Merge in upstream updates from latest download of official source

BobbyJonas · Jan 15, 2018 · 551cd5d · 551cd5d
1 parent 80be14a
commit 551cd5d
Show file tree

Hide file tree

Showing 18 changed files with 360 additions and 277 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,15 @@
-src/compute-accuracy
-src/distance
-src/word-analogy
-src/word2phrase
-src/word2vec
+bin/compute-accuracy
+bin/distance
+bin/word-analogy
+bin/word2phrase
+bin/word2vec
+data/text8.gz
+data/vectors-phrase
+data/news.2012.en.shuffled
+data/text8
+data/text8-vector.bin
+data/news.2012.en.shuffled-norm0
+data/news.2012.en.shuffled-norm0-phrase0
+data/news.2012.en.shuffled-norm0-phrase1
+data/news.2012.en.shuffled-norm1-phrase1
+data/vectors-phrase.bin
diff --git a/Makefile b/Makefile
@@ -0,0 +1,17 @@
+DATA_DIR=./data
+BIN_DIR=./bin
+SRC_DIR=./src
+
+clean:
+	rm -f $(DATA_DIR)/vectors-phrase
+	rm -f $(DATA_DIR)/vectors-phrase.bin
+	rm -f $(DATA_DIR)/news.2012.en.shuffled-norm0
+	rm -f $(DATA_DIR)/news.2012.en.shuffled-norm0-phrase1
+	rm -f $(DATA_DIR)/news.2012.en.shuffled-norm1-phrase1
+
+veryclean: clean
+	rm -f $(DATA_DIR)/news.2012.en.shuffled
+	rm -f $(DATA_DIR)/news.2012.en.shuffled.gz
+
+build:
+	pushd ${SRC_DIR} && make; popd
diff --git a/README.md b/README.md
@@ -16,6 +16,8 @@ To get started:
     cd scripts && ./demo-word.sh
 
 ------------------------------------------------------
+Original README text follows:
+
 
 This tool provides an efficient implementation of the continuous bag-of-words and skip-gram architectures for computing vector representations of words. These representations can be subsequently used in many natural language processing applications and for further research. 
 
@@ -39,4 +41,3 @@ The script demo-word.sh downloads a small (100MB) text corpus from the web, and
 is finished, the user can interactively explore the similarity of the words.
 
 More information about the scripts is provided at https://code.google.com/p/word2vec/
-
diff --git a/scripts/create-lowercase-phrases-data.sh b/scripts/create-lowercase-phrases-data.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+DATA_DIR=../data
+BIN_DIR=../bin
+SRC_DIR=../src
+
+GZIPPED_DATA=$DATA_DIR/news.2012.en.shuffled.gz
+TEXT_DATA=$DATA_DIR/news.2012.en.shuffled
+NORM0=$DATA_DIR/news.2012.en.shuffled-norm0
+PHRASE0=$DATA_DIR/news.2012.en.shuffled-norm0-phrase0
+PHRASE1=$DATA_DIR/news.2012.en.shuffled-norm0-phrase1
+LOWERCASE_PHRASES=$DATA_DIR/news.2012.en.shuffled-norm1-phrase1
+LOWERCASE_PHRASES_VECTOR_DATA=$DATA_DIR/lowercase-vectors-phrase.bin
+
+if [ ! -e $LOWERCASE_PHRASES_VECTOR_DATA ]; then
+  if [ ! -e $LOWERCASE_PHRASES ]; then
+	if [ ! -e $TEXT_DATA ]; then
+	  if [ ! -e $GZIPPED__DATA ]; then
+		wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz -O $GZIPPED_DATA
+	  fi
+	  gzip -d $GZIPPED_DATA -f
+    fi
+
+	echo -----------------------------------------------------------------------------------------------------
+	echo -- "Creating normalized version of word data (output: $NORM0)"
+
+	sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < $DATA_DIR/news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > $NORM0
+
+	echo -----------------------------------------------------------------------------------------------------
+	echo "-- Creating lowercased phrases (output: $LOWERCASE_PHRASES)"
+
+	time $BIN_DIR/word2phrase -train $NORM0 -output $PHRASE0 -threshold 200 -debug 2
+	time $BIN_DIR/word2phrase -train $PHRASE0 -output $PHRASE1 -threshold 100 -debug 2
+	tr A-Z a-z < $PHRASE1 > $LOWERCASE_PHRASES
+  fi
+
+  echo -----------------------------------------------------------------------------------------------------
+  echo "-- Creating phrases (output: $LOWERCASE_PHRASES_VECTOR_DATA)..."
+  time $BIN_DIR/word2vec -train $LOWERCASE_PHRASES -output $LOWERCASE_PHRASES_VECTOR_DATA -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
+fi
diff --git a/scripts/create-text8-data.sh b/scripts/create-text8-data.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+DATA_DIR=../data
+BIN_DIR=../bin
+SRC_DIR=../src
+
+TEXT_DATA=$DATA_DIR/text8
+ZIPPED_TEXT_DATA="${TEXT_DATA}.gz"
+
+if [ ! -e $TEXT_DATA ]; then
+  if [ ! -e $ZIPPED_TEXT_DATA ]; then
+    wget http://mattmahoney.net/dc/text8.zip -O $ZIPPED_TEXT_DATA
+	fi
+	gzip -d $ZIPPED_TEXT_DATA -f
+	mv text8 $TEXT_DATA
+fi
diff --git a/scripts/create-text8-vector-data.sh b/scripts/create-text8-vector-data.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+DATA_DIR=../data
+BIN_DIR=../bin
+SRC_DIR=../src
+
+TEXT_DATA=$DATA_DIR/text8
+VECTOR_DATA=$DATA_DIR/text8-vector.bin
+
+if [ ! -e $VECTOR_DATA ]; then
+  if [ ! -e $TEXT_DATA ]; then
+		sh $DATA_DIR/create-text8-data.sh
+	fi
+  echo -----------------------------------------------------------------------------------------------------
+  echo -- Training vectors...
+  time $BIN_DIR/word2vec -train $TEXT_DATA -output $VECTOR_DATA -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
+fi
diff --git a/scripts/demo-analogy.sh b/scripts/demo-analogy.sh
@@ -4,7 +4,6 @@ DATA_DIR=../data
 BIN_DIR=../bin
 SRC_DIR=../src
 
-TEXT_DATA=$DATA_DIR/text8
 VECTOR_DATA=$DATA_DIR/text8-vector.bin
 
 pushd ${SRC_DIR} && make; popd
@@ -15,18 +14,8 @@ echo Example input: paris france berlin
 echo -----------------------------------------------------------------------------------------------------
 
 if [ ! -e $VECTOR_DATA ]; then
-
-  if [ ! -e $TEXT_DATA ]; then
-    wget http://mattmahoney.net/dc/text8.zip -O $DATA_DIR/text8.gz
-    gzip -d $DATA_DIR/text8.gz -f
-  fi
-  echo -----------------------------------------------------------------------------------------------------
-  echo -- Training vectors...
-  time $BIN_DIR/word2vec -train $TEXT_DATA -output $VECTOR_DATA -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
-
+	sh $DATA_DIR/create-text8-vector-data.sh
 fi
 
-echo -----------------------------------------------------------------------------------------------------
-echo -- analogy...
-
+set -x
 $BIN_DIR/word-analogy $VECTOR_DATA
diff --git a/scripts/demo-classes.sh b/scripts/demo-classes.sh
@@ -4,23 +4,23 @@ DATA_DIR=../data
 BIN_DIR=../bin
 SRC_DIR=../src
 
+pushd ${SRC_DIR} && make; popd
+sh $DATA_DIR/create-text8-data.sh
+
 TEXT_DATA=$DATA_DIR/text8
 CLASSES_DATA=$DATA_DIR/classes.txt
 
 pushd ${SRC_DIR} && make; popd
-
 
 if [ ! -e $CLASSES_DATA ]; then
-
   if [ ! -e $TEXT_DATA ]; then
-    wget http://mattmahoney.net/dc/text8.zip -O $DATA_DIR/text8.gz
-    gzip -d $DATA_DIR/text8.gz -f
-  fi
+		sh $DATA_DIR/create-text8-data.sh
+	fi
   echo -----------------------------------------------------------------------------------------------------
   echo -- Training vectors...
-  time $BIN_DIR/word2vec -train $TEXT_DATA -output $CLASSES_DATA -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -classes 500
-
+  time $BIN_DIR/word2vec -train $TEXT_DATA -output $CLASSES_DATA -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500
 fi
 
 sort $CLASSES_DATA -k 2 -n > $DATA_DIR/classes.sorted.txt
+
 echo The word classes were saved to file $DATA_DIR/classes.sorted.txt
diff --git a/scripts/demo-phrase-accuracy.sh b/scripts/demo-phrase-accuracy.sh
@@ -4,38 +4,8 @@ DATA_DIR=../data
 BIN_DIR=../bin
 SRC_DIR=../src
 
-TEXT_DATA=$DATA_DIR/text8
-PHRASES_DATA=$DATA_DIR/text8-phrases
-PHRASES_VECTOR_DATA=$DATA_DIR/vectors-phrase.bin
-
-echo ----------------------------------------------------------------------------------------------------------------
-echo Note that the accuracy and coverage of the test set questions is going to be low with this small training corpus
-echo To achieve better accuracy, larger training set is needed
-echo ----------------------------------------------------------------------------------------------------------------
-
 pushd ${SRC_DIR} && make; popd
+sh $DATA_DIR/create-lowercase-phrases-data.sh
 
-if [ ! -e $PHRASES_VECTOR_DATA ]; then
-
-  if [ ! -e $PHRASES_DATA ]; then
-
-    if [ ! -e $TEXT_DATA ]; then
-      wget http://mattmahoney.net/dc/text8.zip -O $DATA_DIR/text8.gz
-      gzip -d $DATA_DIR/text8.gz -f
-    fi
-    echo -----------------------------------------------------------------------------------------------------
-    echo -- Creating phrases...
-    time $BIN_DIR/word2phrase -train $DATA_DIR/text8 -output $PHRASES_DATA -threshold 500 -debug 2-min-count 3
-
-  fi
-
-  echo -----------------------------------------------------------------------------------------------------
-  echo -- Training vectors from phrases...
-  time $BIN_DIR/word2vec -train $PHRASES_DATA -output $PHRASES_VECTOR_DATA -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -min-count 3
-
-fi
-
-echo -----------------------------------------------------------------------------------------------------
-echo -- distance...
-
-$BIN_DIR/compute-accuracy $PHRASES_VECTOR_DATA < $DATA_DIR/questions-phrases.txt
+set -x
+$BIN_DIR/compute-accuracy $DATA_DIR/lowercase-vectors-phrase.bin < $DATA_DIR/questions-phrases.txt
diff --git a/scripts/demo-phrases.sh b/scripts/demo-phrases.sh
@@ -4,33 +4,8 @@ DATA_DIR=../data
 BIN_DIR=../bin
 SRC_DIR=../src
 
-TEXT_DATA=$DATA_DIR/text8
-PHRASES_DATA=$DATA_DIR/text8-phrases
-PHRASES_VECTOR_DATA=$DATA_DIR/vectors-phrase.bin
-
 pushd ${SRC_DIR} && make; popd
+sh ./create-lowercase-phrases-data.sh
 
-if [ ! -e $PHRASES_VECTOR_DATA ]; then
-
-  if [ ! -e $PHRASES_DATA ]; then
-
-    if [ ! -e $TEXT_DATA ]; then
-      wget http://mattmahoney.net/dc/text8.zip -O $DATA_DIR/text8.gz
-      gzip -d $DATA_DIR/text8.gz -f
-    fi
-    echo -----------------------------------------------------------------------------------------------------
-    echo -- Creating phrases...
-    time $BIN_DIR/word2phrase -train $DATA_DIR/text8 -output $PHRASES_DATA -threshold 500 -debug 2
-
-  fi
-
-  echo -----------------------------------------------------------------------------------------------------
-  echo -- Training vectors from phrases...
-  time $BIN_DIR/word2vec -train $PHRASES_DATA -output $PHRASES_VECTOR_DATA -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
-
-fi
-
-echo -----------------------------------------------------------------------------------------------------
-echo -- distance...
-
-$BIN_DIR/distance $PHRASES_VECTOR_DATA
+set -x
+$BIN_DIR/distance $DATA_DIR/lowercase-vectors-phrase.bin
diff --git a/scripts/demo-train-big-model-v1.sh b/scripts/demo-train-big-model-v1.sh
@@ -0,0 +1,105 @@
+echo Note: this new demo script is not yet making use of the $BIN, $DATA structure.
+echo Take care to avoid re-downloading and training data you already have locally.
+echo Edit script to remove this safety message and exit.
+exit 1
+
+###############################################################################################
+#
+# Script for training good word and phrase vector model using public corpora, version 1.0.
+# The training time will be from several hours to about a day.
+#
+# Downloads about 8 billion words, makes phrases using two runs of word2phrase, trains
+# a 500-dimensional vector model and evaluates it on word and phrase analogy tasks.
+#
+###############################################################################################
+
+# This function will convert text to lowercase and remove special characters
+normalize_text() {
+  awk '{print tolower($0);}' | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \
+  -e 's/"/ " /g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
+  -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \
+  -e 's/«/ /g' | tr 0-9 " "
+}
+
+mkdir word2vec
+cd word2vec
+
+wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
+wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.en.shuffled.gz
+gzip -d news.2012.en.shuffled.gz
+gzip -d news.2013.en.shuffled.gz
+normalize_text < news.2012.en.shuffled > data.txt
+normalize_text < news.2013.en.shuffled >> data.txt
+
+wget http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
+tar -xvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
+for i in `ls 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled`; do
+  normalize_text < 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/$i >> data.txt
+done
+
+wget http://ebiquity.umbc.edu/redirect/to/resource/id/351/UMBC-webbase-corpus
+tar -zxvf umbc_webbase_corpus.tar.gz webbase_all/*.txt
+for i in `ls webbase_all`; do
+  normalize_text < webbase_all/$i >> data.txt
+done
+
+wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
+bzip2 -c -d enwiki-latest-pages-articles.xml.bz2 | awk '{print tolower($0);}' | perl -e '
+# Program to filter Wikipedia XML dumps to "clean" text consisting only of lowercase
+# letters (a-z, converted from A-Z), and spaces (never consecutive)...
+# All other characters are converted to spaces.  Only text which normally appears.
+# in the web browser is displayed.  Tables are removed.  Image captions are.
+# preserved.  Links are converted to normal text.  Digits are spelled out.
+# *** Modified to not spell digits or throw away non-ASCII characters ***
+
+# Written by Matt Mahoney, June 10, 2006.  This program is released to the public domain.
+
+$/=">";                     # input record separator
+while (<>) {
+  if (/<text /) {$text=1;}  # remove all but between <text> ... </text>
+  if (/#redirect/i) {$text=0;}  # remove #REDIRECT
+  if ($text) {
+
+    # Remove any text not normally visible
+    if (/<\/text>/) {$text=0;}
+    s/<.*>//;               # remove xml tags
+    s/&amp;/&/g;            # decode URL encoded chars
+    s/&lt;/</g;
+    s/&gt;/>/g;
+    s/<ref[^<]*<\/ref>//g;  # remove references <ref...> ... </ref>
+    s/<[^>]*>//g;           # remove xhtml tags
+    s/\[http:[^] ]*/[/g;    # remove normal url, preserve visible text
+    s/\|thumb//ig;          # remove images links, preserve caption
+    s/\|left//ig;
+    s/\|right//ig;
+    s/\|\d+px//ig;
+    s/\[\[image:[^\[\]]*\|//ig;
+    s/\[\[category:([^|\]]*)[^]]*\]\]/[[$1]]/ig;  # show categories without markup
+    s/\[\[[a-z\-]*:[^\]]*\]\]//g;  # remove links to other languages
+    s/\[\[[^\|\]]*\|/[[/g;  # remove wiki url, preserve visible text
+    s/{{[^}]*}}//g;         # remove {{icons}} and {tables}
+    s/{[^}]*}//g;
+    s/\[//g;                # remove [ and ]
+    s/\]//g;
+    s/&[^;]*;/ /g;          # remove URL encoded chars
+
+    $_=" $_ ";
+    chop;
+    print $_;
+  }
+}
+' | normalize_text | awk '{if (NF>1) print;}' >> data.txt
+
+wget http://word2vec.googlecode.com/svn/trunk/word2vec.c
+wget http://word2vec.googlecode.com/svn/trunk/word2phrase.c
+wget http://word2vec.googlecode.com/svn/trunk/compute-accuracy.c
+wget http://word2vec.googlecode.com/svn/trunk/questions-words.txt
+wget http://word2vec.googlecode.com/svn/trunk/questions-phrases.txt
+gcc word2vec.c -o word2vec -lm -pthread -O3 -march=native -funroll-loops
+gcc word2phrase.c -o word2phrase -lm -pthread -O3 -march=native -funroll-loops
+gcc compute-accuracy.c -o compute-accuracy -lm -pthread -O3 -march=native -funroll-loops
+./word2phrase -train data.txt -output data-phrase.txt -threshold 200 -debug 2
+./word2phrase -train data-phrase.txt -output data-phrase2.txt -threshold 100 -debug 2
+./word2vec -train data-phrase2.txt -output vectors.bin -cbow 1 -size 500 -window 10 -negative 10 -hs 0 -sample 1e-5 -threads 40 -binary 1 -iter 3 -min-count 10
+./compute-accuracy vectors.bin 400000 < questions-words.txt     # should get to almost 78% accuracy on 99.7% of questions
+./compute-accuracy vectors.bin 1000000 < questions-phrases.txt  # about 78% accuracy with 77% coverage