Skip to content

Commit

Permalink
Merge in upstream updates from latest download of official source
Browse files Browse the repository at this point in the history
  • Loading branch information
dav committed Jan 15, 2018
1 parent 80be14a commit 551cd5d
Show file tree
Hide file tree
Showing 18 changed files with 360 additions and 277 deletions.
20 changes: 15 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
src/compute-accuracy
src/distance
src/word-analogy
src/word2phrase
src/word2vec
bin/compute-accuracy
bin/distance
bin/word-analogy
bin/word2phrase
bin/word2vec
data/text8.gz
data/vectors-phrase
data/news.2012.en.shuffled
data/text8
data/text8-vector.bin
data/news.2012.en.shuffled-norm0
data/news.2012.en.shuffled-norm0-phrase0
data/news.2012.en.shuffled-norm0-phrase1
data/news.2012.en.shuffled-norm1-phrase1
data/vectors-phrase.bin
17 changes: 17 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
DATA_DIR=./data
BIN_DIR=./bin
SRC_DIR=./src

clean:
rm -f $(DATA_DIR)/vectors-phrase
rm -f $(DATA_DIR)/vectors-phrase.bin
rm -f $(DATA_DIR)/news.2012.en.shuffled-norm0
rm -f $(DATA_DIR)/news.2012.en.shuffled-norm0-phrase1
rm -f $(DATA_DIR)/news.2012.en.shuffled-norm1-phrase1

veryclean: clean
rm -f $(DATA_DIR)/news.2012.en.shuffled
rm -f $(DATA_DIR)/news.2012.en.shuffled.gz

build:
pushd ${SRC_DIR} && make; popd
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ To get started:
cd scripts && ./demo-word.sh

------------------------------------------------------
Original README text follows:


This tool provides an efficient implementation of the continuous bag-of-words and skip-gram architectures for computing vector representations of words. These representations can be subsequently used in many natural language processing applications and for further research.

Expand All @@ -39,4 +41,3 @@ The script demo-word.sh downloads a small (100MB) text corpus from the web, and
is finished, the user can interactively explore the similarity of the words.

More information about the scripts is provided at https://code.google.com/p/word2vec/

40 changes: 40 additions & 0 deletions scripts/create-lowercase-phrases-data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/bash

DATA_DIR=../data
BIN_DIR=../bin
SRC_DIR=../src

GZIPPED_DATA=$DATA_DIR/news.2012.en.shuffled.gz
TEXT_DATA=$DATA_DIR/news.2012.en.shuffled
NORM0=$DATA_DIR/news.2012.en.shuffled-norm0
PHRASE0=$DATA_DIR/news.2012.en.shuffled-norm0-phrase0
PHRASE1=$DATA_DIR/news.2012.en.shuffled-norm0-phrase1
LOWERCASE_PHRASES=$DATA_DIR/news.2012.en.shuffled-norm1-phrase1
LOWERCASE_PHRASES_VECTOR_DATA=$DATA_DIR/lowercase-vectors-phrase.bin

if [ ! -e $LOWERCASE_PHRASES_VECTOR_DATA ]; then
if [ ! -e $LOWERCASE_PHRASES ]; then
if [ ! -e $TEXT_DATA ]; then
if [ ! -e $GZIPPED__DATA ]; then
wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz -O $GZIPPED_DATA
fi
gzip -d $GZIPPED_DATA -f
fi

echo -----------------------------------------------------------------------------------------------------
echo -- "Creating normalized version of word data (output: $NORM0)"

sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < $DATA_DIR/news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > $NORM0

echo -----------------------------------------------------------------------------------------------------
echo "-- Creating lowercased phrases (output: $LOWERCASE_PHRASES)"

time $BIN_DIR/word2phrase -train $NORM0 -output $PHRASE0 -threshold 200 -debug 2
time $BIN_DIR/word2phrase -train $PHRASE0 -output $PHRASE1 -threshold 100 -debug 2
tr A-Z a-z < $PHRASE1 > $LOWERCASE_PHRASES
fi

echo -----------------------------------------------------------------------------------------------------
echo "-- Creating phrases (output: $LOWERCASE_PHRASES_VECTOR_DATA)..."
time $BIN_DIR/word2vec -train $LOWERCASE_PHRASES -output $LOWERCASE_PHRASES_VECTOR_DATA -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
fi
16 changes: 16 additions & 0 deletions scripts/create-text8-data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

DATA_DIR=../data
BIN_DIR=../bin
SRC_DIR=../src

TEXT_DATA=$DATA_DIR/text8
ZIPPED_TEXT_DATA="${TEXT_DATA}.gz"

if [ ! -e $TEXT_DATA ]; then
if [ ! -e $ZIPPED_TEXT_DATA ]; then
wget http://mattmahoney.net/dc/text8.zip -O $ZIPPED_TEXT_DATA
fi
gzip -d $ZIPPED_TEXT_DATA -f
mv text8 $TEXT_DATA
fi
17 changes: 17 additions & 0 deletions scripts/create-text8-vector-data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

DATA_DIR=../data
BIN_DIR=../bin
SRC_DIR=../src

TEXT_DATA=$DATA_DIR/text8
VECTOR_DATA=$DATA_DIR/text8-vector.bin

if [ ! -e $VECTOR_DATA ]; then
if [ ! -e $TEXT_DATA ]; then
sh $DATA_DIR/create-text8-data.sh
fi
echo -----------------------------------------------------------------------------------------------------
echo -- Training vectors...
time $BIN_DIR/word2vec -train $TEXT_DATA -output $VECTOR_DATA -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
fi
15 changes: 2 additions & 13 deletions scripts/demo-analogy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ DATA_DIR=../data
BIN_DIR=../bin
SRC_DIR=../src

TEXT_DATA=$DATA_DIR/text8
VECTOR_DATA=$DATA_DIR/text8-vector.bin

pushd ${SRC_DIR} && make; popd
Expand All @@ -15,18 +14,8 @@ echo Example input: paris france berlin
echo -----------------------------------------------------------------------------------------------------

if [ ! -e $VECTOR_DATA ]; then

if [ ! -e $TEXT_DATA ]; then
wget http://mattmahoney.net/dc/text8.zip -O $DATA_DIR/text8.gz
gzip -d $DATA_DIR/text8.gz -f
fi
echo -----------------------------------------------------------------------------------------------------
echo -- Training vectors...
time $BIN_DIR/word2vec -train $TEXT_DATA -output $VECTOR_DATA -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1

sh $DATA_DIR/create-text8-vector-data.sh
fi

echo -----------------------------------------------------------------------------------------------------
echo -- analogy...

set -x
$BIN_DIR/word-analogy $VECTOR_DATA
14 changes: 7 additions & 7 deletions scripts/demo-classes.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,23 @@ DATA_DIR=../data
BIN_DIR=../bin
SRC_DIR=../src

pushd ${SRC_DIR} && make; popd
sh $DATA_DIR/create-text8-data.sh

TEXT_DATA=$DATA_DIR/text8
CLASSES_DATA=$DATA_DIR/classes.txt

pushd ${SRC_DIR} && make; popd


if [ ! -e $CLASSES_DATA ]; then

if [ ! -e $TEXT_DATA ]; then
wget http://mattmahoney.net/dc/text8.zip -O $DATA_DIR/text8.gz
gzip -d $DATA_DIR/text8.gz -f
fi
sh $DATA_DIR/create-text8-data.sh
fi
echo -----------------------------------------------------------------------------------------------------
echo -- Training vectors...
time $BIN_DIR/word2vec -train $TEXT_DATA -output $CLASSES_DATA -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -classes 500

time $BIN_DIR/word2vec -train $TEXT_DATA -output $CLASSES_DATA -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500
fi

sort $CLASSES_DATA -k 2 -n > $DATA_DIR/classes.sorted.txt

echo The word classes were saved to file $DATA_DIR/classes.sorted.txt
36 changes: 3 additions & 33 deletions scripts/demo-phrase-accuracy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,38 +4,8 @@ DATA_DIR=../data
BIN_DIR=../bin
SRC_DIR=../src

TEXT_DATA=$DATA_DIR/text8
PHRASES_DATA=$DATA_DIR/text8-phrases
PHRASES_VECTOR_DATA=$DATA_DIR/vectors-phrase.bin

echo ----------------------------------------------------------------------------------------------------------------
echo Note that the accuracy and coverage of the test set questions is going to be low with this small training corpus
echo To achieve better accuracy, larger training set is needed
echo ----------------------------------------------------------------------------------------------------------------

pushd ${SRC_DIR} && make; popd
sh $DATA_DIR/create-lowercase-phrases-data.sh

if [ ! -e $PHRASES_VECTOR_DATA ]; then

if [ ! -e $PHRASES_DATA ]; then

if [ ! -e $TEXT_DATA ]; then
wget http://mattmahoney.net/dc/text8.zip -O $DATA_DIR/text8.gz
gzip -d $DATA_DIR/text8.gz -f
fi
echo -----------------------------------------------------------------------------------------------------
echo -- Creating phrases...
time $BIN_DIR/word2phrase -train $DATA_DIR/text8 -output $PHRASES_DATA -threshold 500 -debug 2-min-count 3

fi

echo -----------------------------------------------------------------------------------------------------
echo -- Training vectors from phrases...
time $BIN_DIR/word2vec -train $PHRASES_DATA -output $PHRASES_VECTOR_DATA -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -min-count 3

fi

echo -----------------------------------------------------------------------------------------------------
echo -- distance...

$BIN_DIR/compute-accuracy $PHRASES_VECTOR_DATA < $DATA_DIR/questions-phrases.txt
set -x
$BIN_DIR/compute-accuracy $DATA_DIR/lowercase-vectors-phrase.bin < $DATA_DIR/questions-phrases.txt
31 changes: 3 additions & 28 deletions scripts/demo-phrases.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,8 @@ DATA_DIR=../data
BIN_DIR=../bin
SRC_DIR=../src

TEXT_DATA=$DATA_DIR/text8
PHRASES_DATA=$DATA_DIR/text8-phrases
PHRASES_VECTOR_DATA=$DATA_DIR/vectors-phrase.bin

pushd ${SRC_DIR} && make; popd
sh ./create-lowercase-phrases-data.sh

if [ ! -e $PHRASES_VECTOR_DATA ]; then

if [ ! -e $PHRASES_DATA ]; then

if [ ! -e $TEXT_DATA ]; then
wget http://mattmahoney.net/dc/text8.zip -O $DATA_DIR/text8.gz
gzip -d $DATA_DIR/text8.gz -f
fi
echo -----------------------------------------------------------------------------------------------------
echo -- Creating phrases...
time $BIN_DIR/word2phrase -train $DATA_DIR/text8 -output $PHRASES_DATA -threshold 500 -debug 2

fi

echo -----------------------------------------------------------------------------------------------------
echo -- Training vectors from phrases...
time $BIN_DIR/word2vec -train $PHRASES_DATA -output $PHRASES_VECTOR_DATA -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1

fi

echo -----------------------------------------------------------------------------------------------------
echo -- distance...

$BIN_DIR/distance $PHRASES_VECTOR_DATA
set -x
$BIN_DIR/distance $DATA_DIR/lowercase-vectors-phrase.bin
105 changes: 105 additions & 0 deletions scripts/demo-train-big-model-v1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
echo Note: this new demo script is not yet making use of the $BIN, $DATA structure.
echo Take care to avoid re-downloading and training data you already have locally.
echo Edit script to remove this safety message and exit.
exit 1

###############################################################################################
#
# Script for training good word and phrase vector model using public corpora, version 1.0.
# The training time will be from several hours to about a day.
#
# Downloads about 8 billion words, makes phrases using two runs of word2phrase, trains
# a 500-dimensional vector model and evaluates it on word and phrase analogy tasks.
#
###############################################################################################

# This function will convert text to lowercase and remove special characters
normalize_text() {
awk '{print tolower($0);}' | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \
-e 's/"/ " /g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
-e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \
-e 's/«/ /g' | tr 0-9 " "
}

mkdir word2vec
cd word2vec

wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.en.shuffled.gz
gzip -d news.2012.en.shuffled.gz
gzip -d news.2013.en.shuffled.gz
normalize_text < news.2012.en.shuffled > data.txt
normalize_text < news.2013.en.shuffled >> data.txt

wget http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
tar -xvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
for i in `ls 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled`; do
normalize_text < 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/$i >> data.txt
done

wget http://ebiquity.umbc.edu/redirect/to/resource/id/351/UMBC-webbase-corpus
tar -zxvf umbc_webbase_corpus.tar.gz webbase_all/*.txt
for i in `ls webbase_all`; do
normalize_text < webbase_all/$i >> data.txt
done

wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
bzip2 -c -d enwiki-latest-pages-articles.xml.bz2 | awk '{print tolower($0);}' | perl -e '
# Program to filter Wikipedia XML dumps to "clean" text consisting only of lowercase
# letters (a-z, converted from A-Z), and spaces (never consecutive)...
# All other characters are converted to spaces. Only text which normally appears.
# in the web browser is displayed. Tables are removed. Image captions are.
# preserved. Links are converted to normal text. Digits are spelled out.
# *** Modified to not spell digits or throw away non-ASCII characters ***
# Written by Matt Mahoney, June 10, 2006. This program is released to the public domain.
$/=">"; # input record separator
while (<>) {
if (/<text /) {$text=1;} # remove all but between <text> ... </text>
if (/#redirect/i) {$text=0;} # remove #REDIRECT
if ($text) {
# Remove any text not normally visible
if (/<\/text>/) {$text=0;}
s/<.*>//; # remove xml tags
s/&amp;/&/g; # decode URL encoded chars
s/&lt;/</g;
s/&gt;/>/g;
s/<ref[^<]*<\/ref>//g; # remove references <ref...> ... </ref>
s/<[^>]*>//g; # remove xhtml tags
s/\[http:[^] ]*/[/g; # remove normal url, preserve visible text
s/\|thumb//ig; # remove images links, preserve caption
s/\|left//ig;
s/\|right//ig;
s/\|\d+px//ig;
s/\[\[image:[^\[\]]*\|//ig;
s/\[\[category:([^|\]]*)[^]]*\]\]/[[$1]]/ig; # show categories without markup
s/\[\[[a-z\-]*:[^\]]*\]\]//g; # remove links to other languages
s/\[\[[^\|\]]*\|/[[/g; # remove wiki url, preserve visible text
s/{{[^}]*}}//g; # remove {{icons}} and {tables}
s/{[^}]*}//g;
s/\[//g; # remove [ and ]
s/\]//g;
s/&[^;]*;/ /g; # remove URL encoded chars
$_=" $_ ";
chop;
print $_;
}
}
' | normalize_text | awk '{if (NF>1) print;}' >> data.txt

wget http://word2vec.googlecode.com/svn/trunk/word2vec.c
wget http://word2vec.googlecode.com/svn/trunk/word2phrase.c
wget http://word2vec.googlecode.com/svn/trunk/compute-accuracy.c
wget http://word2vec.googlecode.com/svn/trunk/questions-words.txt
wget http://word2vec.googlecode.com/svn/trunk/questions-phrases.txt
gcc word2vec.c -o word2vec -lm -pthread -O3 -march=native -funroll-loops
gcc word2phrase.c -o word2phrase -lm -pthread -O3 -march=native -funroll-loops
gcc compute-accuracy.c -o compute-accuracy -lm -pthread -O3 -march=native -funroll-loops
./word2phrase -train data.txt -output data-phrase.txt -threshold 200 -debug 2
./word2phrase -train data-phrase.txt -output data-phrase2.txt -threshold 100 -debug 2
./word2vec -train data-phrase2.txt -output vectors.bin -cbow 1 -size 500 -window 10 -negative 10 -hs 0 -sample 1e-5 -threads 40 -binary 1 -iter 3 -min-count 10
./compute-accuracy vectors.bin 400000 < questions-words.txt # should get to almost 78% accuracy on 99.7% of questions
./compute-accuracy vectors.bin 1000000 < questions-phrases.txt # about 78% accuracy with 77% coverage
Loading

0 comments on commit 551cd5d

Please sign in to comment.