forked from dav/word2vec
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge in upstream updates from latest download of official source
- Loading branch information
Showing
18 changed files
with
360 additions
and
277 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,15 @@ | ||
src/compute-accuracy | ||
src/distance | ||
src/word-analogy | ||
src/word2phrase | ||
src/word2vec | ||
bin/compute-accuracy | ||
bin/distance | ||
bin/word-analogy | ||
bin/word2phrase | ||
bin/word2vec | ||
data/text8.gz | ||
data/vectors-phrase | ||
data/news.2012.en.shuffled | ||
data/text8 | ||
data/text8-vector.bin | ||
data/news.2012.en.shuffled-norm0 | ||
data/news.2012.en.shuffled-norm0-phrase0 | ||
data/news.2012.en.shuffled-norm0-phrase1 | ||
data/news.2012.en.shuffled-norm1-phrase1 | ||
data/vectors-phrase.bin |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
DATA_DIR=./data | ||
BIN_DIR=./bin | ||
SRC_DIR=./src | ||
|
||
clean: | ||
rm -f $(DATA_DIR)/vectors-phrase | ||
rm -f $(DATA_DIR)/vectors-phrase.bin | ||
rm -f $(DATA_DIR)/news.2012.en.shuffled-norm0 | ||
rm -f $(DATA_DIR)/news.2012.en.shuffled-norm0-phrase1 | ||
rm -f $(DATA_DIR)/news.2012.en.shuffled-norm1-phrase1 | ||
|
||
veryclean: clean | ||
rm -f $(DATA_DIR)/news.2012.en.shuffled | ||
rm -f $(DATA_DIR)/news.2012.en.shuffled.gz | ||
|
||
build: | ||
pushd ${SRC_DIR} && make; popd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
#!/bin/bash | ||
|
||
DATA_DIR=../data | ||
BIN_DIR=../bin | ||
SRC_DIR=../src | ||
|
||
GZIPPED_DATA=$DATA_DIR/news.2012.en.shuffled.gz | ||
TEXT_DATA=$DATA_DIR/news.2012.en.shuffled | ||
NORM0=$DATA_DIR/news.2012.en.shuffled-norm0 | ||
PHRASE0=$DATA_DIR/news.2012.en.shuffled-norm0-phrase0 | ||
PHRASE1=$DATA_DIR/news.2012.en.shuffled-norm0-phrase1 | ||
LOWERCASE_PHRASES=$DATA_DIR/news.2012.en.shuffled-norm1-phrase1 | ||
LOWERCASE_PHRASES_VECTOR_DATA=$DATA_DIR/lowercase-vectors-phrase.bin | ||
|
||
if [ ! -e $LOWERCASE_PHRASES_VECTOR_DATA ]; then | ||
if [ ! -e $LOWERCASE_PHRASES ]; then | ||
if [ ! -e $TEXT_DATA ]; then | ||
if [ ! -e $GZIPPED__DATA ]; then | ||
wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz -O $GZIPPED_DATA | ||
fi | ||
gzip -d $GZIPPED_DATA -f | ||
fi | ||
|
||
echo ----------------------------------------------------------------------------------------------------- | ||
echo -- "Creating normalized version of word data (output: $NORM0)" | ||
|
||
sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < $DATA_DIR/news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > $NORM0 | ||
|
||
echo ----------------------------------------------------------------------------------------------------- | ||
echo "-- Creating lowercased phrases (output: $LOWERCASE_PHRASES)" | ||
|
||
time $BIN_DIR/word2phrase -train $NORM0 -output $PHRASE0 -threshold 200 -debug 2 | ||
time $BIN_DIR/word2phrase -train $PHRASE0 -output $PHRASE1 -threshold 100 -debug 2 | ||
tr A-Z a-z < $PHRASE1 > $LOWERCASE_PHRASES | ||
fi | ||
|
||
echo ----------------------------------------------------------------------------------------------------- | ||
echo "-- Creating phrases (output: $LOWERCASE_PHRASES_VECTOR_DATA)..." | ||
time $BIN_DIR/word2vec -train $LOWERCASE_PHRASES -output $LOWERCASE_PHRASES_VECTOR_DATA -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15 | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#!/bin/bash | ||
|
||
DATA_DIR=../data | ||
BIN_DIR=../bin | ||
SRC_DIR=../src | ||
|
||
TEXT_DATA=$DATA_DIR/text8 | ||
ZIPPED_TEXT_DATA="${TEXT_DATA}.gz" | ||
|
||
if [ ! -e $TEXT_DATA ]; then | ||
if [ ! -e $ZIPPED_TEXT_DATA ]; then | ||
wget http://mattmahoney.net/dc/text8.zip -O $ZIPPED_TEXT_DATA | ||
fi | ||
gzip -d $ZIPPED_TEXT_DATA -f | ||
mv text8 $TEXT_DATA | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#!/bin/bash | ||
|
||
DATA_DIR=../data | ||
BIN_DIR=../bin | ||
SRC_DIR=../src | ||
|
||
TEXT_DATA=$DATA_DIR/text8 | ||
VECTOR_DATA=$DATA_DIR/text8-vector.bin | ||
|
||
if [ ! -e $VECTOR_DATA ]; then | ||
if [ ! -e $TEXT_DATA ]; then | ||
sh $DATA_DIR/create-text8-data.sh | ||
fi | ||
echo ----------------------------------------------------------------------------------------------------- | ||
echo -- Training vectors... | ||
time $BIN_DIR/word2vec -train $TEXT_DATA -output $VECTOR_DATA -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
echo Note: this new demo script is not yet making use of the $BIN, $DATA structure. | ||
echo Take care to avoid re-downloading and training data you already have locally. | ||
echo Edit script to remove this safety message and exit. | ||
exit 1 | ||
|
||
############################################################################################### | ||
# | ||
# Script for training good word and phrase vector model using public corpora, version 1.0. | ||
# The training time will be from several hours to about a day. | ||
# | ||
# Downloads about 8 billion words, makes phrases using two runs of word2phrase, trains | ||
# a 500-dimensional vector model and evaluates it on word and phrase analogy tasks. | ||
# | ||
############################################################################################### | ||
|
||
# This function will convert text to lowercase and remove special characters | ||
normalize_text() { | ||
awk '{print tolower($0);}' | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \ | ||
-e 's/"/ " /g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \ | ||
-e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \ | ||
-e 's/«/ /g' | tr 0-9 " " | ||
} | ||
|
||
mkdir word2vec | ||
cd word2vec | ||
|
||
wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz | ||
wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.en.shuffled.gz | ||
gzip -d news.2012.en.shuffled.gz | ||
gzip -d news.2013.en.shuffled.gz | ||
normalize_text < news.2012.en.shuffled > data.txt | ||
normalize_text < news.2013.en.shuffled >> data.txt | ||
|
||
wget http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz | ||
tar -xvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz | ||
for i in `ls 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled`; do | ||
normalize_text < 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/$i >> data.txt | ||
done | ||
|
||
wget http://ebiquity.umbc.edu/redirect/to/resource/id/351/UMBC-webbase-corpus | ||
tar -zxvf umbc_webbase_corpus.tar.gz webbase_all/*.txt | ||
for i in `ls webbase_all`; do | ||
normalize_text < webbase_all/$i >> data.txt | ||
done | ||
|
||
wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 | ||
bzip2 -c -d enwiki-latest-pages-articles.xml.bz2 | awk '{print tolower($0);}' | perl -e ' | ||
# Program to filter Wikipedia XML dumps to "clean" text consisting only of lowercase | ||
# letters (a-z, converted from A-Z), and spaces (never consecutive)... | ||
# All other characters are converted to spaces. Only text which normally appears. | ||
# in the web browser is displayed. Tables are removed. Image captions are. | ||
# preserved. Links are converted to normal text. Digits are spelled out. | ||
# *** Modified to not spell digits or throw away non-ASCII characters *** | ||
# Written by Matt Mahoney, June 10, 2006. This program is released to the public domain. | ||
$/=">"; # input record separator | ||
while (<>) { | ||
if (/<text /) {$text=1;} # remove all but between <text> ... </text> | ||
if (/#redirect/i) {$text=0;} # remove #REDIRECT | ||
if ($text) { | ||
# Remove any text not normally visible | ||
if (/<\/text>/) {$text=0;} | ||
s/<.*>//; # remove xml tags | ||
s/&/&/g; # decode URL encoded chars | ||
s/</</g; | ||
s/>/>/g; | ||
s/<ref[^<]*<\/ref>//g; # remove references <ref...> ... </ref> | ||
s/<[^>]*>//g; # remove xhtml tags | ||
s/\[http:[^] ]*/[/g; # remove normal url, preserve visible text | ||
s/\|thumb//ig; # remove images links, preserve caption | ||
s/\|left//ig; | ||
s/\|right//ig; | ||
s/\|\d+px//ig; | ||
s/\[\[image:[^\[\]]*\|//ig; | ||
s/\[\[category:([^|\]]*)[^]]*\]\]/[[$1]]/ig; # show categories without markup | ||
s/\[\[[a-z\-]*:[^\]]*\]\]//g; # remove links to other languages | ||
s/\[\[[^\|\]]*\|/[[/g; # remove wiki url, preserve visible text | ||
s/{{[^}]*}}//g; # remove {{icons}} and {tables} | ||
s/{[^}]*}//g; | ||
s/\[//g; # remove [ and ] | ||
s/\]//g; | ||
s/&[^;]*;/ /g; # remove URL encoded chars | ||
$_=" $_ "; | ||
chop; | ||
print $_; | ||
} | ||
} | ||
' | normalize_text | awk '{if (NF>1) print;}' >> data.txt | ||
|
||
wget http://word2vec.googlecode.com/svn/trunk/word2vec.c | ||
wget http://word2vec.googlecode.com/svn/trunk/word2phrase.c | ||
wget http://word2vec.googlecode.com/svn/trunk/compute-accuracy.c | ||
wget http://word2vec.googlecode.com/svn/trunk/questions-words.txt | ||
wget http://word2vec.googlecode.com/svn/trunk/questions-phrases.txt | ||
gcc word2vec.c -o word2vec -lm -pthread -O3 -march=native -funroll-loops | ||
gcc word2phrase.c -o word2phrase -lm -pthread -O3 -march=native -funroll-loops | ||
gcc compute-accuracy.c -o compute-accuracy -lm -pthread -O3 -march=native -funroll-loops | ||
./word2phrase -train data.txt -output data-phrase.txt -threshold 200 -debug 2 | ||
./word2phrase -train data-phrase.txt -output data-phrase2.txt -threshold 100 -debug 2 | ||
./word2vec -train data-phrase2.txt -output vectors.bin -cbow 1 -size 500 -window 10 -negative 10 -hs 0 -sample 1e-5 -threads 40 -binary 1 -iter 3 -min-count 10 | ||
./compute-accuracy vectors.bin 400000 < questions-words.txt # should get to almost 78% accuracy on 99.7% of questions | ||
./compute-accuracy vectors.bin 1000000 < questions-phrases.txt # about 78% accuracy with 77% coverage |
Oops, something went wrong.