StatNLP
diff --git a/‎.github/workflows/main.yml
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/main.yml
Lines changed: 3 additions & 3 deletions
diff --git a/‎.pylintrc
Lines changed: 1 addition & 1 deletion b/‎.pylintrc
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 8 additions & 11 deletions b/‎README.md
Lines changed: 8 additions & 11 deletions
diff --git a/‎configs/iwslt14_deen_sp.yaml
Lines changed: 5 additions & 4 deletions b/‎configs/iwslt14_deen_sp.yaml
Lines changed: 5 additions & 4 deletions
diff --git a/‎configs/jparacrawl_enja_sp.yaml
Lines changed: 7 additions & 7 deletions b/‎configs/jparacrawl_enja_sp.yaml
Lines changed: 7 additions & 7 deletions
diff --git a/‎configs/jparacrawl_jaen_sp.yaml
Lines changed: 8 additions & 8 deletions b/‎configs/jparacrawl_jaen_sp.yaml
Lines changed: 8 additions & 8 deletions
diff --git a/‎configs/rnn_small.yaml
Lines changed: 2 additions & 2 deletions b/‎configs/rnn_small.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎configs/transformer_reverse.yaml
Lines changed: 2 additions & 2 deletions b/‎configs/transformer_reverse.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎configs/transformer_small.yaml
Lines changed: 4 additions & 0 deletions b/‎configs/transformer_small.yaml
Lines changed: 4 additions & 0 deletions
diff --git a/‎configs/wmt17_ende_bpe.yaml
Lines changed: 4 additions & 4 deletions b/‎configs/wmt17_ende_bpe.yaml
Lines changed: 4 additions & 4 deletions
diff --git a/‎configs/wmt17_ende_sp.yaml
Lines changed: 4 additions & 4 deletions b/‎configs/wmt17_ende_sp.yaml
Lines changed: 4 additions & 4 deletions
@@ -35,8 +35,8 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install --upgrade torch==1.11.0+cu115 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu115
-          pip install -e .
+          python -m pip install --upgrade torch torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
+          python -m pip install -e .
 
       # Check code format
       - name: Lint
@@ -48,4 +48,4 @@ jobs:
       # Run unittest
       - name: Test
         run: |
-          python -m pytest
+          python -m unittest
@@ -34,7 +34,7 @@ unsafe-load-any-extension=no
 # A comma-separated list of package or module names from where C extensions may
 # be loaded. Extensions are loading into the active Python interpreter and may
 # run arbitrary code
-extension-pkg-whitelist=
+extension-pkg-whitelist=fastBPE
 
 [MESSAGES CONTROL]
 
 
@@ -14,7 +14,7 @@ Joey S2T implements the following features:
 - CMVN, SpecAugment
 - WER evaluation
 
-Furthermore, all the functionalities in JoeyNMT v2.0 are also available from JoeyS2T:
+Furthermore, all the functionalities in JoeyNMT v2 are also available from JoeyS2T:
 - BLEU and ChrF evaluation
 - BPE tokenization (with BPE dropout option)
 - Beam search and greedy decoding (with repetition penalty, ngram blocker)
@@ -26,31 +26,30 @@ Furthermore, all the functionalities in JoeyNMT v2.0 are also available from Joe
 
 
 ## Installation
+
 JoeyS2T is built on [PyTorch](https://pytorch.org/). Please make sure you have a compatible environment.
 We tested JoeyS2T with
 - python 3.10
-- torch 1.11.0
-- cuda 11.5
+- torch 1.12.1
+- cuda 11.6
 
 Clone this repository and install via pip:
 ```bash
 $ git clone https://github.com/may-/joeys2t.git
 $ cd joeynmt
-$ pip install . -e
-```
-Run the unit tests:
-```bash
-$ python -m unittest
+$ pip install -e .
 ```
 
 
+
 ## Documentation & Tutorials
 
 Please check the JoeyNMT's [documentation](https://joeynmt.readthedocs.io) first, if you are not familiar with JoeyNMT yet.
 
 For details, follow the tutorials in [notebooks](notebooks) dir.
+
 - [quick-start-with-joeynmt2](notebooks/quick-start-with-joeynmt2.ipynb)
-- [speech-to-text-with-joeynmt2](notebooks/joeyS2T_ASR_tutorial.ipynb) 
+- [speech-to-text-with-joeynmt2](notebooks/joeyS2T_ASR_tutorial.ipynb)
 
 
 
@@ -67,5 +66,3 @@ Please leave an issue if you have found a bug in the code.
 
 For general questions, email me at `ohta <at> cl.uni-heidelberg.de`.
 
-
-
@@ -17,10 +17,10 @@ data:
         level: "bpe"
         voc_limit: 32000
         voc_min_freq: 1
-        voc_file: "test/data/iwslt14_sp.vocab"
+        voc_file: "test/data/iwslt14/sp.vocab"
         tokenizer_type: "sentencepiece"
         tokenizer_cfg:
-            model_file: "test/data/iwslt14_sp.model"
+            model_file: "test/data/iwslt14/sp.model"
             model_type: "unigram"
             character_coverage: 1.0
             alpha: 0.1
@@ -33,10 +33,10 @@ data:
         level: "bpe"
         voc_limit: 32000
         voc_min_freq: 1
-        voc_file: "test/data/iwslt14_sp.vocab"
+        voc_file: "test/data/iwslt14/sp.vocab"
         tokenizer_type: "sentencepiece"
         tokenizer_cfg:
-            model_file: "test/data/iwslt14_sp.model"
+            model_file: "test/data/iwslt14/sp.model"
             model_type: "unigram"
             character_coverage: 1.0
             alpha: 0.1
@@ -82,6 +82,7 @@ training:
     overwrite: False
     shuffle: True
     use_cuda: True
+    fp16: True
     print_valid_sents: [0, 1, 2, 3]
     keep_best_ckpts: 5
 
 
@@ -8,7 +8,7 @@ data:
     test: "../datasets/datasets/kftt"
     dataset_type: "huggingface"
     dataset_cfg:
-        name: "en-ja"
+        name: "ja-en"
     sample_train_subset: -1
     sample_dev_subset: 200
     src:
@@ -19,10 +19,10 @@ data:
         level: "bpe"
         voc_limit: 32000
         voc_min_freq: 1
-        voc_file: "/scratch5t/ohta/jparacrawl_v3/spm_en.vocab"
+        voc_file: "subwords/jparacrawl_en.vocab"
         tokenizer_type: "sentencepiece"
         tokenizer_cfg:
-            model_file: "/scratch5t/ohta/jparacrawl_v3/spm_en.model"
+            model_file: "subwords/jparacrawl_en.model"
             model_type: "unigram"
             character_coverage: 1.0
             nbest_size: 10
@@ -35,10 +35,10 @@ data:
         level: "bpe"
         voc_limit: 32000
         voc_min_freq: 1
-        voc_file: "/scratch5t/ohta/jparacrawl_v3/spm_ja.vocab"
+        voc_file: "subwords/jparacrawl_ja.vocab"
         tokenizer_type: "sentencepiece"
         tokenizer_cfg:
-            model_file: "/scratch5t/ohta/jparacrawl_v3/spm_ja.model"
+            model_file: "subwords/jparacrawl_ja.model"
             model_type: "unigram"
             character_coverage: 0.995
             nbest_size: 10
@@ -61,8 +61,8 @@ testing:
         tokenize: "ja-mecab"
 
 training:
-    #load_model: "/workspace/mitarb/ohta/models/jparacrawl_enja_seed456/best.ckpt"
-    random_seed: 456
+    #load_model: "models/jparacrawl_enja/best.ckpt"
+    random_seed: 42
     optimizer: "adam"
     normalization: "tokens"
     adam_betas: [0.9, 0.98]
 
@@ -2,9 +2,9 @@ name: "jparacrawl_jaen_sp"
 joeynmt_version: "2.0.0"
 
 data:
-    train: "../datasets/datasets/jparacrawl"
-    dev: "../datasets/datasets/wmt21"
-    test: "../datasets/datasets/kftt"
+    train: "jparacrawl"
+    dev: "wmt21"
+    test: "kftt"
     dataset_type: "huggingface"
     dataset_cfg:
         name: "ja-en"
@@ -18,10 +18,10 @@ data:
         level: "bpe"
         voc_limit: 32000
         voc_min_freq: 1
-        voc_file: "data/jparacrawl_v3/spm_ja.vocab"
+        voc_file: "subwords/jparacrawl_ja.vocab"
         tokenizer_type: "sentencepiece"
         tokenizer_cfg:
-            model_file: "data/jparacrawl_v3/spm_ja.model"
+            model_file: "subwords/jparacrawl_ja.model"
             model_type: "unigram"
             character_coverage: 1.0
             nbest_size: 10
@@ -34,10 +34,10 @@ data:
         level: "bpe"
         voc_limit: 32000
         voc_min_freq: 1
-        voc_file: "data/jparacrawl_v3/spm_en.vocab"
+        voc_file: "subwords/jparacrawl_en.vocab"
         tokenizer_type: "sentencepiece"
         tokenizer_cfg:
-            model_file: "data/jparacrawl_v3/spm_en.model"
+            model_file: "subwords/jparacrawl_en.model"
             model_type: "unigram"
             character_coverage: 1.0
             nbest_size: 10
@@ -60,7 +60,7 @@ testing:
         tokenize: "intl"
 
 training:
-    #load_model: "models/jparacrawl_enja/best.ckpt"
+    #load_model: "models/jparacrawl_jaen/best.ckpt"
     random_seed: 42
     optimizer: "adam"
     normalization: "tokens"
 
@@ -113,8 +113,8 @@ model:                              # specify your model architecture here
     initializer: "xavier_uniform"   # initializer for all trainable weights (xavier_uniform, xavier_normal, zeros, normal, uniform)
     init_weight: 0.01               # weight to initialize; for uniform, will use [-weight, weight]
     init_gain: 1.0                  # gain for Xavier initializer (default: 1.0)
-    bias_initializer: "zeros"       # initializer for bias terms (xavier, zeros, normal, uniform)
-    embed_initializer: "normal"     # initializer for embeddings (xavier, zeros, normal, uniform)
+    bias_initializer: "zeros"       # initializer for bias terms (xavier_uniform, xavier_normal, zeros, normal, uniform)
+    embed_initializer: "normal"     # initializer for embeddings (xavier_uniform, xavier_normal, zeros, normal, uniform)
     embed_init_weight: 0.1          # weight to initialize; for uniform, will use [-weight, weight]
     embed_init_gain: 1.0            # gain for Xavier initializer for embeddings (default: 1.0)
     init_rnn_orthogonal: False      # use orthogonal initialization for recurrent weights (default: False)
 
@@ -59,9 +59,9 @@ model:
     initializer: "xavier_uniform"   # initializer for all trainable weights (xavier_uniform, xavier_normal, zeros, normal, uniform)
     init_gain: 1.0                  # gain for Xavier initializer (default: 1.0)
     bias_initializer: "zeros"       # initializer for bias terms (xavier_uniform, xavier_normal, zeros, normal, uniform)
-    embed_initializer: "xavier_uniform"     # initializer for embeddings (xavier_uniform, xavier_normal, zeros, normal, uniform)
+    embed_initializer: "xavier_uniform"  # initializer for embeddings (xavier_uniform, xavier_normal, zeros, normal, uniform)
     embed_init_gain: 1.0            # gain for Xavier initializer for embeddings (default: 1.0)
-    tied_embeddings: True          # tie src and trg embeddings, only applicable if vocabularies are the same, default: False
+    tied_embeddings: True           # tie src and trg embeddings, only applicable if vocabularies are the same, default: False
     tied_softmax: True
     encoder:
         type: "transformer"
 
@@ -114,7 +114,11 @@ model:                              # specify your model architecture here
     initializer: "xavier_uniform"   # initializer for all trainable weights (xavier_uniform, xavier_normal, zeros, normal, uniform)
     init_gain: 1.0                  # gain for Xavier initializer (default: 1.0)
     bias_initializer: "zeros"       # initializer for bias terms (xavier_uniform, xavier_normal, zeros, normal, uniform)
+<<<<<<< HEAD
     embed_initializer: "xavier_uniform"     # initializer for embeddings (xavier_uniform, xavier_normal, zeros, normal, uniform)
+=======
+    embed_initializer: "xavier_uniform"  # initializer for embeddings (xavier_uniform, xavier_normal, zeros, normal, uniform)
+>>>>>>> 4a132900d3ae55d5df9bae11196ae32a5014efd1
     embed_init_gain: 1.0            # gain for Xavier initializer for embeddings (default: 1.0)
     tied_embeddings: False          # tie src and trg embeddings, only applicable if vocabularies are the same, default: False
     tied_softmax: True
 
@@ -17,11 +17,11 @@ data:
         normalize: True
         level: "bpe"
         voc_min_freq: 1
-        voc_file: "data/subwords/wmt17_bpe.vocab"
+        voc_file: "subwords/wmt17_bpe.vocab"
         tokenizer_type: "subword-nmt"
         tokenizer_cfg:
             num_merges: 32000
-            codes: "data/subwords/wmt17_bpe.codes"
+            codes: "subwords/wmt17_bpe.codes"
             dropout: 0.1
             pretokenizer: "moses"
     trg:
@@ -32,11 +32,11 @@ data:
         level: "bpe"
         voc_limit: 32000
         voc_min_freq: 1
-        voc_file: "data/subwords/wmt17_bpe.vocab"
+        voc_file: "subwords/wmt17_bpe.vocab"
         tokenizer_type: "subword-nmt"
         tokenizer_cfg:
             num_merges: 32000
-            codes: "data/subwords/wmt17_bpe.codes"
+            codes: "subwords/wmt17_bpe.codes"
             dropout: 0.1
             pretokenizer: "moses"
 
 
@@ -18,10 +18,10 @@ data:
         level: "bpe"
         voc_limit: 32000
         voc_min_freq: 1
-        voc_file: "data/spm/wmt17_sp.vocab"
+        voc_file: "subwords/wmt17_sp.vocab"
         tokenizer_type: "sentencepiece"
         tokenizer_cfg:
-            model_file: "data/spm/wmt17_sp.model"
+            model_file: "subwords/wmt17_sp.model"
             model_type: "unigram"
             character_coverage: 1.0
             nbest_size: 10
@@ -35,10 +35,10 @@ data:
         level: "bpe"
         voc_limit: 32000
         voc_min_freq: 1
-        voc_file: "data/spm/wmt17_sp.vocab"
+        voc_file: "subwords/wmt17_sp.vocab"
         tokenizer_type: "sentencepiece"
         tokenizer_cfg:
-            model_file: "data/spm/wmt17_sp.model"
+            model_file: "subwords/wmt17_sp.model"
             model_type: "unigram"
             character_coverage: 1.0
             nbest_size: 10