extended to two weight options

Sariyusha · Sariyusha · commit 30cd283ea3b2 · 2019-07-05T14:32:03.000+02:00
diff --git a/configs/testme.yaml b/configs/testme.yaml
@@ -63,15 +63,16 @@ model:  # specify your model architecture here
             embedding_dim: 20 # size of embeddings
             scale: False  # scale the embeddings by sqrt of their size, default: False
             freeze: True  # if True, embeddings are not updated during training
-        hidden_size: 50 # size of RNN
+        hidden_size: 256 # size of RNN
         bidirectional: True # use a bi-directional encoder, default: True
         dropout: 0.2  # apply dropout to the inputs to the RNN, default: 0.0
-        num_layers: 1 # stack this many layers of equal size, default: 1
+        num_layers: 3 # stack this many layers of equal size, default: 1
         freeze: False  # if True, encoder parameters are not updated during training (does not include embedding parameters)
         activation: "tanh" # activation type for 2 layers following the src embeddings (only for speech), default: "relu", other options: "tanh"
         last_activation: "relu" # non-linear activation after RNNs in speech encoder, default: "None", other options: "tanh", "relu"
-        layer_norm: False # layer normalization layers for 2 CNNs and RNN layer, default: False
+        layer_norm: True # layer normalization layers for 2 CNNs and RNN layer, default: False
         emb_norm: False # layer normalization layers for embeddings, default: False
+        same_weights: True # use same weights for linear layers, default: False
     decoder:
         rnn_type: "gru"
         embeddings:
diff --git a/joeynmt/encoders.py b/joeynmt/encoders.py
@@ -160,6 +160,7 @@ def __init__(self,
                  last_activation: str = "None",
                  layer_norm: bool = False,
                  emb_norm: bool = False,
+                 same_weights: bool = False,
                  **kwargs) -> None:
         """
         Create a new recurrent encoder.
@@ -181,8 +182,10 @@ def __init__(self,
         self.emb_size = emb_size
         self.lila1 = nn.Linear(emb_size, hidden_size)
         self.lila2 = nn.Linear(hidden_size, hidden_size)
-        self.lila3 = nn.Linear(hidden_size, hidden_size)
-        self.lila4 = nn.Linear(hidden_size, hidden_size)
+        self.same_weights = same_weights
+        if not self.same_weights:
+            self.lila3 = nn.Linear(hidden_size, hidden_size)
+            self.lila4 = nn.Linear(hidden_size, hidden_size)
         self.activation = activation
         self.last_activation = last_activation
         self.conv1 = nn.Sequential(
@@ -201,6 +204,7 @@ def __init__(self,
             self.norm_out = nn.LayerNorm(2 * hidden_size if bidirectional else hidden_size)
         if self.emb_norm:
             self.norm_emb = nn.LayerNorm(emb_size)
+        self.same_weights = same_weights
 
         rnn = nn.GRU if rnn_type == "gru" else nn.LSTM
 
@@ -273,10 +277,17 @@ def forward(self, embed_src: Tensor, src_length: Tensor, mask: Tensor, \
         if self.layer_norm:
             conv_out1 = self.norm1(conv_out1)
 
-        if self.activation == "tanh":
-            lila_out3 = torch.tanh(self.lila3(conv_out1))
+        if not self.same_weights:
+            if self.activation == "tanh":
+                lila_out3 = torch.tanh(self.lila3(conv_out1))
+            else:
+                lila_out3 = torch.relu(self.lila3(conv_out1))
         else:
-            lila_out3 = torch.relu(self.lila3(conv_out1))
+            if self.activation == "tanh":
+                lila_out3 = torch.tanh(self.lila2(conv_out1))
+            else:
+                lila_out3 = torch.relu(self.lila2(conv_out1))
+
         lila_out3 = lila_out3.transpose(1,2)
 
         conv_out2 = self.conv2(lila_out3)
@@ -286,10 +297,16 @@ def forward(self, embed_src: Tensor, src_length: Tensor, mask: Tensor, \
         if self.layer_norm:
             conv_out2 = self.norm2(conv_out2)
 
-        if self.activation == "tanh":
-            lila_out4 = torch.tanh(self.lila4(conv_out2))
+        if not self.same_weights:
+            if self.activation == "tanh":
+                lila_out4 = torch.tanh(self.lila4(conv_out1))
+            else:
+                lila_out4 = torch.relu(self.lila4(conv_out1))
         else:
-            lila_out4 = torch.relu(self.lila4(conv_out2))
+            if self.activation == "tanh":
+                lila_out4 = torch.tanh(self.lila2(conv_out1))
+            else:
+                lila_out4 = torch.relu(self.lila2(conv_out1))
 
         # apply dropout to the rnn input
         lila_do = self.rnn_input_dropout(lila_out4)