throw in laser attention

lucidrains · lucidrains · commit bb5487385dd2 · 2024-12-02T09:31:15.000-08:00
diff --git a/README.md b/README.md
@@ -234,3 +234,12 @@ sampled = model.generate_text_only(text[:, :1], 1024)
     url     = {https://api.semanticscholar.org/CorpusID:273532030}
 }
 ```
+
+```bibtex
+@inproceedings{Duvvuri2024LASERAW,
+    title   = {LASER: Attention with Exponential Transformation},
+    author  = {Sai Surya Duvvuri and Inderjit S. Dhillon},
+    year    = {2024},
+    url     = {https://api.semanticscholar.org/CorpusID:273849947}
+}
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "transfusion-pytorch"
-version = "0.6.0"
+version = "0.6.3"
 description = "Transfusion in Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/train_image_only.py b/train_image_only.py
@@ -51,7 +51,8 @@ def forward(self, x):
         dim = 64,
         depth = 4,
         dim_head = 32,
-        heads = 8
+        heads = 8,
+        attn_laser = True
     )
 ).cuda()
 
diff --git a/train_text_only.py b/train_text_only.py
@@ -50,7 +50,8 @@ def decode_tokens(tokens):
         dim = 384,
         depth = 8,
         dim_head = 64,
-        heads = 8
+        heads = 8,
+        attn_laser = True
     )
 ).cuda()
 
diff --git a/transfusion_pytorch/transfusion.py b/transfusion_pytorch/transfusion.py
@@ -756,6 +756,7 @@ def __init__(
         softcap_value = 50.,
         use_flex_attn = False,
         gate_values = True,
+        laser = False,
         learned_value_residual_mix = False
     ):
         super().__init__()
@@ -783,6 +784,8 @@ def __init__(
 
         self.softcap_value = softcap_value
 
+        self.laser = laser
+
         self.dropout = nn.Dropout(dropout)
 
         self.to_out = nn.Sequential(
@@ -844,6 +847,12 @@ def forward(
         if exists(rotary_emb):
             q, k = tuple(apply_rotary_emb(rotary_emb, t, freqs_seq_dim = -2) for t in (q, k))
 
+        # laser attention
+
+        if self.laser:
+            v_max = v.amax(dim = -2, keepdim = True).detach()
+            v = (v - v_max).exp()
+
         # whether to use flex attention or not
 
         if should_use_flex_attn:
@@ -878,6 +887,11 @@ def forward(
 
             out = einsum(attn, v, 'b h i j, b h j d -> b h i d')
 
+        # laser attention
+
+        if self.laser:
+            out = log(out) + v_max
+
         # maybe gate values
 
         if exists(self.to_gates):
@@ -908,6 +922,7 @@ def __init__(
         ff_expansion_factor = 4,
         attn_kwargs: dict = dict(),
         ff_kwargs: dict = dict(),
+        attn_laser = False,
         unet_skips = True,
         use_flex_attn = False
     ):
@@ -932,7 +947,7 @@ def __init__(
 
             skip_proj = Linear(dim * 2, dim, bias = False) if is_latter_half and unet_skips else None
 
-            attn = Attention(dim = dim, dim_head = dim_head, heads = heads, dropout = dropout, use_flex_attn = use_flex_attn, learned_value_residual_mix = not is_first, **attn_kwargs)
+            attn = Attention(dim = dim, dim_head = dim_head, heads = heads, dropout = dropout, use_flex_attn = use_flex_attn, learned_value_residual_mix = not is_first, laser = attn_laser, **attn_kwargs)
 
             ff = FeedForward(dim = dim, expansion_factor = ff_expansion_factor, **ff_kwargs)
 

Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,8 @@ def forward(self, x):`
`51`	`51`	`dim = 64,`
`52`	`52`	`depth = 4,`
`53`	`53`	`dim_head = 32,`
`54`		`- heads = 8`
	`54`	`+ heads = 8,`
	`55`	`+ attn_laser = True`
`55`	`56`	`)`
`56`	`57`	`).cuda()`
`57`	`58`
Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,8 @@ def decode_tokens(tokens):`
`50`	`50`	`dim = 384,`
`51`	`51`	`depth = 8,`
`52`	`52`	`dim_head = 64,`
`53`		`- heads = 8`
	`53`	`+ heads = 8,`
	`54`	`+ attn_laser = True`
`54`	`55`	`)`
`55`	`56`	`).cuda()`
`56`	`57`