[TK] Add test for fused attention and ops needed (#480)

Groverkss · web-flow · commit bc02c45d0839 · 2024-02-26T02:11:21.000Z
This patch adds a test for f16 attention. The fused_attention kernel
needed dtype conversion so that is also added. As a side effect,
implicit vector conversion (for unary ops) is now also supported.
diff --git a/core/shark_turbine/kernel/_support/tracing.py b/core/shark_turbine/kernel/_support/tracing.py
@@ -203,6 +203,14 @@ def handle_thread_program_id(self, op, axis: int) -> Index:
         )
         return proxy
 
+    def handle_to_dtype(self, op, val, dtype):
+        return self.region_graph.create_proxy(
+            "call_function",
+            op,
+            args=(val, dtype),
+            kwargs={},
+        )
+
     def handle_kernel_buffer_getitem(self, op, kernel_buffer: KernelBuffer, key):
         return self.region_graph.create_proxy(
             "call_function",
@@ -349,7 +357,6 @@ def handle_vector_broadcast_in_dim(self, op, vector, shape, broadcast_dimensions
             i for i in range(len(shape)) if i not in broadcast_dimensions
         )
         permutation = permutation + tuple(broadcast_dimensions)
-        print(permutation)
 
         # Transpose
         return self.region_graph.create_proxy(
diff --git a/core/shark_turbine/kernel/compiler/builder.py b/core/shark_turbine/kernel/compiler/builder.py
@@ -32,12 +32,12 @@
     F64Type,
 )
 
-# TODO: Have a way upstream to check if a floating point type.
-FLOAT_TYPES_ASM = {
-    "bf16",
-    "f16",
-    "f32",
-    "f64",
+# TODO: Use FloatType from upstream when available.
+FLOAT_BITWIDTHS = {
+    "bf16": 16,
+    "f16": 16,
+    "f32": 32,
+    "f64": 64,
     # TODO: FP8 types.
 }
 
@@ -87,28 +87,54 @@ def __init__(
 
 class _ScalarBuilder:
     def is_floating_point_type(self, t: IrType) -> bool:
-        return str(t) in FLOAT_TYPES_ASM
+        # TODO: Use FloatType from upstream when available.
+        return str(t) in FLOAT_BITWIDTHS
 
     def is_integer_type(self, t: IrType) -> bool:
         return IntegerType.isinstance(t)
 
     def is_index_type(self, t: IrType) -> bool:
         return IndexType.isinstance(t)
 
-    def promote(self, value: Value, to_type: IrType) -> Value:
-        value_type = value.type
+    def get_typeclass(self, t: IrType, index_same_as_integer=False) -> str:
+        # If this is a vector type, get the element type.
+        if isinstance(t, VectorType):
+            t = t.element_type
+        if self.is_floating_point_type(t):
+            return "float"
+        if self.is_integer_type(t):
+            return "integer"
+        if self.is_index_type(t):
+            return "integer" if index_same_as_integer else "index"
+        raise CodegenError(f"Unknown typeclass for type `{t}`")
+
+    def get_float_bitwidth(self, t: IrType) -> int:
+        # If this is a vector type, get the element type.
+        if isinstance(t, VectorType):
+            t = t.element_type
+        return FLOAT_BITWIDTHS[str(t)]
+
+    def to_dtype(self, value: IRProxyValue, dtype: IrType) -> IRProxyValue:
+        value_type = value.ir_value.type
+        # Create a vector type for dtype if value is a vector.
+        to_type = dtype
+        if isinstance(value_type, VectorType):
+            to_type = VectorType.get(value_type.shape, dtype)
+
         # Short-circuit if already the right type.
         if value_type == to_type:
             return value
 
-        attr_name = f"promote_{value_type}_to_{to_type}"
+        value_typeclass = self.get_typeclass(value_type)
+        to_typeclass = self.get_typeclass(dtype)
+        attr_name = f"to_dtype_{value_typeclass}_to_{to_typeclass}"
         try:
             handler = getattr(self, attr_name)
         except AttributeError:
             raise CodegenError(
                 f"No implemented path to implicitly promote scalar `{value_type}` to `{to_type}` (tried '{attr_name}')"
             )
-        return handler(value, to_type)
+        return IRProxyValue(handler(value.ir_value, to_type))
 
     def constant_attr(self, val: int | float, element_type: IrType) -> Attribute:
         if self.is_integer_type(element_type) or self.is_index_type(element_type):
@@ -153,7 +179,7 @@ def binary_arithmetic(
                 f"Cannot perform binary arithmetic operation '{op}' between {lhs_ir_type} and {rhs_ir_type} due to element type mismatch"
             )
 
-        typeclass = "float" if self.is_floating_point_type(lhs_ir_type) else "integer"
+        typeclass = self.get_typeclass(lhs_ir_type, True)
         attr_name = f"binary_{op}_{typeclass}"
         try:
             handler = getattr(self, attr_name)
@@ -176,9 +202,7 @@ def binary_vector_arithmetic(
                 f"Cannot perform binary arithmetic operation '{op}' between {lhs_ir.type} and {rhs_ir.type} due to element type mismatch"
             )
 
-        typeclass = (
-            "float" if self.is_floating_point_type(lhs_element_type) else "integer"
-        )
+        typeclass = self.get_typeclass(lhs_element_type, True)
         attr_name = f"binary_{op}_{typeclass}"
         try:
             handler = getattr(self, attr_name)
@@ -190,7 +214,7 @@ def binary_vector_arithmetic(
 
     def unary_arithmetic(self, op: str, val: IRProxyValue) -> IRProxyValue:
         val_ir_type = val.ir_value.type
-        typeclass = "float" if self.is_floating_point_type(val_ir_type) else "integer"
+        typeclass = self.get_typeclass(val_ir_type, True)
         attr_name = f"unary_{op}_{typeclass}"
         try:
             handler = getattr(self, attr_name)
@@ -203,9 +227,7 @@ def unary_arithmetic(self, op: str, val: IRProxyValue) -> IRProxyValue:
     def unary_vector_arithmetic(self, op: str, val: IRProxyValue) -> IRProxyValue:
         val_ir = val.ir_value
         val_element_type = VectorType(val_ir.type).element_type
-        typeclass = (
-            "float" if self.is_floating_point_type(val_element_type) else "integer"
-        )
+        typeclass = self.get_typeclass(val_element_type, True)
         attr_name = f"unary_{op}_{typeclass}"
         try:
             handler = getattr(self, attr_name)
@@ -217,10 +239,33 @@ def unary_vector_arithmetic(self, op: str, val: IRProxyValue) -> IRProxyValue:
 
     ### Specializations
 
-    def promote_index_to_f32(self, value: Value, to_type: IrType) -> Value:
-        i32_type = IntegerType.get_signless(32)
-        i32 = arith_d.index_cast(i32_type, value)
-        return arith_d.sitofp(to_type, i32)
+    # Casting
+    def to_dtype_index_to_integer(self, value: Value, to_type: IrType) -> Value:
+        return arith_d.index_cast(to_type, value)
+
+    def to_dtype_index_to_float(self, value: Value, to_type: IrType) -> Value:
+        # Cast index to integer, and then ask for a integer to float cast.
+        # TODO: I don't really know how to query the machine bitwidth here,
+        # so using 64.
+        casted_to_int = arith_d.index_cast(IntegerType.get_signless(64), value)
+        return self.to_dtype(IRProxyValue(casted_to_int), to_type).ir_value
+
+    def to_dtype_integer_to_float(self, value: Value, to_type: IrType) -> Value:
+        # sitofp
+        casted_to_float = arith_d.sitofp(to_type, value)
+        return self.to_dtype(IRProxyValue(casted_to_float), to_type).ir_value
+
+    def to_dtype_float_to_float(self, value: Value, to_type: IrType) -> Value:
+        # Check bitwidth to determine if we need to extend or narrow
+        from_type = value.type
+        from_bitwidth = self.get_float_bitwidth(from_type)
+        to_bitwidth = self.get_float_bitwidth(to_type)
+        if from_bitwidth < to_bitwidth:
+            return arith_d.extf(to_type, value)
+        elif from_bitwidth > to_bitwidth:
+            return arith_d.truncf(to_type, value)
+        else:
+            raise CodegenError(f"NYI: Cast from {from_type} to {to_type}")
 
     # Binary integer/integer arithmetic.
     def binary_add_integer(self, lhs: IRProxyValue, rhs: IRProxyValue) -> IRProxyValue:
diff --git a/core/shark_turbine/kernel/compiler/vector_codegen.py b/core/shark_turbine/kernel/compiler/vector_codegen.py
@@ -358,6 +358,18 @@ def _(emitter: ThreadEmitter, node: fx.Node):
     )
 
 
+@handle_op(tkl.to_dtype)
+def _(emitter: ThreadEmitter, node: fx.Node):
+    try:
+        (val, dtype) = node.args
+    except ValueError as e:
+        raise ValidationError("Malformed arguments") from e
+
+    ir_type = cast_dtype(emitter, dtype)
+    casted = cast_vector(emitter, val, element_type=ir_type)
+    emitter.bind_node_proxy(node, IRProxyValue(casted))
+
+
 @handle_op(ops.kernel_buffer_getitem)
 def _(emitter: ThreadEmitter, node: fx.Node):
     try:
@@ -828,24 +840,19 @@ def cast_vector(
     emitter: ThreadEmitter, value, *, element_type: Optional[IrType] = None
 ):
     proxy_value = cast_py_value(emitter, value)
-    value = proxy_value.ir_value
 
-    # Promote scalar types correctly first.
-    if element_type and not ShapedType.isinstance(value.type):
+    # Cast scalar types correctly first.
+    if element_type and not ShapedType.isinstance(proxy_value.ir_value.type):
         # Implicit scalar type promotion.
-        value = ScalarBuilder.promote(value, element_type)
+        proxy_value = ScalarBuilder.to_dtype(proxy_value, element_type)
+
+    value = proxy_value.ir_value
 
     # After scalar promotion, promote to vector.
     if VectorType.isinstance(value.type):
         # Already a vector. Coerce or return.
         if element_type is not None:
-            vector_type = VectorType(value.type)
-            if vector_type.element_type == element_type:
-                return value
-            # TODO: Implement vector element type conversion.
-            raise CodegenError(
-                f"Implicit conversion of vector element types not supported (`{vector_type.element_type}` -> `{element_type}`)"
-            )
+            value = ScalarBuilder.to_dtype(proxy_value, element_type).ir_value
         # No target element_type.
         return value
     else:
diff --git a/core/shark_turbine/kernel/lang/prims.py b/core/shark_turbine/kernel/lang/prims.py
@@ -21,6 +21,7 @@
     "broadcast",
     "broadcast_in_dim",
     "transpose",
+    "to_dtype",
 ]
 
 
@@ -31,6 +32,7 @@ def is_debug() -> bool:
 
 # Core language operations
 program_id = ops.thread_program_id
+to_dtype = ops.to_dtype
 
 # Math Operations
 exp2 = ops.exp2
diff --git a/core/shark_turbine/kernel/ops/core.py b/core/shark_turbine/kernel/ops/core.py
@@ -1,17 +1,17 @@
-from typing import Any
+from typing import Any, TypeVar
 import typing
 
 if typing.TYPE_CHECKING:
     from ..lang.types import Index, Vector
 
-from .base import (
-    define_op,
-)
+from .base import define_op
+from .._support.dtype import DataType
 
 __all__ = [
     "kernel_buffer_getitem",
     "kernel_buffer_setitem",
     "thread_program_id",
+    "to_dtype",
 ]
 
 
@@ -28,3 +28,8 @@ def kernel_buffer_setitem(kernel_buffer, key, item) -> None:
 @define_op
 def thread_program_id(axis: int) -> "Index":
     ...
+
+
+@define_op
+def to_dtype(val, dtype: DataType):
+    ...
diff --git a/core/tests/kernel/fused_attention_test.py b/core/tests/kernel/fused_attention_test.py
@@ -0,0 +1,89 @@
+import logging
+import unittest
+
+import torch
+import shark_turbine.kernel as tk
+import shark_turbine.kernel.lang as tkl
+
+BATCH = tkl.sym.BATCH
+N_HEADS = tkl.sym.N_HEADS
+N_CTX = tkl.sym.N_CTX
+D_HEAD = tkl.sym.D_HEAD
+
+BLOCK_N = tkl.sym.BLOCK_N
+BLOCK_M = tkl.sym.BLOCK_M
+
+
+class Test(unittest.TestCase):
+    def testFusedAttention(self):
+        @tk.gen.thread(N_CTX // BLOCK_M, BATCH * N_HEADS)
+        def fused_attention(
+            Q: tkl.InputBuffer[BATCH, N_HEADS, N_CTX, D_HEAD].of(tkl.f16),
+            K: tkl.InputBuffer[BATCH, N_HEADS, N_CTX, D_HEAD].of(tkl.f16),
+            V: tkl.InputBuffer[BATCH, N_HEADS, N_CTX, D_HEAD].of(tkl.f16),
+            O: tkl.OutputBuffer[BATCH, N_HEADS, N_CTX, D_HEAD].of(tkl.f16),
+        ):
+            grid_n = tkl.program_id(0)
+            grid_m = tkl.program_id(1)
+
+            batch = grid_m // N_HEADS
+            head = grid_m % N_HEADS
+
+            q = tkl.load(Q, (batch, head, grid_n * BLOCK_M, 0), (BLOCK_M, D_HEAD))
+            acc_init = tkl.constant((BLOCK_M, D_HEAD), tkl.f32, 0.0)
+            max_stat_init = tkl.constant((BLOCK_M,), tkl.f32, -1e9)
+            sum_stat_init = tkl.constant((BLOCK_M,), tkl.f32, 0.0)
+
+            @tkl.for_loop(
+                0, N_CTX, BLOCK_N, init_args=[max_stat_init, sum_stat_init, acc_init]
+            )
+            def body(i, old_max, old_sum, old_acc):
+                k = tkl.load(K, (batch, head, i, 0), (BLOCK_N, D_HEAD))
+                kT = tkl.transpose(k, (1, 0))
+
+                qkT = tkl.constant((BLOCK_M, BLOCK_N), tkl.f32, 0.0)
+                qkT = tkl.dot(q, kT, qkT)
+
+                new_max = tkl.max(qkT, axis=1, acc=old_max)
+                broadcasted_max = tkl.broadcast_in_dim(
+                    new_max, (BLOCK_M, BLOCK_N), (0,)
+                )
+                partial_softmax = tkl.exp2(qkT - broadcasted_max)
+                scale_factor = tkl.exp2(old_max - new_max)
+                scaled_old_sum = scale_factor * old_sum
+                new_sum = tkl.sum(partial_softmax, axis=1, acc=scaled_old_sum)
+                broadcasted_scale_factor = tkl.broadcast_in_dim(
+                    scale_factor, (BLOCK_M, D_HEAD), (0,)
+                )
+                new_acc = old_acc * broadcasted_scale_factor
+
+                v = tkl.load(V, (batch, head, i, 0), (BLOCK_N, D_HEAD))
+                qkT16 = tkl.to_dtype(qkT, tkl.f16)
+                new_acc = tkl.dot(qkT16, v, new_acc)
+
+                return (new_max, new_sum, new_acc)
+
+            sum_stat = body[1]
+            result = body[2]
+            one = tkl.constant((BLOCK_M,), tkl.f32, 1.0)
+            one_by_sum = one / sum_stat
+            result = tkl.broadcast_in_dim(one_by_sum, (BLOCK_M, D_HEAD), (0,)) * result
+            tkl.store(O, (batch, head, grid_n * BLOCK_M, 0), result)
+
+        Q = torch.randn(4, 48, 1024, 64)
+        K = torch.randn(4, 48, 1024, 64)
+        V = torch.randn(4, 48, 1024, 64)
+        O = torch.randn(4, 48, 1024, 64)
+
+        with tk.gen.TestLaunchContext(
+            {
+                BLOCK_N: 128,
+                BLOCK_M: 256,
+            }
+        ):
+            fused_attention(Q, K, V, O)
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+    unittest.main()