diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0827a15..45b951b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -24,28 +24,28 @@ concurrency:
 
 jobs:
   build:
-    name: Build and Test
+    name: Test
     strategy:
       matrix:
-        os: [ubuntu-latest]
+        os: [ubuntu-22.04]
         scala: [3, 2.13]
         java: [temurin@21]
         project: [rootNative]
     runs-on: ${{ matrix.os }}
     timeout-minutes: 60
     steps:
-      - name: Install sbt
-        uses: sbt/setup-sbt@v1
-
       - name: Checkout current branch (full)
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           fetch-depth: 0
 
+      - name: Setup sbt
+        uses: sbt/setup-sbt@v1
+
       - name: Setup Java (temurin@21)
         id: setup-java-temurin-21
         if: matrix.java == 'temurin@21'
-        uses: actions/setup-java@v4
+        uses: actions/setup-java@v5
         with:
           distribution: temurin
           java-version: 21
@@ -60,13 +60,13 @@ jobs:
 
       - name: Install brew formulae (ubuntu)
         if: startsWith(matrix.os, 'ubuntu')
-        run: /home/linuxbrew/.linuxbrew/bin/brew install mlpack openblas
+        run: /home/linuxbrew/.linuxbrew/bin/brew install cereal libsvm mlpack openblas
 
       - name: Check that workflows are up to date
         run: sbt githubWorkflowCheck
 
       - name: Check headers and formatting
-        if: matrix.java == 'temurin@21' && matrix.os == 'ubuntu-latest'
+        if: matrix.java == 'temurin@21' && matrix.os == 'ubuntu-22.04'
         run: sbt 'project ${{ matrix.project }}' '++ ${{ matrix.scala }}' headerCheckAll scalafmtCheckAll 'project /' scalafmtSbtCheck
 
       - name: nativeLink
@@ -77,11 +77,11 @@ jobs:
         run: sbt 'project ${{ matrix.project }}' '++ ${{ matrix.scala }}' test
 
       - name: Check binary compatibility
-        if: matrix.java == 'temurin@21' && matrix.os == 'ubuntu-latest'
+        if: matrix.java == 'temurin@21' && matrix.os == 'ubuntu-22.04'
         run: sbt 'project ${{ matrix.project }}' '++ ${{ matrix.scala }}' mimaReportBinaryIssues
 
       - name: Generate API documentation
-        if: matrix.java == 'temurin@21' && matrix.os == 'ubuntu-latest'
+        if: matrix.java == 'temurin@21' && matrix.os == 'ubuntu-22.04'
         run: sbt 'project ${{ matrix.project }}' '++ ${{ matrix.scala }}' doc
 
       - name: Make target directories
@@ -94,7 +94,7 @@ jobs:
 
       - name: Upload target directories
         if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main')
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v5
         with:
           name: target-${{ matrix.os }}-${{ matrix.java }}-${{ matrix.scala }}-${{ matrix.project }}
           path: targets.tar
@@ -105,22 +105,22 @@ jobs:
     if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main')
     strategy:
       matrix:
-        os: [ubuntu-latest]
+        os: [ubuntu-22.04]
         java: [temurin@21]
     runs-on: ${{ matrix.os }}
     steps:
-      - name: Install sbt
-        uses: sbt/setup-sbt@v1
-
       - name: Checkout current branch (full)
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           fetch-depth: 0
 
+      - name: Setup sbt
+        uses: sbt/setup-sbt@v1
+
       - name: Setup Java (temurin@21)
         id: setup-java-temurin-21
         if: matrix.java == 'temurin@21'
-        uses: actions/setup-java@v4
+        uses: actions/setup-java@v5
         with:
           distribution: temurin
           java-version: 21
@@ -131,7 +131,7 @@ jobs:
         run: sbt +update
 
       - name: Download target directories (3, rootNative)
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v6
         with:
           name: target-${{ matrix.os }}-${{ matrix.java }}-3-rootNative
 
@@ -141,7 +141,7 @@ jobs:
           rm targets.tar
 
       - name: Download target directories (2.13, rootNative)
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v6
         with:
           name: target-${{ matrix.os }}-${{ matrix.java }}-2.13-rootNative
 
@@ -179,22 +179,22 @@ jobs:
     if: github.event.repository.fork == false && github.event_name != 'pull_request'
     strategy:
       matrix:
-        os: [ubuntu-latest]
+        os: [ubuntu-22.04]
         java: [temurin@21]
     runs-on: ${{ matrix.os }}
     steps:
-      - name: Install sbt
-        uses: sbt/setup-sbt@v1
-
       - name: Checkout current branch (full)
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           fetch-depth: 0
 
+      - name: Setup sbt
+        uses: sbt/setup-sbt@v1
+
       - name: Setup Java (temurin@21)
         id: setup-java-temurin-21
         if: matrix.java == 'temurin@21'
-        uses: actions/setup-java@v4
+        uses: actions/setup-java@v5
         with:
           distribution: temurin
           java-version: 21
diff --git a/build.sbt b/build.sbt
index 488b333..45a8dd1 100644
--- a/build.sbt
+++ b/build.sbt
@@ -8,10 +8,8 @@ ThisBuild / developers ++= List(
   tlGitHubDev("valencik", "Andrew Valencik"),
 )
 ThisBuild / startYear := Some(2023)
-ThisBuild / tlSonatypeUseLegacyHost := false
-
-ThisBuild / crossScalaVersions := Seq("3.3.4", "2.13.16")
 
+ThisBuild / crossScalaVersions := Seq("3.3.4", "2.13.18")
 ThisBuild / githubWorkflowJavaVersions := Seq(JavaSpec.temurin("21"))
 ThisBuild / githubWorkflowBuildPreamble +=
   WorkflowStep.Run(List("/home/linuxbrew/.linuxbrew/bin/brew update"), name = Some("brew update"))
@@ -19,7 +17,7 @@ ThisBuild / githubWorkflowBuildPreamble ++= nativeBrewInstallWorkflowSteps.value
 
 ThisBuild / Test / testOptions += Tests.Argument("+l") // for munit logging
 
-val CatsEffectVersion = "3.7-8f2b497"
+val CatsEffectVersion = "3.7.0"
 val CatsVersion = "2.12.0"
 val MunitVersion = "1.0.4"
 lazy val root = tlCrossRootProject.aggregate(ir, onnx, runtime)
@@ -40,6 +38,7 @@ lazy val onnx = project
       "com.thesamet.scalapb" %% "scalapb-runtime" % scalapb.compiler.Version.scalapbVersion % "protobuf",
       "org.typelevel" %%% "cats-core" % CatsVersion,
       "org.scalameta" %%% "munit" % MunitVersion % Test,
+      "org.typelevel" %%% "cats-effect" % CatsEffectVersion,
     ),
     nativeConfig ~= { _.withEmbedResources(true) },
     Compile / PB.generate := (Compile / PB.generate).dependsOn(Compile / downloadOnnxProto).value,
@@ -69,9 +68,20 @@ lazy val runtime = project
   .settings(
     name := "vilcacora-runtime",
     libraryDependencies ++= Seq(
-      "org.typelevel" %%% "cats-effect-kernel" % CatsEffectVersion,
+      "org.typelevel" %%% "cats-effect" % CatsEffectVersion,
       "org.typelevel" %%% "cats-core" % CatsVersion,
       "org.scalameta" %%% "munit" % MunitVersion % Test,
+      "org.typelevel" %%% "keypool" % "0.5.0-RC1",
     ),
-    nativeBrewFormulas ++= Set("openblas", "mlpack"),
+    nativeBrewFormulas ++= Set("cereal", "openblas", "mlpack", "libsvm"),
+    nativeConfig ~= { c =>
+      c.withCompileOptions(
+        c.compileOptions ++ Seq(
+          "-fexceptions",
+          "-frtti",
+          "-Wno-inconsistent-missing-override",
+          "-DARMA_DONT_USE_WRAPPER",
+        ),
+      )
+    },
   )
diff --git a/ir/src/main/scala/com/armanbilge/vilcacora/ir/ModelIR.scala b/ir/src/main/scala/com/armanbilge/vilcacora/ir/ModelIR.scala
index 75555c4..1c7f1d9 100644
--- a/ir/src/main/scala/com/armanbilge/vilcacora/ir/ModelIR.scala
+++ b/ir/src/main/scala/com/armanbilge/vilcacora/ir/ModelIR.scala
@@ -85,6 +85,24 @@ object PostTransform {
   }
 }
 
+/** Representation of auto_pad attribute for Conv and Pool operations.
+  */
+sealed abstract class AutoPad
+object AutoPad {
+  case object NotSet extends AutoPad
+  case object SameUpper extends AutoPad
+  case object SameLower extends AutoPad
+  case object Valid extends AutoPad
+
+  def fromString(s: String): Either[String, AutoPad] = s.toUpperCase match {
+    case "NOTSET" => Right(NotSet)
+    case "SAME_UPPER" => Right(SameUpper)
+    case "SAME_LOWER" => Right(SameLower)
+    case "VALID" => Right(Valid)
+    case other => Left(s"Unsupported AutoPad: $other")
+  }
+}
+
 /** An ADT representing a single operation in the computation graph.
   */
 sealed abstract class Operation {
@@ -155,6 +173,95 @@ object Operation {
     override def inputs: List[String] = List(input)
     override def outputs: List[String] = List(output)
   }
+
+  /** Represents a Constant operation that produces a constant tensor.
+    */
+  final case class Constant(
+      output: String,
+      // --- Attributes ---
+      value: Array[Byte],
+      dataType: DataType,
+      shape: List[Int],
+  ) extends Operation {
+    override def inputs: List[String] = List.empty
+    override def outputs: List[String] = List(output)
+  }
+
+  /** Represents an element-wise division operation.
+    */
+  final case class Div(inputA: String, inputB: String, output: String) extends Operation {
+    override def inputs: List[String] = List(inputA, inputB)
+    override def outputs: List[String] = List(output)
+  }
+
+  /** Represents a Convolution operation.
+    */
+  final case class Conv(
+      input: String,
+      weight: String,
+      bias: Option[String],
+      output: String,
+      // --- Attributes ---
+      autoPad: AutoPad,
+      dilations: List[Int],
+      group: Int,
+      kernelShape: List[Int],
+      pads: List[Int],
+      strides: List[Int],
+  ) extends Operation {
+    override def inputs: List[String] = List(input, weight) ++ bias.toList
+    override def outputs: List[String] = List(output)
+  }
+
+  /** Represents a ReLU (Rectified Linear Unit) activation operation.
+    */
+  final case class Relu(input: String, output: String) extends Operation {
+    override def inputs: List[String] = List(input)
+    override def outputs: List[String] = List(output)
+  }
+
+  /** Represents a MaxPool operation.
+    */
+  final case class MaxPool(
+      input: String,
+      output: String,
+      // --- Attributes ---
+      autoPad: AutoPad,
+      ceilMode: Boolean,
+      dilations: List[Int],
+      kernelShape: List[Int],
+      pads: List[Int],
+      storageOrder: Int,
+      strides: List[Int],
+  ) extends Operation {
+    override def inputs: List[String] = List(input)
+    override def outputs: List[String] = List(output)
+  }
+
+  /** Represents a Reshape operation. Takes a shape tensor as input following ONNX specification.
+    */
+  final case class Reshape(
+      input: String,
+      shape: String, // Shape tensor input name
+      output: String,
+      // --- Attributes ---
+      allowzero: Boolean = false,
+  ) extends Operation {
+    override def inputs: List[String] = List(input, shape)
+    override def outputs: List[String] = List(output)
+  }
+
+  /** Represents a Softmax activation operation.
+    */
+  final case class Softmax(
+      input: String,
+      output: String,
+      // --- Attributes ---
+      axis: Int = -1, // Default axis is -1 (last dimension)
+  ) extends Operation {
+    override def inputs: List[String] = List(input)
+    override def outputs: List[String] = List(output)
+  }
   // Add more operations here...
 }
 
diff --git a/onnx/src/main/scala/vilcacora/onnx/ModelLoader.scala b/onnx/src/main/scala/vilcacora/onnx/ModelLoader.scala
new file mode 100644
index 0000000..cbfba1f
--- /dev/null
+++ b/onnx/src/main/scala/vilcacora/onnx/ModelLoader.scala
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2023 Arman Bilge
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package vilcacora.onnx
+
+import cats.effect.IO
+import java.io.InputStream
+import vilcacora.onnx.proto.ModelProto
+
+object ModelLoader {
+
+  /** Loads an ONNX model from classpath as a ModelProto. Use a forward slash for resource path,
+    * e.g., "/mnist.onnx".
+    */
+  def loadModelFromPath(modelPath: String): IO[ModelProto] =
+    IO.blocking {
+      val stream: InputStream = getClass.getResourceAsStream(modelPath)
+      if (stream == null)
+        throw new IllegalArgumentException(s"Resource not found: $modelPath")
+      try ModelProto.parseFrom(stream)
+
+      finally stream.close()
+    }
+}
diff --git a/onnx/src/main/scala/vilcacora/onnx/Translator.scala b/onnx/src/main/scala/vilcacora/onnx/Translator.scala
index a9fbcae..035b906 100644
--- a/onnx/src/main/scala/vilcacora/onnx/Translator.scala
+++ b/onnx/src/main/scala/vilcacora/onnx/Translator.scala
@@ -93,11 +93,69 @@ object Translator {
     // Create allocations for constant tensors, which include initial data.
     val initializerAllocations: Either[String, List[Allocation]] =
       graph.initializer.toList.traverse(createAllocationFromInitializer)
+    // This new section manually creates allocations for intermediate tensors
+    // that are not explicitly declared in the ONNX graph's value_info.
+
+    val manuallyCreatedAllocs = for {
+      valAllocs <- valueAllocations
+      initAllocs <- initializerAllocations
+      // Create a temporary map of all known allocations so far for lookups.
+      existingAllocs = (valAllocs ++ initAllocs).map(a => a.name -> a).toMap
+
+      // Iterate through all nodes to find any that need special handling.
+      newAllocs <- graph.node.toList.flatTraverse { node =>
+        node.opType match {
+          case "SVMClassifier" =>
+            for {
+              _ <- checkArity(node, 1, 2) // Ensure SVMClassifier has 2 outputs
+              scoresOutputName = node.output(1)
+              // Check if an allocation for the scores tensor already exists.
+              allocations <-
+                if (existingAllocs.contains(scoresOutputName)) {
+                  // If it exists, we don't need to do anything.
+                  Right(List.empty[Allocation])
+                } else {
+                  // If it doesn't exist, create it manually.
+                  for {
+                    // Get the input tensor's allocation to infer the batch size.
+                    inputAlloc <- existingAllocs
+                      .get(node.input.head)
+                      .toRight(
+                        s"SVM input '${node.input.head}' not found in allocations.",
+                      )
+                    batchSize <- inputAlloc.shape.headOption.toRight(
+                      s"Input '${inputAlloc.name}' for SVM has no dimensions.",
+                    )
+
+                    // Get the number of classes from the node's attributes.
+                    attributes = new OnnxAttributeHelper(node)
+                    classLabels <- attributes.getInts("classlabels_ints")
+                    numClasses = classLabels.size
+
+                    // The ONNX spec defines the scores output as a float tensor.
+                    // We default to Float32. Shape is [batch_size, num_classes].
+                    scoresAlloc = Allocation(
+                      name = scoresOutputName,
+                      dataType = DataType.Float32,
+                      shape = List(batchSize.toInt, numClasses),
+                      initialData = None,
+                    )
+                  } yield List(scoresAlloc)
+                }
+            } yield allocations
+
+          case _ =>
+            // For all other operators, we assume their outputs are properly declared.
+            Right(List.empty[Allocation])
+        }
+      }
+    } yield newAllocs
 
     for {
       valAllocs <- valueAllocations
       initAllocs <- initializerAllocations
-    } yield (valAllocs ++ initAllocs).map(a => a.name -> a).toMap
+      manualAllocs <- manuallyCreatedAllocs
+    } yield (valAllocs ++ initAllocs ++ manualAllocs).map(a => a.name -> a).toMap
   }
 
   /** Translates a single ONNX `NodeProto` into its corresponding IR `Operation`.
@@ -106,7 +164,7 @@ object Translator {
     val attributes = new OnnxAttributeHelper(node)
     node.opType match {
       // Group simple binary operators
-      case "MatMul" | "Add" | "Mul" =>
+      case "MatMul" | "Add" | "Mul" | "Div" =>
         for {
           // The arity check ensures the .head and (1) accessors below are safe.
           _ <- checkArity(node, expectedInputs = 2, expectedOutputs = 1)
@@ -115,6 +173,7 @@ object Translator {
               Right(Operation.MatMul(node.input.head, node.input(1), node.output.head))
             case "Add" => Right(Operation.Add(node.input.head, node.input(1), node.output.head))
             case "Mul" => Right(Operation.Mul(node.input.head, node.input(1), node.output.head))
+            case "Div" => Right(Operation.Div(node.input.head, node.input(1), node.output.head))
             case _ => Left("Internal error: Unreachable code in operator matching")
           }
         } yield op
@@ -152,6 +211,157 @@ object Translator {
           supportVectors = supportVectors.map(_.toDouble).toArray,
           vectorsPerClass = vectorsPerClass.toList,
         )
+
+      // Represents a ReLU (Rectified Linear Unit) activation operation.
+      case "Relu" =>
+        for {
+          _ <- checkArity(node, expectedInputs = 1, expectedOutputs = 1)
+        } yield Operation.Relu(node.input.head, node.output.head)
+
+      // Represents a Reshape operation.
+      case "Reshape" =>
+        for {
+          _ <- checkArity(node, expectedInputs = 2, expectedOutputs = 1)
+          // The 'allowzero' attribute is optional and defaults to 0 (false).
+          allowzero = node.attribute.find(_.name == "allowzero").map(_.i).getOrElse(0L) != 0L
+        } yield Operation.Reshape(
+          input = node.input.head,
+          shape = node.input(1),
+          output = node.output.head,
+          allowzero = allowzero,
+        )
+
+      // Represents a Convolution operation.
+      case "Conv" =>
+        for {
+          _ <-
+            if ((node.input.size == 2 || node.input.size == 3) && node.output.size == 1)
+              Right(())
+            else
+              Left(
+                s"Node '${node.name}' (opType: Conv) expects 2 or 3 inputs and 1 output, but got ${node.input.size} and ${node.output.size}",
+              )
+
+          // 'kernel_shape' is a required attribute.
+          kernelShape <- attributes.getInts("kernel_shape")
+
+          // Handle optional attributes with defaults as per ONNX specification.
+          autoPadStr = node.attribute
+            .find(_.name == "auto_pad")
+            .map(_.s.toStringUtf8())
+            .getOrElse("NOTSET")
+          autoPad <- AutoPad.fromString(autoPadStr)
+          group = node.attribute.find(_.name == "group").map(_.i).getOrElse(1L)
+
+          spatialDims = kernelShape.size
+          dilations = node.attribute
+            .find(_.name == "dilations")
+            .map(_.ints)
+            .getOrElse(Seq.fill(spatialDims)(1L))
+          pads = node.attribute
+            .find(_.name == "pads")
+            .map(_.ints)
+            .getOrElse(Seq.fill(spatialDims * 2)(0L))
+          strides = node.attribute
+            .find(_.name == "strides")
+            .map(_.ints)
+            .getOrElse(Seq.fill(spatialDims)(1L))
+
+        } yield Operation.Conv(
+          input = node.input.head,
+          weight = node.input(1),
+          bias = if (node.input.size == 3) Some(node.input(2)) else None,
+          output = node.output.head,
+          autoPad = autoPad,
+          dilations = dilations.map(_.toInt).toList,
+          group = group.toInt,
+          kernelShape = kernelShape.map(_.toInt).toList,
+          pads = pads.map(_.toInt).toList,
+          strides = strides.map(_.toInt).toList,
+        )
+
+      // Represents a MaxPool operation.
+      case "MaxPool" =>
+        for {
+          // The IR only uses the first output, but ONNX can have a second (indices).
+          _ <-
+            if (node.input.size == 1 && node.output.nonEmpty) Right(())
+            else
+              Left(
+                s"Node '${node.name}' (opType: MaxPool) expects 1 input and at least 1 output, but got ${node.input.size} and ${node.output.size}",
+              )
+
+          // 'kernel_shape' is a required attribute.
+          kernelShape <- attributes.getInts("kernel_shape")
+
+          // Handle optional attributes with defaults.
+          autoPadStr = node.attribute
+            .find(_.name == "auto_pad")
+            .map(_.s.toStringUtf8())
+            .getOrElse("NOTSET")
+          autoPad <- AutoPad.fromString(autoPadStr)
+          ceilMode = node.attribute.find(_.name == "ceil_mode").map(_.i).getOrElse(0L) != 0L
+          storageOrder = node.attribute.find(_.name == "storage_order").map(_.i).getOrElse(0L)
+
+          spatialDims = kernelShape.size
+          dilations = node.attribute
+            .find(_.name == "dilations")
+            .map(_.ints)
+            .getOrElse(Seq.fill(spatialDims)(1L))
+          pads = node.attribute
+            .find(_.name == "pads")
+            .map(_.ints)
+            .getOrElse(Seq.fill(spatialDims * 2)(0L))
+          strides = node.attribute
+            .find(_.name == "strides")
+            .map(_.ints)
+            .getOrElse(Seq.fill(spatialDims)(1L))
+
+        } yield Operation.MaxPool(
+          input = node.input.head,
+          output = node.output.head,
+          autoPad = autoPad,
+          ceilMode = ceilMode,
+          dilations = dilations.map(_.toInt).toList,
+          kernelShape = kernelShape.map(_.toInt).toList,
+          pads = pads.map(_.toInt).toList,
+          storageOrder = storageOrder.toInt,
+          strides = strides.map(_.toInt).toList,
+        )
+      case "Constant" =>
+        for {
+          // A Constant node has 0 inputs and 1 output.
+          _ <- checkArity(node, expectedInputs = 0, expectedOutputs = 1)
+
+          // The constant's data is stored in a 'value' attribute of type TensorProto.
+          valueAttribute <- node.attribute
+            .find(_.name == "value")
+            .toRight(s"Constant node '${node.name}' is missing the 'value' attribute.")
+
+          tensorProto <- valueAttribute.t
+            .toRight(s"Attribute 'value' in Constant node '${node.name}' is not a tensor.")
+
+          // Use existing helpers to extract the data type and raw bytes.
+          dataType <- fromOnnxDataType(tensorProto.dataType)
+          shape = tensorProto.dims.map(_.toInt).toList
+          data <- extractBytes(tensorProto, dataType)
+
+        } yield Operation.Constant(
+          output = node.output.head,
+          value = data,
+          dataType = dataType,
+          shape = shape,
+        )
+      case "Softmax" =>
+        for {
+          _ <- checkArity(node, expectedInputs = 1, expectedOutputs = 1)
+          // The 'axis' attribute is optional and defaults to -1 in ONNX version 13+
+          axis = node.attribute.find(_.name == "axis").map(_.i).getOrElse(-1L)
+        } yield Operation.Softmax(
+          input = node.input.head,
+          output = node.output.head,
+          axis = axis.toInt,
+        )
       case unsupported => Left(s"Unsupported operation type: $unsupported")
     }
   }
@@ -223,8 +433,8 @@ object Translator {
       case 11 => Right(DataType.Float64)
       case 10 => Right(DataType.Float16)
       case 16 => Right(DataType.BFloat16)
-      case 7 => Right(DataType.Int32)
-      case 6 => Right(DataType.Int64)
+      case 6 => Right(DataType.Int32)
+      case 7 => Right(DataType.Int64)
       case 5 => Right(DataType.Int16)
       case 3 => Right(DataType.Int8)
       case 12 => Right(DataType.UInt32)
@@ -256,8 +466,8 @@ object Translator {
       tensor.dataType match {
         case 1 => tensor.floatData.foreach(buffer.putFloat)
         case 11 => tensor.doubleData.foreach(buffer.putDouble)
-        case 7 => tensor.int32Data.foreach(buffer.putInt)
-        case 6 => tensor.int64Data.foreach(buffer.putLong)
+        case 6 => tensor.int32Data.foreach(buffer.putInt)
+        case 7 => tensor.int64Data.foreach(buffer.putLong)
         // Note: Other types like int16 are typically stored in `rawData` or `int32Data`.
         case unsupportedType =>
           return Left(
@@ -294,5 +504,6 @@ object Translator {
         .get(name)
         .toRight(s"Missing attribute '$name' in node '${node.name}'")
         .map(_.ints)
+
   }
 }
diff --git a/onnx/src/test/scala/vilcacora/onnx/TranslatorInternalsSuite.scala b/onnx/src/test/scala/vilcacora/onnx/TranslatorInternalsSuite.scala
index 27bbbe1..aa64c88 100644
--- a/onnx/src/test/scala/vilcacora/onnx/TranslatorInternalsSuite.scala
+++ b/onnx/src/test/scala/vilcacora/onnx/TranslatorInternalsSuite.scala
@@ -28,9 +28,10 @@ class TranslatorInternalsSuite extends FunSuite {
 
   test("fromOnnxDataType should map known type codes") {
     assertEquals(Translator.fromOnnxDataType(1), Right(DataType.Float32))
-    assertEquals(Translator.fromOnnxDataType(7), Right(DataType.Int32))
+    // ONNX code 6 is INT32, 7 is INT64
+    assertEquals(Translator.fromOnnxDataType(6), Right(DataType.Int32))
+    assertEquals(Translator.fromOnnxDataType(7), Right(DataType.Int64))
   }
-
   test("fromOnnxDataType should fail on unknown type codes") {
     assert(Translator.fromOnnxDataType(999).isLeft)
   }
diff --git a/onnx/src/test/scala/vilcacora/onnx/TranslatorSuite.scala b/onnx/src/test/scala/vilcacora/onnx/TranslatorSuite.scala
index b0b5a1f..a9e49f4 100644
--- a/onnx/src/test/scala/vilcacora/onnx/TranslatorSuite.scala
+++ b/onnx/src/test/scala/vilcacora/onnx/TranslatorSuite.scala
@@ -16,11 +16,12 @@
 
 package vilcacora.onnx
 
-import com.armanbilge.vilcacora.ir.{DataType, Operation, PostTransform, SVMKernel}
-import vilcacora.onnx.proto.{AttributeProto, ModelProto, NodeProto}
+import com.armanbilge.vilcacora.ir.{DataType, Operation, PostTransform, SVMKernel, AutoPad}
+import vilcacora.onnx.proto.{AttributeProto, ModelProto, NodeProto, TensorProto}
 import com.google.protobuf.ByteString
 import munit.FunSuite
 import java.io.InputStream
+import java.nio.{ByteBuffer, ByteOrder}
 
 class TranslatorSuite extends FunSuite {
 
@@ -42,8 +43,8 @@ class TranslatorSuite extends FunSuite {
       input = Seq("in"),
       output = Seq("out"),
       attribute = Seq(
-        AttributeProto(name = "to", i = 6, `type` = AttributeProto.AttributeType.INT),
-      ), // 6 = INT64
+        AttributeProto(name = "to", i = 7, `type` = AttributeProto.AttributeType.INT),
+      ), // 7 = INT64
     )
     assertEquals(Translator.translateNode(node), Right(Operation.Cast("in", "out", DataType.Int64)))
   }
@@ -95,6 +96,173 @@ class TranslatorSuite extends FunSuite {
       case other => fail(s"Expected SVMClassifier but got $other")
     }
   }
+  test("translateNode should translate a Div node") {
+    val node =
+      NodeProto(opType = "Div", input = Seq("Numerator", "Denominator"), output = Seq("Quotient"))
+    assertEquals(
+      Translator.translateNode(node),
+      Right(Operation.Div("Numerator", "Denominator", "Quotient")),
+    )
+  }
+
+  test("translateNode should translate a Relu node") {
+    val node = NodeProto(opType = "Relu", input = Seq("X"), output = Seq("Y"))
+    assertEquals(Translator.translateNode(node), Right(Operation.Relu("X", "Y")))
+  }
+
+  test("translateNode should translate a Reshape node") {
+    val node = NodeProto(opType = "Reshape", input = Seq("data", "shape"), output = Seq("reshaped"))
+    // Test with the default 'allowzero' attribute
+    assertEquals(
+      Translator.translateNode(node),
+      Right(Operation.Reshape("data", "shape", "reshaped", allowzero = false)),
+    )
+  }
+
+  test("translateNode should translate a Constant node") {
+    // Prepare the raw byte data for a Float32 tensor with value [1.0f, 2.0f]
+    val byteBuffer = ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN)
+    byteBuffer.putFloat(1.0f)
+    byteBuffer.putFloat(2.0f)
+    val rawBytes = byteBuffer.array()
+
+    // The constant data is stored as a TensorProto inside an AttributeProto
+    val tensorProto = TensorProto(
+      dataType = 1, // Float32
+      dims = Seq(2),
+      rawData = ByteString.copyFrom(rawBytes),
+    )
+    val attribute = AttributeProto(
+      name = "value",
+      t = Some(tensorProto),
+      `type` = AttributeProto.AttributeType.TENSOR,
+    )
+
+    val node = NodeProto(
+      opType = "Constant",
+      input = Nil,
+      output = Seq("const_out"),
+      attribute = Seq(attribute),
+    )
+
+    val result = Translator.translateNode(node)
+    assert(result.isRight, "Constant translation failed")
+    result.foreach {
+      case op: Operation.Constant =>
+        assertEquals(op.output, "const_out")
+        assertEquals(op.dataType, DataType.Float32)
+        assertEquals(op.shape, List(2))
+        assert(op.value.sameElements(rawBytes), "Raw byte data did not match")
+      case other => fail(s"Expected Constant but got $other")
+    }
+  }
+
+  test("translateNode should translate a Conv node") {
+    val node = NodeProto(
+      opType = "Conv",
+      input = Seq("X", "W", "B"), // Input, Weights, Bias
+      output = Seq("Y"),
+      attribute = Seq(
+        AttributeProto(name = "kernel_shape", ints = Seq(3L, 3L)),
+        AttributeProto(name = "strides", ints = Seq(1L, 1L)),
+        AttributeProto(name = "pads", ints = Seq(1L, 1L, 1L, 1L)),
+        AttributeProto(name = "dilations", ints = Seq(1L, 1L)),
+        AttributeProto(name = "group", i = 1L),
+        AttributeProto(name = "auto_pad", s = ByteString.copyFromUtf8("NOTSET")),
+      ),
+    )
+
+    val expected = Operation.Conv(
+      input = "X",
+      weight = "W",
+      bias = Some("B"),
+      output = "Y",
+      autoPad = AutoPad.NotSet,
+      dilations = List(1, 1),
+      group = 1,
+      kernelShape = List(3, 3),
+      pads = List(1, 1, 1, 1),
+      strides = List(1, 1),
+    )
+
+    assertEquals(Translator.translateNode(node), Right(expected))
+  }
+
+  test("translateNode should translate a MaxPool node") {
+    val node = NodeProto(
+      opType = "MaxPool",
+      input = Seq("X"),
+      output = Seq("Y"),
+      attribute = Seq(
+        AttributeProto(name = "kernel_shape", ints = Seq(2L, 2L)),
+        AttributeProto(name = "strides", ints = Seq(2L, 2L)),
+        AttributeProto(name = "pads", ints = Seq(0L, 0L, 0L, 0L)),
+        AttributeProto(name = "ceil_mode", i = 0L),
+        AttributeProto(name = "storage_order", i = 0L),
+        AttributeProto(name = "dilations", ints = Seq(1L, 1L)),
+        AttributeProto(name = "auto_pad", s = ByteString.copyFromUtf8("NOTSET")),
+      ),
+    )
+
+    val expected = Operation.MaxPool(
+      input = "X",
+      output = "Y",
+      autoPad = AutoPad.NotSet,
+      ceilMode = false,
+      dilations = List(1, 1),
+      kernelShape = List(2, 2),
+      pads = List(0, 0, 0, 0),
+      storageOrder = 0,
+      strides = List(2, 2),
+    )
+
+    assertEquals(Translator.translateNode(node), Right(expected))
+  }
+  test("translateNode should translate a Softmax node with default axis") {
+    val node = NodeProto(
+      opType = "Softmax",
+      input = Seq("input_tensor"),
+      output = Seq("output_tensor"),
+      // No axis attribute, should default to -1
+    )
+
+    assertEquals(
+      Translator.translateNode(node),
+      Right(Operation.Softmax("input_tensor", "output_tensor", axis = -1)),
+    )
+  }
+
+  test("translateNode should translate a Softmax node with explicit axis") {
+    val node = NodeProto(
+      opType = "Softmax",
+      input = Seq("logits"),
+      output = Seq("probabilities"),
+      attribute = Seq(
+        AttributeProto(name = "axis", i = 1L, `type` = AttributeProto.AttributeType.INT),
+      ),
+    )
+
+    assertEquals(
+      Translator.translateNode(node),
+      Right(Operation.Softmax("logits", "probabilities", axis = 1)),
+    )
+  }
+
+  test("translateNode should handle Softmax with axis=0") {
+    val node = NodeProto(
+      opType = "Softmax",
+      input = Seq("batch_logits"),
+      output = Seq("batch_probs"),
+      attribute = Seq(
+        AttributeProto(name = "axis", i = 0L, `type` = AttributeProto.AttributeType.INT),
+      ),
+    )
+
+    assertEquals(
+      Translator.translateNode(node),
+      Right(Operation.Softmax("batch_logits", "batch_probs", axis = 0)),
+    )
+  }
 
   // --- End-to-end tests using model files from resources ---
 
diff --git a/project/build.properties b/project/build.properties
index 73df629..bbb0b60 100644
--- a/project/build.properties
+++ b/project/build.properties
@@ -1 +1 @@
-sbt.version=1.10.7
+sbt.version=1.11.2
diff --git a/project/plugins.sbt b/project/plugins.sbt
index c7a055b..a437639 100644
--- a/project/plugins.sbt
+++ b/project/plugins.sbt
@@ -1,5 +1,8 @@
-addSbtPlugin("org.typelevel" % "sbt-typelevel" % "0.7.4")
-addSbtPlugin("org.scala-native" % "sbt-scala-native" % "0.5.5")
-addSbtPlugin("com.armanbilge" % "sbt-scala-native-config-brew-github-actions" % "0.3.0")
+resolvers += Resolver.sonatypeCentralSnapshots
+
+addSbtPlugin("org.typelevel" % "sbt-typelevel" % "0.8.5")
+addSbtPlugin("org.scala-native" % "sbt-scala-native" % "0.5.9")
+addSbtPlugin("com.armanbilge" % "sbt-scala-native-config-brew-github-actions" % "0.4.0")
 addSbtPlugin("com.thesamet" % "sbt-protoc" % "1.0.7")
+
 libraryDependencies += "com.thesamet.scalapb" %% "compilerplugin" % "0.11.17"
diff --git a/runtime/src/main/resources/scala-native/mlpack_wrapper.cpp b/runtime/src/main/resources/scala-native/mlpack_wrapper.cpp
new file mode 100644
index 0000000..0c73a51
--- /dev/null
+++ b/runtime/src/main/resources/scala-native/mlpack_wrapper.cpp
@@ -0,0 +1,230 @@
+#ifdef MLPACK_Wrapper
+#include <mlpack/core.hpp>
+#include <mlpack/methods/ann/layer/convolution.hpp>
+#include <mlpack/methods/ann/layer/max_pooling.hpp>
+#include <armadillo>
+#include <cstring>
+
+// ---------------- type aliases ----------------
+using FConv       = mlpack::Convolution<arma::fmat>;
+using DConv       = mlpack::Convolution<arma::mat>;
+using FMaxPooling = mlpack::MaxPooling<arma::fmat>;
+using DMaxPooling = mlpack::MaxPooling<arma::mat>;
+
+// ---------------- handle structs --------------
+struct ConvHandleF {
+  FConv*         layer;
+  arma::fmat     inView;
+  arma::fmat     outView;
+};
+
+struct ConvHandleD {
+  DConv*         layer;
+  arma::mat      inView;
+  arma::mat      outView;
+};
+
+struct PoolHandleF {
+  FMaxPooling*   layer;
+  arma::fmat     inView;
+  arma::fmat     outView;
+};
+
+struct PoolHandleD {
+  DMaxPooling*   layer;
+  arma::mat      inView;
+  arma::mat      outView;
+};
+
+struct SoftmaxF { arma::fmat inView, outView; };
+struct SoftmaxD { arma::mat  inView, outView; };
+
+extern "C" {
+
+// =======================================================
+//                 FLOAT 32-bit CONVOLUTION
+// =======================================================
+ConvHandleF* initialise_conv_f(
+    size_t  outMaps, size_t kH, size_t kW,
+    size_t  sH, size_t sW, int autoPad, int useBias,
+    size_t  inH, size_t inW, size_t inC,
+    const float* wPtr, const float* bPtr,
+    float* inputPtr,  float* outputPtr) {
+
+  auto* layer = new FConv(outMaps, kW, kH, sW, sH, 0, 0,
+                          (autoPad ? "same" : "valid"), useBias);
+  layer->InputDimensions() = {inH, inW, inC};
+  layer->ComputeOutputDimensions();
+
+  arma::fcube wCube(const_cast<float*>(wPtr),
+                    kW, kH, outMaps * inC, false, false);
+  layer->Weight() = wCube;
+
+  if (useBias) {
+    arma::fmat bMat(const_cast<float*>(bPtr), outMaps, 1, false, false);
+    layer->Bias() = bMat;
+  }
+
+  const auto& od = layer->OutputDimensions();
+  size_t outElems = od[0] * od[1] * od[2];
+
+  arma::fmat inView (inputPtr,  inH * inW * inC, 1, false, false);
+  arma::fmat outView(outputPtr, outElems,          1, false, false);
+
+  return new ConvHandleF{ layer, std::move(inView), std::move(outView) };
+}
+
+void execute_conv_f(ConvHandleF* h) {
+  h->layer->Forward(h->inView, h->outView);
+}
+
+void cleanup_conv_f(ConvHandleF* h) {
+  delete h->layer;
+  delete h;
+}
+
+// =======================================================
+//                 FLOAT 64-bit CONVOLUTION
+// =======================================================
+ConvHandleD* initialise_conv_d(
+    size_t  outMaps, size_t kH, size_t kW,
+    size_t  sH, size_t sW, int autoPad, int useBias,
+    size_t  inH, size_t inW, size_t inC,
+    const double* wPtr, const double* bPtr,
+    double* inputPtr,  double* outputPtr) {
+
+  auto* layer = new DConv(outMaps, kW, kH, sW, sH, 0, 0,
+                          (autoPad ? "same" : "valid"), useBias);
+  layer->InputDimensions() = {inH, inW, inC};
+  layer->ComputeOutputDimensions();
+
+  arma::cube wCube(const_cast<double*>(wPtr),
+                   kW, kH, outMaps * inC, false, false);
+  layer->Weight() = wCube;
+
+  if (useBias) {
+    arma::mat bMat(const_cast<double*>(bPtr), outMaps, 1, false, false);
+    layer->Bias() = bMat;
+  }
+
+  const auto& od = layer->OutputDimensions();
+  size_t outElems = od[0] * od[1] * od[2];
+
+  arma::mat inView (inputPtr,  inH * inW * inC, 1, false, false);
+  arma::mat outView(outputPtr, outElems,         1, false, false);
+
+  return new ConvHandleD{ layer, std::move(inView), std::move(outView) };
+}
+
+void execute_conv_d(ConvHandleD* h) {
+  h->layer->Forward(h->inView, h->outView);
+}
+
+void cleanup_conv_d(ConvHandleD* h) {
+  delete h->layer;
+  delete h;
+}
+
+// =======================================================
+//                     FLOAT 32-bit MAXPOOL
+// =======================================================
+PoolHandleF* initialise_pool_f(
+    size_t kH, size_t kW,
+    size_t sH, size_t sW,
+    size_t inH, size_t inW, size_t inC,
+    float* inputPtr,  float* outputPtr) {
+
+  auto* layer = new FMaxPooling(kW, kH, sW, sH);
+  layer->InputDimensions() = {inH, inW, inC};
+  layer->ComputeOutputDimensions();
+
+  const auto& od = layer->OutputDimensions();
+  size_t outElems = od[0] * od[1] * od[2];
+
+  arma::fmat inView (inputPtr,  inH * inW * inC, 1, false, false);
+  arma::fmat outView(outputPtr, outElems,         1, false, false);
+
+  return new PoolHandleF{ layer, std::move(inView), std::move(outView) };
+}
+
+void execute_pool_f(PoolHandleF* h) {
+  h->layer->Forward(h->inView, h->outView);
+}
+
+void cleanup_pool_f(PoolHandleF* h) {
+  delete h->layer;
+  delete h;
+}
+
+// =======================================================
+//                     FLOAT 64-bit MAXPOOL
+// =======================================================
+PoolHandleD* initialise_pool_d(
+    size_t kH, size_t kW,
+    size_t sH, size_t sW,
+    size_t inH, size_t inW, size_t inC,
+    double* inputPtr,  double* outputPtr) {
+
+  auto* layer = new DMaxPooling(kW, kH, sW, sH);
+  layer->InputDimensions() = {inH, inW, inC};
+  layer->ComputeOutputDimensions();
+
+  const auto& od = layer->OutputDimensions();
+  size_t outElems = od[0] * od[1] * od[2];
+
+  arma::mat inView (inputPtr,  inH * inW * inC, 1, false, false);
+  arma::mat outView(outputPtr, outElems,         1, false, false);
+
+  return new PoolHandleD{ layer, std::move(inView), std::move(outView) };
+}
+
+void execute_pool_d(PoolHandleD* h) {
+  h->layer->Forward(h->inView, h->outView);
+}
+
+void cleanup_pool_d(PoolHandleD* h) {
+  delete h->layer;
+  delete h;
+}
+
+// =======================================================
+//                          SOFTMAX
+// =======================================================
+void F_perform_softmax_direct(
+    const float* input_ptr, size_t input_size,
+    float* output_ptr) // Scala pre-allocates same size as input
+{
+    // Direct computation - no intermediate allocations
+    float max_val = *std::max_element(input_ptr, input_ptr + input_size);
+    
+    float sum = 0.0f;
+    for (size_t i = 0; i < input_size; ++i) {
+        output_ptr[i] = std::exp(input_ptr[i] - max_val);
+        sum += output_ptr[i];
+    }
+    
+    for (size_t i = 0; i < input_size; ++i) {
+        output_ptr[i] /= sum;
+    }
+}
+void perform_softmax_direct(
+    const double* input_ptr, size_t input_size,
+    double* output_ptr) // Scala pre-allocates same size as input
+{
+    // Direct computation - no intermediate allocations
+    double max_val = *std::max_element(input_ptr, input_ptr + input_size);
+    
+    double sum = 0.0;
+    for (size_t i = 0; i < input_size; ++i) {
+        output_ptr[i] = std::exp(input_ptr[i] - max_val);
+        sum += output_ptr[i];
+    }
+    
+    for (size_t i = 0; i < input_size; ++i) {
+        output_ptr[i] /= sum;
+    }
+
+} 
+
+} // extern "C"
+#endif
\ No newline at end of file
diff --git a/runtime/src/main/resources/scala-native/scala-native.properties b/runtime/src/main/resources/scala-native/scala-native.properties
new file mode 100644
index 0000000..bb3bcfc
--- /dev/null
+++ b/runtime/src/main/resources/scala-native/scala-native.properties
@@ -0,0 +1,2 @@
+compile.include.paths = .
+compile.cpp.options = -std=c++17, -fcxx-exceptions,-Wno-deprecated-declarations
diff --git a/runtime/src/main/resources/scala-native/svm_wrapper.cpp b/runtime/src/main/resources/scala-native/svm_wrapper.cpp
new file mode 100644
index 0000000..512fc6c
--- /dev/null
+++ b/runtime/src/main/resources/scala-native/svm_wrapper.cpp
@@ -0,0 +1,204 @@
+#ifdef SVM_Wrapper
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "svm.h"
+
+// Wrapper functions for LibSVM struct creation and management
+extern "C" {
+
+// Create and initialize svm_parameter
+struct svm_parameter* create_svm_param(
+    int svm_type, 
+    int kernel_type, 
+    int degree, 
+    double gamma, 
+    double coef0
+) {
+    struct svm_parameter *param = (struct svm_parameter *)malloc(sizeof(struct svm_parameter));
+    if (!param) return NULL;
+    
+    // Initialize all fields to safe defaults
+    memset(param, 0, sizeof(struct svm_parameter));
+    
+    // Set provided values
+    param->svm_type = svm_type;
+    param->kernel_type = kernel_type;
+    param->degree = degree;
+    param->gamma = gamma;
+    param->coef0 = coef0;
+    
+
+    
+    return param;
+}
+
+// Create svm_model with proper initialization
+struct svm_model* create_svm_model(
+    struct svm_parameter *param,
+    int nr_class,
+    int l,
+    double *support_vectors,  // flattened array [l * num_features]
+    int num_features,
+    double *coefficients,     // flattened array [(nr_class-1) * l]  
+    double *rho,             // array [nr_class*(nr_class-1)/2]
+    int *class_labels,       // array [nr_class]
+    int *n_sv_per_class      // array [nr_class]
+) {
+    struct svm_model *model = (struct svm_model *)malloc(sizeof(struct svm_model));
+    if (!model) return NULL;
+    
+    // Initialize all fields
+    memset(model, 0, sizeof(struct svm_model));
+    
+    // Basic model properties
+    model->param = *param;  // Copy parameter struct
+    model->nr_class = nr_class;
+    model->l = l;
+    
+    // Allocate and populate support vectors
+    model->SV = (struct svm_node **)malloc(sizeof(struct svm_node*) * l);
+    for (int i = 0; i < l; i++) {
+        model->SV[i] = (struct svm_node *)malloc(sizeof(struct svm_node) * (num_features + 1));
+        
+        // Copy feature values
+        for (int j = 0; j < num_features; j++) {
+            model->SV[i][j].index = j + 1;  // 1-indexed
+            model->SV[i][j].value = support_vectors[i * num_features + j];
+        }
+        // Terminator node
+        model->SV[i][num_features].index = -1;
+        model->SV[i][num_features].value = 0.0;
+    }
+    
+    // Allocate and populate coefficients
+    model->sv_coef = (double **)malloc(sizeof(double*) * (nr_class - 1));
+    for (int i = 0; i < nr_class - 1; i++) {
+        model->sv_coef[i] = (double *)malloc(sizeof(double) * l);
+        for (int j = 0; j < l; j++) {
+            model->sv_coef[i][j] = coefficients[i * l + j];
+        }
+    }
+    
+    // Copy rho (bias terms)
+    int rho_size = nr_class * (nr_class - 1) / 2;
+    model->rho = (double *)malloc(sizeof(double) * rho_size);
+    memcpy(model->rho, rho, sizeof(double) * rho_size);
+    
+    // Copy class labels
+    model->label = (int *)malloc(sizeof(int) * nr_class);
+    memcpy(model->label, class_labels, sizeof(int) * nr_class);
+    
+    // Copy number of SVs per class
+    model->nSV = (int *)malloc(sizeof(int) * nr_class);
+    memcpy(model->nSV, n_sv_per_class, sizeof(int) * nr_class);
+    
+    // Initialize other fields
+    model->probA = NULL;
+    model->probB = NULL; 
+    model->sv_indices = NULL;
+    model->free_sv = 1;
+    
+    return model;
+}
+
+// Prediction with per-class scores
+int svm_predict_with_scores(
+    struct svm_model *model,
+    double *features,        // input features [num_features]
+    int num_features,
+    double *class_scores     // output scores [nr_class] 
+) {
+    // Create input svm_node array
+    struct svm_node *x = (struct svm_node *)malloc(sizeof(struct svm_node) * (num_features + 1));
+    
+    for (int i = 0; i < num_features; i++) {
+        x[i].index = i + 1;
+        x[i].value = features[i];
+    }
+    x[num_features].index = -1;  // terminator
+    
+    // Get decision values from LibSVM
+    int nr_class = model->nr_class;
+    int dec_values_count = (nr_class * (nr_class - 1)) / 2;
+    double *dec_values = (double *)malloc(sizeof(double) * dec_values_count);
+    
+    double predicted_label = svm_predict_values(model, x, dec_values);
+    
+    // Convert to per-class scores
+    if (nr_class == 2) {
+        // Binary case
+        class_scores[0] = -dec_values[0];
+        class_scores[1] = dec_values[0];
+    } else {
+        // Multiclass: OvO to OvR conversion
+        int *votes = (int *)calloc(nr_class, sizeof(int));
+        double *conf = (double *)calloc(nr_class, sizeof(double));
+        
+        int k = 0;
+        for (int i = 0; i < nr_class; i++) {
+            for (int j = i + 1; j < nr_class; j++) {
+                double margin = dec_values[k];
+                
+                if (margin > 0) {
+                    votes[i] += 1;
+                } else {
+                    votes[j] += 1;
+                }
+                
+                conf[i] -= margin;
+                conf[j] += margin;
+                k++;
+            }
+        }
+        
+        // Apply tie-breaking and final scores
+        for (int c = 0; c < nr_class; c++) {
+            double tconf = conf[c] / (3.0 * (fabs(conf[c]) + 1.0));
+            class_scores[c] = (double)votes[c] + tconf;
+        }
+        
+        free(votes);
+        free(conf);
+    }
+    
+    free(x);
+    free(dec_values);
+    
+    return (int)predicted_label;
+}
+
+// Debug function to print model details
+void debug_model_info(struct svm_model *model) {
+    printf("=== SVM Model Debug Info ===\n");
+    printf("nr_class: %d\n", model->nr_class);
+    printf("l (num support vectors): %d\n", model->l);
+    printf("kernel_type: %d\n", model->param.kernel_type);
+    printf("gamma: %f\n", model->param.gamma);
+    printf("coef0: %f\n", model->param.coef0);
+    printf("degree: %d\n", model->param.degree);
+    
+    printf("Class labels: ");
+    for (int i = 0; i < model->nr_class; i++) {
+        printf("%d ", model->label[i]);
+    }
+    printf("\n");
+    
+    printf("Number of SVs per class: ");
+    for (int i = 0; i < model->nr_class; i++) {
+        printf("%d ", model->nSV[i]);
+    }
+    printf("\n");
+    
+    printf("Rho values: ");
+    int rho_size = model->nr_class * (model->nr_class - 1) / 2;
+    for (int i = 0; i < rho_size; i++) {
+        printf("%f ", model->rho[i]);
+    }
+    printf("\n");
+    printf("===========================\n");
+}
+
+} // extern "C"
+#endif
\ No newline at end of file
diff --git a/runtime/src/main/scala/com/armanbilge/vilcacora/runtime/Blas.scala b/runtime/src/main/scala/com/armanbilge/vilcacora/runtime/Blas.scala
new file mode 100644
index 0000000..80ec285
--- /dev/null
+++ b/runtime/src/main/scala/com/armanbilge/vilcacora/runtime/Blas.scala
@@ -0,0 +1,72 @@
+/*
+ * Copyright 2023 Arman Bilge
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.armanbilge.vilcacora.runtime
+
+import scala.scalanative.unsafe._
+
+/** CBLAS constants - defined outside the extern object */
+object BLASConstants {
+  // CBLAS layout options
+  final val CblasRowMajor: CInt = 101
+  final val CblasColMajor: CInt = 102
+
+  // CBLAS transpose options
+  final val CblasNoTrans: CInt = 111
+  final val CblasTrans: CInt = 112
+  final val CblasConjTrans: CInt = 113
+}
+
+/** Scala Native bindings for OpenBLAS CBLAS functions */
+@link("openblas")
+@extern
+object BLAS {
+  // Double precision matrix multiplication
+  def cblas_dgemm(
+      layout: CInt,
+      transA: CInt,
+      transB: CInt,
+      M: CInt,
+      N: CInt,
+      K: CInt,
+      alpha: CDouble,
+      A: Ptr[CDouble],
+      lda: CInt,
+      B: Ptr[CDouble],
+      ldb: CInt,
+      beta: CDouble,
+      C: Ptr[CDouble],
+      ldc: CInt,
+  ): Unit = extern
+
+  // Single precision matrix multiplication
+  def cblas_sgemm(
+      layout: CInt,
+      transA: CInt,
+      transB: CInt,
+      M: CInt,
+      N: CInt,
+      K: CInt,
+      alpha: CFloat,
+      A: Ptr[CFloat],
+      lda: CInt,
+      B: Ptr[CFloat],
+      ldb: CInt,
+      beta: CFloat,
+      C: Ptr[CFloat],
+      ldc: CInt,
+  ): Unit = extern
+}
diff --git a/runtime/src/main/scala/com/armanbilge/vilcacora/runtime/Interpreter.scala b/runtime/src/main/scala/com/armanbilge/vilcacora/runtime/Interpreter.scala
new file mode 100644
index 0000000..76fc23b
--- /dev/null
+++ b/runtime/src/main/scala/com/armanbilge/vilcacora/runtime/Interpreter.scala
@@ -0,0 +1,1012 @@
+/*
+ * Copyright 2023 Arman Bilge
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.armanbilge.vilcacora.runtime
+
+import cats.effect.{IO, Resource}
+import cats.syntax.all._
+import com.armanbilge.vilcacora.ir._
+import com.armanbilge.vilcacora.runtime.LibSVM._
+import scala.scalanative.unsafe._
+import scala.scalanative.libc.stdlib
+import scala.scalanative.libc.string.memcpy
+import scala.scalanative.unsigned._
+import scala.collection.mutable.ListBuffer
+
+import com.armanbilge.vilcacora.runtime.BLAS._
+import com.armanbilge.vilcacora.runtime.BLASConstants._
+
+/** The core execution engine for a translated `ModelIR`. It manages native memory and executes
+  * model operations within a Cats Effect IO context.
+  */
+object Interpreter {
+
+  /** A type alias mapping a tensor's name to its pointer in native memory. */
+  type MemoryMap = Map[String, Ptr[Byte]]
+
+  /** Executes a complete ModelIR graph.
+    *
+    * The process is separated into two stages:
+    *   1. A synchronous validation of the model to fail-fast on unsupported operations. 2. An
+    *      asynchronous, resource-safe execution of the graph operations within an IO context.
+    *
+    * @param model
+    *   The intermediate representation of the model to execute.
+    * @param inputs
+    *   A map of input tensor names to their corresponding Scala arrays.
+    * @return
+    *   An IO containing a map of output tensor names to their resulting Scala arrays.
+    */
+  def execute(
+      model: ModelIR,
+      inputs: Map[String, Array[_]],
+  ): Resource[IO, IO[Map[String, Array[_]]]] = {
+    validateModel(model)
+    val outputArrays: Map[String, Array[_]] = createOutputArrays(model)
+
+    memoryResource(model, inputs, outputArrays).flatMap { memoryMap =>
+      val opResources: List[Resource[IO, IO[Unit]]] =
+        model.operations.map(op => executeOperation(op, memoryMap, model))
+
+      val combined: Resource[IO, List[IO[Unit]]] = opResources.sequence
+
+      combined.map { opIOs =>
+        val runOps: IO[Unit] = opIOs.traverse_(_ *> IO.cede)
+        runOps.as(outputArrays)
+      }
+    }
+  }
+
+  /** Synchronously validates the model definition, throwing a `NotImplementedError` if any
+    * operation or data type cast is not supported. This ensures the interpreter fails before any
+    * memory is allocated or side effects are scheduled.
+    */
+  private def validateModel(model: ModelIR): Unit =
+    model.operations.foreach {
+      case _: Operation.SVMClassifier | _: Operation.Mul |
+          _: Operation.Softmax => // softmax currently here only because axis is ignored
+        () // Supported
+      case op: Operation.Add =>
+        // Validate Add operation broadcasting compatibility
+        val shapeA = model.allocations(op.inputs(0)).shape
+        val shapeB = model.allocations(op.inputs(1)).shape
+        val outputShape = model.allocations(op.outputs.head).shape
+
+        // Validate data type compatibility
+        val inputAAlloc = model.allocations(op.inputs(0))
+        val inputBAlloc = model.allocations(op.inputs(1))
+        val outputAlloc = model.allocations(op.outputs.head)
+
+        require(
+          inputAAlloc.dataType == inputBAlloc.dataType &&
+            inputBAlloc.dataType == outputAlloc.dataType,
+          s"Add operation requires all tensors to have the same data type. " +
+            s"Got: ${inputAAlloc.dataType}, ${inputBAlloc.dataType}, ${outputAlloc.dataType}",
+        )
+
+        // Validate broadcasting compatibility
+        val broadcastedShape = calculateBroadcastShape(shapeA, shapeB)
+        require(
+          broadcastedShape.isDefined && broadcastedShape.get == outputShape,
+          s"Add operation broadcasting incompatible: shapes ${shapeA.mkString("x")} and ${shapeB.mkString("x")} " +
+            s"cannot broadcast to output shape ${outputShape.mkString("x")}. " +
+            s"Expected output shape: ${broadcastedShape.map(_.mkString("x")).getOrElse("incompatible")}",
+        )
+      case op: Operation.Cast =>
+        val from = model.allocations(op.input).dataType
+        val to = model.allocations(op.output).dataType
+        (from, to) match {
+          case (f, t) if f == t => ()
+          case (DataType.Float64, DataType.Float32) => ()
+          case (DataType.Float32, DataType.Float64) => ()
+          case (from, to) =>
+            throw new NotImplementedError(s"Cast from $from to $to is not implemented.")
+        }
+      case op: Operation.Relu =>
+        // Validate ReLU operation requirements
+        val inputAlloc = model.allocations(op.input)
+        inputAlloc.dataType match {
+          case DataType.Float32 | DataType.Float64 => () // Supported
+          case unsupported =>
+            throw new NotImplementedError(
+              s"ReLU operation not implemented for data type: $unsupported",
+            )
+        }
+      case op: Operation.Reshape =>
+        // Validate reshape operation requirements
+        val inputAlloc = model.allocations(op.input)
+        val outputAlloc = model.allocations(op.output)
+        require(
+          inputAlloc.shape.product == outputAlloc.shape.product,
+          s"Reshape operation '${op.input} -> ${op.output}' requires same total elements. " +
+            s"Input shape ${inputAlloc.shape} has ${inputAlloc.shape.product} elements, " +
+            s"output shape ${outputAlloc.shape} has ${outputAlloc.shape.product} elements",
+        )
+
+      case op: Operation.MatMul =>
+        // Validate MatMul operation requirements
+        val inputAAlloc = model.allocations(op.inputA)
+        val inputBAlloc = model.allocations(op.inputB)
+        val outputAlloc = model.allocations(op.output)
+
+        require(
+          inputAAlloc.dataType == inputBAlloc.dataType &&
+            inputBAlloc.dataType == outputAlloc.dataType,
+          s"MatMul operation requires all tensors to have the same data type. " +
+            s"Got: ${inputAAlloc.dataType}, ${inputBAlloc.dataType}, ${outputAlloc.dataType}",
+        )
+
+        // Validate matrix dimensions for multiplication: A[M,K] * B[K,N] = C[M,N]
+        val shapeA = inputAAlloc.shape
+        val shapeB = inputBAlloc.shape
+        val shapeC = outputAlloc.shape
+
+        require(
+          shapeA.length == 2 && shapeB.length == 2 && shapeC.length == 2,
+          s"MatMul requires 2D matrices. Got shapes: A=${shapeA}, B=${shapeB}, C=${shapeC}",
+        )
+
+        val (m, k_a) = (shapeA(0), shapeA(1))
+        val (k_b, n) = (shapeB(0), shapeB(1))
+        val (m_c, n_c) = (shapeC(0), shapeC(1))
+
+        require(
+          k_a == k_b,
+          s"MatMul dimension mismatch: A columns ($k_a) must equal B rows ($k_b)",
+        )
+
+        require(
+          m == m_c && n == n_c,
+          s"MatMul output shape mismatch: expected [$m, $n], got [$m_c, $n_c]",
+        )
+
+        // Validate supported data types
+        inputAAlloc.dataType match {
+          case DataType.Float32 | DataType.Float64 => () // Supported
+          case unsupported =>
+            throw new NotImplementedError(
+              s"MatMul operation not implemented for data type: $unsupported",
+            )
+        }
+      case op: Operation.Conv =>
+        // Validate Conv operation for MLPack compatibility
+        val inputAlloc = model.allocations(op.input)
+
+        // MLPack only supports Float32 tensors (converted to Float64 internally)
+        inputAlloc.dataType match {
+          case DataType.Float32 | DataType.Float64 => () // Supported
+          case unsupported =>
+            throw new NotImplementedError(
+              s"Conv operation with MLPack only supports Float32 or Float64, got: $unsupported",
+            )
+        }
+
+        // MLPack convolution limitations
+        val padsAreZero = op.pads.forall(_ == 0)
+        val autoPadOk = op.autoPad match {
+          case AutoPad.SameUpper | AutoPad.Valid => true
+          case AutoPad.NotSet if padsAreZero => true // treat as VALID
+          case _ => false
+        }
+        require(
+          autoPadOk,
+          s"MLPack Conv supports SAME_UPPER, VALID, or NOTSET with zero pads; " +
+            s"got autoPad=${op.autoPad}, pads=${op.pads}",
+        )
+
+        require(
+          op.group == 1,
+          s"MLPack Conv does not support grouping (group=${op.group}), only group=1",
+        )
+
+        require(
+          op.dilations.forall(_ == 1),
+          s"MLPack Conv does not support dilation (dilations=${op.dilations}), only [1,1]",
+        )
+
+      case op: Operation.MaxPool =>
+        // Validate MaxPool operation for MLPack compatibility
+        val inputAlloc = model.allocations(op.input)
+
+        inputAlloc.dataType match {
+          case DataType.Float32 | DataType.Float64 => () // Supported
+          case unsupported =>
+            throw new NotImplementedError(
+              s"MaxPool operation with MLPack only supports Float32 or Float64, got: $unsupported",
+            )
+        }
+
+        // MLPack does not implement dilation or ceil_mode
+        require(op.dilations.forall(_ == 1), "MLPack MaxPool requires dilations = [1,1]")
+        require(!op.ceilMode, "MLPack MaxPool does not support ceil_mode = true")
+      case other =>
+        throw new NotImplementedError(s"Operation not implemented: ${other.getClass.getSimpleName}")
+    }
+
+  /** A `Resource` that manages all memory for the graph execution.
+    *   - Input and output tensors get direct pointers to the memory of their Scala arrays
+    *     (zero-copy).
+    *   - Intermediate and constant tensors are allocated in native memory using `malloc`. The
+    *     `Resource` guarantees that all `malloc`'d memory is freed after execution.
+    */
+  private def memoryResource(
+      model: ModelIR,
+      inputs: Map[String, Array[_]],
+      outputs: Map[String, Array[_]],
+  ): Resource[IO, MemoryMap] = {
+    val acquire = IO {
+      val mallocedPtrs = ListBuffer.empty[Ptr[Byte]]
+      val memoryMap = model.allocations.map { case (name, allocation) =>
+        val ptr: Ptr[Byte] =
+          if (inputs.contains(name)) {
+            inputs(name).at(0).asInstanceOf[Ptr[Byte]]
+          } else if (outputs.contains(name)) {
+            outputs(name).at(0).asInstanceOf[Ptr[Byte]]
+          } else {
+            val totalBytes = (allocation.shape.product * allocation.dataType.sizeInBytes).toUSize
+            val p = stdlib.malloc(totalBytes)
+            if (p == null) throw new OutOfMemoryError(s"Failed to allocate tensor '$name'")
+
+            allocation.initialData.foreach(data =>
+              memcpy(p, data.at(0).asInstanceOf[Ptr[Byte]], data.length.toUSize),
+            )
+            mallocedPtrs += p
+            p
+          }
+        name -> ptr
+      }
+      (memoryMap.toMap, mallocedPtrs.toList)
+    }
+
+    Resource
+      .make(acquire) { case (_, ptrsToFree) =>
+        IO(ptrsToFree.foreach(stdlib.free))
+      }
+      .map(_._1)
+  }
+
+  /** Dispatches a single operation to its corresponding handler function. */
+  private def executeOperation(
+      op: Operation,
+      memory: MemoryMap,
+      model: ModelIR,
+  ): Resource[IO, IO[Unit]] =
+    op match {
+      case op: Operation.SVMClassifier => handleSvmClassifier(op, memory, model)
+      case op: Operation.Add => Resource.pure(handleAdd(op, memory, model))
+      case op: Operation.Mul => Resource.pure(handleMul(op, memory, model))
+      case op: Operation.Cast => Resource.pure(handleCast(op, memory, model))
+      case op: Operation.Relu => Resource.pure(handleRelu(op, memory, model))
+      case op: Operation.Reshape => Resource.pure(handleReshape(op, memory, model))
+      case op: Operation.Conv => handleConv(op, memory, model)
+      case op: Operation.MaxPool => handleMaxPool(op, memory, model)
+      case op: Operation.MatMul => Resource.pure(handleMatMul(op, memory, model))
+      case op: Operation.Softmax => Resource.pure(handleSoftmax(op, memory, model))
+
+      case other =>
+        // This case is unreachable due to the pre-validation step.
+        // It remains as a safeguard against internal logic errors.
+        throw new NotImplementedError(s"Operation not implemented: ${other.getClass.getSimpleName}")
+    }
+
+  /** Handles element-wise addition for both Float32 and Float64 tensors. */
+  private def handleAdd(op: Operation.Add, memory: MemoryMap, model: ModelIR): IO[Unit] = IO {
+    val shapeA = model.allocations(op.inputs(0)).shape
+    val shapeB = model.allocations(op.inputs(1)).shape
+    val outputShape = model.allocations(op.outputs.head).shape
+    val dataType = model.allocations(op.inputs(0)).dataType
+
+    if (shapeA == shapeB) {
+      // Fast path: same shapes, simple element-wise addition
+      val count = outputShape.product
+      dataType match {
+        case DataType.Float32 =>
+          val inputA = memory(op.inputs(0)).asInstanceOf[Ptr[CFloat]]
+          val inputB = memory(op.inputs(1)).asInstanceOf[Ptr[CFloat]]
+          val output = memory(op.outputs.head).asInstanceOf[Ptr[CFloat]]
+          var i = 0
+          while (i < count) {
+            !(output + i) = !(inputA + i) + !(inputB + i)
+            i += 1
+          }
+
+        case DataType.Float64 =>
+          val inputA = memory(op.inputs(0)).asInstanceOf[Ptr[CDouble]]
+          val inputB = memory(op.inputs(1)).asInstanceOf[Ptr[CDouble]]
+          val output = memory(op.outputs.head).asInstanceOf[Ptr[CDouble]]
+          var i = 0
+          while (i < count) {
+            !(output + i) = !(inputA + i) + !(inputB + i)
+            i += 1
+          }
+
+        case DataType.Int32 =>
+          val inputA = memory(op.inputs(0)).asInstanceOf[Ptr[CInt]]
+          val inputB = memory(op.inputs(1)).asInstanceOf[Ptr[CInt]]
+          val output = memory(op.outputs.head).asInstanceOf[Ptr[CInt]]
+          var i = 0
+          while (i < count) {
+            !(output + i) = !(inputA + i) + !(inputB + i)
+            i += 1
+          }
+
+        case DataType.Int64 =>
+          val inputA = memory(op.inputs(0)).asInstanceOf[Ptr[CLongLong]]
+          val inputB = memory(op.inputs(1)).asInstanceOf[Ptr[CLongLong]]
+          val output = memory(op.outputs.head).asInstanceOf[Ptr[CLongLong]]
+          var i = 0
+          while (i < count) {
+            !(output + i) = !(inputA + i) + !(inputB + i)
+            i += 1
+          }
+
+        case unsupported =>
+          throw new NotImplementedError(
+            s"Add operation not implemented for data type: $unsupported",
+          )
+      }
+    } else {
+      // Broadcasting path: different shapes
+      val outputCount = outputShape.product
+
+      // Pre-calculate strides once (since validation confirmed compatibility, we know this will work)
+      val stridesA = calculateStrides(shapeA)
+      val stridesB = calculateStrides(shapeB)
+      val outputStrides = calculateStrides(outputShape)
+
+      dataType match {
+        case DataType.Float32 =>
+          val inputA = memory(op.inputs(0)).asInstanceOf[Ptr[CFloat]]
+          val inputB = memory(op.inputs(1)).asInstanceOf[Ptr[CFloat]]
+          val output = memory(op.outputs.head).asInstanceOf[Ptr[CFloat]]
+
+          var i = 0
+          while (i < outputCount) {
+            val idxA = calculateBroadcastIndex(i, outputShape, shapeA, outputStrides, stridesA)
+            val idxB = calculateBroadcastIndex(i, outputShape, shapeB, outputStrides, stridesB)
+            !(output + i) = !(inputA + idxA) + !(inputB + idxB)
+            i += 1
+          }
+
+        case DataType.Float64 =>
+          val inputA = memory(op.inputs(0)).asInstanceOf[Ptr[CDouble]]
+          val inputB = memory(op.inputs(1)).asInstanceOf[Ptr[CDouble]]
+          val output = memory(op.outputs.head).asInstanceOf[Ptr[CDouble]]
+
+          var i = 0
+          while (i < outputCount) {
+            val idxA = calculateBroadcastIndex(i, outputShape, shapeA, outputStrides, stridesA)
+            val idxB = calculateBroadcastIndex(i, outputShape, shapeB, outputStrides, stridesB)
+            !(output + i) = !(inputA + idxA) + !(inputB + idxB)
+            i += 1
+          }
+
+        case unsupported =>
+          throw new NotImplementedError(
+            s"Broadcast Add operation not implemented for data type: $unsupported",
+          )
+      }
+    }
+  }
+
+  /** Handles element-wise multiplication for both Float32 and Float64 tensors. */
+  private def handleMul(op: Operation.Mul, memory: MemoryMap, model: ModelIR): IO[Unit] = IO {
+    val count = model.allocations(op.outputs.head).shape.product
+    val inputAAlloc = model.allocations(op.inputs(0))
+
+    inputAAlloc.dataType match {
+      case DataType.Float32 =>
+        val inputA = memory(op.inputs(0)).asInstanceOf[Ptr[CFloat]]
+        val inputB = memory(op.inputs(1)).asInstanceOf[Ptr[CFloat]]
+        val output = memory(op.outputs.head).asInstanceOf[Ptr[CFloat]]
+
+        var i = 0
+        while (i < count) {
+          !(output + i) = !(inputA + i) * !(inputB + i)
+          i += 1
+        }
+
+      case DataType.Float64 =>
+        val inputA = memory(op.inputs(0)).asInstanceOf[Ptr[CDouble]]
+        val inputB = memory(op.inputs(1)).asInstanceOf[Ptr[CDouble]]
+        val output = memory(op.outputs.head).asInstanceOf[Ptr[CDouble]]
+
+        var i = 0
+        while (i < count) {
+          !(output + i) = !(inputA + i) * !(inputB + i)
+          i += 1
+        }
+
+      case DataType.Int32 =>
+        val inputA = memory(op.inputs(0)).asInstanceOf[Ptr[CInt]]
+        val inputB = memory(op.inputs(1)).asInstanceOf[Ptr[CInt]]
+        val output = memory(op.outputs.head).asInstanceOf[Ptr[CInt]]
+
+        var i = 0
+        while (i < count) {
+          !(output + i) = !(inputA + i) * !(inputB + i)
+          i += 1
+        }
+
+      case DataType.Int64 =>
+        val inputA = memory(op.inputs(0)).asInstanceOf[Ptr[CLongLong]]
+        val inputB = memory(op.inputs(1)).asInstanceOf[Ptr[CLongLong]]
+        val output = memory(op.outputs.head).asInstanceOf[Ptr[CLongLong]]
+
+        var i = 0
+        while (i < count) {
+          !(output + i) = !(inputA + i) * !(inputB + i)
+          i += 1
+        }
+
+      case unsupported =>
+        throw new NotImplementedError(s"Mul operation not implemented for data type: $unsupported")
+    }
+  }
+
+  /** Handles casting between supported data types (Float32 <-> Float64). */
+  private def handleCast(op: Operation.Cast, memory: MemoryMap, model: ModelIR): IO[Unit] = IO {
+    val inputAlloc = model.allocations(op.input)
+    val outputAlloc = model.allocations(op.output)
+    val count = inputAlloc.shape.product
+    val inputPtr = memory(op.input)
+    val outputPtr = memory(op.output)
+
+    (inputAlloc.dataType, outputAlloc.dataType) match {
+      case (from, to) if from == to =>
+        ()
+
+      case (DataType.Float64, DataType.Float32) =>
+        val in = inputPtr.asInstanceOf[Ptr[CDouble]]
+        val out = outputPtr.asInstanceOf[Ptr[CFloat]]
+        var i = 0
+        while (i < count) {
+          !(out + i) = (!(in + i)).toFloat
+          i += 1
+        }
+
+      case (DataType.Float32, DataType.Float64) =>
+        val in = inputPtr.asInstanceOf[Ptr[CFloat]]
+        val out = outputPtr.asInstanceOf[Ptr[CDouble]]
+        var i = 0
+        while (i < count) {
+          !(out + i) = (!(in + i)).toDouble
+          i += 1
+        }
+
+      case (from, to) =>
+        // Unreachable due to pre-validation.
+        throw new IllegalStateException(s"Unvalidated cast from $from to $to encountered.")
+    }
+  }
+
+  /** Handles the SVMClassifier operation by constructing a native LibSvm model, performing the
+    * prediction, and writing the results to the output tensors.
+    */
+  /** Handles the SVMClassifier operation using the C++ wrapper for robust LibSVM integration. This
+    * approach eliminates struct layout issues and provides ONNX-compliant per-class scores.
+    */
+  private def handleSvmClassifier(
+      op: Operation.SVMClassifier,
+      memory: MemoryMap,
+      model: ModelIR,
+  ): Resource[IO, IO[Unit]] = {
+    val numFeatures = model.allocations(op.input).shape.last
+
+    // Create SVM model using C++ wrapper with proper resource management
+    createSvmModelResource(op, numFeatures).map { svmModel =>
+      IO {
+        // Get input and output pointers from memory map
+        val inputPtr = memory(op.input).asInstanceOf[Ptr[CDouble]]
+        val scoresPtr = memory(op.outputScores).asInstanceOf[Ptr[CDouble]]
+        val labelPtr = memory(op.outputLabel).asInstanceOf[Ptr[CInt]]
+
+        // Single function call - all complexity handled in C++
+        val predictedLabel = svm_predict_with_scores(
+          svmModel,
+          inputPtr,
+          numFeatures,
+          scoresPtr,
+        )
+
+        // Write back the predicted label
+        !labelPtr = predictedLabel
+
+        ()
+      }
+    }
+  }
+
+  /** Creates an SVM model using the C++ wrapper functions with proper resource management. All
+    * memory allocation and model construction is handled in C for maximum reliability.
+    */
+  private def createSvmModelResource(
+      op: Operation.SVMClassifier,
+      numFeatures: Int,
+  ): Resource[IO, Ptr[Byte]] = {
+    val nrClass = op.classLabels.size
+    val numSupportVectors = op.vectorsPerClass.sum.toInt
+
+    for {
+      // Create SVM parameter using C++ wrapper
+      param <- createSvmParameterResource(op)
+
+      // Create managed arrays for model data
+      supportVectorsPtr <- createManagedDoubleArray(op.supportVectors)
+      coefficientsPtr <- createManagedDoubleArray(op.coefficients)
+      rhoPtr <- createManagedDoubleArray(op.rho.toArray)
+      classLabelsPtr <- createManagedIntArray(op.classLabels.map(_.toInt).toArray)
+      vectorsPerClassPtr <- createManagedIntArray(op.vectorsPerClass.map(_.toInt).toArray)
+
+      // Create the SVM model using C++ wrapper with LibSVM's native cleanup
+      svmModel <- Resource.make(IO {
+        create_svm_model(
+          param,
+          nrClass,
+          numSupportVectors,
+          supportVectorsPtr,
+          numFeatures,
+          coefficientsPtr,
+          rhoPtr,
+          classLabelsPtr,
+          vectorsPerClassPtr,
+        )
+      })(model =>
+        IO {
+          // Use LibSVM's native cleanup function
+          val modelPtrPtr = stdlib.malloc(sizeof[Ptr[Byte]]).asInstanceOf[Ptr[Ptr[Byte]]]
+          !modelPtrPtr = model
+          svm_free_and_destroy_model(modelPtrPtr)
+          stdlib.free(modelPtrPtr.asInstanceOf[Ptr[Byte]])
+        },
+      )
+
+    } yield svmModel
+  }
+
+  /** Creates SVM parameter using C++ wrapper with proper resource management. */
+  private def createSvmParameterResource(op: Operation.SVMClassifier): Resource[IO, Ptr[Byte]] = {
+    val kernelType = op.kernelType match {
+      case SVMKernel.Linear => 0
+      case SVMKernel.Poly => 1
+      case SVMKernel.Rbf => 2
+      case SVMKernel.Sigmoid => 3
+    }
+
+    val gamma = op.kernelParams.headOption.getOrElse(0.0)
+    val coef0 = op.kernelParams.drop(1).headOption.getOrElse(0.0)
+    val degree = op.kernelParams.drop(2).headOption.map(_.toInt).getOrElse(3)
+
+    Resource.make(IO {
+      create_svm_param(
+        svm_type = 0, // C_SVC
+        kernel_type = kernelType,
+        degree = degree,
+        gamma = gamma,
+        coef0 = coef0,
+      )
+    })(param => IO(stdlib.free(param)))
+  }
+
+  /** Helper to create managed double array for C++ wrapper calls. */
+  private def createManagedDoubleArray(values: Array[Double]): Resource[IO, Ptr[CDouble]] =
+    Resource.make(IO {
+      val ptr = stdlib
+        .malloc(sizeof[CDouble] * values.length.toUSize)
+        .asInstanceOf[Ptr[CDouble]]
+      if (ptr == null) throw new OutOfMemoryError(s"Failed to allocate ${values.length} doubles")
+
+      for (i <- values.indices)
+        ptr(i) = values(i)
+      ptr
+    })(ptr => IO(stdlib.free(ptr.asInstanceOf[Ptr[Byte]])))
+
+  /** Helper to create managed int array for C++ wrapper calls. */
+  private def createManagedIntArray(values: Array[Int]): Resource[IO, Ptr[CInt]] =
+    Resource.make(IO {
+      val ptr = stdlib
+        .malloc(sizeof[CInt] * values.length.toUSize)
+        .asInstanceOf[Ptr[CInt]]
+      if (ptr == null) throw new OutOfMemoryError(s"Failed to allocate ${values.length} ints")
+
+      for (i <- values.indices)
+        ptr(i) = values(i)
+      ptr
+    })(ptr => IO(stdlib.free(ptr.asInstanceOf[Ptr[Byte]])))
+
+  /** Creates empty Scala arrays for each graph output. These arrays will be pointed to by the
+    * `memoryResource` and written to directly from native code.
+    */
+  private def createOutputArrays(model: ModelIR): Map[String, Array[_]] =
+    model.graphOutputs.map { name =>
+      val allocation = model.allocations(name)
+      val size = allocation.shape.product
+      val array: Array[_] = allocation.dataType match {
+        case DataType.Float32 => new Array[Float](size)
+        case DataType.Int32 => new Array[Int](size)
+        case DataType.Float64 => new Array[Double](size)
+        case DataType.Int64 => new Array[Long](size)
+        case other => throw new Exception(s"Unsupported output data type: $other")
+      }
+      name -> array
+    }.toMap
+
+  /** Handles ReLU activation function: output = max(0, input) Simple element-wise operation with
+    * good performance in Scala.
+    */
+  private def handleRelu(op: Operation.Relu, memory: MemoryMap, model: ModelIR): IO[Unit] = IO {
+    val count = model.allocations(op.output).shape.product
+    val inputAlloc = model.allocations(op.input)
+
+    inputAlloc.dataType match {
+      case DataType.Float32 =>
+        val input = memory(op.input).asInstanceOf[Ptr[CFloat]]
+        val output = memory(op.output).asInstanceOf[Ptr[CFloat]]
+        var i = 0
+        while (i < count) {
+          val value = !(input + i)
+          !(output + i) = if (value > 0.0f) value else 0.0f
+          i += 1
+        }
+
+      case DataType.Float64 =>
+        val input = memory(op.input).asInstanceOf[Ptr[CDouble]]
+        val output = memory(op.output).asInstanceOf[Ptr[CDouble]]
+        var i = 0
+        while (i < count) {
+          val value = !(input + i)
+          !(output + i) = if (value > 0.0) value else 0.0
+          i += 1
+        }
+
+      case unsupported =>
+        throw new NotImplementedError(
+          s"ReLU operation not implemented for data type: $unsupported",
+        ) // should never happen due to pre-validation
+    }
+  }
+
+  /** Handles Reshape operation: changes the shape of a tensor without modifying its data. This is a
+    * no-op in terms of data, but must ensure the output pointer is correctly set.
+    */
+  private def handleReshape(op: Operation.Reshape, memory: MemoryMap, model: ModelIR): IO[Unit] =
+    IO {
+      val inputAlloc = model.allocations(op.input)
+      val inputPtr = memory(op.input)
+      val outputPtr = memory(op.output)
+      val totalBytes = (inputAlloc.shape.product * inputAlloc.dataType.sizeInBytes).toUSize
+
+      // Copy data from input to output (same data, different shape interpretation)
+      memcpy(outputPtr, inputPtr, totalBytes)
+      ()
+    }
+  // BROADCASTING HELPER METHODS
+  /** Calculates the broadcasted output shape following numpy broadcasting rules. Returns None if
+    * shapes are incompatible for broadcasting.
+    */
+  private def calculateBroadcastShape(shapeA: List[Int], shapeB: List[Int]): Option[List[Int]] = {
+    val maxDims = math.max(shapeA.length, shapeB.length)
+
+    // Pad shapes with leading 1s
+    val paddedA = List.fill(maxDims - shapeA.length)(1) ++ shapeA
+    val paddedB = List.fill(maxDims - shapeB.length)(1) ++ shapeB
+
+    // Use traverse to validate and transform dimensions
+    paddedA.zip(paddedB).traverse { case (dimA, dimB) =>
+      if (dimA == dimB) {
+        Some(dimA)
+      } else if (dimA == 1) {
+        Some(dimB)
+      } else if (dimB == 1) {
+        Some(dimA)
+      } else {
+        None
+      }
+    }
+  }
+
+  private def handleConv(
+      op: Operation.Conv,
+      memory: MemoryMap,
+      model: ModelIR,
+  ): Resource[IO, IO[Unit]] = {
+
+    // All your existing shape/pointer extraction logic
+    val inputAlloc = model.allocations(op.input)
+    val weightAlloc = model.allocations(op.weight)
+
+    val inputShape = inputAlloc.shape
+    val weightShape = weightAlloc.shape
+    val (inputChannels, inputHeight, inputWidth) = (inputShape(1), inputShape(2), inputShape(3))
+    val (outputChannels, kernelHeight, kernelWidth) =
+      (weightShape(0), op.kernelShape(0), op.kernelShape(1))
+
+    val inputPtr = memory(op.input)
+    val weightPtr = memory(op.weight)
+    val outputPtr = memory(op.outputs.head)
+    val biasPtrOpt = op.bias.map(b => memory(b))
+
+    val autoPadValue = op.autoPad match {
+      case AutoPad.SameUpper => 1
+      case AutoPad.Valid => 0
+      case AutoPad.NotSet if op.pads.forall(_ == 0) => 0
+      case other => throw new IllegalArgumentException(s"Unsupported autoPad in handleConv: $other")
+    }
+
+    val useBias = if (biasPtrOpt.isDefined) 1 else 0
+
+    // Two-phase Resource pattern
+    inputAlloc.dataType match {
+      case DataType.Float32 =>
+        Resource
+          .make(IO {
+            // Phase 1: Initialize with all your extracted values
+            MLPack.initialise_conv_f(
+              outputChannels.toUSize,
+              kernelHeight.toUSize,
+              kernelWidth.toUSize,
+              op.strides(0).toUSize,
+              op.strides(1).toUSize,
+              autoPadValue,
+              useBias,
+              inputHeight.toUSize,
+              inputWidth.toUSize,
+              inputChannels.toUSize,
+              weightPtr.asInstanceOf[Ptr[Float]],
+              biasPtrOpt.getOrElse(null).asInstanceOf[Ptr[Float]],
+              inputPtr.asInstanceOf[Ptr[Float]],
+              outputPtr.asInstanceOf[Ptr[Float]],
+            )
+          })(handle =>
+            IO {
+              // Phase 3: Cleanup
+              MLPack.cleanup_conv_f(handle)
+            },
+          )
+          .map { handle =>
+            // Phase 2: Execute
+            IO(MLPack.execute_conv_f(handle))
+          }
+
+      case DataType.Float64 =>
+        Resource
+          .make(IO {
+            MLPack.initialise_conv_d(
+              outputChannels.toUSize,
+              kernelHeight.toUSize,
+              kernelWidth.toUSize,
+              op.strides(0).toUSize,
+              op.strides(1).toUSize,
+              autoPadValue,
+              useBias,
+              inputHeight.toUSize,
+              inputWidth.toUSize,
+              inputChannels.toUSize,
+              weightPtr.asInstanceOf[Ptr[Double]],
+              biasPtrOpt.getOrElse(null).asInstanceOf[Ptr[Double]],
+              inputPtr.asInstanceOf[Ptr[Double]],
+              outputPtr.asInstanceOf[Ptr[Double]],
+            )
+          })(handle =>
+            IO {
+              MLPack.cleanup_conv_d(handle)
+            },
+          )
+          .map { handle =>
+            IO(MLPack.execute_conv_d(handle))
+          }
+
+      case other =>
+        throw new NotImplementedError(s"Conv input data type $other not supported.")
+    }
+  }
+  private def handleMaxPool(
+      op: Operation.MaxPool,
+      memory: MemoryMap,
+      model: ModelIR,
+  ): Resource[IO, IO[Unit]] = {
+
+    // All your existing extraction logic
+    val inputAlloc = model.allocations(op.input)
+    val inputShape = inputAlloc.shape
+
+    val (inputChannels, inputHeight, inputWidth) = (inputShape(1), inputShape(2), inputShape(3))
+    val (kernelHeight, kernelWidth) = (op.kernelShape(0), op.kernelShape(1))
+
+    val inputPtr = memory(op.input)
+    val outputPtr = memory(op.outputs.head)
+
+    // Two-phase Resource pattern
+    inputAlloc.dataType match {
+      case DataType.Float32 =>
+        Resource
+          .make(IO {
+            // Phase 1: Initialize with all your extracted values
+            MLPack.initialise_pool_f(
+              kernelHeight.toUSize,
+              kernelWidth.toUSize,
+              op.strides(0).toUSize,
+              op.strides(1).toUSize,
+              inputHeight.toUSize,
+              inputWidth.toUSize,
+              inputChannels.toUSize,
+              inputPtr.asInstanceOf[Ptr[Float]],
+              outputPtr.asInstanceOf[Ptr[Float]],
+            )
+          })(handle =>
+            IO {
+              // Phase 3: Cleanup
+              MLPack.cleanup_pool_f(handle)
+            },
+          )
+          .map { handle =>
+            // Phase 2: Execute
+            IO(MLPack.execute_pool_f(handle))
+          }
+
+      case DataType.Float64 =>
+        Resource
+          .make(IO {
+            MLPack.initialise_pool_d(
+              kernelHeight.toUSize,
+              kernelWidth.toUSize,
+              op.strides(0).toUSize,
+              op.strides(1).toUSize,
+              inputHeight.toUSize,
+              inputWidth.toUSize,
+              inputChannels.toUSize,
+              inputPtr.asInstanceOf[Ptr[Double]],
+              outputPtr.asInstanceOf[Ptr[Double]],
+            )
+          })(handle =>
+            IO {
+              MLPack.cleanup_pool_d(handle)
+            },
+          )
+          .map { handle =>
+            IO(MLPack.execute_pool_d(handle))
+          }
+
+      case other =>
+        throw new NotImplementedError(s"MaxPool input data type $other not supported.")
+    }
+  }
+
+  /** Handles Matrix Multiplication using OpenBLAS CBLAS functions. Performs C = A * B where A is
+    * [M, K], B is [K, N], and C is [M, N].
+    */
+  private def handleMatMul(op: Operation.MatMul, memory: MemoryMap, model: ModelIR): IO[Unit] = IO {
+    val inputAAlloc = model.allocations(op.inputA)
+    val inputBAlloc = model.allocations(op.inputB)
+
+    val shapeA = inputAAlloc.shape
+    val shapeB = inputBAlloc.shape
+
+    val M = shapeA(0) // rows of A and C
+    val K = shapeA(1) // cols of A, rows of B
+    val N = shapeB(1) // cols of B and C
+
+    val inputAPtr = memory(op.inputA)
+    val inputBPtr = memory(op.inputB)
+    val outputPtr = memory(op.output)
+
+    inputAAlloc.dataType match {
+      case DataType.Float32 =>
+        cblas_sgemm(
+          layout = CblasRowMajor,
+          transA = CblasNoTrans,
+          transB = CblasNoTrans,
+          M = M,
+          N = N,
+          K = K,
+          alpha = 1.0f,
+          A = inputAPtr.asInstanceOf[Ptr[CFloat]],
+          lda = K, // leading dimension of A (number of columns)
+          B = inputBPtr.asInstanceOf[Ptr[CFloat]],
+          ldb = N, // leading dimension of B
+          beta = 0.0f,
+          C = outputPtr.asInstanceOf[Ptr[CFloat]],
+          ldc = N, // leading dimension of C
+        )
+
+      case DataType.Float64 =>
+        cblas_dgemm(
+          layout = CblasRowMajor,
+          transA = CblasNoTrans,
+          transB = CblasNoTrans,
+          M = M,
+          N = N,
+          K = K,
+          alpha = 1.0,
+          A = inputAPtr.asInstanceOf[Ptr[CDouble]],
+          lda = K,
+          B = inputBPtr.asInstanceOf[Ptr[CDouble]],
+          ldb = N,
+          beta = 0.0,
+          C = outputPtr.asInstanceOf[Ptr[CDouble]],
+          ldc = N,
+        )
+
+      case unsupported =>
+        // This should never happen due to pre-validation
+        throw new IllegalStateException(s"Unvalidated MatMul data type: $unsupported")
+    }
+  }
+  private def handleSoftmax(op: Operation.Softmax, memory: MemoryMap, model: ModelIR): IO[Unit] =
+    IO {
+      val inputAlloc = model.allocations(op.input)
+      val inputSize = inputAlloc.shape.product // Total number of elements
+
+      val inputPtr = memory(op.input)
+      val outputPtr = memory(op.output)
+
+      inputAlloc.dataType match {
+        case DataType.Float32 =>
+          MLPack.F_perform_softmax_direct(
+            inputPtr.asInstanceOf[Ptr[CFloat]],
+            inputSize.toUSize,
+            outputPtr.asInstanceOf[Ptr[CFloat]],
+          )
+
+        case DataType.Float64 =>
+          MLPack.perform_softmax_direct(
+            inputPtr.asInstanceOf[Ptr[CDouble]],
+            inputSize.toUSize, // Assuming size fits in Int for the Double version
+            outputPtr.asInstanceOf[Ptr[CDouble]],
+          )
+
+        case other =>
+          throw new NotImplementedError(s"Softmax input data type $other not supported.")
+      }
+    }
+
+  /** Calculate row-major strides for a given shape */
+  private def calculateStrides(shape: List[Int]): List[Int] = {
+    val strides = Array.fill(shape.length)(1)
+    for (i <- shape.length - 2 to 0 by -1)
+      strides(i) = strides(i + 1) * shape(i + 1)
+    strides.toList
+  }
+
+  /** Calculate input index from output linear index considering broadcasting with pre-computed
+    * strides
+    */
+  private def calculateBroadcastIndex(
+      linearIndex: Int,
+      outputShape: List[Int],
+      inputShape: List[Int],
+      outputStrides: List[Int],
+      inputStrides: List[Int],
+  ): Int = {
+    // Pad input shape with leading 1s
+    val paddedInput = List.fill(outputShape.length - inputShape.length)(1) ++ inputShape
+    val paddedInputStrides = List.fill(outputShape.length - inputStrides.length)(0) ++ inputStrides
+
+    var remaining = linearIndex
+    var inputIndex = 0
+
+    for (dim <- outputShape.indices) {
+      val coord = remaining / outputStrides(dim)
+      remaining = remaining % outputStrides(dim)
+
+      // If input dimension is 1, it's broadcasted (use coordinate 0)
+      val inputCoord = if (paddedInput(dim) == 1) 0 else coord
+      inputIndex += inputCoord * paddedInputStrides(dim)
+    }
+
+    inputIndex
+  }
+}
diff --git a/runtime/src/main/scala/com/armanbilge/vilcacora/runtime/InterpreterUtils.scala b/runtime/src/main/scala/com/armanbilge/vilcacora/runtime/InterpreterUtils.scala
new file mode 100644
index 0000000..df2ab0a
--- /dev/null
+++ b/runtime/src/main/scala/com/armanbilge/vilcacora/runtime/InterpreterUtils.scala
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2023 Arman Bilge
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.armanbilge.vilcacora.runtime
+
+import cats.effect.{IO, Resource}
+import com.armanbilge.vilcacora.ir._
+import org.typelevel.keypool.KeyPool
+import scala.concurrent.duration._
+
+/** Utility functions for managing inference sessions and input buffers */
+object InterpreterUtils {
+
+  /** Safely copy data from one array to another using System.arraycopy wrapped in IO */
+  def copyArrayToBuffer[A](
+      source: Array[A],
+      sourcePos: Int,
+      dest: Array[A],
+      destPos: Int,
+      length: Int,
+  ): IO[Unit] =
+    IO {
+      if (source == null || dest == null)
+        throw new NullPointerException("Source and destination arrays must not be null")
+      if (sourcePos < 0 || destPos < 0 || length < 0)
+        throw new IllegalArgumentException("Positions and length must be non-negative")
+      if (sourcePos + length > source.length)
+        throw new IndexOutOfBoundsException("Source range out of bounds")
+      if (destPos + length > dest.length)
+        throw new IndexOutOfBoundsException("Destination range out of bounds")
+      System.arraycopy(source, sourcePos, dest, destPos, length)
+    }
+
+  /** Creates a zero-filled buffer matching given shape and data type */
+  def createBuffer(shape: List[Int], dataType: DataType): Array[_] = {
+    val size = shape.product
+    dataType match {
+      case DataType.Float32 => new Array[Float](size)
+      case DataType.Float64 => new Array[Double](size)
+      case DataType.Int32 => new Array[Int](size)
+      case DataType.Int64 => new Array[Long](size)
+      case other =>
+        throw new IllegalArgumentException(s"Unsupported data type: $other")
+    }
+  }
+
+  /** Creates input buffers for all graph inputs of a model */
+  def createInputBuffers(modelIR: ModelIR): Map[String, Array[_]] =
+    modelIR.graphInputs.map { name =>
+      val alloc = modelIR.allocations(name)
+      name -> createBuffer(alloc.shape, alloc.dataType)
+    }.toMap
+
+  /** Wrapper representing an inference session */
+  final case class InferenceSession(
+      inputs: Map[String, Array[_]],
+      runInference: IO[Map[String, Array[_]]],
+  )
+
+  /** Internal wrapper used by the pool to manage resource lifecycle. Only `.session` should be used
+    * from this — the `release` is handled automatically by the pool.
+    */
+  final case class ManagedInferenceSession(
+      session: InferenceSession,
+      release: IO[Unit],
+  )
+
+  /** A resource managing a single inference session lifecycle */
+  def inferenceSessionResource(
+      modelIR: ModelIR,
+      inputs: Map[String, Array[_]],
+  ): Resource[IO, InferenceSession] =
+    for {
+      run <- Interpreter.execute(modelIR, inputs)
+    } yield InferenceSession(inputs, run)
+
+  /** Creates a KeyPool for managing concurrent inference sessions.
+    *
+    * @param modelIR
+    *   The translated model IR
+    * @param inputFactory
+    *   Optional factory for creating input buffers for each inference session. If not provided,
+    *   input buffers will be automatically allocated based on the model's graph input definitions.
+    * @param maxTotal
+    *   Maximum total concurrent sessions
+    * @param maxPerKey
+    *   Maximum sessions per key (default is Int.MaxValue)
+    * @param idleTimeout
+    *   Duration after which idle sessions are removed (default infinite)
+    * @return
+    *   Resource managing the inference session KeyPool
+    */
+  def inferenceSessionPool(
+      modelIR: ModelIR,
+      inputFactory: Option[() => Map[String, Array[_]]] = None,
+      maxTotal: Int = 4,
+      maxPerKey: Int = Int.MaxValue,
+      idleTimeout: Duration = Duration.Inf,
+  ): Resource[IO, KeyPool[IO, Unit, ManagedInferenceSession]] = {
+    val factory = inputFactory.getOrElse(() => createInputBuffers(modelIR))
+    KeyPool
+      .Builder[IO, Unit, ManagedInferenceSession](
+        create = (_: Unit) => {
+          val inputs = factory()
+          inferenceSessionResource(modelIR, inputs).allocated
+            .map { case (session, release) =>
+              ManagedInferenceSession(session, release)
+            }
+        },
+        destroy = (managed: ManagedInferenceSession) => managed.release,
+      )
+      .withMaxPerKey(_ => maxPerKey)
+      .withMaxTotal(maxTotal)
+      .withIdleTimeAllowedInPool(idleTimeout)
+      .build
+  }
+}
diff --git a/runtime/src/main/scala/com/armanbilge/vilcacora/runtime/LibSvm.scala b/runtime/src/main/scala/com/armanbilge/vilcacora/runtime/LibSvm.scala
new file mode 100644
index 0000000..7e4dfc6
--- /dev/null
+++ b/runtime/src/main/scala/com/armanbilge/vilcacora/runtime/LibSvm.scala
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2023 Arman Bilge
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.armanbilge.vilcacora.runtime
+
+import scala.scalanative.unsafe._
+
+/** A safe, idiomatic Scala API for the svm_wrapper.cpp
+  */
+
+// Scala Native bindings for the C wrapper functions
+@define("SVM_Wrapper")
+@link("svm")
+@extern
+object LibSVM {
+  // Model creation
+  def create_svm_param(
+      svm_type: CInt,
+      kernel_type: CInt,
+      degree: CInt,
+      gamma: CDouble,
+      coef0: CDouble,
+  ): Ptr[Byte] = extern
+
+  def create_svm_model(
+      param: Ptr[Byte],
+      nr_class: CInt,
+      l: CInt,
+      support_vectors: Ptr[CDouble],
+      num_features: CInt,
+      coefficients: Ptr[CDouble],
+      rho: Ptr[CDouble],
+      class_labels: Ptr[CInt],
+      n_sv_per_class: Ptr[CInt],
+  ): Ptr[Byte] = extern
+
+  // Prediction
+  def svm_predict_with_scores(
+      model: Ptr[Byte],
+      features: Ptr[CDouble],
+      num_features: CInt,
+      class_scores: Ptr[CDouble],
+  ): CInt = extern
+
+  // Debug function
+  def debug_model_info(model: Ptr[Byte]): Unit = extern
+
+  // Use LibSVM's native cleanup function
+  def svm_free_and_destroy_model(model_ptr_ptr: Ptr[Ptr[Byte]]): Unit = extern
+}
diff --git a/runtime/src/main/scala/com/armanbilge/vilcacora/runtime/MLPACK.scala b/runtime/src/main/scala/com/armanbilge/vilcacora/runtime/MLPACK.scala
new file mode 100644
index 0000000..345dc44
--- /dev/null
+++ b/runtime/src/main/scala/com/armanbilge/vilcacora/runtime/MLPACK.scala
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2023 Arman Bilge
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.armanbilge.vilcacora.runtime
+
+import scala.scalanative.unsafe._
+
+/** Scala Native bindings for MLPack C++ wrapper functions Two-phase operations: initialization and
+  * execution
+  */
+@define("MLPACK_Wrapper")
+@linkCppRuntime
+@extern
+object MLPack {
+
+  /* opaque handle types */
+  type ConvHandleF = Ptr[Byte]
+  type ConvHandleD = Ptr[Byte]
+  type PoolHandleF = Ptr[Byte]
+  type PoolHandleD = Ptr[Byte]
+  type SoftmaxF = Ptr[Byte]
+  type SoftmaxD = Ptr[Byte]
+
+  /* ---- convolution ---- */
+  def initialise_conv_f(
+      outMaps: CSize,
+      kH: CSize,
+      kW: CSize,
+      sH: CSize,
+      sW: CSize,
+      autoPad: CInt,
+      useBias: CInt,
+      inH: CSize,
+      inW: CSize,
+      inC: CSize,
+      weight: Ptr[Float],
+      bias: Ptr[Float],
+      inputPtr: Ptr[Float],
+      outputPtr: Ptr[Float],
+  ): ConvHandleF = extern
+
+  def execute_conv_f(h: ConvHandleF): Unit = extern
+  def cleanup_conv_f(h: ConvHandleF): Unit = extern
+
+  def initialise_conv_d(
+      outMaps: CSize,
+      kH: CSize,
+      kW: CSize,
+      sH: CSize,
+      sW: CSize,
+      autoPad: CInt,
+      useBias: CInt,
+      inH: CSize,
+      inW: CSize,
+      inC: CSize,
+      weight: Ptr[Double],
+      bias: Ptr[Double],
+      inputPtr: Ptr[Double],
+      outputPtr: Ptr[Double],
+  ): ConvHandleD = extern
+
+  def execute_conv_d(h: ConvHandleD): Unit = extern
+  def cleanup_conv_d(h: ConvHandleD): Unit = extern
+
+  /* ---- max-pool ---- */
+  def initialise_pool_f(
+      kH: CSize,
+      kW: CSize,
+      sH: CSize,
+      sW: CSize,
+      inH: CSize,
+      inW: CSize,
+      inC: CSize,
+      inputPtr: Ptr[Float],
+      outputPtr: Ptr[Float],
+  ): PoolHandleF = extern
+  def execute_pool_f(h: PoolHandleF): Unit = extern
+  def cleanup_pool_f(h: PoolHandleF): Unit = extern
+
+  def initialise_pool_d(
+      kH: CSize,
+      kW: CSize,
+      sH: CSize,
+      sW: CSize,
+      inH: CSize,
+      inW: CSize,
+      inC: CSize,
+      inputPtr: Ptr[Double],
+      outputPtr: Ptr[Double],
+  ): PoolHandleD = extern
+  def execute_pool_d(h: PoolHandleD): Unit = extern
+  def cleanup_pool_d(h: PoolHandleD): Unit = extern
+
+  /* ---- softmax ---- */
+  def F_perform_softmax_direct(
+      input_ptr: Ptr[CFloat],
+      input_size: CSize,
+      output_ptr: Ptr[CFloat], // Same size as input
+  ): Unit = extern
+  def perform_softmax_direct(
+      input_ptr: Ptr[Double],
+      input_size: CSize,
+      output_ptr: Ptr[Double], // Same size as input
+  ): Unit = extern
+}
diff --git a/runtime/src/test/scala/com/armanbilge/vilcacora/runtime/InterpreterSuite.scala b/runtime/src/test/scala/com/armanbilge/vilcacora/runtime/InterpreterSuite.scala
new file mode 100644
index 0000000..3b8bd5d
--- /dev/null
+++ b/runtime/src/test/scala/com/armanbilge/vilcacora/runtime/InterpreterSuite.scala
@@ -0,0 +1,576 @@
+/*
+ * Copyright 2023 Arman Bilge
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.armanbilge.vilcacora.runtime
+
+import cats.effect.unsafe.implicits.global
+import com.armanbilge.vilcacora.ir._
+import munit.FunSuite
+
+class InterpreterSuite extends FunSuite {
+
+  /** Test the Add operation with Float32 tensors */
+  test("Add operation should perform element-wise addition on Float32 tensors") {
+    val inputA = Array(1.0f, 2.0f, 3.0f, 4.0f)
+    val inputB = Array(5.0f, 6.0f, 7.0f, 8.0f)
+
+    val model = ModelIR(
+      name = "add_test",
+      operations = List(
+        Operation.Add("input_a", "input_b", "output"),
+      ),
+      allocations = Map(
+        "input_a" -> Allocation("input_a", DataType.Float32, List(4)),
+        "input_b" -> Allocation("input_b", DataType.Float32, List(4)),
+        "output" -> Allocation("output", DataType.Float32, List(4)),
+      ),
+      graphInputs = List("input_a", "input_b"),
+      graphOutputs = List("output"),
+    )
+
+    val inputs = Map(
+      "input_a" -> inputA,
+      "input_b" -> inputB,
+    )
+
+    val results = Interpreter.execute(model, inputs).use(_.map(identity)).unsafeRunSync()
+    val output = results("output").asInstanceOf[Array[Float]]
+    val expected = Array(6.0f, 8.0f, 10.0f, 12.0f)
+
+    assertEquals(output.toSeq, expected.toSeq)
+  }
+
+  /** Test the Add operation with broadcasting */
+  test("Add operation with broadcasting should add a vector to each row of a matrix") {
+    val matrix = Array(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f) // Shape [2, 3]
+    val vector = Array(10.0f, 20.0f, 30.0f) // Shape [3]
+
+    val model = ModelIR(
+      name = "add_broadcast_test",
+      operations = List(
+        Operation.Add("matrix", "vector", "output"),
+      ),
+      allocations = Map(
+        "matrix" -> Allocation("matrix", DataType.Float32, List(2, 3)),
+        "vector" -> Allocation("vector", DataType.Float32, List(3)),
+        "output" -> Allocation("output", DataType.Float32, List(2, 3)),
+      ),
+      graphInputs = List("matrix", "vector"),
+      graphOutputs = List("output"),
+    )
+
+    val inputs = Map(
+      "matrix" -> matrix,
+      "vector" -> vector,
+    )
+
+    val results = Interpreter.execute(model, inputs).use(_.map(identity)).unsafeRunSync()
+    val output = results("output").asInstanceOf[Array[Float]]
+    val expected = Array(11.0f, 22.0f, 33.0f, 14.0f, 25.0f, 36.0f)
+
+    assertEquals(output.toSeq, expected.toSeq)
+  }
+
+  /** Test the Mul operation with Float32 tensors */
+  test("Mul operation should perform element-wise multiplication on Float32 tensors") {
+    val inputA = Array(2.0f, 3.0f, 4.0f, 5.0f)
+    val inputB = Array(1.5f, 2.0f, 2.5f, 3.0f)
+
+    val model = ModelIR(
+      name = "mul_test",
+      operations = List(
+        Operation.Mul("input_a", "input_b", "output"),
+      ),
+      allocations = Map(
+        "input_a" -> Allocation("input_a", DataType.Float32, List(4)),
+        "input_b" -> Allocation("input_b", DataType.Float32, List(4)),
+        "output" -> Allocation("output", DataType.Float32, List(4)),
+      ),
+      graphInputs = List("input_a", "input_b"),
+      graphOutputs = List("output"),
+    )
+
+    val inputs = Map(
+      "input_a" -> inputA,
+      "input_b" -> inputB,
+    )
+
+    val results = Interpreter.execute(model, inputs).use(_.map(identity)).unsafeRunSync()
+    val output = results("output").asInstanceOf[Array[Float]]
+    val expected = Array(3.0f, 6.0f, 10.0f, 15.0f)
+
+    assertEquals(output.toSeq, expected.toSeq)
+  }
+
+  /** Test Cast operation from Float64 to Float32 */
+  test("Cast operation should convert Float64 to Float32 with appropriate precision loss") {
+    val input = Array(1.123456789, 2.987654321, 3.141592653)
+
+    val model = ModelIR(
+      name = "cast_f64_to_f32_test",
+      operations = List(
+        Operation.Cast("input", "output", DataType.Float32),
+      ),
+      allocations = Map(
+        "input" -> Allocation("input", DataType.Float64, List(3)),
+        "output" -> Allocation("output", DataType.Float32, List(3)),
+      ),
+      graphInputs = List("input"),
+      graphOutputs = List("output"),
+    )
+
+    val inputs = Map("input" -> input)
+
+    val results = Interpreter.execute(model, inputs).use(_.map(identity)).unsafeRunSync()
+    val output = results("output").asInstanceOf[Array[Float]]
+
+    // Check that values are approximately correct within Float32 precision
+    assertEqualsFloat(output(0), 1.123456789f, 1e-6f)
+    assertEqualsFloat(output(1), 2.987654321f, 1e-6f)
+    assertEqualsFloat(output(2), 3.141592653f, 1e-6f)
+  }
+
+  /** Test Cast operation from Float32 to Float64 */
+  test("Cast operation should convert Float32 to Float64 without precision loss") {
+    val input = Array(1.5f, 2.75f, 3.25f)
+
+    val model = ModelIR(
+      name = "cast_f32_to_f64_test",
+      operations = List(
+        Operation.Cast("input", "output", DataType.Float64),
+      ),
+      allocations = Map(
+        "input" -> Allocation("input", DataType.Float32, List(3)),
+        "output" -> Allocation("output", DataType.Float64, List(3)),
+      ),
+      graphInputs = List("input"),
+      graphOutputs = List("output"),
+    )
+
+    val inputs = Map("input" -> input)
+
+    val results = Interpreter.execute(model, inputs).use(_.map(identity)).unsafeRunSync()
+    val output = results("output").asInstanceOf[Array[Double]]
+    val expected = Array(1.5, 2.75, 3.25)
+
+    assertEquals(output.toSeq, expected.toSeq)
+  }
+
+  /** Test ReLU operation */
+  test("Relu operation should replace negative values with zero") {
+    val input = Array(-1.0f, 0.0f, 1.5f, -2.5f, 3.0f)
+
+    val model = ModelIR(
+      name = "relu_test",
+      operations = List(
+        Operation.Relu("input", "output"),
+      ),
+      allocations = Map(
+        "input" -> Allocation("input", DataType.Float32, List(5)),
+        "output" -> Allocation("output", DataType.Float32, List(5)),
+      ),
+      graphInputs = List("input"),
+      graphOutputs = List("output"),
+    )
+
+    val inputs = Map("input" -> input)
+
+    val results = Interpreter.execute(model, inputs).use(_.map(identity)).unsafeRunSync()
+    val output = results("output").asInstanceOf[Array[Float]]
+    val expected = Array(0.0f, 0.0f, 1.5f, 0.0f, 3.0f)
+
+    assertEquals(output.toSeq, expected.toSeq)
+  }
+
+  /** Test Reshape operation */
+  test("Reshape operation should preserve data while changing logical shape") {
+    val input = Array(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f)
+
+    val model = ModelIR(
+      name = "reshape_test",
+      operations = List(
+        // The interpreter's handleReshape is a direct memory copy,
+        // so we just verify the data is preserved.
+        Operation.Reshape("input", "shape", "output"),
+      ),
+      allocations = Map(
+        "input" -> Allocation("input", DataType.Float32, List(6)),
+        "shape" -> Allocation(
+          "shape",
+          DataType.Int64,
+          List(2),
+          Some(Array[Byte](2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0)),
+        ), // Dummy shape tensor
+        "output" -> Allocation("output", DataType.Float32, List(2, 3)),
+      ),
+      graphInputs = List("input"),
+      graphOutputs = List("output"),
+    )
+
+    val inputs = Map("input" -> input)
+
+    val results = Interpreter.execute(model, inputs).use(_.map(identity)).unsafeRunSync()
+    val output = results("output").asInstanceOf[Array[Float]]
+
+    // The output array is flat, so it should be identical to the input data.
+    assertEquals(output.toSeq, input.toSeq)
+  }
+
+  /** Test Conv (Convolution) operation */
+  test("Conv operation should perform 2D convolution correctly") {
+    // Input: 1 batch, 1 channel, 3x3 image
+    val input = Array(
+      1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f,
+    )
+    // Weights: 1 output channel, 1 input channel, 2x2 kernel
+    val weight = Array(
+      1.0f,
+      1.0f,
+      1.0f,
+      1.0f,
+    )
+    // Bias: 1 bias per output channel
+    val bias = Array(0.5f)
+
+    val model = ModelIR(
+      name = "conv_test",
+      operations = List(
+        Operation.Conv(
+          input = "input",
+          weight = "weight",
+          bias = Some("bias"),
+          output = "output",
+          autoPad = AutoPad.NotSet,
+          dilations = List(1, 1),
+          group = 1,
+          kernelShape = List(2, 2),
+          pads = List(0, 0, 0, 0),
+          strides = List(1, 1),
+        ),
+      ),
+      allocations = Map(
+        "input" -> Allocation("input", DataType.Float32, List(1, 1, 3, 3)),
+        "weight" -> Allocation("weight", DataType.Float32, List(1, 1, 2, 2)),
+        "bias" -> Allocation("bias", DataType.Float32, List(1)),
+        "output" -> Allocation("output", DataType.Float32, List(1, 1, 2, 2)),
+      ),
+      graphInputs = List("input", "weight", "bias"),
+      graphOutputs = List("output"),
+    )
+
+    val inputs = Map(
+      "input" -> input,
+      "weight" -> weight,
+      "bias" -> bias,
+    )
+
+    val results = Interpreter.execute(model, inputs).use(_.map(identity)).unsafeRunSync()
+    val output = results("output").asInstanceOf[Array[Float]]
+    val expected = Array(12.5f, 16.5f, 24.5f, 28.5f)
+
+    assertEquals(output.toSeq, expected.toSeq)
+  }
+
+  /** Test Identity Conv (1×1 kernel) should return input unchanged */
+  test("Identity convolution should produce identical output for 1×1 kernel") {
+    // Input: 1 batch, 1 channel, 3×3 image
+    val input = Array(
+      1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f,
+    )
+
+    // Weights: 1 output channel, 1 input channel, 1×1 kernel with weight=1
+    val weight = Array(1.0f)
+
+    // No bias
+    val model = ModelIR(
+      name = "identity_conv_test",
+      operations = List(
+        Operation.Conv(
+          input = "input",
+          weight = "weight",
+          bias = None,
+          output = "output",
+          autoPad = AutoPad.NotSet, // VALID
+          dilations = List(1, 1),
+          group = 1,
+          kernelShape = List(1, 1),
+          pads = List(0, 0, 0, 0),
+          strides = List(1, 1),
+        ),
+      ),
+      allocations = Map(
+        "input" -> Allocation("input", DataType.Float32, List(1, 1, 3, 3)),
+        "weight" -> Allocation("weight", DataType.Float32, List(1, 1, 1, 1)),
+        "output" -> Allocation("output", DataType.Float32, List(1, 1, 3, 3)),
+      ),
+      graphInputs = List("input", "weight"),
+      graphOutputs = List("output"),
+    )
+
+    val inputs = Map(
+      "input" -> input,
+      "weight" -> weight,
+    )
+
+    val results = Interpreter.execute(model, inputs).use(_.map(identity)).unsafeRunSync()
+    val output = results("output").asInstanceOf[Array[Float]]
+    val expected = input
+
+    assertEquals(output.toSeq, expected.toSeq)
+  }
+
+  /** Test MaxPool operation */
+  test("MaxPool operation should find the max value in each window") {
+    // Input: 1 batch, 1 channel, 4x4 image
+    val input = Array(
+      1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f,
+      15.0f, 16.0f,
+    )
+
+    val model = ModelIR(
+      name = "maxpool_test",
+      operations = List(
+        Operation.MaxPool(
+          input = "input",
+          output = "output",
+          autoPad = AutoPad.NotSet,
+          ceilMode = false,
+          dilations = List(1, 1),
+          kernelShape = List(2, 2),
+          pads = List(0, 0, 0, 0),
+          storageOrder = 0,
+          strides = List(2, 2),
+        ),
+      ),
+      allocations = Map(
+        "input" -> Allocation("input", DataType.Float32, List(1, 1, 4, 4)),
+        "output" -> Allocation("output", DataType.Float32, List(1, 1, 2, 2)),
+      ),
+      graphInputs = List("input"),
+      graphOutputs = List("output"),
+    )
+
+    val inputs = Map("input" -> input)
+
+    val results = Interpreter.execute(model, inputs).use(_.map(identity)).unsafeRunSync()
+    val output = results("output").asInstanceOf[Array[Float]]
+    val expected = Array(6.0f, 8.0f, 14.0f, 16.0f)
+
+    assertEquals(output.toSeq, expected.toSeq)
+  }
+
+  /** Test SVM Classifier operation (simplified example) */
+  test("SVM Classifier should handle basic classification (may fail without LibSVM)") {
+    // Simple 2D input for binary classification
+    val input = Array(0.5, 1.5)
+
+    // Minimal SVM model data (this is a simplified example)
+    val supportVectors = Array(
+      0.0,
+      1.0, // Support vector 1
+      1.0,
+      0.0, // Support vector 2
+    )
+    val coefficients = Array(1.0, -1.0) // Dual coefficients
+    val rho = List(0.5) // Decision function constant
+    val classLabels = List(0L, 1L)
+    val vectorsPerClass = List(1L, 1L)
+
+    val model = ModelIR(
+      name = "svm_test",
+      operations = List(
+        Operation.SVMClassifier(
+          input = "input",
+          outputLabel = "label",
+          outputScores = "scores",
+          classLabels = classLabels,
+          coefficients = coefficients,
+          kernelType = SVMKernel.Linear,
+          kernelParams = List(),
+          postTransform = PostTransform.None,
+          rho = rho,
+          supportVectors = supportVectors,
+          vectorsPerClass = vectorsPerClass,
+        ),
+      ),
+      allocations = Map(
+        "input" -> Allocation("input", DataType.Float64, List(1, 2)),
+        "label" -> Allocation("label", DataType.Int32, List(1)),
+        "scores" -> Allocation("scores", DataType.Float64, List(2)),
+      ),
+      graphInputs = List("input"),
+      graphOutputs = List("label", "scores"),
+    )
+
+    val inputs = Map("input" -> input)
+
+    // SVM may fail without LibSVM bindings, so we test both success and graceful failure
+    try {
+      val results = Interpreter.execute(model, inputs).use(_.map(identity)).unsafeRunSync()
+      val label = results("label").asInstanceOf[Array[Int]]
+      val scores = results("scores").asInstanceOf[Array[Double]]
+
+      // If SVM works, verify the outputs are reasonable
+      assert(label.length == 1, "Should produce exactly one label")
+      assert(scores.length == 2, "Should produce scores for 2 classes")
+      assert(classLabels.contains(label(0).toLong), "Label should be one of the class labels")
+    } catch {
+      case _: NotImplementedError | _: UnsatisfiedLinkError =>
+        // Expected when LibSVM bindings are not available
+        () // Test passes - graceful failure is acceptable
+    }
+  }
+
+  /** Test MatMul operation with Float32 matrices */
+  test("MatMul operation should perform matrix multiplication correctly") {
+    // Matrix A: 2x3 matrix [[1, 2, 3], [4, 5, 6]]
+    val matrixA = Array(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f)
+
+    // Matrix B: 3x2 matrix [[1, 2], [3, 4], [5, 6]]
+    val matrixB = Array(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f)
+
+    val model = ModelIR(
+      name = "matmul_test",
+      operations = List(
+        Operation.MatMul("matrix_a", "matrix_b", "output"),
+      ),
+      allocations = Map(
+        "matrix_a" -> Allocation("matrix_a", DataType.Float32, List(2, 3)),
+        "matrix_b" -> Allocation("matrix_b", DataType.Float32, List(3, 2)),
+        "output" -> Allocation("output", DataType.Float32, List(2, 2)),
+      ),
+      graphInputs = List("matrix_a", "matrix_b"),
+      graphOutputs = List("output"),
+    )
+
+    val inputs = Map(
+      "matrix_a" -> matrixA,
+      "matrix_b" -> matrixB,
+    )
+
+    val results = Interpreter.execute(model, inputs).use(_.map(identity)).unsafeRunSync()
+    val output = results("output").asInstanceOf[Array[Float]]
+
+    // Expected result: [[22, 28], [49, 64]]
+    // Calculation:
+    // [1,2,3] * [1,3,5; 2,4,6] = [1*1+2*3+3*5, 1*2+2*4+3*6] = [22, 28]
+    // [4,5,6] * [1,3,5; 2,4,6] = [4*1+5*3+6*5, 4*2+5*4+6*6] = [49, 64]
+    val expected = Array(22.0f, 28.0f, 49.0f, 64.0f)
+
+    assertEquals(output.toSeq, expected.toSeq)
+  }
+
+  /** Test MatMul operation with different sized matrices (like MNIST FC layer) */
+  test("MatMul operation should handle typical neural network dimensions") {
+    // Simulate flattened feature vector: 1x4 (batch size 1, 4 features)
+    val features = Array(1.0f, 2.0f, 3.0f, 4.0f)
+
+    // Weight matrix: 4x3 (4 inputs, 3 outputs)
+    val weights = Array(
+      0.5f, 1.0f, 1.5f, // weights for input 1
+      0.2f, 0.4f, 0.6f, // weights for input 2
+      0.1f, 0.3f, 0.5f, // weights for input 3
+      0.8f, 0.6f, 0.4f, // weights for input 4
+    )
+
+    val model = ModelIR(
+      name = "matmul_fc_test",
+      operations = List(
+        Operation.MatMul("features", "weights", "output"),
+      ),
+      allocations = Map(
+        "features" -> Allocation("features", DataType.Float32, List(1, 4)),
+        "weights" -> Allocation("weights", DataType.Float32, List(4, 3)),
+        "output" -> Allocation("output", DataType.Float32, List(1, 3)),
+      ),
+      graphInputs = List("features", "weights"),
+      graphOutputs = List("output"),
+    )
+
+    val inputs = Map(
+      "features" -> features,
+      "weights" -> weights,
+    )
+
+    val results = Interpreter.execute(model, inputs).use(_.map(identity)).unsafeRunSync()
+    val output = results("output").asInstanceOf[Array[Float]]
+
+    // Expected calculation: [1,2,3,4] * weights
+    // Output 1: 1*0.5 + 2*0.2 + 3*0.1 + 4*0.8 = 0.5 + 0.4 + 0.3 + 3.2 = 4.4
+    // Output 2: 1*1.0 + 2*0.4 + 3*0.3 + 4*0.6 = 1.0 + 0.8 + 0.9 + 2.4 = 5.1
+    // Output 3: 1*1.5 + 2*0.6 + 3*0.5 + 4*0.4 = 1.5 + 1.2 + 1.5 + 1.6 = 5.8
+    val expected = Array(4.4f, 5.1f, 5.8f)
+
+    assertEquals(output.length, expected.length)
+    output.zip(expected).foreach { case (actual, exp) =>
+      assertEqualsFloat(actual, exp, 1e-5f)
+    }
+  }
+
+  /** Test a more complex graph with multiple operations */
+  test("Complex graph should correctly chain Add, Cast, and Mul operations") {
+    val inputA = Array(2.0, 4.0, 6.0)
+    val inputB = Array(1.0, 2.0, 3.0)
+    val multiplier = Array(0.5f, 1.5f, 2.5f)
+
+    val model = ModelIR(
+      name = "complex_test",
+      operations = List(
+        // First add the inputs (Float64)
+        Operation.Add("input_a", "input_b", "sum"),
+        // Cast the sum to Float32
+        Operation.Cast("sum", "sum_f32", DataType.Float32),
+        // Multiply with the multiplier
+        Operation.Mul("sum_f32", "multiplier", "final_output"),
+      ),
+      allocations = Map(
+        "input_a" -> Allocation("input_a", DataType.Float64, List(3)),
+        "input_b" -> Allocation("input_b", DataType.Float64, List(3)),
+        "sum" -> Allocation("sum", DataType.Float64, List(3)),
+        "sum_f32" -> Allocation("sum_f32", DataType.Float32, List(3)),
+        "multiplier" -> Allocation("multiplier", DataType.Float32, List(3)),
+        "final_output" -> Allocation("final_output", DataType.Float32, List(3)),
+      ),
+      graphInputs = List("input_a", "input_b", "multiplier"),
+      graphOutputs = List("final_output"),
+    )
+
+    val inputs = Map(
+      "input_a" -> inputA,
+      "input_b" -> inputB,
+      "multiplier" -> multiplier,
+    )
+
+    val results = Interpreter.execute(model, inputs).use(_.map(identity)).unsafeRunSync()
+    val output = results("final_output").asInstanceOf[Array[Float]]
+    val expected = Array(1.5f, 9.0f, 22.5f)
+
+    // Use floating point comparison with tolerance
+    assertEquals(output.length, expected.length)
+    output.zip(expected).foreach { case (actual, exp) =>
+      assertEqualsFloat(actual, exp, 1e-5f)
+    }
+  }
+
+  // Helper method for floating point comparisons
+  private def assertEqualsFloat(actual: Float, expected: Float, tolerance: Float): Unit = {
+    val diff = math.abs(actual - expected)
+    assert(
+      diff <= tolerance,
+      s"Expected $expected ± $tolerance, but got $actual (difference: $diff)",
+    )
+  }
+}