diff --git a/MAINTAINERS b/MAINTAINERS
index 5df6020ed545..5d7a5753b593 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -232,14 +232,25 @@ Hexagon TCG CPUs
 M: Brian Cain <brian.cain@oss.qualcomm.com>
 S: Supported
 F: target/hexagon/
+F: hw/intc/l2vic.[ch]
+F: hw/hexagon/
+F: hw/timer/qct-qtimer.c
+F: include/hw/hexagon/
+F: include/hw/timer/qct-qtimer.h
 X: target/hexagon/idef-parser/
 X: target/hexagon/gen_idef_parser_funcs.py
 F: linux-user/hexagon/
 F: tests/tcg/hexagon/
 F: disas/hexagon.c
 F: configs/targets/hexagon-linux-user/default.mak
+F: configs/devices/hexagon-softmmu/default.mak
 F: docker/dockerfiles/debian-hexagon-cross.docker
 F: gdb-xml/hexagon*.xml
+F: docs/system/target-hexagon.rst
+F: docs/devel/hexagon-sys.rst
+F: docs/devel/hexagon-l2vic.rst
+F: tests/functional/test_hexagon_minivm.py
+F: docs/devel/hexagon-vm.rst
 T: git https://github.com/quic/qemu.git hex-next
 
 Hexagon idef-parser
diff --git a/configs/devices/hexagon-softmmu/default.mak b/configs/devices/hexagon-softmmu/default.mak
new file mode 100644
index 000000000000..37b4f9f3237a
--- /dev/null
+++ b/configs/devices/hexagon-softmmu/default.mak
@@ -0,0 +1,8 @@
+# Default configuration for hexagon-softmmu
+
+# Uncomment the following lines to disable these optional devices:
+
+# Boards are selected by default, uncomment to keep out of the build.
+# CONFIG_HEX_VIRT=y
+# CONFIG_HEX_DSP=y
+# CONFIG_L2VIC=y
diff --git a/configs/targets/hexagon-softmmu.mak b/configs/targets/hexagon-softmmu.mak
new file mode 100644
index 000000000000..03cf1306a348
--- /dev/null
+++ b/configs/targets/hexagon-softmmu.mak
@@ -0,0 +1,10 @@
+# Default configuration for hexagon-softmmu
+
+TARGET_ARCH=hexagon
+TARGET_SUPPORTS_MTTCG=y
+TARGET_XML_FILES=gdb-xml/hexagon-core.xml gdb-xml/hexagon-hvx.xml gdb-xml/hexagon-sys.xml
+TARGET_LONG_BITS=32
+TARGET_NEED_FDT=y
+CONFIG_SEMIHOSTING=y
+CONFIG_ARM_COMPATIBLE_SEMIHOSTING=y
+CONFIG_SEMIHOSTING_USE_STDIO=y
diff --git a/docs/devel/hexagon-l2vic.rst b/docs/devel/hexagon-l2vic.rst
new file mode 100644
index 000000000000..088563627445
--- /dev/null
+++ b/docs/devel/hexagon-l2vic.rst
@@ -0,0 +1,59 @@
+Hexagon L2 Vectored Interrupt Controller
+========================================
+
+
+.. code-block:: none
+
+              +-------+
+              |       |             +----------------+
+              | l2vic |             |  hexagon core  |
+              |       |             |                |
+              | +-----|             |                |
+        ------> |VID0 >------------->irq2 -\         |
+        ------> |     |             |      |         |
+         ...  > |     |             |      |         |
+        ------> |     |             | <int steering> |
+              | +-----|             |   / |  | \     |
+              |  ...  |             |  |  |  |  |    |
+              | +-----|             | t0 t1 t2 t3 ...|
+        ------> |VIDN |             |                |
+        ------> |     |             |                |
+        ------> |     |             |                |
+        ------> |     |             |                |
+              | +-----|             |                |
+              |       |             |Global SREG File|
+              | State |             |                |
+              | [    ]|<============|=>[VID ]        |
+              | [    ]|<============|=>[VID1]        |
+              | [    ]|             |                |
+              | [    ]|             |                |
+              |       |             |                |
+              +-------+             +----------------+
+
+L2VIC/Core Integration
+----------------------
+
+* hexagon core supports 8 external interrupt sources
+* l2vic supports 1024 input interrupts mapped among 4 output interrupts
+* l2vic has four output signals: { VID0, VID1, VID2, VID3 }
+* l2vic device has a bank of registers per-VID that can be used to query
+  the status or assert new interrupts.
+* Interrupts are 'steered' to threads based on { thread priority, 'EX' state,
+  thread interrupt mask, thread interrupt enable, global interrupt enable,
+  etc. }.
+* Any hardware thread could conceivably handle any input interrupt, dependent
+  on state.
+* The system register transfer instruction can read the VID0-VID3 values from
+  the l2vic when reading from hexagon core system registers "VID" and "VID1".
+* When l2vic VID0 has multiple active interrupts, it pulses the VID0 output
+  IRQ and stores the IRQ number for the VID0 register field.  Only after this
+  interrupt is cleared can the l2vic pulse the VID0 output IRQ again and provide
+  the next interrupt number on the VID0 register.
+* The ``ciad`` instruction clears the l2vic input interrupt and un-disables the
+  core interrupt.  If some/an l2vic VID0 interrupt is pending when this occurs,
+  the next interrupt should fire and any subseqeunt reads of the VID register
+  should reflect the newly raised interrupt.
+* In QEMU, on an external interrupt or an unmasked-pending interrupt,
+  all vCPUs are triggered (has_work==true) and each will grab the IO lock
+  while considering the steering logic to determine whether they're the thread
+  that must handle the interrupt.
diff --git a/docs/devel/hexagon-sys.rst b/docs/devel/hexagon-sys.rst
new file mode 100644
index 000000000000..3972261a2bbe
--- /dev/null
+++ b/docs/devel/hexagon-sys.rst
@@ -0,0 +1,106 @@
+.. _Hexagon-System-arch:
+
+Hexagon System Architecture
+===========================
+
+The hexagon architecture has some unique elements which are described here.
+
+Interrupts
+----------
+When interrupts arrive at a Hexagon DSP core, they are priority-steered to
+be handled by an eligible hardware thread with the lowest priority.
+
+Memory
+------
+Each hardware thread has an ``SSR.ASID`` field that contains its Address
+Space Identifier.  This value is catenated with a 32-bit virtual address -
+the MMU can then resolve this extended virtual address to a physical address.
+
+TLBs
+----
+The format of a TLB entry is shown below.
+
+.. note::
+    The Small Core DSPs have a different TLB format which is not yet
+    supported.
+
+.. admonition:: Diagram
+
+ .. code:: text
+
+             6                   5                   4               3
+       3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |v|g|x|A|A|             |                                       |
+      |a|l|P|1|0|     ASID    |             Virtual Page              |
+      |l|b| | | |             |                                       |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+         3                   2                   1                   0
+       1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      | | | | |       |                                             | |
+      |x|w|r|u|Cacheab|               Physical Page                 |S|
+      | | | | |       |                                             | |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+* ASID: the address-space identifier
+* A1, A0: the behavior of these cache line attributes are not modeled by QEMU.
+* xP: the extra-physical bit is the most significant physical address bit.
+* S: the S bit and the LSBs of the physical page indicate the page size
+* val: this is the 'valid' bit, when set it indicates that page matching
+  should consider this entry.
+
+.. list-table:: Page sizes
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - S-bit
+     - Phys page LSBs
+     - Page size
+   * - 1
+     - N/A
+     - 4kb
+   * - 0
+     - 0b1
+     - 16kb
+   * - 0
+     - 0b10
+     - 64kb
+   * - 0
+     - 0b100
+     - 256kb
+   * - 0
+     - 0b1000
+     - 1MB
+   * - 0
+     - 0b10000
+     - 4MB
+   * - 0
+     - 0b100000
+     - 16MB
+
+* glb: if the global bit is set, the ASID is not considered when matching
+  TLBs.
+* Cacheab: the cacheability attributes of TLBs are not modeled, these bits
+  are ignored.
+* RWX: read-, write-, execute-, enable bits.  Indicates if user programs
+  are permitted to read/write/execute the given page.
+* U: indicates if user programs can access this page.
+
+Scheduler
+---------
+The Hexagon system architecture has a feature to assist the guest OS
+task scheduler.  The guest OS can enable this feature by setting
+``SCHEDCFG.EN``.  The ``BESTWAIT`` register is programmed by the guest OS
+to indicate the priority of the highest priority task waiting to run on a
+hardware thread.  The reschedule interrupt is triggered when any hardware
+thread's priority in ``STID.PRIO`` is worse than the ``BESTWAIT``.  When
+it is triggered, the ``BESTWAIT.PRIO`` value is reset to 0x1ff.
+
+HVX Coprocessor
+---------------
+The Supervisor Status Register field ``SSR.XA`` binds a DSP hardware thread
+to one of the eight possible HVX contexts.  The guest OS is responsible for
+managing this resource.
diff --git a/docs/devel/hexagon-vm.rst b/docs/devel/hexagon-vm.rst
new file mode 100644
index 000000000000..fb16d56d59de
--- /dev/null
+++ b/docs/devel/hexagon-vm.rst
@@ -0,0 +1,190 @@
+Hexagon Virtual Machine
+=======================
+
+The hexagon virtual machine is a hypervisor that can partition a single
+Hexagon DSP among multiple guest operating systems, and abstracts the
+specific details of a DSP architectural revision for the sake of consistency
+among generations.
+
+Events
+------
+
+The guest operating system should register the Guest Event Vector Base
+via the ``vmsetvec`` virtual instruction at system startup.  The vector table
+and handlers are determined by the guest OS.
+
+Guests return from event handlers with ``vmrte``.  This instruction will restore
+the mode (user versus guest), interrupt enable state, PC, SP.
+
+.. list-table:: Event types
+   :header-rows: 1
+
+   * - Number
+     - Name
+     - Description
+     - Maskable
+     - Detail
+   * - 0
+     - Reserved
+     -
+     -
+     -
+   * - 1
+     - Machine check event
+     - unrecoverable VM state
+     - No
+     - execution terminates if unhandled
+   * - 2
+     - General exception
+     - internal hardware or software exception
+     - No
+     -
+   * - 3-4
+     - Reserved
+     -
+     -
+     -
+   * - 5
+     - ``trap0``
+     - ``trap0`` instruction
+     - No
+     -
+   * - 6
+     - Reserved
+     -
+     -
+     -
+   * - 7
+     - Interrupt
+     - external interrupts
+     - Yes
+     - increasing interrupt numbers have descending priority
+
+Startup
+-------
+In order to transition to user-mode, the guest OS must set the ``UM`` bit in
+the guest status register and specify the address to start executing in
+user mode in the guest event link register.
+
+Virtual Instructions
+--------------------
+
+.. list-table:: Virtual Instructions
+   :header-rows: 1
+
+   * - Instruction
+     - Behavior
+     - Operand
+     - Input
+     - Output
+   * - vmversion
+     - returns the VM version
+     - 0x0
+     - requested VM version
+     - provided VM version
+   * - vmrte
+     - return from event
+     - 0x1
+     - Event info in g3:0
+     - N/A
+   * - vmsetvec
+     - set event vector
+     - 0x2
+     - r0 is set to vector table addr
+     - r0 is 0 on success, 1 otherwise
+   * - vmsetie
+     - set interrupt enabled
+     - 0x3
+     - r0 is set to 1 to enable, 0 to disable
+     - previous IE bit is stored as LSB of r0
+   * - vmgetie
+     - get interrupt enabled
+     - 0x4
+     - N/A
+     - current IE bit is stored as LSB of r0
+   * - vmintop
+     - interrupt operation
+     - 0x5
+     - r0 = Interrupt Op, r1-r4: Depends on Op
+     - r0 - value depends on operation
+   * - vmclrmap
+     - clear virtual memory map
+     - 0xa
+     - r0 = Interrupt Op, r1-r4: Depends on Op
+     - r0 - value depends on operation
+   * - vmnewmap
+     - set new virtual memory map
+     - 0xb
+     - + r0 contains logical address of new segment table
+       + r1 = type of translations: 0 indicates a logical address of a zero-terminated linear list, 1 indicates a set of page tables.
+     - r0 contains 0 on success, otherwise negative error code
+   * - vmcache
+     - VM cache control: not modeled
+     - 0xd
+     - + r0 contains the operation to be performed
+       + r1 = Starting virtual address
+       + r2 contains the length in bytes
+     - r0 contains 0 on success, otherwise -1.  Cache behavior is not modeled so this operation always succeeds.
+   * - vmgettime
+     - Get virtual machine time
+     - 0xe
+     - N/A
+     - r0 contains the least significant 32 bits of timestamp, r1 contains the  most significant 32 bits of timestamp
+   * - vmsettime
+     - Set virtual machine time
+     - 0xf
+     - r0 contains the least significant 32 bits of timestamp, r1 contains the  most significant 32 bits of timestamp
+     - N/A
+   * - vmwait
+     - wait for interrupt
+     - 0x10
+     - N/A
+     - r0 contains the interrupt number of the interrupt waking the guest
+   * - vmyield
+     - voluntarily yield VM task
+     - 0x11
+     - N/A
+     - N/A
+   * - vmstart
+     - Create new virtual processor instance
+     - 0x12
+     - r0 contains the starting execution address, r1 contains the starting stack pointer
+     - r0 contains the Virtual processor number of new virtual processor on success, otherwise -1
+   * - vmstop
+     - terminate current virtual processor instance
+     - 0x13
+     - N/A
+     - N/A
+   * - vmvpid
+     - get the virtual processor ID
+     - 0x14
+     - N/A
+     - r0 contains the virtual processor number of virtual processor executing the instruction
+   * - vmsetregs
+     - Set guest registers
+     - 0x15
+     - r0-3 hold g0-3 values
+     - N/A
+   * - vmgetregs
+     - Get guest registers
+     - 0x16
+     - N/A
+     - r0-3 hold g0-3 values
+   * - vmtimerop
+     - perform an operation on a system timer
+     - 0x18
+     - + getfreq = 0
+       + getres = 1
+       + gettime = 2
+       + gettimeout = 3
+       + settimeout = 4
+       + deltatimeout = 5
+     - r0 contains result of the timer operation call
+   * - vmgetinfo
+     - Get system info
+     - 0x1a
+     - Index of the system info parameter:
+
+       + build_id = 0
+       + info_boot_flags = 1
+     - value of the indicated system info parameter
diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst
index 7a0678cbdd3a..82f788682bb4 100644
--- a/docs/devel/index-internals.rst
+++ b/docs/devel/index-internals.rst
@@ -14,6 +14,9 @@ Details about QEMU's various subsystems including how to add features to them.
    block-coroutine-wrapper
    clocks
    ebpf_rss
+   hexagon-sys
+   hexagon-l2vic
+   hexagon-vm
    migration/index
    multi-process
    reset
diff --git a/docs/system/hexagon/cdsp.rst b/docs/system/hexagon/cdsp.rst
new file mode 100644
index 000000000000..f755fbe0a5ba
--- /dev/null
+++ b/docs/system/hexagon/cdsp.rst
@@ -0,0 +1,10 @@
+Compute DSP
+===========
+
+A Hexagon CDSP is designed as a computation offload device for an SoC.  The
+``V66G_1024`` machine contains:
+
+* L2VIC interrupt controller
+* QTimer timer device
+
+This machine will support any Hexagon CPU, but will default to ``v66``.
diff --git a/docs/system/hexagon/emulation.rst b/docs/system/hexagon/emulation.rst
new file mode 100644
index 000000000000..03a6092a1281
--- /dev/null
+++ b/docs/system/hexagon/emulation.rst
@@ -0,0 +1,16 @@
+.. _Hexagon Emulation:
+
+Hexagon CPU architecture support
+================================
+
+QEMU's TCG emulation includes support for v65, v66, v67, v68, v69, v71, v73.
+It also has support for the following architecture extensions:
+
+- HVX (Hexagon Vector eXtensions)
+
+For information on the specifics of the HVX extension, please refer
+to the `Qualcomm Hexagon V69 HVX Programmer's Reference Manual
+<https://docs.qualcomm.com/bundle/publicresource/80-N2040-49_REV_AA_Qualcomm_Hexagon_V69_HVX_ProgrammerS_Reference_Manual.pdf>`_.
+
+.. code-block:: bash
+
diff --git a/docs/system/target-hexagon.rst b/docs/system/target-hexagon.rst
new file mode 100644
index 000000000000..894337a533cd
--- /dev/null
+++ b/docs/system/target-hexagon.rst
@@ -0,0 +1,112 @@
+.. _Hexagon-System-emulator:
+
+Hexagon System emulator
+-----------------------
+
+Use the ``qemu-system-hexagon`` executable to simulate a 32-bit Hexagon
+machine.
+
+Hexagon Machines
+================
+
+Hexagon DSPs are suited to various functions and generally appear in a
+"DSP subsystem" of a larger system-on-chip (SoC).
+
+Hexagon DSPs are often included in a subsystem that looks like the diagram
+below.  Instructions are loaded into DDR before the DSP is brought out of
+reset and the first instructions are fetched from DDR via the EVB/reset vector.
+
+In a real system, a TBU/SMMU would normally arbitrate AXI accesses but
+we don't have a need to model that for QEMU.
+
+Hexagon DSP cores use simultaneous multithreading (SMT) with as many as 8
+hardware threads.
+
+.. admonition:: Diagram
+
+ .. code:: text
+
+              AHB (local) bus                     AXI (global) bus
+                    │                                 │
+                    │                                 │
+       ┌─────────┐  │       ┌─────────────────┐       │
+       │ L2VIC   ├──┤       │                 │       │
+       │         ├──┼───────►                 ├───────┤
+       └─────▲───┘  │       │   Hexagon DSP   │       │
+             │      │       │                 │       │        ┌─────┐
+             │      │       │    N threads    │       │        │ DDR │
+             │      ├───────┤                 │       │        │     │
+        ┌────┴──┐   │       │                 │       ├────────┤     │
+        │QTimer ├───┤       │                 │       │        │     │
+        │       │   │       │                 │       │        │     │
+        └───────┘   │       │   ┌─────────┐   │       │        │     │
+                    │       │  ┌─────────┐│   │       │        │     │
+        ┌───────┐   │       │  │  HVX xM ││   │       │        │     │
+        │QDSP6SS├───┤       │  │         │┘   │       │        │     │
+        └───────┘   │       │  └─────────┘    │       │        └─────┘
+                    │       │                 │       │
+        ┌───────┐   │       └─────────────────┘       │
+        │  CSR  ├───┤
+        └───────┘   │   ┌──────┐   ┌───────────┐
+                    │   │ TCM  │   │   VTCM    │
+                        │      │   │           │
+                        └──────┘   │           │
+                                   │           │
+                                   │           │
+                                   │           │
+                                   └───────────┘
+
+Components
+----------
+Other than l2vic and HVX, the components below are not implemented in QEMU.
+
+* L2VIC: the L2 vectored interrupt controller.  Supports 1024 input
+  interrupts, edge- or level-triggered.  The core ISA has system registers
+  ``VID``, ``VID1`` which read through to the L2VIC device.
+* QTimer: ARMSSE-based programmable timer device. Its interrupts are
+  wired to the L2VIC.  System registers ``TIMER``, ``UTIMER`` read
+  through to the QTimer device.
+* QDSP6SS: DSP subsystem features, accessible to the entire SoC, including
+  DSP NMI, watchdog, reset, etc.
+* CSR: Configuration/Status Registers.
+* TCM: DSP-exclusive tightly-coupled memory.  This memory can be used for
+  DSPs when isolated from DDR and in some bootstrapping modes.
+* VTCM: DSP-exclusive vector tightly-coupled memory.  This memory is accessed
+  by some HVX instructions.
+* HVX: the vector coprocessor supports 64 and 128-byte vector registers.
+  64-byte mode is not implemented in QEMU.
+
+
+Bootstrapping
+-------------
+Hexagon systems do not generally have access to a block device.  So, for
+QEMU the typical use case involves loading a binary or ELF file into memory
+and executing from the indicated start address::
+
+    $ qemu-system-hexagon -kernel ./prog -append 'arg1 arg2'
+
+Semihosting
+-----------
+Hexagon supports a semihosting interface similar to other architectures'.
+The ``trap0`` instruction can activate these semihosting calls so that the
+guest software can access the host console and filesystem.  Semihosting
+is not yet implemented in QEMU hexagon.
+
+Hexagon Virtual Machine
+-----------------------
+
+The hexagon virtual machine is a hypervisor that can partition a single
+Hexagon DSP among multiple guest operating systems, and abstracts the
+specific details of a DSP architectural revision for the sake of consistency
+among generations.
+
+[minivm](https://github.com/quic/hexagonMVM) is a reference implementation
+of this VM interface.
+
+
+Hexagon Features
+================
+.. toctree::
+   hexagon/emulation
+   hexagon/cdsp
+
diff --git a/docs/system/targets.rst b/docs/system/targets.rst
index 224fadae71c4..e6dcdb9d4161 100644
--- a/docs/system/targets.rst
+++ b/docs/system/targets.rst
@@ -29,3 +29,4 @@ Contents:
    target-sparc64
    target-i386
    target-xtensa
+   target-hexagon
diff --git a/gdb-xml/hexagon-sys.xml b/gdb-xml/hexagon-sys.xml
new file mode 100644
index 000000000000..1d9c21172253
--- /dev/null
+++ b/gdb-xml/hexagon-sys.xml
@@ -0,0 +1,116 @@
+<?xml version="1.0"?>
+<!--
+  Copyright(c) 2023-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+
+  This work is licensed under the terms of the GNU GPL, version 2 or
+  (at your option) any later version. See the COPYING file in the
+  top-level directory.
+
+  Note: this file is intended to be use with LLDB, so it contains fields
+  that may be unknown to GDB. For more information on such fields, please
+  see:
+  https://github.com/llvm/llvm-project/blob/287aa6c4536408413b860e61fca0318a27214cf3/lldb/docs/lldb-gdb-remote.txt#L738-L860
+  https://github.com/llvm/llvm-project/blob/287aa6c4536408413b860e61fca0318a27214cf3/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp#L4275-L4335
+-->
+
+<!DOCTYPE feature SYSTEM "gdb-target.dtd">
+<feature name="org.gnu.gdb.hexagon.sys">
+
+  <reg name="sgp0"       bitsize="32" offset="4416" encoding="uint" format="hex" group="System Registers" dwarf_regnum="135" />
+  <reg name="sgp1"       bitsize="32" offset="4420" encoding="uint" format="hex" group="System Registers" dwarf_regnum="136" />
+  <reg name="stid"       bitsize="32" offset="4424" encoding="uint" format="hex" group="System Registers" dwarf_regnum="137" />
+  <reg name="elr"        bitsize="32" offset="4428" encoding="uint" format="hex" group="System Registers" dwarf_regnum="138" />
+  <reg name="badva0"     bitsize="32" offset="4432" encoding="uint" format="hex" group="System Registers" dwarf_regnum="139" />
+  <reg name="badva1"     bitsize="32" offset="4436" encoding="uint" format="hex" group="System Registers" dwarf_regnum="140" />
+  <reg name="ssr"        bitsize="32" offset="4440" encoding="uint" format="hex" group="System Registers" dwarf_regnum="141" />
+  <reg name="ccr"        bitsize="32" offset="4444" encoding="uint" format="hex" group="System Registers" dwarf_regnum="142" />
+  <reg name="htid"       bitsize="32" offset="4448" encoding="uint" format="hex" group="System Registers" dwarf_regnum="143" />
+  <reg name="badva"      bitsize="32" offset="4452" encoding="uint" format="hex" group="System Registers" dwarf_regnum="144" />
+  <reg name="imask"      bitsize="32" offset="4456" encoding="uint" format="hex" group="System Registers" dwarf_regnum="145" />
+  <reg name="gevb"       bitsize="32" offset="4460" encoding="uint" format="hex" group="System Registers" dwarf_regnum="146" />
+  <reg name="rsv12"      bitsize="32" offset="4464" encoding="uint" format="hex" group="System Registers" dwarf_regnum="147" />
+  <reg name="rsv13"      bitsize="32" offset="4468" encoding="uint" format="hex" group="System Registers" dwarf_regnum="148" />
+  <reg name="rsv14"      bitsize="32" offset="4472" encoding="uint" format="hex" group="System Registers" dwarf_regnum="149" />
+  <reg name="rsv15"      bitsize="32" offset="4476" encoding="uint" format="hex" group="System Registers" dwarf_regnum="150" />
+  <reg name="evb"        bitsize="32" offset="4480" encoding="uint" format="hex" group="System Registers" dwarf_regnum="151" />
+  <reg name="modectl"    bitsize="32" offset="4484" encoding="uint" format="hex" group="System Registers" dwarf_regnum="152" />
+  <reg name="syscfg"     bitsize="32" offset="4488" encoding="uint" format="hex" group="System Registers" dwarf_regnum="153" />
+  <reg name="free19"     bitsize="32" offset="4492" encoding="uint" format="hex" group="System Registers" dwarf_regnum="154" />
+  <reg name="ipendad"    bitsize="32" offset="4496" encoding="uint" format="hex" group="System Registers" dwarf_regnum="155" />
+  <reg name="vid"        bitsize="32" offset="4500" encoding="uint" format="hex" group="System Registers" dwarf_regnum="156" />
+  <reg name="vid1"       bitsize="32" offset="4504" encoding="uint" format="hex" group="System Registers" dwarf_regnum="157" />
+  <reg name="bestwait"   bitsize="32" offset="4508" encoding="uint" format="hex" group="System Registers" dwarf_regnum="158" />
+  <reg name="free24"     bitsize="32" offset="4512" encoding="uint" format="hex" group="System Registers" dwarf_regnum="159" />
+  <reg name="schedcfg"   bitsize="32" offset="4516" encoding="uint" format="hex" group="System Registers" dwarf_regnum="160" />
+  <reg name="free26"     bitsize="32" offset="4520" encoding="uint" format="hex" group="System Registers" dwarf_regnum="161" />
+  <reg name="cfgbase"    bitsize="32" offset="4524" encoding="uint" format="hex" group="System Registers" dwarf_regnum="162" />
+  <reg name="diag"       bitsize="32" offset="4528" encoding="uint" format="hex" group="System Registers" dwarf_regnum="163" />
+  <reg name="rev"        bitsize="32" offset="4532" encoding="uint" format="hex" group="System Registers" dwarf_regnum="164" />
+  <reg name="pcyclelo"   bitsize="32" offset="4536" encoding="uint" format="hex" group="System Registers" dwarf_regnum="165" />
+  <reg name="pcyclehi"   bitsize="32" offset="4540" encoding="uint" format="hex" group="System Registers" dwarf_regnum="166" />
+  <reg name="isdbst"     bitsize="32" offset="4544" encoding="uint" format="hex" group="System Registers" dwarf_regnum="167" />
+  <reg name="isdbcfg0"   bitsize="32" offset="4548" encoding="uint" format="hex" group="System Registers" dwarf_regnum="168" />
+  <reg name="isdbcfg1"   bitsize="32" offset="4552" encoding="uint" format="hex" group="System Registers" dwarf_regnum="169" />
+  <reg name="livelock"   bitsize="32" offset="4556" encoding="uint" format="hex" group="System Registers" dwarf_regnum="170" />
+  <reg name="brkptpc0"   bitsize="32" offset="4560" encoding="uint" format="hex" group="System Registers" dwarf_regnum="171" />
+  <reg name="brkptccfg0" bitsize="32" offset="4564" encoding="uint" format="hex" group="System Registers" dwarf_regnum="172" />
+  <reg name="brkptpc1"   bitsize="32" offset="4568" encoding="uint" format="hex" group="System Registers" dwarf_regnum="173" />
+  <reg name="brkptcfg1"  bitsize="32" offset="4572" encoding="uint" format="hex" group="System Registers" dwarf_regnum="174" />
+  <reg name="isdbmbxin"  bitsize="32" offset="4576" encoding="uint" format="hex" group="System Registers" dwarf_regnum="175" />
+  <reg name="isdbmbxout" bitsize="32" offset="4580" encoding="uint" format="hex" group="System Registers" dwarf_regnum="176" />
+  <reg name="isdben"     bitsize="32" offset="4584" encoding="uint" format="hex" group="System Registers" dwarf_regnum="177" />
+  <reg name="isdbgpr"    bitsize="32" offset="4588" encoding="uint" format="hex" group="System Registers" dwarf_regnum="178" />
+  <reg name="pmucnt4"    bitsize="32" offset="4592" encoding="uint" format="hex" group="System Registers" dwarf_regnum="179" />
+  <reg name="pmucnt5"    bitsize="32" offset="4596" encoding="uint" format="hex" group="System Registers" dwarf_regnum="180" />
+  <reg name="pmucnt6"    bitsize="32" offset="4600" encoding="uint" format="hex" group="System Registers" dwarf_regnum="181" />
+  <reg name="pmucnt7"    bitsize="32" offset="4604" encoding="uint" format="hex" group="System Registers" dwarf_regnum="182" />
+  <reg name="pmucnt0"    bitsize="32" offset="4608" encoding="uint" format="hex" group="System Registers" dwarf_regnum="183" />
+  <reg name="pmucnt1"    bitsize="32" offset="4612" encoding="uint" format="hex" group="System Registers" dwarf_regnum="184" />
+  <reg name="pmucnt2"    bitsize="32" offset="4616" encoding="uint" format="hex" group="System Registers" dwarf_regnum="185" />
+  <reg name="pmucnt3"    bitsize="32" offset="4620" encoding="uint" format="hex" group="System Registers" dwarf_regnum="186" />
+  <reg name="pmuevtcfg"  bitsize="32" offset="4624" encoding="uint" format="hex" group="System Registers" dwarf_regnum="187" />
+  <reg name="pmustid0"   bitsize="32" offset="4628" encoding="uint" format="hex" group="System Registers" dwarf_regnum="188" />
+  <reg name="pmuevtcfg1" bitsize="32" offset="4632" encoding="uint" format="hex" group="System Registers" dwarf_regnum="189" />
+  <reg name="pmustid1"   bitsize="32" offset="4636" encoding="uint" format="hex" group="System Registers" dwarf_regnum="190" />
+  <reg name="timerlo"    bitsize="32" offset="4640" encoding="uint" format="hex" group="System Registers" dwarf_regnum="191" />
+  <reg name="timerhi"    bitsize="32" offset="4644" encoding="uint" format="hex" group="System Registers" dwarf_regnum="192" />
+  <reg name="pmucfg"     bitsize="32" offset="4648" encoding="uint" format="hex" group="System Registers" dwarf_regnum="193" />
+  <reg name="rsv59"      bitsize="32" offset="4652" encoding="uint" format="hex" group="System Registers" dwarf_regnum="194" />
+  <reg name="rsv60"      bitsize="32" offset="4656" encoding="uint" format="hex" group="System Registers" dwarf_regnum="195" />
+  <reg name="rsv61"      bitsize="32" offset="4660" encoding="uint" format="hex" group="System Registers" dwarf_regnum="196" />
+  <reg name="rsv62"      bitsize="32" offset="4664" encoding="uint" format="hex" group="System Registers" dwarf_regnum="197" />
+  <reg name="rsv63"      bitsize="32" offset="4668" encoding="uint" format="hex" group="System Registers" dwarf_regnum="198" />
+  <reg name="g0"         bitsize="32" offset="4672" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="179" />
+  <reg name="g1"         bitsize="32" offset="4676" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="180" />
+  <reg name="g2"         bitsize="32" offset="4680" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="181" />
+  <reg name="g3"         bitsize="32" offset="4684" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="182" />
+  <reg name="rsv4"       bitsize="32" offset="4688" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="183" />
+  <reg name="rsv5"       bitsize="32" offset="4692" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="184" />
+  <reg name="rsv6"       bitsize="32" offset="4696" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="185" />
+  <reg name="rsv7"       bitsize="32" offset="4700" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="186" />
+  <reg name="rsv8"       bitsize="32" offset="4704" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="187" />
+  <reg name="rsv9"       bitsize="32" offset="4708" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="188" />
+  <reg name="rsv10"      bitsize="32" offset="4712" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="189" />
+  <reg name="rsv11"      bitsize="32" offset="4716" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="190" />
+  <reg name="rsv12"      bitsize="32" offset="4720" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="191" />
+  <reg name="rsv13"      bitsize="32" offset="4724" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="192" />
+  <reg name="rsv14"      bitsize="32" offset="4728" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="193" />
+  <reg name="rsv15"      bitsize="32" offset="4732" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="194" />
+  <reg name="gpmucnt4"   bitsize="32" offset="4736" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="195" />
+  <reg name="gpmucnt5"   bitsize="32" offset="4740" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="196" />
+  <reg name="gpmucnt6"   bitsize="32" offset="4744" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="197" />
+  <reg name="gpmucnt7"   bitsize="32" offset="4748" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="198" />
+  <reg name="rsv20"      bitsize="32" offset="4752" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="199" />
+  <reg name="rsv21"      bitsize="32" offset="4756" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="200" />
+  <reg name="rsv22"      bitsize="32" offset="4760" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="201" />
+  <reg name="rsv23"      bitsize="32" offset="4764" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="202" />
+  <reg name="gpcyclelo"  bitsize="32" offset="4768" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="203" />
+  <reg name="gpcyclehi"  bitsize="32" offset="4772" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="204" />
+  <reg name="gpmucnt0"   bitsize="32" offset="4776" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="205" />
+  <reg name="gpmucnt1"   bitsize="32" offset="4780" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="206" />
+  <reg name="gpmucnt2"   bitsize="32" offset="4784" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="207" />
+  <reg name="gpmucnt3"   bitsize="32" offset="4788" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="208" />
+  <reg name="rsv30"      bitsize="32" offset="4792" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="209" />
+  <reg name="rsv31"      bitsize="32" offset="4796" encoding="uint" format="hex" group="Guest Registers"  dwarf_regnum="210" />
+
+</feature>
diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
index c59cd6637b97..ecdbdf623d11 100644
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -196,7 +196,8 @@ SRST
 ERST
 
 #if defined(TARGET_I386) || defined(TARGET_SH4) || defined(TARGET_SPARC) || \
-    defined(TARGET_PPC) || defined(TARGET_XTENSA) || defined(TARGET_M68K)
+    defined(TARGET_PPC) || defined(TARGET_XTENSA) || defined(TARGET_M68K) || \
+    defined(TARGET_HEXAGON)
     {
         .name       = "tlb",
         .args_type  = "",
diff --git a/hw/Kconfig b/hw/Kconfig
index 9a86a6a28a64..4dc7914c13f5 100644
--- a/hw/Kconfig
+++ b/hw/Kconfig
@@ -67,6 +67,7 @@ source sparc/Kconfig
 source sparc64/Kconfig
 source tricore/Kconfig
 source xtensa/Kconfig
+source hexagon/Kconfig
 
 # Symbols used by multiple targets
 config TEST_DEVICES
diff --git a/hw/hexagon/Kconfig b/hw/hexagon/Kconfig
new file mode 100644
index 000000000000..9a2369974e09
--- /dev/null
+++ b/hw/hexagon/Kconfig
@@ -0,0 +1,15 @@
+config HEX_DSP
+    bool
+    default y
+    depends on HEXAGON && TCG
+    imply PTIMER
+    select L2VIC  # Vector PIC
+    select ARM_COMPATIBLE_SEMIHOSTING
+
+config HEX_VIRT
+    bool
+    default y
+    depends on HEX_DSP && FDT
+    select DEVICE_TREE
+    select VIRTIO_MMIO
+    select PL011
diff --git a/hw/hexagon/hexagon_dsp.c b/hw/hexagon/hexagon_dsp.c
new file mode 100644
index 000000000000..f4440de80ce0
--- /dev/null
+++ b/hw/hexagon/hexagon_dsp.c
@@ -0,0 +1,206 @@
+/*
+ * Hexagon DSP Subsystem emulation.  This represents a generic DSP
+ * subsystem with few peripherals, like the Compute DSP.
+ *
+ * Copyright (c) 2020-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+
+#include "qemu/osdep.h"
+#include "qemu/units.h"
+#include "exec/address-spaces.h"
+#include "hw/hw.h"
+#include "hw/boards.h"
+#include "hw/qdev-properties.h"
+#include "hw/hexagon/hexagon.h"
+#include "hw/loader.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/log.h"
+#include "elf.h"
+#include "cpu.h"
+#include "include/migration/cpu.h"
+#include "include/system/system.h"
+#include "target/hexagon/internal.h"
+#include "system/reset.h"
+#include "include/semihosting/semihost.h"
+
+#include "machine_cfg_v66g_1024.h.inc"
+
+static hwaddr isdb_secure_flag;
+static hwaddr isdb_trusted_flag;
+static void hex_symbol_callback(const char *st_name, int st_info,
+                                uint64_t st_value, uint64_t st_size)
+{
+    if (!g_strcmp0("isdb_secure_flag", st_name)) {
+        isdb_secure_flag = st_value;
+    }
+    if (!g_strcmp0("isdb_trusted_flag", st_name)) {
+        isdb_trusted_flag = st_value;
+    }
+}
+
+/* Board init.  */
+static struct hexagon_board_boot_info hexagon_binfo;
+
+static void hexagon_load_kernel(HexagonCPU *cpu)
+{
+    uint64_t pentry;
+    long kernel_size;
+
+    kernel_size = load_elf_ram_sym(hexagon_binfo.kernel_filename, NULL, NULL,
+                      NULL, &pentry, NULL, NULL,
+                      &hexagon_binfo.kernel_elf_flags, 0, EM_HEXAGON, 0, 0,
+                      &address_space_memory, false, hex_symbol_callback);
+
+    if (kernel_size <= 0) {
+        error_report("no kernel file '%s'",
+            hexagon_binfo.kernel_filename);
+        exit(1);
+    }
+
+    qdev_prop_set_uint32(DEVICE(cpu), "exec-start-addr", pentry);
+}
+
+static void hexagon_init_bootstrap(MachineState *machine, HexagonCPU *cpu)
+{
+    if (machine->kernel_filename) {
+        hexagon_load_kernel(cpu);
+        uint32_t mem = 1;
+        if (isdb_secure_flag) {
+            cpu_physical_memory_write(isdb_secure_flag, &mem, sizeof(mem));
+        }
+        if (isdb_trusted_flag) {
+            cpu_physical_memory_write(isdb_trusted_flag, &mem, sizeof(mem));
+        }
+    }
+}
+
+static void do_cpu_reset(void *opaque)
+{
+    HexagonCPU *cpu = opaque;
+    CPUState *cs = CPU(cpu);
+    cpu_reset(cs);
+}
+
+static void hexagon_common_init(MachineState *machine, Rev_t rev,
+                                hexagon_machine_config *m_cfg)
+{
+    memset(&hexagon_binfo, 0, sizeof(hexagon_binfo));
+    if (machine->kernel_filename) {
+        hexagon_binfo.ram_size = machine->ram_size;
+        hexagon_binfo.kernel_filename = machine->kernel_filename;
+    }
+
+    machine->enable_graphics = 0;
+
+    MemoryRegion *address_space = get_system_memory();
+
+    MemoryRegion *config_table_rom = g_new(MemoryRegion, 1);
+    memory_region_init_rom(config_table_rom, NULL, "config_table.rom",
+                           sizeof(m_cfg->cfgtable), &error_fatal);
+    memory_region_add_subregion(address_space, m_cfg->cfgbase,
+                                config_table_rom);
+
+    MemoryRegion *sram = g_new(MemoryRegion, 1);
+    memory_region_init_ram(sram, NULL, "ddr.ram",
+        machine->ram_size, &error_fatal);
+    memory_region_add_subregion(address_space, 0x0, sram);
+
+    Error **errp = NULL;
+
+    for (int i = 0; i < machine->smp.cpus; i++) {
+        HexagonCPU *cpu = HEXAGON_CPU(object_new(machine->cpu_type));
+        CPUHexagonState *env = &cpu->env;
+        qemu_register_reset(do_cpu_reset, cpu);
+
+        /*
+         * CPU #0 is the only CPU running at boot, others must be
+         * explicitly enabled via start instruction.
+         */
+        qdev_prop_set_bit(DEVICE(cpu), "start-powered-off", (i != 0));
+        qdev_prop_set_uint32(DEVICE(cpu), "l2vic-base-addr", m_cfg->l2vic_base);
+        qdev_prop_set_uint32(DEVICE(cpu), "config-table-addr", m_cfg->cfgbase);
+        qdev_prop_set_uint32(DEVICE(cpu), "qtimer-base-addr", m_cfg->qtmr_region);
+        qdev_prop_set_uint32(DEVICE(cpu), "hvx-contexts",
+                             m_cfg->cfgtable.ext_contexts);
+        qdev_prop_set_uint32(DEVICE(cpu), "jtlb-entries",
+                             m_cfg->cfgtable.jtlb_size_entries);
+
+
+        if (i == 0) {
+            hexagon_init_bootstrap(machine, cpu);
+            if (!qdev_realize_and_unref(DEVICE(cpu), NULL, errp)) {
+                return;
+            }
+            DeviceState *l2vic_dev;
+            l2vic_dev = sysbus_create_varargs("l2vic", m_cfg->l2vic_base,
+                    /* IRQ#, Evnt#,CauseCode */
+                    qdev_get_gpio_in(DEVICE(cpu), 0),
+                    qdev_get_gpio_in(DEVICE(cpu), 1),
+                    qdev_get_gpio_in(DEVICE(cpu), 2),
+                    qdev_get_gpio_in(DEVICE(cpu), 3),
+                    qdev_get_gpio_in(DEVICE(cpu), 4),
+                    qdev_get_gpio_in(DEVICE(cpu), 5),
+                    qdev_get_gpio_in(DEVICE(cpu), 6),
+                    qdev_get_gpio_in(DEVICE(cpu), 7),
+                    NULL);
+            sysbus_mmio_map(SYS_BUS_DEVICE(l2vic_dev), 1,
+                m_cfg->cfgtable.fastl2vic_base << 16);
+        } else if (!qdev_realize_and_unref(DEVICE(cpu), NULL, errp)) {
+            env->dir_list = NULL;
+            return;
+        }
+
+    }
+
+    rom_add_blob_fixed_as("config_table.rom", &m_cfg->cfgtable,
+                          sizeof(m_cfg->cfgtable), m_cfg->cfgbase,
+                          &address_space_memory);
+}
+
+static void init_mc(MachineClass *mc)
+{
+    mc->block_default_type = IF_SD;
+    mc->default_ram_size = 4 * GiB;
+    mc->no_parallel = 1;
+    mc->no_floppy = 1;
+    mc->no_cdrom = 1;
+    mc->no_serial = 1;
+    mc->is_default = false;
+    mc->max_cpus = 8;
+    qemu_semihosting_enable();
+}
+
+/* ----------------------------------------------------------------- */
+/* Core-specific configuration settings are defined below this line. */
+/* Config table values defined in machine_configs.h.inc              */
+/* ----------------------------------------------------------------- */
+
+static void v66g_1024_config_init(MachineState *machine)
+{
+    hexagon_common_init(machine, v66_rev, &v66g_1024);
+}
+
+static void v66g_1024_init(ObjectClass *oc, void *data)
+{
+    MachineClass *mc = MACHINE_CLASS(oc);
+
+    mc->desc = "Hexagon V66G_1024";
+    mc->init = v66g_1024_config_init;
+    init_mc(mc);
+    mc->is_default = true;
+    mc->default_cpu_type = TYPE_HEXAGON_CPU_V66;
+    mc->default_cpus = 4;
+}
+
+static const TypeInfo hexagon_machine_types[] = {
+    {
+        .name = MACHINE_TYPE_NAME("V66G_1024"),
+        .parent = TYPE_MACHINE,
+        .class_init = v66g_1024_init,
+    },
+};
+
+DEFINE_TYPES(hexagon_machine_types)
diff --git a/hw/hexagon/machine_cfg_sa8775_cdsp0.h.inc b/hw/hexagon/machine_cfg_sa8775_cdsp0.h.inc
new file mode 100644
index 000000000000..70b1eabfe961
--- /dev/null
+++ b/hw/hexagon/machine_cfg_sa8775_cdsp0.h.inc
@@ -0,0 +1,63 @@
+
+static hexagon_machine_config SA8775P_cdsp0 = {
+    .cfgbase =         0x24000000 + 0x180000,
+    .l2tcm_size =      0x00000000,
+    .l2vic_base =      0x26300000 + 0x90000,
+    .l2vic_size =      0x00001000,
+    .csr_base =        0x26300000,
+    .qtmr_region =     0x26300000 + 0xA1000,
+    .cfgtable = {
+        .l2tcm_base = 0x00002400,
+        .reserved0 = 0x00000000,
+        .subsystem_base = 0x00002638,
+        .etm_base = 0x00002419,
+        .l2cfg_base = 0x0000241a,
+        .reserved1 = 0x0000241b,
+        .l1s0_base = 0x00002500,
+        .axi2_lowaddr = 0x00000000,
+        .streamer_base = 0x00000000,
+        .reserved2 = 0x00000000,
+        .fastl2vic_base = 0x0000241e,
+        .jtlb_size_entries = 0x00000080,
+        .coproc_present = 0x00000001,
+        .ext_contexts = 0x00000004,
+        .vtcm_base = 0x00002500,
+        .vtcm_size_kb = 0x00002000,
+        .l2tag_size = 0x00000400,
+        .l2ecomem_size = 0x00000000,
+        .thread_enable_mask = 0x0000003f,
+        .eccreg_base = 0x0000241f,
+        .l2line_size = 0x00000080,
+        .tiny_core = 0x00000000,
+        .l2itcm_size = 0x00000000,
+        .l2itcm_base = 0x00002400,
+        .reserved3 = 0x00000000,
+        .dtm_present = 0x00000000,
+        .dma_version = 0x00000003,
+        .hvx_vec_log_length = 0x00000007,
+        .core_id = 0x00000000,
+        .core_count = 0x00000000,
+        .coproc2_reg0 = 0x00000040,
+        .coproc2_reg1 = 0x00000020,
+        .v2x_mode = 0x00000001,
+        .coproc2_reg2 = 0x00000008,
+        .coproc2_reg3 = 0x00000020,
+        .coproc2_reg4 = 0x00000000,
+        .coproc2_reg5 = 0x00000002,
+        .coproc2_reg6 = 0x00000016,
+        .coproc2_reg7 = 0x00000006,
+        .acd_preset = 0x00000001,
+        .mnd_preset = 0x00000000,
+        .l1d_size_kb = 0x00000010,
+        .l1i_size_kb = 0x00000020,
+        .l1d_write_policy = 0x00000002,
+        .vtcm_bank_width = 0x00000080,
+        .reserved3 = 0x00000001,
+        .reserved4 = 0x00000000,
+        .reserved5 = 0x00000003,
+        .coproc2_cvt_mpy_size = 0x0000000a,
+        .consistency_domain = 0x000000e0,
+        .capacity_domain = 0x00000080,
+        .axi3_lowaddr = 0x00000000,
+    },
+};
diff --git a/hw/hexagon/machine_cfg_v66g_1024.h.inc b/hw/hexagon/machine_cfg_v66g_1024.h.inc
new file mode 100644
index 000000000000..8f2a593bb860
--- /dev/null
+++ b/hw/hexagon/machine_cfg_v66g_1024.h.inc
@@ -0,0 +1,63 @@
+
+static hexagon_machine_config v66g_1024 = {
+    .cfgbase =        0xd8180000,
+    .l2tcm_size =     0x00000000,
+    .l2vic_base =     0xfc910000,
+    .l2vic_size =     0x00001000,
+    .csr_base =       0xfc900000,
+    .qtmr_region =    0xfc921000,
+    .cfgtable = {
+        .l2tcm_base = 0x0000d800,
+        .reserved0 = 0x0000d400,
+        .subsystem_base = 0x0000fc90,
+        .etm_base = 0x0000d805,
+        .l2cfg_base = 0x0000d81a,
+        .reserved1 = 0x00000000,
+        .l1s0_base = 0x0000d820,
+        .axi2_lowaddr = 0x00003000,
+        .streamer_base = 0x00000000,
+        .reserved2 = 0x0000d819,
+        .fastl2vic_base = 0x0000d81e,
+        .jtlb_size_entries = 0x00000080,
+        .coproc_present = 0x00000001,
+        .ext_contexts = 0x00000004,
+        .vtcm_base = 0x0000d820,
+        .vtcm_size_kb = 0x00000100,
+        .l2tag_size = 0x00000400,
+        .l2ecomem_size = 0x00000400,
+        .thread_enable_mask = 0x0000000f,
+        .eccreg_base = 0x0000d81f,
+        .l2line_size = 0x00000080,
+        .tiny_core = 0x00000000,
+        .l2itcm_size = 0x00000000,
+        .l2itcm_base = 0x0000d820,
+        .reserved3 = 0x00000000,
+        .dtm_present = 0x00000000,
+        .dma_version = 0x00000000,
+        .hvx_vec_log_length = 0x00000080,
+        .core_id = 0x00000000,
+        .core_count = 0x00000000,
+        .coproc2_reg0 = 0x00000000,
+        .coproc2_reg1 = 0x00000000,
+        .v2x_mode = 0x00000000,
+        .coproc2_reg2 = 0x00000000,
+        .coproc2_reg3 = 0x00000000,
+        .coproc2_reg4 = 0x00000000,
+        .coproc2_reg5 = 0x00000000,
+        .coproc2_reg6 = 0x00000000,
+        .coproc2_reg7 = 0x00000000,
+        .acd_preset = 0x00000000,
+        .mnd_preset = 0x00000000,
+        .l1d_size_kb = 0x00000000,
+        .l1i_size_kb = 0x00000000,
+        .l1d_write_policy = 0x00000000,
+        .vtcm_bank_width = 0x00000000,
+        .reserved3 = 0x00000000,
+        .reserved4 = 0x00000000,
+        .reserved5 = 0x00000000,
+        .coproc2_cvt_mpy_size = 0x00000000,
+        .consistency_domain = 0x00000000,
+        .capacity_domain = 0x00000000,
+        .axi3_lowaddr = 0x00000000,
+    },
+};
diff --git a/hw/hexagon/machine_cfg_v68n_1024.h.inc b/hw/hexagon/machine_cfg_v68n_1024.h.inc
new file mode 100644
index 000000000000..257c133df8f3
--- /dev/null
+++ b/hw/hexagon/machine_cfg_v68n_1024.h.inc
@@ -0,0 +1,64 @@
+
+static hexagon_machine_config v68n_1024 = {
+    .cfgbase =           0xde000000,
+    .l2tcm_size =        0x00000000,
+    .l2vic_base =        0xfc910000,
+    .l2vic_size =        0x00001000,
+    .csr_base =          0xfc900000,
+    .qtmr_region =       0xfc921000,
+    .cfgtable = {
+        .l2tcm_base = 0x0000d800,
+        .reserved0 = 0x00000000,
+        .subsystem_base = 0x0000fc90,
+        .etm_base = 0x0000d819,
+        .l2cfg_base = 0x0000d81a,
+        .reserved1 = 0x00000000,
+        .l1s0_base = 0x0000d840,
+        .axi2_lowaddr = 0x00003000,
+        .streamer_base = 0x0000d81c,
+        .reserved2 = 0x0000d81d,
+        .fastl2vic_base = 0x0000d81e,
+        .jtlb_size_entries = 0x00000080,
+        .coproc_present = 0x00000001,
+        .ext_contexts = 0x00000004,
+        .vtcm_base = 0x0000d840,
+        .vtcm_size_kb = 0x00001000,
+        .l2tag_size = 0x00000400,
+        .l2ecomem_size = 0x00000400,
+        .thread_enable_mask = 0x0000003f,
+        .eccreg_base = 0x0000d81f,
+        .l2line_size = 0x00000080,
+        .tiny_core = 0x00000000,
+        .l2itcm_size = 0x00000000,
+        .l2itcm_base = 0x0000d820,
+        .reserved3 = 0x00000000,
+        .dtm_present = 0x00000000,
+        .dma_version = 0x00000001,
+        .hvx_vec_log_length = 0x00000007,
+        .core_id = 0x00000000,
+        .core_count = 0x00000000,
+        .coproc2_reg0 = 0x00000040,
+        .coproc2_reg1 = 0x00000020,
+        .v2x_mode = 0x1f1f1f1f,
+        .coproc2_reg2 = 0x1f1f1f1f,
+        .coproc2_reg3 = 0x1f1f1f1f,
+        .coproc2_reg4 = 0x1f1f1f1f,
+        .coproc2_reg5 = 0x1f1f1f1f,
+        .coproc2_reg6 = 0x1f1f1f1f,
+        .coproc2_reg7 = 0x1f1f1f1f,
+        .acd_preset = 0x1f1f1f1f,
+        .mnd_preset = 0x1f1f1f1f,
+        .l1d_size_kb = 0x1f1f1f1f,
+        .l1i_size_kb = 0x1f1f1f1f,
+        .l1d_write_policy = 0x1f1f1f1f,
+        .vtcm_bank_width = 0x1f1f1f1f,
+        .reserved3 = 0x1f1f1f1f,
+        .reserved4 = 0x1f1f1f1f,
+        .reserved5 = 0x1f1f1f1f,
+        .coproc2_cvt_mpy_size = 0x1f1f1f1f,
+        .consistency_domain = 0x1f1f1f1f,
+        .capacity_domain = 0x1f1f1f1f,
+        .axi3_lowaddr = 0x1f1f1f1f,
+    },
+};
+
diff --git a/hw/hexagon/meson.build b/hw/hexagon/meson.build
new file mode 100644
index 000000000000..649ad6dc02b3
--- /dev/null
+++ b/hw/hexagon/meson.build
@@ -0,0 +1,7 @@
+hexagon_ss = ss.source_set()
+hexagon_ss.add(when: 'CONFIG_HEX_DSP', if_true: files('hexagon_dsp.c',))
+
+hw_arch += {'hexagon': hexagon_ss}
+
+hexagon_ss.add(when: 'CONFIG_HEX_VIRT', if_true: files('virt.c',))
+
diff --git a/hw/hexagon/virt.c b/hw/hexagon/virt.c
new file mode 100644
index 000000000000..1e7ac4e5b70b
--- /dev/null
+++ b/hw/hexagon/virt.c
@@ -0,0 +1,415 @@
+/*
+ * Hexagon virt emulation
+ *
+ * Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "exec/address-spaces.h"
+#include "hw/char/pl011.h"
+#include "hw/core/sysbus-fdt.h"
+#include "hw/hexagon/hexagon.h"
+#include "hw/hexagon/virt.h"
+#include "hw/loader.h"
+#include "hw/qdev-properties.h"
+#include "hw/register.h"
+#include "hw/timer/qct-qtimer.h"
+#include "qemu/error-report.h"
+#include "qemu/guest-random.h"
+#include "qemu/units.h"
+#include "elf.h"
+#include "machine_cfg_v68n_1024.h.inc"
+#include "system/device_tree.h"
+#include "system/reset.h"
+#include "system/system.h"
+#include <libfdt.h>
+
+static const int VIRTIO_DEV_COUNT = 2;
+
+static const MemMapEntry base_memmap[] = {
+    [VIRT_UART0] = { 0x10000000, 0x00000200 },
+    [VIRT_MMIO] = { 0x11000000, 0x1000000, },
+    [VIRT_GPT] = { 0xab000000, 0x00001000 },
+    [VIRT_FDT] = { 0x99900000, 0x00000200 },
+};
+
+static const int irqmap[] = {
+    [VIRT_MMIO] = 18, /* ...to 18 + VIRTIO_DEV_COUNT - 1 */
+    [VIRT_GPT] = 12,
+    [VIRT_UART0] = 15,
+    [VIRT_QTMR0] = 2,
+    [VIRT_QTMR1] = 4,
+};
+
+
+static void create_fdt(HexagonVirtMachineState *vms)
+{
+    MachineState *ms = MACHINE(vms);
+    void *fdt = create_device_tree(&vms->fdt_size);
+
+    if (!fdt) {
+        error_report("create_device_tree() failed");
+        exit(1);
+    }
+
+    ms->fdt = fdt;
+
+    qemu_fdt_setprop_string(fdt, "/", "compatible", "linux,hexagon-virt");
+    qemu_fdt_setprop_cell(fdt, "/", "#address-cells", 0x2);
+    qemu_fdt_setprop_cell(fdt, "/", "#size-cells", 0x1);
+    qemu_fdt_setprop_string(fdt, "/", "model", "linux,hexagon-virt");
+
+    qemu_fdt_setprop_string(fdt, "/", "model", "hexagon-virt,qemu");
+    qemu_fdt_setprop_string(fdt, "/", "compatible", "qcom,sm8150");
+
+    qemu_fdt_add_subnode(fdt, "/soc");
+    qemu_fdt_setprop_cell(fdt, "/soc", "#address-cells", 0x2);
+    qemu_fdt_setprop_cell(fdt, "/soc", "#size-cells", 0x1);
+    qemu_fdt_setprop(fdt, "/soc", "ranges", NULL, 0);
+
+    qemu_fdt_add_subnode(fdt, "/chosen");
+
+    uint8_t rng_seed[32];
+    qemu_guest_getrandom_nofail(rng_seed, sizeof(rng_seed));
+    qemu_fdt_setprop(fdt, "/chosen", "rng-seed", rng_seed, sizeof(rng_seed));
+}
+
+static void fdt_add_hvx(HexagonVirtMachineState *vms,
+                        const hexagon_machine_config *m_cfg, Error **errp)
+{
+    const MachineState *ms = MACHINE(vms);
+    uint32_t vtcm_size_bytes = m_cfg->cfgtable.vtcm_size_kb * 1024;
+    if (vtcm_size_bytes > 0) {
+        memory_region_init_ram(&vms->vtcm, NULL, "vtcm.ram", vtcm_size_bytes,
+                               errp);
+        memory_region_add_subregion(vms->sys, m_cfg->cfgtable.vtcm_base << 16,
+                                    &vms->vtcm);
+
+        qemu_fdt_add_subnode(ms->fdt, "/soc/vtcm");
+        qemu_fdt_setprop_string(ms->fdt, "/soc/vtcm", "compatible",
+                                "qcom,hexagon_vtcm");
+
+        assert(sizeof(m_cfg->cfgtable.vtcm_base) == sizeof(uint32_t));
+        qemu_fdt_setprop_cells(ms->fdt, "/soc/vtcm", "reg", 0,
+                               m_cfg->cfgtable.vtcm_base << 16,
+                               vtcm_size_bytes);
+    }
+
+    if (m_cfg->cfgtable.ext_contexts > 0) {
+        qemu_fdt_add_subnode(ms->fdt, "/soc/hvx");
+        qemu_fdt_setprop_string(ms->fdt, "/soc/hvx", "compatible",
+                                "qcom,hexagon-hvx");
+        qemu_fdt_setprop_cells(ms->fdt, "/soc/hvx", "qcom,hvx-max-ctxts",
+                               m_cfg->cfgtable.ext_contexts);
+        qemu_fdt_setprop_cells(ms->fdt, "/soc/hvx", "qcom,hvx-vlength",
+                               m_cfg->cfgtable.hvx_vec_log_length);
+    }
+}
+
+static int32_t irq_hvm_ic_phandle = -1;
+static void fdt_add_hvm_pic_node(HexagonVirtMachineState *vms,
+                                 const hexagon_machine_config *m_cfg)
+{
+    MachineState *ms = MACHINE(vms);
+    irq_hvm_ic_phandle = qemu_fdt_alloc_phandle(ms->fdt);
+
+    qemu_fdt_setprop_cell(ms->fdt, "/soc", "interrupt-parent",
+                          irq_hvm_ic_phandle);
+
+    qemu_fdt_add_subnode(ms->fdt, "/soc/interrupt-controller");
+    qemu_fdt_setprop_cell(ms->fdt, "/soc/interrupt-controller",
+                          "#address-cells", 2);
+    qemu_fdt_setprop_cell(ms->fdt, "/soc/interrupt-controller",
+                          "#interrupt-cells", 2);
+    qemu_fdt_setprop_string(ms->fdt, "/soc/interrupt-controller", "compatible",
+                            "qcom,h2-pic,hvm-pic");
+    qemu_fdt_setprop(ms->fdt, "/soc/interrupt-controller",
+                     "interrupt-controller", NULL, 0);
+    qemu_fdt_setprop_cell(ms->fdt, "/soc/interrupt-controller", "phandle",
+                          irq_hvm_ic_phandle);
+
+    sysbus_mmio_map(SYS_BUS_DEVICE(vms->l2vic), 1,
+                    m_cfg->cfgtable.fastl2vic_base << 16);
+}
+
+
+static void fdt_add_gpt_node(HexagonVirtMachineState *vms)
+{
+    g_autofree char *name = NULL;
+    MachineState *ms = MACHINE(vms);
+
+    name = g_strdup_printf("/soc/gpt@%" PRIx64,
+                           (int64_t)base_memmap[VIRT_GPT].base);
+    qemu_fdt_add_subnode(ms->fdt, name);
+    qemu_fdt_setprop_string(ms->fdt, name, "compatible",
+                            "qcom,h2-timer,hvm-timer");
+    qemu_fdt_setprop_cells(ms->fdt, name, "interrupts", irqmap[VIRT_GPT], 0);
+    qemu_fdt_setprop_cells(ms->fdt, name, "reg", 0x0,
+                           base_memmap[VIRT_GPT].base,
+                           base_memmap[VIRT_GPT].size);
+}
+
+static int32_t clock_phandle = -1;
+static void fdt_add_clocks(const HexagonVirtMachineState *vms)
+{
+    MachineState *ms = MACHINE(vms);
+    clock_phandle = qemu_fdt_alloc_phandle(ms->fdt);
+    qemu_fdt_add_subnode(ms->fdt, "/apb-pclk");
+    qemu_fdt_setprop_string(ms->fdt, "/apb-pclk", "compatible", "fixed-clock");
+    qemu_fdt_setprop_cell(ms->fdt, "/apb-pclk", "#clock-cells", 0x0);
+    qemu_fdt_setprop_cell(ms->fdt, "/apb-pclk", "clock-frequency", 24000000);
+    qemu_fdt_setprop_string(ms->fdt, "/apb-pclk", "clock-output-names",
+                            "clk24mhz");
+    qemu_fdt_setprop_cell(ms->fdt, "/apb-pclk", "phandle", clock_phandle);
+}
+
+static void fdt_add_uart(const HexagonVirtMachineState *vms, int uart)
+{
+    char *nodename;
+    hwaddr base = base_memmap[uart].base;
+    hwaddr size = base_memmap[uart].size;
+    assert(uart == 0);
+    int irq = irqmap[VIRT_UART0 + uart];
+    const char compat[] = "arm,pl011\0arm,primecell";
+    const char clocknames[] = "uartclk\0apb_pclk";
+    MachineState *ms = MACHINE(vms);
+
+    pl011_create(base, qdev_get_gpio_in(vms->l2vic, irq), serial_hd(0));
+
+    nodename = g_strdup_printf("/pl011@%" PRIx64, base);
+    qemu_fdt_add_subnode(ms->fdt, nodename);
+
+    /* Note that we can't use setprop_string because of the embedded NUL */
+    qemu_fdt_setprop(ms->fdt, nodename, "compatible", compat, sizeof(compat));
+    qemu_fdt_setprop_cells(ms->fdt, nodename, "reg", 0, base, size);
+    qemu_fdt_setprop_cells(ms->fdt, nodename, "interrupts", 32 + irq, 0);
+    qemu_fdt_setprop_cells(ms->fdt, nodename, "clocks", clock_phandle,
+                           clock_phandle);
+    qemu_fdt_setprop(ms->fdt, nodename, "clock-names", clocknames,
+                     sizeof(clocknames));
+    qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupt-parent",
+                          irq_hvm_ic_phandle);
+
+    qemu_fdt_setprop_string(ms->fdt, "/chosen", "stdout-path", nodename);
+    qemu_fdt_add_subnode(ms->fdt, "/aliases");
+    qemu_fdt_setprop_string(ms->fdt, "/aliases", "serial0", nodename);
+
+    g_free(nodename);
+}
+
+static void fdt_add_cpu_nodes(const HexagonVirtMachineState *vms)
+{
+    MachineState *ms = MACHINE(vms);
+    qemu_fdt_add_subnode(ms->fdt, "/cpus");
+    qemu_fdt_setprop_cell(ms->fdt, "/cpus", "#address-cells", 0x1);
+    qemu_fdt_setprop_cell(ms->fdt, "/cpus", "#size-cells", 0x0);
+
+    /* cpu nodes */
+    for (int num = ms->smp.cpus - 1; num >= 0; num--) {
+        char *nodename = g_strdup_printf("/cpus/cpu@%d", num);
+        qemu_fdt_add_subnode(ms->fdt, nodename);
+        qemu_fdt_setprop_string(ms->fdt, nodename, "device_type", "cpu");
+        qemu_fdt_setprop_cell(ms->fdt, nodename, "reg", num);
+        qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle",
+                              qemu_fdt_alloc_phandle(ms->fdt));
+        g_free(nodename);
+    }
+}
+
+
+static void fdt_add_virtio_devices(const HexagonVirtMachineState *vms)
+{
+    MachineState *ms = MACHINE(vms);
+    /* VirtIO MMIO devices */
+    for (int i = 0; i < VIRTIO_DEV_COUNT; i++) {
+        char *nodename;
+        int irq = irqmap[VIRT_MMIO] + i;
+        size_t size = base_memmap[VIRT_MMIO].size;
+        hwaddr base = base_memmap[VIRT_MMIO].base + i * size;
+
+        nodename = g_strdup_printf("/virtio_mmio@%" PRIx64, base);
+        qemu_fdt_add_subnode(ms->fdt, nodename);
+        qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "virtio,mmio");
+        qemu_fdt_setprop_sized_cells(ms->fdt, nodename, "reg", 2, base, 1,
+                                     size);
+        qemu_fdt_setprop_cells(ms->fdt, nodename, "interrupts", irq, 0);
+        qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupt-parent",
+                              irq_hvm_ic_phandle);
+
+        sysbus_create_simple(
+            "virtio-mmio", base,
+            qdev_get_gpio_in(vms->l2vic, irqmap[VIRT_MMIO] + i));
+
+        g_free(nodename);
+    }
+}
+
+static void create_qtimer(HexagonVirtMachineState *vms,
+                          const hexagon_machine_config *m_cfg)
+{
+    Error **errp = NULL;
+    QCTQtimerState *qtimer = QCT_QTIMER(qdev_new(TYPE_QCT_QTIMER));
+
+    object_property_set_uint(OBJECT(qtimer), "nr_frames", 2, errp);
+    object_property_set_uint(OBJECT(qtimer), "nr_views", 1, errp);
+    object_property_set_uint(OBJECT(qtimer), "cnttid", 0x111, errp);
+    sysbus_realize_and_unref(SYS_BUS_DEVICE(qtimer), errp);
+
+
+    sysbus_mmio_map(SYS_BUS_DEVICE(qtimer), 1, m_cfg->qtmr_region);
+    sysbus_connect_irq(SYS_BUS_DEVICE(qtimer), 0,
+                       qdev_get_gpio_in(vms->l2vic, irqmap[VIRT_QTMR0]));
+    sysbus_connect_irq(SYS_BUS_DEVICE(qtimer), 1,
+                       qdev_get_gpio_in(vms->l2vic, irqmap[VIRT_QTMR1]));
+}
+
+static void virt_instance_init(Object *obj)
+{
+    HexagonVirtMachineState *vms = HEXAGON_VIRT_MACHINE(obj);
+
+    create_fdt(vms);
+}
+
+void hexagon_load_fdt(const HexagonVirtMachineState *vms)
+{
+    MachineState *ms = MACHINE(vms);
+    hwaddr fdt_addr = base_memmap[VIRT_FDT].base;
+    uint32_t fdtsize = vms->fdt_size;
+
+    /* copy in the device tree */
+    rom_add_blob_fixed_as("fdt", ms->fdt, fdtsize, fdt_addr,
+                          &address_space_memory);
+    qemu_register_reset_nosnapshotload(
+        qemu_fdt_randomize_seeds,
+        rom_ptr_for_as(&address_space_memory, fdt_addr, fdtsize));
+}
+
+static uint64_t load_kernel(const HexagonVirtMachineState *vms)
+{
+    MachineState *ms = MACHINE(vms);
+    uint64_t entry = 0;
+    if (load_elf_ram_sym(ms->kernel_filename, NULL, NULL, NULL, &entry, NULL,
+                         NULL, NULL, 0, EM_HEXAGON, 0, 0, &address_space_memory,
+                         false, NULL) > 0) {
+        return entry;
+    }
+    error_report("error loading '%s'", ms->kernel_filename);
+    exit(1);
+}
+
+static void do_cpu_reset(void *opaque)
+{
+    HexagonCPU *cpu = opaque;
+    CPUState *cs = CPU(cpu);
+    cpu_reset(cs);
+}
+
+static void virt_init(MachineState *ms)
+{
+    HexagonVirtMachineState *vms = HEXAGON_VIRT_MACHINE(ms);
+    Error **errp = NULL;
+    const hexagon_machine_config *m_cfg = &v68n_1024;
+
+    qemu_fdt_setprop_string(ms->fdt, "/chosen", "bootargs", ms->kernel_cmdline);
+
+    vms->sys = get_system_memory();
+
+    memory_region_init_ram(&vms->ram, NULL, "ddr.ram", ms->ram_size, errp);
+    memory_region_add_subregion(vms->sys, 0x0, &vms->ram);
+
+    if (m_cfg->l2tcm_size) {
+        memory_region_init_ram(&vms->tcm, NULL, "tcm.ram", m_cfg->l2tcm_size,
+                               errp);
+        memory_region_add_subregion(vms->sys, m_cfg->cfgtable.l2tcm_base << 16,
+                                    &vms->tcm);
+    }
+
+    memory_region_init_rom(&vms->cfgtable, NULL, "config_table.rom",
+                           sizeof(m_cfg->cfgtable), errp);
+    memory_region_add_subregion(vms->sys, m_cfg->cfgbase, &vms->cfgtable);
+    fdt_add_hvx(vms, m_cfg, errp);
+    const char *cpu_model = ms->cpu_type;
+
+    if (!cpu_model) {
+        cpu_model = HEXAGON_CPU_TYPE_NAME("v73");
+    }
+
+    HexagonCPU *cpu_0 = NULL;
+    for (int i = 0; i < ms->smp.cpus; i++) {
+        HexagonCPU *cpu = HEXAGON_CPU(object_new(ms->cpu_type));
+        qemu_register_reset(do_cpu_reset, cpu);
+
+        if (i == 0) {
+            cpu_0 = cpu;
+            if (ms->kernel_filename) {
+                uint64_t entry = load_kernel(vms);
+
+                qdev_prop_set_uint32(DEVICE(cpu_0), "exec-start-addr", entry);
+            }
+        }
+        qdev_prop_set_bit(DEVICE(cpu), "start-powered-off", (i != 0));
+        qdev_prop_set_uint32(DEVICE(cpu), "hvx-contexts",
+                             m_cfg->cfgtable.ext_contexts);
+        qdev_prop_set_uint32(DEVICE(cpu), "config-table-addr", m_cfg->cfgbase);
+        qdev_prop_set_uint32(DEVICE(cpu), "l2vic-base-addr", m_cfg->l2vic_base);
+        qdev_prop_set_uint32(DEVICE(cpu), "qtimer-base-addr", m_cfg->qtmr_region);
+        qdev_prop_set_uint32(DEVICE(cpu), "jtlb-entries",
+                             m_cfg->cfgtable.jtlb_size_entries);
+
+        if (!qdev_realize_and_unref(DEVICE(cpu), NULL, errp)) {
+            return;
+        }
+    }
+    vms->l2vic = sysbus_create_varargs(
+        "l2vic", m_cfg->l2vic_base, qdev_get_gpio_in(DEVICE(cpu_0), 0),
+        qdev_get_gpio_in(DEVICE(cpu_0), 1), qdev_get_gpio_in(DEVICE(cpu_0), 2),
+        qdev_get_gpio_in(DEVICE(cpu_0), 3), qdev_get_gpio_in(DEVICE(cpu_0), 4),
+        qdev_get_gpio_in(DEVICE(cpu_0), 5), qdev_get_gpio_in(DEVICE(cpu_0), 6),
+        qdev_get_gpio_in(DEVICE(cpu_0), 7), NULL);
+
+    fdt_add_hvm_pic_node(vms, m_cfg);
+    fdt_add_virtio_devices(vms);
+    fdt_add_cpu_nodes(vms);
+    fdt_add_clocks(vms);
+    fdt_add_uart(vms, VIRT_UART0);
+    fdt_add_gpt_node(vms);
+    create_qtimer(vms, m_cfg);
+
+    rom_add_blob_fixed_as("config_table.rom", &m_cfg->cfgtable,
+                          sizeof(m_cfg->cfgtable), m_cfg->cfgbase,
+                          &address_space_memory);
+
+
+    hexagon_load_fdt(vms);
+}
+
+
+static void virt_class_init(ObjectClass *oc, void *data)
+{
+    MachineClass *mc = MACHINE_CLASS(oc);
+
+    mc->init = virt_init;
+    mc->default_cpu_type = HEXAGON_CPU_TYPE_NAME("v73");
+    mc->default_ram_size = 4 * GiB;
+    mc->max_cpus = 8;
+    mc->default_cpus = 8;
+    mc->is_default = false;
+    mc->default_kernel_irqchip_split = false;
+    mc->block_default_type = IF_VIRTIO;
+    mc->default_boot_order = NULL;
+    mc->no_cdrom = 1;
+    mc->numa_mem_supported = false;
+    mc->default_nic = "virtio-mmio-bus";
+}
+
+
+static const TypeInfo virt_machine_types[] = { {
+    .name = TYPE_HEXAGON_VIRT_MACHINE,
+    .parent = TYPE_MACHINE,
+    .instance_size = sizeof(HexagonVirtMachineState),
+    .class_init = virt_class_init,
+    .instance_init = virt_instance_init,
+} };
+
+DEFINE_TYPES(virt_machine_types)
diff --git a/hw/intc/Kconfig b/hw/intc/Kconfig
index 7547528f2c27..a5b136e2fa72 100644
--- a/hw/intc/Kconfig
+++ b/hw/intc/Kconfig
@@ -8,6 +8,9 @@ config I8259
 config PL190
     bool
 
+config L2VIC
+    bool
+
 config IOAPIC
     bool
     select I8259
diff --git a/hw/intc/l2vic.c b/hw/intc/l2vic.c
new file mode 100644
index 000000000000..1c450179dd6d
--- /dev/null
+++ b/hw/intc/l2vic.c
@@ -0,0 +1,417 @@
+/*
+ * QEMU L2VIC Interrupt Controller
+ *
+ * Arm PrimeCell PL190 Vector Interrupt Controller was used as a reference.
+ * Copyright(c) 2020-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "hw/irq.h"
+#include "hw/sysbus.h"
+#include "migration/vmstate.h"
+#include "qemu/log.h"
+#include "qemu/module.h"
+#include "hw/intc/l2vic.h"
+#include "trace.h"
+
+#define L2VICA(s, n) (s[(n) >> 2])
+
+#define TYPE_L2VIC "l2vic"
+OBJECT_DECLARE_SIMPLE_TYPE(L2VICState, L2VIC)
+
+#define SLICE_MAX (L2VIC_INTERRUPT_MAX / 32)
+
+typedef struct L2VICState {
+    SysBusDevice parent_obj;
+
+    QemuMutex active;
+    MemoryRegion iomem;
+    MemoryRegion fast_iomem;
+    uint32_t level;
+    /*
+     * offset 0:vid group 0 etc, 10 bits in each group
+     * are used:
+     */
+    uint32_t vid_group[4];
+    uint32_t vid0;
+    /* Clear Status of Active Edge interrupt, not used: */
+    uint32_t int_clear[SLICE_MAX] QEMU_ALIGNED(16);
+    /* Enable interrupt source */
+    uint32_t int_enable[SLICE_MAX] QEMU_ALIGNED(16);
+    /* Clear (set to 0) corresponding bit in int_enable */
+    uint32_t int_enable_clear;
+    /* Set (to 1) corresponding bit in int_enable */
+    uint32_t int_enable_set;
+    /* Present for debugging, not used */
+    uint32_t int_pending[SLICE_MAX] QEMU_ALIGNED(16);
+    /* Generate an interrupt */
+    uint32_t int_soft;
+    /* Which enabled interrupt is active */
+    uint32_t int_status[SLICE_MAX] QEMU_ALIGNED(16);
+    /* Edge or Level interrupt */
+    uint32_t int_type[SLICE_MAX] QEMU_ALIGNED(16);
+    /* L2 interrupt group 0-3 0x600-0x7FF */
+    uint32_t int_group_n0[SLICE_MAX] QEMU_ALIGNED(16);
+    uint32_t int_group_n1[SLICE_MAX] QEMU_ALIGNED(16);
+    uint32_t int_group_n2[SLICE_MAX] QEMU_ALIGNED(16);
+    uint32_t int_group_n3[SLICE_MAX] QEMU_ALIGNED(16);
+    qemu_irq irq[8];
+} L2VICState;
+
+
+/*
+ * Find out if this irq is associated with a group other than
+ * the default group
+ */
+static uint32_t *get_int_group(L2VICState *s, int irq)
+{
+    int n = irq & 0x1f;
+    if (n < 8) {
+        return s->int_group_n0;
+    }
+    if (n < 16) {
+        return s->int_group_n1;
+    }
+    if (n < 24) {
+        return s->int_group_n2;
+    }
+    return s->int_group_n3;
+}
+
+static int find_slice(int irq)
+{
+    return irq / 32;
+}
+
+static int get_vid(L2VICState *s, int irq)
+{
+    uint32_t *group = get_int_group(s, irq);
+    uint32_t slice = group[find_slice(irq)];
+    /* Mask with 0x7 to remove the GRP:EN bit */
+    uint32_t val = slice >> ((irq & 0x7) * 4);
+    if (val & 0x8) {
+        return val & 0x7;
+    } else {
+        return 0;
+    }
+}
+
+static inline bool vid_active(L2VICState *s)
+
+{
+    /* scan all 1024 bits in int_status arrary */
+    const int size = sizeof(s->int_status) * CHAR_BIT;
+    const int active_irq = find_first_bit((unsigned long *)s->int_status, size);
+    return ((active_irq != size)) ? true : false;
+}
+
+static bool l2vic_update(L2VICState *s, int irq)
+{
+    if (vid_active(s)) {
+        return true;
+    }
+
+    bool pending = test_bit(irq, (unsigned long *)s->int_pending);
+    bool enable = test_bit(irq, (unsigned long *)s->int_enable);
+    if (pending && enable) {
+        int vid = get_vid(s, irq);
+        set_bit(irq, (unsigned long *)s->int_status);
+        clear_bit(irq, (unsigned long *)s->int_pending);
+        clear_bit(irq, (unsigned long *)s->int_enable);
+        /* ensure the irq line goes low after going high */
+        s->vid0 = irq;
+        s->vid_group[get_vid(s, irq)] = irq;
+
+        /* already low: now call pulse */
+        /*     pulse: calls qemu_upper() and then qemu_lower()) */
+        qemu_irq_pulse(s->irq[vid + 2]);
+        trace_l2vic_delivered(irq, vid);
+        return true;
+    }
+    return false;
+}
+
+static void l2vic_update_all(L2VICState *s)
+{
+    for (int i = 0; i < L2VIC_INTERRUPT_MAX; i++) {
+        if (l2vic_update(s, i) == true) {
+            /* once vid is active, no-one else can set it until ciad */
+            return;
+        }
+    }
+}
+
+static void l2vic_set_irq(void *opaque, int irq, int level)
+{
+    L2VICState *s = (L2VICState *)opaque;
+    if (level) {
+        qemu_mutex_lock(&s->active);
+        set_bit(irq, (unsigned long *)s->int_pending);
+        qemu_mutex_unlock(&s->active);
+    }
+    l2vic_update(s, irq);
+}
+
+static void l2vic_write(void *opaque, hwaddr offset, uint64_t val,
+                        unsigned size)
+{
+    L2VICState *s = (L2VICState *)opaque;
+    qemu_mutex_lock(&s->active);
+    trace_l2vic_reg_write((unsigned)offset, (uint32_t)val);
+
+    if (offset == L2VIC_VID_0) {
+        if ((int)val != L2VIC_CIAD_INSTRUCTION) {
+            s->vid0 = val;
+        } else {
+            /* ciad issued: clear int_status */
+            clear_bit(s->vid0, (unsigned long *)s->int_status);
+        }
+    } else if (offset >= L2VIC_INT_ENABLEn &&
+               offset < (L2VIC_INT_ENABLE_CLEARn)) {
+        L2VICA(s->int_enable, offset - L2VIC_INT_ENABLEn) = val;
+    } else if (offset >= L2VIC_INT_ENABLE_CLEARn &&
+               offset < L2VIC_INT_ENABLE_SETn) {
+        L2VICA(s->int_enable, offset - L2VIC_INT_ENABLE_CLEARn) &= ~val;
+    } else if (offset >= L2VIC_INT_ENABLE_SETn && offset < L2VIC_INT_TYPEn) {
+        L2VICA(s->int_enable, offset - L2VIC_INT_ENABLE_SETn) |= val;
+    } else if (offset >= L2VIC_INT_TYPEn && offset < L2VIC_INT_TYPEn + 0x80) {
+        L2VICA(s->int_type, offset - L2VIC_INT_TYPEn) = val;
+    } else if (offset >= L2VIC_INT_STATUSn && offset < L2VIC_INT_CLEARn) {
+        L2VICA(s->int_status, offset - L2VIC_INT_STATUSn) = val;
+    } else if (offset >= L2VIC_INT_CLEARn && offset < L2VIC_SOFT_INTn) {
+        L2VICA(s->int_clear, offset - L2VIC_INT_CLEARn) = val;
+    } else if (offset >= L2VIC_INT_PENDINGn &&
+               offset < L2VIC_INT_PENDINGn + 0x80) {
+        L2VICA(s->int_pending, offset - L2VIC_INT_PENDINGn) = val;
+    } else if (offset >= L2VIC_SOFT_INTn && offset < L2VIC_INT_PENDINGn) {
+        L2VICA(s->int_enable, offset - L2VIC_SOFT_INTn) |= val;
+        /*
+         *  Need to reverse engineer the actual irq number.
+         */
+        int irq = find_first_bit((unsigned long *)&val,
+                                 sizeof(s->int_enable[0]) * CHAR_BIT);
+        hwaddr byteoffset = offset - L2VIC_SOFT_INTn;
+        g_assert(irq != sizeof(s->int_enable[0]) * CHAR_BIT);
+        irq += byteoffset * 8;
+
+        /* The soft-int interface only works with edge-triggered interrupts */
+        if (test_bit(irq, (unsigned long *)s->int_type)) {
+            qemu_mutex_unlock(&s->active);
+            l2vic_set_irq(opaque, irq, 1);
+            qemu_mutex_lock(&s->active);
+        }
+    } else if (offset >= L2VIC_INT_GRPn_0 && offset < L2VIC_INT_GRPn_1) {
+        L2VICA(s->int_group_n0, offset - L2VIC_INT_GRPn_0) = val;
+    } else if (offset >= L2VIC_INT_GRPn_1 && offset < L2VIC_INT_GRPn_2) {
+        L2VICA(s->int_group_n1, offset - L2VIC_INT_GRPn_1) = val;
+    } else if (offset >= L2VIC_INT_GRPn_2 && offset < L2VIC_INT_GRPn_3) {
+        L2VICA(s->int_group_n2, offset - L2VIC_INT_GRPn_2) = val;
+    } else if (offset >= L2VIC_INT_GRPn_3 && offset < L2VIC_INT_GRPn_3 + 0x80) {
+        L2VICA(s->int_group_n3, offset - L2VIC_INT_GRPn_3) = val;
+    } else {
+        qemu_log_mask(LOG_UNIMP, "%s: offset %x unimplemented\n", __func__,
+                      (int)offset);
+    }
+    l2vic_update_all(s);
+    qemu_mutex_unlock(&s->active);
+    return;
+}
+
+static uint64_t l2vic_read(void *opaque, hwaddr offset, unsigned size)
+{
+    uint64_t value;
+    L2VICState *s = (L2VICState *)opaque;
+    qemu_mutex_lock(&s->active);
+
+    if (offset == L2VIC_VID_GRP_0) {
+        value = s->vid_group[0];
+    } else if (offset == L2VIC_VID_GRP_1) {
+        value = s->vid_group[1];
+    } else if (offset == L2VIC_VID_GRP_2) {
+        value = s->vid_group[2];
+    } else if (offset == L2VIC_VID_GRP_3) {
+        value = s->vid_group[3];
+    } else if (offset == L2VIC_VID_0) {
+        value = s->vid0;
+    } else if (offset >= L2VIC_INT_ENABLEn &&
+               offset < L2VIC_INT_ENABLE_CLEARn) {
+        value = L2VICA(s->int_enable, offset - L2VIC_INT_ENABLEn);
+    } else if (offset >= L2VIC_INT_ENABLE_CLEARn &&
+               offset < L2VIC_INT_ENABLE_SETn) {
+        value = 0;
+    } else if (offset >= L2VIC_INT_ENABLE_SETn && offset < L2VIC_INT_TYPEn) {
+        value = 0;
+    } else if (offset >= L2VIC_INT_TYPEn && offset < L2VIC_INT_TYPEn + 0x80) {
+        value = L2VICA(s->int_type, offset - L2VIC_INT_TYPEn);
+    } else if (offset >= L2VIC_INT_STATUSn && offset < L2VIC_INT_CLEARn) {
+        value = L2VICA(s->int_status, offset - L2VIC_INT_STATUSn);
+    } else if (offset >= L2VIC_INT_CLEARn && offset < L2VIC_SOFT_INTn) {
+        value = L2VICA(s->int_clear, offset - L2VIC_INT_CLEARn);
+    } else if (offset >= L2VIC_SOFT_INTn && offset < L2VIC_INT_PENDINGn) {
+        value = 0;
+    } else if (offset >= L2VIC_INT_PENDINGn &&
+               offset < L2VIC_INT_PENDINGn + 0x80) {
+        value = L2VICA(s->int_pending, offset - L2VIC_INT_PENDINGn);
+    } else if (offset >= L2VIC_INT_GRPn_0 && offset < L2VIC_INT_GRPn_1) {
+        value = L2VICA(s->int_group_n0, offset - L2VIC_INT_GRPn_0);
+    } else if (offset >= L2VIC_INT_GRPn_1 && offset < L2VIC_INT_GRPn_2) {
+        value = L2VICA(s->int_group_n1, offset - L2VIC_INT_GRPn_1);
+    } else if (offset >= L2VIC_INT_GRPn_2 && offset < L2VIC_INT_GRPn_3) {
+        value = L2VICA(s->int_group_n2, offset - L2VIC_INT_GRPn_2);
+    } else if (offset >= L2VIC_INT_GRPn_3 && offset < L2VIC_INT_GRPn_3 + 0x80) {
+        value = L2VICA(s->int_group_n3, offset - L2VIC_INT_GRPn_3);
+    } else {
+        value = 0;
+        qemu_log_mask(LOG_GUEST_ERROR, "L2VIC: %s: offset 0x%x\n", __func__,
+                      (int)offset);
+    }
+
+    trace_l2vic_reg_read((unsigned)offset, value);
+    qemu_mutex_unlock(&s->active);
+
+    return value;
+}
+
+static const MemoryRegionOps l2vic_ops = {
+    .read = l2vic_read,
+    .write = l2vic_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .valid.min_access_size = 4,
+    .valid.max_access_size = 4,
+    .valid.unaligned = false,
+};
+
+#define FASTL2VIC_ENABLE 0x0
+#define FASTL2VIC_DISABLE 0x1
+#define FASTL2VIC_INT 0x2
+
+static void fastl2vic_write(void *opaque, hwaddr offset, uint64_t val,
+                            unsigned size)
+{
+    if (offset == 0) {
+        uint32_t cmd = (val >> 16) & 0x3;
+        uint32_t irq = val & 0x3ff;
+        uint32_t slice = (irq / 32) * 4;
+        val = 1 << (irq % 32);
+
+        if (cmd == FASTL2VIC_ENABLE) {
+            l2vic_write(opaque, L2VIC_INT_ENABLE_SETn + slice, val, size);
+        } else if (cmd == FASTL2VIC_DISABLE) {
+            l2vic_write(opaque, L2VIC_INT_ENABLE_CLEARn + slice, val, size);
+        } else if (cmd == FASTL2VIC_INT) {
+            l2vic_write(opaque, L2VIC_SOFT_INTn + slice, val, size);
+        }
+        qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid write cmd %" PRId32 "\n",
+            __func__, cmd);
+        return;
+    }
+    qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid write offset 0x%08" HWADDR_PRIx
+            "\n", __func__, offset);
+}
+
+static const MemoryRegionOps fastl2vic_ops = {
+    .write = fastl2vic_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .valid.min_access_size = 4,
+    .valid.max_access_size = 4,
+    .valid.unaligned = false,
+};
+
+static void l2vic_reset_hold(Object *obj, G_GNUC_UNUSED ResetType res_type)
+{
+    L2VICState *s = L2VIC(obj);
+    memset(s->int_clear, 0, sizeof(s->int_clear));
+    memset(s->int_enable, 0, sizeof(s->int_enable));
+    memset(s->int_pending, 0, sizeof(s->int_pending));
+    memset(s->int_status, 0, sizeof(s->int_status));
+    memset(s->int_type, 0, sizeof(s->int_type));
+    memset(s->int_group_n0, 0, sizeof(s->int_group_n0));
+    memset(s->int_group_n1, 0, sizeof(s->int_group_n1));
+    memset(s->int_group_n2, 0, sizeof(s->int_group_n2));
+    memset(s->int_group_n3, 0, sizeof(s->int_group_n3));
+    s->int_soft = 0;
+    s->vid0 = 0;
+
+    l2vic_update_all(s);
+}
+
+
+static void reset_irq_handler(void *opaque, int irq, int level)
+{
+    L2VICState *s = (L2VICState *)opaque;
+    Object *obj = OBJECT(opaque);
+    if (level) {
+        l2vic_reset_hold(obj, RESET_TYPE_COLD);
+    }
+    l2vic_update_all(s);
+}
+
+static void l2vic_init(Object *obj)
+{
+    DeviceState *dev = DEVICE(obj);
+    L2VICState *s = L2VIC(obj);
+    SysBusDevice *sbd = SYS_BUS_DEVICE(obj);
+    int i;
+
+    memory_region_init_io(&s->iomem, obj, &l2vic_ops, s, "l2vic", 0x1000);
+    sysbus_init_mmio(sbd, &s->iomem);
+    memory_region_init_io(&s->fast_iomem, obj, &fastl2vic_ops, s, "fast",
+                          0x10000);
+    sysbus_init_mmio(sbd, &s->fast_iomem);
+
+    qdev_init_gpio_in(dev, l2vic_set_irq, L2VIC_INTERRUPT_MAX);
+    qdev_init_gpio_in_named(dev, reset_irq_handler, "reset", 1);
+    for (i = 0; i < 8; i++) {
+        sysbus_init_irq(sbd, &s->irq[i]);
+    }
+    qemu_mutex_init(&s->active); /* TODO: Remove this is an experiment */
+}
+
+static const VMStateDescription vmstate_l2vic = {
+    .name = "l2vic",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields =
+        (VMStateField[]){
+            VMSTATE_UINT32(level, L2VICState),
+            VMSTATE_UINT32_ARRAY(vid_group, L2VICState, 4),
+            VMSTATE_UINT32(vid0, L2VICState),
+            VMSTATE_UINT32_ARRAY(int_enable, L2VICState, SLICE_MAX),
+            VMSTATE_UINT32(int_enable_clear, L2VICState),
+            VMSTATE_UINT32(int_enable_set, L2VICState),
+            VMSTATE_UINT32_ARRAY(int_type, L2VICState, SLICE_MAX),
+            VMSTATE_UINT32_ARRAY(int_status, L2VICState, SLICE_MAX),
+            VMSTATE_UINT32_ARRAY(int_clear, L2VICState, SLICE_MAX),
+            VMSTATE_UINT32(int_soft, L2VICState),
+            VMSTATE_UINT32_ARRAY(int_pending, L2VICState, SLICE_MAX),
+            VMSTATE_UINT32_ARRAY(int_group_n0, L2VICState, SLICE_MAX),
+            VMSTATE_UINT32_ARRAY(int_group_n1, L2VICState, SLICE_MAX),
+            VMSTATE_UINT32_ARRAY(int_group_n2, L2VICState, SLICE_MAX),
+            VMSTATE_UINT32_ARRAY(int_group_n3, L2VICState, SLICE_MAX),
+            VMSTATE_END_OF_LIST() }
+};
+
+static void l2vic_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    ResettableClass *rc = RESETTABLE_CLASS(klass);
+
+    dc->vmsd = &vmstate_l2vic;
+    rc->phases.hold = l2vic_reset_hold;
+}
+
+static const TypeInfo l2vic_info = {
+    .name = TYPE_L2VIC,
+    .parent = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(L2VICState),
+    .instance_init = l2vic_init,
+    .class_init = l2vic_class_init,
+};
+
+static void l2vic_register_types(void)
+{
+    type_register_static(&l2vic_info);
+}
+
+type_init(l2vic_register_types)
diff --git a/hw/intc/meson.build b/hw/intc/meson.build
index 602da304b02d..35f4a7bad5ef 100644
--- a/hw/intc/meson.build
+++ b/hw/intc/meson.build
@@ -67,6 +67,8 @@ specific_ss.add(when: 'CONFIG_PSERIES', if_true: files('xics_spapr.c', 'spapr_xi
 specific_ss.add(when: 'CONFIG_XIVE', if_true: files('xive.c'))
 specific_ss.add(when: ['CONFIG_KVM', 'CONFIG_XIVE'],
 		if_true: files('spapr_xive_kvm.c'))
+
+specific_ss.add(when: 'CONFIG_L2VIC', if_true: files('l2vic.c'))
 specific_ss.add(when: 'CONFIG_M68K_IRQC', if_true: files('m68k_irqc.c'))
 specific_ss.add(when: 'CONFIG_LOONGSON_IPI_COMMON', if_true: files('loongson_ipi_common.c'))
 specific_ss.add(when: 'CONFIG_LOONGSON_IPI', if_true: files('loongson_ipi.c'))
diff --git a/hw/intc/trace-events b/hw/intc/trace-events
index 3dcf14719833..bc66260fc0cb 100644
--- a/hw/intc/trace-events
+++ b/hw/intc/trace-events
@@ -303,6 +303,10 @@ sh_intc_register(const char *s, int id, unsigned short v, int c, int m) "%s %u -
 sh_intc_read(unsigned size, uint64_t offset, unsigned long val) "size %u 0x%" PRIx64 " -> 0x%lx"
 sh_intc_write(unsigned size, uint64_t offset, unsigned long val) "size %u 0x%" PRIx64 " <- 0x%lx"
 sh_intc_set(int id, int enable) "setting interrupt group %d to %d"
+# l2vic.c
+l2vic_reg_write(unsigned int addr, uint32_t value) "addr: 0x%03x value: 0x%08"PRIx32
+l2vic_reg_read(unsigned int addr, uint32_t value) "addr: 0x%03x value: 0x%08"PRIx32
+l2vic_delivered(int irq, int vid) "l2vic: delivered %d (vid %d)"
 
 # loongson_ipi.c
 loongson_ipi_read(unsigned size, uint64_t addr, uint64_t val) "size: %u addr: 0x%"PRIx64 "val: 0x%"PRIx64
diff --git a/hw/meson.build b/hw/meson.build
index b91f761fe08a..6aaf469f95e4 100644
--- a/hw/meson.build
+++ b/hw/meson.build
@@ -66,3 +66,4 @@ subdir('sparc')
 subdir('sparc64')
 subdir('tricore')
 subdir('xtensa')
+subdir('hexagon')
diff --git a/hw/timer/meson.build b/hw/timer/meson.build
index f5f9eed2d0a9..6c30bf602226 100644
--- a/hw/timer/meson.build
+++ b/hw/timer/meson.build
@@ -34,3 +34,5 @@ specific_ss.add(when: 'CONFIG_IBEX', if_true: files('ibex_timer.c'))
 system_ss.add(when: 'CONFIG_SIFIVE_PWM', if_true: files('sifive_pwm.c'))
 
 specific_ss.add(when: 'CONFIG_AVR_TIMER16', if_true: files('avr_timer16.c'))
+
+specific_ss.add(when: 'CONFIG_HEX_DSP', if_true: files('qct-qtimer.c'))
diff --git a/hw/timer/qct-qtimer.c b/hw/timer/qct-qtimer.c
new file mode 100644
index 000000000000..413f7249eef0
--- /dev/null
+++ b/hw/timer/qct-qtimer.c
@@ -0,0 +1,519 @@
+/*
+ * Qualcomm QCT QTimer
+ *
+ * Copyright(c) 2019-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+
+#include "qemu/osdep.h"
+#include "hw/irq.h"
+#include "hw/qdev-properties.h"
+#include "hw/timer/qct-qtimer.h"
+#include "migration/vmstate.h"
+#include "qapi/error.h"
+#include "qemu/log.h"
+#include "qemu/module.h"
+#include "qemu/timer.h"
+
+/* Common timer implementation.  */
+
+#define QTIMER_MEM_SIZE_BYTES 0x1000
+#define QTIMER_MEM_REGION_SIZE_BYTES 0x1000
+#define QTIMER_DEFAULT_FREQ_HZ 19200000ULL
+#define QTMR_TIMER_INDEX_MASK (0xf000)
+#define HIGH_32(val) (0x0ffffffffULL & (val >> 32))
+#define LOW_32(val) (0x0ffffffffULL & val)
+
+/*
+ * QTimer version reg:
+ *
+ *    3                   2                   1
+ *  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Major |         Minor         |           Step                |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+static unsigned int TIMER_VERSION = 0x20020000;
+
+/*
+ * qct_qtimer_read/write:
+ * if offset < 0x1000 read restricted registers:
+ * QCT_QTIMER_AC_CNTFREQ/CNTSR/CNTTID/CNTACR/CNTOFF_(LO/HI)/QCT_QTIMER_VERSION
+ */
+static uint64_t qct_qtimer_read(void *opaque, hwaddr offset, unsigned size)
+{
+    QCTQtimerState *s = (QCTQtimerState *)opaque;
+    uint32_t frame = 0;
+
+    switch (offset) {
+    case QCT_QTIMER_AC_CNTFRQ:
+        return s->freq;
+    case QCT_QTIMER_AC_CNTSR:
+        return s->secure;
+    case QCT_QTIMER_AC_CNTTID:
+        return s->cnttid;
+    case QCT_QTIMER_AC_CNTACR_START ... QCT_QTIMER_AC_CNTACR_END:
+        frame = (offset - 0x40) / 0x4;
+        if (frame >= s->nr_frames) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "%s: QCT_QTIMER_AC_CNT: Bad offset %x\n", __func__,
+                          (int)offset);
+            return 0x0;
+        }
+        return s->timer[frame].cnt_ctrl;
+    case QCT_QTIMER_VERSION:
+        return TIMER_VERSION;
+    default:
+        qemu_log_mask(LOG_GUEST_ERROR, "%s: QCT_QTIMER_AC_CNT: Bad offset %x\n",
+                      __func__, (int)offset);
+        return 0x0;
+    }
+
+    qemu_log_mask(LOG_GUEST_ERROR, "%s: Bad offset 0x%x\n", __func__,
+                  (int)offset);
+    return 0;
+}
+
+static void qct_qtimer_write(void *opaque, hwaddr offset, uint64_t value,
+                             unsigned size)
+{
+    QCTQtimerState *s = (QCTQtimerState *)opaque;
+    uint32_t frame = 0;
+
+    if (offset < 0x1000) {
+        switch (offset) {
+        case QCT_QTIMER_AC_CNTFRQ:
+            s->freq = value;
+            return;
+        case QCT_QTIMER_AC_CNTSR:
+            if (value > 0xFF)
+                qemu_log_mask(LOG_GUEST_ERROR,
+                              "%s: QCT_QTIMER_AC_CNTSR: Bad value %x\n",
+                              __func__, (int)value);
+            else
+                s->secure = value;
+            return;
+        case QCT_QTIMER_AC_CNTACR_START ... QCT_QTIMER_AC_CNTACR_END:
+            frame = (offset - QCT_QTIMER_AC_CNTACR_START) / 0x4;
+            if (frame >= s->nr_frames) {
+                qemu_log_mask(LOG_GUEST_ERROR,
+                              "%s: QCT_QTIMER_AC_CNT: Bad offset %x\n",
+                              __func__, (int)offset);
+                return;
+            }
+            s->timer[frame].cnt_ctrl = value;
+            return;
+        default:
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "%s: QCT_QTIMER_AC_CNT: Bad offset %x\n", __func__,
+                          (int)offset);
+            return;
+        }
+    } else
+        qemu_log_mask(LOG_GUEST_ERROR, "%s: Bad offset %x\n", __func__,
+                      (int)offset);
+}
+
+static const MemoryRegionOps qct_qtimer_ops = {
+    .read = qct_qtimer_read,
+    .write = qct_qtimer_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static const VMStateDescription vmstate_qct_qtimer = {
+    .name = "qct-qtimer",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]){ VMSTATE_END_OF_LIST() }
+};
+
+static void qct_qtimer_init(Object *obj)
+{
+    QCTQtimerState *s = QCT_QTIMER(obj);
+
+    object_property_add_uint32_ptr(obj, "secure", &s->secure,
+                                   OBJ_PROP_FLAG_READ);
+    object_property_add_uint32_ptr(obj, "frame_id", &s->frame_id,
+                                   OBJ_PROP_FLAG_READ);
+}
+
+static void hex_timer_update(QCTHextimerState *s)
+{
+    /* Update interrupts.  */
+    int level = s->int_level && (s->control & QCT_QTIMER_CNTP_CTL_ENABLE);
+    qemu_set_irq(s->irq, level);
+}
+
+static MemTxResult hex_timer_read(void *opaque, hwaddr offset, uint64_t *data,
+                                  unsigned size, MemTxAttrs attrs)
+{
+    QCTQtimerState *qct_s = (QCTQtimerState *)opaque;
+    uint32_t slot_nr = (offset & 0xF000) >> 12;
+    uint32_t reg_offset = offset & 0xFFF;
+    uint32_t view = slot_nr % qct_s->nr_views;
+    uint32_t frame = slot_nr / qct_s->nr_views;
+
+    if (frame >= qct_s->nr_frames) {
+        *data = 0;
+        return MEMTX_ACCESS_ERROR;
+    }
+    QCTHextimerState *s = &qct_s->timer[frame];
+
+
+    /*
+     * This is the case where we have 2 views, but the second one is not
+     * implemented.
+     */
+    if (view && !(qct_s->cnttid & (0x4 << (frame * 4)))) {
+        *data = 0;
+        return MEMTX_OK;
+    }
+
+    switch (reg_offset) {
+    case (QCT_QTIMER_CNT_FREQ): /* Ticks/Second */
+        if (!(s->cnt_ctrl & QCT_QTIMER_AC_CNTACR_RFRQ)) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+        if (view && !((s->cntpl0acr & QCT_QTIMER_CNTPL0ACR_PL0PCTEN) ||
+                      (s->cntpl0acr & QCT_QTIMER_CNTPL0ACR_PL0VCTEN))) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+        *data = s->freq;
+        return MEMTX_OK;
+    case (QCT_QTIMER_CNTP_CVAL_LO): /* TimerLoad */
+        if (!(s->cnt_ctrl & QCT_QTIMER_AC_CNTACR_RWPT)) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+        if (view && !(s->cntpl0acr & QCT_QTIMER_CNTPL0ACR_PL0CTEN)) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+        *data = LOW_32((s->cntval));
+        return MEMTX_OK;
+    case (QCT_QTIMER_CNTP_CVAL_HI): /* TimerLoad */
+        if (!(s->cnt_ctrl & QCT_QTIMER_AC_CNTACR_RWPT)) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+        if (view && !(s->cntpl0acr & QCT_QTIMER_CNTPL0ACR_PL0CTEN)) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+        *data = HIGH_32((s->cntval));
+        return MEMTX_OK;
+    case QCT_QTIMER_CNTPCT_LO:
+        if (!(s->cnt_ctrl & QCT_QTIMER_AC_CNTACR_RPCT)) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+        if (view && !(s->cntpl0acr & QCT_QTIMER_CNTPL0ACR_PL0PCTEN)) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+        *data = LOW_32((s->cntpct + (ptimer_get_count(s->timer))));
+        return MEMTX_OK;
+    case QCT_QTIMER_CNTPCT_HI:
+        if (!(s->cnt_ctrl & QCT_QTIMER_AC_CNTACR_RPCT)) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+        if (view && !(s->cntpl0acr & QCT_QTIMER_CNTPL0ACR_PL0PCTEN)) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+        *data = HIGH_32((s->cntpct + (ptimer_get_count(s->timer))));
+        return MEMTX_OK;
+    case (QCT_QTIMER_CNTP_TVAL): /* CVAL - CNTP */
+        if (!(s->cnt_ctrl & QCT_QTIMER_AC_CNTACR_RWPT)) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+        if (view && !(s->cntpl0acr & QCT_QTIMER_CNTPL0ACR_PL0CTEN)) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+        *data =
+            (s->cntval - (HIGH_32((s->cntpct + (ptimer_get_count(s->timer)))) +
+                          LOW_32((s->cntpct + (ptimer_get_count(s->timer))))));
+        return MEMTX_OK;
+    case (QCT_QTIMER_CNTP_CTL): /* TimerMIS */
+        if (!(s->cnt_ctrl & QCT_QTIMER_AC_CNTACR_RWPT)) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+        if (view && !(s->cntpl0acr & QCT_QTIMER_CNTPL0ACR_PL0CTEN)) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+        *data = s->int_level;
+        return MEMTX_OK;
+    case QCT_QTIMER_CNTPL0ACR:
+        if (view) {
+            *data = 0;
+        } else {
+            *data = s->cntpl0acr;
+        }
+        return MEMTX_OK;
+
+    case QCT_QTIMER_VERSION:
+        *data = TIMER_VERSION;
+        return MEMTX_OK;
+
+    default:
+        qemu_log_mask(LOG_GUEST_ERROR, "%s: Bad offset %x\n", __func__,
+                      (int)offset);
+        *data = 0;
+        return MEMTX_ACCESS_ERROR;
+    }
+}
+
+/*
+ * Reset the timer limit after settings have changed.
+ * May only be called from inside a ptimer transaction block.
+ */
+static void hex_timer_recalibrate(QCTHextimerState *s, int reload)
+{
+    uint64_t limit;
+    /* Periodic.  */
+    limit = s->limit;
+    ptimer_set_limit(s->timer, limit, reload);
+}
+
+static MemTxResult hex_timer_write(void *opaque, hwaddr offset, uint64_t value,
+                                   unsigned size, MemTxAttrs attrs)
+{
+    QCTQtimerState *qct_s = (QCTQtimerState *)opaque;
+    uint32_t slot_nr = (offset & 0xF000) >> 12;
+    uint32_t reg_offset = offset & 0xFFF;
+    uint32_t view = slot_nr % qct_s->nr_views;
+    uint32_t frame = slot_nr / qct_s->nr_views;
+
+    if (frame >= qct_s->nr_frames) {
+        return MEMTX_ACCESS_ERROR;
+    }
+    QCTHextimerState *s = &qct_s->timer[frame];
+
+    /*
+     * This is the case where we have 2 views, but the second one is not
+     * implemented.
+     */
+    if (view && !(qct_s->cnttid & (0x4 << (frame * 4)))) {
+        return MEMTX_OK;
+    }
+
+    switch (reg_offset) {
+    case (QCT_QTIMER_CNTP_CVAL_LO): /* TimerLoad */
+        if (!(s->cnt_ctrl & QCT_QTIMER_AC_CNTACR_RWPT)) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+        if (view && !(s->cntpl0acr & QCT_QTIMER_CNTPL0ACR_PL0CTEN)) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+
+        s->int_level = 0;
+        s->cntval = value;
+        ptimer_transaction_begin(s->timer);
+        if (s->control & QCT_QTIMER_CNTP_CTL_ENABLE) {
+            /*
+             * Pause the timer if it is running.  This may cause some
+             * inaccuracy due to rounding, but avoids other issues.
+             */
+            ptimer_stop(s->timer);
+        }
+        hex_timer_recalibrate(s, 1);
+        if (s->control & QCT_QTIMER_CNTP_CTL_ENABLE) {
+            ptimer_run(s->timer, 0);
+        }
+        ptimer_transaction_commit(s->timer);
+        break;
+    case (QCT_QTIMER_CNTP_CVAL_HI):
+        if (!(s->cnt_ctrl & QCT_QTIMER_AC_CNTACR_RWPT)) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+        if (view && !(s->cntpl0acr & QCT_QTIMER_CNTPL0ACR_PL0CTEN)) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+        break;
+    case (QCT_QTIMER_CNTP_CTL): /* Timer control register */
+        if (!(s->cnt_ctrl & QCT_QTIMER_AC_CNTACR_RWPT)) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+        if (view && !(s->cntpl0acr & QCT_QTIMER_CNTPL0ACR_PL0CTEN)) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+        ptimer_transaction_begin(s->timer);
+        if (s->control & QCT_QTIMER_CNTP_CTL_ENABLE) {
+            /*
+             * Pause the timer if it is running.  This may cause some
+             * inaccuracy due to rounding, but avoids other issues.
+             */
+            ptimer_stop(s->timer);
+        }
+        s->control = value;
+        hex_timer_recalibrate(s, s->control & QCT_QTIMER_CNTP_CTL_ENABLE);
+        ptimer_set_freq(s->timer, s->freq);
+        ptimer_set_period(s->timer, 1);
+        if (s->control & QCT_QTIMER_CNTP_CTL_ENABLE) {
+            ptimer_run(s->timer, 0);
+        }
+        ptimer_transaction_commit(s->timer);
+        break;
+    case (QCT_QTIMER_CNTP_TVAL): /* CVAL - CNTP */
+        if (!(s->cnt_ctrl & QCT_QTIMER_AC_CNTACR_RWPT)) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+        if (view && !(s->cntpl0acr & QCT_QTIMER_CNTPL0ACR_PL0CTEN)) {
+            return MEMTX_ACCESS_ERROR;
+        }
+
+        ptimer_transaction_begin(s->timer);
+        if (s->control & QCT_QTIMER_CNTP_CTL_ENABLE) {
+            /*
+             * Pause the timer if it is running.  This may cause some
+             * inaccuracy due to rounding, but avoids other issues.
+             */
+            ptimer_stop(s->timer);
+        }
+        s->cntval = s->cntpct + value;
+        ptimer_set_freq(s->timer, s->freq);
+        ptimer_set_period(s->timer, 1);
+        if (s->control & QCT_QTIMER_CNTP_CTL_ENABLE) {
+            ptimer_run(s->timer, 0);
+        }
+        ptimer_transaction_commit(s->timer);
+        break;
+    case QCT_QTIMER_CNTPL0ACR:
+        if (view) {
+            break;
+        }
+
+        s->cntpl0acr = value;
+        break;
+    default:
+        qemu_log_mask(LOG_GUEST_ERROR, "%s: Bad offset %x\n", __func__,
+                      (int)offset);
+        return MEMTX_ACCESS_ERROR;
+    }
+    hex_timer_update(s);
+    return MEMTX_OK;
+}
+
+static void hex_timer_tick(void *opaque)
+{
+    QCTHextimerState *s = (QCTHextimerState *)opaque;
+    if ((s->cntpct >= s->cntval) && (s->int_level != 1)) {
+        s->int_level = 1;
+        hex_timer_update(s);
+        return;
+    }
+    s->cntpct += s->limit;
+}
+
+static const MemoryRegionOps hex_timer_ops = {
+    .read_with_attrs = hex_timer_read,
+    .write_with_attrs = hex_timer_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static const VMStateDescription vmstate_hex_timer = {
+    .name = "hex_timer",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]){ VMSTATE_UINT32(control, QCTHextimerState),
+                                VMSTATE_UINT32(cnt_ctrl, QCTHextimerState),
+                                VMSTATE_UINT64(cntpct, QCTHextimerState),
+                                VMSTATE_UINT64(cntval, QCTHextimerState),
+                                VMSTATE_UINT64(limit, QCTHextimerState),
+                                VMSTATE_UINT32(int_level, QCTHextimerState),
+                                VMSTATE_PTIMER(timer, QCTHextimerState),
+                                VMSTATE_END_OF_LIST() }
+};
+
+static void qct_qtimer_realize(DeviceState *dev, Error **errp)
+{
+    SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
+    QCTQtimerState *s = QCT_QTIMER(dev);
+    unsigned int i;
+
+    if (s->nr_frames > QCT_QTIMER_TIMER_FRAME_ELTS) {
+        error_setg(errp, "nr_frames too high");
+        return;
+    }
+
+    if (s->nr_views > QCT_QTIMER_TIMER_VIEW_ELTS) {
+        error_setg(errp, "nr_views too high");
+        return;
+    }
+
+    memory_region_init_io(&s->iomem, OBJECT(sbd), &qct_qtimer_ops, s, "qutimer",
+                          QTIMER_MEM_SIZE_BYTES);
+    sysbus_init_mmio(sbd, &s->iomem);
+
+    memory_region_init_io(&s->view_iomem, OBJECT(sbd), &hex_timer_ops, s,
+                          "qutimer_views",
+                          QTIMER_MEM_SIZE_BYTES * s->nr_frames * s->nr_views);
+    sysbus_init_mmio(sbd, &s->view_iomem);
+
+    for (i = 0; i < s->nr_frames; i++) {
+        s->timer[i].limit = 1;
+        s->timer[i].control = QCT_QTIMER_CNTP_CTL_ENABLE;
+        s->timer[i].cnt_ctrl =
+            (QCT_QTIMER_AC_CNTACR_RWPT | QCT_QTIMER_AC_CNTACR_RWVT |
+             QCT_QTIMER_AC_CNTACR_RVOFF | QCT_QTIMER_AC_CNTACR_RFRQ |
+             QCT_QTIMER_AC_CNTACR_RPVCT | QCT_QTIMER_AC_CNTACR_RPCT);
+        s->timer[i].qtimer = s;
+        s->timer[i].freq = QTIMER_DEFAULT_FREQ_HZ;
+
+        s->secure |= (1 << i);
+
+        sysbus_init_irq(sbd, &(s->timer[i].irq));
+
+        (s->timer[i]).timer =
+            ptimer_init(hex_timer_tick, &s->timer[i], PTIMER_POLICY_LEGACY);
+        vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, &vmstate_hex_timer,
+                         &s->timer[i]);
+    }
+}
+
+static const Property qct_qtimer_properties[] = {
+    DEFINE_PROP_UINT32("freq", QCTQtimerState, freq, QTIMER_DEFAULT_FREQ_HZ),
+    DEFINE_PROP_UINT32("nr_frames", QCTQtimerState, nr_frames, 2),
+    DEFINE_PROP_UINT32("nr_views", QCTQtimerState, nr_views, 1),
+    DEFINE_PROP_UINT32("cnttid", QCTQtimerState, cnttid, 0x11),
+};
+
+static void qct_qtimer_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *k = DEVICE_CLASS(klass);
+
+    device_class_set_props(k, qct_qtimer_properties);
+    k->realize = qct_qtimer_realize;
+    k->vmsd = &vmstate_qct_qtimer;
+}
+
+static const TypeInfo qct_qtimer_info = {
+    .name = TYPE_QCT_QTIMER,
+    .parent = TYPE_SYS_BUS_DEVICE,
+    .instance_size = sizeof(QCTQtimerState),
+    .instance_init = qct_qtimer_init,
+    .class_init = qct_qtimer_class_init,
+};
+
+static void qct_qtimer_register_types(void)
+{
+    type_register_static(&qct_qtimer_info);
+}
+
+type_init(qct_qtimer_register_types)
diff --git a/include/hw/hexagon/hexagon.h b/include/hw/hexagon/hexagon.h
new file mode 100644
index 000000000000..ce356325fcd7
--- /dev/null
+++ b/include/hw/hexagon/hexagon.h
@@ -0,0 +1,150 @@
+/*
+ * Hexagon Baseboard System emulation.
+ *
+ * Copyright (c) 2020-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+
+#ifndef HW_HEXAGON_H
+#define HW_HEXAGON_H
+
+#include "exec/memory.h"
+
+struct hexagon_board_boot_info {
+    uint64_t ram_size;
+    const char *kernel_filename;
+    uint32_t kernel_elf_flags;
+};
+
+typedef enum {
+    unknown_rev = 0,
+    v66_rev = 0xa666,
+    v67_rev = 0x2667,
+    v68_rev = 0x8d68,
+    v69_rev = 0x8c69,
+    v71_rev = 0x8c71,
+    v73_rev = 0x8c73,
+    v73m_rev = 0xcc73,
+} Rev_t;
+#define HEXAGON_LATEST_REV v73
+#define HEXAGON_LATEST_REV_UPPER V73
+
+/*
+ * Config table address bases represent bits [35:16].
+ */
+#define HEXAGON_CFG_ADDR_BASE(addr) (((addr) >> 16) & 0x0fffff)
+
+#define HEXAGON_CFGSPACE_ENTRIES (128)
+
+typedef  union {
+  struct {
+    /* Base address of L2TCM space */
+    uint32_t l2tcm_base;
+    uint32_t reserved0;
+    /* Base address of subsystem space */
+    uint32_t subsystem_base;
+    /* Base address of ETM space */
+    uint32_t etm_base;
+    /* Base address of L2 configuration space */
+    uint32_t l2cfg_base;
+    uint32_t reserved1;
+    /* Base address of L1S */
+    uint32_t l1s0_base;
+    /* Base address of AXI2 */
+    uint32_t axi2_lowaddr;
+    /* Base address of streamer base */
+    uint32_t streamer_base;
+    uint32_t reserved2;
+    /* Base address of fast L2VIC */
+    uint32_t fastl2vic_base;
+    /* Number of entries in JTLB */
+    uint32_t jtlb_size_entries;
+    /* Coprocessor type */
+    uint32_t coproc_present;
+    /* Number of extension execution contexts available */
+    uint32_t ext_contexts;
+    /* Base address of Hexagon Vector Tightly Coupled Memory (VTCM) */
+    uint32_t vtcm_base;
+    /* Size of VTCM (in KB) */
+    uint32_t vtcm_size_kb;
+    /* L2 tag size */
+    uint32_t l2tag_size;
+    /* Amount of physical L2 memory in released version */
+    uint32_t l2ecomem_size;
+    /* Hardware threads available on the core */
+    uint32_t thread_enable_mask;
+    /* Base address of the ECC registers */
+    uint32_t eccreg_base;
+    /* L2 line size */
+    uint32_t l2line_size;
+    /* Small Core processor (also implies audio extension) */
+    uint32_t tiny_core;
+    /* Size of L2TCM */
+    uint32_t l2itcm_size;
+    /* Base address of L2-ITCM */
+    uint32_t l2itcm_base;
+    uint32_t reserved3;
+    /* DTM is present */
+    uint32_t dtm_present;
+    /* Version of the DMA */
+    uint32_t dma_version;
+    /* Native HVX vector length in log of bytes */
+    uint32_t hvx_vec_log_length;
+    /* Core ID of the multi-core */
+    uint32_t core_id;
+    /* Number of multi-core cores */
+    uint32_t core_count;
+    uint32_t coproc2_reg0;
+    uint32_t coproc2_reg1;
+    /* Supported HVX vector length */
+    uint32_t v2x_mode;
+    uint32_t coproc2_reg2;
+    uint32_t coproc2_reg3;
+    uint32_t coproc2_reg4;
+    uint32_t coproc2_reg5;
+    uint32_t coproc2_reg6;
+    uint32_t coproc2_reg7;
+    /* Voltage droop mitigation technique parameter */
+    uint32_t acd_preset;
+    /* Voltage droop mitigation technique parameter */
+    uint32_t mnd_preset;
+    /* L1 data cache size (in KB) */
+    uint32_t l1d_size_kb;
+    /* L1 instruction cache size in (KB) */
+    uint32_t l1i_size_kb;
+    /* L1 data cache write policy: see HexagonL1WritePolicy */
+    uint32_t l1d_write_policy;
+    /* VTCM bank width  */
+    uint32_t vtcm_bank_width;
+    uint32_t reserved4;
+    uint32_t reserved5;
+    uint32_t reserved6;
+    uint32_t coproc2_cvt_mpy_size;
+    uint32_t consistency_domain;
+    uint32_t capacity_domain;
+    uint32_t axi3_lowaddr;
+    uint32_t coproc2_int8_subcolumns;
+    uint32_t corecfg_present;
+    uint32_t coproc2_fp16_acc_exp;
+    uint32_t AXIM2_secondary_base;
+  };
+  uint32_t raw[HEXAGON_CFGSPACE_ENTRIES];
+} hexagon_config_table;
+
+typedef struct {
+    /* Base address of config table */
+    uint32_t cfgbase;
+    /* Size of L2 TCM */
+    uint32_t l2tcm_size;
+    /* Base address of L2VIC */
+    uint32_t l2vic_base;
+    /* Size of L2VIC region */
+    uint32_t l2vic_size;
+    /* QTimer csr base */
+    uint32_t csr_base;
+    uint32_t qtmr_region;
+    hexagon_config_table cfgtable;
+} hexagon_machine_config;
+
+#endif
diff --git a/include/hw/hexagon/virt.h b/include/hw/hexagon/virt.h
new file mode 100644
index 000000000000..0c165a786d30
--- /dev/null
+++ b/include/hw/hexagon/virt.h
@@ -0,0 +1,41 @@
+/*
+ * Definitions for hexagon virt board.
+ *
+ * Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_HEXAGONVIRT_H
+#define HW_HEXAGONVIRT_H
+
+#include "hw/boards.h"
+#include "target/hexagon/cpu.h"
+
+struct HexagonVirtMachineState {
+    /*< private >*/
+    MachineState parent_obj;
+
+    int fdt_size;
+    MemoryRegion *sys;
+    MemoryRegion cfgtable;
+    MemoryRegion ram;
+    MemoryRegion tcm;
+    MemoryRegion vtcm;
+    DeviceState *l2vic;
+};
+
+void hexagon_load_fdt(const struct HexagonVirtMachineState *vms);
+
+enum {
+    VIRT_UART0,
+    VIRT_QTMR0,
+    VIRT_QTMR1,
+    VIRT_GPT,
+    VIRT_MMIO,
+    VIRT_FDT,
+};
+
+#define TYPE_HEXAGON_VIRT_MACHINE MACHINE_TYPE_NAME("virt")
+OBJECT_DECLARE_SIMPLE_TYPE(HexagonVirtMachineState, HEXAGON_VIRT_MACHINE)
+
+#endif /* HW_HEXAGONVIRT_H */
diff --git a/include/hw/intc/l2vic.h b/include/hw/intc/l2vic.h
new file mode 100644
index 000000000000..ed8ccf33b1f8
--- /dev/null
+++ b/include/hw/intc/l2vic.h
@@ -0,0 +1,37 @@
+/*
+ * QEMU L2VIC Interrupt Controller
+ *
+ * Copyright(c) 2020-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#define L2VIC_VID_GRP_0 0x0 /* Read */
+#define L2VIC_VID_GRP_1 0x4 /* Read */
+#define L2VIC_VID_GRP_2 0x8 /* Read */
+#define L2VIC_VID_GRP_3 0xC /* Read */
+#define L2VIC_VID_0 0x10 /* Read SOFTWARE DEFINED */
+#define L2VIC_VID_1 0x14 /* Read SOFTWARE DEFINED NOT YET USED */
+#define L2VIC_INT_ENABLEn 0x100 /* Read/Write */
+#define L2VIC_INT_ENABLE_CLEARn 0x180 /* Write */
+#define L2VIC_INT_ENABLE_SETn 0x200 /* Write */
+#define L2VIC_INT_TYPEn 0x280 /* Read/Write */
+#define L2VIC_INT_STATUSn 0x380 /* Read */
+#define L2VIC_INT_CLEARn 0x400 /* Write */
+#define L2VIC_SOFT_INTn 0x480 /* Write */
+#define L2VIC_INT_PENDINGn 0x500 /* Read */
+#define L2VIC_INT_GRPn_0 0x600 /* Read/Write */
+#define L2VIC_INT_GRPn_1 0x680 /* Read/Write */
+#define L2VIC_INT_GRPn_2 0x700 /* Read/Write */
+#define L2VIC_INT_GRPn_3 0x780 /* Read/Write */
+
+#define L2VIC_INTERRUPT_MAX 1024
+#define L2VIC_CIAD_INSTRUCTION -1
+/*
+ * Note about l2vic groups:
+ * Each interrupt to L2VIC can be configured to associate with one of
+ * four groups.
+ * Group 0 interrupts go to IRQ2 via VID 0 (SSR: 0xC2, the default)
+ * Group 1 interrupts go to IRQ3 via VID 1 (SSR: 0xC3)
+ * Group 2 interrupts go to IRQ4 via VID 2 (SSR: 0xC4)
+ * Group 3 interrupts go to IRQ5 via VID 3 (SSR: 0xC5)
+ */
diff --git a/include/hw/timer/qct-qtimer.h b/include/hw/timer/qct-qtimer.h
new file mode 100644
index 000000000000..90f7981ccf8d
--- /dev/null
+++ b/include/hw/timer/qct-qtimer.h
@@ -0,0 +1,85 @@
+/*
+ * Qualcomm QCT QTimer
+ *
+ * Copyright(c) 2019-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#ifndef TIMER_QCT_QTIMER_H
+#define TIMER_QCT_QTIMER_H
+
+#include "hw/ptimer.h"
+#include "hw/sysbus.h"
+
+#define TYPE_QCT_QTIMER "qct-qtimer"
+#define TYPE_QCT_HEXTIMER "qct-hextimer"
+OBJECT_DECLARE_SIMPLE_TYPE(QCTQtimerState, QCT_QTIMER)
+OBJECT_DECLARE_SIMPLE_TYPE(QCTHextimerState, QCT_HEXTIMER)
+
+struct QCTHextimerState {
+    QCTQtimerState *qtimer;
+    ptimer_state *timer;
+    uint64_t cntval; /*
+                      * Physical timer compare value interrupt when cntpct >
+                      * cntval
+                      */
+    uint64_t cntpct; /* Physical counter */
+    uint32_t control;
+    uint32_t cnt_ctrl;
+    uint32_t cntpl0acr;
+    uint64_t limit;
+    uint32_t freq;
+    uint32_t int_level;
+    qemu_irq irq;
+};
+
+#define QCT_QTIMER_TIMER_FRAME_ELTS (8)
+#define QCT_QTIMER_TIMER_VIEW_ELTS (2)
+struct QCTQtimerState {
+    SysBusDevice parent_obj;
+
+    MemoryRegion iomem;
+    MemoryRegion view_iomem;
+    uint32_t secure;
+    struct QCTHextimerState timer[QCT_QTIMER_TIMER_FRAME_ELTS];
+    uint32_t frame_id;
+    uint32_t freq;
+    uint32_t nr_frames;
+    uint32_t nr_views;
+    uint32_t cnttid;
+};
+
+#define QCT_QTIMER_AC_CNTFRQ (0x000)
+#define QCT_QTIMER_AC_CNTSR (0x004)
+#define QCT_QTIMER_AC_CNTSR_NSN_1 (1 << 0)
+#define QCT_QTIMER_AC_CNTSR_NSN_2 (1 << 1)
+#define QCT_QTIMER_AC_CNTSR_NSN_3 (1 << 2)
+#define QCT_QTIMER_AC_CNTTID (0x08)
+#define QCT_QTIMER_AC_CNTACR_0 (0x40)
+#define QCT_QTIMER_AC_CNTACR_1 (0x44)
+#define QCT_QTIMER_AC_CNTACR_2 (0x48)
+#define QCT_QTIMER_AC_CNTACR_RWPT (1 << 5) /* R/W of CNTP_* regs */
+#define QCT_QTIMER_AC_CNTACR_RWVT (1 << 4) /* R/W of CNTV_* regs */
+#define QCT_QTIMER_AC_CNTACR_RVOFF (1 << 3) /* R/W of CNTVOFF register */
+#define QCT_QTIMER_AC_CNTACR_RFRQ (1 << 2) /* R/W of CNTFRQ register */
+#define QCT_QTIMER_AC_CNTACR_RPVCT (1 << 1) /* R/W of CNTVCT register */
+#define QCT_QTIMER_AC_CNTACR_RPCT (1 << 0) /* R/W of CNTPCT register */
+#define QCT_QTIMER_VERSION (0x0fd0)
+#define QCT_QTIMER_CNTPCT_LO (0x000)
+#define QCT_QTIMER_CNTPCT_HI (0x004)
+#define QCT_QTIMER_CNT_FREQ (0x010)
+#define QCT_QTIMER_CNTPL0ACR (0x014)
+#define QCT_QTIMER_CNTPL0ACR_PL0CTEN (1 << 9)
+#define QCT_QTIMER_CNTPL0ACR_PL0TVEN (1 << 8)
+#define QCT_QTIMER_CNTPL0ACR_PL0VCTEN (1 << 1)
+#define QCT_QTIMER_CNTPL0ACR_PL0PCTEN (1 << 0)
+#define QCT_QTIMER_CNTP_CVAL_LO (0x020)
+#define QCT_QTIMER_CNTP_CVAL_HI (0x024)
+#define QCT_QTIMER_CNTP_TVAL (0x028)
+#define QCT_QTIMER_CNTP_CTL (0x02c)
+#define QCT_QTIMER_CNTP_CTL_ISTAT (1 << 2)
+#define QCT_QTIMER_CNTP_CTL_INTEN (1 << 1)
+#define QCT_QTIMER_CNTP_CTL_ENABLE (1 << 0)
+#define QCT_QTIMER_AC_CNTACR_START 0x40
+#define QCT_QTIMER_AC_CNTACR_END 0x5C
+
+#endif /* TIMER_QCT_QTIMER_H */
diff --git a/include/semihosting/common-semi.h b/include/semihosting/common-semi.h
index 0a91db7c4149..58dfb99d7a5b 100644
--- a/include/semihosting/common-semi.h
+++ b/include/semihosting/common-semi.h
@@ -34,6 +34,7 @@
 #ifndef COMMON_SEMI_H
 #define COMMON_SEMI_H
 
+void common_semi_cb(CPUState *cs, uint64_t ret, int err);
 void do_common_semihosting(CPUState *cs);
 
 #endif /* COMMON_SEMI_H */
diff --git a/include/semihosting/semihost.h b/include/semihosting/semihost.h
index 97d2a2ba996d..6e0776610651 100644
--- a/include/semihosting/semihost.h
+++ b/include/semihosting/semihost.h
@@ -51,6 +51,11 @@ static inline const char *semihosting_get_cmdline(void)
 {
     return NULL;
 }
+
+static inline const char *semihosting_get_usefs(void)
+{
+    return NULL;
+}
 #else /* !CONFIG_USER_ONLY */
 /**
  * semihosting_enabled:
@@ -63,6 +68,7 @@ SemihostingTarget semihosting_get_target(void);
 const char *semihosting_get_arg(int i);
 int semihosting_get_argc(void);
 const char *semihosting_get_cmdline(void);
+const char *semihosting_get_usefs(void);
 void semihosting_arg_fallback(const char *file, const char *cmd);
 /* for vl.c hooks */
 void qemu_semihosting_enable(void);
diff --git a/include/semihosting/syscalls.h b/include/semihosting/syscalls.h
index 6627c45fb281..dec2ee0ad4ac 100644
--- a/include/semihosting/syscalls.h
+++ b/include/semihosting/syscalls.h
@@ -75,4 +75,6 @@ void semihost_sys_gettimeofday(CPUState *cs, gdb_syscall_complete_cb complete,
 void semihost_sys_poll_one(CPUState *cs, gdb_syscall_complete_cb complete,
                            int fd, GIOCondition cond, int timeout);
 
+void semihost_sys_ftruncate(CPUState *cs, gdb_syscall_complete_cb complete,
+                            int fd, off_t len);
 #endif /* SEMIHOSTING_SYSCALLS_H */
diff --git a/qapi/machine.json b/qapi/machine.json
index a6b8795b09ed..a7070bad4d52 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
@@ -33,7 +33,7 @@
 # Since: 3.0
 ##
 { 'enum' : 'SysEmuTarget',
-  'data' : [ 'aarch64', 'alpha', 'arm', 'avr', 'hppa', 'i386',
+  'data' : [ 'aarch64', 'alpha', 'arm', 'avr', 'hexagon', 'hppa', 'i386',
              'loongarch64', 'm68k', 'microblaze', 'microblazeel', 'mips', 'mips64',
              'mips64el', 'mipsel', 'or1k', 'ppc',
              'ppc64', 'riscv32', 'riscv64', 'rx', 's390x', 'sh4',
diff --git a/qemu-options.hx b/qemu-options.hx
index dc694a99a30a..888b3092bef7 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -5110,7 +5110,7 @@ ERST
 DEF("semihosting", 0, QEMU_OPTION_semihosting,
     "-semihosting    semihosting mode\n",
     QEMU_ARCH_ARM | QEMU_ARCH_M68K | QEMU_ARCH_XTENSA |
-    QEMU_ARCH_MIPS | QEMU_ARCH_RISCV)
+    QEMU_ARCH_MIPS | QEMU_ARCH_RISCV | QEMU_ARCH_HEXAGON)
 SRST
 ``-semihosting``
     Enable :ref:`Semihosting` mode (ARM, M68K, Xtensa, MIPS, RISC-V only).
@@ -5126,11 +5126,11 @@ DEF("semihosting-config", HAS_ARG, QEMU_OPTION_semihosting_config,
     "-semihosting-config [enable=on|off][,target=native|gdb|auto][,chardev=id][,userspace=on|off][,arg=str[,...]]\n" \
     "                semihosting configuration\n",
 QEMU_ARCH_ARM | QEMU_ARCH_M68K | QEMU_ARCH_XTENSA |
-QEMU_ARCH_MIPS | QEMU_ARCH_RISCV)
+QEMU_ARCH_MIPS | QEMU_ARCH_RISCV | QEMU_ARCH_HEXAGON)
 SRST
-``-semihosting-config [enable=on|off][,target=native|gdb|auto][,chardev=id][,userspace=on|off][,arg=str[,...]]``
-    Enable and configure :ref:`Semihosting` (ARM, M68K, Xtensa, MIPS, RISC-V
-    only).
+``-semihosting-config [enable=on|off][,target=native|gdb|auto][,chardev=id][,userspace=on|off][,usefs=<path>][,arg=str[,...]]``
+    Enable and configure :ref:`Semihosting` (ARM, M68K, Xtensa, MIPS, RISC-V,
+    Hexagon only).
 
     .. warning::
       Note that this allows guest direct access to the host filesystem, so
@@ -5152,6 +5152,11 @@ SRST
         only be used if all guest code is trusted (for example, in
         bare-metal test case code).
 
+    ``usefs=<path>``
+        Sets a fallback directory to be used by the open semihosting call. If
+        the requested file is not found QEMU will search again at the given
+        path.
+
     ``arg=str1,arg=str2,...``
         Allows the user to pass input arguments, and can be used
         multiple times to build up a list. The old-style
diff --git a/semihosting/arm-compat-semi.c b/semihosting/arm-compat-semi.c
index 86e5260e504b..e4825a866718 100644
--- a/semihosting/arm-compat-semi.c
+++ b/semihosting/arm-compat-semi.c
@@ -85,7 +85,30 @@
 #define O_BINARY 0
 #endif
 
-static int gdb_open_modeflags[12] = {
+struct semihosting_opt_callbacks {
+    void (*set_err)(CPUState *cs, target_ulong err);
+    void (*prepare_for_read)(CPUState *cs, target_ulong fd, target_ulong buf,
+                              target_ulong len);
+} opt_callbacks;
+
+#define SEMIHOSTING_REGISTER_OPT_CALLBACKS(callbacks) \
+    struct semihosting_opt_callbacks opt_callbacks = callbacks;
+
+#define CALL_OPT_CALLBACK(FN, ARGS...) do { \
+    if (opt_callbacks.FN) { \
+        opt_callbacks.FN(ARGS); \
+    } \
+} while (0)
+
+#include "common-semi-target.h"
+
+#ifdef SEMIHOSTING_EXT_OPEN_MODES
+#define GDB_OPEN_MODES_NR 14
+#else
+#define GDB_OPEN_MODES_NR 12
+#endif
+
+static int gdb_open_modeflags[GDB_OPEN_MODES_NR] = {
     GDB_O_RDONLY,
     GDB_O_RDONLY,
     GDB_O_RDWR,
@@ -98,6 +121,10 @@ static int gdb_open_modeflags[12] = {
     GDB_O_WRONLY | GDB_O_CREAT | GDB_O_APPEND,
     GDB_O_RDWR | GDB_O_CREAT | GDB_O_APPEND,
     GDB_O_RDWR | GDB_O_CREAT | GDB_O_APPEND,
+#ifdef SEMIHOSTING_EXT_OPEN_MODES
+    GDB_O_RDWR | GDB_O_CREAT,
+    GDB_O_RDWR | GDB_O_CREAT | GDB_O_EXCL,
+#endif
 };
 
 #ifndef CONFIG_USER_ONLY
@@ -180,17 +207,10 @@ static LayoutInfo common_semi_find_bases(CPUState *cs)
  * error indication (0 on success, non-0 for error) which the caller
  * should check.
  */
-
-#define GET_ARG(n) do {                                 \
-    if (is_64bit_semihosting(env)) {                    \
-        if (get_user_u64(arg ## n, args + (n) * 8)) {   \
-            goto do_fault;                              \
-        }                                               \
-    } else {                                            \
-        if (get_user_u32(arg ## n, args + (n) * 4)) {   \
-            goto do_fault;                              \
-        }                                               \
-    }                                                   \
+#define GET_ARG(n) do { \
+    if (common_semi_read_arg_word(env, &arg ## n, args, n)) { \
+        goto do_fault; \
+    } \
 } while (0)
 
 #define SET_ARG(n, val)                                 \
@@ -223,7 +243,7 @@ static inline uint32_t get_swi_errno(CPUState *cs)
 #endif
 }
 
-static void common_semi_cb(CPUState *cs, uint64_t ret, int err)
+void common_semi_cb(CPUState *cs, uint64_t ret, int err)
 {
     if (err) {
 #ifdef CONFIG_USER_ONLY
@@ -231,6 +251,7 @@ static void common_semi_cb(CPUState *cs, uint64_t ret, int err)
         ts->swi_errno = err;
 #else
         syscall_err = err;
+        CALL_OPT_CALLBACK(set_err, cs, err);
 #endif
     }
     common_semi_set_ret(cs, ret);
@@ -386,7 +407,7 @@ void do_common_semihosting(CPUState *cs)
         if (!s) {
             goto do_fault;
         }
-        if (arg1 >= 12) {
+        if (arg1 >= GDB_OPEN_MODES_NR) {
             unlock_user(s, arg0, 0);
             common_semi_cb(cs, -1, EINVAL);
             break;
@@ -466,6 +487,7 @@ void do_common_semihosting(CPUState *cs)
         GET_ARG(0);
         GET_ARG(1);
         GET_ARG(2);
+        CALL_OPT_CALLBACK(prepare_for_read, cs, arg0, arg1, arg2);
         semihost_sys_read(cs, common_semi_rw_cb, arg0, arg1, arg2);
         break;
 
diff --git a/semihosting/config.c b/semihosting/config.c
index 56283b5c3c38..a64a8dfd27da 100644
--- a/semihosting/config.c
+++ b/semihosting/config.c
@@ -46,6 +46,9 @@ QemuOptsList qemu_semihosting_config_opts = {
         }, {
             .name = "arg",
             .type = QEMU_OPT_STRING,
+        }, {
+            .name = "usefs",
+            .type = QEMU_OPT_STRING,
         },
         { /* end of list */ }
     },
@@ -58,6 +61,7 @@ typedef struct SemihostingConfig {
     char **argv;
     int argc;
     const char *cmdline; /* concatenated argv */
+    const char *usefs;
 } SemihostingConfig;
 
 static SemihostingConfig semihosting;
@@ -94,6 +98,11 @@ const char *semihosting_get_cmdline(void)
     return semihosting.cmdline;
 }
 
+const char *semihosting_get_usefs(void)
+{
+    return semihosting.usefs;
+}
+
 static int add_semihosting_arg(void *opaque,
                                const char *name, const char *val,
                                Error **errp)
@@ -144,6 +153,8 @@ int qemu_semihosting_config_options(const char *optstr)
                                                 true);
         semihosting.userspace_enabled = qemu_opt_get_bool(opts, "userspace",
                                                           false);
+        semihosting.usefs = qemu_opt_get(opts, "usefs");
+
         const char *target = qemu_opt_get(opts, "target");
         /* setup of chardev is deferred until they are initialised */
         semihost_chardev = qemu_opt_get(opts, "chardev");
diff --git a/semihosting/guestfd.c b/semihosting/guestfd.c
index d3241434c516..4d846f4e5d10 100644
--- a/semihosting/guestfd.c
+++ b/semihosting/guestfd.c
@@ -23,6 +23,18 @@ GuestFD console_in_gf;
 GuestFD console_out_gf;
 #endif
 
+static void semihosting_use_stdio(void)
+{
+    console_in_gf.type = GuestFDHost;
+    console_in_gf.hostfd = 0;
+    console_out_gf.type = GuestFDHost;
+    console_out_gf.hostfd = 1;
+    guestfd_array = g_array_set_size(guestfd_array, 3);
+    associate_guestfd(0, 0);
+    associate_guestfd(1, 1);
+    associate_guestfd(2, 2);
+}
+
 void qemu_semihosting_guestfd_init(void)
 {
     /* New entries zero-initialized, i.e. type GuestFDUnused */
@@ -36,8 +48,12 @@ void qemu_semihosting_guestfd_init(void)
         console_out_gf.type = GuestFDGDB;
         console_out_gf.hostfd = 2;
     } else {
+#ifdef CONFIG_SEMIHOSTING_USE_STDIO
+        semihosting_use_stdio();
+#else
         console_in_gf.type = GuestFDConsole;
         console_out_gf.type = GuestFDConsole;
+#endif
     }
 #else
     /* Otherwise, the stdio file descriptors apply. */
diff --git a/semihosting/syscalls.c b/semihosting/syscalls.c
index f6451d9bb0e6..e790c79efe85 100644
--- a/semihosting/syscalls.c
+++ b/semihosting/syscalls.c
@@ -13,6 +13,7 @@
 #include "semihosting/guestfd.h"
 #include "semihosting/syscalls.h"
 #include "semihosting/console.h"
+#include "semihosting/semihost.h"
 #ifdef CONFIG_USER_ONLY
 #include "qemu.h"
 #else
@@ -261,7 +262,8 @@ static void host_open(CPUState *cs, gdb_syscall_complete_cb complete,
 {
     CPUArchState *env G_GNUC_UNUSED = cpu_env(cs);
     char *p;
-    int ret, host_flags = O_BINARY;
+    int ret, err, host_flags = O_BINARY;
+    const char *usefs = semihosting_get_usefs();
 
     ret = validate_lock_user_string(&p, cs, fname, fname_len);
     if (ret < 0) {
@@ -287,9 +289,17 @@ static void host_open(CPUState *cs, gdb_syscall_complete_cb complete,
     }
 
     ret = open(p, host_flags, mode);
+    err = errno;
+    if (ret < 0 && err == ENOENT && usefs) {
+        g_autoptr(GString) usefs_fname = g_string_new(NULL);
+        g_string_append_printf(usefs_fname, "%s/%s", usefs, p);
+        ret = open(usefs_fname->str, host_flags, mode);
+        err = errno;
+    }
+
     if (ret < 0) {
         qemu_log_mask(LOG_GUEST_ERROR, "%s: failed to open %s\n", __func__, p);
-        complete(cs, -1, errno);
+        complete(cs, -1, err);
     } else {
         int guestfd = alloc_guestfd();
         associate_guestfd(guestfd, ret);
@@ -542,6 +552,13 @@ static void host_poll_one(CPUState *cs, gdb_syscall_complete_cb complete,
 }
 #endif
 
+static void host_ftruncate(CPUState *cs, gdb_syscall_complete_cb complete,
+                           GuestFD *gf, off_t len)
+{
+    int err = ftruncate(gf->hostfd, len);
+    complete(cs, err, err < 0 ? errno : 0);
+}
+
 /*
  * Static file semihosting syscall implementations.
  */
@@ -983,3 +1000,22 @@ void semihost_sys_poll_one(CPUState *cs, gdb_syscall_complete_cb complete,
     }
 }
 #endif
+
+void semihost_sys_ftruncate(CPUState *cs, gdb_syscall_complete_cb complete,
+                            int fd, off_t len)
+{
+    GuestFD *gf = get_guestfd(fd);
+    if (!gf) {
+        complete(cs, -1, EBADF);
+        return;
+    }
+
+    switch (gf->type) {
+    case GuestFDHost:
+        host_ftruncate(cs, complete, gf, len);
+        break;
+    default:
+        fprintf(stderr, "ftruncate call not implemented for this semihosting mode.\n");
+        g_assert_not_reached();
+    }
+}
diff --git a/target/Kconfig b/target/Kconfig
index d0c7b59d9c71..37781146b9bb 100644
--- a/target/Kconfig
+++ b/target/Kconfig
@@ -16,6 +16,7 @@ source sh4/Kconfig
 source sparc/Kconfig
 source tricore/Kconfig
 source xtensa/Kconfig
+source hexagon/Kconfig
 
 config TARGET_BIG_ENDIAN
     bool
diff --git a/target/arm/common-semi-target.h b/target/arm/common-semi-target.h
index da51f2d7f540..69429a45c652 100644
--- a/target/arm/common-semi-target.h
+++ b/target/arm/common-semi-target.h
@@ -12,6 +12,17 @@
 
 #include "target/arm/cpu-qom.h"
 
+static inline bool common_semi_read_arg_word(CPUArchState *env,
+                                             target_ulong *save_to,
+                                             target_ulong args_addr,
+                                             int arg_num)
+{
+    if (is_64bit_semihosting(env)) {
+        return get_user_u64(*save_to, args_addr + (arg_num) * 8));
+    }
+    return get_user_u32(*save_to, args_addr + (arg_num) * 4));
+}
+
 static inline target_ulong common_semi_arg(CPUState *cs, int argno)
 {
     ARMCPU *cpu = ARM_CPU(cs);
diff --git a/target/hexagon/Kconfig b/target/hexagon/Kconfig
new file mode 100644
index 000000000000..7e556f350633
--- /dev/null
+++ b/target/hexagon/Kconfig
@@ -0,0 +1,2 @@
+config HEXAGON
+    bool
diff --git a/target/hexagon/arch.c b/target/hexagon/arch.c
index d053d6848715..87c2f6a53f6c 100644
--- a/target/hexagon/arch.c
+++ b/target/hexagon/arch.c
@@ -208,6 +208,11 @@ void arch_fpop_start(CPUHexagonState *env)
  * model it in qemu user mode.
  */
 #define RAISE_FP_EXCEPTION   do {} while (0)
+#else
+ /*
+  * To be implemented.
+  */
+#define RAISE_FP_EXCEPTION   do { g_assert_not_reached(); } while (0)
 #endif
 
 #define SOFTFLOAT_TEST_FLAG(FLAG, MYF, MYE) \
diff --git a/target/hexagon/attribs_def.h.inc b/target/hexagon/attribs_def.h.inc
index 9e3a05f88281..e6523a739b10 100644
--- a/target/hexagon/attribs_def.h.inc
+++ b/target/hexagon/attribs_def.h.inc
@@ -19,20 +19,41 @@
 DEF_ATTRIB(AA_DUMMY, "Dummy Zeroth Attribute", "", "")
 
 /* Misc */
+DEF_ATTRIB(FAKEINSN, "Not a real instruction", "", "")
+DEF_ATTRIB(MAPPING, "Not real -- asm mapped", "", "")
+DEF_ATTRIB(CONDMAPPING, "Not real -- mapped based on values", "", "")
 DEF_ATTRIB(EXTENSION, "Extension instruction", "", "")
+DEF_ATTRIB(SHARED_EXTENSION, "Shared extension instruction", "", "")
+DEF_ATTRIB(CABAC,
+           "Cabac Instruction. Used in conjuction with QDSP6_CABAC_PRESENT", "",
+           "")
+DEF_ATTRIB(EXPERIMENTAL, "This may not work correctly not supported by RTL.",
+           "", "")
 
 DEF_ATTRIB(PRIV, "Not available in user or guest mode", "", "")
 DEF_ATTRIB(GUEST, "Not available in user mode", "", "")
 
 DEF_ATTRIB(FPOP, "Floating Point Operation", "", "")
+DEF_ATTRIB(FPDOUBLE, "Double-precision Floating Point Operation", "", "")
+DEF_ATTRIB(FPSINGLE, "Single-precision Floating Point Operation", "", "")
+DEF_ATTRIB(SFMAKE, "Single Float Make", "", "")
+DEF_ATTRIB(DFMAKE, "Single Float Make", "", "")
+
+DEF_ATTRIB(NO_TIMING_LOG, "Does not get logged to the timing model", "", "")
 
 DEF_ATTRIB(EXTENDABLE, "Immediate may be extended", "", "")
+DEF_ATTRIB(EXT_UPPER_IMMED, "Extend upper case immediate", "", "")
+DEF_ATTRIB(EXT_LOWER_IMMED, "Extend lower case immediate", "", "")
+DEF_ATTRIB(MUST_EXTEND, "Immediate must be extended", "", "")
+DEF_ATTRIB(NA_NT, "Non-Allocating Non-Temporal instruction", "", "")
+DEF_ATTRIB(INVPRED, "The predicate is inverted for true/false sense", "", "")
 
 DEF_ATTRIB(ARCHV2, "V2 architecture", "", "")
 DEF_ATTRIB(ARCHV3, "V3 architecture", "", "")
 DEF_ATTRIB(ARCHV4, "V4 architecture", "", "")
 DEF_ATTRIB(ARCHV5, "V5 architecture", "", "")
 
+DEF_ATTRIB(PACKED, "Packable instruction", "", "")
 DEF_ATTRIB(SUBINSN, "sub-instruction", "", "")
 
 /* Load and Store attributes */
@@ -46,21 +67,48 @@ DEF_ATTRIB(MEMSIZE_4B, "Memory width is 4 bytes", "", "")
 DEF_ATTRIB(MEMSIZE_8B, "Memory width is 8 bytes", "", "")
 DEF_ATTRIB(SCALAR_LOAD, "Load is scalar", "", "")
 DEF_ATTRIB(SCALAR_STORE, "Store is scalar", "", "")
-DEF_ATTRIB(REGWRSIZE_1B, "Memory width is 1 byte", "", "")
-DEF_ATTRIB(REGWRSIZE_2B, "Memory width is 2 bytes", "", "")
-DEF_ATTRIB(REGWRSIZE_4B, "Memory width is 4 bytes", "", "")
-DEF_ATTRIB(REGWRSIZE_8B, "Memory width is 8 bytes", "", "")
+DEF_ATTRIB(REGWRSIZE_1B, "ETM Memory width is 1 byte", "", "")
+DEF_ATTRIB(REGWRSIZE_2B, "ETM Memory width is 2 bytes", "", "")
+DEF_ATTRIB(REGWRSIZE_4B, "ETM Memory width is 4 bytes", "", "")
+DEF_ATTRIB(REGWRSIZE_8B, "ETM Memory width is 8 bytes", "", "")
 DEF_ATTRIB(MEMLIKE, "Memory-like instruction", "", "")
 DEF_ATTRIB(MEMLIKE_PACKET_RULES, "follows Memory-like packet rules", "", "")
+DEF_ATTRIB(CACHEOP, "Cache operation", "", "")
+DEF_ATTRIB(COPBYADDRESS, "Cache operation by address", "", "")
+DEF_ATTRIB(COPBYIDX, "Cache operation by index", "", "")
 DEF_ATTRIB(RELEASE, "Releases a lock", "", "")
 DEF_ATTRIB(ACQUIRE, "Acquires a lock", "", "")
+DEF_ATTRIB(LLSC, "load-locked/store-conditional instruction", "", "")
 
 DEF_ATTRIB(RLS_INNER, "Store release inner visibility", "", "")
+DEF_ATTRIB(RLS_OUTER, "Store release outer visibility", "", "")
 DEF_ATTRIB(RLS_ALL_THREAD, "Store release among all threads", "", "")
 DEF_ATTRIB(RLS_SAME_THREAD, "Store release with the same thread", "", "")
 
+/* Load and Store Addressing Mode Attributes */
+DEF_ATTRIB(EA_REG_ONLY, "EA = input register only", "", "")
+DEF_ATTRIB(EA_IMM_ONLY, "EA = immediate only", "", "")
+DEF_ATTRIB(EA_REG_PLUS_IMM, "EA = register plus immediate", "", "")
+DEF_ATTRIB(EA_REG_PLUS_REGSCALED, "EA = register plus scaled register", "", "")
+DEF_ATTRIB(EA_IMM_PLUS_REGSCALED, "EA = immediate plus scaled register", "", "")
+DEF_ATTRIB(EA_BREV_REG, "EA = bit-reversed input register", "", "")
+DEF_ATTRIB(EA_GP_IMM, "EA = GP plus immediate (unless extended)", "", "")
+DEF_ATTRIB(EA_PAGECROSS, "EA calculation can have a Page Cross Stall", "", "")
+
+DEF_ATTRIB(PM_ANY, "Post Modify", "", "")
+DEF_ATTRIB(PM_I, "Post Modify by Immediate", "", "")
+DEF_ATTRIB(PM_M, "Post Modify by M register", "", "")
+DEF_ATTRIB(PM_CIRI, "Post Modify with Circular Addressing by immediate", "", "")
+DEF_ATTRIB(PM_CIRR, "Post Modify with Circular Addressing by I field", "", "")
+
+DEF_ATTRIB(VMEM, "VMEM-type", "", "")
+DEF_ATTRIB(VBUF, "Touches the VBUF", "", "")
+DEF_ATTRIB(VDBG, "Vector debugging instruction", "", "")
+
 /* V6 Vector attributes */
 DEF_ATTRIB(CVI, "Executes on the HVX extension", "", "")
+DEF_ATTRIB(NT_VMEM, "Non-temporal memory access", "", "")
+DEF_ATTRIB(VMEMU, "Unaligned memory access", "", "")
 
 DEF_ATTRIB(CVI_NEW, "New value memory instruction executes on HVX", "", "")
 DEF_ATTRIB(CVI_VM, "Memory instruction executes on HVX", "", "")
@@ -69,109 +117,415 @@ DEF_ATTRIB(CVI_VP_VS, "Double vector permute/shft insn executes on HVX", "", "")
 DEF_ATTRIB(CVI_VX, "Multiply instruction executes on HVX", "", "")
 DEF_ATTRIB(CVI_VX_DV, "Double vector multiply insn executes on HVX", "", "")
 DEF_ATTRIB(CVI_VS, "Shift instruction executes on HVX", "", "")
-DEF_ATTRIB(CVI_VS_3SRC, "This shift needs to borrow a source register", "", "")
+DEF_ATTRIB(
+    CVI_VS_3SRC,
+    "This shift instruction needs to borrow a source register from the VP slot",
+    "", "")
 DEF_ATTRIB(CVI_VS_VX, "Permute/shift and multiply insn executes on HVX", "", "")
 DEF_ATTRIB(CVI_VA, "ALU instruction executes on HVX", "", "")
+DEF_ATTRIB(CVI_VA_2SRC,
+           "This alu instruction executes on multimedia vector engine and "
+           "requires two vectro sources",
+           "", "")
 DEF_ATTRIB(CVI_VA_DV, "Double vector alu instruction executes on HVX", "", "")
 DEF_ATTRIB(CVI_4SLOT, "Consumes all the vector execution resources", "", "")
 DEF_ATTRIB(CVI_TMP, "Transient Memory Load not written to register", "", "")
 DEF_ATTRIB(CVI_REMAP, "Register Renaming not written to register file", "", "")
+DEF_ATTRIB(CVI_TMP_SRC, "Transient reassign", "", "")
+DEF_ATTRIB(CVI_EXTRACT, "HVX Extract Instruction that goes through L2", "", "")
+DEF_ATTRIB(CVI_EARLY, "HVX instructions that require early sources", "", "")
+DEF_ATTRIB(CVI_LATE, "HVX insn that always require late sources", "", "")
+DEF_ATTRIB(CVI_VV_LATE, "HVX insn that always require late Vv source", "", "")
+DEF_ATTRIB(CVI_REQUIRES_TMPLOAD, ".tmp load must be included in packet", "", "")
+DEF_ATTRIB(CVI_PUMP_2X, "Goes through the pipeline twice", "", "")
+DEF_ATTRIB(CVI_PUMP_4X, "Goes through the pipeline four times", "", "")
 DEF_ATTRIB(CVI_GATHER, "CVI Gather operation", "", "")
 DEF_ATTRIB(CVI_SCATTER, "CVI Scatter operation", "", "")
 DEF_ATTRIB(CVI_SCATTER_RELEASE, "CVI Store Release for scatter", "", "")
+DEF_ATTRIB(CVI_GATHER_RELEASE, "CVI Store Release for gather", "", "")
 DEF_ATTRIB(CVI_TMP_DST, "CVI instruction that doesn't write a register", "", "")
+DEF_ATTRIB(CVI_SCATTER_WORD_ACC, "CVI Scatter Word Accum (second pass)", "", "")
+DEF_ATTRIB(CVI_SCATTER_ACC, "CVI Scatter Accumulate", "", "")
+DEF_ATTRIB(CVI_VX_VSRC0_IS_DST,
+           "For the assembler to handle the special case of non-linear "
+           "instructions with Vxx specified both as src and dst in syntax ",
+           "", "")
+
+DEF_ATTRIB(CVI_VX_ACC_FWD, "VX Accumulator Forwarding", "", "")
+
+DEF_ATTRIB(CVI_VX_NO_TMP_LD,
+           "VX Accumulator renaming not allowed from tmp load instruction", "",
+           "")
+
+DEF_ATTRIB(RESTRICT_CVI_NOVP,
+           "Instructions with this attribute are assigned to the original "
+           "shift unit and can not be assigned to the shift/permute unit",
+           "", "")
+
+DEF_ATTRIB(CVI_GATHER_ADDR_2B, "CVI Scatter/Gather address is halfword", "", "")
+DEF_ATTRIB(CVI_GATHER_ADDR_4B, "CVI Scatter/Gather address is word", "", "")
+
+DEF_ATTRIB(VFETCH, "memory fetch op to L2 for a single vector", "", "")
+
 DEF_ATTRIB(CVI_SLOT23, "Can execute in slot 2 or slot 3 (HVX)", "", "")
 
-DEF_ATTRIB(VTCM_ALLBANK_ACCESS, "Allocates in all VTCM schedulers.", "", "")
+DEF_ATTRIB(HVX_FLT, "This a floating point HVX instruction.", "", "")
+
+DEF_ATTRIB(
+    VTCM_ALLBANK_ACCESS,
+    "This instruction allocates in all VTCM schedulers due to a region access.",
+    "", "")
+DEF_ATTRIB(XUMINOR, "XU minor SMTable instruction", "", "")
+
+DEF_ATTRIB(SYNC_MARKER, "This instruction needs a sync marker.", "", "")
+
 
 /* Change-of-flow attributes */
 DEF_ATTRIB(JUMP, "Jump-type instruction", "", "")
+DEF_ATTRIB(DIRECT, "Uses an PC-relative immediate field", "", "")
 DEF_ATTRIB(INDIRECT, "Absolute register jump", "", "")
+DEF_ATTRIB(CJUMP, "Conditional jump", "", "")
 DEF_ATTRIB(CALL, "Function call instruction", "", "")
+DEF_ATTRIB(RET, "Function return instruction", "", "")
+DEF_ATTRIB(PERM, "Permute instruction", "", "")
 DEF_ATTRIB(COF, "Change-of-flow instruction", "", "")
 DEF_ATTRIB(HINTED_COF, "This instruction is a hinted change-of-flow", "", "")
 DEF_ATTRIB(CONDEXEC, "May be cancelled by a predicate", "", "")
+DEF_ATTRIB(DOTOLD, "Uses a predicate generated in a previous packet", "", "")
+DEF_ATTRIB(DOTNEW, "Uses a predicate generated in the same packet", "", "")
 DEF_ATTRIB(DOTNEWVALUE, "Uses a register value generated in this pkt", "", "")
 DEF_ATTRIB(NEWCMPJUMP, "Compound compare and jump", "", "")
 DEF_ATTRIB(NVSTORE, "New-value store", "", "")
 DEF_ATTRIB(MEMOP, "memop", "", "")
 
-DEF_ATTRIB(ROPS_2, "Compound instruction worth 2 RISC-ops", "", "")
-DEF_ATTRIB(ROPS_3, "Compound instruction worth 3 RISC-ops", "", "")
+DEF_ATTRIB(ROPS_2, "Compound instruction worth 2 wimpy RISC-ops", "", "")
+DEF_ATTRIB(ROPS_3, "Compound instruction worth 3 wimpy RISC-ops", "", "")
 
 /* access to implicit registers */
 DEF_ATTRIB(IMPLICIT_WRITES_LR, "Writes the link register", "", "UREG.LR")
+DEF_ATTRIB(IMPLICIT_READS_LR, "Reads the link register", "UREG.LR", "")
+DEF_ATTRIB(IMPLICIT_READS_LC0, "Reads loop count for loop 0", "UREG.LC0", "")
+DEF_ATTRIB(IMPLICIT_READS_LC1, "Reads loop count for loop 1", "UREG.LC1", "")
+DEF_ATTRIB(IMPLICIT_READS_SA0, "Reads start address for loop 0", "UREG.SA0", "")
+DEF_ATTRIB(IMPLICIT_READS_SA1, "Reads start address for loop 1", "UREG.SA1", "")
+DEF_ATTRIB(IMPLICIT_WRITES_PC, "Writes the program counter", "", "UREG.PC")
+DEF_ATTRIB(IMPLICIT_READS_PC, "Reads the program counter", "UREG.PC", "")
 DEF_ATTRIB(IMPLICIT_WRITES_SP, "Writes the stack pointer", "", "UREG.SP")
+DEF_ATTRIB(IMPLICIT_READS_SP, "Reads the stack pointer", "UREG.SP", "")
 DEF_ATTRIB(IMPLICIT_WRITES_FP, "Writes the frame pointer", "", "UREG.FP")
+DEF_ATTRIB(IMPLICIT_READS_FP, "Reads the frame pointer", "UREG.FP", "")
+DEF_ATTRIB(IMPLICIT_WRITES_GP, "Writes the GP register", "", "UREG.GP")
+DEF_ATTRIB(IMPLICIT_READS_GP, "Reads the GP register", "UREG.GP", "")
 DEF_ATTRIB(IMPLICIT_WRITES_LC0, "Writes loop count for loop 0", "", "UREG.LC0")
 DEF_ATTRIB(IMPLICIT_WRITES_LC1, "Writes loop count for loop 1", "", "UREG.LC1")
 DEF_ATTRIB(IMPLICIT_WRITES_SA0, "Writes start addr for loop 0", "", "UREG.SA0")
 DEF_ATTRIB(IMPLICIT_WRITES_SA1, "Writes start addr for loop 1", "", "UREG.SA1")
+DEF_ATTRIB(IMPLICIT_WRITES_R00, "Writes Register 0", "", "UREG.R00")
 DEF_ATTRIB(IMPLICIT_WRITES_P0, "Writes Predicate 0", "", "UREG.P0")
 DEF_ATTRIB(IMPLICIT_WRITES_P1, "Writes Predicate 1", "", "UREG.P1")
 DEF_ATTRIB(IMPLICIT_WRITES_P2, "Writes Predicate 1", "", "UREG.P2")
 DEF_ATTRIB(IMPLICIT_WRITES_P3, "May write Predicate 3", "", "UREG.P3")
-DEF_ATTRIB(IMPLICIT_READS_PC, "Reads the PC register", "", "")
-DEF_ATTRIB(IMPLICIT_READS_P0, "Reads the P0 register", "", "")
-DEF_ATTRIB(IMPLICIT_READS_P1, "Reads the P1 register", "", "")
-DEF_ATTRIB(IMPLICIT_READS_P2, "Reads the P2 register", "", "")
-DEF_ATTRIB(IMPLICIT_READS_P3, "Reads the P3 register", "", "")
+DEF_ATTRIB(IMPLICIT_READS_R00, "Reads Register 0", "UREG.R00", "")
+DEF_ATTRIB(IMPLICIT_READS_P0, "Reads Predicate 0", "UREG.P0", "")
+DEF_ATTRIB(IMPLICIT_READS_P1, "Reads Predicate 1", "UREG.P1", "")
+DEF_ATTRIB(IMPLICIT_READS_P3, "Reads Predicate 3", "UREG.P3", "")
+DEF_ATTRIB(IMPLICIT_READS_Q3, "Reads Vector Predicate 3", "UREG.Q3", "")
+DEF_ATTRIB(IMPLICIT_READS_CS, "Reads the CS/M register", "UREG.CS", "")
+DEF_ATTRIB(IMPLICIT_READS_FRAMEKEY, "Reads FRAMEKEY", "UREG.FRAMEKEY", "")
+DEF_ATTRIB(IMPLICIT_READS_FRAMELIMIT, "Reads FRAMELIMIT", "UREG.FRAMELIMIT", "")
+DEF_ATTRIB(IMPLICIT_READS_ELR, "Reads the ELR register", "MREG.ELR", "")
+DEF_ATTRIB(IMPLICIT_READS_SGP0, "Reads the SGP0 register", "MREG.SGP0", "")
+DEF_ATTRIB(IMPLICIT_READS_SGP1, "Reads the SGP1 register", "MREG.SGP1", "")
+DEF_ATTRIB(IMPLICIT_WRITES_SGP0, "Reads the SGP0 register", "", "MREG.SGP0")
+DEF_ATTRIB(IMPLICIT_WRITES_SGP1, "Reads the SGP1 register", "", "MREG.SGP1")
+DEF_ATTRIB(IMPLICIT_WRITES_STID_PRIO_ANYTHREAD, "Reads", "", "MREG.STID.PRIO")
+DEF_ATTRIB(IMPLICIT_WRITES_SRBIT, "Writes the OVF bit", "", "UREG.SR.OVF")
+DEF_ATTRIB(IMPLICIT_WRITES_FPFLAGS, "May write FP flags", "", "UREG.SR.FPFLAGS")
+DEF_ATTRIB(IMPLICIT_WRITES_LPCFG, "Writes the loop config", "", "UREG.SR.LPCFG")
+DEF_ATTRIB(IMPLICIT_WRITES_CVBITS, "Writes the CV flags", "", "UREG.SR.CV")
+DEF_ATTRIB(IMPLICIT_READS_FPRND, "May read FP rnd mode", "UREG.SR.FPRND", "")
+DEF_ATTRIB(IMPLICIT_READS_SSR, "May read SSR values", "MREG.SSR", "")
+DEF_ATTRIB(IMPLICIT_READS_CCR, "May read CCR values", "MREG.CCR", "")
+DEF_ATTRIB(IMPLICIT_WRITES_CCR, "May write CCR values", "", "MREG.CCR")
+DEF_ATTRIB(IMPLICIT_WRITES_SSR, "May write SSR values", "", "MREG.SSR")
+DEF_ATTRIB(IMPLICIT_READS_GELR, "May read GELR values", "GREG.GELR", "")
+DEF_ATTRIB(IMPLICIT_READS_GEVB, "May read GEVB values", "MREG.GEVB", "")
+DEF_ATTRIB(IMPLICIT_READS_GSR, "May read GSR values", "GREG.GSR", "")
+DEF_ATTRIB(IMPLICIT_READS_GOSP, "May read GOSP values", "GREG.GOSP", "")
+DEF_ATTRIB(IMPLICIT_WRITES_GELR, "May write GELR values", "", "GREG.GELR")
+DEF_ATTRIB(IMPLICIT_WRITES_GSR, "May write GSR values", "", "GREG.GSR")
+DEF_ATTRIB(IMPLICIT_WRITES_GOSP, "May write GOSP values", "", "GREG.GOSP")
+DEF_ATTRIB(IMPLICIT_READS_IPENDAD_IPEND, "May read", "MREG.IPENDAD.IPEND", "")
+DEF_ATTRIB(IMPLICIT_WRITES_IPENDAD_IPEND, "May write", "", "MREG.IPENDAD.IPEND")
+DEF_ATTRIB(IMPLICIT_READS_IPENDAD_IAD, "May read", "MREG.IPENDAD.IAD", "")
+DEF_ATTRIB(IMPLICIT_WRITES_IPENDAD_IAD, "May write", "", "MREG.IPENDAD.IAD")
+DEF_ATTRIB(IMPLICIT_WRITES_IMASK_ANYTHREAD, "May write", "", "MREG.IMASK")
+DEF_ATTRIB(IMPLICIT_READS_IMASK_ANYTHREAD, "May read", "MREG.IMASK", "")
+DEF_ATTRIB(IMPLICIT_READS_SYSCFG_K0LOCK, "May read", "MREG.SYSCFG.K0LOCK", "")
+DEF_ATTRIB(IMPLICIT_WRITES_SYSCFG_K0LOCK, "May write", "", "MREG.SYSCFG.K0LOCK")
+DEF_ATTRIB(IMPLICIT_READS_SYSCFG_TLBLOCK, "May read", "MREG.SYSCFG.TLBLOCK", "")
+DEF_ATTRIB(IMPLICIT_WRITES_SYSCFG_TLBLOCK, "May wr", "", "MREG.SYSCFG.TLBLOCK")
+DEF_ATTRIB(IMPLICIT_WRITES_SYSCFG_GCA, "May write", "", "MREG.SYSCFG.GCA")
+DEF_ATTRIB(IMPLICIT_READS_SYSCFG_GCA, "May read", "MREG.SYSCFG.GCA", "")
 DEF_ATTRIB(IMPLICIT_WRITES_USR, "May write USR", "", "")
-DEF_ATTRIB(IMPLICIT_READS_SP, "Reads the SP register", "", "")
+
+/* Other things the instruction does */
+DEF_ATTRIB(ACC, "Has a multiply", "", "")
+DEF_ATTRIB(MPY, "Has a multiply", "", "")
+DEF_ATTRIB(SATURATE, "Does signed saturation", "", "")
+DEF_ATTRIB(USATURATE, "Does unsigned saturation", "", "")
+DEF_ATTRIB(CIRCADDR, "Uses circular addressing mode", "", "")
+DEF_ATTRIB(BREVADDR, "Uses bit reverse addressing mode", "", "")
+DEF_ATTRIB(BIDIRSHIFTL, "Uses a bidirectional shift left", "", "")
+DEF_ATTRIB(BIDIRSHIFTR, "Uses a bidirectional shift right", "", "")
+DEF_ATTRIB(BRANCHADDER, "Contains a PC-plus-immediate operation.", "", "")
+DEF_ATTRIB(CRSLOT23, "Can execute in slot 2 or slot 3 (CR)", "", "")
 DEF_ATTRIB(COMMUTES, "The operation is communitive", "", "")
 DEF_ATTRIB(DEALLOCRET, "dealloc_return", "", "")
 DEF_ATTRIB(DEALLOCFRAME, "deallocframe", "", "")
 
-DEF_ATTRIB(CRSLOT23, "Can execute in slot 2 or slot 3 (CR)", "", "")
+/* Instruction Types */
+
+DEF_ATTRIB(IT_ALU, "ALU type", "", "")
+DEF_ATTRIB(IT_ALU_ADDSUB, "ALU add or subtract type", "", "")
+DEF_ATTRIB(IT_ALU_MINMAX, "ALU MIN or MAX type", "", "")
+DEF_ATTRIB(IT_ALU_MOVE, "ALU data movement type", "", "")
+DEF_ATTRIB(IT_ALU_LOGICAL, "ALU logical operation type", "", "")
+DEF_ATTRIB(IT_ALU_SHIFT, "ALU shift operation type", "", "")
+DEF_ATTRIB(IT_ALU_SHIFT_AND_OP, "ALU shift and additional op type", "", "")
+DEF_ATTRIB(IT_ALU_CMP, "ALU compare operation type", "", "")
+
+DEF_ATTRIB(IT_LOAD, "Loads from memory", "", "")
+DEF_ATTRIB(IT_STORE, "Stores to memory", "", "")
+
+DEF_ATTRIB(IT_MPY, "Multiply type", "", "")
+DEF_ATTRIB(IT_MPY_32, "32-bit Multiply type", "", "")
+
+DEF_ATTRIB(IT_COF, "Change-of-flow type", "", "")
+DEF_ATTRIB(IT_HWLOOP, "Sets up hardware loop registers", "", "")
+
+DEF_ATTRIB(IT_MISC, "misc instruction type", "", "")
+
 DEF_ATTRIB(IT_NOP, "nop instruction", "", "")
 DEF_ATTRIB(IT_EXTENDER, "constant extender instruction", "", "")
 
 
+/* Exceptions the instruction can generate */
+
+DEF_ATTRIB(EXCEPTION_TLB, "Can generate a TLB Miss Exception", "", "")
+DEF_ATTRIB(EXCEPTION_ACCESS, "Can generate Access Violation Exception", "", "")
+DEF_ATTRIB(EXCEPTION_SWI, "Software Interrupt (trap) exception", "", "")
+
+
+/* Documentation Notes */
+DEF_ATTRIB(NOTE_ARCHV2, "Only available in the V2 architecture", "", "")
+
+DEF_ATTRIB(NOTE_PACKET_PC, "The PC is the addr of the start of the pkt", "", "")
+
+DEF_ATTRIB(NOTE_PACKET_NPC, "Next PC is the address following pkt", "", "")
+
+DEF_ATTRIB(NOTE_CONDITIONAL, "can be conditionally executed", "", "")
+
+DEF_ATTRIB(NOTE_NEWVAL_SLOT0, "New-value oprnd must execute on slot 0", "", "")
+
+DEF_ATTRIB(NOTE_RELATIVE_ADDRESS, "A PC-relative address is formed", "", "")
+
+DEF_ATTRIB(NOTE_LA_RESTRICT, "Cannot be in the last pkt of a HW loop", "", "")
+
+DEF_ATTRIB(NOTE_OOBVSHIFT, "Possible shift overflow", "", "")
+DEF_ATTRIB(NOTE_BIDIRSHIFT, "Bidirectional shift", "", "")
+
+DEF_ATTRIB(NOTE_CVFLAGS, "Sets the Carry and Overflow flags in USR.", "", "")
+DEF_ATTRIB(NOTE_SR_OVF_WHEN_SATURATING, "Might set OVF bit", "", "")
+DEF_ATTRIB(NOTE_STNT,
+           "Non Temporal Data. The :nt appendix is a hint to the "
+           "microarchitecture indicating that the life of the cache line is "
+           "short. This information is used throughout the cache hierarchy to "
+           "make replacement and allocation decisions.",
+           "", "")
+DEF_ATTRIB(NOTE_PRIV, "Monitor-level feature", "", "")
+DEF_ATTRIB(NOTE_GUEST, "Guest-level feature", "", "")
+DEF_ATTRIB(NOTE_NOPACKET, "solo instruction", "", "")
+DEF_ATTRIB(NOTE_AXOK, "May only be grouped with ALU32 or non-FP XTYPE.", "", "")
+DEF_ATTRIB(NOTE_NOSLOT1, "Packet with this insn must have slot 1 empty", "", "")
+DEF_ATTRIB(NOTE_SLOT1_AOK, "Packet must have slot 1 empty or ALU32", "", "")
+DEF_ATTRIB(NOTE_NOSLOT01, "Packet must have both slot 1 and 2 empty", "", "")
+DEF_ATTRIB(NOTE_NEEDS_MEMLD, "Must be grouped with a memory load", "", "")
+DEF_ATTRIB(NOTE_LATEPRED, "The predicate can not be used as a .new", "", "")
+DEF_ATTRIB(NOTE_COMPAT_ACCURACY, "In the future accuracy may increase", "", "")
+DEF_ATTRIB(NOTE_NVSLOT0, "Can execute only in slot 0 (ST)", "", "")
+DEF_ATTRIB(NOTE_DEPRECATED, "Will be deprecated in a future version.", "", "")
+DEF_ATTRIB(NOTE_NONAPALIV1, "may not work correctly in Napali V1.", "", "")
+DEF_ATTRIB(NOTE_NOLAHAINAV1, "This may not work correctly in Lahaina V1.", "",
+           "")
+DEF_ATTRIB(NOTE_BADTAG_UNDEF, "Undefined if a tag is non-present", "", "")
+DEF_ATTRIB(NOTE_NOSLOT2_MPY, "Packet cannot have a slot 2 multiply", "", "")
+DEF_ATTRIB(NOTE_HVX_ONLY, "Only available on a core with HVX.", "", "")
+
+DEF_ATTRIB(NOTE_NOCOF_RESTRICT, "Cannot be grouped with any COF", "", "")
+DEF_ATTRIB(NOTE_BRANCHADDER_MAX1, "One PC-plus-offset calculation", "", "")
+
+DEF_ATTRIB(NOTE_CRSLOT23, "Execute on either slot2 or slot3 (CR)", "", "")
+DEF_ATTRIB(NOTE_EXTENSION_AUDIO, "Hexagon audio extensions", "", "")
+DEF_ATTRIB(NOTE_FETCHNT,
+           "Non Temporal Data Cache Prefetch. The :nt appendix is a hint to "
+           "the microarchitecture indicating that the life of the cache line "
+           "fetched is short. This information is used throughout the cache "
+           "hierarchy to make replacement and allocation decisions.",
+           "", "")
+DEF_ATTRIB(NOTE_VECX_V67, "This instruction is only available on V67", "", "")
+
+DEF_ATTRIB(NOTE_NOVP,
+           "This instruction cannot be paired with a HVX permute instruction",
+           "", "")
+DEF_ATTRIB(NOTE_VA_UNARY,
+           "If a packet contains this instruction and a HVX ALU op then the "
+           "ALU OP must be unary.",
+           "", "")
+
+
+/* V6 MMVector Notes for Documentation */
+DEF_ATTRIB(NOTE_ANY_RESOURCE, "Can use any HVX resource.", "", "")
+DEF_ATTRIB(NOTE_ANY2_RESOURCE, "Uses any pair of the HVX resources", "", "")
+DEF_ATTRIB(NOTE_PERMUTE_RESOURCE, "Uses the HVX permute resource.", "", "")
+DEF_ATTRIB(NOTE_SHIFT_RESOURCE, "Uses the HVX shift resource.", "", "")
+DEF_ATTRIB(NOTE_MPY_RESOURCE, "Uses a HVX multiply resource.", "", "")
+DEF_ATTRIB(NOTE_MPYDV_RESOURCE, "Uses both HVX multiply resources.", "", "")
+DEF_ATTRIB(NOTE_NT_VMEM, "Non-temporal hint to the micro-architecture", "", "")
+DEF_ATTRIB(NOTE_ALL_RESOURCE, "Uses all HVX resources.", "", "")
+DEF_ATTRIB(NOTE_VMEM, "Immediates are in multiples of vector length.", "", "")
+DEF_ATTRIB(NOTE_ANY_VS_VX_RESOURCE, "Consumes two resources", "", "")
+
+DEF_ATTRIB(NOTE_RT8, "Input scalar register Rt is limited to R0-R7", "", "")
+
+DEF_ATTRIB(NOTE_MX, "This is in-memory matrix multiply instruction.", "", "")
+DEF_ATTRIB(NOTE_VX_ACC_FWD,
+           "The accumulator (Vxx) source of this instruction must be generate "
+           "in the previous packet to avoid a stall. The accumulator cannot "
+           "come from a .tmp operation.",
+           "", "")
+DEF_ATTRIB(NOTE_TMP_NO_VX,
+           "The tmp load instruction destination register cannot be an "
+           "accumulator register.",
+           "", "")
+
+DEF_ATTRIB(
+    NOTE_NO_ECC,
+    "ECC is not supported for scatter and gather instructions. Enabling ECC "
+    "with unprotected access instructions result in undetermined behavior.",
+    "", "")
+
+/* FP8 instructions */
+DEF_ATTRIB(HVX_FP8, "HVX FP8 extension instruction", "", "")
+DEF_ATTRIB(HVX_IEEE_FP_OUT_8, "HVX IEEE FP extension instruction: 8-bit output",
+           "", "")
+
 /* Restrictions to make note of */
+DEF_ATTRIB(RESTRICT_LOOP_LA, "Cannot be in the last packet of a loop", "", "")
+DEF_ATTRIB(RESTRICT_NEEDS_MEMLD, "Must be grouped with a load", "", "")
 DEF_ATTRIB(RESTRICT_COF_MAX1, "One change-of-flow per packet", "", "")
 DEF_ATTRIB(RESTRICT_NOPACKET, "Not allowed in a packet", "", "")
+DEF_ATTRIB(RESTRICT_NOSRMOVE, "Do not write SR in the same packet", "", "")
 DEF_ATTRIB(RESTRICT_SLOT0ONLY, "Must execute on slot0", "", "")
 DEF_ATTRIB(RESTRICT_SLOT1ONLY, "Must execute on slot1", "", "")
 DEF_ATTRIB(RESTRICT_SLOT2ONLY, "Must execute on slot2", "", "")
 DEF_ATTRIB(RESTRICT_SLOT3ONLY, "Must execute on slot3", "", "")
+DEF_ATTRIB(RESTRICT_NOSLOT2_MPY, "A packet cannot have a slot 2 mpy", "", "")
 DEF_ATTRIB(RESTRICT_NOSLOT1, "No slot 1 instruction in parallel", "", "")
+DEF_ATTRIB(RESTRICT_SLOT1_AOK, "Slot 1 insn must be empty or A-type", "", "")
+DEF_ATTRIB(RESTRICT_NOSLOT01, "No slot 0 or 1 instructions in parallel", "", "")
+DEF_ATTRIB(RESTRICT_NOSLOT1_STORE, "Packet must not have slot 1 store", "", "")
+DEF_ATTRIB(RESTRICT_NOSLOT0_LOAD, "Packet must not have a slot 1 load", "", "")
+DEF_ATTRIB(RESTRICT_NOCOF, "Cannot be grouped with any COF", "", "")
+DEF_ATTRIB(RESTRICT_BRANCHADDER_MAX1, "One PC-plus-offset calculation", "", "")
 DEF_ATTRIB(RESTRICT_PREFERSLOT0, "Try to encode into slot 0", "", "")
+DEF_ATTRIB(RESTRICT_SINGLE_MEM_FIRST, "Single memory op must be last", "", "")
 DEF_ATTRIB(RESTRICT_PACKET_AXOK, "May exist with A-type or X-type", "", "")
+DEF_ATTRIB(RESTRICT_PACKET_SOMEREGS_OK, "Relaxed grouping rules", "", "")
+DEF_ATTRIB(RESTRICT_LATEPRED, "Predicate can not be used as a .new.", "", "")
+
+DEF_ATTRIB(PAIR_1OF2, "For assembler", "", "")
+DEF_ATTRIB(PAIR_2OF2, "For assembler", "", "")
+DEF_ATTRIB(NOTE_MX_PAIR,
+           "Weights and Activations need to be paired in a packet.", "", "")
+DEF_ATTRIB(NOTE_RESTRICT_CVI_NOVP,
+           "This instruction cannot use the permute/shift resource", "", "")
+
+/* Performance based preferences */
+DEF_ATTRIB(PREFER_SLOT3, "Complex XU prefering slot3", "", "")
+
+DEF_ATTRIB(RELAX_COF_1ST, "COF can be fisrt in assembly order", "", "")
+DEF_ATTRIB(RELAX_COF_2ND, "COF can be second in assembly order", "", "")
 
 DEF_ATTRIB(ICOP, "Instruction cache op", "", "")
 
+DEF_ATTRIB(INTRINSIC_RETURNS_UNSIGNED, "Intrinsic returns an unsigned", "", "")
+
+DEF_ATTRIB(PRED_BIT_1, "The branch uses bit 1 as the prediction bit", "", "")
+DEF_ATTRIB(PRED_BIT_4, "The branch uses bit 4 as the prediction bit", "", "")
+DEF_ATTRIB(PRED_BIT_8, "The branch uses bit 8 as the prediction bit", "", "")
+DEF_ATTRIB(PRED_BIT_12, "The branch uses bit 12 as the prediction bit", "", "")
+DEF_ATTRIB(PRED_BIT_13, "The branch uses bit 13 as the prediction bit", "", "")
+DEF_ATTRIB(PRED_BIT_7, "The branch uses bit 7 as the prediction bit", "", "")
+DEF_ATTRIB(HWLOOP0_SETUP, "Sets up HW loop0", "", "")
+DEF_ATTRIB(HWLOOP1_SETUP, "Sets up HW loop1", "", "")
 DEF_ATTRIB(HWLOOP0_END, "Ends HW loop0", "", "")
 DEF_ATTRIB(HWLOOP1_END, "Ends HW loop1", "", "")
 DEF_ATTRIB(RET_TYPE, "return type", "", "")
+DEF_ATTRIB(HINTJR, "hintjr type", "", "")
 DEF_ATTRIB(DCZEROA, "dczeroa type", "", "")
+DEF_ATTRIB(ICTAGOP, "ictag op type", "", "")
 DEF_ATTRIB(ICFLUSHOP, "icflush op type", "", "")
 DEF_ATTRIB(DCFLUSHOP, "dcflush op type", "", "")
+DEF_ATTRIB(DCTAGOP, "dctag op type", "", "")
 DEF_ATTRIB(L2FLUSHOP, "l2flush op type", "", "")
+DEF_ATTRIB(L2TAGOP, "l2tag op type", "", "")
 DEF_ATTRIB(DCFETCH, "dcfetch type", "", "")
+DEF_ATTRIB(BIMODAL_BRANCH, "Updates the bimodal branch predictor", "", "")
 
+DEF_ATTRIB(VECINSN, "Long Vector Instruction", "", "")
+DEF_ATTRIB(MEMSIZE_32B, "Memory width is 32 bytes", "", "")
+DEF_ATTRIB(FOUR_PHASE, "Four Phase Instruction", "", "")
 DEF_ATTRIB(L2FETCH, "Instruction is l2fetch type", "", "")
 
+DEF_ATTRIB(PREDUSE_BSB, "Instructions need back-skip-back scheduling", "", "")
 DEF_ATTRIB(ICINVA, "icinva", "", "")
 DEF_ATTRIB(DCCLEANINVA, "dccleaninva", "", "")
 
+DEF_ATTRIB(EXTENSION_AUDIO, "audio extension", "", "")
+
+DEF_ATTRIB(MEMCPY, "memcpy or dma-type instruction", "", "")
 DEF_ATTRIB(NO_INTRINSIC, "Don't generate an intrisic", "", "")
 
-/* Documentation Notes */
-DEF_ATTRIB(NOTE_CONDITIONAL, "can be conditionally executed", "", "")
-DEF_ATTRIB(NOTE_NEWVAL_SLOT0, "New-value oprnd must execute on slot 0", "", "")
-DEF_ATTRIB(NOTE_PRIV, "Monitor-level feature", "", "")
-DEF_ATTRIB(NOTE_NOPACKET, "solo instruction", "", "")
-DEF_ATTRIB(NOTE_AXOK, "May only be grouped with ALU32 or non-FP XTYPE.", "", "")
-DEF_ATTRIB(NOTE_LATEPRED, "The predicate can not be used as a .new", "", "")
-DEF_ATTRIB(NOTE_NVSLOT0, "Can execute only in slot 0 (ST)", "", "")
-DEF_ATTRIB(NOTE_NOVP, "Cannot be paired with a HVX permute instruction", "", "")
-DEF_ATTRIB(NOTE_VA_UNARY, "Combined with HVX ALU op (must be unary)", "", "")
+DEF_ATTRIB(NO_XML, "Don't generate a XML docs for this instruction", "", "")
 
-/* V6 MMVector Notes for Documentation */
-DEF_ATTRIB(NOTE_SHIFT_RESOURCE, "Uses the HVX shift resource.", "", "")
-/* Restrictions to make note of */
-DEF_ATTRIB(RESTRICT_NOSLOT1_STORE, "Packet must not have slot 1 store", "", "")
-DEF_ATTRIB(RESTRICT_LATEPRED, "Predicate can not be used as a .new.", "", "")
+DEF_ATTRIB(DMA, "User-DMA instruction", "", "")
+DEF_ATTRIB(VERIF_DMASTEP,
+           "Hiphop needs to step dma prior to executing this packet", "", "")
+DEF_ATTRIB(VERIF_DMATICK,
+           "DMA gets a tick in verif mode for this instruction after a commit",
+           "", "")
+
+DEF_ATTRIB(HVX_IEEE_FP, "HVX IEEE FP extension instruction", "", "")
+DEF_ATTRIB(NOTE_HVX_IEEE_FP,
+           "Only supported on the HVX cores with the IEEE FP extension", "", "")
+
+DEF_ATTRIB(HVX_IEEE_FP_DV_ONE,
+           "HVX IEEE FP extension instruction - dual pipes: P2 and P3 - output "
+           "only on P2",
+           "", "")
+DEF_ATTRIB(HVX_IEEE_FP_ACC, "HVX IEEE FP accumulate instruction", "", "")
+DEF_ATTRIB(HVX_IEEE_BF,
+           "HVX IEEE BF extension instruction: 16-bit bfloat input", "", "")
+DEF_ATTRIB(HVX_IEEE_FP_OUT_BF,
+           "HVX IEEE FP extension instruction: 16-bit bfloat output", "", "")
+DEF_ATTRIB(HVX_IEEE_FP_OUT_16,
+           "HVX IEEE FP extension instruction: 16-bit output", "", "")
+DEF_ATTRIB(HVX_IEEE_FP_OUT_32,
+           "HVX IEEE FP extension instruction: 32-bit output", "", "")
+DEF_ATTRIB(HVX_IEEE_FP_BINARY_LATE,
+           "HVX IEEE FP extension instruction: Both inputs can arrive late", "",
+           "")
 
 /* Keep this as the last attribute: */
 DEF_ATTRIB(ZZ_LASTATTRIB, "Last attribute in the file", "", "")
diff --git a/target/hexagon/common-semi-target.h b/target/hexagon/common-semi-target.h
new file mode 100644
index 000000000000..759aaeba905f
--- /dev/null
+++ b/target/hexagon/common-semi-target.h
@@ -0,0 +1,87 @@
+/*
+ * Target-specific parts of semihosting/arm-compat-semi.c.
+ *
+ * Copyright(c) 2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef TARGET_HEXAGON_COMMON_SEMI_TARGET_H
+#define TARGET_HEXAGON_COMMON_SEMI_TARGET_H
+
+#include "cpu.h"
+#include "cpu_helper.h"
+#include "qemu/log.h"
+#include "semihosting/uaccess.h"
+
+static inline bool common_semi_read_arg_word(CPUArchState *env,
+                                             target_ulong *save_to,
+                                             target_ulong args_addr,
+                                             int arg_num)
+{
+    hexagon_read_memory(env, args_addr + (arg_num) * 4, 4, save_to, 0);
+    return false;
+}
+
+static inline target_ulong common_semi_arg(CPUState *cs, int argno)
+{
+    CPUHexagonState *env = cpu_env(cs);
+    return arch_get_thread_reg(env, HEX_REG_R00 + argno);
+}
+
+static inline void common_semi_set_ret(CPUState *cs, target_ulong ret)
+{
+    CPUHexagonState *env = cpu_env(cs);
+    arch_set_thread_reg(env, HEX_REG_R00, ret);
+}
+
+static inline void hex_semi_set_err(CPUState *cs, target_ulong err)
+{
+    CPUHexagonState *env = cpu_env(cs);
+    arch_set_thread_reg(env, HEX_REG_R01, err);
+}
+
+static inline bool common_semi_sys_exit_extended(CPUState *cs, int nr)
+{
+    return false;
+}
+
+static inline bool is_64bit_semihosting(CPUArchState *env)
+{
+    return false;
+}
+
+static inline target_ulong common_semi_stack_bottom(CPUState *cs)
+{
+    CPUHexagonState *env = cpu_env(cs);
+    return arch_get_thread_reg(env, HEX_REG_SP);
+}
+
+static inline bool common_semi_has_synccache(CPUArchState *env)
+{
+    return false;
+}
+
+static inline void hex_prepare_for_read(CPUState *cs, target_ulong fd,
+                                        target_ulong buf, target_ulong len)
+{
+    CPUHexagonState *env = cpu_env(cs);
+    /*
+     * Need to make sure the page we are going to write to is available.
+     * The file pointer advances with the read. If the write to bufaddr
+     * faults the swi function will be restarted but the file pointer
+     * will be wrong.
+     */
+    hexagon_touch_memory(env, buf, len, 0);
+}
+
+const struct semihosting_opt_callbacks hex_opt_callbacks = {
+    .prepare_for_read = hex_prepare_for_read,
+    .set_err = hex_semi_set_err,
+};
+
+SEMIHOSTING_REGISTER_OPT_CALLBACKS(hex_opt_callbacks)
+
+#define SEMIHOSTING_EXT_OPEN_MODES
+
+#endif
diff --git a/target/hexagon/cpu-param.h b/target/hexagon/cpu-param.h
index 45ee7b46409c..d414ca89d690 100644
--- a/target/hexagon/cpu-param.h
+++ b/target/hexagon/cpu-param.h
@@ -18,9 +18,18 @@
 #ifndef HEXAGON_CPU_PARAM_H
 #define HEXAGON_CPU_PARAM_H
 
+#ifdef CONFIG_USER_ONLY
 #define TARGET_PAGE_BITS 16     /* 64K pages */
+#else
+#define TARGET_PAGE_BITS 12     /* 4K pages */
+#endif
 
 #define TARGET_PHYS_ADDR_SPACE_BITS 36
 #define TARGET_VIRT_ADDR_SPACE_BITS 32
 
+/*
+ * Hexagon processors have a strong memory model.
+ */
+#define TCG_GUEST_DEFAULT_MO      (TCG_MO_ALL)
+
 #endif
diff --git a/target/hexagon/cpu.c b/target/hexagon/cpu.c
index a9beb9a17572..f90e8f726a88 100644
--- a/target/hexagon/cpu.c
+++ b/target/hexagon/cpu.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -26,13 +26,32 @@
 #include "fpu/softfloat-helpers.h"
 #include "tcg/tcg.h"
 #include "exec/gdbstub.h"
+#include "cpu_helper.h"
+#include "max.h"
+#include "hex_mmu.h"
+#include "hw/hexagon/hexagon.h"
+
+#ifndef CONFIG_USER_ONLY
+#include "macros.h"
+#include "sys_macros.h"
+#include "qemu/main-loop.h"
+#include "hex_interrupts.h"
+#include "hexswi.h"
+#endif
+
+#define DEFINE_STD_CPU_INIT_FUNC(REV) \
+    static void hexagon_##REV##_cpu_init(Object *obj) \
+    { \
+        HexagonCPU *cpu = HEXAGON_CPU(obj); \
+        cpu->rev_reg = REV##_rev; \
+    }
 
-static void hexagon_v66_cpu_init(Object *obj) { }
-static void hexagon_v67_cpu_init(Object *obj) { }
-static void hexagon_v68_cpu_init(Object *obj) { }
-static void hexagon_v69_cpu_init(Object *obj) { }
-static void hexagon_v71_cpu_init(Object *obj) { }
-static void hexagon_v73_cpu_init(Object *obj) { }
+DEFINE_STD_CPU_INIT_FUNC(v66)
+DEFINE_STD_CPU_INIT_FUNC(v67)
+DEFINE_STD_CPU_INIT_FUNC(v68)
+DEFINE_STD_CPU_INIT_FUNC(v69)
+DEFINE_STD_CPU_INIT_FUNC(v71)
+DEFINE_STD_CPU_INIT_FUNC(v73)
 
 static ObjectClass *hexagon_cpu_class_by_name(const char *cpu_model)
 {
@@ -50,6 +69,18 @@ static ObjectClass *hexagon_cpu_class_by_name(const char *cpu_model)
 }
 
 static const Property hexagon_cpu_properties[] = {
+#if !defined(CONFIG_USER_ONLY)
+    DEFINE_PROP_UINT32("jtlb-entries", HexagonCPU, num_tlbs, MAX_TLB_ENTRIES),
+    DEFINE_PROP_UINT32("l2vic-base-addr", HexagonCPU, l2vic_base_addr,
+        0xffffffffULL),
+    DEFINE_PROP_UINT32("qtimer-base-addr", HexagonCPU, qtimer_base_addr,
+                       0xffffffffULL),
+    DEFINE_PROP_UINT32("hvx-contexts", HexagonCPU, hvx_contexts, 0),
+    DEFINE_PROP_UINT32("exec-start-addr", HexagonCPU, boot_addr, 0xffffffffULL),
+    DEFINE_PROP_UINT64("config-table-addr", HexagonCPU, config_table_addr,
+                       0xffffffffULL),
+#endif
+    DEFINE_PROP_UINT32("dsp-rev", HexagonCPU, rev_reg, 0),
     DEFINE_PROP_BOOL("lldb-compat", HexagonCPU, lldb_compat, false),
     DEFINE_PROP_UNSIGNED("lldb-stack-adjust", HexagonCPU, lldb_stack_adjust, 0,
                          qdev_prop_uint32, target_ulong),
@@ -62,11 +93,41 @@ const char * const hexagon_regnames[TOTAL_PER_THREAD_REGS] = {
   "r16", "r17", "r18", "r19", "r20",  "r21", "r22", "r23",
   "r24", "r25", "r26", "r27", "r28",  "r29", "r30", "r31",
   "sa0", "lc0", "sa1", "lc1", "p3_0", "c5",  "m0",  "m1",
-  "usr", "pc",  "ugp", "gp",  "cs0",  "cs1", "c14", "c15",
-  "c16", "c17", "c18", "c19", "pkt_cnt",  "insn_cnt", "hvx_cnt", "c23",
-  "c24", "c25", "c26", "c27", "c28",  "c29", "c30", "c31",
+  "usr", "pc",  "ugp", "gp",  "cs0",  "cs1", "upcyclelo", "upcyclehi",
+  "framelimit", "framekey", "pktcountlo", "pktcounthi", "upmucnt0",
+  "upmucnt1", "upmucnt2", "upmucnt3", "upmucnt4", "upmucnt5", "upmucnt6",
+  "upmucnt7",  "c28", "c29", "utimerlo", "utimerhi",
 };
 
+#ifndef CONFIG_USER_ONLY
+const char * const hexagon_sregnames[] = {
+    "sgp0",       "sgp1",       "stid",       "elr",        "badva0",
+    "badva1",     "ssr",        "ccr",        "htid",       "badva",
+    "imask",      "gevb",       "vwctrl",     "s13",        "s14",
+    "s15",        "evb",        "modectl",    "syscfg",     "segment",
+    "ipendad",    "vid",        "vid1",       "bestwait",   "s24",
+    "schedcfg",   "s26",        "cfgbase",    "diag",       "rev",
+    "pcyclelo",   "pcyclehi",   "isdbst",     "isdbcfg0",   "isdbcfg1",
+    "livelock",   "brkptpc0",   "brkptcfg0",  "brkptpc1",   "brkptcfg1",
+    "isdbmbxin",  "isdbmbxout", "isdben",     "isdbgpr",    "pmucnt4",
+    "pmucnt5",    "pmucnt6",    "pmucnt7",    "pmucnt0",    "pmucnt1",
+    "pmucnt2",    "pmucnt3",    "pmuevtcfg",  "pmustid0",   "pmuevtcfg1",
+    "pmustid1",   "timerlo",    "timerhi",    "pmucfg",     "rgdr2",
+    "rgdr",       "turkey",     "duck",       "chicken",
+};
+
+G_STATIC_ASSERT(NUM_SREGS == ARRAY_SIZE(hexagon_sregnames));
+
+const char * const hexagon_gregnames[] = {
+    "gelr",       "gsr",       "gosp",      "gbadva",    "gcommit1t",
+    "gcommit2t",  "gcommit3t", "gcommit4t", "gcommit5t", "gcommit6t",
+    "gpcycle1t",  "gpcycle2t", "gpcycle3t", "gpcycle4t", "gpcycle5t",
+    "gpcycle6t",  "gpmucnt4",  "gpmucnt5",  "gpmucnt6",  "gpmucnt7",
+    "gcommit7t",  "gcommit8t", "gpcycle7t", "gpcycle8t", "gpcyclelo",
+    "gpcyclehi",  "gpmucnt0",  "gpmucnt1",  "gpmucnt2",  "gpmucnt3",
+    "g30",        "g31",
+};
+#endif
 /*
  * One of the main debugging techniques is to use "-d cpu" and compare against
  * LLDB output when single stepping.  However, the target and qemu put the
@@ -176,7 +237,7 @@ void hexagon_debug_qreg(CPUHexagonState *env, int regnum)
     print_qreg(stdout, env, regnum, false);
 }
 
-static void hexagon_dump(CPUHexagonState *env, FILE *f, int flags)
+void hexagon_dump(CPUHexagonState *env, FILE *f, int flags)
 {
     HexagonCPU *cpu = env_archcpu(env);
 
@@ -216,8 +277,7 @@ static void hexagon_dump(CPUHexagonState *env, FILE *f, int flags)
     qemu_fprintf(f, "  cs0 = 0x00000000\n");
     qemu_fprintf(f, "  cs1 = 0x00000000\n");
 #else
-    print_reg(f, env, HEX_REG_CAUSE);
-    print_reg(f, env, HEX_REG_BADVA);
+    print_reg(f, env, HEX_SREG_BADVA);
     print_reg(f, env, HEX_REG_CS0);
     print_reg(f, env, HEX_REG_CS1);
 #endif
@@ -262,9 +322,28 @@ static void hexagon_cpu_synchronize_from_tb(CPUState *cs,
     cpu_env(cs)->gpr[HEX_REG_PC] = tb->pc;
 }
 
+#ifndef CONFIG_USER_ONLY
+bool hexagon_thread_is_enabled(CPUHexagonState *env)
+{
+    target_ulong modectl = arch_get_system_reg(env, HEX_SREG_MODECTL);
+    uint32_t thread_enabled_mask = GET_FIELD(MODECTL_E, modectl);
+    bool E_bit = thread_enabled_mask & (0x1 << env->threadId);
+
+    return E_bit;
+}
+#endif
+
 static bool hexagon_cpu_has_work(CPUState *cs)
 {
+#ifndef CONFIG_USER_ONLY
+    CPUHexagonState *env = cpu_env(cs);
+
+    return hexagon_thread_is_enabled(env) &&
+        (cs->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_SWI
+            | CPU_INTERRUPT_K0_UNLOCK | CPU_INTERRUPT_TLB_UNLOCK));
+#else
     return true;
+#endif
 }
 
 static void hexagon_restore_state_to_opc(CPUState *cs,
@@ -274,6 +353,29 @@ static void hexagon_restore_state_to_opc(CPUState *cs,
     cpu_env(cs)->gpr[HEX_REG_PC] = data[0];
 }
 
+
+#ifndef CONFIG_USER_ONLY
+static void mmu_reset(CPUHexagonState *env)
+{
+    CPUState *cs = env_cpu(env);
+    if (cs->cpu_index == 0) {
+        memset(env->hex_tlb, 0, sizeof(*env->hex_tlb));
+    }
+}
+
+void hexagon_cpu_soft_reset(CPUHexagonState *env)
+{
+    BQL_LOCK_GUARD();
+    arch_set_system_reg(env, HEX_SREG_SSR, 0);
+    hexagon_ssr_set_cause(env, HEX_CAUSE_RESET);
+
+    target_ulong evb = arch_get_system_reg(env, HEX_SREG_EVB);
+    arch_set_thread_reg(env, HEX_REG_PC, evb);
+}
+#endif
+
+
+#define HEXAGON_CFG_ADDR_BASE(addr) (((addr) >> 16) & 0x0fffff)
 static void hexagon_cpu_reset_hold(Object *obj, ResetType type)
 {
     CPUState *cs = CPU(obj);
@@ -288,6 +390,45 @@ static void hexagon_cpu_reset_hold(Object *obj, ResetType type)
     set_float_detect_tininess(float_tininess_before_rounding, &env->fp_status);
     /* Default NaN value: sign bit set, all frac bits set */
     set_float_default_nan_pattern(0b11111111, &env->fp_status);
+
+#ifndef CONFIG_USER_ONLY
+    HexagonCPU *cpu = HEXAGON_CPU(cs);
+
+    if (cs->cpu_index == 0) {
+        memset(env->g_sreg, 0, sizeof(target_ulong) * NUM_SREGS);
+    }
+    memset(env->t_sreg, 0, sizeof(target_ulong) * NUM_SREGS);
+    memset(env->greg, 0, sizeof(target_ulong) * NUM_GREGS);
+
+    if (cs->cpu_index == 0) {
+        arch_set_system_reg(env, HEX_SREG_REV, cpu->rev_reg);
+        arch_set_system_reg(env, HEX_SREG_MODECTL, 0x1);
+        *(env->g_pcycle_base) = 0;
+    }
+
+    memset(env->gpr, 0, sizeof(target_ulong) * TOTAL_PER_THREAD_REGS);
+    memset(env->pred, 0, sizeof(target_ulong) * NUM_PREGS);
+    memset(env->VRegs, 0, sizeof(MMVector) * NUM_VREGS);
+    memset(env->QRegs, 0, sizeof(MMQReg) * NUM_QREGS);
+    memset(env->vstore_pending, 0, sizeof(target_ulong) * VSTORES_MAX);
+    env->t_cycle_count = 0;
+    env->vtcm_pending = false;
+
+    mmu_reset(env);
+    arch_set_system_reg(env, HEX_SREG_HTID, cs->cpu_index);
+    hexagon_cpu_soft_reset(env);
+    env->threadId = cs->cpu_index;
+    env->tlb_lock_state = HEX_LOCK_UNLOCKED;
+    env->k0_lock_state = HEX_LOCK_UNLOCKED;
+    env->tlb_lock_count = 0;
+    env->k0_lock_count = 0;
+    env->next_PC = 0;
+    env->wait_next_pc = 0;
+    env->cause_code = -1;
+    arch_set_thread_reg(env, HEX_REG_PC, cpu->boot_addr);
+    arch_set_system_reg(env, HEX_SREG_CFGBASE,
+                        HEXAGON_CFG_ADDR_BASE(cpu->config_table_addr));
+#endif
 }
 
 static void hexagon_cpu_disas_set_info(CPUState *s, disassemble_info *info)
@@ -308,29 +449,291 @@ static void hexagon_cpu_realize(DeviceState *dev, Error **errp)
         return;
     }
 
+#ifndef CONFIG_USER_ONLY
+    HexagonCPU *cpu = HEXAGON_CPU(cs);
+    if (cpu->num_tlbs > MAX_TLB_ENTRIES) {
+        error_setg(errp, "Number of TLBs selected is invalid");
+        return;
+    }
+#endif
+
     gdb_register_coprocessor(cs, hexagon_hvx_gdb_read_register,
                              hexagon_hvx_gdb_write_register,
                              gdb_find_static_feature("hexagon-hvx.xml"), 0);
 
+#ifndef CONFIG_USER_ONLY
+    gdb_register_coprocessor(cs, hexagon_sys_gdb_read_register,
+                             hexagon_sys_gdb_write_register,
+                             gdb_find_static_feature("hexagon-sys.xml"), 0);
+#endif
+
     qemu_init_vcpu(cs);
-    cpu_reset(cs);
+    CPUHexagonState *env = cpu_env(cs);
+#ifndef CONFIG_USER_ONLY
+    hex_mmu_realize(env);
+    if (cs->cpu_index == 0) {
+        env->g_sreg = g_new0(target_ulong, NUM_SREGS);
+    } else {
+        CPUState *cpu0 = qemu_get_cpu(0);
+        CPUHexagonState *env0 = cpu_env(cpu0);
+        env->g_sreg = env0->g_sreg;
+    }
+#endif
+    if (cs->cpu_index == 0) {
+        env->g_pcycle_base = g_malloc0(sizeof(*env->g_pcycle_base));
+    } else {
+        CPUState *cpu0 = qemu_get_cpu(0);
+        env->g_pcycle_base = cpu_env(cpu0)->g_pcycle_base;
+    }
 
     mcc->parent_realize(dev, errp);
 }
 
+#if !defined(CONFIG_USER_ONLY)
+static void hexagon_cpu_set_irq(void *opaque, int irq, int level)
+{
+    HexagonCPU *cpu = HEXAGON_CPU(opaque);
+    CPUState *cs = CPU(cpu);
+    CPUHexagonState *env = cpu_env(cs);
+
+    switch (irq) {
+    case HEXAGON_CPU_IRQ_0 ... HEXAGON_CPU_IRQ_7:
+        qemu_log_mask(CPU_LOG_INT, "%s: irq %d, level %d\n",
+                      __func__, irq, level);
+        if (level) {
+            hex_raise_interrupts(env, 1 << irq, CPU_INTERRUPT_HARD);
+        }
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+#endif
+
+
 static void hexagon_cpu_init(Object *obj)
 {
+#if !defined(CONFIG_USER_ONLY)
+    HexagonCPU *cpu = HEXAGON_CPU(obj);
+    qdev_init_gpio_in(DEVICE(cpu), hexagon_cpu_set_irq, 8);
+#endif
 }
 
 #include "accel/tcg/cpu-ops.h"
 
+#if !defined(CONFIG_USER_ONLY)
+static bool get_physical_address(CPUHexagonState *env, hwaddr *phys, int *prot,
+                                 int *size, int32_t *excp, target_ulong address,
+                                 MMUAccessType access_type, int mmu_idx)
+
+{
+    if (hexagon_cpu_mmu_enabled(env)) {
+        return hex_tlb_find_match(env, address, access_type, phys, prot, size,
+                                  excp, mmu_idx);
+    } else {
+        *phys = address & 0xFFFFFFFF;
+        *prot = PAGE_VALID | PAGE_READ | PAGE_WRITE | PAGE_EXEC;
+        *size = TARGET_PAGE_SIZE;
+        return true;
+    }
+}
+
+/* qemu seems to only want to know about TARGET_PAGE_SIZE pages */
+static void find_qemu_subpage(vaddr *addr, hwaddr *phys, int page_size)
+{
+    vaddr page_start = *addr & ~((vaddr)(page_size - 1));
+    vaddr offset = ((*addr - page_start) / TARGET_PAGE_SIZE) * TARGET_PAGE_SIZE;
+    *addr = page_start + offset;
+    *phys += offset;
+}
+
+static hwaddr hexagon_cpu_get_phys_page_debug(CPUState *cs, vaddr addr)
+{
+    CPUHexagonState *env = cpu_env(cs);
+    hwaddr phys_addr;
+    int prot;
+    int page_size = 0;
+    int32_t excp = 0;
+    int mmu_idx = MMU_KERNEL_IDX;
+
+    if (get_physical_address(env, &phys_addr, &prot, &page_size, &excp,
+                             addr, 0, mmu_idx)) {
+        find_qemu_subpage(&addr, &phys_addr, page_size);
+        return phys_addr;
+    }
+
+    return -1;
+}
+
+
+#define INVALID_BADVA 0xbadabada
+
+static void set_badva_regs(CPUHexagonState *env, target_ulong VA, int slot,
+                           MMUAccessType access_type)
+{
+    arch_set_system_reg(env, HEX_SREG_BADVA, VA);
+
+    if (access_type == MMU_INST_FETCH || slot == 0) {
+        arch_set_system_reg(env, HEX_SREG_BADVA0, VA);
+        arch_set_system_reg(env, HEX_SREG_BADVA1, INVALID_BADVA);
+        SET_SSR_FIELD(env, SSR_V0, 1);
+        SET_SSR_FIELD(env, SSR_V1, 0);
+        SET_SSR_FIELD(env, SSR_BVS, 0);
+    } else if (slot == 1) {
+        arch_set_system_reg(env, HEX_SREG_BADVA0, INVALID_BADVA);
+        arch_set_system_reg(env, HEX_SREG_BADVA1, VA);
+        SET_SSR_FIELD(env, SSR_V0, 0);
+        SET_SSR_FIELD(env, SSR_V1, 1);
+        SET_SSR_FIELD(env, SSR_BVS, 1);
+    } else {
+        g_assert_not_reached();
+    }
+}
+
+static void raise_tlbmiss_exception(CPUState *cs, target_ulong VA, int slot,
+                                    MMUAccessType access_type)
+{
+    CPUHexagonState *env = cpu_env(cs);
+
+    set_badva_regs(env, VA, slot, access_type);
+
+    switch (access_type) {
+    case MMU_INST_FETCH:
+        cs->exception_index = HEX_EVENT_TLB_MISS_X;
+        if ((VA & ~TARGET_PAGE_MASK) == 0) {
+            env->cause_code = HEX_CAUSE_TLBMISSX_CAUSE_NEXTPAGE;
+        } else {
+            env->cause_code = HEX_CAUSE_TLBMISSX_CAUSE_NORMAL;
+        }
+        break;
+    case MMU_DATA_LOAD:
+        cs->exception_index = HEX_EVENT_TLB_MISS_RW;
+        env->cause_code = HEX_CAUSE_TLBMISSRW_CAUSE_READ;
+        break;
+    case MMU_DATA_STORE:
+        cs->exception_index = HEX_EVENT_TLB_MISS_RW;
+        env->cause_code = HEX_CAUSE_TLBMISSRW_CAUSE_WRITE;
+        break;
+    }
+}
+
+static void raise_perm_exception(CPUState *cs, target_ulong VA, int slot,
+                                 MMUAccessType access_type, int32_t excp)
+{
+    CPUHexagonState *env = cpu_env(cs);
+
+    set_badva_regs(env, VA, slot, access_type);
+    cs->exception_index = excp;
+}
+
+static const char *access_type_names[] = { "MMU_DATA_LOAD ", "MMU_DATA_STORE",
+                                           "MMU_INST_FETCH" };
+
+static const char *mmu_idx_names[] = { "MMU_USER_IDX", "MMU_GUEST_IDX",
+                                       "MMU_KERNEL_IDX" };
+
+static bool hexagon_tlb_fill(CPUState *cs, vaddr address, int size,
+                             MMUAccessType access_type, int mmu_idx, bool probe,
+                             uintptr_t retaddr)
+{
+    CPUHexagonState *env = cpu_env(cs);
+    static int slot = 0 /* This is always zero for now */;
+    hwaddr phys;
+    int prot = 0;
+    int page_size = 0;
+    int32_t excp = 0;
+    bool ret = 0;
+
+    qemu_log_mask(
+        CPU_LOG_MMU,
+        "%s: tid = 0x%x, pc = 0x%08" PRIx32 ", vaddr = 0x%08" VADDR_PRIx
+        ", size = %d, %s,\tprobe = %d, %s\n",
+        __func__, env->threadId, env->gpr[HEX_REG_PC], address, size,
+        access_type_names[access_type], probe, mmu_idx_names[mmu_idx]);
+    ret = get_physical_address(env, &phys, &prot, &page_size, &excp, address,
+                               access_type, mmu_idx);
+    if (ret) {
+        if (!excp) {
+            find_qemu_subpage(&address, &phys, page_size);
+            tlb_set_page(cs, address, phys, prot, mmu_idx, TARGET_PAGE_SIZE);
+            return ret;
+        } else {
+            raise_perm_exception(cs, address, slot, access_type, excp);
+            do_raise_exception(env, cs->exception_index, env->gpr[HEX_REG_PC],
+                               retaddr);
+        }
+    }
+    if (probe) {
+        return false;
+    }
+    raise_tlbmiss_exception(cs, address, slot, access_type);
+    do_raise_exception(env, cs->exception_index, env->gpr[HEX_REG_PC], retaddr);
+}
+
+
+#include "hw/core/sysemu-cpu-ops.h"
+
+static const struct SysemuCPUOps hexagon_sysemu_ops = {
+    .get_phys_page_debug = hexagon_cpu_get_phys_page_debug,
+};
+
+static bool hexagon_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
+{
+    CPUHexagonState *env = cpu_env(cs);
+    if (interrupt_request & CPU_INTERRUPT_TLB_UNLOCK) {
+        cs->halted = false;
+        cpu_reset_interrupt(cs, CPU_INTERRUPT_TLB_UNLOCK);
+        return true;
+    }
+    if (interrupt_request & CPU_INTERRUPT_K0_UNLOCK) {
+        cs->halted = false;
+        cpu_reset_interrupt(cs, CPU_INTERRUPT_K0_UNLOCK);
+        return true;
+    }
+    if (interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_SWI)) {
+        return hex_check_interrupts(env);
+    }
+    return false;
+}
+
+#endif
+
 static const TCGCPUOps hexagon_tcg_ops = {
     .initialize = hexagon_translate_init,
     .translate_code = hexagon_translate_code,
     .synchronize_from_tb = hexagon_cpu_synchronize_from_tb,
     .restore_state_to_opc = hexagon_restore_state_to_opc,
+#if !defined(CONFIG_USER_ONLY)
+    .cpu_exec_interrupt = hexagon_cpu_exec_interrupt,
+    .tlb_fill = hexagon_tlb_fill,
+    .cpu_exec_halt = hexagon_cpu_has_work,
+    .do_interrupt = hexagon_cpu_do_interrupt,
+#endif /* !CONFIG_USER_ONLY */
 };
 
+static int hexagon_cpu_mmu_index(CPUState *cs, bool ifetch)
+{
+#ifndef CONFIG_USER_ONLY
+    BQL_LOCK_GUARD();
+    CPUHexagonState *env = cpu_env(cs);
+    uint32_t syscfg = arch_get_system_reg(env, HEX_SREG_SYSCFG);
+    uint8_t mmuen = GET_SYSCFG_FIELD(SYSCFG_MMUEN, syscfg);
+    if (!mmuen) {
+        return MMU_KERNEL_IDX;
+    }
+
+    int cpu_mode = get_cpu_mode(env);
+    if (cpu_mode == HEX_CPU_MODE_MONITOR) {
+        return MMU_KERNEL_IDX;
+    } else if (cpu_mode == HEX_CPU_MODE_GUEST) {
+        return MMU_GUEST_IDX;
+    }
+#endif
+
+    return MMU_USER_IDX;
+}
+
+
 static void hexagon_cpu_class_init(ObjectClass *c, void *data)
 {
     HexagonCPUClass *mcc = HEXAGON_CPU_CLASS(c);
@@ -347,6 +750,7 @@ static void hexagon_cpu_class_init(ObjectClass *c, void *data)
 
     cc->class_by_name = hexagon_cpu_class_by_name;
     cc->has_work = hexagon_cpu_has_work;
+    cc->mmu_index = hexagon_cpu_mmu_index;
     cc->dump_state = hexagon_dump_state;
     cc->set_pc = hexagon_cpu_set_pc;
     cc->get_pc = hexagon_cpu_get_pc;
@@ -355,9 +759,39 @@ static void hexagon_cpu_class_init(ObjectClass *c, void *data)
     cc->gdb_stop_before_watchpoint = true;
     cc->gdb_core_xml_file = "hexagon-core.xml";
     cc->disas_set_info = hexagon_cpu_disas_set_info;
+#ifndef CONFIG_USER_ONLY
+    cc->sysemu_ops = &hexagon_sysemu_ops;
+    dc->vmsd = &vmstate_hexagon_cpu;
+#endif
+#ifdef CONFIG_TCG
     cc->tcg_ops = &hexagon_tcg_ops;
+#endif
 }
 
+#ifndef CONFIG_USER_ONLY
+uint32_t hexagon_greg_read(CPUHexagonState *env, uint32_t reg)
+{
+    target_ulong ssr = arch_get_system_reg(env, HEX_SREG_SSR);
+    int ssr_ce = GET_SSR_FIELD(SSR_CE, ssr);
+
+    if (reg <= HEX_GREG_G3) {
+        return env->greg[reg];
+    }
+    switch (reg) {
+    case HEX_GREG_GPCYCLELO:
+        return ssr_ce ? hexagon_get_sys_pcycle_count_low(env) : 0;
+
+    case HEX_GREG_GPCYCLEHI:
+        return ssr_ce ? hexagon_get_sys_pcycle_count_high(env) : 0;
+
+    default:
+        qemu_log_mask(LOG_UNIMP, "reading greg %" PRId32
+                " not yet supported.\n", reg);
+        return 0;
+    }
+}
+#endif
+
 #define DEFINE_CPU(type_name, initfn)      \
     {                                      \
         .name = type_name,                 \
diff --git a/target/hexagon/cpu.h b/target/hexagon/cpu.h
index f78c8f9c2a00..70ed3d5ba7b9 100644
--- a/target/hexagon/cpu.h
+++ b/target/hexagon/cpu.h
@@ -20,14 +20,21 @@
 
 #include "fpu/softfloat-types.h"
 
+#define NUM_GREGS 32
+#define GREG_WRITES_MAX 2
+#define NUM_SREGS 64
+#define SREG_WRITES_MAX 2
+
 #include "cpu-qom.h"
 #include "exec/cpu-defs.h"
+#include "exec/cpu-common.h"
 #include "hex_regs.h"
 #include "mmvec/mmvec.h"
 #include "hw/registerfields.h"
 
 #ifndef CONFIG_USER_ONLY
-#error "Hexagon does not support system emulation"
+#include "reg_fields.h"
+typedef struct CPUHexagonTLBContext CPUHexagonTLBContext;
 #endif
 
 #define NUM_PREGS 4
@@ -38,10 +45,47 @@
 #define REG_WRITES_MAX 32
 #define PRED_WRITES_MAX 5                   /* 4 insns + endloop */
 #define VSTORES_MAX 2
+#define THREADS_MAX 8
+#define VECTOR_UNIT_MAX 8
 
-#define CPU_RESOLVING_TYPE TYPE_HEXAGON_CPU
+#ifndef CONFIG_USER_ONLY
+#define CPU_INTERRUPT_SWI      CPU_INTERRUPT_TGT_INT_0
+#define CPU_INTERRUPT_K0_UNLOCK CPU_INTERRUPT_TGT_INT_1
+#define CPU_INTERRUPT_TLB_UNLOCK CPU_INTERRUPT_TGT_INT_2
+
+#define HEX_CPU_MODE_USER    1
+#define HEX_CPU_MODE_GUEST   2
+#define HEX_CPU_MODE_MONITOR 3
+
+#define HEX_EXE_MODE_OFF     1
+#define HEX_EXE_MODE_RUN     2
+#define HEX_EXE_MODE_WAIT    3
+#define HEX_EXE_MODE_DEBUG   4
+#endif
 
-#define MMU_USER_IDX 0
+#define MMU_USER_IDX         0
+#ifndef CONFIG_USER_ONLY
+#define MMU_GUEST_IDX        1
+#define MMU_KERNEL_IDX       2
+
+#define HEXAGON_CPU_IRQ_0 0
+#define HEXAGON_CPU_IRQ_1 1
+#define HEXAGON_CPU_IRQ_2 2
+#define HEXAGON_CPU_IRQ_3 3
+#define HEXAGON_CPU_IRQ_4 4
+#define HEXAGON_CPU_IRQ_5 5
+#define HEXAGON_CPU_IRQ_6 6
+#define HEXAGON_CPU_IRQ_7 7
+
+typedef enum {
+    HEX_LOCK_UNLOCKED       = 0,
+    HEX_LOCK_WAITING        = 1,
+    HEX_LOCK_OWNER          = 2,
+    HEX_LOCK_QUEUED        = 3
+} hex_lock_state_t;
+#endif
+
+#define CPU_RESOLVING_TYPE TYPE_HEXAGON_CPU
 
 typedef struct {
     target_ulong va;
@@ -75,12 +119,33 @@ typedef struct {
 typedef struct CPUArchState {
     target_ulong gpr[TOTAL_PER_THREAD_REGS];
     target_ulong pred[NUM_PREGS];
+    target_ulong cause_code;
 
     /* For comparing with LLDB on target - see adjust_stack_ptrs function */
     target_ulong last_pc_dumped;
     target_ulong stack_start;
 
     uint8_t slot_cancelled;
+    uint64_t t_cycle_count;
+    uint64_t *g_pcycle_base;
+#ifndef CONFIG_USER_ONLY
+    /* Some system registers are per thread and some are global. */
+    target_ulong t_sreg[NUM_SREGS];
+    target_ulong *g_sreg;
+
+    target_ulong greg[NUM_GREGS];
+    target_ulong wait_next_pc;
+
+    /* This alias of CPUState.cpu_index is used by imported sources: */
+    target_ulong threadId;
+    hex_lock_state_t tlb_lock_state;
+    hex_lock_state_t k0_lock_state;
+    target_ulong tlb_lock_count;
+    target_ulong k0_lock_count;
+    CPUHexagonTLBContext *hex_tlb;
+    GList *dir_list;
+#endif
+    target_ulong next_PC;
     target_ulong new_value_usr;
 
     MemLog mem_log_stores[STORES_MAX];
@@ -123,19 +188,48 @@ struct ArchCPU {
 
     CPUHexagonState env;
 
+    uint32_t rev_reg;
     bool lldb_compat;
     target_ulong lldb_stack_adjust;
     bool short_circuit;
+#ifndef CONFIG_USER_ONLY
+    uint32_t num_tlbs;
+    uint32_t l2vic_base_addr;
+    uint32_t qtimer_base_addr;
+    uint32_t hvx_contexts;
+    uint32_t boot_addr;
+    uint64_t config_table_addr;
+#endif
 };
 
 #include "cpu_bits.h"
 
 FIELD(TB_FLAGS, IS_TIGHT_LOOP, 0, 1)
+FIELD(TB_FLAGS, MMU_INDEX, 1, 3)
+FIELD(TB_FLAGS, PCYCLE_ENABLED, 4, 1)
 
 G_NORETURN void hexagon_raise_exception_err(CPUHexagonState *env,
                                             uint32_t exception,
                                             uintptr_t pc);
 
+#ifndef CONFIG_USER_ONLY
+/*
+ * @return true if the @a thread_env hardware thread is
+ * not stopped.
+ */
+bool hexagon_thread_is_enabled(CPUHexagonState *thread_env);
+uint32_t hexagon_greg_read(CPUHexagonState *env, uint32_t reg);
+uint32_t hexagon_sreg_read(CPUHexagonState *env, uint32_t reg);
+void hexagon_gdb_sreg_write(CPUHexagonState *env, uint32_t reg, uint32_t val);
+void hexagon_cpu_soft_reset(CPUHexagonState *env);
+#endif
+
+#include "exec/cpu-all.h"
+
+#ifndef CONFIG_USER_ONLY
+#include "cpu_helper.h"
+#endif
+
 static inline void cpu_get_tb_cpu_state(CPUHexagonState *env, vaddr *pc,
                                         uint64_t *cs_base, uint32_t *flags)
 {
@@ -145,10 +239,27 @@ static inline void cpu_get_tb_cpu_state(CPUHexagonState *env, vaddr *pc,
     if (*pc == env->gpr[HEX_REG_SA0]) {
         hex_flags = FIELD_DP32(hex_flags, TB_FLAGS, IS_TIGHT_LOOP, 1);
     }
-    *flags = hex_flags;
     if (*pc & PCALIGN_MASK) {
         hexagon_raise_exception_err(env, HEX_CAUSE_PC_NOT_ALIGNED, 0);
     }
+#ifndef CONFIG_USER_ONLY
+    target_ulong syscfg = arch_get_system_reg(env, HEX_SREG_SYSCFG);
+
+    bool pcycle_enabled = extract32(syscfg,
+                                    reg_field_info[SYSCFG_PCYCLEEN].offset,
+                                    reg_field_info[SYSCFG_PCYCLEEN].width);
+
+    hex_flags = FIELD_DP32(hex_flags, TB_FLAGS, MMU_INDEX,
+                           cpu_mmu_index(env_cpu(env), false));
+
+    if (pcycle_enabled) {
+        hex_flags = FIELD_DP32(hex_flags, TB_FLAGS, PCYCLE_ENABLED, 1);
+    }
+#else
+    hex_flags = FIELD_DP32(hex_flags, TB_FLAGS, PCYCLE_ENABLED, true);
+    hex_flags = FIELD_DP32(hex_flags, TB_FLAGS, MMU_INDEX, MMU_USER_IDX);
+#endif
+    *flags = hex_flags;
 }
 
 typedef HexagonCPU ArchCPU;
@@ -156,7 +267,4 @@ typedef HexagonCPU ArchCPU;
 void hexagon_translate_init(void);
 void hexagon_translate_code(CPUState *cs, TranslationBlock *tb,
                             int *max_insns, vaddr pc, void *host_pc);
-
-#include "exec/cpu-all.h"
-
 #endif /* HEXAGON_CPU_H */
diff --git a/target/hexagon/cpu_bits.h b/target/hexagon/cpu_bits.h
index ff596e2a94c9..c7cc426ec888 100644
--- a/target/hexagon/cpu_bits.h
+++ b/target/hexagon/cpu_bits.h
@@ -24,19 +24,88 @@
 #define PCALIGN_MASK (PCALIGN - 1)
 
 enum hex_event {
-    HEX_EVENT_NONE           = -1,
-    HEX_EVENT_TRAP0          =  0x008,
+    HEX_EVENT_NONE = -1,
+    HEX_EVENT_RESET = 0x0,
+    HEX_EVENT_IMPRECISE = 0x1,
+    HEX_EVENT_PRECISE = 0x2,
+    HEX_EVENT_TLB_MISS_X = 0x4,
+    HEX_EVENT_TLB_MISS_RW = 0x6,
+    HEX_EVENT_TRAP0 = 0x8,
+    HEX_EVENT_TRAP1 = 0x9,
+    HEX_EVENT_FPTRAP = 0xb,
+    HEX_EVENT_DEBUG = 0xc,
+    HEX_EVENT_INT0 = 0x10,
+    HEX_EVENT_INT1 = 0x11,
+    HEX_EVENT_INT2 = 0x12,
+    HEX_EVENT_INT3 = 0x13,
+    HEX_EVENT_INT4 = 0x14,
+    HEX_EVENT_INT5 = 0x15,
+    HEX_EVENT_INT6 = 0x16,
+    HEX_EVENT_INT7 = 0x17,
+    HEX_EVENT_INT8 = 0x18,
+    HEX_EVENT_INT9 = 0x19,
+    HEX_EVENT_INTA = 0x1a,
+    HEX_EVENT_INTB = 0x1b,
+    HEX_EVENT_INTC = 0x1c,
+    HEX_EVENT_INTD = 0x1d,
+    HEX_EVENT_INTE = 0x1e,
+    HEX_EVENT_INTF = 0x1f,
 };
 
 enum hex_cause {
     HEX_CAUSE_NONE = -1,
-    HEX_CAUSE_TRAP0 = 0x172,
-    HEX_CAUSE_FETCH_NO_UPAGE =  0x012,
-    HEX_CAUSE_INVALID_PACKET =  0x015,
-    HEX_CAUSE_INVALID_OPCODE =  0x015,
-    HEX_CAUSE_PC_NOT_ALIGNED =  0x01e,
-    HEX_CAUSE_PRIV_NO_UREAD  =  0x024,
-    HEX_CAUSE_PRIV_NO_UWRITE =  0x025,
+    HEX_CAUSE_RESET = 0x000,
+    HEX_CAUSE_BIU_PRECISE = 0x001,
+    HEX_CAUSE_UNSUPORTED_HVX_64B = 0x002, /* QEMU-specific */
+    HEX_CAUSE_DOUBLE_EXCEPT = 0x003,
+    HEX_CAUSE_TRAP0 = 0x008,
+    HEX_CAUSE_TRAP1 = 0x009,
+    HEX_CAUSE_FETCH_NO_XPAGE = 0x011,
+    HEX_CAUSE_FETCH_NO_UPAGE = 0x012,
+    HEX_CAUSE_INVALID_PACKET = 0x015,
+    HEX_CAUSE_INVALID_OPCODE = 0x015,
+    HEX_CAUSE_NO_COPROC_ENABLE = 0x016,
+    HEX_CAUSE_NO_COPROC2_ENABLE = 0x018,
+    HEX_CAUSE_PRIV_USER_NO_GINSN = 0x01a,
+    HEX_CAUSE_PRIV_USER_NO_SINSN = 0x01b,
+    HEX_CAUSE_REG_WRITE_CONFLICT = 0x01d,
+    HEX_CAUSE_PC_NOT_ALIGNED = 0x01e,
+    HEX_CAUSE_MISALIGNED_LOAD = 0x020,
+    HEX_CAUSE_MISALIGNED_STORE = 0x021,
+    HEX_CAUSE_PRIV_NO_READ = 0x022,
+    HEX_CAUSE_PRIV_NO_WRITE = 0x023,
+    HEX_CAUSE_PRIV_NO_UREAD = 0x024,
+    HEX_CAUSE_PRIV_NO_UWRITE = 0x025,
+    HEX_CAUSE_COPROC_LDST = 0x026,
+    HEX_CAUSE_STACK_LIMIT = 0x027,
+    HEX_CAUSE_VWCTRL_WINDOW_MISS = 0x029,
+    HEX_CAUSE_IMPRECISE_NMI = 0x043,
+    HEX_CAUSE_IMPRECISE_MULTI_TLB_MATCH = 0x044,
+    HEX_CAUSE_TLBMISSX_CAUSE_NORMAL = 0x060,
+    HEX_CAUSE_TLBMISSX_CAUSE_NEXTPAGE = 0x061,
+    HEX_CAUSE_TLBMISSRW_CAUSE_READ = 0x070,
+    HEX_CAUSE_TLBMISSRW_CAUSE_WRITE = 0x071,
+    HEX_CAUSE_DEBUG_SINGLESTEP = 0x80,
+    HEX_CAUSE_FPTRAP_CAUSE_BADFLOAT = 0x0bf,
+    HEX_CAUSE_INT0 = 0x0c0,
+    HEX_CAUSE_INT1 = 0x0c1,
+    HEX_CAUSE_INT2 = 0x0c2,
+    HEX_CAUSE_INT3 = 0x0c3,
+    HEX_CAUSE_INT4 = 0x0c4,
+    HEX_CAUSE_INT5 = 0x0c5,
+    HEX_CAUSE_INT6 = 0x0c6,
+    HEX_CAUSE_INT7 = 0x0c7,
+    HEX_CAUSE_VIC0 = 0x0c2,
+    HEX_CAUSE_VIC1 = 0x0c3,
+    HEX_CAUSE_VIC2 = 0x0c4,
+    HEX_CAUSE_VIC3 = 0x0c5,
+};
+
+enum data_cache_state {
+    HEX_DC_STATE_INVALID   = 0x0,
+    HEX_DC_STATE_VALID     = 0x1,
+    HEX_DC_STATE_RESERVED  = 0x2,
+    HEX_DC_STATE_UNUSED_WT = 0x3,
 };
 
 #define PACKET_WORDS_MAX         4
diff --git a/target/hexagon/cpu_helper.c b/target/hexagon/cpu_helper.c
new file mode 100644
index 000000000000..5a651de0514f
--- /dev/null
+++ b/target/hexagon/cpu_helper.c
@@ -0,0 +1,582 @@
+/*
+ * Copyright(c) 2019-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "cpu_helper.h"
+#include "system/cpus.h"
+#ifdef CONFIG_USER_ONLY
+#include "qemu.h"
+#include "exec/helper-proto.h"
+#else
+#include "hw/boards.h"
+#include "hw/hexagon/hexagon.h"
+#include "hex_interrupts.h"
+#include "hex_mmu.h"
+#endif
+#include "exec/exec-all.h"
+#include "exec/cpu_ldst.h"
+#include "qemu/log.h"
+#include "tcg/tcg-op.h"
+#include "internal.h"
+#include "macros.h"
+#include "sys_macros.h"
+#include "arch.h"
+
+
+#ifndef CONFIG_USER_ONLY
+
+static bool hexagon_read_memory_small(CPUHexagonState *env, target_ulong addr,
+                                      int byte_count, unsigned char *dstbuf,
+                                      int mmu_idx, uintptr_t retaddr)
+ {
+    /* handle small sizes */
+    switch (byte_count) {
+    case 1:
+        *dstbuf = cpu_ldub_mmuidx_ra(env, addr, mmu_idx, retaddr);
+        return true;
+
+    case 2:
+        if (QEMU_IS_ALIGNED(addr, 2)) {
+            *(unsigned short *)dstbuf =
+                cpu_lduw_mmuidx_ra(env, addr, mmu_idx, retaddr);
+            return true;
+        }
+        break;
+
+    case 4:
+        if (QEMU_IS_ALIGNED(addr, 4)) {
+            *(uint32_t *)dstbuf =
+                cpu_ldl_mmuidx_ra(env, addr, mmu_idx, retaddr);
+            return true;
+        }
+        break;
+
+    case 8:
+        if (QEMU_IS_ALIGNED(addr, 8)) {
+            *(uint64_t *)dstbuf =
+                cpu_ldq_mmuidx_ra(env, addr, mmu_idx, retaddr);
+            return true;
+        }
+        break;
+
+    default:
+        /* larger request, handle elsewhere */
+        return false;
+    }
+
+    /* not aligned, copy bytes */
+    for (int i = 0; i < byte_count; ++i) {
+        *dstbuf++ = cpu_ldub_mmuidx_ra(env, addr++, mmu_idx, retaddr);
+    }
+    return true;
+}
+
+void hexagon_read_memory(CPUHexagonState *env, target_ulong vaddr, int size,
+                         void *retptr, uintptr_t retaddr)
+{
+    BQL_LOCK_GUARD();
+    CPUState *cs = env_cpu(env);
+    unsigned mmu_idx = cpu_mmu_index(cs, false);
+    if (!hexagon_read_memory_small(env, vaddr, size, retptr, mmu_idx, retaddr)) {
+        cpu_abort(cs, "%s: ERROR: bad size = %d!\n", __func__, size);
+    }
+}
+
+static bool hexagon_write_memory_small(CPUHexagonState *env, target_ulong addr,
+                                       int byte_count, unsigned char *srcbuf,
+                                       int mmu_idx, uintptr_t retaddr)
+{
+    /* handle small sizes */
+    switch (byte_count) {
+    case 1:
+        cpu_stb_mmuidx_ra(env, addr, *srcbuf, mmu_idx, retaddr);
+        return true;
+
+    case 2:
+        if (QEMU_IS_ALIGNED(addr, 2)) {
+            cpu_stw_mmuidx_ra(env, addr, *(uint16_t *)srcbuf, mmu_idx, retaddr);
+            return true;
+        }
+        break;
+
+    case 4:
+        if (QEMU_IS_ALIGNED(addr, 4)) {
+            cpu_stl_mmuidx_ra(env, addr, *(uint32_t *)srcbuf, mmu_idx, retaddr);
+            return true;
+        }
+        break;
+
+    case 8:
+        if (QEMU_IS_ALIGNED(addr, 8)) {
+            cpu_stq_mmuidx_ra(env, addr, *(uint64_t *)srcbuf, mmu_idx, retaddr);
+            return true;
+        }
+        break;
+
+    default:
+        /* larger request, handle elsewhere */
+        return false;
+    }
+
+    /* not aligned, copy bytes */
+    for (int i = 0; i < byte_count; ++i) {
+        cpu_stb_mmuidx_ra(env, addr++, *srcbuf++, mmu_idx, retaddr);
+    }
+
+    return true;
+}
+
+void hexagon_write_memory(CPUHexagonState *env, target_ulong vaddr,
+                          int size, uint64_t data, uintptr_t retaddr)
+{
+    CPUState *cs = env_cpu(env);
+    unsigned mmu_idx = cpu_mmu_index(cs, false);
+    if (!hexagon_write_memory_small(env, vaddr, size, (unsigned char *)&data,
+                                   mmu_idx, retaddr)) {
+        cpu_abort(cs, "%s: ERROR: bad size = %d!\n", __func__, size);
+    }
+}
+
+static inline uint32_t page_start(uint32_t addr)
+{
+    uint32_t page_align = ~(TARGET_PAGE_SIZE - 1);
+    return addr & page_align;
+}
+
+void hexagon_touch_memory(CPUHexagonState *env, uint32_t start_addr,
+                          uint32_t length, uintptr_t retaddr)
+{
+    unsigned int warm;
+    uint32_t first = page_start(start_addr);
+    uint32_t last = page_start(start_addr + length - 1);
+    for (uint32_t page = first; page <= last; page += TARGET_PAGE_SIZE) {
+        hexagon_read_memory(env, page, 1, &warm, retaddr);
+    }
+}
+
+uint32_t hexagon_get_pmu_counter(CPUHexagonState *cur_env, int index)
+{
+    g_assert_not_reached();
+}
+
+uint32_t arch_get_system_reg(CPUHexagonState *env, uint32_t reg)
+{
+    if (reg == HEX_SREG_PCYCLELO) {
+        return hexagon_get_sys_pcycle_count_low(env);
+    } else if (reg == HEX_SREG_PCYCLEHI) {
+        return hexagon_get_sys_pcycle_count_high(env);
+    }
+
+    g_assert(reg < NUM_SREGS);
+    return reg < HEX_SREG_GLB_START ? env->t_sreg[reg] : env->g_sreg[reg];
+}
+
+#endif
+
+uint64_t hexagon_get_sys_pcycle_count(CPUHexagonState *env)
+{
+    uint64_t cycles = 0;
+    CPUState *cs;
+    CPU_FOREACH(cs) {
+        CPUHexagonState *env_ = cpu_env(cs);
+        cycles += env_->t_cycle_count;
+    }
+    return *(env->g_pcycle_base) + cycles;
+}
+
+uint32_t hexagon_get_sys_pcycle_count_high(CPUHexagonState *env)
+{
+    return hexagon_get_sys_pcycle_count(env) >> 32;
+}
+
+uint32_t hexagon_get_sys_pcycle_count_low(CPUHexagonState *env)
+{
+    return extract64(hexagon_get_sys_pcycle_count(env), 0, 32);
+}
+
+void hexagon_set_sys_pcycle_count_high(CPUHexagonState *env,
+        uint32_t cycles_hi)
+{
+    uint64_t cur_cycles = hexagon_get_sys_pcycle_count(env);
+    uint64_t cycles =
+        ((uint64_t)cycles_hi << 32) | extract64(cur_cycles, 0, 32);
+    hexagon_set_sys_pcycle_count(env, cycles);
+}
+
+void hexagon_set_sys_pcycle_count_low(CPUHexagonState *env,
+        uint32_t cycles_lo)
+{
+    uint64_t cur_cycles = hexagon_get_sys_pcycle_count(env);
+    uint64_t cycles = extract64(cur_cycles, 32, 32) | cycles_lo;
+    hexagon_set_sys_pcycle_count(env, cycles);
+}
+
+void hexagon_set_sys_pcycle_count(CPUHexagonState *env, uint64_t cycles)
+{
+    *(env->g_pcycle_base) = cycles;
+
+    CPUState *cs;
+    CPU_FOREACH(cs) {
+        CPUHexagonState *env_ = cpu_env(cs);
+        env_->t_cycle_count = 0;
+    }
+}
+
+#ifndef CONFIG_USER_ONLY
+
+static void set_wait_mode(CPUHexagonState *env)
+{
+    g_assert(bql_locked());
+
+    const uint32_t modectl = arch_get_system_reg(env, HEX_SREG_MODECTL);
+    uint32_t thread_wait_mask = GET_FIELD(MODECTL_W, modectl);
+    thread_wait_mask |= 0x1 << env->threadId;
+    SET_SYSTEM_FIELD(env, HEX_SREG_MODECTL, MODECTL_W, thread_wait_mask);
+}
+
+void hexagon_wait_thread(CPUHexagonState *env, target_ulong PC)
+{
+    g_assert(bql_locked());
+
+    if (qemu_loglevel_mask(LOG_GUEST_ERROR) &&
+        (env->k0_lock_state != HEX_LOCK_UNLOCKED ||
+         env->tlb_lock_state != HEX_LOCK_UNLOCKED)) {
+        qemu_log("WARNING: executing wait() with acquired lock"
+                 "may lead to deadlock\n");
+    }
+    g_assert(get_exe_mode(env) != HEX_EXE_MODE_WAIT);
+
+    CPUState *cs = env_cpu(env);
+    /*
+     * The addtion of cpu_has_work is borrowed from arm's wfi helper
+     * and is critical for our stability
+     */
+    if ((cs->exception_index != HEX_EVENT_NONE) ||
+        (cpu_has_work(cs))) {
+        qemu_log_mask(CPU_LOG_INT,
+            "%s: thread %d skipping WAIT mode, have some work\n",
+            __func__, env->threadId);
+        return;
+    }
+    set_wait_mode(env);
+    env->wait_next_pc = PC + 4;
+
+    cpu_interrupt(cs, CPU_INTERRUPT_HALT);
+}
+
+static void hexagon_resume_thread(CPUHexagonState *env)
+{
+    CPUState *cs = env_cpu(env);
+    clear_wait_mode(env);
+    /*
+     * The wait instruction keeps the PC pointing to itself
+     * so that it has an opportunity to check for interrupts.
+     *
+     * When we come out of wait mode, adjust the PC to the
+     * next executable instruction.
+     */
+    env->gpr[HEX_REG_PC] = env->wait_next_pc;
+    cs = env_cpu(env);
+    ASSERT_DIRECT_TO_GUEST_UNSET(env, cs->exception_index);
+    cs->halted = false;
+    cs->exception_index = HEX_EVENT_NONE;
+    qemu_cpu_kick(cs);
+}
+
+void hexagon_resume_threads(CPUHexagonState *current_env, uint32_t mask)
+{
+    CPUState *cs;
+    CPUHexagonState *env;
+
+    g_assert(bql_locked());
+    CPU_FOREACH(cs) {
+        env = cpu_env(cs);
+        g_assert(env->threadId < THREADS_MAX);
+        if ((mask & (0x1 << env->threadId))) {
+            if (get_exe_mode(env) == HEX_EXE_MODE_WAIT) {
+                hexagon_resume_thread(env);
+            }
+        }
+    }
+}
+
+
+static MMVector VRegs[VECTOR_UNIT_MAX][NUM_VREGS];
+static MMQReg QRegs[VECTOR_UNIT_MAX][NUM_QREGS];
+
+/*
+ *                            EXT_CONTEXTS
+ * SSR.XA   2              4              6              8
+ * 000      HVX Context 0  HVX Context 0  HVX Context 0  HVX Context 0
+ * 001      HVX Context 1  HVX Context 1  HVX Context 1  HVX Context 1
+ * 010      HVX Context 0  HVX Context 2  HVX Context 2  HVX Context 2
+ * 011      HVX Context 1  HVX Context 3  HVX Context 3  HVX Context 3
+ * 100      HVX Context 0  HVX Context 0  HVX Context 4  HVX Context 4
+ * 101      HVX Context 1  HVX Context 1  HVX Context 5  HVX Context 5
+ * 110      HVX Context 0  HVX Context 2  HVX Context 2  HVX Context 6
+ * 111      HVX Context 1  HVX Context 3  HVX Context 3  HVX Context 7
+ */
+static int parse_context_idx(CPUHexagonState *env, uint8_t XA)
+{
+    int ret;
+    HexagonCPU *cpu = env_archcpu(env);
+    if (cpu->hvx_contexts == 6 && XA >= 6) {
+        ret = XA - 6 + 2;
+    } else {
+        ret = XA % cpu->hvx_contexts;
+    }
+    g_assert(ret >= 0 && ret < VECTOR_UNIT_MAX);
+    return ret;
+}
+
+static void check_overcommitted_hvx(CPUHexagonState *env, uint32_t ssr)
+{
+    if (!GET_FIELD(SSR_XE, ssr)) {
+        return;
+    }
+
+    uint8_t XA = GET_SSR_FIELD(SSR_XA, ssr);
+
+    CPUState *cs;
+    CPU_FOREACH(cs) {
+        CPUHexagonState *env_ = cpu_env(cs);
+        if (env_ == env) {
+            continue;
+        }
+        /* Check if another thread has the XE bit set and same XA */
+        uint32_t ssr_ = arch_get_system_reg(env_, HEX_SREG_SSR);
+        if (GET_SSR_FIELD(SSR_XE2, ssr_) && GET_FIELD(SSR_XA, ssr_) == XA) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                    "setting SSR.XA '%d' on thread %d but thread"
+                    " %d has same extension active\n", XA, env->threadId,
+                    env_->threadId);
+        }
+    }
+}
+
+void hexagon_modify_ssr(CPUHexagonState *env, uint32_t new, uint32_t old)
+{
+    g_assert(bql_locked());
+
+    bool old_EX = GET_SSR_FIELD(SSR_EX, old);
+    bool old_UM = GET_SSR_FIELD(SSR_UM, old);
+    bool old_GM = GET_SSR_FIELD(SSR_GM, old);
+    bool old_IE = GET_SSR_FIELD(SSR_IE, old);
+    uint8_t old_XA = GET_SSR_FIELD(SSR_XA, old);
+    bool new_EX = GET_SSR_FIELD(SSR_EX, new);
+    bool new_UM = GET_SSR_FIELD(SSR_UM, new);
+    bool new_GM = GET_SSR_FIELD(SSR_GM, new);
+    bool new_IE = GET_SSR_FIELD(SSR_IE, new);
+    uint8_t new_XA = GET_SSR_FIELD(SSR_XA, new);
+
+    if ((old_EX != new_EX) ||
+        (old_UM != new_UM) ||
+        (old_GM != new_GM)) {
+        hex_mmu_mode_change(env);
+    }
+
+    uint8_t old_asid = GET_SSR_FIELD(SSR_ASID, old);
+    uint8_t new_asid = GET_SSR_FIELD(SSR_ASID, new);
+    if (new_asid != old_asid) {
+        CPUState *cs = env_cpu(env);
+        tlb_flush(cs);
+    }
+
+    if (old_XA != new_XA) {
+        int old_unit = parse_context_idx(env, old_XA);
+        int new_unit = parse_context_idx(env, new_XA);
+
+        /* Ownership exchange */
+        memcpy(VRegs[old_unit], env->VRegs, sizeof(env->VRegs));
+        memcpy(QRegs[old_unit], env->QRegs, sizeof(env->QRegs));
+        memcpy(env->VRegs, VRegs[new_unit], sizeof(env->VRegs));
+        memcpy(env->QRegs, QRegs[new_unit], sizeof(env->QRegs));
+
+        check_overcommitted_hvx(env, new);
+    }
+
+    /* See if the interrupts have been enabled or we have exited EX mode */
+    if ((new_IE && !old_IE) ||
+        (!new_EX && old_EX)) {
+        hex_interrupt_update(env);
+    }
+}
+
+void clear_wait_mode(CPUHexagonState *env)
+{
+    g_assert(bql_locked());
+
+    const uint32_t modectl = arch_get_system_reg(env, HEX_SREG_MODECTL);
+    uint32_t thread_wait_mask = GET_FIELD(MODECTL_W, modectl);
+    thread_wait_mask &= ~(0x1 << env->threadId);
+    SET_SYSTEM_FIELD(env, HEX_SREG_MODECTL, MODECTL_W, thread_wait_mask);
+}
+
+void hexagon_ssr_set_cause(CPUHexagonState *env, uint32_t cause)
+{
+    g_assert(bql_locked());
+
+    const uint32_t old = arch_get_system_reg(env, HEX_SREG_SSR);
+    SET_SYSTEM_FIELD(env, HEX_SREG_SSR, SSR_EX, 1);
+    SET_SYSTEM_FIELD(env, HEX_SREG_SSR, SSR_CAUSE, cause);
+    const uint32_t new = arch_get_system_reg(env, HEX_SREG_SSR);
+
+    hexagon_modify_ssr(env, new, old);
+}
+
+
+int get_exe_mode(CPUHexagonState *env)
+{
+    g_assert(bql_locked());
+
+    target_ulong modectl = arch_get_system_reg(env, HEX_SREG_MODECTL);
+    uint32_t thread_enabled_mask = GET_FIELD(MODECTL_E, modectl);
+    bool E_bit = thread_enabled_mask & (0x1 << env->threadId);
+    uint32_t thread_wait_mask = GET_FIELD(MODECTL_W, modectl);
+    bool W_bit = thread_wait_mask & (0x1 << env->threadId);
+    target_ulong isdbst = arch_get_system_reg(env, HEX_SREG_ISDBST);
+    uint32_t debugmode = GET_FIELD(ISDBST_DEBUGMODE, isdbst);
+    bool D_bit = debugmode & (0x1 << env->threadId);
+
+    /* Figure 4-2 */
+    if (!D_bit && !W_bit && !E_bit) {
+        return HEX_EXE_MODE_OFF;
+    }
+    if (!D_bit && !W_bit && E_bit) {
+        return HEX_EXE_MODE_RUN;
+    }
+    if (!D_bit && W_bit && E_bit) {
+        return HEX_EXE_MODE_WAIT;
+    }
+    if (D_bit && !W_bit && E_bit) {
+        return HEX_EXE_MODE_DEBUG;
+    }
+    g_assert_not_reached();
+}
+
+static void set_enable_mask(CPUHexagonState *env)
+{
+    g_assert(bql_locked());
+
+    const uint32_t modectl = arch_get_system_reg(env, HEX_SREG_MODECTL);
+    uint32_t thread_enabled_mask = GET_FIELD(MODECTL_E, modectl);
+    thread_enabled_mask |= 0x1 << env->threadId;
+    SET_SYSTEM_FIELD(env, HEX_SREG_MODECTL, MODECTL_E, thread_enabled_mask);
+}
+
+static uint32_t clear_enable_mask(CPUHexagonState *env)
+{
+    g_assert(bql_locked());
+
+    const uint32_t modectl = arch_get_system_reg(env, HEX_SREG_MODECTL);
+    uint32_t thread_enabled_mask = GET_FIELD(MODECTL_E, modectl);
+    thread_enabled_mask &= ~(0x1 << env->threadId);
+    SET_SYSTEM_FIELD(env, HEX_SREG_MODECTL, MODECTL_E, thread_enabled_mask);
+    return thread_enabled_mask;
+}
+static void do_start_thread(CPUState *cs, run_on_cpu_data tbd)
+{
+    BQL_LOCK_GUARD();
+
+    CPUHexagonState *env = cpu_env(cs);
+
+    hexagon_cpu_soft_reset(env);
+
+    set_enable_mask(env);
+
+    cs->halted = 0;
+    cs->exception_index = HEX_EVENT_NONE;
+    cpu_resume(cs);
+}
+
+void hexagon_start_threads(CPUHexagonState *current_env, uint32_t mask)
+{
+    CPUState *cs;
+    CPU_FOREACH(cs) {
+        CPUHexagonState *env = cpu_env(cs);
+        if (!(mask & (0x1 << env->threadId))) {
+            continue;
+        }
+
+        if (current_env->threadId != env->threadId) {
+            async_safe_run_on_cpu(cs, do_start_thread, RUN_ON_CPU_NULL);
+        }
+    }
+}
+
+/*
+ * When we have all threads stopped, the return
+ * value to the shell is register 2 from thread 0.
+ */
+static target_ulong get_thread0_r2(void)
+{
+    CPUState *cs;
+    CPU_FOREACH(cs) {
+        CPUHexagonState *thread = cpu_env(cs);
+        if (thread->threadId == 0) {
+            return thread->gpr[2];
+        }
+    }
+    g_assert_not_reached();
+}
+
+void hexagon_stop_thread(CPUHexagonState *env)
+
+{
+    BQL_LOCK_GUARD();
+
+    uint32_t thread_enabled_mask = clear_enable_mask(env);
+    CPUState *cs = env_cpu(env);
+    cpu_interrupt(cs, CPU_INTERRUPT_HALT);
+    if (!thread_enabled_mask) {
+        /* All threads are stopped, exit */
+        exit(get_thread0_r2());
+    }
+}
+
+static int sys_in_monitor_mode_ssr(uint32_t ssr)
+{
+    if ((GET_SSR_FIELD(SSR_EX, ssr) != 0) ||
+       ((GET_SSR_FIELD(SSR_EX, ssr) == 0) && (GET_SSR_FIELD(SSR_UM, ssr) == 0)))
+        return 1;
+    return 0;
+}
+
+static int sys_in_guest_mode_ssr(uint32_t ssr)
+{
+    if ((GET_SSR_FIELD(SSR_EX, ssr) == 0) &&
+        (GET_SSR_FIELD(SSR_UM, ssr) != 0) &&
+        (GET_SSR_FIELD(SSR_GM, ssr) != 0))
+        return 1;
+    return 0;
+}
+
+static int sys_in_user_mode_ssr(uint32_t ssr)
+{
+    if ((GET_SSR_FIELD(SSR_EX, ssr) == 0) &&
+        (GET_SSR_FIELD(SSR_UM, ssr) != 0) &&
+        (GET_SSR_FIELD(SSR_GM, ssr) == 0))
+        return 1;
+   return 0;
+}
+
+int get_cpu_mode(CPUHexagonState *env)
+
+{
+    uint32_t ssr = arch_get_system_reg(env, HEX_SREG_SSR);
+
+    if (sys_in_monitor_mode_ssr(ssr)) {
+        return HEX_CPU_MODE_MONITOR;
+    } else if (sys_in_guest_mode_ssr(ssr)) {
+        return HEX_CPU_MODE_GUEST;
+    } else if (sys_in_user_mode_ssr(ssr)) {
+        return HEX_CPU_MODE_USER;
+    }
+    return HEX_CPU_MODE_MONITOR;
+}
+
+#endif
diff --git a/target/hexagon/cpu_helper.h b/target/hexagon/cpu_helper.h
new file mode 100644
index 000000000000..f86f5e744fd4
--- /dev/null
+++ b/target/hexagon/cpu_helper.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright(c) 2019-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HEXAGON_CPU_HELPER_H
+#define HEXAGON_CPU_HELPER_H
+
+void hexagon_read_memory(CPUHexagonState *env, target_ulong vaddr, int size,
+                         void *retptr, uintptr_t retaddr);
+void hexagon_write_memory(CPUHexagonState *env, target_ulong vaddr,
+                          int size, uint64_t data, uintptr_t retaddr);
+void hexagon_touch_memory(CPUHexagonState *env, uint32_t start_addr,
+                          uint32_t length, uintptr_t retaddr);
+uint32_t hexagon_get_pmu_counter(CPUHexagonState *cur_env, int index);
+uint64_t hexagon_get_sys_pcycle_count(CPUHexagonState *env);
+uint32_t hexagon_get_sys_pcycle_count_low(CPUHexagonState *env);
+uint32_t hexagon_get_sys_pcycle_count_high(CPUHexagonState *env);
+void hexagon_set_sys_pcycle_count(CPUHexagonState *env, uint64_t);
+void hexagon_set_sys_pcycle_count_low(CPUHexagonState *env, uint32_t);
+void hexagon_set_sys_pcycle_count_high(CPUHexagonState *env, uint32_t);
+void hexagon_modify_ssr(CPUHexagonState *env, uint32_t new, uint32_t old);
+int get_cpu_mode(CPUHexagonState *env);
+int get_exe_mode(CPUHexagonState *env);
+void clear_wait_mode(CPUHexagonState *env);
+void hexagon_ssr_set_cause(CPUHexagonState *env, uint32_t cause);
+void hexagon_start_threads(CPUHexagonState *env, uint32_t mask);
+void hexagon_stop_thread(CPUHexagonState *env);
+void hexagon_wait_thread(CPUHexagonState *env, target_ulong PC);
+void hexagon_resume_threads(CPUHexagonState *env, uint32_t mask);
+
+static inline void arch_set_thread_reg(CPUHexagonState *env, uint32_t reg,
+                                       uint32_t val)
+{
+    g_assert(reg < TOTAL_PER_THREAD_REGS);
+    env->gpr[reg] = val;
+}
+
+static inline uint32_t arch_get_thread_reg(CPUHexagonState *env, uint32_t reg)
+{
+    g_assert(reg < TOTAL_PER_THREAD_REGS);
+    return env->gpr[reg];
+}
+
+#ifndef CONFIG_USER_ONLY
+static inline void arch_set_system_reg(CPUHexagonState *env, uint32_t reg,
+                                       uint32_t val)
+{
+    g_assert(reg < NUM_SREGS);
+    if (reg < HEX_SREG_GLB_START) {
+        env->t_sreg[reg] = val;
+    } else {
+        env->g_sreg[reg] = val;
+    }
+}
+#endif
+
+uint32_t arch_get_system_reg(CPUHexagonState *env, uint32_t reg);
+
+#endif
+
diff --git a/target/hexagon/decode.c b/target/hexagon/decode.c
index 23deba2426f8..41bf03c9b513 100644
--- a/target/hexagon/decode.c
+++ b/target/hexagon/decode.c
@@ -193,6 +193,8 @@ static bool decode_opcode_can_jump(int opcode)
     if ((GET_ATTRIB(opcode, A_JUMP)) ||
         (GET_ATTRIB(opcode, A_CALL)) ||
         (opcode == J2_trap0) ||
+        (opcode == J2_trap1) ||
+        (opcode == J2_rte) ||
         (opcode == J2_pause)) {
         /* Exception to A_JUMP attribute */
         if (opcode == J4_hintjumpr) {
@@ -236,9 +238,9 @@ static void decode_set_insn_attr_fields(Packet *pkt)
             if (GET_ATTRIB(opcode, A_SCALAR_STORE) &&
                 !GET_ATTRIB(opcode, A_MEMSIZE_0B)) {
                 if (pkt->insn[i].slot == 0) {
-                    pkt->pkt_has_store_s0 = true;
+                    pkt->pkt_has_scalar_store_s0 = true;
                 } else {
-                    pkt->pkt_has_store_s1 = true;
+                    pkt->pkt_has_scalar_store_s1 = true;
                 }
             }
         }
@@ -371,6 +373,18 @@ static void decode_shuffle_for_execution(Packet *packet)
             break;
         }
     }
+    /*
+     * And at the very very very end, move any RTE's, since they update
+     * user/supervisor mode.
+     */
+#if !defined(CONFIG_USER_ONLY)
+    for (i = 0; i < last_insn; i++) {
+        if (packet->insn[i].opcode == J2_rte) {
+            decode_send_insn_to(packet, i, last_insn);
+            break;
+        }
+    }
+#endif
 }
 
 static void
diff --git a/target/hexagon/gdbstub.c b/target/hexagon/gdbstub.c
index 12d6b3bbcbb1..8476199b753e 100644
--- a/target/hexagon/gdbstub.c
+++ b/target/hexagon/gdbstub.c
@@ -76,6 +76,51 @@ int hexagon_gdb_write_register(CPUState *cs, uint8_t *mem_buf, int n)
     g_assert_not_reached();
 }
 
+#ifndef CONFIG_USER_ONLY
+int hexagon_sys_gdb_read_register(CPUState *cs, GByteArray *mem_buf, int n)
+{
+    CPUHexagonState *env = cpu_env(cs);
+
+    if (n < NUM_SREGS) {
+        return gdb_get_regl(mem_buf, hexagon_sreg_read(env, n));
+    }
+    n -= NUM_SREGS;
+
+    if (n < NUM_GREGS) {
+        return gdb_get_regl(mem_buf, hexagon_greg_read(env, n));
+    }
+    n -= NUM_GREGS;
+
+    n -= TOTAL_PER_THREAD_REGS;
+
+    if (n < NUM_PREGS) {
+        env->pred[n] = ldtul_p(mem_buf) & 0xff;
+        return sizeof(uint8_t);
+    }
+
+    n -= NUM_PREGS;
+
+    g_assert_not_reached();
+}
+
+int hexagon_sys_gdb_write_register(CPUState *cs, uint8_t *mem_buf, int n)
+{
+    CPUHexagonState *env = cpu_env(cs);
+
+    if (n < NUM_SREGS) {
+        hexagon_gdb_sreg_write(env, n, ldtul_p(mem_buf));
+        return sizeof(target_ulong);
+    }
+    n -= NUM_SREGS;
+
+    if (n < NUM_GREGS) {
+        return env->greg[n] = ldtul_p(mem_buf);
+    }
+    n -= NUM_GREGS;
+
+    g_assert_not_reached();
+}
+#endif
 static int gdb_get_vreg(CPUHexagonState *env, GByteArray *mem_buf, int n)
 {
     int total = 0;
diff --git a/target/hexagon/gen_analyze_funcs.py b/target/hexagon/gen_analyze_funcs.py
index 3ac7cc2cfe57..dfdf5f3b87ba 100755
--- a/target/hexagon/gen_analyze_funcs.py
+++ b/target/hexagon/gen_analyze_funcs.py
@@ -22,6 +22,8 @@
 import string
 import hex_common
 
+def has_analyze_func(reg, mode):
+    return callable(getattr(reg, f"analyze_{mode}", None))
 
 ##
 ## Generate the code to analyze the instruction
@@ -42,6 +44,14 @@ def gen_analyze_func(f, tag, regs, imms):
     f.write(f"static void analyze_{tag}(DisasContext *ctx)\n")
     f.write("{\n")
 
+    if hex_common.tag_ignore(tag):
+        f.write("}\n\n")
+        return
+
+    if ("A_PRIV" in hex_common.attribdict[tag] or
+        "A_GUEST" in hex_common.attribdict[tag]):
+        f.write("#ifndef CONFIG_USER_ONLY\n")
+
     f.write("    Insn *insn G_GNUC_UNUSED = ctx->insn;\n")
     if (hex_common.is_hvx_insn(tag)):
         if hex_common.has_hvx_helper(tag):
@@ -58,22 +68,27 @@ def gen_analyze_func(f, tag, regs, imms):
     for regno, register in enumerate(regs):
         reg_type, reg_id = register
         reg = hex_common.get_register(tag, reg_type, reg_id)
-        reg.decl_reg_num(f, regno)
+        if has_analyze_func(reg, "read") or has_analyze_func(reg, "write"):
+            reg.decl_reg_num(f, regno)
 
     ## Analyze the register reads
     for regno, register in enumerate(regs):
         reg_type, reg_id = register
         reg = hex_common.get_register(tag, reg_type, reg_id)
-        if reg.is_read():
+        if reg.is_read() and has_analyze_func(reg, "read"):
             reg.analyze_read(f, regno)
 
     ## Analyze the register writes
     for regno, register in enumerate(regs):
         reg_type, reg_id = register
         reg = hex_common.get_register(tag, reg_type, reg_id)
-        if reg.is_written():
+        if reg.is_written() and has_analyze_func(reg, "write"):
             reg.analyze_write(f, tag, regno)
 
+    if ("A_PRIV" in hex_common.attribdict[tag] or
+        "A_GUEST" in hex_common.attribdict[tag]):
+        f.write("#endif /* !CONFIG_USER_ONLY */\n")
+
     f.write("}\n\n")
 
 
diff --git a/target/hexagon/gen_helper_funcs.py b/target/hexagon/gen_helper_funcs.py
index c1f806ac4b25..32e3bac74625 100755
--- a/target/hexagon/gen_helper_funcs.py
+++ b/target/hexagon/gen_helper_funcs.py
@@ -69,7 +69,7 @@ def gen_helper_function(f, tag, tagregs, tagimms):
     if hex_common.need_slot(tag):
         if "A_LOAD" in hex_common.attribdict[tag]:
             f.write(hex_common.code_fmt(f"""\
-                bool pkt_has_store_s1 = slotval & 0x1;
+                bool pkt_has_scalar_store_s1 = slotval & 0x1;
             """))
         f.write(hex_common.code_fmt(f"""\
             uint32_t slot = slotval >> 1;
@@ -109,26 +109,23 @@ def main():
     tagimms = hex_common.get_tagimms()
 
     with open(args.out, "w") as f:
-        for tag in hex_common.tags:
-            ## Skip the priv instructions
-            if "A_PRIV" in hex_common.attribdict[tag]:
+        for tag in hex_common.get_user_tags():
+            if hex_common.tag_ignore(tag):
                 continue
-            ## Skip the guest instructions
-            if "A_GUEST" in hex_common.attribdict[tag]:
-                continue
-            ## Skip the diag instructions
-            if tag == "Y6_diag":
-                continue
-            if tag == "Y6_diag0":
+            if hex_common.skip_qemu_helper(tag):
                 continue
-            if tag == "Y6_diag1":
+            if hex_common.is_idef_parser_enabled(tag):
                 continue
+            gen_helper_function(f, tag, tagregs, tagimms)
+
+        f.write("#if !defined(CONFIG_USER_ONLY)\n")
+        for tag in hex_common.get_sys_tags():
             if hex_common.skip_qemu_helper(tag):
                 continue
             if hex_common.is_idef_parser_enabled(tag):
                 continue
-
             gen_helper_function(f, tag, tagregs, tagimms)
+        f.write("#endif\n")
 
 
 if __name__ == "__main__":
diff --git a/target/hexagon/gen_helper_protos.py b/target/hexagon/gen_helper_protos.py
index 77f8e0a6a322..59c8bdd05c0f 100755
--- a/target/hexagon/gen_helper_protos.py
+++ b/target/hexagon/gen_helper_protos.py
@@ -59,27 +59,28 @@ def main():
     tagimms = hex_common.get_tagimms()
 
     with open(args.out, "w") as f:
-        for tag in hex_common.tags:
-            ## Skip the priv instructions
-            if "A_PRIV" in hex_common.attribdict[tag]:
+        for tag in hex_common.get_user_tags():
+            if hex_common.tag_ignore(tag):
                 continue
-            ## Skip the guest instructions
-            if "A_GUEST" in hex_common.attribdict[tag]:
-                continue
-            ## Skip the diag instructions
-            if tag == "Y6_diag":
-                continue
-            if tag == "Y6_diag0":
+
+            if hex_common.skip_qemu_helper(tag):
                 continue
-            if tag == "Y6_diag1":
+            if hex_common.is_idef_parser_enabled(tag):
                 continue
 
+            gen_helper_prototype(f, tag, tagregs, tagimms)
+
+        f.write("#if !defined(CONFIG_USER_ONLY)\n")
+        for tag in hex_common.get_sys_tags():
+            if hex_common.tag_ignore(tag):
+                continue
             if hex_common.skip_qemu_helper(tag):
                 continue
             if hex_common.is_idef_parser_enabled(tag):
                 continue
 
             gen_helper_prototype(f, tag, tagregs, tagimms)
+        f.write("#endif\n")
 
 
 if __name__ == "__main__":
diff --git a/target/hexagon/gen_idef_parser_funcs.py b/target/hexagon/gen_idef_parser_funcs.py
index 2f6e826f76d6..32bce9b00286 100644
--- a/target/hexagon/gen_idef_parser_funcs.py
+++ b/target/hexagon/gen_idef_parser_funcs.py
@@ -60,6 +60,8 @@ def main():
         f.write('#include "macros.h.inc"\n\n')
 
         for tag in hex_common.tags:
+            if hex_common.tag_ignore(tag):
+                continue
             ## Skip the priv instructions
             if "A_PRIV" in hex_common.attribdict[tag]:
                 continue
diff --git a/target/hexagon/gen_op_attribs.py b/target/hexagon/gen_op_attribs.py
index bbbb02df3a23..94dd1f876b21 100755
--- a/target/hexagon/gen_op_attribs.py
+++ b/target/hexagon/gen_op_attribs.py
@@ -38,7 +38,7 @@ def main():
     ##     Generate all the attributes associated with each instruction
     ##
     with open(args.out, "w") as f:
-        for tag in hex_common.tags:
+        for tag in hex_common.get_all_tags():
             f.write(
                 f"OP_ATTRIB({tag},ATTRIBS("
                 f'{",".join(sorted(hex_common.attribdict[tag]))}))\n'
diff --git a/target/hexagon/gen_opcodes_def.py b/target/hexagon/gen_opcodes_def.py
index 94a19ff412e2..17ba3f9db95e 100755
--- a/target/hexagon/gen_opcodes_def.py
+++ b/target/hexagon/gen_opcodes_def.py
@@ -37,7 +37,10 @@ def main():
     ##     Generate a list of all the opcodes
     ##
     with open(args.out, "w") as f:
-        for tag in hex_common.tags:
+        for tag in hex_common.get_user_tags():
+            f.write(f"OPCODE({tag}),\n")
+
+        for tag in hex_common.get_sys_tags():
             f.write(f"OPCODE({tag}),\n")
 
 
diff --git a/target/hexagon/gen_semantics.c b/target/hexagon/gen_semantics.c
index 4a2bdd70e9cc..ed66ae4ec241 100644
--- a/target/hexagon/gen_semantics.c
+++ b/target/hexagon/gen_semantics.c
@@ -106,7 +106,7 @@ int main(int argc, char *argv[])
 /*
  * Process the macros for HVX
  */
-#define DEF_MACRO(MNAME, BEH, ATTRS) \
+#define DEF_MACRO(MNAME, PARAMS, SDESC, LDESC, BEH, ATTRS) \
     fprintf(outfile, "MACROATTRIB( \\\n" \
                      "    \"%s\", \\\n" \
                      "    \"\"\"%s\"\"\", \\\n" \
diff --git a/target/hexagon/gen_tcg.h b/target/hexagon/gen_tcg.h
index 8a3b801287c7..146aadc73764 100644
--- a/target/hexagon/gen_tcg.h
+++ b/target/hexagon/gen_tcg.h
@@ -488,6 +488,7 @@
 
 /* dczeroa clears the 32 byte cache line at the address given */
 #define fGEN_TCG_Y2_dczeroa(SHORTCODE) SHORTCODE
+#define fGEN_TCG_Y2_dczeroa_nt(SHORTCODE) SHORTCODE
 
 /* In linux-user mode, these are not modelled, suppress compiler warning */
 #define fGEN_TCG_Y2_dcinva(SHORTCODE) \
@@ -1133,6 +1134,9 @@
                            RdV, tcg_constant_tl(0)); \
     } while (0)
 
+#define fGEN_TCG_Y2_break(SHORTCODE)
+#define fGEN_TCG_J2_unpause(SHORTCODE)
+
 #define fGEN_TCG_J2_pause(SHORTCODE) \
     do { \
         uiV = uiV; \
@@ -1342,6 +1346,11 @@
         RsV = RsV; \
         uiV = uiV; \
     } while (0)
+#define fGEN_TCG_Y2_dcfetchbo_nt(SHORTCODE) \
+    do { \
+        RsV = RsV; \
+        uiV = uiV; \
+    } while (0)
 
 #define fGEN_TCG_L2_loadw_aq(SHORTCODE)                 SHORTCODE
 #define fGEN_TCG_L4_loadd_aq(SHORTCODE)                 SHORTCODE
@@ -1361,13 +1370,6 @@
 #define fGEN_TCG_S2_storew_rl_st_vi(SHORTCODE)          SHORTCODE
 #define fGEN_TCG_S4_stored_rl_st_vi(SHORTCODE)          SHORTCODE
 
-#define fGEN_TCG_J2_trap0(SHORTCODE) \
-    do { \
-        uiV = uiV; \
-        tcg_gen_movi_tl(hex_gpr[HEX_REG_PC], ctx->pkt->pc); \
-        TCGv excp = tcg_constant_tl(HEX_EVENT_TRAP0); \
-        gen_helper_raise_exception(tcg_env, excp); \
-    } while (0)
 #endif
 
 #define fGEN_TCG_A2_nop(SHORTCODE) do { } while (0)
diff --git a/target/hexagon/gen_tcg_func_table.py b/target/hexagon/gen_tcg_func_table.py
index 299a39b1aa02..70c8db5c44c8 100755
--- a/target/hexagon/gen_tcg_func_table.py
+++ b/target/hexagon/gen_tcg_func_table.py
@@ -41,19 +41,9 @@ def main():
         f.write("#define HEXAGON_FUNC_TABLE_H\n\n")
 
         f.write("const SemanticInsn opcode_genptr[XX_LAST_OPCODE] = {\n")
+
         for tag in hex_common.tags:
-            ## Skip the priv instructions
-            if "A_PRIV" in hex_common.attribdict[tag]:
-                continue
-            ## Skip the guest instructions
-            if "A_GUEST" in hex_common.attribdict[tag]:
-                continue
-            ## Skip the diag instructions
-            if tag == "Y6_diag":
-                continue
-            if tag == "Y6_diag0":
-                continue
-            if tag == "Y6_diag1":
+            if hex_common.tag_ignore(tag):
                 continue
 
             f.write(f"    [{tag}] = generate_{tag},\n")
diff --git a/target/hexagon/gen_tcg_funcs.py b/target/hexagon/gen_tcg_funcs.py
index c2ba91ddc044..65bfa046b867 100755
--- a/target/hexagon/gen_tcg_funcs.py
+++ b/target/hexagon/gen_tcg_funcs.py
@@ -21,7 +21,7 @@
 import re
 import string
 import hex_common
-
+from textwrap import dedent
 
 ##
 ## Generate the TCG code to call the helper
@@ -50,6 +50,18 @@ def gen_tcg_func(f, tag, regs, imms):
 
     f.write("    Insn *insn G_GNUC_UNUSED = ctx->insn;\n")
 
+    if "A_PRIV" in hex_common.attribdict[tag]:
+        f.write(dedent("""\
+#ifdef CONFIG_USER_ONLY
+    hex_gen_exception_end_tb(ctx, HEX_CAUSE_PRIV_USER_NO_SINSN);
+#else
+"""))
+    if "A_GUEST" in hex_common.attribdict[tag]:
+        f.write(dedent("""\
+#ifdef CONFIG_USER_ONLY
+    hex_gen_exception_end_tb(ctx, HEX_CAUSE_PRIV_USER_NO_GINSN);
+#else
+"""))
     if hex_common.need_ea(tag):
         f.write("    TCGv EA G_GNUC_UNUSED = tcg_temp_new();\n")
 
@@ -97,6 +109,11 @@ def gen_tcg_func(f, tag, regs, imms):
         if reg.is_written():
             reg.log_write(f, tag)
 
+    if (
+        "A_PRIV" in hex_common.attribdict[tag]
+        or "A_GUEST" in hex_common.attribdict[tag]
+    ):
+        f.write("#endif   /* CONFIG_USER_ONLY */\n")
     f.write("}\n\n")
 
 
@@ -121,18 +138,7 @@ def main():
             f.write('#include "idef-generated-emitter.h.inc"\n\n')
 
         for tag in hex_common.tags:
-            ## Skip the priv instructions
-            if "A_PRIV" in hex_common.attribdict[tag]:
-                continue
-            ## Skip the guest instructions
-            if "A_GUEST" in hex_common.attribdict[tag]:
-                continue
-            ## Skip the diag instructions
-            if tag == "Y6_diag":
-                continue
-            if tag == "Y6_diag0":
-                continue
-            if tag == "Y6_diag1":
+            if hex_common.tag_ignore(tag):
                 continue
 
             gen_def_tcg_func(f, tag, tagregs, tagimms)
diff --git a/target/hexagon/gen_tcg_sys.h b/target/hexagon/gen_tcg_sys.h
new file mode 100644
index 000000000000..e56553462fb0
--- /dev/null
+++ b/target/hexagon/gen_tcg_sys.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright(c) 2022-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HEXAGON_GEN_TCG_SYS_H
+#define HEXAGON_GEN_TCG_SYS_H
+
+/* System mode instructions */
+#define fGEN_TCG_Y2_swi(SHORTCODE) \
+    gen_helper_swi(tcg_env, RsV)
+
+#define fGEN_TCG_Y2_cswi(SHORTCODE) \
+    gen_helper_cswi(tcg_env, RsV)
+
+#define fGEN_TCG_Y2_ciad(SHORTCODE) \
+    gen_helper_ciad(tcg_env, RsV)
+
+#define fGEN_TCG_Y4_siad(SHORTCODE) \
+    gen_helper_siad(tcg_env, RsV)
+
+#define fGEN_TCG_Y2_iassignw(SHORTCODE) \
+    gen_helper_iassignw(tcg_env, RsV)
+
+#define fGEN_TCG_Y2_iassignr(SHORTCODE) \
+    gen_helper_iassignr(RdV, tcg_env, RsV)
+
+#define fGEN_TCG_Y2_getimask(SHORTCODE) \
+    gen_helper_getimask(RdV, tcg_env, RsV)
+
+#define fGEN_TCG_Y2_setimask(SHORTCODE) \
+    gen_helper_setimask(tcg_env, PtV, RsV)
+
+#define fGEN_TCG_Y2_setprio(SHORTCODE) \
+    gen_helper_setprio(tcg_env, PtV, RsV)
+
+#define fGEN_TCG_Y2_crswap0(SHORTCODE) \
+    do { \
+        TCGv tmp = tcg_temp_new(); \
+        tcg_gen_mov_tl(tmp, RxV); \
+        tcg_gen_mov_tl(RxV, hex_t_sreg[HEX_SREG_SGP0]); \
+        tcg_gen_mov_tl(ctx->t_sreg_new_value[HEX_SREG_SGP0], tmp); \
+    } while (0)
+
+#define fGEN_TCG_Y4_crswap1(SHORTCODE) \
+    do { \
+        TCGv tmp = tcg_temp_new(); \
+        tcg_gen_mov_tl(tmp, RxV); \
+        tcg_gen_mov_tl(RxV, hex_t_sreg[HEX_SREG_SGP1]); \
+        tcg_gen_mov_tl(ctx->t_sreg_new_value[HEX_SREG_SGP1], tmp); \
+    } while (0)
+
+#define fGEN_TCG_Y4_crswap10(SHORTCODE) \
+    do { \
+        g_assert_not_reached(); \
+        TCGv_i64 tmp = tcg_temp_new_i64(); \
+        tcg_gen_mov_i64(tmp, RxxV); \
+        tcg_gen_concat_i32_i64(RxxV, \
+                               hex_t_sreg[HEX_SREG_SGP0], \
+                               hex_t_sreg[HEX_SREG_SGP1]); \
+        tcg_gen_extrl_i64_i32(ctx->t_sreg_new_value[HEX_SREG_SGP0], tmp); \
+        tcg_gen_extrh_i64_i32(ctx->t_sreg_new_value[HEX_SREG_SGP1], tmp); \
+    } while (0)
+
+#define fGEN_TCG_Y2_wait(SHORTCODE) \
+    do { \
+        RsV = RsV; \
+        gen_helper_wait(tcg_env, tcg_constant_tl(ctx->pkt->pc)); \
+    } while (0)
+
+#define fGEN_TCG_Y2_resume(SHORTCODE) \
+    gen_helper_resume(tcg_env, RsV)
+
+#define fGEN_TCG_Y2_start(SHORTCODE) \
+    gen_helper_start(tcg_env, RsV)
+
+#define fGEN_TCG_Y2_stop(SHORTCODE) \
+    do { \
+        RsV = RsV; \
+        gen_helper_stop(tcg_env); \
+    } while (0)
+
+#define fGEN_TCG_Y2_tfrscrr(SHORTCODE) \
+    tcg_gen_mov_tl(RdV, SsV)
+
+#define fGEN_TCG_Y2_tfrsrcr(SHORTCODE) \
+    tcg_gen_mov_tl(SdV, RsV)
+
+#define fGEN_TCG_Y4_tfrscpp(SHORTCODE) \
+    tcg_gen_mov_i64(RddV, SssV)
+
+#define fGEN_TCG_Y4_tfrspcp(SHORTCODE) \
+    tcg_gen_mov_i64(SddV, RssV)
+
+#define fGEN_TCG_G4_tfrgcrr(SHORTCODE) \
+    tcg_gen_mov_tl(RdV, GsV)
+
+#define fGEN_TCG_G4_tfrgrcr(SHORTCODE) \
+    tcg_gen_mov_tl(GdV, RsV)
+
+#define fGEN_TCG_G4_tfrgcpp(SHORTCODE) \
+    tcg_gen_mov_i64(RddV, GssV)
+
+#define fGEN_TCG_G4_tfrgpcp(SHORTCODE) \
+    tcg_gen_mov_i64(GddV, RssV)
+
+
+/*
+ * rte (return from exception)
+ *     Clear the EX bit in SSR
+ *     Jump to ELR
+ */
+#define fGEN_TCG_J2_rte(SHORTCODE) \
+    do { \
+        TCGv new_ssr = tcg_temp_new(); \
+        tcg_gen_deposit_tl(new_ssr, hex_t_sreg[HEX_SREG_SSR], \
+                           tcg_constant_tl(0), \
+                           reg_field_info[SSR_EX].offset, \
+                           reg_field_info[SSR_EX].width); \
+        gen_log_sreg_write(ctx, HEX_SREG_SSR, new_ssr); \
+        gen_jumpr(ctx, hex_t_sreg[HEX_SREG_ELR]); \
+    } while (0)
+
+#define fGEN_TCG_Y4_nmi(SHORTCODE) \
+    gen_helper_nmi(tcg_env, RsV)
+
+#endif
diff --git a/target/hexagon/genptr.c b/target/hexagon/genptr.c
index 2c5e15cfcf6f..1dde04529bbe 100644
--- a/target/hexagon/genptr.c
+++ b/target/hexagon/genptr.c
@@ -23,6 +23,7 @@
 #include "exec/helper-gen.h"
 #include "insn.h"
 #include "opcodes.h"
+#include "sys_macros.h"
 #include "translate.h"
 #define QEMU_GENERATE       /* Used internally by macros.h */
 #include "macros.h"
@@ -30,6 +31,10 @@
 #undef QEMU_GENERATE
 #include "gen_tcg.h"
 #include "gen_tcg_hvx.h"
+#ifndef CONFIG_USER_ONLY
+#include "gen_tcg_sys.h"
+#endif
+
 #include "genptr.h"
 
 TCGv gen_read_reg(TCGv result, int num)
@@ -128,6 +133,164 @@ TCGv get_result_pred(DisasContext *ctx, int pnum)
     }
 }
 
+#ifndef CONFIG_USER_ONLY
+G_GNUC_UNUSED
+static bool greg_writable(int rnum, bool pair)
+{
+    if (pair) {
+        if (rnum < HEX_GREG_G3) {
+            return true;
+        }
+        qemu_log_mask(LOG_UNIMP,
+                "Warning: ignoring write to guest register pair G%d:%d\n",
+                rnum + 1, rnum);
+    } else {
+        if (rnum <= HEX_GREG_G3) {
+            return true;
+        }
+        qemu_log_mask(LOG_UNIMP,
+                "Warning: ignoring write to guest register G%d\n", rnum);
+    }
+    return false;
+}
+
+G_GNUC_UNUSED
+static void check_greg_impl(int rnum, bool pair)
+{
+    if (pair && (!greg_implemented(rnum) || !greg_implemented(rnum + 1))) {
+        qemu_log_mask(LOG_UNIMP,
+                "Warning: guest register pair G%d:%d is unimplemented or "
+                "reserved. Read will yield 0.\n",
+                rnum + 1, rnum);
+    } else if (!pair && !greg_implemented(rnum)) {
+        qemu_log_mask(LOG_UNIMP,
+                "Warning: guest register G%d is unimplemented or reserved."
+                " Read will yield 0.\n", rnum);
+    }
+}
+
+G_GNUC_UNUSED
+static inline void gen_log_greg_write(DisasContext *ctx, int rnum, TCGv val)
+{
+    tcg_gen_mov_tl(ctx->greg_new_value[rnum], val);
+}
+
+G_GNUC_UNUSED
+static void gen_log_greg_write_pair(DisasContext *ctx, int rnum, TCGv_i64 val)
+{
+    TCGv val32 = tcg_temp_new();
+
+    /* Low word */
+    tcg_gen_extrl_i64_i32(val32, val);
+    gen_log_greg_write(ctx, rnum, val32);
+
+    /* High word */
+    tcg_gen_extrh_i64_i32(val32, val);
+    gen_log_greg_write(ctx, rnum + 1, val32);
+}
+
+static const target_ulong sreg_immut_masks[NUM_SREGS] = {
+    [HEX_SREG_STID] = 0xff00ff00,
+    [HEX_SREG_ELR] = 0x00000003,
+    [HEX_SREG_SSR] = 0x00008000,
+    [HEX_SREG_CCR] = 0x10e0ff24,
+    [HEX_SREG_HTID] = IMMUTABLE,
+    [HEX_SREG_IMASK] = 0xffff0000,
+    [HEX_SREG_GEVB] = 0x000000ff,
+    [HEX_SREG_EVB] = 0x000000ff,
+    [HEX_SREG_MODECTL] = IMMUTABLE,
+    [HEX_SREG_SYSCFG] = 0x80001c00,
+    [HEX_SREG_IPENDAD] = IMMUTABLE,
+    [HEX_SREG_VID] = 0xfc00fc00,
+    [HEX_SREG_VID1] = 0xfc00fc00,
+    [HEX_SREG_BESTWAIT] = 0xfffffe00,
+    [HEX_SREG_SCHEDCFG] = 0xfffffef0,
+    [HEX_SREG_CFGBASE] = IMMUTABLE,
+    [HEX_SREG_REV] = IMMUTABLE,
+    [HEX_SREG_ISDBST] = IMMUTABLE,
+    [HEX_SREG_ISDBCFG0] = 0xe0000000,
+    [HEX_SREG_BRKPTPC0] = 0x00000003,
+    [HEX_SREG_BRKPTCFG0] = 0xfc007000,
+    [HEX_SREG_BRKPTPC1] = 0x00000003,
+    [HEX_SREG_BRKPTCFG1] = 0xfc007000,
+    [HEX_SREG_ISDBMBXIN] = IMMUTABLE,
+    [HEX_SREG_ISDBEN] = 0xfffffffe,
+    [HEX_SREG_TIMERLO] = IMMUTABLE,
+    [HEX_SREG_TIMERHI] = IMMUTABLE,
+};
+
+G_GNUC_UNUSED
+static void gen_log_sreg_write(DisasContext *ctx, int rnum, TCGv val)
+{
+    const target_ulong reg_mask = sreg_immut_masks[rnum];
+
+    if (reg_mask != IMMUTABLE) {
+        if (rnum < HEX_SREG_GLB_START) {
+            gen_masked_reg_write(val, hex_t_sreg[rnum], reg_mask);
+            tcg_gen_mov_tl(ctx->t_sreg_new_value[rnum], val);
+        } else {
+            gen_masked_reg_write(val, hex_g_sreg[rnum], reg_mask);
+            gen_helper_sreg_write(tcg_env, tcg_constant_i32(rnum), val);
+        }
+    }
+}
+
+G_GNUC_UNUSED
+static void gen_log_sreg_write_pair(DisasContext *ctx, int rnum, TCGv_i64 val)
+{
+    TCGv val32 = tcg_temp_new();
+
+    /* Low word */
+    tcg_gen_extrl_i64_i32(val32, val);
+    gen_log_sreg_write(ctx, rnum, val32);
+
+    /* High word */
+    tcg_gen_extrh_i64_i32(val32, val);
+    gen_log_sreg_write(ctx, rnum + 1, val32);
+}
+
+G_GNUC_UNUSED
+static void gen_read_sreg(TCGv dst, int reg_num)
+{
+    if (reg_num >= HEX_SREG_GLB_START || reg_num == HEX_SREG_BADVA) {
+        gen_helper_sreg_read(dst, tcg_env, tcg_constant_i32(reg_num));
+    } else {
+        tcg_gen_mov_tl(dst, hex_t_sreg[reg_num]);
+    }
+}
+
+G_GNUC_UNUSED
+static void gen_read_sreg_pair(TCGv_i64 dst, int reg_num)
+{
+    if (reg_num < HEX_SREG_GLB_START) {
+        if (reg_num + 1 == HEX_SREG_BADVA) {
+            TCGv badva = tcg_temp_new();
+            gen_helper_sreg_read(badva, tcg_env,
+                                 tcg_constant_tl(HEX_SREG_BADVA));
+            tcg_gen_concat_i32_i64(dst, hex_t_sreg[reg_num], badva);
+        } else {
+            tcg_gen_concat_i32_i64(dst, hex_t_sreg[reg_num],
+                                        hex_t_sreg[reg_num + 1]);
+        }
+    } else {
+        gen_helper_sreg_read_pair(dst, tcg_env, tcg_constant_tl(reg_num));
+    }
+}
+
+G_GNUC_UNUSED
+static void gen_read_greg(TCGv dst, int reg_num)
+{
+    gen_helper_greg_read(dst, tcg_env, tcg_constant_tl(reg_num));
+}
+
+G_GNUC_UNUSED
+static void gen_read_greg_pair(TCGv_i64 dst, int reg_num)
+{
+    gen_helper_greg_read_pair(dst, tcg_env, tcg_constant_tl(reg_num));
+}
+#endif
+
+
 void gen_log_pred_write(DisasContext *ctx, int pnum, TCGv val)
 {
     TCGv pred = get_result_pred(ctx, pnum);
@@ -183,6 +346,11 @@ static inline void gen_read_ctrl_reg(DisasContext *ctx, const int reg_num,
     } else if (reg_num == HEX_REG_QEMU_HVX_CNT) {
         tcg_gen_addi_tl(dest, hex_gpr[HEX_REG_QEMU_HVX_CNT],
                         ctx->num_hvx_insns);
+    } else if ((reg_num == HEX_REG_PKTCNTLO)
+            || (reg_num == HEX_REG_PKTCNTHI)
+            || (reg_num == HEX_REG_UTIMERLO)
+            || (reg_num == HEX_REG_UTIMERHI)) {
+        gen_helper_creg_read(dest, tcg_env, tcg_constant_tl(reg_num));
     } else {
         tcg_gen_mov_tl(dest, hex_gpr[reg_num]);
     }
@@ -211,6 +379,10 @@ static inline void gen_read_ctrl_reg_pair(DisasContext *ctx, const int reg_num,
         tcg_gen_addi_tl(hvx_cnt, hex_gpr[HEX_REG_QEMU_HVX_CNT],
                         ctx->num_hvx_insns);
         tcg_gen_concat_i32_i64(dest, hvx_cnt, hex_gpr[reg_num + 1]);
+    } else if ((reg_num == HEX_REG_PKTCNTLO)
+            || (reg_num == HEX_REG_UTIMERLO)
+            || (reg_num == HEX_REG_UPCYCLELO)) {
+        gen_helper_creg_read_pair(dest, tcg_env, tcg_constant_i32(reg_num));
     } else {
         tcg_gen_concat_i32_i64(dest,
             hex_gpr[reg_num],
@@ -395,7 +567,8 @@ static inline void gen_store_conditional8(DisasContext *ctx,
 #ifndef CONFIG_HEXAGON_IDEF_PARSER
 static TCGv gen_slotval(DisasContext *ctx)
 {
-    int slotval = (ctx->pkt->pkt_has_store_s1 & 1) | (ctx->insn->slot << 1);
+    int slotval =
+        (ctx->pkt->pkt_has_scalar_store_s1 & 1) | (ctx->insn->slot << 1);
     return tcg_constant_tl(slotval);
 }
 #endif
@@ -471,14 +644,15 @@ static void gen_write_new_pc_addr(DisasContext *ctx, TCGv addr,
         tcg_gen_brcondi_tl(cond, pred, 0, pred_false);
     }
 
+    TCGv PC_wr = ctx->need_next_pc ? hex_next_PC : hex_gpr[HEX_REG_PC];
     if (ctx->pkt->pkt_has_multi_cof) {
         /* If there are multiple branches in a packet, ignore the second one */
-        tcg_gen_movcond_tl(TCG_COND_NE, hex_gpr[HEX_REG_PC],
+        tcg_gen_movcond_tl(TCG_COND_NE, PC_wr,
                            ctx->branch_taken, tcg_constant_tl(0),
-                           hex_gpr[HEX_REG_PC], addr);
+                           PC_wr, addr);
         tcg_gen_movi_tl(ctx->branch_taken, 1);
     } else {
-        tcg_gen_mov_tl(hex_gpr[HEX_REG_PC], addr);
+        tcg_gen_mov_tl(PC_wr, addr);
     }
 
     if (cond != TCG_COND_ALWAYS) {
diff --git a/target/hexagon/helper.h b/target/hexagon/helper.h
index f8baa599c88c..b381e0e116b3 100644
--- a/target/hexagon/helper.h
+++ b/target/hexagon/helper.h
@@ -18,7 +18,7 @@
 #include "internal.h"
 #include "helper_protos_generated.h.inc"
 
-DEF_HELPER_FLAGS_2(raise_exception, TCG_CALL_NO_RETURN, noreturn, env, i32)
+DEF_HELPER_FLAGS_3(raise_exception, TCG_CALL_NO_RETURN, noreturn, env, i32, i32)
 DEF_HELPER_2(commit_store, void, env, int)
 DEF_HELPER_3(gather_store, void, env, i32, int)
 DEF_HELPER_1(commit_hvx_stores, void, env)
@@ -107,3 +107,31 @@ DEF_HELPER_4(probe_noshuf_load, void, env, i32, int, int)
 DEF_HELPER_2(probe_pkt_scalar_store_s0, void, env, int)
 DEF_HELPER_2(probe_hvx_stores, void, env, int)
 DEF_HELPER_2(probe_pkt_scalar_hvx_stores, void, env, int)
+
+DEF_HELPER_2(creg_read, i32, env, i32)
+DEF_HELPER_2(creg_read_pair, i64, env, i32)
+#if !defined(CONFIG_USER_ONLY)
+DEF_HELPER_2(swi, void, env, i32)
+DEF_HELPER_2(cswi, void, env, i32)
+DEF_HELPER_2(ciad, void, env, i32)
+DEF_HELPER_2(siad, void, env, i32)
+DEF_HELPER_2(iassignw, void, env, i32)
+DEF_HELPER_2(iassignr, i32, env, i32)
+DEF_HELPER_2(getimask, i32, env, i32)
+DEF_HELPER_3(setimask, void, env, i32, i32)
+DEF_HELPER_2(sreg_read, i32, env, i32)
+DEF_HELPER_2(sreg_read_pair, i64, env, i32)
+DEF_HELPER_2(greg_read, i32, env, i32)
+DEF_HELPER_2(greg_read_pair, i64, env, i32)
+DEF_HELPER_3(sreg_write, void, env, i32, i32)
+DEF_HELPER_3(sreg_write_pair, void, env, i32, i64)
+DEF_HELPER_3(setprio, void, env, i32, i32)
+DEF_HELPER_2(start, void, env, i32)
+DEF_HELPER_1(stop, void, env)
+DEF_HELPER_2(wait, void, env, i32)
+DEF_HELPER_2(resume, void, env, i32)
+DEF_HELPER_2(nmi, void, env, i32)
+DEF_HELPER_1(resched, void, env)
+DEF_HELPER_3(modify_ssr, void, env, i32, i32)
+DEF_HELPER_1(pending_interrupt, void, env)
+#endif
diff --git a/target/hexagon/hex_common.py b/target/hexagon/hex_common.py
index 758e5fd12dfe..4ce275363acf 100755
--- a/target/hexagon/hex_common.py
+++ b/target/hexagon/hex_common.py
@@ -33,6 +33,41 @@
 overrides = {}  # tags with helper overrides
 idef_parser_enabled = {}  # tags enabled for idef-parser
 
+
+def is_sysemu_tag(tag):
+    return "A_PRIV" in attribdict[tag] or "A_GUEST" in attribdict[tag]
+
+
+def tag_ignore(tag):
+    tag_skips = (
+        "Y6_diag",
+        "Y6_diag0",
+        "Y6_diag1",
+    )
+    attr_skips = (
+        "A_FAKEINSN",
+        "A_MAPPING",
+    )
+    return tag in tag_skips or \
+        any(attr in attribdict[tag] for attr in attr_skips)
+
+
+def get_sys_tags():
+    return sorted(
+        tag for tag in frozenset(tags) if is_sysemu_tag(tag)
+    )
+
+
+def get_user_tags():
+    return sorted(
+        tag for tag in frozenset(tags) if not is_sysemu_tag(tag)
+    )
+
+
+def get_all_tags():
+    return get_user_tags() + get_sys_tags()
+
+
 # We should do this as a hash for performance,
 # but to keep order let's keep it as a list.
 def uniquify(seq):
@@ -93,8 +128,13 @@ def calculate_attribs():
     add_qemu_macro_attrib("fTRAP", "A_IMPLICIT_READS_PC")
     add_qemu_macro_attrib("fSET_OVERFLOW", "A_IMPLICIT_WRITES_USR")
     add_qemu_macro_attrib("fSET_LPCFG", "A_IMPLICIT_WRITES_USR")
+    add_qemu_macro_attrib("fLOAD_LOCKED", "A_LLSC")
+    add_qemu_macro_attrib("fSTORE_LOCKED", "A_LLSC")
+    add_qemu_macro_attrib("fCLEAR_RTE_EX", "A_IMPLICIT_WRITES_SSR")
     add_qemu_macro_attrib("fLOAD", "A_SCALAR_LOAD")
     add_qemu_macro_attrib("fSTORE", "A_SCALAR_STORE")
+    add_qemu_macro_attrib("fSET_K0_LOCK", "A_IMPLICIT_READS_PC")
+    add_qemu_macro_attrib("fSET_TLB_LOCK", "A_IMPLICIT_READS_PC")
     add_qemu_macro_attrib('fLSBNEW0', 'A_IMPLICIT_READS_P0')
     add_qemu_macro_attrib('fLSBNEW0NOT', 'A_IMPLICIT_READS_P0')
     add_qemu_macro_attrib('fREAD_P0', 'A_IMPLICIT_READS_P0')
@@ -213,9 +253,12 @@ def is_hvx_insn(tag):
 def need_env(tag):
     return ("A_STORE" in attribdict[tag] or
             "A_LOAD" in attribdict[tag] or
+            "A_DMA" in attribdict[tag] or
             "A_CVI_GATHER" in attribdict[tag] or
             "A_CVI_SCATTER" in attribdict[tag] or
-            "A_IMPLICIT_WRITES_USR" in attribdict[tag])
+            "A_IMPLICIT_WRITES_USR" in attribdict[tag] or
+            "A_PRIV" in attribdict[tag] or
+            "J2_trap" in tag)
 
 
 def need_slot(tag):
@@ -224,6 +267,9 @@ def need_slot(tag):
         and "A_CVI_GATHER" not in attribdict[tag]
         and ("A_STORE" in attribdict[tag]
              or "A_LOAD" in attribdict[tag])
+        and tag != "L4_loadw_phys"
+        and tag != "L6_memcpy"
+        and tag != "Y6_dmlink"
     ):
         return 1
     else:
@@ -247,7 +293,11 @@ def need_next_PC(tag):
 
 
 def need_pkt_has_multi_cof(tag):
-    return "A_COF" in attribdict[tag]
+    if "A_JUMP" in attribdict[tag] or "A_CALL" in attribdict[tag]:
+        if tag == "J4_hintjumpr":
+            return False
+        return True
+    return False
 
 
 def need_pkt_need_commit(tag):
@@ -366,12 +416,16 @@ def helper_proto_type(self):
         return "s32"
     def helper_arg_type(self):
         return "int32_t"
+    def is_pair(self):
+        return False
 
 class Pair(Scalar):
     def helper_proto_type(self):
         return "s64"
     def helper_arg_type(self):
         return "int64_t"
+    def is_pair(self):
+        return True
 
 class Hvx:
     def is_scalar_reg(self):
@@ -1009,6 +1063,120 @@ def analyze_write(self, f, tag, regno):
             ctx_log_qreg_write(ctx, {self.reg_num}, insn_has_hvx_helper);
         """))
 
+class GuestRegister(Register):
+    def gen_check_impl(self, f, regno):
+        if self.is_written():
+            f.write(code_fmt(f"""\
+                if (!greg_writable(insn->regno[{regno}],
+                    {str(self.is_pair()).lower()})) {{
+                    return;
+                }}
+            """))
+        else:
+            f.write(code_fmt(f"""\
+check_greg_impl(insn->regno[{regno}], {str(self.is_pair()).lower()});
+            """))
+
+class GuestDest(GuestRegister, Single, Dest):
+    def decl_tcg(self, f, tag, regno):
+        self.decl_reg_num(f, regno)
+        self.gen_check_impl(f, regno)
+        f.write(code_fmt(f"""\
+            TCGv {self.reg_tcg()} = tcg_temp_new();
+            gen_read_greg({self.reg_tcg()}, {self.reg_num});
+        """))
+    def log_write(self, f, tag):
+        f.write(code_fmt(f"""\
+            gen_log_greg_write(ctx, {self.reg_num}, {self.reg_tcg()});
+        """))
+    def analyze_write(self, f, tag, regno):
+        f.write(code_fmt(f"""\
+            ctx_log_greg_write(ctx, {self.reg_num});
+        """))
+
+class GuestSource(GuestRegister, Single, OldSource):
+    def decl_tcg(self, f, tag, regno):
+        self.decl_reg_num(f, regno);
+        self.gen_check_impl(f, regno)
+        f.write(code_fmt(f"""\
+            TCGv {self.reg_tcg()} = tcg_temp_new();
+            gen_read_greg({self.reg_tcg()}, {self.reg_num});
+        """))
+
+class GuestPairDest(GuestRegister, Pair, Dest):
+    def decl_tcg(self, f, tag, regno):
+        self.decl_reg_num(f, regno)
+        self.gen_check_impl(f, regno)
+        f.write(code_fmt(f"""\
+            TCGv_i64 {self.reg_tcg()} = tcg_temp_new_i64();
+            gen_read_greg_pair({self.reg_tcg()}, {self.reg_num});
+        """))
+    def log_write(self, f, tag):
+        f.write(code_fmt(f"""\
+            gen_log_greg_write_pair(ctx, {self.reg_num}, {self.reg_tcg()});
+        """))
+    def analyze_write(self, f, tag, regno):
+        f.write(code_fmt(f"""\
+            ctx_log_greg_write_pair(ctx, {self.reg_num});
+        """))
+
+class GuestPairSource(GuestRegister, Pair, OldSource):
+    def decl_tcg(self, f, tag, regno):
+        self.decl_reg_num(f, regno)
+        self.gen_check_impl(f, regno)
+        f.write(code_fmt(f"""\
+            TCGv_i64 {self.reg_tcg()} = tcg_temp_new_i64();
+            gen_read_greg_pair({self.reg_tcg()}, {self.reg_num});
+        """))
+
+class SystemDest(Register, Single, Dest):
+    def decl_tcg(self, f, tag, regno):
+        self.decl_reg_num(f, regno)
+        f.write(code_fmt(f"""\
+            TCGv {self.reg_tcg()} = tcg_temp_new();
+            gen_read_sreg({self.reg_tcg()}, {self.reg_num});
+        """))
+    def log_write(self, f, tag):
+        f.write(code_fmt(f"""\
+            gen_log_sreg_write(ctx, {self.reg_num}, {self.reg_tcg()});
+        """))
+    def analyze_write(self, f, tag, regno):
+        f.write(code_fmt(f"""\
+            ctx_log_sreg_write(ctx, {self.reg_num});
+        """))
+
+class SystemSource(Register, Single, OldSource):
+    def decl_tcg(self, f, tag, regno):
+        self.decl_reg_num(f, regno);
+        f.write(code_fmt(f"""\
+            TCGv {self.reg_tcg()} = tcg_temp_new();
+            gen_read_sreg({self.reg_tcg()}, {self.reg_num});
+        """))
+
+class SystemPairDest(Register, Pair, Dest):
+    def decl_tcg(self, f, tag, regno):
+        self.decl_reg_num(f, regno)
+        f.write(code_fmt(f"""\
+            TCGv_i64 {self.reg_tcg()} = tcg_temp_new_i64();
+            gen_read_sreg_pair({self.reg_tcg()}, {self.reg_num});
+        """))
+    def log_write(self, f, tag):
+        f.write(code_fmt(f"""\
+            gen_log_sreg_write_pair(ctx, {self.reg_num}, {self.reg_tcg()});
+        """))
+    def analyze_write(self, f, tag, regno):
+        f.write(code_fmt(f"""\
+            ctx_log_sreg_write_pair(ctx, {self.reg_num});
+        """))
+
+class SystemPairSource(Register, Pair, OldSource):
+    def decl_tcg(self, f, tag, regno):
+        self.decl_reg_num(f, regno)
+        f.write(code_fmt(f"""\
+            TCGv_i64 {self.reg_tcg()} = tcg_temp_new_i64();
+            gen_read_sreg_pair({self.reg_tcg()}, {self.reg_num});
+        """))
+
 def init_registers():
     regs = {
         GprDest("R", "d"),
@@ -1055,6 +1223,16 @@ def init_registers():
         QRegSource("Q", "u"),
         QRegSource("Q", "v"),
         QRegReadWrite("Q", "x"),
+
+        # system regs
+        GuestDest("G", "d"),
+        GuestSource("G", "s"),
+        GuestPairDest("G", "dd"),
+        GuestPairSource("G", "ss"),
+        SystemDest("S", "d"),
+        SystemSource("S", "s"),
+        SystemPairDest("S", "dd"),
+        SystemPairSource("S", "ss"),
     }
     for reg in regs:
         registers[f"{reg.regtype}{reg.regid}"] = reg
@@ -1070,11 +1248,18 @@ def init_registers():
     for reg in new_regs:
         new_registers[f"{reg.regtype}{reg.regid}"] = reg
 
+def is_new_reg(tag, regid):
+    if regid[0] in "NO":
+        return True
+    return regid[0] == "P" and \
+           f"{regid}N" in semdict[tag] and \
+           f"{regid}V" not in semdict[tag]
+
 def get_register(tag, regtype, regid):
-    if f"{regtype}{regid}V" in semdict[tag]:
-        return registers[f"{regtype}{regid}"]
-    else:
-        return new_registers[f"{regtype}{regid}"]
+    regid = f"{regtype}{regid}"
+    is_new = is_new_reg(tag, regid)
+    reg = new_registers[regid] if is_new else registers[regid]
+    return reg
 
 def helper_ret_type(tag, regs):
     ## If there is a scalar result, it is the return type
@@ -1187,6 +1372,7 @@ def parse_common_args(desc):
     parser.add_argument("semantics", help="semantics file")
     parser.add_argument("overrides", help="overrides file")
     parser.add_argument("overrides_vec", help="vector overrides file")
+    parser.add_argument("overrides_sys", help="system overrides file")
     parser.add_argument("out", help="output file")
     parser.add_argument("--idef-parser",
                         help="file of instructions translated by idef-parser")
@@ -1194,6 +1380,7 @@ def parse_common_args(desc):
     read_semantics_file(args.semantics)
     read_overrides_file(args.overrides)
     read_overrides_file(args.overrides_vec)
+    read_overrides_file(args.overrides_sys)
     if args.idef_parser:
         read_idef_parser_enabled_file(args.idef_parser)
     calculate_attribs()
diff --git a/target/hexagon/hex_interrupts.c b/target/hexagon/hex_interrupts.c
new file mode 100644
index 000000000000..fd00bcfb9a57
--- /dev/null
+++ b/target/hexagon/hex_interrupts.c
@@ -0,0 +1,324 @@
+/*
+ * Copyright(c) 2022-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "qemu/main-loop.h"
+#include "cpu.h"
+#include "hex_interrupts.h"
+#include "macros.h"
+#include "sys_macros.h"
+#include "system/cpus.h"
+
+static bool hex_is_qualified_for_int(CPUHexagonState *env, int int_num);
+
+static bool get_syscfg_gie(CPUHexagonState *env)
+{
+    target_ulong syscfg = arch_get_system_reg(env, HEX_SREG_SYSCFG);
+    return GET_SYSCFG_FIELD(SYSCFG_GIE, syscfg);
+}
+
+static bool get_ssr_ex(CPUHexagonState *env)
+{
+    target_ulong ssr = arch_get_system_reg(env, HEX_SREG_SSR);
+    return GET_SSR_FIELD(SSR_EX, ssr);
+}
+
+static bool get_ssr_ie(CPUHexagonState *env)
+{
+    target_ulong ssr = arch_get_system_reg(env, HEX_SREG_SSR);
+    return GET_SSR_FIELD(SSR_IE, ssr);
+}
+
+/* Do these together so we only have to call hexagon_modify_ssr once */
+static void set_ssr_ex_cause(CPUHexagonState *env, int ex, uint32_t cause)
+{
+    target_ulong old = arch_get_system_reg(env, HEX_SREG_SSR);
+    SET_SYSTEM_FIELD(env, HEX_SREG_SSR, SSR_EX, ex);
+    SET_SYSTEM_FIELD(env, HEX_SREG_SSR, SSR_CAUSE, cause);
+    target_ulong new = arch_get_system_reg(env, HEX_SREG_SSR);
+    hexagon_modify_ssr(env, new, old);
+}
+
+static bool get_iad_bit(CPUHexagonState *env, int int_num)
+{
+    target_ulong ipendad = arch_get_system_reg(env, HEX_SREG_IPENDAD);
+    target_ulong iad = GET_FIELD(IPENDAD_IAD, ipendad);
+    return extract32(iad, int_num, 1);
+}
+
+static void set_iad_bit(CPUHexagonState *env, int int_num, int val)
+{
+    target_ulong ipendad = arch_get_system_reg(env, HEX_SREG_IPENDAD);
+    target_ulong iad = GET_FIELD(IPENDAD_IAD, ipendad);
+    iad = deposit32(iad, int_num, 1, val);
+    fSET_FIELD(ipendad, IPENDAD_IAD, iad);
+    arch_set_system_reg(env, HEX_SREG_IPENDAD, ipendad);
+}
+
+static uint32_t get_ipend(CPUHexagonState *env)
+{
+    target_ulong ipendad = arch_get_system_reg(env, HEX_SREG_IPENDAD);
+    return GET_FIELD(IPENDAD_IPEND, ipendad);
+}
+
+static inline bool get_ipend_bit(CPUHexagonState *env, int int_num)
+{
+    target_ulong ipendad = arch_get_system_reg(env, HEX_SREG_IPENDAD);
+    target_ulong ipend = GET_FIELD(IPENDAD_IPEND, ipendad);
+    return extract32(ipend, int_num, 1);
+}
+
+static void clear_ipend(CPUHexagonState *env, uint32_t mask)
+{
+    target_ulong ipendad = arch_get_system_reg(env, HEX_SREG_IPENDAD);
+    target_ulong ipend = GET_FIELD(IPENDAD_IPEND, ipendad);
+    ipend &= ~mask;
+    fSET_FIELD(ipendad, IPENDAD_IPEND, ipend);
+    arch_set_system_reg(env, HEX_SREG_IPENDAD, ipendad);
+}
+
+static void set_ipend(CPUHexagonState *env, uint32_t mask)
+{
+    target_ulong ipendad = arch_get_system_reg(env, HEX_SREG_IPENDAD);
+    target_ulong ipend = GET_FIELD(IPENDAD_IPEND, ipendad);
+    ipend |= mask;
+    fSET_FIELD(ipendad, IPENDAD_IPEND, ipend);
+    arch_set_system_reg(env, HEX_SREG_IPENDAD, ipendad);
+}
+
+static void set_ipend_bit(CPUHexagonState *env, int int_num, int val)
+{
+    target_ulong ipendad = arch_get_system_reg(env, HEX_SREG_IPENDAD);
+    target_ulong ipend = GET_FIELD(IPENDAD_IPEND, ipendad);
+    ipend = deposit32(ipend, int_num, 1, val);
+    fSET_FIELD(ipendad, IPENDAD_IPEND, ipend);
+    arch_set_system_reg(env, HEX_SREG_IPENDAD, ipendad);
+}
+
+static bool get_imask_bit(CPUHexagonState *env, int int_num)
+{
+    target_ulong imask = arch_get_system_reg(env, HEX_SREG_IMASK);
+    return extract32(imask, int_num, 1);
+}
+
+static uint32_t get_prio(CPUHexagonState *env)
+{
+    target_ulong stid = arch_get_system_reg(env, HEX_SREG_STID);
+    return extract32(stid, reg_field_info[STID_PRIO].offset,
+                     reg_field_info[STID_PRIO].width);
+}
+
+static void set_elr(CPUHexagonState *env, target_ulong val)
+{
+    arch_set_system_reg(env, HEX_SREG_ELR, val);
+}
+
+static bool get_schedcfgen(CPUHexagonState *env)
+{
+    target_ulong schedcfg = arch_get_system_reg(env, HEX_SREG_SCHEDCFG);
+    return extract32(schedcfg, reg_field_info[SCHEDCFG_EN].offset,
+                     reg_field_info[SCHEDCFG_EN].width);
+}
+
+static bool is_lowest_prio(CPUHexagonState *env, int int_num)
+{
+    uint32_t my_prio = get_prio(env);
+    CPUState *cs;
+
+    CPU_FOREACH(cs) {
+        CPUHexagonState *hex_env = cpu_env(cs);
+        if (!hex_is_qualified_for_int(hex_env, int_num)) {
+            continue;
+        }
+
+        /* Note that lower values indicate *higher* priority */
+        if (my_prio < get_prio(hex_env)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+static bool hex_is_qualified_for_int(CPUHexagonState *env, int int_num)
+{
+    bool syscfg_gie = get_syscfg_gie(env);
+    bool iad = get_iad_bit(env, int_num);
+    bool ssr_ie = get_ssr_ie(env);
+    bool ssr_ex = get_ssr_ex(env);
+    bool imask = get_imask_bit(env, int_num);
+
+    return syscfg_gie && !iad && ssr_ie && !ssr_ex && !imask;
+}
+
+static void clear_pending_locks(CPUHexagonState *env)
+{
+    g_assert(bql_locked());
+    if (env->k0_lock_state == HEX_LOCK_WAITING) {
+        env->k0_lock_state = HEX_LOCK_UNLOCKED;
+    }
+    if (env->tlb_lock_state == HEX_LOCK_WAITING) {
+        env->tlb_lock_state = HEX_LOCK_UNLOCKED;
+    }
+}
+
+static bool should_not_exec(CPUHexagonState *env)
+{
+    return (get_exe_mode(env) == HEX_EXE_MODE_WAIT);
+}
+
+static void restore_state(CPUHexagonState *env, bool int_accepted)
+{
+    CPUState *cs = env_cpu(env);
+    cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD | CPU_INTERRUPT_SWI);
+    if (!int_accepted && should_not_exec(env)) {
+        cpu_interrupt(cs, CPU_INTERRUPT_HALT);
+    }
+}
+
+static void hex_accept_int(CPUHexagonState *env, int int_num)
+{
+    CPUState *cs = env_cpu(env);
+    target_ulong evb = arch_get_system_reg(env, HEX_SREG_EVB);
+    const int exe_mode = get_exe_mode(env);
+    const bool in_wait_mode = exe_mode == HEX_EXE_MODE_WAIT;
+
+    set_ipend_bit(env, int_num, 0);
+    set_iad_bit(env, int_num, 1);
+    set_ssr_ex_cause(env, 1, HEX_CAUSE_INT0 | int_num);
+    cs->exception_index = HEX_EVENT_INT0 + int_num;
+    env->cause_code = HEX_EVENT_INT0 + int_num;
+    clear_pending_locks(env);
+    if (in_wait_mode) {
+        qemu_log_mask(CPU_LOG_INT,
+            "%s: thread %d resuming, exiting WAIT mode\n",
+            __func__, env->threadId);
+        set_elr(env, env->wait_next_pc);
+        clear_wait_mode(env);
+        cs->halted = false;
+    } else if (env->k0_lock_state == HEX_LOCK_WAITING) {
+        g_assert_not_reached();
+    } else {
+        set_elr(env, env->gpr[HEX_REG_PC]);
+    }
+    env->gpr[HEX_REG_PC] = evb | (cs->exception_index << 2);
+    if (get_ipend(env) == 0) {
+        restore_state(env, true);
+    }
+}
+
+
+bool hex_check_interrupts(CPUHexagonState *env)
+{
+    CPUState *cs = env_cpu(env);
+    bool int_handled = false;
+    bool ssr_ex = get_ssr_ex(env);
+    int max_ints = 32;
+    bool schedcfgen;
+
+    /* Early exit if nothing pending */
+    if (get_ipend(env) == 0) {
+        restore_state(env, false);
+        return false;
+    }
+
+    BQL_LOCK_GUARD();
+    /* Only check priorities when schedcfgen is set */
+    schedcfgen = get_schedcfgen(env);
+    for (int i = 0; i < max_ints; i++) {
+        if (!get_iad_bit(env, i) && get_ipend_bit(env, i)) {
+            qemu_log_mask(CPU_LOG_INT,
+                          "%s: thread[%d] pc = 0x%x found int %d\n", __func__,
+                          env->threadId, env->gpr[HEX_REG_PC], i);
+            if (hex_is_qualified_for_int(env, i) &&
+                (!schedcfgen || is_lowest_prio(env, i))) {
+                qemu_log_mask(CPU_LOG_INT, "%s: thread[%d] int %d handled_\n",
+                    __func__, env->threadId, i);
+                hex_accept_int(env, i);
+                int_handled = true;
+                break;
+            }
+            bool syscfg_gie = get_syscfg_gie(env);
+            bool iad = get_iad_bit(env, i);
+            bool ssr_ie = get_ssr_ie(env);
+            bool imask = get_imask_bit(env, i);
+
+            qemu_log_mask(CPU_LOG_INT,
+                          "%s: thread[%d] int %d not handled, qualified: %d, "
+                          "schedcfg_en: %d, low prio %d\n",
+                          __func__, env->threadId, i,
+                          hex_is_qualified_for_int(env, i), schedcfgen,
+                          is_lowest_prio(env, i));
+
+            qemu_log_mask(CPU_LOG_INT,
+                          "%s: thread[%d] int %d not handled, GIE %d, iad %d, "
+                          "SSR:IE %d, SSR:EX: %d, imask bit %d\n",
+                          __func__, env->threadId, i, syscfg_gie, iad, ssr_ie,
+                          ssr_ex, imask);
+        }
+    }
+
+    /*
+     * If we didn't handle the interrupt and it wasn't
+     * because we were in EX state, then we won't be able
+     * to execute the interrupt on this CPU unless something
+     * changes in the CPU state.  Clear the interrupt_request bits
+     * while preserving the IPEND bits, and we can re-assert the
+     * interrupt_request bit(s) when we execute one of those instructions.
+     */
+    if (!int_handled && !ssr_ex) {
+        restore_state(env, int_handled);
+    } else if (int_handled) {
+        assert(!cs->halted);
+    }
+
+    return int_handled;
+}
+
+void hex_clear_interrupts(CPUHexagonState *env, uint32_t mask, uint32_t type)
+{
+    if (mask == 0) {
+        return;
+    }
+
+    /*
+     * Notify all CPUs that the interrupt has happened
+     */
+    BQL_LOCK_GUARD();
+    clear_ipend(env, mask);
+    hex_interrupt_update(env);
+}
+
+void hex_raise_interrupts(CPUHexagonState *env, uint32_t mask, uint32_t type)
+{
+    g_assert(bql_locked());
+    if (mask == 0) {
+        return;
+    }
+
+    /*
+     * Notify all CPUs that the interrupt has happened
+     */
+    set_ipend(env, mask);
+    hex_interrupt_update(env);
+}
+
+void hex_interrupt_update(CPUHexagonState *env)
+{
+    CPUState *cs;
+
+    g_assert(bql_locked());
+    if (get_ipend(env) != 0) {
+        CPU_FOREACH(cs) {
+            CPUHexagonState *hex_env = cpu_env(cs);
+            const int exe_mode = get_exe_mode(hex_env);
+            if (exe_mode != HEX_EXE_MODE_OFF) {
+                cs->interrupt_request |= CPU_INTERRUPT_SWI;
+                cpu_resume(cs);
+            }
+        }
+    }
+}
diff --git a/target/hexagon/hex_interrupts.h b/target/hexagon/hex_interrupts.h
new file mode 100644
index 000000000000..17a243946ce2
--- /dev/null
+++ b/target/hexagon/hex_interrupts.h
@@ -0,0 +1,15 @@
+/*
+ * Copyright(c) 2022-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HEX_INTERRUPTS_H
+#define HEX_INTERRUPTS_H
+
+bool hex_check_interrupts(CPUHexagonState *env);
+void hex_clear_interrupts(CPUHexagonState *env, uint32_t mask, uint32_t type);
+void hex_raise_interrupts(CPUHexagonState *env, uint32_t mask, uint32_t type);
+void hex_interrupt_update(CPUHexagonState *env);
+
+#endif
diff --git a/target/hexagon/hex_mmu.c b/target/hexagon/hex_mmu.c
new file mode 100644
index 000000000000..8037528a2ccd
--- /dev/null
+++ b/target/hexagon/hex_mmu.c
@@ -0,0 +1,603 @@
+/*
+ * Copyright(c) 2019-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/main-loop.h"
+#include "qemu/qemu-print.h"
+#include "cpu.h"
+#include "system/cpus.h"
+#include "internal.h"
+#include "exec/exec-all.h"
+#include "hex_mmu.h"
+#include "macros.h"
+#include "sys_macros.h"
+#include "reg_fields.h"
+
+#define GET_TLB_FIELD(ENTRY, FIELD)                               \
+    ((uint64_t)fEXTRACTU_BITS(ENTRY, reg_field_info[FIELD].width, \
+                              reg_field_info[FIELD].offset))
+
+/* PPD (physical page descriptor) */
+static inline uint64_t GET_PPD(uint64_t entry)
+{
+    return GET_TLB_FIELD(entry, PTE_PPD) |
+        (GET_TLB_FIELD(entry, PTE_PA35) << reg_field_info[PTE_PPD].width);
+}
+
+#define NO_ASID      (1 << 8)
+
+typedef enum {
+    PGSIZE_4K,
+    PGSIZE_16K,
+    PGSIZE_64K,
+    PGSIZE_256K,
+    PGSIZE_1M,
+    PGSIZE_4M,
+    PGSIZE_16M,
+    PGSIZE_64M,
+    PGSIZE_256M,
+    PGSIZE_1G,
+    NUM_PGSIZE_TYPES
+} tlb_pgsize_t;
+
+static const char *pgsize_str[NUM_PGSIZE_TYPES] = {
+    "4K",
+    "16K",
+    "64K",
+    "256K",
+    "1M",
+    "4M",
+    "16M",
+    "64M",
+    "256M",
+    "1G",
+};
+
+#define INVALID_MASK 0xffffffffLL
+
+static const uint64_t encmask_2_mask[] = {
+    0x0fffLL,                           /* 4k,   0000 */
+    0x3fffLL,                           /* 16k,  0001 */
+    0xffffLL,                           /* 64k,  0010 */
+    0x3ffffLL,                          /* 256k, 0011 */
+    0xfffffLL,                          /* 1m,   0100 */
+    0x3fffffLL,                         /* 4m,   0101 */
+    0xffffffLL,                         /* 16m,  0110 */
+    0x3ffffffLL,                        /* 64m,  0111 */
+    0xfffffffLL,                        /* 256m, 1000 */
+    0x3fffffffLL,                       /* 1g,   1001 */
+    INVALID_MASK,                      /* RSVD, 0111 */
+};
+
+/*
+ * @return the page size type from @a entry.
+ */
+static inline tlb_pgsize_t hex_tlb_pgsize_type(uint64_t entry)
+{
+    if (entry == 0) {
+        qemu_log_mask(CPU_LOG_MMU, "%s: Supplied TLB entry was 0!\n", __func__);
+        return 0;
+    }
+    tlb_pgsize_t size = ctz64(entry);
+    g_assert(size < NUM_PGSIZE_TYPES);
+    return size;
+}
+
+/*
+ * @return the page size of @a entry, in bytes.
+ */
+static inline uint64_t hex_tlb_page_size_bytes(uint64_t entry)
+{
+    return 1ull << (TARGET_PAGE_BITS + 2 * hex_tlb_pgsize_type(entry));
+}
+
+static inline uint64_t hex_tlb_phys_page_num(uint64_t entry)
+{
+    uint32_t ppd = GET_PPD(entry);
+    return ppd >> 1;
+}
+
+static inline uint64_t hex_tlb_phys_addr(uint64_t entry)
+{
+    uint64_t pagemask = encmask_2_mask[hex_tlb_pgsize_type(entry)];
+    uint64_t pagenum = hex_tlb_phys_page_num(entry);
+    uint64_t PA = (pagenum << TARGET_PAGE_BITS) & (~pagemask);
+    return PA;
+}
+
+static inline uint64_t hex_tlb_virt_addr(uint64_t entry)
+{
+    return (uint64_t)GET_TLB_FIELD(entry, PTE_VPN) << TARGET_PAGE_BITS;
+}
+
+static bool hex_dump_mmu_entry(FILE *f, uint64_t entry)
+{
+    if (GET_TLB_FIELD(entry, PTE_V)) {
+        fprintf(f, "0x%016" PRIx64 ": ", entry);
+        uint64_t PA = hex_tlb_phys_addr(entry);
+        uint64_t VA = hex_tlb_virt_addr(entry);
+        fprintf(f, "V:%" PRId64 " G:%" PRId64 " A1:%" PRId64 " A0:%" PRId64,
+                GET_TLB_FIELD(entry, PTE_V), GET_TLB_FIELD(entry, PTE_G),
+                GET_TLB_FIELD(entry, PTE_ATR1), GET_TLB_FIELD(entry, PTE_ATR0));
+        fprintf(f, " ASID:0x%02" PRIx64 " VA:0x%08" PRIx64,
+                GET_TLB_FIELD(entry, PTE_ASID), VA);
+        fprintf(f,
+                " X:%" PRId64 " W:%" PRId64 " R:%" PRId64 " U:%" PRId64
+                " C:%" PRId64,
+                GET_TLB_FIELD(entry, PTE_X), GET_TLB_FIELD(entry, PTE_W),
+                GET_TLB_FIELD(entry, PTE_R), GET_TLB_FIELD(entry, PTE_U),
+                GET_TLB_FIELD(entry, PTE_C));
+        fprintf(f, " PA:0x%09" PRIx64 " SZ:%s (0x%" PRIx64 ")", PA,
+                pgsize_str[hex_tlb_pgsize_type(entry)],
+                hex_tlb_page_size_bytes(entry));
+        fprintf(f, "\n");
+        return true;
+    }
+
+    /* Not valid */
+    return false;
+}
+
+void dump_mmu(CPUHexagonState *env)
+{
+    HexagonCPU *cpu = env_archcpu(env);
+    for (uint32_t i = 0; i < cpu->num_tlbs; i++) {
+        uint64_t entry = env->hex_tlb->entries[i];
+        if (GET_TLB_FIELD(entry, PTE_V)) {
+            qemu_printf("[%03" PRIu32 "] ", i);
+            qemu_printf("0x%016" PRIx64 ": ", entry);
+            uint64_t PA = hex_tlb_phys_addr(entry);
+            uint64_t VA = hex_tlb_virt_addr(entry);
+            qemu_printf(
+                "V:%" PRId64 " G:%" PRId64 " A1:%" PRId64 " A0:%" PRId64,
+                GET_TLB_FIELD(entry, PTE_V), GET_TLB_FIELD(entry, PTE_G),
+                GET_TLB_FIELD(entry, PTE_ATR1), GET_TLB_FIELD(entry, PTE_ATR0));
+            qemu_printf(" ASID:0x%02" PRIx64 " VA:0x%08" PRIx64,
+                        GET_TLB_FIELD(entry, PTE_ASID), VA);
+            qemu_printf(
+                " X:%" PRId64 " W:%" PRId64 " R:%" PRId64 " U:%" PRId64
+                " C:%" PRId64,
+                GET_TLB_FIELD(entry, PTE_X), GET_TLB_FIELD(entry, PTE_W),
+                GET_TLB_FIELD(entry, PTE_R), GET_TLB_FIELD(entry, PTE_U),
+                GET_TLB_FIELD(entry, PTE_C));
+            qemu_printf(" PA:0x%09" PRIx64 " SZ:%s (0x%" PRIx64 ")", PA,
+                        pgsize_str[hex_tlb_pgsize_type(entry)],
+                        hex_tlb_page_size_bytes(entry));
+            qemu_printf("\n");
+        }
+    }
+}
+
+static inline void hex_log_tlbw(uint32_t index, uint64_t entry)
+{
+    if (qemu_loglevel_mask(CPU_LOG_MMU)) {
+        if (qemu_log_enabled()) {
+            FILE *logfile = qemu_log_trylock();
+            if (logfile) {
+                fprintf(logfile, "tlbw[%03d]: ", index);
+                if (!hex_dump_mmu_entry(logfile, entry)) {
+                    fprintf(logfile, "invalid\n");
+                }
+                qemu_log_unlock(logfile);
+            }
+        }
+    }
+}
+
+void hex_tlbw(CPUHexagonState *env, uint32_t index, uint64_t value)
+{
+    uint32_t myidx = fTLB_NONPOW2WRAP(fTLB_IDXMASK(index));
+    bool old_entry_valid = GET_TLB_FIELD(env->hex_tlb->entries[myidx], PTE_V);
+    if (old_entry_valid && hexagon_cpu_mmu_enabled(env)) {
+        CPUState *cs = env_cpu(env);
+
+        tlb_flush(cs);
+    }
+    env->hex_tlb->entries[myidx] = (value);
+    hex_log_tlbw(myidx, value);
+}
+
+void hex_mmu_realize(CPUHexagonState *env)
+{
+    CPUState *cs = env_cpu(env);
+    if (cs->cpu_index == 0) {
+        env->hex_tlb = g_malloc0(sizeof(CPUHexagonTLBContext));
+    } else {
+        CPUState *cpu0_s = NULL;
+        CPUHexagonState *env0 = NULL;
+        CPU_FOREACH(cpu0_s) {
+            assert(cpu0_s->cpu_index == 0);
+            env0 = &(HEXAGON_CPU(cpu0_s)->env);
+            break;
+        }
+        env->hex_tlb = env0->hex_tlb;
+    }
+}
+
+void hex_mmu_on(CPUHexagonState *env)
+{
+    CPUState *cs = env_cpu(env);
+    qemu_log_mask(CPU_LOG_MMU, "Hexagon MMU turned on!\n");
+    tlb_flush(cs);
+}
+
+void hex_mmu_off(CPUHexagonState *env)
+{
+    CPUState *cs = env_cpu(env);
+    qemu_log_mask(CPU_LOG_MMU, "Hexagon MMU turned off!\n");
+    tlb_flush(cs);
+}
+
+void hex_mmu_mode_change(CPUHexagonState *env)
+{
+    qemu_log_mask(CPU_LOG_MMU, "Hexagon mode change!\n");
+    CPUState *cs = env_cpu(env);
+    tlb_flush(cs);
+}
+
+static inline bool hex_tlb_entry_match_noperm(uint64_t entry, uint32_t asid,
+                                              uint64_t VA)
+{
+    if (GET_TLB_FIELD(entry, PTE_V)) {
+        if (GET_TLB_FIELD(entry, PTE_G)) {
+            /* Global entry - ingnore ASID */
+        } else if (asid != NO_ASID) {
+            uint32_t tlb_asid = GET_TLB_FIELD(entry, PTE_ASID);
+            if (tlb_asid != asid) {
+                return false;
+            }
+        }
+
+        uint64_t page_size = hex_tlb_page_size_bytes(entry);
+        uint64_t page_start =
+            ROUND_DOWN(hex_tlb_virt_addr(entry), page_size);
+        if (page_start <= VA && VA < page_start + page_size) {
+            return true;
+        }
+    }
+    return false;
+}
+
+static inline void hex_tlb_entry_get_perm(CPUHexagonState *env, uint64_t entry,
+                                          MMUAccessType access_type,
+                                          int mmu_idx, int *prot,
+                                          int32_t *excp)
+{
+    bool perm_x = GET_TLB_FIELD(entry, PTE_X);
+    bool perm_w = GET_TLB_FIELD(entry, PTE_W);
+    bool perm_r = GET_TLB_FIELD(entry, PTE_R);
+    bool perm_u = GET_TLB_FIELD(entry, PTE_U);
+    bool user_idx = mmu_idx == MMU_USER_IDX;
+
+    if (mmu_idx == MMU_KERNEL_IDX) {
+        *prot = PAGE_VALID | PAGE_READ | PAGE_WRITE | PAGE_EXEC;
+        return;
+    }
+
+    *prot = PAGE_VALID;
+    switch (access_type) {
+    case MMU_INST_FETCH:
+        if (user_idx && !perm_u) {
+            *excp = HEX_EVENT_PRECISE;
+            env->cause_code = HEX_CAUSE_FETCH_NO_UPAGE;
+        } else if (!perm_x) {
+            *excp = HEX_EVENT_PRECISE;
+            env->cause_code = HEX_CAUSE_FETCH_NO_XPAGE;
+        }
+        break;
+    case MMU_DATA_LOAD:
+        if (user_idx && !perm_u) {
+            *excp = HEX_EVENT_PRECISE;
+            env->cause_code = HEX_CAUSE_PRIV_NO_UREAD;
+        } else if (!perm_r) {
+            *excp = HEX_EVENT_PRECISE;
+            env->cause_code = HEX_CAUSE_PRIV_NO_READ;
+        }
+        break;
+    case MMU_DATA_STORE:
+        if (user_idx && !perm_u) {
+            *excp = HEX_EVENT_PRECISE;
+            env->cause_code = HEX_CAUSE_PRIV_NO_UWRITE;
+        } else if (!perm_w) {
+            *excp = HEX_EVENT_PRECISE;
+            env->cause_code = HEX_CAUSE_PRIV_NO_WRITE;
+        }
+        break;
+    }
+
+    if (!user_idx || perm_u) {
+        if (perm_x) {
+            *prot |= PAGE_EXEC;
+        }
+        if (perm_r) {
+            *prot |= PAGE_READ;
+        }
+        if (perm_w) {
+            *prot |= PAGE_WRITE;
+        }
+    }
+}
+
+static inline bool hex_tlb_entry_match(CPUHexagonState *env, uint64_t entry,
+                                       uint8_t asid, target_ulong VA,
+                                       MMUAccessType access_type, hwaddr *PA,
+                                       int *prot, int *size, int32_t *excp,
+                                       int mmu_idx)
+{
+    if (hex_tlb_entry_match_noperm(entry, asid, VA)) {
+        hex_tlb_entry_get_perm(env, entry, access_type, mmu_idx, prot, excp);
+        *PA = hex_tlb_phys_addr(entry);
+        *size = hex_tlb_page_size_bytes(entry);
+        return true;
+    }
+    return false;
+}
+
+bool hex_tlb_find_match(CPUHexagonState *env, target_ulong VA,
+                        MMUAccessType access_type, hwaddr *PA, int *prot,
+                        int *size, int32_t *excp, int mmu_idx)
+{
+    *PA = 0;
+    *prot = 0;
+    *size = 0;
+    *excp = 0;
+    uint32_t ssr = arch_get_system_reg(env, HEX_SREG_SSR);
+    uint8_t asid = GET_SSR_FIELD(SSR_ASID, ssr);
+    int i;
+    HexagonCPU *cpu = env_archcpu(env);
+    for (i = 0; i < cpu->num_tlbs; i++) {
+        uint64_t entry = env->hex_tlb->entries[i];
+        if (hex_tlb_entry_match(env, entry, asid, VA, access_type, PA, prot,
+                                size, excp, mmu_idx)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+static uint32_t hex_tlb_lookup_by_asid(CPUHexagonState *env, uint32_t asid,
+                                       uint32_t VA)
+{
+    uint32_t not_found = 0x80000000;
+    uint32_t idx = not_found;
+    int i;
+
+    HexagonCPU *cpu = env_archcpu(env);
+    for (i = 0; i < cpu->num_tlbs; i++) {
+        uint64_t entry = env->hex_tlb->entries[i];
+        if (hex_tlb_entry_match_noperm(entry, asid, VA)) {
+            if (idx != not_found) {
+                env->cause_code = HEX_CAUSE_IMPRECISE_MULTI_TLB_MATCH;
+                break;
+            }
+            idx = i;
+        }
+    }
+
+    if (idx == not_found) {
+        qemu_log_mask(CPU_LOG_MMU, "%s: 0x%x, 0x%08x => NOT FOUND\n",
+                      __func__, asid, VA);
+    } else {
+        qemu_log_mask(CPU_LOG_MMU, "%s: 0x%x, 0x%08x => %d\n",
+                      __func__, asid, VA, idx);
+    }
+
+    return idx;
+}
+
+/* Called from tlbp instruction */
+uint32_t hex_tlb_lookup(CPUHexagonState *env, uint32_t ssr, uint32_t VA)
+{
+    return hex_tlb_lookup_by_asid(env, GET_SSR_FIELD(SSR_ASID, ssr), VA);
+}
+
+static bool hex_tlb_is_match(CPUHexagonState *env,
+                             uint64_t entry1, uint64_t entry2,
+                             bool consider_gbit)
+{
+    bool valid1 = GET_TLB_FIELD(entry1, PTE_V);
+    bool valid2 = GET_TLB_FIELD(entry2, PTE_V);
+    uint64_t size1 = hex_tlb_page_size_bytes(entry1);
+    uint64_t vaddr1 = ROUND_DOWN(hex_tlb_virt_addr(entry1), size1);
+    uint64_t size2 = hex_tlb_page_size_bytes(entry2);
+    uint64_t vaddr2 = ROUND_DOWN(hex_tlb_virt_addr(entry2), size2);
+    int asid1 = GET_TLB_FIELD(entry1, PTE_ASID);
+    int asid2 = GET_TLB_FIELD(entry2, PTE_ASID);
+    bool gbit1 = GET_TLB_FIELD(entry1, PTE_G);
+    bool gbit2 = GET_TLB_FIELD(entry2, PTE_G);
+
+    if (!valid1 || !valid2) {
+        return false;
+    }
+
+    if (((vaddr1 <= vaddr2) && (vaddr2 < (vaddr1 + size1))) ||
+        ((vaddr2 <= vaddr1) && (vaddr1 < (vaddr2 + size2)))) {
+        if (asid1 == asid2) {
+            return true;
+        }
+        if ((consider_gbit && gbit1) || gbit2) {
+            return true;
+        }
+    }
+    return false;
+}
+
+/*
+ * Return codes:
+ * 0 or positive             index of match
+ * -1                        multiple matches
+ * -2                        no match
+ */
+int hex_tlb_check_overlap(CPUHexagonState *env, uint64_t entry, uint64_t index)
+{
+    int matches = 0;
+    int last_match = 0;
+    int i;
+
+    HexagonCPU *cpu = env_archcpu(env);
+    for (i = 0; i < cpu->num_tlbs; i++) {
+        if (hex_tlb_is_match(env, entry, env->hex_tlb->entries[i], false)) {
+            matches++;
+            last_match = i;
+        }
+    }
+
+    if (matches == 1) {
+        return last_match;
+    }
+    if (matches == 0) {
+        return -2;
+    }
+    return -1;
+}
+
+static inline void print_thread(const char *str, CPUState *cs)
+{
+    g_assert(bql_locked());
+    CPUHexagonState *thread = cpu_env(cs);
+    bool is_stopped = cpu_is_stopped(cs);
+    int exe_mode = get_exe_mode(thread);
+    hex_lock_state_t lock_state = thread->tlb_lock_state;
+    qemu_log_mask(CPU_LOG_MMU,
+           "%s: threadId = %d: %s, exe_mode = %s, tlb_lock_state = %s\n",
+           str,
+           thread->threadId,
+           is_stopped ? "stopped" : "running",
+           exe_mode == HEX_EXE_MODE_OFF ? "off" :
+           exe_mode == HEX_EXE_MODE_RUN ? "run" :
+           exe_mode == HEX_EXE_MODE_WAIT ? "wait" :
+           exe_mode == HEX_EXE_MODE_DEBUG ? "debug" :
+           "unknown",
+           lock_state == HEX_LOCK_UNLOCKED ? "unlocked" :
+           lock_state == HEX_LOCK_WAITING ? "waiting" :
+           lock_state == HEX_LOCK_OWNER ? "owner" :
+           "unknown");
+}
+
+static inline void print_thread_states(const char *str)
+{
+    CPUState *cs;
+    CPU_FOREACH(cs) {
+        print_thread(str, cs);
+    }
+}
+
+void hex_tlb_lock(CPUHexagonState *env)
+{
+    qemu_log_mask(CPU_LOG_MMU, "hex_tlb_lock: %d\n", env->threadId);
+    BQL_LOCK_GUARD();
+    g_assert((env->tlb_lock_count == 0) || (env->tlb_lock_count == 1));
+
+    uint32_t syscfg = arch_get_system_reg(env, HEX_SREG_SYSCFG);
+    uint8_t tlb_lock = GET_SYSCFG_FIELD(SYSCFG_TLBLOCK, syscfg);
+    if (tlb_lock) {
+        if (env->tlb_lock_state == HEX_LOCK_QUEUED) {
+            env->next_PC += 4;
+            env->tlb_lock_count++;
+            env->tlb_lock_state = HEX_LOCK_OWNER;
+            SET_SYSCFG_FIELD(env, SYSCFG_TLBLOCK, 1);
+            return;
+        }
+        if (env->tlb_lock_state == HEX_LOCK_OWNER) {
+            qemu_log_mask(CPU_LOG_MMU | LOG_GUEST_ERROR,
+                          "Double tlblock at PC: 0x%x, thread may hang\n",
+                          env->next_PC);
+            env->next_PC += 4;
+            CPUState *cs = env_cpu(env);
+            cpu_interrupt(cs, CPU_INTERRUPT_HALT);
+            return;
+        }
+        env->tlb_lock_state = HEX_LOCK_WAITING;
+        CPUState *cs = env_cpu(env);
+        cpu_interrupt(cs, CPU_INTERRUPT_HALT);
+    } else {
+        env->next_PC += 4;
+        env->tlb_lock_count++;
+        env->tlb_lock_state = HEX_LOCK_OWNER;
+        SET_SYSCFG_FIELD(env, SYSCFG_TLBLOCK, 1);
+    }
+
+    if (qemu_loglevel_mask(CPU_LOG_MMU)) {
+        qemu_log_mask(CPU_LOG_MMU, "Threads after hex_tlb_lock:\n");
+        print_thread_states("\tThread");
+    }
+}
+
+void hex_tlb_unlock(CPUHexagonState *env)
+{
+    BQL_LOCK_GUARD();
+    g_assert((env->tlb_lock_count == 0) || (env->tlb_lock_count == 1));
+
+    /* Nothing to do if the TLB isn't locked by this thread */
+    uint32_t syscfg = arch_get_system_reg(env, HEX_SREG_SYSCFG);
+    uint8_t tlb_lock = GET_SYSCFG_FIELD(SYSCFG_TLBLOCK, syscfg);
+    if ((tlb_lock == 0) ||
+        (env->tlb_lock_state != HEX_LOCK_OWNER)) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "thread %d attempted to tlbunlock without having the "
+                      "lock, tlb_lock state = %d\n",
+                      env->threadId, env->tlb_lock_state);
+        g_assert(env->tlb_lock_state != HEX_LOCK_WAITING);
+        return;
+    }
+
+    env->tlb_lock_count--;
+    env->tlb_lock_state = HEX_LOCK_UNLOCKED;
+    SET_SYSCFG_FIELD(env, SYSCFG_TLBLOCK, 0);
+
+    /* Look for a thread to unlock */
+    unsigned int this_threadId = env->threadId;
+    CPUHexagonState *unlock_thread = NULL;
+    CPUState *cs;
+    CPU_FOREACH(cs) {
+        CPUHexagonState *thread = cpu_env(cs);
+
+        /*
+         * The hardware implements round-robin fairness, so we look for threads
+         * starting at env->threadId + 1 and incrementing modulo the number of
+         * threads.
+         *
+         * To implement this, we check if thread is a earlier in the modulo
+         * sequence than unlock_thread.
+         *     if unlock thread is higher than this thread
+         *         thread must be between this thread and unlock_thread
+         *     else
+         *         thread higher than this thread is ahead of unlock_thread
+         *         thread must be lower then unlock thread
+         */
+        if (thread->tlb_lock_state == HEX_LOCK_WAITING) {
+            if (!unlock_thread) {
+                unlock_thread = thread;
+            } else if (unlock_thread->threadId > this_threadId) {
+                if (this_threadId < thread->threadId &&
+                    thread->threadId < unlock_thread->threadId) {
+                    unlock_thread = thread;
+                }
+            } else {
+                if (thread->threadId > this_threadId) {
+                    unlock_thread = thread;
+                }
+                if (thread->threadId < unlock_thread->threadId) {
+                    unlock_thread = thread;
+                }
+            }
+        }
+    }
+    if (unlock_thread) {
+        cs = env_cpu(unlock_thread);
+        print_thread("\tWaiting thread found", cs);
+        unlock_thread->tlb_lock_state = HEX_LOCK_QUEUED;
+        SET_SYSCFG_FIELD(unlock_thread, SYSCFG_TLBLOCK, 1);
+        cpu_interrupt(cs, CPU_INTERRUPT_TLB_UNLOCK);
+    }
+
+    if (qemu_loglevel_mask(CPU_LOG_MMU)) {
+        qemu_log_mask(CPU_LOG_MMU, "Threads after hex_tlb_unlock:\n");
+        print_thread_states("\tThread");
+    }
+
+}
+
diff --git a/target/hexagon/hex_mmu.h b/target/hexagon/hex_mmu.h
new file mode 100644
index 000000000000..fae8aefcac1d
--- /dev/null
+++ b/target/hexagon/hex_mmu.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright(c) 2019-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HEXAGON_MMU_H
+#define HEXAGON_MMU_H
+
+#include "max.h"
+
+struct CPUHexagonTLBContext {
+    uint64_t entries[MAX_TLB_ENTRIES];
+};
+
+extern void hex_tlbw(CPUHexagonState *env, uint32_t index, uint64_t value);
+extern uint32_t hex_tlb_lookup(CPUHexagonState *env, uint32_t ssr, uint32_t VA);
+extern void hex_mmu_realize(CPUHexagonState *env);
+extern void hex_mmu_on(CPUHexagonState *env);
+extern void hex_mmu_off(CPUHexagonState *env);
+extern void hex_mmu_mode_change(CPUHexagonState *env);
+extern bool hex_tlb_find_match(CPUHexagonState *env, target_ulong VA,
+                               MMUAccessType access_type, hwaddr *PA, int *prot,
+                               int *size, int32_t *excp, int mmu_idx);
+extern int hex_tlb_check_overlap(CPUHexagonState *env, uint64_t entry,
+                                 uint64_t index);
+extern void hex_tlb_lock(CPUHexagonState *env);
+extern void hex_tlb_unlock(CPUHexagonState *env);
+void dump_mmu(CPUHexagonState *env);
+#endif
diff --git a/target/hexagon/hex_regs.h b/target/hexagon/hex_regs.h
index bddfc28021c6..ea8c62eba9ce 100644
--- a/target/hexagon/hex_regs.h
+++ b/target/hexagon/hex_regs.h
@@ -81,4 +81,119 @@ enum {
     HEX_REG_UTIMERHI          = 63,
 };
 
+#ifndef CONFIG_USER_ONLY
+
+#define HEX_GREG_VALUES \
+  DECL_HEX_GREG(G0,         0) \
+  DECL_HEX_GREG(GELR,       0) \
+  DECL_HEX_GREG(G1,         1) \
+  DECL_HEX_GREG(GSR,        1) \
+  DECL_HEX_GREG(G2,         2) \
+  DECL_HEX_GREG(GOSP,       2) \
+  DECL_HEX_GREG(G3,         3) \
+  DECL_HEX_GREG(GBADVA,     3) \
+  DECL_HEX_GREG(GCYCLE_1T,  10) \
+  DECL_HEX_GREG(GCYCLE_2T,  11) \
+  DECL_HEX_GREG(GCYCLE_3T,  12) \
+  DECL_HEX_GREG(GCYCLE_4T,  13) \
+  DECL_HEX_GREG(GCYCLE_5T,  14) \
+  DECL_HEX_GREG(GCYCLE_6T,  15) \
+  DECL_HEX_GREG(GPMUCNT4,   16) \
+  DECL_HEX_GREG(GPMUCNT5,   17) \
+  DECL_HEX_GREG(GPMUCNT6,   18) \
+  DECL_HEX_GREG(GPMUCNT7,   19) \
+  DECL_HEX_GREG(GPCYCLELO,  24) \
+  DECL_HEX_GREG(GPCYCLEHI,  25) \
+  DECL_HEX_GREG(GPMUCNT0,   26) \
+  DECL_HEX_GREG(GPMUCNT1,   27) \
+  DECL_HEX_GREG(GPMUCNT2,   28) \
+  DECL_HEX_GREG(GPMUCNT3,   29) \
+  DECL_HEX_GREG_DONE
+
+#define DECL_HEX_GREG_DONE
+#define DECL_HEX_GREG(name, val) HEX_GREG_ ##name = val,
+enum hex_greg {
+    HEX_GREG_VALUES
+};
+#undef DECL_HEX_GREG
+#undef DECL_HEX_GREG_DONE
+
+#define DECL_HEX_GREG_DONE 0
+#define DECL_HEX_GREG(_, val) (1 << val) |
+static inline bool greg_implemented(enum hex_greg greg)
+{
+#if NUM_GREGS > 32
+#error "NUM_GREGS too large for greg_implemented(): update `impl_bitmap`"
+#endif
+    static int32_t impl_bitmap = HEX_GREG_VALUES;
+    return impl_bitmap & (1 << greg);
+}
+#undef DECL_HEX_GREG
+#undef DECL_HEX_GREG_DONE
+
+#endif /* CONFIG_USER_ONLY */
+
+enum {
+    HEX_SREG_SGP0 = 0,
+    HEX_SREG_SGP1 = 1,
+    HEX_SREG_STID = 2,
+    HEX_SREG_ELR = 3,
+    HEX_SREG_BADVA0 = 4,
+    HEX_SREG_BADVA1 = 5,
+    HEX_SREG_SSR = 6,
+    HEX_SREG_CCR = 7,
+    HEX_SREG_HTID = 8,
+    HEX_SREG_BADVA = 9,
+    HEX_SREG_IMASK = 10,
+    HEX_SREG_GEVB  = 11,
+    HEX_SREG_GLB_START = 16,
+    HEX_SREG_EVB = 16,
+    HEX_SREG_MODECTL = 17,
+    HEX_SREG_SYSCFG = 18,
+    HEX_SREG_IPENDAD = 20,
+    HEX_SREG_VID = 21,
+    HEX_SREG_VID1 = 22,
+    HEX_SREG_BESTWAIT = 23,
+    HEX_SREG_IEL = 24,
+    HEX_SREG_SCHEDCFG = 25,
+    HEX_SREG_IAHL = 26,
+    HEX_SREG_CFGBASE = 27,
+    HEX_SREG_DIAG = 28,
+    HEX_SREG_REV = 29,
+    HEX_SREG_PCYCLELO = 30,
+    HEX_SREG_PCYCLEHI = 31,
+    HEX_SREG_ISDBST = 32,
+    HEX_SREG_ISDBCFG0 = 33,
+    HEX_SREG_ISDBCFG1 = 34,
+    HEX_SREG_LIVELOCK = 35,
+    HEX_SREG_BRKPTPC0 = 36,
+    HEX_SREG_BRKPTCFG0 = 37,
+    HEX_SREG_BRKPTPC1 = 38,
+    HEX_SREG_BRKPTCFG1 = 39,
+    HEX_SREG_ISDBMBXIN = 40,
+    HEX_SREG_ISDBMBXOUT = 41,
+    HEX_SREG_ISDBEN = 42,
+    HEX_SREG_ISDBGPR = 43,
+    HEX_SREG_PMUCNT4 = 44,
+    HEX_SREG_PMUCNT5 = 45,
+    HEX_SREG_PMUCNT6 = 46,
+    HEX_SREG_PMUCNT7 = 47,
+    HEX_SREG_PMUCNT0 = 48,
+    HEX_SREG_PMUCNT1 = 49,
+    HEX_SREG_PMUCNT2 = 50,
+    HEX_SREG_PMUCNT3 = 51,
+    HEX_SREG_PMUEVTCFG = 52,
+    HEX_SREG_PMUSTID0 = 53,
+    HEX_SREG_PMUEVTCFG1 = 54,
+    HEX_SREG_PMUSTID1 = 55,
+    HEX_SREG_TIMERLO = 56,
+    HEX_SREG_TIMERHI = 57,
+    HEX_SREG_PMUCFG = 58,
+    HEX_SREG_S59 = 59,
+    HEX_SREG_S60 = 60,
+    HEX_SREG_S61 = 61,
+    HEX_SREG_S62 = 62,
+    HEX_SREG_S63 = 63,
+};
+
 #endif
diff --git a/target/hexagon/hexswi.c b/target/hexagon/hexswi.c
new file mode 100644
index 000000000000..a08d7f68917c
--- /dev/null
+++ b/target/hexagon/hexswi.c
@@ -0,0 +1,728 @@
+/*
+ * Copyright(c) 2019-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#ifdef CONFIG_USER_ONLY
+#include "exec/helper-proto.h"
+#include "qemu.h"
+#endif
+#include "exec/cpu_ldst.h"
+#include "exec/exec-all.h"
+#include "qemu/log.h"
+#include "qemu/main-loop.h"
+#include "arch.h"
+#include "internal.h"
+#include "macros.h"
+#include "sys_macros.h"
+#include "tcg/tcg-op.h"
+#ifndef CONFIG_USER_ONLY
+#include "hex_mmu.h"
+#include "hexswi.h"
+#include "semihosting/common-semi.h"
+#include "semihosting/syscalls.h"
+#include "semihosting/guestfd.h"
+#endif
+
+#ifndef CONFIG_USER_ONLY
+
+/* non-arm-compatible semihosting calls */
+#define HEXAGON_SPECIFIC_SWI_FLAGS \
+    DEF_SWI_FLAG(EXCEPTION,        0x18) \
+    DEF_SWI_FLAG(READ_CYCLES,      0x40) \
+    DEF_SWI_FLAG(PROF_ON,          0x41) \
+    DEF_SWI_FLAG(PROF_OFF,         0x42) \
+    DEF_SWI_FLAG(WRITECREG,        0x43) \
+    DEF_SWI_FLAG(READ_TCYCLES,     0x44) \
+    DEF_SWI_FLAG(READ_ICOUNT,      0x47) \
+    DEF_SWI_FLAG(PROF_STATSRESET,  0x48) \
+    DEF_SWI_FLAG(DUMP_PMU_STATS,   0x4a) \
+    DEF_SWI_FLAG(READ_PCYCLES,     0x52) \
+    DEF_SWI_FLAG(COREDUMP,         0xCD) \
+    DEF_SWI_FLAG(FTELL,            0x100) \
+    DEF_SWI_FLAG(FSTAT,            0x101) \
+    DEF_SWI_FLAG(STAT,             0x103) \
+    DEF_SWI_FLAG(GETCWD,           0x104) \
+    DEF_SWI_FLAG(ACCESS,           0x105) \
+    DEF_SWI_FLAG(OPENDIR,          0x180) \
+    DEF_SWI_FLAG(CLOSEDIR,         0x181) \
+    DEF_SWI_FLAG(READDIR,          0x182) \
+    DEF_SWI_FLAG(EXEC,             0x185) \
+    DEF_SWI_FLAG(FTRUNC,           0x186)
+
+#define DEF_SWI_FLAG(name, val) HEX_SYS_ ##name = val,
+enum hex_swi_flag {
+    HEXAGON_SPECIFIC_SWI_FLAGS
+};
+#undef DEF_SWI_FLAG
+
+#define DEF_SWI_FLAG(_, val) case val:
+static inline bool is_hexagon_specific_swi_flag(enum hex_swi_flag what_swi)
+{
+    switch (what_swi) {
+    HEXAGON_SPECIFIC_SWI_FLAGS
+        return true;
+    }
+    return false;
+}
+#undef DEF_SWI_FLAG
+
+/* We start from 1 as 0 is used to signal an error from opendir() */
+static const int DIR_INDEX_OFFSET = 1;
+
+static void common_semi_ftell_cb(CPUState *cs, uint64_t ret, int err)
+{
+    if (err) {
+        ret = -1;
+    }
+    common_semi_cb(cs, ret, err);
+}
+
+static void coredump(CPUHexagonState *env)
+{
+    uint32_t ssr = arch_get_system_reg(env, HEX_SREG_SSR);
+    printf("CRASH!\n");
+    printf("I think the exception was: ");
+    switch (GET_SSR_FIELD(SSR_CAUSE, ssr)) {
+    case 0x43:
+        printf("0x43, NMI");
+        break;
+    case 0x42:
+        printf("0x42, Data abort");
+        break;
+    case 0x44:
+        printf("0x44, Multi TLB match");
+        break;
+    case HEX_CAUSE_BIU_PRECISE:
+        printf("0x%x, Bus Error (Precise BIU error)",
+               HEX_CAUSE_BIU_PRECISE);
+        break;
+    case HEX_CAUSE_DOUBLE_EXCEPT:
+        printf("0x%x, Exception observed when EX = 1 (double exception)",
+               HEX_CAUSE_DOUBLE_EXCEPT);
+        break;
+    case HEX_CAUSE_FETCH_NO_XPAGE:
+        printf("0x%x, Privilege violation: User/Guest mode execute"
+               " to page with no execute permissions",
+               HEX_CAUSE_FETCH_NO_XPAGE);
+        break;
+    case HEX_CAUSE_FETCH_NO_UPAGE:
+        printf("0x%x, Privilege violation: "
+               "User mode exececute to page with no user permissions",
+               HEX_CAUSE_FETCH_NO_UPAGE);
+        break;
+    case HEX_CAUSE_INVALID_PACKET:
+        printf("0x%x, Invalid packet",
+               HEX_CAUSE_INVALID_PACKET);
+        break;
+    case HEX_CAUSE_PRIV_USER_NO_GINSN:
+        printf("0x%x, Privilege violation: guest mode insn in user mode",
+               HEX_CAUSE_PRIV_USER_NO_GINSN);
+        break;
+    case HEX_CAUSE_PRIV_USER_NO_SINSN:
+        printf("0x%x, Privilege violation: "
+               "monitor mode insn ins user/guest mode",
+               HEX_CAUSE_PRIV_USER_NO_SINSN);
+        break;
+    case HEX_CAUSE_REG_WRITE_CONFLICT:
+        printf("0x%x, Multiple writes to same register",
+               HEX_CAUSE_REG_WRITE_CONFLICT);
+        break;
+    case HEX_CAUSE_PC_NOT_ALIGNED:
+        printf("0x%x, PC not aligned",
+               HEX_CAUSE_PC_NOT_ALIGNED);
+        break;
+    case HEX_CAUSE_MISALIGNED_LOAD:
+        printf("0x%x, Misaligned Load @ 0x%x",
+               HEX_CAUSE_MISALIGNED_LOAD,
+               arch_get_system_reg(env, HEX_SREG_BADVA));
+        break;
+    case HEX_CAUSE_MISALIGNED_STORE:
+        printf("0x%x, Misaligned Store @ 0x%x",
+               HEX_CAUSE_MISALIGNED_STORE,
+               arch_get_system_reg(env, HEX_SREG_BADVA));
+        break;
+    case HEX_CAUSE_PRIV_NO_READ:
+        printf("0x%x, Privilege violation: "
+            "user/guest read permission @ 0x%x",
+            HEX_CAUSE_PRIV_NO_READ,
+            arch_get_system_reg(env, HEX_SREG_BADVA));
+        break;
+    case HEX_CAUSE_PRIV_NO_WRITE:
+        printf("0x%x, Privilege violation: "
+            "user/guest write permission @ 0x%x",
+            HEX_CAUSE_PRIV_NO_WRITE,
+            arch_get_system_reg(env, HEX_SREG_BADVA));
+        break;
+    case HEX_CAUSE_PRIV_NO_UREAD:
+        printf("0x%x, Privilege violation: user read permission @ 0x%x",
+               HEX_CAUSE_PRIV_NO_UREAD,
+               arch_get_system_reg(env, HEX_SREG_BADVA));
+        break;
+    case HEX_CAUSE_PRIV_NO_UWRITE:
+        printf("0x%x, Privilege violation: user write permission @ 0x%x",
+               HEX_CAUSE_PRIV_NO_UWRITE,
+               arch_get_system_reg(env, HEX_SREG_BADVA));
+        break;
+    case HEX_CAUSE_COPROC_LDST:
+        printf("0x%x, Coprocessor VMEM address error. @ 0x%x",
+               HEX_CAUSE_COPROC_LDST,
+               arch_get_system_reg(env, HEX_SREG_BADVA));
+        break;
+    case HEX_CAUSE_STACK_LIMIT:
+        printf("0x%x, Stack limit check error", HEX_CAUSE_STACK_LIMIT);
+        break;
+    case HEX_CAUSE_FPTRAP_CAUSE_BADFLOAT:
+        printf("0x%X, Floating-Point: Execution of Floating-Point "
+               "instruction resulted in exception",
+               HEX_CAUSE_FPTRAP_CAUSE_BADFLOAT);
+        break;
+    case HEX_CAUSE_NO_COPROC_ENABLE:
+        printf("0x%x, Illegal Execution of Coprocessor Instruction",
+               HEX_CAUSE_NO_COPROC_ENABLE);
+        break;
+    case HEX_CAUSE_NO_COPROC2_ENABLE:
+        printf("0x%x, "
+               "Illegal Execution of Secondary Coprocessor Instruction",
+               HEX_CAUSE_NO_COPROC2_ENABLE);
+        break;
+    case HEX_CAUSE_UNSUPORTED_HVX_64B:
+        printf("0x%x, "
+               "Unsuported Execution of Coprocessor Instruction with 64bits Mode On",
+               HEX_CAUSE_UNSUPORTED_HVX_64B);
+        break;
+    case HEX_CAUSE_VWCTRL_WINDOW_MISS:
+        printf("0x%x, "
+               "Thread accessing a region outside VWCTRL window",
+               HEX_CAUSE_VWCTRL_WINDOW_MISS);
+        break;
+    default:
+        printf("Don't know");
+        break;
+    }
+    printf("\nRegister Dump:\n");
+    hexagon_dump(env, stdout, 0);
+}
+
+static void sim_handle_trap0(CPUHexagonState *env)
+{
+    g_assert(bql_locked());
+    target_ulong what_swi = arch_get_thread_reg(env, HEX_REG_R00);
+    target_ulong swi_info = arch_get_thread_reg(env, HEX_REG_R01);
+    uintptr_t retaddr = 0;
+    CPUState *cs = env_cpu(env);
+
+    if (!is_hexagon_specific_swi_flag(what_swi)) {
+        do_common_semihosting(cs);
+        return;
+    }
+
+    switch (what_swi) {
+
+    case HEX_SYS_EXCEPTION:
+        arch_set_system_reg(env, HEX_SREG_MODECTL, 0);
+        exit(arch_get_thread_reg(env, HEX_REG_R02));
+        break;
+
+    case HEX_SYS_WRITECREG:
+        fprintf(stdout, "%c", swi_info);
+        fflush(stdout);
+        common_semi_cb(cs, 0, 0);
+        break;
+
+    case HEX_SYS_STAT:
+    case HEX_SYS_FSTAT:
+    {
+        /*
+         * This must match the caller's definition, it would be in the
+         * caller's angel.h or equivalent header.
+         */
+        struct __SYS_STAT {
+            uint64_t dev;
+            uint64_t ino;
+            uint32_t mode;
+            uint32_t nlink;
+            uint64_t rdev;
+            uint32_t size;
+            uint32_t __pad1;
+            uint32_t atime;
+            uint32_t mtime;
+            uint32_t ctime;
+            uint32_t __pad2;
+        } sys_stat;
+        struct stat st_buf;
+        uint8_t *st_bufptr = (uint8_t *)&sys_stat;
+        int rc, err = 0;
+        char filename[BUFSIZ];
+        target_ulong physicalFilenameAddr;
+        target_ulong statBufferAddr;
+        hexagon_read_memory(env, swi_info, 4, &physicalFilenameAddr, retaddr);
+
+        if (what_swi == HEX_SYS_STAT) {
+            int i = 0;
+            do {
+                hexagon_read_memory(env, physicalFilenameAddr + i, 1,
+                                    &filename[i], retaddr);
+                i++;
+            } while ((i < BUFSIZ) && filename[i - 1]);
+            rc = stat(filename, &st_buf);
+            err = errno;
+        } else{
+            int fd = physicalFilenameAddr;
+            GuestFD *gf = get_guestfd(fd);
+            if (gf->type != GuestFDHost) {
+                fprintf(stderr, "fstat semihosting only implemented for native mode.\n");
+                g_assert_not_reached();
+            }
+            rc = fstat(gf->hostfd, &st_buf);
+            err = errno;
+        }
+        if (rc == 0) {
+            sys_stat.dev   = st_buf.st_dev;
+            sys_stat.ino   = st_buf.st_ino;
+            sys_stat.mode  = st_buf.st_mode;
+            sys_stat.nlink = (uint32_t) st_buf.st_nlink;
+            sys_stat.rdev  = st_buf.st_rdev;
+            sys_stat.size  = (uint32_t) st_buf.st_size;
+#if defined(__linux__)
+            sys_stat.atime = (uint32_t) st_buf.st_atim.tv_sec;
+            sys_stat.mtime = (uint32_t) st_buf.st_mtim.tv_sec;
+            sys_stat.ctime = (uint32_t) st_buf.st_ctim.tv_sec;
+#elif defined(_WIN32)
+            sys_stat.atime = st_buf.st_atime;
+            sys_stat.mtime = st_buf.st_mtime;
+            sys_stat.ctime = st_buf.st_ctime;
+#endif
+        }
+        hexagon_read_memory(env, swi_info + 4, 4, &statBufferAddr, retaddr);
+
+        for (int i = 0; i < sizeof(sys_stat); i++) {
+            hexagon_write_memory(env, statBufferAddr + i, 1, st_bufptr[i],
+                                 retaddr);
+        }
+        common_semi_cb(cs, rc, err);
+    }
+    break;
+
+    case HEX_SYS_FTRUNC:
+    {
+        int fd;
+        off_t size_limit;
+        hexagon_read_memory(env, swi_info, 4, &fd, retaddr);
+        hexagon_read_memory(env, swi_info + 4, 8, &size_limit, retaddr);
+        semihost_sys_ftruncate(cs, common_semi_cb, fd, size_limit);
+    }
+    break;
+
+    case HEX_SYS_ACCESS:
+    {
+        char filename[BUFSIZ];
+        uint32_t FileNameAddr;
+        uint32_t BufferMode;
+        int rc, err = 0;
+
+        int i = 0;
+
+        hexagon_read_memory(env, swi_info, 4, &FileNameAddr, retaddr);
+        do {
+            hexagon_read_memory(env, FileNameAddr + i, 1, &filename[i], retaddr);
+            i++;
+        } while ((i < BUFSIZ) && (filename[i - 1]));
+        filename[i] = 0;
+
+        hexagon_read_memory(env, swi_info + 4, 4, &BufferMode, retaddr);
+
+        rc = access(filename, BufferMode);
+        if (rc != 0) {
+            err = errno;
+        }
+        common_semi_cb(cs, rc, err);
+    }
+    break;
+
+    case HEX_SYS_GETCWD:
+    {
+        char cwdPtr[PATH_MAX];
+        uint32_t BufferAddr;
+        uint32_t BufferSize;
+        uint32_t rc = 0, err = 0;
+
+        hexagon_read_memory(env, swi_info, 4, &BufferAddr, retaddr);
+        hexagon_read_memory(env, swi_info + 4, 4, &BufferSize, retaddr);
+
+        if (!getcwd(cwdPtr, PATH_MAX)) {
+            err = errno;
+        } else {
+            size_t cwd_size = strlen(cwdPtr);
+            if (cwd_size > BufferSize) {
+                err = ERANGE;
+            } else {
+                for (int i = 0; i < cwd_size; i++) {
+                    hexagon_write_memory(env, BufferAddr + i, 1,
+                                         (uint64_t)cwdPtr[i], retaddr);
+                }
+                rc = BufferAddr;
+            }
+        }
+        common_semi_cb(cs, rc, err);
+        break;
+    }
+
+    case HEX_SYS_EXEC:
+    {
+        qemu_log_mask(LOG_UNIMP, "SYS_EXEC is deprecated\n");
+        common_semi_cb(cs, -1, ENOSYS);
+    }
+    break;
+
+    case HEX_SYS_OPENDIR:
+    {
+        DIR *dir;
+        char buf[BUFSIZ];
+        int rc = 0, err = 0;
+
+        int i = 0;
+        do {
+            hexagon_read_memory(env, swi_info + i, 1, &buf[i], retaddr);
+            i++;
+        } while (buf[i - 1]);
+
+        dir = opendir(buf);
+        if (dir != NULL) {
+            env->dir_list = g_list_append(env->dir_list, dir);
+            rc = g_list_index(env->dir_list, dir) + DIR_INDEX_OFFSET;
+        } else {
+            err = errno;
+        }
+        common_semi_cb(cs, rc, err);
+        break;
+    }
+
+    case HEX_SYS_READDIR:
+    {
+        struct dirent *host_dir_entry = NULL;
+        int dir_index = swi_info - DIR_INDEX_OFFSET;
+        DIR *dir = g_list_nth_data(env->dir_list, dir_index);
+        uint32_t rc = 0, err = 0;
+
+        if (dir) {
+            errno = 0;
+            host_dir_entry = readdir(dir);
+            if (host_dir_entry == NULL) {
+                err = errno;
+            }
+        } else {
+            err = EBADF;
+        }
+
+        if (host_dir_entry) {
+            uint32_t guest_dir_entry = arch_get_thread_reg(env, HEX_REG_R02);
+            hexagon_write_memory(env, guest_dir_entry, 4, host_dir_entry->d_ino,
+                                 retaddr);
+            for (int i = 0; i < sizeof(host_dir_entry->d_name); i++) {
+                hexagon_write_memory(env, guest_dir_entry + 4 + i, 1,
+                                     host_dir_entry->d_name[i], retaddr);
+                if (!host_dir_entry->d_name[i]) {
+                    break;
+                }
+            }
+            rc = guest_dir_entry;
+        }
+        common_semi_cb(cs, rc, err);
+        break;
+    }
+
+    case HEX_SYS_CLOSEDIR:
+    {
+        DIR *dir;
+        int ret = 0, err = 0;
+
+        dir = g_list_nth_data(env->dir_list, swi_info);
+        if (dir != NULL) {
+            ret = closedir(dir);
+            if (ret != 0) {
+                err = errno;
+            }
+        } else {
+            err = EBADF;
+        }
+        common_semi_cb(cs, ret, err);
+        break;
+    }
+
+    case HEX_SYS_COREDUMP:
+        coredump(env);
+        break;
+
+    case HEX_SYS_FTELL:
+    {
+        int fd;
+        hexagon_read_memory(env, swi_info, 4, &fd, retaddr);
+        semihost_sys_lseek(cs, common_semi_ftell_cb, fd, 0, GDB_SEEK_CUR);
+    }
+    break;
+
+    case HEX_SYS_READ_CYCLES:
+    case HEX_SYS_READ_TCYCLES:
+    case HEX_SYS_READ_ICOUNT:
+    {
+        arch_set_thread_reg(env, HEX_REG_R00, 0);
+        arch_set_thread_reg(env, HEX_REG_R01, 0);
+        break;
+    }
+
+    case HEX_SYS_READ_PCYCLES:
+    {
+        arch_set_thread_reg(env, HEX_REG_R00,
+            arch_get_system_reg(env, HEX_SREG_PCYCLELO));
+        arch_set_thread_reg(env, HEX_REG_R01,
+            arch_get_system_reg(env, HEX_SREG_PCYCLEHI));
+        break;
+    }
+
+    case HEX_SYS_PROF_ON:
+    case HEX_SYS_PROF_OFF:
+    case HEX_SYS_PROF_STATSRESET:
+    case HEX_SYS_DUMP_PMU_STATS:
+        common_semi_cb(cs, -1, ENOSYS);
+        qemu_log_mask(LOG_UNIMP, "SWI call %x is unimplemented in QEMU\n",
+                      what_swi);
+        break;
+
+    default:
+        qemu_log_mask(LOG_GUEST_ERROR, "error: unknown swi call 0x%x\n", what_swi);
+        cpu_abort(cs, "Hexagon Unsupported swi call 0x%x\n", what_swi);
+    }
+}
+
+static void set_addresses(CPUHexagonState *env, target_ulong pc_offset,
+                          target_ulong exception_index)
+
+{
+    arch_set_system_reg(env, HEX_SREG_ELR,
+                        arch_get_thread_reg(env, HEX_REG_PC) + pc_offset);
+    arch_set_thread_reg(env, HEX_REG_PC,
+                        arch_get_system_reg(env, HEX_SREG_EVB) |
+                            (exception_index << 2));
+}
+
+static const char *event_name[] = {
+    [HEX_EVENT_RESET] = "HEX_EVENT_RESET",
+    [HEX_EVENT_IMPRECISE] = "HEX_EVENT_IMPRECISE",
+    [HEX_EVENT_TLB_MISS_X] = "HEX_EVENT_TLB_MISS_X",
+    [HEX_EVENT_TLB_MISS_RW] = "HEX_EVENT_TLB_MISS_RW",
+    [HEX_EVENT_TRAP0] = "HEX_EVENT_TRAP0",
+    [HEX_EVENT_TRAP1] = "HEX_EVENT_TRAP1",
+    [HEX_EVENT_FPTRAP] = "HEX_EVENT_FPTRAP",
+    [HEX_EVENT_DEBUG] = "HEX_EVENT_DEBUG",
+    [HEX_EVENT_INT0] = "HEX_EVENT_INT0",
+    [HEX_EVENT_INT1] = "HEX_EVENT_INT1",
+    [HEX_EVENT_INT2] = "HEX_EVENT_INT2",
+    [HEX_EVENT_INT3] = "HEX_EVENT_INT3",
+    [HEX_EVENT_INT4] = "HEX_EVENT_INT4",
+    [HEX_EVENT_INT5] = "HEX_EVENT_INT5",
+    [HEX_EVENT_INT6] = "HEX_EVENT_INT6",
+    [HEX_EVENT_INT7] = "HEX_EVENT_INT7",
+    [HEX_EVENT_INT8] = "HEX_EVENT_INT8",
+    [HEX_EVENT_INT9] = "HEX_EVENT_INT9",
+    [HEX_EVENT_INTA] = "HEX_EVENT_INTA",
+    [HEX_EVENT_INTB] = "HEX_EVENT_INTB",
+    [HEX_EVENT_INTC] = "HEX_EVENT_INTC",
+    [HEX_EVENT_INTD] = "HEX_EVENT_INTD",
+    [HEX_EVENT_INTE] = "HEX_EVENT_INTE",
+    [HEX_EVENT_INTF] = "HEX_EVENT_INTF"
+};
+
+void hexagon_cpu_do_interrupt(CPUState *cs)
+
+{
+    CPUHexagonState *env = cpu_env(cs);
+    BQL_LOCK_GUARD();
+
+    qemu_log_mask(CPU_LOG_INT, "\t%s: event 0x%x:%s, cause 0x%x(%d)\n",
+                  __func__, cs->exception_index,
+                  event_name[cs->exception_index], env->cause_code,
+                  env->cause_code);
+
+    env->llsc_addr = ~0;
+
+    uint32_t ssr = arch_get_system_reg(env, HEX_SREG_SSR);
+    if (GET_SSR_FIELD(SSR_EX, ssr) == 1) {
+        arch_set_system_reg(env, HEX_SREG_DIAG, env->cause_code);
+        env->cause_code = HEX_CAUSE_DOUBLE_EXCEPT;
+        cs->exception_index = HEX_EVENT_PRECISE;
+    }
+
+    switch (cs->exception_index) {
+    case HEX_EVENT_TRAP0:
+        if (env->cause_code == 0) {
+            sim_handle_trap0(env);
+        }
+
+        hexagon_ssr_set_cause(env, env->cause_code);
+        set_addresses(env, 4, cs->exception_index);
+        break;
+
+    case HEX_EVENT_TRAP1:
+        hexagon_ssr_set_cause(env, env->cause_code);
+        set_addresses(env, 4, cs->exception_index);
+        break;
+
+    case HEX_EVENT_TLB_MISS_X:
+        switch (env->cause_code) {
+        case HEX_CAUSE_TLBMISSX_CAUSE_NORMAL:
+        case HEX_CAUSE_TLBMISSX_CAUSE_NEXTPAGE:
+            qemu_log_mask(CPU_LOG_MMU,
+                          "TLB miss EX exception (0x%x) caught: "
+                          "Cause code (0x%x) "
+                          "TID = 0x%" PRIx32 ", PC = 0x%" PRIx32
+                          ", BADVA = 0x%" PRIx32 "\n",
+                          cs->exception_index, env->cause_code, env->threadId,
+                          arch_get_thread_reg(env, HEX_REG_PC),
+                          arch_get_system_reg(env, HEX_SREG_BADVA));
+
+            hexagon_ssr_set_cause(env, env->cause_code);
+            set_addresses(env, 0, cs->exception_index);
+            break;
+
+        default:
+            cpu_abort(cs,
+                      "1:Hexagon exception %d/0x%x: "
+                      "Unknown cause code %d/0x%x\n",
+                      cs->exception_index, cs->exception_index, env->cause_code,
+                      env->cause_code);
+            break;
+        }
+        break;
+
+    case HEX_EVENT_TLB_MISS_RW:
+        switch (env->cause_code) {
+        case HEX_CAUSE_TLBMISSRW_CAUSE_READ:
+        case HEX_CAUSE_TLBMISSRW_CAUSE_WRITE:
+            qemu_log_mask(CPU_LOG_MMU,
+                          "TLB miss RW exception (0x%x) caught: "
+                          "Cause code (0x%x) "
+                          "TID = 0x%" PRIx32 ", PC = 0x%" PRIx32
+                          ", BADVA = 0x%" PRIx32 "\n",
+                          cs->exception_index, env->cause_code, env->threadId,
+                          env->gpr[HEX_REG_PC],
+                          arch_get_system_reg(env, HEX_SREG_BADVA));
+
+            hexagon_ssr_set_cause(env, env->cause_code);
+            set_addresses(env, 0, cs->exception_index);
+            /* env->sreg[HEX_SREG_BADVA] is set when the exception is raised */
+            break;
+
+        default:
+            cpu_abort(cs,
+                      "2:Hexagon exception %d/0x%x: "
+                      "Unknown cause code %d/0x%x\n",
+                      cs->exception_index, cs->exception_index, env->cause_code,
+                      env->cause_code);
+            break;
+        }
+        break;
+
+    case HEX_EVENT_FPTRAP:
+        hexagon_ssr_set_cause(env, env->cause_code);
+        arch_set_thread_reg(env, HEX_REG_PC,
+                            arch_get_system_reg(env, HEX_SREG_EVB) |
+                                (cs->exception_index << 2));
+        break;
+
+    case HEX_EVENT_DEBUG:
+        hexagon_ssr_set_cause(env, env->cause_code);
+        set_addresses(env, 0, cs->exception_index);
+        qemu_log_mask(LOG_UNIMP, "single-step exception is not handled\n");
+        break;
+
+    case HEX_EVENT_PRECISE:
+        switch (env->cause_code) {
+        case HEX_CAUSE_FETCH_NO_XPAGE:
+        case HEX_CAUSE_FETCH_NO_UPAGE:
+        case HEX_CAUSE_PRIV_NO_READ:
+        case HEX_CAUSE_PRIV_NO_UREAD:
+        case HEX_CAUSE_PRIV_NO_WRITE:
+        case HEX_CAUSE_PRIV_NO_UWRITE:
+        case HEX_CAUSE_MISALIGNED_LOAD:
+        case HEX_CAUSE_MISALIGNED_STORE:
+        case HEX_CAUSE_PC_NOT_ALIGNED:
+            qemu_log_mask(CPU_LOG_MMU,
+                          "MMU permission exception (0x%x) caught: "
+                          "Cause code (0x%x) "
+                          "TID = 0x%" PRIx32 ", PC = 0x%" PRIx32
+                          ", BADVA = 0x%" PRIx32 "\n",
+                          cs->exception_index, env->cause_code, env->threadId,
+                          env->gpr[HEX_REG_PC],
+                          arch_get_system_reg(env, HEX_SREG_BADVA));
+
+
+            hexagon_ssr_set_cause(env, env->cause_code);
+            set_addresses(env, 0, cs->exception_index);
+            /* env->sreg[HEX_SREG_BADVA] is set when the exception is raised */
+            break;
+
+        case HEX_CAUSE_DOUBLE_EXCEPT:
+        case HEX_CAUSE_PRIV_USER_NO_SINSN:
+        case HEX_CAUSE_PRIV_USER_NO_GINSN:
+        case HEX_CAUSE_INVALID_OPCODE:
+        case HEX_CAUSE_NO_COPROC_ENABLE:
+        case HEX_CAUSE_NO_COPROC2_ENABLE:
+        case HEX_CAUSE_UNSUPORTED_HVX_64B:
+        case HEX_CAUSE_REG_WRITE_CONFLICT:
+        case HEX_CAUSE_VWCTRL_WINDOW_MISS:
+            hexagon_ssr_set_cause(env, env->cause_code);
+            set_addresses(env, 0, cs->exception_index);
+            break;
+
+        case HEX_CAUSE_COPROC_LDST:
+            hexagon_ssr_set_cause(env, env->cause_code);
+            set_addresses(env, 0, cs->exception_index);
+            break;
+
+        case HEX_CAUSE_STACK_LIMIT:
+            hexagon_ssr_set_cause(env, env->cause_code);
+            set_addresses(env, 0, cs->exception_index);
+            break;
+
+        default:
+            cpu_abort(cs,
+                      "3:Hexagon exception %d/0x%x: "
+                      "Unknown cause code %d/0x%x\n",
+                      cs->exception_index, cs->exception_index, env->cause_code,
+                      env->cause_code);
+            break;
+        }
+        break;
+
+    case HEX_EVENT_IMPRECISE:
+        qemu_log_mask(LOG_UNIMP,
+                "Imprecise exception: this case is not yet handled");
+        break;
+
+    default:
+        qemu_log_mask(LOG_UNIMP,
+                "Hexagon Unsupported exception 0x%x/0x%x\n",
+                  cs->exception_index, env->cause_code);
+        break;
+    }
+
+    cs->exception_index = HEX_EVENT_NONE;
+}
+
+void register_trap_exception(CPUHexagonState *env, int traptype, int imm,
+                             target_ulong PC)
+{
+    CPUState *cs = env_cpu(env);
+
+    cs->exception_index = (traptype == 0) ? HEX_EVENT_TRAP0 : HEX_EVENT_TRAP1;
+    ASSERT_DIRECT_TO_GUEST_UNSET(env, cs->exception_index);
+
+    env->cause_code = imm;
+    env->gpr[HEX_REG_PC] = PC;
+    cpu_loop_exit(cs);
+}
+#endif
diff --git a/target/hexagon/hexswi.h b/target/hexagon/hexswi.h
new file mode 100644
index 000000000000..5d232cb06cb0
--- /dev/null
+++ b/target/hexagon/hexswi.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright(c) 2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HEXSWI_H
+#define HEXSWI_H
+
+
+#include "cpu.h"
+
+void hexagon_cpu_do_interrupt(CPUState *cpu);
+void register_trap_exception(CPUHexagonState *env, int type, int imm,
+                             target_ulong PC);
+
+#endif /* HEXSWI_H */
diff --git a/target/hexagon/idef-parser/README.rst b/target/hexagon/idef-parser/README.rst
index 7199177ee33e..235e3debee3c 100644
--- a/target/hexagon/idef-parser/README.rst
+++ b/target/hexagon/idef-parser/README.rst
@@ -637,7 +637,7 @@ tinycode for the Hexagon ``add`` instruction
 ::
 
    ---- 00021094
-   mov_i32 pkt_has_store_s1,$0x0
+   mov_i32 pkt_has_scalar_store_s1,$0x0
    add_i32 tmp0,r2,r2
    mov_i32 loc2,tmp0
    mov_i32 new_r1,loc2
diff --git a/target/hexagon/idef-parser/parser-helpers.c b/target/hexagon/idef-parser/parser-helpers.c
index a7dcd85fe43d..3316c230f8a5 100644
--- a/target/hexagon/idef-parser/parser-helpers.c
+++ b/target/hexagon/idef-parser/parser-helpers.c
@@ -1725,7 +1725,7 @@ void gen_cancel(Context *c, YYLTYPE *locp)
 
 void gen_load_cancel(Context *c, YYLTYPE *locp)
 {
-    OUT(c, locp, "if (insn->slot == 0 && pkt->pkt_has_store_s1) {\n");
+    OUT(c, locp, "if (insn->slot == 0 && pkt->pkt_has_scalar_store_s1) {\n");
     OUT(c, locp, "ctx->s1_store_processed = false;\n");
     OUT(c, locp, "process_store(ctx, 1);\n");
     OUT(c, locp, "}\n");
@@ -1750,7 +1750,7 @@ void gen_load(Context *c, YYLTYPE *locp, HexValue *width,
 
     /* Lookup the effective address EA */
     find_variable(c, locp, ea, ea);
-    OUT(c, locp, "if (insn->slot == 0 && pkt->pkt_has_store_s1) {\n");
+    OUT(c, locp, "if (insn->slot == 0 && pkt->pkt_has_scalar_store_s1) {\n");
     OUT(c, locp, "probe_noshuf_load(", ea, ", ", width, ", ctx->mem_idx);\n");
     OUT(c, locp, "process_store(ctx, 1);\n");
     OUT(c, locp, "}\n");
diff --git a/target/hexagon/imported/encode_pp.def b/target/hexagon/imported/encode_pp.def
index 0cd30a5e8575..2c45388ab629 100644
--- a/target/hexagon/imported/encode_pp.def
+++ b/target/hexagon/imported/encode_pp.def
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2020 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -16,6 +16,7 @@
  */
 
 /*
+ * encode32.def
  * Encodings for 32 bit instructions
  *
  */
@@ -341,6 +342,8 @@ DEF_ENC32(L4_pload##TAG##tnew_abs,ICLASS_LD" 1 11 "OPC"  iiiii  PP110tti  1--ddd
 DEF_ENC32(L4_pload##TAG##fnew_abs,ICLASS_LD" 1 11 "OPC"  iiiii  PP111tti  1--ddddd")
 
 
+
+
 /*               0 000  misc: dealloc,loadw_locked,dcfetch      */
 STD_LD_ENC(bzw4,"0 101")
 STD_LD_ENC(bzw2,"0 011")
@@ -375,6 +378,7 @@ DEF_ANTICLASS32(ICLASS_LD" 1110 000----- PP------ --------",LD_ADDR_POST_REG)
 
 DEF_ENC32(L2_deallocframe,    ICLASS_LD" 000 0 000 sssss PP0----- ---ddddd")
 DEF_ENC32(L4_return,          ICLASS_LD" 011 0 000 sssss PP0000-- ---ddddd")
+
 DEF_ENC32(L4_return_t,        ICLASS_LD" 011 0 000 sssss PP0100vv ---ddddd")
 DEF_ENC32(L4_return_f,        ICLASS_LD" 011 0 000 sssss PP1100vv ---ddddd")
 DEF_ENC32(L4_return_tnew_pt,  ICLASS_LD" 011 0 000 sssss PP0110vv ---ddddd")
@@ -382,15 +386,19 @@ DEF_ENC32(L4_return_fnew_pt,  ICLASS_LD" 011 0 000 sssss PP1110vv ---ddddd")
 DEF_ENC32(L4_return_tnew_pnt, ICLASS_LD" 011 0 000 sssss PP0010vv ---ddddd")
 DEF_ENC32(L4_return_fnew_pnt, ICLASS_LD" 011 0 000 sssss PP1010vv ---ddddd")
 
-DEF_ENC32(L2_loadw_locked,ICLASS_LD" 001 0 000 sssss PP000--- 000ddddd")
-
+/** Load Acquire Store Release Encoding **/
 
+DEF_ENC32(L4_loadw_phys,      ICLASS_LD" 001 0 000 sssss PP1ttttt -00ddddd")
+DEF_ENC32(L2_loadw_locked,    ICLASS_LD" 001 0 000 sssss PP000--- 000ddddd")
+DEF_ENC32(L4_loadd_locked,    ICLASS_LD" 001 0 000 sssss PP010--- 000ddddd")
 
 DEF_ENC32(L2_loadw_aq,        ICLASS_LD" 001 0 000 sssss PP001--- 000ddddd")
 DEF_ENC32(L4_loadd_aq,        ICLASS_LD" 001 0 000 sssss PP011--- 000ddddd")
 
-DEF_ENC32(R6_release_at_vi,    ICLASS_ST" 000 01 11sssss PP0ttttt --0011dd")
-DEF_ENC32(R6_release_st_vi,   ICLASS_ST" 000 01 11sssss PP0ttttt --1011dd")
+
+DEF_ENC32(S2_storew_locked,   ICLASS_ST" 000 01 01sssss PP-ttttt ----00dd")
+DEF_ENC32(S4_stored_locked,   ICLASS_ST" 000 01 11sssss PP0ttttt ----00dd")
+
 
 DEF_ENC32(S2_storew_rl_at_vi,  ICLASS_ST" 000 01 01sssss PP-ttttt --0010dd")
 DEF_ENC32(S2_storew_rl_st_vi, ICLASS_ST" 000 01 01sssss PP-ttttt --1010dd")
@@ -398,13 +406,11 @@ DEF_ENC32(S2_storew_rl_st_vi, ICLASS_ST" 000 01 01sssss PP-ttttt --1010dd")
 DEF_ENC32(S4_stored_rl_at_vi,  ICLASS_ST" 000 01 11sssss PP0ttttt --0010dd")
 DEF_ENC32(S4_stored_rl_st_vi, ICLASS_ST" 000 01 11sssss PP0ttttt --1010dd")
 
-DEF_ENC32(L4_loadd_locked,ICLASS_LD" 001 0 000 sssss PP010--- 000ddddd")
-DEF_EXT_SPACE(EXTRACTW,   ICLASS_LD" 001 0 000 iiiii PP0iiiii -01iiiii")
-DEF_ENC32(Y2_dcfetchbo,   ICLASS_LD" 010 0 000 sssss PP0--iii iiiiiiii")
-
-
-
+DEF_ENC32(R6_release_at_vi,    ICLASS_ST" 000 01 11sssss PP0ttttt --0011dd")
+DEF_ENC32(R6_release_st_vi,   ICLASS_ST" 000 01 11sssss PP0ttttt --1011dd")
 
+DEF_EXT_SPACE(EXTRACTW,   ICLASS_LD" 001 0 000 iiiii PP0iiiii 001iiiii")
+DEF_ENC32(Y2_dcfetchbo,   ICLASS_LD" 010 0 000 sssss PP0--iii iiiiiiii")
 
 
 
@@ -488,13 +494,17 @@ STD_PST_ENC(rinew, "1 101","10ttt")
 /*                               x bus/cache     */
 /*                                    x store/cache     */
 DEF_ENC32(S2_allocframe,   ICLASS_ST" 000 01 00xxxxx PP000iii iiiiiiii")
-DEF_ENC32(S2_storew_locked,ICLASS_ST" 000 01 01sssss PP-ttttt ----00dd")
-DEF_ENC32(S4_stored_locked,ICLASS_ST" 000 01 11sssss PP0ttttt ----00dd")
+DEF_ENC32(Y5_l2locka,      ICLASS_ST" 000 01 11sssss PP1----- ------dd")
 DEF_ENC32(Y2_dczeroa,      ICLASS_ST" 000 01 10sssss PP0----- --------")
 
 
-DEF_ENC32(Y2_barrier,      ICLASS_ST" 100 00 00----- PP------ 000-----")
+DEF_ENC32(Y2_barrier,          ICLASS_ST" 100 00 00----- PP------ 000-----")
 DEF_ENC32(Y2_syncht,       ICLASS_ST" 100 00 10----- PP------ --------")
+DEF_ENC32(Y2_l2kill,       ICLASS_ST" 100 00 01----- PP-000-- --------")
+DEF_ENC32(Y5_l2gunlock,    ICLASS_ST" 100 00 01----- PP-010-- --------")
+DEF_ENC32(Y5_l2gclean,     ICLASS_ST" 100 00 01----- PP-100-- --------")
+DEF_ENC32(Y5_l2gcleaninv,  ICLASS_ST" 100 00 01----- PP-110-- --------")
+DEF_ENC32(Y2_l2cleaninvidx,ICLASS_ST" 100 00 11sssss PP------ --------")
 
 
 
@@ -502,9 +512,34 @@ DEF_ENC32(Y2_dccleana,     ICLASS_ST" 000 00 00sssss PP------ --------")
 DEF_ENC32(Y2_dcinva,       ICLASS_ST" 000 00 01sssss PP------ --------")
 DEF_ENC32(Y2_dccleaninva,  ICLASS_ST" 000 00 10sssss PP------ --------")
 
-DEF_ENC32(Y4_l2fetch,      ICLASS_ST" 011 00 00sssss PP-ttttt 000-----")
+/* Super */
+DEF_ENC32(Y2_dckill,       ICLASS_ST" 001 00 00----- PP------ --------")
+DEF_ENC32(Y2_dccleanidx,   ICLASS_ST" 001 00 01sssss PP------ --------")
+DEF_ENC32(Y2_dcinvidx,     ICLASS_ST" 001 00 10sssss PP------ --------")
+DEF_ENC32(Y2_dccleaninvidx,ICLASS_ST" 001 00 11sssss PP------ --------")
+
+DEF_ENC32(Y2_dctagw       ,ICLASS_ST" 010 00 00sssss PP-ttttt --------")
+DEF_ENC32(Y2_dctagr       ,ICLASS_ST" 010 00 01sssss PP------ ---ddddd")
+
+DEF_ENC32(Y4_l2tagw       ,ICLASS_ST" 010 00 10sssss PP0ttttt --------")
+DEF_ENC32(Y4_l2tagr       ,ICLASS_ST" 010 00 11sssss PP------ ---ddddd")
+
+DEF_ENC32(Y4_l2fetch,          ICLASS_ST" 011 00 00sssss PP-ttttt 000-----")
+DEF_ENC32(Y5_l2cleanidx,   ICLASS_ST" 011 00 01sssss PP------ --------")
+DEF_ENC32(Y5_l2invidx,     ICLASS_ST" 011 00 10sssss PP------ --------")
+DEF_ENC32(Y5_l2unlocka,    ICLASS_ST" 011 00 11sssss PP------ --------")
 DEF_ENC32(Y5_l2fetch,      ICLASS_ST" 011 01 00sssss PP-ttttt --------")
 
+DEF_ENC32(Y6_l2gcleanpa,   ICLASS_ST" 011 01 01----- PP-ttttt --------")
+DEF_ENC32(Y6_l2gcleaninvpa,ICLASS_ST" 011 01 10----- PP-ttttt --------")
+
+
+
+
+
+
+
+
 /*******************************/
 /*                             */
 /*                             */
@@ -547,13 +582,23 @@ DEF_ENC32(J2_jumprfnewpt, ICLASS_J" 0011  011sssss  PP-11-uu  --------")
 
 DEF_FIELDROW_DESC32(ICLASS_J" 0100 -------- PP------ --------","[#4] (#u8) ")
 DEF_ENC32(J2_trap0,     ICLASS_J" 0100  00------  PP-iiiii  ---iii--")
-DEF_ENC32(J2_pause,     ICLASS_J" 0100  01------  PP-iiiii  ---iii--")
+DEF_ENC32(J2_trap1,     ICLASS_J" 0100  10-xxxxx  PP-iiiii  ---iii--")
+DEF_ENC32(J2_pause,     ICLASS_J" 0100  01----ii  PP-iiiii  ---iii--")
+
+DEF_FIELDROW_DESC32(ICLASS_J" 0101 -------- PP------ --------","[#5] Rd=(Rs) ")
+DEF_ENC32(Y2_icdatar,   ICLASS_J" 0101  101sssss  PP------  ---ddddd")
+DEF_ENC32(Y2_ictagr,    ICLASS_J" 0101  111sssss  PP------  ---ddddd")
+DEF_ENC32(Y2_ictagw,    ICLASS_J" 0101  110sssss  PP0ttttt  --------")
+DEF_ENC32(Y2_icdataw,   ICLASS_J" 0101  110sssss  PP1ttttt  --------")
 
 DEF_FIELDROW_DESC32(ICLASS_J" 0110 -------- PP------ --------","[#6] icop(Rs) ")
 DEF_ENC32(Y2_icinva,    ICLASS_J" 0110  110sssss  PP000---  --------")
+DEF_ENC32(Y2_icinvidx,  ICLASS_J" 0110  110sssss  PP001---  --------")
+DEF_ENC32(Y2_ickill,    ICLASS_J" 0110  110-----  PP010---  --------")
 
 DEF_FIELDROW_DESC32(ICLASS_J" 0111 -------- PP------ --------","[#7] () ")
 DEF_ENC32(Y2_isync,     ICLASS_J" 0111  11000000  PP0---00  00000010")
+DEF_ENC32(J2_rte,       ICLASS_J" 0111  111-----  PP00----  000-----")
 
 /* JUMP */
 DEF_FIELDROW_DESC32(ICLASS_J" 100- -------- PP------ --------","[#8,9] PC=(#r22)")
@@ -591,7 +636,6 @@ DEF_ENC32(J2_callf,     ICLASS_J" 1101  ii1iiiii  PPi-0-uu  iiiiiii-")
 /*******************************/
 
 
-/* EJP: this has to match what we have in htmldocs.py... so I will call it CJ, we can change it */
 DEF_CLASS32(ICLASS_CJ" 0--- -------- PP------ --------",CJ)
 
 DEF_FIELDROW_DESC32(ICLASS_CJ" 00-- -------- -------- --------","[#0-3]  pd=cmp.xx(R,#u5) ; if ([!]p0.new) jump:[h] #s9:2 ")
@@ -738,12 +782,30 @@ DEF_ENC32(J2_jumprltezpt,ICLASS_CR" 0001  11isssss  PPi1iiii  iiiiiii-")
 
 DEF_FIELDROW_DESC32(    ICLASS_CR" 0010  --------  PP------  --------","[#2] Cd=Rs ")
 DEF_ENC32(A2_tfrrcr,    ICLASS_CR" 0010  001sssss  PP------  ---ddddd")
+DEF_ENC32(G4_tfrgrcr,   ICLASS_CR" 0010  000sssss  PP------  ---ddddd")
+DEF_ENC32(Y4_trace,     ICLASS_CR" 0010  010sssss  PP------  000-----")
+DEF_ENC32(Y6_diag,      ICLASS_CR" 0010  010sssss  PP------  001-----")
+DEF_ENC32(Y6_diag0,     ICLASS_CR" 0010  010sssss  PP-ttttt  010-----")
+DEF_ENC32(Y6_diag1,     ICLASS_CR" 0010  010sssss  PP-ttttt  011-----")
+
+DEF_ENC32(Y6_dmcfgrd,"10101000000sssssPP------101ddddd")
+DEF_ENC32(Y6_dmcfgwr,"10101000000sssssPP-ttttt110-----")
+DEF_ENC32(Y6_dmlink,"10100110000sssssPP-ttttt010-----")
+DEF_ENC32(Y6_dmpause,"10101000000-----PP------011ddddd")
+DEF_ENC32(Y6_dmpoll,"10101000000-----PP------010ddddd")
+DEF_ENC32(Y6_dmresume,"10100110000sssssPP------100-----")
+DEF_ENC32(Y6_dmstart,"10100110000sssssPP------001-----")
+DEF_ENC32(Y6_dmsyncht,"10101000000-----PP-----0111ddddd")
+DEF_ENC32(Y6_dmtlbsynch,"10101000000-----PP-----1111ddddd")
+DEF_ENC32(Y6_dmwait,"10101000000-----PP------001ddddd")
 
 DEF_FIELDROW_DESC32(    ICLASS_CR" 0011  --------  PP------  --------","[#3] Cdd=Rss ")
 DEF_ENC32(A4_tfrpcp,    ICLASS_CR" 0011  001sssss  PP------  ---ddddd")
+DEF_ENC32(G4_tfrgpcp,   ICLASS_CR" 0011  000sssss  PP------  ---ddddd")
 
 DEF_FIELDROW_DESC32(    ICLASS_CR" 1000  --------  PP------  --------","[#8] Rdd=Css ")
 DEF_ENC32(A4_tfrcpp,    ICLASS_CR" 1000  000sssss  PP------  ---ddddd")
+DEF_ENC32(G4_tfrgcpp,   ICLASS_CR" 1000  001sssss  PP------  ---ddddd")
 
 DEF_FIELDROW_DESC32(    ICLASS_CR" 1001  --------  PP------  --------","[#9] (#r8,#U10)")
 DEF_ENC32(J2_ploop1si,  ICLASS_CR" 1001  101IIIII  PP-iiiii  IIIii-II")
@@ -754,6 +816,7 @@ DEF_ENC32(J2_loop1i,    ICLASS_CR" 1001  001IIIII  PP-iiiii  IIIii-II")
 
 DEF_FIELDROW_DESC32(    ICLASS_CR" 1010  --------  PP------  --------","[#10] Rd=Cs ")
 DEF_ENC32(A2_tfrcrr,    ICLASS_CR" 1010  000sssss  PP------  ---ddddd")
+DEF_ENC32(G4_tfrgcrr,   ICLASS_CR" 1010  001sssss  PP------  ---ddddd")
 DEF_ENC32(C4_addipc,    ICLASS_CR" 1010  01001001  PP-iiiii  i--ddddd")
 
 
@@ -776,8 +839,66 @@ DEF_ENC32(C4_and_orn,   ICLASS_CR" 1011  1011--ss  PP0---tt  uu----dd")
 DEF_ENC32(C4_or_andn,   ICLASS_CR" 1011  1101--ss  PP0---tt  uu----dd")
 DEF_ENC32(C4_or_orn,    ICLASS_CR" 1011  1111--ss  PP0---tt  uu----dd")
 
-DEF_ENC32(C4_fastcorner9,       ICLASS_CR"1011 0000--ss  PP1---tt 1--1--dd")
-DEF_ENC32(C4_fastcorner9_not,   ICLASS_CR"1011 0001--ss  PP1---tt 1--1--dd")
+DEF_ENC32(C4_fastcorner9,	ICLASS_CR"1011 0000--ss  PP1---tt 1--1--dd")
+DEF_ENC32(C4_fastcorner9_not,	ICLASS_CR"1011 0001--ss  PP1---tt 1--1--dd")
+
+
+
+/* Supervisor CR ops */
+/* Interrupts */
+DEF_FIELDROW_DESC32(   ICLASS_CR" 0100 -------- PP------  --------","[#4] (Rs,Pt)")
+DEF_ENC32(Y2_swi,      ICLASS_CR" 0100 000sssss PP------ 000-----")
+DEF_ENC32(Y2_cswi,     ICLASS_CR" 0100 000sssss PP------ 001-----")
+DEF_ENC32(Y2_iassignw, ICLASS_CR" 0100 000sssss PP------ 010-----")
+DEF_ENC32(Y2_ciad,     ICLASS_CR" 0100 000sssss PP------ 011-----")
+DEF_ENC32(Y2_setimask, ICLASS_CR" 0100 100sssss PP----tt 000-----")
+DEF_ENC32(Y2_setprio,  ICLASS_CR" 0100 100sssss PP----tt 001-----")
+DEF_ENC32(Y4_siad,     ICLASS_CR" 0100 100sssss PP------ 011-----")
+
+DEF_ENC32(Y2_wait,     ICLASS_CR" 0100 010sssss PP------ 000-----")
+DEF_ENC32(Y2_resume,   ICLASS_CR" 0100 010sssss PP------ 001-----")
+DEF_ENC32(Y2_stop,     ICLASS_CR" 0100 011sssss PP------ 000-----")
+DEF_ENC32(Y2_start,    ICLASS_CR" 0100 011sssss PP------ 001-----")
+DEF_ENC32(Y4_nmi,      ICLASS_CR" 0100 011sssss PP------ 010-----")
+
+DEF_FIELDROW_DESC32(   ICLASS_CR" 0101 -------- PP------  --------","[#5] Rx ")
+DEF_ENC32(Y2_crswap0,  ICLASS_CR" 0101 000xxxxx PP------ --------")
+DEF_ENC32(Y4_crswap1,  ICLASS_CR" 0101 001xxxxx PP------ --------")
+
+DEF_FIELDROW_DESC32(   ICLASS_CR" 0110 -------- PP------  --------","[#6] Rd=(Rs)")
+DEF_ENC32(Y2_getimask, ICLASS_CR" 0110 000sssss PP------ ---ddddd")
+DEF_ENC32(Y2_iassignr, ICLASS_CR" 0110 011sssss PP------ ---ddddd")
+
+DEF_FIELDROW_DESC32(   ICLASS_CR" 0111 -------- PP------  --------","[#7] cr=Rs ")
+DEF_ENC32(Y2_tfrsrcr,  ICLASS_CR" 0111 00-sssss PP------ -ddddddd")
+
+DEF_FIELDROW_DESC32(   ICLASS_CR" 1100 -------- PP------  --------","[#12] ")
+DEF_ENC32(Y2_break,    ICLASS_CR" 1100 001----- PP------ 000-----")
+DEF_ENC32(Y2_tlblock,  ICLASS_CR" 1100 001----- PP------ 001-----")
+DEF_ENC32(Y2_tlbunlock,ICLASS_CR" 1100 001----- PP------ 010-----")
+DEF_ENC32(Y2_k0lock,   ICLASS_CR" 1100 001----- PP------ 011-----")
+DEF_ENC32(Y2_k0unlock, ICLASS_CR" 1100 001----- PP------ 100-----")
+DEF_ENC32(Y2_tlbp,     ICLASS_CR" 1100 100sssss PP------ ---ddddd")
+DEF_ENC32(Y5_tlboc,    ICLASS_CR" 1100 111sssss PP------ ---ddddd")
+DEF_ENC32(Y5_tlbasidi, ICLASS_CR" 1100 101sssss PP------ --------")
+DEF_ENC32(Y2_tlbr,     ICLASS_CR" 1100 010sssss PP------ ---ddddd")
+DEF_ENC32(Y2_tlbw,     ICLASS_CR" 1100 000sssss PP0ttttt --------")
+DEF_ENC32(Y5_ctlbw,    ICLASS_CR" 1100 110sssss PP0ttttt ---ddddd")
+
+DEF_FIELDROW_DESC32(   ICLASS_CR" 1101 -------- PP------  --------","[#13] Rxx ")
+DEF_ENC32(Y4_crswap10, ICLASS_CR" 1101 10-xxxxx PP------ ---00000")
+DEF_ENC32(Y4_tfrspcp,  ICLASS_CR" 1101 00-sssss PP------ -ddddddd")
+
+DEF_FIELDROW_DESC32(   ICLASS_CR" 1110 -------- PP------  --------","[#14] Rd=cr ")
+DEF_ENC32(Y2_tfrscrr,  ICLASS_CR" 1110 1sssssss PP------ ---ddddd")
+
+DEF_FIELDROW_DESC32(   ICLASS_CR" 1111 -------- PP------  --------","[#15] Rdd=Sss ")
+DEF_ENC32(Y4_tfrscpp,  ICLASS_CR" 1111 0sssssss PP------ ---ddddd")
+
+
+
+
+
 
 
 
@@ -956,9 +1077,9 @@ MPY_ENC(F2_dfmin,            "1000","ddddd","0","0","1","1","11")
 MPY_ENC(F2_dfmax,            "1000","ddddd","0","1","0","0","11")
 MPY_ENC(F2_dfmpyll,          "1000","ddddd","0","1","0","1","11")
 
-MPY_ENC(M7_dcmpyrw,          "1000","ddddd","0","0","0","1","10")
+MPY_ENC(M7_dcmpyrw,       	"1000","ddddd","0","0","0","1","10")
 MPY_ENC(M7_dcmpyrwc,         "1000","ddddd","0","0","1","1","10")
-MPY_ENC(M7_dcmpyiw,          "1000","ddddd","0","1","1","0","10")
+MPY_ENC(M7_dcmpyiw,       	"1000","ddddd","0","1","1","0","10")
 MPY_ENC(M7_dcmpyiwc,         "1000","ddddd","0","1","1","1","10")
 
 
@@ -967,14 +1088,14 @@ DEF_FIELDROW_DESC32(ICLASS_M" 1001 -------- PP------ --------","[#9] Rd=(Rss,Rtt
 MPY_ENC(M2_vdmpyrs_s0,       "1001","ddddd","0","0","0","0","00")
 MPY_ENC(M2_vdmpyrs_s1,       "1001","ddddd","0","0","0","1","00")
 
-MPY_ENC(M7_wcmpyrw,          "1001","ddddd","0","0","1","0","00")
+MPY_ENC(M7_wcmpyrw,      	 "1001","ddddd","0","0","1","0","00")
 MPY_ENC(M7_wcmpyrw_rnd,      "1001","ddddd","0","0","1","1","00")
-MPY_ENC(M7_wcmpyiw,          "1001","ddddd","0","1","0","0","00")
+MPY_ENC(M7_wcmpyiw,       	 "1001","ddddd","0","1","0","0","00")
 MPY_ENC(M7_wcmpyiw_rnd,      "1001","ddddd","0","1","0","1","00")
 
-MPY_ENC(M7_wcmpyrwc,         "1001","ddddd","0","1","1","0","00")
+MPY_ENC(M7_wcmpyrwc,      	 "1001","ddddd","0","1","1","0","00")
 MPY_ENC(M7_wcmpyrwc_rnd,     "1001","ddddd","0","1","1","1","00")
-MPY_ENC(M7_wcmpyiwc,         "1001","ddddd","1","0","0","0","00")
+MPY_ENC(M7_wcmpyiwc,       	 "1001","ddddd","1","0","0","0","00")
 MPY_ENC(M7_wcmpyiwc_rnd,     "1001","ddddd","1","0","0","1","00")
 
 
@@ -1030,10 +1151,10 @@ MPY_ENC(F2_dfmpylh,          "1010","xxxxx","0","0","0","0","11")
 MPY_ENC(F2_dfmpyhh,          "1010","xxxxx","0","0","0","1","11")
 
 
-MPY_ENC(M7_dcmpyrw_acc,      "1010","xxxxx","0","0","0","1","10")
-MPY_ENC(M7_dcmpyrwc_acc,     "1010","xxxxx","0","0","1","1","10")
-MPY_ENC(M7_dcmpyiw_acc,      "1010","xxxxx","0","1","1","0","10")
-MPY_ENC(M7_dcmpyiwc_acc,     "1010","xxxxx","1","0","1","0","10")
+MPY_ENC(M7_dcmpyrw_acc,       	"1010","xxxxx","0","0","0","1","10")
+MPY_ENC(M7_dcmpyrwc_acc,         "1010","xxxxx","0","0","1","1","10")
+MPY_ENC(M7_dcmpyiw_acc,       	"1010","xxxxx","0","1","1","0","10")
+MPY_ENC(M7_dcmpyiwc_acc,         "1010","xxxxx","1","0","1","0","10")
 
 
 
@@ -1063,7 +1184,6 @@ SP_MPY(M2_mpy_sat_rnd,       "1100","ddddd","1","1","0")
 SP_MPY(M2_mpyu,              "1100","ddddd","0","0","1")
 
 DEF_FIELDROW_DESC32(ICLASS_M" 1101 -------- PP------ --------","[#13] Rd=(Rs,Rt)")
-/* EJP: same as mpyi MPY_ENC(M2_mpyui,            "1101","ddddd","0","0","1","0","00") */
 MPY_ENC(M2_mpyi,             "1101","ddddd","0","0","0","0","00")
 MPY_ENC(M2_mpy_up,           "1101","ddddd","0","0","0","0","01")
 MPY_ENC(M2_mpyu_up,          "1101","ddddd","0","0","1","0","01")
@@ -1266,7 +1386,6 @@ DEF_ENC32(C2_cmovenewif,ICLASS_ALU2op" 1110 1uu0iiii PP1iiiii iiiddddd")
 DEF_ENC32(C2_cmoveit,   ICLASS_ALU2op" 1110 0uu0iiii PP0iiiii iiiddddd")
 DEF_ENC32(C2_cmoveif,   ICLASS_ALU2op" 1110 1uu0iiii PP0iiiii iiiddddd")
 
-
 DEF_FIELDROW_DESC32(    ICLASS_ALU2op" 1111 -------- PP------ --------","[#15] nop")
 DEF_ENC32(A2_nop,       ICLASS_ALU2op" 1111 -------- PP------ --------")
 
@@ -1408,9 +1527,6 @@ DEF_FIELDROW_DESC32(ICLASS_ALU3op"  1110 -------- PP------ --------","[#14] Rese
 
 
 
-
-
-
 /*******************************/
 /*                             */
 /*                             */
@@ -1508,7 +1624,6 @@ SH_RRI6_ENC(S6_rol_i_##TAGEND,MAJ4,MIN3,SMOD1 "11",DSTCHARS)
 
 
 DEF_FIELDROW_DESC32(ICLASS_S2op" 0000 -------- PP------ --------","[#0] Rdd=(Rss,#u6)")
-/* EJP: there is actually quite a bit of space here, look at the reserved bits */
 I6SHIFTTYPES(p,                 "0000","000","0","ddddd")
 I5SHIFTTYPES_NOROL(vw,          "0000","010","0","ddddd")
 I4SHIFTTYPES(vh,                "0000","100","0","ddddd")
@@ -1620,8 +1735,8 @@ SH2_RR_ENC(A2_roundsat,           "1000","110","-","001","ddddd")
 SH_RRI5_ENC(S2_asr_i_svw_trun,    "1000","110",    "010","ddddd")
 SH_RRI5_ENC(A4_bitspliti,         "1000","110",    "100","ddddd")
 
-SH_RRI5_ENC(A7_clip,              "1000","110",    "101","ddddd")
-SH_RRI5_ENC(A7_vclip,             "1000","110",    "110","ddddd")
+SH_RRI5_ENC(A7_clip,         	  "1000","110",    "101","ddddd")
+SH_RRI5_ENC(A7_vclip,         	  "1000","110",    "110","ddddd")
 
 
 SH2_RR_ENC(S4_clbpnorm,           "1000","011","-","000","ddddd")
@@ -1743,10 +1858,11 @@ SH_RRR_ENC(S2_shuffob,          "0001","00-","-","10-","ddddd")
 SH_RRR_ENC(S2_shuffeh,          "0001","00-","-","11-","ddddd")
 
 SH_RRR_ENC(S2_shuffoh,          "0001","10-","-","000","ddddd")
+// 001
 SH_RRR_ENC(S2_vtrunewh,         "0001","10-","-","010","ddddd")
-SH_RRR_ENC(S6_vtrunehb_ppp,     "0001","10-","-","011","ddddd")
+SH_RRR_ENC(S6_vtrunehb_ppp,		"0001","10-","-","011","ddddd")
 SH_RRR_ENC(S2_vtrunowh,         "0001","10-","-","100","ddddd")
-SH_RRR_ENC(S6_vtrunohb_ppp,     "0001","10-","-","101","ddddd")
+SH_RRR_ENC(S6_vtrunohb_ppp,		"0001","10-","-","101","ddddd")
 SH_RRR_ENC(S2_lfsp,             "0001","10-","-","110","ddddd")
 
 SH_RRR_ENC(S4_vxaddsubw,        "0001","01-","-","000","ddddd")
@@ -1780,8 +1896,6 @@ SH_RRR_ENC(S4_vrcrotate,        "0011","11-","i","11i","ddddd")
 DEF_FIELDROW_DESC32(ICLASS_S3op" 0100 -------- PP------ --------","[#4] Rd=(Rs,Rt,#u3)")
 DEF_ENC32(S2_addasl_rrri, ICLASS_S3op" 0100   000 sssss PP0ttttt iiiddddd")
 
-
-
 DEF_FIELDROW_DESC32(ICLASS_S3op" 0101 -------- PP------ --------","[#5] Rd=(Rss,Rt)")
 SH_RRR_ENC(S2_asr_r_svw_trun,   "0101","---","-","010","ddddd")
 SH_RRR_ENC(M4_cmpyi_wh,         "0101","---","-","100","ddddd")
@@ -1841,6 +1955,7 @@ DEF_FIELDROW_DESC32(ICLASS_S3op" 1010 -------- PP------ --------","[#10] Rxx=(Rs
 SH_RRR_ENC(S2_insertp_rp,       "1010","0--","0","---","xxxxx")
 SH_RRR_ENC(M4_xor_xacc,         "1010","10-","0","000","xxxxx")
 
+
 DEF_FIELDROW_DESC32(ICLASS_S3op" 1011 -------- PP------ --------","[#11] Rxx=(Rss,Rt)")
 RSHIFTTYPES(p_or,               "1011","000","-","-","xxxxx")
 RSHIFTTYPES(p_and,              "1011","010","-","-","xxxxx")
@@ -1848,19 +1963,19 @@ RSHIFTTYPES(p_nac,              "1011","100","-","-","xxxxx")
 RSHIFTTYPES(p_acc,              "1011","110","-","-","xxxxx")
 RSHIFTTYPES(p_xor,              "1011","011","-","-","xxxxx")
 
-SH_RRR_ENCX(A4_vrmaxh,          "1011","001","0","001","uuuuu")
-SH_RRR_ENCX(A4_vrmaxuh,         "1011","001","1","001","uuuuu")
-SH_RRR_ENCX(A4_vrmaxw,          "1011","001","0","010","uuuuu")
-SH_RRR_ENCX(A4_vrmaxuw,         "1011","001","1","010","uuuuu")
+SH_RRR_ENCX(A4_vrmaxh,		"1011","001","0","001","uuuuu")
+SH_RRR_ENCX(A4_vrmaxuh,		"1011","001","1","001","uuuuu")
+SH_RRR_ENCX(A4_vrmaxw,		"1011","001","0","010","uuuuu")
+SH_RRR_ENCX(A4_vrmaxuw,		"1011","001","1","010","uuuuu")
 
-SH_RRR_ENCX(A4_vrminh,          "1011","001","0","101","uuuuu")
-SH_RRR_ENCX(A4_vrminuh,         "1011","001","1","101","uuuuu")
-SH_RRR_ENCX(A4_vrminw,          "1011","001","0","110","uuuuu")
-SH_RRR_ENCX(A4_vrminuw,         "1011","001","1","110","uuuuu")
+SH_RRR_ENCX(A4_vrminh,		"1011","001","0","101","uuuuu")
+SH_RRR_ENCX(A4_vrminuh,		"1011","001","1","101","uuuuu")
+SH_RRR_ENCX(A4_vrminw,		"1011","001","0","110","uuuuu")
+SH_RRR_ENCX(A4_vrminuw,		"1011","001","1","110","uuuuu")
 
-SH_RRR_ENC(S2_vrcnegh,          "1011","001","1","111","xxxxx")
+SH_RRR_ENC(S2_vrcnegh,		"1011","001","1","111","xxxxx")
 
-SH_RRR_ENC(S4_vrcrotate_acc,    "1011","101","i","--i","xxxxx")
+SH_RRR_ENC(S4_vrcrotate_acc,	"1011","101","i","--i","xxxxx")
 
 
 DEF_FIELDROW_DESC32(ICLASS_S3op" 1100 -------- PP------ --------","[#12] Rx=(Rs,Rt)")
@@ -1874,11 +1989,6 @@ DEF_FIELDROW_DESC32(ICLASS_S3op" 1101 -------- PP------ --------","[#13] Reserve
 DEF_FIELDROW_DESC32(ICLASS_S3op" 1110 -------- PP------ --------","[#14] Reserved")
 
 
-DEF_FIELDROW_DESC32(ICLASS_S3op" 1111 -------- PP------ --------","[#14] User Instruction")
-
-
-
-
 
 
 
@@ -2129,3 +2239,5 @@ OP_OPI_RI(lsr,"1")
 DEF_FIELDROW_DESC32(ICLASS_ALU64" 1111 -------- PP------ --------","[#15] Rd=(Rs,Ru,#u6:2)")
 DEF_ENC32(M4_mpyri_addr_u2, ICLASS_ALU64" 1111   0ii sssss PPiddddd iiiuuuuu")
 DEF_ENC32(M4_mpyri_addr,    ICLASS_ALU64" 1111   1ii sssss PPiddddd iiiuuuuu")
+
+
diff --git a/target/hexagon/imported/ldst.idef b/target/hexagon/imported/ldst.idef
index 53198176a994..4e1e5d5326dd 100644
--- a/target/hexagon/imported/ldst.idef
+++ b/target/hexagon/imported/ldst.idef
@@ -203,6 +203,9 @@ Q6INSN(S2_storew_locked,"memw_locked(Rs32,Pd4)=Rt32", ATTRIBS(A_REGWRSIZE_4B,A_M
 Q6INSN(L4_loadd_locked,"Rdd32=memd_locked(Rs32)", ATTRIBS(A_REGWRSIZE_8B,A_MEMSIZE_8B,A_LOAD,A_RESTRICT_SLOT0ONLY,A_RESTRICT_PACKET_AXOK,A_NOTE_AXOK), "Load double with lock",
 { fEA_REG(RsV); fLOAD_LOCKED(1,8,u,EA,RddV) })
 
+Q6INSN(L4_loadw_phys,"Rd32=memw_phys(Rs32,Rt32)", ATTRIBS(A_REGWRSIZE_4B,A_PRIV,A_RESTRICT_SLOT0ONLY,A_NOTE_PRIV,A_MEMSIZE_4B,A_LOAD,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET), "Load word from physical address",
+{ fLOAD_PHYS(1,4,u,RsV,RtV,RdV); })
+
 Q6INSN(S4_stored_locked,"memd_locked(Rs32,Pd4)=Rtt32", ATTRIBS(A_REGWRSIZE_8B,A_MEMSIZE_8B,A_STORE,A_RESTRICT_SLOT0ONLY,A_RESTRICT_PACKET_AXOK,A_NOTE_AXOK,A_RESTRICT_LATEPRED,A_NOTE_LATEPRED), "Store word with lock",
 { fEA_REG(RsV); fSTORE_LOCKED(1,8,EA,RttV,PdV) })
 
diff --git a/target/hexagon/imported/macros.def b/target/hexagon/imported/macros.def
old mode 100755
new mode 100644
index 4bbcfdd5e194..f24f89f36126
--- a/target/hexagon/imported/macros.def
+++ b/target/hexagon/imported/macros.def
@@ -353,6 +353,12 @@ DEF_MACRO(
     ()
 )
 
+DEF_MACRO(
+    fREAD_SSR, /* read SSR register */
+    (READ_RREG(REG_SSR)),          /* behavior */
+    ()
+)
+
 DEF_MACRO(
     fWRITE_LR, /* write lr */
     WRITE_RREG(REG_LR,A),          /* behavior */
@@ -371,12 +377,36 @@ DEF_MACRO(
     (A_IMPLICIT_WRITES_SP)
 )
 
+DEF_MACRO(
+    fWRITE_GOSP, /* write gosp */
+    WRITE_RREG(REG_GOSP,A),          /* behavior */
+    (A_IMPLICIT_WRITES_GOSP)
+)
+
 DEF_MACRO(
     fREAD_SP, /* read stack pointer */
     (READ_RREG(REG_SP)),          /* behavior */
     ()
 )
 
+DEF_MACRO(
+    fREAD_GOSP, /* read guest other stack pointer */
+    (READ_RREG(REG_GOSP)),          /* behavior */
+    ()
+)
+
+DEF_MACRO(
+    fREAD_GELR, /* read guest other stack pointer */
+    (READ_RREG(REG_GELR)),          /* behavior */
+    ()
+)
+
+DEF_MACRO(
+    fREAD_GEVB, /* read guest other stack pointer */
+    (READ_RREG(REG_GEVB)),          /* behavior */
+    ()
+)
+
 DEF_MACRO(
     fREAD_CSREG, /* read  CS register */
     (READ_RREG(REG_CSA+N)),          /* behavior */
@@ -570,6 +600,11 @@ DEF_MACRO(
     WRITE_PREG(3,VAL),     /* behavior */
     (A_IMPLICIT_WRITES_P3)
 )
+DEF_MACRO(
+	fWRITE_P3_LATE, /* write Predicate 0 */
+	{WRITE_PREG(3,VAL); fHIDE(MARK_LATE_PRED_WRITE(3))} ,          /* behavior */
+	(A_IMPLICIT_WRITES_P3,A_RESTRICT_LATEPRED)
+)
 
 DEF_MACRO(
     fPART1, /* write Predicate 0 */
@@ -660,6 +695,7 @@ DEF_MACRO(
     ((size8s_t)((size2s_t)(A))),
     /* optional attributes */
 )
+
 DEF_MACRO(
     fCAST2_8u, /* macro name */
     ((size8u_t)((size2u_t)(A))),
@@ -1532,18 +1568,209 @@ DEF_MACRO(fECHO,
 /* OS interface and stop/wait               */
 /********************************************/
 
+DEF_MACRO(RUNNABLE_THREADS_MAX,
+    (thread->processor_ptr->runnable_threads_max),
+    ()
+)
+
+DEF_MACRO(THREAD_IS_ON,
+    ((PROC->arch_proc_options->thread_enable_mask>>TNUM) & 0x1),
+    ()
+)
+
+DEF_MACRO(THREAD_EN_MASK,
+    ((PROC->arch_proc_options->thread_enable_mask)),
+    ()
+)
+
+
+
+DEF_MACRO(READ_IMASK,
+    (((TH) >= (thread->processor_ptr->runnable_threads_max)) ? 0 : (thread->processor_ptr->thread[TH]->Regs[REG_IMASK])),
+    ()
+)
+DEF_MACRO(WRITE_IMASK,
+    if ((TH) < (thread->processor_ptr->runnable_threads_max)) { thread->processor_ptr->thread[TH]->Regs[REG_IMASK]=(VAL & reg_mutability[REG_IMASK] ); },
+    (A_IMPLICIT_WRITES_IMASK_ANYTHREAD)
+)
+
+
+DEF_MACRO(WRITE_PRIO,
+    {
+        if ((TH) < (thread->processor_ptr->runnable_threads_max)) {
+            size4u_t tid_reg = thread->processor_ptr->thread[TH]->Regs[REG_TID];
+            fINSERT_BITS(tid_reg, reg_field_info[STID_PRIO].width, reg_field_info[STID_PRIO].offset, VAL);
+            LOG_OTHER_THREAD_REG_WRITE(thread,REG_TID,tid_reg,TH);
+        }
+    },
+    (A_IMPLICIT_WRITES_STID_PRIO_ANYTHREAD)
+)
+
+
+DEF_MACRO(DO_IASSIGNW,
+    {
+        int i;
+        int intbitpos = ((REG>>16)&0xF);
+        for (i=0;i<RUNNABLE_THREADS_MAX;i++) {
+            if(( (thread->processor_ptr->arch_proc_options->thread_enable_mask>>i) & 0x1)) {
+                fINSERT_BITS(thread->processor_ptr->thread[i]->Regs[REG_IMASK],1, intbitpos, (REG>>i) & 1);
+           }
+        }
+    },
+    (A_IMPLICIT_WRITES_IMASK_ANYTHREAD)
+)
+
+
+
+
+DEF_MACRO(fDO_NMI,
+    {
+        int i;
+        for (i=0;i<RUNNABLE_THREADS_MAX;i++) {
+            if( ( (thread->processor_ptr->arch_proc_options->thread_enable_mask>>i) & 0x1) ) {
+                if (SREG & (1<<i)) {
+                    register_nmi_interrupt(thread->processor_ptr->thread[i]);
+                }
+            }
+        }
+    },
+)
+
+DEF_MACRO(fDO_TRACE,
+    {
+        fHIDE(HEX_CALLBACK(thread->processor_ptr->options->trace_callback,
+            thread->system_ptr,thread->processor_ptr,
+            thread->threadId,SREG);)
+    },
+)
+
+DEF_MACRO(DO_IASSIGNR,
+    {
+        int i;
+        int result=0;
+        int intbitpos = ((SREG>>16)&0xF);
+        for (i=0;i<RUNNABLE_THREADS_MAX;i++) {
+            if(( (thread->processor_ptr->arch_proc_options->thread_enable_mask>>i) & 0x1)) {
+                result |= (((thread->processor_ptr->thread[i]->Regs[REG_IMASK]>>intbitpos)&1)<<i);
+            }
+        }
+        DREG=result;
+    },
+    ()
+)
+
+DEF_MACRO(DO_SWI,
+        {fHIDE(HEX_CALLBACK(thread->processor_ptr->options->swi_callback,
+         thread->system_ptr,thread->processor_ptr,
+         thread->threadId,REG));
+         LOG_GLOBAL_REG_WRITE(REG_IPEND,(GLOBAL_REG_READ(REG_IPEND) | (REG & GLOBAL_REG_READ(REG_IEL))));
+        },
+        (A_EXCEPTION_SWI)
+)
+
+DEF_MACRO(DO_CSWI,
+        LOG_GLOBAL_REG_WRITE(REG_IPEND,GLOBAL_REG_READ(REG_IPEND) & ~((REG) & GLOBAL_REG_READ(REG_IEL)));,
+        ()
+)
+
+DEF_MACRO(DO_CIAD,
+        sys_ciad(thread,VAL); LOG_GLOBAL_REG_WRITE(REG_IAD,GLOBAL_REG_READ(REG_IAD) & ~(VAL));,
+        (A_EXCEPTION_SWI)
+)
+
+DEF_MACRO(DO_SIAD,
+        sys_siad(thread,VAL); LOG_GLOBAL_REG_WRITE(REG_IAD,GLOBAL_REG_READ(REG_IAD) | (VAL));,
+        (A_EXCEPTION_SWI)
+)
+
+DEF_MACRO(fBREAK,
+    {isdb_brkpt_insn(thread->processor_ptr,thread->threadId);},
+    ()
+)
+
 DEF_MACRO(fPAUSE,
     {sys_pause(thread, insn->slot, IMM);},
     ()
 )
 
+
 DEF_MACRO(fTRAP,
     warn("Trap NPC=%x ",fREAD_NPC());
     warn("Trap exception, PCYCLE=%lld TYPE=%d NPC=%x IMM=0x%x",thread->processor_ptr->pstats[pcycles],TRAPTYPE,fREAD_NPC(),IMM);
     register_trap_exception(thread,fREAD_NPC(),TRAPTYPE,IMM);,
+    (A_EXCEPTION_SWI)
+)
+
+DEF_MACRO(fINTERNAL_CLEAR_SAMEPAGE,
+    /* force re-xlate at next fetch, refresh of in_user_mode, etc */
+    /* Permissions change too... */
+    sys_utlb_invalidate(thread->processor_ptr,thread),
+    /* NOTHING */
+)
+
+DEF_MACRO(fCLEAR_RTE_EX,
+      {
+        fLOG_REG_FIELD(SSR,SSR_EX,0);
+        fINTERNAL_CLEAR_SAMEPAGE();
+      },
+      ()
+)
+
+DEF_MACRO(fTLB_LOCK_AVAILABLE,
+    (fREAD_GLOBAL_REG_FIELD(SYSCONF,SYSCFG_TLBLOCK) == 0),
     ()
 )
 
+DEF_MACRO(fK0_LOCK_AVAILABLE,
+    (fREAD_GLOBAL_REG_FIELD(SYSCONF,SYSCFG_K0LOCK) == 0),
+    ()
+)
+
+DEF_MACRO(fSET_TLB_LOCK,
+      {
+      if (fTLB_LOCK_AVAILABLE()) {
+        fLOG_GLOBAL_REG_FIELD(SYSCONF,SYSCFG_TLBLOCK,1);
+      } else {
+        sys_waiting_for_tlb_lock(thread);
+      }
+      },
+      ()
+)
+
+DEF_MACRO(fSET_K0_LOCK,
+      {
+          if (fK0_LOCK_AVAILABLE() && sys_k0lock_queue_ready(thread)) {
+              warn("k0lock: T%d: PC=0x%x: PCycle=%lld",thread->threadId,thread->Regs[REG_PC],thread->processor_ptr->pstats[pcycles]);
+              fLOG_GLOBAL_REG_FIELD(SYSCONF,SYSCFG_K0LOCK,1);
+          } else {
+              warn("k0lock_waiting: T%d: PC=0x%x: PCycle=%lld",thread->threadId,thread->Regs[REG_PC],thread->processor_ptr->pstats[pcycles]);
+                sys_waiting_for_k0_lock(thread);
+          }
+      },
+      ()
+)
+
+DEF_MACRO(fCLEAR_TLB_LOCK,
+      {
+          int i;
+          fLOG_GLOBAL_REG_FIELD(SYSCONF,SYSCFG_TLBLOCK,0);
+          for (i = 0; i < RUNNABLE_THREADS_MAX; i++) {
+              if(( (thread->processor_ptr->arch_proc_options->thread_enable_mask>>i) & 0x1)) {
+                  thread->processor_ptr->thread[i]->cu_tlb_lock_waiting = 0;
+              }
+          }
+      },
+      ()
+)
+
+DEF_MACRO(fCLEAR_K0_LOCK,
+      do {
+      warn("k0unlock: T%d: PC=0x%x: Pcycle=%lld",thread->threadId,thread->Regs[REG_PC], thread->processor_ptr->pstats[pcycles]);
+      sys_initiate_clear_k0_lock(thread);
+      } while (0),
+      ()
+)
+
 DEF_MACRO(fALIGN_REG_FIELD_VALUE,
     ((VAL)<<reg_field_info[FIELD].offset),
     /* */
@@ -1554,6 +1781,26 @@ DEF_MACRO(fGET_REG_FIELD_MASK,
     /* */
 )
 
+DEF_MACRO(fLOG_REG_FIELD,
+    LOG_MASKED_REG_WRITE(thread,REG_##REG,
+    fALIGN_REG_FIELD_VALUE(FIELD,VAL),
+    fGET_REG_FIELD_MASK(FIELD)),
+    ()
+)
+
+DEF_MACRO(fWRITE_GLOBAL_REG_FIELD,
+    fINSERT_BITS(thread->processor_ptr->global_regs[REG_##REG],
+            reg_field_info[FIELD].width,
+            reg_field_info[FIELD].offset,VAL),
+)
+
+DEF_MACRO(fLOG_GLOBAL_REG_FIELD,
+    LOG_MASKED_GLOBAL_REG_WRITE(REG_##REG,
+        fALIGN_REG_FIELD_VALUE(FIELD,VAL),
+        fGET_REG_FIELD_MASK(FIELD)),
+    ()
+)
+
 DEF_MACRO(fREAD_REG_FIELD,
     fEXTRACTU_BITS(thread->Regs[REG_##REG],
         reg_field_info[FIELD].width,
@@ -1561,6 +1808,13 @@ DEF_MACRO(fREAD_REG_FIELD,
     /* ATTRIBS */
 )
 
+DEF_MACRO(fREAD_GLOBAL_REG_FIELD,
+    fEXTRACTU_BITS(thread->processor_ptr->global_regs[REG_##REG],
+        reg_field_info[FIELD].width,
+        reg_field_info[FIELD].offset),
+    /* ATTRIBS */
+)
+
 DEF_MACRO(fGET_FIELD,
     fEXTRACTU_BITS(VAL,
         reg_field_info[FIELD].width,
@@ -1576,6 +1830,185 @@ DEF_MACRO(fSET_FIELD,
     /* ATTRIBS */
 )
 
+DEF_MACRO(fSET_RUN_MODE_NOW,
+        {thread->processor_ptr->global_regs[REG_MODECTL] |= (1<<TNUM);
+         thread->last_commit_cycle = thread->processor_ptr->pcycle_counter;
+         sys_recalc_num_running_threads(thread->processor_ptr);},
+)
+
+DEF_MACRO(fIN_DEBUG_MODE,
+    (thread->debug_mode || (fREAD_GLOBAL_REG_FIELD(ISDBST,ISDBST_DEBUGMODE) & 1<<TNUM)),
+    ()
+)
+DEF_MACRO(fIN_DEBUG_MODE_NO_ISDB,
+    (thread->debug_mode),
+    ()
+)
+
+
+DEF_MACRO(fIN_DEBUG_MODE_WARN,
+    {
+        if (fREAD_GLOBAL_REG_FIELD(ISDBST,ISDBST_DEBUGMODE) & 1<<TNUM)
+            warn("In ISDB debug mode, but TB told me to step normally");
+    },
+    ()
+)
+
+DEF_MACRO(fCLEAR_RUN_MODE,
+    {fLOG_GLOBAL_REG_FIELD(MODECTL,MODECTL_E,
+     fREAD_GLOBAL_REG_FIELD(MODECTL,MODECTL_E) & ~(1<<(TNUM)))},
+    /* NOTHING */
+)
+
+DEF_MACRO(fCLEAR_RUN_MODE_NOW,
+    do {
+        fWRITE_GLOBAL_REG_FIELD(MODECTL,MODECTL_E,
+        fREAD_GLOBAL_REG_FIELD(MODECTL,MODECTL_E) & ~(1<<(TNUM)));
+        sys_recalc_num_running_threads(thread->processor_ptr);
+    } while (0),
+    /* NOTHING */
+)
+
+DEF_MACRO(fGET_RUN_MODE,
+        ((thread->processor_ptr->global_regs[REG_MODECTL]>>TNUM)&0x1),
+)
+
+DEF_MACRO(fSET_WAIT_MODE,
+    {fLOG_GLOBAL_REG_FIELD(MODECTL,MODECTL_W,
+    fREAD_GLOBAL_REG_FIELD(MODECTL,MODECTL_W) | 1<<(TNUM))},
+    /* NOTHING */
+)
+
+DEF_MACRO(fCLEAR_WAIT_MODE,
+        {thread->processor_ptr->global_regs[REG_MODECTL] &= ~(1<<(TNUM+16));
+         thread->last_commit_cycle = thread->processor_ptr->pcycle_counter;
+         sys_recalc_num_running_threads(thread->processor_ptr);},
+)
+
+DEF_MACRO(fGET_WAIT_MODE,
+        ((thread->processor_ptr->global_regs[REG_MODECTL]>>(TNUM+16))&0x1),
+)
+
+
+DEF_MACRO(fRESET_THREAD,
+        register_reset_interrupt(T,NUM),
+)
+
+DEF_MACRO(fREAD_CURRENT_EVB,
+    (GLOBAL_REG_READ(REG_EVB)),
+    /* nothing */
+)
+
+DEF_MACRO(fREAD_ELR,
+    READ_RREG(REG_ELR),
+    ()
+)
+
+DEF_MACRO(fPOW2_HELP_ROUNDUP,
+    ((VAL) | ((VAL) >> 1) | ((VAL) >> 2) | ((VAL) >> 4) | ((VAL) >> 8) | ((VAL) >> 16)),
+    ()
+)
+
+DEF_MACRO(fPOW2_ROUNDUP,
+    fPOW2_HELP_ROUNDUP((VAL)-1)+1,
+    ()
+)
+
+DEF_MACRO(fTLB_IDXMASK,
+    ((INDEX) & (fPOW2_ROUNDUP(fCAST4u(thread->processor_ptr->arch_proc_options->jtlb_size)) - 1)),
+    ()
+)
+
+DEF_MACRO(fTLB_NONPOW2WRAP,
+    (((INDEX) >= thread->processor_ptr->arch_proc_options->jtlb_size) ? ((INDEX) - thread->processor_ptr->arch_proc_options->jtlb_size) : (INDEX)),
+    /* ATTRIBS */
+)
+
+DEF_MACRO(fTLBW,
+    do {size4u_t __myidx = fTLB_NONPOW2WRAP(fTLB_IDXMASK(INDEX));
+        TLB_REG_WRITE(__myidx,VALUE);
+        fHIDE(HEX_CALLBACK(thread->processor_ptr->options->tlbw_callback,thread->system_ptr,thread->processor_ptr,thread->threadId,__myidx);)
+        fHIDE(sys_tlb_write(thread,__myidx,VALUE);)} while (0),
+    /* ATTRIBS */
+)
+
+DEF_MACRO(fTLB_ENTRY_OVERLAP,
+    fHIDE( (sys_check_overlap(thread,VALUE)!=-2) ),
+    /* ATTRIBS */
+)
+
+DEF_MACRO(fTLB_ENTRY_OVERLAP_IDX,
+    fHIDE(sys_check_overlap(thread,VALUE)),
+    /* ATTRIBS */
+)
+
+
+DEF_MACRO(fTLBR,
+    TLB_REG_READ(fTLB_NONPOW2WRAP(fTLB_IDXMASK(INDEX))),
+    /* ATTRIBS */
+)
+
+DEF_MACRO(fTLBP,
+    tlb_lookup(thread,((TLBHI)>>12),((TLBHI)<<12),1),
+    /* attribs */
+)
+
+
+
+DEF_MACRO(READ_SGP0,
+    READ_RREG(REG_SGP),
+    ()
+)
+
+DEF_MACRO(READ_SGP1,
+    READ_RREG(REG_SGP+1),
+    ()
+)
+
+DEF_MACRO(READ_SGP10,
+    READ_RREG_PAIR(REG_SGP),
+    ()
+)
+
+DEF_MACRO(READ_UGP,
+    READ_RREG(REG_UGP),
+)
+
+DEF_MACRO(WRITE_SGP0,
+    WRITE_RREG(REG_SGP,VAL),
+    (A_IMPLICIT_WRITES_SGP0)
+)
+
+DEF_MACRO(WRITE_SGP1,
+    WRITE_RREG(REG_SGP+1,VAL),
+    (A_IMPLICIT_WRITES_SGP1)
+)
+
+DEF_MACRO(WRITE_SGP10,
+    WRITE_RREG_PAIR(REG_SGP,VAL),
+    (A_IMPLICIT_WRITES_SGP0,A_IMPLICIT_WRITES_SGP1)
+)
+
+DEF_MACRO(WRITE_UGP,
+        WRITE_RREG(REG_UGP,VAL),
+)
+
+DEF_MACRO(fSTART,
+    fLOG_GLOBAL_REG_FIELD(MODECTL,MODECTL_E, fREAD_GLOBAL_REG_FIELD(MODECTL,MODECTL_E) | (((REG & ((1<<RUNNABLE_THREADS_MAX)-1))) & THREAD_EN_MASK(thread->processor_ptr))),
+    ()
+)
+
+DEF_MACRO(fRESUME,
+    fLOG_GLOBAL_REG_FIELD(MODECTL,MODECTL_W,
+    fREAD_GLOBAL_REG_FIELD(MODECTL,MODECTL_W) & (~(REG))),
+    ()
+)
+
+DEF_MACRO(fGET_TNUM,
+    thread->threadId,
+    ()
+)
+
 /********************************************/
 /* Cache Management                         */
 /********************************************/
@@ -1602,6 +2035,11 @@ DEF_MACRO(fISYNC,
 )
 
 
+DEF_MACRO(fICFETCH,
+    ,
+    ()
+)
+
 DEF_MACRO(fDCFETCH,
     sys_dcfetch(thread, (REG), insn->slot),
     (A_MEMLIKE)
@@ -1615,6 +2053,34 @@ DEF_MACRO(fICINVA,
     (A_ICINVA)
 )
 
+DEF_MACRO(fDCTAGR,
+    ({DST=sys_dctagr(thread, INDEX, insn->slot,DSTREGNO);})/* FIXME */,
+    ()
+)
+
+DEF_MACRO(fDCTAGW,
+    (sys_dctagw(thread, INDEX, PART2, insn->slot)),
+    ()
+)
+DEF_MACRO(fICTAGR,
+    ({DST=sys_ictagr(thread, INDEX, insn->slot,REGNO);}),
+    ()
+)
+
+DEF_MACRO(fICDATAR,
+    ({DST=sys_icdatar(thread, INDEX, insn->slot);}),
+    ()
+)
+
+DEF_MACRO(fICTAGW,
+    (sys_ictagw(thread, INDEX, PART2, insn->slot)),
+    ()
+)
+DEF_MACRO(fICDATAW,
+    ({ fHIDE(); }),
+    ()
+)
+
 DEF_MACRO(fL2FETCH,
     sys_l2fetch(thread, ADDR,HEIGHT,WIDTH,STRIDE,FLAGS, insn->slot),
     (A_MEMLIKE,A_L2FETCH)
@@ -1635,6 +2101,12 @@ DEF_MACRO(fDCZEROA,
     (A_MEMLIKE)
 )
 
+DEF_MACRO(fDCINVA,
+    sys_dcinva(thread, (REG)),
+    (A_MEMLIKE)
+)
+
+
 DEF_MACRO(fCHECKFORPRIV,
     {sys_check_privs(thread); if (EXCEPTION_DETECTED) return; },
     ()
@@ -1645,6 +2117,16 @@ DEF_MACRO(fCHECKFORGUEST,
     ()
 )
 
+DEF_MACRO(fTAKEN_INTERRUPT_EDGECLEAR,
+        { proc->global_regs[REG_IPEND] &= ~(INT_NUMTOMASK(intnum) & proc->global_regs[REG_IEL]); },
+        ()
+)
+
+DEF_MACRO(fSET_IAD,
+        { sys_siad(thread,INT_NUMTOMASK(intnum)); thread->processor_ptr->global_regs[REG_IAD] |= INT_NUMTOMASK(intnum); },
+        ()
+)
+
 DEF_MACRO(fBRANCH_SPECULATE_STALL,
     {
         sys_speculate_branch_stall(thread, insn->slot, JUMP_COND(JUMP_PRED_SET),
@@ -1664,3 +2146,79 @@ DEF_MACRO(IV1DEAD,
     ,
     ()
 )
+
+DEF_MACRO(fIN_MONITOR_MODE,
+    sys_in_monitor_mode(thread),
+    ()
+)
+
+DEF_MACRO(fIN_USER_MODE,
+    sys_in_user_mode(thread),
+    ()
+)
+
+DEF_MACRO(fIN_GUEST_MODE,
+    sys_in_guest_mode(thread),
+    ()
+)
+
+DEF_MACRO(fGRE_ENABLED,
+    fREAD_REG_FIELD(CCR,CCR_GRE),
+    ()
+)
+
+DEF_MACRO(fGTE_ENABLED,
+    fREAD_REG_FIELD(CCR,CCR_GRE),
+    ()
+)
+
+DEF_MACRO(fTRAP1_VIRTINSN,
+    ((fIN_GUEST_MODE())
+     && (fGRE_ENABLED())
+     && (   ((IMM) == 1)
+         || ((IMM) == 3)
+         || ((IMM) == 4)
+         || ((IMM) == 6))),
+    ()
+)
+
+DEF_MACRO(fVIRTINSN_RTE,
+    do {
+        thread->trap1_info = TRAP1_VIRTINSN_RTE;
+        fLOG_REG_FIELD(SSR,SSR_SS,fREAD_REG_FIELD(GSR,GSR_SS));
+        fLOG_REG_FIELD(CCR,CCR_GIE,fREAD_REG_FIELD(GSR,GSR_IE));
+        fLOG_REG_FIELD(SSR,SSR_GM,!fREAD_REG_FIELD(GSR,GSR_UM));
+        fBRANCH((fREAD_GELR() & -4),COF_TYPE_RTE);
+        fINTERNAL_CLEAR_SAMEPAGE();
+    } while (0),
+    (A_IMPLICIT_WRITES_CCR,A_IMPLICIT_WRITES_SSR)
+)
+
+DEF_MACRO(fVIRTINSN_SETIE,
+    do {
+        fLOG_REG_FIELD(CCR,CCR_GIE,(REG) & 1);
+        REG = fREAD_REG_FIELD(CCR,CCR_GIE);
+        thread->trap1_info = TRAP1_VIRTINSN_SETIE;
+    } while (0),
+    (A_IMPLICIT_WRITES_CCR)
+)
+
+DEF_MACRO(fVIRTINSN_GETIE,
+    {
+        thread->trap1_info = TRAP1_VIRTINSN_GETIE;
+        REG = fREAD_REG_FIELD(CCR,CCR_GIE);
+    },
+    ()
+)
+
+DEF_MACRO(fVIRTINSN_SPSWAP,
+    do {
+        if (fREAD_REG_FIELD(GSR,GSR_UM)) {
+            size4u_t TEMP = REG;
+            REG = fREAD_GOSP();
+            fWRITE_GOSP(TEMP);
+            thread->trap1_info = TRAP1_VIRTINSN_SPSWAP;
+        }
+    } while (0),
+    (A_IMPLICIT_WRITES_GOSP)
+)
diff --git a/target/hexagon/imported/mmvec/encode_ext.def b/target/hexagon/imported/mmvec/encode_ext.def
index 402438f566c1..9df920476441 100644
--- a/target/hexagon/imported/mmvec/encode_ext.def
+++ b/target/hexagon/imported/mmvec/encode_ext.def
@@ -647,7 +647,7 @@ DEF_ENC(V6_vsubububb_sat,    ICLASS_CJ" 1 110 101 vvvvv PP 0 uuuuu 101 ddddd")
 DEF_ENC(V6_vmpyewuh_64,        ICLASS_CJ" 1 110 101 vvvvv PP 0 uuuuu 110 ddddd")
 
 DEF_FIELDROW_DESC32(        ICLASS_CJ" 1 110 --0 ----- PP 1 ----- ----- ---","Vx32=Vu32")
-DEF_ENC(V6_vunpackob,         ICLASS_CJ" 1 110 --0 ---00 PP 1 uuuuu 000 xxxxx") //
+DEF_ENC(V6_vunpackob,         ICLASS_CJ" 1 110 --0 --000 PP 1 uuuuu 000 xxxxx") //
 DEF_ENC(V6_vunpackoh,         ICLASS_CJ" 1 110 --0 ---00 PP 1 uuuuu 001 xxxxx") //
 //DEF_ENC(V6_vunpackow,     ICLASS_CJ" 1 110 --0 ---00 PP 1 uuuuu 010 xxxxx") //
 
@@ -804,5 +804,31 @@ DEF_ENC(V6_vmpyewuh,    ICLASS_CJ" 1 111 111 vvvvv PP 0 uuuuu 101 ddddd")
 DEF_ENC(V6_vmpyowh,        ICLASS_CJ" 1 111 111 vvvvv PP 0 uuuuu 111 ddddd")
 DEF_ENC(V6_vmpyuhvs,"00011111110vvvvvPP1uuuuu111ddddd")
 
+DEF_ENC(V6_vadd_hf,"00011111011vvvvvPP1uuuuu011ddddd")
+DEF_ENC(V6_vadd_sf,"00011111101vvvvvPP1uuuuu001ddddd")
+DEF_ENC(V6_vadd_qf16,"00011111011vvvvvPP1uuuuu010ddddd")
+DEF_ENC(V6_vadd_qf16_mix,"00011111011vvvvvPP1uuuuu100ddddd")
+DEF_ENC(V6_vadd_qf32,"00011111101vvvvvPP1uuuuu000ddddd")
+DEF_ENC(V6_vadd_qf32_mix,"00011111101vvvvvPP1uuuuu010ddddd")
+
+DEF_ENC(V6_vconv_hf_qf16,"00011110--0--100PP1uuuuu011ddddd")
+DEF_ENC(V6_vconv_hf_qf32,"00011110--0--100PP1uuuuu110ddddd")
+DEF_ENC(V6_vconv_sf_qf32,"00011110--0--100PP1uuuuu000ddddd")
+
+DEF_ENC(V6_vmpy_qf16,"00011111111vvvvvPP1uuuuu011ddddd")
+DEF_ENC(V6_vmpy_qf16_hf,"00011111111vvvvvPP1uuuuu100ddddd")
+DEF_ENC(V6_vmpy_qf16_mix_hf,"00011111111vvvvvPP1uuuuu101ddddd")
+DEF_ENC(V6_vmpy_qf32,"00011111111vvvvvPP1uuuuu000ddddd")
+DEF_ENC(V6_vmpy_qf32_hf,"00011111111vvvvvPP1uuuuu111ddddd")
+DEF_ENC(V6_vmpy_qf32_mix_hf,"00011111100vvvvvPP1uuuuu000ddddd")
+DEF_ENC(V6_vmpy_qf32_qf16,"00011111111vvvvvPP1uuuuu110ddddd")
+DEF_ENC(V6_vmpy_qf32_sf,"00011111111vvvvvPP1uuuuu001ddddd")
+
+DEF_ENC(V6_vsub_hf,"00011111011vvvvvPP1uuuuu110ddddd")
+DEF_ENC(V6_vsub_sf,"00011111101vvvvvPP1uuuuu100ddddd")
+DEF_ENC(V6_vsub_qf32,"00011111101vvvvvPP1uuuuu011ddddd")
+DEF_ENC(V6_vsub_qf32_mix,"00011111101vvvvvPP1uuuuu101ddddd")
+DEF_ENC(V6_vsub_qf16,"00011111011vvvvvPP1uuuuu101ddddd")
+DEF_ENC(V6_vsub_qf16_mix,"00011111011vvvvvPP1uuuuu111ddddd")
 
 #endif /* NO MMVEC */
diff --git a/target/hexagon/imported/mmvec/ext.idef b/target/hexagon/imported/mmvec/ext.idef
index 03d31f6181d7..1b7c5afb42f7 100644
--- a/target/hexagon/imported/mmvec/ext.idef
+++ b/target/hexagon/imported/mmvec/ext.idef
@@ -1400,6 +1400,376 @@ ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhus_acc, "Vxx32+=vmpyhus(Vu32,Vv32)","
     VxxV.v[1].w[i] += fMPY16SU(fGETHALF(1, VuV.w[i]), fGETUHALF(1, VvV.uw[i])))
 
 
+/* VMPY_QF32 */
+/* multiply qf32 input, produce qf32 output*/
+ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32,vmpy_qf32,"Vd32.qf32=vmpy(Vu32.qf32,Vv32.qf32)","Vector multiply: qf32 output from qf32 input",
+    fHIDE(unfloat )u = fPARSEQF32(VuV.qf32[i]);
+    fHIDE(unfloat )v = fPARSEQF32(VvV.qf32[i]);
+    VdV.qf32[i] = fRNDSATQF32(u.exp+v.exp, u.sig*v.sig, 0))
+
+/* VMPY_QF32_SF */
+/* multiply ieee sf input, produce qf32 output*/
+ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32,vmpy_qf32_sf,"Vd32.qf32=vmpy(Vu32.sf,Vv32.sf)","Vector multiply: qf32 output from IEEE sf input",
+    fHIDE(unfloat )u = fPARSESF(VuV.sf[i]);
+    fHIDE(unfloat )v = fPARSESF(VvV.sf[i]);
+    VdV.qf32[i] = fRNDSATQF32(u.exp+v.exp, u.sig*v.sig, 0);
+    if(u.sign^v.sign) VdV.qf32[i] = fNEGQF32(VdV.qf32[i]))
+
+
+/* VMPY_QF16 */
+ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(16,vmpy_qf16,"Vd32.qf16=vmpy(Vu32.qf16,Vv32.qf16)","Vector multiply: qf16 output from qf16 inupt",
+    fHIDE(unfloat )u = fPARSEQF16(VuV.qf16[i]);
+    fHIDE(unfloat )v = fPARSEQF16(VvV.qf16[i]);
+    VdV.qf16[i] = fRNDSATQF16(u.exp+v.exp, u.sig*v.sig, 0))
+
+/* VMPY_QF16_HF */
+ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(16,vmpy_qf16_hf,"Vd32.qf16=vmpy(Vu32.hf,Vv32.hf)","Vector multiply: qf16 output from ieee hf input",
+    fHIDE(unfloat )u = fPARSEHF(VuV.hf[i]);
+    fHIDE(unfloat )v = fPARSEHF(VvV.hf[i]);
+    VdV.qf16[i] = fRNDSATQF16(u.exp+v.exp, u.sig*v.sig, 0);
+    if(u.sign^v.sign) VdV.qf16[i] = fNEGQF16(VdV.qf16[i]))
+
+/* VMPY_QF16_with_QF16_HF */
+/* get the magnitude of qf16 before multiply */
+ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(16,vmpy_qf16_mix_hf,"Vd32.qf16=vmpy(Vu32.qf16,Vv32.hf)","Vector multiply: qf16 output from mixed input of qf16 and ieee hf",
+    fHIDE(unfloat )u = fPARSEQF16(VuV.qf16[i]);
+    fHIDE(unfloat )v = fPARSEHF(VvV.hf[i]);
+    VdV.qf16[i] = fRNDSATQF16(u.exp+v.exp, u.sig*v.sig, 0);
+    if(v.sign) VdV.qf16[i] = fNEGQF16(VdV.qf16[i]))
+
+/* VMPY_QF32_QF16 */
+ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32,vmpy_qf32_qf16,"Vdd32.qf32=vmpy(Vu32.qf16,Vv32.qf16)","Vector multiply: double qf32 output from qf16 input",
+    fHIDE(unfloat )u0 = fPARSEQF16(VuV.w[i] & 0xFFFF);
+    fHIDE(unfloat )u1 = fPARSEQF16((VuV.w[i]>>16) & 0xFFFF);
+    fHIDE(unfloat )v0 = fPARSEQF16(VvV.w[i] & 0xFFFF);
+    fHIDE(unfloat )v1 = fPARSEQF16((VvV.w[i]>>16) & 0xFFFF);
+    VddV.v[0].qf32[i] = fRNDSATQF32(u0.exp+v0.exp, u0.sig*v0.sig, 0);
+    VddV.v[1].qf32[i] = fRNDSATQF32(u1.exp+v1.exp, u1.sig*v1.sig, 0))
+
+/* VMPY_QF32_HF */
+ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32,vmpy_qf32_hf,"Vdd32.qf32=vmpy(Vu32.hf,Vv32.hf)","Vector multiply: double qf32 output from ieee hf input",
+    fHIDE(unfloat )u0 = fPARSEHF(VuV.w[i] & 0xFFFF);
+    fHIDE(unfloat )u1 = fPARSEHF((VuV.w[i]>>16) & 0xFFFF);
+    fHIDE(unfloat )v0 = fPARSEHF(VvV.w[i] & 0xFFFF);
+    fHIDE(unfloat )v1 = fPARSEHF((VvV.w[i]>>16) & 0xFFFF);
+    VddV.v[0].qf32[i] = fRNDSATQF32(u0.exp+v0.exp, u0.sig*v0.sig, 0);
+    VddV.v[1].qf32[i] = fRNDSATQF32(u1.exp+v1.exp, u1.sig*v1.sig, 0);
+    if(u0.sign^v0.sign) VddV.v[0].qf32[i] = fNEGQF32(VddV.v[0].qf32[i]);
+    if(u1.sign^v1.sign) VddV.v[1].qf32[i] = fNEGQF32(VddV.v[1].qf32[i]))
+
+/* VMPY_QF32_with_QF16_HF */
+ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32,vmpy_qf32_mix_hf,"Vdd32.qf32=vmpy(Vu32.qf16,Vv32.hf)","Vector multiply: double qf32 output from mixed input of qf16 and ieee hf",
+    fHIDE(unfloat )u0 = fPARSEQF16(VuV.w[i] & 0xFFFF);
+    fHIDE(unfloat )u1 = fPARSEQF16((VuV.w[i]>>16) & 0xFFFF);
+    fHIDE(unfloat )v0 = fPARSEHF(VvV.w[i] & 0xFFFF);
+    fHIDE(unfloat )v1 = fPARSEHF((VvV.w[i]>>16) & 0xFFFF);
+    VddV.v[0].qf32[i] = fRNDSATQF32(u0.exp+v0.exp, u0.sig*v0.sig, 0);
+    VddV.v[1].qf32[i] = fRNDSATQF32(u1.exp+v1.exp, u1.sig*v1.sig, 0);
+    if(v0.sign) VddV.v[0].qf32[i] = fNEGQF32(VddV.v[0].qf32[i]);
+    if(v1.sign) VddV.v[1].qf32[i] = fNEGQF32(VddV.v[1].qf32[i]))
+
+/* VADD_QF32 */
+ITERATOR_INSN_SHIFT_SLOT(32,vadd_qf32,"Vd32.qf32=vadd(Vu32.qf32,Vv32.qf32)","Vector addition of qf32 input",
+    fHIDE(unfloat )u = fPARSEQF32(VuV.qf32[i]);
+    fHIDE(unfloat )v = fPARSEQF32(VvV.qf32[i]);
+    fHIDE(size2s_t exp=0;)
+    if (u.exp>v.exp) {
+        exp = u.exp+((u.sig==0.0)? (-(FRAC_SF+1)):ilogb(u.sig));
+        if (exp<v.exp) exp = v.exp;
+    } else {
+        exp = v.exp+((v.sig==0.0)? (-(FRAC_SF+1)):ilogb(v.sig));
+        if (exp<u.exp) exp = u.exp;
+    }
+    fHIDE(double ) sig_u = ldexp(u.sig, u.exp-exp);
+    fHIDE(double ) sig_v = ldexp(v.sig, v.exp-exp);
+    fHIDE(double ) sig = sig_u + sig_v;
+    fHIDE(double ) sig_low = (u.exp>v.exp) ? (sig_u-sig)+sig_v : (sig_v-sig)+sig_u;
+    VdV.qf32[i] = fRNDSATQF32(exp, sig, sig_low))
+
+/* VADD_SF */
+ITERATOR_INSN_SHIFT_SLOT(32,vadd_sf,"Vd32.qf32=vadd(Vu32.sf,Vv32.sf)","Vector addition of sf input",
+    fHIDE(unfloat )u = fPARSESF(VuV.sf[i]);
+    fHIDE(unfloat )v = fPARSESF(VvV.sf[i]);
+    fHIDE(size2s_t exp=0;)
+    if (u.exp>v.exp) {
+        exp = u.exp+((u.sig==0.0)? (-(FRAC_SF+1)):ilogb(u.sig));
+        if (exp<v.exp) exp = v.exp;
+    } else {
+        exp = v.exp+((v.sig==0.0)? (-(FRAC_SF+1)):ilogb(v.sig));
+        if (exp<u.exp) exp = u.exp;
+    }
+    //printf("ARCHSIM: u.sign:%d, v.sign:%d, u.sig:%10.30f, v.sig:%10.30f\\n", u.sign, v.sign, u.sig, v.sig);
+
+    fHIDE(double ) sig_u = ldexp(u.sig, u.exp-exp);
+    fHIDE(double ) sig_v = ldexp(v.sig, v.exp-exp);
+
+    //printf("ARCHSIM: u.exp:%d, v.exp:%d, exp:%d, sig_u:%10.30f, sig_v:%10.30f\\n", u.exp, v.exp, exp, sig_u, sig_v);
+
+    fHIDE(double sig;)
+    fHIDE(double sig_low;)
+
+    if((u.sign^v.sign)==0){
+        sig = sig_u + sig_v;
+        sig_low = (u.exp>v.exp) ? (sig_u-sig)+sig_v : (sig_v-sig)+sig_u;
+    }
+    else if((u.sign==0) && (v.sign==1))
+    {
+        sig = sig_u - sig_v;
+        sig_low = (u.exp>v.exp) ? (sig_u-sig)-sig_v : sig_u-(sig_v+sig);
+    }
+    else{
+        sig = sig_v - sig_u;
+        sig_low = (v.exp>u.exp) ? (sig_v-sig)-sig_u : sig_v-(sig_u+sig);
+    }
+    VdV.qf32[i] = fRNDSATQF32(exp, sig, sig_low);
+    //printf("ARCHSIM: output:%x\\n", VdV.qf32[i]);
+    if(u.sign && v.sign)   VdV.qf32[i] = fNEGQF32(VdV.qf32[i]))
+
+/* VADD_QF32_MIX */
+ITERATOR_INSN_SHIFT_SLOT(32,vadd_qf32_mix,"Vd32.qf32=vadd(Vu32.qf32,Vv32.sf)","Vector addition of mixed qf32 and sf",
+    fHIDE(unfloat )u = fPARSEQF32(VuV.qf32[i]);
+    fHIDE(unfloat )v = fPARSESF(VvV.sf[i]);
+    if(v.sign) v.sig = (-1.0)*v.sig;
+    fHIDE(size2s_t exp=0;)
+    if (u.exp>v.exp) {
+        exp = u.exp+((u.sig==0.0)? (-(FRAC_SF+1)):ilogb(u.sig));
+        if (exp<v.exp) exp = v.exp;
+    } else {
+        exp = v.exp+((v.sig==0.0)? (-(FRAC_SF+1)):ilogb(v.sig));
+        if (exp<u.exp) exp = u.exp;
+    }
+    fHIDE(double ) sig_u = ldexp(u.sig, u.exp-exp);
+    fHIDE(double ) sig_v = ldexp(v.sig, v.exp-exp);
+    fHIDE(double) sig = sig_u + sig_v;
+    fHIDE(double) sig_low = (u.exp>v.exp) ? (sig_u-sig)+sig_v : (sig_v-sig)+sig_u;
+    VdV.qf32[i] = fRNDSATQF32(exp, sig, sig_low))
+
+/* VSUB_QF32 */
+ITERATOR_INSN_SHIFT_SLOT(32,vsub_qf32,"Vd32.qf32=vsub(Vu32.qf32,Vv32.qf32)","Vector subtraction of qf32 input",
+    fHIDE(unfloat )u = fPARSEQF32(VuV.qf32[i]);
+    fHIDE(unfloat )v = fPARSEQF32(VvV.qf32[i]);
+    fHIDE(size2s_t exp=0;)
+    if (u.exp>v.exp) {
+        exp = u.exp+((u.sig==0.0)? (-(FRAC_SF+1)):ilogb(u.sig));
+        if (exp<v.exp) exp = v.exp;
+    } else {
+        exp = v.exp+((v.sig==0.0)? (-(FRAC_SF+1)):ilogb(v.sig));
+        if (exp<u.exp) exp = u.exp;
+    }
+    fHIDE(double ) sig_u = ldexp(u.sig, u.exp-exp);
+    fHIDE(double ) sig_v = ldexp(v.sig, v.exp-exp);
+    fHIDE(double) sig = sig_u - sig_v;
+    fHIDE(double) sig_low = (u.exp>v.exp) ? (sig_u-sig)-sig_v : (sig_u-(sig_v+sig));
+    VdV.qf32[i] = fRNDSATQF32(exp, sig, sig_low))
+
+/* VSUB_SF */
+ITERATOR_INSN_SHIFT_SLOT(32,vsub_sf,"Vd32.qf32=vsub(Vu32.sf,Vv32.sf)","Vector subtraction of ieee sf input",
+    fHIDE(unfloat )u = fPARSESF(VuV.sf[i]);
+    fHIDE(unfloat )v = fPARSESF(VvV.sf[i]);
+    fHIDE(size2s_t exp=0;)
+    if (u.exp>v.exp) {
+        exp = u.exp+((u.sig==0.0)? (-(FRAC_SF+1)):ilogb(u.sig));
+        if (exp<v.exp) exp = v.exp;
+    } else {
+        exp = v.exp+((v.sig==0.0)? (-(FRAC_SF+1)):ilogb(v.sig));
+        if (exp<u.exp) exp = u.exp;
+    }
+    fHIDE(double ) sig_u = ldexp(u.sig, u.exp-exp);
+    fHIDE(double ) sig_v = ldexp(v.sig, v.exp-exp);
+    fHIDE(double sig;)
+    fHIDE(double sig_low;)
+    if((u.sign==0) && (v.sign==0)) {
+        sig = sig_u - sig_v;
+        sig_low = (u.exp>v.exp) ? (sig_u-sig)-sig_v : (sig_u-(sig_v+sig));
+    }
+    else if(u.sign ^ v.sign){
+        sig = sig_u + sig_v;
+        sig_low = (u.exp>v.exp) ? (sig_u-sig)+sig_v : (sig_v-sig)+sig_u;
+    }
+    else{
+        sig = sig_v - sig_u;
+        sig_low = (v.exp>u.exp) ? (sig_v-sig)-sig_u : sig_v-(sig_u+sig);
+    }
+    VdV.qf32[i] = fRNDSATQF32(exp, sig, sig_low);
+    if((u.sign==1) && (v.sign==0))   VdV.qf32[i] = fNEGQF32(VdV.qf32[i]))
+
+/* VSUB_QF32_MIX */
+ITERATOR_INSN_SHIFT_SLOT(32,vsub_qf32_mix,"Vd32.qf32=vsub(Vu32.qf32,Vv32.sf)","Vector subtraction of mixed qf32 input and sf",
+    fHIDE(unfloat )u = fPARSEQF32(VuV.qf32[i]);
+    fHIDE(unfloat )v = fPARSESF(VvV.sf[i]);
+    if(v.sign) v.sig = (-1.0)*v.sig;
+    fHIDE(size2s_t exp=0;)
+    if (u.exp>v.exp) {
+        exp = u.exp+((u.sig==0.0)? (-(FRAC_SF+1)):ilogb(u.sig));
+        if (exp<v.exp) exp = v.exp;
+    } else {
+        exp = v.exp+((v.sig==0.0)? (-(FRAC_SF+1)):ilogb(v.sig));
+        if (exp<u.exp) exp = u.exp;
+    }
+    fHIDE(double ) sig_u = ldexp(u.sig, u.exp-exp);
+    fHIDE(double ) sig_v = ldexp(v.sig, v.exp-exp);
+    fHIDE(double ) sig = sig_u - sig_v;
+    fHIDE(double) sig_low = (u.exp>v.exp) ? (sig_u-sig)-sig_v : (sig_u-(sig_v+sig));
+    VdV.qf32[i] = fRNDSATQF32(exp, sig, sig_low))
+
+/* VADD_QF16 */
+ITERATOR_INSN_SHIFT_SLOT(16,vadd_qf16,"Vd32.qf16=vadd(Vu32.qf16,Vv32.qf16)","Vector addition of qf16 input",
+    fHIDE(unfloat )u = fPARSEQF16(VuV.qf16[i]);
+    fHIDE(unfloat )v = fPARSEQF16(VvV.qf16[i]);
+    fHIDE(size2s_t exp=0;)
+    if (u.exp>v.exp) {
+        exp = u.exp+((u.sig==0.0)? (-(FRAC_HF+1)):ilogb(u.sig));
+        if (exp<v.exp) exp = v.exp;
+    } else {
+        exp = v.exp+((v.sig==0.0)? (-(FRAC_HF+1)):ilogb(v.sig));
+        if (exp<u.exp) exp = u.exp;
+    }
+    fHIDE(double ) sig_u = ldexp(u.sig, u.exp-exp);
+    fHIDE(double ) sig_v = ldexp(v.sig, v.exp-exp);
+    fHIDE(double) sig = sig_u + sig_v;
+    fHIDE(double) sig_low = (u.exp>v.exp) ? (sig_u-sig)+sig_v : (sig_v-sig)+sig_u;
+    VdV.qf16[i] = fRNDSATQF16(exp, sig, sig_low))
+
+/* VADD_HF */
+ITERATOR_INSN_SHIFT_SLOT(16,vadd_hf,"Vd32.qf16=vadd(Vu32.hf,Vv32.hf)","Vector addition of hf input",
+    fHIDE(unfloat )u = fPARSEHF(VuV.hf[i]);
+    fHIDE(unfloat )v = fPARSEHF(VvV.hf[i]);
+    fHIDE(size2s_t exp=0;)
+    if (u.exp>v.exp) {
+        exp = u.exp+((u.sig==0.0)? (-(FRAC_HF+1)):ilogb(u.sig));
+        if (exp<v.exp) exp = v.exp;
+    } else {
+        exp = v.exp+((v.sig==0.0)? (-(FRAC_HF+1)):ilogb(v.sig));
+        if (exp<u.exp) exp = u.exp;
+    }
+    fHIDE(double ) sig_u = ldexp(u.sig, u.exp-exp);
+    fHIDE(double ) sig_v = ldexp(v.sig, v.exp-exp);
+
+    fHIDE(double sig;)
+    fHIDE(double sig_low;)
+
+    if((u.sign^v.sign)==0){
+        sig = sig_u + sig_v;
+        sig_low = (u.exp>v.exp) ? (sig_u-sig)+sig_v : (sig_v-sig)+sig_u;
+    }
+    else if((u.sign==0) && (v.sign==1))
+    {
+        sig = sig_u - sig_v;
+        sig_low = (u.exp>v.exp) ? (sig_u-sig)-sig_v : sig_u-(sig_v+sig);
+    }
+    else{
+        sig = sig_v - sig_u;
+        sig_low = (v.exp>u.exp) ? (sig_v-sig)-sig_u : sig_v-(sig_u+sig);
+    }
+    VdV.qf16[i] = fRNDSATQF16(exp, sig, sig_low);
+    if(u.sign && v.sign)
+        VdV.qf16[i] = fNEGQF16(VdV.qf16[i]))
+
+/* VADD_QF16_MIX */
+ITERATOR_INSN_SHIFT_SLOT(16,vadd_qf16_mix,"Vd32.qf16=vadd(Vu32.qf16,Vv32.hf)","Vector addition of mixed qf16 and hf",
+    fHIDE(unfloat )u = fPARSEQF16(VuV.qf16[i]);
+    fHIDE(unfloat )v = fPARSEHF(VvV.hf[i]);
+    if(v.sign)  v.sig = (-1.0)*v.sig;
+    fHIDE(size2s_t exp=0;)
+    if (u.exp>v.exp) {
+        exp = u.exp+((u.sig==0.0)? (-(FRAC_HF+1)):ilogb(u.sig));
+        if (exp<v.exp) exp = v.exp;
+    } else {
+        exp = v.exp+((v.sig==0.0)? (-(FRAC_HF+1)):ilogb(v.sig));
+        if (exp<u.exp) exp = u.exp;
+    }
+    fHIDE(double ) sig_u = ldexp(u.sig, u.exp-exp);
+    fHIDE(double ) sig_v = ldexp(v.sig, v.exp-exp);
+    fHIDE(double) sig = sig_u + sig_v;
+    fHIDE(double) sig_low = (u.exp>v.exp) ? (sig_u-sig)+sig_v : (sig_v-sig)+sig_u;
+    VdV.qf16[i] = fRNDSATQF16(exp, sig, sig_low))
+
+/* VSUB_QF16 */
+ITERATOR_INSN_SHIFT_SLOT(16,vsub_qf16,"Vd32.qf16=vsub(Vu32.qf16,Vv32.qf16)","Vector subtraction of qf16 input",
+    fHIDE(unfloat )u = fPARSEQF16(VuV.qf16[i]);
+    fHIDE(unfloat )v = fPARSEQF16(VvV.qf16[i]);
+    fHIDE(size2s_t exp=0;)
+    if (u.exp>v.exp) {
+        exp = u.exp+((u.sig==0.0)? (-(FRAC_HF+1)):ilogb(u.sig));
+        if (exp<v.exp) exp = v.exp;
+    } else {
+        exp = v.exp+((v.sig==0.0)? (-(FRAC_HF+1)):ilogb(v.sig));
+        if (exp<u.exp) exp = u.exp;
+    }
+    fHIDE(double ) sig_u = ldexp(u.sig, u.exp-exp);
+    fHIDE(double ) sig_v = ldexp(v.sig, v.exp-exp);
+    fHIDE(double) sig = sig_u - sig_v;
+    fHIDE(double) sig_low = (u.exp>v.exp) ? (sig_u-sig)-sig_v : (sig_u-(sig_v+sig));
+    VdV.qf16[i] = fRNDSATQF16(exp, sig, sig_low))
+
+/* VSUB_HF */
+ITERATOR_INSN_SHIFT_SLOT(16,vsub_hf,"Vd32.qf16=vsub(Vu32.hf,Vv32.hf)","Vector subtraction of hf input",
+    fHIDE(unfloat )u = fPARSEHF(VuV.hf[i]);
+    fHIDE(unfloat )v = fPARSEHF(VvV.hf[i]);
+    fHIDE(size2s_t exp=0;)
+    if (u.exp>v.exp) {
+        exp = u.exp+((u.sig==0.0)? (-(FRAC_HF+1)):ilogb(u.sig));
+        if (exp<v.exp) exp = v.exp;
+    } else {
+        exp = v.exp+((v.sig==0.0)? (-(FRAC_HF+1)):ilogb(v.sig));
+        if (exp<u.exp) exp = u.exp;
+    }
+    fHIDE(double ) sig_u = ldexp(u.sig, u.exp-exp);
+    fHIDE(double ) sig_v = ldexp(v.sig, v.exp-exp);
+    fHIDE(double sig;)
+    fHIDE(double sig_low;)
+    if((u.sign==0) && (v.sign==0)) {
+        sig = sig_u - sig_v;
+        sig_low = (u.exp>v.exp) ? (sig_u-sig)-sig_v : (sig_u-(sig_v+sig));
+    }
+    else if(u.sign ^ v.sign){
+        sig = sig_u + sig_v;
+        sig_low = (u.exp>v.exp) ? (sig_u-sig)+sig_v : (sig_v-sig)+sig_u;
+    }
+    else{
+        sig = sig_v - sig_u;
+        sig_low = (v.exp>u.exp) ? (sig_v-sig)-sig_u : sig_v-(sig_u+sig);
+    }
+    VdV.qf16[i] = fRNDSATQF16(exp, sig, sig_low);
+    if((u.sign==1) && (v.sign==0))
+        VdV.qf16[i] = fNEGQF16(VdV.qf16[i]))
+
+
+/* VSUB_QF16_MIXED */
+ITERATOR_INSN_SHIFT_SLOT(16,vsub_qf16_mix,"Vd32.qf16=vsub(Vu32.qf16,Vv32.hf)","Vector subtraction of mixed qf16 and hf",
+    fHIDE(unfloat )u = fPARSEQF16(VuV.qf16[i]);
+    fHIDE(unfloat )v = fPARSEHF(VvV.hf[i]);
+    if(v.sign)  v.sig = (-1.0)*v.sig;
+    fHIDE(size2s_t exp=0;)
+    if (u.exp>v.exp) {
+        exp = u.exp+((u.sig==0.0)? (-(FRAC_HF+1)):ilogb(u.sig));
+        if (exp<v.exp) exp = v.exp;
+    } else {
+        exp = v.exp+((v.sig==0.0)? (-(FRAC_HF+1)):ilogb(v.sig));
+        if (exp<u.exp) exp = u.exp;
+    }
+    fHIDE(double ) sig_u = ldexp(u.sig, u.exp-exp);
+    fHIDE(double ) sig_v = ldexp(v.sig, v.exp-exp);
+    fHIDE(double) sig = sig_u - sig_v;
+    fHIDE(double) sig_low = (u.exp>v.exp) ? (sig_u-sig)-sig_v : (sig_u-(sig_v+sig));
+    VdV.qf16[i] = fRNDSATQF16(exp, sig, sig_low))
+
+// FP Convert QF32/W/UW to ieee SF
+ITERATOR_INSN_SHIFT_SLOT(32,vconv_sf_qf32,"Vd32.sf=Vu32.qf32","Vector conversion of qf32 format to ieee SF",
+    fHIDE(unfloat )u = fPARSEQF32(VuV.qf32[i]);
+    VdV.sf[i] = fRNDSATSF(u.exp, u.sig))
+
+// FP Convert QF16/H/UH to ieee HF
+ITERATOR_INSN_SHIFT_SLOT(16,vconv_hf_qf16,"Vd32.hf=Vu32.qf16","Vector conversion of qf16 format to ieee HF",
+    fHIDE(unfloat )u = fPARSEQF16(VuV.qf16[i]);
+    VdV.hf[i] = fRNDSATHF(u.exp, u.sig))
+
+// FP Convert double QF32 to two packed ieee HF in one vector
+ITERATOR_INSN_SHIFT_SLOT(32,vconv_hf_qf32,"Vd32.hf=Vuu32.qf32","Vector conversion of double qf32 to ieee HF",
+    fHIDE(unfloat )u0 = fPARSEQF32(VuuV.v[0].qf32[i]);
+    fHIDE(unfloat )u1 = fPARSEQF32(VuuV.v[1].qf32[i]);
+    VdV.hf[2*i] = fRNDSATHF(u0.exp, u0.sig);
+    VdV.hf[2*i+1] = fRNDSATHF(u1.exp, u1.sig))
 
 
 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyih,"Vd32=vmpyih(Vu32,Vv32)","Vd32.h=vmpyi(Vu32.h,Vv32.h)",
diff --git a/target/hexagon/imported/mmvec/macros.def b/target/hexagon/imported/mmvec/macros.def
index 7e5438a99802..e9524aa56d1e 100755
--- a/target/hexagon/imported/mmvec/macros.def
+++ b/target/hexagon/imported/mmvec/macros.def
@@ -15,46 +15,76 @@
  *  along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
-DEF_MACRO(fDUMPQ,
+DEF_MACRO(fDUMPQ,(STR,REG),
+	"dump REG",
+	"dump REG",
 	do {
 		printf(STR ":" #REG ": 0x%016llx\n",REG.ud[0]);
 	} while (0),
 	()
 )
 
-DEF_MACRO(fUSE_LOOKUP_ADDRESS_BY_REV,
-	PROC->arch_proc_options->mmvec_use_full_va_for_lookup,
+DEF_MACRO(fUSE_LOOKUP_ADDRESS_BY_REV,(PROC),
+	"",
+	"Use full VA address for lookup and exception based on REV ",
+	PROC->arch_proc_options->HVX_USE_FULL_VA,
 	()
 )
 
-DEF_MACRO(fUSE_LOOKUP_ADDRESS,
+DEF_MACRO(fUSE_LOOKUP_ADDRESS,(),
+	"",
+	"Use full VA address for lookup and exception",
 	1,
 	()
 )
 
-DEF_MACRO(fNOTQ,
+DEF_MACRO(fRT8NOTE, (),
+	"",
+	"",
+	,
+	(A_NOTE_RT8)
+)
+
+DEF_MACRO(fCVI_VX_NO_TMP_LD, (),
+    "",
+    "",
+    ,
+    (A_CVI_VX_NO_TMP_LD)
+)
+DEF_MACRO(fNOTQ,(VAL),
+	"~VAL",
+	"~VAL",
+	/* Will break Visual Studio? */
 	({mmqreg_t _ret = {0}; int _i_; for (_i_ = 0; _i_ < fVECSIZE()/64; _i_++) _ret.ud[_i_] = ~VAL.ud[_i_]; _ret;}),
 	()
 )
 
-DEF_MACRO(fGETQBITS,
+DEF_MACRO(fGETQBITS,(REG,WIDTH,MASK,BITNO),
+	"REG[BITNO+WIDTH-1:BITNO]",
+	"Get MASK bits at BITNO from REG",
 	((MASK) & (REG.w[(BITNO)>>5] >> ((BITNO) & 0x1f))),
 	()
 )
 
-DEF_MACRO(fGETQBIT,
+DEF_MACRO(fGETQBIT,(REG,BITNO),
+	"REG[BITNO]",
+	"Get bit BITNO from REG",
 	fGETQBITS(REG,1,1,BITNO),
 	()
 )
 
-DEF_MACRO(fGENMASKW,
+DEF_MACRO(fGENMASKW,(QREG,IDX),
+	"maskw(QREG,IDX)",
+	"Generate mask from QREG for word IDX",
 	(((fGETQBIT(QREG,(IDX*4+0)) ? 0xFF : 0x0) << 0)
 	|((fGETQBIT(QREG,(IDX*4+1)) ? 0xFF : 0x0) << 8)
 	|((fGETQBIT(QREG,(IDX*4+2)) ? 0xFF : 0x0) << 16)
 	|((fGETQBIT(QREG,(IDX*4+3)) ? 0xFF : 0x0) << 24)),
 	()
 )
-DEF_MACRO(fGET10BIT,
+DEF_MACRO(fGET10BIT,(COE,VAL,POS),
+	"COE=(((((fGETUBYTE(3,VAL) >> (2 * POS)) & 3) << 8) | fGETUBYTE(POS,VAL)) << 6) >> 6;",
+	"Get 10-bit coefficient from current word value and byte position",
 	{
 		COE = (((((fGETUBYTE(3,VAL) >> (2 * POS)) & 3) << 8) | fGETUBYTE(POS,VAL)) << 6);
 		COE >>= 6;
@@ -62,62 +92,160 @@ DEF_MACRO(fGET10BIT,
 	()
 )
 
-DEF_MACRO(fVMAX,
+DEF_MACRO(fVMAX,(X,Y),
+	"max(X,Y)",
+	"",
 	(X>Y) ? X : Y,
 	()
 )
 
 
-DEF_MACRO(fGETNIBBLE,
+DEF_MACRO(fREAD_VEC,
+    (DST,IDX),
+    "DST=VREG[IDX]",   /* short desc */
+    "Read Vector IDX", /* long desc */
+	(DST = READ_VREG(fMODCIRCU((IDX),5))),
+    ()
+)
+DEF_MACRO(fREAD_ZVEC,
+    (DST,IDX),
+    "DST=ZREG[IDX]",   /* short desc */
+    "Read Vector IDX", /* long desc */
+	(DST = READ_ZREG(fMODCIRCU((IDX),5))),
+    ()
+)
+
+DEF_MACRO(fREAD_ZVEC_WORD,
+    (DST,IDX),
+    "DST=ZReg.uw[IDX]",   /* short desc */
+    "Read Z Vector IDX", /* long desc */
+	{
+		mmvector_t ZReg = READ_ZREG(0);
+		DST = ZReg.uw[IDX];
+
+	},
+    ()
+)
+DEF_MACRO(fREAD_ZVEC_ALL,
+    (DST,N,NZ),
+    "",   /* short desc */
+    "Read Z Vector IDX", /* long desc */
+	{
+		int __idx = 0;
+		for (__idx = 0; __idx < NZ/N; __idx++) {
+			memcpy(&DST[N*__idx], &THREAD2STRUCT->ZRegs[__idx], N);
+		}
+	},
+    ()
+)
+DEF_MACRO(fZREGB,
+    (Z,IDX),
+    "ZREG.b[IDX]",   /* short desc */
+    "Read Z IDX", /* long desc */
+	((size1s_t)Z[IDX]),
+    ()
+)
+DEF_MACRO(fZREGUB,
+    (Z,IDX),
+    "ZREG.ub[IDX]",   /* short desc */
+    "Read Z IDX", /* long desc */
+	((size1u_t)Z[IDX]),
+    ()
+)
+DEF_MACRO(fZREGH,
+    (Z,IDX),
+    "ZREG.h[IDX]",   /* short desc */
+    "Read Z IDX", /* long desc */
+	((size2s_t)Z[IDX]),
+    ()
+)
+DEF_MACRO(fZREGUB,
+    (Z,IDX),
+    "ZREG.ub[IDX]",   /* short desc */
+    "Read Z IDX", /* long desc */
+	((size1u_t)Z[IDX]),
+    ()
+)
+
+DEF_MACRO(fGETNIBBLE,(IDX,SRC),
+    "SRC.s4[IDX]",
+    "Get nibble",
     ( fSXTN(4,8,(SRC >> (4*IDX)) & 0xF) ),
     ()
 )
 
-DEF_MACRO(fGETCRUMB,
+DEF_MACRO(fGETCRUMB,(IDX,SRC),
+    "SRC.s2[IDX]",
+    "Get 2bits",
     ( fSXTN(2,8,(SRC >> (2*IDX)) & 0x3) ),
     ()
 )
 
-DEF_MACRO(fGETCRUMB_SYMMETRIC,
+DEF_MACRO(fGETCRUMB_SYMMETRIC,(IDX,SRC),
+    "SRC.s2[IDX] >= 0 ? (2-SRC.s2[IDX]) : SRC.s2[IDX]",
+    "Get 2bits",
     ( (fGETCRUMB(IDX,SRC)>=0 ? (2-fGETCRUMB(IDX,SRC)) : fGETCRUMB(IDX,SRC) ) ),
     ()
 )
 
+//#define ZERO_OFFSET_2B +(fGETCRUMB(z,VuV.uw[i])>=0)
 #define ZERO_OFFSET_2B +
 
-DEF_MACRO(fGENMASKH,
+DEF_MACRO(fWRITE_VEC,
+    (IDX,VAR),
+    "VREG[IDX]=VAR",   /* short desc */
+    "Write Vector IDX", /* long desc */
+	(WRITE_VREG(fMODCIRCU((IDX),5),VAR)),
+    ()
+)
+
+DEF_MACRO(fGENMASKH,(QREG,IDX),
+	"maskh(QREG,IDX)",
+	"generate mask from QREG for halfword IDX",
 	(((fGETQBIT(QREG,(IDX*2+0)) ? 0xFF : 0x0) << 0)
 	|((fGETQBIT(QREG,(IDX*2+1)) ? 0xFF : 0x0) << 8)),
 	()
 )
 
-DEF_MACRO(fGETMASKW,
+DEF_MACRO(fGETMASKW,(VREG,QREG,IDX),
+	"VREG.w[IDX] & fGENMASKW(QREG,IDX)",
+	"Mask word IDX from VREG using QREG",
 	(VREG.w[IDX] & fGENMASKW((QREG),IDX)),
 	()
 )
 
-DEF_MACRO(fGETMASKH,
+DEF_MACRO(fGETMASKH,(VREG,QREG,IDX),
+	"VREG.h[IDX] & fGENMASKH(QREG,IDX)",
+	"Mask word IDX from VREG using QREG",
 	(VREG.h[IDX] & fGENMASKH((QREG),IDX)),
 	()
 )
 
-DEF_MACRO(fCONDMASK8,
+DEF_MACRO(fCONDMASK8,(QREG,IDX,YESVAL,NOVAL),
+	"QREG.IDX ? YESVAL : NOVAL",
+	"QREG.IDX ? YESVAL : NOVAL",
 	(fGETQBIT(QREG,IDX) ? (YESVAL) : (NOVAL)),
 	()
 )
 
-DEF_MACRO(fCONDMASK16,
+DEF_MACRO(fCONDMASK16,(QREG,IDX,YESVAL,NOVAL),
+	"select_bytes(QREG,IDX,YESVAL,NOVAL)",
+	"select_bytes(QREG,IDX,YESVAL,NOVAL)",
 	((fGENMASKH(QREG,IDX) & (YESVAL)) | (fGENMASKH(fNOTQ(QREG),IDX) & (NOVAL))),
 	()
 )
 
-DEF_MACRO(fCONDMASK32,
+DEF_MACRO(fCONDMASK32,(QREG,IDX,YESVAL,NOVAL),
+	"select_bytes(QREG,IDX,YESVAL,NOVAL)",
+	"select_bytes(QREG,IDX,YESVAL,NOVAL)",
 	((fGENMASKW(QREG,IDX) & (YESVAL)) | (fGENMASKW(fNOTQ(QREG),IDX) & (NOVAL))),
 	()
 )
 
 
-DEF_MACRO(fSETQBITS,
+DEF_MACRO(fSETQBITS,(REG,WIDTH,MASK,BITNO,VAL),
+	"REG[BITNO+WIDTH-1:BITNO] = VAL",
+	"Put bits into REG",
 	do {
 		size4u_t __TMP = (VAL);
 		REG.w[(BITNO)>>5] &= ~((MASK) << ((BITNO) & 0x1f));
@@ -126,58 +254,101 @@ DEF_MACRO(fSETQBITS,
 	()
 )
 
-DEF_MACRO(fSETQBIT,
+DEF_MACRO(fSETQBIT,(REG,BITNO,VAL),
+	"REG[BITNO]=VAL",
+	"Put bit into REG",
 	fSETQBITS(REG,1,1,BITNO,VAL),
 	()
 )
 
-DEF_MACRO(fVBYTES,
+DEF_MACRO(fVBYTES,(),
+	"VWIDTH",
+	"Number of bytes in a vector",
 	(fVECSIZE()),
 	()
 )
 
-DEF_MACRO(fVHALVES,
+DEF_MACRO(fVHALVES,(),
+	"VWIDTH/2",
+	"Number of halves in a vector",
 	(fVECSIZE()/2),
 	()
 )
 
-DEF_MACRO(fVWORDS,
+DEF_MACRO(fVWORDS,(),
+	"VWIDTH/2",
+	"Number of words in a vector",
 	(fVECSIZE()/4),
 	()
 )
 
-DEF_MACRO(fVDWORDS,
+DEF_MACRO(fVDWORDS,(),
+	"VWIDTH/8",
+	"Number of double words in a vector",
 	(fVECSIZE()/8),
 	()
 )
 
-DEF_MACRO(fVALIGN,
+DEF_MACRO(fVALIGN, (ADDR, LOG2_ALIGNMENT),
+    "ADDR = ADDR & ~(LOG2_ALIGNMENT-1)",
+    "Align to Element Size",
     ( ADDR = ADDR & ~(LOG2_ALIGNMENT-1)),
     ()
 )
 
-DEF_MACRO(fVLASTBYTE,
+DEF_MACRO(fVLASTBYTE, (ADDR, LOG2_ALIGNMENT),
+    "ADDR = ADDR | (LOG2_ALIGNMENT-1)",
+    "Set LSB of length to last byte",
     ( ADDR = ADDR | (LOG2_ALIGNMENT-1)),
     ()
 )
 
 
-DEF_MACRO(fVELEM,
+DEF_MACRO(fVELEM, (WIDTH),
+    "VBITS/WIDTH",
+    "Number of WIDTH-bit elements in a vector",
     ((fVECSIZE()*8)/WIDTH),
     ()
 )
 
-DEF_MACRO(fVECLOGSIZE,
+DEF_MACRO(fVECLOGSIZE,(),
+    "log2(VECTOR_SIZE)",
+    "Log base 2 of the number of bytes in a vector",
     (mmvec_current_veclogsize(thread)),
     ()
 )
 
-DEF_MACRO(fVECSIZE,
+DEF_MACRO(fVBUF_IDX,(EA),
+	"(EA >> log2(VECTOR_SIZE)) & 0xFF",
+	"(EA >> log2(VECTOR_SIZE)) & 0xFF",
+	(((EA) >> fVECLOGSIZE()) & 0xFF),
+	(A_FAKEINSN)
+)
+
+DEF_MACRO(fREAD_VBUF,(IDX,WIDX),
+	"vbuf[IDX].w[WIDX]",
+	"vbuf[IDX].w[WIDX]",
+	READ_VBUF(IDX,WIDX),
+	(A_FAKEINSN)
+)
+
+DEF_MACRO(fLOG_VBUF,(IDX,VAL,WIDX),
+	"vbuf[IDX].w[WIDX] = VAL",
+	"vbuf[IDX].w[WIDX] = VAL",
+	LOG_VBUF(IDX,VAL,WIDX),
+	(A_FAKEINSN)
+)
+
+DEF_MACRO(fVECSIZE,(),
+    "VBYTES",
+    "Number of bytes in a vector currently",
     (1<<fVECLOGSIZE()),
     ()
 )
 
-DEF_MACRO(fSWAPB,
+DEF_MACRO(fSWAPB,(A, B),
+    "SWAP(A,B)",
+    "Swap bytes",
     {
 		size1u_t tmp = A;
 		A = B;
@@ -187,41 +358,54 @@ DEF_MACRO(fSWAPB,
 )
 
 DEF_MACRO(
-	fVZERO,
+	fVZERO,(),
+	"0",
+	"0",
 	mmvec_zero_vector(),
 	()
 )
 
 DEF_MACRO(
-    fNEWVREG,
+    fNEWVREG,(VNUM),
+    "VNUM.new",
+    "Register value produced in this packet",
     ((THREAD2STRUCT->VRegs_updated & (((VRegMask)1)<<VNUM)) ? THREAD2STRUCT->future_VRegs[VNUM] : mmvec_zero_vector()),
     (A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY)
 )
 
 DEF_MACRO(
 	fV_AL_CHECK,
+	(EA,MASK),
+	"",
+	"",
 	if ((EA) & (MASK)) {
 		warn("aligning misaligned vector. PC=%08x EA=%08x",thread->Regs[REG_PC],(EA));
 	},
 	()
 )
-DEF_MACRO(fSCATTER_INIT,
+DEF_MACRO(fSCATTER_INIT, ( REGION_START, LENGTH, ELEMENT_SIZE),
+    "",
+    "",
     {
     mem_vector_scatter_init(thread, insn,   REGION_START, LENGTH, ELEMENT_SIZE);
 	if (EXCEPTION_DETECTED) return;
     },
-    (A_STORE,A_MEMLIKE,A_RESTRICT_SLOT0ONLY)
+    (A_STORE,A_MEMLIKE,A_RESTRICT_SINGLE_MEM_FIRST,A_RESTRICT_SLOT0ONLY)
 )
 
-DEF_MACRO(fGATHER_INIT,
+DEF_MACRO(fGATHER_INIT, ( REGION_START, LENGTH, ELEMENT_SIZE),
+    "",
+    "",
     {
     mem_vector_gather_init(thread, insn,   REGION_START, LENGTH, ELEMENT_SIZE);
 	if (EXCEPTION_DETECTED) return;
     },
-    (A_LOAD,A_MEMLIKE,A_RESTRICT_SLOT1ONLY)
+    (A_LOAD,A_MEMLIKE,A_RESTRICT_SINGLE_MEM_FIRST)
 )
 
-DEF_MACRO(fSCATTER_FINISH,
+DEF_MACRO(fSCATTER_FINISH, (OP),
+    "",
+    "",
     {
 	if (EXCEPTION_DETECTED) return;
     mem_vector_scatter_finish(thread, insn, OP);
@@ -229,7 +413,9 @@ DEF_MACRO(fSCATTER_FINISH,
     ()
 )
 
-DEF_MACRO(fGATHER_FINISH,
+DEF_MACRO(fGATHER_FINISH, (),
+    "",
+    "",
     {
 	if (EXCEPTION_DETECTED) return;
     mem_vector_gather_finish(thread, insn);
@@ -238,7 +424,9 @@ DEF_MACRO(fGATHER_FINISH,
 )
 
 
-DEF_MACRO(CHECK_VTCM_PAGE,
+DEF_MACRO(CHECK_VTCM_PAGE, (FLAG, BASE, LENGTH, OFFSET, ALIGNMENT),
+    "FLAG=((BASE+OFFSET) < (BASE+LENGTH))",
+    "FLAG=((BASE+OFFSET) < (BASE+LENGTH))",
      {
         int slot = insn->slot;
         paddr_t pa = thread->mem_access[slot].paddr+OFFSET;
@@ -247,7 +435,9 @@ DEF_MACRO(CHECK_VTCM_PAGE,
      },
     ()
 )
-DEF_MACRO(COUNT_OUT_OF_BOUNDS,
+DEF_MACRO(COUNT_OUT_OF_BOUNDS, (FLAG, SIZE),
+    " ",
+    "",
      {
         if (!FLAG)
         {
@@ -258,7 +448,9 @@ DEF_MACRO(COUNT_OUT_OF_BOUNDS,
     ()
 )
 
-DEF_MACRO(fLOG_SCATTER_OP,
+DEF_MACRO(fLOG_SCATTER_OP, (SIZE),
+    "  ",
+    "  ",
     {
         // Log the size and indicate that the extension ext.c file needs to increment right before memory write
         THREAD2STRUCT->vtcm_log.op = 1;
@@ -269,7 +461,9 @@ DEF_MACRO(fLOG_SCATTER_OP,
 
 
 
-DEF_MACRO(fVLOG_VTCM_WORD_INCREMENT,
+DEF_MACRO(fVLOG_VTCM_WORD_INCREMENT, (EA,OFFSET,INC,IDX,ALIGNMENT,LEN),
+    "if (RtV <= EA <= RtV + LEN) *EA += INC.uw[IDX] ",
+    "if (RtV <= EA <= RtV + LEN) *EA += INC.uw[IDX] ",
     {
         int slot = insn->slot;
         int log_bank = 0;
@@ -287,7 +481,9 @@ DEF_MACRO(fVLOG_VTCM_WORD_INCREMENT,
     ()
 )
 
-DEF_MACRO(fVLOG_VTCM_HALFWORD_INCREMENT,
+DEF_MACRO(fVLOG_VTCM_HALFWORD_INCREMENT, (EA,OFFSET,INC,IDX,ALIGNMENT,LEN),
+    "if (RtV <= EA <= RtV + LEN) *EA += INC.uh[IDX] ",
+    "if (RtV <= EA <= RtV + LEN) *EA += INC.uh[IDX] ",
     {
         int slot = insn->slot;
         int log_bank = 0;
@@ -304,7 +500,9 @@ DEF_MACRO(fVLOG_VTCM_HALFWORD_INCREMENT,
     ()
 )
 
-DEF_MACRO(fVLOG_VTCM_HALFWORD_INCREMENT_DV,
+DEF_MACRO(fVLOG_VTCM_HALFWORD_INCREMENT_DV, (EA,OFFSET,INC,IDX,IDX2,IDX_H,ALIGNMENT,LEN),
+    "if (RtV <= EA <= RtV + LEN) *EA += INC.w[IDX2].uh[IDX_H] ",
+    "if (RtV <= EA <= RtV + LEN) *EA += INC.w[IDX2].uh[IDX_H] ",
     {
         int slot = insn->slot;
         int log_bank = 0;
@@ -323,7 +521,9 @@ DEF_MACRO(fVLOG_VTCM_HALFWORD_INCREMENT_DV,
 
 
 
-DEF_MACRO(GATHER_FUNCTION,
+DEF_MACRO(GATHER_FUNCTION, (EA,OFFSET,IDX, LEN, ELEMENT_SIZE, BANK_IDX, QVAL),
+"",
+"",
 {
         int slot = insn->slot;
         int i0;
@@ -336,6 +536,9 @@ DEF_MACRO(GATHER_FUNCTION,
             log_byte =  ((OFFSET>=0)&&((pa+i0)<=pa_high)) && QVAL;
             log_bank |= (log_byte<<i0);
             size1u_t B  = sim_mem_read1(thread->system_ptr, thread->threadId, thread->mem_access[slot].paddr+OFFSET+i0);
+#ifdef VERIFICATION
+			warn("Gather[%d] sim_mem_read1 pa:%llx val: %x", ELEMENT_SIZE*IDX+i0, thread->mem_access[slot].paddr+OFFSET+i0, B);
+#endif
             THREAD2STRUCT->tmp_VRegs[0].ub[ELEMENT_SIZE*IDX+i0] = B;
             LOG_VTCM_BYTE(pa+i0,log_byte,B,ELEMENT_SIZE*IDX+i0);
         }
@@ -346,38 +549,50 @@ DEF_MACRO(GATHER_FUNCTION,
 
 
 
-DEF_MACRO(fVLOG_VTCM_GATHER_WORD,
+DEF_MACRO(fVLOG_VTCM_GATHER_WORD, (EA,OFFSET,IDX, LEN),
+    "if (RtV <= EA <= RtV + LEN) TEMP.uw[IDX] = *EA ",
+    "if (RtV <= EA <= RtV + LEN) TEMP.uw[IDX] = *EA ",
     {
 		GATHER_FUNCTION(EA,OFFSET,IDX, LEN, 4, IDX, 1);
     },
     ()
 )
-DEF_MACRO(fVLOG_VTCM_GATHER_HALFWORD,
+DEF_MACRO(fVLOG_VTCM_GATHER_HALFWORD, (EA,OFFSET,IDX, LEN),
+    " if (RtV <= EA <= RtV + LEN) TEMP.uh[IDX] = *EA ",
+    " if (RtV <= EA <= RtV + LEN) TEMP.uh[IDX] = *EA ",
     {
 		GATHER_FUNCTION(EA,OFFSET,IDX, LEN, 2, IDX, 1);
     },
     ()
 )
-DEF_MACRO(fVLOG_VTCM_GATHER_HALFWORD_DV,
+DEF_MACRO(fVLOG_VTCM_GATHER_HALFWORD_DV, (EA,OFFSET,IDX,IDX2,IDX_H, LEN),
+    "if (RtV <= EA <= RtV + LEN) TEMP.uw[IDX2].uh[IDX_H] = *EA ",
+    "if (RtV <= EA <= RtV + LEN) TEMP.uw[IDX2].uh[IDX_H] = *EA ",
     {
 		GATHER_FUNCTION(EA,OFFSET,IDX, LEN, 2, (2*IDX2+IDX_H), 1);
     },
     ()
 )
-DEF_MACRO(fVLOG_VTCM_GATHER_WORDQ,
+DEF_MACRO(fVLOG_VTCM_GATHER_WORDQ, (EA,OFFSET,IDX, Q, LEN),
+    " if ( (RtV <= EA <= RtV + LEN) & Q) TEMP.uw[IDX] = *EA ",
+    " if ( (RtV <= EA <= RtV + LEN) & Q) TEMP.uw[IDX] = *EA ",
     {
 		GATHER_FUNCTION(EA,OFFSET,IDX, LEN, 4, IDX, fGETQBIT(QsV,4*IDX+i0));
     },
     ()
 )
-DEF_MACRO(fVLOG_VTCM_GATHER_HALFWORDQ,
+DEF_MACRO(fVLOG_VTCM_GATHER_HALFWORDQ, (EA,OFFSET,IDX, Q, LEN),
+    " if ( (RtV <= EA <= RtV + LEN) & Q) TEMP.uh[IDX] = *EA ",
+    " if ( (RtV <= EA <= RtV + LEN) & Q) TEMP.uh[IDX] = *EA ",
     {
 		GATHER_FUNCTION(EA,OFFSET,IDX, LEN, 2, IDX, fGETQBIT(QsV,2*IDX+i0));
     },
     ()
 )
 
-DEF_MACRO(fVLOG_VTCM_GATHER_HALFWORDQ_DV,
+DEF_MACRO(fVLOG_VTCM_GATHER_HALFWORDQ_DV, (EA,OFFSET,IDX,IDX2,IDX_H, Q, LEN),
+    " if ( (RtV <= EA <= RtV + LEN) & Q) TEMP.uw[IDX2].uh[IDX_H] = *EA ",
+    " if ( (RtV <= EA <= RtV + LEN) & Q) TEMP.uw[IDX2].uh[IDX_H] = *EA ",
     {
 		GATHER_FUNCTION(EA,OFFSET,IDX, LEN, 2, (2*IDX2+IDX_H), fGETQBIT(QsV,2*IDX+i0));
     },
@@ -385,7 +600,9 @@ DEF_MACRO(fVLOG_VTCM_GATHER_HALFWORDQ_DV,
 )
 
 
-DEF_MACRO(DEBUG_LOG_ADDR,
+DEF_MACRO(DEBUG_LOG_ADDR, (OFFSET),
+    "  ",
+    "  ",
     {
 
         if (thread->processor_ptr->arch_proc_options->mmvec_network_addr_log2)
@@ -393,6 +610,7 @@ DEF_MACRO(DEBUG_LOG_ADDR,
 
             int slot = insn->slot;
             paddr_t pa = thread->mem_access[slot].paddr+OFFSET;
+           // pa = pa & ~(ALIGNMENT-1);
         }
     },
     ()
@@ -404,7 +622,9 @@ DEF_MACRO(DEBUG_LOG_ADDR,
 
 
 
-DEF_MACRO(SCATTER_OP_WRITE_TO_MEM,
+DEF_MACRO(SCATTER_OP_WRITE_TO_MEM, (TYPE),
+    " Read, accumulate, and write to VTCM",
+    "  ",
     {
         for (int i = 0; i < mmvecx->vtcm_log.size; i+=sizeof(TYPE))
         {
@@ -430,7 +650,9 @@ DEF_MACRO(SCATTER_OP_WRITE_TO_MEM,
     ()
 )
 
-DEF_MACRO(SCATTER_FUNCTION,
+DEF_MACRO(SCATTER_FUNCTION, (EA,OFFSET,IDX, LEN, ELEMENT_SIZE, BANK_IDX, QVAL, IN),
+"",
+"",
 {
         int slot = insn->slot;
         int i0;
@@ -449,26 +671,34 @@ DEF_MACRO(SCATTER_FUNCTION,
 ()
 )
 
-DEF_MACRO(fVLOG_VTCM_HALFWORD,
+DEF_MACRO(fVLOG_VTCM_HALFWORD, (EA,OFFSET,IN,IDX, LEN),
+    "if (RtV <= EA <= RtV + LEN) *EA = IN.uh[IDX] ",
+    "if (RtV <= EA <= RtV + LEN) *EA = IN.uh[IDX] ",
     {
 		SCATTER_FUNCTION (EA,OFFSET,IDX, LEN, 2, IDX, 1, IN);
     },
     ()
 )
-DEF_MACRO(fVLOG_VTCM_WORD,
+DEF_MACRO(fVLOG_VTCM_WORD, (EA,OFFSET,IN,IDX,LEN),
+    "if (RtV <= EA <= RtV + LEN) *EA = IN.uw[IDX] ",
+    "if (RtV <= EA <= RtV + LEN) *EA = IN.uw[IDX] ",
     {
 		SCATTER_FUNCTION (EA,OFFSET,IDX, LEN, 4, IDX, 1, IN);
     },
     ()
 )
 
-DEF_MACRO(fVLOG_VTCM_HALFWORDQ,
+DEF_MACRO(fVLOG_VTCM_HALFWORDQ, (EA,OFFSET,IN,IDX,Q,LEN),
+    " if ( (RtV <= EA <= RtV + LEN) & Q) *EA = IN.uh[IDX] ",
+    " if ( (RtV <= EA <= RtV + LEN) & Q) *EA = IN.uh[IDX] ",
     {
 		SCATTER_FUNCTION (EA,OFFSET,IDX, LEN, 2, IDX, fGETQBIT(QsV,2*IDX+i0), IN);
     },
     ()
 )
-DEF_MACRO(fVLOG_VTCM_WORDQ,
+DEF_MACRO(fVLOG_VTCM_WORDQ, (EA,OFFSET,IN,IDX,Q,LEN),
+    " if ( (RtV <= EA <= RtV + LEN) & Q) *EA = IN.uw[IDX] ",
+    " if ( (RtV <= EA <= RtV + LEN) & Q) *EA = IN.uw[IDX] ",
     {
 		SCATTER_FUNCTION (EA,OFFSET,IDX, LEN, 4, IDX, fGETQBIT(QsV,4*IDX+i0), IN);
     },
@@ -479,14 +709,18 @@ DEF_MACRO(fVLOG_VTCM_WORDQ,
 
 
 
-DEF_MACRO(fVLOG_VTCM_HALFWORD_DV,
+DEF_MACRO(fVLOG_VTCM_HALFWORD_DV, (EA,OFFSET,IN,IDX,IDX2,IDX_H, LEN),
+    "if (RtV <= EA <= RtV + LEN) *EA = IN.w[IDX2].uh[IDX_H] ",
+    "if (RtV <= EA <= RtV + LEN) *EA = IN.w[IDX2].uh[IDX_H] ",
     {
 		SCATTER_FUNCTION (EA,OFFSET,IDX, LEN, 2, (2*IDX2+IDX_H), 1, IN);
     },
     ()
 )
 
-DEF_MACRO(fVLOG_VTCM_HALFWORDQ_DV,
+DEF_MACRO(fVLOG_VTCM_HALFWORDQ_DV, (EA,OFFSET,IN,IDX,Q,IDX2,IDX_H, LEN),
+    " if ( (RtV <= EA <= RtV + LEN) & Q) *EA = IN.w[IDX2].uh[IDX_H] ",
+    " if ( (RtV <= EA <= RtV + LEN) & Q) *EA = IN.w[IDX2].uh[IDX_H] ",
     {
 		SCATTER_FUNCTION (EA,OFFSET,IDX, LEN, 2, (2*IDX2+IDX_H), fGETQBIT(QsV,2*IDX+i0), IN);
     },
@@ -498,39 +732,161 @@ DEF_MACRO(fVLOG_VTCM_HALFWORDQ_DV,
 
 
 
-DEF_MACRO(fSTORERELEASE,
+DEF_MACRO(fSTORERELEASE, (EA,TYPE),
+	"char* addr = EA&~(ALIGNMENT-1); Zero Byte Store Release (Non-blocking Sync)",
+	"Zero Byte Store Release (Sync)",
     {
         fV_AL_CHECK(EA,fVECSIZE()-1);
 
         mem_store_release(thread, insn, fVECSIZE(), EA&~(fVECSIZE()-1), EA, TYPE, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr));
     },
-	(A_STORE,A_MEMLIKE)
+	(A_STORE,A_MEMLIKE,A_RESTRICT_SINGLE_MEM_FIRST)
 )
 
-DEF_MACRO(fVFETCH_AL,
+DEF_MACRO(fVFETCH_AL, (EA),
+    "Prefetch vector into L2 cache at EA",
+    "Prefetch vector into L2 cache at EA",
     {
     fV_AL_CHECK(EA,fVECSIZE()-1);
     mem_fetch_vector(thread, insn, EA&~(fVECSIZE()-1), insn->slot, fVECSIZE());
     },
-    (A_LOAD,A_MEMLIKE)
+    (A_LOAD,A_MEMLIKE,A_RESTRICT_SINGLE_MEM_FIRST,A_RESTRICT_NOSLOT1_STORE)
 )
 
 
-DEF_MACRO(fLOADMMV_AL,
+DEF_MACRO(fLOADMMV_AL, (EA, ALIGNMENT, LEN, DST),
+    "char* addr = EA&~(ALIGNMENT-1); for (i=0; i<LEN; ++i) DST[i] = addr[i]",
+    "Load LEN bytes from memory at EA (forced alignment) to DST.",
     {
     fV_AL_CHECK(EA,ALIGNMENT-1);
 	thread->last_pkt->double_access_vec = 0;
     mem_load_vector_oddva(thread, insn, EA&~(ALIGNMENT-1), EA, insn->slot, LEN, &DST.ub[0], LEN, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr));
     },
-    (A_LOAD,A_MEMLIKE)
+    (A_LOAD,A_MEMLIKE,A_RESTRICT_SINGLE_MEM_FIRST,A_RESTRICT_NOSLOT1_STORE)
 )
 
-DEF_MACRO(fLOADMMV,
+DEF_MACRO(fLOADMMV, (EA, DST),
+	"DST = *(EA&~(ALIGNMENT-1))",
+	"Load vector from memory at EA (forced alignment) to DST.",
 	fLOADMMV_AL(EA,fVECSIZE(),fVECSIZE(),DST),
 	()
 )
 
-DEF_MACRO(fLOADMMVQ,
+DEF_MACRO(fLOADMMZ, (EA,DST),
+	"DST[EA[7]] = *(EA)",
+	"Load splatter register from memory at EA (forced alignment) to DST.",
+	{
+		mmvector_t load_vec;
+		fV_AL_CHECK(EA,fVECSIZE()-1);
+		mem_load_vector_oddva(thread, insn, EA&~(fVECSIZE()-1), EA, insn->slot, fVECSIZE(), &load_vec.ub[0], fVECSIZE(), fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr));
+		int idx = (EA & 0x80)>0;
+		DST.v[idx] = load_vec;
+
+	},
+	()
+)
+DEF_MACRO(fLOADZ_LOAD, (EA,EAU,WIDTH,DST),
+	"",
+	"",
+	{
+		thread->last_pkt->ext_slot_cancelled = 0;
+		thread->last_pkt->double_access_vec = 0;
+		int etm_size = ((EA % width) ==0) ? fVECSIZE() : 0;
+		if (thread->processor_ptr->options->testgen_mode)
+			etm_size = ((EA % width) ==0) ? WIDTH : 0;
+
+		mem_load_vector_oddva(thread, insn, EA, EAU, insn->slot, WIDTH, &DST.ub[0], etm_size, fUSE_LOOKUP_ADDRESS());
+	},
+	(A_LOAD,A_MEMLIKE,A_RESTRICT_SINGLE_MEM_FIRST,A_RESTRICT_NOSLOT1_STORE)
+)
+
+DEF_MACRO(fELSE_CANCELZ, (),
+	"",
+	"",
+	else {
+		if (thread->last_pkt) {
+			thread->mem_access[insn->slot].dropped_z = 1;
+			thread->last_pkt->ext_slot_cancelled |= (1<<insn->slot);
+		}
+	},
+	(A_LOAD,A_MEMLIKE,A_RESTRICT_SINGLE_MEM_FIRST,A_RESTRICT_NOSLOT1_STORE)
+)
+
+
+
+
+DEF_MACRO(fPOST_INC4, (R),
+	"R+=4",
+	"",
+	R+=4;
+	,
+	(A_CVI_Z_INC_4)
+)
+DEF_MACRO(fPOST_INC8, (R),
+	"R+=8",
+	"",
+	R+=8;
+	,
+	(A_CVI_Z_INC_8)
+)
+DEF_MACRO(fPOST_INC16, (R),
+	"R+=16",
+	"",
+	R+=16;
+	,
+	(A_CVI_Z_INC_16)
+)
+
+DEF_MACRO(fEXTRACTZ,
+    (DST,IDX),
+    "DST=ZREG[IDX]",   /* short desc */
+    "Read Vector IDX", /* long desc */
+	(DST = READ_ZREG(fMODCIRCU((IDX),5))),
+    ()
+)
+
+DEF_MACRO(fLOADZ_UPDATE, (EA,WIDTH,ZN,N,SRC),
+	"for(i = 0; i < width; i++) ZREG.b[(EA+i)%ZN] = *(EA+i)",
+	"Load splatter register from memory at EA (forced alignment) to DST.",
+	{
+		mmvector_t Z[2];
+		Z[0] = READ_ZREG(0);
+		Z[1] = READ_ZREG(1);
+		for(int k = 0; k < WIDTH; k++) {
+			int element_idx = (EA+k)%N;
+			int z_idx = ((EA+k)%ZN)/N;
+			Z[z_idx].ub[element_idx] = SRC.ub[k];
+		}
+
+		WRITE_EXT_ZREG(0,Z[0],0);
+		WRITE_EXT_ZREG(1,Z[1],0);
+	},
+	(A_LOAD,A_MEMLIKE,A_RESTRICT_SINGLE_MEM_FIRST,A_RESTRICT_NOSLOT1_STORE)
+)
+DEF_MACRO(fSTOREZ, (EA,WIDTH,ZN,N),
+	"for(i = 0; i < width; i++) *(EA+i) = ZREG.b[(EA+i)%ZN]",
+	"Store splatter register from memory at EA (forced alignment) to DST.",
+	{
+		mmvector_t store_vec;
+		mmvector_t maskvec = {0};
+		mmvector_t Z[2];
+		Z[0] = READ_ZREG(0);
+		Z[1] = READ_ZREG(1);
+
+		for(int k = 0; k < WIDTH; k++) {
+			int element_idx = (EA+k)%N;
+			int z_idx = ((EA+k)%ZN)/N;
+			store_vec.ub[k] = Z[z_idx].ub[element_idx];
+			maskvec.ub[k] = 1;
+		}
+		mem_store_vector_oddva(thread, insn, EA, EA, insn->slot, WIDTH, &store_vec.ub[0], &maskvec.ub[0], 0, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr));
+	},
+	(A_STORE,A_MEMLIKE,A_RESTRICT_SINGLE_MEM_FIRST)
+)
+
+DEF_MACRO(fLOADMMVQ, (EA,DST,QVAL),
+	"DST = vmux(QVAL,*(EA&~(ALIGNMENT-1)),0)",
+	"Load vector from memory at EA (forced alignment) to DST.",
 	do {
 		int __i;
 		fLOADMMV_AL(EA,fVECSIZE(),fVECSIZE(),DST);
@@ -539,7 +895,9 @@ DEF_MACRO(fLOADMMVQ,
 	()
 )
 
-DEF_MACRO(fLOADMMVNQ,
+DEF_MACRO(fLOADMMVNQ, (EA,DST,QVAL),
+	"DST = vmux(QVAL,0,*(EA&~(ALIGNMENT-1)))",
+	"Load vector from memory at EA (forced alignment) to DST.",
 	do {
 		int __i;
 		fLOADMMV_AL(EA,fVECSIZE(),fVECSIZE(),DST);
@@ -548,7 +906,9 @@ DEF_MACRO(fLOADMMVNQ,
 	()
 )
 
-DEF_MACRO(fLOADMMVU_AL,
+DEF_MACRO(fLOADMMVU_AL, (EA, ALIGNMENT, LEN, DST),
+    "char* addr = EA; for (i=0; i<LEN; ++i) DST[i] = addr[i]",
+    "Load LEN bytes from memory at EA (unaligned) to DST.",
     {
     size4u_t size2 = (EA)&(ALIGNMENT-1);
     size4u_t size1 = LEN-size2;
@@ -556,10 +916,12 @@ DEF_MACRO(fLOADMMVU_AL,
     mem_load_vector_oddva(thread, insn, EA+size1, EA+fVECSIZE(), /* slot */ 1, size2, &DST.ub[size1], size2, fUSE_LOOKUP_ADDRESS());
     mem_load_vector_oddva(thread, insn, EA, EA,/* slot */ 0, size1, &DST.ub[0], size1, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr));
     },
-    (A_LOAD,A_MEMLIKE)
+    (A_LOAD,A_MEMLIKE,A_RESTRICT_SINGLE_MEM_FIRST,A_RESTRICT_NOSLOT1_STORE)
 )
 
-DEF_MACRO(fLOADMMVU,
+DEF_MACRO(fLOADMMVU, (EA, DST),
+	"DST = *EA",
+	"Load vector from memory at EA (unaligned) to DST.",
 	{
 		/* if address happens to be aligned, only do aligned load */
         thread->last_pkt->pkt_has_vtcm_access = 0;
@@ -579,63 +941,79 @@ DEF_MACRO(fLOADMMVU,
 	()
 )
 
-DEF_MACRO(fSTOREMMV_AL,
+DEF_MACRO(fSTOREMMV_AL, (EA, ALIGNMENT, LEN, SRC),
+    "char* addr = EA&~(ALIGNMENT-1); for (i=0; i<LEN; ++i) addr[i] = SRC[i]",
+    "Store LEN bytes from SRC into memory at EA (forced alignment).",
     {
     fV_AL_CHECK(EA,ALIGNMENT-1);
-    mem_store_vector_oddva(thread, insn, EA&~(ALIGNMENT-1), EA, insn->slot, LEN, &SRC.ub[0], 0, 0, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr));
+    mem_store_vector_oddva(thread, insn, EA&~(ALIGNMENT-1), EA, insn->slot, LEN, SRC.ub, 0, 0, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr));
     },
-    (A_STORE,A_MEMLIKE)
+    (A_STORE,A_MEMLIKE,A_RESTRICT_SINGLE_MEM_FIRST)
 )
 
-DEF_MACRO(fSTOREMMV,
+DEF_MACRO(fSTOREMMV, (EA, SRC),
+	"*(EA&~(ALIGNMENT-1)) = SRC",
+	"Store vector SRC to memory at EA (unaligned).",
 	fSTOREMMV_AL(EA,fVECSIZE(),fVECSIZE(),SRC),
 	()
 )
 
-DEF_MACRO(fSTOREMMVQ_AL,
+DEF_MACRO(fSTOREMMVQ_AL, (EA, ALIGNMENT, LEN, SRC, MASK),
+    "char* addr = EA&~(ALIGNMENT-1); for (i=0; i<LEN; ++i) if (MASK[i]) addr[i] = SRC[i]",
+    "Store LEN bytes from SRC into memory at EA (forced alignment).",
     do {
 	mmvector_t maskvec;
 	int i;
 	for (i = 0; i < fVECSIZE(); i++) maskvec.ub[i] = fGETQBIT(MASK,i);
-	mem_store_vector_oddva(thread, insn, EA&~(ALIGNMENT-1), EA, insn->slot, LEN, &SRC.ub[0], &maskvec.ub[0], 0, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr));
+	mem_store_vector_oddva(thread, insn, EA&~(ALIGNMENT-1), EA, insn->slot, LEN, SRC.ub, &maskvec.ub[0], 0, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr));
     } while (0),
-    (A_STORE,A_MEMLIKE)
+    (A_STORE,A_MEMLIKE,A_RESTRICT_SINGLE_MEM_FIRST)
 )
 
-DEF_MACRO(fSTOREMMVQ,
+DEF_MACRO(fSTOREMMVQ, (EA, SRC, MASK),
+	"*(EA&~(ALIGNMENT-1)) = SRC",
+	"Masked store vector SRC to memory at EA (forced alignment).",
 	fSTOREMMVQ_AL(EA,fVECSIZE(),fVECSIZE(),SRC,MASK),
 	()
 )
 
-DEF_MACRO(fSTOREMMVNQ_AL,
+DEF_MACRO(fSTOREMMVNQ_AL, (EA, ALIGNMENT, LEN, SRC, MASK),
+    "char* addr = EA&~(ALIGNMENT-1); for (i=0; i<LEN; ++i) if (!MASK[i]) addr[i] = SRC[i]",
+    "Store LEN bytes from SRC into memory at EA (forced alignment).",
     {
 	mmvector_t maskvec;
 	int i;
 	for (i = 0; i < fVECSIZE(); i++) maskvec.ub[i] = fGETQBIT(MASK,i);
         fV_AL_CHECK(EA,ALIGNMENT-1);
-	mem_store_vector_oddva(thread, insn, EA&~(ALIGNMENT-1), EA, insn->slot, LEN, &SRC.ub[0], &maskvec.ub[0], 1, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr));
+	mem_store_vector_oddva(thread, insn, EA&~(ALIGNMENT-1), EA, insn->slot, LEN, SRC.ub, &maskvec.ub[0], 1, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr));
     },
-    (A_STORE,A_MEMLIKE)
+    (A_STORE,A_MEMLIKE,A_RESTRICT_SINGLE_MEM_FIRST)
 )
 
-DEF_MACRO(fSTOREMMVNQ,
+DEF_MACRO(fSTOREMMVNQ, (EA, SRC, MASK),
+	"*(EA&~(ALIGNMENT-1)) = SRC",
+	"Masked negated store vector SRC to memory at EA (forced alignment).",
 	fSTOREMMVNQ_AL(EA,fVECSIZE(),fVECSIZE(),SRC,MASK),
 	()
 )
 
-DEF_MACRO(fSTOREMMVU_AL,
+DEF_MACRO(fSTOREMMVU_AL, (EA, ALIGNMENT, LEN, SRC),
+    "char* addr = EA; for (i=0; i<LEN; ++i) addr[i] = SRC[i]",
+    "Store LEN bytes from SRC into memory at EA (unaligned).",
     {
     size4u_t size1 = ALIGNMENT-((EA)&(ALIGNMENT-1));
     size4u_t size2;
     if (size1>LEN) size1 = LEN;
     size2 = LEN-size1;
     mem_store_vector_oddva(thread, insn, EA+size1, EA+fVECSIZE(), /* slot */ 1, size2, &SRC.ub[size1], 0, 0, fUSE_LOOKUP_ADDRESS());
-    mem_store_vector_oddva(thread, insn, EA, EA, /* slot */ 0, size1, &SRC.ub[0], 0, 0, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr));
+    mem_store_vector_oddva(thread, insn, EA, EA, /* slot */ 0, size1, SRC.ub, 0, 0, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr));
     },
-    (A_STORE,A_MEMLIKE)
+    (A_STORE,A_MEMLIKE,A_RESTRICT_SINGLE_MEM_FIRST)
 )
 
-DEF_MACRO(fSTOREMMVU,
+DEF_MACRO(fSTOREMMVU, (EA, SRC),
+	"*EA = SRC",
+	"Store vector SRC to memory at EA (unaligned).",
 	{
         thread->last_pkt->pkt_has_vtcm_access = 0;
         thread->last_pkt->pkt_access_count = 0;
@@ -651,7 +1029,9 @@ DEF_MACRO(fSTOREMMVU,
 	()
 )
 
-DEF_MACRO(fSTOREMMVQU_AL,
+DEF_MACRO(fSTOREMMVQU_AL, (EA, ALIGNMENT, LEN, SRC, MASK),
+    "char* addr = EA; for (i=0; i<LEN; ++i) if (MASK[i]) addr[i] = SRC[i]",
+    "Store LEN bytes from SRC into memory at EA (unaligned).",
     {
 	size4u_t size1 = ALIGNMENT-((EA)&(ALIGNMENT-1));
 	size4u_t size2;
@@ -661,12 +1041,14 @@ DEF_MACRO(fSTOREMMVQU_AL,
 	if (size1>LEN) size1 = LEN;
 	size2 = LEN-size1;
 	mem_store_vector_oddva(thread, insn, EA+size1, EA+fVECSIZE(),/* slot */ 1, size2, &SRC.ub[size1], &maskvec.ub[size1], 0, fUSE_LOOKUP_ADDRESS());
-	mem_store_vector_oddva(thread, insn, EA, /* slot */ 0, size1, &SRC.ub[0], &maskvec.ub[0], 0, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr));
+	mem_store_vector_oddva(thread, insn, EA, /* slot */ 0, size1, SRC.ub, &maskvec.ub[0], 0, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr));
     },
-    (A_STORE,A_MEMLIKE)
+    (A_STORE,A_MEMLIKE,A_RESTRICT_SINGLE_MEM_FIRST)
 )
 
-DEF_MACRO(fSTOREMMVQU,
+DEF_MACRO(fSTOREMMVQU, (EA, SRC, MASK),
+	"*EA = SRC",
+	"Store vector SRC to memory at EA (unaligned).",
 	{
         thread->last_pkt->pkt_has_vtcm_access = 0;
         thread->last_pkt->pkt_access_count = 0;
@@ -682,7 +1064,9 @@ DEF_MACRO(fSTOREMMVQU,
 	()
 )
 
-DEF_MACRO(fSTOREMMVNQU_AL,
+DEF_MACRO(fSTOREMMVNQU_AL, (EA, ALIGNMENT, LEN, SRC, MASK),
+    "char* addr = EA; for (i=0; i<LEN; ++i) if (!MASK[i]) addr[i] = SRC[i]",
+    "Store LEN bytes from SRC into memory at EA (unaligned).",
     {
 	size4u_t size1 = ALIGNMENT-((EA)&(ALIGNMENT-1));
 	size4u_t size2;
@@ -692,12 +1076,14 @@ DEF_MACRO(fSTOREMMVNQU_AL,
 	if (size1>LEN) size1 = LEN;
 	size2 = LEN-size1;
 	mem_store_vector_oddva(thread, insn, EA+size1, EA+fVECSIZE(), /* slot */ 1, size2, &SRC.ub[size1], &maskvec.ub[size1], 1, fUSE_LOOKUP_ADDRESS());
-	mem_store_vector_oddva(thread, insn, EA, EA, /* slot */ 0, size1, &SRC.ub[0], &maskvec.ub[0], 1, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr));
+	mem_store_vector_oddva(thread, insn, EA, EA, /* slot */ 0, size1, SRC.ub, &maskvec.ub[0], 1, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr));
     },
-    (A_STORE,A_MEMLIKE)
+    (A_STORE,A_MEMLIKE,A_RESTRICT_SINGLE_MEM_FIRST)
 )
 
-DEF_MACRO(fSTOREMMVNQU,
+DEF_MACRO(fSTOREMMVNQU, (EA, SRC, MASK),
+	"*EA = SRC",
+	"Store vector SRC to memory at EA (unaligned).",
 	{
         thread->last_pkt->pkt_has_vtcm_access = 0;
         thread->last_pkt->pkt_access_count = 0;
@@ -716,127 +1102,446 @@ DEF_MACRO(fSTOREMMVNQU,
 
 
 
-DEF_MACRO(fVFOREACH,
+DEF_MACRO(fVFOREACH,(WIDTH, VAR),
+    "for (VAR = 0; VAR < VELEM(WIDTH); VAR++)",
+    "For VAR in each WIDTH-bit vector index",
     for (VAR = 0; VAR < fVELEM(WIDTH); VAR++),
     /* NOTHING */
 )
 
-DEF_MACRO(fVARRAY_ELEMENT_ACCESS,
+DEF_MACRO(fVARRAY_ELEMENT_ACCESS, (ARRAY, TYPE, INDEX),
+    "ARRAY.TYPE[INDEX]",
+    "Access element of type TYPE at position INDEX of flattened ARRAY",
     ARRAY.v[(INDEX) / (fVECSIZE()/(sizeof(ARRAY.TYPE[0])))].TYPE[(INDEX) % (fVECSIZE()/(sizeof(ARRAY.TYPE[0])))],
     ()
 )
 
-DEF_MACRO(fVNEWCANCEL,
+DEF_MACRO(fVNEWCANCEL,(REGNUM),
+	"Ignore current value for register REGNUM",
+	"Ignore current value for register REGNUM",
 	do { THREAD2STRUCT->VRegs_select &= ~(1<<(REGNUM)); } while (0),
 	()
 )
 
-DEF_MACRO(fTMPVDATA,
+DEF_MACRO(fTMPVDATA,(),
+	"Data from .tmp load",
+	"Data from .tmp load and clear tmp status",
 	mmvec_vtmp_data(thread),
-	(A_CVI)
+	(A_CVI,A_CVI_REQUIRES_TMPLOAD)
 )
 
-DEF_MACRO(fVSATDW,
+DEF_MACRO(fVSATDW, (U,V),
+    "usat_32(U:V)",
+    "Use 32-bits of U as MSW and 32-bits of V as LSW and saturate the resultant 64-bits to 32 bits",
     fVSATW( ( ( ((long long)U)<<32 ) | fZXTN(32,64,V) ) ),
     /* attribs */
 )
 
-DEF_MACRO(fVASL_SATHI,
+DEF_MACRO(fVASL_SATHI, (U,V),
+    "uasl_sathi(U:V)",
+    "Use 32-bits of U as MSW and 32-bits of V as LSW, left shift by 1 and saturate the result and take high word",
     fVSATW(((U)<<1) | ((V)>>31)),
     /* attribs */
 )
 
-DEF_MACRO(fVUADDSAT,
+DEF_MACRO(fVUADDSAT,(WIDTH,U,V),
+	"usat_##WIDTH(U+V)",
+	"Add WIDTH-bit values U and V with saturation",
 	fVSATUN( WIDTH, fZXTN(WIDTH, 2*WIDTH, U)  + fZXTN(WIDTH, 2*WIDTH, V)),
 	/* attribs */
 )
 
-DEF_MACRO(fVSADDSAT,
-	fVSATN(  WIDTH, fSXTN(WIDTH, 2*WIDTH, U)  + fSXTN(WIDTH, 2*WIDTH, V)),
+DEF_MACRO(fVSADDSAT,(WIDTH,U,V),
+	"sat_##WIDTH(U+V)",
+	"Add WIDTH-bit values U and V with saturation",
+	({size8s_t tmp5 = fSXTN(WIDTH, 2*WIDTH, U);
+	  size8s_t tmp6 = fSXTN(WIDTH, 2*WIDTH, V);
+      size8s_t tmp7 = tmp5 + tmp6;
+	  fVSATN(  WIDTH, tmp7);
+	 }),
 	/* attribs */
 )
 
-DEF_MACRO(fVUSUBSAT,
+DEF_MACRO(fVUSUBSAT,(WIDTH,U,V),
+	"usat_##WIDTH(U-V)",
+	"sub WIDTH-bit values U and V with saturation",
 	fVSATUN( WIDTH, fZXTN(WIDTH, 2*WIDTH, U)  - fZXTN(WIDTH, 2*WIDTH, V)),
 	/* attribs */
 )
 
-DEF_MACRO(fVSSUBSAT,
+DEF_MACRO(fVSSUBSAT,(WIDTH,U,V),
+	"sat_##WIDTH(U-V)",
+	"sub WIDTH-bit values U and V with saturation",
 	fVSATN(  WIDTH, fSXTN(WIDTH, 2*WIDTH, U)  - fSXTN(WIDTH, 2*WIDTH, V)),
 	/* attribs */
 )
 
-DEF_MACRO(fVAVGU,
+DEF_MACRO(fVAVGU,(WIDTH,U,V),
+	"(U+V)/2",
+	"average WIDTH-bit values U and V with saturation",
 	((fZXTN(WIDTH, 2*WIDTH, U) + fZXTN(WIDTH, 2*WIDTH, V))>>1),
 	/* attribs */
 )
 
-DEF_MACRO(fVAVGURND,
+DEF_MACRO(fVAVGURND,(WIDTH,U,V),
+	"(U+V+1)/2",
+	"average WIDTH-bit values U and V with saturation",
 	((fZXTN(WIDTH, 2*WIDTH, U) + fZXTN(WIDTH, 2*WIDTH, V)+1)>>1),
 	/* attribs */
 )
 
-DEF_MACRO(fVNAVGU,
+DEF_MACRO(fVNAVGU,(WIDTH,U,V),
+	"(U-V)/2",
+	"average WIDTH-bit values U and V with saturation",
 	((fZXTN(WIDTH, 2*WIDTH, U) - fZXTN(WIDTH, 2*WIDTH, V))>>1),
 	/* attribs */
 )
 
-DEF_MACRO(fVNAVGURNDSAT,
+DEF_MACRO(fVNAVGURNDSAT,(WIDTH,U,V),
+	"(U-V+1)/2",
+	"average WIDTH-bit values U and V with saturation",
 	fVSATUN(WIDTH,((fZXTN(WIDTH, 2*WIDTH, U) - fZXTN(WIDTH, 2*WIDTH, V)+1)>>1)),
 	/* attribs */
 )
 
-DEF_MACRO(fVAVGS,
+DEF_MACRO(fVAVGS,(WIDTH,U,V),
+	"(U+V)/2",
+	"average WIDTH-bit values U and V with saturation",
 	((fSXTN(WIDTH, 2*WIDTH, U) + fSXTN(WIDTH, 2*WIDTH, V))>>1),
 	/* attribs */
 )
 
-DEF_MACRO(fVAVGSRND,
+DEF_MACRO(fVAVGSRND,(WIDTH,U,V),
+	"(U+V+1)/2",
+	"average WIDTH-bit values U and V with saturation",
 	((fSXTN(WIDTH, 2*WIDTH, U) + fSXTN(WIDTH, 2*WIDTH, V)+1)>>1),
 	/* attribs */
 )
 
-DEF_MACRO(fVNAVGS,
+DEF_MACRO(fVNAVGS,(WIDTH,U,V),
+	"(U-V)/2",
+	"average WIDTH-bit values U and V with saturation",
 	((fSXTN(WIDTH, 2*WIDTH, U) - fSXTN(WIDTH, 2*WIDTH, V))>>1),
 	/* attribs */
 )
 
-DEF_MACRO(fVNAVGSRND,
+DEF_MACRO(fVNAVGSRND,(WIDTH,U,V),
+	"(U-V+1)/2",
+	"average WIDTH-bit values U and negative V followed by rounding",
 	((fSXTN(WIDTH, 2*WIDTH, U) - fSXTN(WIDTH, 2*WIDTH, V)+1)>>1),
 	/* attribs */
 )
 
-DEF_MACRO(fVNAVGSRNDSAT,
+DEF_MACRO(fVNAVGSRNDSAT,(WIDTH,U,V),
+	"(U-V+1)/2",
+	"average WIDTH-bit values U and V with saturation",
 	fVSATN(WIDTH,((fSXTN(WIDTH, 2*WIDTH, U) - fSXTN(WIDTH, 2*WIDTH, V)+1)>>1)),
 	/* attribs */
 )
 
 
-DEF_MACRO(fVNOROUND,
+DEF_MACRO(fVNOROUND,(VAL,SHAMT),
+	"VAL",
+	"VAL",
 	VAL,
 	/* NOTHING */
 )
-DEF_MACRO(fVNOSAT,
+DEF_MACRO(fVNOSAT,(VAL),
+	"VAL",
+	"VAL",
 	VAL,
 	/* NOTHING */
 )
 
-DEF_MACRO(fVROUND,
+DEF_MACRO(fVROUND,(VAL,SHAMT),
+	"VAL + (1<<(SHAMT-1))",
+	"VAL + RNDBIT",
 	((VAL) + (((SHAMT)>0)?(1LL<<((SHAMT)-1)):0)),
 	/* NOTHING */
 )
 
-DEF_MACRO(fCARRY_FROM_ADD32,
+DEF_MACRO(fCARRY_FROM_ADD32,(A,B,C),
+	"carry_from(A,B,C)",
+	"carry_from(A,B,C)",
 	(((fZXTN(32,64,A)+fZXTN(32,64,B)+C) >> 32) & 1),
 	/* NOTHING */
 )
 
-DEF_MACRO(fUARCH_NOTE_PUMP_4X,
+DEF_MACRO(fUARCH_NOTE_PUMP_4X,(),
+	"",
+	"",
 	,
-	()
+	(A_CVI_PUMP_4X)
 )
 
-DEF_MACRO(fUARCH_NOTE_PUMP_2X,
+DEF_MACRO(fUARCH_NOTE_PUMP_2X,(),
+	"",
+	"",
 	,
+	(A_CVI_PUMP_2X)
+)
+
+DEF_MACRO(fVDOCHKPAGECROSS,(BASE,SUM),
+	"",
+	"",
+	if (UNLIKELY(thread->timing_on)) {
+		thread->mem_access[slot].check_page_crosses = 1;
+		thread->mem_access[slot].page_cross_base = BASE;
+		thread->mem_access[slot].page_cross_sum = SUM;
+	},
+	(A_EA_PAGECROSS)
+)
+
+/* FP instructions */
+/*Qfloat Macros for muls*/
+DEF_MACRO(fPARSEQF32,(A),
+  "A",
+  "Parsing QF32 to extract exp/sig",
+  parse_qf32(A),
+  (A_HVX_FLT)
+)
+
+DEF_MACRO(fRNDSATQF32,(A,B,C),
+  "rnd_sat(A,B,C)",
+  "Rnd/Sat/Norm of Vector Multiply of two QF32 inputs",
+  rnd_sat_qf32(A,B,C),
+  (A_HVX_FLT)
+)
+
+DEF_MACRO(fPARSEQF16,(A),
+  "A",
+  "Parsing QF16 to extract exp/sig",
+  parse_qf16(A),
+  (A_HVX_FLT)
+)
+
+DEF_MACRO(fRNDSATQF16,(A,B,C),
+  "rnd_sat(A,B,C)",
+  "Rnd/Sat/Norm of Vector Multiply of two QF16 inputs",
+  rnd_sat_qf16(A,B,C),
 	()
 )
+/*Qfloat Macros for others*/
+DEF_MACRO(fPARSESF,(A),
+  "A",
+  "Parsing IEEE SF to extract sign/exp/sig",
+  parse_sf(A),
+  (A_HVX_FLT)
+)
+
+DEF_MACRO(fRNDSATSF,(A,B),
+  "rnd_sat(A,B)",
+  "Rnd/Sat/Norm of Vector Multiply of two IEEE SF inputs",
+  rnd_sat_sf(A,B),
+  (A_HVX_FLT)
+)
+
+DEF_MACRO(fPARSEHF,(A),
+  "A",
+  "Parsing IEEE HF to extract sign/exp/sig",
+  parse_hf(A),
+  (A_HVX_FLT)
+)
+
+DEF_MACRO(fRNDSATHF,(A,B),
+  "rnd_sat(A,B)",
+  "Rnd/Sat/Norm of Vector Multiply of two IEEE HF inputs",
+  rnd_sat_hf(A,B),
+  (A_HVX_FLT)
+)
+
+DEF_MACRO(fRNDSATW,(A,B),
+  "rnd_sat(A,B)",
+  "Rnd/Sat/Norm of Vector convert of W inputs",
+  rnd_sat_w(A,B),
+  (A_HVX_FLT)
+)
+
+DEF_MACRO(fRNDSATUW,(A,B),
+  "rnd_sat(A,B)",
+  "Rnd/Sat/Norm of Vector convert of UW inputs",
+  rnd_sat_uw(A,B),
+  (A_HVX_FLT)
+)
+
+DEF_MACRO(fRNDSATH,(A,B),
+  "rnd_sat(A,B)",
+  "Rnd/Sat/Norm of Vector convert of H inputs",
+  rnd_sat_h(A,B),
+  (A_HVX_FLT)
+)
+
+DEF_MACRO(fRNDSATUH,(A,B),
+  "rnd_sat(A,B)",
+  "Rnd/Sat/Norm of Vector convert of UW inputs",
+  rnd_sat_uh(A,B),
+  (A_HVX_FLT)
+)
+
+DEF_MACRO(fRNDSATB,(A,B),
+  "rnd_sat(A,B)",
+  "Rnd/Sat/Norm of Vector convert of B inputs",
+  rnd_sat_b(A,B),
+  (A_HVX_FLT)
+)
+
+DEF_MACRO(fRNDSATUB,(A,B),
+  "rnd_sat(A,B)",
+  "Rnd/Sat/Norm of Vector convert of UB inputs",
+  rnd_sat_ub(A,B),
+  (A_HVX_FLT)
+)
+
+DEF_MACRO(fNEGQF32,(A),
+  "-(A)",
+  "Take Ones complement",
+  negate32(A),
+  (A_HVX_FLT)
+)
+
+DEF_MACRO(fNEGQF16,(A),
+  "-(A)",
+  "Take Ones complement",
+  negate16(A),
+  (A_HVX_FLT)
+)
+
+DEF_MACRO(fNEGSF,(A),
+  "-(A)",
+  "Change sign",
+  negate_sf(A),
+  (A_HVX_FLT)
+)
+DEF_MACRO(fNEGHF,(A),
+  "-(A)",
+  "Change sign",
+  negate_hf(A),
+  (A_HVX_FLT)
+)
+
+//FP vector compare
+DEF_MACRO(fCMPGT_QF32,(A,B),
+  "(A > B)",
+  "Vector compare of QF32 format",
+  cmpgt_qf32(A,B),
+  (A_HVX_FLT)
+)
+
+DEF_MACRO(fCMPGT_QF16,(A,B),
+  "(A > B)",
+  "Vector compare of QF16 format",
+  cmpgt_qf16(A,B),
+  (A_HVX_FLT)
+)
+
+DEF_MACRO(fCMPGT_SF,(A,B),
+  "(A > B)",
+  "Vector compare of SF format",
+  cmpgt_sf(A,B),
+  (A_HVX_FLT)
+)
+
+DEF_MACRO(fCMPGT_HF,(A,B),
+  "(A > B)",
+  "Vector compare of HF format",
+  cmpgt_hf(A,B),
+	(A_HVX_FLT)
+)
+
+DEF_MACRO(fCMPGT_BF,(A,B),
+	"(A > B)",
+	"Vector compare of BF format",
+	cmpgt_sf(((int)A) << 16,((int)B) << 16),
+  (A_HVX_FLT)
+)
+
+DEF_MACRO(fCMPGT_QF32_SF,(A,B),
+  "(A > B)",
+  "Vector compare of QF32/SF format",
+  cmpgt_qf32_sf(A,B),
+  (A_HVX_FLT)
+)
+
+DEF_MACRO(fCMPGT_QF16_HF,(A,B),
+  "(A > B)",
+  "Vector compare of QF16/HF format",
+  cmpgt_qf16_hf(A,B),
+  (A_HVX_FLT)
+)
+
+//VMAX/VMIN_QF32/QF16
+DEF_MACRO(fMAX_QF32,(X,Y),
+  "max(X,Y)",
+  "",
+  max_qf32(X,Y),
+  (A_HVX_FLT)
+)
+DEF_MACRO(fMIN_QF32,(X,Y),
+  "min(X,Y)",
+  "",
+  min_qf32(X,Y),
+  (A_HVX_FLT)
+)
+DEF_MACRO(fMAX_QF32_SF,(X,Y),
+  "max(X,Y)",
+  "",
+  max_qf32_sf(X,Y),
+  (A_HVX_FLT)
+)
+DEF_MACRO(fMIN_QF32_SF,(X,Y),
+  "min(X,Y)",
+  "",
+  min_qf32_sf(X,Y),
+  (A_HVX_FLT)
+)
+DEF_MACRO(fMAX_QF16,(X,Y),
+  "max(X,Y)",
+  "",
+  max_qf16(X,Y),
+  (A_HVX_FLT)
+)
+DEF_MACRO(fMIN_QF16,(X,Y),
+  "min(X,Y)",
+  "",
+  min_qf16(X,Y),
+  (A_HVX_FLT)
+)
+DEF_MACRO(fMAX_QF16_HF,(X,Y),
+  "max(X,Y)",
+  "",
+  max_qf16_hf(X,Y),
+  (A_HVX_FLT)
+)
+DEF_MACRO(fMIN_QF16_HF,(X,Y),
+  "min(X,Y)",
+  "",
+  min_qf16_hf(X,Y),
+  (A_HVX_FLT)
+)
+
+//MAX/MIN_SF/HF
+DEF_MACRO(fMAX_SF,(X,Y),
+  "max(X,Y)",
+  "",
+  max_sf(X,Y),
+  (A_HVX_FLT)
+)
+DEF_MACRO(fMIN_SF,(X,Y),
+  "min(X,Y)",
+  "",
+  min_sf(X,Y),
+  (A_HVX_FLT)
+)
+DEF_MACRO(fMAX_HF,(X,Y),
+  "max(X,Y)",
+  "",
+  max_hf(X,Y),
+  (A_HVX_FLT)
+)
+DEF_MACRO(fMIN_HF,(X,Y),
+  "min(X,Y)",
+  "",
+  min_hf(X,Y),
+  (A_HVX_FLT)
+)
+
diff --git a/target/hexagon/imported/system.idef b/target/hexagon/imported/system.idef
index 7c6568e75e42..aa57149a1ceb 100644
--- a/target/hexagon/imported/system.idef
+++ b/target/hexagon/imported/system.idef
@@ -25,44 +25,301 @@
 /* User->OS interface                       */
 /********************************************/
 
-Q6INSN(J2_trap0,"trap0(#u8)",ATTRIBS(A_COF),
+Q6INSN(J2_trap0,"trap0(#u8)",ATTRIBS(A_COF,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET),
 "Trap to Operating System",
-    fTRAP(0,uiV);
+	fTRAP(0,uiV);
 )
 
-Q6INSN(J2_pause,"pause(#u8)",ATTRIBS(A_COF),
+Q6INSN(J2_trap1,"trap1(Rx32,#u8)",ATTRIBS(A_COF,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET),
+"Trap to Operating System",
+	/*
+	 * Note: if RxV is not written, we get the same as the input.
+	 * Since trap1 is SOLO, this means the register will effectively not be updated
+	 */
+	if (!fTRAP1_VIRTINSN(uiV)) {
+		fTRAP(1,uiV);
+	} else if (uiV == 1) {
+		fVIRTINSN_RTE(uiV,RxV);
+	} else if (uiV == 3) {
+		fVIRTINSN_SETIE(uiV,RxV);
+	} else if (uiV == 4) {
+		fVIRTINSN_GETIE(uiV,RxV);
+	} else if (uiV == 6) {
+		fVIRTINSN_SPSWAP(uiV,RxV);
+	})
+
+Q6INSN(J2_pause,"pause(#u8)",ATTRIBS(A_COF,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET),
 "Enter low-power state for #u8 cycles",{fPAUSE(uiV);})
 
-Q6INSN(Y2_icinva,"icinva(Rs32)",ATTRIBS(A_ICOP,A_ICFLUSHOP),"Instruction Cache Invalidate Address",{fEA_REG(RsV); fICINVA(EA);})
+Q6INSN(J2_rte,  "rte", ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NO_TIMING_LOG),
+"Return from Exception",
+{
+fHIDE(if((thread->timing_on) && (thread->status & EXEC_STATUS_REPLAY)) { return; })
+fHIDE(CALLBACK(thread->processor_ptr->options->rte_callback,
+      thread->system_ptr,thread->processor_ptr,
+      thread->threadId,0);)
+fCLEAR_RTE_EX();
+fBRANCH(fREAD_ELR(),COF_TYPE_RTE);})
+
+
+/********************************************/
+/* Interrupt Management                     */
+/********************************************/
+
+Q6INSN(Y2_swi,"swi(Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_AXOK,A_RESTRICT_PACKET_AXOK),"Software Interrupt",{DO_SWI(RsV);})
+Q6INSN(Y2_cswi,"cswi(Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_AXOK,A_RESTRICT_PACKET_AXOK),"Cancel Software Interrupt",{DO_CSWI(RsV);})
+Q6INSN(Y2_ciad,"ciad(Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_AXOK,A_RESTRICT_PACKET_AXOK),"Re-enable interrupt in IAD",{DO_CIAD(RsV);})
+Q6INSN(Y4_siad,"siad(Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_AXOK,A_RESTRICT_PACKET_AXOK),"Disable interrupt in IAD",{DO_SIAD(RsV);})
+Q6INSN(Y2_iassignr,"Rd32=iassignr(Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_AXOK,A_RESTRICT_PACKET_AXOK),"Read interrupt to thread assignments",{DO_IASSIGNR(RsV,RdV);})
+Q6INSN(Y2_iassignw,"iassignw(Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_AXOK,A_RESTRICT_PACKET_AXOK),"Write interrupt to thread assignments",{DO_IASSIGNW(RsV);})
+
+
+Q6INSN(Y2_getimask,"Rd32=getimask(Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_AXOK,A_RESTRICT_PACKET_AXOK),"Read imask register of another thread",
+{RdV = READ_IMASK(RsV & thread->processor_ptr->thread_system_mask); })
+
+Q6INSN(Y2_setimask,"setimask(Pt4,Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_AXOK,A_RESTRICT_PACKET_AXOK),"Change imask register of another thread",
+{fPREDUSE_TIMING();WRITE_IMASK(PtV & thread->processor_ptr->thread_system_mask,RsV); })
+
+
+
+/********************************************/
+/* TLB management                           */
+/********************************************/
+
+Q6INSN(Y2_tlbw,"tlbw(Rss32,Rt32)", ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET),
+"Write TLB entry", {fTLBW(RtV,RssV);})
+
+Q6INSN(Y5_ctlbw,"Rd32=ctlbw(Rss32,Rt32)", ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET),
+"Conditional Write TLB entry",
+{
+  if (fTLB_ENTRY_OVERLAP( (1LL<<63) | RssV )) {
+        RdV=fTLB_ENTRY_OVERLAP_IDX( (1LL<<63) | RssV);
+     } else {
+        fTLBW(RtV,RssV);
+        RdV=0x80000000;
+     }
+})
+
+Q6INSN(Y5_tlboc,"Rd32=tlboc(Rss32)", ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET),
+"TLB overlap check",
+{
+  if (fTLB_ENTRY_OVERLAP( (1LL<<63) | RssV )) {
+        RdV=fTLB_ENTRY_OVERLAP_IDX( (1LL<<63) | RssV);
+     } else {
+        RdV=0x80000000;
+     }
+})
+
+Q6INSN(Y2_tlbr,"Rdd32=tlbr(Rs32)", ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET), "Read TLB entry",
+{RddV = fTLBR(RsV);})
+
+Q6INSN(Y2_tlbp,"Rd32=tlbp(Rs32)", ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET), "Probe TLB", {RdV=fTLBP(RsV);})
+
+Q6INSN(Y5_tlbasidi,"tlbinvasid(Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET), "Invalidate ASID",
+{
+	fHIDE(int i;)
+    fHIDE(unsigned int NUM_TLB_ENTRIES = NUM_TLB_REGS(thread->processor_ptr);)
+	for (i = 0; i < NUM_TLB_ENTRIES; i++) {
+		if ((fGET_FIELD(fTLBR(i),PTE_G) == 0) &&
+			(fGET_FIELD(fTLBR(i),PTE_ASID) == fEXTRACTU_RANGE(RsV,26,20))) {
+			fTLBW(i,fTLBR(i) & ~(1ULL << 63));
+		}
+	}
+})
+
+Q6INSN(Y2_tlblock,"tlblock", ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_NO_TIMING_LOG), "Lock TLB",
+{fSET_TLB_LOCK();})
+
+Q6INSN(Y2_tlbunlock,"tlbunlock", ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET), "Unlock TLB",
+{fCLEAR_TLB_LOCK();})
+
+Q6INSN(Y2_k0lock,"k0lock", ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_NO_TIMING_LOG), "Lock K0",
+{fSET_K0_LOCK();})
+
+Q6INSN(Y2_k0unlock,"k0unlock", ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET), "Unlock K0",
+{fCLEAR_K0_LOCK();})
+
+/********************************************/
+/* Supervisor Reg Management                */
+/********************************************/
+
+Q6INSN(Y2_crswap0,"crswap(Rx32,sgp0)",ATTRIBS(A_PRIV,A_NOTE_PRIV), "Swap system general pointer 0 with GPR",
+{fHIDE(size4s_t tmp;) tmp = RxV; RxV = READ_SGP0(); WRITE_SGP0(tmp);})
+Q6INSN(Y4_crswap1,"crswap(Rx32,sgp1)",ATTRIBS(A_PRIV,A_NOTE_PRIV), "Swap system general pointer 1 with GPR",
+{fHIDE(size4s_t tmp;) tmp = RxV; RxV = READ_SGP1(); WRITE_SGP1(tmp);})
+
+Q6INSN(Y4_crswap10,"crswap(Rxx32,sgp1:0)",ATTRIBS(A_PRIV,A_NOTE_PRIV), "Swap system general purpose 0/1 with GPR Pair",
+{fHIDE(size8s_t tmp;) tmp = RxxV; RxxV=READ_SGP10(); WRITE_SGP10(tmp);})
+
+Q6INSN(Y2_tfrscrr,"Rd32=Ss128",ATTRIBS(A_PRIV,A_NOTE_PRIV),"Transfer Supervisor Reg to GPR", {RdV=SsV;})
+Q6INSN(Y2_tfrsrcr,"Sd128=Rs32",ATTRIBS(A_PRIV,A_NOTE_PRIV),"Transfer GPR to Supervisor Reg", {SdV=RsV;})
+Q6INSN(Y4_tfrscpp,"Rdd32=Sss128",ATTRIBS(A_PRIV,A_NOTE_PRIV),"Transfer Supervisor Reg to GPR", {RddV=SssV;})
+Q6INSN(Y4_tfrspcp,"Sdd128=Rss32",ATTRIBS(A_PRIV,A_NOTE_PRIV),"Transfer GPR to Supervisor Reg", {SddV=RssV;})
+
+Q6INSN(G4_tfrgcrr,"Rd32=Gs32",ATTRIBS(A_GUEST,A_NOTE_GUEST),"Transfer Guest Reg to GPR", {RdV=GsV;})
+Q6INSN(G4_tfrgrcr,"Gd32=Rs32",ATTRIBS(A_GUEST,A_NOTE_GUEST),"Transfer GPR to Guest Reg", {GdV=RsV;})
+Q6INSN(G4_tfrgcpp,"Rdd32=Gss32",ATTRIBS(A_GUEST,A_NOTE_GUEST),"Transfer Guest Reg to GPR", {RddV=GssV;})
+Q6INSN(G4_tfrgpcp,"Gdd32=Rss32",ATTRIBS(A_GUEST,A_NOTE_GUEST),"Transfer GPR to Guest Reg", {GddV=RssV;})
+
+
+
+Q6INSN(Y2_setprio,"setprio(Pt4,Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV),"Change TID Prio of another thread",
+{fPREDUSE_TIMING();WRITE_PRIO(PtV & thread->processor_ptr->thread_system_mask,RsV); })
+
+
+
+
+/********************************************/
+/* Power Management / Thread on/off         */
+/********************************************/
+Q6INSN(Y6_diag,"diag(Rs32)",ATTRIBS(),"Send value to Diag trace module",{
+})
+Q6INSN(Y6_diag0,"diag0(Rss32,Rtt32)",ATTRIBS(),"Send values of two register to DIAG Trace. Set X=0",{
+})
+Q6INSN(Y6_diag1,"diag1(Rss32,Rtt32)",ATTRIBS(),"Send values of two register to DIAG Trace.  Set X=1",{
+})
+
 
-Q6INSN(Y2_isync,"isync",ATTRIBS(),"Memory Synchronization",{fISYNC();})
-Q6INSN(Y2_barrier,"barrier",ATTRIBS(A_RESTRICT_SLOT0ONLY),"Memory Barrier",{fBARRIER();})
-Q6INSN(Y2_syncht,"syncht",ATTRIBS(A_RESTRICT_SLOT0ONLY),"Memory Synchronization",{fSYNCH();})
+Q6INSN(Y4_trace,"trace(Rs32)",ATTRIBS(A_NOTE_AXOK,A_RESTRICT_PACKET_AXOK),"Send value to ETM trace",{
+    fDO_TRACE(RsV);
+})
+
+Q6INSN(Y2_stop,"stop(Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET),"Stop thread(s)",{
+    fHIDE(RsV=RsV;)
+    if (!fIN_DEBUG_MODE_NO_ISDB(fGET_TNUM())) fCLEAR_RUN_MODE(fGET_TNUM());
+})
+
+Q6INSN(Y4_nmi,"nmi(Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_NO_TIMING_LOG),"Raise NMI on thread(s)",{
+    fDO_NMI(RsV);
+})
+
+Q6INSN(Y2_start,"start(Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET),"Start thread(s)",fSTART(RsV);)
+
+Q6INSN(Y2_wait,"wait(Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_NO_TIMING_LOG),"Make thread(s) wait",{
+    fHIDE(RsV=RsV;)
+    if (!fIN_DEBUG_MODE(fGET_TNUM())) fSET_WAIT_MODE(fGET_TNUM());
+	fIN_DEBUG_MODE_WARN(fGET_TNUM());
+})
+
+Q6INSN(Y2_resume,"resume(Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET),"Make thread(s) stop waiting",fRESUME(RsV);)
+
+Q6INSN(Y2_break,"brkpt",ATTRIBS(A_NOTE_NOPACKET,A_RESTRICT_NOPACKET),"Breakpoint",{fBREAK();})
+
+
+/********************************************/
+/* Cache Management                         */
+/********************************************/
+
+Q6INSN(Y2_ictagr,"Rd32=ictagr(Rs32)",ATTRIBS(A_ICOP,A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_CACHEOP,A_COPBYIDX,A_ICTAGOP),"Instruction Cache Tag Read",{fICTAGR(RsV,RdV,RdN);})
+Q6INSN(Y2_ictagw,"ictagw(Rs32,Rt32)",ATTRIBS(A_ICOP,A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_CACHEOP,A_COPBYIDX,A_ICTAGOP),"Instruction Cache Tag Write",{fICTAGW(RsV,RtV);})
+Q6INSN(Y2_icdataw,"icdataw(Rs32,Rt32)",ATTRIBS(A_ICOP,A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_CACHEOP,A_COPBYIDX,A_ICTAGOP),"Instruction Cache Data Write",{fICDATAW(RsV,RtV);})
+Q6INSN(Y2_icdatar,"Rd32=icdatar(Rs32)",ATTRIBS(A_ICOP,A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_CACHEOP,A_COPBYIDX,A_ICTAGOP),"Instruction Cache Data Read",{fICDATAR(RsV, RdV);})
+Q6INSN(Y2_icinva,"icinva(Rs32)",ATTRIBS(A_ICOP,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_CACHEOP,A_COPBYADDRESS,A_ICFLUSHOP),"Instruction Cache Invalidate Address",{fEA_REG(RsV); fICINVA(EA);})
+Q6INSN(Y2_icinvidx,"icinvidx(Rs32)",ATTRIBS(A_ICOP,A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_CACHEOP,A_COPBYIDX,A_ICFLUSHOP),"Instruction Cache Invalidate Index",{fICINVIDX(RsV);})
+Q6INSN(Y2_ickill,"ickill",ATTRIBS(A_ICOP,A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_CACHEOP,A_ICFLUSHOP),"Instruction Cache Invalidate",{fICKILL();})
+
+Q6INSN(Y2_isync,"isync",ATTRIBS(A_NOTE_NOPACKET,A_RESTRICT_NOPACKET),"Memory Synchronization",{fISYNC();})
+Q6INSN(Y2_barrier,"barrier",ATTRIBS(A_NOTE_NOPACKET,A_RESTRICT_SLOT0ONLY,A_RESTRICT_PACKET_AXOK),"Memory Barrier",{fBARRIER();})
+Q6INSN(Y2_syncht,"syncht",ATTRIBS(A_NOTE_NOPACKET,A_RESTRICT_SLOT0ONLY,A_RESTRICT_NOPACKET),"Memory Synchronization",{fSYNCH();})
+
+
+Q6INSN(Y2_dcfetchbo,"dcfetch(Rs32+#u11:3)",ATTRIBS(A_RESTRICT_PREFERSLOT0,A_DCFETCH,A_RESTRICT_NOSLOT1_STORE),"Data Cache Prefetch",{fEA_RI(RsV,uiV); fDCFETCH(EA);})
+Q6INSN(Y2_dckill,"dckill",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_SLOT0ONLY,A_RESTRICT_NOPACKET,A_CACHEOP,A_DCFLUSHOP),"Data Cache Invalidate",{fDCKILL();})
 
 
-Q6INSN(Y2_dcfetchbo,"dcfetch(Rs32+#u11:3)",ATTRIBS(A_RESTRICT_PREFERSLOT0,A_DCFETCH),"Data Cache Prefetch",{fEA_RI(RsV,uiV); fDCFETCH(EA);})
+Q6INSN(Y2_dczeroa,"dczeroa(Rs32)",ATTRIBS(A_STORE,A_RESTRICT_SLOT1_AOK,A_NOTE_SLOT1_AOK,A_RESTRICT_SLOT0ONLY,A_CACHEOP,A_COPBYADDRESS,A_DCZEROA),"Zero an aligned 32-byte cacheline",{fEA_REG(RsV); fDCZEROA(EA);})
+Q6INSN(Y2_dccleana,"dccleana(Rs32)",ATTRIBS(A_RESTRICT_SLOT1_AOK,A_NOTE_SLOT1_AOK,A_RESTRICT_SLOT0ONLY,A_CACHEOP,A_COPBYADDRESS,A_DCFLUSHOP),"Data Cache Clean Address",{fEA_REG(RsV); fDCCLEANA(EA);})
+Q6INSN(Y2_dccleanidx,"dccleanidx(Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_RESTRICT_PACKET_AXOK,A_NOTE_AXOK,A_RESTRICT_SLOT0ONLY,A_CACHEOP,A_COPBYIDX,A_DCFLUSHOP),"Data Cache Clean Index",{fDCCLEANIDX(RsV);})
+Q6INSN(Y2_dccleaninva,"dccleaninva(Rs32)",ATTRIBS(A_RESTRICT_SLOT1_AOK,A_NOTE_SLOT1_AOK,A_RESTRICT_SLOT0ONLY,A_CACHEOP,A_COPBYADDRESS,A_DCFLUSHOP),"Data Cache Clean and Invalidate Address",{fEA_REG(RsV); fDCCLEANINVA(EA);})
+Q6INSN(Y2_dccleaninvidx,"dccleaninvidx(Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_RESTRICT_PACKET_AXOK,A_NOTE_AXOK,A_RESTRICT_SLOT0ONLY,A_CACHEOP,A_COPBYIDX,A_DCFLUSHOP),"Data Cache Clean and Invalidate Index",{fDCCLEANINVIDX(RsV);})
+Q6INSN(Y2_dcinva,"dcinva(Rs32)",ATTRIBS(A_RESTRICT_SLOT1_AOK,A_NOTE_SLOT1_AOK,A_RESTRICT_SLOT0ONLY,A_CACHEOP,A_COPBYADDRESS,A_DCFLUSHOP),"Data Cache Invalidate Address",{fEA_REG(RsV); fDCCLEANINVA(EA);})
+Q6INSN(Y2_dcinvidx,"dcinvidx(Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_RESTRICT_PACKET_AXOK,A_NOTE_AXOK,A_RESTRICT_SLOT0ONLY,A_CACHEOP,A_COPBYIDX,A_DCFLUSHOP),"Data Cache Invalidate Index",{fDCINVIDX(RsV);})
+Q6INSN(Y2_dctagr,"Rd32=dctagr(Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_RESTRICT_PACKET_AXOK,A_NOTE_AXOK,A_RESTRICT_SLOT0ONLY,A_CACHEOP,A_COPBYIDX,A_DCTAGOP),"Data Cache Tag Read",{fDCTAGR(RsV,RdV,RdN);})
+Q6INSN(Y2_dctagw,"dctagw(Rs32,Rt32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_RESTRICT_SLOT0ONLY,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_CACHEOP,A_COPBYIDX,A_DCTAGOP),"Data Cache Tag Write",{fDCTAGW(RsV,RtV);})
 
 
-Q6INSN(Y2_dczeroa,"dczeroa(Rs32)",ATTRIBS(A_STORE,A_RESTRICT_SLOT0ONLY,A_DCZEROA),"Zero an aligned 32-byte cacheline",{fEA_REG(RsV); fDCZEROA(EA);})
-Q6INSN(Y2_dccleana,"dccleana(Rs32)",ATTRIBS(A_RESTRICT_SLOT0ONLY,A_DCFLUSHOP),"Data Cache Clean Address",{fEA_REG(RsV); fDCCLEANA(EA);})
-Q6INSN(Y2_dccleaninva,"dccleaninva(Rs32)",ATTRIBS(A_RESTRICT_SLOT0ONLY,A_DCFLUSHOP),"Data Cache Clean and Invalidate Address",{fEA_REG(RsV); fDCCLEANINVA(EA);})
-Q6INSN(Y2_dcinva,"dcinva(Rs32)",ATTRIBS(A_RESTRICT_SLOT0ONLY,A_DCFLUSHOP),"Data Cache Invalidate Address",{fEA_REG(RsV); fDCCLEANINVA(EA);})
+Q6INSN(Y2_l2kill,"l2kill",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_SLOT0ONLY,A_RESTRICT_NOPACKET,A_CACHEOP,A_L2FLUSHOP),"L2 Cache Invalidate",{fL2KILL();})
+Q6INSN(Y4_l2tagw,"l2tagw(Rs32,Rt32)",ATTRIBS(A_PRIV,A_NOTE_BADTAG_UNDEF,A_NOTE_PRIV,A_RESTRICT_SLOT0ONLY,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_CACHEOP,A_COPBYIDX,A_L2TAGOP),"L2 Cache Tag Write",{fL2TAGW(RsV,RtV);})
+Q6INSN(Y4_l2tagr,"Rd32=l2tagr(Rs32)",ATTRIBS(A_PRIV,A_NOTE_BADTAG_UNDEF,A_NOTE_PRIV,A_NOTE_AXOK,A_RESTRICT_PACKET_AXOK,A_RESTRICT_SLOT0ONLY,A_CACHEOP,A_COPBYIDX,A_L2TAGOP),"L2 Cache Tag Read",{fL2TAGR(RsV,RdV,RdN);})
 
+Q6INSN(Y2_l2cleaninvidx,"l2cleaninvidx(Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_AXOK,A_RESTRICT_PACKET_AXOK,A_RESTRICT_SLOT0ONLY,A_CACHEOP,A_COPBYIDX,A_L2FLUSHOP),"L2 Cache Clean and Invalidate Index",{fL2CLEANINVIDX(RsV); })
+Q6INSN(Y5_l2cleanidx,"l2cleanidx(Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_AXOK,A_RESTRICT_PACKET_AXOK,A_RESTRICT_SLOT0ONLY,A_CACHEOP,A_COPBYIDX,A_L2FLUSHOP),"L2 Cache Clean by Index",{fL2CLEANIDX(RsV); })
+Q6INSN(Y5_l2invidx,"l2invidx(Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_AXOK,A_RESTRICT_PACKET_AXOK,A_RESTRICT_SLOT0ONLY,A_CACHEOP,A_COPBYIDX,A_L2FLUSHOP),"L2 Cache Invalidate by Index",{fL2INVIDX(RsV); })
 
-Q6INSN(Y4_l2fetch,"l2fetch(Rs32,Rt32)",ATTRIBS(A_RESTRICT_SLOT0ONLY),"L2 Cache Prefetch",
+
+
+Q6INSN(Y4_l2fetch,"l2fetch(Rs32,Rt32)",ATTRIBS(A_RESTRICT_SLOT0ONLY,A_RESTRICT_PACKET_AXOK,A_NOTE_AXOK),"L2 Cache Prefetch",
 { fL2FETCH(RsV,
-           (RtV&0xff), /*height*/
-           ((RtV>>8)&0xff), /*width*/
-           ((RtV>>16)&0xffff), /*stride*/
-           0); /*extra attrib flags*/
+	(RtV&0xff), /*height*/
+	((RtV>>8)&0xff), /*width*/
+	((RtV>>16)&0xffff), /*stride*/
+	0); /*extra attrib flags*/
+})
+
+Q6INSN(Y6_dmstart,"dmstart(Rs32)",ATTRIBS(A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_DMA,A_RESTRICT_SLOT0ONLY,A_SYNC_MARKER,A_NO_TIMING_LOG),"DMA Start", {
+	fUNIMP();
+})
+
+Q6INSN(Y6_dmlink,"dmlink(Rs32,Rt32)",ATTRIBS(A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_DMA,A_RESTRICT_SLOT0ONLY,A_SYNC_MARKER,A_NO_TIMING_LOG),"DMA Link", {
+	fUNIMP();
 })
 
+Q6INSN(Y6_dmpoll,"Rd32=dmpoll",ATTRIBS(A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_DMA,A_RESTRICT_SLOT0ONLY,A_NO_TIMING_LOG),"DMA Poll", {
+	fUNIMP();
+})
+
+Q6INSN(Y6_dmwait,"Rd32=dmwait",ATTRIBS(A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_DMA,A_RESTRICT_SLOT0ONLY,A_NO_TIMING_LOG),"DMA Wait", {
+	fUNIMP();
+})
+
+Q6INSN(Y6_dmsyncht,"Rd32=dmsyncht",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_DMA,A_RESTRICT_SLOT0ONLY,A_NO_TIMING_LOG),"DMA SynchT",{
+	fUNIMP();
+})
+Q6INSN(Y6_dmtlbsynch,"Rd32=dmtlbsynch",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_DMA,A_RESTRICT_SLOT0ONLY,A_NO_TIMING_LOG),"DMA TLB Synch",{
+	fUNIMP();
+})
+
+Q6INSN(Y6_dmcfgrd,"Rd32=dmcfgrd(Rs32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_DMA,A_RESTRICT_SLOT0ONLY,A_NO_TIMING_LOG),
+  "DMA Config Read", {
+	fUNIMP();
+})
+
+Q6INSN(Y6_dmcfgwr,"dmcfgwr(Rs32,Rt32)",ATTRIBS(A_NOTE_PRIV,A_PRIV,A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_DMA,A_RESTRICT_SLOT0ONLY,A_NO_TIMING_LOG),
+  "DMA Config Write", {
+	fUNIMP();
+})
+
+Q6INSN(Y6_dmpause,"Rd32=dmpause",ATTRIBS(A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_DMA,A_RESTRICT_SLOT0ONLY,A_NO_TIMING_LOG),"DMA Pause",{
+	fUNIMP();
+})
+
+Q6INSN(Y6_dmresume,"dmresume(Rs32)",ATTRIBS(A_NOTE_NOPACKET,A_RESTRICT_NOPACKET,A_DMA,A_RESTRICT_SLOT0ONLY,A_SYNC_MARKER,A_NO_TIMING_LOG),"DMA Resume",{
+	fUNIMP();
+})
 
 
-Q6INSN(Y5_l2fetch,"l2fetch(Rs32,Rtt32)",ATTRIBS(A_RESTRICT_SLOT0ONLY),"L2 Cache Prefetch",
+Q6INSN(Y5_l2fetch,"l2fetch(Rs32,Rtt32)",ATTRIBS(A_RESTRICT_SLOT0ONLY,A_RESTRICT_PACKET_AXOK,A_NOTE_AXOK),"L2 Cache Prefetch",
 { fL2FETCH(RsV,
-           fGETUHALF(0,RttV), /*height*/
-           fGETUHALF(1,RttV), /*width*/
-           fGETUHALF(2,RttV), /*stride*/
-           fGETUHALF(3,RttV)); /*flags*/
+	fGETUHALF(0,RttV), /*height*/
+	fGETUHALF(1,RttV), /*width*/
+	fGETUHALF(2,RttV), /*stride*/
+	fGETUHALF(3,RttV)); /*flags*/
 })
+
+Q6INSN(Y5_l2locka,"Pd4=l2locka(Rs32)", ATTRIBS(A_PRIV,A_NOTE_PRIV,A_CACHEOP,A_COPBYADDRESS,A_RESTRICT_SLOT0ONLY,A_RESTRICT_PACKET_AXOK,A_NOTE_AXOK,A_RESTRICT_LATEPRED,A_NOTE_LATEPRED),
+"Lock L2 cache line by address", { fEA_REG(RsV); fL2LOCKA(EA,PdV,PdN); fHIDE(MARK_LATE_PRED_WRITE(PdN)) })
+
+
+Q6INSN(Y5_l2unlocka,"l2unlocka(Rs32)", ATTRIBS(A_PRIV,A_NOTE_PRIV,A_CACHEOP,A_COPBYADDRESS,A_RESTRICT_SLOT0ONLY,A_RESTRICT_PACKET_AXOK,A_NOTE_AXOK), "UnLock L2 cache line by address", { fEA_REG(RsV); fL2UNLOCKA(EA); })
+
+
+
+Q6INSN(Y5_l2gunlock,"l2gunlock",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_SLOT0ONLY,A_RESTRICT_NOPACKET,A_CACHEOP,A_L2FLUSHOP),"L2 Global Unlock",{fL2UNLOCK();})
+
+Q6INSN(Y5_l2gclean,"l2gclean",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_SLOT0ONLY,A_RESTRICT_NOPACKET,A_CACHEOP,A_L2FLUSHOP),"L2 Global Clean",{fL2CLEAN();})
+
+Q6INSN(Y5_l2gcleaninv,"l2gcleaninv",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_SLOT0ONLY,A_RESTRICT_NOPACKET,A_CACHEOP,A_L2FLUSHOP),"L2 Global Clean and Invalidate",{fL2CLEANINV();})
+
+Q6INSN(Y6_l2gcleanpa,"l2gclean(Rtt32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_SLOT0ONLY,A_RESTRICT_NOPACKET,A_CACHEOP,A_L2FLUSHOP),"L2 Global Clean by PA Range",{fL2CLEANPA(RttV);})
+
+Q6INSN(Y6_l2gcleaninvpa,"l2gcleaninv(Rtt32)",ATTRIBS(A_PRIV,A_NOTE_PRIV,A_NOTE_NOPACKET,A_RESTRICT_SLOT0ONLY,A_RESTRICT_NOPACKET,A_CACHEOP,A_L2FLUSHOP),"L2 Global Clean and Invalidate by PA Range",{fL2CLEANINVPA(RttV);})
diff --git a/target/hexagon/insn.h b/target/hexagon/insn.h
index 24dcf7fe9f38..5d59430da9e1 100644
--- a/target/hexagon/insn.h
+++ b/target/hexagon/insn.h
@@ -66,8 +66,8 @@ struct Packet {
 
     bool pkt_has_dczeroa;
 
-    bool pkt_has_store_s0;
-    bool pkt_has_store_s1;
+    bool pkt_has_scalar_store_s0;
+    bool pkt_has_scalar_store_s1;
 
     bool pkt_has_hvx;
     Insn *vhist_insn;
diff --git a/target/hexagon/internal.h b/target/hexagon/internal.h
index 32e96f00d97a..ff89c9cda43f 100644
--- a/target/hexagon/internal.h
+++ b/target/hexagon/internal.h
@@ -22,13 +22,32 @@
 
 int hexagon_gdb_read_register(CPUState *cpu, GByteArray *buf, int reg);
 int hexagon_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg);
+#ifndef CONFIG_USER_ONLY
+int hexagon_sys_gdb_read_register(CPUState *cs, GByteArray *mem_buf, int n);
+int hexagon_sys_gdb_write_register(CPUState *cs, uint8_t *mem_buf, int n);
+#endif
 int hexagon_hvx_gdb_read_register(CPUState *env, GByteArray *mem_buf, int n);
 int hexagon_hvx_gdb_write_register(CPUState *env, uint8_t *mem_buf, int n);
 
 void hexagon_debug_vreg(CPUHexagonState *env, int regnum);
 void hexagon_debug_qreg(CPUHexagonState *env, int regnum);
 void hexagon_debug(CPUHexagonState *env);
+void hexagon_dump(CPUHexagonState *env, FILE *f, int flags);
 
 extern const char * const hexagon_regnames[TOTAL_PER_THREAD_REGS];
+extern const char * const hexagon_sregnames[];
+extern const char * const hexagon_gregnames[];
+
+void G_NORETURN do_raise_exception(CPUHexagonState *env,
+        uint32_t exception,
+        target_ulong PC,
+        uintptr_t retaddr);
+
+#define hexagon_cpu_mmu_enabled(env) \
+    GET_SYSCFG_FIELD(SYSCFG_MMUEN, arch_get_system_reg(env, HEX_SREG_SYSCFG))
+
+#ifndef CONFIG_USER_ONLY
+extern const VMStateDescription vmstate_hexagon_cpu;
+#endif
 
 #endif
diff --git a/target/hexagon/machine.c b/target/hexagon/machine.c
new file mode 100644
index 000000000000..79e9b7effa5e
--- /dev/null
+++ b/target/hexagon/machine.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright(c) 2023-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "migration/cpu.h"
+#include "cpu.h"
+#include "hex_mmu.h"
+
+static int get_u64_ptr(QEMUFile *f, void *pv, size_t size,
+                       const VMStateField *field)
+{
+    uint64_t *p = pv;
+    *p = qemu_get_be64(f);
+    return 0;
+}
+
+static int put_u64_ptr(QEMUFile *f, void *pv, size_t size,
+                      const VMStateField *field, JSONWriter *vmdesc)
+{
+    qemu_put_be64(f, *((uint64_t *)pv));
+    return 0;
+}
+
+const VMStateInfo vmstate_info_uint64_ptr = {
+    .name = "uint64_t_pointer",
+    .get  = get_u64_ptr,
+    .put  = put_u64_ptr,
+};
+
+static int get_hex_tlb_ptr(QEMUFile *f, void *pv, size_t size,
+                       const VMStateField *field)
+{
+    CPUHexagonTLBContext *tlb = pv;
+    for (int i = 0; i < ARRAY_SIZE(tlb->entries); i++) {
+        tlb->entries[i] = qemu_get_be64(f);
+    }
+    return 0;
+}
+
+static int put_hex_tlb_ptr(QEMUFile *f, void *pv, size_t size,
+                      const VMStateField *field, JSONWriter *vmdesc)
+{
+    CPUHexagonTLBContext *tlb = pv;
+    for (int i = 0; i < ARRAY_SIZE(tlb->entries); i++) {
+        qemu_put_be64(f,  tlb->entries[i]);
+    }
+    return 0;
+}
+
+const VMStateInfo vmstate_info_hex_tlb_ptr = {
+    .name = "hex_tlb_pointer",
+    .get  = get_hex_tlb_ptr,
+    .put  = put_hex_tlb_ptr,
+};
+
+const VMStateDescription vmstate_hexagon_cpu = {
+    .name = "cpu",
+    .version_id = 0,
+    .minimum_version_id = 0,
+    .fields = (VMStateField[]) {
+        VMSTATE_CPU(),
+        VMSTATE_UINTTL_ARRAY(env.gpr, HexagonCPU, TOTAL_PER_THREAD_REGS),
+        VMSTATE_UINTTL_ARRAY(env.pred, HexagonCPU, NUM_PREGS),
+        VMSTATE_UINTTL_ARRAY(env.t_sreg, HexagonCPU, NUM_SREGS),
+        VMSTATE_UINTTL_ARRAY(env.greg, HexagonCPU, NUM_GREGS),
+        VMSTATE_UINTTL(env.next_PC, HexagonCPU),
+        VMSTATE_UINTTL(env.tlb_lock_state, HexagonCPU),
+        VMSTATE_UINTTL(env.k0_lock_state, HexagonCPU),
+        VMSTATE_UINTTL(env.tlb_lock_count, HexagonCPU),
+        VMSTATE_UINTTL(env.k0_lock_count, HexagonCPU),
+        VMSTATE_UINTTL(env.threadId, HexagonCPU),
+        VMSTATE_UINTTL(env.cause_code, HexagonCPU),
+        VMSTATE_UINTTL(env.wait_next_pc, HexagonCPU),
+        VMSTATE_POINTER(env.hex_tlb, HexagonCPU, 0,
+                        vmstate_info_hex_tlb_ptr, CPUHexagonTLBContext *),
+        VMSTATE_UINT64(env.t_cycle_count, HexagonCPU),
+        VMSTATE_POINTER(env.g_pcycle_base, HexagonCPU, 0,
+                        vmstate_info_uint64_ptr, uint64_t *),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
diff --git a/target/hexagon/macros.h b/target/hexagon/macros.h
index ee3d4c88e7bd..01469a28a0cc 100644
--- a/target/hexagon/macros.h
+++ b/target/hexagon/macros.h
@@ -82,7 +82,7 @@
  */
 #define CHECK_NOSHUF(VA, SIZE) \
     do { \
-        if (insn->slot == 0 && ctx->pkt->pkt_has_store_s1) { \
+        if (insn->slot == 0 && ctx->pkt->pkt_has_scalar_store_s1) { \
             probe_noshuf_load(VA, SIZE, ctx->mem_idx); \
             process_store(ctx, 1); \
         } \
@@ -93,11 +93,11 @@
         TCGLabel *noshuf_label = gen_new_label(); \
         tcg_gen_brcondi_tl(TCG_COND_EQ, PRED, 0, noshuf_label); \
         GET_EA; \
-        if (insn->slot == 0 && ctx->pkt->pkt_has_store_s1) { \
+        if (insn->slot == 0 && ctx->pkt->pkt_has_scalar_store_s1) { \
             probe_noshuf_load(EA, SIZE, ctx->mem_idx); \
         } \
         gen_set_label(noshuf_label); \
-        if (insn->slot == 0 && ctx->pkt->pkt_has_store_s1) { \
+        if (insn->slot == 0 && ctx->pkt->pkt_has_scalar_store_s1) { \
             process_store(ctx, 1); \
         } \
     } while (0)
@@ -524,7 +524,7 @@ static inline TCGv gen_read_ireg(TCGv result, TCGv val, int shift)
 
 #define fLOAD(NUM, SIZE, SIGN, EA, DST) \
     do { \
-        check_noshuf(env, pkt_has_store_s1, slot, EA, SIZE, GETPC()); \
+        check_noshuf(env, pkt_has_scalar_store_s1, slot, EA, SIZE, GETPC()); \
         DST = (size##SIZE##SIGN##_t)MEM_LOAD##SIZE(env, EA, GETPC()); \
     } while (0)
 #endif
@@ -537,9 +537,6 @@ static inline TCGv gen_read_ireg(TCGv result, TCGv val, int shift)
 
 #ifdef CONFIG_USER_ONLY
 #define fFRAMECHECK(ADDR, EA) do { } while (0) /* Not modelled in linux-user */
-#else
-/* System mode not implemented yet */
-#define fFRAMECHECK(ADDR, EA)  g_assert_not_reached();
 #endif
 
 #ifdef QEMU_GENERATE
@@ -630,8 +627,18 @@ static inline TCGv gen_read_ireg(TCGv result, TCGv val, int shift)
 #define fCONSTLL(A) A##LL
 #define fECHO(A) (A)
 
-#define fTRAP(TRAPTYPE, IMM) helper_raise_exception(env, HEX_EXCP_TRAP0)
+#ifdef CONFIG_USER_ONLY
+#define fTRAP(TRAPTYPE, IMM) \
+    do { \
+        hexagon_raise_exception_err(env, HEX_EVENT_TRAP0, PC); \
+    } while (0)
+#endif
+
+#define fDO_TRACE(SREG)
+#define fBREAK()
+#define fUNPAUSE()
 #define fPAUSE(IMM)
+#define fDCFETCH(REG)
 
 #define fALIGN_REG_FIELD_VALUE(FIELD, VAL) \
     ((VAL) << reg_field_info[FIELD].offset)
@@ -642,16 +649,43 @@ static inline TCGv gen_read_ireg(TCGv result, TCGv val, int shift)
                    reg_field_info[FIELD].width, \
                    reg_field_info[FIELD].offset)
 
+#define fGET_FIELD(VAL, FIELD) \
+    fEXTRACTU_BITS(VAL, \
+                   reg_field_info[FIELD].width, \
+                   reg_field_info[FIELD].offset)
+#define fSET_FIELD(VAL, FIELD, NEWVAL) \
+    fINSERT_BITS(VAL, \
+                 reg_field_info[FIELD].width, \
+                 reg_field_info[FIELD].offset, \
+                 (NEWVAL))
+
 #ifdef QEMU_GENERATE
 #define fDCZEROA(REG) \
     do { \
         ctx->dczero_addr = tcg_temp_new(); \
         tcg_gen_mov_tl(ctx->dczero_addr, (REG)); \
     } while (0)
+#else
+#define fDCZEROA(REG) ((void) REG)
 #endif
 
 #define fBRANCH_SPECULATE_STALL(DOTNEWVAL, JUMP_COND, SPEC_DIR, HINTBITNUM, \
                                 STRBITNUM) /* Nothing */
 
+#ifdef CONFIG_USER_ONLY
+/*
+ * This macro can only be true in guest mode.
+ * In user mode, the 4 VIRTINSN's can't be reached
+ */
+#define fTRAP1_VIRTINSN(IMM)       (false)
+#define fVIRTINSN_SPSWAP(IMM, REG) g_assert_not_reached()
+#define fVIRTINSN_GETIE(IMM, REG)  g_assert_not_reached()
+#define fVIRTINSN_SETIE(IMM, REG)  g_assert_not_reached()
+#define fVIRTINSN_RTE(IMM, REG)    g_assert_not_reached()
+#endif
 
 #endif
+
+#define fPREDUSE_TIMING()
+
+#define fUNIMP() qemu_log_mask(LOG_UNIMP, "Unimplemented instruction\n")
diff --git a/target/hexagon/max.h b/target/hexagon/max.h
new file mode 100644
index 000000000000..0f595bcb736d
--- /dev/null
+++ b/target/hexagon/max.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2019-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HEXAGON_MAX_H
+#define HEXAGON_MAX_H
+
+#define MAX_EXT_CONTEXTS 8
+#define MAX_L2_INTERLEAVES 2
+#define MAX_VFIFO_COUNT 4
+
+#define SLOTS_MAX 4
+
+#define REG_WRITES_MAX 32
+#define PRED_WRITES_MAX 5
+#define STORES_MAX 2
+#define LOADS_MAX 2
+#define MAX_PRED 4
+
+#define PACKET_BYTES_MAX 16
+#define MAX_TLB_ENTRIES 1024
+#define DTLB_ENTRIES 16
+#define ITLB_ENTRIES 16
+
+#endif /* HEXAGON_MAX_H */
diff --git a/target/hexagon/meson.build b/target/hexagon/meson.build
index bb4ebaae816e..280b5dc58ac5 100644
--- a/target/hexagon/meson.build
+++ b/target/hexagon/meson.build
@@ -20,6 +20,7 @@ hexagon_ss = ss.source_set()
 hex_common_py = 'hex_common.py'
 gen_tcg_h = meson.current_source_dir() / 'gen_tcg.h'
 gen_tcg_hvx_h = meson.current_source_dir() / 'gen_tcg_hvx.h'
+gen_tcg_sys_h = meson.current_source_dir() / 'gen_tcg_sys.h'
 idef_parser_dir = meson.current_source_dir() / 'idef-parser'
 
 #
@@ -244,11 +245,13 @@ decodetree_trans_funcs_generated = custom_target(
     command: [python, files('gen_trans_funcs.py'), semantics_generated, '@OUTPUT@'],
 )
 hexagon_ss.add(decodetree_trans_funcs_generated)
+hexagon_softmmu_ss = ss.source_set()
 
 hexagon_ss.add(files(
     'cpu.c',
     'translate.c',
     'op_helper.c',
+    'cpu_helper.c',
     'gdbstub.c',
     'genptr.c',
     'reg_fields.c',
@@ -260,6 +263,16 @@ hexagon_ss.add(files(
     'fma_emu.c',
     'mmvec/decode_ext_mmvec.c',
     'mmvec/system_ext_mmvec.c',
+    'mmvec/mmvec_qfloat.c',
+    'mmvec/kvx_ieee.c',
+))
+
+hexagon_softmmu_ss.add(files(
+    'hex_mmu.c',
+    'hex_interrupts.c',
+    'hexswi.c',
+    'machine.c',
+    'monitor.c',
 ))
 
 #
@@ -271,7 +284,8 @@ hexagon_ss.add(files(
 #     idef-generated-enabled-instructions
 #
 idef_parser_enabled = get_option('hexagon_idef_parser')
-if idef_parser_enabled and 'hexagon-linux-user' in target_dirs
+if idef_parser_enabled and ('hexagon-linux-user' in target_dirs or
+                            'hexagon-softmmu' in target_dirs)
     idef_parser_input_generated = custom_target(
         'idef_parser_input.h.inc',
         output: 'idef_parser_input.h.inc',
@@ -346,12 +360,12 @@ if idef_parser_enabled and 'hexagon-linux-user' in target_dirs
     # Setup input and dependencies for the next step, this depends on whether or
     # not idef-parser is enabled
     helper_dep = [semantics_generated, idef_generated_tcg_c, idef_generated_tcg]
-    helper_in = [semantics_generated, gen_tcg_h, gen_tcg_hvx_h, '--idef-parser', idef_generated_list]
+    helper_in = [semantics_generated, gen_tcg_h, gen_tcg_hvx_h, gen_tcg_sys_h, '--idef-parser', idef_generated_list]
 else
     # Setup input and dependencies for the next step, this depends on whether or
     # not idef-parser is enabled
     helper_dep = [semantics_generated]
-    helper_in = [semantics_generated, gen_tcg_h, gen_tcg_hvx_h]
+    helper_in = [semantics_generated, gen_tcg_h, gen_tcg_hvx_h, gen_tcg_sys_h]
 endif
 
 #
@@ -365,7 +379,7 @@ helper_protos_generated = custom_target(
     'helper_protos_generated.h.inc',
     output: 'helper_protos_generated.h.inc',
     depends: helper_dep,
-    depend_files: [hex_common_py, gen_tcg_h, gen_tcg_hvx_h],
+    depend_files: [hex_common_py, gen_tcg_h, gen_tcg_hvx_h, gen_tcg_sys_h],
     command: [python, files('gen_helper_protos.py'), helper_in, '@OUTPUT@'],
 )
 hexagon_ss.add(helper_protos_generated)
@@ -374,7 +388,7 @@ helper_funcs_generated = custom_target(
     'helper_funcs_generated.c.inc',
     output: 'helper_funcs_generated.c.inc',
     depends: helper_dep,
-    depend_files: [hex_common_py, gen_tcg_h, gen_tcg_hvx_h],
+    depend_files: [hex_common_py, gen_tcg_h, gen_tcg_hvx_h, gen_tcg_sys_h],
     command: [python, files('gen_helper_funcs.py'), helper_in, '@OUTPUT@'],
 )
 hexagon_ss.add(helper_funcs_generated)
@@ -383,7 +397,7 @@ tcg_funcs_generated = custom_target(
     'tcg_funcs_generated.c.inc',
     output: 'tcg_funcs_generated.c.inc',
     depends: helper_dep,
-    depend_files: [hex_common_py, gen_tcg_h, gen_tcg_hvx_h],
+    depend_files: [hex_common_py, gen_tcg_h, gen_tcg_hvx_h, gen_tcg_sys_h],
     command: [python, files('gen_tcg_funcs.py'), helper_in, '@OUTPUT@'],
 )
 hexagon_ss.add(tcg_funcs_generated)
@@ -392,9 +406,10 @@ analyze_funcs_generated = custom_target(
     'analyze_funcs_generated.c.inc',
     output: 'analyze_funcs_generated.c.inc',
     depends: helper_dep,
-    depend_files: [hex_common_py, gen_tcg_h, gen_tcg_hvx_h],
+    depend_files: [hex_common_py, gen_tcg_h, gen_tcg_hvx_h, gen_tcg_sys_h],
     command: [python, files('gen_analyze_funcs.py'), helper_in, '@OUTPUT@'],
 )
 hexagon_ss.add(analyze_funcs_generated)
 
 target_arch += {'hexagon': hexagon_ss}
+target_system_arch += {'hexagon': hexagon_softmmu_ss}
diff --git a/target/hexagon/mmvec/kvx_ieee.c b/target/hexagon/mmvec/kvx_ieee.c
new file mode 100644
index 000000000000..3e67230f62e4
--- /dev/null
+++ b/target/hexagon/mmvec/kvx_ieee.c
@@ -0,0 +1,1460 @@
+/*
+ *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "kvx_ieee.h"
+#include "kvx_mac_reduce.c"
+#include "qemu/host-utils.h"
+
+uint32_t shiftRightJam32( uint32_t a, uint_fast16_t dist )
+{
+    return
+        (dist < 31) ? a>>dist | ((uint32_t) (a<<(-dist & 31)) != 0) : (a != 0);
+}
+
+uint_fast8_t countLeadingZeros16( uint16_t a )
+{
+    return clz16(a);
+}
+
+struct exp8_sig16 normSubnormalF16Sig( uint_fast16_t sig )
+{
+    int_fast8_t shiftDist;
+    struct exp8_sig16 z;
+
+    shiftDist = countLeadingZeros16( sig ) - 5;
+    z.exp = 1 - shiftDist;
+    z.sig = sig<<shiftDist;
+    return z;
+
+}
+
+uint16_t roundPackToF16( bool sign, int_fast16_t exp, uint_fast16_t sig )
+{
+    bool roundNearEven;
+    uint_fast8_t roundIncrement, roundBits;
+
+    /*------------------------------------------------------------------------
+    *------------------------------------------------------------------------*/
+    roundNearEven  = 1;
+    roundIncrement = 0x8;
+    roundBits      = sig & 0xF;
+    /*------------------------------------------------------------------------
+    *------------------------------------------------------------------------*/
+    if ( 0x1D <= (unsigned int) exp ) {
+        if ( exp < 0 ) {
+            /*----------------------------------------------------------------
+            *----------------------------------------------------------------*/
+            sig = shiftRightJam32( sig, -exp );
+            exp = 0;
+            roundBits = sig & 0xF;
+            //if ( isTiny && roundBits ) {
+            //    softfloat_raiseFlags( softfloat_flag_underflow );
+            //}
+        } else if ( (0x1D < exp) || (0x8000 <= sig + roundIncrement) ) {
+            /*----------------------------------------------------------------
+            *----------------------------------------------------------------*/
+            return packToF16UI( sign, 0x1F, 0 ) - ! roundIncrement;
+        }
+    }
+    /*------------------------------------------------------------------------
+    *------------------------------------------------------------------------*/
+    sig = (sig + roundIncrement)>>4;
+    sig &= ~(uint_fast16_t) (! (roundBits ^ 8) & roundNearEven);
+    if ( ! sig ) exp = 0;
+
+    return packToF16UI( sign, exp, sig );
+
+}
+
+
+uint32_t fp_mult_sf_sf (uint32_t op1, uint32_t op2)
+{
+
+    union ui32_f32 u_op1;
+    union ui32_f32 u_op2;
+    union ui32_f32 u_rslt;
+
+    float a,b,rslt;
+    uint32_t result;
+
+    #ifdef DEBUG
+    printf("fp_mult_sf_sf");
+    printf("Debug : op1 =0x%08x\n",op1);
+    printf("Debug : op2 =0x%08x\n",op2);
+    #endif
+
+    if(isNaNF32UI(op1) || isNaNF32UI(op2))
+       return FP32_DEF_NAN;
+
+    u_op1.ui = op1;
+    u_op2.ui = op2;
+    a = u_op1.f;
+    b = u_op2.f;
+    rslt = a*b;
+    u_rslt.f = rslt;
+    result = u_rslt.ui;
+
+    result = isNaNF32UI(result) ? FP32_DEF_NAN : result;
+
+    #ifdef DEBUG
+    printf("Debug : a = %f\n",a);
+    printf("Debug : b = %f\n",b);
+    printf("Debug : rslt = %f\n",rslt);
+    printf("Debug : result =0x%08x\n",result);
+    #endif
+
+    return result;
+}
+
+uint32_t fp_add_sf_sf (uint32_t op1, uint32_t op2)
+{
+    union ui32_f32 u_op1;
+    union ui32_f32 u_op2;
+    union ui32_f32 u_rslt;
+
+    float a,b,rslt;
+    uint32_t result;
+
+    #ifdef DEBUG
+    printf("fp_add_sf_sf");
+    printf("Debug : op1 =0x%08x\n",op1);
+    printf("Debug : op2 =0x%08x\n",op2);
+    #endif
+
+    if(isNaNF32UI(op1) || isNaNF32UI(op2))
+       return FP32_DEF_NAN;
+
+    u_op1.ui = op1;
+    u_op2.ui = op2;
+    a = u_op1.f;
+    b = u_op2.f;
+    rslt = a+b;
+    u_rslt.f = rslt;
+    result = u_rslt.ui;
+    result = isNaNF32UI(result) ? FP32_DEF_NAN : result;
+
+    #ifdef DEBUG
+    printf("Debug : a = %f\n",a);
+    printf("Debug : b = %f\n",b);
+    printf("Debug : rslt = %f\n",rslt);
+    printf("Debug : result =0x%08x\n",result);
+    #endif
+
+    return result;
+}
+
+uint32_t fp_sub_sf_sf (uint32_t op1, uint32_t op2)
+{
+    union ui32_f32 u_op1;
+    union ui32_f32 u_op2;
+    union ui32_f32 u_rslt;
+
+    float a,b,rslt;
+    uint32_t result;
+
+    #ifdef DEBUG
+    printf("Debug : op1 =0x%08x\n",op1);
+    printf("Debug : op2 =0x%08x\n",op2);
+    #endif
+
+    if(isNaNF32UI(op1) || isNaNF32UI(op2))
+       return FP32_DEF_NAN;
+
+    u_op1.ui = op1;
+    u_op2.ui = op2;
+    a = u_op1.f;
+    b = u_op2.f;
+    rslt = a-b;
+    u_rslt.f = rslt;
+    result = u_rslt.ui;
+    result = isNaNF32UI(result) ? FP32_DEF_NAN : result;
+
+    #ifdef DEBUG
+    printf("Debug : a = %f\n",a);
+    printf("Debug : b = %f\n",b);
+    printf("Debug : rslt = %f\n",rslt);
+    printf("Debug : result =0x%08x\n",result);
+    #endif
+
+    return result;
+}
+
+//--------------------------------------------------------------
+//Function to convert FP32 to FP16
+//--------------------------------------------------------------
+
+uint16_t f32_to_f16 ( uint32_t a)
+{
+    bool sign;
+    int_fast16_t exp;
+    uint_fast32_t frac;
+    uint_fast16_t frac16;
+
+    sign = signF32UI( a );
+    exp  = expF32UI ( a );
+    frac = fracF32UI( a );
+
+    // Inf and NaN case
+    if ( exp == 0xFF ) {
+        if ( frac ) {
+           return FP16_DEF_NAN;
+        } else {
+           return packToF16UI( sign, 0x1F, 0 );
+        }
+    }
+
+    /*------------------------------------------------------------------------
+    frac>>9              : keeping 14 bit of precision out ot 23 bits in FP32
+    (frac & 0x1FF) != 0) : setting the sticky bit required for rounding
+    *------------------------------------------------------------------------*/
+    frac16 = frac>>9 | ((frac & 0x1FF) != 0);
+
+    //If input was a Zero
+    if ( ! (exp | frac16) ) {
+        return packToF16UI( sign, 0, 0 );
+    }
+
+    return roundPackToF16( sign, exp - 0x71, frac16 | 0x4000 );
+
+}
+
+//--------------------------------------------------------------
+//Function to convert FP16 to FP32
+//--------------------------------------------------------------
+
+uint32_t f16_to_f32( uint16_t a )
+{
+    bool sign;
+    int_fast8_t exp;
+    uint_fast16_t frac;
+    struct exp8_sig16 normExpSig;
+
+    sign = signF16UI( a );
+    exp  = expF16UI ( a );
+    frac = fracF16UI( a );
+
+
+    if ( exp == 0x1F ) {
+        if ( frac ) {
+           return FP32_DEF_NAN;
+        } else {
+            return packToF32UI( sign, 0xFF, 0 );
+        }
+    }
+
+
+    if ( ! exp ) {
+        if ( ! frac ) {
+            return packToF32UI( sign, 0, 0 );
+        }
+        normExpSig = normSubnormalF16Sig( frac );
+        exp = normExpSig.exp - 1;
+        frac = normExpSig.sig;
+    }
+
+
+    return packToF32UI( sign, exp + 0x70, (uint_fast32_t) frac<<13 );
+
+}
+
+uint16_t fp_mult_hf_hf (uint16_t op1, uint16_t op2)
+{
+
+    union ui32_f32 u_op1;
+    union ui32_f32 u_op2;
+    union ui32_f32 u_rslt;
+
+    uint32_t op1_f32;
+    uint32_t op2_f32;
+
+    float a,b,rslt;
+    uint32_t result_f32;
+    uint16_t result;
+
+    #ifdef DEBUG
+    printf("Debug : op1 =0x%08x\n",op1);
+    printf("Debug : op2 =0x%08x\n",op2);
+    #endif
+
+    if(isNaNF16UI(op1) || isNaNF16UI(op2))
+       return FP16_DEF_NAN;
+
+    op1_f32 = f16_to_f32(op1);
+    op2_f32 = f16_to_f32(op2);
+
+    u_op1.ui = op1_f32;
+    u_op2.ui = op2_f32;
+    a = u_op1.f;
+    b = u_op2.f;
+    rslt = a*b;
+    u_rslt.f = rslt;
+    result_f32 = u_rslt.ui;
+
+    result = f32_to_f16(result_f32);
+
+    #ifdef DEBUG
+    printf("Debug : a = %f\n",a);
+    printf("Debug : b = %f\n",b);
+    printf("Debug : rslt = %f\n",rslt);
+    printf("Debug : result =0x%08x\n",result);
+    #endif
+
+    return result;
+}
+
+uint16_t fp_add_hf_hf (uint16_t op1, uint16_t op2)
+{
+
+    union ui32_f32 u_op1;
+    union ui32_f32 u_op2;
+    union ui32_f32 u_rslt;
+
+    uint32_t op1_f32;
+    uint32_t op2_f32;
+
+    float a,b,rslt;
+    uint32_t result_f32;
+    uint16_t result;
+
+    #ifdef DEBUG
+    printf("Debug : op1 =0x%08x\n",op1);
+    printf("Debug : op2 =0x%08x\n",op2);
+    #endif
+
+    if(isNaNF16UI(op1) || isNaNF16UI(op2))
+       return FP16_DEF_NAN;
+
+    op1_f32 = f16_to_f32(op1);
+    op2_f32 = f16_to_f32(op2);
+
+    u_op1.ui = op1_f32;
+    u_op2.ui = op2_f32;
+    a = u_op1.f;
+    b = u_op2.f;
+    rslt = a+b;
+    u_rslt.f = rslt;
+    result_f32 = u_rslt.ui;
+
+    result = f32_to_f16(result_f32);
+
+    #ifdef DEBUG
+    printf("Debug : a = %f\n",a);
+    printf("Debug : b = %f\n",b);
+    printf("Debug : rslt = %f\n",rslt);
+    printf("Debug : result =0x%08x\n",result);
+    #endif
+
+    return result;
+}
+
+uint16_t fp_sub_hf_hf (uint16_t op1, uint16_t op2)
+{
+
+    union ui32_f32 u_op1;
+    union ui32_f32 u_op2;
+    union ui32_f32 u_rslt;
+
+    uint32_t op1_f32;
+    uint32_t op2_f32;
+
+    float a,b,rslt;
+    uint32_t result_f32;
+    uint16_t result;
+
+    #ifdef DEBUG
+    printf("Debug : op1 =0x%08x\n",op1);
+    printf("Debug : op2 =0x%08x\n",op2);
+    #endif
+
+    if(isNaNF16UI(op1) || isNaNF16UI(op2))
+       return FP16_DEF_NAN;
+
+    op1_f32 = f16_to_f32(op1);
+    op2_f32 = f16_to_f32(op2);
+
+    u_op1.ui = op1_f32;
+    u_op2.ui = op2_f32;
+    a = u_op1.f;
+    b = u_op2.f;
+    rslt = a-b;
+    u_rslt.f = rslt;
+    result_f32 = u_rslt.ui;
+
+    result = f32_to_f16(result_f32);
+
+    #ifdef DEBUG
+    printf("Debug : a = %f\n",a);
+    printf("Debug : b = %f\n",b);
+    printf("Debug : rslt = %f\n",rslt);
+    printf("Debug : result =0x%08x\n",result);
+    #endif
+
+    return result;
+}
+
+uint32_t fp_mult_sf_hf (uint16_t op1, uint16_t op2)
+{
+
+    union ui32_f32 u_op1;
+    union ui32_f32 u_op2;
+    union ui32_f32 u_rslt;
+
+    uint32_t op1_f32;
+    uint32_t op2_f32;
+
+    float a,b,rslt;
+    uint32_t result;
+
+    #ifdef DEBUG
+    printf("Debug : op1 =0x%08x\n",op1);
+    printf("Debug : op2 =0x%08x\n",op2);
+    #endif
+
+    if(isNaNF16UI(op1) || isNaNF16UI(op2))
+       return FP32_DEF_NAN;
+
+    op1_f32 = f16_to_f32(op1);
+    op2_f32 = f16_to_f32(op2);
+
+    u_op1.ui = op1_f32;
+    u_op2.ui = op2_f32;
+    a = u_op1.f;
+    b = u_op2.f;
+    rslt = a*b;
+    u_rslt.f = rslt;
+    result = u_rslt.ui;
+    result = isNaNF32UI(result) ? FP32_DEF_NAN : result;
+
+    #ifdef DEBUG
+    printf("Debug : a = %f\n",a);
+    printf("Debug : b = %f\n",b);
+    printf("Debug : rslt = %f\n",rslt);
+    printf("Debug : result =0x%08x\n",result);
+    #endif
+
+    return result;
+}
+
+uint32_t fp_add_sf_hf (uint16_t op1, uint16_t op2)
+{
+
+    union ui32_f32 u_op1;
+    union ui32_f32 u_op2;
+    union ui32_f32 u_rslt;
+
+    uint32_t op1_f32;
+    uint32_t op2_f32;
+
+    float a,b,rslt;
+    uint32_t result;
+
+    #ifdef DEBUG
+    printf("Debug : op1 =0x%08x\n",op1);
+    printf("Debug : op2 =0x%08x\n",op2);
+    #endif
+
+    if(isNaNF16UI(op1) || isNaNF16UI(op2))
+       return FP32_DEF_NAN;
+
+    op1_f32 = f16_to_f32(op1);
+    op2_f32 = f16_to_f32(op2);
+
+    u_op1.ui = op1_f32;
+    u_op2.ui = op2_f32;
+    a = u_op1.f;
+    b = u_op2.f;
+    rslt = a+b;
+    u_rslt.f = rslt;
+    result = u_rslt.ui;
+    result = isNaNF32UI(result) ? FP32_DEF_NAN : result;
+
+    #ifdef DEBUG
+    printf("Debug : a = %f\n",a);
+    printf("Debug : b = %f\n",b);
+    printf("Debug : rslt = %f\n",rslt);
+    printf("Debug : result =0x%08x\n",result);
+    #endif
+
+    return result;
+}
+
+uint32_t fp_sub_sf_hf (uint16_t op1, uint16_t op2)
+{
+
+    union ui32_f32 u_op1;
+    union ui32_f32 u_op2;
+    union ui32_f32 u_rslt;
+
+    uint32_t op1_f32;
+    uint32_t op2_f32;
+
+    float a,b,rslt;
+    uint32_t result;
+
+    #ifdef DEBUG
+    printf("Debug : op1 =0x%08x\n",op1);
+    printf("Debug : op2 =0x%08x\n",op2);
+    #endif
+
+    if(isNaNF16UI(op1) || isNaNF16UI(op2))
+       return FP32_DEF_NAN;
+
+    op1_f32 = f16_to_f32(op1);
+    op2_f32 = f16_to_f32(op2);
+
+    u_op1.ui = op1_f32;
+    u_op2.ui = op2_f32;
+    a = u_op1.f;
+    b = u_op2.f;
+    rslt = a-b;
+    u_rslt.f = rslt;
+    result = u_rslt.ui;
+    result = isNaNF32UI(result) ? FP32_DEF_NAN : result;
+
+    #ifdef DEBUG
+    printf("Debug : a = %f\n",a);
+    printf("Debug : b = %f\n",b);
+    printf("Debug : rslt = %f\n",rslt);
+    printf("Debug : result =0x%08x\n",result);
+    #endif
+
+    return result;
+}
+
+uint32_t fp_mult_sf_bf_acc (uint16_t op1, uint16_t op2, uint32_t acc)
+{
+    union ui32_f32 u_op1;
+    union ui32_f32 u_op2;
+    union ui32_f32 u_acc;
+    union ui32_f32 u_rslt;
+
+    uint32_t op1_f32;
+    uint32_t op2_f32;
+
+    double a,b,facc,rslt;
+    uint32_t result;
+
+    #ifdef DEBUG
+    printf("Debug : op1 =0x%04x\n",op1);
+    printf("Debug : op2 =0x%04x\n",op2);
+    printf("Debug : acc =0x%08x\n",acc);
+    #endif
+
+    op1_f32 = ((uint32_t)op1) << 16;
+    op2_f32 = ((uint32_t)op2) << 16;
+
+    if(isNaNF32UI(op1_f32) || isNaNF32UI(op2_f32) || isNaNF32UI(acc))
+       return FP32_DEF_NAN;
+
+    u_op1.ui = op1_f32;
+    u_op2.ui = op2_f32;
+    u_acc.ui = acc;
+    a = u_op1.f;
+    b = u_op2.f;
+    facc = u_acc.f;
+    //rslt = fma(a,b,facc);
+    rslt = (a * b) + facc;
+    u_rslt.f = rslt;
+    result = u_rslt.ui;
+    result = isNaNF32UI(result) ? FP32_DEF_NAN : result;
+
+    #ifdef DEBUG
+    printf("Debug : a = %f\n",a);
+    printf("Debug : b = %f\n",b);
+    printf("Debug : facc = %f\n",facc);
+    printf("Debug : rslt = %f\n",rslt);
+    printf("Debug : result =0x%04x\n",result);
+    #endif
+
+    return result;
+}
+
+uint32_t fp_mult_sf_bf (uint16_t op1, uint16_t op2)
+{
+    uint32_t op1_f32;
+    uint32_t op2_f32;
+    op1_f32 = ((uint32_t)op1) << 16;
+    op2_f32 = ((uint32_t)op2) << 16;
+    return fp_mult_sf_sf(op1_f32, op2_f32);
+}
+
+uint32_t fp_add_sf_bf (uint16_t op1, uint16_t op2)
+{
+    uint32_t op1_f32;
+    uint32_t op2_f32;
+    op1_f32 = ((uint32_t)op1) << 16;
+    op2_f32 = ((uint32_t)op2) << 16;
+    return fp_add_sf_sf(op1_f32, op2_f32);
+}
+
+uint32_t fp_sub_sf_bf (uint16_t op1, uint16_t op2)
+{
+    uint32_t op1_f32;
+    uint32_t op2_f32;
+    op1_f32 = ((uint32_t)op1) << 16;
+    op2_f32 = ((uint32_t)op2) << 16;
+    return fp_sub_sf_sf(op1_f32, op2_f32);
+}
+
+uint16_t f16_to_uh( uint16_t op1)
+{
+    union ui32_f32 u_op1;
+
+    float a,frac;
+    uint32_t op1_f32;
+    uint16_t result;
+
+    //converting a NaN to an integral ----> Vx4Rslt is +MAX_INT
+    if(isNaNF16UI(op1))
+    {
+       result = UHW_MAX;
+       goto end;
+    }
+    //converting a negative floating-point value to
+    //unsigned integer U(h|b) ----> (Vx4Rslt is 0)
+    if(signF16UI(op1))
+    {
+       result = 0x0;
+       goto end;
+    }
+    //converting ±Inf to an integral ----> Vx4Rslt is ±MAX_INT
+    if(isInfF16UI(op1))
+    {
+       result = UHW_MAX;
+       goto end;
+    }
+    //out of range FP to integer ------> Vx4Rslt is ±MAX_INT
+
+    //The default float-to-integer conversion in C does not
+    //round to the nearest integer, but instead truncates toward zero.
+    op1_f32 = f16_to_f32(op1);
+    u_op1.ui = op1_f32;
+    a = u_op1.f;
+    frac = a - (float)((uint16_t) a);
+    //round to the nearest
+    result = (uint16_t) (a + 0.5);
+    //Ties to Even
+    if(frac == 0.5)
+    {
+       if((result % 2)) result--;
+    }
+    #ifdef DEBUG
+    printf("Debug : a      = %f\n",a);
+    printf("Debug : a frac = %f\n",frac);
+    #endif
+
+ end:
+    #ifdef DEBUG
+    printf("Debug : result =0x%x\n",result);
+    #endif
+    return result;
+}
+
+int16_t f16_to_h( uint16_t op1)
+{
+    union ui32_f32 u_op1;
+
+    float a,frac;
+    uint32_t op1_f32;
+    int16_t  result;
+
+    //converting a NaN to an integral ----> Vx4Rslt is +MAX_INT
+    if(isNaNF16UI(op1))
+    {
+       result = HW_MAX;
+       goto end;
+    }
+    //converting ±Inf to an integral ----> Vx4Rslt is ±MAX_INT
+    if(isInfF16UI(op1))
+    {
+       result = signF16UI(op1) ? HW_MIN : HW_MAX;
+       goto end;
+    }
+
+    //The default float-to-integer conversion in C does not round
+    //to the nearest integer, but instead truncates toward zero.
+    op1_f32 = f16_to_f32(op1);
+    u_op1.ui = op1_f32;
+    a = u_op1.f;
+
+    //out of range FP to integer ------> Vx4Rslt is ±MAX_INT
+    if(a > (float)(HW_MAX))
+    {
+       result = HW_MAX;
+       goto end;
+    }
+    if(a < (float)(HW_MIN))
+    {
+       result = HW_MIN;
+       goto end;
+    }
+
+    frac = fabs(a - (float)((int16_t) a));
+    //round to the nearest
+    result = (a > 0) ? ((int16_t) (a + 0.5)) : ((int16_t) (a - 0.5));
+    //Ties to Even
+    if(frac == 0.5)
+    {
+       if((result % 2))
+       {
+          if(a > 0) result--;
+          if(a < 0) result++;
+       }
+    }
+    #ifdef DEBUG
+    printf("Debug : a      = %f\n",a);
+    printf("Debug : a frac = %f\n",frac);
+    #endif
+
+ end:
+    #ifdef DEBUG
+    printf("Debug : result =0x%04x\n",result);
+    #endif
+    return result;
+}
+
+uint8_t f16_to_ub( uint16_t op1)
+{
+    union ui32_f32 u_op1;
+
+    float a,frac;
+    uint32_t op1_f32;
+    uint8_t result;
+
+    //converting a NaN to an integral ----> Vx4Rslt is +MAX_INT
+    if(isNaNF16UI(op1))
+    {
+       result = UBYTE_MAX;
+       goto end;
+    }
+    //converting a negative floating-point value to
+    //unsigned integer U(h|b) ----> (Vx4Rslt is 0)
+    if(signF16UI(op1))
+    {
+       result = 0x0;
+       goto end;
+    }
+    //converting ±Inf to an integral ----> Vx4Rslt is ±MAX_INT
+    if(isInfF16UI(op1))
+    {
+       result = UBYTE_MAX;
+       goto end;
+    }
+
+    //The default float-to-integer conversion in C does
+    //not round to the nearest integer, but instead truncates toward zero.
+    op1_f32 = f16_to_f32(op1);
+    u_op1.ui = op1_f32;
+    a = u_op1.f;
+
+    //out of range FP to integer ------> Vx4Rslt is ±MAX_INT
+    if( a  > (float)(UBYTE_MAX))
+    {
+       result = UBYTE_MAX;
+       goto end;
+    }
+
+    frac = a - (float)((uint16_t) a);
+    //round to the nearest
+    result = (uint8_t) (a + 0.5);
+    //Ties to Even
+    if(frac == 0.5)
+    {
+       if((result % 2))
+       {
+          if(a > 0) result--;
+          if(a < 0) result++;
+       }
+    }
+    #ifdef DEBUG
+    printf("Debug : a      = %f\n",a);
+    printf("Debug : a frac = %f\n",frac);
+    #endif
+
+ end:
+    #ifdef DEBUG
+    printf("Debug : result =0x%x\n",result);
+    #endif
+    return result;
+}
+
+int8_t f16_to_b( uint16_t op1)
+{
+    union ui32_f32 u_op1;
+
+    float a,frac;
+    uint32_t op1_f32;
+    int16_t  result;
+
+    //converting a NaN to an integral ----> Vx4Rslt is +MAX_INT
+    if(isNaNF16UI(op1))
+    {
+       result = BYTE_MAX;
+       goto end;
+    }
+    //converting ±Inf to an integral ----> Vx4Rslt is ±MAX_INT
+    if(isInfF16UI(op1))
+    {
+       result = signF16UI(op1) ? BYTE_MIN : BYTE_MAX;
+       goto end;
+    }
+
+    //The default float-to-integer conversion in C does not
+    //round to the nearest integer, but instead truncates toward zero.
+    op1_f32 = f16_to_f32(op1);
+    u_op1.ui = op1_f32;
+    a = u_op1.f;
+
+    //out of range FP to integer ------> Vx4Rslt is ±MAX_INT
+    if(a > (float)(BYTE_MAX))
+    {
+       result = BYTE_MAX;
+       goto end;
+    }
+    if(a < (float)(BYTE_MIN))
+    {
+       result = BYTE_MIN;
+       goto end;
+    }
+
+    frac = fabs(a - (float)((int16_t) a));
+    //round to the nearest
+    result = (a > 0) ? ((int16_t) (a + 0.5)) : ((int16_t) (a - 0.5));
+    //Ties to Even
+    if(frac == 0.5)
+    {
+       if((result % 2))
+       {
+          if(a > 0) result--;
+          if(a < 0) result++;
+       }
+    }
+    #ifdef DEBUG
+    printf("Debug : a      = %f\n",a);
+    printf("Debug : a frac = %f\n",frac);
+    #endif
+
+ end:
+    #ifdef DEBUG
+    printf("Debug : result =0x%04x\n",result);
+    #endif
+    return result;
+}
+
+uint16_t uh_to_f16(uint16_t op1)
+{
+    union ui32_f32 u_op1;
+
+    float a;
+    uint32_t rslt;
+    uint16_t result;
+
+    #ifdef DEBUG
+    printf("Debug : op1 =0x%08x\n",op1);
+    #endif
+
+    a = (float) op1;
+    u_op1.f = a;
+    rslt = u_op1.ui;
+    result = f32_to_f16(rslt);
+
+    #ifdef DEBUG
+    printf("Debug : a = %f\n",a);
+    printf("Debug : rslt = 0x%08x\n",rslt);
+    printf("Debug : result =0x%04x\n",result);
+    #endif
+
+    return result;
+}
+
+uint16_t h_to_f16 (int16_t op1)
+{
+    union ui32_f32 u_op1;
+
+    float a;
+    uint32_t rslt;
+    uint16_t result;
+
+    #ifdef DEBUG
+    printf("Debug : op1 =0x%08x\n",op1);
+    #endif
+
+    a = (float) op1;
+    u_op1.f = a;
+    rslt = u_op1.ui;
+    result = f32_to_f16(rslt);
+
+    #ifdef DEBUG
+    printf("Debug : a = %f\n",a);
+    printf("Debug : rslt = 0x%08x\n",rslt);
+    printf("Debug : result =0x%04x\n",result);
+    #endif
+
+    return result;
+}
+
+uint16_t ub_to_f16(uint8_t op1)
+{
+    union ui32_f32 u_op1;
+
+    float a;
+    uint32_t rslt;
+    uint16_t result;
+
+    #ifdef DEBUG
+    printf("Debug : op1 =0x%08x\n",op1);
+    #endif
+
+    a = (float) op1;
+    u_op1.f = a;
+    rslt = u_op1.ui;
+    result = f32_to_f16(rslt);
+
+    #ifdef DEBUG
+    printf("Debug : a = %f\n",a);
+    printf("Debug : rslt = 0x%08x\n",rslt);
+    printf("Debug : result =0x%04x\n",result);
+    #endif
+
+    return result;
+}
+
+uint16_t b_to_f16 (int8_t op1)
+{
+    union ui32_f32 u_op1;
+
+    float a;
+    uint32_t rslt;
+    uint16_t result;
+
+    #ifdef DEBUG
+    printf("Debug : op1 =0x%08x\n",op1);
+    #endif
+
+    a = (float) op1;
+    u_op1.f = a;
+    rslt = u_op1.ui;
+    result = f32_to_f16(rslt);
+
+    #ifdef DEBUG
+    printf("Debug : a = %f\n",a);
+    printf("Debug : rslt = 0x%08x\n",rslt);
+    printf("Debug : result =0x%04x\n",result);
+    #endif
+
+    return result;
+}
+
+uint16_t sf_to_bf (int32_t op1)
+{
+    uint32_t rslt = op1;
+    if((rslt & 0x1FFFF) == 0x08000){
+        //break; // do not round up if exactly .5 and even already
+    }
+    else if ((rslt & 0x8000) == 0x8000){
+        rslt += 0x8000; //rounding to nearest number
+    }
+    rslt = isNaNF32UI(op1) ? FP32_DEF_NAN : rslt;
+    uint16_t result = (rslt >> 16);
+    return result;
+}
+
+uint32_t fp_vdmpy (uint16_t op1_u,uint16_t op1_l,uint16_t op2_u,uint16_t op2_l)
+{
+    union ui32_f32 u_op;
+    union ui32_f32 u_rslt;
+
+    uint32_t op1_u_f32, op1_l_f32, op2_u_f32, op2_l_f32;
+    float f_op1_u, f_op1_l, f_op2_u, f_op2_l;
+    double f_prod_l, f_prod_u, rslt;
+    uint32_t result;
+
+    #ifdef DEBUG
+    printf("Debug : op1_u =0x%04x\n",op1_u);
+    printf("Debug : op1_l =0x%04x\n",op1_l);
+    printf("Debug : op2_u =0x%04x\n",op2_u);
+    printf("Debug : op2_l =0x%04x\n",op2_l);
+    #endif
+
+    if(isNaNF16UI(op1_u) || isNaNF16UI(op1_l) || isNaNF16UI(op2_u) ||
+       isNaNF16UI(op2_l))
+    {   result = FP32_DEF_NAN;
+        goto end;
+    }
+
+    op1_u_f32 = f16_to_f32(op1_u);
+    op1_l_f32 = f16_to_f32(op1_l);
+    op2_u_f32 = f16_to_f32(op2_u);
+    op2_l_f32 = f16_to_f32(op2_l);
+
+    u_op.ui = op1_u_f32;
+    f_op1_u = u_op.f;
+
+    u_op.ui = op1_l_f32;
+    f_op1_l = u_op.f;
+
+    u_op.ui = op2_l_f32;
+    f_op2_l = u_op.f;
+
+    u_op.ui = op2_u_f32;
+    f_op2_u = u_op.f;
+
+    f_prod_l = f_op1_l * f_op2_l;
+    f_prod_u = f_op1_u * f_op2_u;
+    rslt     = f_prod_u + f_prod_l;
+
+    u_rslt.f = rslt;
+    result = u_rslt.ui;
+    result = isNaNF32UI(result) ? FP32_DEF_NAN : result;
+
+    #ifdef DEBUG
+    printf("Debug : f_op1_u = %f\n",f_op1_u);
+    printf("Debug : f_op1_l = %f\n",f_op1_l);
+    printf("Debug : f_op2_u = %f\n",f_op2_u);
+    printf("Debug : f_op2_l = %f\n",f_op2_l);
+    printf("Debug : f_prod_l = %f\n",f_prod_l);
+    printf("Debug : f_prod_u = %f\n",f_prod_u);
+    printf("Debug : rslt = %f\n",rslt);
+    #endif
+
+end:
+    #ifdef DEBUG
+    printf("Debug : result =0x%08x\n",result);
+    #endif
+    return result;
+}
+
+uint32_t fp_vdmpy_acc_dumb  (uint32_t acc,uint16_t op1_u,uint16_t op1_l,
+    uint16_t op2_u,uint16_t op2_l)
+{
+    union ui32_f32 u_op;
+    union ui32_f32 u_acc;
+    union ui32_f32 u_rslt;
+
+    uint32_t op1_u_f32, op1_l_f32, op2_u_f32, op2_l_f32;
+    float f_op1_u, f_op1_l, f_op2_u, f_op2_l, f_acc;
+    long double f_prod_l, f_prod_u, rslt;
+    uint32_t result;
+
+    #ifdef DEBUG
+    printf("Debug : op1_u =0x%04x\n",op1_u);
+    printf("Debug : op1_l =0x%04x\n",op1_l);
+    printf("Debug : op2_u =0x%04x\n",op2_u);
+    printf("Debug : op2_l =0x%04x\n",op2_l);
+    printf("Debug : acc   =0x%08x\n",acc);
+    #endif
+
+    op1_u_f32 = f16_to_f32(op1_u);
+    op1_l_f32 = f16_to_f32(op1_l);
+    op2_u_f32 = f16_to_f32(op2_u);
+    op2_l_f32 = f16_to_f32(op2_l);
+
+    u_op.ui = op1_u_f32;
+    f_op1_u = u_op.f;
+
+    u_op.ui = op1_l_f32;
+    f_op1_l = u_op.f;
+
+    u_op.ui = op2_l_f32;
+    f_op2_l = u_op.f;
+
+    u_op.ui = op2_u_f32;
+    f_op2_u = u_op.f;
+
+    u_acc.ui = acc;
+    f_acc   = u_acc.f;
+
+    f_prod_l =  (long double)(f_op1_l * f_op2_l);
+    f_prod_u =  (long double)(f_op1_u * f_op2_u);
+    rslt     =  (long double)((long double)f_acc + f_prod_u + f_prod_l);
+
+    u_rslt.f = rslt;
+    result = u_rslt.ui;
+    result = isNaNF32UI(result) ? FP32_DEF_NAN : result;
+
+    #ifdef DEBUG
+    printf("Debug : f_op1_u = %f\n",f_op1_u);
+    printf("Debug : f_op1_l = %f\n",f_op1_l);
+    printf("Debug : f_op2_u = %f\n",f_op2_u);
+    printf("Debug : f_op2_l = %f\n",f_op2_l);
+    printf("Debug : f_acc   = %f\n",f_acc);
+    printf("Debug : f_prod_l = %Lf\n",f_prod_l);
+    printf("Debug : f_prod_u = %Lf\n",f_prod_u);
+    printf("Debug : rslt = %Lf\n",rslt);
+    printf("Debug : result =0x%08x\n",result);
+    #endif
+
+    return result;
+}
+
+uint16_t fp_min_hf(uint16_t op1,uint16_t op2)
+{
+    union ui32_f32 u_op1;
+    union ui32_f32 u_op2;
+    union ui32_f32 u_rslt;
+
+    uint32_t op1_f32;
+    uint32_t op2_f32;
+
+    float a,b,rslt;
+    uint32_t result_f32;
+    uint16_t result;
+
+    #ifdef DEBUG
+    printf("Debug : op1 =0x%08x\n",op1);
+    printf("Debug : op2 =0x%08x\n",op2);
+    #endif
+
+    if(isNaNF16UI(op1) || isNaNF16UI(op2))
+       return FP16_DEF_NAN;
+
+    op1_f32 = f16_to_f32(op1);
+    op2_f32 = f16_to_f32(op2);
+
+    u_op1.ui = op1_f32;
+    u_op2.ui = op2_f32;
+    a = u_op1.f;
+    b = u_op2.f;
+
+    rslt = (a>b) ? b : a;
+    // +0 is evaluated equal to -0 in C. Handeling that case separatly
+    if( (fabs(a) == 0.0f) && (fabs(b) == 0.0f) && (signF16UI(op1) !=
+        signF16UI(op2)) )
+    {
+       rslt = signF16UI(op1) ? a : b;
+    }
+    u_rslt.f = rslt;
+    result_f32 = u_rslt.ui;
+
+    result = f32_to_f16(result_f32);
+
+    #ifdef DEBUG
+    printf("Debug : a = %f\n",a);
+    printf("Debug : b = %f\n",b);
+    printf("Debug : rslt = %f\n",rslt);
+    printf("Debug : result =0x%08x\n",result);
+    #endif
+
+    return result;
+
+}
+
+uint32_t fp_min_sf(uint32_t op1,uint32_t op2)
+{
+    union ui32_f32 u_op1;
+    union ui32_f32 u_op2;
+    union ui32_f32 u_rslt;
+
+    float a,b,rslt;
+    uint32_t result;
+
+    #ifdef DEBUG
+    printf("Debug : op1 =0x%08x\n",op1);
+    printf("Debug : op2 =0x%08x\n",op2);
+    #endif
+
+    if(isNaNF32UI(op1) || isNaNF32UI(op2))
+       return FP32_DEF_NAN;
+
+    u_op1.ui = op1;
+    u_op2.ui = op2;
+    a = u_op1.f;
+    b = u_op2.f;
+    rslt = (a>b) ? b : a;
+    // +0 is evaluated equal to -0 in C. Handeling that case separatly
+    if( (fabs(a) == 0.0f) && (fabs(b) == 0.0f) &&
+         (signF32UI(op1) != signF32UI(op2)) )
+    {
+       rslt = signF32UI(op1) ? a : b;
+    }
+    u_rslt.f = rslt;
+    result = u_rslt.ui;
+
+    #ifdef DEBUG
+    printf("Debug : a = %f\n",a);
+    printf("Debug : b = %f\n",b);
+    printf("Debug : rslt = %f\n",rslt);
+    printf("Debug : result =0x%08x\n",result);
+    #endif
+
+    return result;
+}
+
+uint16_t fp_min_bf(uint16_t op1,uint16_t op2)
+{
+    uint32_t op1_f32;
+    uint32_t op2_f32;
+
+    uint32_t result_f32;
+    uint16_t result;
+
+    op1_f32 = ((uint32_t)op1) << 16;
+    op2_f32 = ((uint32_t)op2) << 16;
+
+    result_f32 = fp_min_sf(op1_f32, op2_f32);
+    result_f32 = result_f32 >> 16;
+    result = result_f32 & 0xFFFF;
+    return result;
+}
+
+
+uint16_t fp_max_hf(uint16_t op1,uint16_t op2)
+{
+    union ui32_f32 u_op1;
+    union ui32_f32 u_op2;
+    union ui32_f32 u_rslt;
+
+    uint32_t op1_f32;
+    uint32_t op2_f32;
+
+    float a,b,rslt;
+    uint32_t result_f32;
+    uint16_t result;
+
+    #ifdef DEBUG
+    printf("Debug : op1 =0x%08x\n",op1);
+    printf("Debug : op2 =0x%08x\n",op2);
+    #endif
+
+    if(isNaNF16UI(op1) || isNaNF16UI(op2))
+       return FP16_DEF_NAN;
+
+    op1_f32 = f16_to_f32(op1);
+    op2_f32 = f16_to_f32(op2);
+
+    u_op1.ui = op1_f32;
+    u_op2.ui = op2_f32;
+    a = u_op1.f;
+    b = u_op2.f;
+
+    rslt = (a>b) ? a : b;
+    // +0 is evaluated equal to -0 in C. Handeling that case separatly
+    if( (fabs(a) == 0.0f) &&
+        (fabs(b) == 0.0f) && (signF16UI(op1) != signF16UI(op2)) )
+    {
+       rslt = signF16UI(op1) ? b : a;
+    }
+    u_rslt.f = rslt;
+    result_f32 = u_rslt.ui;
+
+    result = f32_to_f16(result_f32);
+
+    #ifdef DEBUG
+    printf("Debug : a = %f\n",a);
+    printf("Debug : b = %f\n",b);
+    printf("Debug : rslt = %f\n",rslt);
+    printf("Debug : result =0x%08x\n",result);
+    #endif
+
+    return result;
+
+}
+
+uint32_t fp_max_sf(uint32_t op1,uint32_t op2)
+{
+    union ui32_f32 u_op1;
+    union ui32_f32 u_op2;
+    union ui32_f32 u_rslt;
+
+    float a,b,rslt;
+    uint32_t result;
+
+    #ifdef DEBUG
+    printf("Debug : op1 =0x%08x\n",op1);
+    printf("Debug : op2 =0x%08x\n",op2);
+    #endif
+
+    if(isNaNF32UI(op1) || isNaNF32UI(op2))
+       return FP32_DEF_NAN;
+
+    u_op1.ui = op1;
+    u_op2.ui = op2;
+    a = u_op1.f;
+    b = u_op2.f;
+    rslt = (a>b) ? a : b;
+    // +0 is evaluated equal to -0 in C. Handeling that case separatly
+    if( (fabs(a) == 0.0f) && (fabs(b) == 0.0f) &&
+         (signF32UI(op1) != signF32UI(op2)) )
+    {
+       rslt = signF32UI(op1) ? b : a;
+    }
+    u_rslt.f = rslt;
+    result = u_rslt.ui;
+
+    #ifdef DEBUG
+    printf("Debug : a = %f\n",a);
+    printf("Debug : b = %f\n",b);
+    printf("Debug : rslt = %f\n",rslt);
+    printf("Debug : result =0x%08x\n",result);
+    #endif
+
+    return result;
+}
+
+uint16_t fp_max_bf(uint16_t op1,uint16_t op2)
+{
+    uint32_t op1_f32;
+    uint32_t op2_f32;
+
+    uint32_t result_f32;
+    uint16_t result;
+
+    op1_f32 = ((uint32_t)op1) << 16;
+    op2_f32 = ((uint32_t)op2) << 16;
+
+    result_f32 = fp_max_sf(op1_f32, op2_f32);
+    result_f32 = result_f32 >> 16;
+    result = result_f32 & 0xFFFF;
+    return result;
+}
+
+uint16_t fp_abs_bf(uint16_t op1)
+{
+    union ui32_f32 u_op1;
+
+    float result_f;
+    uint32_t result_f32;
+    uint16_t result;
+
+    u_op1.ui = ((uint32_t)op1) << 16;
+
+    result_f = fabs(u_op1.f);
+    u_op1.f = result_f;
+    result_f32 = u_op1.ui >> 16;
+    result = result_f32 & 0xFFFF;
+    return result;
+}
+
+uint16_t fp_neg_bf(uint16_t op1)
+{
+    union ui32_f32 u_op1;
+
+    float result_f;
+    uint32_t result_f32;
+    uint16_t result;
+
+    u_op1.ui = ((uint32_t)op1) << 16;
+
+    result_f = -(u_op1.f);
+    u_op1.f = result_f;
+    result_f32 = u_op1.ui >> 16;
+    result = result_f32 & 0xFFFF;
+    return result;
+}
+
+//float fmaf( float x, float y, float z );
+uint16_t fp_mult_hf_hf_acc_dumb (uint16_t op1, uint16_t op2, uint16_t acc)
+{
+    union ui32_f32 u_op1;
+    union ui32_f32 u_op2;
+    union ui32_f32 u_acc;
+    union ui32_f32 u_rslt;
+
+    uint32_t op1_f32;
+    uint32_t op2_f32;
+    uint32_t acc_f32;
+
+    float a,b,facc,rslt;
+    uint32_t result_f32;
+    uint16_t result;
+
+    #ifdef DEBUG
+    printf("Debug : op1 =0x%04x\n",op1);
+    printf("Debug : op2 =0x%04x\n",op2);
+    printf("Debug : acc =0x%04x\n",acc);
+    #endif
+
+    if(isNaNF16UI(op1) || isNaNF16UI(op2) || isNaNF16UI(acc))
+       return FP16_DEF_NAN;
+
+    op1_f32 = f16_to_f32(op1);
+    op2_f32 = f16_to_f32(op2);
+    acc_f32 = f16_to_f32(acc);
+
+    u_op1.ui = op1_f32;
+    u_op2.ui = op2_f32;
+    u_acc.ui = acc_f32;
+    a = u_op1.f;
+    b = u_op2.f;
+    facc = u_acc.f;
+    //rslt = fma(a,b,facc);
+    rslt = (a * b) + facc;
+    u_rslt.f = rslt;
+    result_f32 = u_rslt.ui;
+
+    result = f32_to_f16(result_f32);
+
+    #ifdef DEBUG
+    printf("Debug : a = %f\n",a);
+    printf("Debug : b = %f\n",b);
+    printf("Debug : facc = %f\n",facc);
+    printf("Debug : rslt = %f\n",rslt);
+    printf("Debug : result =0x%04x\n",result);
+    #endif
+
+    return result;
+}
+
+uint32_t fp_mult_sf_hf_acc (uint16_t op1, uint16_t op2, uint32_t acc)
+{
+    union ui32_f32 u_op1;
+    union ui32_f32 u_op2;
+    union ui32_f32 u_acc;
+    union ui32_f32 u_rslt;
+
+    uint32_t op1_f32;
+    uint32_t op2_f32;
+
+    float a,b,facc,rslt;
+    uint32_t result;
+
+    #ifdef DEBUG
+    printf("Debug : op1 =0x%04x\n",op1);
+    printf("Debug : op2 =0x%04x\n",op2);
+    printf("Debug : acc =0x%08x\n",acc);
+    #endif
+
+    if(isNaNF16UI(op1) || isNaNF16UI(op2) || isNaNF32UI(acc))
+       return FP32_DEF_NAN;
+
+    op1_f32 = f16_to_f32(op1);
+    op2_f32 = f16_to_f32(op2);
+
+    u_op1.ui = op1_f32;
+    u_op2.ui = op2_f32;
+    u_acc.ui = acc;
+    a = u_op1.f;
+    b = u_op2.f;
+    facc = u_acc.f;
+    //rslt = fma(a,b,facc);
+    rslt = (a * b) + facc;
+    u_rslt.f = rslt;
+    result = u_rslt.ui;
+    result = isNaNF32UI(result) ? FP32_DEF_NAN : result;
+
+    #ifdef DEBUG
+    printf("Debug : a = %f\n",a);
+    printf("Debug : b = %f\n",b);
+    printf("Debug : facc = %f\n",facc);
+    printf("Debug : rslt = %f\n",rslt);
+    printf("Debug : result =0x%04x\n",result);
+    #endif
+
+    return result;
+}
diff --git a/target/hexagon/mmvec/kvx_ieee.h b/target/hexagon/mmvec/kvx_ieee.h
new file mode 100644
index 000000000000..ad80b7023925
--- /dev/null
+++ b/target/hexagon/mmvec/kvx_ieee.h
@@ -0,0 +1,141 @@
+/*
+ *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef KVX_COMPACT_H
+#define KVX_COMPACT_H 1
+
+#include <math.h>
+#include "hex_arch_types.h"
+
+//Double precision
+#define signF64UI( a ) ((bool) ((uint64_t) (a)>>63))
+#define expF64UI( a ) ((int_fast16_t) ((a)>>52) & 0x7FF)
+#define fracF64UI( a ) ((a) & UINT64_C( 0x000FFFFFFFFFFFFF ))
+#define packToF64UI( sign, exp, sig ) ((uint64_t) (((uint_fast64_t) (sign)<<63) + ((uint_fast64_t) (exp)<<52) + (sig)))
+#define isNaNF64UI( a ) (((~(a) & UINT64_C( 0x7FF0000000000000 )) == 0) && ((a) & UINT64_C( 0x000FFFFFFFFFFFFF )))
+
+//SF defines
+#define FP32_DEF_NAN      0x7FFFFFFF
+#define isNaNF32UI( a ) (((~(a) & 0x7F800000) == 0) && ((a) & 0x007FFFFF))
+#define isInfF32UI( a ) (((~(a) & 0x7F800000) == 0) && (((a) & 0x007FFFFF) == 0))
+#define signF32UI( a ) ((bool) ((uint32_t) (a)>>31))
+#define expF32UI( a ) ((int_fast16_t) ((a)>>23) & 0xFF)
+#define fracF32UI( a ) ((a) & 0x007FFFFF)
+#define packToF32UI( sign, exp, sig ) (((uint32_t) (sign)<<31) + ((uint32_t) (exp)<<23) + (sig))
+
+//HF defines
+#define FP16_DEF_NAN      0x7FFF
+#define isNaNF16UI( a ) (((~(a) & 0x7C00) == 0) && ((a) & 0x03FF))
+#define isInfF16UI( a ) (((~(a) & 0x7C00) == 0) && (((a) & 0x03FF) == 0))
+#define signF16UI( a ) ((bool) ((uint16_t) (a)>>15))
+#define expF16UI( a ) ((int_fast8_t) ((a)>>10) & 0x1F)
+#define fracF16UI( a ) ((a) & 0x03FF)
+#define packToF16UI( sign, exp, sig ) (((uint16_t) (sign)<<15) + ((uint16_t) (exp)<<10) + (sig))
+
+#define UHW_MIN           0
+#define UHW_MAX           65535
+#define HW_MIN            -32768
+#define HW_MAX            32767
+
+#define UBYTE_MIN         0
+#define UBYTE_MAX         255
+#define BYTE_MIN          -128
+#define BYTE_MAX          127
+
+//union ui16_f16 { uint16_t ui; float16_t f; };
+union ui32_f32 { uint32_t ui; float  f; };
+union ui64_f64 { uint64_t ui; double f; };
+struct exp8_sig16 { int_fast8_t exp; uint_fast16_t sig; };
+
+uint32_t shiftRightJam32( uint32_t a, uint_fast16_t dist );
+uint_fast8_t countLeadingZeros16( uint16_t a );
+struct exp8_sig16 normSubnormalF16Sig( uint_fast16_t sig );
+uint16_t roundPackToF16( bool sign, int_fast16_t exp, uint_fast16_t sig );
+
+//--------------------------------------------------------------------------
+// IEEE - FP Convert instructions
+//--------------------------------------------------------------------------
+uint16_t f32_to_f16 ( uint32_t a);
+uint32_t f16_to_f32( uint16_t a );
+
+uint16_t f16_to_uh( uint16_t op1);
+int16_t  f16_to_h ( uint16_t op1);
+uint8_t  f16_to_ub( uint16_t op1);
+int8_t   f16_to_b ( uint16_t op1);
+
+uint16_t uh_to_f16(uint16_t op1);
+uint16_t h_to_f16 (int16_t op1);
+uint16_t ub_to_f16(uint8_t op1);
+uint16_t b_to_f16 (int8_t op1);
+
+uint16_t sf_to_bf (int32_t op1);
+
+//--------------------------------------------------------------------------
+// IEEE - FP ADD/SUB/MPY instructions
+//--------------------------------------------------------------------------
+
+//size4s_t fp_mult(size4s_t input_1, size4s_t input_2);
+uint32_t fp_mult_sf_sf (uint32_t op1, uint32_t op2);
+uint32_t fp_add_sf_sf  (uint32_t op1, uint32_t op2);
+uint32_t fp_sub_sf_sf  (uint32_t op1, uint32_t op2);
+
+uint16_t fp_mult_hf_hf (uint16_t op1, uint16_t op2);
+uint16_t fp_add_hf_hf  (uint16_t op1, uint16_t op2);
+uint16_t fp_sub_hf_hf  (uint16_t op1, uint16_t op2);
+
+uint32_t fp_mult_sf_hf (uint16_t op1, uint16_t op2);
+uint32_t fp_add_sf_hf  (uint16_t op1, uint16_t op2);
+uint32_t fp_sub_sf_hf  (uint16_t op1, uint16_t op2);
+
+uint32_t fp_mult_sf_bf (uint16_t op1, uint16_t op2);
+uint32_t fp_add_sf_bf  (uint16_t op1, uint16_t op2);
+uint32_t fp_sub_sf_bf  (uint16_t op1, uint16_t op2);
+
+//--------------------------------------------------------------------------
+// IEEE - FP Accumulate instructions
+//--------------------------------------------------------------------------
+
+uint16_t fp_mult_hf_hf_acc (uint16_t op1, uint16_t op2, uint16_t acc);
+uint32_t fp_mult_sf_bf_acc (uint16_t op1, uint16_t op2, uint32_t acc);
+uint32_t fp_mult_sf_hf_acc (uint16_t op1, uint16_t op2, uint32_t acc);
+
+//--------------------------------------------------------------------------
+// IEEE - FP Reduce instructions
+//--------------------------------------------------------------------------
+
+uint32_t fp_vdmpy      (uint16_t op1_u,uint16_t op1_l,uint16_t op2_u,uint16_t op2_l);
+uint32_t fp_vdmpy_acc  (uint32_t acc,uint16_t op1_u,uint16_t op1_l,uint16_t op2_u,uint16_t op2_l);
+
+//--------------------------------------------------------------------------
+// IEEE - FP Select instructions
+//--------------------------------------------------------------------------
+
+uint16_t fp_min_hf(uint16_t op1,uint16_t op2);
+uint16_t fp_max_hf(uint16_t op1,uint16_t op2);
+uint32_t fp_min_sf(uint32_t op1,uint32_t op2);
+uint32_t fp_max_sf(uint32_t op1,uint32_t op2);
+uint16_t fp_min_bf(uint16_t op1,uint16_t op2);
+uint16_t fp_max_bf(uint16_t op1,uint16_t op2);
+uint16_t fp_abs_bf(uint16_t op1);
+uint16_t fp_neg_bf(uint16_t op1);
+
+//--------------------------------------------------------------------------
+// IEEE - FP Experiment Implementations
+//--------------------------------------------------------------------------
+uint16_t fp_mult_hf_hf_acc_dumb (uint16_t op1, uint16_t op2, uint16_t acc);
+uint32_t fp_vdmpy_acc_dumb  (uint32_t acc,uint16_t op1_u,uint16_t op1_l,uint16_t op2_u,uint16_t op2_l);
+#endif
diff --git a/target/hexagon/mmvec/kvx_mac_reduce.c b/target/hexagon/mmvec/kvx_mac_reduce.c
new file mode 100644
index 000000000000..e11e41ae5891
--- /dev/null
+++ b/target/hexagon/mmvec/kvx_mac_reduce.c
@@ -0,0 +1,1156 @@
+/*
+ *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "kvx_ieee.h"
+
+#define DF_MANTBITS() 52
+#define SF_MANTBITS() 23
+#define HF_MANTBITS() 10
+
+#define DF_INF_EXP 0x7ff
+#define DF_BIAS 1023
+
+#define SF_INF_EXP 0xff
+#define SF_BIAS 127
+
+#define HF_INF_EXP 0x1f
+#define HF_BIAS 15
+
+#define WAY_BIG_EXP 4096
+
+#define isz(X) (fabs(X) == 0.0f)
+
+
+typedef union {
+	double f;
+	size8u_t i;
+#ifndef SLOWLARIS
+	struct {
+		size8u_t mant:52;
+		size8u_t exp:11;
+		size8u_t sign:1;
+	} x;
+#else
+	struct {
+		size8u_t sign:1;
+		size8u_t exp:11;
+		size8u_t mant:52;
+	} x;
+#endif
+} df_t;
+
+typedef union {
+	float f;
+	size4u_t i;
+#ifndef SLOWLARIS
+	struct {
+		size4u_t mant:23;
+		size4u_t exp:8;
+		size4u_t sign:1;
+	} x;
+#else
+	struct {
+		size4u_t sign:1;
+		size4u_t exp:8;
+		size4u_t mant:23;
+	} x;
+#endif
+} sf_t;
+
+typedef struct {
+	union {
+		size8u_t low;
+		struct {
+#ifndef SLOWLARIS
+			size4u_t w0;
+			size4u_t w1;
+#else
+			size4u_t w1;
+			size4u_t w0;
+#endif
+		};
+	};
+	union {
+		size8u_t high;
+		struct {
+#ifndef SLOWLARIS
+			size4u_t w2;
+			size4u_t w3;
+#else
+			size4u_t w3;
+			size4u_t w2;
+#endif
+		};
+	};
+} int128_t;
+
+typedef struct {
+	int128_t mant;
+	size4s_t exp;
+	size1u_t sign;
+	size1u_t guard;
+	size1u_t round;
+	size1u_t sticky;
+} xf_t;
+
+static inline void xf_init(xf_t * p)
+{
+	p->mant.low = 0;
+	p->mant.high = 0;
+	p->exp = 0;
+	p->sign = 0;
+	p->guard = 0;
+	p->round = 0;
+	p->sticky = 0;
+}
+
+size8u_t df_getmant_kvx(df_t a);
+size8u_t df_getmant_kvx(df_t a)
+{
+	//int class = fpclassify(a.f);
+	//switch (class) {
+	//case FP_NORMAL:
+		return (a.x.mant | 1ULL << 52);
+	//case FP_ZERO:
+	//	return 0;
+	//case FP_SUBNORMAL:
+	//	return a.x.mant;
+	//default:
+	//	return -1;
+	//};
+}
+
+size4s_t df_getexp_kvx(df_t a);
+size4s_t df_getexp_kvx(df_t a)
+{
+	//int class = fpclassify(a.f);
+	//switch (class) {
+	//case FP_NORMAL:
+		return a.x.exp;
+	//case FP_SUBNORMAL:
+	//	return a.x.exp + 1;
+	//default:
+	//	return -1;
+	//};
+}
+
+size8u_t sf_getmant_kvx(sf_t a);
+size8u_t sf_getmant_kvx(sf_t a)
+{
+	//case FP_ZERO:
+	if((a.x.mant == 0) && (a.x.exp == 0))
+		return 0;
+	//case FP_SUBNORMAL:
+        else if((a.x.mant != 0) && (a.x.exp == 0))
+		return a.x.mant;
+	//case FP_NORMAL:
+        else if((a.x.exp != 0xFF) && (a.x.exp != 0))
+		return (a.x.mant | 1ULL << 23);
+	//default:
+        else
+		return -1;
+}
+
+size4s_t sf_getexp_kvx(sf_t a);
+size4s_t sf_getexp_kvx(sf_t a)
+{
+	//case FP_SUBNORMAL:
+        if((a.x.mant != 0) && (a.x.exp == 0))
+		return a.x.exp + 1;
+	//case FP_NORMAL:
+        else if((a.x.exp != 0xFF) && (a.x.exp != 0))
+		return a.x.exp;
+	//default:
+	else
+		return -1;
+}
+
+static inline void xf_debug(const char *msg, xf_t a)
+{
+#ifdef DEBUG
+	printf("%s %c0x%016llx_%016llx /%d/%d/%d p%d\n", msg,
+				  a.sign ? '-' : '+', a.mant.high, a.mant.low, a.guard,
+				  a.round, a.sticky, a.exp);
+#endif
+}
+
+static inline int128_t int128_shl(int128_t a, size4u_t amt)
+{
+	int128_t ret;
+	if (amt == 0)
+		return a;
+	if (amt > 128) {
+		ret.high = 0;
+		ret.low = 0;
+		return ret;
+	}
+	if (amt >= 64) {
+		amt -= 64;
+		a.high = a.low;
+		a.low = 0;
+	}
+	ret.high = a.high << amt;
+	ret.high |= (a.low >> (64 - amt));
+	ret.low = a.low << amt;
+	return ret;
+}
+
+static inline int128_t int128_shr(int128_t a, size4u_t amt)
+{
+	int128_t ret;
+	if (amt == 0)
+		return a;
+	if (amt > 128) {
+		ret.high = 0;
+		ret.low = 0;
+		return ret;
+	}
+	if (amt >= 64) {
+		amt -= 64;
+		a.low = a.high;
+		a.high = 0;
+	}
+	ret.low = a.low >> amt;
+	ret.low |= (a.high << (64 - amt));
+	ret.high = a.high >> amt;
+	return ret;
+}
+
+
+#define int128_gt kvx_int128_gt
+static inline int kvx_int128_gt(int128_t a, int128_t b)
+{
+	if (a.high == b.high)
+		return (a.low > b.low);
+	return (a.high > b.high);
+}
+
+static inline xf_t xf_norm_left(xf_t a)
+{
+	a.exp--;
+	a.mant = int128_shl(a.mant, 1);
+	a.mant.low |= a.guard;
+	a.guard = a.round;
+	a.round = a.sticky;
+	return a;
+}
+
+static inline xf_t xf_norm_right(xf_t a, int amt)
+{
+	if (amt > 130) {
+		a.sticky |=
+			a.round | a.guard | (a.mant.low != 0) | (a.mant.high != 0);
+		a.guard = a.round = a.mant.high = a.mant.low = 0;
+		a.exp += amt;
+		return a;
+
+	}
+	while (amt >= 64) {
+		a.sticky |= a.round | a.guard | (a.mant.low != 0);
+		a.guard = (a.mant.low >> 63) & 1;
+		a.round = (a.mant.low >> 62) & 1;
+		a.mant.low = a.mant.high;
+		a.mant.high = 0;
+		a.exp += 64;
+		amt -= 64;
+	}
+	while (amt > 0) {
+		a.exp++;
+		a.sticky |= a.round;
+		a.round = a.guard;
+		a.guard = a.mant.low & 1;
+		a.mant = int128_shr(a.mant, 1);
+		amt--;
+	}
+	return a;
+}
+
+#define int128_add  kvx_int128_add
+static inline int128_t kvx_int128_add(int128_t a, int128_t b)
+{
+	int128_t ret;
+	ret.low = a.low + b.low;
+	if ((ret.low < a.low) || (ret.low < b.low)) {
+		/* carry into high part */
+		a.high += 1;
+	}
+	ret.high = a.high + b.high;
+	return ret;
+}
+
+#define int128_sub kvx_int128_sub
+static inline int128_t kvx_int128_sub(int128_t a, int128_t b, int borrow)
+{
+	int128_t ret;
+	ret.low = a.low - b.low;
+	if (ret.low > a.low) {
+		/* borrow into high part */
+		a.high -= 1;
+	}
+	ret.high = a.high - b.high;
+	if (borrow == 0) {
+		return ret;
+	} else {
+		a.high = 0;
+		a.low = 1;
+		return int128_sub(ret, a, 0);
+	}
+}
+
+/* Return an infinity with the same sign as a */
+static inline df_t infinite_df_t(xf_t a)
+{
+	df_t ret;
+	ret.x.sign = a.sign;
+	ret.x.exp = DF_INF_EXP;
+	ret.x.mant = 0ULL;
+	return ret;
+}
+
+/* Return a maximum finite value with the same sign as a */
+static inline df_t maxfinite_df_t(xf_t a)
+{
+	df_t ret;
+	ret.x.sign = a.sign;
+	ret.x.exp = DF_INF_EXP - 1;
+	ret.x.mant = 0x000fffffffffffffULL;
+	return ret;
+}
+
+static inline df_t f2df_t(double in)
+{
+	df_t ret;
+	ret.f = in;
+	return ret;
+}
+
+/* Return an infinity with the same sign as a */
+static inline sf_t infinite_sf_t(xf_t a)
+{
+	sf_t ret;
+	ret.x.sign = a.sign;
+	ret.x.exp = SF_INF_EXP;
+	ret.x.mant = 0ULL;
+	return ret;
+}
+
+/* Return a maximum finite value with the same sign as a */
+static inline sf_t maxfinite_sf_t(xf_t a)
+{
+	sf_t ret;
+	ret.x.sign = a.sign;
+	ret.x.exp = SF_INF_EXP - 1;
+	ret.x.mant = 0x007fffffUL;
+	return ret;
+}
+
+static inline sf_t f2sf_t(float in)
+{
+	sf_t ret;
+	ret.f = in;
+	return ret;
+}
+
+#define GEN_XF_ROUND(TYPE,MANTBITS,INF_EXP) \
+TYPE xf_round_kvx_##TYPE(xf_t a); \
+TYPE xf_round_kvx_##TYPE(xf_t a) \
+{ \
+	TYPE ret; \
+	ret.i = 0; \
+	ret.x.sign = a.sign; \
+	if ((a.mant.high == 0) && (a.mant.low == 0) \
+		&& ((a.guard | a.round | a.sticky) == 0)) { \
+		/* result zero */ \
+		/*switch (fegetround()) { */\
+		/*case FE_DOWNWARD: */\
+		/*	return f2##TYPE(-0.0); */\
+		/*default: */\
+		if(a.sign) return f2##TYPE(-0.0); \
+		else return f2##TYPE(0.0); \
+		/*} */\
+	} \
+	/* Normalize right */ \
+	/* We want MANTBITS bits of mantissa plus the leading one. */ \
+	/* That means that we want MANTBITS+1 bits, or 0x000000000000FF_FFFF */ \
+	/* So we need to normalize right while the high word is non-zero and \
+	 * while the low word is nonzero when masked with 0xffe0_0000_0000_0000 */ \
+	xf_debug("input: ", a); \
+	while ((a.mant.high != 0) || ((a.mant.low >> (MANTBITS+1)) != 0)) { \
+		a = xf_norm_right(a, 1); \
+	} \
+	xf_debug("norm_right: ", a); \
+	/* OK, now normalize left */ \
+	/* We want to normalize left until we have a leading one in bit 24 */ \
+	/* Theoretically, we only need to shift a maximum of one to the left if we \
+	 * shifted out lots of bits from B, or if we had no shift / 1 shift sticky shoudl be 0  \
+	 */ \
+	while ((a.mant.low & (1ULL << MANTBITS)) == 0) { \
+		a = xf_norm_left(a); \
+	} \
+	xf_debug("norm_left: ", a); \
+	/* OK, now we might need to denormalize because of potential underflow.  We need \
+	 * to do this before rounding, and rounding might make us normal again */ \
+	while (a.exp <= 0) { \
+		a = xf_norm_right(a, 1 - a.exp); \
+		/* Do we have underflow?  That's when we get an inexact answer because we \
+		 * ran out of bits in a denormal. */ \
+		if (a.guard || a.round || a.sticky) { \
+			/*feraiseexcept(FE_UNDERFLOW);*/ \
+		} \
+	} \
+	xf_debug("norm_denorm: ", a); \
+	/* OK, we're relatively canonical... now we need to round */ \
+	if (a.guard || a.round || a.sticky) { \
+		/*feraiseexcept(FE_INEXACT);*/ \
+		/*switch (fegetround()) { */\
+		/*case FE_TOWARDZERO: */\
+		/*	 Chop and we're done */ \
+		/*	break; */\
+		/*case FE_UPWARD: */\
+		/*	if (a.sign == 0) a.mant.low += 1; */\
+		/*	break; */\
+		/*case FE_DOWNWARD: */\
+		/*	if (a.sign != 0) a.mant.low += 1; */\
+		/*	break; */\
+		/*default: */\
+			if (a.round || a.sticky) { \
+				/* round up if guard is 1, down if guard is zero */ \
+				a.mant.low += a.guard; \
+			} else if (a.guard) { \
+				/* exactly .5, round up if odd */ \
+				a.mant.low += (a.mant.low & 1); \
+			} \
+			/*break; */\
+		/*}*/ \
+	} \
+	xf_debug("post_round: ", a); \
+	/* OK, now we might have carried all the way up.  So we might need to shr once */ \
+	/* at least we know that the lsb should be zero if we rounded and got a carry out... */ \
+	if ((a.mant.low >> (MANTBITS+1)) != 0) { \
+		a = xf_norm_right(a, 1); \
+	} \
+	xf_debug("once_norm_right: ", a); \
+	/* Overflow? */ \
+	if (a.exp >= INF_EXP) { \
+		/* Yep, inf result */ \
+		xf_debug("inf: ", a); \
+		/*feraiseexcept(FE_OVERFLOW);*/ \
+		/*feraiseexcept(FE_INEXACT);*/ \
+		/*switch (fegetround()) { */\
+		/*case FE_TOWARDZERO: */\
+		/*	return maxfinite_##TYPE(a); */\
+		/*case FE_UPWARD: */\
+		/*	if (a.sign == 0) */\
+		/*		return infinite_##TYPE(a); */\
+		/*	else */\
+		/*		return maxfinite_##TYPE(a); */\
+		/*case FE_DOWNWARD: */\
+		/*	if (a.sign != 0) */\
+		/*		return infinite_##TYPE(a); */\
+		/*	else */\
+		/*		return maxfinite_##TYPE(a); */\
+		/*default: */\
+			return infinite_##TYPE(a); \
+		/*} */\
+	} \
+	/* Underflow? */ \
+	if (a.mant.low & (1ULL << MANTBITS)) { \
+		/* Leading one means: No, we're normal. So, we should be done... */ \
+		xf_debug("norm: ", a); \
+		ret.x.exp = a.exp; \
+		ret.x.mant = a.mant.low; \
+		return ret; \
+	} \
+	xf_debug("denorm: ", a); \
+	if (a.exp != 1) \
+		/*printf("a.exp == %d\n", a.exp);*/ \
+	assert(a.exp == 1); \
+	ret.x.exp = 0; \
+	ret.x.mant = a.mant.low; \
+	return ret; \
+}
+
+#define GEN_HF_ROUND(TYPE,MANTBITS,INF_EXP) \
+TYPE hf_round_##TYPE(xf_t a); \
+TYPE hf_round_##TYPE(xf_t a) \
+{ \
+	TYPE ret; \
+	ret.i = 0; \
+	ret.x.sign = a.sign; \
+	if ((a.mant.high == 0) && (a.mant.low == 0) \
+		&& ((a.guard | a.round | a.sticky) == 0)) { \
+		/* result zero */ \
+		/*switch (fegetround()) { */\
+		/*case FE_DOWNWARD: */\
+		/*	return f2##TYPE(-0.0); */\
+		/*default: */\
+		if(a.sign) return f2##TYPE(-0.0); \
+		else return f2##TYPE(0.0); \
+		/*} */\
+	} \
+	/* Normalize right */ \
+	/* We want MANTBITS bits of mantissa plus the leading one. */ \
+	/* That means that we want MANTBITS+1 bits, or 0x000000000000FF_FFFF */ \
+	/* So we need to normalize right while the high word is non-zero and \
+	 * while the low word is nonzero when masked with 0xffe0_0000_0000_0000 */ \
+	xf_debug("input: ", a); \
+	while ((a.mant.high != 0) || ((a.mant.low >> (MANTBITS+1)) != 0)) { \
+		a = xf_norm_right(a, 1); \
+	} \
+	xf_debug("norm_right: ", a); \
+	/* OK, now normalize left */ \
+	/* We want to normalize left until we have a leading one in bit 24 */ \
+	/* Theoretically, we only need to shift a maximum of one to the left if we \
+	 * shifted out lots of bits from B, or if we had no shift / 1 shift sticky shoudl be 0  \
+	 */ \
+	while ((a.mant.low & (1ULL << MANTBITS)) == 0) { \
+		a = xf_norm_left(a); \
+	} \
+	xf_debug("norm_left: ", a); \
+	/* OK, now we might need to denormalize because of potential underflow.  We need \
+	 * to do this before rounding, and rounding might make us normal again */ \
+	while (a.exp <= 0) { \
+		a = xf_norm_right(a, 1 - a.exp); \
+		/* Do we have underflow?  That's when we get an inexact answer because we \
+		 * ran out of bits in a denormal. */ \
+		if (a.guard || a.round || a.sticky) { \
+			/*feraiseexcept(FE_UNDERFLOW);*/ \
+		} \
+	} \
+	xf_debug("norm_denorm: ", a); \
+	/* OK, we're relatively canonical... now we need to round */ \
+	/*if (a.guard || a.round || a.sticky) { */\
+		/*feraiseexcept(FE_INEXACT);*/ \
+		/*switch (fegetround()) { */\
+		/*case FE_TOWARDZERO: */\
+		/*	   Chop and we're done */ \
+		/*	break; */\
+		/*case FE_UPWARD: */\
+		/*	if (a.sign == 0) a.mant.low += 1; */\
+		/*	break; */\
+		/*case FE_DOWNWARD: */\
+		/*	if (a.sign != 0) a.mant.low += 1; */\
+		/*	break; */\
+		/*default: */\
+			if (a.round || a.sticky || a.guard) { \
+				/* round up if guard is 1, down if guard is zero */ \
+				if ((a.mant.low & 0xFFF) == 0) a.mant.low += 1;  \
+	/*		} else if (a.guard) {*/ \
+				/* exactly .5, round up if odd */ \
+	/*			a.mant.low += (a.mant.low & 1); */\
+			} \
+			/*break; */\
+		/*}*/ \
+	/*} */\
+	xf_debug("post_round: ", a); \
+	/* OK, now we might have carried all the way up.  So we might need to shr once */ \
+	/* at least we know that the lsb should be zero if we rounded and got a carry out... */ \
+	if ((a.mant.low >> (MANTBITS+1)) != 0) { \
+		a = xf_norm_right(a, 1); \
+	} \
+	xf_debug("once_norm_right: ", a); \
+	/* Overflow? */ \
+	if (a.exp >= INF_EXP) { \
+		/* Yep, inf result */ \
+		xf_debug("inf: ", a); \
+		/*feraiseexcept(FE_OVERFLOW);*/ \
+		/*feraiseexcept(FE_INEXACT);*/ \
+		/*switch (fegetround()) { */\
+		/*case FE_TOWARDZERO: */\
+		/*	return maxfinite_##TYPE(a); */\
+		/*case FE_UPWARD: */\
+		/*	if (a.sign == 0) */\
+		/*		return infinite_##TYPE(a); */\
+		/*	else */\
+		/*		return maxfinite_##TYPE(a); */\
+		/*case FE_DOWNWARD: */\
+		/*	if (a.sign != 0) */\
+		/*		return infinite_##TYPE(a); */\
+		/*	else */\
+		/*		return maxfinite_##TYPE(a); */\
+		/*default: */\
+			return infinite_##TYPE(a); \
+		/*} */\
+	} \
+	/* Underflow? */ \
+	if (a.mant.low & (1ULL << MANTBITS)) { \
+		/* Leading one means: No, we're normal. So, we should be done... */ \
+		xf_debug("norm: ", a); \
+		ret.x.exp = a.exp; \
+		ret.x.mant = a.mant.low; \
+		return ret; \
+	} \
+	xf_debug("denorm: ", a); \
+	if (a.exp != 1) \
+		/*printf("a.exp == %d\n", a.exp);*/ \
+	assert(a.exp == 1); \
+	ret.x.exp = 0; \
+	ret.x.mant = a.mant.low; \
+	return ret; \
+}
+
+
+GEN_XF_ROUND(df_t,DF_MANTBITS(),DF_INF_EXP)
+GEN_XF_ROUND(sf_t,SF_MANTBITS(),SF_INF_EXP)
+GEN_HF_ROUND(sf_t,SF_MANTBITS(),SF_INF_EXP)
+
+#define int128_mult_6464 kvx_int128_mult_6464
+static inline int128_t kvx_int128_mult_6464(size8u_t ai, size8u_t bi)
+{
+	int128_t ret;
+	int128_t a, b;
+	size8u_t pp0, pp1a, pp1b, pp1s, pp2;
+
+#ifdef DEBUG
+	printf("ai/bi: 0x%016llx/0x%016llx\n", ai, bi);
+#endif
+	a.high = b.high = 0;
+	a.low = ai;
+	b.low = bi;
+	pp0 = (size8u_t) a.w0 * (size8u_t) b.w0;
+	pp1a = (size8u_t) a.w1 * (size8u_t) b.w0;
+	pp1b = (size8u_t) b.w1 * (size8u_t) a.w0;
+	pp2 = (size8u_t) a.w1 * (size8u_t) b.w1;
+#ifdef DEBUG
+	printf("pp2/1b/1a/0: 0x%016llx/0x%016llx/0x%016llx/0x%016llx\n",
+				  pp2, pp1b, pp1a, pp0);
+#endif
+	pp1s = pp1a + pp1b;
+	if ((pp1s < pp1a) || (pp1s < pp1b)) {
+		pp2 += (1ULL << 32);
+	}
+	ret.low = pp0 + (pp1s << 32);
+	if ((ret.low < pp0) || (ret.low < (pp1s << 32)))
+		pp2 += 1;
+	ret.high = pp2 + (pp1s >> 32);
+#ifdef DEBUG
+	printf("pp1s/rethi/retlo: 0x%016llx/0x%016llx/0x%016llx\n",
+				  pp1s, ret.high, ret.low);
+#endif
+	return ret;
+}
+
+xf_t xf_add_kvx(xf_t a, xf_t b);
+
+xf_t xf_sub_kvx(xf_t a, xf_t b, int negate);
+xf_t xf_sub_kvx(xf_t a, xf_t b, int negate)
+{
+	xf_t ret;
+	xf_init(&ret);
+	int borrow;
+	xf_debug("-->Sub/a: ", a);
+	xf_debug("-->Sub/b: ", b);
+	if (a.sign != b.sign) {
+		b.sign = !b.sign;
+		return xf_add_kvx(a, b);
+	}
+	if (b.exp > a.exp) {
+		/* small - big == - (big - small) */
+		return xf_sub_kvx(b, a, !negate);
+	}
+	if ((b.exp == a.exp) && (int128_gt(b.mant, a.mant))) {
+		/* small - big == - (big - small) */
+		return xf_sub_kvx(b, a, !negate);
+	}
+	xf_debug("OK: Sub/a: ", a);
+	xf_debug("OK: Sub/b: ", b);
+	while (a.exp > b.exp) {
+		/* Try to normalize exponents: shrink a exponent and grow mantissa */
+		if (a.mant.high & (1ULL << 62)) {
+			/* Can't grow a any more */
+			break;
+		} else {
+			a = xf_norm_left(a);
+		}
+	}
+	xf_debug("norm_l: Sub/a: ", a);
+	xf_debug("norm_l: Sub/b: ", b);
+	while (a.exp > b.exp) {
+		/* Try to normalize exponents: grow b exponent and shrink mantissa */
+		/* Keep around shifted out bits... we might need those later */
+		b = xf_norm_right(b, a.exp - b.exp);
+	}
+	xf_debug("norm_r: Sub/a: ", a);
+	xf_debug("norm_r: Sub/b: ", b);
+	if ((int128_gt(b.mant, a.mant))) {
+		xf_debug("retry: Sub/a: ", a);
+		xf_debug("retry: Sub/b: ", b);
+		return xf_sub_kvx(b, a, !negate);
+	}
+	/* OK, now things should be normalized! */
+	ret.sign = a.sign;
+	ret.exp = a.exp;
+	assert(!int128_gt(b.mant, a.mant));
+	borrow = (b.round << 2) | (b.guard << 1) | b.sticky;
+	ret.mant = int128_sub(a.mant, b.mant, (borrow != 0));
+	borrow = 0 - borrow;
+	ret.guard = (borrow >> 2) & 1;
+	ret.round = (borrow >> 1) & 1;
+	ret.sticky = (borrow >> 0) & 1;
+	if (negate)
+		ret.sign = !ret.sign;
+        //According to the IEEE standard, Zero result in a subtraction should always be positive
+	if ((ret.sign) && ((ret.mant.high == 0) && (ret.mant.low == 0) && ((ret.guard | ret.round | ret.sticky) == 0)))
+		ret.sign = !ret.sign;
+        xf_debug("ret: Sub ", ret);
+	return ret;
+}
+
+
+xf_t xf_add_kvx(xf_t a, xf_t b)
+{
+	xf_t ret;
+	xf_init(&ret);
+	xf_debug("-->Add/a: ", a);
+	xf_debug("-->Add/b: ", b);
+	if (a.sign != b.sign) {
+		b.sign = !b.sign;
+		return xf_sub_kvx(a, b, 0);
+	}
+	if (b.exp > a.exp) {
+		/* small + big ==  (big + small) */
+		return xf_add_kvx(b, a);
+	}
+	if ((b.exp == a.exp) && int128_gt(b.mant, a.mant)) {
+		/* small + big ==  (big + small) */
+		return xf_add_kvx(b, a);
+	}
+	xf_debug("OK? Add/a: ", a);
+	xf_debug("OK? Add/b: ", b);
+	while (a.exp > b.exp) {
+		/* Try to normalize exponents: shrink a exponent and grow mantissa */
+		if (a.mant.high & (1ULL << 62)) {
+			/* Can't grow a any more */
+			break;
+		} else {
+			a = xf_norm_left(a);
+		}
+	}
+	xf_debug("norm_l: Add/a: ", a);
+	xf_debug("norm_l: Add/b: ", b);
+	while (a.exp > b.exp) {
+		/* Try to normalize exponents: grow b exponent and shrink mantissa */
+		/* Keep around shifted out bits... we might need those later */
+		b = xf_norm_right(b, a.exp - b.exp);
+	}
+	xf_debug("norm_r: Add/a: ", a);
+	xf_debug("norm_r: Add/b: ", b);
+	/* OK, now things should be normalized! */
+	if (int128_gt(b.mant, a.mant)) {
+		xf_debug("retry: Add/a: ", a);
+		xf_debug("retry: Add/b: ", b);
+		return xf_add_kvx(b, a);
+	};
+	ret.sign = a.sign;
+	ret.exp = a.exp;
+	assert(!int128_gt(b.mant, a.mant));
+	ret.mant = int128_add(a.mant, b.mant);
+	ret.guard = b.guard;
+	ret.round = b.round;
+	ret.sticky = b.sticky;
+        xf_debug("ret: Add ", ret);
+	return ret;
+}
+
+
+float internal_fma_kvx(float a_in, float b_in, float c_in, int scale);
+float internal_fma_kvx(float a_in, float b_in, float c_in, int scale)
+{
+	sf_t a, b, c;
+	xf_t prod;
+	xf_t acc;
+	xf_t result;
+#if 0
+	df_t t;
+	fexcept_t flags_tmp;
+#endif
+	xf_init(&prod);
+	xf_init(&acc);
+	xf_init(&result);
+	a.f = a_in;
+	b.f = b_in;
+	c.f = c_in;
+//	printf("internal_fma_kvxx: 0x%016x * 0x%016x + 0x%016x sc: %d\n",
+//				  fUNFLOAT(a_in), fUNFLOAT(b_in), fUNFLOAT(c_in), scale);
+//	if (isinf(a.f) || isinf(b.f) || isinf(c.f))
+//		return special_fmaf(a, b, c);
+//	if (isnan(a.f) || isnan(b.f) || isnan(c.f))
+//		return special_fmaf(a, b, c);
+	if ((scale == 0) && (isz(a.f) || isz(b.f)))
+		return (a.f * b.f + c.f);
+	/* Is a*b exact?  If so, we don't have to go the slow way */
+	/* EJP: axe this for simplicity? */
+#if 0
+	fegetexceptflag(&flags_tmp, FE_ALL_EXCEPT);
+	feclearexcept(FE_ALL_EXCEPT);
+	t.f = a.f * b.f;
+	if (0 && (scale == 0) && isfinite(t.f)
+		&& fetestexcept(FE_ALL_EXCEPT) == 0) {
+		/* It's exactly correct, we can just do the add and return */
+		fesetexceptflag(&flags_tmp, FE_ALL_EXCEPT);
+		asm volatile ("");
+		t.f = (t.f + c.f);
+		return t.f;
+	}
+	fesetexceptflag(&flags_tmp, FE_ALL_EXCEPT);
+#endif
+	/* (a * 2**b) * (c * 2**d) == a*c * 2**(b+d) */
+	prod.mant = int128_mult_6464(sf_getmant_kvx(a), sf_getmant_kvx(b));
+	/* Note: extracting the mantissa into an int is multiplying by 2**23, so adjust here: */
+	prod.exp = sf_getexp_kvx(a) + sf_getexp_kvx(b) - SF_BIAS - 23;
+	prod.sign = a.x.sign ^ b.x.sign;
+	if (isz(a.f) || isz(b.f)) prod.exp = -2*WAY_BIG_EXP;
+	xf_debug("prod: ", prod);
+	if ((scale > 0) /*&& (fpclassify(c.f) == FP_SUBNORMAL)*/) {
+		acc.mant = int128_mult_6464(0,0);
+		acc.exp = -WAY_BIG_EXP;
+		acc.sign = c.x.sign;
+		acc.sticky = 1;
+		xf_debug("special denorm acc: ",acc);
+		result = xf_add_kvx(prod,acc);
+	} else if (!isz(c.f)) {
+		acc.mant = int128_mult_6464(sf_getmant_kvx(c), 1);
+		acc.exp = sf_getexp_kvx(c);
+		acc.sign = c.x.sign;
+		xf_debug("acc: ", acc);
+		result = xf_add_kvx(prod, acc);
+	} else {
+		result = prod;
+	}
+	xf_debug("sum: ", result);
+#ifdef DEBUG
+	printf("Scaling: %d\n", scale);
+#endif
+	result.exp += scale;
+	xf_debug("post-scale: ", result);
+	return hf_round_sf_t(result).f;
+}
+
+// result = (a*c) + (b*d) + acc
+float internal_vdmpy_acc(float a_in, float b_in, float c_in, float d_in, float acc_in, int scale);
+float internal_vdmpy_acc(float a_in, float b_in, float c_in, float d_in, float acc_in, int scale)
+{
+	sf_t a, b, c, d, accm;
+	xf_t prod1; //a*c
+	xf_t prod2; //b*d
+	xf_t acc;
+	xf_t result_temp;
+	xf_t result;
+
+        xf_init(&prod1);
+        xf_init(&prod2);
+	xf_init(&acc);
+	xf_init(&result_temp);
+	xf_init(&result);
+
+	a.f = a_in;
+	b.f = b_in;
+	c.f = c_in;
+	d.f = d_in;
+        accm.f = acc_in;
+
+        /* (a * 2**b) * (c * 2**d) == a*c * 2**(b+d) */
+	prod1.mant = int128_mult_6464(sf_getmant_kvx(a), sf_getmant_kvx(c));
+	/* Note: extracting the mantissa into an int is multiplying by 2**23, so adjust here: */
+	prod1.exp = sf_getexp_kvx(a) + sf_getexp_kvx(c) - SF_BIAS - 23;
+	prod1.sign = a.x.sign ^ c.x.sign;
+
+        /* (a * 2**b) * (c * 2**d) == a*c * 2**(b+d) */
+	prod2.mant = int128_mult_6464(sf_getmant_kvx(b), sf_getmant_kvx(d));
+	/* Note: extracting the mantissa into an int is multiplying by 2**23, so adjust here: */
+	prod2.exp = sf_getexp_kvx(b) + sf_getexp_kvx(d) - SF_BIAS - 23;
+	prod2.sign = b.x.sign ^ d.x.sign;
+
+
+	if (isz(a.f) || isz(c.f)) prod1.exp = -2*WAY_BIG_EXP;
+	if (isz(b.f) || isz(d.f)) prod2.exp = -2*WAY_BIG_EXP;
+
+	xf_debug("prod1: ", prod1);
+	xf_debug("prod2: ", prod2);
+
+	if ((scale > 0) /*&& (fpclassify(c.f) == FP_SUBNORMAL)*/) {
+		acc.mant = int128_mult_6464(0,0);
+		acc.exp = -WAY_BIG_EXP;
+		acc.sign = c.x.sign;
+		acc.sticky = 1;
+		xf_debug("special denorm acc: ",acc);
+		//result = xf_add_kvx(prod,acc);
+	} else /*if (!isz(accm.f)) */{
+		acc.mant = int128_mult_6464(sf_getmant_kvx(accm), 1);
+		acc.exp  = sf_getexp_kvx(accm);
+		acc.sign = accm.x.sign;
+		xf_debug("acc: ", acc);
+		//result = xf_add_kvx(prod, acc);
+	} /*else {
+		result = xf_add_kvx(prod1, prod2);
+	}*/
+
+        //Add the 3 numbers: prod1 prod2 acc
+        //result_temp = xf_add_kvx(prod1,prod2);
+        //result = xf_add_kvx(result_temp,acc);
+        result_temp = xf_add_kvx(prod1,prod2);
+        result = xf_add_kvx(result_temp,acc);
+
+	xf_debug("sum: ", result);
+#ifdef DEBUG
+	printf("Scaling: %d\n", scale);
+#endif
+	result.exp += scale;
+	xf_debug("post-scale: ", result);
+	return xf_round_kvx_sf_t(result).f;
+}
+
+
+uint32_t fp_vdmpy_acc  (uint32_t acc,uint16_t op1_u,uint16_t op1_l,uint16_t op2_u,uint16_t op2_l)
+{
+    union ui32_f32 u_op;
+    union ui32_f32 u_acc;
+    union ui32_f32 u_rslt;
+
+    uint32_t op1_u_f32, op1_l_f32, op2_u_f32, op2_l_f32;
+    float f_op1_u, f_op1_l, f_op2_u, f_op2_l, f_acc;
+    float f_prod_l = 0, f_prod_u = 0, rslt;
+    uint32_t result;
+
+#ifdef DEBUG
+    printf("Debug : op1_u =0x%04x\n",op1_u);
+    printf("Debug : op1_l =0x%04x\n",op1_l);
+    printf("Debug : op2_u =0x%04x\n",op2_u);
+    printf("Debug : op2_l =0x%04x\n",op2_l);
+    printf("Debug : acc   =0x%08x\n",acc);
+#endif
+
+    if(isNaNF16UI(op1_u) || isNaNF16UI(op1_l) || isNaNF16UI(op2_u) || isNaNF16UI(op2_l) || isNaNF32UI(acc))
+       return FP32_DEF_NAN;
+
+    op1_u_f32 = f16_to_f32(op1_u);
+    op1_l_f32 = f16_to_f32(op1_l);
+    op2_u_f32 = f16_to_f32(op2_u);
+    op2_l_f32 = f16_to_f32(op2_l);
+
+#ifdef DEBUG
+    printf("Debug : op1_u_f32 =0x%08x\n",op1_u_f32);
+    printf("Debug : op1_l_f32 =0x%08x\n",op1_l_f32);
+    printf("Debug : op2_u_f32 =0x%08x\n",op2_u_f32);
+    printf("Debug : op2_l_f32 =0x%08x\n",op2_l_f32);
+#endif
+
+    u_op.ui = op1_u_f32;
+    f_op1_u = u_op.f;
+
+    u_op.ui = op1_l_f32;
+    f_op1_l = u_op.f;
+
+    u_op.ui = op2_l_f32;
+    f_op2_l = u_op.f;
+
+    u_op.ui = op2_u_f32;
+    f_op2_u = u_op.f;
+
+    u_acc.ui = acc;
+    f_acc   = u_acc.f;
+
+#ifdef DEBUG
+    printf("Debug_0 : f_op1_u = %f\n",f_op1_u);
+    printf("Debug_0 : f_op1_l   = %f\n",f_op1_l);
+    printf("Debug_0 : f_op2_u = %f\n",f_op2_u);
+    printf("Debug_0 : f_op2_l   = %f\n",f_op2_l);
+    printf("Debug_0 : f_acc   = %f\n",f_acc);
+#endif
+
+    f_prod_l =  (f_op1_l * f_op2_l);
+    f_prod_u =  (f_op1_u * f_op2_u);
+
+    if(isInfF16UI(op1_u) || isInfF16UI(op1_l) || isInfF16UI(op2_u) || isInfF16UI(op2_l) || isInfF32UI(acc))
+    {
+       rslt     =  (f_prod_u + f_prod_l + f_acc);
+#ifdef DEBUG
+       printf("Debug_inf : rslt = %f\n",rslt);
+#endif
+       u_rslt.f = rslt;
+       result = u_rslt.ui;
+#ifdef DEBUG
+       printf("Debug_inf : result =0x%08x\n",result);
+#endif
+       result = isNaNF32UI(result) ? FP32_DEF_NAN : result;
+#ifdef DEBUG
+       printf("Debug_inf : result final =0x%08x\n",result);
+#endif
+       return result;
+    }
+
+    //If any of the below is a zero, we can use easy approach
+    if(isz(f_prod_l) || isz(f_prod_u) || isz(f_acc))
+    {
+       rslt     =  (f_prod_u + f_prod_l + f_acc);
+#ifdef DEBUG
+       printf("Debug_inf : rslt = %f\n",rslt);
+#endif
+       u_rslt.f = rslt;
+       result = u_rslt.ui;
+#ifdef DEBUG
+       printf("Debug_inf : result =0x%08x\n",result);
+#endif
+       result = isNaNF32UI(result) ? FP32_DEF_NAN : result;
+#ifdef DEBUG
+       printf("Debug_inf : result final =0x%08x\n",result);
+#endif
+       return result;
+    }
+
+
+////----------------------------------------------------------------------------------------------------
+//    f_prod_l =  (f_op1_l * f_op2_l);
+//    f_prod_u =  (f_op1_u * f_op2_u);
+//
+//    printf("Debug_1 : f_prod_l = %f\n",f_prod_l);
+//    printf("Debug_1 : f_prod_u = %f\n",f_prod_u);
+//
+//    rslt     =  (f_prod_u + f_prod_l + f_acc);
+//    printf("Debug_1 : rslt = %f\n",rslt);
+//    u_rslt.f = rslt;
+//    result = u_rslt.ui;
+//    printf("Debug_1 : result =0x%08x\n",result);
+////----------------------------------------------------------------------------------------------------
+
+    rslt     =  internal_vdmpy_acc(f_op1_u, f_op1_l,f_op2_u,f_op2_l,f_acc,0);
+    u_rslt.f = rslt;
+    result = u_rslt.ui;
+#ifdef DEBUG
+    printf("Debug_2 : rslt = %f\n",rslt);
+    printf("Debug_2 : result =0x%08x\n",result);
+#endif
+
+    result = isNaNF32UI(result) ? FP32_DEF_NAN : result;
+
+#ifdef DEBUG
+    printf("Debug : f_op1_u = %f\n",f_op1_u);
+    printf("Debug : f_op1_l = %f\n",f_op1_l);
+    printf("Debug : f_op2_u = %f\n",f_op2_u);
+    printf("Debug : f_op2_l = %f\n",f_op2_l);
+    printf("Debug : f_acc   = %f\n",f_acc);
+    printf("Debug : f_prod_l = %f\n",f_prod_l);
+    printf("Debug : f_prod_u = %f\n",f_prod_u);
+    printf("Debug : rslt = %f\n",rslt);
+    printf("Debug : result =0x%08x\n",result);
+#endif
+
+    return result;
+}
+
+
+uint16_t fp_mult_hf_hf_acc (uint16_t op1, uint16_t op2, uint16_t acc)
+{
+    union ui32_f32 u_op1;
+    union ui32_f32 u_op2;
+    union ui32_f32 u_acc;
+    union ui32_f32 u_rslt;
+
+    uint32_t op1_f32;
+    uint32_t op2_f32;
+    uint32_t acc_f32;
+
+    float a,b,facc,rslt;
+    uint32_t result_f32;
+    uint16_t result;
+
+#ifdef DEBUG
+    printf("Debug : op1 =0x%04x\n",op1);
+    printf("Debug : op2 =0x%04x\n",op2);
+    printf("Debug : acc =0x%04x\n",acc);
+#endif
+
+    if(isNaNF16UI(op1) || isNaNF16UI(op2) || isNaNF16UI(acc))
+       return FP16_DEF_NAN;
+
+    op1_f32 = f16_to_f32(op1);
+    op2_f32 = f16_to_f32(op2);
+    acc_f32 = f16_to_f32(acc);
+
+#ifdef DEBUG
+    printf("Debug : op1_f32 = 0x%08x\n",op1_f32);
+    printf("Debug : op2_f32 = 0x%08x\n",op2_f32);
+    printf("Debug : acc_f32 = 0x%08x\n",acc_f32);
+#endif
+
+    u_op1.ui = op1_f32;
+    u_op2.ui = op2_f32;
+    u_acc.ui = acc_f32;
+    a = u_op1.f;
+    b = u_op2.f;
+    facc = u_acc.f;
+
+#ifdef DEBUG
+    printf("Debug_1 : a = %f\n",a);
+    printf("Debug_1 : b = %f\n",b);
+    printf("Debug_1 : facc = %f\n",facc);
+#endif
+
+    if(isInfF16UI(op1) || isInfF16UI(op2) || isInfF16UI(acc))
+    {
+       rslt = (a * b) + facc;
+#ifdef DEBUG
+       printf("Debug_inf : rslt = %f\n",rslt);
+#endif
+       u_rslt.f = rslt;
+       result_f32 = u_rslt.ui;
+       result = f32_to_f16(result_f32);
+#ifdef DEBUG
+       printf("Debug_inf : result_f32 =0x%08x\n",result_f32);
+       printf("Debug_inf : result =0x%04x\n",result);
+#endif
+       result = isNaNF16UI(result) ? FP16_DEF_NAN : result;
+#ifdef DEBUG
+       printf("Debug_inf : result final =0x%04x\n",result);
+#endif
+       return result;
+    }
+
+//    //----------------------------------------------------------------------------------------------------
+//    rslt = (a * b) + facc;
+//    u_rslt.f = rslt;
+//    result_f32 = u_rslt.ui;
+//    printf("Debug_3 : result_f32 =0x%08x\n",result_f32);
+//    result = f32_to_f16(result_f32);
+//    printf("Debug_3 : result =0x%04x\n",result);
+//    //----------------------------------------------------------------------------------------------------
+
+    //rslt = fma(a,b,facc);
+    rslt = internal_fma_kvx(a, b, facc, 0);
+    u_rslt.f = rslt;
+    result_f32 = u_rslt.ui;
+#ifdef DEBUG
+    printf("Debug_2 : rslt = %f\n",rslt);
+    printf("Debug_2 : result_f32 =0x%08x\n",result_f32);
+#endif
+
+    result = f32_to_f16(result_f32);
+
+#ifdef DEBUG
+    printf("Debug_2 : result =0x%04x\n",result);
+#endif
+
+    result = isNaNF16UI(result) ? FP16_DEF_NAN : result;
+
+#ifdef DEBUG
+    printf("Debug_2 : result final =0x%04x\n",result);
+#endif
+
+    return result;
+}
+
diff --git a/target/hexagon/mmvec/macros.h b/target/hexagon/mmvec/macros.h
index bcd4a1e8973c..645ec9280972 100644
--- a/target/hexagon/mmvec/macros.h
+++ b/target/hexagon/mmvec/macros.h
@@ -354,3 +354,16 @@
     } while (0);
 
 #endif
+
+#define fPARSEHF(A) parse_hf(A)
+#define fPARSESF(A) parse_sf(A)
+#define fPARSEQF16(A) parse_qf16(A)
+#define fPARSEQF32(A) parse_qf32(A)
+
+#define fRNDSATHF(A,B) rnd_sat_hf(A,B)
+#define fRNDSATSF(A,B) rnd_sat_sf(A,B)
+#define fRNDSATQF16(A,B,C) rnd_sat_qf16(A,B,C)
+#define fRNDSATQF32(A,B,C) rnd_sat_qf32(A,B,C)
+
+#define fNEGQF16(A) negate16(A)
+#define fNEGQF32(A) negate32(A)
diff --git a/target/hexagon/mmvec/macros_auto.h b/target/hexagon/mmvec/macros_auto.h
new file mode 100644
index 000000000000..479cb225c70c
--- /dev/null
+++ b/target/hexagon/mmvec/macros_auto.h
@@ -0,0 +1,221 @@
+/*
+ *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HEXAGON_MMVEC_MACROS_AUTO_H
+#define HEXAGON_MMVEC_MACROS_AUTO_H
+
+
+#include "mmvec/macros.h"
+
+#include "q6v_defines.h"
+#pragma GCC diagnostic ignored "-Wtype-limits"
+#define fDUMPQ(STR,REG) do { printf(STR ":" #REG ": 0x%016llx\n",REG.ud[0]); } while (0)
+#define fRT8NOTE()
+#define fEXPERIMENTAL()
+#define fBFLOAT()
+#define fCVI_VX_NO_TMP_LD()
+#define fNOTQ(VAL) ({mmqreg_t _ret ={0}; int _i_; for (_i_ = 0; _i_ < fVECSIZE()/64; _i_++) _ret.ud[_i_] = ~VAL.ud[_i_]; _ret;})
+#define fGETQBITS(REG,WIDTH,MASK,BITNO) ((MASK) & (REG.w[(BITNO)>>5] >> ((BITNO) & 0x1f)))
+#define fGETQBIT(REG,BITNO) fGETQBITS(REG,1,1,BITNO)
+#define fGENMASKW(QREG,IDX) (((fGETQBIT(QREG,(IDX*4+0)) ? 0xFF : 0x0) << 0) |((fGETQBIT(QREG,(IDX*4+1)) ? 0xFF : 0x0) << 8) |((fGETQBIT(QREG,(IDX*4+2)) ? 0xFF : 0x0) << 16) |((fGETQBIT(QREG,(IDX*4+3)) ? 0xFF : 0x0) << 24))
+#define fGET10BIT(COE,VAL,POS) { COE = (((((fGETUBYTE(3,VAL) >> (2 * POS)) & 3) << 8) | fGETUBYTE(POS,VAL)) << 6); COE >>= 6; }
+#define fVMAX(X,Y) (X>Y) ? X : Y
+#define fREAD_VEC(DST,IDX) (DST = READ_VREG(fMODCIRCU((IDX),5)))
+#define fREAD_ZVEC(DST,IDX) (DST = READ_ZREG(fMODCIRCU((IDX),5)))
+#define fREAD_ZVEC_WORD(DST,IDX) { mmvector_t ZReg = READ_ZREG(0); DST = ZReg.uw[IDX]; }
+#define fREAD_ZVEC_ALL(DST,N,NZ) { int __idx = 0; for (__idx = 0; __idx < NZ/N; __idx++) { memcpy(&DST[N*__idx], &THREAD2STRUCT->ZRegs[__idx], N); } }
+#define fZREGB(Z,IDX) ((size1s_t)Z[IDX])
+#define fZREGUB(Z,IDX) ((size1u_t)Z[IDX])
+#define fZREGH(Z,IDX) ((size2s_t)Z[IDX])
+#define fZREGUB(Z,IDX) ((size1u_t)Z[IDX])
+#define fGETNIBBLE(IDX,SRC) ( fSXTN(4,8,(SRC >> (4*IDX)) & 0xF) )
+#define fGETCRUMB(IDX,SRC) ( fSXTN(2,8,(SRC >> (2*IDX)) & 0x3) )
+#define fGETCRUMB_SYMMETRIC(IDX,SRC) ( (fGETCRUMB(IDX,SRC)>=0 ? (2-fGETCRUMB(IDX,SRC)) : fGETCRUMB(IDX,SRC) ) )
+#define fWRITE_VEC(IDX,VAR) (WRITE_VREG(fMODCIRCU((IDX),5),VAR))
+#define fGENMASKH(QREG,IDX) (((fGETQBIT(QREG,(IDX*2+0)) ? 0xFF : 0x0) << 0) |((fGETQBIT(QREG,(IDX*2+1)) ? 0xFF : 0x0) << 8))
+#define fGETMASKW(VREG,QREG,IDX) (VREG.w[IDX] & fGENMASKW((QREG),IDX))
+#define fGETMASKH(VREG,QREG,IDX) (VREG.h[IDX] & fGENMASKH((QREG),IDX))
+#define fCONDMASK8(QREG,IDX,YESVAL,NOVAL) (fGETQBIT(QREG,IDX) ? (YESVAL) : (NOVAL))
+#define fCONDMASK16(QREG,IDX,YESVAL,NOVAL) ((fGENMASKH(QREG,IDX) & (YESVAL)) | (fGENMASKH(fNOTQ(QREG),IDX) & (NOVAL)))
+#define fCONDMASK32(QREG,IDX,YESVAL,NOVAL) ((fGENMASKW(QREG,IDX) & (YESVAL)) | (fGENMASKW(fNOTQ(QREG),IDX) & (NOVAL)))
+#define fSETQBITS(REG,WIDTH,MASK,BITNO,VAL) do { size4u_t __TMP = (VAL); REG.w[(BITNO)>>5] &= ~((MASK) << ((BITNO) & 0x1f)); REG.w[(BITNO)>>5] |= (((__TMP) & (MASK)) << ((BITNO) & 0x1f)); } while (0)
+#define fSETQBIT(REG,BITNO,VAL) fSETQBITS(REG,1,1,BITNO,VAL)
+#define fVBYTES() (fVECSIZE())
+#define fVHALVES() (fVECSIZE()/2)
+#define fVWORDS() (fVECSIZE()/4)
+#define fVDWORDS() (fVECSIZE()/8)
+#define fVALIGN(ADDR, LOG2_ALIGNMENT) ( ADDR = ADDR & ~(LOG2_ALIGNMENT-1))
+#define fVLASTBYTE(ADDR, LOG2_ALIGNMENT) ( ADDR = ADDR | (LOG2_ALIGNMENT-1))
+#define fVELEM(WIDTH) ((fVECSIZE()*8)/WIDTH)
+#define fVECLOGSIZE() (MAX_VEC_SIZE_LOGBYTES)
+#define fVBUF_IDX(EA) (((EA) >> fVECLOGSIZE()) & 0xFF)
+#define fREAD_VBUF(IDX,WIDX) READ_VBUF(IDX,WIDX)
+#define fLOG_VBUF(IDX,VAL,WIDX) LOG_VBUF(IDX,VAL,WIDX)
+#define fVECSIZE() (1<<fVECLOGSIZE())
+#define fSWAPB(A, B) { size1u_t tmp = A; A = B; B = tmp; }
+#define fVZERO() mmvec_zero_vector()
+#define fNEWVREG(VNUM) ((THREAD2STRUCT->VRegs_updated & (((VRegMask)1)<<VNUM)) ? THREAD2STRUCT->future_VRegs[VNUM] : mmvec_zero_vector())
+#define fV_AL_CHECK(EA,MASK) if ((EA) & (MASK)) { warn("aligning misaligned vector. PC=%08x EA=%08x",thread->Regs[REG_PC],(EA)); }
+#define fSCATTER_INIT( REGION_START, LENGTH, ELEMENT_SIZE) { mem_vector_scatter_init(thread, insn, REGION_START, LENGTH, ELEMENT_SIZE); if (EXCEPTION_DETECTED) return; }
+#define fGATHER_INIT( REGION_START, LENGTH, ELEMENT_SIZE) { mem_vector_gather_init(thread, insn, REGION_START, LENGTH, ELEMENT_SIZE); if (EXCEPTION_DETECTED) return; }
+#ifdef CONFIG_USER_ONLY
+#define fSCATTER_FINISH(OP)
+#define fGATHER_FINISH()
+#else
+#define fSCATTER_FINISH(OP) { if (EXCEPTION_DETECTED) return; mem_vector_scatter_finish(thread, insn, OP); }
+#define fGATHER_FINISH() { if (EXCEPTION_DETECTED) return; mem_vector_gather_finish(thread, insn); }
+#endif
+#define CHECK_VTCM_PAGE(FLAG, BASE, LENGTH, OFFSET, ALIGNMENT) { int slot = insn->slot; paddr_t pa = thread->mem_access[slot].paddr+OFFSET; pa = pa & ~(ALIGNMENT-1); FLAG = (pa < (thread->mem_access[slot].paddr+LENGTH)); }
+#define COUNT_OUT_OF_BOUNDS(FLAG, SIZE) { if (!FLAG) { THREAD2STRUCT->vtcm_log.oob_access += SIZE; warn("Scatter/Gather out of bounds of region"); } }
+#define fLOG_SCATTER_OP(SIZE) { thread->vtcm_log.op = 1; thread->vtcm_log.op_size = SIZE; }
+#define fVLOG_VTCM_GATHER_WORD(EA,OFFSET,IDX, LEN) { GATHER_FUNCTION(EA,OFFSET,IDX, LEN, 4, IDX, 1); }
+#define fVLOG_VTCM_GATHER_HALFWORD(EA,OFFSET,IDX, LEN) { GATHER_FUNCTION(EA,OFFSET,IDX, LEN, 2, IDX, 1); }
+#define fVLOG_VTCM_GATHER_HALFWORD_DV(EA,OFFSET,IDX,IDX2,IDX_H, LEN) { GATHER_FUNCTION(EA,OFFSET,IDX, LEN, 2, (2*IDX2+IDX_H), 1); }
+#define fVLOG_VTCM_GATHER_WORDQ(EA,OFFSET,IDX, Q, LEN) { GATHER_FUNCTION(EA,OFFSET,IDX, LEN, 4, IDX, fGETQBIT(QsV,4*IDX+i0)); }
+#define fVLOG_VTCM_GATHER_HALFWORDQ(EA,OFFSET,IDX, Q, LEN) { GATHER_FUNCTION(EA,OFFSET,IDX, LEN, 2, IDX, fGETQBIT(QsV,2*IDX+i0)); }
+#define fVLOG_VTCM_GATHER_HALFWORDQ_DV(EA,OFFSET,IDX,IDX2,IDX_H, Q, LEN) { GATHER_FUNCTION(EA,OFFSET,IDX, LEN, 2, (2*IDX2+IDX_H), fGETQBIT(QsV,2*IDX+i0)); }
+#define DEBUG_LOG_ADDR(OFFSET) { if (thread->processor_ptr->arch_proc_options->mmvec_network_addr_log2) { int slot = insn->slot; paddr_t pa = thread->mem_access[slot].paddr+OFFSET; } }
+//#define SCATTER_OP_WRITE_TO_MEM(TYPE) { for (int i = 0; i < mmvecx->vtcm_log.size; i+=sizeof(TYPE)) { if ( mmvecx->vtcm_log.mask.ub[i] != 0) { TYPE dst = 0; TYPE inc = 0; for(int j = 0; j < sizeof(TYPE); j++) { dst |= (sim_mem_read1(thread->system_ptr, thread->threadId, mmvecx->vtcm_log.pa[i+j]) << (8*j)); inc |= mmvecx->vtcm_log.data.ub[j+i] << (8*j); mmvecx->vtcm_log.mask.ub[j+i] = 0; mmvecx->vtcm_log.data.ub[j+i] = 0; mmvecx->vtcm_log.offsets.ub[j+i] = 0; } dst += inc; for(int j = 0; j < sizeof(TYPE); j++) { sim_mem_write1(thread->system_ptr,thread->threadId, mmvecx->vtcm_log.pa[i+j], (dst >> (8*j))& 0xFF ); } } } }
+#define fVLOG_VTCM_HALFWORD(EA,OFFSET,IN,IDX, LEN) { SCATTER_FUNCTION (EA,OFFSET,IDX, LEN, 2, IDX, 1, IN); }
+#define fVLOG_VTCM_WORD(EA,OFFSET,IN,IDX,LEN) { SCATTER_FUNCTION (EA,OFFSET,IDX, LEN, 4, IDX, 1, IN); }
+#define fVLOG_VTCM_HALFWORDQ(EA,OFFSET,IN,IDX,Q,LEN) { SCATTER_FUNCTION (EA,OFFSET,IDX, LEN, 2, IDX, fGETQBIT(QsV,2*IDX+i0), IN); }
+#define fVLOG_VTCM_WORDQ(EA,OFFSET,IN,IDX,Q,LEN) { SCATTER_FUNCTION (EA,OFFSET,IDX, LEN, 4, IDX, fGETQBIT(QsV,4*IDX+i0), IN); }
+#define fVLOG_VTCM_HALFWORD_DV(EA,OFFSET,IN,IDX,IDX2,IDX_H, LEN) { SCATTER_FUNCTION (EA,OFFSET,IDX, LEN, 2, (2*IDX2+IDX_H), 1, IN); }
+#define fVLOG_VTCM_HALFWORDQ_DV(EA,OFFSET,IN,IDX,Q,IDX2,IDX_H, LEN) { SCATTER_FUNCTION (EA,OFFSET,IDX, LEN, 2, (2*IDX2+IDX_H), fGETQBIT(QsV,2*IDX+i0), IN); }
+#define fSTORERELEASE(EA,TYPE) { fV_AL_CHECK(EA,fVECSIZE()-1); mem_store_release(thread, insn, fVECSIZE(), EA&~(fVECSIZE()-1), EA, TYPE, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr)); }
+#define fVFETCH_AL(EA) { fV_AL_CHECK(EA,fVECSIZE()-1); mem_fetch_vector(thread, insn, EA&~(fVECSIZE()-1), slot, fVECSIZE()); }
+#define fLOADMMV_AL(EA, ALIGNMENT, LEN, DST) { fV_AL_CHECK(EA,ALIGNMENT-1); /*thread->last_pkt->double_access_vec = 0;*/ mem_load_vector_oddva(thread, 0, EA&~(ALIGNMENT-1), EA, slot, LEN, &DST.ub[0], LEN, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr)); }
+#ifdef QEMU_GENERATE
+#define fLOADMMV(EA, DST) gen_vreg_load(ctx, DST##_off, EA, true)
+#else
+#define fLOADMMV(EA, DST) fLOADMMV_AL(EA,fVECSIZE(),fVECSIZE(),DST)
+#endif
+#define fLOADMMZ(EA,DST) { mmvector_t load_vec; fV_AL_CHECK(EA,fVECSIZE()-1); mem_load_vector_oddva(thread, 0, EA&~(fVECSIZE()-1), EA, slot, fVECSIZE(), &load_vec.ub[0], fVECSIZE(), fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr)); int idx = (EA & 0x80)>0; DST.v[idx] = load_vec; }
+#define fLOADZ_LOAD(EA,EAU,WIDTH,DST) {/* thread->last_pkt->ext_slot_cancelled = 0; thread->last_pkt->double_access_vec = 0;*/ int etm_size = ((EA % width) ==0) ? fVECSIZE() : 0; if (thread->processor_ptr->options->testgen_mode) etm_size = ((EA % width) ==0) ? WIDTH : 0; mem_load_vector_oddva(thread, 0, EA, EAU, slot, WIDTH, &DST.ub[0], etm_size, fUSE_LOOKUP_ADDRESS()); }
+#define fELSE_CANCELZ() else { /*if (thread->last_pkt) { thread->mem_access[slot].dropped_z = 1; thread->last_pkt->ext_slot_cancelled |= (1<<slot); } */ }
+#define fPOST_INC4(R) R+=4;
+#define fPOST_INC8(R) R+=8;
+#define fPOST_INC16(R) R+=16;
+#define fEXTRACTZ(DST,IDX) (DST = READ_ZREG(fMODCIRCU((IDX),5)))
+#define fLOADZ_UPDATE(EA,WIDTH,ZN,N,SRC) { mmvector_t Z[2]; Z[0] = READ_ZREG(0); Z[1] = READ_ZREG(1); for(int k = 0; k < WIDTH; k++) { int element_idx = (EA+k)%N; int z_idx = ((EA+k)%ZN)/N; Z[z_idx].ub[element_idx] = SRC.ub[k]; } WRITE_EXT_ZREG(0,Z[0],0); WRITE_EXT_ZREG(1,Z[1],0); }
+#define fSTOREZ(EA,WIDTH,ZN,N) { mmvector_t store_vec; mmvector_t maskvec = {0}; mmvector_t Z[2]; Z[0] = READ_ZREG(0); Z[1] = READ_ZREG(1); for(int k = 0; k < WIDTH; k++) { int element_idx = (EA+k)%N; int z_idx = ((EA+k)%ZN)/N; store_vec.ub[k] = Z[z_idx].ub[element_idx]; maskvec.ub[k] = 1; } mem_store_vector_oddva(thread, 0, EA, EA, slot, WIDTH, &store_vec.ub[0], &maskvec.ub[0], 0, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr)); }
+#define fLOADMMVQ(EA,DST,QVAL) do { int __i; fLOADMMV_AL(EA,fVECSIZE(),fVECSIZE(),DST); fVFOREACH(8,__i) if (!fGETQBIT(QVAL,__i)) DST.b[__i] = 0; } while (0)
+#define fLOADMMVNQ(EA,DST,QVAL) do { int __i; fLOADMMV_AL(EA,fVECSIZE(),fVECSIZE(),DST); fVFOREACH(8,__i) if (fGETQBIT(QVAL,__i)) DST.b[__i] = 0; } while (0)
+#define fLOADMMVU_AL(EA, ALIGNMENT, LEN, DST) { size4u_t size2 = (EA)&(ALIGNMENT-1); size4u_t size1 = LEN-size2; /*thread->last_pkt->double_access_vec = 1;*/ mem_load_vector_oddva(thread, 0, EA+size1, EA+fVECSIZE(), 1, size2, &DST.ub[size1], size2, fUSE_LOOKUP_ADDRESS()); mem_load_vector_oddva(thread, 0, EA, EA, 0, size1, &DST.ub[0], size1, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr)); }
+#ifdef QEMU_GENERATE
+#define fLOADMMVU(EA, DST) gen_vreg_load(ctx, DST##_off, EA, false)
+#else
+#define fLOADMMVU(EA, DST) { /*thread->last_pkt->pkt_has_vtcm_access = 0; thread->last_pkt->pkt_access_count = 0;*/ if ( (EA & (fVECSIZE()-1)) == 0) { /*thread->last_pkt->pkt_has_vmemu_access = 0; thread->last_pkt->double_access = 0;*/ fLOADMMV_AL(EA,fVECSIZE(),fVECSIZE(),DST); } else { /*thread->last_pkt->pkt_has_vmemu_access = 1; thread->last_pkt->double_access = 1;*/ fLOADMMVU_AL(EA,fVECSIZE(),fVECSIZE(),DST); } }
+#endif
+#define fSTOREMMV_AL(EA, ALIGNMENT, LEN, SRC) { fV_AL_CHECK(EA,ALIGNMENT-1); mem_store_vector_oddva(thread, 0, EA&~(ALIGNMENT-1), EA, slot, LEN, &SRC.ub[0], 0, 0, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr)); }
+#ifdef QEMU_GENERATE
+#define fSTOREMMV(EA, SRC) gen_vreg_store(ctx, EA, SRC##_off, insn->slot, true)
+#else
+#define fSTOREMMV(EA, SRC) fSTOREMMV_AL(EA,fVECSIZE(),fVECSIZE(),SRC)
+#endif
+#define fSTOREMMVQ_AL(EA, ALIGNMENT, LEN, SRC, MASK) do { mmvector_t maskvec; int i; for (i = 0; i < fVECSIZE(); i++) maskvec.ub[i] = fGETQBIT(MASK,i); mem_store_vector_oddva(thread, 0, EA&~(ALIGNMENT-1), EA, slot, LEN, &SRC.ub[0], &maskvec.ub[0], 0, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr)); } while (0)
+#ifdef QEMU_GENERATE
+#define fSTOREMMVQ(EA, SRC, MASK) \
+    gen_vreg_masked_store(ctx, EA, SRC##_off, MASK##_off, insn->slot, false)
+#else
+#define fSTOREMMVQ(EA, SRC, MASK) fSTOREMMVQ_AL(EA,fVECSIZE(),fVECSIZE(),SRC,MASK)
+#endif
+#define fSTOREMMVNQ_AL(EA, ALIGNMENT, LEN, SRC, MASK) { mmvector_t maskvec; int i; for (i = 0; i < fVECSIZE(); i++) maskvec.ub[i] = fGETQBIT(MASK,i); fV_AL_CHECK(EA,ALIGNMENT-1); mem_store_vector_oddva(thread, 0, EA&~(ALIGNMENT-1), EA, slot, LEN, &SRC.ub[0], &maskvec.ub[0], 1, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr)); }
+#ifdef QEMU_GENERATE
+#define fSTOREMMVNQ(EA, SRC, MASK) \
+    gen_vreg_masked_store(ctx, EA, SRC##_off, MASK##_off, insn->slot, true)
+#else
+#define fSTOREMMVNQ(EA, SRC, MASK) fSTOREMMVNQ_AL(EA,fVECSIZE(),fVECSIZE(),SRC,MASK)
+#endif
+#define fSTOREMMVU_AL(EA, ALIGNMENT, LEN, SRC) { size4u_t size1 = ALIGNMENT-((EA)&(ALIGNMENT-1)); size4u_t size2; if (size1>LEN) size1 = LEN; size2 = LEN-size1; mem_store_vector_oddva(thread, 0, EA+size1, EA+fVECSIZE(), 1, size2, &SRC.ub[size1], 0, 0, fUSE_LOOKUP_ADDRESS()); mem_store_vector_oddva(thread, 0, EA, EA, 0, size1, &SRC.ub[0], 0, 0, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr)); }
+#ifdef QEMU_GENERATE
+#define fSTOREMMVU(EA, SRC) \
+    gen_vreg_store(ctx, EA, SRC##_off, insn->slot, false)
+#else
+#define fSTOREMMVU(EA, SRC) { /*thread->last_pkt->pkt_has_vtcm_access = 0; thread->last_pkt->pkt_access_count = 0;*/ if ( (EA & (fVECSIZE()-1)) == 0) { /*thread->last_pkt->double_access = 0;*/ fSTOREMMV_AL(EA,fVECSIZE(),fVECSIZE(),SRC); } else { /*thread->last_pkt->double_access = 1; thread->last_pkt->pkt_has_vmemu_access = 1;*/ fSTOREMMVU_AL(EA,fVECSIZE(),fVECSIZE(),SRC); } }
+#endif
+#define fSTOREMMVQU_AL(EA, ALIGNMENT, LEN, SRC, MASK) { size4u_t size1 = ALIGNMENT-((EA)&(ALIGNMENT-1)); size4u_t size2; mmvector_t maskvec; int i; for (i = 0; i < fVECSIZE(); i++) maskvec.ub[i] = fGETQBIT(MASK,i); if (size1>LEN) size1 = LEN; size2 = LEN-size1; mem_store_vector_oddva(thread, 0, EA+size1, EA+fVECSIZE(), 1, size2, &SRC.ub[size1], &maskvec.ub[size1], 0, fUSE_LOOKUP_ADDRESS()); mem_store_vector_oddva(thread, 0, EA, 0, size1, &SRC.ub[0], &maskvec.ub[0], 0, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr)); }
+#define fSTOREMMVQU(EA, SRC, MASK) { /*thread->last_pkt->pkt_has_vtcm_access = 0; thread->last_pkt->pkt_access_count = 0;*/ if ( (EA & (fVECSIZE()-1)) == 0) { /*thread->last_pkt->double_access = 0;*/ fSTOREMMVQ_AL(EA,fVECSIZE(),fVECSIZE(),SRC,MASK); } else { /*thread->last_pkt->double_access = 1; thread->last_pkt->pkt_has_vmemu_access = 1;*/ fSTOREMMVQU_AL(EA,fVECSIZE(),fVECSIZE(),SRC,MASK); } }
+#define fSTOREMMVNQU_AL(EA, ALIGNMENT, LEN, SRC, MASK) { size4u_t size1 = ALIGNMENT-((EA)&(ALIGNMENT-1)); size4u_t size2; mmvector_t maskvec; int i; for (i = 0; i < fVECSIZE(); i++) maskvec.ub[i] = fGETQBIT(MASK,i); if (size1>LEN) size1 = LEN; size2 = LEN-size1; mem_store_vector_oddva(thread, 0, EA+size1, EA+fVECSIZE(), 1, size2, &SRC.ub[size1], &maskvec.ub[size1], 1, fUSE_LOOKUP_ADDRESS()); mem_store_vector_oddva(thread, 0, EA, EA, 0, size1, &SRC.ub[0], &maskvec.ub[0], 1, fUSE_LOOKUP_ADDRESS_BY_REV(thread->processor_ptr)); }
+#define fSTOREMMVNQU(EA, SRC, MASK) { /*thread->last_pkt->pkt_has_vtcm_access = 0; thread->last_pkt->pkt_access_count = 0;*/ if ( (EA & (fVECSIZE()-1)) == 0) { /*thread->last_pkt->double_access = 0;*/ fSTOREMMVNQ_AL(EA,fVECSIZE(),fVECSIZE(),SRC,MASK); } else { /*thread->last_pkt->double_access = 1; thread->last_pkt->pkt_has_vmemu_access = 1;*/ fSTOREMMVNQU_AL(EA,fVECSIZE(),fVECSIZE(),SRC,MASK); } }
+#define fVFOREACH(WIDTH, VAR) for (VAR = 0; VAR < fVELEM(WIDTH); VAR++)
+#define fVARRAY_ELEMENT_ACCESS(ARRAY, TYPE, INDEX) ARRAY.v[(INDEX) / (fVECSIZE()/(sizeof(ARRAY.TYPE[0])))].TYPE[(INDEX) % (fVECSIZE()/(sizeof(ARRAY.TYPE[0])))]
+#define fVNEWCANCEL(REGNUM) do { THREAD2STRUCT->VRegs_select &= ~(1<<(REGNUM)); } while (0)
+#define fTMPVDATA() mmvec_vtmp_data(thread)
+#define fVSATDW(U,V) fVSATW( ( ( ((long long)U)<<32 ) | fZXTN(32,64,V) ) )
+#define fVASL_SATHI(U,V) fVSATW(((U)<<1) | ((V)>>31))
+#define fVUADDSAT(WIDTH,U,V) fVSATUN( WIDTH, fZXTN(WIDTH, 2*WIDTH, U) + fZXTN(WIDTH, 2*WIDTH, V))
+#define fVSADDSAT(WIDTH,U,V) ({size8s_t tmp5 = fSXTN(WIDTH, 2*WIDTH, U); size8s_t tmp6 = fSXTN(WIDTH, 2*WIDTH, V); size8s_t tmp7 = tmp5 + tmp6; fVSATN( WIDTH, tmp7); })
+#define fVUSUBSAT(WIDTH,U,V) fVSATUN( WIDTH, fZXTN(WIDTH, 2*WIDTH, U) - fZXTN(WIDTH, 2*WIDTH, V))
+#define fVSSUBSAT(WIDTH,U,V) fVSATN( WIDTH, fSXTN(WIDTH, 2*WIDTH, U) - fSXTN(WIDTH, 2*WIDTH, V))
+#define fVAVGU(WIDTH,U,V) ((fZXTN(WIDTH, 2*WIDTH, U) + fZXTN(WIDTH, 2*WIDTH, V))>>1)
+#define fVAVGURND(WIDTH,U,V) ((fZXTN(WIDTH, 2*WIDTH, U) + fZXTN(WIDTH, 2*WIDTH, V)+1)>>1)
+#define fVNAVGU(WIDTH,U,V) ((fZXTN(WIDTH, 2*WIDTH, U) - fZXTN(WIDTH, 2*WIDTH, V))>>1)
+#define fVNAVGURNDSAT(WIDTH,U,V) fVSATUN(WIDTH,((fZXTN(WIDTH, 2*WIDTH, U) - fZXTN(WIDTH, 2*WIDTH, V)+1)>>1))
+#define fVAVGS(WIDTH,U,V) ((fSXTN(WIDTH, 2*WIDTH, U) + fSXTN(WIDTH, 2*WIDTH, V))>>1)
+#define fVAVGSRND(WIDTH,U,V) ((fSXTN(WIDTH, 2*WIDTH, U) + fSXTN(WIDTH, 2*WIDTH, V)+1)>>1)
+#define fVNAVGS(WIDTH,U,V) ((fSXTN(WIDTH, 2*WIDTH, U) - fSXTN(WIDTH, 2*WIDTH, V))>>1)
+#define fVNAVGSRND(WIDTH,U,V) ((fSXTN(WIDTH, 2*WIDTH, U) - fSXTN(WIDTH, 2*WIDTH, V)+1)>>1)
+#define fVNAVGSRNDSAT(WIDTH,U,V) fVSATN(WIDTH,((fSXTN(WIDTH, 2*WIDTH, U) - fSXTN(WIDTH, 2*WIDTH, V)+1)>>1))
+#define fVNOROUND(VAL,SHAMT) VAL
+#define fVNOSAT(VAL) VAL
+#define fVROUND(VAL,SHAMT) ((VAL) + (((SHAMT)>0)?(1LL<<((SHAMT)-1)):0))
+#define fCARRY_FROM_ADD32(A,B,C) (((fZXTN(32,64,A)+fZXTN(32,64,B)+C) >> 32) & 1)
+#define fUARCH_NOTE_PUMP_4X()
+#define fUARCH_NOTE_PUMP_2X()
+#define UNLIKELY(X) __builtin_expect((X), 0)
+#define fVDOCHKPAGECROSS(BASE,SUM) if (UNLIKELY(thread->timing_on)) { thread->mem_access[slot].check_page_crosses = 1; thread->mem_access[slot].page_cross_base = BASE; thread->mem_access[slot].page_cross_sum = SUM; }
+#define fPARSEQF32(A) parse_qf32(A)
+#define fRNDSATQF32(A,B,C) rnd_sat_qf32(A,B,C)
+#define fPARSEQF16(A) parse_qf16(A)
+#define fRNDSATQF16(A,B,C) rnd_sat_qf16(A,B,C)
+#define fPARSESF(A) parse_sf(A)
+#define fRNDSATSF(A,B) rnd_sat_sf(A,B)
+#define fPARSEHF(A) parse_hf(A)
+#define fRNDSATHF(A,B) rnd_sat_hf(A,B)
+#define fRNDSATW(A,B) rnd_sat_w(A,B)
+#define fRNDSATUW(A,B) rnd_sat_uw(A,B)
+#define fRNDSATH(A,B) rnd_sat_h(A,B)
+#define fRNDSATUH(A,B) rnd_sat_uh(A,B)
+#define fRNDSATB(A,B) rnd_sat_b(A,B)
+#define fRNDSATUB(A,B) rnd_sat_ub(A,B)
+#define fNEGQF32(A) negate32(A)
+#define fNEGQF16(A) negate16(A)
+#define fNEGSF(A) negate_sf(A)
+#define fNEGHF(A) negate_hf(A)
+#define fCMPGT_QF32(A,B) cmpgt_qf32(A,B)
+#define fCMPGT_QF16(A,B) cmpgt_qf16(A,B)
+#define fCMPGT_SF(A,B) cmpgt_sf(A,B)
+#define fCMPGT_HF(A,B) cmpgt_hf(A,B)
+#define fCMPGT_BF(A,B) cmpgt_sf(((int)A) << 16,((int)B) << 16)
+#define fCMPGT_QF32_SF(A,B) cmpgt_qf32_sf(A,B)
+#define fCMPGT_QF16_HF(A,B) cmpgt_qf16_hf(A,B)
+#define fMAX_QF32(X,Y) max_qf32(X,Y)
+#define fMIN_QF32(X,Y) min_qf32(X,Y)
+#define fMAX_QF32_SF(X,Y) max_qf32_sf(X,Y)
+#define fMIN_QF32_SF(X,Y) min_qf32_sf(X,Y)
+#define fMAX_QF16(X,Y) max_qf16(X,Y)
+#define fMIN_QF16(X,Y) min_qf16(X,Y)
+#define fMAX_QF16_HF(X,Y) max_qf16_hf(X,Y)
+#define fMIN_QF16_HF(X,Y) min_qf16_hf(X,Y)
+#define fMAX_SF(X,Y) max_sf(X,Y)
+#define fMIN_SF(X,Y) min_sf(X,Y)
+#define fMAX_HF(X,Y) max_hf(X,Y)
+#define fMIN_HF(X,Y) min_hf(X,Y)
+
+#define fSTOREDOUBLEMMV(EA, SRC) fSTOREMMV_AL(EA,fVECSIZE(),2*fVECSIZE(),SRC)
+#endif
diff --git a/target/hexagon/mmvec/mmvec.h b/target/hexagon/mmvec/mmvec.h
index 52d470709c02..906bf16d8258 100644
--- a/target/hexagon/mmvec/mmvec.h
+++ b/target/hexagon/mmvec/mmvec.h
@@ -38,6 +38,11 @@ typedef union {
     int16_t   h[MAX_VEC_SIZE_BYTES / 2];
     uint8_t  ub[MAX_VEC_SIZE_BYTES / 1];
     int8_t    b[MAX_VEC_SIZE_BYTES / 1];
+    int32_t qf32[MAX_VEC_SIZE_BYTES / 4];
+    int16_t qf16[MAX_VEC_SIZE_BYTES / 2];
+    int32_t   sf[MAX_VEC_SIZE_BYTES / 4];
+    int16_t   hf[MAX_VEC_SIZE_BYTES / 2];
+    int16_t   bf[MAX_VEC_SIZE_BYTES / 2];
 } MMVector;
 
 typedef union {
diff --git a/target/hexagon/mmvec/mmvec_qfloat.c b/target/hexagon/mmvec/mmvec_qfloat.c
new file mode 100644
index 000000000000..060ac4b14d8f
--- /dev/null
+++ b/target/hexagon/mmvec/mmvec_qfloat.c
@@ -0,0 +1,2563 @@
+/*
+ *  Copyright(c) 2019-2020 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#if !defined(__clang__)
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#endif
+
+#include "qemu/osdep.h"
+#include "mmvec_qfloat.h"
+#include <math.h>
+
+#define UNUSED(var) do { (void)var; } while (0)
+
+//Take one's complement of the mantissa for QF32
+size4s_t negate32(size4s_t in)
+{
+  size4s_t out;
+  out = in>>8;
+  out = ~out;
+  out = (out<<8) | (in & 0xFF);
+  return out;
+}
+//Take one's complement of the mantissa for QF16
+size2s_t negate16(size2s_t in)
+{
+  size2s_t out;
+  out = in>>5;
+  out = ~out;
+  out = (out<<5) | (in & 0x1F);
+  return out;
+}
+//Change sign for SF
+size4s_t negate_sf(size4s_t in)
+{
+  size4s_t out;
+  int sign;
+  sign = (in>>31) & 1;
+  sign = ~sign;
+  out = (sign<<31) | (in & 0x7FFFFFFF);
+  return out;
+}
+//Change sign for SF
+size2s_t negate_hf(size2s_t in)
+{
+  size2s_t out;
+  int sign;
+  sign = (in>>15) & 1;
+  sign = ~sign;
+  out = (sign<<15) | (in & 0x7FFF);
+  return out;
+}
+unfloat parse_qf16(size2s_t in)
+{
+    unfloat out;
+
+	out.sign = (in>>15) & 0x1;
+
+    out.exp = (size1s_t)(0x00 | (in & 0x1F));
+    out.exp = out.exp - BIAS_QF16;
+
+    /*implied LSB=1*/
+    size2s_t signif;
+    /*take signif and sign extend, add LSB=1*/
+    signif= ((size4s_t)in >> 4) | 1;
+
+    out.sig = (double)signif * epsilon_hf;
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_QF16_parse]in=%x, exp=%d, sig=%10.20f\n", in,out.exp,out.sig);
+    printf("[ARCH_QF16_parse]exp_d=%d, sig_d=%10.20f\n", ilogb(out.sig),ldexp(out.sig, -ilogb(out.sig)));
+#endif
+    return out;
+}
+//Take signed int and generate sign, exp and ***signed sig
+unfloat parse_qf32(size4s_t in)
+{
+    unfloat out;
+
+	out.sign = (in>>31) & 0x1;
+
+    out.exp = (size2s_t)(0x0000 | (in & 0xFF));
+    out.exp = out.exp - BIAS_QF32;
+
+    /*implied LSB=1*/
+    size4s_t signif;
+    /*take signif and sign extend, add LSB=1*/
+    signif= ((size8s_t)in >> 7) | 1;
+
+    out.sig = (double)signif * epsilon;
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_QF32_parse]in=%x, exp=%d, sig=%10.20f\n", in,out.exp,out.sig);
+    printf("[ARCH_QF32_parse]exp_d=%d, sig_d=%10.20f\n", ilogb(out.sig),ldexp(out.sig, -ilogb(out.sig)));
+#endif
+    return out;
+}
+
+unfloat parse_hf(size2s_t in)
+{
+    unfloat out;
+
+	out.sign = (in>>15) & 0x1;
+    out.exp = (size1s_t)( (0x00 | (in>>10)) & 0x1F);
+
+    size2u_t sig;
+    //take signif and sign extend
+    sig = (size2u_t)(in & 0x3FF);
+
+    /*implied MSB=1*/
+    if(out.exp>0)
+        sig = (1<<10) | sig;
+
+    out.exp = out.exp - BIAS_HF;
+    if(out.exp<E_MIN_HF)
+        out.exp = E_MIN_HF;
+
+    //if(in == 0)
+    //    out.exp = E_MIN_QF16;
+
+    out.sig = (double)sig * epsilon_hf;
+
+    //if(out.sign)
+    //    out.sig = (-1.0)*out.sig;
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_HF_parse] in=%x, sign=%d, exp=%d, sig=%10.20f\n", in,out.sign,out.exp,out.sig);
+    printf("[ARCH_HF_parse]exp_d=%d, sig_d=%10.20f\n", ilogb(out.sig),ldexp(out.sig, -ilogb(out.sig)));
+#endif
+    return out;
+}
+//Take the magnitude and generate ******positive sig
+unfloat parse_sf(size4s_t in)
+{
+    unfloat out;
+
+	out.sign = (in>>31) & 0x1;
+    out.exp = (size2s_t)( (0x0000 | (in>>23)) & 0xFF);
+
+    size4u_t sig;
+    //take signif and sign extend
+    sig = (size4u_t)(in & 0x7FFFFF);
+
+    /*implied MSB=1*/
+    if(out.exp>0)
+        sig = (1<<23) | sig;
+
+    out.exp = out.exp - BIAS_SF;
+
+    if(out.exp<E_MIN_SF)
+        out.exp = E_MIN_SF;
+
+    //if(in == 0)
+    //    out.exp = E_MIN_QF32;
+
+    out.sig = (double)sig * epsilon;
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_SF_parse] in=%x, sign=%d, exp=%d, sig=%x(%10.30f)\n", in,out.sign,out.exp,sig,out.sig);
+    printf("[ARCH_SF_parse]exp_d=%d, sig_d=%10.20f\n", ilogb(out.sig),ldexp(out.sig, -ilogb(out.sig)));
+#endif
+    return out;
+}
+
+
+size4s_t rnd_sat_qf_sig(int* exp_in, double sig, double sig_low, f_type ft);
+size4s_t rnd_sat_qf_sig(int* exp_in, double sig, double sig_low, f_type ft)
+{
+    double scale;
+    double sig_s;
+    double sig_f=0.0;
+    double R1, R2, R3, R_low;
+    int exp_ovf=0;
+    int exp_adj=0;
+    int exp_undf=0;
+    int exp = *exp_in;
+    int sign = (sig>=0.0)? 0:1;
+
+#ifndef DEBUG_MMVEC_QF
+    UNUSED(R_low);
+#endif
+
+    int prod_ovf=0;
+    if(fabs(sig)>=2.0L && sig != -2.0L)
+        prod_ovf = 1;
+
+    int E_MIN=E_MIN_QF32;
+    int E_MAX=E_MAX_QF32;
+    int BIAS=BIAS_QF32;
+    double _epsilon=epsilon;
+    double _units=units;
+    if(ft==QF32)
+    {
+        E_MIN = E_MIN_QF32;
+        E_MAX = E_MAX_QF32;
+        BIAS = BIAS_QF32;
+        _epsilon = epsilon;
+        _units= units;
+    }
+    else if(ft==QF16)
+    {
+        E_MIN = E_MIN_QF16;
+        E_MAX = E_MAX_QF16;
+        BIAS = BIAS_QF16;
+        _epsilon = epsilon_hf;
+        _units= units_hf;
+    }
+    else if(ft==SF)
+    {
+        E_MIN = E_MIN_SF;
+        E_MAX = E_MAX_SF;
+        BIAS = BIAS_SF;
+        _epsilon = epsilon;
+        _units= units;
+    }
+    else if(ft==HF)
+    {
+        E_MIN = E_MIN_HF;
+        E_MAX = E_MAX_HF;
+        BIAS = BIAS_HF;
+        _epsilon = epsilon_hf;
+        _units= units_hf;
+    }
+
+    //Set scale factor
+    if((exp == (E_MIN-1)) || (prod_ovf && (exp<E_MAX)))
+        scale = 2.0;
+    else
+        scale =1.0;
+
+    //Scale the significand
+    sig_s = sig/scale;
+
+    //Get remainder from the scaled significand
+    R1 = sig_s*_units;
+    if(sig_low>0.0)
+      R_low = 0.25;
+    else if(sig_low<0.0)
+      R_low = -0.25;
+    else
+      R_low = 0;
+
+    //R2 = floor((R1+R_low)/4.0)*4.0;
+    //R3 = (R1+R_low) - R2;
+    R2 = floor(R1/4.0)*4.0;
+    R3 = R1 - R2;
+
+    //Check for exp overflow/underflow
+    if(exp>=(E_MAX+1) || (prod_ovf && exp==E_MAX))
+    {
+        exp_ovf=1;
+    }
+    else if(exp<=(E_MIN-2))
+    {
+        exp_undf=1;
+    }
+    else if(exp == E_MAX)//exp=E_MAX
+    {
+        //if(R3-2.0)+sig_low<=0.0
+        if((R3==0.0) && (sig_low<0.0))
+        {
+            sig_f = sig_s + (3.0-R3-4.0)*_epsilon;
+        }
+        else if((R3<2.0) || (R3==2.0 && sig_low<=0.0))
+        //if(R3<=2.0)
+        {
+            sig_f = sig_s + (1.0-R3)*_epsilon;
+        }
+        else
+        {
+            sig_f = sig_s + (3.0-R3)*_epsilon;
+        }
+    }
+    else if(exp == (E_MIN-1))
+    {
+        exp_adj = 1;
+        if((R3==0.0) && (sig_low<0.0))
+        {
+            sig_f = sig_s + (3.0-R3-4.0)*_epsilon;
+        }
+        else if((R3<2.0) || (R3==2.0 && sig_low<=0.0))
+        //if(R3<=2.0)
+        {
+            sig_f = sig_s + (1.0-R3)*_epsilon;
+        }
+        else
+        {
+            sig_f = sig_s + (3.0-R3)*_epsilon;
+        }
+    }
+    else if(prod_ovf && (exp < E_MAX))
+    {
+        exp_adj = 1;
+        if((R3==0.0) && (sig_low<0.0))
+        {
+            sig_f = sig_s + (3.0-R3-4.0)*_epsilon;
+        }
+        else if((R3<2.0) || (R3==2.0 && sig_low<=0.0))
+        //if(R3<=2.0)
+        {
+            sig_f = sig_s + (1.0-R3)*_epsilon;
+        }
+        else
+        {
+            sig_f = sig_s + (3.0-R3)*_epsilon;
+        }
+    }
+    else if(!prod_ovf)
+    {
+        if((R3==0.0) && (sig_low<0.0))
+        {
+            sig_f = sig_s + (3.0-R3-4.0)*_epsilon;
+        }
+        else if((R3<1.5) || (R3==1.5 && sig_low<=0.0))
+        //if(R3<=1.5)
+        {
+            sig_f = sig_s + (1.0-R3)*_epsilon;
+        }
+        //else if(R3<=2.5)
+        else if((R3<2.5) || (R3==2.5 && sig_low<=0.0))
+        {
+            sig_f = (sig + (2.0-R3)*_epsilon)*0.5;
+            exp_adj=1;
+        }
+        else
+        {
+            sig_f = sig_s + (3.0-R3)*_epsilon;
+        }
+    }
+    //get the binary bits from the double-precision significand
+    //Either sig is positive or negative, IEEE double sig has magnitude
+    //Check for sign at the last stage and take 2's complement if negative
+    uint64_t sig_64_org, sig_64;
+    sig_64_org = *(uint64_t *)&sig_f;
+    sig_64 = sig_64_org;
+    uint32_t sig_32=0;
+    int32_t sig_32_out=0;
+
+    int exp_df;
+
+    exp_df = (sig_64_org >> 52) & 0x7FF;
+    exp_df = exp_df - BIAS_DF;
+
+    if(exp_ovf)
+    {
+        exp=E_MAX+BIAS;
+        if(ft==QF32 || ft==SF)
+            sig_32 = (sign-1) & 0x7FFFFF;
+        else if(ft==QF16 || ft==HF)
+            sig_32 = (sign-1) & 0x3FF;
+    }
+    else if(exp_undf)
+    {
+        exp=E_MIN+BIAS;
+        if(ft==QF32 || ft==SF)
+            sig_32 = ((-1)*sign) & 0x7FFFFF;
+        else if(ft==QF16 || ft==HF)
+            sig_32 = ((-1)*sign) & 0x3FF;
+    }
+    else
+    {
+        exp += BIAS+exp_adj;
+        //Add MSB, generates 53bits (52+1)
+        sig_64 = (sig_64_org & 0xFFFFFFFFFFFFF) | 0x10000000000000;
+        //Shift out exponent 11 bits
+        sig_64 = sig_64<<11;
+        sig_64 = (exp_df>=0)? (sig_64 << exp_df):(sig_64>>abs(exp_df));
+        if(ft==QF32)
+        {
+        sig_64 = sig_64 >> 41;
+        sig_32 = sig_64 & 0x7FFFFF;
+        }
+        else if(ft==QF16)
+        {
+        sig_64 = sig_64 >> 54;
+        sig_32 = sig_64 & 0x3FF;
+        }
+
+        if(sign)
+            sig_32 = ~sig_32;
+    }
+
+    sig_32_out = (sign<<23) | sig_32;
+
+    if(ft==QF16 ||ft==HF)
+        sig_32_out = (sign<<10) | sig_32;
+
+
+    if( (ft ==QF16) ||  (ft==QF32)) {
+        if ((sig == 0.0) && (sig_low == 0.0)) {
+            exp = 0;
+            //printf("Squash to zero!\n");
+        }
+
+    }
+
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_QF_rnd_sat]sign=%d exp_in=%d sig=%10.30f sig_low=%10.30f\n",sign, *exp_in, sig, sig_low);
+    printf("[ARCH_QF_rnd_sat]sig_s=%10.30f, sig_f=%10.30f\n",sig_s, sig_f);
+    printf("[ARCH_QF_rnd_sat]prod_ovf=%d exp_adj=%d exp_ovf=%d exp_undf=%d\n",prod_ovf,exp_adj, exp_ovf, exp_undf);
+    printf("[ARCH_QF_rnd_sat]sig_64_org=%lx sig_64=%lx sig_32=%x exp_df=%d exp=%d\n",sig_64_org, sig_64, sig_32, exp_df, exp);
+    printf("[ARCH_QF_rnd_sat]R1=%10.30f R_low=%1.128f R2=%10.30f R3=%10.30f eps=%10.30f\n",R1,R_low,R2,R3,_epsilon);
+
+    double final = ldexp(sig_f, (exp-BIAS));
+    printf("[ARCH_QF_norm] sig_f:%10.30f, exp-BIAS:%d, ldexp:%10.128f \n",sig_f, exp-BIAS, final);
+    printf("[ARCH_QF_norm] sig_32_out:%x, exp:%x \n",sig_32_out, exp);
+#endif
+
+    *exp_in = exp;
+    return sig_32_out;
+}
+
+//size4s_t rnd_sat_qf32(int sign, int exp, double sig, double sig_low)
+size4s_t rnd_sat_qf32(int exp, double sig, double sig_low)
+{
+
+    //size4u_t sig_32=rnd_sat_qf_sig(sign, &exp, sig, sig_low, QF32);
+    //size4u_t sig_32=rnd_sat_qf_sig(&exp, sig, sig_low, QF32);
+    size4s_t sig_32=rnd_sat_qf_sig(&exp, sig, sig_low, QF32);
+
+    size4s_t result;
+    //result = (sign<<31) | (sig_32 <<8) | (exp & 0xFF);
+    result = (sig_32 <<8) | (exp & 0xFF);
+
+    return result;
+}
+
+
+size4u_t get_ieee_sig(int *exp, double sig, f_type ft);
+size4u_t get_ieee_sig(int *exp, double sig, f_type ft)
+{
+    //Extract bits from double precision significand
+    uint64_t sig_64_org=0, sig_52=0, sig_53=0;
+    double value = 0.0;
+    int exp_d=0, exp_org=*exp;
+    int E_MIN;
+    E_MIN = (ft==SF)?  E_MIN_SF: E_MIN_HF;
+    double _epsilon;
+    _epsilon = (ft==SF)?  epsilon: epsilon_hf;
+    uint32_t sig_32=0;
+    size4s_t signif=0;
+    //int sign = (sig>=0.0)? 0:1;
+
+    value = ldexp(sig, exp_org);
+
+    sig_64_org = *(uint64_t *)&value;
+    exp_d = (sig_64_org >> 52) & 0x7FF;
+    exp_d = exp_d - BIAS_DF;
+    sig_52 = (sig_64_org & 0xFFFFFFFFFFFFF);
+    sig_53 = sig_52     | 0x10000000000000;
+
+    //Check if exp is one less than the MIN
+    //shifting right the excess amount of bits from E_MIN
+    int shift = E_MIN - exp_d;
+
+    int lsb =0;
+    int rem =0;
+    int sticky =0;
+    int sig_f =0;
+#ifndef DEBUG_MMVEC_QF
+    UNUSED(lsb);
+    UNUSED(rem);
+    UNUSED(sticky);
+    UNUSED(sig_f);
+    UNUSED(_epsilon);
+#endif
+
+    if(exp_d <= (E_MIN-1))
+    {
+        sig_53 = sig_53 >> shift;
+    }
+
+    if(shift >=53)
+        sig_53=0;
+
+    double R1, R2, R3;
+    if(ft==SF)
+    {
+        signif = sig_53 >> 29;
+        sig_32 = signif & 0x7FFFFF;
+
+        lsb = signif & 1;
+        rem = (sig_53 >>28) & 1;
+        sticky = (sig_53 & 0xFFFFFFF)? 1:0;
+
+        R1 = sig_53/pow(2,29);
+        R2 = floor(R1/2.0)*2;
+        R3 = R1 - R2;
+
+        if(fabs(value) >= SF_MAX)
+        {
+            //sig_32 = (1-sign)*0x7FFFFF;
+            sig_32 = 0x7FFFFF;
+        }
+        else if((R3>0.5 && R3<1.0) || (R3>=1.5))
+        {
+            if(sig_32 == 0x7FFFFF)
+            {
+                sig_32 = 0;
+                exp_d = exp_d +1;
+            }
+            else
+                sig_32 = sig_32 +1;
+        }
+        sig_f = 0x800000 | (sig_32 & 0x7FFFFF);
+    }
+    else
+    {
+        signif = sig_53 >> 42;
+        sig_32 = signif & 0x3FF;
+
+        lsb = signif & 1;
+        rem = (sig_53 >> 41) & 1;
+        sticky = (sig_53 & 0x1FFFFFFFFFF)? 1:0;
+
+        R1 = sig_53/pow(2,42);
+        R2 = floor(R1/2.0)*2;
+        R3 = R1 - R2;
+
+        //if((rem==1 && sticky==1) || (lsb==1 && rem==1))
+        if(fabs(value) >= HF_MAX)
+        {
+            //sig_32 = (1-sign)*0x3FF;
+            sig_32 = 0x3FF;
+        }
+        else if((R3>0.5 && R3<1.0) || (R3>=1.5))
+        {
+            if(sig_32 == 0x3FF)
+            {
+                sig_32 = 0;
+                exp_d = exp_d +1;
+            }
+            else
+                sig_32 = sig_32 +1;
+        }
+        sig_f = 0x400 | (sig_32 & 0x3FF);
+
+    }
+
+    if(sig ==0.0 && exp_org == (E_MIN-1))
+    {
+        sig_64_org = 0;
+        exp_d = 0;
+        sig_32=0;
+        sig_f =0;
+    }
+    *exp = exp_d;
+
+
+
+#ifdef DEBUG_MMVEC_QF
+    int sign = (sig>=0.0)? 0: 1;
+    double param = (double)sig_f*_epsilon;
+    if(sign) param = (-1.0)*param;
+    int exp_f = (exp_d<=E_MIN-1)? E_MIN: exp_d;
+    double final = ldexp(param, exp_f);
+    int exp_1 = (value != 0.0)? ilogb(value): 0;
+    int exp_2 = (exp_1 > E_MIN)? exp_1: E_MIN;
+    double sig_1 = ldexp(value, exp_1-exp_2);
+
+    printf("[IEEE_sig]exp_1=%d, exp_2=%d, sig_1=%10.20f\n",exp_1,exp_2,sig_1);
+    printf("[IEEE_sig]exp_org=%d, sig=%10.20f, value=%10.20f, shift=%d\n",exp_org, sig, value, shift);
+    printf("[IEEE_sig]sign=%d exp_d=%d sig_64_org=%lx sig_52=%lx sig_53=%lx sig_32=%x signif=%x sig_f=%x\n",sign, exp_d, sig_64_org, sig_52, sig_53, sig_32, signif, sig_f);
+    printf("[IEEE_sig]lsb=%d, rem=%d, sticky=%d\n",lsb, rem, sticky);
+    printf("[IEEE_sig] param:%10.20f, exp_d:%d, exp_f:%d, ldexp:%10.20f \n",param, exp_d, exp_f, final);
+    printf("[IEEE_sig]R1=%lf, R2=%lf, R3=%lf\n",R1, R2, R3);
+#endif
+
+    return sig_32;
+}
+
+size2s_t rnd_sat_hf_rint(int exp_in, double sig_in);
+size2s_t rnd_sat_hf_rint(int exp_in, double sig_in)
+{
+    // normalize and decompose again limiting to EMIN of target
+    double val=0.0;
+    double den=0.0;
+    double sig=0.0;
+    double mant=0.0;
+    int exp=0, exp_d=0, exp_ub=0;
+    size2s_t result=0;
+
+    val = ldexp(sig_in, exp); // normalize - convert to simple float (double)
+    exp_d = (val != 0.0)? ilogb(val): 0;
+    exp_ub = (exp_d> E_MIN_HF)? exp_d: E_MIN_HF; // EMIN=-14 for fp16
+    den = ldexp(val, -exp_ub); // denormalized if we hit EMIN
+    int sign = (sig<0)? 1:0;
+    sig = fabs(den);
+    // round to final mantissa
+    mant = rint(ldexp(sig, FRAC_HF)); // FRAC=10 for fp16; RNE
+    // post-round exponent adjust
+    exp = exp_ub + BIAS_HF; // BIAS=15 for fp16
+    // -1 for -1.0 (denorm) or +1 for >=2.0 (round up to next exponent)
+    int exp_mant = (mant != 0.0)? ilogb(mant): 0;
+    int exp_adj = (exp_mant-FRAC_HF > -1)? (exp_mant - FRAC_HF): -1;
+    exp = exp - exp_adj;
+    // overflow
+    if (exp>E_MAX_HF) { // +16 for fp16 w/o inf/nan
+        exp = E_MAX_HF;
+        mant = -1;
+    }
+    // final result// better to use a struct for fp16 instead
+//    result = (mant&((1<<FRAC_HF)-1)) | (exp<<FRAC_HF) | (sign<<15));
+    result = (sign<<15)| (exp<<FRAC_HF) | ((int)mant & 0x3FF);
+
+    printf("[RND_SAT_HF]sign=%d, exp_in=%d, exp_d=%d, exp_ub=%d, exp=%d\n",sign, exp_in, exp_d, exp_ub,exp);
+    printf("[RND_SAT_HF]sig_in=%10.20f, val=%10.20f, den=%10.20f, sig=%10.20f\n",sig_in, val, den, sig);
+    printf("[RND_SAT_HF]mant=%lf, result=%x\n",mant, result);
+
+    return result;
+}
+
+
+size2s_t rnd_sat_hf(int exp, double sig)
+{
+
+    int sign = (sig>=0.0)? 0:1;
+    //size4u_t sig_32=0;//rnd_sat_ieee_sig(&exp, sig, sig_low, SF);
+    size4u_t sig_32 = get_ieee_sig(&exp, sig, HF);
+
+    //exp is unbiased
+    size2s_t result;
+    if(exp==(E_MIN_HF-1) && sig==0.0)
+    {
+        result = 0;
+    }
+    else if(exp > E_MAX_HF)
+    {
+        result = (sign<<15) | (0x1F << 10) | 0x3FF;
+    }
+    //else if((exp < E_MIN_HF-11) ||((exp == E_MIN_HF-11) && (sig_32 ==0)))
+    //{
+    //    result = (sign<<15);
+    //}
+    else
+    {
+        exp = exp + BIAS_HF;
+        if(exp < 0)
+            exp = 0;
+        else if(exp > 31)
+            exp = 31;
+        result = (sign<<15) | ((exp & 0x1F) << 10) | sig_32;
+    }
+
+
+    return result;
+}
+
+
+//Take signed sig, produce normalized ieee sf output
+size4s_t rnd_sat_sf(int exp, double sig)
+{
+
+    int sign = (sig>=0.0)? 0: 1;
+    size4u_t sig_32 = get_ieee_sig(&exp, sig, SF);
+
+    size4s_t result;
+
+    if(exp==0 && sig==0.0)
+    {
+        result = 0;
+    }
+    else
+    {
+        exp = exp + BIAS_SF;
+        if(exp < 0)
+            exp = 0;
+        else if(exp > 255)
+            exp = 255;
+        result = (sign<<31) | ((exp & 0xFF)<< 23) | (sig_32 & 0x7FFFFF);
+    }
+
+    return result;
+}
+
+//size2s_t rnd_sat_qf16(int sign, int exp_ab, double sig, double sig_low)
+size2s_t rnd_sat_qf16(int exp_ab, double sig, double sig_low)
+{
+    int exp=exp_ab;
+
+
+    //size4u_t sig_32=rnd_sat_qf_sig(&exp, sig, sig_low, QF16);
+    //printf("sig low=%f sig=%f\n", sig, sig_low);
+    size4s_t sig_32=rnd_sat_qf_sig(&exp, sig, sig_low, QF16);
+
+    size2s_t result;
+    result = (sig_32<<5) | (exp & 0x1F);
+    //result = (sign_ab<<15) | (sig_16<<5) | (exp_ab & 0x1F);
+
+    return result;
+}
+
+size4s_t mpy_qf32(size4s_t in_a, size4s_t in_b ) {
+	size2s_t exp;
+    double sig;
+
+    unfloat a, b;
+
+    //Get double precision significands and unbiased exp
+    a = parse_qf32(in_a);
+    b = parse_qf32(in_b);
+
+    //Unbiased: after removing bias
+    exp = a.exp + b.exp;
+    sig = a.sig * b.sig;
+
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_QF32_pre_rnd] a.sig:%10.20f, b.sig:%10.20f, sig:%10.20f, ilogb(sig):%d, exp:%d\n", a.sig, b.sig, sig, ilogb(sig), exp);
+#endif
+
+    size4s_t result;
+    //result = rnd_sat_qf32(sign, exp_ab, sig_ab, 0.0);
+    result = rnd_sat_qf32(exp, sig, 0.0);
+
+    return result;
+}
+
+size4s_t mpy_qf32_sf(size4s_t in_a, size4s_t in_b ) {
+    int sign;
+	size2s_t exp;
+    double sig;
+    unfloat a, b;
+
+    //Get double precision significands and unbiased exp
+    a = parse_sf(in_a);
+    b = parse_sf(in_b);
+
+    //Unbiased: after removing bias
+    sign = a.sign ^ b.sign;
+    exp = a.exp + b.exp;
+    sig = a.sig * b.sig;
+
+    size4s_t result;
+    result = rnd_sat_qf32(exp, sig, 0.0);
+    if(sign) result = negate32(result);
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_SF_parse]sign:%d, a.sig:%10.20f, b.sig:%10.20f, sig:%10.20f exp:%d\n",sign, a.sig, b.sig, sig, exp);
+#endif
+    return result;
+}
+
+size4s_t mpy_qf32_mix_sf(size4s_t in_a, size4s_t in_b ) {
+	size2s_t exp;
+    double sig;
+    unfloat a, b;
+
+    //Get double precision significands and unbiased exp
+    a = parse_qf32(in_a);
+    b = parse_sf(in_b);
+
+    //Unbiased: after removing bias
+    exp = a.exp + b.exp;
+    sig = a.sig * b.sig;
+
+    size4s_t result;
+    result = rnd_sat_qf32(exp, sig, 0.0);
+    if(b.sign) result = negate32(result);
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_SF_parse]a.sign:%d, a.sig:%10.20f, b.sign:%d, b.sig:%10.20f, sig:%10.20f exp:%d\n",a.sign, a.sig, b.sign, b.sig, sig, exp);
+#endif
+    return result;
+}
+
+//QF32 output out of two QF16 muls
+size8s_t mpy_qf32_qf16(size4s_t in_a, size4s_t in_b ) {
+
+    double sig_0, sig_1;
+    int exp_0, exp_1;
+
+    unfloat u0,u1,v0,v1;
+
+    u0 = parse_qf16((in_a & 0xFFFF));
+    u1 = parse_qf16(((in_a>>16) & 0xFFFF));
+    v0 = parse_qf16((in_b & 0xFFFF));
+    v1 = parse_qf16(((in_b>>16) & 0xFFFF));
+
+    //Unbiased: after removing bias
+    exp_0 = u0.exp + v0.exp;
+    exp_1 = u1.exp + v1.exp;
+    sig_0 = u0.sig * v0.sig;
+    sig_1 = u1.sig * v1.sig;
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_QF32_QF16_parse]u0.exp:%d, u0.sig:%10.20f, v0.exp:%d, v0.sig:%10.20f, sig_0:%10.20f exp_0:%d\n", u0.exp, u0.sig, v0.exp, v0.sig, sig_0, exp_0);
+    printf("[ARCH_QF32_QF16_parse]u1.exp:%d, u1.sig:%10.20f, v1.exp:%d, v1.sig:%10.20f, sig_1:%10.20f exp_1:%d\n", u1.exp, u1.sig, v1.exp, v1.sig, sig_1, exp_1);
+#endif
+
+    size4s_t result_0, result_1;
+    size8s_t result;
+    result_0 = rnd_sat_qf32(exp_0, sig_0, 0.0);
+    result_1 = rnd_sat_qf32(exp_1, sig_1, 0.0);
+
+    result = ((size8s_t)result_1 <<32) | (result_0 &0xFFFFFFFF);
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_QF32_QF16_norm]result_1:%x, result_0:%x, result:%llx\n",result_1, result_0, result);
+#endif
+
+    return result;
+}
+
+//QF32 output out of two HF muls
+size8s_t mpy_qf32_hf(size4s_t in_a, size4s_t in_b ) {
+
+    double sig_0, sig_1;
+    int exp_0, exp_1;
+
+    unfloat u0,u1,v0,v1;
+
+    u0 = parse_hf((in_a & 0xFFFF));
+    u1 = parse_hf(((in_a>>16) & 0xFFFF));
+    v0 = parse_hf((in_b & 0xFFFF));
+    v1 = parse_hf(((in_b>>16) & 0xFFFF));
+
+    //Unbiased: after removing bias
+    exp_0 = u0.exp + v0.exp;
+    exp_1 = u1.exp + v1.exp;
+    sig_0 = u0.sig * v0.sig;
+    sig_1 = u1.sig * v1.sig;
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_QF32_HF_parse]u0.exp:%d, u0.sig:%10.20f, v0.exp:%d, v0.sig:%10.20f, sig_0:%10.20f exp_0:%d\n", u0.exp, u0.sig, v0.exp, v0.sig, sig_0, exp_0);
+    printf("[ARCH_QF32_HF_parse]u1.exp:%d, u1.sig:%10.20f, v1.exp:%d, v1.sig:%10.20f, sig_1:%10.20f exp_1:%d\n", u1.exp, u1.sig, v1.exp, v1.sig, sig_1, exp_1);
+#endif
+    size4s_t result_0, result_1;
+    size8s_t result;
+    result_0 = rnd_sat_qf32(exp_0, sig_0, 0.0);
+    result_1 = rnd_sat_qf32(exp_1, sig_1, 0.0);
+
+    if(u0.sign ^ v0.sign)
+      result_0 = negate32(result_0);
+
+    if(u1.sign ^ v1.sign)
+      result_1 = negate32(result_1);
+
+    result = ((size8s_t)result_1 <<32) | (result_0 & 0xFFFFFFFF);
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_QF32_HF_norm]result_1:%x, result_0:%x, result:%llx\n",result_1, result_0, result);
+#endif
+
+    return result;
+}
+
+//QF32 output out of mix of QF16 and HF muls
+size8s_t mpy_qf32_mix_hf(size4s_t in_a, size4s_t in_b ) {
+
+    double sig_0, sig_1;
+    int exp_0, exp_1;
+
+    unfloat u0,u1,v0,v1;
+
+    u0 = parse_qf16((in_a & 0xFFFF));
+    u1 = parse_qf16(((in_a>>16) & 0xFFFF));
+    v0 = parse_hf((in_b & 0xFFFF));
+    v1 = parse_hf(((in_b>>16) & 0xFFFF));
+
+    //Unbiased: after removing bias
+    exp_0 = u0.exp + v0.exp;
+    exp_1 = u1.exp + v1.exp;
+    sig_0 = u0.sig * v0.sig;
+    sig_1 = u1.sig * v1.sig;
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_QF32_mix_hf_parse]u0.exp:%d, u0.sig:%10.20f, v0.exp:%d, v0.sig:%10.20f, sig_0:%10.20f exp_0:%d\n", u0.exp, u0.sig, v0.exp, v0.sig, sig_0, exp_0);
+    printf("[ARCH_QF32_mix_hf_parse]u1.exp:%d, u1.sig:%10.20f, v1.exp:%d, v1.sig:%10.20f, sig_1:%10.20f exp_1:%d\n", u1.exp, u1.sig, v1.exp, v1.sig, sig_1, exp_1);
+#endif
+
+    size4s_t result_0, result_1;
+    size8s_t result;
+    result_0 = rnd_sat_qf32(exp_0, sig_0, 0.0);
+    result_1 = rnd_sat_qf32(exp_1, sig_1, 0.0);
+
+    if(v0.sign)
+      result_0 = negate32(result_0);
+    if(v1.sign)
+      result_1 = negate32(result_1);
+
+    result = ((size8s_t)result_1 <<32) | (result_0 & 0xFFFFFFFF);
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_QF32_mix_hf_norm]result_1:%x, result_0:%x, result:%llx\n",result_1, result_0, result);
+#endif
+
+    return result;
+}
+
+/* VMPY_QF16 */
+//ITERATOR_INSN_MPY_SLOT(16,vmpy_qf16,"Vd32.qf16=vmpy(Vu32.qf16,Vv32.qf16)",
+//"Vector multiply of qf16 format",
+size2s_t mpy_qf16(size2s_t in_a, size2s_t in_b ) {
+	size1s_t exp;
+    double sig;
+
+    unfloat a, b;
+
+    //Get double precision significands and unbiased exp
+    a = parse_qf16(in_a);
+    b = parse_qf16(in_b);
+
+    //Unbiased: after removing bias
+    exp = a.exp + b.exp;
+    sig = a.sig * b.sig;
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_QF16_parse] a.exp:%d, a.sig:%10.20f, b.exp:%d, b.sig:%10.20f, sig:%10.20f exp:%d\n", a.exp, a.sig, b.exp, b.sig, sig, exp);
+#endif
+
+    size2s_t result;
+    result = rnd_sat_qf16(exp, sig, 0.0);
+
+    return result;
+}
+
+size2s_t mpy_qf16_hf(size2s_t in_a, size2s_t in_b ) {
+    int sign;
+	size2s_t exp;
+    double sig;
+
+    unfloat a, b;
+
+    //Get double precision significands and unbiased exp
+    a = parse_hf(in_a);
+    b = parse_hf(in_b);
+
+    //Unbiased: after removing bias
+    exp = a.exp + b.exp;
+    sig = a.sig * b.sig;
+    sign = a.sign^b.sign;
+
+    size2s_t result;
+    result = rnd_sat_qf16(exp, sig, 0.0);
+    if(sign) result = negate16(result);
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_HF_parse]a.exp:%d, a.sig:%10.20f, b.exp:%d, b.sig:%10.20f, sig:%10.20f exp:%d\n",a.exp, a.sig, b.exp, b.sig, sig, exp);
+#endif
+
+    return result;
+}
+
+size2s_t mpy_qf16_mix_hf(size2s_t in_a, size2s_t in_b ) {
+	size2s_t exp;
+    double sig;
+    unfloat a, b;
+
+    //Get double precision significands and unbiased exp
+    a = parse_qf16(in_a);
+    b = parse_hf(in_b);
+
+    //Unbiased: after removing bias
+    exp = a.exp + b.exp;
+    sig = a.sig * b.sig;
+
+    size2s_t result;
+    result = rnd_sat_qf16(exp, sig, 0.0);
+    if(b.sign) result = negate16(result);
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_HF_parse]a.exp:%d, a.sig:%10.20f, b.exp:%d, b.sig:%10.20f, sig:%10.20f exp:%d\n",a.exp, a.sig, b.exp, b.sig, sig, exp);
+#endif
+
+    return result;
+}
+
+size4s_t add_qf32(size4s_t in_a, size4s_t in_b ) {
+	size2s_t exp_ab;
+
+    unfloat a, b;
+
+    //Get double precision significands
+    a = parse_qf32(in_a);
+    b = parse_qf32(in_b);
+
+    if(a.exp>b.exp){
+        exp_ab = a.exp+((a.sig==0.0)? (-(FRAC_SF+1)):ilogb(a.sig));
+        if(exp_ab<b.exp)
+          exp_ab= b.exp;
+    }
+    else{
+        exp_ab = b.exp+((b.sig==0.0)? (-(FRAC_SF+1)):ilogb(b.sig));
+        if(exp_ab<a.exp)
+          exp_ab= a.exp;
+    }
+
+    double sig_ab;
+
+    //Scale sig to the bigger exp
+    double sig_a, sig_b;
+    sig_a = ldexp(a.sig, a.exp-exp_ab);
+    sig_b = ldexp(b.sig, b.exp-exp_ab);
+
+    sig_ab = sig_a + sig_b;
+    double sig_low;
+	sig_low = (a.exp>b.exp) ? ((sig_a-sig_ab)+sig_b) : ((sig_b-sig_ab)+sig_a);
+	//sig_low = (b.sign)? (-1.0*epsilon): epsilon;
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_add_qf32] a.exp:%d, b.exp:%d, exp_ab:%d, ilogb(a.sig):%d, ilogb(b.sig):%d\n", a.exp,b.exp,exp_ab, ilogb(a.sig), ilogb(b.sig));
+    printf("[ARCH_add_qf32] a.sig:%10.30f, b.sig:%10.30f, sig_a:%10.30f, sig_b:%1.128f, sig_ab:%1.128f, sig_a-sig_ab:%1.128f, sig_low:%1.128f\n", a.sig, b.sig, sig_a, sig_b, sig_ab, sig_a-sig_ab,sig_low);
+#endif
+
+    size4s_t result;
+
+    result = rnd_sat_qf32(exp_ab, sig_ab, sig_low);
+
+    return result;
+}
+
+
+size4s_t add_sf(size4s_t in_a, size4s_t in_b ) {
+	size2s_t exp_ab;
+
+    unfloat a, b;
+
+    //Get double precision significands
+    a = parse_sf(in_a);
+    b = parse_sf(in_b);
+
+    if(a.exp>b.exp){
+        exp_ab = a.exp+((a.sig==0.0)? (-(FRAC_SF+1)):ilogb(a.sig));
+        if(exp_ab<b.exp)
+          exp_ab= b.exp;
+    }
+    else{
+        exp_ab = b.exp+((b.sig==0.0)? (-(FRAC_SF+1)):ilogb(b.sig));
+        if(exp_ab<a.exp)
+          exp_ab= a.exp;
+    }
+    //Scale sig to the bigger exp
+    double sig_a, sig_b;
+    sig_a = ldexp(a.sig, a.exp-exp_ab);
+    sig_b = ldexp(b.sig, b.exp-exp_ab);
+
+    double sig_ab;
+    double sig_low;
+    if((a.sign ^ b.sign) == 0)
+    {
+        sig_ab = sig_a + sig_b;
+	    sig_low = (a.exp>b.exp) ? ((sig_a-sig_ab)+sig_b) : ((sig_b-sig_ab)+sig_a);
+    }
+    else if(a.sign==0 && b.sign==1)
+    {
+        sig_ab = sig_a - sig_b;
+	    sig_low = (a.exp>b.exp) ? ((sig_a-sig_ab)-sig_b) : (sig_a -(sig_b+sig_ab));
+    }
+    else// if(a.sign==1 && b.sign==0)
+    {
+        sig_ab = sig_b - sig_a;
+	    sig_low = (b.exp>a.exp) ? ((sig_b-sig_ab)-sig_a) : (sig_b -(sig_a+sig_ab));
+    }
+
+    size4s_t result;
+    result = rnd_sat_qf32(exp_ab, sig_ab, sig_low);
+
+    if((a.sign==1) && (b.sign== 1))
+        result = negate32(result);
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_add_sf] a.exp:%d, b.exp:%d, exp_ab:%d, ilogb(a.sig):%d, ilogb(b.sig):%d\n", a.exp,b.exp,exp_ab, ilogb(a.sig), ilogb(b.sig));
+    printf("[ARCH_add_sf] a.sig:%10.30f, b.sig:%10.30f, sig_a:%10.30f, sig_b:%1.128f, sig_ab:%1.128f, sig_b-sig_ab:%1.128f, sig_low:%1.128f\n", a.sig, b.sig, sig_a, sig_b, sig_ab, sig_b-sig_ab,sig_low);
+    printf("[ARCH_add_sf] result:%x \n\n", result);
+#endif
+
+
+    return result;
+}
+
+size4s_t add_qf32_mix(size4s_t in_a, size4s_t in_b ) {
+	int exp_ab;
+
+    unfloat a, b;
+
+    //Get double precision significands
+    a = parse_qf32(in_a);
+    b = parse_sf(in_b);
+
+    if(b.sign) b.sig = (-1.0)*b.sig;
+
+    if(a.exp>b.exp){
+        exp_ab = a.exp+((a.sig==0.0)? (-(FRAC_SF+1)):ilogb(a.sig));
+        if(exp_ab<b.exp)
+          exp_ab= b.exp;
+    }
+    else{
+        exp_ab = b.exp+((b.sig==0.0)? (-(FRAC_SF+1)):ilogb(b.sig));
+        //exp_ab = b.exp+ilogb(b.sig);
+        if(exp_ab<a.exp)
+          exp_ab= a.exp;
+    }
+
+    double sig_ab;
+
+    //Scale sig to the bigger exp
+    double sig_a, sig_b;
+    sig_a = ldexp(a.sig, a.exp-exp_ab);
+    sig_b = ldexp(b.sig, b.exp-exp_ab);
+
+    sig_ab = sig_a + sig_b;
+    double sig_low;
+	sig_low = (a.exp>b.exp) ? ((sig_a-sig_ab)+sig_b) : ((sig_b-sig_ab)+sig_a);
+	//sig_low = (b.sign)? (-1.0*epsilon): epsilon;
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_add_qf32_mix] a.exp:%d, b.exp:%d, exp_ab:%d, ilogb(a.sig):%d, ilogb(b.sig):%d\n", a.exp,b.exp,exp_ab, ilogb(a.sig), ilogb(b.sig));
+    printf("[ARCH_add_qf32_mix] a.sig:%10.30f, b.sig:%10.30f, sig_a:%10.30f, sig_b:%1.128f, sig_ab:%1.128f, sig_a-sig_ab:%1.128f, sig_low:%1.128f\n", a.sig, b.sig, sig_a, sig_b, sig_ab, sig_a-sig_ab,sig_low);
+#endif
+
+    size4s_t result;
+
+    result = rnd_sat_qf32(exp_ab, sig_ab, sig_low);
+
+    return result;
+}
+
+size4s_t sub_qf32(size4s_t in_a, size4s_t in_b ) {
+	size2s_t exp_ab;
+
+    unfloat a, b;
+
+    //Get double precision significands
+    a = parse_qf32(in_a);
+    b = parse_qf32(in_b);
+
+    if(a.exp>b.exp){
+        exp_ab = a.exp+((a.sig==0.0)? (-(FRAC_SF+1)):ilogb(a.sig));
+        if(exp_ab<b.exp)
+          exp_ab= b.exp;
+    }
+    else{
+        exp_ab = b.exp+((b.sig==0.0)? (-(FRAC_SF+1)):ilogb(b.sig));
+        if(exp_ab<a.exp)
+          exp_ab= a.exp;
+    }
+
+    double sig_ab;
+
+    //Scale sig to the bigger exp
+    double sig_a, sig_b;
+    sig_a = ldexp(a.sig, a.exp-exp_ab);
+    sig_b = ldexp(b.sig, b.exp-exp_ab);
+
+    sig_ab = sig_a - sig_b;
+    double sig_low;
+	sig_low = (a.exp>b.exp) ? ((sig_a-sig_ab)-sig_b) : (sig_a -(sig_b+sig_ab));
+	//sig_low = (b.sign)? (-1.0*epsilon): epsilon;
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_sub_qf32] a.exp:%d, b.exp:%d, exp_ab:%d, ilogb(a.sig):%d, ilogb(b.sig):%d\n", a.exp,b.exp,exp_ab, ilogb(a.sig), ilogb(b.sig));
+    printf("[ARCH_sub_qf32] a.sig:%10.30f, b.sig:%10.30f, sig_a:%10.30f, sig_b:%1.128f, sig_ab:%1.128f, sig_a-sig_ab:%1.128f, sig_low:%1.128f\n", a.sig, b.sig, sig_a, sig_b, sig_ab, sig_a-sig_ab,sig_low);
+    printf("[ARCH_sub_qf32] a:%10.30f, a_adj:%10.30f, fabs(sig_b):%f\n", ldexp(a.sig, a.exp), ldexp(sig_a, exp_ab), fabs(sig_b));
+#endif
+
+    size4s_t result;
+
+    result = rnd_sat_qf32(exp_ab, sig_ab, sig_low);
+
+    return result;
+}
+
+size4s_t sub_sf(size4s_t in_a, size4s_t in_b ) {
+	size2s_t exp_ab;
+    unfloat a, b;
+
+    //Get double precision significands
+    a = parse_sf(in_a);
+    b = parse_sf(in_b);
+
+    if(a.exp>b.exp){
+        exp_ab = a.exp+((a.sig==0.0)? (-(FRAC_SF+1)):ilogb(a.sig));
+        if(exp_ab<b.exp)
+          exp_ab= b.exp;
+    }
+    else{
+        exp_ab = b.exp+((b.sig==0.0)? (-(FRAC_SF+1)):ilogb(b.sig));
+        if(exp_ab<a.exp)
+          exp_ab= a.exp;
+    }
+    //Scale sig to the bigger exp
+    double sig_a, sig_b;
+    sig_a = ldexp(a.sig, a.exp-exp_ab);
+    sig_b = ldexp(b.sig, b.exp-exp_ab);
+
+    double sig_ab;
+    double sig_low;
+    if((a.sign==0) && (b.sign==0))
+    {
+        sig_ab = sig_a - sig_b;
+	    sig_low = (a.exp>b.exp) ? ((sig_a-sig_ab)-sig_b) : (sig_a -(sig_b+sig_ab));
+    }
+    else if(a.sign ^ b.sign)
+    {
+        sig_ab = sig_a + sig_b;
+	    sig_low = (a.exp>b.exp) ? ((sig_a-sig_ab)+sig_b) : ((sig_b-sig_ab)+sig_a);
+    }
+    else// if(a.sign && b.sign)
+    {
+        sig_ab = sig_b - sig_a;
+	    sig_low = (b.exp>a.exp) ? ((sig_b-sig_ab)-sig_a) : (sig_b -(sig_a+sig_ab));
+    }
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_sub_sf] a.exp:%d, b.exp:%d, exp_ab:%d, ilogb(a.sig):%d, ilogb(b.sig):%d\n", a.exp,b.exp,exp_ab, ilogb(a.sig), ilogb(b.sig));
+    printf("[ARCH_sub_sf] a.sig:%10.30f, b.sig:%10.30f, sig_a:%10.30f, sig_b:%1.128f, sig_ab:%1.128f, sig_b-sig_ab:%1.128f, sig_low:%1.128f\n", a.sig, b.sig, sig_a, sig_b, sig_ab, sig_b-sig_ab,sig_low);
+#endif
+
+    size4s_t result;
+
+    result = rnd_sat_qf32(exp_ab, sig_ab, sig_low);
+
+    if((a.sign==1) && (b.sign==0))
+        result = negate32(result);
+
+    return result;
+}
+
+size4s_t sub_qf32_mix(size4s_t in_a, size4s_t in_b ) {
+	size2s_t exp_ab;
+
+    unfloat a, b;
+
+    //Get double precision significands
+    a = parse_qf32(in_a);
+    b = parse_sf(in_b);
+
+    if(b.sign) b.sig = (-1.0)*b.sig;
+
+    if(a.exp>b.exp){
+        exp_ab = a.exp+((a.sig==0.0)? (-(FRAC_SF+1)):ilogb(a.sig));
+        if(exp_ab<b.exp)
+          exp_ab= b.exp;
+    }
+    else{
+        exp_ab = b.exp+((b.sig==0.0)? (-(FRAC_SF+1)):ilogb(b.sig));
+        if(exp_ab<a.exp)
+          exp_ab= a.exp;
+    }
+
+    double sig_ab;
+
+    //Scale sig to the bigger exp
+    double sig_a, sig_b;
+    sig_a = ldexp(a.sig, a.exp-exp_ab);
+    sig_b = ldexp(b.sig, b.exp-exp_ab);
+
+    sig_ab = sig_a - sig_b;
+    double sig_low;
+	//sig_low = (a.exp>b.exp) ? ((sig_ab-sig_a)-sig_b) : ((sig_ab-sig_b)-sig_a);
+	//sig_low = (a.exp>b.exp) ? ((sig_ab-sig_a)+sig_b) : (sig_a-(sig_b+sig_ab));
+	sig_low = (a.exp>b.exp) ? ((sig_a-sig_ab)-sig_b) : (sig_a -(sig_b+sig_ab));
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_sub_qf32_mix] a.exp:%d, b.exp:%d, exp_ab:%d, ilogb(a.sig):%d, ilogb(b.sig):%d\n", a.exp,b.exp,exp_ab, ilogb(a.sig), ilogb(b.sig));
+    printf("[ARCH_sub_qf32_mix] a.sig:%10.30f, b.sig:%10.30f, sig_a:%10.30f, sig_b:%1.128f, sig_ab:%1.128f, sig_a-sig_ab:%1.128f, sig_low:%1.128f\n", a.sig, b.sig, sig_a, sig_b, sig_ab, sig_a-sig_ab,sig_low);
+#endif
+
+    size4s_t result;
+
+    result = rnd_sat_qf32(exp_ab, sig_ab, sig_low);
+
+    return result;
+}
+//add_qf16
+size2s_t add_qf16(size2s_t in_a, size2s_t in_b ) {
+	size1s_t exp_ab;
+    unfloat a, b;
+
+    //Get double precision significands
+    a = parse_qf16(in_a);
+    b = parse_qf16(in_b);
+
+    if(a.exp>b.exp){
+        exp_ab = a.exp+((a.sig==0.0)? (-(FRAC_HF+1)):ilogb(a.sig));
+        if(exp_ab<b.exp)
+          exp_ab= b.exp;
+    }
+    else{
+        exp_ab = b.exp+((b.sig==0.0)? (-(FRAC_HF+1)):ilogb(b.sig));
+        if(exp_ab<a.exp)
+          exp_ab= a.exp;
+    }
+
+    double sig_ab;
+
+    //Scale sig to the bigger exp
+    double sig_a, sig_b;
+    sig_a = ldexp(a.sig, a.exp-exp_ab);
+    sig_b = ldexp(b.sig, b.exp-exp_ab);
+
+    sig_ab = sig_a + sig_b;
+    double sig_low;
+	sig_low = (a.exp>b.exp) ? ((sig_a-sig_ab)+sig_b) : ((sig_b-sig_ab)+sig_a);
+	//sig_low = (b.sign)? (-1.0*epsilon): epsilon;
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_add_qf16] a.exp:%d, b.exp:%d, exp_ab:%d, ilogb(a.sig):%d, ilogb(b.sig):%d\n", a.exp,b.exp,exp_ab, ilogb(a.sig), ilogb(b.sig));
+    printf("[ARCH_add_qf16] a.sig:%10.30f, b.sig:%10.30f, sig_a:%10.30f, sig_b:%1.128f, sig_ab:%1.128f, sig_a-sig_ab:%1.128f, sig_low:%1.128f\n", a.sig, b.sig, sig_a, sig_b, sig_ab, sig_a-sig_ab,sig_low);
+#endif
+
+    size2s_t result;
+
+    result = rnd_sat_qf16(exp_ab, sig_ab, sig_low);
+
+    return result;
+}
+
+size2s_t add_hf(size2s_t in_a, size2s_t in_b ) {
+	size1s_t exp_ab;
+    unfloat a, b;
+
+    //Get double precision significands
+    a = parse_hf(in_a);
+    b = parse_hf(in_b);
+
+    if(a.exp>b.exp){
+        exp_ab = a.exp+((a.sig==0.0)? (-(FRAC_HF+1)):ilogb(a.sig));
+        if(exp_ab<b.exp)
+          exp_ab= b.exp;
+    }
+    else{
+        exp_ab = b.exp+((b.sig==0.0)? (-(FRAC_HF+1)):ilogb(b.sig));
+        if(exp_ab<a.exp)
+          exp_ab= a.exp;
+    }
+    //Scale sig to the bigger exp
+    double sig_a, sig_b;
+    sig_a = ldexp(a.sig, a.exp-exp_ab);
+    sig_b = ldexp(b.sig, b.exp-exp_ab);
+
+    double sig_ab;
+    double sig_low;
+    if((a.sign ^ b.sign) == 0)
+    {
+        sig_ab = sig_a + sig_b;
+	    sig_low = (a.exp>b.exp) ? ((sig_a-sig_ab)+sig_b) : ((sig_b-sig_ab)+sig_a);
+    }
+    else if(a.sign==0 && b.sign==1)
+    {
+        sig_ab = sig_a - sig_b;
+	    sig_low = (a.exp>b.exp) ? ((sig_a-sig_ab)-sig_b) : (sig_a -(sig_b+sig_ab));
+    }
+    else// if(a.sign==1 && b.sign==0)
+    {
+        sig_ab = sig_b - sig_a;
+	    sig_low = (b.exp>a.exp) ? ((sig_b-sig_ab)-sig_a) : (sig_b -(sig_a+sig_ab));
+    }
+
+    size2s_t result;
+
+    result = rnd_sat_qf16(exp_ab, sig_ab, sig_low);
+    if((a.sign==1) && (b.sign== 1))
+        result = negate16(result);
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_add_hf] a.exp:%d, b.exp:%d, exp_ab:%d, ilogb(a.sig):%d, ilogb(b.sig):%d\n", a.exp,b.exp,exp_ab, ilogb(a.sig), ilogb(b.sig));
+    printf("[ARCH_add_hf] a.sig:%10.30f, b.sig:%10.30f, sig_a:%10.30f, sig_b:%1.128f, sig_ab:%1.128f, sig_b-sig_ab:%1.128f, sig_low:%1.128f\n", a.sig, b.sig, sig_a, sig_b, sig_ab, sig_b-sig_ab,sig_low);
+    printf("[ARCH_add_sf] result:%x \n\n", result);
+#endif
+
+
+    return result;
+}
+
+size2s_t add_qf16_mix(size2s_t in_a, size2s_t in_b ) {
+	size1s_t exp_ab;
+    unfloat a, b;
+
+    //Get double precision significands
+    a = parse_qf16(in_a);
+    b = parse_hf(in_b);
+
+    if(b.sign) b.sig = (-1.0)*b.sig;
+
+    if(a.exp>b.exp){
+        exp_ab = a.exp+((a.sig==0.0)? (-(FRAC_HF+1)):ilogb(a.sig));
+        if(exp_ab<b.exp)
+          exp_ab= b.exp;
+    }
+    else{
+        exp_ab = b.exp+((b.sig==0.0)? (-(FRAC_HF+1)):ilogb(b.sig));
+        if(exp_ab<a.exp)
+          exp_ab= a.exp;
+    }
+
+    double sig_ab;
+
+    //Scale sig to the bigger exp
+    double sig_a, sig_b;
+    sig_a = ldexp(a.sig, a.exp-exp_ab);
+    sig_b = ldexp(b.sig, b.exp-exp_ab);
+
+    sig_ab = sig_a + sig_b;
+    double sig_low;
+	sig_low = (a.exp>b.exp) ? ((sig_a-sig_ab)+sig_b) : ((sig_b-sig_ab)+sig_a);
+	//sig_low = (b.sign)? (-1.0*epsilon): epsilon;
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_add_qf16_mix] a.exp:%d, b.exp:%d, exp_ab:%d, ilogb(a.sig):%d, ilogb(b.sig):%d\n", a.exp,b.exp,exp_ab, ilogb(a.sig), ilogb(b.sig));
+    printf("[ARCH_add_qf16_mix] a.sig:%10.30f, b.sig:%10.30f, sig_a:%10.30f, sig_b:%1.128f, sig_ab:%1.128f, sig_a-sig_ab:%1.128f, sig_low:%1.128f\n", a.sig, b.sig, sig_a, sig_b, sig_ab, sig_a-sig_ab,sig_low);
+#endif
+
+    size2s_t result;
+
+    result = rnd_sat_qf16(exp_ab, sig_ab, sig_low);
+
+    return result;
+}
+
+size2s_t sub_qf16(size2s_t in_a, size2s_t in_b ) {
+	size1s_t exp_ab;
+
+    unfloat a, b;
+
+    //Get double precision significands
+    a = parse_qf16(in_a);
+    b = parse_qf16(in_b);
+
+    if(a.exp>b.exp){
+        exp_ab = a.exp+((a.sig==0.0)? (-(FRAC_HF+1)):ilogb(a.sig));
+        if(exp_ab<b.exp)
+          exp_ab= b.exp;
+    }
+    else{
+        exp_ab = b.exp+((b.sig==0.0)? (-(FRAC_HF+1)):ilogb(b.sig));
+        if(exp_ab<a.exp)
+          exp_ab= a.exp;
+    }
+
+    double sig_ab;
+
+    //Scale sig to the bigger exp
+    double sig_a, sig_b;
+    sig_a = ldexp(a.sig, a.exp-exp_ab);
+    sig_b = ldexp(b.sig, b.exp-exp_ab);
+
+    sig_ab = sig_a - sig_b;
+    double sig_low;
+	//sig_low = (a.exp>b.exp) ? ((sig_a-sig_ab)-sig_b) : (sig_a -(sig_b+sig_ab));
+	//sig_low = (a.exp>b.exp) ? ((sig_ab-sig_a)+sig_b) : (sig_a-(sig_b+sig_ab));
+	sig_low = (a.exp>b.exp) ? ((sig_a-sig_ab)-sig_b) : (sig_a -(sig_b+sig_ab));
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_sub_qf16] a.exp:%d, b.exp:%d, exp_ab:%d, ilogb(a.sig):%d, ilogb(b.sig):%d\n", a.exp,b.exp,exp_ab, ilogb(a.sig), ilogb(b.sig));
+    printf("[ARCH_sub_qf16] a.sig:%10.30f, b.sig:%10.30f, sig_a:%10.30f, sig_b:%1.128f, sig_ab:%1.128f, sig_a-sig_ab:%1.128f, sig_low:%1.128f\n", a.sig, b.sig, sig_a, sig_b, sig_ab, sig_a-sig_ab,sig_low);
+    printf("[ARCH_sub_qf32] a:%10.30f, a_adj:%10.30f, fabs(sig_b):%f\n", ldexp(a.sig, a.exp), ldexp(sig_a, exp_ab), fabs(sig_b));
+#endif
+
+    size2s_t result;
+
+    result = rnd_sat_qf16(exp_ab, sig_ab, sig_low);
+
+    return result;
+}
+
+
+size2s_t sub_hf(size2s_t in_a, size2s_t in_b ) {
+	size1s_t exp_ab;
+
+    unfloat a, b;
+
+    //Get double precision significands
+    a = parse_hf(in_a);
+    b = parse_hf(in_b);
+
+    if(a.exp>b.exp){
+        exp_ab = a.exp+((a.sig==0.0)? (-(FRAC_HF+1)):ilogb(a.sig));
+        if(exp_ab<b.exp)
+          exp_ab= b.exp;
+    }
+    else{
+        exp_ab = b.exp+((b.sig==0.0)? (-(FRAC_HF+1)):ilogb(b.sig));
+        if(exp_ab<a.exp)
+          exp_ab= a.exp;
+    }
+    //Scale sig to the bigger exp
+    double sig_a, sig_b;
+    sig_a = ldexp(a.sig, a.exp-exp_ab);
+    sig_b = ldexp(b.sig, b.exp-exp_ab);
+
+    double sig_ab;
+    double sig_low;
+    if((a.sign==0) && (b.sign==0))
+    {
+        sig_ab = sig_a - sig_b;
+	    sig_low = (a.exp>b.exp) ? ((sig_a-sig_ab)-sig_b) : (sig_a -(sig_b+sig_ab));
+    }
+    else if(a.sign ^ b.sign)
+    {
+        sig_ab = sig_a + sig_b;
+	    sig_low = (a.exp>b.exp) ? ((sig_a-sig_ab)+sig_b) : ((sig_b-sig_ab)+sig_a);
+    }
+    else// if(a.sign && b.sign)
+    {
+        sig_ab = sig_b - sig_a;
+	    sig_low = (b.exp>a.exp) ? ((sig_b-sig_ab)-sig_a) : (sig_b -(sig_a+sig_ab));
+    }
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_sub_hf] a.exp:%d, b.exp:%d, exp_ab:%d, ilogb(a.sig):%d, ilogb(b.sig):%d\n", a.exp,b.exp,exp_ab, ilogb(a.sig), ilogb(b.sig));
+    printf("[ARCH_sub_hf] a.sig:%10.30f, b.sig:%10.30f, sig_a:%10.30f, sig_b:%1.30f, sig_ab:%1.30f, sig_ab-sig_a:%1.30f, sig_low:%1.30f\n", a.sig, b.sig, sig_a, sig_b, sig_ab, sig_ab-sig_a,sig_low);
+#endif
+
+    size2s_t result;
+
+    result = rnd_sat_qf16(exp_ab, sig_ab, sig_low);
+    if((a.sign==1) && (b.sign==0))
+        result = negate16(result);
+
+    return result;
+}
+
+size2s_t sub_qf16_mix(size2s_t in_a, size2s_t in_b ) {
+	size1s_t exp_ab;
+
+    unfloat a, b;
+
+    //Get double precision significands
+    a = parse_qf16(in_a);
+    b = parse_hf(in_b);
+
+    if(b.sign) b.sig = (-1.0)*b.sig;
+
+    if(a.exp>b.exp){
+        exp_ab = a.exp+((a.sig==0.0)? (-(FRAC_HF+1)):ilogb(a.sig));
+        if(exp_ab<b.exp)
+          exp_ab= b.exp;
+    }
+    else{
+        exp_ab = b.exp+((b.sig==0.0)? (-(FRAC_HF+1)):ilogb(b.sig));
+        if(exp_ab<a.exp)
+          exp_ab= a.exp;
+    }
+
+    double sig_ab;
+
+    //Scale sig to the bigger exp
+    double sig_a, sig_b;
+    sig_a = ldexp(a.sig, a.exp-exp_ab);
+    sig_b = ldexp(b.sig, b.exp-exp_ab);
+
+    sig_ab = sig_a - sig_b;
+    double sig_low;
+	//sig_low = (a.exp>b.exp) ? ((sig_ab-sig_a)-sig_b) : ((sig_ab-sig_b)-sig_a);
+	//sig_low = (a.exp>b.exp) ? ((sig_ab-sig_a)+sig_b) : (sig_a-(sig_b+sig_ab));
+	sig_low = (a.exp>b.exp) ? ((sig_a-sig_ab)-sig_b) : (sig_a -(sig_b+sig_ab));
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[ARCH_sub_qf16_mix] a.exp:%d, b.exp:%d, exp_ab:%d, ilogb(a.sig):%d, ilogb(b.sig):%d\n", a.exp,b.exp,exp_ab, ilogb(a.sig), ilogb(b.sig));
+    printf("[ARCH_sub_qf16_mix] a.sig:%10.30f, b.sig:%10.30f, sig_a:%10.30f, sig_b:%1.128f, sig_ab:%1.128f, sig_a-sig_ab:%1.128f, sig_low:%1.128f\n", a.sig, b.sig, sig_a, sig_b, sig_ab, sig_a-sig_ab,sig_low);
+#endif
+
+    size2s_t result;
+
+    result = rnd_sat_qf16(exp_ab, sig_ab, sig_low);
+
+    return result;
+}
+
+//FP conversion QF32 to IEEE SF
+size4s_t conv_sf_qf32(size4s_t a)
+{
+
+    size4s_t result;
+    unfloat u = parse_qf32(a);
+
+    result = rnd_sat_sf(u.exp, u.sig);
+
+#ifdef DEBUG_MMVEC_QF
+    double final = ldexp(u.sig, u.exp);
+    printf("[SF_parse_conv_sf_qf32] u.sig:%lf, u.exp:%d, ldexp:%10.20f \n",u.sig, u.exp, final);
+#endif
+
+    return result;
+}
+
+//FP conversion W to IEEE SF
+size4s_t conv_sf_w(size4s_t a)
+{
+
+    size4s_t result;
+    int exp=0;
+    double sig=0.0;
+    if(a !=0)
+    {
+        exp = ilogb(a);
+        sig = (double)a/scalbn(1.0, exp);
+    }
+    result = rnd_sat_sf(exp, sig);
+
+#ifdef DEBUG_MMVEC_QF
+    double final = ldexp(sig, exp);
+    printf("[SF_parse_conv_sf_w] sig:%lf, exp:%d, ldexp:%10.20f \n",sig, exp, final);
+#endif
+
+    return result;
+}
+
+//FP conversion UW to IEEE SF
+size4s_t conv_sf_uw(size4u_t a)
+{
+
+    size4s_t result;
+    int exp=0;
+    double sig=0.0;
+    if(a !=0)
+    {
+        exp = ilogb(a);
+        sig = (double)(unsigned)a/scalbn(1.0, exp);
+    }
+    result = rnd_sat_sf(exp, sig);
+
+//#ifdef DEBUG_MMVEC_QF
+//    double final = ldexp(sig, exp);
+//    printf("[SF_parse_conv_sf_uw] sig:%lf, exp:%d, ldexp:%10.20f \n",sig, exp, final);
+//#endif
+
+    return result;
+}
+
+//FP conversion QF16 to IEEE HF
+size2s_t conv_hf_qf16(size2s_t a)
+{
+
+    size2s_t result;
+    unfloat u = parse_qf16(a);
+
+    result = rnd_sat_hf(u.exp, u.sig);
+
+//#ifdef DEBUG_MMVEC_QF
+//    double final = ldexp(u.sig, u.exp);
+//    printf("[HF_parse_conv_hf_qf16] u.sig:%lf, u.exp:%d, ldexp:%10.20f \n",u.sig, u.exp, final);
+//#endif
+
+    return result;
+}
+
+//FP conversion H to IEEE HF
+size2s_t conv_hf_h(size2s_t a)
+{
+    size2s_t result;
+    int exp=0;
+    double sig=0.0;
+    if(a !=0)
+    {
+        exp = ilogb(a);
+        sig = (double)a/scalbn(1.0, exp);
+    }
+    result = rnd_sat_hf(exp, sig);
+
+#ifdef DEBUG_MMVEC_QF
+    double final = ldexp(sig, exp);
+    double f_rint = rint(final);
+    printf("[HF_parse_conv_hf_h] sig:%lf, exp:%d, ldexp:%10.20f, rint:%lf \n",sig, exp, final, f_rint);
+#endif
+    return result;
+}
+
+//FP conversion UH to IEEE HF
+size2s_t conv_hf_uh(size2u_t a)
+{
+
+    size2s_t result;
+    int exp=0;
+    double sig=0.0;
+    if(a !=0)
+    {
+        exp = ilogb(a);
+        sig = (double)(unsigned)a/scalbn(1.0, exp);
+    }
+    result = rnd_sat_hf(exp, sig);
+
+//#ifdef DEBUG_MMVEC_QF
+//    double final = ldexp(sig, exp);
+//    printf("[SF_parse_conv_hf_uh] sig:%lf, exp:%d, ldexp:%10.20f \n",sig, exp, final);
+//#endif
+
+    return result;
+}
+
+//FP conversion two QF32 to two QF16
+size4s_t conv_hf_qf32(size8s_t a)
+{
+
+    size2s_t result0, result1;
+    size4s_t result;
+    size4s_t a0, a1;
+    a0 = a & 0xFFFFFFFF;
+    a1 = (a>>32) & 0xFFFFFFFF;
+
+    unfloat u0 = parse_qf32(a0);
+    unfloat u1 = parse_qf32(a1);
+
+    result0 = rnd_sat_hf(u0.exp, u0.sig);
+    result1 = rnd_sat_hf(u1.exp, u1.sig);
+
+    result = ((size4s_t)result1 << 16) | (result0 & 0xFFFF);
+
+/*
+#ifdef DEBUG_MMVEC_QF
+    double final0 = ldexp(u0.sig, u0.exp);
+    double final1 = ldexp(u1.sig, u1.exp);
+
+    printf("[HF_parse_conv_hf_qf32] u0.sig:%lf, u0.exp:%d, ldexp0:%10.20f \n",u0.sig, u0.exp, final0);
+    printf("[HF_parse_conv_hf_qf32] u1.sig:%lf, u1.exp:%d, ldexp1:%10.20f \n",u1.sig, u1.exp, final1);
+#endif
+*/
+
+    return result;
+}
+
+//FP conversion two W to two IEEE HF
+size4s_t conv_hf_w(size8s_t a)
+{
+    size2s_t result0, result1;
+    size4s_t result;
+    size4s_t a0, a1;
+    a0 = a & 0xFFFFFFFF;
+    a1 = (a>>32) & 0xFFFFFFFF;
+
+    int exp0=0, exp1=0;
+    double sig0=0.0, sig1=0.0;
+    if(a0 !=0)
+    {
+        exp0 = ilogb(a0);
+        sig0 = (double)a0/scalbn(1.0, exp0);
+    }
+    if(a1 !=0)
+    {
+        exp1 = ilogb(a1);
+        sig1 = (double)a1/scalbn(1.0, exp1);
+    }
+    result0 = rnd_sat_hf(exp0, sig0);
+    result1 = rnd_sat_hf(exp1, sig1);
+
+    result = ((size4s_t)result1 << 16) | (result0 & 0xFFFF);
+
+/*
+#ifdef DEBUG_MMVEC_QF
+    double final0 = ldexp(sig0, exp0);
+    double final1 = ldexp(sig1, exp1);
+
+    printf("[HF_parse_conv_hf_w] sig0:%lf, exp0:%d, ldexp0:%10.20f \n",sig0, exp0, final0);
+    printf("[HF_parse_conv_hf_w] sig1:%lf, exp1:%d, ldexp1:%10.20f \n",sig1, exp1, final1);
+#endif
+*/
+    return result;
+}
+
+//FP conversion two UW to two IEEE HF
+size4s_t conv_hf_uw(size8u_t a)
+{
+    size2s_t result0, result1;
+    size4s_t result;
+    size4u_t a0, a1;
+    a0 = a & 0xFFFFFFFF;
+    a1 = (a>>32) & 0xFFFFFFFF;
+
+    int exp0=0, exp1=0;
+    double sig0=0.0, sig1=0.0;
+    if(a0 !=0)
+    {
+        exp0 = ilogb(a0);
+        sig0 = (double)(unsigned)a0/scalbn(1.0, exp0);
+    }
+    if(a1 !=0)
+    {
+        exp1 = ilogb(a1);
+        sig1 = (double)(unsigned)a1/scalbn(1.0, exp1);
+    }
+    result0 = rnd_sat_hf(exp0, sig0);
+    result1 = rnd_sat_hf(exp1, sig1);
+
+    result = ((size4s_t)result1 << 16) | (result0 & 0xFFFF);
+/*
+#ifdef DEBUG_MMVEC_QF
+    double final0 = ldexp(sig0, exp0);
+    double final1 = ldexp(sig1, exp1);
+
+    printf("[HF_parse_conv_hf_uw] sig0:%lf, exp0:%d, ldexp0:%10.20f \n",sig0, exp0, final0);
+    printf("[HF_parse_conv_hf_uw] sig1:%lf, exp1:%d, ldexp1:%10.20f \n",sig1, exp1, final1);
+#endif
+*/
+    return result;
+}
+
+size4s_t rnd_sat_w(int exp, double sig)
+{
+    size4s_t result=0;
+    size4s_t W_MAX = 0x7fffffff;
+    size4s_t W_MIN = 0x80000000;
+
+    int sign = (sig>=0.0)? 0: 1;
+
+    double R1=0.0;
+    double R2=0.0;
+    double R3=0.0;
+    if(exp > 30)
+    {
+        result = (sign)? W_MIN:W_MAX;
+        result = (sign <<31) | result;
+    }
+    else
+    {
+        R1 = ldexp(sig, exp);
+        R2 = floor(R1/2.0)*2;
+        R3 = R1 - R2;
+        if(sign==0)
+        {
+            if(R3<=0.5)
+                result = (size4s_t) R1;
+            else if(R3>0.5 && R3<1.5)
+                result =  (size4s_t) round(R1);
+            else if(R3>=1.5)
+                result =  (size4s_t) R1+1;
+        }
+        else
+            result = (size4s_t)round(R1);
+    }
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[RND_conv_w_qf32] sig:%lf, exp:%d, R1:%10.20f, R2:%10.20f, R3:%10.20f, result:%x(%d)\n",sig, exp, R1, R2, R3, result, result);
+#endif
+
+    return result;
+}
+
+size4u_t rnd_sat_uw(int exp, double sig)
+{
+    size4u_t result=0;
+    size4u_t W_MAX = 0xffffffff;
+
+    double R1=0.0;
+    double R2=0.0;
+    double R3=0.0;
+    if(sig<0.0)
+        result = 0;
+    else if(exp > 31)
+    {
+        result = W_MAX;
+    }
+    else
+    {
+        R1 = ldexp(sig, exp);
+        R2 = floor(R1/2.0)*2;
+        R3 = R1 - R2;
+        if(R3<=0.5)
+            result = (size4s_t) R1;
+        else if(R3>0.5 && R3<1.5)
+            result =  (size4s_t) round(R1);
+        else if(R3>=1.5)
+            result =  (size4s_t) R1+1;
+    }
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[RND_conv_uw_qf32] sig:%lf, exp:%d, R1:%10.20f, R2:%10.20f, R3:%10.20f, result:%x(%d)\n",sig, exp, R1, R2, R3, result, result);
+#endif
+
+    return result;
+}
+
+size2s_t rnd_sat_h(int exp, double sig)
+{
+    size2s_t result=0;
+    size2s_t W_MAX = 0x7fff;
+    size2s_t W_MIN = 0x8000;
+
+    int sign = (sig>=0.0)? 0: 1;
+
+    double R1=0.0;
+    double R2=0.0;
+    double R3=0.0;
+    if(exp > 14)
+    {
+        result = (sign)? W_MIN:W_MAX;
+        result = (sign <<15) | result;
+    }
+    else
+    {
+        R1 = ldexp(sig, exp);
+        R2 = floor(R1/2.0)*2;
+        R3 = R1 - R2;
+        if(sign==0)
+        {
+            if(R3<=0.5)
+                result = (size2s_t) R1;
+            else if(R3>0.5 && R3<1.5)
+                result =  (size2s_t) round(R1);
+            else if(R3>=1.5)
+                result =  (size2s_t) R1+1;
+        }
+        else
+        {
+            if(R3<=0.5 && R3 !=0.0)
+                result = (size2s_t)R1 -1;
+            else if(R3>0.5 && R3<1.5)
+                result = (size2s_t)round(R1);
+            else// if(R3>=1.5)
+                result = (size2s_t)R1;
+        }
+    }
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[RND_conv_h_qf16] sig:%lf, exp:%d, R1:%10.20f, R2:%10.20f, R3:%10.20f, result:%x(%d)\n",sig, exp, R1, R2, R3, result, result);
+#endif
+
+    return result;
+}
+
+size2u_t rnd_sat_uh(int exp, double sig)
+{
+    size2u_t result=0;
+    size2u_t W_MAX = 0xffff;
+
+    double R1=0.0;
+    double R2=0.0;
+    double R3=0.0;
+    if(sig<0.0)
+        result = 0;
+    else if(exp > 15)
+    {
+        result = W_MAX;
+    }
+    else
+    {
+        R1 = ldexp(sig, exp);
+        R2 = floor(R1/2.0)*2;
+        R3 = R1 - R2;
+        if(R3<=0.5)
+            result = (size2s_t) R1;
+        else if(R3>0.5 && R3<1.5)
+            result =  (size2s_t) round(R1);
+        else if(R3>=1.5)
+            result =  (size2s_t) R1+1;
+    }
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[RND_conv_uh_qf16] sig:%lf, exp:%d, R1:%10.20f, R2:%10.20f, R3:%10.20f, result:%x(%d)\n",sig, exp, R1, R2, R3, result, result);
+#endif
+
+    return result;
+}
+
+size1s_t rnd_sat_b(int exp, double sig)
+{
+    size1s_t result=0;
+    size1s_t W_MAX = 0x7f;
+    size1s_t W_MIN = 0x80;
+
+    int sign = (sig>=0.0)? 0: 1;
+
+    double R1=0.0;
+    double R2=0.0;
+    double R3=0.0;
+    if(exp > 6)
+    {
+        result = (sign)? W_MIN:W_MAX;
+        result = (sign <<7) | result;
+    }
+    else
+    {
+        R1 = ldexp(sig, exp);
+        R2 = floor(R1/2.0)*2;
+        R3 = R1 - R2;
+        if(sign==0)
+        {
+            if(R3<=0.5)
+                result = (size1s_t) R1;
+            else if(R3>0.5 && R3<1.5)
+                result =  (size1s_t) round(R1);
+            else if(R3>=1.5)
+                result =  (size1s_t) R1+1;
+        }
+        else
+        {
+            if(R3<=0.5 && R3 !=0.0)
+                result = (size1s_t)R1 -1;
+            else if(R3>0.5 && R3<1.5)
+                result = (size1s_t)round(R1);
+            else// if(R3>=1.5)
+                result = (size1s_t)R1;
+        }
+    }
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[RND_conv_b_qf16] sig:%lf, exp:%d, R1:%10.20f, R2:%10.20f, R3:%10.20f, result:%x(%d)\n",sig, exp, R1, R2, R3, result, result);
+#endif
+
+    return result;
+}
+
+size1u_t rnd_sat_ub(int exp, double sig)
+{
+    size1u_t result=0;
+    size1u_t W_MAX = 0xff;
+
+    double R1=0.0;
+    double R2=0.0;
+    double R3=0.0;
+    if(sig<0.0)
+        result = 0;
+    else if(exp > 7)
+    {
+        result = W_MAX;
+    }
+    else
+    {
+        R1 = ldexp(sig, exp);
+        R2 = floor(R1/2.0)*2;
+        R3 = R1 - R2;
+
+        if(R3<=0.5)
+            result = (size1s_t) R1;
+        else if(R3>0.5 && R3<1.5)
+            result =  (size1s_t) round(R1);
+        else if(R3>=1.5)
+            result =  (size1s_t) R1+1;
+    }
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[RND_conv_ub_qf16] sig:%lf, exp:%d, R1:%10.20f, R2:%10.20f, R3:%10.20f, result:%x(%d)\n",sig, exp, R1, R2, R3, result, result);
+#endif
+
+    return result;
+}
+
+//FP conversion QF32 to 32bit W
+size4s_t conv_w_qf32(size4s_t a)
+{
+
+    size4s_t result;
+    unfloat u = parse_qf32(a);
+
+    result = rnd_sat_w(u.exp, u.sig);
+
+    return result;
+}
+
+size4s_t conv_w_sf(size4s_t op1)
+{
+    sf_union input;
+    size4s_t W_MAX = 0x7fffffff;
+    size4s_t W_MIN = 0x80000000;
+    input.i = op1;
+    size4s_t  result;
+
+    if(isNaNF32(op1) || isInfF32(op1) || (input.f >= (float)W_MAX) || (input.f <= (float)W_MIN))
+    {
+        if(input.x.sign == 1){
+            result = W_MIN;
+        }
+        else{
+            result = W_MAX;
+        }
+    }
+    else{
+        //convert and round to the zero
+        result = (int)input.f;
+    }
+
+#ifdef DEBUG_MMVEC_QF
+    printf("Debug : result =0x%08x\n",result);
+#endif
+    return result;
+}
+
+size2s_t conv_h_hf(size2s_t op1)
+{
+    sf_union input;
+    size4s_t op1_ext = op1;
+    size2s_t HW_MAX = 0x7fff;
+    size2s_t HW_MIN = 0x8000;
+    input.i = ((op1_ext & 0x8000) << 16) + (((op1_ext & 0x7c00) + 0x1c000) << 13) + ((op1_ext & 0x03ff) << 13); //grabbing sign, exp, and significand and ocnverting to sf32 format
+    size2s_t  result;
+
+    if(isNaNF16(op1) || isInfF16(op1) || (input.f >= (float)HW_MAX) || (input.f <= (float)HW_MIN))
+    {
+        if(input.x.sign == 1){
+            result = HW_MIN;
+        }
+        else{
+            result = HW_MAX;
+        }
+    }
+    else{
+        //convert and round to the zero
+        result = (short)input.f;
+    }
+
+#ifdef DEBUG_MMVEC_QF
+    printf("Debug : result =0x%08x\n",result);
+#endif
+    return result;
+}
+
+//FP conversion QF32 to 32bit UW
+size4u_t conv_uw_qf32(size4s_t a)
+{
+
+    size4u_t result;
+    unfloat u = parse_qf32(a);
+
+    result = rnd_sat_uw(u.exp, u.sig);
+
+    return result;
+}
+
+//FP conversion QF16 to 16bit H
+size2s_t conv_h_qf16(size2s_t a)
+{
+
+    size2s_t result;
+    unfloat u = parse_qf16(a);
+
+    result = rnd_sat_h(u.exp, u.sig);
+
+    return result;
+}
+
+//FP conversion QF32 to 32bit UW
+size2u_t conv_uh_qf16(size2s_t a)
+{
+
+    size2u_t result;
+    unfloat u = parse_qf16(a);
+
+    result = rnd_sat_uh(u.exp, u.sig);
+
+    return result;
+}
+
+//FP conversion double QF32 to double H
+size4s_t conv_h_qf32(size8s_t a)
+{
+    size2s_t result0, result1;
+    size4s_t result;
+    size4s_t a0, a1;
+    a0 = a & 0xFFFFFFFF;
+    a1 = (a>>32) & 0xFFFFFFFF;
+
+    unfloat u0 = parse_qf32(a0);
+    unfloat u1 = parse_qf32(a1);
+
+    result0 = rnd_sat_h(u0.exp, u0.sig);
+    result1 = rnd_sat_h(u1.exp, u1.sig);
+
+    result = ((size4s_t)result1 << 16) | (result0 & 0xFFFF);
+
+#ifdef DEBUG_MMVEC_QF
+    double final0 = ldexp(u0.sig, u0.exp);
+    double final1 = ldexp(u1.sig, u1.exp);
+
+    printf("[H_parse_conv_h_qf32] u0.sig:%lf, u0.exp:%d, ldexp0:%10.20f \n",u0.sig, u0.exp, final0);
+    printf("[H_parse_conv_h_qf32] u1.sig:%lf, u1.exp:%d, ldexp1:%10.20f \n",u1.sig, u1.exp, final1);
+#endif
+
+    return result;
+}
+
+//FP conversion QF32 to 32bit UW
+size4u_t conv_uh_qf32(size8s_t a)
+{
+    size2u_t result0, result1;
+    size4u_t result;
+    size4s_t a0, a1;
+    a0 = a & 0xFFFFFFFF;
+    a1 = (a>>32) & 0xFFFFFFFF;
+
+    unfloat u0 = parse_qf32(a0);
+    unfloat u1 = parse_qf32(a1);
+
+    result0 = rnd_sat_uh(u0.exp, u0.sig);
+    result1 = rnd_sat_uh(u1.exp, u1.sig);
+
+    result = ((size4u_t)result1 << 16) | (result0 & 0xFFFF);
+
+#ifdef DEBUG_MMVEC_QF
+    double final0 = ldexp(u0.sig, u0.exp);
+    double final1 = ldexp(u1.sig, u1.exp);
+
+    printf("[UH_parse_conv_uh_qf32] u0.sig:%lf, u0.exp:%d, ldexp0:%10.20f \n",u0.sig, u0.exp, final0);
+    printf("[UH_parse_conv_uh_qf32] u1.sig:%lf, u1.exp:%d, ldexp1:%10.20f \n",u1.sig, u1.exp, final1);
+#endif
+
+    return result;
+}
+
+//FP conversion double QF16 to double B
+size2s_t conv_b_qf16(size4s_t a)
+{
+    size1s_t result0, result1;
+    size2s_t result;
+    size2s_t a0, a1;
+    a0 = a & 0xFFFF;
+    a1 = (a>>16) & 0xFFFF;
+
+    unfloat u0 = parse_qf16(a0);
+    unfloat u1 = parse_qf16(a1);
+
+    result0 = rnd_sat_b(u0.exp, u0.sig);
+    result1 = rnd_sat_b(u1.exp, u1.sig);
+
+    result = ((size2s_t)result1 << 8) | (result0 & 0xFF);
+
+#ifdef DEBUG_MMVEC_QF
+    double final0 = ldexp(u0.sig, u0.exp);
+    double final1 = ldexp(u1.sig, u1.exp);
+
+    printf("[B_parse_conv_b_qf16] u0.sig:%lf, u0.exp:%d, ldexp0:%10.20f \n",u0.sig, u0.exp, final0);
+    printf("[B_parse_conv_b_qf16] u1.sig:%lf, u1.exp:%d, ldexp1:%10.20f \n",u1.sig, u1.exp, final1);
+#endif
+
+    return result;
+}
+
+//FP conversion QF32 to 32bit UW
+size2u_t conv_ub_qf16(size4s_t a)
+{
+    size1u_t result0, result1;
+    size2u_t result;
+    size2s_t a0, a1;
+    a0 = a & 0xFFFF;
+    a1 = (a>>16) & 0xFFFF;
+
+    unfloat u0 = parse_qf16(a0);
+    unfloat u1 = parse_qf16(a1);
+
+    result0 = rnd_sat_ub(u0.exp, u0.sig);
+    result1 = rnd_sat_ub(u1.exp, u1.sig);
+
+    result = ((size2u_t)result1 << 8) | (result0 & 0xFF);
+
+#ifdef DEBUG_MMVEC_QF
+    double final0 = ldexp(u0.sig, u0.exp);
+    double final1 = ldexp(u1.sig, u1.exp);
+
+    printf("[UB_parse_conv_ub_qf16] u0.sig:%lf, u0.exp:%d, ldexp0:%10.20f \n",u0.sig, u0.exp, final0);
+    printf("[UB_parse_conv_ub_qf16] u1.sig:%lf, u1.exp:%d, ldexp1:%10.20f \n",u1.sig, u1.exp, final1);
+#endif
+
+    return result;
+}
+
+//Neg/Abs
+size4s_t neg_qf32(size4s_t a)
+{
+    size4s_t result;
+    result = negate32(a);
+    return result;
+}
+size4s_t abs_qf32(size4s_t a)
+{
+    size4s_t result;
+    if((a>>31) & 1)
+        result = negate32(a);
+    else
+        result = a;
+    return result;
+}
+size2s_t neg_qf16(size2s_t a)
+{
+    size2s_t result;
+    result = negate16(a);
+    return result;
+}
+size2s_t abs_qf16(size2s_t a)
+{
+    size2s_t result;
+    if((a>>15) & 1)
+        result = negate16(a);
+    else
+        result = a;
+    return result;
+}
+size4s_t neg_sf(size4s_t a)
+{
+    size4s_t result;
+    result = negate_sf(a);
+    return result;
+}
+size4s_t abs_sf(size4s_t a)
+{
+    size4s_t result;
+    if((a>>31) & 1)
+        result = negate_sf(a);
+    else
+        result = a;
+    return result;
+}
+size2s_t neg_hf(size2s_t a)
+{
+    size2s_t result;
+    result = negate_hf(a);
+    return result;
+}
+size2s_t abs_hf(size2s_t a)
+{
+    size2s_t result;
+    if((a>>15) & 1)
+        result = negate_hf(a);
+    else
+        result = a;
+    return result;
+}
+
+//FP Compare
+int cmpgt_fp(unfloat a, unfloat b)
+{
+    int result=0;
+    double a_d, b_d;
+    a_d = ldexp(a.sig, a.exp);
+    b_d = ldexp(b.sig, b.exp);
+
+    //Filter out +0/-0 by checking the sign
+    if(a_d > b_d)
+        result=1;
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[CMPGT]a:%10.30f, b:%10.30f\n",a_d, b_d);
+#endif
+
+    return result;
+}
+
+int cmpgt_qf32(size4s_t in_a, size4s_t in_b)
+{
+    unfloat a, b;
+    a= parse_qf32(in_a);
+    b= parse_qf32(in_b);
+
+    int result=0;
+
+    result = cmpgt_fp(a,b);
+
+    return result;
+}
+
+int cmpgt_qf16(size2s_t in_a, size2s_t in_b)
+{
+
+    unfloat a, b;
+    a= parse_qf16(in_a);
+    b= parse_qf16(in_b);
+
+    int result=0;
+    result = cmpgt_fp(a,b);
+
+    return result;
+}
+
+int cmpgt_sf(size4s_t in_a, size4s_t in_b)
+{
+
+    unfloat a, b;
+    a= parse_sf(in_a);
+    b= parse_sf(in_b);
+
+    if(a.sign)
+        a.sig = (-1.0)*a.sig;
+    if(b.sign)
+        b.sig = (-1.0)*b.sig;
+
+    int result=0;
+    result = cmpgt_fp(a,b);
+
+    return result;
+}
+
+int cmpgt_hf(size2s_t in_a, size2s_t in_b)
+{
+
+    unfloat a, b;
+    a= parse_hf(in_a);
+    b= parse_hf(in_b);
+
+    if(a.sign)
+        a.sig = (-1.0)*a.sig;
+    if(b.sign)
+        b.sig = (-1.0)*b.sig;
+
+    int result=0;
+    result = cmpgt_fp(a,b);
+
+    return result;
+}
+
+int cmpgt_qf32_sf(size4s_t in_a, size4s_t in_b)
+{
+    unfloat a = parse_qf32(in_a);
+    unfloat b = parse_sf(in_b);
+    if(b.sign)
+        b.sig = (-1.0)*b.sig;
+
+    int result=0;
+    result = cmpgt_fp(a,b);
+
+    return result;
+}
+
+int cmpgt_qf16_hf(size2s_t in_a, size2s_t in_b)
+{
+    unfloat a = parse_qf16(in_a);
+    unfloat b = parse_hf(in_b);
+    if(b.sign)
+        b.sig = (-1.0)*b.sig;
+
+    int result=0;
+    result = cmpgt_fp(a,b);
+    return result;
+}
+//max/min
+ //if a==b, a is returned
+size4s_t max_qf32(    size4s_t in_a, size4s_t in_b) { return cmpgt_qf32(    in_b, in_a) ? in_b : in_a; }
+size2s_t max_qf16(    size2s_t in_a, size2s_t in_b) { return cmpgt_qf16(    in_b, in_a) ? in_b : in_a; }
+
+
+
+size4s_t is_check_zero_sf(size4s_t in_a);
+size4s_t is_check_zero_sf(size4s_t in_a) {
+    return (in_a == 0) || ((in_a & 0xFFFFFFFF) == 0x80000000);
+}
+size2s_t is_check_zero_hf(size2s_t in_a);
+size2s_t is_check_zero_hf(size2s_t in_a) {
+    return (in_a == 0) || ((in_a & 0xFFFF) == 0x8000);
+}
+
+size4s_t max_sf(      size4s_t in_a, size4s_t in_b) {
+    if (is_check_zero_sf(in_a) && is_check_zero_sf(in_b) ) {
+        return (in_a == 0) ? in_a : in_b;       // Return in_a if it's positive 0, otherwise return the other one
+    }
+    return cmpgt_sf(      in_b, in_a) ? in_b : in_a;
+
+}
+size2s_t max_hf(      size2s_t in_a, size2s_t in_b)
+{
+    if (is_check_zero_hf(in_a) && is_check_zero_hf(in_b) ) {
+        return (in_a == 0) ? in_a : in_b;
+    }
+    return cmpgt_hf(      in_b, in_a) ? in_b : in_a;
+}
+
+
+//size2s_t max_qf16_hf( size2s_t in_a, size2s_t in_b) { return cmpgt_qf16_hf( in_b, in_a) ? in_b : in_a; }
+//size4s_t max_qf32_sf( size4s_t in_a, size4s_t in_b) { return cmpgt_qf32_sf( in_b, in_a) ? in_b : in_a; }
+
+size4s_t min_qf32(    size4s_t in_a, size4s_t in_b) { return cmpgt_qf32(    in_a, in_b) ? in_b : in_a; }
+size2s_t min_qf16(    size2s_t in_a, size2s_t in_b) { return cmpgt_qf16(    in_a, in_b) ? in_b : in_a; }
+
+size4s_t min_sf(      size4s_t in_a, size4s_t in_b) {
+    if (is_check_zero_sf(in_a) && is_check_zero_sf(in_b) ) {
+        return (in_a == 0) ? in_b : in_a;
+    }
+    return cmpgt_sf(      in_a, in_b) ? in_b : in_a;
+}
+size2s_t min_hf(      size2s_t in_a, size2s_t in_b) {
+    if (is_check_zero_hf(in_a) && is_check_zero_hf(in_b) ) {
+        return (in_a == 0) ? in_b : in_a;
+    }
+    return cmpgt_hf(      in_a, in_b) ? in_b : in_a;
+}
+//size2s_t min_qf16_hf( size2s_t in_a, size2s_t in_b) { return cmpgt_qf16_hf( in_a, in_b) ? in_b : in_a; }
+//size4s_t min_qf32_sf( size4s_t in_a, size4s_t in_b) { return cmpgt_qf32_sf( in_a, in_b) ? in_b : in_a; }
+
+
+size4s_t max_qf32_sf(size4s_t in_a, size4s_t in_b)
+{
+    size4s_t result=0;
+    unfloat a,b;
+    a= parse_qf32(in_a);
+    b= parse_sf(in_b);
+    if(b.sign)
+        b.sig = (-1)*b.sig;
+
+    double a_d, b_d;
+    a_d = ldexp(a.sig, a.exp);
+    b_d = ldexp(b.sig, b.exp);
+
+    if(a_d >= b_d)
+        result = in_a;
+    else
+        result = in_b;
+
+#ifdef DEBUG_MMVEC_QF
+    printf("[max_qf32_sf]a:%10.30f, b:%10.30f\n",a_d, b_d);
+#endif
+
+    return result;
+}
+size4s_t min_qf32_sf(size4s_t in_a, size4s_t in_b)
+{
+    size4s_t result=0;
+    unfloat a,b;
+    a= parse_qf32(in_a);
+    b= parse_sf(in_b);
+    if(b.sign)
+        b.sig = (-1)*b.sig;
+    double a_d, b_d;
+    a_d = ldexp(a.sig, a.exp);
+    b_d = ldexp(b.sig, b.exp);
+    if(a_d <= b_d)
+      result = in_a;
+    else
+      result = in_b;
+#ifdef DEBUG_MMVEC_QF
+    printf("[min_qf32_sf]a:%10.30f, b:%10.30f\n",a_d, b_d);
+#endif
+    return result;
+}
+
+size2s_t max_qf16_hf(size2s_t in_a, size2s_t in_b)
+{
+    size2s_t result=0;
+    unfloat a,b;
+    a= parse_qf16(in_a);
+    b= parse_hf(in_b);
+    if(b.sign)
+        b.sig = (-1)*b.sig;
+    double a_d, b_d;
+    a_d = ldexp(a.sig, a.exp);
+    b_d = ldexp(b.sig, b.exp);
+    if(a_d >= b_d)
+      result = in_a;
+    else
+      result = in_b;
+#ifdef DEBUG_MMVEC_QF
+    printf("[max_qf16_hf]a:%10.30f, b:%10.30f\n",a_d, b_d);
+#endif
+    return result;
+}
+size2s_t min_qf16_hf(size2s_t in_a, size2s_t in_b)
+{
+    size2s_t result=0;
+    unfloat a,b;
+    a= parse_qf16(in_a);
+    b= parse_hf(in_b);
+    if(b.sign)
+        b.sig = (-1)*b.sig;
+    double a_d, b_d;
+    a_d = ldexp(a.sig, a.exp);
+    b_d = ldexp(b.sig, b.exp);
+    if(a_d <= b_d)
+      result = in_a;
+    else
+      result = in_b;
+#ifdef DEBUG_MMVEC_QF
+    printf("[min_qf16_hf]a:%10.30f, b:%10.30f\n",a_d, b_d);
+#endif
+    return result;
+}
diff --git a/target/hexagon/mmvec/mmvec_qfloat.h b/target/hexagon/mmvec/mmvec_qfloat.h
new file mode 100644
index 000000000000..dc15cd17408b
--- /dev/null
+++ b/target/hexagon/mmvec/mmvec_qfloat.h
@@ -0,0 +1,199 @@
+/*
+ *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MMVEC_QFLOAT_H
+#define MMVEC_QFLOAT_H 1
+
+#define HF_MAX 131008  //pow(2,17)-pow(2,6) =(2-1.0/pow(2,10))*pow(2,16)
+#define HF_MIN 1.0/pow(2,24)
+#define SF_MAX pow(2,129)-pow(2,105) //(2-1.0/pow(2,23))*pow(2,128)
+#define SF_MIN 1.0/pow(2,149)
+
+#define E_MAX_QF32 128
+#define E_MIN_QF32 -127
+#define E_MAX_QF16 16
+#define E_MIN_QF16 -15
+#define E_MAX_SF 128
+#define E_MIN_SF -126
+#define E_MAX_HF 16
+#define E_MIN_HF -14
+#define BIAS_QF32 127
+#define BIAS_QF16 15
+#define BIAS_DF 1023
+#define BIAS_SF 127
+#define BIAS_HF 15
+#define FRAC_HF 10
+#define FRAC_SF 23
+#define isNaNF32( a ) (((~(a) & 0x7F800000) == 0) && ((a) & 0x007FFFFF))
+#define isInfF32( a ) (((~(a) & 0x7F800000) == 0) && (((a) & 0x007FFFFF) == 0))
+#define isNaNF16( a ) (((~(a) & 0x7C00) == 0) && ((a) & 0x03FF))
+#define isInfF16( a ) (((~(a) & 0x7C00) == 0) && (((a) & 0x03FF) == 0))
+
+//#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
+//#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
+
+#include "cpu.h"
+#include "hex_arch_types.h"
+
+#define epsilon 1.0/pow(2,23)
+#define units 1.0*pow(2,23)
+#define epsilon_hf 1.0/pow(2,10)
+#define units_hf 1.0*pow(2,10)
+
+typedef struct{
+  int sign;
+  int exp;
+  double sig;
+} unfloat; //Un-Normalized Float
+
+typedef struct{
+  int sign;
+  int sig;
+  int exp;
+} qf_t;
+
+typedef struct{
+  int32_t sig : 24;
+  uint32_t exp : 8;
+} qf32_t;
+
+typedef struct{
+  int32_t sig : 11;
+  uint32_t exp : 5;
+} qf16_t;
+
+typedef enum float_type{
+  QF32,
+  QF16,
+  SF,
+  HF
+} f_type;
+
+typedef union {
+	float f;
+	size4u_t i;
+	struct {
+		size4u_t mant:23;
+		size4u_t exp:8;
+		size4u_t sign:1;
+	} x;
+} sf_union;
+
+//MPY
+size4s_t mpy_qf32(size4s_t a, size4s_t b);
+size4s_t mpy_qf32_sf(size4s_t a, size4s_t b);
+size4s_t mpy_qf32_mix_sf(size4s_t a, size4s_t b);
+size2s_t mpy_qf16(size2s_t a, size2s_t b);
+size2s_t mpy_qf16_hf(size2s_t a, size2s_t b);
+size2s_t mpy_qf16_mix_hf(size2s_t a, size2s_t b);
+size8s_t mpy_qf32_qf16(size4s_t a, size4s_t b);
+size8s_t mpy_qf32_hf(size4s_t a, size4s_t b);
+size8s_t mpy_qf32_mix_hf(size4s_t a, size4s_t b);
+
+unfloat parse_qf32(size4s_t a);
+unfloat parse_qf16(size2s_t a);
+unfloat parse_sf(size4s_t a);
+unfloat parse_hf(size2s_t a);
+size4s_t rnd_sat_qf32(int exp, double sig, double sig_low);
+size2s_t rnd_sat_qf16(int exp, double sig, double sig_low);
+size4s_t rnd_sat_sf(int exp, double sig);
+size2s_t rnd_sat_hf(int exp, double sig);
+size4s_t rnd_sat_w(int exp, double sig);
+size4u_t rnd_sat_uw(int exp, double sig);
+size2s_t rnd_sat_h(int exp, double sig);
+size2u_t rnd_sat_uh(int exp, double sig);
+size1s_t rnd_sat_b(int exp, double sig);
+size1u_t rnd_sat_ub(int exp, double sig);
+size4s_t negate32(size4s_t);
+size2s_t negate16(size2s_t);
+size4s_t negate_sf(size4s_t);
+size2s_t negate_hf(size2s_t);
+
+//ADD
+size4s_t add_qf32(size4s_t a, size4s_t b);
+size4s_t add_sf(size4s_t a, size4s_t b);
+size4s_t add_qf32_mix(size4s_t a, size4s_t b);
+size2s_t add_qf16(size2s_t a, size2s_t b);
+size2s_t add_hf(size2s_t a, size2s_t b);
+size2s_t add_qf16_mix(size2s_t a, size2s_t b);
+
+//SUB
+size4s_t sub_qf32(size4s_t a, size4s_t b);
+size4s_t sub_sf(size4s_t a, size4s_t b);
+size4s_t sub_qf32_mix(size4s_t a, size4s_t b);
+size2s_t sub_qf16(size2s_t a, size2s_t b);
+size2s_t sub_hf(size2s_t a, size2s_t b);
+size2s_t sub_qf16_mix(size2s_t a, size2s_t b);
+
+//Convert
+size4s_t conv_sf_qf32(size4s_t a);
+size4s_t conv_sf_w(size4s_t a);
+size4s_t conv_sf_uw(size4u_t a);
+size2s_t conv_hf_qf16(size2s_t a);
+size2s_t conv_hf_h(size2s_t a);
+size2s_t conv_hf_uh(size2u_t a);
+size4s_t conv_hf_qf32(size8s_t a);
+size4s_t conv_hf_w(size8s_t a);
+size4s_t conv_hf_uw(size8u_t a);
+
+size4s_t conv_w_qf32(size4s_t a);
+size4u_t conv_uw_qf32(size4s_t a);
+size2s_t conv_h_qf16(size2s_t a);
+size2u_t conv_uh_qf16(size2s_t a);
+size4s_t conv_h_qf32(size8s_t a);
+size4u_t conv_uh_qf32(size8s_t a);
+size2s_t conv_b_qf16(size4s_t a);
+size2u_t conv_ub_qf16(size4s_t a);
+
+size4s_t conv_w_sf(size4s_t a);
+// size4u_t conv_uw_sf(size4s_t a);
+size2s_t conv_h_hf(size2s_t a);
+// size2u_t conv_uh_sf(size2s_t a);
+
+//Neg/Abs
+size4s_t neg_qf32(size4s_t a);
+size4s_t abs_qf32(size4s_t a);
+size2s_t neg_qf16(size2s_t a);
+size2s_t abs_qf16(size2s_t a);
+size4s_t neg_sf(size4s_t a);
+size4s_t abs_sf(size4s_t a);
+size2s_t neg_hf(size2s_t a);
+size2s_t abs_hf(size2s_t a);
+
+//Compare
+int cmpgt_fp(unfloat a,  unfloat b);
+int cmpgt_qf32(size4s_t a,  size4s_t b);
+int cmpgt_qf16(size2s_t a,  size2s_t b);
+int cmpgt_sf(size4s_t a,  size4s_t b);
+int cmpgt_hf(size2s_t a,  size2s_t b);
+int cmpgt_qf32_sf(size4s_t a,  size4s_t b);
+int cmpgt_qf16_hf(size2s_t a,  size2s_t b);
+
+//max/min
+size4s_t max_qf32(size4s_t a, size4s_t b);
+size4s_t min_qf32(size4s_t a, size4s_t b);
+size4s_t max_qf32_sf(size4s_t a, size4s_t b);
+size4s_t min_qf32_sf(size4s_t a, size4s_t b);
+size4s_t max_sf(size4s_t a, size4s_t b);
+size4s_t min_sf(size4s_t a, size4s_t b);
+size2s_t max_qf16(size2s_t a, size2s_t b);
+size2s_t min_qf16(size2s_t a, size2s_t b);
+size2s_t max_qf16_hf(size2s_t a, size2s_t b);
+size2s_t min_qf16_hf(size2s_t a, size2s_t b);
+size2s_t max_hf(size2s_t a, size2s_t b);
+size2s_t min_hf(size2s_t a, size2s_t b);
+#endif
diff --git a/target/hexagon/monitor.c b/target/hexagon/monitor.c
new file mode 100644
index 000000000000..534ca2abe63a
--- /dev/null
+++ b/target/hexagon/monitor.c
@@ -0,0 +1,36 @@
+/*
+ *  Copyright(c) 2022-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "cpu_bits.h"
+#include "monitor/monitor.h"
+#include "monitor/hmp-target.h"
+#include "monitor/hmp.h"
+#include "hex_mmu.h"
+
+const MonitorDef monitor_defs[] = {
+    { NULL },
+};
+
+const MonitorDef *target_monitor_defs(void)
+{
+    return monitor_defs;
+}
+
+void hmp_info_tlb(Monitor *mon, const QDict *qdict)
+{
+#if !defined(CONFIG_USER_ONLY)
+    CPUArchState *env = mon_get_cpu_env(mon);
+    if (!env) {
+        monitor_printf(mon, "No CPU available\n");
+        return;
+    }
+
+    dump_mmu(env);
+#endif
+}
diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
index 6da8db8ea5c5..e6f11fd5f990 100644
--- a/target/hexagon/op_helper.c
+++ b/target/hexagon/op_helper.c
@@ -17,6 +17,8 @@
 
 #include "qemu/osdep.h"
 #include "qemu/log.h"
+#include "qemu/main-loop.h"
+#include "qemu/timer.h"
 #include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
 #include "exec/helper-proto.h"
@@ -24,31 +26,59 @@
 #include "cpu.h"
 #include "internal.h"
 #include "macros.h"
+#include "sys_macros.h"
 #include "arch.h"
 #include "hex_arch_types.h"
 #include "fma_emu.h"
 #include "mmvec/mmvec.h"
 #include "mmvec/macros.h"
+#include "mmvec/mmvec_qfloat.h"
 #include "op_helper.h"
+#include "cpu_helper.h"
 #include "translate.h"
+#ifndef CONFIG_USER_ONLY
+#include "hex_mmu.h"
+#include "hw/intc/l2vic.h"
+#include "hw/timer/qct-qtimer.h"
+#include "hex_interrupts.h"
+#include "hexswi.h"
+#endif
 
 #define SF_BIAS        127
 #define SF_MANTBITS    23
 
 /* Exceptions processing helpers */
+G_NORETURN
+void do_raise_exception(CPUHexagonState *env, uint32_t exception,
+                        target_ulong PC, uintptr_t retaddr)
+{
+    CPUState *cs = env_cpu(env);
+#ifdef CONFIG_USER_ONLY
+    qemu_log_mask(CPU_LOG_INT, "%s: 0x%08x\n", __func__, exception);
+#else
+    qemu_log_mask(CPU_LOG_INT, "%s: 0x%08x, @ %08" PRIx32 "\n",
+                  __func__, exception, PC);
+
+    ASSERT_DIRECT_TO_GUEST_UNSET(env, exception);
+#endif
+
+    env->gpr[HEX_REG_PC] = PC;
+    cs->exception_index = exception;
+    cpu_loop_exit_restore(cs, retaddr);
+    cs->halted = false;
+}
+
 G_NORETURN void hexagon_raise_exception_err(CPUHexagonState *env,
                                             uint32_t exception,
                                             uintptr_t pc)
 {
-    CPUState *cs = env_cpu(env);
-    qemu_log_mask(CPU_LOG_INT, "%s: %d\n", __func__, exception);
-    cs->exception_index = exception;
-    cpu_loop_exit_restore(cs, pc);
+    do_raise_exception(env, exception, pc, 0);
 }
 
-G_NORETURN void HELPER(raise_exception)(CPUHexagonState *env, uint32_t excp)
+G_NORETURN void HELPER(raise_exception)(CPUHexagonState *env, uint32_t excp,
+                                        target_ulong PC)
 {
-    hexagon_raise_exception_err(env, excp, 0);
+    hexagon_raise_exception_err(env, excp, PC);
 }
 
 void log_store32(CPUHexagonState *env, target_ulong addr,
@@ -463,11 +493,11 @@ void HELPER(probe_pkt_scalar_hvx_stores)(CPUHexagonState *env, int mask)
  * If the load is in slot 0 and there is a store in slot1 (that
  * wasn't cancelled), we have to do the store first.
  */
-static void check_noshuf(CPUHexagonState *env, bool pkt_has_store_s1,
+static void check_noshuf(CPUHexagonState *env, bool pkt_has_scalar_store_s1,
                          uint32_t slot, target_ulong vaddr, int size,
                          uintptr_t ra)
 {
-    if (slot == 0 && pkt_has_store_s1 &&
+    if (slot == 0 && pkt_has_scalar_store_s1 &&
         ((env->slot_cancelled & (1 << 1)) == 0)) {
         probe_read(env, vaddr, size, MMU_USER_IDX, ra);
         commit_store(env, 1, ra);
@@ -1149,6 +1179,119 @@ float64 HELPER(dfmpyhh)(CPUHexagonState *env, float64 RxxV,
     return RxxV;
 }
 
+#ifndef CONFIG_USER_ONLY
+void HELPER(modify_ssr)(CPUHexagonState *env, uint32_t new, uint32_t old)
+{
+    BQL_LOCK_GUARD();
+    hexagon_modify_ssr(env, new, old);
+}
+
+static void hex_k0_lock(CPUHexagonState *env)
+{
+    BQL_LOCK_GUARD();
+    g_assert((env->k0_lock_count == 0) || (env->k0_lock_count == 1));
+
+    uint32_t syscfg = arch_get_system_reg(env, HEX_SREG_SYSCFG);
+    if (GET_SYSCFG_FIELD(SYSCFG_K0LOCK, syscfg)) {
+        if (env->k0_lock_state == HEX_LOCK_QUEUED) {
+            env->next_PC += 4;
+            env->k0_lock_count++;
+            env->k0_lock_state = HEX_LOCK_OWNER;
+            SET_SYSCFG_FIELD(env, SYSCFG_K0LOCK, 1);
+            return;
+        }
+        if (env->k0_lock_state == HEX_LOCK_OWNER) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "Double k0lock at PC: 0x%x, thread may hang\n",
+                          env->next_PC);
+            env->next_PC += 4;
+            CPUState *cs = env_cpu(env);
+            cpu_interrupt(cs, CPU_INTERRUPT_HALT);
+            return;
+        }
+        env->k0_lock_state = HEX_LOCK_WAITING;
+        CPUState *cs = env_cpu(env);
+        cpu_interrupt(cs, CPU_INTERRUPT_HALT);
+    } else {
+        env->next_PC += 4;
+        env->k0_lock_count++;
+        env->k0_lock_state = HEX_LOCK_OWNER;
+        SET_SYSCFG_FIELD(env, SYSCFG_K0LOCK, 1);
+    }
+
+}
+
+static void hex_k0_unlock(CPUHexagonState *env)
+{
+    BQL_LOCK_GUARD();
+    g_assert((env->k0_lock_count == 0) || (env->k0_lock_count == 1));
+
+    /* Nothing to do if the k0 isn't locked by this thread */
+    uint32_t syscfg = arch_get_system_reg(env, HEX_SREG_SYSCFG);
+    if ((GET_SYSCFG_FIELD(SYSCFG_K0LOCK, syscfg) == 0) ||
+        (env->k0_lock_state != HEX_LOCK_OWNER)) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "thread %d attempted to unlock k0 without having the "
+                      "lock, k0_lock state = %d, syscfg:k0 = %d\n",
+                      env->threadId, env->k0_lock_state,
+                      GET_SYSCFG_FIELD(SYSCFG_K0LOCK, syscfg));
+        g_assert(env->k0_lock_state != HEX_LOCK_WAITING);
+        return;
+    }
+
+    env->k0_lock_count--;
+    env->k0_lock_state = HEX_LOCK_UNLOCKED;
+    SET_SYSCFG_FIELD(env, SYSCFG_K0LOCK, 0);
+
+    /* Look for a thread to unlock */
+    unsigned int this_threadId = env->threadId;
+    CPUHexagonState *unlock_thread = NULL;
+    CPUState *cs;
+    CPU_FOREACH(cs) {
+        CPUHexagonState *thread = cpu_env(cs);
+
+        /*
+         * The hardware implements round-robin fairness, so we look for threads
+         * starting at env->threadId + 1 and incrementing modulo the number of
+         * threads.
+         *
+         * To implement this, we check if thread is a earlier in the modulo
+         * sequence than unlock_thread.
+         *     if unlock thread is higher than this thread
+         *         thread must be between this thread and unlock_thread
+         *     else
+         *         thread higher than this thread is ahead of unlock_thread
+         *         thread must be lower then unlock thread
+         */
+        if (thread->k0_lock_state == HEX_LOCK_WAITING) {
+            if (!unlock_thread) {
+                unlock_thread = thread;
+            } else if (unlock_thread->threadId > this_threadId) {
+                if (this_threadId < thread->threadId &&
+                    thread->threadId < unlock_thread->threadId) {
+                    unlock_thread = thread;
+                }
+            } else {
+                if (thread->threadId > this_threadId) {
+                    unlock_thread = thread;
+                }
+                if (thread->threadId < unlock_thread->threadId) {
+                    unlock_thread = thread;
+                }
+            }
+        }
+    }
+    if (unlock_thread) {
+        cs = env_cpu(unlock_thread);
+        unlock_thread->k0_lock_state = HEX_LOCK_QUEUED;
+        SET_SYSCFG_FIELD(unlock_thread, SYSCFG_K0LOCK, 1);
+        cpu_interrupt(cs, CPU_INTERRUPT_K0_UNLOCK);
+    }
+
+}
+#endif
+
+
 /* Histogram instructions */
 
 void HELPER(vhist)(CPUHexagonState *env)
@@ -1314,6 +1457,555 @@ void HELPER(vwhist128qm)(CPUHexagonState *env, int32_t uiV)
     }
 }
 
+#ifndef CONFIG_USER_ONLY
+static void hexagon_set_vid(CPUHexagonState *env, uint32_t offset, int val)
+{
+    g_assert((offset == L2VIC_VID_0) || (offset == L2VIC_VID_1));
+    CPUState *cs = env_cpu(env);
+    HexagonCPU *cpu = HEXAGON_CPU(cs);
+    const hwaddr pend_mem = cpu->l2vic_base_addr + offset;
+    cpu_physical_memory_write(pend_mem, &val, sizeof(val));
+}
+
+static void hexagon_clear_last_irq(CPUHexagonState *env, uint32_t offset)
+{
+    /*
+     * currently only l2vic is the only attached it uses vid0, remove
+     * the assert below if anther is added
+     */
+    hexagon_set_vid(env, offset, L2VIC_CIAD_INSTRUCTION);
+}
+
+void HELPER(ciad)(CPUHexagonState *env, uint32_t mask)
+{
+    uint32_t ipendad;
+    uint32_t iad;
+
+    BQL_LOCK_GUARD();
+    ipendad = READ_SREG(HEX_SREG_IPENDAD);
+    iad = fGET_FIELD(ipendad, IPENDAD_IAD);
+    fSET_FIELD(ipendad, IPENDAD_IAD, iad & ~(mask));
+    arch_set_system_reg(env, HEX_SREG_IPENDAD, ipendad);
+    hexagon_clear_last_irq(env, L2VIC_VID_0);
+    hex_interrupt_update(env);
+}
+
+void HELPER(siad)(CPUHexagonState *env, uint32_t mask)
+{
+    uint32_t ipendad;
+    uint32_t iad;
+
+    BQL_LOCK_GUARD();
+    ipendad = READ_SREG(HEX_SREG_IPENDAD);
+    iad = fGET_FIELD(ipendad, IPENDAD_IAD);
+    fSET_FIELD(ipendad, IPENDAD_IAD, iad | mask);
+    arch_set_system_reg(env, HEX_SREG_IPENDAD, ipendad);
+    hex_interrupt_update(env);
+}
+
+void HELPER(swi)(CPUHexagonState *env, uint32_t mask)
+{
+    BQL_LOCK_GUARD();
+    hex_raise_interrupts(env, mask, CPU_INTERRUPT_SWI);
+}
+
+void HELPER(cswi)(CPUHexagonState *env, uint32_t mask)
+{
+    BQL_LOCK_GUARD();
+    hex_clear_interrupts(env, mask, CPU_INTERRUPT_SWI);
+}
+
+void HELPER(iassignw)(CPUHexagonState *env, uint32_t src)
+{
+    uint32_t modectl;
+    uint32_t thread_enabled_mask;
+    CPUState *cpu;
+
+    BQL_LOCK_GUARD();
+    modectl = arch_get_system_reg(env, HEX_SREG_MODECTL);
+    thread_enabled_mask = GET_FIELD(MODECTL_E, modectl);
+
+    CPU_FOREACH(cpu) {
+        CPUHexagonState *thread_env = &(HEXAGON_CPU(cpu)->env);
+        uint32_t thread_id_mask = 0x1 << thread_env->threadId;
+        if (thread_enabled_mask & thread_id_mask) {
+            uint32_t imask = arch_get_system_reg(thread_env, HEX_SREG_IMASK);
+            uint32_t intbitpos = (src >> 16) & 0xF;
+            uint32_t val = (src >> thread_env->threadId) & 0x1;
+            imask = deposit32(imask, intbitpos, 1, val);
+            arch_set_system_reg(thread_env, HEX_SREG_IMASK, imask);
+
+            qemu_log_mask(CPU_LOG_INT, "%s: thread " TARGET_FMT_ld
+               ", new imask 0x%" PRIx32 "\n", __func__,
+               thread_env->threadId, imask);
+        }
+    }
+    hex_interrupt_update(env);
+}
+
+uint32_t HELPER(iassignr)(CPUHexagonState *env, uint32_t src)
+{
+    uint32_t modectl;
+    uint32_t thread_enabled_mask;
+    uint32_t intbitpos;
+    uint32_t dest_reg;
+    CPUState *cpu;
+
+    BQL_LOCK_GUARD();
+    modectl = arch_get_system_reg(env, HEX_SREG_MODECTL);
+    thread_enabled_mask = GET_FIELD(MODECTL_E, modectl);
+    /* src fields are in same position as modectl, but mean different things */
+    intbitpos = GET_FIELD(MODECTL_W, src);
+    dest_reg = 0;
+    CPU_FOREACH(cpu) {
+        CPUHexagonState *thread_env = &(HEXAGON_CPU(cpu)->env);
+        uint32_t thread_id_mask = 0x1 << thread_env->threadId;
+        if (thread_enabled_mask & thread_id_mask) {
+            uint32_t imask = arch_get_system_reg(thread_env, HEX_SREG_IMASK);
+            dest_reg |= ((imask >> intbitpos) & 0x1) << thread_env->threadId;
+        }
+    }
+
+    return dest_reg;
+}
+
+void HELPER(start)(CPUHexagonState *env, uint32_t imask)
+{
+    hexagon_start_threads(env, imask);
+}
+
+void HELPER(stop)(CPUHexagonState *env)
+{
+    hexagon_stop_thread(env);
+}
+
+static inline QEMU_ALWAYS_INLINE void resched(CPUHexagonState *env)
+{
+    uint32_t schedcfg;
+    uint32_t schedcfg_en;
+    int int_number;
+    CPUState *cs;
+    uint32_t lowest_th_prio = 0; /* 0 is highest prio */
+    uint32_t bestwait_reg;
+    uint32_t best_prio;
+
+    BQL_LOCK_GUARD();
+    qemu_log_mask(CPU_LOG_INT, "%s: check resched\n", __func__);
+    schedcfg = arch_get_system_reg(env, HEX_SREG_SCHEDCFG);
+    schedcfg_en = GET_FIELD(SCHEDCFG_EN, schedcfg);
+    int_number = GET_FIELD(SCHEDCFG_INTNO, schedcfg);
+
+    if (!schedcfg_en) {
+        return;
+    }
+
+    CPU_FOREACH(cs) {
+        HexagonCPU *thread = HEXAGON_CPU(cs);
+        CPUHexagonState *thread_env = &(thread->env);
+        uint32_t th_prio = GET_FIELD(
+            STID_PRIO, arch_get_system_reg(thread_env, HEX_SREG_STID));
+        if (!hexagon_thread_is_enabled(thread_env)) {
+            continue;
+        }
+
+        lowest_th_prio = (lowest_th_prio > th_prio)
+            ? lowest_th_prio
+            : th_prio;
+    }
+
+    bestwait_reg = arch_get_system_reg(env, HEX_SREG_BESTWAIT);
+    best_prio = GET_FIELD(BESTWAIT_PRIO, bestwait_reg);
+
+    /*
+     * If the lowest priority thread is lower priority than the
+     * value in the BESTWAIT register, we must raise the reschedule
+     * interrupt on the lowest priority thread.
+     */
+    if (lowest_th_prio > best_prio) {
+        qemu_log_mask(CPU_LOG_INT,
+                "%s: raising resched int %d, cur PC 0x" TARGET_FMT_lx "\n",
+                __func__, int_number, arch_get_thread_reg(env, HEX_REG_PC));
+        SET_SYSTEM_FIELD(env, HEX_SREG_BESTWAIT, BESTWAIT_PRIO, 0x1ff);
+        hex_raise_interrupts(env, 1 << int_number, CPU_INTERRUPT_SWI);
+    }
+}
+
+void HELPER(resched)(CPUHexagonState *env)
+{
+    resched(env);
+}
+
+void HELPER(wait)(CPUHexagonState *env, target_ulong PC)
+{
+    BQL_LOCK_GUARD();
+
+    if (!fIN_DEBUG_MODE(env->threadId)) {
+        hexagon_wait_thread(env, PC);
+    }
+}
+
+void HELPER(resume)(CPUHexagonState *env, uint32_t mask)
+{
+    BQL_LOCK_GUARD();
+    hexagon_resume_threads(env, mask);
+}
+
+uint32_t HELPER(getimask)(CPUHexagonState *env, uint32_t tid)
+{
+    CPUState *cs;
+    CPU_FOREACH(cs) {
+        HexagonCPU *found_cpu = HEXAGON_CPU(cs);
+        CPUHexagonState *found_env = &found_cpu->env;
+        if (found_env->threadId == tid) {
+            target_ulong imask = arch_get_system_reg(found_env, HEX_SREG_IMASK);
+            qemu_log_mask(CPU_LOG_INT, "%s: tid %d imask = 0x%x\n",
+                          __func__, env->threadId,
+                          (unsigned)GET_FIELD(IMASK_MASK, imask));
+            return GET_FIELD(IMASK_MASK, imask);
+        }
+    }
+    return 0;
+}
+
+void HELPER(setimask)(CPUHexagonState *env, uint32_t pred, uint32_t imask)
+{
+    CPUState *cs;
+
+    BQL_LOCK_GUARD();
+    CPU_FOREACH(cs) {
+        HexagonCPU *found_cpu = HEXAGON_CPU(cs);
+        CPUHexagonState *found_env = &found_cpu->env;
+
+        if (pred == found_env->threadId) {
+            SET_SYSTEM_FIELD(found_env, HEX_SREG_IMASK, IMASK_MASK, imask);
+            qemu_log_mask(CPU_LOG_INT, "%s: tid %d imask 0x%x\n",
+                          __func__, found_env->threadId, imask);
+            hex_interrupt_update(env);
+            return;
+        }
+    }
+    hex_interrupt_update(env);
+}
+
+static bool handle_pmu_sreg_write(CPUHexagonState *env, uint32_t reg,
+                                  uint32_t val)
+{
+    if (reg == HEX_SREG_PMUSTID0 || reg == HEX_SREG_PMUSTID1
+        || reg == HEX_SREG_PMUCFG || reg == HEX_SREG_PMUEVTCFG
+        || reg == HEX_SREG_PMUEVTCFG1
+        || (reg >= HEX_SREG_PMUCNT4 && reg <= HEX_SREG_PMUCNT3)) {
+        qemu_log_mask(LOG_UNIMP, "PMU registers not yet implemented");
+        return true;
+    }
+    return false;
+}
+
+static void modify_syscfg(CPUHexagonState *env, uint32_t val)
+{
+    g_assert(bql_locked());
+
+    uint32_t old;
+    uint32_t syscfg_read_only_mask = 0x80001c00;
+    uint32_t syscfg = arch_get_system_reg(env, HEX_SREG_SYSCFG);
+
+    /* clear read-only bits if they are set in the new value. */
+    val &= ~syscfg_read_only_mask;
+    /* if read-only are currently set in syscfg keep them set. */
+    val |= (syscfg & syscfg_read_only_mask);
+
+    uint32_t tmp = val;
+    old = arch_get_system_reg(env, HEX_SREG_SYSCFG);
+    arch_set_system_reg(env, HEX_SREG_SYSCFG, tmp);
+
+    /* Check for change in MMU enable */
+    target_ulong old_mmu_enable = GET_SYSCFG_FIELD(SYSCFG_MMUEN, old);
+    uint8_t old_en = GET_SYSCFG_FIELD(SYSCFG_PCYCLEEN, old);
+    uint8_t old_gie = GET_SYSCFG_FIELD(SYSCFG_GIE, old);
+    target_ulong new_mmu_enable =
+        GET_SYSCFG_FIELD(SYSCFG_MMUEN, val);
+    if (new_mmu_enable && !old_mmu_enable) {
+        hex_mmu_on(env);
+    } else if (!new_mmu_enable && old_mmu_enable) {
+        hex_mmu_off(env);
+    }
+
+    /* Changing pcycle enable from 0 to 1 resets the counters */
+    uint8_t new_en = GET_SYSCFG_FIELD(SYSCFG_PCYCLEEN, val);
+    CPUState *cs;
+    if (old_en == 0 && new_en == 1) {
+        CPU_FOREACH(cs) {
+            CPUHexagonState *_env = cpu_env(cs);
+            _env->t_cycle_count = 0;
+        }
+    }
+
+    /* See if global interrupts are turned on */
+    uint8_t new_gie = GET_SYSCFG_FIELD(SYSCFG_GIE, val);
+    if (!old_gie && new_gie) {
+        qemu_log_mask(CPU_LOG_INT, "%s: global interrupts enabled\n", __func__);
+        hex_interrupt_update(env);
+    }
+
+    if (qemu_loglevel_mask(LOG_UNIMP)) {
+        int new_v2x = GET_SYSCFG_FIELD(SYSCFG_V2X, val);
+        if (!new_v2x) {
+            qemu_log("HVX: 64 byte vector length is unsupported\n");
+        }
+    }
+}
+
+static uint32_t hexagon_find_last_irq(CPUHexagonState *env, uint32_t vid)
+{
+    int offset = (vid ==  HEX_SREG_VID) ? L2VIC_VID_0 : L2VIC_VID_1;
+    CPUState *cs = env_cpu(env);
+    HexagonCPU *cpu = HEXAGON_CPU(cs);
+    const hwaddr pend_mem = cpu->l2vic_base_addr + offset;
+    uint32_t irq;
+    cpu_physical_memory_read(pend_mem, &irq, sizeof(irq));
+    return irq;
+}
+
+static void hexagon_read_timer(CPUHexagonState *env, uint32_t *low,
+                               uint32_t *high)
+{
+    CPUState *cs = env_cpu(env);
+    HexagonCPU *cpu = HEXAGON_CPU(cs);
+    const hwaddr low_addr  = cpu->qtimer_base_addr + QCT_QTIMER_CNTPCT_LO;
+    const hwaddr high_addr = cpu->qtimer_base_addr + QCT_QTIMER_CNTPCT_HI;
+
+    cpu_physical_memory_read(low_addr, low, sizeof(*low));
+    cpu_physical_memory_read(high_addr, high, sizeof(*high));
+}
+
+static inline bool ssr_ce_enabled(CPUHexagonState *env)
+{
+    target_ulong ssr = arch_get_system_reg(env, HEX_SREG_SSR);
+    return GET_SSR_FIELD(SSR_CE, ssr);
+}
+
+static uint32_t creg_read(CPUHexagonState *env, uint32_t reg)
+{
+    uint32_t low, high;
+    switch (reg) {
+    case HEX_REG_UPCYCLELO:
+        return ssr_ce_enabled(env) ? hexagon_get_sys_pcycle_count_low(env) : 0;
+    case HEX_REG_UPCYCLEHI:
+        return ssr_ce_enabled(env) ? hexagon_get_sys_pcycle_count_high(env) : 0;
+    case HEX_REG_UTIMERLO:
+        hexagon_read_timer(env, &low, &high);
+        return low;
+    case HEX_REG_UTIMERHI:
+        hexagon_read_timer(env, &low, &high);
+        return high;
+    default:
+        return env->gpr[reg];
+    }
+}
+
+uint32_t HELPER(creg_read)(CPUHexagonState *env, uint32_t reg)
+{
+    return creg_read(env, reg);
+}
+
+uint64_t HELPER(creg_read_pair)(CPUHexagonState *env, uint32_t reg)
+{
+    return  (uint64_t)creg_read(env, reg) |
+           (((uint64_t)creg_read(env, reg + 1)) << 32);
+}
+
+
+static inline QEMU_ALWAYS_INLINE void sreg_write(CPUHexagonState *env,
+                                                 uint32_t reg, uint32_t val)
+
+{
+    g_assert(bql_locked());
+    if ((reg == HEX_SREG_VID) || (reg == HEX_SREG_VID1)) {
+        hexagon_set_vid(env, (reg == HEX_SREG_VID) ? L2VIC_VID_0 : L2VIC_VID_1,
+                        val);
+        arch_set_system_reg(env, reg, val);
+    } else if (reg == HEX_SREG_SYSCFG) {
+        modify_syscfg(env, val);
+    } else if (reg == HEX_SREG_IMASK) {
+        val = GET_FIELD(IMASK_MASK, val);
+        arch_set_system_reg(env, reg, val);
+    } else if (reg == HEX_SREG_PCYCLELO) {
+        hexagon_set_sys_pcycle_count_low(env, val);
+    } else if (reg == HEX_SREG_PCYCLEHI) {
+        hexagon_set_sys_pcycle_count_high(env, val);
+    } else if (!handle_pmu_sreg_write(env, reg, val)) {
+        if (reg >= HEX_SREG_GLB_START) {
+            arch_set_system_reg(env, reg, val);
+        } else {
+            arch_set_system_reg(env, reg, val);
+        }
+    }
+}
+
+void HELPER(sreg_write)(CPUHexagonState *env, uint32_t reg, uint32_t val)
+{
+    BQL_LOCK_GUARD();
+    sreg_write(env, reg, val);
+}
+
+void hexagon_gdb_sreg_write(CPUHexagonState *env, uint32_t reg, uint32_t val)
+{
+    BQL_LOCK_GUARD();
+    sreg_write(env, reg, val);
+    /*
+     * The above is needed to run special logic for regs like syscfg, but it
+     * won't set read-only bits. This will:
+     */
+    arch_set_system_reg(env, reg, val);
+}
+
+void HELPER(sreg_write_pair)(CPUHexagonState *env, uint32_t reg, uint64_t val)
+{
+    BQL_LOCK_GUARD();
+    sreg_write(env, reg, val & 0xFFFFFFFF);
+    sreg_write(env, reg + 1, val >> 32);
+}
+
+static inline QEMU_ALWAYS_INLINE uint32_t sreg_read(CPUHexagonState *env,
+                                                    uint32_t reg)
+{
+    g_assert(bql_locked());
+    if (reg == HEX_SREG_PMUSTID0 || reg == HEX_SREG_PMUSTID1
+        || reg == HEX_SREG_PMUCFG || reg == HEX_SREG_PMUEVTCFG
+        || reg == HEX_SREG_PMUEVTCFG1
+        || (reg >= HEX_SREG_PMUCNT4 && reg <= HEX_SREG_PMUCNT3)) {
+        qemu_log_mask(LOG_UNIMP, "PMU registers not yet implemented");
+        return 0;
+    }
+    if ((reg == HEX_SREG_VID) || (reg == HEX_SREG_VID1)) {
+        const uint32_t vid = hexagon_find_last_irq(env, reg);
+        arch_set_system_reg(env, reg, vid);
+    } else if ((reg == HEX_SREG_TIMERLO) || (reg == HEX_SREG_TIMERHI)) {
+        uint32_t low = 0;
+        uint32_t high = 0;
+        hexagon_read_timer(env, &low, &high);
+        arch_set_system_reg(env, HEX_SREG_TIMERLO, low);
+        arch_set_system_reg(env, HEX_SREG_TIMERHI, high);
+    } else if (reg == HEX_SREG_BADVA) {
+        target_ulong ssr = arch_get_system_reg(env, HEX_SREG_SSR);
+        if (GET_SSR_FIELD(SSR_BVS, ssr)) {
+            return arch_get_system_reg(env, HEX_SREG_BADVA1);
+        }
+        return arch_get_system_reg(env, HEX_SREG_BADVA0);
+    }
+    return arch_get_system_reg(env, reg);
+}
+
+uint32_t HELPER(sreg_read)(CPUHexagonState *env, uint32_t reg)
+{
+    BQL_LOCK_GUARD();
+    return sreg_read(env, reg);
+}
+
+uint32_t hexagon_sreg_read(CPUHexagonState *env, uint32_t reg)
+{
+    return sreg_read(env, reg);
+}
+
+uint64_t HELPER(sreg_read_pair)(CPUHexagonState *env, uint32_t reg)
+{
+    BQL_LOCK_GUARD();
+    if (reg == HEX_SREG_TIMERLO) {
+        uint32_t low = 0;
+        uint32_t high = 0;
+        hexagon_read_timer(env, &low, &high);
+        arch_set_system_reg(env, HEX_SREG_TIMERLO, low);
+        arch_set_system_reg(env, HEX_SREG_TIMERHI, high);
+    } else if (reg == HEX_SREG_PCYCLELO) {
+        return hexagon_get_sys_pcycle_count(env);
+    }
+    return   (uint64_t)sreg_read(env, reg) |
+           (((uint64_t)sreg_read(env, reg + 1)) << 32);
+}
+
+uint32_t HELPER(greg_read)(CPUHexagonState *env, uint32_t reg)
+
+{
+    return hexagon_greg_read(env, reg);
+}
+
+uint64_t HELPER(greg_read_pair)(CPUHexagonState *env, uint32_t reg)
+
+{
+    if (reg == HEX_GREG_G0 || reg == HEX_GREG_G2) {
+        return (uint64_t)(env->greg[reg]) |
+               (((uint64_t)(env->greg[reg + 1])) << 32);
+    }
+    switch (reg) {
+    case HEX_GREG_GPCYCLELO: {
+        target_ulong ssr = arch_get_system_reg(env, HEX_SREG_SSR);
+        int ssr_ce = GET_SSR_FIELD(SSR_CE, ssr);
+        return ssr_ce ? hexagon_get_sys_pcycle_count(env) : 0;
+    }
+    default:
+        return (uint64_t)hexagon_greg_read(env, reg) |
+               ((uint64_t)(hexagon_greg_read(env, reg + 1)) << 32);
+    }
+}
+
+void HELPER(setprio)(CPUHexagonState *env, uint32_t thread, uint32_t prio)
+{
+    CPUState *cs;
+
+    BQL_LOCK_GUARD();
+    CPU_FOREACH(cs) {
+        HexagonCPU *found_cpu = HEXAGON_CPU(cs);
+        CPUHexagonState *found_env = &found_cpu->env;
+        if (thread == found_env->threadId) {
+            SET_SYSTEM_FIELD(found_env, HEX_SREG_STID, STID_PRIO, prio);
+            qemu_log_mask(CPU_LOG_INT, "%s: tid %d prio = 0x%x\n",
+                          __func__, found_env->threadId, prio);
+            resched(env);
+            return;
+        }
+    }
+    g_assert_not_reached();
+}
+
+void HELPER(nmi)(CPUHexagonState *env, uint32_t thread_mask)
+{
+    g_assert_not_reached();
+}
+
+void HELPER(pending_interrupt)(CPUHexagonState *env)
+{
+    BQL_LOCK_GUARD();
+    hex_interrupt_update(env);
+}
+#endif
+
+#ifdef CONFIG_USER_ONLY
+uint32_t HELPER(creg_read)(CPUHexagonState *env, uint32_t reg)
+{
+    /* These are handled directly by gen_read_ctrl_reg(). */
+    g_assert(reg != HEX_REG_UPCYCLELO && reg != HEX_REG_UPCYCLEHI);
+
+    if (reg == HEX_REG_UTIMERHI) {
+        return cpu_get_host_ticks() >> 32;
+    } else if (reg == HEX_REG_UTIMERLO) {
+        return extract32(cpu_get_host_ticks(), 0, 32);
+    }
+    return 0;
+}
+
+uint64_t HELPER(creg_read_pair)(CPUHexagonState *env, uint32_t reg)
+{
+    if (reg == HEX_REG_UPCYCLELO) {
+        /* Pretend SSR[CE] is always set. */
+        return hexagon_get_sys_pcycle_count(env);
+    }
+    if (reg == HEX_REG_UTIMERLO) {
+        return cpu_get_host_ticks();
+    }
+    return 0;
+}
+#endif
+
+
 /* These macros can be referenced in the generated helper functions */
 #define warn(...) /* Nothing */
 #define fatal(...) g_assert_not_reached();
@@ -1321,4 +2013,5 @@ void HELPER(vwhist128qm)(CPUHexagonState *env, int32_t uiV)
 #define BOGUS_HELPER(tag) \
     printf("ERROR: bogus helper: " #tag "\n")
 
+#include "mmvec/kvx_ieee.h"
 #include "helper_funcs_generated.c.inc"
diff --git a/target/hexagon/printinsn.c b/target/hexagon/printinsn.c
index 4865cdd133b5..f780092586cf 100644
--- a/target/hexagon/printinsn.c
+++ b/target/hexagon/printinsn.c
@@ -24,16 +24,17 @@
 
 static const char *sreg2str(unsigned int reg)
 {
-    if (reg < TOTAL_PER_THREAD_REGS) {
-        return hexagon_regnames[reg];
-    } else {
-        return "???";
+#ifndef CONFIG_USER_ONLY
+    if (reg < NUM_SREGS) {
+        return hexagon_sregnames[reg];
     }
+#endif
+    return "???";
 }
 
 static const char *creg2str(unsigned int reg)
 {
-    return sreg2str(reg + HEX_REG_SA0);
+    return hexagon_regnames[reg + HEX_REG_SA0];
 }
 
 static void snprintinsn(GString *buf, Insn *insn)
diff --git a/target/hexagon/reg_fields_def.h.inc b/target/hexagon/reg_fields_def.h.inc
index f2a58d486c55..50b8c26f8bfa 100644
--- a/target/hexagon/reg_fields_def.h.inc
+++ b/target/hexagon/reg_fields_def.h.inc
@@ -39,3 +39,110 @@ DEF_REG_FIELD(USR_FPDBZE,        26, 1)
 DEF_REG_FIELD(USR_FPOVFE,        27, 1)
 DEF_REG_FIELD(USR_FPUNFE,        28, 1)
 DEF_REG_FIELD(USR_FPINPE,        29, 1)
+
+DEF_REG_FIELD(IPENDAD_IAD, 16, 16)
+DEF_REG_FIELD(IPENDAD_IPEND, 0, 16)
+
+DEF_REG_FIELD(SCHEDCFG_EN, 8, 1)
+DEF_REG_FIELD(SCHEDCFG_INTNO, 0, 4)
+DEF_REG_FIELD(BESTWAIT_PRIO, 0, 10)
+
+
+/* PTE (aka TLB entry) fields */
+DEF_REG_FIELD(PTE_PPD, 0, 24)
+DEF_REG_FIELD(PTE_C, 24, 4)
+DEF_REG_FIELD(PTE_U, 28, 1)
+DEF_REG_FIELD(PTE_R, 29, 1)
+DEF_REG_FIELD(PTE_W, 30, 1)
+DEF_REG_FIELD(PTE_X, 31, 1)
+DEF_REG_FIELD(PTE_VPN, 32, 20)
+DEF_REG_FIELD(PTE_ASID, 52, 7)
+DEF_REG_FIELD(PTE_ATR0, 59, 1)
+DEF_REG_FIELD(PTE_ATR1, 60, 1)
+DEF_REG_FIELD(PTE_PA35, 61, 1)
+DEF_REG_FIELD(PTE_G, 62, 1)
+DEF_REG_FIELD(PTE_V, 63, 1)
+
+/* SYSCFG fields */
+DEF_REG_FIELD(SYSCFG_MMUEN, 0, 1)
+DEF_REG_FIELD(SYSCFG_ICEN, 1, 1)
+DEF_REG_FIELD(SYSCFG_DCEN, 2, 1)
+DEF_REG_FIELD(SYSCFG_ISDBTRUSTED, 3, 1)
+DEF_REG_FIELD(SYSCFG_GIE, 4, 1)
+DEF_REG_FIELD(SYSCFG_ISDBREADY, 5, 1)
+DEF_REG_FIELD(SYSCFG_PCYCLEEN, 6, 1)
+DEF_REG_FIELD(SYSCFG_V2X, 7, 1)
+DEF_REG_FIELD(SYSCFG_IGNOREDABORT, 8, 1)
+DEF_REG_FIELD(SYSCFG_PM, 9, 1)
+DEF_REG_FIELD(SYSCFG_TLBLOCK, 11, 1)
+DEF_REG_FIELD(SYSCFG_K0LOCK, 12, 1)
+DEF_REG_FIELD(SYSCFG_BQ, 13, 1)
+DEF_REG_FIELD(SYSCFG_PRIO, 14, 1)
+DEF_REG_FIELD(SYSCFG_DMT, 15, 1)
+DEF_REG_FIELD(SYSCFG_L2CFG, 16, 3)
+DEF_REG_FIELD(SYSCFG_ITCM, 19, 1)
+DEF_REG_FIELD(SYSCFG_L2NWA, 21, 1)
+DEF_REG_FIELD(SYSCFG_L2NRA, 22, 1)
+DEF_REG_FIELD(SYSCFG_L2WB, 23, 1)
+DEF_REG_FIELD(SYSCFG_L2P, 24, 1)
+DEF_REG_FIELD(SYSCFG_SLVCTL0, 25, 2)
+DEF_REG_FIELD(SYSCFG_SLVCTL1, 27, 2)
+DEF_REG_FIELD(SYSCFG_L2PARTSIZE, 29, 2)
+DEF_REG_FIELD(SYSCFG_L2GCA, 31, 1)
+
+/* SSR fields */
+DEF_REG_FIELD(SSR_CAUSE, 0, 8)
+DEF_REG_FIELD(SSR_ASID, 8, 7)
+DEF_REG_FIELD(SSR_UM, 16, 1)
+DEF_REG_FIELD(SSR_EX, 17, 1)
+DEF_REG_FIELD(SSR_IE, 18, 1)
+DEF_REG_FIELD(SSR_GM, 19, 1)
+DEF_REG_FIELD(SSR_V0, 20, 1)
+DEF_REG_FIELD(SSR_V1, 21, 1)
+DEF_REG_FIELD(SSR_BVS, 22, 1)
+DEF_REG_FIELD(SSR_CE, 23, 1)
+DEF_REG_FIELD(SSR_PE, 24, 1)
+DEF_REG_FIELD(SSR_BP, 25, 1)
+DEF_REG_FIELD(SSR_XE2, 26, 1)
+DEF_REG_FIELD(SSR_XA, 27, 3)
+DEF_REG_FIELD(SSR_SS, 30, 1)
+DEF_REG_FIELD(SSR_XE, 31, 1)
+
+/* misc registers */
+DEF_REG_FIELD(IMASK_MASK, 0, 16)
+
+DEF_REG_FIELD(STID_PRIO, 16, 8)
+DEF_REG_FIELD(STID_STID, 0, 8)
+
+/* MODECTL fields */
+DEF_REG_FIELD(MODECTL_E, 0, 8)
+DEF_REG_FIELD(MODECTL_W, 16, 8)
+
+DEF_REG_FIELD(CCR_L1ICP, 0, 2)
+DEF_REG_FIELD(CCR_L1DCP, 3, 2)
+DEF_REG_FIELD(CCR_L2CP, 6, 2)
+
+DEF_REG_FIELD(CCR_HFI, 16, 1)
+DEF_REG_FIELD(CCR_HFD, 17, 1)
+DEF_REG_FIELD(CCR_HFIL2, 18, 1)
+DEF_REG_FIELD(CCR_HFDL2, 19, 1)
+DEF_REG_FIELD(CCR_SFD, 20, 1)
+
+DEF_REG_FIELD(CCR_GIE, 24, 1)
+DEF_REG_FIELD(CCR_GTE, 25, 1)
+DEF_REG_FIELD(CCR_GEE, 26, 1)
+DEF_REG_FIELD(CCR_GRE, 27, 1)
+DEF_REG_FIELD(CCR_VV1, 29, 1)
+DEF_REG_FIELD(CCR_VV2, 30, 1)
+DEF_REG_FIELD(CCR_VV3, 31, 1)
+
+/* ISDB ST fields */
+DEF_REG_FIELD(ISDBST_WAITRUN, 24, 8)
+DEF_REG_FIELD(ISDBST_ONOFF, 16, 8)
+DEF_REG_FIELD(ISDBST_DEBUGMODE, 8, 8)
+DEF_REG_FIELD(ISDBST_STUFFSTATUS, 5, 1)
+DEF_REG_FIELD(ISDBST_CMDSTATUS, 4, 1)
+DEF_REG_FIELD(ISDBST_PROCMODE, 3, 1)
+DEF_REG_FIELD(ISDBST_MBXINSTATUS, 2, 1)
+DEF_REG_FIELD(ISDBST_MBXOUTSTATUS, 1, 1)
+DEF_REG_FIELD(ISDBST_READY, 0, 1)
diff --git a/target/hexagon/sys_macros.h b/target/hexagon/sys_macros.h
new file mode 100644
index 000000000000..e5dc1ce0ab9f
--- /dev/null
+++ b/target/hexagon/sys_macros.h
@@ -0,0 +1,238 @@
+/*
+ * Copyright(c) 2019-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HEXAGON_SYS_MACROS_H
+#define HEXAGON_SYS_MACROS_H
+
+/*
+ * Macro definitions for Hexagon system mode
+ */
+
+#ifndef CONFIG_USER_ONLY
+
+#define READ_SREG(NUM) arch_get_system_reg(env, NUM)
+#define READ_SGP0()    arch_get_system_reg(env, HEX_SREG_SGP0)
+#define READ_SGP1()    arch_get_system_reg(env, HEX_SREG_SGP1)
+#define READ_SGP10()   ((uint64_t)arch_get_system_reg(env, HEX_SREG_SGP0) | \
+    ((uint64_t)arch_get_system_reg(env, HEX_SREG_SGP1) << 32))
+
+#define WRITE_SREG(NUM, VAL)      log_sreg_write(env, NUM, VAL, slot)
+#define WRITE_SGP0(VAL)           log_sreg_write(env, HEX_SREG_SGP0, VAL, slot)
+#define WRITE_SGP1(VAL)           log_sreg_write(env, HEX_SREG_SGP1, VAL, slot)
+#define WRITE_SGP10(VAL) \
+    do { \
+        log_sreg_write(env, HEX_SREG_SGP0, (VAL) & 0xFFFFFFFF, slot); \
+        log_sreg_write(env, HEX_SREG_SGP1, (VAL) >> 32, slot); \
+    } while (0)
+
+#ifdef QEMU_GENERATE
+#define GET_SSR_FIELD(RES, FIELD) \
+    GET_FIELD(RES, FIELD, hex_t_sreg[HEX_SREG_SSR])
+#else
+
+#define GET_SSR_FIELD(FIELD, REGIN) \
+    (uint32_t)GET_FIELD(FIELD, REGIN)
+#define GET_SYSCFG_FIELD(FIELD, REGIN) \
+    (uint32_t)GET_FIELD(FIELD, REGIN)
+#define SET_SYSTEM_FIELD(ENV, REG, FIELD, VAL) \
+    do { \
+        uint32_t regval = arch_get_system_reg(ENV, REG); \
+        fINSERT_BITS(regval, reg_field_info[FIELD].width, \
+                     reg_field_info[FIELD].offset, (VAL)); \
+        arch_set_system_reg(ENV, REG, regval); \
+    } while (0)
+#define SET_SSR_FIELD(ENV, FIELD, VAL) \
+    SET_SYSTEM_FIELD(ENV, HEX_SREG_SSR, FIELD, VAL)
+#define SET_SYSCFG_FIELD(ENV, FIELD, VAL) \
+    SET_SYSTEM_FIELD(ENV, HEX_SREG_SYSCFG, FIELD, VAL)
+
+#define CCR_FIELD_SET(ENV, FIELD) \
+    (!!GET_FIELD(FIELD, arch_get_system_reg(ENV, HEX_SREG_CCR)))
+
+/*
+ * Direct-to-guest is not implemented yet, continuing would cause unexpected
+ * behavior, so we abort.
+ */
+#define ASSERT_DIRECT_TO_GUEST_UNSET(ENV, EXCP) \
+    do { \
+        switch (EXCP) { \
+        case HEX_EVENT_TRAP0: \
+            g_assert(!CCR_FIELD_SET(ENV, CCR_GTE)); \
+            break; \
+        case HEX_EVENT_IMPRECISE: \
+        case HEX_EVENT_PRECISE: \
+        case HEX_EVENT_FPTRAP: \
+            g_assert(!CCR_FIELD_SET(ENV, CCR_GEE)); \
+            break; \
+        default: \
+            if ((EXCP) >= HEX_EVENT_INT0) { \
+                g_assert(!CCR_FIELD_SET(ENV, CCR_GIE)); \
+            } \
+            break; \
+        } \
+    } while (0)
+#endif
+
+#define fREAD_ELR() (READ_SREG(HEX_SREG_ELR))
+
+#define fLOAD_PHYS(NUM, SIZE, SIGN, SRC1, SRC2, DST) { \
+  const uintptr_t rs = ((unsigned long)(unsigned)(SRC1)) & 0x7ff; \
+  const uintptr_t rt = ((unsigned long)(unsigned)(SRC2)) << 11; \
+  const uintptr_t addr = rs + rt;         \
+  cpu_physical_memory_read(addr, &DST, sizeof(uint32_t)); \
+}
+
+#define fPOW2_HELP_ROUNDUP(VAL) \
+    ((VAL) | \
+     ((VAL) >> 1) | \
+     ((VAL) >> 2) | \
+     ((VAL) >> 4) | \
+     ((VAL) >> 8) | \
+     ((VAL) >> 16))
+#define fPOW2_ROUNDUP(VAL) (fPOW2_HELP_ROUNDUP((VAL) - 1) + 1)
+
+#define fFRAMECHECK(ADDR, EA)  g_assert_not_reached();
+
+#define fTRAP(TRAPTYPE, IMM) \
+    register_trap_exception(env, TRAPTYPE, IMM, PC)
+
+#define fVIRTINSN_SPSWAP(IMM, REG)
+#define fVIRTINSN_GETIE(IMM, REG) { REG = 0xdeafbeef; }
+#define fVIRTINSN_SETIE(IMM, REG)
+#define fVIRTINSN_RTE(IMM, REG)
+#define fGRE_ENABLED() GET_FIELD(CCR_GRE, READ_SREG(HEX_SREG_CCR))
+#define fTRAP1_VIRTINSN(IMM) \
+    (fGRE_ENABLED() && \
+        (((IMM) == 1) || ((IMM) == 3) || ((IMM) == 4) || ((IMM) == 6)))
+
+/* Not modeled in qemu */
+
+#define MARK_LATE_PRED_WRITE(RNUM)
+#define fICINVIDX(REG)
+#define fICKILL()
+#define fDCKILL()
+#define fL2KILL()
+#define fL2UNLOCK()
+#define fL2CLEAN()
+#define fL2CLEANINV()
+#define fL2CLEANPA(REG)
+#define fL2CLEANINVPA(REG)
+#define fL2CLEANINVIDX(REG)
+#define fL2CLEANIDX(REG)
+#define fL2INVIDX(REG)
+#define fL2TAGR(INDEX, DST, DSTREG)
+#define fL2UNLOCKA(VA) ((void) VA)
+#define fL2TAGW(INDEX, PART2)
+#define fDCCLEANIDX(REG)
+#define fDCCLEANINVIDX(REG)
+
+/* Always succeed: */
+#define fL2LOCKA(EA, PDV, PDN) ((void) EA, PDV = 0xFF)
+#define fCLEAR_RTE_EX() \
+    do { \
+        uint32_t tmp = 0; \
+        tmp = arch_get_system_reg(env, HEX_SREG_SSR); \
+        fINSERT_BITS(tmp, reg_field_info[SSR_EX].width, \
+                     reg_field_info[SSR_EX].offset, 0); \
+        log_sreg_write(env, HEX_SREG_SSR, tmp, slot); \
+    } while (0)
+
+#define fDCINVIDX(REG)
+#define fDCINVA(REG) do { REG = REG; } while (0) /* Nothing to do in qemu */
+
+#define fSET_TLB_LOCK()       hex_tlb_lock(env);
+#define fCLEAR_TLB_LOCK()     hex_tlb_unlock(env);
+
+#define fSET_K0_LOCK()        hex_k0_lock(env);
+#define fCLEAR_K0_LOCK()      hex_k0_unlock(env);
+
+#define fTLB_IDXMASK(INDEX) \
+    ((INDEX) & (fPOW2_ROUNDUP(fCAST4u(env_archcpu(env)->num_tlbs)) - 1))
+
+#define fTLB_NONPOW2WRAP(INDEX)                 \
+    (((INDEX) >= env_archcpu(env)->num_tlbs) ?  \
+         ((INDEX) - env_archcpu(env)->num_tlbs) : \
+         (INDEX))
+
+
+#define fTLBW(INDEX, VALUE) \
+    hex_tlbw(env, (INDEX), (VALUE))
+#define fTLBW_EXTENDED(INDEX, VALUE) \
+    hex_tlbw(env, (INDEX), (VALUE))
+#define fTLB_ENTRY_OVERLAP(VALUE) \
+    (hex_tlb_check_overlap(env, VALUE, -1) != -2)
+#define fTLB_ENTRY_OVERLAP_IDX(VALUE) \
+    hex_tlb_check_overlap(env, VALUE, -1)
+#define fTLBR(INDEX) \
+    (env->hex_tlb->entries[fTLB_NONPOW2WRAP(fTLB_IDXMASK(INDEX))])
+#define fTLBR_EXTENDED(INDEX) \
+    (env->hex_tlb->entries[fTLB_NONPOW2WRAP(fTLB_IDXMASK(INDEX))])
+#define fTLBP(TLBHI) \
+    hex_tlb_lookup(env, ((TLBHI) >> 12), ((TLBHI) << 12))
+#define iic_flush_cache(p)
+
+#define fIN_DEBUG_MODE(TNUM) \
+    ((GET_FIELD(ISDBST_DEBUGMODE, arch_get_system_reg(env, HEX_SREG_ISDBST)) \
+        & (0x1 << (TNUM))) != 0)
+
+#define fIN_DEBUG_MODE_NO_ISDB(TNUM) false
+#define fIN_DEBUG_MODE_WARN(TNUM) false
+
+#ifdef QEMU_GENERATE
+
+/*
+ * Read tags back as zero for now:
+ *
+ * tag value in RD[31:10] for 32k, RD[31:9] for 16k
+ */
+#define fICTAGR(RS, RD, RD2) \
+    do { \
+        RD = ctx->zero; \
+    } while (0)
+#define fICTAGW(RS, RD)
+#define fICDATAR(RS, RD) \
+    do { \
+        RD = ctx->zero; \
+    } while (0)
+#define fICDATAW(RS, RD)
+
+#define fDCTAGW(RS, RT)
+/* tag: RD[23:0], state: RD[30:29] */
+#define fDCTAGR(INDEX, DST, DST_REG_NUM) \
+    do { \
+        DST = ctx->zero; \
+    } while (0)
+#else
+
+/*
+ * Read tags back as zero for now:
+ *
+ * tag value in RD[31:10] for 32k, RD[31:9] for 16k
+ */
+#define fICTAGR(RS, RD, RD2) \
+    do { \
+        RD = 0x00; \
+    } while (0)
+#define fICTAGW(RS, RD)
+#define fICDATAR(RS, RD) \
+    do { \
+        RD = 0x00; \
+    } while (0)
+#define fICDATAW(RS, RD)
+
+#define fDCTAGW(RS, RT)
+/* tag: RD[23:0], state: RD[30:29] */
+#define fDCTAGR(INDEX, DST, DST_REG_NUM) \
+    do { \
+        DST = HEX_DC_STATE_INVALID | 0x00; \
+    } while (0)
+#endif
+
+#endif
+
+#define NUM_TLB_REGS(x) (env_archcpu(env)->num_tlbs)
+
+#endif
diff --git a/target/hexagon/translate.c b/target/hexagon/translate.c
index fe7858703c8c..35765d48ba11 100644
--- a/target/hexagon/translate.c
+++ b/target/hexagon/translate.c
@@ -49,6 +49,7 @@ static const AnalyzeInsn opcode_analyze[XX_LAST_OPCODE] = {
 TCGv hex_gpr[TOTAL_PER_THREAD_REGS];
 TCGv hex_pred[NUM_PREGS];
 TCGv hex_slot_cancelled;
+TCGv hex_next_PC;
 TCGv hex_new_value_usr;
 TCGv hex_store_addr[STORES_MAX];
 TCGv hex_store_width[STORES_MAX];
@@ -57,9 +58,19 @@ TCGv_i64 hex_store_val64[STORES_MAX];
 TCGv hex_llsc_addr;
 TCGv hex_llsc_val;
 TCGv_i64 hex_llsc_val_i64;
+TCGv_i64 hex_cycle_count;
 TCGv hex_vstore_addr[VSTORES_MAX];
 TCGv hex_vstore_size[VSTORES_MAX];
 TCGv hex_vstore_pending[VSTORES_MAX];
+static bool need_next_PC(DisasContext *ctx);
+
+#ifndef CONFIG_USER_ONLY
+TCGv hex_greg[NUM_GREGS];
+TCGv hex_t_sreg[NUM_SREGS];
+TCGv_ptr hex_g_sreg_ptr;
+TCGv hex_g_sreg[NUM_SREGS];
+TCGv hex_cause_code;
+#endif
 
 static const char * const hexagon_prednames[] = {
   "p0", "p1", "p2", "p3"
@@ -113,11 +124,28 @@ intptr_t ctx_tmp_vreg_off(DisasContext *ctx, int regnum,
     return offset;
 }
 
-static void gen_exception_raw(int excp)
+static void gen_exception(int excp, target_ulong PC)
+{
+    gen_helper_raise_exception(tcg_env, tcg_constant_i32(excp),
+                               tcg_constant_tl(PC));
+}
+
+#ifndef CONFIG_USER_ONLY
+static inline void gen_precise_exception(int excp, target_ulong PC)
 {
-    gen_helper_raise_exception(tcg_env, tcg_constant_i32(excp));
+    tcg_gen_movi_tl(hex_cause_code, excp);
+    gen_exception(HEX_EVENT_PRECISE, PC);
 }
 
+static inline void gen_pcycle_counters(DisasContext *ctx)
+{
+    if (ctx->pcycle_enabled) {
+        tcg_gen_addi_i64(hex_cycle_count, hex_cycle_count, ctx->num_cycles);
+        ctx->num_cycles = 0;
+    }
+}
+#endif
+
 static void gen_exec_counters(DisasContext *ctx)
 {
     tcg_gen_addi_tl(hex_gpr[HEX_REG_QEMU_PKT_CNT],
@@ -126,6 +154,10 @@ static void gen_exec_counters(DisasContext *ctx)
                     hex_gpr[HEX_REG_QEMU_INSN_CNT], ctx->num_insns);
     tcg_gen_addi_tl(hex_gpr[HEX_REG_QEMU_HVX_CNT],
                     hex_gpr[HEX_REG_QEMU_HVX_CNT], ctx->num_hvx_insns);
+
+#ifndef CONFIG_USER_ONLY
+   gen_pcycle_counters(ctx);
+#endif
 }
 
 static bool use_goto_tb(DisasContext *ctx, target_ulong dest)
@@ -156,6 +188,9 @@ static void gen_end_tb(DisasContext *ctx)
 
     gen_exec_counters(ctx);
 
+    if (ctx->need_next_pc) {
+        tcg_gen_mov_tl(hex_gpr[HEX_REG_PC], hex_next_PC);
+    }
     if (ctx->branch_cond != TCG_COND_NEVER) {
         if (ctx->branch_cond != TCG_COND_ALWAYS) {
             TCGLabel *skip = gen_new_label();
@@ -185,13 +220,14 @@ static void gen_end_tb(DisasContext *ctx)
     ctx->base.is_jmp = DISAS_NORETURN;
 }
 
-static void gen_exception_end_tb(DisasContext *ctx, int excp)
+void hex_gen_exception_end_tb(DisasContext *ctx, int excp)
 {
-    gen_exec_counters(ctx);
-    tcg_gen_movi_tl(hex_gpr[HEX_REG_PC], ctx->next_PC);
-    gen_exception_raw(excp);
+#ifdef CONFIG_USER_ONLY
+    gen_exception(excp, ctx->pkt->pc);
+#else
+    gen_precise_exception(excp, ctx->pkt->pc);
+#endif
     ctx->base.is_jmp = DISAS_NORETURN;
-
 }
 
 static int read_packet_words(CPUHexagonState *env, DisasContext *ctx,
@@ -232,6 +268,18 @@ static bool check_for_attrib(Packet *pkt, int attrib)
     return false;
 }
 
+#ifndef CONFIG_USER_ONLY
+static bool check_for_opcode(Packet *pkt, uint16_t opcode)
+{
+    for (int i = 0; i < pkt->num_insns; i++) {
+        if (pkt->insn[i].opcode == opcode) {
+            return true;
+        }
+    }
+    return false;
+}
+#endif
+
 static bool need_slot_cancelled(Packet *pkt)
 {
     /* We only need slot_cancelled for conditional store instructions */
@@ -245,21 +293,111 @@ static bool need_slot_cancelled(Packet *pkt)
     return false;
 }
 
-static bool need_next_PC(DisasContext *ctx)
+#ifndef CONFIG_USER_ONLY
+static bool sreg_write_to_global(int reg_num)
 {
-    Packet *pkt = ctx->pkt;
+    return reg_num == HEX_SREG_SSR ||
+           reg_num == HEX_SREG_STID ||
+           reg_num == HEX_SREG_IMASK ||
+           reg_num == HEX_SREG_IPENDAD ||
+           reg_num == HEX_SREG_BESTWAIT ||
+           reg_num == HEX_SREG_SCHEDCFG;
+}
 
-    /* Check for conditional control flow or HW loop end */
+static bool has_sreg_write_to_global(Packet const *pkt)
+{
     for (int i = 0; i < pkt->num_insns; i++) {
-        uint16_t opcode = pkt->insn[i].opcode;
-        if (GET_ATTRIB(opcode, A_CONDEXEC) && GET_ATTRIB(opcode, A_COF)) {
-            return true;
+        Insn const *insn = &pkt->insn[i];
+        uint16_t opcode = insn->opcode;
+        if (opcode == Y2_tfrsrcr) {
+            /* Write to a single sreg */
+            int reg_num = insn->regno[0];
+            if (sreg_write_to_global(reg_num)) {
+                return true;
+            }
+        } else if (opcode == Y4_tfrspcp) {
+            /* Write to a sreg pair */
+            int reg_num = insn->regno[0];
+            if (sreg_write_to_global(reg_num)) {
+                return true;
+            }
+            if (sreg_write_to_global(reg_num + 1)) {
+                return true;
+            }
         }
-        if (GET_ATTRIB(opcode, A_HWLOOP0_END) ||
-            GET_ATTRIB(opcode, A_HWLOOP1_END)) {
-            return true;
+    }
+    return false;
+}
+#endif
+
+static bool pkt_ends_tb(Packet *pkt)
+{
+    if (pkt->pkt_has_cof) {
+        return true;
+    }
+#ifndef CONFIG_USER_ONLY
+    /* System mode instructions that end TLB */
+    if (check_for_opcode(pkt, Y2_swi) ||
+        check_for_opcode(pkt, Y2_cswi) ||
+        check_for_opcode(pkt, Y2_ciad) ||
+        check_for_opcode(pkt, Y4_siad) ||
+        check_for_opcode(pkt, Y2_wait) ||
+        check_for_opcode(pkt, Y2_resume) ||
+        check_for_opcode(pkt, Y2_iassignw) ||
+        check_for_opcode(pkt, Y2_setimask) ||
+        check_for_opcode(pkt, Y4_nmi) ||
+        check_for_opcode(pkt, Y2_setprio) ||
+        check_for_opcode(pkt, Y2_start) ||
+        check_for_opcode(pkt, Y2_stop) ||
+        check_for_opcode(pkt, Y2_k0lock) ||
+        check_for_opcode(pkt, Y2_k0unlock) ||
+        check_for_opcode(pkt, Y2_tlblock) ||
+        check_for_opcode(pkt, Y2_tlbunlock) ||
+        check_for_opcode(pkt, Y2_break) ||
+        check_for_opcode(pkt, Y2_isync) ||
+        check_for_opcode(pkt, Y2_syncht) ||
+        check_for_opcode(pkt, Y2_tlbp) ||
+        check_for_opcode(pkt, Y2_tlbw) ||
+        check_for_opcode(pkt, Y5_ctlbw) ||
+        check_for_opcode(pkt, Y5_tlbasidi)) {
+        return true;
+    }
+
+    /*
+     * Check for sreg writes that would end the TB
+     */
+    if (check_for_attrib(pkt, A_IMPLICIT_WRITES_SSR)) {
+        return true;
+    }
+    if (has_sreg_write_to_global(pkt)) {
+        return true;
+    }
+#endif
+    return false;
+}
+
+
+static bool need_next_PC(DisasContext *ctx)
+{
+    Packet *pkt = ctx->pkt;
+    if (pkt->pkt_has_cof || ctx->pkt_ends_tb) {
+        for (int i = 0; i < pkt->num_insns; i++) {
+            uint16_t opcode = pkt->insn[i].opcode;
+            if ((GET_ATTRIB(opcode, A_CONDEXEC) && GET_ATTRIB(opcode, A_COF)) ||
+                GET_ATTRIB(opcode, A_HWLOOP0_END) ||
+                GET_ATTRIB(opcode, A_HWLOOP1_END)) {
+                return true;
+            }
         }
     }
+    /*
+     * We end the TB on some instructions that do not change the flow (for
+     * other reasons). In these cases, we must set pc too, as the insn won't
+     * do it themselves.
+     */
+    if (ctx->pkt_ends_tb && !check_for_attrib(pkt, A_COF)) {
+        return true;
+    }
     return false;
 }
 
@@ -291,6 +429,16 @@ static void mark_implicit_reg_write(DisasContext *ctx, int attrib, int rnum)
     }
 }
 
+#ifndef CONFIG_USER_ONLY
+static void mark_implicit_sreg_write(DisasContext *ctx, int attrib, int snum)
+{
+    uint16_t opcode = ctx->insn->opcode;
+    if (GET_ATTRIB(opcode, attrib)) {
+        ctx_log_sreg_write(ctx, snum);
+    }
+}
+#endif
+
 static void mark_implicit_reg_writes(DisasContext *ctx)
 {
     mark_implicit_reg_write(ctx, A_IMPLICIT_WRITES_FP,  HEX_REG_FP);
@@ -302,6 +450,12 @@ static void mark_implicit_reg_writes(DisasContext *ctx)
     mark_implicit_reg_write(ctx, A_IMPLICIT_WRITES_SA1, HEX_REG_SA1);
     mark_implicit_reg_write(ctx, A_IMPLICIT_WRITES_USR, HEX_REG_USR);
     mark_implicit_reg_write(ctx, A_FPOP, HEX_REG_USR);
+
+#ifndef CONFIG_USER_ONLY
+    mark_implicit_sreg_write(ctx, A_IMPLICIT_WRITES_SGP0, HEX_SREG_SGP0);
+    mark_implicit_sreg_write(ctx, A_IMPLICIT_WRITES_SGP1, HEX_SREG_SGP1);
+    mark_implicit_sreg_write(ctx, A_IMPLICIT_WRITES_SSR, HEX_SREG_SSR);
+#endif
 }
 
 static void mark_implicit_pred_write(DisasContext *ctx, int attrib, int pnum)
@@ -400,7 +554,14 @@ static void analyze_packet(DisasContext *ctx)
 static void gen_start_packet(DisasContext *ctx)
 {
     Packet *pkt = ctx->pkt;
+#ifndef CONFIG_USER_ONLY
+    target_ulong next_PC = (check_for_opcode(pkt, Y2_k0lock) ||
+                            check_for_opcode(pkt, Y2_tlblock)) ?
+                               ctx->base.pc_next :
+                               ctx->base.pc_next + pkt->encod_pkt_size_in_bytes;
+#else
     target_ulong next_PC = ctx->base.pc_next + pkt->encod_pkt_size_in_bytes;
+#endif
     int i;
 
     /* Clear out the disassembly context */
@@ -408,6 +569,10 @@ static void gen_start_packet(DisasContext *ctx)
     ctx->reg_log_idx = 0;
     bitmap_zero(ctx->regs_written, TOTAL_PER_THREAD_REGS);
     bitmap_zero(ctx->predicated_regs, TOTAL_PER_THREAD_REGS);
+#ifndef CONFIG_USER_ONLY
+    ctx->greg_log_idx = 0;
+    ctx->sreg_log_idx = 0;
+#endif
     ctx->preg_log_idx = 0;
     bitmap_zero(ctx->pregs_written, NUM_PREGS);
     ctx->future_vregs_idx = 0;
@@ -440,6 +605,25 @@ static void gen_start_packet(DisasContext *ctx)
      * gen phase, so clear it again.
      */
     bitmap_zero(ctx->pregs_written, NUM_PREGS);
+#ifndef CONFIG_USER_ONLY
+    for (i = 0; i < NUM_SREGS; i++) {
+        ctx->t_sreg_new_value[i] = NULL;
+    }
+    for (i = 0; i < ctx->sreg_log_idx; i++) {
+        int reg_num = ctx->sreg_log[i];
+        if (reg_num < HEX_SREG_GLB_START) {
+            ctx->t_sreg_new_value[reg_num] = tcg_temp_new();
+            tcg_gen_mov_tl(ctx->t_sreg_new_value[reg_num], hex_t_sreg[reg_num]);
+        }
+    }
+    for (i = 0; i < NUM_GREGS; i++) {
+        ctx->greg_new_value[i] = NULL;
+    }
+    for (i = 0; i < ctx->greg_log_idx; i++) {
+        int reg_num = ctx->greg_log[i];
+        ctx->greg_new_value[reg_num] = tcg_temp_new();
+    }
+#endif
 
     /* Initialize the runtime state for packet semantics */
     if (need_slot_cancelled(pkt)) {
@@ -448,12 +632,14 @@ static void gen_start_packet(DisasContext *ctx)
     ctx->branch_taken = NULL;
     if (pkt->pkt_has_cof) {
         ctx->branch_taken = tcg_temp_new();
-        if (pkt->pkt_has_multi_cof) {
-            tcg_gen_movi_tl(ctx->branch_taken, 0);
-        }
-        if (need_next_PC(ctx)) {
-            tcg_gen_movi_tl(hex_gpr[HEX_REG_PC], next_PC);
-        }
+    }
+    if (pkt->pkt_has_multi_cof) {
+        tcg_gen_movi_tl(ctx->branch_taken, 0);
+    }
+    ctx->pkt_ends_tb = pkt_ends_tb(pkt);
+    ctx->need_next_pc = need_next_PC(ctx);
+    if (ctx->need_next_pc) {
+        tcg_gen_movi_tl(hex_next_PC, next_PC);
     }
 
     /* Preload the predicated registers into get_result_gpr(ctx, i) */
@@ -558,7 +744,7 @@ static void gen_insn(DisasContext *ctx)
         ctx->insn->generate(ctx);
         mark_store_width(ctx);
     } else {
-        gen_exception_end_tb(ctx, HEX_CAUSE_INVALID_OPCODE);
+        hex_gen_exception_end_tb(ctx, HEX_CAUSE_INVALID_OPCODE);
     }
 }
 
@@ -589,6 +775,59 @@ static void gen_reg_writes(DisasContext *ctx)
     }
 }
 
+#ifndef CONFIG_USER_ONLY
+static void gen_greg_writes(DisasContext *ctx)
+{
+    int i;
+
+    for (i = 0; i < ctx->greg_log_idx; i++) {
+        int reg_num = ctx->greg_log[i];
+
+        tcg_gen_mov_tl(hex_greg[reg_num], ctx->greg_new_value[reg_num]);
+    }
+}
+
+
+static void gen_sreg_writes(DisasContext *ctx)
+{
+    int i;
+
+    TCGv old_reg = tcg_temp_new();
+    for (i = 0; i < ctx->sreg_log_idx; i++) {
+        int reg_num = ctx->sreg_log[i];
+
+        if (reg_num == HEX_SREG_SSR) {
+            tcg_gen_mov_tl(old_reg, hex_t_sreg[reg_num]);
+            tcg_gen_mov_tl(hex_t_sreg[reg_num], ctx->t_sreg_new_value[reg_num]);
+            gen_helper_modify_ssr(tcg_env, ctx->t_sreg_new_value[reg_num],
+                                  old_reg);
+            /* This can change processor state, so end the TB */
+            ctx->base.is_jmp = DISAS_NORETURN;
+        } else if ((reg_num == HEX_SREG_STID) ||
+                   (reg_num == HEX_SREG_IMASK) ||
+                   (reg_num == HEX_SREG_IPENDAD)) {
+            if (reg_num < HEX_SREG_GLB_START) {
+                tcg_gen_mov_tl(old_reg, hex_t_sreg[reg_num]);
+                tcg_gen_mov_tl(hex_t_sreg[reg_num],
+                               ctx->t_sreg_new_value[reg_num]);
+            }
+            /* This can change the interrupt state, so end the TB */
+            gen_helper_pending_interrupt(tcg_env);
+            ctx->base.is_jmp = DISAS_NORETURN;
+        } else if ((reg_num == HEX_SREG_BESTWAIT) ||
+                   (reg_num == HEX_SREG_SCHEDCFG)) {
+            /* This can trigger resched interrupt, so end the TB */
+            gen_helper_resched(tcg_env);
+            ctx->base.is_jmp = DISAS_NORETURN;
+        }
+
+        if (reg_num < HEX_SREG_GLB_START) {
+            tcg_gen_mov_tl(hex_t_sreg[reg_num], ctx->t_sreg_new_value[reg_num]);
+        }
+    }
+}
+#endif
+
 static void gen_pred_writes(DisasContext *ctx)
 {
     /* Early exit if not needed or the log is empty */
@@ -693,11 +932,11 @@ static void process_store_log(DisasContext *ctx)
      *  the memory accesses overlap.
      */
     Packet *pkt = ctx->pkt;
-    if (pkt->pkt_has_store_s1) {
+    if (pkt->pkt_has_scalar_store_s1) {
         g_assert(!pkt->pkt_has_dczeroa);
         process_store(ctx, 1);
     }
-    if (pkt->pkt_has_store_s0) {
+    if (pkt->pkt_has_scalar_store_s0) {
         g_assert(!pkt->pkt_has_dczeroa);
         process_store(ctx, 0);
     }
@@ -779,6 +1018,7 @@ static void gen_commit_hvx(DisasContext *ctx)
     }
 }
 
+static const int PCYCLES_PER_PACKET = 3;
 static void update_exec_counters(DisasContext *ctx)
 {
     Packet *pkt = ctx->pkt;
@@ -798,6 +1038,7 @@ static void update_exec_counters(DisasContext *ctx)
     }
 
     ctx->num_packets++;
+    ctx->num_cycles += PCYCLES_PER_PACKET;
     ctx->num_insns += num_real_insns;
     ctx->num_hvx_insns += num_hvx_insns;
 }
@@ -822,8 +1063,9 @@ static void gen_commit_packet(DisasContext *ctx)
      * involved in committing the packet.
      */
     Packet *pkt = ctx->pkt;
-    bool has_store_s0 = pkt->pkt_has_store_s0;
-    bool has_store_s1 = (pkt->pkt_has_store_s1 && !ctx->s1_store_processed);
+    bool has_store_s0 = pkt->pkt_has_scalar_store_s0;
+    bool has_store_s1 =
+        (pkt->pkt_has_scalar_store_s1 && !ctx->s1_store_processed);
     bool has_hvx_store = pkt_has_hvx_store(pkt);
     if (pkt->pkt_has_dczeroa) {
         /*
@@ -886,6 +1128,10 @@ static void gen_commit_packet(DisasContext *ctx)
     process_store_log(ctx);
 
     gen_reg_writes(ctx);
+#if !defined(CONFIG_USER_ONLY)
+    gen_greg_writes(ctx);
+    gen_sreg_writes(ctx);
+#endif
     gen_pred_writes(ctx);
     if (pkt->pkt_has_hvx) {
         gen_commit_hvx(ctx);
@@ -898,7 +1144,7 @@ static void gen_commit_packet(DisasContext *ctx)
         pkt->vhist_insn->generate(ctx);
     }
 
-    if (pkt->pkt_has_cof) {
+    if (ctx->pkt_ends_tb || ctx->base.is_jmp == DISAS_NORETURN) {
         gen_end_tb(ctx);
     }
 }
@@ -912,7 +1158,7 @@ static void decode_and_translate_packet(CPUHexagonState *env, DisasContext *ctx)
 
     nwords = read_packet_words(env, ctx, words);
     if (!nwords) {
-        gen_exception_end_tb(ctx, HEX_CAUSE_INVALID_PACKET);
+        hex_gen_exception_end_tb(ctx, HEX_CAUSE_INVALID_PACKET);
         return;
     }
 
@@ -927,7 +1173,7 @@ static void decode_and_translate_packet(CPUHexagonState *env, DisasContext *ctx)
         gen_commit_packet(ctx);
         ctx->base.pc_next += pkt.encod_pkt_size_in_bytes;
     } else {
-        gen_exception_end_tb(ctx, HEX_CAUSE_INVALID_PACKET);
+        hex_gen_exception_end_tb(ctx, HEX_CAUSE_INVALID_PACKET);
     }
 }
 
@@ -938,13 +1184,16 @@ static void hexagon_tr_init_disas_context(DisasContextBase *dcbase,
     HexagonCPU *hex_cpu = env_archcpu(cpu_env(cs));
     uint32_t hex_flags = dcbase->tb->flags;
 
-    ctx->mem_idx = MMU_USER_IDX;
+    ctx->mem_idx = FIELD_EX32(hex_flags, TB_FLAGS, MMU_INDEX);
     ctx->num_packets = 0;
+    ctx->num_cycles = 0;
     ctx->num_insns = 0;
     ctx->num_hvx_insns = 0;
     ctx->branch_cond = TCG_COND_NEVER;
     ctx->is_tight_loop = FIELD_EX32(hex_flags, TB_FLAGS, IS_TIGHT_LOOP);
     ctx->short_circuit = hex_cpu->short_circuit;
+    ctx->pcycle_enabled = FIELD_EX32(hex_flags, TB_FLAGS, PCYCLE_ENABLED);
+    ctx->need_next_pc = false;
 }
 
 static void hexagon_tr_tb_start(DisasContextBase *db, CPUState *cpu)
@@ -1050,6 +1299,26 @@ void hexagon_translate_init(void)
 
     opcode_init();
 
+#ifndef CONFIG_USER_ONLY
+    for (i = 0; i < NUM_GREGS; i++) {
+            hex_greg[i] = tcg_global_mem_new(tcg_env,
+                offsetof(CPUHexagonState, greg[i]),
+                hexagon_gregnames[i]);
+    }
+    hex_g_sreg_ptr = tcg_global_mem_new_ptr(tcg_env,
+            offsetof(CPUHexagonState, g_sreg), "hex_g_sreg_ptr");
+    for (i = 0; i < NUM_SREGS; i++) {
+        if (i < HEX_SREG_GLB_START) {
+            hex_t_sreg[i] = tcg_global_mem_new(tcg_env,
+                offsetof(CPUHexagonState, t_sreg[i]),
+                hexagon_sregnames[i]);
+        } else {
+            hex_g_sreg[i] = tcg_global_mem_new(hex_g_sreg_ptr,
+                i * sizeof(target_ulong),
+                hexagon_sregnames[i]);
+        }
+    }
+#endif
     for (i = 0; i < TOTAL_PER_THREAD_REGS; i++) {
         hex_gpr[i] = tcg_global_mem_new(tcg_env,
             offsetof(CPUHexagonState, gpr[i]),
@@ -1071,6 +1340,15 @@ void hexagon_translate_init(void)
         offsetof(CPUHexagonState, llsc_val), "llsc_val");
     hex_llsc_val_i64 = tcg_global_mem_new_i64(tcg_env,
         offsetof(CPUHexagonState, llsc_val_i64), "llsc_val_i64");
+    hex_cycle_count = tcg_global_mem_new_i64(tcg_env,
+            offsetof(CPUHexagonState, t_cycle_count), "t_cycle_count");
+#ifndef CONFIG_USER_ONLY
+    hex_cause_code = tcg_global_mem_new(tcg_env,
+        offsetof(CPUHexagonState, cause_code), "cause_code");
+#endif
+    hex_next_PC = tcg_global_mem_new(tcg_env,
+        offsetof(CPUHexagonState, next_PC), "next_PC");
+
     for (i = 0; i < STORES_MAX; i++) {
         snprintf(store_addr_names[i], NAME_LEN, "store_addr_%d", i);
         hex_store_addr[i] = tcg_global_mem_new(tcg_env,
diff --git a/target/hexagon/translate.h b/target/hexagon/translate.h
index d251e2233fda..ad1a2f404534 100644
--- a/target/hexagon/translate.h
+++ b/target/hexagon/translate.h
@@ -39,6 +39,14 @@ typedef struct DisasContext {
     int reg_log_idx;
     DECLARE_BITMAP(regs_written, TOTAL_PER_THREAD_REGS);
     DECLARE_BITMAP(predicated_regs, TOTAL_PER_THREAD_REGS);
+#ifndef CONFIG_USER_ONLY
+    int greg_log[GREG_WRITES_MAX];
+    int greg_log_idx;
+    int sreg_log[SREG_WRITES_MAX];
+    int sreg_log_idx;
+    TCGv t_sreg_new_value[NUM_SREGS];
+    TCGv greg_new_value[NUM_GREGS];
+#endif
     int preg_log[PRED_WRITES_MAX];
     int preg_log_idx;
     DECLARE_BITMAP(pregs_written, NUM_PREGS);
@@ -75,10 +83,42 @@ typedef struct DisasContext {
     TCGv new_pred_value[NUM_PREGS];
     TCGv branch_taken;
     TCGv dczero_addr;
+    bool pcycle_enabled;
+    bool pkt_ends_tb;
+    bool need_next_pc;
+    uint32_t num_cycles;
 } DisasContext;
 
 bool is_gather_store_insn(DisasContext *ctx);
 
+#ifndef CONFIG_USER_ONLY
+static inline void ctx_log_greg_write(DisasContext *ctx, int rnum)
+{
+    if (rnum <= HEX_GREG_G3) {
+        ctx->greg_log[ctx->greg_log_idx] = rnum;
+        ctx->greg_log_idx++;
+    }
+}
+
+static inline void ctx_log_greg_write_pair(DisasContext *ctx, int rnum)
+{
+    ctx_log_greg_write(ctx, rnum);
+    ctx_log_greg_write(ctx, rnum + 1);
+}
+
+static inline void ctx_log_sreg_write(DisasContext *ctx, int rnum)
+{
+    ctx->sreg_log[ctx->sreg_log_idx] = rnum;
+    ctx->sreg_log_idx++;
+}
+
+static inline void ctx_log_sreg_write_pair(DisasContext *ctx, int rnum)
+{
+    ctx_log_sreg_write(ctx, rnum);
+    ctx_log_sreg_write(ctx, rnum + 1);
+}
+#endif
+
 static inline void ctx_log_pred_write(DisasContext *ctx, int pnum)
 {
     if (!test_bit(pnum, ctx->pregs_written)) {
@@ -267,6 +307,7 @@ static inline void ctx_log_qreg_read(DisasContext *ctx,
 }
 
 extern TCGv hex_gpr[TOTAL_PER_THREAD_REGS];
+extern TCGv hex_next_PC;
 extern TCGv hex_pred[NUM_PREGS];
 extern TCGv hex_slot_cancelled;
 extern TCGv hex_new_value_usr;
@@ -280,6 +321,15 @@ extern TCGv_i64 hex_llsc_val_i64;
 extern TCGv hex_vstore_addr[VSTORES_MAX];
 extern TCGv hex_vstore_size[VSTORES_MAX];
 extern TCGv hex_vstore_pending[VSTORES_MAX];
+#ifndef CONFIG_USER_ONLY
+extern TCGv hex_greg[NUM_GREGS];
+extern TCGv hex_t_sreg[NUM_SREGS];
+extern TCGv_ptr hex_g_sreg_ptr;
+extern TCGv hex_g_sreg[NUM_SREGS];
+#endif
+
+
+void hex_gen_exception_end_tb(DisasContext *ctx, int excp);
 
 void process_store(DisasContext *ctx, int slot_num);
 
diff --git a/target/riscv/common-semi-target.h b/target/riscv/common-semi-target.h
index 7c8a59e0cc3c..ef6929bdfc5a 100644
--- a/target/riscv/common-semi-target.h
+++ b/target/riscv/common-semi-target.h
@@ -11,6 +11,17 @@
 #ifndef TARGET_RISCV_COMMON_SEMI_TARGET_H
 #define TARGET_RISCV_COMMON_SEMI_TARGET_H
 
+static inline bool common_semi_read_arg_word(CPUArchState *env,
+                                             target_ulong *save_to,
+                                             target_ulong args_addr,
+                                             int arg_num)
+{
+    if (is_64bit_semihosting(env)) {
+        return get_user_u64(*save_to, args_addr + (arg_num) * 8));
+    }
+    return get_user_u32(*save_to, args_addr + (arg_num) * 4));
+}
+
 static inline target_ulong common_semi_arg(CPUState *cs, int argno)
 {
     RISCVCPU *cpu = RISCV_CPU(cs);
diff --git a/tests/functional/meson.build b/tests/functional/meson.build
index 3fd2652c0782..7e361c68dd90 100644
--- a/tests/functional/meson.build
+++ b/tests/functional/meson.build
@@ -140,6 +140,14 @@ tests_i386_system_quick = [
   'migration',
 ]
 
+test_timeouts += {
+  'hexagon_minivm': 180,
+}
+
+tests_hexagon_system_quick = [
+  'hexagon_minivm',
+]
+
 tests_i386_system_thorough = [
   'i386_tuxrun',
 ]
diff --git a/tests/functional/test_hexagon_minivm.py b/tests/functional/test_hexagon_minivm.py
new file mode 100755
index 000000000000..2ba92bcce383
--- /dev/null
+++ b/tests/functional/test_hexagon_minivm.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+#
+# Copyright(c) 2024-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+import os
+from glob import glob
+from qemu_test import QemuSystemTest, Asset
+from qemu_test import wait_for_console_pattern
+
+class MiniVMTest(QemuSystemTest):
+
+    timeout = 180
+    GUEST_ENTRY = 0xc0000000
+
+    REPO = 'https://artifacts.codelinaro.org/artifactory'
+    ASSET_TARBALL = \
+        Asset(f'{REPO}/codelinaro-toolchain-for-hexagon/'
+               '19.1.5/hexagon_minivm_2024_Dec_15.tar.gz',
+        'd7920b5ff14bed5a10b23ada7d4eb927ede08635281f25067e0d5711feee2c2a')
+
+    def test_minivm(self):
+        self.set_machine('virt')
+        self.archive_extract(self.ASSET_TARBALL)
+        rootfs_path = f'{self.workdir}/hexagon-unknown-linux-musl-rootfs'
+        kernel_path = f'{rootfs_path}/boot/minivm'
+
+        assert(os.path.exists(kernel_path))
+        for test_bin_path in glob(f'{rootfs_path}/boot/test_*'):
+            print(f'# Testing "{os.path.basename(test_bin_path)}"')
+
+            vm = self.get_vm()
+            vm.add_args('-kernel', kernel_path,
+                  '-device',
+                  f'loader,addr={hex(self.GUEST_ENTRY)},file={test_bin_path}')
+            vm.launch()
+            vm.wait()
+            self.assertEqual(vm.exitcode(), 0)
+
+if __name__ == '__main__':
+    QemuSystemTest.main()
diff --git a/tests/tcg/hexagon/Makefile.softmmu-target b/tests/tcg/hexagon/Makefile.softmmu-target
new file mode 100644
index 000000000000..0b12f7485b62
--- /dev/null
+++ b/tests/tcg/hexagon/Makefile.softmmu-target
@@ -0,0 +1,114 @@
+##
+##  Copyright(c) 2019-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+##
+##  SPDX-License-Identifier: GPL-2.0-or-later
+##
+
+# -*- Mode: makefile -*-
+#
+# Hexagon SoftMMU tests - included from tests/tcg/Makefile
+#
+
+HEXAGON_SYSTEM_SRC=$(SRC_PATH)/tests/tcg/hexagon/system
+
+# Set search path for all sources
+VPATH 		+= $(HEXAGON_SYSTEM_SRC)
+
+########### Compiling options
+# We force -O0 to avoid optimizations that would break the
+# libc simplifications we made at min_libc.c
+#
+CFLAGS=-mv73 -U__linux__ -G0 -nodefaultlibs -nostdlib -static -fno-PIC -O0 -g -Werror
+LDFLAGS=-lclang_rt.builtins-hexagon
+
+########### QEMU options
+QEMU_BASE_MACHINE=-M V66G_1024 -semihosting-config usefs=$(SRC_PATH)/tests/tcg/hexagon/system
+QEMU_OPTS+=-display none
+
+QEMU_OPTS+=$(QEMU_BASE_MACHINE) -kernel
+
+crt0.o: crt0/crt0.S crt0/crt0.inc
+crt0_standalone.o: crt0/crt0_standalone.S crt0/crt0.inc
+pte.o: crt0/pte.S
+min_libc.o: crt0/min_libc.c
+tlb.o: crt0/tlb.c
+
+CRT0_OBJS=crt0.o crt0_standalone.o pte.o min_libc.o tlb.o
+
+TESTS_BUILT_WITH_DEFAULT_RULES = \
+	semihost \
+	mmu_overlap \
+	mmu_asids \
+	standalone_hw \
+	ciad-siad \
+	badva \
+	vid_reg \
+	hvx-multi \
+	standalone_vec \
+	fastl2vic \
+	int_range \
+	$()
+
+TESTS += \
+	$(TESTS_BUILT_WITH_DEFAULT_RULES) \
+	tlb-miss-tlblock \
+	$()
+
+$(TESTS_BUILT_WITH_DEFAULT_RULES): $(CRT0_OBJS)
+
+# Build and link the tests
+echo-and-run = echo $(1) && $(1)
+define build_fn
+    @if test "$(3)" = LINK; then extra="$(LDFLAGS)"; else extra=-c; fi && \
+    $(call echo-and-run, $(CC) $(CFLAGS) $(1) -o $(2) $$extra)
+endef
+
+$(CRT0_OBJS):
+	$(call build_fn,$<,$@)
+$(TESTS_BUILT_WITH_DEFAULT_RULES):
+	$(call build_fn,$^,$@,LINK)
+
+%.o: %.S
+	$(call build_fn,$<,$@)
+%.o: %.c
+	$(call build_fn,$<,$@)
+
+mmu.h: ../hex_test.h
+
+semihost.o: semihost.c strutils.h
+semihost: semihost.o
+mmu_overlap.o: mmu_overlap.c mmu.h
+mmu_overlap: mmu_overlap.o
+mmu_asids.o: mmu_asids.c mmu.h
+mmu_asids: mmu_asids.o
+ciad-siad: ciad-siad.o
+standalone_hw: standalone_hw.o monitor_insts.o
+vid_reg: vid_reg.o
+hvx-multi.o: hvx-multi.c ../hvx_misc.h
+hvx-multi: hvx-multi.o
+standalone_vec.o: standalone_vec.c cfgtable.h
+standalone_vec: standalone_vec.o
+badva.o: badva.c ../hex_test.h crt0/hexagon_standalone.h
+badva: badva.o
+fastl2vic.o: fastl2vic.c cfgtable.h
+fastl2vic: fastl2vic.o
+int_range.o: int_range.c cfgtable.h
+int_range: int_range.o
+
+############# Custom build options
+
+standalone_vec.o: CFLAGS+= -mv69 -O2 -mhvx  -fvectorize
+hvx-multi.o: CFLAGS+= -O2 -mhvx
+
+# We don't want to link this one with crt0 files
+tlb-miss-tlblock: tlb-miss-tlblock.o
+	$(CC) $(CFLAGS) $< -o $@ -nostartfiles -Wl,-Ttext,0x9b800000 -Wl,-entry,0x9b800000
+
+############# Custom test rules
+
+run-semihost: semihost
+	mkdir -p _semihost_dir
+	touch _semihost_dir/fileA _semihost_dir/fileB
+	$(call run-test, $<, $(QEMU) --append "arg1 arg2" $(QEMU_OPTS) $< \
+		             > $<.stdout)
+	$(call quiet-command, grep -q "PASS" $<.stdout, "GREP", "PASS")
diff --git a/tests/tcg/hexagon/Makefile.target b/tests/tcg/hexagon/Makefile.target
index e5182c01d8a0..44dd927b5937 100644
--- a/tests/tcg/hexagon/Makefile.target
+++ b/tests/tcg/hexagon/Makefile.target
@@ -52,6 +52,7 @@ HEX_TESTS += hvx_misc
 HEX_TESTS += hvx_histogram
 HEX_TESTS += invalid-slots
 HEX_TESTS += unaligned_pc
+HEX_TESTS += utimer
 
 run-and-check-exception = $(call run-test,$2,$3 2>$2.stderr; \
 	test $$? -eq 1 && grep -q "exception $(strip $1)" $2.stderr)
@@ -109,6 +110,7 @@ preg_alias: preg_alias.c hex_test.h
 read_write_overlap: read_write_overlap.c hex_test.h
 reg_mut: reg_mut.c hex_test.h
 unaligned_pc: unaligned_pc.c
+utimer: utimer.c hex_test.h
 
 # This test has to be compiled for the -mv67t target
 usr: usr.c hex_test.h
diff --git a/tests/tcg/hexagon/hvx_misc.c b/tests/tcg/hexagon/hvx_misc.c
index 90c3733da071..319d7c0dd052 100644
--- a/tests/tcg/hexagon/hvx_misc.c
+++ b/tests/tcg/hexagon/hvx_misc.c
@@ -495,6 +495,28 @@ void test_store_new()
     check_output_w(__LINE__, 1);
 }
 
+void test_qfloat()
+{
+    asm volatile(
+        "r0 = #0xf\n"
+        "v0 = vsplat(r0)\n"
+        "v1 = vsplat(r0)\n"
+        "{\n"
+        "   v2.qf16 = vadd(v0.qf16, v1.qf16)\n"
+        "}\n"
+        "vmem(%0) = v2\n"
+        :
+        : "r"(&output[0])
+        : "r0", "v0", "v1", "v2", "memory"
+    );
+
+    for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
+        expect[0].w[i] = 0x10010;
+    }
+
+    check_output_w(__LINE__, 1);
+}
+
 int main()
 {
     init_buffers();
@@ -538,6 +560,8 @@ int main()
 
     test_store_new();
 
+    test_qfloat();
+
     puts(err ? "FAIL" : "PASS");
     return err ? 1 : 0;
 }
diff --git a/tests/tcg/hexagon/reg_mut.c b/tests/tcg/hexagon/reg_mut.c
index c5a39e55100d..45db9ae5cd15 100644
--- a/tests/tcg/hexagon/reg_mut.c
+++ b/tests/tcg/hexagon/reg_mut.c
@@ -77,10 +77,10 @@ static inline void write_control_registers(void)
     check32(result, 0x00000000);
 
     WRITE_REG_NOCLOBBER(result, "utimerlo", 0xffffffff);
-    check32(result, 0x00000000);
+    check32_ne(result, 0xffffffff);
 
     WRITE_REG_NOCLOBBER(result, "utimerhi", 0xffffffff);
-    check32(result, 0x00000000);
+    check32_ne(result, 0xffffffff);
 
     /*
      * PC is special.  Setting it to these values
@@ -107,7 +107,7 @@ static inline void write_control_register_pairs(void)
     check64(result, 0x0000000000000000);
 
     WRITE_REG_NOCLOBBER(result, "c31:30", 0xffffffffffffffff);
-    check64(result, 0x0000000000000000);
+    check64_ne(result, 0xffffffffffffffff);
 
     WRITE_REG_PAIR_ENCODED(result, "c9:8", (uint64_t) 0x0000000000000000,
                            C9_8_EQ_R1_0);
diff --git a/tests/tcg/hexagon/system/badva.c b/tests/tcg/hexagon/system/badva.c
new file mode 100644
index 000000000000..1351269d1077
--- /dev/null
+++ b/tests/tcg/hexagon/system/badva.c
@@ -0,0 +1,335 @@
+/*
+ *  Copyright(c) 2019-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "crt0/hexagon_standalone.h"
+
+#define DEBUG 0
+
+int err;
+#include "../hex_test.h"
+
+/* volatile because it is written through different MMU mappings */
+typedef volatile int mmu_variable;
+mmu_variable data0 = 0xdeadbeef;
+mmu_variable data1 = 0xabcdef01;
+
+#define ONE_MB (1 << 20)
+#define INVALID_BADVA 0xbadabada
+
+static uint32_t read_badva(void)
+{
+    uint32_t ret;
+    __asm__ __volatile__("%0 = badva\n\t" : "=r"(ret));
+    return ret;
+}
+
+static uint32_t read_badva0(void)
+{
+    uint32_t ret;
+    __asm__ __volatile__("%0 = badva0\n\t" : "=r"(ret));
+    return ret;
+}
+
+static uint32_t read_badva1(void)
+{
+    uint32_t ret;
+    __asm__ __volatile__("%0 = badva1\n\t" : "=r"(ret));
+    return ret;
+}
+
+static uint32_t read_ssr(void)
+{
+    uint32_t ret;
+    __asm__ __volatile__("%0 = ssr\n\t" : "=r"(ret));
+    return ret;
+}
+
+static void write_badva0(uint32_t val)
+{
+    __asm__ __volatile__("badva0=%0;" : : "r"(val));
+    return;
+}
+
+static void write_badva1(uint32_t val)
+{
+    __asm__ __volatile__("badva1=%0;" : : "r"(val));
+    return;
+}
+
+#define SSR_V0_BIT 20
+#define SSR_V1_BIT 21
+#define SSR_BVS_BIT 21
+
+static uint32_t read_ssr_v0(void)
+{
+    return (read_ssr() >> SSR_V0_BIT) & 0x1;
+}
+
+static uint32_t read_ssr_v1(void)
+{
+    return (read_ssr() >> SSR_V1_BIT) & 0x1;
+}
+
+static uint32_t read_ssr_bvs(void)
+{
+    return (read_ssr() >> SSR_BVS_BIT) & 0x1;
+}
+
+static void dual_store(mmu_variable *p, mmu_variable *q, uint32_t pval,
+                       uint32_t qval)
+{
+#if DEBUG
+    printf("dual_store:\t0x%p, 0x%p, 0x%lx, 0x%lx\n", p, q, pval, qval);
+#endif
+
+    __asm__ __volatile__("r6 = #0\n\t"
+                         "badva0 = r6\n\t"
+                         "badva1 = r6\n\t"
+                         "r6 = ssr\n\t"
+                         "r6 = clrbit(r6, #%4) // V0\n\t"
+                         "r6 = clrbit(r6, #%5) // V1\n\t"
+                         "r6 = clrbit(r6, #%6) // BVS\n\t"
+                         "ssr = r6\n\t"
+                         "{\n\t"
+                         "    memw(%0) = %2    // slot 1\n\t"
+                         "    memw(%1) = %3    // slot 0\n\t"
+                         "}\n\t"
+                         : "=m"(*p), "=m"(*q)
+                         : "r"(pval), "r"(qval), "i"(SSR_V0_BIT),
+                           "i"(SSR_V1_BIT), "i"(SSR_BVS_BIT)
+                         : "r6");
+}
+
+static void dual_load(mmu_variable *p, mmu_variable *q, uint32_t *pval,
+                      uint32_t *qval)
+{
+    uint32_t val0, val1;
+
+#if DEBUG
+    printf("dual_load:\t0x%p, 0x%p\n", p, q);
+#endif
+
+    __asm__ __volatile__("r6 = #0\n\t"
+                         "badva0 = r6\n\t"
+                         "badva1 = r6\n\t"
+                         "r6 = ssr\n\t"
+                         "r6 = clrbit(r6, #%4) // V0\n\t"
+                         "r6 = clrbit(r6, #%5) // V1\n\t"
+                         "r6 = clrbit(r6, #%6) // BVS\n\t"
+                         "ssr = r6\n\t"
+                         "{\n\t"
+                         "    %1 = memw(%3)    // slot 1\n\t"
+                         "    %0 = memw(%2)    // slot 0\n\t"
+                         "}\n\t"
+                         : "=r"(val0), "=r"(val1)
+                         : "m"(*p), "m"(*q), "i"(SSR_V0_BIT), "i"(SSR_V1_BIT),
+                           "i"(SSR_BVS_BIT)
+                         : "r6");
+
+#if DEBUG
+    printf("\t\t0x%lx, 0x%lx\n", val0, val1);
+#endif
+
+    *pval = val0;
+    *qval = val1;
+}
+
+static void load_store(mmu_variable *p, mmu_variable *q, uint32_t *pval,
+                       uint32_t qval)
+{
+    uint32_t val;
+
+#if DEBUG
+    printf("load_store:\t0x%p, 0x%p, 0x%lx\n", p, q, qval);
+#endif
+
+    __asm__ __volatile__("r6 = #0\n\t"
+                         "badva0 = r6\n\t"
+                         "badva1 = r6\n\t"
+                         "r6 = ssr\n\t"
+                         "r6 = clrbit(r6, #%4) // V0\n\t"
+                         "r6 = clrbit(r6, #%5) // V1\n\t"
+                         "r6 = clrbit(r6, #%6) // BVS\n\t"
+                         "ssr = r6\n\t"
+                         "{\n\t"
+                         "    %0 = memw(%2)    // slot 1\n\t"
+                         "    memw(%1) = %3    // slot 0\n\t"
+                         "}\n\t"
+                         : "=r"(val), "=m"(*q)
+                         : "m"(*p), "r"(qval), "i"(SSR_V0_BIT), "i"(SSR_V1_BIT),
+                           "i"(SSR_BVS_BIT)
+                         : "r6");
+
+#if DEBUG
+    printf("\t\t0x%lx\n", val);
+#endif
+
+    *pval = val;
+}
+
+enum {
+    TLB_U = (1 << 0),
+    TLB_R = (1 << 1),
+    TLB_W = (1 << 2),
+    TLB_X = (1 << 3),
+};
+
+uint32_t add_trans_pgsize(uint32_t page_size_bits)
+{
+    switch (page_size_bits) {
+    case 12: /* 4KB   */
+        return 1;
+    case 14: /* 16KB  */
+        return 2;
+    case 16: /* 64KB  */
+        return 4;
+    case 18: /* 256KB */
+        return 8;
+    case 20: /* 1MB   */
+        return 16;
+    case 22: /* 4MB   */
+        return 32;
+    case 24: /* 16MB  */
+        return 64;
+    default:
+        return 1;
+    }
+}
+
+int mb_counter = 1;
+
+static mmu_variable *map_data_address(mmu_variable *p, uint32_t data_offset)
+{
+    uint32_t page_size_bits = 12;
+    uint32_t page_size = 1 << page_size_bits;
+    uint32_t page_align = ~(page_size - 1);
+
+    uint32_t data_addr = (uint32_t)p;
+    uint32_t data_page = data_addr & page_align;
+
+    uint32_t new_data_page = data_page + data_offset;
+    uint32_t read_data_addr = data_addr + data_offset;
+    unsigned int data_perm = TLB_X | TLB_W | TLB_U;
+    add_translation((void *)new_data_page, (void *)data_page, 0);
+
+    return (mmu_variable *)read_data_addr;
+}
+
+static void test_dual_store(void)
+{
+    data0 = 0x12345678;
+    data1 = 0x87654321;
+
+    mmu_variable *new_data0 = map_data_address(&data0, mb_counter * ONE_MB);
+    mb_counter++;
+    mmu_variable *new_data1 = map_data_address(&data1, mb_counter * ONE_MB);
+    mb_counter++;
+
+    dual_store(new_data0, new_data1, 0x1, 0x2);
+    if (read_badva() == (uint32_t)new_data0) {
+        check32(read_badva0(), (uint32_t)new_data0);
+        check32(read_badva1(), INVALID_BADVA);
+        check32(read_ssr_v0(), 1);
+        check32(read_ssr_v1(), 0);
+        check32(read_ssr_bvs(), 0);
+    } else if (read_badva() == (uint32_t)new_data1) {
+        check32(read_badva0(), INVALID_BADVA);
+        check32(read_badva1(), (uint32_t)new_data1);
+        check32(read_ssr_v0(), 0);
+        check32(read_ssr_v1(), 1);
+        check32(read_ssr_bvs(), 1);
+    } else {
+        /* Something went wrong! */
+        check32(0, 1);
+    }
+    check32(data0, 0x1);
+    check32(data1, 0x2);
+}
+
+static void test_dual_load(void)
+{
+    uint32_t val0, val1;
+
+    data0 = 0xaabbccdd;
+    data1 = 0xeeff0011;
+
+    mmu_variable *new_data0 = map_data_address(&data0, mb_counter * ONE_MB);
+    mb_counter++;
+    mmu_variable *new_data1 = map_data_address(&data1, mb_counter * ONE_MB);
+    mb_counter++;
+
+    dual_load(new_data0, new_data1, &val0, &val1);
+    if (read_badva() == (uint32_t)new_data0) {
+        check32(read_badva0(), (uint32_t)new_data0);
+        check32(read_badva1(), INVALID_BADVA);
+        check32(read_ssr_v0(), 1);
+        check32(read_ssr_v1(), 0);
+        check32(read_ssr_bvs(), 0);
+    } else if (read_badva() == (uint32_t)new_data1) {
+        check32(read_badva0(), INVALID_BADVA);
+        check32(read_badva1(), (uint32_t)new_data1);
+        check32(read_ssr_v0(), 0);
+        check32(read_ssr_v1(), 1);
+        check32(read_ssr_bvs(), 1);
+    } else {
+        /* Something went wrong! */
+        check32(0, 1);
+    }
+    check32(val0, 0xaabbccdd);
+    check32(val1, 0xeeff0011);
+}
+
+static void test_load_store(void)
+{
+    uint32_t val;
+
+    data0 = 0x11223344;
+    data1 = 0x55667788;
+
+    mmu_variable *new_data0 = map_data_address(&data0, mb_counter * ONE_MB);
+    mb_counter++;
+    mmu_variable *new_data1 = map_data_address(&data1, mb_counter * ONE_MB);
+    mb_counter++;
+
+    load_store(new_data0, new_data1, &val, 0x123);
+    if (read_badva() == (uint32_t)new_data1) {
+        check32(read_badva0(), (uint32_t)new_data1);
+        check32(read_badva1(), INVALID_BADVA);
+        check32(read_ssr_v0(), 1);
+        check32(read_ssr_v1(), 0);
+        check32(read_ssr_bvs(), 0);
+    } else if (read_badva() == (uint32_t)new_data0) {
+        check32(read_badva0(), INVALID_BADVA);
+        check32(read_badva1(), (uint32_t)new_data0);
+        check32(read_ssr_v0(), 0);
+        check32(read_ssr_v1(), 1);
+        check32(read_ssr_bvs(), 1);
+    } else {
+        /* Something went wrong! */
+        check32(0, 1);
+    }
+    check32(val, 0x11223344);
+    check32(data1, 0x123);
+}
+static void test_badva_write(void)
+{
+    uint32_t va = 0x11223344;
+    write_badva0(va);
+    check32(read_badva(), va);
+}
+
+int main()
+{
+    puts("Hexagon badva test");
+
+    test_dual_store();
+    test_dual_load();
+    test_load_store();
+    test_badva_write();
+
+    printf("%s\n", ((err) ? "FAIL" : "PASS"));
+    return err;
+}
diff --git a/tests/tcg/hexagon/system/cfgtable.h b/tests/tcg/hexagon/system/cfgtable.h
new file mode 100644
index 000000000000..fff84ef56950
--- /dev/null
+++ b/tests/tcg/hexagon/system/cfgtable.h
@@ -0,0 +1,39 @@
+/*
+ *  Copyright(c) 2023-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef CFGTABLE_H
+#define CFGTABLE_H
+
+#include <stdint.h>
+
+static uint32_t read_cfgtable_field(uint32_t offset)
+{
+    uint32_t val;
+    asm volatile("r0 = cfgbase\n\t"
+                 "r0 = asl(r0, #5)\n\t"
+                 "%0 = memw_phys(%1, r0)\n\t"
+                 : "=r"(val)
+                 : "r"(offset)
+                 : "r0");
+    return val;
+}
+
+#define GET_SUBSYSTEM_BASE() (read_cfgtable_field(0x8) << 16)
+#define GET_FASTL2VIC_BASE() (read_cfgtable_field(0x28) << 16)
+
+static uintptr_t get_vtcm_base(void)
+{
+#if __HEXAGON_ARCH__ == 65
+    return 0xD8200000L;
+#elif __HEXAGON_ARCH__ >= 66
+    int vtcm_offset = 0x038;
+    return read_cfgtable_field(vtcm_offset) << 16;
+#else
+#error "unsupported hexagon revision"
+#endif
+}
+
+#endif /* CFGTABLE_H */
diff --git a/tests/tcg/hexagon/system/ciad-siad.c b/tests/tcg/hexagon/system/ciad-siad.c
new file mode 100644
index 000000000000..e3fbb7a506dc
--- /dev/null
+++ b/tests/tcg/hexagon/system/ciad-siad.c
@@ -0,0 +1,50 @@
+/*
+ *  Copyright(c) 2023-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+
+
+static inline void siad(uint32_t val)
+{
+    asm volatile ("siad(%0);"
+                  : : "r"(val));
+    return;
+}
+static inline void ciad(uint32_t val)
+{
+    asm volatile ("ciad(%0);"
+                  : : "r"(val));
+    return;
+}
+
+static inline uint32_t getipendad()
+{
+    uint32_t reg;
+    asm volatile ("%0=s20;"
+                  : "=r"(reg));
+    return reg;
+}
+int
+main(int argc, char *argv[])
+{
+    siad(4);
+    int ipend = getipendad();
+    if (ipend != (0x4 << 16)) {
+        goto fail;
+    }
+    ciad(4);
+    ipend = getipendad();
+    if (ipend) {
+        goto fail;
+    }
+
+    printf("PASS\n");
+    return 0;
+fail:
+    printf("FAIL\n");
+    return 1;
+}
diff --git a/tests/tcg/hexagon/system/crt0/crt0.S b/tests/tcg/hexagon/system/crt0/crt0.S
new file mode 100644
index 000000000000..8a40e39536eb
--- /dev/null
+++ b/tests/tcg/hexagon/system/crt0/crt0.S
@@ -0,0 +1,103 @@
+/*
+ *  Copyright(c) 2024-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "crt0.inc"
+	.equ DEFAULT_HEAP_SIZE, 0x4000000 /* 64MB */
+	.equ DEFAULT_STACK_SIZE, 0x100000 /* 1MB */
+
+	.section .start, "ax", @progbits
+	.subsection 0
+	.org 0
+
+	.global _start
+	.type _start, @function
+        .p2align 5
+_start:
+	jump hexagon_start_init
+	jump hexagon_start_main
+	.size _start, . - _start
+
+/*----------------------------------------------------------------------------*/
+
+	.global hexagon_pre_main
+        .type hexagon_pre_main, @function
+
+hexagon_pre_main:
+	/* Mark first stack frame. */
+	fp = #0
+
+        ReadFrom heapBase, r4
+
+        AddrOf DEFAULT_HEAP_SIZE
+        r5 = r0
+
+        r5 = add (r4, r5)               /* Calculate aligned heap top. */
+        r5 = add (r5, #15)
+        r5 = and (r5, #-16)
+        WriteTo heapLimit, r5
+
+        /* Set up stack. */
+        AddrOf DEFAULT_STACK_SIZE
+        r7 = r0
+
+        r6 = add (r5, r7)               /* Assume stack after heap. */
+        r6 = and (r6, #-16)
+
+        WriteTo stackBase, r6
+
+        ReadFrom stackBase, r6
+
+        r7 = sub (r6, r7)               /* Desired stack size. */
+        r7 = add (r7, #15)
+        r7 = and (r7, #-16)
+        WriteTo stackLimit, r7
+
+        /* Set stack up. */
+        ReadFrom stackBase, r0
+        sp = and (r0, #-16)             /* Align top of stack. */
+
+        /* Zero up BSS. */
+        AddrOf __bss_start, r0
+        AddrOf _end, r2
+        AddrOf memset, r28		/* bzero () is deprecated. */
+        { r1 = #0
+          r2 = sub (r2, r0)
+          callr r28 }
+        .size hexagon_pre_main, . - hexagon_pre_main
+
+/*----------------------------------------------------------------------------*/
+
+        .global hexagon_start_main
+        .type hexagon_start_main, @function
+hexagon_start_main:
+        AddrOf _start_main, r28
+        callr r28
+        /*Stop all threads to terminate execution */
+        r0 = #0x3f
+        stop (r0)
+        .size hexagon_start_main, . - hexagon_start_main
+
+/*----------------------------------------------------------------------------*/
+
+	.data
+        .global heapBase
+        .global heapLimit
+        .global stackBase
+        .global stackLimit
+	.global setHeapAngelCallParams
+
+.HeapParams:
+heapBase:
+        .word end                       /* Provided by the linker script. */
+heapLimit:
+        .word end + (DEFAULT_HEAP_SIZE & -16)
+stackBase:
+        .word 0
+stackLimit:
+        .word end + ((DEFAULT_HEAP_SIZE + 15) & -16)
+
+setHeapAngelCallParams:
+        .word .HeapParams
diff --git a/tests/tcg/hexagon/system/crt0/crt0.inc b/tests/tcg/hexagon/system/crt0/crt0.inc
new file mode 100755
index 000000000000..a28d68c51cd5
--- /dev/null
+++ b/tests/tcg/hexagon/system/crt0/crt0.inc
@@ -0,0 +1,25 @@
+/*
+ *  Copyright(c) 2024-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+	.macro AddrOf Var, To = r0
+	\To\() = ## (\Var)
+	.endm
+
+	.macro ReadFrom Var, To = r0
+	AddrOf \Var, \To
+	\To = memw (\To)
+	.endm
+
+	.macro WriteTo Var, From = r0, Ptr = r1
+	.ifnc "\From", "\Ptr"
+	AddrOf \Var, \Ptr
+	memw (\Ptr) = \From
+	\From = memw (\Ptr)
+	.else
+	.print "Macro arguments \"From\" and \"Ptr\" cannot be the same."
+	.err
+	.endif
+	.endm
diff --git a/tests/tcg/hexagon/system/crt0/crt0_standalone.S b/tests/tcg/hexagon/system/crt0/crt0_standalone.S
new file mode 100644
index 000000000000..a3ca6ea95da2
--- /dev/null
+++ b/tests/tcg/hexagon/system/crt0/crt0_standalone.S
@@ -0,0 +1,1206 @@
+/*
+ *  Copyright(c) 2024-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "crt0.inc"
+	.equ TLB_FIXED_ENTRIES, 6
+
+	.org	0x20			/* This must be at address 0x20 */
+EventVectorBase:
+	.word .EventVectors
+
+/* This can vary based on the revid of the part:
+   64, 128, 192.  Most are 128 */
+_NumTLBEntries:
+	.word 127
+
+TLBMapTable:
+	.word UPTE_START
+
+CoreDump:
+	.word RegDump
+
+	.subsection 0
+
+	/* Make sure that data and code don't end up in the same L2 cache-line. */
+        .p2align 6, 0
+
+ 	.global hexagon_start_init
+ 	.type hexagon_start_init, @function
+hexagon_start_init:
+.Init:
+        /* Clean up house (make sure that R0 is initialized before DCKILL). */
+	dckill
+	isync
+	ickill
+	isync
+
+.InitSSR:
+	/* SFD = 0, IE = 0, UM = 0, EX = 0, ASID = 0 */
+	r0 = #0
+	ssr = r0
+	isync
+
+	/* Setup events */
+.InitVector:
+	ReadFrom EventVectorBase
+	evb = r0
+
+.InitStack:
+	ReadFrom exc_stack_tops
+	sgp0 = r0
+
+.InitFramekey:
+	r0 = #0
+	framekey = r0
+
+	/* Configure cycle counter. */
+.InitPcycle:
+	r1 = #1
+	r0 = syscfg
+	r0 = insert (r1, #1, #6)
+	syscfg = r0
+
+	/* Configure IMT/DMT. */
+.InitDMT:
+	r1 = #1
+	r0 = syscfg
+	r0 = insert (r1, #1, #15)
+	syscfg = r0
+.InitQoS:
+	r1 = #1
+	r0 = syscfg
+	r0 = insert (r1, #1, #13)
+	syscfg = r0
+1:
+.InitXE:
+	r1 = #1
+	r0 = ssr
+	r0 = insert (r1, #1, #31)
+	ssr = r0
+
+        //{ 0x4066, 0x4, 0x7F, 0, 4 }, // v66a_512
+	{
+		r0 = #0x2c // JTLB size
+		r2 = cfgbase
+	}
+	r1 = asl(r2, #5)
+	r0 = memw_phys(r0, r1)
+	{
+		r0 = add(r0, #-1);
+		memw(##_tlbmax) = r0.new
+	}
+
+	{
+		r0 = #0x40 // L2 Tag size
+		r2 = cfgbase
+	}
+	r0 = memw_phys(r0, r1)
+	r1 = #0;
+	p0 = cmp.eq(r0, #0x400)
+	{
+		if (p0) r1 = #5
+		if (p0) jump 1f
+	}
+	p0 = cmp.eq(r0, #0x200)
+	{
+		if (p0) r1 = #4
+		if (p0) jump 1f
+	}
+	p0 = cmp.eq(r0, #0x100)
+	{
+		if (p0) r1 = #3
+		if (p0) jump 1f
+	}
+	p0 = cmp.eq(r0, #0x080)
+	{
+		if (p0) r1 = #2
+		if (p0) jump 1f
+	}
+1:
+	memw(##_l2cfg) = r1
+
+/* L2 config sequence:
+ *    1	- Disable prefetching by clearing HFd/i bits in ssr/ccr
+ */
+        r0 = ccr
+	r3 = #0
+	r0 = insert (r3, #4, #16)	/* Clear HFi, HFd, HFiL2 HFdL2 bits */
+	ccr = r0
+
+	/* Configure L2 cache. */
+    	r0 = syscfg
+	r0 = insert (r3, #3, #16)	/* Set L2 size to 0 via L2CFG. */
+
+
+/* L2 config sequence:
+ *    2	- execute an isync which is aligned to a 32byte boundary.
+ */
+	.p2alignl 5, 0x7f00c000
+	isync
+
+/* L2 config sequence:
+ *    3	- execute an syncht insn to insure there are no outstanding
+ *        memory transactions.
+ */
+        syncht
+
+/* L2 config sequence:
+ *    4	- Set the desired L2 size for < V4 (set to 0 for >= V4).
+ */
+	syscfg = r0
+	isync
+
+/* L2 config sequence:
+ *    5	- Execute the L2KILL insn to initiate the cache.
+ */
+	l2kill
+	syncht
+
+/* L2 config sequence:
+ *    6	- Set the desired L2 size.
+ */
+	r2 = memw(##_l2cfg)
+        r3 = #0x5
+	r3 = min (r2, r3)	        /* min between desired and hwmax */
+	r0 = insert (r3, #4, #16)       /* Set L2 size via L2CFG. */
+	syscfg = r0
+	isync
+
+	/* Configure L1 caches. */
+.InitCache:
+	r1 = #0
+	r1 = #1
+	r2 = syscfg
+	r2 = insert (r1, #1, #1)
+	r2 = insert (r0, #1, #2)
+
+	r1 = #1
+	r2 = insert (r1, #1, #23)
+
+	syscfg = r2
+	isync
+
+	/* BEGIN code to turn on translation */
+.InitTLB:
+	// V65 an later use a table for this stuff, should get a table for all of it!
+	r0 = memw(##_tlbmax)
+
+	/* Clear TLB and store the number of TLBs */
+	{
+		r3:2 = combine(#0,#0)
+		memw(##_NumTLBEntries) = r0
+	}
+
+	loop0(.InitTLBLoop, r0)
+.falign
+.InitTLBLoop:
+	tlbw(r3:2,r0)
+	r0 = add (r0, #-1)
+	{}:endloop0
+	isync
+
+.InitTLBGlobal:				/* Fixed entry for everything. */
+        AddrOf _start, r2
+        r2 = lsr (r2, #12)
+
+	AddrOf 0xc3f00000, r1		/* Global, 1-1 mapping. */
+	AddrOf 0xf7000000, r0		/* Full perms, fully cacheable WB */
+        r1 = or (r1, r2)		/* 1M translation */
+        r0 |= asl (r2,#1)
+	r0 = setbit(r0,#4)
+	r0 = and(r0,#-16)
+	r2 = #0
+	tlbw(r1:0,r3)
+
+	/* TODO Should there be a TLB entry for TCM too? */
+
+	r0 = syscfg
+	r0 = setbit (r0, #0)		/* Turn the MMU on. */
+	syscfg = r0
+	isync
+
+.InitInt:
+	/* Set up rising edge triggered interrupts */
+        r0 = #0
+	imask = r0
+	r1 = #-1
+	cswi (r1)
+
+        /* Enable interrupts globally. */
+        r0 = ssr
+	r0 = setbit (r0, #18)
+        ssr= r0
+
+        r0 = syscfg
+        r0 = setbit (r0, #4)
+        syscfg = r0
+        isync
+
+        /* Set up input params to Angel call   */
+        r0 = #22
+        AddrOf setHeapAngelCallParams, r1
+        trap0 (#0)
+
+.PreMain:
+        AddrOf hexagon_pre_main, r28
+        jumpr r28
+        .size hexagon_start_init, . - hexagon_start_init
+
+.global	qdsp6_start_init
+.set	qdsp6_start_init, \
+	hexagon_start_init
+
+/* (At this point the machine is mostly ready for normal execution */
+
+ /* This code is jumped to when we start a new thread.        */
+ /* It reads some values out of memory and uses them          */
+ /* to begin execution.                                       */
+ /* The code supports going to a function of the type:        */
+ /*     void foo (void *arg);                                  */
+ /* or                                                        */
+ /*     void foo (int arg);                                    */
+ /* All we have to do is get the location of "foo", the       */
+ /* value for "arg", and set up the stack.                    */
+ /* This stuff has been set up for us by thread_create, below.*/
+ /* Under the OS, we have no need for this, it is merely for  */
+ /* trying multithreaded applications on the raw hardware.    */
+
+	.p2align 4
+	.weak thread_stop
+	.type thread_stop, @function
+thread_stop:
+{
+	r0 = htid
+	r1 = #1
+}
+	r1 = lsl (r1, r0)
+	stop (r1)
+
+	.p2align 4
+
+	.type event_handle_reset, @function
+
+event_handle_reset:
+	r1 = htid /* do not alter until final register initialization */
+
+	{
+		r28 = ##(start_pc)
+		r29 = ##(start_sp)
+	}
+
+	r2 = #0 				/* UM = 0 EX = 0 IE = 0 ASID = 0 */
+	ssr = r2
+	isync
+	imask = r2
+
+	r2 = ##(exc_stack_tops)
+	r2 = memw (r2+r1<<#2)
+	sgp0 = r2
+
+	/* Initialize GP to the start of the global data area. */
+	//r2 = ##(_SDA_BASE_)
+	//gp = r2
+
+        r2.h = #4
+	r2.l = #0
+	ssr = r2 /* Turn on interrupts */
+
+	r3 = #1
+	r2 = ssr
+	r2 = insert (r3, #1, #31)
+	ssr = r2
+
+	r2.h = #0x1  /* Enable cache fetching */
+	usr  = r2
+
+	r0 = #1
+	r2 = #1
+	r0 |= asl (r2, #1)
+	r2 = ccr
+	r2 = insert (r0, #2, #16)
+	/* Enable dcfetch and l2fetch. */
+	r2 = setbit (r2, #20)
+	ccr = r2
+
+	isync
+
+	{
+		r2 = ##framekey_tbl
+		r3 = ##stack_size
+	}
+	{
+		r2 = memw(r2+r1<<#2) /* load framekey from memory array */
+		r3 = memw(r3+r1<<#2) /* load stack_size from memory array */
+	}
+	{
+		framekey = r2 /* store into framekey register */
+		r2 = memw (sp+r1<<#2)
+	}
+	r3 = sub(r2, r3) /* framelimt = sp-stack_size) */
+	framelimit = r3 /* store into framelimit register */
+
+	{
+		r28 = memw (r28+r1<<#2)
+		sp = memw (sp+r1<<#2)
+		fp = #0
+	}
+
+        {
+                r0 = ##(start_param)
+	        lr = ##(thread_stop)
+        }
+	fp = #0
+	r1 = htid
+	r0 = memw (r0+r1<<#2)
+
+	jump thread_start
+
+	.size event_handle_reset, . - event_handle_reset
+
+        .global __coredump
+        .type coredump, @function
+        .set __coredump, coredump
+coredump:
+        r0 = ssr
+        r0 = clrbit (r0, #16) /* UM = 0 */
+        r0 = clrbit (r0, #17) /* EX = 0 */
+        ssr = r0
+        isync
+        r0 = #0xCD
+        trap0 (#0)
+        r2 = #-1
+        r0 = #-1
+        stop (r0)
+	.size event_core_dump, . - event_core_dump
+
+        .type event_handle_nmi, @function
+event_handle_nmi:
+        r0 = #1
+        stid = r0
+        jump coredump
+        .size event_handle_nmi, . - event_handle_nmi
+
+        .type event_handle_error, @function
+event_handle_error:
+        r0 = #2
+        stid = r0
+        jump coredump
+        .size event_handle_error, . - event_handle_error
+
+        .type event_handle_rsvd, @function
+event_handle_rsvd:
+        r0.h = #0xdead
+        r0.l = #0xbeef
+        stid = r0
+        jump coredump
+        .size event_handle_rsvd, . - event_handle_rsvd
+
+	.global thread_start
+	.type thread_start, @function
+thread_start:
+		jumpr r28
+	.size thread_start, . - thread_start
+
+ /* TLB HANDLING                                                  */
+ /* There are a few strategies we have tried for TLB handling.    */
+ /* The first is just to map every page 1:1 for virtual:physical  */
+ /* This means we have nothing to look up but no flexibility      */
+ /* The strategy implemented here is to divide memory into        */
+ /* a bunch of 1MB pages.  Each page is by default set to the     */
+ /* corresponding physical 1M page, but the translation (and the  */
+ /* cacheability) can be changed with the add_translation function*/
+ /* below.                                                        */
+ /* We have to keep the table in memory, and it's down in the data*/
+ /* section.                                                      */
+ /* The page at address 0 is always kept in the TLB.              */
+ /* You will run into problems if the data gets pushed out into   */
+ /* another page, because you don't have a translation for the    */
+ /* data you need to do the translation!                          */
+ /* The solution is to put the translation table (and probably    */
+ /* the TLB fill code) in special section (s) that go near address 0 */
+ /* You can set that up in the linker script.                     */
+ /* TLB miss because of eXecution                                 */
+ /* See HEXAGON Architecture System-Level Spec for more information */
+
+
+
+ 	.subsection 0
+
+	.p2align 6
+	.global event_handle_tlbmissx
+ 	.type event_handle_tlbmissx, @function
+
+event_handle_tlbmissx:
+	crswap (sp, sgp0)
+	sp = add (sp, #-64)
+	/* Save off state */
+	{
+		memd (sp + #0) = r1:0
+		memd (sp + #8) = r3:2
+	}
+	{
+		memd (sp + #16) = r5:4
+		memd (sp + #24) = r7:6
+	}
+	{
+		memd (sp + #32) = r9:8
+		r9 = p3:0
+	}
+	r8 = ssr
+	r7 = elr
+	p1 = tstbit (r8, #0)
+	{
+		/* Calculate 4K page index */
+		r7 = lsr (r7, #12)
+		/* Check for next page hit */
+		if (!p1) jump 1f
+		r0 = ##(__tlb_idx)
+	}
+	r7 = add (r7, #1)
+1:
+	{
+		r1 = memw(##_tlb_fixed_entries)	/* First non-fixed entry. */
+		r3 = memw(##_NumTLBEntries)
+	}
+	/* Atomically increment index */
+	/* NEVER overwrite fixed entries */
+1:
+	r6 = memw_locked (r0)
+	{
+		r6 = add (r6, #1)
+		/* This was hard coded to p0 = cmp.ge(r6, #NUM_TLB_ENTRIES)
+		   Now we are using 2 registers so switch to the equivalent
+		   p0 = !cmp.gt(r3, r6) */
+		p0 = !cmp.gt (r3, r6)
+	}
+	/* Will never store a number greater than
+           _NumTLBEntries in &__tlb_idx */
+	r6 = mux (p0, r1, r6)
+	memw_locked (r0, p0) = r6
+	if (!p0) jump 1b              /* Retry, lost reservation. */
+
+	{
+		r7 = lsr (r7, #8) /* 1M page index */
+		r3 = memw (##TLBMapTable)
+	}
+	r3 = addasl (r3, r7, #1)
+	{
+		r3 = memh (r3)
+		r7 = asl (r7, #8) /* VPN */
+	}
+	r5 = extractu (r3, #12, #4)
+	{
+		r4 = extractu (r3, #4, #0)
+		r0 = #0x0010 /* 1M */
+		r1 = #0
+	}
+	{
+		r4 = asl (r4, #24)
+		r1.h = #0xc000
+		r0.h = #0xf000
+	}
+1:
+	{
+		r1 = or (r1, r7)	/* c000_0000 + VPN */
+		r0 |= asl(r5,#9)	/* f000_0000 + PPD */
+	}
+	r0 = or (r0, r4)
+	/* Get Lock */
+	tlblock
+	r5 = tlbp(r1)
+	p0 = tstbit (r5, #31)
+	if (!p0) jump 1f
+
+	tlbw(r1:0,r6)
+	isync
+
+1:
+	tlbunlock
+
+	p3:0 = r9
+	{
+		r9:8 = memd (sp + #32)
+		r7:6 = memd (sp + #24)
+	}
+	{
+		r5:4 = memd (sp + #16)
+		r3:2 = memd (sp + #8)
+	}
+	{
+		r1:0 = memd (sp + #0)
+		sp = add (sp, #64)
+	}
+	crswap (sp, sgp0)
+	rte
+
+	.size  .event_handle_tlbmissx, . - event_handle_tlbmissx
+
+ /* TLB Miss RW                                            */
+ /* Basically the same as TLB MissX, but we get            */
+ /* The address from BADVA instead of EVB... see the       */
+ /* HEXAGON Architecture System-level Spec for more details. */
+
+	.p2align 6
+
+	.global event_handle_tlbmissrw
+ 	.type event_handle_tlbmissrw, @function
+
+event_handle_tlbmissrw:
+	crswap (sp, sgp0)
+	sp = add (sp, #-64)
+	{
+		memd (sp + #0) = r1:0
+		memd (sp + #8) = r3:2
+	}
+	{
+		memd (sp + #16) = r5:4
+		memd (sp + #24) = r7:6
+	}
+	{
+		memd (sp + #32) = r9:8
+		r8 = ssr
+	}
+	r7 = badva
+	r9 = p3:0
+	{
+		r0 = ##__tlb_idx
+		r1 = memw(##_tlb_fixed_entries)
+	}
+	{
+		r7 = lsr (r7, #20)
+		r3 = memw(##_NumTLBEntries) /* 31, 63, 127, or 191 */
+	}
+	/* Atomically increment index */
+	/* NEVER overwrite entry 0 */
+1:
+	r6 = memw_locked (r0)
+	{
+		r6 = add (r6, #1)
+		/* This was hard coded to p0 = cmp.ge(r6, #NUM_TLB_ENTRIES)
+		   Now we are using 2 registers so switch to the equivalent
+		   p0 = !cmp.gt(r3, r6) */
+		p0 = !cmp.gt (r3, r6)
+	}
+	/* Will never store a number greater than
+           _NumTLBEntries in &__tlb_idx */
+	r6 = mux (p0, r1, r6)
+	memw_locked (r0, p0) = r6
+	if (!p0) jump 1b              /* Retry, lost reservation. */
+
+	r3 = memw (##TLBMapTable)
+	r3 = addasl (r3, r7, #1)
+	{
+		r3 = memh (r3)
+		r7 = asl (r7, #8) /* VPN */
+	}
+
+	r4 = extractu (r3, #4, #0)
+.L_OK:
+	{
+	        r5 = extractu (r3, #12, #4)
+		r0 = #0x0010	/* 1M */
+		r1 = #0
+	}
+	{
+		r4 = asl (r4, #24)
+		r1.h = #0xc000
+		r0.h = #0xf000
+	}
+1:
+	{
+		r1 = or (r1, r7) /* R5: VPN | C000_0000 */
+		r0 |= asl(r5,#9) /* R4: PPD | F000_0000 */
+	}
+	r0 = or (r0, r4)
+
+	tlblock
+	r5 = tlbp(r1)
+	p0 = tstbit (r5, #31)
+	if (!p0) jump 1f
+
+	tlbw(r1:0,r6)
+	isync
+	jump 2f
+1:
+        // If we take a miss around a user defined page they need to
+        // manually create another page or not touch the regions above
+        // and below their page within a 1M boundary.
+	r4 = memw(##_tlb_fixed_entries)
+	p0 = cmp.gt(r4, r5) // r4>r5 == r5<r4, (entryfound < num_fixed)
+	if (p0) jump .  // DEAD
+2:
+	tlbunlock
+
+	p3:0 = r9
+	{
+		r9:8 = memd (sp + #32)
+		r7:6 = memd (sp + #24)
+	}
+	{
+		r5:4 = memd (sp + #16)
+		r3:2 = memd (sp + #8)
+	}
+	{
+		r1:0 = memd (sp + #0)
+		sp = add (sp, #64)
+	}
+	crswap (sp, sgp0)
+	rte
+
+	.size event_handle_tlbmissrw, . - event_handle_tlbmissrw
+
+/* This code handles the OS-like requests coming   */
+/* from the application.                           */
+
+	.p2align 4
+
+ 	.type event_handle_trap0, @function
+
+event_handle_trap0:
+	crswap (sp, sgp0)
+	{
+		sp = add (sp, #-40)
+		memd (sp + #-40) = r5:4
+		r5 = p3:0
+		p0 = cmp.eq (r0, #0x40)  /* read (thread) cycles */
+	}
+	{
+		memd (sp + #8) = r3:2
+		p1 = cmp.eq (r0, #0x44)  /* read tcycles */
+		p2 = cmp.eq (r0, #0x52)  /* read pcycles */
+		r4.h = #HI (0x55555555)  /* 1/3 in 0.32 fixed point */
+	}
+
+7:
+	{
+		p3:0 = r5
+		r3:2 = memd (sp + #8)
+		r5:4 = memd (sp)
+		sp = add (sp, #40)
+	}
+	crswap (sp, sgp0)
+        rte
+
+8:
+	{
+		if (!p2) jump 9f
+		r6.l = #38
+	}
+	{
+		p2 = cmp.eq (r1, r6)
+		jump 1b
+	}
+
+9:
+	r1 = memw (##CoreDump)
+
+	jump 1b
+
+        .size event_handle_trap0, . - event_handle_trap0
+
+        .p2align 4
+
+        .type event_handle_trap1, @function
+
+event_handle_trap1:
+        r0 = #9
+        stid = r0
+        jump coredump
+
+        .size event_handle_trap1, . - event_handle_trap1
+
+ /* This is the code jumped to by the interrupt vectors */
+ /* (above).  We save context, jump to the function,    */
+ /* restore context, and return to where we left off.   */
+
+ 	.type event_handle_int, @function
+
+event_handle_int:
+	crswap (sp, sgp0)
+	allocframe (#160)
+	{
+		memd (sp + #0) = r1:0
+		memd (sp + #8) = r3:2
+		r0 = SA0
+	}
+	{
+		memd (sp + #16) = r5:4
+		memd (sp + #24) = r7:6
+		r1 = LC0
+	}
+	{
+		memd (sp + #32) = r9:8
+		memd (sp + #40) = r11:10
+		r2 = SA1
+	}
+	{
+		memd (sp + #48) = r13:12
+		memd (sp + #56) = r15:14
+		r3 = LC1
+	}
+	{
+		memd (sp + #64) = r17:16
+		memd (sp + #72) = r19:18
+		r6 = p3:0
+	}
+	{
+		memd (sp + #80) = r21:20
+		memd (sp + #88) = r23:22
+		r5:4 = C7:6 /* M1 and M0 */
+	}
+	{
+		memd (sp + #96) = r25:24
+		memd (sp + #104) = r27:26
+		r7 = USR
+	}
+	{
+		memd (sp + #112) = r1:0
+		memd (sp + #136) = r7:6
+		r8 = UGP
+	}
+	r0 = ssr
+	{
+		memd (sp + #120) = r3:2
+		r2 = r0
+		r7 = insert (r0, #8, #16)
+	}
+	{
+		r9 = ELR
+		memd (sp + #128) = r5:4
+		r0 = and (r0, #0x1f)
+		r1 = ##(__IntHandlers)
+	}
+	{
+		r1 = addasl (r1, r0, #2)
+	}
+	{
+		memd (sp + #144) = r9:8
+		r1 = memw (r1)
+		r3 = #0
+		lr = r28
+	}
+	{
+		memd (sp + #152) = lr:fp
+		r2 = insert (r3, #3, #16)
+		p0 = cmp.eq (r1, #0)
+	}
+	if (p0) jump 1f // if null, skip a bunch of stuff
+	ssr = r2
+	crswap (sp, sgp0)
+	/* Call interrupt handler */
+	callr r1
+	/* Ok, we're back... */
+	crswap (sp, sgp0)
+	/* R7.H is also intnum.. use for ciad */
+	/* ciad ... do early to jump over */
+	r0 = ssr
+	{
+		r26.h = #0x0000
+		r7:6 = memd (sp + #136)
+		r1 = #6  /* EX, IE, !UM */
+	}
+	{
+		r7 = asrh (r7)
+		r26.l = #0x0001
+		r0 = insert(r1, #3, #16)
+	}
+	r7 = and (r7, #0x1f)
+	r26 = lsl (r26, r7)
+
+	ssr = r0
+	ciad (r26)
+1:
+	{
+		lr:fp = memd (sp + #152)
+		r9:8 = memd (sp + #144)
+	}
+	elr = r9
+	{
+		r7:6 = memd (sp + #136)
+		r5:4 = memd (sp + #128)
+		UGP = r8
+	}
+	{
+		r3:2 = memd (sp + #120)
+		r1:0 = memd (sp + #112)
+		usr = r7
+		r28 = lr
+	}
+	{
+		r27:26 = memd (sp + #104)
+		r25:24 = memd (sp + #96)
+		m0 = r4
+	}
+	{
+		r23:22 = memd (sp + #88)
+		r21:20 = memd (sp + #80)
+		m1 = r5
+	}
+	{
+		r19:18 = memd (sp + #72)
+		r17:16 = memd (sp + #64)
+		p3:0 = r6
+	}
+	{
+		r15:14 = memd (sp + #56)
+		r13:12 = memd (sp + #48)
+		lc1 = r3
+	}
+	{
+		r11:10 = memd (sp + #40)
+		r9:8   = memd (sp + #32)
+		sa1 = r2
+	}
+	{
+		r7:6   = memd (sp + #24)
+		r5:4   = memd (sp + #16)
+		lc0 = r1
+	}
+	{
+		r3:2   = memd (sp + #8)
+		r1:0   = memd (sp + #0)
+		sa0 = r0
+	}
+	deallocframe
+	crswap (sp, sgp0)
+	rte
+
+	.size event_handle_int, . - event_handle_int
+
+ /* Dummy function for when we don't have code registered for an interrupt.*/
+
+	.p2align 4
+
+ 	.type .NoHandler, @function
+
+.NoHandler:
+	jumpr lr
+
+	.size .NoHandler, . - .NoHandler
+
+	.text
+
+/* Next we have the event vectors */
+/* See the HEXAGON Architecture System-Level Specification  */
+/* for more information.*/
+
+	.p2align 12, 0
+
+ 	.type .EventVectors, @function
+
+.EventVectors:
+	jump event_handle_reset
+	jump event_handle_nmi
+	jump event_handle_error
+	jump event_handle_rsvd
+	jump event_handle_tlbmissx
+	jump event_handle_rsvd
+	jump event_handle_tlbmissrw
+	jump event_handle_rsvd
+	jump event_handle_trap0
+	jump event_handle_trap1
+	jump event_handle_rsvd /* 10 */
+	jump event_handle_rsvd /* 11 */
+	jump event_handle_rsvd /* 12 */
+	jump event_handle_rsvd /* 13 */
+	jump event_handle_rsvd /* 14 */
+	jump event_handle_rsvd /* 15 */
+	jump event_handle_int  /* Event number 16, Interrupt 0 */
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int
+	jump event_handle_int  /* Event number 47, Interrupt 31 */
+
+	.size .EventVectors, . - .EventVectors
+
+/**************** DATA SECTION ****************/
+
+	/* Here are definitions for some of the data we use above */
+
+	.section .start, "awx"
+	.subsection 1
+
+	.p2align 4, 0
+
+	.global __IntHandlers
+	.set __IntHandlers, .IntHandlers
+.IntHandlers:
+	.word .NoHandler	/* 0 */
+	.word .NoHandler	/* 1 */
+	.word .NoHandler	/* 2 */
+	.word .NoHandler	/* 3 */
+	.word .NoHandler	/* 4 */
+	.word .NoHandler	/* 5 */
+	.word .NoHandler	/* 6 */
+	.word .NoHandler	/* 7 */
+	.word .NoHandler	/* 8 */
+	.word .NoHandler	/* 9 */
+	.word .NoHandler	/* 10 */
+	.word .NoHandler	/* 11 */
+	.word .NoHandler	/* 12 */
+	.word .NoHandler /* 13 */
+	.word .NoHandler /* 14 */
+	.word .NoHandler /* 15 */
+	.word .NoHandler /* 16 */
+	.word .NoHandler /* 17 */
+	.word .NoHandler /* 18 */
+	.word .NoHandler /* 19 */
+	.word .NoHandler /* 20 */
+	.word .NoHandler /* 21 */
+	.word .NoHandler /* 22 */
+	.word .NoHandler /* 23 */
+	.word .NoHandler /* 24 */
+	.word .NoHandler /* 25 */
+	.word .NoHandler /* 26 */
+	.word .NoHandler /* 27 */
+	.word .NoHandler /* 28 */
+	.word .NoHandler /* 29 */
+	.word .NoHandler /* 30 */
+	.word .NoHandler /* 31 */
+
+	.p2align 5, 0
+RegDump:
+	.space 4 * (32 + 10 + 29)
+
+    /* This space is used by the supervisor code for saving      */
+    /* context for kernel stuff.  It's also used to hold the     */
+    /* normal user code registers while we call the user-defined */
+    /* interrupt service routine  */
+/* Stack tops... enough for a couple context saves... */
+	.p2align 3, 0
+exc_stack_lim0:	.space 384
+exc_stack_top0:	.word 0
+	.p2align 3, 0
+exc_stack_lim1:	.space 384
+exc_stack_top1:	.word 0
+	.p2align 3, 0
+exc_stack_lim2:	.space 384
+exc_stack_top2:	.word 0
+	.p2align 3, 0
+exc_stack_lim3:	.space 384
+exc_stack_top3:	.word 0
+	.p2align 3, 0
+exc_stack_lim4:	.space 384
+exc_stack_top4:	.word 0
+	.p2align 3, 0
+exc_stack_lim5:	.space 384
+exc_stack_top5:	.word 0
+	.p2align 3, 0
+exc_stack_lim6: .space 384
+exc_stack_top6: .word 0
+	.p2align 3, 0
+exc_stack_lim7: .space 384
+exc_stack_top7: .word 0
+	.p2align 3, 0
+exc_stack_lim8:	.space 384
+exc_stack_top8:	.word 0
+	.p2align 3, 0
+exc_stack_lim9:	.space 384
+exc_stack_top9:	.word 0
+	.p2align 3, 0
+exc_stack_lim10: .space 384
+exc_stack_top10: .word 0
+	.p2align 3, 0
+exc_stack_lim11: .space 384
+exc_stack_top11: .word 0
+	.p2align 3, 0
+exc_stack_lim12: .space 384
+exc_stack_top12: .word 0
+	.p2align 3, 0
+exc_stack_lim13: .space 384
+exc_stack_top13: .word 0
+	.p2align 3, 0
+exc_stack_lim14: .space 384
+exc_stack_top14: .word 0
+	.p2align 3, 0
+exc_stack_lim15: .space 384
+exc_stack_top15: .word 0
+exc_stack_tops:
+	.word exc_stack_top0
+	.word exc_stack_top1
+	.word exc_stack_top2
+	.word exc_stack_top3
+	.word exc_stack_top4
+	.word exc_stack_top5
+	.word exc_stack_top6
+	.word exc_stack_top7
+	.word exc_stack_top8
+	.word exc_stack_top9
+	.word exc_stack_top10
+	.word exc_stack_top11
+	.word exc_stack_top12
+	.word exc_stack_top13
+	.word exc_stack_top14
+	.word exc_stack_top15
+
+	.global __start_pc
+	.set __start_pc, start_pc
+start_pc:
+	jump .
+	jump .
+	jump .
+	jump .
+	jump .
+	jump .
+	jump .
+	jump .
+	jump .
+	jump .
+	jump .
+	jump .
+	jump .
+	jump .
+	jump .
+	jump .
+
+
+	.global __start_sp
+	.set __start_sp, start_sp
+start_sp:
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+
+	.global __start_param
+	.set __start_param, start_param
+start_param:
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+
+	.global __stack_size
+	.set __stack_size, stack_size
+stack_size:
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+
+	.global __framekey
+	.set __framekey, framekey_tbl
+framekey_tbl:
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+
+_l2cfg:
+	.word 0
+_tlbmax:
+	.word 0
+
+syscfg_l2_table:
+        .byte 0x0       /* rev: 0x0xxx: No L2 -> 0k L2 cache */
+        .byte 0x2       /* rev: 0x1xxx: 128K L2 -> 128k L2 cache */
+        .byte 0x3       /* rev: 0x2xxx: 256K L2 -> 256k L2 cache */
+        .byte 0x3       /* rev: 0x3xxx: Not valid at this time */
+        .byte 0x4       /* rev: 0x4xxx: 512K L2 -> 512k L2 cache */
+        .byte 0x4       /* rev: 0x5xxx: Not valid at this time */
+        .byte 0x4       /* rev: 0x6xxx: 768K L2 -> 512k L2 cache */
+        .byte 0x4       /* rev: 0x7xxx: Not valid at this time */
+        .byte 0x5       /* rev: 0x8xxx: 1024K L2 -> 1024 L2 cache */
+        .byte 0x4       /* rev: 0x9xxx: Not valid at this time */
+        .byte 0x5       /* rev: 0xAxxx: 1536K L2 -> 1024 L2 cache */
+        .byte 0x4       /* rev: 0xBxxx: Not valid at this time */
+        .byte 0x4       /* rev: 0xCxxx: Not valid at this time */
+        .byte 0x4       /* rev: 0xDxxx: Not valid at this time */
+        .byte 0x4       /* rev: 0xExxx: Not valid at this time */
+        .byte 0x4       /* rev: 0xFxxx: Not valid at this time */
+
+
+ /* Data used for TLB refill */
+
+	.p2align 6, 0
+
+	.global __tlb_lock
+	.set __tlb_lock, tlb_lock
+tlb_lock:
+	.word 0
+	.global __tlb_idx
+	.set __tlb_idx, tlb_idx
+tlb_idx:
+	.word TLB_FIXED_ENTRIES - 1
+
+	.global _tlb_fixed_entries
+_tlb_fixed_entries:
+	.word TLB_FIXED_ENTRIES
diff --git a/tests/tcg/hexagon/system/crt0/hexagon_standalone.h b/tests/tcg/hexagon/system/crt0/hexagon_standalone.h
new file mode 100644
index 000000000000..01ca41349f0f
--- /dev/null
+++ b/tests/tcg/hexagon/system/crt0/hexagon_standalone.h
@@ -0,0 +1,103 @@
+/*
+ *  Copyright(c) 2024-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+
+#ifndef _TLB_H
+#define _TLB_H
+
+typedef enum {
+    SHIFT_4K = 0,
+    SHIFT_16K,
+    SHIFT_64K,
+    SHIFT_256K,
+    SHIFT_1M,
+    SHIFT_4M,
+    SHIFT_16M,
+    SHIFT_64M,
+    SHIFT_256M,
+    SHIFT_1G,
+} PageShift;
+
+typedef enum {
+    PAGE_4K   = 1 << SHIFT_4K,
+    PAGE_16K  = 1 << SHIFT_16K,
+    PAGE_64K  = 1 << SHIFT_64K,
+    PAGE_256K = 1 << SHIFT_256K,
+    PAGE_1M   = 1 << SHIFT_1M,
+    PAGE_4M   = 1 << SHIFT_4M,
+    PAGE_16M  = 1 << SHIFT_16M,
+    PAGE_64M  = 1 << SHIFT_64M,
+    PAGE_256M = 1 << SHIFT_256M,
+    PAGE_1G   = 1 << SHIFT_1G,
+} PageSize;
+
+
+/*
+ * TLB entry format:
+ *
+ * TLBHI:
+ *    63 | 62 | 61 | 60:59 | 58 -- 52 | 51 -------- 32 |
+ *    V  | G  | EP   PPNex | ASID     | Virtual Page # |
+ *    -------------------------------------------
+ *
+ *    V            - Valid bit.
+ *    G            - Global bit.  If set ASID is ignored and the page
+ *                   is globally  accessible.
+ *    EP           - Extra Physical Bit
+ *    PPNex        - Extended Physical Page. (V73 and beyond)
+ *    ASID         - Address Space Identifier.
+ *    Virtual Page - Virtual Page number.  It has a minimum 4K alignment.
+ *                   This means the input value is right shifted 12 bits
+ *                   and that is what is placed into this field.
+ *
+ * TLBLO:
+ *    31 | 30 | 29 | 28 | 27 -- 24 | 23 --------- 1  | 0 |
+ *    X  | W  | R  | U  | C        | Physical Page # | S |
+ *    ----------------------------------------------------
+ *
+ *    X              - Execute Enabled
+ *    W              - Write Enabled
+ *    R              - Read Enabled
+ *    U              - User mode accessible
+ *    C              - Cacheablilty attributes: L1/L2 Cacheable Writeback/thru
+ *    Physical Page  - Physical Page #
+ *
+ */
+
+typedef union {
+  struct {
+    uint64_t S:1;
+    uint64_t PPN:23;
+    uint64_t CacheAttr:4;
+    uint64_t XWRU:4;
+    uint64_t VirtualPage:20;
+    uint64_t ASID:7;
+#if __HEXAGON_ARCH__ < 73
+    uint64_t A0:1;
+    uint64_t A1:1;
+#else
+    uint64_t PPN_EX:2;
+#endif
+    uint64_t EP:1;
+    uint64_t VG:2;
+  };
+  uint64_t raw;
+} TLBEntry;
+
+
+#define TLB_NOT_FOUND 0x80000000
+
+int add_translation_extended(int index, void *va, uint64_t pa,
+                              unsigned int page_size, unsigned int xwru,
+                              unsigned int cccc, unsigned int asid,
+                              unsigned int aa, unsigned int vg);
+void add_translation_fixed(int index, void *va, void *pa, int cccc,
+                            int permissions);
+void add_translation(void *va, void *pa, int cccc);
+
+#endif /* _TLB_H */
diff --git a/tests/tcg/hexagon/system/crt0/min_libc.c b/tests/tcg/hexagon/system/crt0/min_libc.c
new file mode 100644
index 000000000000..f44ee49f8f44
--- /dev/null
+++ b/tests/tcg/hexagon/system/crt0/min_libc.c
@@ -0,0 +1,359 @@
+/*
+ *  Copyright(c) 2024-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+/*
+ * Small cheat: take size_t, NULL, and other type/symbol definitions from the
+ * hexagon toolchain. We cannot link with the libc, though, as the actual
+ * implementation for functions like printf and open are defined for Linux, and
+ * we are running on "bare metal".
+ */
+#include <stdio.h>
+#include <stdint.h>
+#include <assert.h>
+#include <string.h>
+
+FILE *const stdout = (FILE *)1;
+
+void exit(int code)
+{
+    asm volatile(
+        "r2 = %0\n"
+        "stop(r0)\n"
+        :
+        : "r"(code)
+        : "r2");
+    __builtin_unreachable();
+}
+
+/* The assert() macro will use this. */
+void __assert_fail(const char *assertion, const char *file, int line,
+                   const char *function)
+{
+    printf("ASSERT fail '%s' at file '%s' line %d function %s\n",
+           assertion, file, line, function);
+    exit(1);
+}
+
+void *memset(void *b, int c, size_t len)
+{
+    for (size_t i = 0; i < len; i++) {
+        ((unsigned char *)b)[i] = (unsigned char)c;
+    }
+    return b;
+}
+
+int memcmp(const void *p1, const void *p2, size_t n)
+{
+    const char *s1 = p1;
+    const char *s2 = p2;
+    for ( ; n && (*s1 == *s2); s1++, s2++, n--) {
+        /* empty */
+    }
+    return n ? *(unsigned char *)s1 - *(unsigned char *)s2 : 0;
+}
+
+int bcmp(const void *s1, const void *s2, size_t n)
+{
+    return __builtin_bcmp(s1, s2, n);
+}
+
+
+#define HEX_SYS_WRITEC          0x03
+#define HEX_SYS_WRITE0          0x04
+#define HEX_SYS_GET_CMDLINE     0x15
+
+/*
+ * Macro flavors:
+ * - DIRECT_SWI takes up to two args an put them at r1 and r2.
+ * - SWI takes up to four args and puts them in an array, placing the
+ *   array address at r1.
+ */
+
+static int swi_ret, swi_err, swi_args[4];
+#define DO_SWI(CODE, ARG0, ARG1) \
+    do { \
+        asm volatile( \
+                "r0 = %2\n" \
+                "r1 = %3\n" \
+                "r2 = %4\n" \
+                "trap0(#0)\n" \
+                "%0 = r0\n" \
+                "%1 = r1\n" \
+                : "=r"(swi_ret), "=r"(swi_err) \
+                : "r"(CODE), "r"(ARG0), "r"(ARG1) \
+                : "r0", "r1", "r2", "memory" \
+                ); \
+    } while (0)
+
+#define SWI0(CODE) DO_SWI(CODE, swi_args, 0)
+#define SWI1(CODE, ARG0) \
+    do { swi_args[0] = (uint32_t)(ARG0); SWI0(CODE); } while (0)
+#define SWI2(CODE, ARG0, ARG1) \
+    do { swi_args[1] = (uint32_t)(ARG1); SWI1(CODE, ARG0); } while (0)
+#define SWI3(CODE, ARG0, ARG1, ARG2) \
+    do { swi_args[2] = (uint32_t)(ARG2); SWI2(CODE, ARG0, ARG1); } while (0)
+#define SWI4(CODE, ARG0, ARG1, ARG2, ARG3) \
+    do { swi_args[3] = (uint32_t)(ARG3); SWI3(CODE, ARG0, ARG1, ARG2); } while (0)
+
+#define GET_MACRO_5(_1, _2, _3, _4, _5, NAME, ...) NAME
+#define SWI(...) \
+    ({ GET_MACRO_5(__VA_ARGS__, SWI4, SWI3, SWI2, SWI1, SWI0)(__VA_ARGS__); \
+       swi_ret; })
+
+#define DIRECT_SWI0(CODE) DO_SWI(CODE, 0, 0)
+#define DIRECT_SWI1(CODE, ARG1) DO_SWI(CODE, ARG1, 0)
+#define DIRECT_SWI2(CODE, ARG1, ARG2) DO_SWI(CODE, ARG1, ARG2)
+
+#define GET_MACRO_3(_1, _2, _3, NAME, ...) NAME
+#define DIRECT_SWI(...) \
+    ({ GET_MACRO_3(__VA_ARGS__, DIRECT_SWI2, DIRECT_SWI1, DIRECT_SWI0)(__VA_ARGS__); \
+       swi_ret; })
+
+int puts(const char *str)
+{
+    DIRECT_SWI(HEX_SYS_WRITE0, str);
+    DIRECT_SWI(HEX_SYS_WRITE0, "\n");
+    return 0;
+}
+
+int fputs(const char *str, FILE *f)
+{
+    assert(f == stdout); /* Only stdout is supported. */
+    DIRECT_SWI(HEX_SYS_WRITE0, str);
+    return 0;
+}
+
+size_t fwrite(const void *ptr, size_t size, size_t nitems, FILE *f)
+{
+    assert(f == stdout); /* Only stdout is supported. */
+    for (size_t i = 0; i < size * nitems; i++) {
+        DIRECT_SWI(HEX_SYS_WRITEC, &ptr[i]);
+    }
+    return size * nitems;
+}
+
+int putchar(int c)
+{
+    DIRECT_SWI(HEX_SYS_WRITEC, &c);
+    return c;
+}
+
+static char *num_to_s(uint64_t signed_num, uint64_t base)
+{
+    static char buffer[1024];
+    char *bptr = buffer;
+    uint64_t num;
+
+    if (base == 16) {
+        num = signed_num;
+    } else if (base == 10) {
+        if (signed_num < 0) {
+            *bptr++ = '-';
+            signed_num *= -1;
+        }
+        num = signed_num;
+    } else {
+        puts("fatal: num_to_s expects base 16 or 10");
+        exit(1);
+    }
+
+    if (!num) {
+        return "0";
+    }
+
+    uint64_t divider = 1;
+    for (uint64_t n = num; n >= base; n /= base) {
+        divider *= base;
+    }
+
+    while (num) {
+        unsigned int digit = num / divider;
+        if (digit) {
+            num %= divider;
+            divider /= base;
+            if (digit >= 10) {
+                *bptr++ = 'a' + (digit - 10);
+            } else {
+                *bptr++ = '0' + digit;
+            }
+            while (num < divider) {
+                *bptr++ = '0';
+                divider /= base;
+            }
+        } else {
+            divider /= base;
+        }
+    }
+
+    *bptr = '\0';
+    return buffer;
+}
+
+static int advance_prefix(const char **str_ptr, char *prefix)
+{
+    const char *str = *str_ptr;
+    while (*str && *str == *prefix) {
+        str++;
+        prefix++;
+    }
+    str--;
+    if (!*prefix) {
+        *str_ptr = str;
+        return 1;
+    }
+    return 0;
+}
+
+static char *pad0(char *str, int n)
+{
+    static char buffer[1024];
+    int len = strlen(str);
+    assert(n < 1024);
+
+    int i;
+    for (i = 0; i < n - len; i++) {
+        buffer[i] = '0';
+    }
+    strcpy(&buffer[i], str);
+    return buffer;
+}
+
+/*
+ * Very simple implementation. No error checking.
+ * Supported formats are:
+ * %d, %s, %c, %x, %016llx
+ */
+int printf(const char *format, ...)
+{
+    va_list ap;
+    __builtin_va_start(ap, format);
+    for (const char *ptr = format; *ptr; ptr++) {
+        if (*ptr == '%') {
+            ptr++;
+            switch (*ptr) {
+            case 'd':
+            case 'x':
+            case 'p':
+            {
+                int num = __builtin_va_arg(ap, int);
+                fputs(num_to_s(num, *ptr == 'd' ? 10 : 16), stdout);
+                break;
+            }
+            case 's':
+                fputs(__builtin_va_arg(ap, char *), stdout);
+                break;
+            case 'c':
+                putchar(__builtin_va_arg(ap, int));
+                break;
+            case '%':
+                putchar('%');
+                break;
+            case '0':
+                if (advance_prefix(&ptr, "016llx")) {
+                    uint64_t num = __builtin_va_arg(ap, uint64_t);
+                    fputs(pad0(num_to_s(num, 16), 16), stdout);
+                    break;
+                }
+                /* else: fallthrough */
+            default:
+                fputs("fatal: unknown printf modifier '", stdout);
+                putchar(*ptr);
+                puts("'");
+                exit(1);
+            }
+        } else {
+            putchar(*ptr);
+        }
+    }
+    __builtin_va_end(ap);
+    return 1;
+}
+
+size_t strlen(const char *s)
+{
+    size_t len = 0;
+    for ( ; *s; s++) {
+        len++;
+    }
+    return len;
+}
+
+char *strcpy(char *dst, const char *src)
+{
+    int i;
+    for (i = 0; src[i]; i++) {
+        dst[i] = src[i];
+    }
+    dst[i] = '\0';
+    return dst;
+}
+
+int strcmp(const char *s1, const char *s2)
+{
+    for ( ; *s1 && (*s1 == *s2); s1++, s2++) {
+        /* empty */
+    }
+    return *(unsigned char *)s1 - *(unsigned char *)s2;
+}
+
+char *strrchr(const char *s, int c)
+{
+    for (int i = strlen(s) - 1; i >= 0; i--) {
+        if (s[i] == c) {
+            return (char *)&s[i];
+        }
+    }
+    return NULL;
+}
+
+#define MAX_ARGS 15
+/*
+ * Very simplistic implementation, using static buffers, and assuming no
+ * args will contain spaces.
+ */
+static inline char **getcmdline(int *argc)
+{
+    static char *args[MAX_ARGS] = { NULL };
+    char buf[4096];
+    char *c;
+    int id = 0;
+
+    assert(!SWI(HEX_SYS_GET_CMDLINE, buf, sizeof(buf)));
+
+    *argc = 1;
+    for (c = buf; *c; c++) {
+        if (*c == ' ' && *(c + 1)) {
+            (*argc)++;
+        }
+    }
+    assert(*argc <= MAX_ARGS);
+
+    if (*argc == 0) {
+        return args;
+    }
+
+    args[id++] = buf;
+    for (c = buf; *c; c++) {
+        if (*c == ' ') {
+            *c = '\0';
+            if (id < *argc) {
+                args[id++] = c + 1;
+            }
+        }
+    }
+    return args;
+}
+
+int main(int argc, char **argv, char **envp);
+void _start_main(void)
+{
+    int argc;
+    char **argv = getcmdline(&argc);
+    /* For now, we ignore envp */
+    char *envp[] = { NULL };
+    exit(main(argc, argv, envp));
+    exit(1);
+}
diff --git a/tests/tcg/hexagon/system/crt0/pte.S b/tests/tcg/hexagon/system/crt0/pte.S
new file mode 100644
index 000000000000..406e45389118
--- /dev/null
+++ b/tests/tcg/hexagon/system/crt0/pte.S
@@ -0,0 +1,80 @@
+/*
+ *  Copyright(c) 2024-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+	.section .start, "awx", @progbits
+	.p2align 3
+	.subsection 1
+/* This is the translation table    */
+/* We make a table of 2^12 entries  */
+/* Each entry is a .hword (16 bits) */
+/* Each entry is initialized to 0 in the 4 LSBs (cached WB, see system spec) */
+/* Each entry is initialized to 1:1 Virtual:Physical in the upper 12 bits.   */
+/* We use the preprocessor to avoid copy-paste errors and to avoid  */
+/* an 8192-line addition  to the file.                              */
+
+	.set __UPTE_START, UPTE_START
+	.weak __UPTE_START, UPTE_START
+UPTE_START:
+#define TLBENTRY(X) .hword ((((X) >> 16) & (0xfff0)) | 0x7);
+
+#define TLB_1M(X)  TLBENTRY ((X) << 20)
+#define TLB_16M(X) \
+	TLB_1M (((X) << 4) + 0) \
+	TLB_1M (((X) << 4) + 1) \
+	TLB_1M (((X) << 4) + 2) \
+	TLB_1M (((X) << 4) + 3) \
+	TLB_1M (((X) << 4) + 4) \
+	TLB_1M (((X) << 4) + 5) \
+	TLB_1M (((X) << 4) + 6) \
+	TLB_1M (((X) << 4) + 7) \
+	TLB_1M (((X) << 4) + 8) \
+	TLB_1M (((X) << 4) + 9) \
+	TLB_1M (((X) << 4) + 10) \
+	TLB_1M (((X) << 4) + 11) \
+	TLB_1M (((X) << 4) + 12) \
+	TLB_1M (((X) << 4) + 13) \
+	TLB_1M (((X) << 4) + 14) \
+	TLB_1M (((X) << 4) + 15)
+
+#define TLB_256M(X) \
+	TLB_16M (((X) << 4) + 0) \
+	TLB_16M (((X) << 4) + 1) \
+	TLB_16M (((X) << 4) + 2) \
+	TLB_16M (((X) << 4) + 3) \
+	TLB_16M (((X) << 4) + 4) \
+	TLB_16M (((X) << 4) + 5) \
+	TLB_16M (((X) << 4) + 6) \
+	TLB_16M (((X) << 4) + 7) \
+	TLB_16M (((X) << 4) + 8) \
+	TLB_16M (((X) << 4) + 9) \
+	TLB_16M (((X) << 4) + 10) \
+	TLB_16M (((X) << 4) + 11) \
+	TLB_16M (((X) << 4) + 12) \
+	TLB_16M (((X) << 4) + 13) \
+	TLB_16M (((X) << 4) + 14) \
+	TLB_16M (((X) << 4) + 15)
+
+#define TLB_4G  \
+	TLB_256M (0) \
+	TLB_256M (1) \
+	TLB_256M (2) \
+	TLB_256M (3) \
+	TLB_256M (4) \
+	TLB_256M (5) \
+	TLB_256M (6) \
+	TLB_256M (7) \
+	TLB_256M (8) \
+	TLB_256M (9) \
+	TLB_256M (10) \
+	TLB_256M (11) \
+	TLB_256M (12) \
+	TLB_256M (13) \
+	TLB_256M (14) \
+	TLB_256M (15)
+
+TLB_4G
+
+	.size UPTE_START, . - UPTE_START
diff --git a/tests/tcg/hexagon/system/crt0/tlb.c b/tests/tcg/hexagon/system/crt0/tlb.c
new file mode 100644
index 000000000000..00e07761dbe9
--- /dev/null
+++ b/tests/tcg/hexagon/system/crt0/tlb.c
@@ -0,0 +1,198 @@
+/*
+ *  Copyright(c) 2024-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include <stdint.h>
+#include "hexagon_standalone.h"
+
+/*
+ * The following 2 functions use global addressing mode
+ * to avoid GP relative overflows.
+ */
+static inline uint32_t get_tlb_fixed_entries(void)
+{
+  uint32_t *addr;
+  asm volatile ("%0=##_tlb_fixed_entries\n\t"
+                : "=r"(addr));
+  return *addr;
+}
+static inline uint32_t *get_UPTE_START(void)
+{
+  uint32_t addr;
+  asm volatile ("%0=##__UPTE_START\n\t"
+                : "=r"(addr));
+  return (uint32_t *)addr;
+}
+
+static inline uint32_t get_ssr(void)
+{
+  uint32_t reg;
+  asm volatile ("%0=ssr\n\t"
+                : "=r"(reg));
+  return reg;
+}
+
+
+static inline int64_t read_tlb_entry(int index)
+{
+  uint64_t reg;
+  asm volatile ("%[reg]=tlbr(%[index])"
+                : [reg] "=r" (reg)
+                : [index] "r" (index));
+  asm volatile ("isync");
+  return reg;
+}
+
+
+static inline void write_tlb_entry(TLBEntry tlb, int index)
+{
+  uint64_t entry = tlb.raw;
+  asm volatile ("tlblock\n"
+                "tlbw(%[entry], %[index])\n"
+                "isync\n"
+                "tlbunlock\n"
+                :
+                : [entry] "r" (entry), [index] "r" (index));
+}
+
+static inline int32_t  tlb_probe(uint32_t va)
+{
+  uint32_t VirtualPageNumber = va >> 12;
+  uint32_t ASID = (get_ssr() >> 8) & 0x7f;
+  uint32_t probe = ((ASID << 20) | VirtualPageNumber) & 0x7ffffff;
+  uint32_t result = 0;
+  asm volatile ("%[result]=tlbp(%[probe])"
+                : [result] "=r" (result)
+                : [probe] "r" (probe));
+
+  return result;
+}
+
+
+static inline void tlb_invalidate(uint32_t va)
+{
+  int entry = tlb_probe(va);
+  if (entry == TLB_NOT_FOUND) {
+    return;
+  }
+
+  TLBEntry tlb;
+  tlb.raw = read_tlb_entry(entry);
+  tlb.raw = tlb.raw & ~(1ull << 63); /* Clear the V bit. */
+  write_tlb_entry(tlb, entry);
+}
+
+
+static inline TLBEntry basic_entry(uint32_t va, uint64_t pa, PageSize pagesize)
+{
+  TLBEntry T;
+  uint64_t  PPN;
+  T.raw = 0ull;
+  T.VirtualPage = va >> 12;  /* 63-51 */
+#if __HEXAGON_ARCH__ > 72
+  T.PPN_EX = (pa & (3ull << 36)) >> 36;
+#endif
+  T.EP = (pa & (1ull << 35)) >> 35;
+  PPN = pa >> 12ull;
+  PPN = (PPN << 1ull) | pagesize;
+  if (pagesize == 1) {
+    T.S = 1;
+  }
+  T.raw |= PPN;
+  return T;
+}
+/*
+ * function: mkentry
+ * description:
+ *  - Given just a Physical Address (pa) and a Virtual Address (va)
+ *  create a default entry.
+ *  - A user wanting to change the cache attributes or permissions
+ *  can do so prior to writing the entry.
+ */
+static TLBEntry mkentry(uint32_t va, uint64_t pa, PageSize pagesize)
+{
+
+  /* Make an entry and set some reasonable defaults */
+  TLBEntry T = basic_entry(va, pa, pagesize);
+
+  T.CacheAttr = 0x7;
+  T.XWRU = 0x6;
+  T.VG = 0x3;
+  return T;
+}
+
+int add_translation_extended(int index, void *va, uint64_t pa,
+                             unsigned int page_size, unsigned int xwru,
+                             unsigned int cccc, unsigned int asid,
+                             unsigned int aa, unsigned int vg)
+{
+  uint32_t num_entries = get_tlb_fixed_entries();
+
+  if ((index < 1) || (index > (num_entries - 1))) {
+    return -1;
+  }
+
+  tlb_invalidate((uint32_t)va);
+  TLBEntry T;
+  T = basic_entry((uint32_t)va, pa, page_size);
+  T.ASID = ((uint64_t)asid & 0x7f);
+  T.CacheAttr = ((uint64_t)cccc & 0xf);
+  T.XWRU = ((uint64_t)xwru & 0xf);
+  T.VG = ((uint64_t)vg & 0x3);
+#if __HEXAGON_ARCH__ < 73
+  T.raw |= ((uint64_t)aa & 0x3) << 59ull;
+#endif
+  write_tlb_entry(T, index);
+
+  return 0;
+}
+
+
+void add_translation_fixed(int index, void *va, void *pa, int cccc,
+                           int permissions)
+{
+  tlb_invalidate((uint32_t)va);
+  add_translation_extended(index, va, (uint64_t)pa, PAGE_1M, permissions, cccc,
+                           0, 0, 3);
+}
+
+/*
+ * The following deals with the PTE software structure. The actual entry will
+ * not be placed into the TLB until an address fault occurrs.
+ */
+
+typedef union {
+    struct {
+        uint16_t cache:4;
+        uint16_t pa:12;
+    };
+    uint16_t PTE_raw;
+} SMALL_PTE;
+
+static SMALL_PTE *findPTEAddr(uint32_t va)
+{
+    uint32_t *PTE = get_UPTE_START();
+    int index = va >> 20;
+    return (SMALL_PTE *)PTE + index;
+}
+static SMALL_PTE findPTEValue(uint32_t va)
+{
+    SMALL_PTE *A = findPTEAddr(va);
+    return *A;
+}
+
+/* This function adds a translation into the mapping table, see above */
+/* Because we use 1MB pages, we only need to translate 12 bits.       */
+/* We keep those 12 bits plus 4 bits (where we keep the C field,      */
+/* see the System-level architecture spec on TLB entries) in          */
+/* a 16-bit entry in the table.                                       */
+/* We index into the table using the upper 12 bits.                   */
+/* As a note, 2 bytes x 2^12 entries == 8KB table                     */
+void add_translation(void *va, void *pa, int cccc)
+{
+    SMALL_PTE *S = findPTEAddr((uint32_t)va);
+    S->pa = (uint32_t)pa >> 20;
+    S->cache = cccc;
+}
diff --git a/tests/tcg/hexagon/system/fastl2vic.c b/tests/tcg/hexagon/system/fastl2vic.c
new file mode 100644
index 000000000000..a115ae73f799
--- /dev/null
+++ b/tests/tcg/hexagon/system/fastl2vic.c
@@ -0,0 +1,73 @@
+/*
+ *  Copyright(c) 2024-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+/*
+ * Test the fastl2vic interface.
+ *
+ *  hexagon-sim a.out --subsystem_base=0xfab0  --cosim_file q6ss.cfg
+ */
+
+#include "crt0/hexagon_standalone.h"
+
+#include "cfgtable.h"
+
+#define CSR_BASE  0xfab00000
+#define L2VIC_BASE ((CSR_BASE) + 0x10000)
+#define L2VIC_INT_ENABLE(b, n) \
+        ((unsigned int *) ((b) + 0x100 + 4 * (n / 32)))
+#define L2VIC_INT_ENABLE_SET(b, n) \
+        ((unsigned int *) ((b) + 0x200 + 4 * (n / 32)))
+
+int main()
+{
+    int ret = 0;
+    unsigned int irq_bit;
+
+    /* setup the fastl2vic interface and setup an indirect mapping */
+    volatile  uint32_t *A = (uint32_t *)0x888e0000;
+    add_translation_extended(3, (void *)A, GET_FASTL2VIC_BASE(), 16, 7, 4, 0, 0, 3);
+
+    uint32_t l2vic_base = GET_SUBSYSTEM_BASE() + 0x10000;
+
+    /* set and verify an interrupt using the L2VIC_BASE */
+    irq_bit  = (1 << (66  % 32));
+    *L2VIC_INT_ENABLE_SET(l2vic_base, 66)   = irq_bit;
+    if (*L2VIC_INT_ENABLE(l2vic_base, 64) != 0x4) {
+        ret = __LINE__;
+    }
+
+    /* set and verify an interrupt using the FASTL2VIC interface */
+    *A = 68;
+    if (*L2VIC_INT_ENABLE(l2vic_base, 64) != 0x14) {
+        ret = __LINE__;
+    }
+    *A = 67;
+    if (*L2VIC_INT_ENABLE(l2vic_base, 64) != 0x1C) {
+        ret = __LINE__;
+    }
+
+
+    /* Now clear the lines */
+    *A = ((1 << 16) | 68);
+    if (*L2VIC_INT_ENABLE(l2vic_base, 64) != 0xC) {
+        ret = __LINE__;
+    }
+    *A = ((1 << 16) | 66);
+    if (*L2VIC_INT_ENABLE(l2vic_base, 64) != 0x8) {
+        ret = __LINE__;
+    }
+    *A = ((1 << 16) | 67);
+    if (*L2VIC_INT_ENABLE(l2vic_base, 64) != 0x0) {
+        ret = __LINE__;
+    }
+
+    if (ret) {
+        printf("%s: FAIL, last failure near line %d\n", __FILE__, ret);
+    } else {
+        printf("PASS\n");
+    }
+    return ret;
+}
diff --git a/tests/tcg/hexagon/system/hvx-multi.c b/tests/tcg/hexagon/system/hvx-multi.c
new file mode 100644
index 000000000000..0d2e90c2c79b
--- /dev/null
+++ b/tests/tcg/hexagon/system/hvx-multi.c
@@ -0,0 +1,119 @@
+/*
+ *  Copyright(c) 2023-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+
+int err;
+
+#include "../hvx_misc.h"
+
+void set_hvx_context(int n)
+{
+    uint32_t ssr_context_bits = n << 27;
+    asm volatile(
+        "r1 = ssr\n"
+        "r1 = and(r1, ##0xc7ffffff)\n"
+        "r1 = or(r1, %0)\n"
+        "ssr = r1\r"
+        "isync\n"
+        :
+        : "r"(ssr_context_bits)
+        : "r1"
+    );
+}
+
+void setv0(int n)
+{
+    asm volatile(
+        "v0 = vsplat(%0)\n"
+        : : "r"(n) : "v0"
+    );
+}
+
+void store_v0(MMVector *v)
+{
+    asm volatile(
+        "vmemu(%0) = v0\n"
+        :
+        : "r"(v)
+        : "memory"
+    );
+}
+
+uint32_t get_num_contexts(void)
+{
+    const int EXT_CONTEXT_OFFSET = 13;
+    unsigned int cfgbase;
+    asm volatile("%0 = cfgbase\n" : "=r"(cfgbase));
+    uint32_t *cfgtable = (uint32_t *)(cfgbase << 16);
+    return *(cfgtable + EXT_CONTEXT_OFFSET);
+}
+
+uint32_t get_rev(void)
+{
+    uint32_t rev;
+    asm volatile("%0 = rev\n" : "=r"(rev));
+    return rev;
+}
+
+/*
+ * This test verifies that each new context is properly selected and is
+ * independent of the thread.
+ */
+int main()
+{
+    int num_contexts = get_num_contexts();
+    printf("rev=v%x, HVX-contexts=%d\n", (int)(get_rev() & 0xff), num_contexts);
+    memset(&output[0], 0, 8 * sizeof(MMVector));
+
+    /* First set v0 on all the contexts. */
+    for (int i = 0; i < num_contexts; i++) {
+        set_hvx_context(i);
+        setv0(i + 1);
+    }
+
+    /*
+     * Now each context should have its own v0 value. Save it to memory. We
+     * check all possible SSR.XA values to make sure the "aliases" are
+     * implemented correctly.
+     */
+    for (int i = 0; i < 8; i++) {
+        set_hvx_context(i);
+        store_v0(&output[i]);
+    }
+
+
+    /*
+     * Set expected values:
+     *
+     *                            num contexts
+     * SSR.XA     2              4              6              8
+     * 000      HVX Context 0  HVX Context 0  HVX Context 0  HVX Context 0
+     * 001      HVX Context 1  HVX Context 1  HVX Context 1  HVX Context 1
+     * 010      HVX Context 0  HVX Context 2  HVX Context 2  HVX Context 2
+     * 011      HVX Context 1  HVX Context 3  HVX Context 3  HVX Context 3
+     * 100      HVX Context 0  HVX Context 0  HVX Context 4  HVX Context 4
+     * 101      HVX Context 1  HVX Context 1  HVX Context 5  HVX Context 5
+     * 110      HVX Context 0  HVX Context 2  HVX Context 2  HVX Context 6
+     * 111      HVX Context 1  HVX Context 3  HVX Context 3  HVX Context 7
+     */
+    for (int i = 0; i < 8; i++) {
+        int expected = (i % num_contexts) + 1;
+        /* Exception for num_contexts=6 */
+        if (num_contexts == 6 && i >= 6) {
+            expected = (i - 6 + 2) + 1;
+        }
+        for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
+            expect[i].w[j] = expected;
+        }
+    }
+
+    check_output_w(__LINE__, 8);
+    puts(err ? "FAIL" : "PASS");
+    return !!err;
+}
diff --git a/tests/tcg/hexagon/system/int_range.c b/tests/tcg/hexagon/system/int_range.c
new file mode 100644
index 000000000000..688355886362
--- /dev/null
+++ b/tests/tcg/hexagon/system/int_range.c
@@ -0,0 +1,94 @@
+/*
+ *  Copyright(c) 2023-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+/*
+ * Test the range of the l2vic interface.
+ */
+
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include "cfgtable.h"
+
+#define L2VIC_INT_ENABLE(b, n)                                               \
+    ((volatile unsigned int *)((b) + 0x100 + 4 * (n / 32))) /* device mem */
+
+#define L2VIC_INT_ENABLE_SET(b, n)                                           \
+    ((volatile unsigned int *)((b) + 0x200 + 4 * (n / 32))) /* device mem */
+
+#define L2VIC_INT_ENABLE_CLEAR(b, n)                                         \
+    ((volatile unsigned int *)((b) + 0x180 + 4 * (n / 32))) /* device mem */
+
+#define L2VIC_SOFT_INT_SET(b, n)                                             \
+    ((volatile unsigned int *)((b) + 0x480 + 4 * (n / 32))) /* device mem */
+
+#define L2VIC_INT_TYPE(b, n)                                                 \
+    ((volatile unsigned int *)((b) + 0x280 + 4 * (n / 32))) /* device mem */
+
+volatile int pass; /* must use volatile */
+int g_irq;
+volatile uint32_t g_l2vic_base; /* must use volatile */
+
+
+/*
+ * When complete the irqlog will contain the value of the vid when the
+ * handler was active.
+ */
+#define INTMAX 1024
+#define LEFT_SET 666
+
+int main()
+{
+    unsigned int irq_bit;
+    unsigned int left_set = 0;
+    int ret = 0;
+
+    /* setup the fastl2vic interface and setup an indirect mapping */
+    g_l2vic_base = GET_SUBSYSTEM_BASE() + 0x10000;
+
+    /* Setup interrupts */
+    for (int irq = 1; irq < INTMAX; irq++) {
+        irq_bit = (1 << (irq % 32));
+        *L2VIC_INT_ENABLE(g_l2vic_base, irq) |= irq_bit;
+    }
+
+    /* Read them all back and check */
+    for (int irq = 1; irq < INTMAX; irq++) {
+        if ((*L2VIC_INT_ENABLE(g_l2vic_base, irq) & (1 << (irq % 32))) !=
+            (1 << irq % 32)) {
+            printf("%d: ERROR: irq: %d: 0x%x\n", __LINE__, irq,
+                   *L2VIC_INT_ENABLE(g_l2vic_base, irq));
+            ret = 1;
+        }
+    }
+    /* Clear them all, except int 1 and LEFT_SET (test)  */
+    for (int irq = 1; irq < INTMAX; irq++) {
+        if (!(irq % LEFT_SET)) {
+            continue;
+        }
+        irq_bit = (1 << (irq % 32));
+        *L2VIC_INT_ENABLE_CLEAR(g_l2vic_base, irq) |= irq_bit;
+    }
+
+    /* make sure just LEFT_SET is set */
+    for (int irq = 0; irq < INTMAX; irq++) {
+        if ((*L2VIC_INT_ENABLE(g_l2vic_base, irq) & (1 << (irq % 32))) !=
+            (0 << irq % 32)) {
+            if (irq != LEFT_SET) {
+                printf("%d: ERROR: irq: %d: 0x%x\n", __LINE__, irq,
+                       *L2VIC_INT_ENABLE(g_l2vic_base, irq));
+                ret = 1;
+            } else {
+                left_set = irq;
+            }
+        }
+    }
+    if (left_set == LEFT_SET) {
+        printf("PASS\n");
+    }
+    return ret;
+}
diff --git a/tests/tcg/hexagon/system/mmu.h b/tests/tcg/hexagon/system/mmu.h
new file mode 100644
index 000000000000..0856c94ab5dd
--- /dev/null
+++ b/tests/tcg/hexagon/system/mmu.h
@@ -0,0 +1,718 @@
+/*
+ *  Copyright(c) 2019-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef MMU_H
+#define MMU_H
+#include <assert.h>
+#include <string.h>
+#include <stdint.h>
+#include "crt0/hexagon_standalone.h"
+
+/*
+ * Helpers for MMU tests
+ */
+
+#define TARGET_PAGE_BITS            12
+#ifndef TLB_NOT_FOUND
+#define TLB_NOT_FOUND               (1 << 31)
+#endif
+
+static inline uint32_t page_start(uint32_t addr, uint32_t page_size_bits)
+{
+    uint32_t page_size = 1 << page_size_bits;
+    uint32_t page_align = ~(page_size - 1);
+    return addr & page_align;
+}
+
+/*
+ * The Hexagon standalone runtime leaves TLB entries 1-5 reserved for
+ * user-defined entries.  We'll set them up to map virtual addresses at
+ * 1MB offsets above the actual physical address
+ *     PA == VA - (entry_num * 1MB)
+ *
+ * We'll define some macros/functions to help with the manipulation
+ */
+
+#define ONE_MB                      (1 << 20)
+#define TWO_MB                      (2 * ONE_MB)
+#define THREE_MB                    (3 * ONE_MB)
+#define FOUR_MB                     (4 * ONE_MB)
+#define FIVE_MB                     (5 * ONE_MB)
+
+#define ONE_MB_ENTRY                1
+#define TWO_MB_ENTRY                2
+#define THREE_MB_ENTRY              3
+#define FOUR_MB_ENTRY               4
+#define FIVE_MB_ENTRY               5
+
+static inline uint32_t tlb_entry_num(uint32_t va)
+{
+    return va >> 20;
+}
+
+#define fZXTN(N, M, VAL) ((VAL) & ((1LL << (N)) - 1))
+#define fEXTRACTU_BITS(INREG, WIDTH, OFFSET) \
+    (fZXTN(WIDTH, 32, (INREG >> OFFSET)))
+
+#define fINSERT_BITS(REG, WIDTH, OFFSET, INVAL) \
+    do { \
+        REG = ((REG) & ~(((1LL << (WIDTH)) - 1) << (OFFSET))) | \
+           (((INVAL) & ((1LL << (WIDTH)) - 1)) << (OFFSET)); \
+    } while (0)
+
+#define GET_FIELD(ENTRY, FIELD) \
+    fEXTRACTU_BITS(ENTRY, reg_field_info[FIELD].width, \
+                   reg_field_info[FIELD].offset)
+#define SET_FIELD(ENTRY, FIELD, VAL) \
+    fINSERT_BITS(ENTRY, reg_field_info[FIELD].width, \
+                 reg_field_info[FIELD].offset, (VAL))
+
+typedef struct {
+    int offset;
+    int width;
+} reg_field_t;
+
+enum reg_fields_enum {
+#define DEF_REG_FIELD(TAG, NAME, START, WIDTH, DESCRIPTION) \
+    TAG,
+#include "reg_fields_def.h"
+    NUM_REG_FIELDS
+#undef DEF_REG_FIELD
+};
+
+static const reg_field_t reg_field_info[] = {
+#define DEF_REG_FIELD(TAG, NAME, START, WIDTH, DESCRIPTION)    \
+      { START, WIDTH },
+
+#include "reg_fields_def.h"
+
+      { 0, 0 }
+#undef DEF_REG_FIELD
+};
+
+/*
+ * PPD (physical page descriptor) is formed by putting the PTE_PA35 field
+ * in the MSB of the PPD
+ */
+#define GET_PPD(ENTRY) \
+    ((GET_FIELD((ENTRY), PTE_PPD) | \
+     (GET_FIELD((ENTRY), PTE_PA35) << reg_field_info[PTE_PPD].width)))
+
+#define NUM_PGSIZE_TYPES (SHIFT_1G + 1)
+
+static const char *pgsize_str(PageSize pgsize)
+{
+    static const char *size_str[NUM_PGSIZE_TYPES] = {
+        "4K",
+        "16K",
+        "64K",
+        "256K",
+        "1M",
+        "4M",
+        "16M",
+        "64M",
+        "256M",
+        "1G"
+    };
+    assert(pgsize);
+    return size_str[__builtin_ctz(pgsize)];
+}
+
+static const uint64_t encmask_2_mask[] = {
+    0x0fffLL,                           /* 4k,   0000 */
+    0x3fffLL,                           /* 16k,  0001 */
+    0xffffLL,                           /* 64k,  0010 */
+    0x3ffffLL,                          /* 256k, 0011 */
+    0xfffffLL,                          /* 1m,   0100 */
+    0x3fffffLL,                         /* 4m,   0101 */
+    0xffffffLL,                         /* 16M,  0110 */
+    0xffffffffLL,                       /* RSVD, 0111 */
+};
+
+static inline int hex_tlb_pgsize(uint64_t entry)
+{
+    assert(entry != 0);
+    int size = __builtin_ctzll(entry);
+    assert(size < NUM_PGSIZE_TYPES);
+    return size;
+}
+
+static inline uint32_t hex_tlb_page_size(uint64_t entry)
+{
+    return 1 << (TARGET_PAGE_BITS + 2 * hex_tlb_pgsize(entry));
+}
+
+static inline uint64_t hex_tlb_phys_page_num(uint64_t entry)
+{
+    uint32_t ppd = GET_PPD(entry);
+    return ppd >> 1;
+}
+
+static inline uint64_t hex_tlb_phys_addr(uint64_t entry)
+{
+    uint64_t pagemask = encmask_2_mask[hex_tlb_pgsize(entry)];
+    uint64_t pagenum = hex_tlb_phys_page_num(entry);
+    uint64_t PA = (pagenum << TARGET_PAGE_BITS) & (~pagemask);
+    return PA;
+}
+
+static inline uint64_t hex_tlb_virt_addr(uint64_t entry)
+{
+    return GET_FIELD(entry, PTE_VPN) << TARGET_PAGE_BITS;
+}
+
+static inline uint64_t create_mmu_entry(uint8_t G, uint8_t A0, uint8_t A1,
+                                        uint8_t ASID, uint32_t VA,
+                                        uint8_t X, int8_t W, uint8_t R,
+                                        uint8_t U, uint8_t C, uint64_t PA,
+                                        PageSize SZ)
+{
+    uint64_t entry = 0;
+    SET_FIELD(entry, PTE_V, 1);
+    SET_FIELD(entry, PTE_G, G);
+    SET_FIELD(entry, PTE_ATR0, A0);
+    SET_FIELD(entry, PTE_ATR1, A1);
+    SET_FIELD(entry, PTE_ASID, ASID);
+    SET_FIELD(entry, PTE_VPN, VA >> TARGET_PAGE_BITS);
+    SET_FIELD(entry, PTE_X, X);
+    SET_FIELD(entry, PTE_W, W);
+    SET_FIELD(entry, PTE_R, R);
+    SET_FIELD(entry, PTE_U, U);
+    SET_FIELD(entry, PTE_C, C);
+    SET_FIELD(entry, PTE_PA35, (PA >> (TARGET_PAGE_BITS + 35)) & 1);
+    SET_FIELD(entry, PTE_PPD, ((PA >> (TARGET_PAGE_BITS - 1))));
+    entry |= SZ;
+    return entry;
+}
+
+static inline uint64_t tlbr(uint32_t i)
+{
+    uint64_t ret;
+    asm volatile ("%0 = tlbr(%1)\n\t" : "=r"(ret) : "r"(i));
+    return ret;
+}
+
+static inline uint32_t ctlbw(uint64_t entry, uint32_t idx)
+{
+    uint32_t ret;
+    asm volatile ("%0 = ctlbw(%1, %2)\n\t" : "=r"(ret) : "r"(entry), "r"(idx));
+    return ret;
+}
+
+static inline uint32_t tlbp(uint32_t asid, uint32_t VA)
+{
+    uint32_t x = ((asid & 0x7f) << 20) | ((VA >> 12) & 0xfffff);
+    uint32_t ret;
+    asm volatile ("%0 = tlbp(%1)\n\t" : "=r"(ret) : "r"(x));
+    return ret;
+}
+
+static inline void tlbw(uint64_t entry, uint32_t idx)
+{
+    asm volatile ("tlbw(%0, %1)\n\t" :: "r"(entry), "r"(idx));
+}
+
+static inline uint32_t tlboc(uint64_t entry)
+{
+    uint32_t ret;
+    asm volatile ("%0 = tlboc(%1)\n\t" : "=r"(ret) : "r"(entry));
+    return ret;
+}
+
+void tlbinvasid(uint32_t entry_hi)
+{
+    asm volatile ("tlbinvasid(%0)\n\t" :: "r"(entry_hi));
+}
+
+static inline void enter_user_mode(void)
+{
+    asm volatile ("r0 = ssr\n\t"
+                  "r0 = clrbit(r0, #17) // EX\n\t"
+                  "r0 = setbit(r0, #16) // UM\n\t"
+                  "r0 = clrbit(r0, #19) // GM\n\t"
+                  "ssr = r0\n\t" : : : "r0");
+}
+
+static inline void enter_kernel_mode(void)
+{
+    asm volatile ("r0 = ssr\n\t"
+                  "r0 = clrbit(r0, #17) // EX\n\t"
+                  "r0 = clrbit(r0, #16) // UM\n\t"
+                  "r0 = clrbit(r0, #19) // GM\n\t"
+                  "ssr = r0\n\t" : : : "r0");
+}
+
+static inline uint32_t *getevb()
+{
+    uint32_t reg;
+    asm volatile ("%0 = evb\n\t" : "=r"(reg));
+    return (uint32_t *)reg;
+}
+
+static inline void setevb(void *new_evb)
+{
+    asm volatile("evb = %0\n\t" : : "r"(new_evb));
+}
+
+static inline uint32_t getbadva()
+{
+    uint32_t badva;
+    asm volatile ("%0 = badva\n\t" : "=r"(badva));
+    return badva;
+}
+
+static void inc_elr(uint32_t inc)
+{
+
+    asm volatile ("r1 = %0\n\t"
+                  "r2 = elr\n\t"
+                  "r1 = add(r2, r1)\n\t"
+                  "elr = r1\n\t"
+                  :  : "r"(inc) : "r1", "r2");
+}
+
+static inline void do_coredump(void)
+{
+    asm volatile("r0 = #2\n\t"
+                 "stid = r0\n\t"
+                 "jump __coredump\n\t" : : : "r0");
+}
+
+static inline uint32_t getssr(void)
+{
+    uint32_t ret;
+    asm volatile ("%0 = ssr\n\t" : "=r"(ret));
+    return ret;
+}
+
+static inline void setssr(uint32_t new_ssr)
+{
+    asm volatile ("ssr = %0\n\t" :: "r"(new_ssr));
+}
+
+static inline void set_asid(uint32_t asid)
+{
+    uint32_t ssr = getssr();
+    SET_FIELD(ssr, SSR_ASID, asid);
+    setssr(ssr);
+}
+
+int err;
+#include "../hex_test.h"
+
+static void *old_evb;
+
+typedef uint64_t exception_vector[2];
+static exception_vector my_exceptions;
+
+static inline void clear_exception_vector(exception_vector excp)
+{
+    excp[0] = 0;
+    excp[1] = 0;
+}
+
+static inline void set_exception_vector_bit(exception_vector excp, uint32_t bit)
+{
+    if (bit < 64) {
+        excp[0] |= 1LL << bit;
+    } else if (bit < 128) {
+        excp[1] |= 1LL << (bit - 64);
+    }
+}
+
+#define check_exception_vector(excp, expect) \
+    do { \
+        check64(excp[0], expect[0]); \
+        check64(excp[1], expect[1]); \
+    } while (0)
+
+static inline void print_exception_vector(exception_vector excp)
+{
+    printf("exceptions (0x%016llx 0x%016llx):", excp[1], excp[0]);
+    for (int i = 0; i < 64; i++) {
+        if (excp[0] & (1LL << i)) {
+            printf(" 0x%x", i);
+        }
+    }
+    for (int i = 0; i < 64; i++) {
+        if (excp[1] & (1LL << i)) {
+            printf(" 0x%x", i + 64);
+        }
+    }
+    printf("\n");
+}
+
+/* volatile because it is written through different MMU mappings */
+typedef volatile int mmu_variable;
+mmu_variable data = 0xdeadbeef;
+
+typedef int (*func_t)(void);
+/* volatile because it will be invoked via different MMU mappings */
+typedef volatile func_t mmu_func_t;
+
+/*
+ * Create a function that returns its (virtual) address
+ * Write it fully in assembly so we don't have to worry about
+ * which optimization level we are compiled with
+ */
+extern int func_return_pc(void);
+asm(
+".global func_return_pc\n"
+".balign 4\n"
+".type func_return_pc, @function\n"
+"func_return_pc:\n"
+"    r0 = pc\n"
+"    jumpr r31\n"
+".size func_return_pc, . - func_return_pc\n"
+);
+
+enum {
+    TLB_U = (1 << 0),
+    TLB_R = (1 << 1),
+    TLB_W = (1 << 2),
+    TLB_X = (1 << 3),
+};
+
+#define HEX_CAUSE_FETCH_NO_XPAGE                  0x011
+#define HEX_CAUSE_FETCH_NO_UPAGE                  0x012
+#define HEX_CAUSE_PRIV_NO_READ                    0x022
+#define HEX_CAUSE_PRIV_NO_WRITE                   0x023
+#define HEX_CAUSE_PRIV_NO_UREAD                   0x024
+#define HEX_CAUSE_PRIV_NO_UWRITE                  0x025
+#define HEX_CAUSE_IMPRECISE_MULTI_TLB_MATCH       0x044
+#define HEX_CAUSE_TLBMISSX_NORMAL                 0x060
+#define HEX_CAUSE_TLBMISSX_NEXTPAGE               0x061
+#define HEX_CAUSE_TLBMISSRW_READ                  0x070
+#define HEX_CAUSE_TLBMISSRW_WRITE                 0x071
+
+/*
+ * The following lets us override the default exception handlers
+ * This can be handy for adding code to check that they are called as well
+ * as special handling needed for the test to succeed.
+ *
+ * MY_EVENT_HANDLE           Use this to define your own event handler
+ * DEFAULT_EVENT_HANDLE      Use this to point to the default handler
+ * my_event_vectors          New event vector table
+ * install_my_event_vectors  Change from the default event handlers
+ */
+
+extern void *my_event_vectors;
+
+#define MY_EVENT_HANDLE(name, helper) \
+void name(void) \
+{ \
+    asm volatile("crswap(sp, sgp0)\n\t" \
+                 "memd(sp++#8) = r1:0\n\t" \
+                 "memd(sp++#8) = r3:2\n\t" \
+                 "memd(sp++#8) = r5:4\n\t" \
+                 "memd(sp++#8) = r7:6\n\t" \
+                 "memd(sp++#8) = r9:8\n\t" \
+                 "memd(sp++#8) = r11:10\n\t" \
+                 "memd(sp++#8) = r13:12\n\t" \
+                 "memd(sp++#8) = r15:14\n\t" \
+                 "memd(sp++#8) = r17:16\n\t" \
+                 "memd(sp++#8) = r19:18\n\t" \
+                 "memd(sp++#8) = r21:20\n\t" \
+                 "memd(sp++#8) = r23:22\n\t" \
+                 "memd(sp++#8) = r25:24\n\t" \
+                 "memd(sp++#8) = r27:26\n\t" \
+                 "memd(sp++#8) = r31:30\n\t" \
+                 "r0 = ssr\n\t" \
+                 "call " #helper "\n\t" \
+                 "sp = add(sp, #-8)\n\t" \
+                 "r31:30 = memd(sp++#-8)\n\t" \
+                 "r27:26 = memd(sp++#-8)\n\t" \
+                 "r25:24 = memd(sp++#-8)\n\t" \
+                 "r23:22 = memd(sp++#-8)\n\t" \
+                 "r21:20 = memd(sp++#-8)\n\t" \
+                 "r19:18 = memd(sp++#-8)\n\t" \
+                 "r17:16 = memd(sp++#-8)\n\t" \
+                 "r15:14 = memd(sp++#-8)\n\t" \
+                 "r13:12 = memd(sp++#-8)\n\t" \
+                 "r11:10 = memd(sp++#-8)\n\t" \
+                 "r9:8 = memd(sp++#-8)\n\t" \
+                 "r7:6 = memd(sp++#-8)\n\t" \
+                 "r5:4 = memd(sp++#-8)\n\t" \
+                 "r3:2 = memd(sp++#-8)\n\t" \
+                 "r1:0 = memd(sp)\n\t" \
+                 "crswap(sp, sgp0);\n\t" \
+                 "rte\n\t"); \
+}
+
+#ifndef NO_DEFAULT_EVENT_HANDLES
+
+#define DEFAULT_EVENT_HANDLE(name, offset) \
+void name(void) \
+{ \
+    asm volatile("r0 = %0\n\t" \
+                 "r0 = add(r0, #" #offset ")\n\t" \
+                 "jumpr r0\n\t" \
+                 : : "r"(old_evb) : "r0"); \
+}
+
+
+/* Use these values as the offset for DEFAULT_EVENT_HANDLE */
+asm (
+".set HANDLE_RESET_OFFSET,               0x00\n\t"
+".set HANDLE_NMI_OFFSET,                 0x04\n\t"
+".set HANDLE_ERROR_OFFSET,               0x08\n\t"
+".set HANDLE_RSVD_OFFSET,                0x0c\n\t"
+".set HANDLE_TLBMISSX_OFFSET,            0x10\n\t"
+".set HANDLE_TLBMISSRW_OFFSET,           0x18\n\t"
+".set HANDLE_TRAP0_OFFSET,               0x20\n\t"
+".set HANDLE_TRAP1_OFFSET,               0x24\n\t"
+".set HANDLE_FPERROR_OFFSET,             0x28\n\t"
+".set HANDLE_INT_OFFSET,                 0x40\n\t"
+);
+
+asm(
+".align 0x1000\n\t"
+"my_event_vectors:\n\t"
+    "jump my_event_handle_reset\n\t"
+    "jump my_event_handle_nmi\n\t"
+    "jump my_event_handle_error\n\t"
+    "jump my_event_handle_rsvd\n\t"
+    "jump my_event_handle_tlbmissx\n\t"
+    "jump my_event_handle_rsvd\n\t"
+    "jump my_event_handle_tlbmissrw\n\t"
+    "jump my_event_handle_rsvd\n\t"
+    "jump my_event_handle_trap0\n\t"
+    "jump my_event_handle_trap1\n\t"
+    "jump my_event_handle_rsvd\n\t"
+    "jump my_event_handle_fperror\n\t"
+    "jump my_event_handle_rsvd\n\t"
+    "jump my_event_handle_rsvd\n\t"
+    "jump my_event_handle_rsvd\n\t"
+    "jump my_event_handle_rsvd\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+    "jump my_event_handle_int\n\t"
+);
+
+#define DEFAULT_EVENT_HANDLES \
+DEFAULT_EVENT_HANDLE(my_event_handle_error,       HANDLE_ERROR_OFFSET) \
+DEFAULT_EVENT_HANDLE(my_event_handle_nmi,         HANDLE_NMI_OFFSET) \
+DEFAULT_EVENT_HANDLE(my_event_handle_tlbmissrw,   HANDLE_TLBMISSRW_OFFSET) \
+DEFAULT_EVENT_HANDLE(my_event_handle_tlbmissx,    HANDLE_TLBMISSX_OFFSET) \
+DEFAULT_EVENT_HANDLE(my_event_handle_reset,       HANDLE_RESET_OFFSET) \
+DEFAULT_EVENT_HANDLE(my_event_handle_rsvd,        HANDLE_RSVD_OFFSET) \
+DEFAULT_EVENT_HANDLE(my_event_handle_trap0,       HANDLE_TRAP0_OFFSET) \
+DEFAULT_EVENT_HANDLE(my_event_handle_trap1,       HANDLE_TRAP1_OFFSET) \
+DEFAULT_EVENT_HANDLE(my_event_handle_int,         HANDLE_INT_OFFSET) \
+DEFAULT_EVENT_HANDLE(my_event_handle_fperror,     HANDLE_FPERROR_OFFSET)
+
+#endif /* NO_DEFAULT_EVENT_HANDLES */
+
+/* When a permission error happens, add the permission to the TLB entry */
+void my_event_handle_error_helper(uint32_t ssr)
+{
+    uint32_t cause = GET_FIELD(ssr, SSR_CAUSE);
+    uint32_t badva = getbadva();
+    uint32_t entry_num = tlb_entry_num(badva);
+    uint64_t entry;
+
+    set_exception_vector_bit(my_exceptions, cause);
+
+    switch (cause) {
+    case HEX_CAUSE_FETCH_NO_XPAGE:
+        entry = tlbr(entry_num);
+        SET_FIELD(entry, PTE_X, 1);
+        tlbw(entry, entry_num);
+        break;
+    case HEX_CAUSE_FETCH_NO_UPAGE:
+        entry = tlbr(entry_num);
+        SET_FIELD(entry, PTE_U, 1);
+        tlbw(entry, entry_num);
+        break;
+    case HEX_CAUSE_PRIV_NO_READ:
+        entry = tlbr(entry_num);
+        SET_FIELD(entry, PTE_R, 1);
+        tlbw(entry, entry_num);
+        break;
+    case HEX_CAUSE_PRIV_NO_WRITE:
+        entry = tlbr(entry_num);
+        SET_FIELD(entry, PTE_W, 1);
+        tlbw(entry, entry_num);
+        break;
+    case HEX_CAUSE_PRIV_NO_UREAD:
+        entry = tlbr(entry_num);
+        SET_FIELD(entry, PTE_U, 1);
+        tlbw(entry, entry_num);
+        break;
+    case HEX_CAUSE_PRIV_NO_UWRITE:
+        entry = tlbr(entry_num);
+        SET_FIELD(entry, PTE_U, 1);
+        tlbw(entry, entry_num);
+        break;
+    default:
+        do_coredump();
+        break;
+    }
+}
+
+void my_event_handle_nmi_helper(uint32_t ssr)
+{
+    uint32_t cause = GET_FIELD(ssr, SSR_CAUSE);
+
+    set_exception_vector_bit(my_exceptions, cause);
+
+    switch (cause) {
+    case HEX_CAUSE_IMPRECISE_MULTI_TLB_MATCH:
+        break;
+    default:
+        do_coredump();
+        break;
+    }
+}
+
+/*
+ * When a TLB miss happens, create a mapping
+ * We'll set different read/write/execute permissions
+ * for different entry numbers.
+ */
+void my_event_handle_tlbmissrw_helper(uint32_t ssr)
+{
+    uint32_t cause = GET_FIELD(ssr, SSR_CAUSE);
+    uint32_t badva = getbadva();
+    uint32_t entry_num = tlb_entry_num(badva);
+    uint32_t VA = page_start(badva, TARGET_PAGE_BITS);
+    uint32_t PA = VA - (entry_num * ONE_MB);
+
+    uint64_t entry =
+        create_mmu_entry(1, 0, 0, 0, VA, 0, 0, 0, 1, 0x3, PA, PAGE_4K);
+    if (entry_num == TWO_MB_ENTRY) {
+        SET_FIELD(entry, PTE_R, 1);
+    }
+    if (entry_num == THREE_MB_ENTRY) {
+        SET_FIELD(entry, PTE_W, 1);
+    }
+
+    set_exception_vector_bit(my_exceptions, cause);
+
+    switch (cause) {
+    case HEX_CAUSE_TLBMISSRW_READ:
+        tlbw(entry, entry_num);
+        break;
+    case HEX_CAUSE_TLBMISSRW_WRITE:
+        tlbw(entry, entry_num);
+        break;
+    default:
+        do_coredump();
+        break;
+    }
+}
+
+void my_event_handle_tlbmissx_helper(uint32_t ssr)
+{
+    uint32_t cause = GET_FIELD(ssr, SSR_CAUSE);
+    uint32_t badva = getbadva();
+    uint32_t entry_num = tlb_entry_num(badva);
+    uint32_t VA = page_start(badva, TARGET_PAGE_BITS);
+    uint32_t PA = VA - (entry_num * ONE_MB);
+
+    uint64_t entry =
+        create_mmu_entry(1, 0, 0, 0, VA, 0, 0, 0, 1, 0x3, PA, PAGE_4K);
+
+    set_exception_vector_bit(my_exceptions, cause);
+
+    switch (cause) {
+    case HEX_CAUSE_TLBMISSX_NORMAL:
+        tlbw(entry, entry_num);
+        break;
+    default:
+        do_coredump();
+        break;
+    }
+}
+
+static inline void install_my_event_vectors(void)
+{
+    old_evb = getevb();
+    setevb(&my_event_vectors);
+}
+
+#define MAKE_GOTO(name) \
+void goto_##name(void) \
+{ \
+    asm volatile("r0 = ##" #name "\n\t" \
+                 "jumpr r0\n\t" \
+                 : : : "r0"); \
+}
+
+#define MAKE_ERR_HANDLER(name, helper_fn) \
+    MY_EVENT_HANDLE(name, helper_fn) \
+    MAKE_GOTO(name)
+
+#define INSTALL_ERR_HANDLER(name) { \
+    /*
+     * Install our own privelege exception handler.
+     * The normal behavior is to coredump
+     * Read and decode the jump displacemnts from evb
+     * ASSUME negative displacement which is the standard.
+     */ \
+    uint32_t *evb_err = getevb() + 2; \
+    uint32_t err_distance = -(0xfe000000 | *evb_err) << 1; \
+    uint32_t err_handler = (uint32_t)evb_err - err_distance; \
+    memcpy((void *)err_handler, goto_##name, 12); \
+} while (0)
+
+static inline void remove_trans(int index)
+{
+    uint64_t entry = tlbr(index);
+    SET_FIELD(entry, PTE_V, 0);
+    tlbw(entry, index);
+}
+
+static inline void clear_overlapping_entry(unsigned int asid, uint32_t va)
+{
+    int32_t index = tlbp(asid, va);
+    if (index != TLB_NOT_FOUND) {
+        remove_trans(index);
+    }
+}
+
+static void add_trans(int index, uint32_t va, uint64_t pa,
+                      PageSize page_size, uint8_t xwru,
+                      unsigned int asid, uint8_t V, uint8_t G)
+{
+    if (V) {
+        clear_overlapping_entry(asid, va);
+    }
+    assert(!add_translation_extended(index, (void *)va, pa, page_size,
+                                     xwru, 0, asid, 0,
+                                     ((V & 1) << 1) | (G & 1)));
+}
+
+#endif
diff --git a/tests/tcg/hexagon/system/mmu_asids.c b/tests/tcg/hexagon/system/mmu_asids.c
new file mode 100644
index 000000000000..34f25c25a3d7
--- /dev/null
+++ b/tests/tcg/hexagon/system/mmu_asids.c
@@ -0,0 +1,80 @@
+/*
+ *  Copyright(c) 2019-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <string.h>
+
+
+#define DEBUG        0
+
+#include "mmu.h"
+
+DEFAULT_EVENT_HANDLES
+
+void test_asids(void)
+{
+    uint32_t addr = (uint32_t)&data;
+    uint32_t page = page_start(addr, TARGET_PAGE_BITS);
+    uint32_t offset = FIVE_MB;
+    uint32_t new_addr = addr + offset;
+    uint32_t new_page = page + offset;
+    uint64_t entry =
+        create_mmu_entry(0, 0, 0, 1, new_page, 1, 1, 1, 0, 7, page, PAGE_4K);
+    /*
+     * Create a TLB entry for ASID=1
+     * Write it at index 1
+     * Check that it is present
+     * Invalidate the ASID
+     * Check that it is not found
+     */
+    tlbw(entry, 1);
+    check32(tlboc(entry), 1);
+    tlbinvasid(entry >> 32);
+    check32(tlboc(entry), TLB_NOT_FOUND);
+
+    /*
+     * Re-install the entry
+     * Put ourselves in ASID=1
+     * Do a load and a store
+     */
+    data = 0xdeadbeef;
+    tlbw(entry, 1);
+    set_asid(1);
+    check32(*(mmu_variable *)new_addr, 0xdeadbeef);
+    *(mmu_variable *)new_addr = 0xcafebabe;
+    check32(data, 0xcafebabe);
+
+    /*
+     * Make sure a load from ASID 2 gets a different value.
+     * The standalone runtime will create a VA==PA entry on
+     * a TLB miss, so the load will be reading from uninitialized
+     * memory.
+     */
+    set_asid(2);
+    data = 0xdeadbeef;
+    check32_ne(*(mmu_variable *)new_addr, 0xdeadbeef);
+
+    /*
+     * Invalidate the ASID and make sure a loads from ASID 1
+     * gets a different value.
+     */
+    tlbinvasid(entry >> 32);
+    set_asid(1);
+    data = 0xcafebabe;
+    check32_ne(*(mmu_variable *)new_addr, 0xcafebabe);
+}
+
+int main()
+{
+    puts("Hexagon MMU ASID test");
+
+    test_asids();
+
+    printf("%s\n", ((err) ? "FAIL" : "PASS"));
+    return err;
+}
diff --git a/tests/tcg/hexagon/system/mmu_overlap.c b/tests/tcg/hexagon/system/mmu_overlap.c
new file mode 100644
index 000000000000..73d0565abed4
--- /dev/null
+++ b/tests/tcg/hexagon/system/mmu_overlap.c
@@ -0,0 +1,65 @@
+/*
+ *  Copyright(c) 2019-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <string.h>
+
+#define DEBUG        0
+
+#include "mmu.h"
+
+DEFAULT_EVENT_HANDLES
+
+void test_overlap(void)
+{
+    uint32_t addr = (uint32_t)&data;
+    uint32_t page = page_start(addr, 20);
+    uint32_t offset = FIVE_MB;
+    uint32_t new_page = page + offset;
+    uint32_t new_addr = addr + offset;
+    uint8_t data_perm = TLB_X | TLB_W | TLB_R | TLB_U;
+    uint64_t entry;
+
+    add_trans(1, new_page, page, PAGE_1M, data_perm, 0, 1, 1);
+    check32(tlbp(0, new_addr), 1);
+
+    /* Check an entry that overlaps with the one we just created */
+    entry =
+        create_mmu_entry(1, 0, 0, 0, new_page, 1, 1, 1, 0, 7, page, PAGE_4K);
+    check32(tlboc(entry), 1);
+    /* Check that conditional TLB write (ctlbw) does NOT write the new entry */
+    check32(ctlbw(entry, 2), 0x1);
+
+    /* Create an entry that does not overlap with the one we just created */
+    entry = create_mmu_entry(1, 0, 0, 0, new_page + ONE_MB, 1, 1, 1, 0, 7, page,
+                             PAGE_4K);
+    check32(tlboc(entry), TLB_NOT_FOUND);
+    /* Check that conditional TLB write (ctlbw) does write the new entry */
+    check32(ctlbw(entry, 2), TLB_NOT_FOUND);
+
+    /* Create an entry that overalps both of these entries */
+    entry =
+        create_mmu_entry(1, 0, 0, 0, new_page, 1, 1, 1, 0, 7, page, PAGE_4M);
+    check32(tlboc(entry), 0xffffffff);
+
+    /* Clear the TLB entries */
+    remove_trans(1);
+    check32(tlbp(0, new_addr), TLB_NOT_FOUND);
+    remove_trans(2);
+    check32(tlbp(0, (new_addr + ONE_MB)), TLB_NOT_FOUND);
+}
+
+int main()
+{
+    puts("Hexagon MMU overlap test");
+
+    test_overlap();
+
+    printf("%s\n", ((err) ? "FAIL" : "PASS"));
+    return err;
+}
diff --git a/tests/tcg/hexagon/system/monitor_insts.S b/tests/tcg/hexagon/system/monitor_insts.S
new file mode 100644
index 000000000000..8027068511f1
--- /dev/null
+++ b/tests/tcg/hexagon/system/monitor_insts.S
@@ -0,0 +1,18 @@
+/*
+ *  Copyright(c) 2020-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+    .text
+    .type test_set_prio, @function
+    .global test_set_prio
+
+test_set_prio:
+    r0 = #3
+    r1 = #1
+    p0 = cmp.eq(r0,r1)
+    setprio(p0, r0)
+    jumpr lr
+
+    .size test_set_prio, . - test_set_prio
diff --git a/tests/tcg/hexagon/system/reg_fields_def.h b/tests/tcg/hexagon/system/reg_fields_def.h
new file mode 100644
index 000000000000..ff2769a1399d
--- /dev/null
+++ b/tests/tcg/hexagon/system/reg_fields_def.h
@@ -0,0 +1,87 @@
+/* PTE (aka TLB entry) fields */
+DEF_REG_FIELD(PTE_PPD,
+    "PPD", 0, 24,
+    "Physical page number that the corresponding virtual page maps to.")
+DEF_REG_FIELD(PTE_C,
+    "C", 24, 4,
+    "Cacheability attributes for the page.")
+DEF_REG_FIELD(PTE_U,
+    "U", 28, 1,
+    "User mode permitted.")
+DEF_REG_FIELD(PTE_R,
+    "R", 29, 1,
+    "Read-enable.")
+DEF_REG_FIELD(PTE_W,
+    "W", 30, 1,
+    "Write-enable.")
+DEF_REG_FIELD(PTE_X,
+    "X", 31, 1,
+    "Execute-enable.")
+DEF_REG_FIELD(PTE_VPN,
+    "VPN", 32, 20,
+    "Virtual page number that is matched against the load or store address.")
+DEF_REG_FIELD(PTE_ASID,
+    "ASID", 52, 7,
+    "7-bit address space identifier (tag extender)")
+DEF_REG_FIELD(PTE_ATR0,
+    "ATR0", 59, 1,
+    "General purpose attribute bit kept as an attribute of each cache line.")
+DEF_REG_FIELD(PTE_ATR1,
+    "ATR1", 60, 1,
+    "General purpose attribute bit kept as an attribute of each cache line.")
+DEF_REG_FIELD(PTE_PA35,
+    "PA35", 61, 1,
+    "The Extra Physical bit is the most-significant physical address bit.")
+DEF_REG_FIELD(PTE_G,
+    "G", 62, 1,
+    "Global bit. If set, then the ASID is ignored in the match.")
+DEF_REG_FIELD(PTE_V,
+    "V", 63, 1,
+    "Valid bit. indicates whether this entry should be used for matching.")
+
+/* SSR fields */
+DEF_REG_FIELD(SSR_CAUSE,
+    "cause", 0, 8,
+    "8-bit field that contains the reason for various exception.")
+DEF_REG_FIELD(SSR_ASID,
+    "asid", 8, 7,
+    "7-bit field that contains the Address Space Identifier.")
+DEF_REG_FIELD(SSR_UM,
+    "um", 16, 1,
+    "read-write bit.")
+DEF_REG_FIELD(SSR_EX,
+    "ex", 17, 1,
+    "set when an interrupt or exception is accepted.")
+DEF_REG_FIELD(SSR_IE,
+    "ie", 18, 1,
+    "indicates whether the global interrupt is enabled.")
+DEF_REG_FIELD(SSR_GM,
+    "gm", 19, 1,
+    "Guest mode bit.")
+DEF_REG_FIELD(SSR_V0,
+    "v0", 20, 1,
+    "if BADVA0 register contents are from a valid slot 0 instruction.")
+DEF_REG_FIELD(SSR_V1,
+     "v1", 21, 1,
+    "if BADVA1 register contents are from a valid slot 1 instruction.")
+DEF_REG_FIELD(SSR_BVS,
+    "bvs", 22, 1,
+    "BADVA Selector.")
+DEF_REG_FIELD(SSR_CE,
+    "ce", 23, 1,
+    "grants user or guest read permissions to the PCYCLE register aliases.")
+DEF_REG_FIELD(SSR_PE,
+    "pe", 24, 1,
+    "grants guest read permissions to the PMU register aliases.")
+DEF_REG_FIELD(SSR_BP,
+    "bp", 25, 1,
+    "Internal Bus Priority bit.")
+DEF_REG_FIELD(SSR_XA,
+    "xa", 27, 3,
+    "Extension Active, which control operation of an attached coprocessor.")
+DEF_REG_FIELD(SSR_SS,
+    "ss", 30, 1,
+    "Single Step, which enables single-step exceptions.")
+DEF_REG_FIELD(SSR_XE,
+    "xe", 31, 1,
+    "Coprocessor Enable, which enables use of an attached coprocessor.")
diff --git a/tests/tcg/hexagon/system/semihost.c b/tests/tcg/hexagon/system/semihost.c
new file mode 100644
index 000000000000..7a0fa0cb73ff
--- /dev/null
+++ b/tests/tcg/hexagon/system/semihost.c
@@ -0,0 +1,297 @@
+/*
+ *  Copyright(c) 2023-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <errno.h>
+#include <unistd.h>
+#include <dirent.h>
+#include "strutils.h"
+
+/* Defines in order of testing */
+
+/* env/CLI-related */
+#define HEX_SYS_GET_CMDLINE     0x15
+#define HEX_SYS_GETCWD          0x104
+
+/* File manipulation */
+#define HEX_SYS_TMPNAM          0x0d
+#define HEX_SYS_OPEN            0x01
+#define HEX_SYS_ACCESS          0x105
+#define HEX_SYS_ISTTY           0x09
+#define HEX_SYS_WRITE           0x05
+#define HEX_SYS_SEEK            0x0a
+#define HEX_SYS_READ            0x06
+#define HEX_SYS_FTELL           0x100
+#define HEX_SYS_FSTAT           0x101
+#define HEX_SYS_FTRUNC          0x186
+#define HEX_SYS_FLEN            0x0c
+#define HEX_SYS_CLOSE           0x02
+#define HEX_SYS_ERRNO           0x13
+#define HEX_SYS_RENAME          0x0f
+#define HEX_SYS_STAT            0x103
+#define HEX_SYS_REMOVE          0x0e
+
+/* Time */
+#define HEX_SYS_CLOCK           0x10
+#define HEX_SYS_TIME            0x11
+
+/* dirent */
+#define HEX_SYS_OPENDIR         0x180
+#define HEX_SYS_CLOSEDIR        0x181
+#define HEX_SYS_READDIR         0x182
+
+/* STDOUT */
+#define HEX_SYS_WRITEC          0x03
+#define HEX_SYS_WRITE0          0x04
+#define HEX_SYS_WRITECREG       0x43
+
+static uint32_t ret, err, args[4];
+
+/*
+ * Macro flavors:
+ * - DIRECT_SWI takes up to two args an put them at r1 and r2.
+ * - SWI takes up to four args and puts them in an array, placing the
+ *   array address at r1.
+ */
+
+#define DO_SWI(CODE, ARG0, ARG1) \
+    do { \
+        asm volatile( \
+                "r0 = %2\n" \
+                "r1 = %3\n" \
+                "r2 = %4\n" \
+                "trap0(#0)\n" \
+                "%0 = r0\n" \
+                "%1 = r1\n" \
+                : "=r"(ret), "=r"(err) \
+                : "r"(CODE), "r"(ARG0), "r"(ARG1) \
+                : "r0", "r1", "r2", "memory" \
+                ); \
+    } while (0)
+
+#define SWI0(CODE) DO_SWI(CODE, args, 0)
+#define SWI1(CODE, ARG0) \
+    do { args[0] = (uint32_t)(ARG0); SWI0(CODE); } while (0)
+#define SWI2(CODE, ARG0, ARG1) \
+    do { args[1] = (uint32_t)(ARG1); SWI1(CODE, ARG0); } while (0)
+#define SWI3(CODE, ARG0, ARG1, ARG2) \
+    do { args[2] = (uint32_t)(ARG2); SWI2(CODE, ARG0, ARG1); } while (0)
+#define SWI4(CODE, ARG0, ARG1, ARG2, ARG3) \
+    do { args[3] = (uint32_t)(ARG3); SWI3(CODE, ARG0, ARG1, ARG2); } while (0)
+
+#define GET_MACRO_5(_1, _2, _3, _4, _5, NAME, ...) NAME
+#define SWI(...) \
+    GET_MACRO_5(__VA_ARGS__, SWI4, SWI3, SWI2, SWI1, SWI0)(__VA_ARGS__)
+
+#define DIRECT_SWI0(CODE) DO_SWI(CODE, 0, 0)
+#define DIRECT_SWI1(CODE, ARG1) DO_SWI(CODE, ARG1, 0)
+#define DIRECT_SWI2(CODE, ARG1, ARG2) DO_SWI(CODE, ARG1, ARG2)
+
+#define GET_MACRO_3(_1, _2, _3, NAME, ...) NAME
+#define DIRECT_SWI(...) \
+    GET_MACRO_3(__VA_ARGS__, DIRECT_SWI2, DIRECT_SWI1, DIRECT_SWI0)(__VA_ARGS__)
+
+#define is_path_sep(C) ((C) == '/' || (C) == '\\')
+
+static int path_ends_with(const char *str, const char *suffix)
+{
+    const char *str_cursor = str + strlen(str) - 1;
+    const char *suffix_cursor = suffix + strlen(suffix) - 1;
+    while (str_cursor >= str && suffix_cursor >= suffix) {
+        /* is_path_sep handles the semihosting-on-Windows case */
+        if (*str_cursor != *suffix_cursor &&
+            !(is_path_sep(*str_cursor) && is_path_sep(*suffix_cursor))) {
+            return 0;
+        }
+        str_cursor--;
+        suffix_cursor--;
+    }
+    return 1;
+}
+
+/*
+ * This must match the caller's definition, it would be in the
+ * caller's angel.h or equivalent header.
+ */
+struct __SYS_STAT {
+    uint64_t dev;
+    uint64_t ino;
+    uint32_t mode;
+    uint32_t nlink;
+    uint64_t rdev;
+    uint32_t size;
+    uint32_t __pad1;
+    uint32_t atime;
+    uint32_t mtime;
+    uint32_t ctime;
+    uint32_t __pad2;
+};
+
+int main(int argc, char **argv)
+{
+    /* GET_CMDLINE */
+    char argv_concat[1024];
+    char *cursor = argv_concat;
+    for (int i = 0; i < argc; i++) {
+        strcpy(cursor, argv[i]);
+        cursor += strlen(argv[i]);
+        *cursor = ' ';
+        cursor++;
+    }
+    *(cursor - 1) = '\0';
+    char buf[4096];
+    SWI(HEX_SYS_GET_CMDLINE, buf, sizeof(buf));
+    assert(!ret && !strcmp(buf, argv_concat));
+
+    /* GETCWD */
+    const char *expected_cwd = "tests/tcg/hexagon-softmmu";
+    SWI(HEX_SYS_GETCWD, buf, sizeof(buf));
+    assert(ret && path_ends_with(buf, expected_cwd));
+
+    /* TMPNAM */
+    char fname[4096];
+    SWI(HEX_SYS_TMPNAM, fname, 0, sizeof(fname));
+    assert(!ret);
+
+    /* OPEN */
+    /* 13 is O_RDWR | O_CREAT | O_EXCL */
+    SWI(HEX_SYS_OPEN, fname, 13, strlen(fname));
+    int fd = (int)ret;
+    assert(fd >= 0);
+
+    /* ACCESS */
+    SWI(HEX_SYS_ACCESS, fname, R_OK);
+    assert(!ret);
+    /* ACCESS with error */
+    SWI(HEX_SYS_ACCESS, "non-existent-semihost-file", R_OK);
+    assert(ret);
+    assert(err == ENOENT);
+
+    /* ISTTY */
+    SWI(HEX_SYS_ISTTY, fd);
+    assert(!ret);
+
+    /* WRITE */
+    char *str = "hello";
+    SWI(HEX_SYS_WRITE, fd, str, strlen(str));
+    assert(!ret);
+
+    /* SEEK */
+    SWI(HEX_SYS_SEEK, fd, 0);
+    assert(!ret);
+
+    /* READ */
+    int n = strlen(str);
+    SWI(HEX_SYS_READ, fd, buf, n);
+    buf[n] = '\0';
+    assert(!ret && !strcmp(str, buf));
+
+    /* FTELL */
+    SWI(HEX_SYS_FTELL, fd);
+    assert(ret == strlen(str));
+
+    /* FSTAT */
+    struct __SYS_STAT st;
+    SWI(HEX_SYS_FSTAT, fd, &st);
+    assert(!ret);
+    assert(st.atime && st.ctime && st.mtime);
+    assert(st.size == strlen(str));
+    assert((st.mode & S_IFMT) == S_IFREG);
+
+    /* FTRUNC */
+    SWI(HEX_SYS_FTRUNC, fd, 1, 0);
+    assert(!ret);
+
+    /* FLEN */
+    SWI(HEX_SYS_FLEN, fd);
+    assert(ret == 1);
+
+    /* CLOSE */
+    SWI(HEX_SYS_CLOSE, fd);
+    assert(!ret);
+
+    /* CLOSE w/ error && ERRNO */
+    SWI(HEX_SYS_CLOSE, fd);
+    assert(ret);
+    assert(err == EBADF);
+    SWI(HEX_SYS_ERRNO);
+    assert(ret == EBADF);
+
+    /* RENAME */
+    char ogfname[4096];
+    int len = strlen(fname);
+    strcpy(ogfname, fname);
+    fname[len - 1] = (fname[len - 1] == 'a' ? 'b' : 'a');
+    SWI(HEX_SYS_RENAME, ogfname, len, fname, len);
+    assert(!ret);
+
+    /* STAT */
+    SWI(HEX_SYS_STAT, fname, &st);
+    assert(!ret);
+    assert(st.atime && st.ctime && st.mtime);
+    assert(st.size == 1);
+    assert((st.mode & S_IFMT) == S_IFREG);
+
+    /* REMOVE */
+    SWI(HEX_SYS_REMOVE, fname, strlen(fname));
+    assert(!ret);
+
+    /* STAT w/ error */
+    SWI(HEX_SYS_STAT, fname, &st);
+    assert(ret);
+    assert(err == ENOENT);
+
+    /* TIME && CLOCK */
+    SWI(HEX_SYS_TIME);
+    assert(ret);
+    SWI(HEX_SYS_CLOCK);
+    assert(ret);
+
+    /* OPENDIR */
+    char *dname = "./_semihost_dir";
+    DIRECT_SWI(HEX_SYS_OPENDIR, dname);
+    assert(ret);
+    int dir_index = ret;
+
+    /* READDIR */
+    char *expected_files[4] = { ".", "..", "fileA", "fileB" };
+    char found_files_buffer[4][256];
+    char *found_files[4];
+    for (int i = 0; 1; i++) {
+        struct __attribute__((__packed__)) { int32_t _; char d_name[256]; } dirent;
+        DIRECT_SWI(HEX_SYS_READDIR, dir_index, &dirent);
+        if (!ret) {
+            break;
+        }
+        assert(i < 4);
+        found_files[i] = found_files_buffer[i];
+        strcpy(found_files[i], dirent.d_name);
+    }
+
+    sort_str_arr(found_files, 4);
+    for (int i = 0; i < 4; i++) {
+        assert(!strcmp(found_files[i], expected_files[i]));
+    }
+
+    /* CLOSEDIR */
+    DIRECT_SWI(HEX_SYS_CLOSEDIR, dir_index);
+    assert(!ret);
+
+    /* WRITEC, WRITECREG, WRITE0 */
+    /* We use DO_SWI directly here to bypass the args array */
+    char *pass = "PASS\n";
+    DIRECT_SWI(HEX_SYS_WRITEC, &pass[0]);
+    DIRECT_SWI(HEX_SYS_WRITECREG, pass[1]);
+    DIRECT_SWI(HEX_SYS_WRITE0, &pass[2]);
+
+    return 0;
+}
diff --git a/tests/tcg/hexagon/system/standalone_hw.c b/tests/tcg/hexagon/system/standalone_hw.c
new file mode 100644
index 000000000000..c67343204a80
--- /dev/null
+++ b/tests/tcg/hexagon/system/standalone_hw.c
@@ -0,0 +1,43 @@
+#include <stdio.h>
+#include <assert.h>
+
+void test_set_prio();
+
+void inst_test()
+{
+    asm volatile("dczeroa(r0)\n\t"
+                 "dccleanidx(r0)\n\t"
+                 "dcinvidx(r0)\n\t"
+                 "r1 = dctagr(r0)\n\t"
+                 "dctagw(r0, r1)\n\t"
+                 "dcfetch(r0)\n\t"
+                 "dccleaninvidx(r0)\n\t"
+                 "l2gclean\n\t"
+                 "l2gclean(r1:0)\n\t"
+                 "l2gcleaninv\n\t"
+                 "l2gcleaninv(r1:0)\n\t"
+                 "l2gunlock\n\t"
+                 "l2kill\n\t"
+                 "trace(r0)\n\t"
+                 "pause(#1)\n\t"
+                );
+
+    asm volatile("r0 = #0\n\t"
+                 "r1 = iassignr(r0)\n\t"
+                 /* Set interrupt 0 to disabled on all threads */
+                 "r0 = #0\n\t"
+                 "iassignw(r0)\n\t");
+
+    test_set_prio();
+    printf("Executed monitor mode instructions\n");
+}
+
+int main(int argc, const char *argv[])
+{
+    inst_test();
+    printf("Hello, World: (argc: %d)\n", argc);
+    assert(argc >= 1);
+    for (int i = 0; i < argc; i++) {
+        printf("\t> '%s'\n", argv[i]);
+    }
+}
diff --git a/tests/tcg/hexagon/system/standalone_vec.c b/tests/tcg/hexagon/system/standalone_vec.c
new file mode 100644
index 000000000000..eb1b2ef4830c
--- /dev/null
+++ b/tests/tcg/hexagon/system/standalone_vec.c
@@ -0,0 +1,1419 @@
+/*
+ *  Copyright(c) 2023-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <hexagon_types.h>
+#include <hexagon_protos.h>
+
+#include "cfgtable.h"
+
+int err;
+
+#ifdef __linux__
+#define VTCM_SIZE_KB (2048)
+#define VTCM_BYTES_PER_KB (1024)
+
+static char vtcm_buffer[VTCM_SIZE_KB * VTCM_BYTES_PER_KB]
+    __attribute__((aligned(0x10000)));
+#endif
+
+/* define the number of rows/cols in a square matrix */
+#define MATRIX_SIZE 64
+
+/* define the size of the scatter buffer */
+#define SCATTER_BUFFER_SIZE (MATRIX_SIZE * MATRIX_SIZE)
+
+#define SCATTER16_BUF_SIZE (2 * SCATTER_BUFFER_SIZE)
+#define SCATTER32_BUF_SIZE (4 * SCATTER_BUFFER_SIZE)
+
+#define GATHER16_BUF_SIZE (2 * MATRIX_SIZE)
+#define GATHER32_BUF_SIZE (4 * MATRIX_SIZE)
+
+uintptr_t VTCM_BASE_ADDRESS;
+uintptr_t VTCM_SCATTER16_ADDRESS;
+uintptr_t VTCM_GATHER16_ADDRESS;
+uintptr_t VTCM_SCATTER32_ADDRESS;
+uintptr_t VTCM_GATHER32_ADDRESS;
+uintptr_t VTCM_SCATTER16_32_ADDRESS;
+uintptr_t VTCM_GATHER16_32_ADDRESS;
+
+/* the vtcm base address */
+unsigned char *vtcm_base;
+
+/* scatter gather 16 bit elements using 16 bit offsets */
+unsigned short *vscatter16;
+unsigned short *vgather16;
+unsigned short vscatter16_ref[SCATTER_BUFFER_SIZE];
+unsigned short vgather16_ref[MATRIX_SIZE];
+
+/* scatter gather 32 bit elements using 32 bit offsets */
+unsigned int *vscatter32;
+unsigned int *vgather32;
+unsigned int vscatter32_ref[SCATTER_BUFFER_SIZE];
+unsigned int vgather32_ref[MATRIX_SIZE];
+
+/* scatter gather 16 bit elements using 32 bit offsets */
+unsigned short *vscatter16_32;
+unsigned short *vgather16_32;
+unsigned short vscatter16_32_ref[SCATTER_BUFFER_SIZE];
+unsigned short vgather16_32_ref[MATRIX_SIZE];
+
+
+/* declare the arrays of offsets */
+unsigned short half_offsets[MATRIX_SIZE];
+unsigned int word_offsets[MATRIX_SIZE];
+
+/* declare the arrays of values */
+unsigned short half_values[MATRIX_SIZE];
+unsigned short half_acc_values[MATRIX_SIZE];
+unsigned short half_q_values[MATRIX_SIZE];
+unsigned int word_values[MATRIX_SIZE];
+unsigned int word_acc_values[MATRIX_SIZE];
+unsigned int word_q_values[MATRIX_SIZE];
+
+/* declare the array of predicates */
+unsigned short half_predicates[MATRIX_SIZE];
+unsigned int word_predicates[MATRIX_SIZE];
+
+/* make this big enough for all the intrinsics */
+unsigned int region_len = 4 * SCATTER_BUFFER_SIZE - 1;
+
+/* optionally add sync instructions */
+#define SYNC_VECTOR 1
+
+/* optionally print cycle counts */
+#define PRINT_CYCLE_COUNTS 0
+
+#if PRINT_CYCLE_COUNTS
+unsigned long long start_cycles;
+#define START_CYCLES start_cycles = hexagon_sim_read_pcycles();
+#define PRINT_CYCLES(x) printf(x, hexagon_sim_read_pcycles() - start_cycles);
+#else
+#define START_CYCLES
+#define PRINT_CYCLES(x)
+#endif
+
+/* define a scratch area for debug and prefill */
+#define SCRATCH_SIZE 0x8800
+
+#define FILL_CHAR '.'
+
+/* fill vtcm scratch with ee */
+void prefill_vtcm_scratch(void)
+{
+    memset((void *)VTCM_BASE_ADDRESS, FILL_CHAR, SCRATCH_SIZE * sizeof(char));
+}
+
+/* print vtcm scratch buffer */
+void print_vtcm_scratch_16(void)
+{
+    unsigned short *vtmp = (unsigned short *)VTCM_BASE_ADDRESS;
+
+    printf("\n\nPrinting the vtcm scratch in half words");
+
+    for (int i = 0; i < SCRATCH_SIZE; i++) {
+        if ((i % MATRIX_SIZE) == 0) {
+            printf("\n");
+        }
+        for (int j = 0; j < 2; j++) {
+            printf("%c", (char)((vtmp[i] >> j * 8) & 0xff));
+        }
+
+        printf(" ");
+    }
+}
+
+/* print vtcm scratch buffer */
+void print_vtcm_scratch_32(void)
+{
+    unsigned int *vtmp = (unsigned int *)VTCM_BASE_ADDRESS;
+
+    printf("\n\nPrinting the vtcm scratch in words");
+
+    for (int i = 0; i < SCRATCH_SIZE; i++) {
+        if ((i % MATRIX_SIZE) == 0) {
+            printf("\n");
+        }
+        for (int j = 0; j < 4; j++) {
+            printf("%c", (char)((vtmp[i] >> j * 8) & 0xff));
+        }
+
+        printf(" ");
+    }
+}
+
+
+/* create byte offsets to be a diagonal of the matrix with 16 bit elements */
+void create_offsets_and_values_16(void)
+{
+    unsigned short half_element = 0;
+    unsigned short half_q_element = 0;
+    char letter = 'A';
+    char q_letter = '@';
+
+    for (int i = 0; i < MATRIX_SIZE; i++) {
+        half_offsets[i] = i * (2 * MATRIX_SIZE + 2);
+
+        half_element = 0;
+        half_q_element = 0;
+        for (int j = 0; j < 2; j++) {
+            half_element |= letter << j * 8;
+            half_q_element |= q_letter << j * 8;
+        }
+
+        half_values[i] = half_element;
+        half_acc_values[i] = ((i % 10) << 8) + (i % 10);
+        half_q_values[i] = half_q_element;
+
+        letter++;
+        /* reset to 'A' */
+        if (letter == 'M') {
+            letter = 'A';
+        }
+    }
+}
+
+/* create a predicate mask for the half word scatter */
+void create_preds_16()
+{
+    for (int i = 0; i < MATRIX_SIZE; i++) {
+        half_predicates[i] = (i % 3 == 0 || i % 5 == 0) ? ~0 : 0;
+    }
+}
+
+
+/* create byte offsets to be a diagonal of the matrix with 32 bit elements */
+void create_offsets_and_values_32(void)
+{
+    unsigned int word_element = 0;
+    unsigned int word_q_element = 0;
+    char letter = 'A';
+    char q_letter = '&';
+
+    for (int i = 0; i < MATRIX_SIZE; i++) {
+        word_offsets[i] = i * (4 * MATRIX_SIZE + 4);
+
+        word_element = 0;
+        word_q_element = 0;
+        for (int j = 0; j < 4; j++) {
+            word_element |= letter << j * 8;
+            word_q_element |= q_letter << j * 8;
+        }
+
+        word_values[i] = word_element;
+        word_acc_values[i] = ((i % 10) << 8) + (i % 10);
+        word_q_values[i] = word_q_element;
+
+        letter++;
+        /* reset to 'A' */
+        if (letter == 'M') {
+            letter = 'A';
+        }
+    }
+}
+
+/* create a predicate mask for the word scatter */
+void create_preds_32()
+{
+    for (int i = 0; i < MATRIX_SIZE; i++) {
+        word_predicates[i] = (i % 4 == 0 || i % 7 == 0) ? ~0 : 0;
+    }
+}
+
+
+void dump_buf(char *str, void *addr, int element_size, int byte_len)
+
+{
+    unsigned short *sptr = addr;
+    unsigned int *ptr = addr;
+
+    printf("\n\nBuffer: %s\n", str);
+    for (int i = 0; i < byte_len / element_size; ++ptr, ++sptr, ++i) {
+        if (i != 0 && (i % 16) == 0) {
+            printf("\n");
+        }
+        if (element_size == 2) {
+            printf("%c ", *sptr);
+        } else if (element_size == 4) {
+            printf("%4.4x ", *ptr);
+        }
+    }
+}
+
+/*
+ * create byte offsets to be a diagonal of the matrix with 16 bit elements and
+ * 32 bit offsets
+ */
+void create_offsets_and_values_16_32(void)
+{
+    unsigned int half_element = 0;
+    unsigned short half_q_element = 0;
+    char letter = 'D';
+    char q_letter = '$';
+
+    for (int i = 0; i < MATRIX_SIZE; i++) {
+        word_offsets[i] = i * (2 * MATRIX_SIZE + 2);
+
+        half_element = 0;
+        half_q_element = 0;
+        for (int j = 0; j < 2; j++) {
+            half_element |= letter << j * 8;
+            half_q_element |= q_letter << j * 8;
+        }
+
+        half_values[i] = half_element;
+        half_acc_values[i] = ((i % 10) << 8) + (i % 10);
+        half_q_values[i] = half_q_element;
+
+        letter++;
+        /* reset to 'A' */
+        if (letter == 'P') {
+            letter = 'D';
+        }
+    }
+
+    /*
+     * dump_buf("word_offsets", word_offsets, sizeof(*word_offsets),
+     * sizeof(word_offsets)); dump_buf("half_offsets", half_offsets,
+     * sizeof(*half_offsets), sizeof(half_offsets));
+     */
+}
+
+void create_preds_16_32()
+{
+    for (int i = 0; i < MATRIX_SIZE; i++) {
+        half_predicates[i] = (i % 2 == 0 || i % 13 == 0) ? ~0 : 0;
+    }
+}
+
+#define SCATTER_RELEASE(ADDR) \
+    asm volatile("vmem(%0 + #0):scatter_release\n" : : "r"(ADDR));
+
+/* scatter the 16 bit elements using intrinsics */
+void vector_scatter_16(void)
+{
+    START_CYCLES;
+
+    /* copy the offsets and values to vectors */
+    HVX_Vector offsets = *(HVX_Vector *)half_offsets;
+    HVX_Vector values = *(HVX_Vector *)half_values;
+
+    /* do the scatter */
+    Q6_vscatter_RMVhV(VTCM_SCATTER16_ADDRESS, region_len, offsets, values);
+
+#if SYNC_VECTOR
+    /* do the sync operation */
+    SCATTER_RELEASE(vscatter16);
+    /*
+     * This dummy load from vscatter16 is to complete the synchronization.
+     * Normally this load would be deferred as long as possible to minimize
+     * stalls.
+     */
+    volatile HVX_Vector vDummy = *(HVX_Vector *)vscatter16;
+#endif
+
+    PRINT_CYCLES("\nVector Scatter 16 cycles = %llu\n");
+}
+
+/* scatter-accumulate the 16 bit elements using intrinsics */
+void vector_scatter_acc_16(void)
+{
+    START_CYCLES;
+
+    /* copy the offsets and values to vectors */
+    HVX_Vector offsets = *(HVX_Vector *)half_offsets;
+    HVX_Vector values = *(HVX_Vector *)half_acc_values;
+
+    /* do the scatter */
+    Q6_vscatteracc_RMVhV(VTCM_SCATTER16_ADDRESS, region_len, offsets, values);
+
+#if SYNC_VECTOR
+    /* do the sync operation */
+    SCATTER_RELEASE(vscatter16);
+    /*
+     * This dummy load from vscatter16 is to complete the synchronization.
+     * Normally this load would be deferred as long as possible to minimize
+     * stalls.
+     */
+    volatile HVX_Vector vDummy = *(HVX_Vector *)vscatter16;
+#endif
+
+    PRINT_CYCLES("\nVector Scatter Acc 16 cycles = %llu\n");
+}
+
+/* scatter the 16 bit elements using intrinsics */
+void vector_scatter_q_16(void)
+{
+    START_CYCLES;
+
+    /* copy the offsets and values to vectors */
+    HVX_Vector offsets = *(HVX_Vector *)half_offsets;
+    HVX_Vector values = *(HVX_Vector *)half_q_values;
+    HVX_Vector pred_reg = *(HVX_Vector *)half_predicates;
+    HVX_VectorPred preds = Q6_Q_vand_VR(pred_reg, ~0);
+
+    /* do the scatter */
+    Q6_vscatter_QRMVhV(preds, VTCM_SCATTER16_ADDRESS, region_len, offsets,
+                       values);
+
+#if SYNC_VECTOR
+    /* do the sync operation */
+    SCATTER_RELEASE(vscatter16);
+    /*
+     * This dummy load from vscatter16 is to complete the synchronization.
+     * Normally this load would be deferred as long as possible to minimize
+     * stalls.
+     */
+    volatile HVX_Vector vDummy = *(HVX_Vector *)vscatter16;
+#endif
+
+    PRINT_CYCLES("\nVector Scatter Q 16 cycles = %llu\n");
+}
+
+/* scatter the 32 bit elements using intrinsics */
+void vector_scatter_32(void)
+{
+    START_CYCLES;
+
+    /* copy the offsets and values to vectors */
+    HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
+    HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
+    HVX_Vector valueslo = *(HVX_Vector *)word_values;
+    HVX_Vector valueshi = *(HVX_Vector *)&word_values[MATRIX_SIZE / 2];
+
+    /* do the scatter */
+    Q6_vscatter_RMVwV(VTCM_SCATTER32_ADDRESS, region_len, offsetslo, valueslo);
+    Q6_vscatter_RMVwV(VTCM_SCATTER32_ADDRESS, region_len, offsetshi, valueshi);
+
+#if SYNC_VECTOR
+    /* do the sync operation */
+    SCATTER_RELEASE(vscatter32);
+    /*
+     * This dummy load from vscatter32 is to complete the synchronization.
+     * Normally this load would be deferred as long as possible to minimize
+     * stalls.
+     */
+    volatile HVX_Vector vDummy = *(HVX_Vector *)vscatter32;
+#endif
+
+    PRINT_CYCLES("\nVector Scatter 32 cycles = %llu\n");
+}
+
+/* scatter-acc the 32 bit elements using intrinsics */
+void vector_scatter_acc_32(void)
+{
+    START_CYCLES;
+
+    /* copy the offsets and values to vectors */
+    HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
+    HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
+    HVX_Vector valueslo = *(HVX_Vector *)word_acc_values;
+    HVX_Vector valueshi = *(HVX_Vector *)&word_acc_values[MATRIX_SIZE / 2];
+
+    /* do the scatter */
+    Q6_vscatteracc_RMVwV(VTCM_SCATTER32_ADDRESS, region_len, offsetslo,
+                         valueslo);
+    Q6_vscatteracc_RMVwV(VTCM_SCATTER32_ADDRESS, region_len, offsetshi,
+                         valueshi);
+
+#if SYNC_VECTOR
+    /* do the sync operation */
+    SCATTER_RELEASE(vscatter32);
+    /*
+     * This dummy load from vscatter32 is to complete the synchronization.
+     * Normally this load would be deferred as long as possible to minimize
+     * stalls.
+     */
+    volatile HVX_Vector vDummy = *(HVX_Vector *)vscatter32;
+#endif
+
+    PRINT_CYCLES("\nVector Scatter Acc 32 cycles = %llu\n");
+}
+
+/* scatter the 32 bit elements using intrinsics */
+void vector_scatter_q_32(void)
+{
+    START_CYCLES;
+
+    /* copy the offsets and values to vectors */
+    HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
+    HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
+    HVX_Vector valueslo = *(HVX_Vector *)word_q_values;
+    HVX_Vector valueshi = *(HVX_Vector *)&word_q_values[MATRIX_SIZE / 2];
+    HVX_Vector pred_reglo = *(HVX_Vector *)word_predicates;
+    HVX_Vector pred_reghi = *(HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
+    HVX_VectorPred predslo = Q6_Q_vand_VR(pred_reglo, ~0);
+    HVX_VectorPred predshi = Q6_Q_vand_VR(pred_reghi, ~0);
+
+    /* do the scatter */
+    Q6_vscatter_QRMVwV(predslo, VTCM_SCATTER32_ADDRESS, region_len, offsetslo,
+                       valueslo);
+    Q6_vscatter_QRMVwV(predshi, VTCM_SCATTER32_ADDRESS, region_len, offsetshi,
+                       valueshi);
+
+#if SYNC_VECTOR
+    /* do the sync operation */
+    SCATTER_RELEASE(vscatter16);
+    /*
+     * This dummy load from vscatter16 is to complete the synchronization.
+     * Normally this load would be deferred as long as possible to minimize
+     * stalls.
+     */
+    volatile HVX_Vector vDummy = *(HVX_Vector *)vscatter16;
+#endif
+
+    PRINT_CYCLES("\nVector Scatter Q 16 cycles = %llu\n");
+}
+
+void print_vector(char *str, HVX_Vector *v)
+
+{
+    unsigned char *ptr = (unsigned char *)v;
+
+    printf("\n\nVector: %s\n", str);
+    for (int i = 0; i < sizeof(HVX_Vector) * 4; ++ptr, ++i) {
+        if (i != 0 && (i % 16) == 0) {
+            printf("\n");
+        }
+        printf("%c ", *ptr);
+    }
+    printf("\n");
+}
+
+void print_vectorpair(char *str, HVX_VectorPair *v)
+
+{
+    unsigned char *ptr = (unsigned char *)v;
+
+    printf("\n\nVectorPair: %s\n", str);
+    for (int i = 0; i < sizeof(HVX_VectorPair); ++ptr, ++i) {
+        if (i != 0 && (i % 16) == 0) {
+            printf("\n");
+        }
+        printf("%c ", *ptr);
+    }
+    printf("\n");
+}
+
+/* scatter the 16 bit elements with 32 bit offsets using intrinsics */
+void vector_scatter_16_32(void)
+{
+    START_CYCLES;
+
+    /* get the word offsets in a vector pair */
+    HVX_VectorPair offsets = *(HVX_VectorPair *)word_offsets;
+    /* print_vectorpair("word_offsets", (HVX_VectorPair *)&word_offsets); */
+
+    /* these values need to be shuffled for the RMWwV scatter */
+    HVX_Vector values = *(HVX_Vector *)half_values;
+    values = Q6_Vh_vshuff_Vh(values);
+    /* print_vector("values", (HVX_Vector *)&values); */
+
+    /* do the scatter */
+    Q6_vscatter_RMWwV(VTCM_SCATTER16_32_ADDRESS, region_len, offsets, values);
+    /* print_vector("scatter16_32_address", (HVX_Vector */
+    /* *)VTCM_SCATTER16_32_ADDRESS); */
+
+#if SYNC_VECTOR
+    /* do the sync operation */
+    SCATTER_RELEASE(vscatter16_32);
+    /*
+     * This dummy load from vscatter16_32 is to complete the synchronization.
+     * Normally this load would be deferred as long as possible to minimize
+     * stalls.
+     */
+    volatile HVX_Vector vDummy = *(HVX_Vector *)vscatter16_32;
+#endif
+
+    PRINT_CYCLES("\nVector Scatter 16_32 cycles = %llu\n");
+}
+
+/* scatter-acc the 16 bit elements with 32 bit offsets using intrinsics */
+void vector_scatter_acc_16_32(void)
+{
+    START_CYCLES;
+
+    /* get the word offsets in a vector pair */
+    HVX_VectorPair offsets = *(HVX_VectorPair *)word_offsets;
+    /* print_vectorpair("word_offsets", (HVX_VectorPair *)&word_offsets); */
+
+    /* these values need to be shuffled for the RMWwV scatter */
+    HVX_Vector values = *(HVX_Vector *)half_acc_values;
+    values = Q6_Vh_vshuff_Vh(values);
+    /* print_vector("values", (HVX_Vector *)&values); */
+
+    /* do the scatter */
+    Q6_vscatteracc_RMWwV(VTCM_SCATTER16_32_ADDRESS, region_len, offsets,
+                         values);
+    /* print_vector("scatter16_32_address", (HVX_Vector */
+    /* *)VTCM_SCATTER16_32_ADDRESS); */
+
+#if SYNC_VECTOR
+    /* do the sync operation */
+    SCATTER_RELEASE(vscatter16_32);
+    /*
+     * This dummy load from vscatter16_32 is to complete the synchronization.
+     * Normally this load would be deferred as long as possible to minimize
+     * stalls.
+     */
+    volatile HVX_Vector vDummy = *(HVX_Vector *)vscatter16_32;
+#endif
+
+    PRINT_CYCLES("\nVector Scatter Acc 16_32 cycles = %llu\n");
+}
+
+/* scatter-acc the 16 bit elements with 32 bit offsets using intrinsics */
+void vector_scatter_q_16_32(void)
+{
+    START_CYCLES;
+
+    /* get the word offsets in a vector pair */
+    HVX_VectorPair offsets = *(HVX_VectorPair *)word_offsets;
+    /* print_vectorpair("word_offsets", (HVX_VectorPair *)&word_offsets); */
+
+    /* these values need to be shuffled for the RMWwV scatter */
+    HVX_Vector values = *(HVX_Vector *)half_q_values;
+    values = Q6_Vh_vshuff_Vh(values);
+    /* print_vector("values", (HVX_Vector *)&values); */
+
+    HVX_Vector pred_reg = *(HVX_Vector *)half_predicates;
+    pred_reg = Q6_Vh_vshuff_Vh(pred_reg);
+    HVX_VectorPred preds = Q6_Q_vand_VR(pred_reg, ~0);
+
+    /* do the scatter */
+    Q6_vscatter_QRMWwV(preds, VTCM_SCATTER16_32_ADDRESS, region_len, offsets,
+                       values);
+    /* print_vector("scatter16_32_address", (HVX_Vector */
+    /* *)VTCM_SCATTER16_32_ADDRESS); */
+
+#if SYNC_VECTOR
+    /* do the sync operation */
+    SCATTER_RELEASE(vscatter16_32);
+    /*
+     * This dummy load from vscatter16_32 is to complete the synchronization.
+     * Normally this load would be deferred as long as possible to minimize
+     * stalls.
+     */
+    volatile HVX_Vector vDummy = *(HVX_Vector *)vscatter16_32;
+#endif
+
+    PRINT_CYCLES("\nVector Scatter Q 16_32 cycles = %llu\n");
+}
+
+
+/* gather the elements from the scatter16 buffer */
+void vector_gather_16(void)
+{
+    START_CYCLES;
+
+    HVX_Vector *vgather = (HVX_Vector *)VTCM_GATHER16_ADDRESS;
+    HVX_Vector offsets = *(HVX_Vector *)half_offsets;
+
+    /* do the gather to the gather16 buffer */
+    Q6_vgather_ARMVh(vgather, VTCM_SCATTER16_ADDRESS, region_len, offsets);
+
+
+#if SYNC_VECTOR
+    /* This dummy read of vgather will stall until completion */
+    volatile HVX_Vector vDummy = *(HVX_Vector *)vgather;
+#endif
+
+    PRINT_CYCLES("\nVector Gather 16 cycles = %llu\n");
+}
+
+static unsigned short gather_q_16_init(void)
+{
+    char letter = '?';
+    return letter | (letter << 8);
+}
+
+void vector_gather_q_16(void)
+{
+    START_CYCLES;
+
+    HVX_Vector *vgather = (HVX_Vector *)VTCM_GATHER16_ADDRESS;
+    HVX_Vector offsets = *(HVX_Vector *)half_offsets;
+    HVX_Vector pred_reg = *(HVX_Vector *)half_predicates;
+    HVX_VectorPred preds = Q6_Q_vand_VR(pred_reg, ~0);
+
+    *vgather = Q6_Vh_vsplat_R(gather_q_16_init());
+    /* do the gather to the gather16 buffer */
+    Q6_vgather_AQRMVh(vgather, preds, VTCM_SCATTER16_ADDRESS, region_len,
+                      offsets);
+
+
+#if SYNC_VECTOR
+    /* This dummy read of vgather will stall until completion */
+    volatile HVX_Vector vDummy = *(HVX_Vector *)vgather;
+#endif
+
+    PRINT_CYCLES("\nVector Gather Q 16 cycles = %llu\n");
+}
+
+
+/* gather the elements from the scatter32 buffer */
+void vector_gather_32(void)
+{
+    START_CYCLES;
+
+    HVX_Vector *vgatherlo = (HVX_Vector *)VTCM_GATHER32_ADDRESS;
+    HVX_Vector *vgatherhi =
+        (HVX_Vector *)(VTCM_GATHER32_ADDRESS + (MATRIX_SIZE * 2));
+    HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
+    HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
+
+    /* do the gather to vgather */
+    Q6_vgather_ARMVw(vgatherlo, VTCM_SCATTER32_ADDRESS, region_len, offsetslo);
+    Q6_vgather_ARMVw(vgatherhi, VTCM_SCATTER32_ADDRESS, region_len, offsetshi);
+
+#if SYNC_VECTOR
+    /* This dummy read of vgatherhi will stall until completion */
+    volatile HVX_Vector vDummy = *(HVX_Vector *)vgatherhi;
+#endif
+
+    PRINT_CYCLES("\nVector Gather 32 cycles = %llu\n");
+}
+
+static unsigned int gather_q_32_init(void)
+{
+    char letter = '?';
+    return letter | (letter << 8) | (letter << 16) | (letter << 24);
+}
+
+void vector_gather_q_32(void)
+{
+    START_CYCLES;
+
+    HVX_Vector *vgatherlo = (HVX_Vector *)VTCM_GATHER32_ADDRESS;
+    HVX_Vector *vgatherhi =
+        (HVX_Vector *)(VTCM_GATHER32_ADDRESS + (MATRIX_SIZE * 2));
+    HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
+    HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
+    HVX_Vector pred_reglo = *(HVX_Vector *)word_predicates;
+    HVX_VectorPred predslo = Q6_Q_vand_VR(pred_reglo, ~0);
+    HVX_Vector pred_reghi = *(HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
+    HVX_VectorPred predshi = Q6_Q_vand_VR(pred_reghi, ~0);
+
+    *vgatherlo = Q6_Vh_vsplat_R(gather_q_32_init());
+    *vgatherhi = Q6_Vh_vsplat_R(gather_q_32_init());
+    /* do the gather to vgather */
+    Q6_vgather_AQRMVw(vgatherlo, predslo, VTCM_SCATTER32_ADDRESS, region_len,
+                      offsetslo);
+    Q6_vgather_AQRMVw(vgatherhi, predshi, VTCM_SCATTER32_ADDRESS, region_len,
+                      offsetshi);
+
+#if SYNC_VECTOR
+    /* This dummy read of vgatherhi will stall until completion */
+    volatile HVX_Vector vDummy = *(HVX_Vector *)vgatherhi;
+#endif
+
+    PRINT_CYCLES("\nVector Gather Q 32 cycles = %llu\n");
+}
+
+/* gather the elements from the scatter16_32 buffer */
+void vector_gather_16_32(void)
+{
+    START_CYCLES;
+
+    /* get the vtcm address to gather from */
+    HVX_Vector *vgather = (HVX_Vector *)VTCM_GATHER16_32_ADDRESS;
+
+    /* get the word offsets in a vector pair */
+    HVX_VectorPair offsets = *(HVX_VectorPair *)word_offsets;
+
+    /* do the gather to vgather */
+    Q6_vgather_ARMWw(vgather, VTCM_SCATTER16_32_ADDRESS, region_len, offsets);
+
+    /* the read of gather will stall until completion */
+    volatile HVX_Vector values = *(HVX_Vector *)vgather;
+
+    /* deal the elements to get the order back */
+    values = Q6_Vh_vdeal_Vh(values);
+
+    /* write it back to vtcm address */
+    *(HVX_Vector *)vgather = values;
+
+
+    PRINT_CYCLES("\nVector Gather 16_32 cycles = %llu\n");
+}
+
+void vector_gather_q_16_32(void)
+{
+    START_CYCLES;
+
+    /* get the vtcm address to gather from */
+    HVX_Vector *vgather = (HVX_Vector *)VTCM_GATHER16_32_ADDRESS;
+
+    /* get the word offsets in a vector pair */
+    HVX_VectorPair offsets = *(HVX_VectorPair *)word_offsets;
+    HVX_Vector pred_reg = *(HVX_Vector *)half_predicates;
+    pred_reg = Q6_Vh_vshuff_Vh(pred_reg);
+    HVX_VectorPred preds = Q6_Q_vand_VR(pred_reg, ~0);
+
+    *vgather = Q6_Vh_vsplat_R(gather_q_16_init());
+    /* do the gather to vgather */
+    Q6_vgather_AQRMWw(vgather, preds, VTCM_SCATTER16_32_ADDRESS, region_len,
+                      offsets);
+
+    /* the read of gather will stall until completion */
+    volatile HVX_Vector values = *(HVX_Vector *)vgather;
+
+    /* deal the elements to get the order back */
+    values = Q6_Vh_vdeal_Vh(values);
+
+    /* write it back to vtcm address */
+    *(HVX_Vector *)vgather = values;
+
+
+    PRINT_CYCLES("\nVector Gather Q 16_32 cycles = %llu\n");
+}
+
+
+static void check_buffer(const char *name, void *c, void *r, size_t size)
+{
+    char *check = (char *)c;
+    char *ref = (char *)r;
+    /*  printf("check buffer %s 0x%x, 0x%x, %d\n", name, check, ref, size); */
+    for (int i = 0; i < size; i++) {
+        if (check[i] != ref[i]) {
+            printf("Error %s [%d]: 0x%x (%c) != 0x%x (%c)\n", name, i, check[i],
+                   check[i], ref[i], ref[i]);
+            err++;
+        }
+    }
+}
+
+
+/*
+ * These scalar functions are the C equivalents of the vector functions that
+ * use HVX
+ */
+
+/* scatter the 16 bit elements using C */
+void scalar_scatter_16(unsigned short *vscatter16)
+{
+    START_CYCLES;
+
+    for (int i = 0; i < MATRIX_SIZE; ++i) {
+        vscatter16[half_offsets[i] / 2] = half_values[i];
+    }
+
+    PRINT_CYCLES("\nScalar Scatter 16 cycles = %llu\n");
+}
+
+void check_scatter_16()
+{
+    memset(vscatter16_ref, FILL_CHAR,
+           SCATTER_BUFFER_SIZE * sizeof(unsigned short));
+    scalar_scatter_16(vscatter16_ref);
+    check_buffer("check_scatter_16", vscatter16, vscatter16_ref,
+                 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
+}
+
+/* scatter the 16 bit elements using C */
+void scalar_scatter_acc_16(unsigned short *vscatter16)
+{
+    START_CYCLES;
+
+    for (int i = 0; i < MATRIX_SIZE; ++i) {
+        vscatter16[half_offsets[i] / 2] += half_acc_values[i];
+    }
+
+    PRINT_CYCLES("\nScalar Scatter Acc 16 cycles = %llu\n");
+}
+
+/* scatter the 16 bit elements using C */
+void scalar_scatter_q_16(unsigned short *vscatter16)
+{
+    START_CYCLES;
+
+    for (int i = 0; i < MATRIX_SIZE; i++) {
+        if (half_predicates[i]) {
+            vscatter16[half_offsets[i] / 2] = half_q_values[i];
+        }
+    }
+
+    PRINT_CYCLES("\nScalar Scatter Q 16 cycles = %llu\n");
+}
+
+
+void check_scatter_acc_16()
+{
+    memset(vscatter16_ref, FILL_CHAR,
+           SCATTER_BUFFER_SIZE * sizeof(unsigned short));
+    scalar_scatter_16(vscatter16_ref);
+    scalar_scatter_acc_16(vscatter16_ref);
+    check_buffer("check_scatter_acc_16", vscatter16, vscatter16_ref,
+                 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
+}
+
+void check_scatter_q_16()
+{
+    memset(vscatter16_ref, FILL_CHAR,
+           SCATTER_BUFFER_SIZE * sizeof(unsigned short));
+    scalar_scatter_16(vscatter16_ref);
+    scalar_scatter_acc_16(vscatter16_ref);
+    scalar_scatter_q_16(vscatter16_ref);
+    check_buffer("check_scatter_q_16", vscatter16, vscatter16_ref,
+                 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
+}
+
+/* scatter the 32 bit elements using C */
+void scalar_scatter_32(unsigned int *vscatter32)
+{
+    START_CYCLES;
+
+    for (int i = 0; i < MATRIX_SIZE; ++i) {
+        vscatter32[word_offsets[i] / 4] = word_values[i];
+    }
+
+    PRINT_CYCLES("\n\nScalar Scatter 32 cycles = %llu\n");
+}
+
+/* scatter the 32 bit elements using C */
+void scalar_scatter_acc_32(unsigned int *vscatter32)
+{
+    START_CYCLES;
+
+    for (int i = 0; i < MATRIX_SIZE; ++i) {
+        vscatter32[word_offsets[i] / 4] += word_acc_values[i];
+    }
+
+    PRINT_CYCLES("\nScalar Scatter Acc 32 cycles = %llu\n");
+}
+
+/* scatter the 32 bit elements using C */
+void scalar_scatter_q_32(unsigned int *vscatter32)
+{
+    START_CYCLES;
+
+    for (int i = 0; i < MATRIX_SIZE; i++) {
+        if (word_predicates[i]) {
+            vscatter32[word_offsets[i] / 4] = word_q_values[i];
+        }
+    }
+
+    PRINT_CYCLES("\nScalar Scatter Q 32 cycles = %llu\n");
+}
+
+void check_scatter_32()
+{
+    memset(vscatter32_ref, FILL_CHAR,
+           SCATTER_BUFFER_SIZE * sizeof(unsigned int));
+    scalar_scatter_32(vscatter32_ref);
+    check_buffer("check_scatter_32", vscatter32, vscatter32_ref,
+                 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
+}
+
+void check_scatter_acc_32()
+{
+    memset(vscatter32_ref, FILL_CHAR,
+           SCATTER_BUFFER_SIZE * sizeof(unsigned int));
+    scalar_scatter_32(vscatter32_ref);
+    scalar_scatter_acc_32(vscatter32_ref);
+    check_buffer("check_scatter_acc_32", vscatter32, vscatter32_ref,
+                 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
+}
+
+void check_scatter_q_32()
+{
+    memset(vscatter32_ref, FILL_CHAR,
+           SCATTER_BUFFER_SIZE * sizeof(unsigned int));
+    scalar_scatter_32(vscatter32_ref);
+    scalar_scatter_acc_32(vscatter32_ref);
+    scalar_scatter_q_32(vscatter32_ref);
+    check_buffer("check_scatter_q_32", vscatter32, vscatter32_ref,
+                 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
+}
+
+/* scatter the 32 bit elements using C */
+void scalar_scatter_16_32(unsigned short *vscatter16_32)
+{
+    START_CYCLES;
+
+    for (int i = 0; i < MATRIX_SIZE; ++i) {
+        vscatter16_32[word_offsets[i] / 2] = half_values[i];
+    }
+
+    PRINT_CYCLES("\n\nScalar Scatter 16_32 cycles = %llu\n");
+}
+
+/* scatter the 32 bit elements using C */
+void scalar_scatteracc_16_32(unsigned short *vscatter16_32)
+{
+    START_CYCLES;
+
+    for (int i = 0; i < MATRIX_SIZE; ++i) {
+        vscatter16_32[word_offsets[i] / 2] += half_acc_values[i];
+    }
+
+    PRINT_CYCLES("\n\nScalar Scatter Acc 16_32 cycles = %llu\n");
+}
+
+void scalar_scatter_q_16_32(unsigned short *vscatter16_32)
+{
+    START_CYCLES;
+
+    for (int i = 0; i < MATRIX_SIZE; i++) {
+        if (half_predicates[i]) {
+            vscatter16_32[word_offsets[i] / 2] = half_q_values[i];
+        }
+    }
+
+    PRINT_CYCLES("\nScalar Scatter Q 16_32 cycles = %llu\n");
+}
+
+void check_scatter_16_32()
+{
+    memset(vscatter16_32_ref, FILL_CHAR,
+           SCATTER_BUFFER_SIZE * sizeof(unsigned short));
+    scalar_scatter_16_32(vscatter16_32_ref);
+    check_buffer("check_scatter_16_32", vscatter16_32, vscatter16_32_ref,
+                 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
+}
+
+void check_scatter_acc_16_32()
+{
+    memset(vscatter16_32_ref, FILL_CHAR,
+           SCATTER_BUFFER_SIZE * sizeof(unsigned short));
+    scalar_scatter_16_32(vscatter16_32_ref);
+    scalar_scatteracc_16_32(vscatter16_32_ref);
+    check_buffer("check_scatter_acc_16_32", vscatter16_32, vscatter16_32_ref,
+                 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
+}
+
+void check_scatter_q_16_32()
+{
+    memset(vscatter16_32_ref, FILL_CHAR,
+           SCATTER_BUFFER_SIZE * sizeof(unsigned short));
+    scalar_scatter_16_32(vscatter16_32_ref);
+    scalar_scatteracc_16_32(vscatter16_32_ref);
+    scalar_scatter_q_16_32(vscatter16_32_ref);
+    check_buffer("check_scatter_q_16_32", vscatter16_32, vscatter16_32_ref,
+                 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
+}
+
+/* gather the elements from the scatter buffer using C */
+void scalar_gather_16(unsigned short *vgather16)
+{
+    START_CYCLES;
+
+    for (int i = 0; i < MATRIX_SIZE; ++i) {
+        vgather16[i] = vscatter16[half_offsets[i] / 2];
+    }
+
+    PRINT_CYCLES("\n\nScalar Gather 16 cycles = %llu\n");
+}
+
+void scalar_gather_q_16(unsigned short *vgather16)
+{
+    START_CYCLES;
+
+    for (int i = 0; i < MATRIX_SIZE; ++i) {
+        if (half_predicates[i]) {
+            vgather16[i] = vscatter16[half_offsets[i] / 2];
+        }
+    }
+
+    PRINT_CYCLES("\n\nScalar Gather Q 16 cycles = %llu\n");
+}
+
+void check_gather_16()
+{
+    memset(vgather16_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
+    scalar_gather_16(vgather16_ref);
+    check_buffer("check_gather_16", vgather16, vgather16_ref,
+                 MATRIX_SIZE * sizeof(unsigned short));
+}
+
+void check_gather_q_16()
+{
+    memset(vgather16_ref, gather_q_16_init(),
+           MATRIX_SIZE * sizeof(unsigned short));
+    scalar_gather_q_16(vgather16_ref);
+    check_buffer("check_gather_q_16", vgather16, vgather16_ref,
+                 MATRIX_SIZE * sizeof(unsigned short));
+}
+
+/* gather the elements from the scatter buffer using C */
+void scalar_gather_32(unsigned int *vgather32)
+{
+    START_CYCLES;
+
+    for (int i = 0; i < MATRIX_SIZE; ++i) {
+        vgather32[i] = vscatter32[word_offsets[i] / 4];
+    }
+
+    PRINT_CYCLES("\n\nScalar Gather 32 cycles = %llu\n");
+}
+
+void scalar_gather_q_32(unsigned int *vgather32)
+{
+    START_CYCLES;
+
+    for (int i = 0; i < MATRIX_SIZE; ++i) {
+        if (word_predicates[i]) {
+            vgather32[i] = vscatter32[word_offsets[i] / 4];
+        }
+    }
+
+    PRINT_CYCLES("\n\nScalar Gather Q 32 cycles = %llu\n");
+}
+
+
+void check_gather_32(void)
+{
+    memset(vgather32_ref, 0, MATRIX_SIZE * sizeof(unsigned int));
+    scalar_gather_32(vgather32_ref);
+    check_buffer("check_gather_32", vgather32, vgather32_ref,
+                 MATRIX_SIZE * sizeof(unsigned int));
+}
+
+void check_gather_q_32(void)
+{
+    memset(vgather32_ref, gather_q_32_init(),
+           MATRIX_SIZE * sizeof(unsigned int));
+    scalar_gather_q_32(vgather32_ref);
+    check_buffer("check_gather_q_32", vgather32, vgather32_ref,
+                 MATRIX_SIZE * sizeof(unsigned int));
+}
+
+/* gather the elements from the scatter buffer using C */
+void scalar_gather_16_32(unsigned short *vgather16_32)
+{
+    START_CYCLES;
+
+    for (int i = 0; i < MATRIX_SIZE; ++i) {
+        vgather16_32[i] = vscatter16_32[word_offsets[i] / 2];
+    }
+
+    PRINT_CYCLES("\n\nScalar Gather 16_32 cycles = %llu\n");
+}
+
+void scalar_gather_q_16_32(unsigned short *vgather16_32)
+{
+    START_CYCLES;
+
+    for (int i = 0; i < MATRIX_SIZE; ++i) {
+        if (half_predicates[i]) {
+            vgather16_32[i] = vscatter16_32[word_offsets[i] / 2];
+        }
+    }
+
+    PRINT_CYCLES("\n\nScalar Gather Q 16_32 cycles = %llu\n");
+}
+
+void check_gather_16_32(void)
+{
+    memset(vgather16_32_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
+    scalar_gather_16_32(vgather16_32_ref);
+    check_buffer("check_gather_16_32", vgather16_32, vgather16_32_ref,
+                 MATRIX_SIZE * sizeof(unsigned short));
+}
+
+void check_gather_q_16_32(void)
+{
+    memset(vgather16_32_ref, gather_q_16_init(),
+           MATRIX_SIZE * sizeof(unsigned short));
+    scalar_gather_q_16_32(vgather16_32_ref);
+    check_buffer("check_gather_q_16_32", vgather16_32, vgather16_32_ref,
+                 MATRIX_SIZE * sizeof(unsigned short));
+}
+
+/* These functions print the buffers to the display */
+
+/* print scatter16 buffer */
+void print_scatter16_buffer(void)
+{
+#if PRINT_DATA
+    /*
+     * printf("\n\nPrinting the 16 bit scatter buffer at 0x%08x",
+     * VTCM_SCATTER16_ADDRESS);
+     */
+    printf("\n\nPrinting the 16 bit scatter buffer");
+
+    for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
+        if ((i % MATRIX_SIZE) == 0) {
+            printf("\n");
+        }
+
+        for (int j = 0; j < 2; j++) {
+            printf("%c", (char)((vscatter16[i] >> j * 8) & 0xff));
+        }
+
+        printf(" ");
+    }
+    printf("\n");
+#endif
+}
+
+/* print the gather 16 buffer */
+void print_gather_result_16(void)
+{
+#if PRINT_DATA
+    /*
+     * printf("\n\nPrinting the 16 bit gather result at 0x%08x\n",
+     * VTCM_GATHER16_ADDRESS);
+     */
+    printf("\n\nPrinting the 16 bit gather result\n");
+
+    for (int i = 0; i < MATRIX_SIZE; i++) {
+        for (int j = 0; j < 2; j++) {
+            printf("%c", (char)((vgather16[i] >> j * 8) & 0xff));
+        }
+
+        printf(" ");
+    }
+    printf("\n");
+#endif
+}
+
+/* print the scatter32 buffer */
+void print_scatter32_buffer(void)
+{
+#if PRINT_DATA
+    /*
+     * printf("\n\nPrinting the 32 bit scatter buffer at 0x%08x",
+     * VTCM_SCATTER32_ADDRESS);
+     */
+    printf("\n\nPrinting the 32 bit scatter buffer");
+
+    for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
+        if ((i % MATRIX_SIZE) == 0) {
+            printf("\n");
+        }
+
+        for (int j = 0; j < 4; j++) {
+            printf("%c", (char)((vscatter32[i] >> j * 8) & 0xff));
+        }
+
+        printf(" ");
+    }
+    printf("\n");
+#endif
+}
+
+
+/* print the gather 32 buffer */
+void print_gather_result_32(void)
+{
+#if PRINT_DATA
+    /*
+     * printf("\n\nPrinting the 32 bit gather result at 0x%08x\n",
+     * VTCM_GATHER32_ADDRESS);
+     */
+    printf("\n\nPrinting the 32 bit gather result\n");
+
+    for (int i = 0; i < MATRIX_SIZE; i++) {
+        for (int j = 0; j < 4; j++) {
+            printf("%c", (char)((vgather32[i] >> j * 8) & 0xff));
+        }
+
+        printf(" ");
+    }
+    printf("\n");
+#endif
+}
+
+/* print the scatter16_32 buffer */
+void print_scatter16_32_buffer(void)
+{
+#if PRINT_DATA
+    /*
+     * printf("\n\nPrinting the 16_32 bit scatter buffer at 0x%08x",
+     * VTCM_SCATTER16_32_ADDRESS);
+     */
+    printf("\n\nPrinting the 16_32 bit scatter buffer");
+
+    for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
+        if ((i % MATRIX_SIZE) == 0) {
+            printf("\n");
+        }
+
+        for (int j = 0; j < 2; j++) {
+            printf("%c", (unsigned char)((vscatter16_32[i] >> j * 8) & 0xff));
+        }
+
+        printf(" ");
+    }
+    printf("\n");
+#endif
+}
+
+/* print the gather 16_32 buffer */
+void print_gather_result_16_32(void)
+{
+#if PRINT_DATA
+    /*
+     * printf("\n\nPrinting the 16_32 bit gather result at 0x%08x\n",
+     * VTCM_GATHER16_32_ADDRESS);
+     */
+    printf("\n\nPrinting the 16_32 bit gather result\n");
+
+    for (int i = 0; i < MATRIX_SIZE; i++) {
+        for (int j = 0; j < 2; j++) {
+            printf("%c", (unsigned char)((vgather16_32[i] >> j * 8) & 0xff));
+        }
+
+        printf(" ");
+    }
+    printf("\n");
+#endif
+}
+
+/*
+ * set up the tcm address translation
+ * Note: This method is only for the standalone environment
+ * SDK users should use the "VTCM Manager" to use VTCM
+ */
+void setup_tcm(void)
+{
+    VTCM_BASE_ADDRESS = get_vtcm_base();
+
+    uint64_t pa = VTCM_BASE_ADDRESS;
+    void *va = (void *)VTCM_BASE_ADDRESS;
+
+    VTCM_SCATTER16_ADDRESS = VTCM_BASE_ADDRESS;
+    VTCM_GATHER16_ADDRESS = VTCM_BASE_ADDRESS + SCATTER16_BUF_SIZE;
+    VTCM_SCATTER32_ADDRESS = VTCM_GATHER16_ADDRESS + GATHER16_BUF_SIZE;
+    VTCM_GATHER32_ADDRESS = VTCM_SCATTER32_ADDRESS + SCATTER32_BUF_SIZE;
+    VTCM_SCATTER16_32_ADDRESS = VTCM_GATHER32_ADDRESS + GATHER32_BUF_SIZE;
+    VTCM_GATHER16_32_ADDRESS = VTCM_SCATTER16_32_ADDRESS + SCATTER16_BUF_SIZE;
+
+    /* the vtcm base address */
+    vtcm_base = (unsigned char *)VTCM_BASE_ADDRESS;
+
+    /* scatter gather 16 bit elements using 16 bit offsets */
+    vscatter16 = (unsigned short *)VTCM_SCATTER16_ADDRESS;
+    vgather16 = (unsigned short *)VTCM_GATHER16_ADDRESS;
+
+    /* scatter gather 32 bit elements using 32 bit offsets */
+    vscatter32 = (unsigned int *)VTCM_SCATTER32_ADDRESS;
+    vgather32 = (unsigned int *)VTCM_GATHER32_ADDRESS;
+
+    /* scatter gather 16 bit elements using 32 bit offsets */
+    vscatter16_32 = (unsigned short *)VTCM_SCATTER16_32_ADDRESS;
+    vgather16_32 = (unsigned short *)VTCM_GATHER16_32_ADDRESS;
+}
+
+void inst_test()
+{
+    /* Should NOT throw an error when paranoid-commit-state turned on */
+    uint32_t R;
+    asm volatile("release(%0):at\n\t" : : "r"(R));
+}
+
+
+int main()
+{
+    setup_tcm();
+    prefill_vtcm_scratch();
+
+    /* 16 bit elements with 16 bit offsets */
+    create_offsets_and_values_16();
+    create_preds_16();
+
+#if PRINT_CYCLE_COUNTS
+    scalar_scatter_16(vscatter16);
+#endif
+    vector_scatter_16();
+    print_scatter16_buffer();
+    check_scatter_16();
+
+
+#if PRINT_CYCLE_COUNTS
+    scalar_gather_16(vgather16);
+#endif
+    vector_gather_16();
+    print_gather_result_16();
+    check_gather_16();
+
+    vector_gather_q_16();
+    print_gather_result_16();
+    check_gather_q_16();
+
+    vector_scatter_acc_16();
+    print_scatter16_buffer();
+    check_scatter_acc_16();
+
+    vector_scatter_q_16();
+    print_scatter16_buffer();
+    check_scatter_q_16();
+
+    /* 32 bit elements with 32 bit offsets */
+    create_offsets_and_values_32();
+    create_preds_32();
+
+#if PRINT_CYCLE_COUNTS
+    scalar_scatter_32(vscatter32);
+#endif
+
+    vector_scatter_32();
+
+    print_scatter32_buffer();
+    check_scatter_32();
+
+#if PRINT_CYCLE_COUNTS
+    scalar_gather_32(vgather32);
+#endif
+
+    vector_gather_32();
+
+    print_gather_result_32();
+    check_gather_32();
+
+    vector_gather_q_32();
+    print_gather_result_32();
+    check_gather_q_32();
+
+    vector_scatter_acc_32();
+    print_scatter32_buffer();
+    check_scatter_acc_32();
+
+    vector_scatter_q_32();
+    print_scatter32_buffer();
+    check_scatter_q_32();
+
+    /* 16 bit elements with 32 bit offsets */
+    create_offsets_and_values_16_32();
+    create_preds_16_32();
+
+#if PRINT_CYCLE_COUNTS
+    scalar_scatter_16_32();
+#endif
+    vector_scatter_16_32();
+
+    print_scatter16_32_buffer();
+    check_scatter_16_32();
+
+#if PRINT_CYCLE_COUNTS
+    scalar_gather_16_32(vgather16_32);
+#endif
+
+    vector_gather_16_32();
+
+    print_gather_result_16_32();
+    check_gather_16_32();
+
+    vector_gather_q_16_32();
+    print_gather_result_16_32();
+    check_gather_q_16_32();
+
+    vector_scatter_acc_16_32();
+    print_scatter16_32_buffer();
+    check_scatter_acc_16_32();
+
+    vector_scatter_q_16_32();
+    print_scatter16_32_buffer();
+    check_scatter_q_16_32();
+
+    inst_test();
+    printf("%s\n", ((err) ? "FAIL" : "PASS"));
+    return err;
+}
diff --git a/tests/tcg/hexagon/system/strutils.h b/tests/tcg/hexagon/system/strutils.h
new file mode 100644
index 000000000000..14f4a290b817
--- /dev/null
+++ b/tests/tcg/hexagon/system/strutils.h
@@ -0,0 +1,25 @@
+/*
+ *  Copyright(c) 2023-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef STRUTILS_H
+#define STRUTILS_H
+
+#include <string.h>
+
+void sort_str_arr(char **arr, size_t n)
+{
+    for (int i = 0; i < n - 1; i++) {
+        for (int j = 0; j < n - i - 1; j++) {
+            if (strcmp(arr[j], arr[j + 1]) > 0) {
+                char *tmp = arr[j];
+                arr[j] = arr[j + 1];
+                arr[j + 1] = tmp;
+            }
+        }
+    }
+}
+
+#endif
diff --git a/tests/tcg/hexagon/system/tlb-miss-tlblock.S b/tests/tcg/hexagon/system/tlb-miss-tlblock.S
new file mode 100644
index 000000000000..fe07aca47b37
--- /dev/null
+++ b/tests/tcg/hexagon/system/tlb-miss-tlblock.S
@@ -0,0 +1,156 @@
+/*
+ *  Copyright(c) 2019-2025 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+/*
+ * Test Purpose:
+ * Verify that tlbmissx and tlbmissrw do not set the syscfg.tl bit
+ * The HW spec says:
+ *     "TLBLOCK is acquired automatically whenever a hardware thread raises a
+ *      TLB miss-RW or TLBmiss-X exception."
+ * The casual reader would assume that a miss handler would implicitly have
+ * the lock, that apparently
+ * isn't the case.
+ */
+
+.global start
+start:
+    r0 = ##evb
+    evb = r0
+    r0 = ##0
+    ssr = r0
+    jump #setup
+
+#define tlb_index r11
+#define stack r29
+#define data r18
+tlb_index = ##0x00000007
+
+.org 0x100
+
+evb:
+    jump #reset
+    jump #nmi
+    jump #error
+    jump #0
+    jump #tlbmissx
+    jump #0
+    jump #tlbmissrw
+
+
+setup:
+    {
+        r1 = ##0xc009b800
+        r0 = ##0xf7137010
+    }
+    tlb_index = add(tlb_index, #1)
+    tlbw(r1:0,tlb_index)
+
+/* Enable MMU */
+    r2 = ##0x0085a07f
+    syscfg = r2
+
+/* Test setup */
+    r12 = #0x12
+    r0 = #0x6
+    r7 = ##0x77777777
+    r6 = ##0x66666666
+    data = ##0xf2000000
+    stack = ##0x9ba01000
+    jump ##.L_server_loop
+
+/* event vector handlers */
+reset:
+    r2 = #1
+    stop(r0)
+nmi:
+    r2 = #1
+    stop(r0)
+error:
+    r2 = #1
+    stop(r0)
+
+
+/*
+ * Can only handle a single ex fault.
+ */
+tlbmissx:
+  r0 = syscfg
+  r1 = #0x800
+/*
+ * Fail if we automatically start setting SYSCFG:TL again
+ */
+    r0 = and(r0, r1)
+    {
+        p0 = cmp.eq(r0, r1); if (p0.new) jump:t .Lfailmissx
+    }
+    {
+        r1 = ##0xc009b900
+        r0 = ##0xf7137210
+    }
+    tlb_index = add(tlb_index, #1)
+    tlbw(r1:0,tlb_index)
+    tlbunlock
+    rte
+    stop(r0);
+.Lfailmissx:
+    r2 = #1
+    stop(r2);
+
+/*
+ * Can only handle a stack fault and a data fault
+ */
+tlbmissrw:
+    r0 = syscfg
+    r1 = #0x800
+/*
+ * Fail if we automatically start setting SYSCFG:TL again
+ */
+    r0 = and(r0, r1)
+    {
+        p0 = cmp.eq(r0, r1); if (p0.new) jump:t .Lfailmissrw
+    }
+    r0 = badva
+    p0 = cmp.eq (stack, r0) // missed the stack
+    if (!p0) jump .Ldata
+    {
+        r1 = ##0xc009ba00
+        r0 = ##0xf7137210
+    }
+  jump #.Ldone
+.Ldata:
+    {
+        r1 = ##0xc00f2000
+        r0 = ##0xf71e4010
+    }
+.Ldone:
+    tlb_index = add(tlb_index, #1)
+    tlbw(r1:0,tlb_index)
+    tlbunlock
+    rte
+.Lfailmissrw:
+    r2 = #1
+    stop(r2);
+
+
+
+.org 0x100000
+  nop
+.Lpass:
+   r2 = #0
+   stop(r0);
+   trap0(#0x18)
+.L_server_loop:
+{
+    p0 = cmp.eq(r0,#-0x1)
+    if (!p0.new) jump:t .Lpass
+    memd(stack) = r7:6; // S1 store to stack will also fault
+    memw(data) = r12; // S0 store will fault
+}
+/*
+ * We should not get here:
+ */
+   r2 = #1
+   stop(r0);
diff --git a/tests/tcg/hexagon/system/vid_reg.c b/tests/tcg/hexagon/system/vid_reg.c
new file mode 100644
index 000000000000..25f266f98b2d
--- /dev/null
+++ b/tests/tcg/hexagon/system/vid_reg.c
@@ -0,0 +1,36 @@
+/*
+ * Verify vid reads/writes really update the register.
+ */
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+
+static inline uint32_t getvid()
+{
+    uint32_t reg;
+    asm volatile("%0=vid;" : "=r"(reg));
+    return reg;
+}
+static inline void setvid(uint32_t val)
+{
+    asm volatile("vid=%0;" : : "r"(val));
+    return;
+}
+int main()
+{
+    uint32_t testval = 0x3ff03ff;
+    setvid(testval);
+    if (testval != getvid()) {
+        printf("ERROR: vid read returned: 0x%x\n", getvid());
+    }
+    assert(testval == getvid());
+
+    /* L2VIC_NO_PENDING (0xffffffff) should not update the vid */
+    setvid(0xffffffff);
+    if (testval != getvid()) {
+        printf("ERROR: vid read returned: 0x%x\n", getvid());
+    }
+
+    assert(testval == getvid());
+}
diff --git a/tests/tcg/hexagon/utimer.c b/tests/tcg/hexagon/utimer.c
new file mode 100644
index 000000000000..ae3bca320192
--- /dev/null
+++ b/tests/tcg/hexagon/utimer.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright(c) 2022-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+
+static int err;
+
+#include "hex_test.h"
+
+static uint64_t get_time()
+{
+    uint64_t time;
+    asm volatile("%0 = utimer\n\t"
+                 : "=r"(time)
+                 :
+                 :
+                 );
+    return time;
+}
+
+static uint64_t get_time_from_regs()
+{
+    uint32_t time_low;
+    uint32_t time_high;
+    asm volatile("%0 = utimerhi\n\t"
+                 "%1 = utimerlo\n\t"
+                 : "=r"(time_high), "=r"(time_low)
+                 :
+                 :
+                 );
+    return ((uint64_t)time_high << 32) | (uint64_t)time_low;
+}
+
+
+int main()
+{
+    err = 0;
+
+    uint64_t t0 = get_time();
+    check64_ne(t0, 0);
+
+    uint64_t t1 = get_time_from_regs();
+    check64_ne(t1, 0);
+
+    puts(err ? "FAIL" : "PASS");
+    return err;
+}