From ad90d0426b1447f953b1ece5c013e97986959483 Mon Sep 17 00:00:00 2001
From: Tom Lin <tom91136@gmail.com>
Date: Wed, 4 Aug 2021 14:11:54 +0100
Subject: [PATCH] julia: initial Julia implementation

---
 README.md                             |   8 +-
 miniBUDE.jl/.JuliaFormatter.toml      |   2 +
 miniBUDE.jl/Manifest.toml             | 489 ++++++++++++++++++++++++++
 miniBUDE.jl/Project.toml              |  19 +
 miniBUDE.jl/README.md                 |  29 ++
 miniBUDE.jl/src/AMDGPU.jl             | 216 ++++++++++++
 miniBUDE.jl/src/BUDE.jl               | 250 +++++++++++++
 miniBUDE.jl/src/CUDA.jl               | 236 +++++++++++++
 miniBUDE.jl/src/KernelAbstractions.jl | 250 +++++++++++++
 miniBUDE.jl/src/Threaded.jl           | 180 ++++++++++
 miniBUDE.jl/src/miniBUDE.jl           |   3 +
 miniBUDE.jl/src/oneAPI.jl             | 201 +++++++++++
 12 files changed, 1881 insertions(+), 2 deletions(-)
 create mode 100644 miniBUDE.jl/.JuliaFormatter.toml
 create mode 100644 miniBUDE.jl/Manifest.toml
 create mode 100644 miniBUDE.jl/Project.toml
 create mode 100644 miniBUDE.jl/README.md
 create mode 100644 miniBUDE.jl/src/AMDGPU.jl
 create mode 100644 miniBUDE.jl/src/BUDE.jl
 create mode 100644 miniBUDE.jl/src/CUDA.jl
 create mode 100644 miniBUDE.jl/src/KernelAbstractions.jl
 create mode 100644 miniBUDE.jl/src/Threaded.jl
 create mode 100644 miniBUDE.jl/src/miniBUDE.jl
 create mode 100644 miniBUDE.jl/src/oneAPI.jl

diff --git a/README.md b/README.md
index f2c67f3..d08eb26 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ Increasing the iteration count has similar performance effects to docking multip
 
 The top-level `data` directory contains the input common to implementations.
 The top-level `makedeck` directory contains an input deck generation program and a set of mol2/bhff input files.
-Each other subdirectory contains a separate implementation:
+Each other subdirectory contains a separate C/C++ implementation:
 
 - [OpenMP](openmp/) for CPUs
 - [OpenMP target](openmp-target/) for GPUs
@@ -18,6 +18,11 @@ Each other subdirectory contains a separate implementation:
 - [SYCL](sycl/) for CPUs and GPUs
 - [Kokkos](kokkos/) for CPUs and GPUs
 
+We also include implementations in emerging programming languages as direct ports of miniBUDE:
+
+- [Julia](miniBUDE.jl) for CPUs (@threads) and GPUs ([CUDA.jl](https://juliagpu.gitlab.io/CUDA.jl/), [AMDGPU.jl](https://amdgpu.juliagpu.org/stable/), [oneAPI.jl](https://github.com/JuliaGPU/oneAPI.jl), etc)
+
+
 ## Building
 
 To build with the default options, type `make` in an implementation directory.
@@ -30,7 +35,6 @@ Refer to each implementation's README for further build instructions.
 To run with the default options, run the binary without any flags.
 To adjust the run time, use `-i` to set the number of iterations.
 For very short runs, e.g. for simulation, use `-n 1024` to reduce the number of poses.
-The maximum number of poses supported is `65536`.
 
 Refer to each implementation's README for further run instructions.
 
diff --git a/miniBUDE.jl/.JuliaFormatter.toml b/miniBUDE.jl/.JuliaFormatter.toml
new file mode 100644
index 0000000..ac95ddd
--- /dev/null
+++ b/miniBUDE.jl/.JuliaFormatter.toml
@@ -0,0 +1,2 @@
+indent = 2
+margin = 100
\ No newline at end of file
diff --git a/miniBUDE.jl/Manifest.toml b/miniBUDE.jl/Manifest.toml
new file mode 100644
index 0000000..e2ff0bc
--- /dev/null
+++ b/miniBUDE.jl/Manifest.toml
@@ -0,0 +1,489 @@
+# This file is machine-generated - editing it directly is not advised
+
+[[AMDGPU]]
+deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Printf", "Random", "Requires", "Setfield", "hsa_rocr_jll", "hsakmt_roct_jll"]
+git-tree-sha1 = "04fdb3923ac6f55fa7347dce0f0f6f10e321e2e9"
+uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+version = "0.2.7"
+
+[[AbstractFFTs]]
+deps = ["LinearAlgebra"]
+git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0"
+uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
+version = "1.0.1"
+
+[[Adapt]]
+deps = ["LinearAlgebra"]
+git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7"
+uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+version = "3.3.1"
+
+[[ArgParse]]
+deps = ["Logging", "TextWrap"]
+git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d"
+uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
+version = "1.1.4"
+
+[[ArgTools]]
+uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
+
+[[Artifacts]]
+uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
+
+[[BFloat16s]]
+deps = ["LinearAlgebra", "Test"]
+git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a"
+uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
+version = "0.1.0"
+
+[[Base64]]
+uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
+
+[[BinaryProvider]]
+deps = ["Libdl", "Logging", "SHA"]
+git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058"
+uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
+version = "0.5.10"
+
+[[Bzip2_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2"
+uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0"
+version = "1.0.8+0"
+
+[[CEnum]]
+git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
+uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
+version = "0.4.1"
+
+[[CUDA]]
+deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "MacroTools", "Memoize", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"]
+git-tree-sha1 = "364179416eabc34c9ca32126a6bdb431680c3bad"
+uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
+version = "3.2.1"
+
+[[CUDAKernels]]
+deps = ["Adapt", "CUDA", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"]
+git-tree-sha1 = "81f76297b63c67723b1d60f5e7e002ae3393974b"
+uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
+version = "0.3.0"
+
+[[Cassette]]
+git-tree-sha1 = "087e76b8d48c014112ba890892c33be42ad10504"
+uuid = "7057c7e9-c182-5462-911a-8362d720325c"
+version = "0.3.7"
+
+[[ChainRulesCore]]
+deps = ["Compat", "LinearAlgebra", "SparseArrays"]
+git-tree-sha1 = "0b0aa9d61456940511416b59a0e902c57b154956"
+uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+version = "0.10.12"
+
+[[Compat]]
+deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
+git-tree-sha1 = "dc7dedc2c2aa9faf59a55c622760a25cbefbe941"
+uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
+version = "3.31.0"
+
+[[CompilerSupportLibraries_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
+
+[[ConstructionBase]]
+deps = ["LinearAlgebra"]
+git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4"
+uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
+version = "1.3.0"
+
+[[DataStructures]]
+deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
+git-tree-sha1 = "4437b64df1e0adccc3e5d1adbc3ac741095e4677"
+uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+version = "0.18.9"
+
+[[Dates]]
+deps = ["Printf"]
+uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
+
+[[DelimitedFiles]]
+deps = ["Mmap"]
+uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+
+[[Distributed]]
+deps = ["Random", "Serialization", "Sockets"]
+uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
+
+[[DocStringExtensions]]
+deps = ["LibGit2"]
+git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f"
+uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
+version = "0.8.5"
+
+[[Downloads]]
+deps = ["ArgTools", "LibCURL", "NetworkOptions"]
+uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
+
+[[Elfutils_jll]]
+deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"]
+git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436"
+uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a"
+version = "0.182.0+0"
+
+[[ExprTools]]
+git-tree-sha1 = "10407a39b87f29d47ebaca8edbc75d7c302ff93e"
+pinned = true
+uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
+version = "0.1.3"
+
+[[Future]]
+deps = ["Random"]
+uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
+
+[[GPUArrays]]
+deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"]
+git-tree-sha1 = "df5b8569904c5c10e84c640984cfff054b18c086"
+uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+version = "6.4.1"
+
+[[GPUCompiler]]
+deps = ["DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "Serialization", "TimerOutputs", "UUIDs"]
+git-tree-sha1 = "42d635f6d87af125b86288df3819f805fb4d851a"
+uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
+version = "0.11.5"
+
+[[InteractiveUtils]]
+deps = ["Markdown"]
+uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+
+[[JLLWrappers]]
+deps = ["Preferences"]
+git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e"
+uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
+version = "1.3.0"
+
+[[KernelAbstractions]]
+deps = ["Adapt", "Cassette", "InteractiveUtils", "MacroTools", "SpecialFunctions", "StaticArrays", "UUIDs"]
+git-tree-sha1 = "5e6c70389c1b1e40adb81664ca8cea6ce8127afc"
+uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+version = "0.7.0"
+
+[[LLVM]]
+deps = ["CEnum", "Libdl", "Printf", "Unicode"]
+git-tree-sha1 = "f57ac3fd2045b50d3db081663837ac5b4096947e"
+uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
+version = "3.9.0"
+
+[[LazyArtifacts]]
+deps = ["Artifacts", "Pkg"]
+uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
+
+[[LibCURL]]
+deps = ["LibCURL_jll", "MozillaCACerts_jll"]
+uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
+
+[[LibCURL_jll]]
+deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
+uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
+
+[[LibGit2]]
+deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
+uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
+
+[[LibSSH2_jll]]
+deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
+uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
+
+[[Libdl]]
+uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
+
+[[LinearAlgebra]]
+deps = ["Libdl"]
+uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+
+[[LogExpFunctions]]
+deps = ["DocStringExtensions", "LinearAlgebra"]
+git-tree-sha1 = "7bd5f6565d80b6bf753738d2bc40a5dfea072070"
+uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
+version = "0.2.5"
+
+[[Logging]]
+uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
+
+[[MacroTools]]
+deps = ["Markdown", "Random"]
+git-tree-sha1 = "6a8a2a625ab0dea913aba95c11370589e0239ff0"
+uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
+version = "0.5.6"
+
+[[Markdown]]
+deps = ["Base64"]
+uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
+
+[[MbedTLS_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
+
+[[Memoize]]
+deps = ["MacroTools"]
+git-tree-sha1 = "2b1dfcba103de714d31c033b5dacc2e4a12c7caa"
+uuid = "c03570c3-d221-55d1-a50c-7939bbd78826"
+version = "0.4.4"
+
+[[Mmap]]
+uuid = "a63ad114-7e13-5084-954f-fe012c677804"
+
+[[MozillaCACerts_jll]]
+uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
+
+[[NEO_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"]
+git-tree-sha1 = "c753dd029eb0837658bf8eaee041c19e4ce5bb8c"
+uuid = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd"
+version = "21.12.19358+0"
+
+[[NUMA_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e"
+uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d"
+version = "2.0.13+1"
+
+[[NetworkOptions]]
+uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
+
+[[OpenSpecFun_jll]]
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
+uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
+version = "0.5.5+0"
+
+[[OrderedCollections]]
+git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
+uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
+version = "1.4.1"
+
+[[Parameters]]
+deps = ["OrderedCollections", "UnPack"]
+git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345"
+uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
+version = "0.12.2"
+
+[[Pkg]]
+deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
+uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+
+[[Preferences]]
+deps = ["TOML"]
+git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a"
+uuid = "21216c6a-2e73-6563-6e65-726566657250"
+version = "1.2.2"
+
+[[Printf]]
+deps = ["Unicode"]
+uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+
+[[REPL]]
+deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
+uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
+
+[[ROCKernels]]
+deps = ["AMDGPU", "Adapt", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"]
+git-tree-sha1 = "41105b861342637dde17797bdd9aaa537aca646b"
+uuid = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9"
+version = "0.2.0"
+
+[[Random]]
+deps = ["Serialization"]
+uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+
+[[Random123]]
+deps = ["Libdl", "Random", "RandomNumbers"]
+git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3"
+uuid = "74087812-796a-5b5d-8853-05524746bad3"
+version = "1.4.2"
+
+[[RandomNumbers]]
+deps = ["Random", "Requires"]
+git-tree-sha1 = "441e6fc35597524ada7f85e13df1f4e10137d16f"
+uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
+version = "1.4.0"
+
+[[Reexport]]
+git-tree-sha1 = "5f6c21241f0f655da3952fd60aa18477cf96c220"
+uuid = "189a3867-3050-52da-a836-e630ba90ab69"
+version = "1.1.0"
+
+[[Requires]]
+deps = ["UUIDs"]
+git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621"
+uuid = "ae029012-a4dd-5104-9daa-d747884805df"
+version = "1.1.3"
+
+[[SHA]]
+uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
+
+[[SPIRV_LLVM_Translator_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "8cca87d57f6ddf19373cc9791fddc741406c8fbf"
+uuid = "4a5d46fc-d8cf-5151-a261-86b458210efb"
+version = "11.0.0+2"
+
+[[SPIRV_Tools_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "3c09b0e79758ddd4ecf41b9edf449a36ad70ef73"
+uuid = "6ac6d60f-d740-5983-97d7-a4482c0689f4"
+version = "2020.6.0+0"
+
+[[Scratch]]
+deps = ["Dates"]
+git-tree-sha1 = "0b4b7f1393cff97c33891da2a0bf69c6ed241fda"
+uuid = "6c6a2e73-6563-6170-7368-637461726353"
+version = "1.1.0"
+
+[[Serialization]]
+uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+
+[[Setfield]]
+deps = ["ConstructionBase", "Future", "MacroTools", "Requires"]
+git-tree-sha1 = "d5640fc570fb1b6c54512f0bd3853866bd298b3e"
+uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46"
+version = "0.7.0"
+
+[[SharedArrays]]
+deps = ["Distributed", "Mmap", "Random", "Serialization"]
+uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
+
+[[Sockets]]
+uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
+
+[[SparseArrays]]
+deps = ["LinearAlgebra", "Random"]
+uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+
+[[SpecialFunctions]]
+deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"]
+git-tree-sha1 = "a50550fa3164a8c46747e62063b4d774ac1bcf49"
+uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
+version = "1.5.1"
+
+[[StaticArrays]]
+deps = ["LinearAlgebra", "Random", "Statistics"]
+git-tree-sha1 = "1b9a0f17ee0adde9e538227de093467348992397"
+uuid = "90137ffa-7385-5640-81b9-e52037218182"
+version = "1.2.7"
+
+[[Statistics]]
+deps = ["LinearAlgebra", "SparseArrays"]
+uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+
+[[TOML]]
+deps = ["Dates"]
+uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
+
+[[Tar]]
+deps = ["ArgTools", "SHA"]
+uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
+
+[[Test]]
+deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
+uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[[TextWrap]]
+git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf"
+uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
+version = "1.0.1"
+
+[[TimerOutputs]]
+deps = ["ExprTools", "Printf"]
+git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7"
+uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
+version = "0.5.12"
+
+[[UUIDs]]
+deps = ["Random", "SHA"]
+uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+
+[[UnPack]]
+git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b"
+uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
+version = "1.0.2"
+
+[[Unicode]]
+uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
+
+[[XZ_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "9f76853ea2ba894054e24640abfb73d73e5a4cb5"
+uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800"
+version = "5.2.5+0"
+
+[[Zlib_jll]]
+deps = ["Libdl"]
+uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
+
+[[argp_standalone_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34"
+uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3"
+version = "1.3.1+0"
+
+[[fts_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d"
+uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee"
+version = "1.2.7+1"
+
+[[gmmlib_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "4067ef455d4fa67febe26efc3f9565a9bb7ba911"
+uuid = "09858cae-167c-5acb-9302-fddc6874d481"
+version = "20.3.2+0"
+
+[[hsa_rocr_jll]]
+deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"]
+git-tree-sha1 = "42189f176d6ae4f37c0c0e652fec339bb0bfab5d"
+uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a"
+version = "3.7.0+1"
+
+[[hsakmt_roct_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"]
+git-tree-sha1 = "8a9ee6c091e952e4ea6585d15131d43f789ae041"
+uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76"
+version = "3.8.0+0"
+
+[[libigc_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "6140dbf267f7ab57fb791b49f2114374218b5c20"
+uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5"
+version = "1.0.6712+0"
+
+[[nghttp2_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
+
+[[obstack_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e"
+uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79"
+version = "1.2.2+0"
+
+[[oneAPI]]
+deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LinearAlgebra", "NEO_jll", "Printf", "Random", "SPIRV_LLVM_Translator_jll", "SPIRV_Tools_jll", "SpecialFunctions", "oneAPI_Level_Zero_Loader_jll"]
+git-tree-sha1 = "b4a4b84c864e75fe885a1643525f0c97ce310dd9"
+uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
+version = "0.1.3"
+
+[[oneAPI_Level_Zero_Headers_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "48982fbfd2f3d0a30d644563dcf96892d252b395"
+uuid = "f4bc562b-d309-54f8-9efb-476e56f0410d"
+version = "1.1.2+1"
+
+[[oneAPI_Level_Zero_Loader_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Headers_jll"]
+git-tree-sha1 = "1fa53dfdd32a732f09c254c86403e1abab653fb2"
+uuid = "13eca655-d68d-5b81-8367-6d99d727ab01"
+version = "1.3.6+0"
+
+[[p7zip_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
diff --git a/miniBUDE.jl/Project.toml b/miniBUDE.jl/Project.toml
new file mode 100644
index 0000000..626cbdd
--- /dev/null
+++ b/miniBUDE.jl/Project.toml
@@ -0,0 +1,19 @@
+name = "miniBUDE"
+uuid = "2f7434cc-9e25-426a-818f-4b8ba496e7fb"
+authors = ["Wei-Chen Lin <wl14928@bristol.ac.uk>"]
+version = "0.0.1"
+
+[deps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
+ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
+ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
+
+[compat]
+julia = "1.6"
diff --git a/miniBUDE.jl/README.md b/miniBUDE.jl/README.md
new file mode 100644
index 0000000..5248996
--- /dev/null
+++ b/miniBUDE.jl/README.md
@@ -0,0 +1,29 @@
+miniBUDE Julia
+==============
+
+This is an implementation of miniBUDE in Julia which contains the following variants:
+
+ * `Threaded.jl` - Threaded implementation with `Threads.@threads` macros
+ * `CUDA.jl` - Direct port of miniBUDE native CUDA implementation using [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl)
+ * `AMDGPU.jl` - Direct port of miniBUDE's native HIP(via CUDA) implementation using [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl)
+ * `oneAPi.jl` - Direct port of miniBUDE's native SYCL implementation using [oneAPi.jl](https://github.com/JuliaGPU/oneAPI.jl)
+ * `KernelAbstractions.jl` - Direct port of miniBUDE's native CUDA implementation using [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl)
+
+### Build & Run
+
+Prerequisites
+
+ * Julia >= 1.6+
+
+With Julia on path, run the benchmark with:
+
+```shell
+> cd JuliaStream.jl
+> julia --project -e 'import Pkg; Pkg.instantiate()' # only required on first run
+> julia --project src/<IMPL>.jl # e.g `julia --project src/KernelAbstractions.jl`, see variants listed above.
+```
+
+**Important:**
+ * Julia is 1-indexed, so N >= 1 in `--device N` (alternatively, use device name substring).
+ * Thread count for `Threaded.jl` must be set via the `JULIA_NUM_THREADS` environment variable (e.g `export JULIA_NUM_THREADS=$(nproc)`) otherwise it defaults to 1.
+ * Certain implementations such as CUDA and AMDGPU will do hardware detection at runtime and may download and/or compile further software packages for the platform.
diff --git a/miniBUDE.jl/src/AMDGPU.jl b/miniBUDE.jl/src/AMDGPU.jl
new file mode 100644
index 0000000..007efa0
--- /dev/null
+++ b/miniBUDE.jl/src/AMDGPU.jl
@@ -0,0 +1,216 @@
+include("BUDE.jl")
+using AMDGPU, StaticArrays
+
+
+# AMDGPU.agents()'s internal iteration order isn't stable
+function gpu_agents_in_repr_order()
+  # XXX if we select anything other than :gpu, we get 
+  # HSA_STATUS_ERROR_INVALID_AGENT on the first kernel submission
+  sort(AMDGPU.get_agents(:gpu), by = repr)
+end
+
+function devices()::Vector{DeviceWithRepr}
+  try
+    map(d -> (d, repr(d), "AMDGPU.jl"), gpu_agents_in_repr_order())
+  catch
+    # probably unsupported
+    []
+  end
+end
+
+
+function run(params::Params, deck::Deck, device::DeviceWithRepr)
+
+  # XXX AMDGPU doesn't expose an API for setting the default like CUDA.device!()
+  # but AMDGPU.get_default_agent returns DEFAULT_AGENT so we can do it by hand
+  AMDGPU.DEFAULT_AGENT[] = device[1]
+  println("Using GPU HSA device: $(AMDGPU.get_name(device[1])) ($(repr(device[1])))")
+
+
+  protein = ROCArray{Atom}(deck.protein)
+  ligand = ROCArray{Atom}(deck.ligand)
+  forcefield = ROCArray{FFParams}(deck.forcefield)
+  poses = ROCArray{Float32,2}(deck.poses)
+
+  nprotein::Int = length(deck.protein)
+  nligand::Int = length(deck.ligand)
+  nforcefield::Int = length(deck.forcefield)
+  nposes::Int = size(deck.poses)[2]
+
+  etotals = ROCArray{Float32}(undef, nposes)
+  blocks_size = ceil(UInt, nposes / params.ppwi)
+  nthreads = params.wgsize
+
+  println("Using kernel parameters: <<<$(blocks_size),$(nthreads)>>> 1:$nposes")
+
+  # warmup
+  AMDGPU.wait(
+    @roc groupsize = nthreads gridsize = blocks_size fasten_main(
+      nprotein,
+      nligand,
+      nforcefield,
+      nposes,
+      protein,
+      ligand,
+      forcefield,
+      poses,
+      etotals,
+      Val(convert(Int, params.ppwi)),
+    )
+  )
+  elapsed = @elapsed begin
+    for _ = 1:params.iterations
+      AMDGPU.wait(
+        @roc groupsize = nthreads gridsize = blocks_size fasten_main(
+          nprotein,
+          nligand,
+          nforcefield,
+          nposes,
+          protein,
+          ligand,
+          forcefield,
+          poses,
+          etotals,
+          Val(convert(Int, params.ppwi)),
+        )
+      )
+    end
+  end
+
+  (Array{Float32}(etotals), elapsed, params.ppwi)
+end
+
+
+@fastmath function fasten_main(
+  nprotein::Int,
+  nligand::Int,
+  nforcefield::Int,
+  nposes::Int,
+  protein,
+  ligand,
+  forcefield,
+  poses,
+  etotals,
+  ::Val{PPWI},
+) where {PPWI}
+
+  #  Get index of first TD
+  ix = (workgroupIdx().x - 1) * workgroupDim().x * PPWI + workitemIdx().x
+
+  #  Have extra threads do the last member intead of return.
+  #  A return would disable use of barriers, so not using return is better
+  ix = ix <= nposes ? ix : nposes - PPWI
+
+  etot = MArray{Tuple{PPWI},Float32}(undef)
+  transform = MArray{Tuple{PPWI,3},Vec4f32}(undef)
+
+  lsz = workgroupDim().x
+  for i = 1:PPWI
+    index = (ix) + (i - 1) * lsz
+    @inbounds sx::Float32 = sin(poses[1, index])
+    @inbounds cx::Float32 = cos(poses[1, index])
+    @inbounds sy::Float32 = sin(poses[2, index])
+    @inbounds cy::Float32 = cos(poses[2, index])
+    @inbounds sz::Float32 = sin(poses[3, index])
+    @inbounds cz::Float32 = cos(poses[3, index])
+    @inbounds transform[i, 1] = #
+      Vec4f32(cy * cz, sx * sy * cz - cx * sz, cx * sy * cz + sx * sz, poses[4, index])
+    @inbounds transform[i, 2] = #
+      Vec4f32(cy * sz, sx * sy * sz + cx * cz, cx * sy * sz - sx * cz, poses[5, index])
+    @inbounds transform[i, 3] = #
+      Vec4f32(-sy, sx * cy, cx * cy, poses[6, index])
+    etot[i] = 0
+  end
+
+  for il = 1:nligand
+
+    @inbounds l_atom::Atom = ligand[il]
+    @inbounds l_params::FFParams = forcefield[l_atom.type+1]
+    lhphb_ltz::Bool = l_params.hphb < Zero
+    lhphb_gtz::Bool = l_params.hphb > Zero
+
+    lpos = MArray{Tuple{PPWI},Vec3f32}(undef)
+
+    @simd for i = 1:PPWI
+      # Transform ligand atom
+      @inbounds lpos[i] = Vec3f32(
+        transform[i, 1].w +
+        l_atom.x * transform[i, 1].x +
+        l_atom.y * transform[i, 1].y +
+        l_atom.z * transform[i, 1].z,
+        transform[i, 2].w +
+        l_atom.x * transform[i, 2].x +
+        l_atom.y * transform[i, 2].y +
+        l_atom.z * transform[i, 2].z,
+        transform[i, 3].w +
+        l_atom.x * transform[i, 3].x +
+        l_atom.y * transform[i, 3].y +
+        l_atom.z * transform[i, 3].z,
+      )
+    end
+
+    for ip = 1:nprotein
+      @inbounds p_atom = protein[ip]
+      @inbounds p_params::FFParams = forcefield[p_atom.type+1]
+
+      radij::Float32 = p_params.radius + l_params.radius
+      r_radij::Float32 = One / radij
+
+      elcdst::Float32 = (p_params.hbtype == HbtypeF && l_params.hbtype == HbtypeF) ? Four : Two
+      elcdst1::Float32 = (p_params.hbtype == HbtypeF && l_params.hbtype == HbtypeF) ? Quarter : Half
+      type_E::Bool = ((p_params.hbtype == HbtypeE || l_params.hbtype == HbtypeE))
+
+      phphb_ltz::Bool = p_params.hphb < Zero
+      phphb_gtz::Bool = p_params.hphb > Zero
+      phphb_nz::Bool = p_params.hphb != Zero
+      p_hphb::Float32 = p_params.hphb * (phphb_ltz && lhphb_gtz ? -One : One)
+      l_hphb::Float32 = l_params.hphb * (phphb_gtz && lhphb_ltz ? -One : One)
+      distdslv::Float32 =
+        (phphb_ltz ? (lhphb_ltz ? Npnpdist : Nppdist) : (lhphb_ltz ? Nppdist : -Float32Max))
+      r_distdslv::Float32 = One / distdslv
+
+      chrg_init::Float32 = l_params.elsc * p_params.elsc
+      dslv_init::Float32 = p_hphb + l_hphb
+
+
+      @simd for i = 1:PPWI
+        @inbounds x::Float32 = lpos[i].x - p_atom.x
+        @inbounds y::Float32 = lpos[i].y - p_atom.y
+        @inbounds z::Float32 = lpos[i].z - p_atom.z
+
+        distij::Float32 = sqrt(x * x + y * y + z * z)
+
+        # Calculate the sum of the sphere radii
+        distbb::Float32 = distij - radij
+        zone1::Bool = (distbb < Zero)
+
+        # Calculate steric energy
+        @inbounds etot[i] += (One - (distij * r_radij)) * (zone1 ? Two * Hardness : Zero)
+
+        # Calculate formal and dipole charge interactions
+        chrg_e::Float32 =
+          chrg_init * ((zone1 ? One : (One - distbb * elcdst1)) * (distbb < elcdst ? One : Zero))
+        neg_chrg_e::Float32 = -abs(chrg_e)
+        chrg_e = type_E ? neg_chrg_e : chrg_e
+        @inbounds etot[i] += chrg_e * Cnstnt
+
+        # Calculate the two cases for Nonpolar-Polar repulsive interactions
+        coeff::Float32 = (One - (distbb * r_distdslv))
+        dslv_e::Float32 = dslv_init * ((distbb < distdslv && phphb_nz) ? One : Zero)
+        dslv_e *= (zone1 ? One : coeff)
+        @inbounds etot[i] += dslv_e
+      end
+    end
+  end
+
+  td_base = (workgroupIdx().x - 1) * workgroupDim().x * PPWI + workitemIdx().x
+  if td_base <= nposes
+    @simd for i = 1:PPWI
+      @inbounds etotals[td_base+(i-1)*workgroupDim().x] = etot[i] * Half
+    end
+  end
+
+  nothing
+end
+
+main()
diff --git a/miniBUDE.jl/src/BUDE.jl b/miniBUDE.jl/src/BUDE.jl
new file mode 100644
index 0000000..559dcf5
--- /dev/null
+++ b/miniBUDE.jl/src/BUDE.jl
@@ -0,0 +1,250 @@
+using ArgParse
+using Parameters
+using Printf
+using Base: Float64, Float32, Int, Int32
+import Base.read
+
+const DefaultInterations = 8
+const DefaultNPoses = 65536
+const RefNPoses = 65536
+const DefaultWGSize = 64
+const DefaultPPWI = 4
+
+const Zero = 0.0f0
+const Quarter = 0.25f0
+const Half = 0.5f0
+const One = 1.0f0
+const Two = 2.0f0
+const Four = 4.0f0
+const Cnstnt = 45.0f0
+
+const HbtypeF = 70
+const HbtypeE = 69
+const Hardness = 38.0f0
+const Npnpdist = 5.5f0
+const Nppdist = 1.0f0
+
+const Float32Max = typemax(Float32)
+
+struct Vec3f32
+  x::Float32
+  y::Float32
+  z::Float32
+end
+
+struct Vec4f32
+  x::Float32
+  y::Float32
+  z::Float32
+  w::Float32
+end
+
+struct Atom
+  x::Float32
+  y::Float32
+  z::Float32
+  type::Int32
+end
+
+function read(io::IO, ::Type{Atom})
+  Atom(read(io, Float32), read(io, Float32), read(io, Float32), read(io, Int32))
+end
+
+struct FFParams
+  hbtype::Int32
+  radius::Float32
+  hphb::Float32
+  elsc::Float32
+end
+
+function read(io::IO, ::Type{FFParams})
+  FFParams(read(io, Int32), read(io, Float32), read(io, Float32), read(io, Float32))
+end
+
+@with_kw mutable struct Params
+  device::String = "1"
+  list::Bool = false
+  numposes::UInt = DefaultNPoses
+  iterations::UInt = DefaultInterations
+  wgsize::UInt = DefaultWGSize
+  ppwi::UInt = DefaultPPWI
+  deck::String = "../data/bm1"
+end
+
+struct Deck
+  protein::AbstractArray{Atom}
+  ligand::AbstractArray{Atom}
+  forcefield::AbstractArray{FFParams}
+  poses::AbstractArray{Float32,2}
+end
+
+const DeviceWithRepr = Tuple{Any,String,Any}
+
+function parse_options(given::Params)
+  s = ArgParseSettings()
+  @add_arg_table s begin
+    "--list", "-l"
+    help = "List available devices"
+    action = :store_true
+    "--device", "-d"
+    help = "Select device at DEVICE or the device name substring if DEVICE is not an integer, NOTE: Julia is 1-indexed"
+    arg_type = String
+    default = given.device
+    "--numposes", "-n"
+    help = "Compute energies for N poses"
+    arg_type = Int
+    default = convert(Int, given.numposes)
+    "--iterations", "-i"
+    help = "Repeat kernel I times"
+    arg_type = Int
+    default = convert(Int, given.iterations)
+    "--wgsize", "-w"
+    help = "Run with work-group size WGSIZE"
+    arg_type = Int
+    default = convert(Int, given.wgsize)
+    "--ppwi", "-p"
+    help = "Compute PPWI poses per work-item"
+    arg_type = Int
+    default = convert(Int, given.ppwi)
+    "--deck"
+    help = "Use the DECK directory as input deck"
+    arg_type = String
+    default = given.deck
+  end
+  args = parse_args(s)
+  # surely there's a better way than doing this:
+  for (arg, val) in args
+    setproperty!(given, Symbol(arg), val)
+  end
+end
+
+function read_structs(path::String, ::Type{T})::Vector{T} where {T}
+  io = open(path, "r")
+  size = filesize(io)
+  [read(io, T) for _ = 1:size÷sizeof(T)]
+end
+
+function print_timings(params::Params, deck::Deck, elapsedSeconds::Float64, wgsize::UInt)
+
+  # Average time per iteration
+  averageIterationSeconds = elapsedSeconds / params.iterations
+
+  natlig = length(deck.ligand)
+  natpro = length(deck.protein)
+
+  # Compute FLOP/s
+  ops_per_wg = (wgsize * 27 + natlig * (2 + wgsize * 18 + natpro * (10 + wgsize * 30)) + wgsize)
+
+  total_ops = ops_per_wg * (params.numposes / wgsize)
+  flops = total_ops / averageIterationSeconds
+  gflops = flops / 1.0e9
+
+  total_finsts = 25.0 * natpro * natlig * params.numposes
+  finsts = total_finsts / averageIterationSeconds
+  gfinsts = finsts / 1e9
+
+  interactions = params.numposes * natlig * natpro
+  interactions_per_sec = interactions / averageIterationSeconds
+
+  # Print stats
+  @printf "- Kernel time:    %.03f ms\n" (elapsedSeconds * 1000.0)
+  @printf "- Average time:   %.03f ms\n" (averageIterationSeconds * 1000.0)
+  @printf "- Interactions/s: %.03f billion\n" interactions_per_sec / 1e9
+  @printf "- GFLOP/s:        %.03f\n" gflops
+  @printf "- GFInst/s:       %.03f\n" gfinsts
+end
+
+function main()
+
+  params::Params = Params()
+  parse_options(params)
+
+
+  ds = devices()
+
+  if params.list
+    for (i, (_, name, type)) in enumerate(ds)
+      println("[$i] ($type) $name")
+    end
+    exit(0)
+  end
+
+  deviceIndex = try
+    index = parse(Int, params.device)
+    if index < 1 || index > length(ds)
+      error("Device $(index) out of range (1..$(length(ds))), NOTE: Julia is 1-indexed")
+    end
+    index
+  catch
+    index = findfirst(x -> occursin(params.device, "($(x[3])) $(x[2])"), ds)
+    if index === nothing
+      error("No device match the substring `$(params.device)`, see --list for available devices")
+    end
+    index
+  end
+
+
+  poses = permutedims( # reshape is column order so we flip it back again
+    reshape(
+      read_structs("$(params.deck)/poses.in", Float32),  # read poses as one long array
+      (params.numposes, 6), # reshape it to 6 slices of numposes sized rows
+    ),
+  )
+
+  deck = Deck(
+    read_structs("$(params.deck)/protein.in", Atom),
+    read_structs("$(params.deck)/ligand.in", Atom),
+    read_structs("$(params.deck)/forcefield.in", FFParams),
+    poses,
+  )
+
+  println("Poses     : ", size(deck.poses)[2])
+  println("Iterations: ", params.iterations)
+  println("Ligands   : ", length(deck.ligand))
+  println("Protein   : ", length(deck.protein))
+  println("Forcefield: ", length(deck.forcefield))
+  println("Deck      : ", params.deck)
+  println("WGsize    : ", params.wgsize)
+  println("PPWI      : ", params.ppwi)
+
+  (energies, rumtimeSeconds, ppwi) = run(params, deck, ds[deviceIndex])
+
+  print_timings(params, deck, rumtimeSeconds, ppwi)
+
+  output = open("energies.out", "w+")
+  println("\nEnergies:")
+  for i = 1:size(deck.poses)[2]
+    @printf(output, "%7.2f\n", energies[i])
+    if (i < 16)
+      @printf("%7.2f\n", energies[i])
+    end
+  end
+
+  ref_energies = open("$(params.deck)/ref_energies.out", "r")
+  n_ref_poses = size(deck.poses)[2]
+  if n_ref_poses > RefNPoses
+    println("Only validating the first $(RefNPoses) poses")
+    n_ref_poses = RefNPoses
+  end
+
+  maxdiff = 0.0
+  for i = 1:n_ref_poses
+    line = readline(ref_energies)
+    if line == ""
+      error("ran out of ref energies lines to verify")
+    end
+    e = parse(Float32, line)
+    if (abs(e) < 1.0 && abs(energies[i]) < 1.0)
+      continue
+    end
+    diff = abs(e - energies[i]) / e
+    if (diff > 0.01)
+      println("![$(i)] $(e) $(energies[i]) ~ $(diff)")
+      maxdiff = diff
+    end
+  end
+
+  @printf "Largest difference was %.03f%%.\n\n" 100 * maxdiff # Expect numbers to be accurate to 2 decimal places
+  close(ref_energies)
+
+end
diff --git a/miniBUDE.jl/src/CUDA.jl b/miniBUDE.jl/src/CUDA.jl
new file mode 100644
index 0000000..34a27aa
--- /dev/null
+++ b/miniBUDE.jl/src/CUDA.jl
@@ -0,0 +1,236 @@
+include("BUDE.jl")
+using CUDA, StaticArrays
+
+
+function devices()::Vector{DeviceWithRepr}
+  return !CUDA.functional(false) ? [] :
+         map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", "CUDA.jl"), CUDA.devices())
+end
+
+function run(params::Params, deck::Deck, device::DeviceWithRepr)
+  # show_reason is set to true here so it dumps CUDA info 
+  # for us regardless of whether it's functional
+  if !CUDA.functional(true)
+    error("Non-functional CUDA configuration")
+  end
+
+  # so CUDA's device is 0 indexed, so -1 from Julia
+  CUDA.device!(device[1])
+  CUDA.math_mode!(CUDA.FAST_MATH)
+
+  println("Using CUDA device: $(CUDA.name(device[1])) ($(repr(device[1])))")
+
+  protein = CuArray{Atom}(deck.protein)
+  ligand = CuArray{Atom}(deck.ligand)
+  forcefield = CuArray{FFParams}(deck.forcefield)
+  poses = CuArray{Float32,2}(deck.poses)
+
+  nprotein::Int = length(deck.protein)
+  nligand::Int = length(deck.ligand)
+  nforcefield::Int = length(deck.forcefield)
+  nposes::Int = size(deck.poses)[2]
+
+  etotals = CuArray{Float32}(undef, nposes)
+  nblocks = ceil(UInt, ceil(nposes / params.ppwi) / params.wgsize)
+  nthreads = params.wgsize
+
+  println("Using kernel parameters: <<<$(nblocks),$(nthreads)>>> 1:$nposes")
+
+  shared = false
+  shmem_bytes = shared ? sizeof(FFParams) * nforcefield : 0
+
+  # warmup
+  @cuda blocks = nblocks threads = nthreads shmem = shmem_bytes fasten_main(
+    nprotein,
+    nligand,
+    nforcefield,
+    nposes,
+    protein,
+    ligand,
+    forcefield,
+    poses,
+    etotals,
+    Val(shared),
+    Val(convert(Int, params.ppwi)),
+  )
+  CUDA.synchronize()
+  elapsed = @elapsed begin
+    for _ = 1:params.iterations
+      @cuda blocks = nblocks threads = nthreads shmem = shmem_bytes fasten_main(
+        nprotein,
+        nligand,
+        nforcefield,
+        nposes,
+        protein,
+        ligand,
+        forcefield,
+        poses,
+        etotals,
+        Val(shared),
+        Val(convert(Int, params.ppwi)),
+      )
+    end
+    CUDA.synchronize()
+  end
+
+  (Array{Float32}(etotals), elapsed, params.ppwi)
+end
+
+# const NUM_TD_PER_THREAD = 8
+
+@fastmath function fasten_main(
+  nprotein::Int,
+  nligand::Int,
+  nforcefield::Int,
+  nposes::Int,
+  protein_::CuDeviceVector{Atom},
+  ligand_::CuDeviceVector{Atom},
+  forcefield_::CuDeviceVector{FFParams},
+  poses_::CuDeviceMatrix{Float32},
+  etotals::CuDeviceVector{Float32},
+  ::Val{Shared},
+  ::Val{PPWI},
+) where {Shared,PPWI}
+
+
+  protein = CUDA.Const(protein_)
+  ligand = CUDA.Const(ligand_)
+  global_forcefield = CUDA.Const(forcefield_)
+  poses = CUDA.Const(poses_)
+
+  #  Get index of first TD
+  ix = (blockIdx().x - 1) * blockDim().x * PPWI + threadIdx().x
+
+  #  Have extra threads do the last member intead of return.
+  #  A return would disable use of barriers, so not using return is better
+  ix = ix <= nposes ? ix : nposes - PPWI
+
+  etot = MArray{Tuple{PPWI},Float32}(undef)
+  transform = MArray{Tuple{PPWI,3},Vec4f32}(undef)
+
+
+  if Shared
+    forcefield = @cuDynamicSharedMem(FFParams, (nforcefield))
+    if ix < nforcefield
+      @inbounds forcefield[ix] = global_forcefield[ix]
+    end
+  else
+    forcefield = global_forcefield
+  end
+
+
+
+  lsz = blockDim().x
+  @simd for i = 1:PPWI
+    index = (ix) + (i - 1) * lsz
+    @inbounds sx::Float32 = sin(poses[1, index])
+    @inbounds cx::Float32 = cos(poses[1, index])
+    @inbounds sy::Float32 = sin(poses[2, index])
+    @inbounds cy::Float32 = cos(poses[2, index])
+    @inbounds sz::Float32 = sin(poses[3, index])
+    @inbounds cz::Float32 = cos(poses[3, index])
+    @inbounds transform[i, 1] = #
+      Vec4f32(cy * cz, sx * sy * cz - cx * sz, cx * sy * cz + sx * sz, poses[4, index])
+    @inbounds transform[i, 2] = #
+      Vec4f32(cy * sz, sx * sy * sz + cx * cz, cx * sy * sz - sx * cz, poses[5, index])
+    @inbounds transform[i, 3] = #
+      Vec4f32(-sy, sx * cy, cx * cy, poses[6, index])
+    @inbounds etot[i] = 0
+  end
+
+  if Shared
+    CUDA.sync_threads()
+  end
+
+  for il = 1:nligand
+
+    @inbounds l_atom::Atom = ligand[il]
+    @inbounds l_params::FFParams = forcefield[l_atom.type+1]
+    lhphb_ltz::Bool = l_params.hphb < Zero
+    lhphb_gtz::Bool = l_params.hphb > Zero
+
+    lpos = MArray{Tuple{PPWI},Vec3f32}(undef)
+
+    @simd for i = 1:PPWI
+      # Transform ligand atom
+      @inbounds lpos[i] = Vec3f32(
+        transform[i, 1].w +
+        l_atom.x * transform[i, 1].x +
+        l_atom.y * transform[i, 1].y +
+        l_atom.z * transform[i, 1].z,
+        transform[i, 2].w +
+        l_atom.x * transform[i, 2].x +
+        l_atom.y * transform[i, 2].y +
+        l_atom.z * transform[i, 2].z,
+        transform[i, 3].w +
+        l_atom.x * transform[i, 3].x +
+        l_atom.y * transform[i, 3].y +
+        l_atom.z * transform[i, 3].z,
+      )
+    end
+
+    for ip = 1:nprotein
+      @inbounds p_atom = protein[ip]
+      @inbounds p_params::FFParams = forcefield[p_atom.type+1]
+
+      radij::Float32 = p_params.radius + l_params.radius
+      r_radij::Float32 = One / radij
+
+      elcdst::Float32 = (p_params.hbtype == HbtypeF && l_params.hbtype == HbtypeF) ? Four : Two
+      elcdst1::Float32 = (p_params.hbtype == HbtypeF && l_params.hbtype == HbtypeF) ? Quarter : Half
+      type_E::Bool = ((p_params.hbtype == HbtypeE || l_params.hbtype == HbtypeE))
+
+      phphb_ltz::Bool = p_params.hphb < Zero
+      phphb_gtz::Bool = p_params.hphb > Zero
+      phphb_nz::Bool = p_params.hphb != Zero
+      p_hphb::Float32 = p_params.hphb * (phphb_ltz && lhphb_gtz ? -One : One)
+      l_hphb::Float32 = l_params.hphb * (phphb_gtz && lhphb_ltz ? -One : One)
+      distdslv::Float32 =
+        (phphb_ltz ? (lhphb_ltz ? Npnpdist : Nppdist) : (lhphb_ltz ? Nppdist : -Float32Max))
+      r_distdslv::Float32 = One / distdslv
+
+      chrg_init::Float32 = l_params.elsc * p_params.elsc
+      dslv_init::Float32 = p_hphb + l_hphb
+
+
+      @simd for i = 1:PPWI
+        @inbounds x::Float32 = lpos[i].x - p_atom.x
+        @inbounds y::Float32 = lpos[i].y - p_atom.y
+        @inbounds z::Float32 = lpos[i].z - p_atom.z
+
+        distij::Float32 = sqrt(x * x + y * y + z * z)
+
+        # Calculate the sum of the sphere radii
+        distbb::Float32 = distij - radij
+        zone1::Bool = (distbb < Zero)
+
+        # Calculate steric energy
+        @inbounds etot[i] += (One - (distij * r_radij)) * (zone1 ? Two * Hardness : Zero)
+
+        # Calculate formal and dipole charge interactions
+        chrg_e::Float32 =
+          chrg_init * ((zone1 ? One : (One - distbb * elcdst1)) * (distbb < elcdst ? One : Zero))
+        neg_chrg_e::Float32 = -abs(chrg_e)
+        chrg_e = type_E ? neg_chrg_e : chrg_e
+        @inbounds etot[i] += chrg_e * Cnstnt
+
+        # Calculate the two cases for Nonpolar-Polar repulsive interactions
+        coeff::Float32 = (One - (distbb * r_distdslv))
+        dslv_e::Float32 = dslv_init * ((distbb < distdslv && phphb_nz) ? One : Zero)
+        dslv_e *= (zone1 ? One : coeff)
+        @inbounds etot[i] += dslv_e
+      end
+    end
+  end
+
+  td_base = (blockIdx().x - 1) * blockDim().x * PPWI + threadIdx().x
+  if td_base <= nposes
+    @simd for i = 1:PPWI
+      @inbounds etotals[td_base+(i-1)*blockDim().x] = etot[i] * Half
+    end
+  end
+
+  nothing
+end
+
+main()
diff --git a/miniBUDE.jl/src/KernelAbstractions.jl b/miniBUDE.jl/src/KernelAbstractions.jl
new file mode 100644
index 0000000..190cb19
--- /dev/null
+++ b/miniBUDE.jl/src/KernelAbstractions.jl
@@ -0,0 +1,250 @@
+include("BUDE.jl")
+using ROCKernels, CUDAKernels, KernelAbstractions, StaticArrays, CUDA, AMDGPU
+
+@enum Backend cuda rocm cpu
+
+
+function list_rocm_devices()::Vector{DeviceWithRepr}
+  try
+    # AMDGPU.agents()'s internal iteration order isn't stable
+    sorted = sort(AMDGPU.get_agents(:gpu), by = repr)
+    map(x -> (x, repr(x), rocm), sorted)
+  catch
+    # probably unsupported
+    []
+  end
+end
+
+function list_cuda_devices()::Vector{DeviceWithRepr}
+  return !CUDA.functional(false) ? [] :
+         map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", cuda), CUDA.devices())
+end
+
+function devices()::Vector{DeviceWithRepr}
+  cudas = list_cuda_devices()
+  rocms = list_rocm_devices()
+  cpus = [(undef, "$(Sys.cpu_info()[1].model) ($(Threads.nthreads())T)", cpu)]
+  vcat(cpus, cudas, rocms)
+end
+
+function run(params::Params, deck::Deck, device::DeviceWithRepr)
+
+
+  nposes::Int64 = size(deck.poses)[2]
+  blocks_size::Int64 = ceil(Int64, nposes / params.ppwi)
+  nthreads::Int64 = params.wgsize
+
+  (selected, _, backend) = device
+  println("KernelAbstractions Backend: $backend")
+
+  if backend == cpu
+    println("Using CPU with max $(Threads.nthreads()) threads")
+    protein = deck.protein
+    ligand = deck.ligand
+    forcefield = deck.forcefield
+    poses = deck.poses
+    etotals = Array{Float32}(undef, nposes)
+    backend_impl = CPU()
+  elseif backend == cuda
+    CUDA.device!(selected)
+    if CUDA.device() != selected
+      error("Cannot select CUDA device, expecting $selected, but got $(CUDA.device())")
+    end
+    if !CUDA.functional(true)
+      error("Non-functional CUDA configuration")
+    end
+    println("Using CUDA device: $(CUDA.name(selected)) ($(repr(selected)))")
+
+    protein = CuArray{Atom}(deck.protein)
+    ligand = CuArray{Atom}(deck.ligand)
+    forcefield = CuArray{FFParams}(deck.forcefield)
+    poses = CuArray{Float32,2}(deck.poses)
+    etotals = CuArray{Float32}(undef, nposes)
+    backend_impl = CUDADevice()
+  elseif backend == rocm
+    AMDGPU.DEFAULT_AGENT[] = selected
+    if AMDGPU.get_default_agent() != selected
+      error("Cannot select HSA device, expecting $selected, but got $(AMDGPU.get_default_agent())")
+    end
+    println("Using GPU HSA device: $(AMDGPU.get_name(selected)) ($(repr(selected)))")
+
+    protein = ROCArray{Atom}(deck.protein)
+    ligand = ROCArray{Atom}(deck.ligand)
+    forcefield = ROCArray{FFParams}(deck.forcefield)
+    poses = ROCArray{Float32,2}(deck.poses)
+    etotals = ROCArray{Float32}(undef, nposes)
+    backend_impl = ROCDevice()
+  else
+    error("unsupported backend $(backend)")
+  end
+
+  println("Using kernel parameters: <<<$blocks_size,$nthreads>>> 1:$nposes")
+
+
+  kernel! = fasten_main(backend_impl, nthreads, blocks_size)
+
+  wait(
+    kernel!(
+      nthreads,
+      nposes,
+      protein,
+      ligand,
+      forcefield,
+      poses,
+      etotals,
+      Val(convert(Int, params.ppwi)),
+      ndrange = blocks_size,
+    ),
+  )
+
+  elapsed = @elapsed for i = 1:params.iterations
+    wait(
+      kernel!(
+        nthreads,
+        nposes,
+        protein,
+        ligand,
+        forcefield,
+        poses,
+        etotals,
+        Val(convert(Int, params.ppwi)),
+        ndrange = blocks_size,
+      ),
+    )
+  end
+
+  (Array{Float32}(etotals), elapsed, params.ppwi)
+end
+
+@fastmath @kernel function fasten_main(
+  wgsize::Int64,
+  nposes::Int64,
+  @Const(protein),
+  @Const(ligand),
+  @Const(forcefield),
+  @Const(poses),
+  etotals,
+  ::Val{PPWI},
+) where {PPWI}
+
+  gid = @index(Group)
+  lid = @index(Local)
+
+  #  Get index of first TD
+  ix = (gid - 1) * wgsize * PPWI + lid
+
+  #  Have extra threads do the last member intead of return.
+  #  A return would disable use of barriers, so not using return is better
+  ix = ix <= nposes ? ix : nposes - PPWI
+
+  etot = MArray{Tuple{PPWI},Float32}(undef)
+  transform = MArray{Tuple{PPWI,3},Vec4f32}(undef)
+
+  lsz = wgsize
+  for i = 1:PPWI
+    index = (ix) + (i - 1) * lsz
+    @inbounds sx::Float32 = sin(poses[1, index])
+    @inbounds cx::Float32 = cos(poses[1, index])
+    @inbounds sy::Float32 = sin(poses[2, index])
+    @inbounds cy::Float32 = cos(poses[2, index])
+    @inbounds sz::Float32 = sin(poses[3, index])
+    @inbounds cz::Float32 = cos(poses[3, index])
+    @inbounds transform[i, 1] = #
+      Vec4f32(cy * cz, sx * sy * cz - cx * sz, cx * sy * cz + sx * sz, poses[4, index])
+    @inbounds transform[i, 2] = #
+      Vec4f32(cy * sz, sx * sy * sz + cx * cz, cx * sy * sz - sx * cz, poses[5, index])
+    @inbounds transform[i, 3] = #
+      Vec4f32(-sy, sx * cy, cx * cy, poses[6, index])
+    etot[i] = 0
+  end
+
+  @inbounds for l_atom::Atom in ligand
+
+    l_params::FFParams = forcefield[l_atom.type+1]
+    lhphb_ltz::Bool = l_params.hphb < Zero
+    lhphb_gtz::Bool = l_params.hphb > Zero
+
+    lpos = MArray{Tuple{PPWI},Vec3f32}(undef)
+
+    @simd for i = 1:PPWI
+      # Transform ligand atom
+      @inbounds lpos[i] = Vec3f32(
+        transform[i, 1].w +
+        l_atom.x * transform[i, 1].x +
+        l_atom.y * transform[i, 1].y +
+        l_atom.z * transform[i, 1].z,
+        transform[i, 2].w +
+        l_atom.x * transform[i, 2].x +
+        l_atom.y * transform[i, 2].y +
+        l_atom.z * transform[i, 2].z,
+        transform[i, 3].w +
+        l_atom.x * transform[i, 3].x +
+        l_atom.y * transform[i, 3].y +
+        l_atom.z * transform[i, 3].z,
+      )
+    end
+
+    @inbounds for p_atom::Atom in protein
+      @inbounds p_params::FFParams = forcefield[p_atom.type+1]
+
+      radij::Float32 = p_params.radius + l_params.radius
+      r_radij::Float32 = One / radij
+
+      elcdst::Float32 = (p_params.hbtype == HbtypeF && l_params.hbtype == HbtypeF) ? Four : Two
+      elcdst1::Float32 = (p_params.hbtype == HbtypeF && l_params.hbtype == HbtypeF) ? Quarter : Half
+      type_E::Bool = ((p_params.hbtype == HbtypeE || l_params.hbtype == HbtypeE))
+
+      phphb_ltz::Bool = p_params.hphb < Zero
+      phphb_gtz::Bool = p_params.hphb > Zero
+      phphb_nz::Bool = p_params.hphb != Zero
+      p_hphb::Float32 = p_params.hphb * (phphb_ltz && lhphb_gtz ? -One : One)
+      l_hphb::Float32 = l_params.hphb * (phphb_gtz && lhphb_ltz ? -One : One)
+      distdslv::Float32 =
+        (phphb_ltz ? (lhphb_ltz ? Npnpdist : Nppdist) : (lhphb_ltz ? Nppdist : -Float32Max))
+      r_distdslv::Float32 = One / distdslv
+
+      chrg_init::Float32 = l_params.elsc * p_params.elsc
+      dslv_init::Float32 = p_hphb + l_hphb
+
+
+      @simd for i = 1:PPWI
+        @inbounds x::Float32 = lpos[i].x - p_atom.x
+        @inbounds y::Float32 = lpos[i].y - p_atom.y
+        @inbounds z::Float32 = lpos[i].z - p_atom.z
+
+        distij::Float32 = sqrt(x * x + y * y + z * z)
+
+        # Calculate the sum of the sphere radii
+        distbb::Float32 = distij - radij
+        zone1::Bool = (distbb < Zero)
+
+        # Calculate steric energy
+        @inbounds etot[i] += (One - (distij * r_radij)) * (zone1 ? Two * Hardness : Zero)
+
+        # Calculate formal and dipole charge interactions
+        chrg_e::Float32 =
+          chrg_init * ((zone1 ? One : (One - distbb * elcdst1)) * (distbb < elcdst ? One : Zero))
+        neg_chrg_e::Float32 = -abs(chrg_e)
+        chrg_e = type_E ? neg_chrg_e : chrg_e
+        @inbounds etot[i] += chrg_e * Cnstnt
+
+        # Calculate the two cases for Nonpolar-Polar repulsive interactions
+        coeff::Float32 = (One - (distbb * r_distdslv))
+        dslv_e::Float32 = dslv_init * ((distbb < distdslv && phphb_nz) ? One : Zero)
+        dslv_e *= (zone1 ? One : coeff)
+        @inbounds etot[i] += dslv_e
+      end
+    end
+  end
+
+  td_base = (gid - 1) * wgsize * PPWI + lid
+  if td_base <= nposes
+    @simd for i = 1:PPWI
+      @inbounds etotals[td_base+(i-1)*wgsize] = etot[i] * Half
+    end
+  end
+
+  nothing
+end
+
+main()
diff --git a/miniBUDE.jl/src/Threaded.jl b/miniBUDE.jl/src/Threaded.jl
new file mode 100644
index 0000000..7393393
--- /dev/null
+++ b/miniBUDE.jl/src/Threaded.jl
@@ -0,0 +1,180 @@
+include("BUDE.jl")
+using StaticArrays
+
+const Device = (undef, "CPU", "Threaded")
+
+function devices()
+  return [Device]
+end
+
+const Float32Max = typemax(Float32)
+
+function run(params::Params, deck::Deck, _::DeviceWithRepr)
+  println("Using max $(Threads.nthreads()) threads")
+  if params.ppwi != DefaultPPWI
+    @warn "Threaded implementation only uses wgsize, the PPWI argument is ignored"
+  end
+
+  poses = size(deck.poses)[2]
+  etotals = Array{Float32}(undef, poses)
+
+  # warmup
+  fasten_main(
+    Val(convert(Int, params.wgsize)),
+    deck.protein,
+    deck.ligand,
+    deck.forcefield,
+    deck.poses,
+    etotals,
+  )
+
+  elapsed = @elapsed for _ = 1:params.iterations
+    fasten_main(
+      Val(convert(Int, params.wgsize)),
+      deck.protein,
+      deck.ligand,
+      deck.forcefield,
+      deck.poses,
+      etotals,
+    )
+  end
+
+
+  (etotals, elapsed, params.wgsize)
+end
+
+@fastmath function fasten_main(
+  ::Val{WGSIZE},
+  protein::AbstractArray{Atom},
+  ligand::AbstractArray{Atom},
+  forcefield::AbstractArray{FFParams},
+  poses::AbstractArray{Float32,2},
+  etotals::AbstractArray{Float32},
+) where {WGSIZE}
+  nposes::Int = size(poses)[2]
+  numGroups::Int = nposes ÷ WGSIZE
+  nligand::Int = length(ligand)
+  nprotein::Int = length(protein)
+
+  Threads.@threads for group = 1:numGroups
+
+    etot = MArray{Tuple{WGSIZE},Float32}(undef)
+    transform = MArray{Tuple{3,4,WGSIZE},Float32}(undef)
+
+    @simd for i = 1:WGSIZE
+      ix = (group - 1) * (WGSIZE) + i
+      @inbounds sx::Float32 = sin(poses[1, ix])
+      @inbounds cx::Float32 = cos(poses[1, ix])
+      @inbounds sy::Float32 = sin(poses[2, ix])
+      @inbounds cy::Float32 = cos(poses[2, ix])
+      @inbounds sz::Float32 = sin(poses[3, ix])
+      @inbounds cz::Float32 = cos(poses[3, ix])
+      @inbounds transform[1, 1, i] = cy * cz
+      @inbounds transform[1, 2, i] = sx * sy * cz - cx * sz
+      @inbounds transform[1, 3, i] = cx * sy * cz + sx * sz
+      @inbounds transform[1, 4, i] = poses[4, ix]
+      @inbounds transform[2, 1, i] = cy * sz
+      @inbounds transform[2, 2, i] = sx * sy * sz + cx * cz
+      @inbounds transform[2, 3, i] = cx * sy * sz - sx * cz
+      @inbounds transform[2, 4, i] = poses[5, ix]
+      @inbounds transform[3, 1, i] = -sy
+      @inbounds transform[3, 2, i] = sx * cy
+      @inbounds transform[3, 3, i] = cx * cy
+      @inbounds transform[3, 4, i] = poses[6, ix]
+      @inbounds etot[i] = Zero
+    end
+
+    for il = 1:nligand
+      @inbounds l_atom::Atom = ligand[il]
+
+      @inbounds l_params::FFParams = forcefield[l_atom.type+1]
+      lhphb_ltz::Bool = l_params.hphb < Zero
+      lhphb_gtz::Bool = l_params.hphb > Zero
+
+
+      lpos_x = MArray{Tuple{WGSIZE},Float32}(undef)
+      lpos_y = MArray{Tuple{WGSIZE},Float32}(undef)
+      lpos_z = MArray{Tuple{WGSIZE},Float32}(undef)
+
+      @simd for i = 1:WGSIZE
+        @inbounds lpos_x[i] = (
+          transform[1, 4, i] +
+          l_atom.x * transform[1, 1, i] +
+          l_atom.y * transform[1, 2, i] +
+          l_atom.z * transform[1, 3, i]
+        )
+        @inbounds lpos_y[i] = (
+          transform[2, 4, i] +
+          l_atom.x * transform[2, 1, i] +
+          l_atom.y * transform[2, 2, i] +
+          l_atom.z * transform[2, 3, i]
+        )
+        @inbounds lpos_z[i] = (
+          transform[3, 4, i] +
+          l_atom.x * transform[3, 1, i] +
+          l_atom.y * transform[3, 2, i] +
+          l_atom.z * transform[3, 3, i]
+        )
+      end
+
+      for ip = 1:nprotein
+        @inbounds p_atom = protein[ip]
+        @inbounds p_params::FFParams = forcefield[p_atom.type+1]
+
+        radij::Float32 = p_params.radius + l_params.radius
+        r_radij::Float32 = One / radij
+
+        elcdst::Float32 = (p_params.hbtype == HbtypeF && l_params.hbtype == HbtypeF) ? Four : Two
+        elcdst1::Float32 =
+          (p_params.hbtype == HbtypeF && l_params.hbtype == HbtypeF) ? Quarter : Half
+        type_E::Bool = ((p_params.hbtype == HbtypeE || l_params.hbtype == HbtypeE))
+
+        phphb_ltz::Bool = p_params.hphb < Zero
+        phphb_gtz::Bool = p_params.hphb > Zero
+        phphb_nz::Bool = p_params.hphb != Zero
+        p_hphb::Float32 = p_params.hphb * (phphb_ltz && lhphb_gtz ? -One : One)
+        l_hphb::Float32 = l_params.hphb * (phphb_gtz && lhphb_ltz ? -One : One)
+        distdslv::Float32 =
+          (phphb_ltz ? (lhphb_ltz ? Npnpdist : Nppdist) : (lhphb_ltz ? Nppdist : -Float32Max))
+        r_distdslv::Float32 = One / distdslv
+
+        chrg_init::Float32 = l_params.elsc * p_params.elsc
+        dslv_init::Float32 = p_hphb + l_hphb
+
+        @simd for i = 1:WGSIZE
+          @inbounds x::Float32 = lpos_x[i] - p_atom.x
+          @inbounds y::Float32 = lpos_y[i] - p_atom.y
+          @inbounds z::Float32 = lpos_z[i] - p_atom.z
+
+          distij::Float32 = sqrt(x * x + y * y + z * z)
+
+          # Calculate the sum of the sphere radii
+          distbb::Float32 = distij - radij
+          zone1::Bool = (distbb < Zero)
+
+          # Calculate steric energy
+          @inbounds etot[i] += (One - (distij * r_radij)) * (zone1 ? Two * Hardness : Zero)
+
+          # Calculate formal and dipole charge interactions
+          chrg_e::Float32 =
+            chrg_init * ((zone1 ? One : (One - distbb * elcdst1)) * (distbb < elcdst ? One : Zero))
+          neg_chrg_e::Float32 = -abs(chrg_e)
+          chrg_e = type_E ? neg_chrg_e : chrg_e
+          @inbounds etot[i] += chrg_e * Cnstnt
+
+          # Calculate the two cases for Nonpolar-Polar repulsive interactions
+          coeff::Float32 = (One - (distbb * r_distdslv))
+          dslv_e::Float32 = dslv_init * ((distbb < distdslv && phphb_nz) ? One : Zero)
+          dslv_e *= (zone1 ? One : coeff)
+          @inbounds etot[i] += dslv_e
+        end
+      end
+    end
+    @simd for i = 1:WGSIZE
+      ix = (group - 1) * WGSIZE + i
+      @inbounds etotals[ix] = etot[i] * Half
+    end
+  end
+end
+
+main()
diff --git a/miniBUDE.jl/src/miniBUDE.jl b/miniBUDE.jl/src/miniBUDE.jl
new file mode 100644
index 0000000..8326527
--- /dev/null
+++ b/miniBUDE.jl/src/miniBUDE.jl
@@ -0,0 +1,3 @@
+module JuliaStream end
+
+println("Please run benchmarks directly via `julia --project src/<IMPL>.jl`")
diff --git a/miniBUDE.jl/src/oneAPI.jl b/miniBUDE.jl/src/oneAPI.jl
new file mode 100644
index 0000000..a7f963d
--- /dev/null
+++ b/miniBUDE.jl/src/oneAPI.jl
@@ -0,0 +1,201 @@
+include("BUDE.jl")
+using oneAPI, StaticArrays
+
+function devices()::Vector{DeviceWithRepr}
+  all = map(oneL0.devices, oneL0.drivers()) |> Iterators.flatten |> Iterators.collect
+  map(dev -> (dev, repr("text/plain", dev), "oneAPi.jl"), all)
+end
+
+
+function run(params::Params, deck::Deck, device::DeviceWithRepr)
+
+  oneAPI.allowscalar(false)
+  oneAPI.device!(device[1])
+
+  println("Using L0 device: $(repr("text/plain",device[1]))")
+
+  protein = oneArray{Atom}(deck.protein)
+  ligand = oneArray{Atom}(deck.ligand)
+  forcefield = oneArray{FFParams}(deck.forcefield)
+  poses = oneArray{Float32,2}(deck.poses)
+
+  nprotein::Int = length(deck.protein)
+  nligand::Int = length(deck.ligand)
+  nforcefield::Int = length(deck.forcefield)
+  nposes::Int = size(deck.poses)[2]
+
+  etotals = oneArray{Float32}(undef, nposes)
+  blocks_size = ceil(UInt, nposes / params.ppwi)
+  nthreads = params.wgsize
+
+  println("Using kernel parameters: <<<$(blocks_size),$(nthreads)>>> 1:$nposes")
+
+  # warmup
+  @oneapi items = nthreads groups = blocks_size ÷ nthreads fasten_main(
+    nprotein,
+    nligand,
+    nforcefield,
+    nposes,
+    protein,
+    ligand,
+    forcefield,
+    poses,
+    etotals,
+    Val(convert(Int, params.ppwi)),
+  )
+  oneAPI.synchronize()
+
+  elapsed = @elapsed begin
+    for _ = 1:params.iterations
+      @oneapi items = nthreads groups = blocks_size ÷ nthreads fasten_main(
+        nprotein,
+        nligand,
+        nforcefield,
+        nposes,
+        protein,
+        ligand,
+        forcefield,
+        poses,
+        etotals,
+        Val(convert(Int, params.ppwi)),
+      )
+      oneAPI.synchronize()
+    end
+  end
+
+  (Array{Float32}(etotals), elapsed, params.ppwi)
+end
+
+@fastmath function fasten_main(
+  nprotein::Int,
+  nligand::Int,
+  nforcefield::Int,
+  nposes::Int,
+  protein,
+  ligand,
+  forcefield,
+  poses,
+  etotals,
+  ::Val{PPWI},
+) where {PPWI}
+
+  #  Get index of first TD
+  ix = (get_group_id() - 1) * get_local_size() * PPWI + get_local_id()
+
+  #  Have extra threads do the last member intead of return.
+  #  A return would disable use of barriers, so not using return is better
+  ix = ix <= nposes ? ix : nposes - PPWI
+
+  etot = MArray{Tuple{PPWI},Float32}(undef)
+  transform = MArray{Tuple{PPWI,3},Vec4f32}(undef)
+
+  lsz = get_local_size()
+  @simd for i = 1:PPWI
+    index = (ix) + (i - 1) * lsz
+    @inbounds sx::Float32 = sin(poses[1, index])
+    @inbounds cx::Float32 = cos(poses[1, index])
+    @inbounds sy::Float32 = sin(poses[2, index])
+    @inbounds cy::Float32 = cos(poses[2, index])
+    @inbounds sz::Float32 = sin(poses[3, index])
+    @inbounds cz::Float32 = cos(poses[3, index])
+    @inbounds transform[i, 1] = #
+      Vec4f32(cy * cz, sx * sy * cz - cx * sz, cx * sy * cz + sx * sz, poses[4, index])
+    @inbounds transform[i, 2] = #
+      Vec4f32(cy * sz, sx * sy * sz + cx * cz, cx * sy * sz - sx * cz, poses[5, index])
+    @inbounds transform[i, 3] = #
+      Vec4f32(-sy, sx * cy, cx * cy, poses[6, index])
+    etot[i] = 0
+  end
+
+  for il = 1:nligand
+
+    @inbounds l_atom::Atom = ligand[il]
+    @inbounds l_params::FFParams = forcefield[l_atom.type+1]
+    lhphb_ltz::Bool = l_params.hphb < Zero
+    lhphb_gtz::Bool = l_params.hphb > Zero
+
+    lpos = MArray{Tuple{PPWI},Vec3f32}(undef)
+
+    @simd for i = 1:PPWI
+      # Transform ligand atom
+      @inbounds lpos[i] = Vec3f32(
+        transform[i, 1].w +
+        l_atom.x * transform[i, 1].x +
+        l_atom.y * transform[i, 1].y +
+        l_atom.z * transform[i, 1].z,
+        transform[i, 2].w +
+        l_atom.x * transform[i, 2].x +
+        l_atom.y * transform[i, 2].y +
+        l_atom.z * transform[i, 2].z,
+        transform[i, 3].w +
+        l_atom.x * transform[i, 3].x +
+        l_atom.y * transform[i, 3].y +
+        l_atom.z * transform[i, 3].z,
+      )
+    end
+
+    for ip = 1:nprotein
+      @inbounds p_atom = protein[ip]
+      @inbounds p_params::FFParams = forcefield[p_atom.type+1]
+
+      radij::Float32 = p_params.radius + l_params.radius
+      r_radij::Float32 = One / radij
+
+      elcdst::Float32 = (p_params.hbtype == HbtypeF && l_params.hbtype == HbtypeF) ? Four : Two
+      elcdst1::Float32 = (p_params.hbtype == HbtypeF && l_params.hbtype == HbtypeF) ? Quarter : Half
+      type_E::Bool = ((p_params.hbtype == HbtypeE || l_params.hbtype == HbtypeE))
+
+      phphb_ltz::Bool = p_params.hphb < Zero
+      phphb_gtz::Bool = p_params.hphb > Zero
+      phphb_nz::Bool = p_params.hphb != Zero
+      p_hphb::Float32 = p_params.hphb * (phphb_ltz && lhphb_gtz ? -One : One)
+      l_hphb::Float32 = l_params.hphb * (phphb_gtz && lhphb_ltz ? -One : One)
+      distdslv::Float32 =
+        (phphb_ltz ? (lhphb_ltz ? Npnpdist : Nppdist) : (lhphb_ltz ? Nppdist : -Float32Max))
+      r_distdslv::Float32 = One / distdslv
+
+      chrg_init::Float32 = l_params.elsc * p_params.elsc
+      dslv_init::Float32 = p_hphb + l_hphb
+
+
+      @simd for i = 1:PPWI
+        @inbounds x::Float32 = lpos[i].x - p_atom.x
+        @inbounds y::Float32 = lpos[i].y - p_atom.y
+        @inbounds z::Float32 = lpos[i].z - p_atom.z
+
+        distij::Float32 = sqrt(x * x + y * y + z * z)
+
+        # Calculate the sum of the sphere radii
+        distbb::Float32 = distij - radij
+        zone1::Bool = (distbb < Zero)
+
+        # Calculate steric energy
+        @inbounds etot[i] += (One - (distij * r_radij)) * (zone1 ? Two * Hardness : Zero)
+
+        # Calculate formal and dipole charge interactions
+        chrg_e::Float32 =
+          chrg_init * ((zone1 ? One : (One - distbb * elcdst1)) * (distbb < elcdst ? One : Zero))
+        neg_chrg_e::Float32 = -abs(chrg_e)
+        chrg_e = type_E ? neg_chrg_e : chrg_e
+        @inbounds etot[i] += chrg_e * Cnstnt
+
+        # Calculate the two cases for Nonpolar-Polar repulsive interactions
+        coeff::Float32 = (One - (distbb * r_distdslv))
+        dslv_e::Float32 = dslv_init * ((distbb < distdslv && phphb_nz) ? One : Zero)
+        dslv_e *= (zone1 ? One : coeff)
+        @inbounds etot[i] += dslv_e
+      end
+    end
+  end
+
+  td_base = (get_group_id() - 1) * get_local_size() * PPWI + get_local_id()
+  if td_base <= nposes
+    @simd for i = 1:PPWI
+      @inbounds etotals[td_base+(i-1)*get_local_size()] = etot[i] * Half
+    end
+  end
+
+  nothing
+end
+
+main()