From ad90d0426b1447f953b1ece5c013e97986959483 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Wed, 4 Aug 2021 14:11:54 +0100 Subject: [PATCH] julia: initial Julia implementation --- README.md | 8 +- miniBUDE.jl/.JuliaFormatter.toml | 2 + miniBUDE.jl/Manifest.toml | 489 ++++++++++++++++++++++++++ miniBUDE.jl/Project.toml | 19 + miniBUDE.jl/README.md | 29 ++ miniBUDE.jl/src/AMDGPU.jl | 216 ++++++++++++ miniBUDE.jl/src/BUDE.jl | 250 +++++++++++++ miniBUDE.jl/src/CUDA.jl | 236 +++++++++++++ miniBUDE.jl/src/KernelAbstractions.jl | 250 +++++++++++++ miniBUDE.jl/src/Threaded.jl | 180 ++++++++++ miniBUDE.jl/src/miniBUDE.jl | 3 + miniBUDE.jl/src/oneAPI.jl | 201 +++++++++++ 12 files changed, 1881 insertions(+), 2 deletions(-) create mode 100644 miniBUDE.jl/.JuliaFormatter.toml create mode 100644 miniBUDE.jl/Manifest.toml create mode 100644 miniBUDE.jl/Project.toml create mode 100644 miniBUDE.jl/README.md create mode 100644 miniBUDE.jl/src/AMDGPU.jl create mode 100644 miniBUDE.jl/src/BUDE.jl create mode 100644 miniBUDE.jl/src/CUDA.jl create mode 100644 miniBUDE.jl/src/KernelAbstractions.jl create mode 100644 miniBUDE.jl/src/Threaded.jl create mode 100644 miniBUDE.jl/src/miniBUDE.jl create mode 100644 miniBUDE.jl/src/oneAPI.jl diff --git a/README.md b/README.md index f2c67f3..d08eb26 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Increasing the iteration count has similar performance effects to docking multip The top-level `data` directory contains the input common to implementations. The top-level `makedeck` directory contains an input deck generation program and a set of mol2/bhff input files. -Each other subdirectory contains a separate implementation: +Each other subdirectory contains a separate C/C++ implementation: - [OpenMP](openmp/) for CPUs - [OpenMP target](openmp-target/) for GPUs @@ -18,6 +18,11 @@ Each other subdirectory contains a separate implementation: - [SYCL](sycl/) for CPUs and GPUs - [Kokkos](kokkos/) for CPUs and GPUs +We also include implementations in emerging programming languages as direct ports of miniBUDE: + +- [Julia](miniBUDE.jl) for CPUs (@threads) and GPUs ([CUDA.jl](https://juliagpu.gitlab.io/CUDA.jl/), [AMDGPU.jl](https://amdgpu.juliagpu.org/stable/), [oneAPI.jl](https://github.com/JuliaGPU/oneAPI.jl), etc) + + ## Building To build with the default options, type `make` in an implementation directory. @@ -30,7 +35,6 @@ Refer to each implementation's README for further build instructions. To run with the default options, run the binary without any flags. To adjust the run time, use `-i` to set the number of iterations. For very short runs, e.g. for simulation, use `-n 1024` to reduce the number of poses. -The maximum number of poses supported is `65536`. Refer to each implementation's README for further run instructions. diff --git a/miniBUDE.jl/.JuliaFormatter.toml b/miniBUDE.jl/.JuliaFormatter.toml new file mode 100644 index 0000000..ac95ddd --- /dev/null +++ b/miniBUDE.jl/.JuliaFormatter.toml @@ -0,0 +1,2 @@ +indent = 2 +margin = 100 \ No newline at end of file diff --git a/miniBUDE.jl/Manifest.toml b/miniBUDE.jl/Manifest.toml new file mode 100644 index 0000000..e2ff0bc --- /dev/null +++ b/miniBUDE.jl/Manifest.toml @@ -0,0 +1,489 @@ +# This file is machine-generated - editing it directly is not advised + +[[AMDGPU]] +deps = ["AbstractFFTs", "Adapt", "BinaryProvider", "CEnum", "GPUArrays", "GPUCompiler", "LLVM", "Libdl", "LinearAlgebra", "MacroTools", "Printf", "Random", "Requires", "Setfield", "hsa_rocr_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "04fdb3923ac6f55fa7347dce0f0f6f10e321e2e9" +uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e" +version = "0.2.7" + +[[AbstractFFTs]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "485ee0867925449198280d4af84bdb46a2a404d0" +uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" +version = "1.0.1" + +[[Adapt]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "84918055d15b3114ede17ac6a7182f68870c16f7" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "3.3.1" + +[[ArgParse]] +deps = ["Logging", "TextWrap"] +git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" +uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +version = "1.1.4" + +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[BFloat16s]] +deps = ["LinearAlgebra", "Test"] +git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a" +uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +version = "0.1.0" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[BinaryProvider]] +deps = ["Libdl", "Logging", "SHA"] +git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" +uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" +version = "0.5.10" + +[[Bzip2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2" +uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" +version = "1.0.8+0" + +[[CEnum]] +git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.4.1" + +[[CUDA]] +deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "MacroTools", "Memoize", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] +git-tree-sha1 = "364179416eabc34c9ca32126a6bdb431680c3bad" +uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" +version = "3.2.1" + +[[CUDAKernels]] +deps = ["Adapt", "CUDA", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"] +git-tree-sha1 = "81f76297b63c67723b1d60f5e7e002ae3393974b" +uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" +version = "0.3.0" + +[[Cassette]] +git-tree-sha1 = "087e76b8d48c014112ba890892c33be42ad10504" +uuid = "7057c7e9-c182-5462-911a-8362d720325c" +version = "0.3.7" + +[[ChainRulesCore]] +deps = ["Compat", "LinearAlgebra", "SparseArrays"] +git-tree-sha1 = "0b0aa9d61456940511416b59a0e902c57b154956" +uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +version = "0.10.12" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "dc7dedc2c2aa9faf59a55c622760a25cbefbe941" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "3.31.0" + +[[CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" + +[[ConstructionBase]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "f74e9d5388b8620b4cee35d4c5a618dd4dc547f4" +uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" +version = "1.3.0" + +[[DataStructures]] +deps = ["Compat", "InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "4437b64df1e0adccc3e5d1adbc3ac741095e4677" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.18.9" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[DocStringExtensions]] +deps = ["LibGit2"] +git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.8.5" + +[[Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + +[[Elfutils_jll]] +deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "XZ_jll", "Zlib_jll", "argp_standalone_jll", "fts_jll", "obstack_jll"] +git-tree-sha1 = "8f9fcde6d89b0a3ca51cb2028beab462705c5436" +uuid = "ab5a07f8-06af-567f-a878-e8bb879eba5a" +version = "0.182.0+0" + +[[ExprTools]] +git-tree-sha1 = "10407a39b87f29d47ebaca8edbc75d7c302ff93e" +pinned = true +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.3" + +[[Future]] +deps = ["Random"] +uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" + +[[GPUArrays]] +deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] +git-tree-sha1 = "df5b8569904c5c10e84c640984cfff054b18c086" +uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +version = "6.4.1" + +[[GPUCompiler]] +deps = ["DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "Serialization", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "42d635f6d87af125b86288df3819f805fb4d851a" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.11.5" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[JLLWrappers]] +deps = ["Preferences"] +git-tree-sha1 = "642a199af8b68253517b80bd3bfd17eb4e84df6e" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.3.0" + +[[KernelAbstractions]] +deps = ["Adapt", "Cassette", "InteractiveUtils", "MacroTools", "SpecialFunctions", "StaticArrays", "UUIDs"] +git-tree-sha1 = "5e6c70389c1b1e40adb81664ca8cea6ce8127afc" +uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +version = "0.7.0" + +[[LLVM]] +deps = ["CEnum", "Libdl", "Printf", "Unicode"] +git-tree-sha1 = "f57ac3fd2045b50d3db081663837ac5b4096947e" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "3.9.0" + +[[LazyArtifacts]] +deps = ["Artifacts", "Pkg"] +uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" + +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + +[[LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[LogExpFunctions]] +deps = ["DocStringExtensions", "LinearAlgebra"] +git-tree-sha1 = "7bd5f6565d80b6bf753738d2bc40a5dfea072070" +uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" +version = "0.2.5" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "6a8a2a625ab0dea913aba95c11370589e0239ff0" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.6" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + +[[Memoize]] +deps = ["MacroTools"] +git-tree-sha1 = "2b1dfcba103de714d31c033b5dacc2e4a12c7caa" +uuid = "c03570c3-d221-55d1-a50c-7939bbd78826" +version = "0.4.4" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NEO_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gmmlib_jll", "libigc_jll", "oneAPI_Level_Zero_Headers_jll"] +git-tree-sha1 = "c753dd029eb0837658bf8eaee041c19e4ce5bb8c" +uuid = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd" +version = "21.12.19358+0" + +[[NUMA_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "778f9bd14400cff2c32ed357e12766ac0e3d766e" +uuid = "7f51dc2b-bb24-59f8-b771-bb1490e4195d" +version = "2.0.13+1" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + +[[OpenSpecFun_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" +uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" +version = "0.5.5+0" + +[[OrderedCollections]] +git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.4.1" + +[[Parameters]] +deps = ["OrderedCollections", "UnPack"] +git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345" +uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" +version = "0.12.2" + +[[Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Preferences]] +deps = ["TOML"] +git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a" +uuid = "21216c6a-2e73-6563-6e65-726566657250" +version = "1.2.2" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[ROCKernels]] +deps = ["AMDGPU", "Adapt", "Cassette", "KernelAbstractions", "SpecialFunctions", "StaticArrays"] +git-tree-sha1 = "41105b861342637dde17797bdd9aaa537aca646b" +uuid = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" +version = "0.2.0" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[Random123]] +deps = ["Libdl", "Random", "RandomNumbers"] +git-tree-sha1 = "0e8b146557ad1c6deb1367655e052276690e71a3" +uuid = "74087812-796a-5b5d-8853-05524746bad3" +version = "1.4.2" + +[[RandomNumbers]] +deps = ["Random", "Requires"] +git-tree-sha1 = "441e6fc35597524ada7f85e13df1f4e10137d16f" +uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" +version = "1.4.0" + +[[Reexport]] +git-tree-sha1 = "5f6c21241f0f655da3952fd60aa18477cf96c220" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "1.1.0" + +[[Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "4036a3bd08ac7e968e27c203d45f5fff15020621" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.1.3" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[SPIRV_LLVM_Translator_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "8cca87d57f6ddf19373cc9791fddc741406c8fbf" +uuid = "4a5d46fc-d8cf-5151-a261-86b458210efb" +version = "11.0.0+2" + +[[SPIRV_Tools_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "3c09b0e79758ddd4ecf41b9edf449a36ad70ef73" +uuid = "6ac6d60f-d740-5983-97d7-a4482c0689f4" +version = "2020.6.0+0" + +[[Scratch]] +deps = ["Dates"] +git-tree-sha1 = "0b4b7f1393cff97c33891da2a0bf69c6ed241fda" +uuid = "6c6a2e73-6563-6170-7368-637461726353" +version = "1.1.0" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[Setfield]] +deps = ["ConstructionBase", "Future", "MacroTools", "Requires"] +git-tree-sha1 = "d5640fc570fb1b6c54512f0bd3853866bd298b3e" +uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46" +version = "0.7.0" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["ChainRulesCore", "LogExpFunctions", "OpenSpecFun_jll"] +git-tree-sha1 = "a50550fa3164a8c46747e62063b4d774ac1bcf49" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "1.5.1" + +[[StaticArrays]] +deps = ["LinearAlgebra", "Random", "Statistics"] +git-tree-sha1 = "1b9a0f17ee0adde9e538227de093467348992397" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "1.2.7" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" + +[[Test]] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[TextWrap]] +git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" +uuid = "b718987f-49a8-5099-9789-dcd902bef87d" +version = "1.0.1" + +[[TimerOutputs]] +deps = ["ExprTools", "Printf"] +git-tree-sha1 = "209a8326c4f955e2442c07b56029e88bb48299c7" +uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +version = "0.5.12" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[UnPack]] +git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" +uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" +version = "1.0.2" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[XZ_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "9f76853ea2ba894054e24640abfb73d73e5a4cb5" +uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" +version = "5.2.5+0" + +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[argp_standalone_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "feaf9f6293003c2bf53056fd6930d677ed340b34" +uuid = "c53206cc-00f7-50bf-ad1e-3ae1f6e49bc3" +version = "1.3.1+0" + +[[fts_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "78732b942383d2cb521df8a1a0814911144e663d" +uuid = "d65627f6-89bd-53e8-8ab5-8b75ff535eee" +version = "1.2.7+1" + +[[gmmlib_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "4067ef455d4fa67febe26efc3f9565a9bb7ba911" +uuid = "09858cae-167c-5acb-9302-fddc6874d481" +version = "20.3.2+0" + +[[hsa_rocr_jll]] +deps = ["Artifacts", "Elfutils_jll", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg", "Zlib_jll", "hsakmt_roct_jll"] +git-tree-sha1 = "42189f176d6ae4f37c0c0e652fec339bb0bfab5d" +uuid = "dd59ff1a-a01a-568d-8b29-0669330f116a" +version = "3.7.0+1" + +[[hsakmt_roct_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "NUMA_jll", "Pkg"] +git-tree-sha1 = "8a9ee6c091e952e4ea6585d15131d43f789ae041" +uuid = "1cecccd7-a9b6-5045-9cdc-a44c19b16d76" +version = "3.8.0+0" + +[[libigc_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "6140dbf267f7ab57fb791b49f2114374218b5c20" +uuid = "94295238-5935-5bd7-bb0f-b00942e9bdd5" +version = "1.0.6712+0" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" + +[[obstack_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "1c4a6b66e934fc6db4649cb2910c72f53bbfea7e" +uuid = "c88a4935-d25e-5644-aacc-5db6f1b8ef79" +version = "1.2.2+0" + +[[oneAPI]] +deps = ["Adapt", "CEnum", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LinearAlgebra", "NEO_jll", "Printf", "Random", "SPIRV_LLVM_Translator_jll", "SPIRV_Tools_jll", "SpecialFunctions", "oneAPI_Level_Zero_Loader_jll"] +git-tree-sha1 = "b4a4b84c864e75fe885a1643525f0c97ce310dd9" +uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" +version = "0.1.3" + +[[oneAPI_Level_Zero_Headers_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "48982fbfd2f3d0a30d644563dcf96892d252b395" +uuid = "f4bc562b-d309-54f8-9efb-476e56f0410d" +version = "1.1.2+1" + +[[oneAPI_Level_Zero_Loader_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "oneAPI_Level_Zero_Headers_jll"] +git-tree-sha1 = "1fa53dfdd32a732f09c254c86403e1abab653fb2" +uuid = "13eca655-d68d-5b81-8367-6d99d727ab01" +version = "1.3.6+0" + +[[p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" diff --git a/miniBUDE.jl/Project.toml b/miniBUDE.jl/Project.toml new file mode 100644 index 0000000..626cbdd --- /dev/null +++ b/miniBUDE.jl/Project.toml @@ -0,0 +1,19 @@ +name = "miniBUDE" +uuid = "2f7434cc-9e25-426a-818f-4b8ba496e7fb" +authors = ["Wei-Chen Lin "] +version = "0.0.1" + +[deps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" +ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" +ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" +StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" +oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" + +[compat] +julia = "1.6" diff --git a/miniBUDE.jl/README.md b/miniBUDE.jl/README.md new file mode 100644 index 0000000..5248996 --- /dev/null +++ b/miniBUDE.jl/README.md @@ -0,0 +1,29 @@ +miniBUDE Julia +============== + +This is an implementation of miniBUDE in Julia which contains the following variants: + + * `Threaded.jl` - Threaded implementation with `Threads.@threads` macros + * `CUDA.jl` - Direct port of miniBUDE native CUDA implementation using [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) + * `AMDGPU.jl` - Direct port of miniBUDE's native HIP(via CUDA) implementation using [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) + * `oneAPi.jl` - Direct port of miniBUDE's native SYCL implementation using [oneAPi.jl](https://github.com/JuliaGPU/oneAPI.jl) + * `KernelAbstractions.jl` - Direct port of miniBUDE's native CUDA implementation using [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl) + +### Build & Run + +Prerequisites + + * Julia >= 1.6+ + +With Julia on path, run the benchmark with: + +```shell +> cd JuliaStream.jl +> julia --project -e 'import Pkg; Pkg.instantiate()' # only required on first run +> julia --project src/.jl # e.g `julia --project src/KernelAbstractions.jl`, see variants listed above. +``` + +**Important:** + * Julia is 1-indexed, so N >= 1 in `--device N` (alternatively, use device name substring). + * Thread count for `Threaded.jl` must be set via the `JULIA_NUM_THREADS` environment variable (e.g `export JULIA_NUM_THREADS=$(nproc)`) otherwise it defaults to 1. + * Certain implementations such as CUDA and AMDGPU will do hardware detection at runtime and may download and/or compile further software packages for the platform. diff --git a/miniBUDE.jl/src/AMDGPU.jl b/miniBUDE.jl/src/AMDGPU.jl new file mode 100644 index 0000000..007efa0 --- /dev/null +++ b/miniBUDE.jl/src/AMDGPU.jl @@ -0,0 +1,216 @@ +include("BUDE.jl") +using AMDGPU, StaticArrays + + +# AMDGPU.agents()'s internal iteration order isn't stable +function gpu_agents_in_repr_order() + # XXX if we select anything other than :gpu, we get + # HSA_STATUS_ERROR_INVALID_AGENT on the first kernel submission + sort(AMDGPU.get_agents(:gpu), by = repr) +end + +function devices()::Vector{DeviceWithRepr} + try + map(d -> (d, repr(d), "AMDGPU.jl"), gpu_agents_in_repr_order()) + catch + # probably unsupported + [] + end +end + + +function run(params::Params, deck::Deck, device::DeviceWithRepr) + + # XXX AMDGPU doesn't expose an API for setting the default like CUDA.device!() + # but AMDGPU.get_default_agent returns DEFAULT_AGENT so we can do it by hand + AMDGPU.DEFAULT_AGENT[] = device[1] + println("Using GPU HSA device: $(AMDGPU.get_name(device[1])) ($(repr(device[1])))") + + + protein = ROCArray{Atom}(deck.protein) + ligand = ROCArray{Atom}(deck.ligand) + forcefield = ROCArray{FFParams}(deck.forcefield) + poses = ROCArray{Float32,2}(deck.poses) + + nprotein::Int = length(deck.protein) + nligand::Int = length(deck.ligand) + nforcefield::Int = length(deck.forcefield) + nposes::Int = size(deck.poses)[2] + + etotals = ROCArray{Float32}(undef, nposes) + blocks_size = ceil(UInt, nposes / params.ppwi) + nthreads = params.wgsize + + println("Using kernel parameters: <<<$(blocks_size),$(nthreads)>>> 1:$nposes") + + # warmup + AMDGPU.wait( + @roc groupsize = nthreads gridsize = blocks_size fasten_main( + nprotein, + nligand, + nforcefield, + nposes, + protein, + ligand, + forcefield, + poses, + etotals, + Val(convert(Int, params.ppwi)), + ) + ) + elapsed = @elapsed begin + for _ = 1:params.iterations + AMDGPU.wait( + @roc groupsize = nthreads gridsize = blocks_size fasten_main( + nprotein, + nligand, + nforcefield, + nposes, + protein, + ligand, + forcefield, + poses, + etotals, + Val(convert(Int, params.ppwi)), + ) + ) + end + end + + (Array{Float32}(etotals), elapsed, params.ppwi) +end + + +@fastmath function fasten_main( + nprotein::Int, + nligand::Int, + nforcefield::Int, + nposes::Int, + protein, + ligand, + forcefield, + poses, + etotals, + ::Val{PPWI}, +) where {PPWI} + + # Get index of first TD + ix = (workgroupIdx().x - 1) * workgroupDim().x * PPWI + workitemIdx().x + + # Have extra threads do the last member intead of return. + # A return would disable use of barriers, so not using return is better + ix = ix <= nposes ? ix : nposes - PPWI + + etot = MArray{Tuple{PPWI},Float32}(undef) + transform = MArray{Tuple{PPWI,3},Vec4f32}(undef) + + lsz = workgroupDim().x + for i = 1:PPWI + index = (ix) + (i - 1) * lsz + @inbounds sx::Float32 = sin(poses[1, index]) + @inbounds cx::Float32 = cos(poses[1, index]) + @inbounds sy::Float32 = sin(poses[2, index]) + @inbounds cy::Float32 = cos(poses[2, index]) + @inbounds sz::Float32 = sin(poses[3, index]) + @inbounds cz::Float32 = cos(poses[3, index]) + @inbounds transform[i, 1] = # + Vec4f32(cy * cz, sx * sy * cz - cx * sz, cx * sy * cz + sx * sz, poses[4, index]) + @inbounds transform[i, 2] = # + Vec4f32(cy * sz, sx * sy * sz + cx * cz, cx * sy * sz - sx * cz, poses[5, index]) + @inbounds transform[i, 3] = # + Vec4f32(-sy, sx * cy, cx * cy, poses[6, index]) + etot[i] = 0 + end + + for il = 1:nligand + + @inbounds l_atom::Atom = ligand[il] + @inbounds l_params::FFParams = forcefield[l_atom.type+1] + lhphb_ltz::Bool = l_params.hphb < Zero + lhphb_gtz::Bool = l_params.hphb > Zero + + lpos = MArray{Tuple{PPWI},Vec3f32}(undef) + + @simd for i = 1:PPWI + # Transform ligand atom + @inbounds lpos[i] = Vec3f32( + transform[i, 1].w + + l_atom.x * transform[i, 1].x + + l_atom.y * transform[i, 1].y + + l_atom.z * transform[i, 1].z, + transform[i, 2].w + + l_atom.x * transform[i, 2].x + + l_atom.y * transform[i, 2].y + + l_atom.z * transform[i, 2].z, + transform[i, 3].w + + l_atom.x * transform[i, 3].x + + l_atom.y * transform[i, 3].y + + l_atom.z * transform[i, 3].z, + ) + end + + for ip = 1:nprotein + @inbounds p_atom = protein[ip] + @inbounds p_params::FFParams = forcefield[p_atom.type+1] + + radij::Float32 = p_params.radius + l_params.radius + r_radij::Float32 = One / radij + + elcdst::Float32 = (p_params.hbtype == HbtypeF && l_params.hbtype == HbtypeF) ? Four : Two + elcdst1::Float32 = (p_params.hbtype == HbtypeF && l_params.hbtype == HbtypeF) ? Quarter : Half + type_E::Bool = ((p_params.hbtype == HbtypeE || l_params.hbtype == HbtypeE)) + + phphb_ltz::Bool = p_params.hphb < Zero + phphb_gtz::Bool = p_params.hphb > Zero + phphb_nz::Bool = p_params.hphb != Zero + p_hphb::Float32 = p_params.hphb * (phphb_ltz && lhphb_gtz ? -One : One) + l_hphb::Float32 = l_params.hphb * (phphb_gtz && lhphb_ltz ? -One : One) + distdslv::Float32 = + (phphb_ltz ? (lhphb_ltz ? Npnpdist : Nppdist) : (lhphb_ltz ? Nppdist : -Float32Max)) + r_distdslv::Float32 = One / distdslv + + chrg_init::Float32 = l_params.elsc * p_params.elsc + dslv_init::Float32 = p_hphb + l_hphb + + + @simd for i = 1:PPWI + @inbounds x::Float32 = lpos[i].x - p_atom.x + @inbounds y::Float32 = lpos[i].y - p_atom.y + @inbounds z::Float32 = lpos[i].z - p_atom.z + + distij::Float32 = sqrt(x * x + y * y + z * z) + + # Calculate the sum of the sphere radii + distbb::Float32 = distij - radij + zone1::Bool = (distbb < Zero) + + # Calculate steric energy + @inbounds etot[i] += (One - (distij * r_radij)) * (zone1 ? Two * Hardness : Zero) + + # Calculate formal and dipole charge interactions + chrg_e::Float32 = + chrg_init * ((zone1 ? One : (One - distbb * elcdst1)) * (distbb < elcdst ? One : Zero)) + neg_chrg_e::Float32 = -abs(chrg_e) + chrg_e = type_E ? neg_chrg_e : chrg_e + @inbounds etot[i] += chrg_e * Cnstnt + + # Calculate the two cases for Nonpolar-Polar repulsive interactions + coeff::Float32 = (One - (distbb * r_distdslv)) + dslv_e::Float32 = dslv_init * ((distbb < distdslv && phphb_nz) ? One : Zero) + dslv_e *= (zone1 ? One : coeff) + @inbounds etot[i] += dslv_e + end + end + end + + td_base = (workgroupIdx().x - 1) * workgroupDim().x * PPWI + workitemIdx().x + if td_base <= nposes + @simd for i = 1:PPWI + @inbounds etotals[td_base+(i-1)*workgroupDim().x] = etot[i] * Half + end + end + + nothing +end + +main() diff --git a/miniBUDE.jl/src/BUDE.jl b/miniBUDE.jl/src/BUDE.jl new file mode 100644 index 0000000..559dcf5 --- /dev/null +++ b/miniBUDE.jl/src/BUDE.jl @@ -0,0 +1,250 @@ +using ArgParse +using Parameters +using Printf +using Base: Float64, Float32, Int, Int32 +import Base.read + +const DefaultInterations = 8 +const DefaultNPoses = 65536 +const RefNPoses = 65536 +const DefaultWGSize = 64 +const DefaultPPWI = 4 + +const Zero = 0.0f0 +const Quarter = 0.25f0 +const Half = 0.5f0 +const One = 1.0f0 +const Two = 2.0f0 +const Four = 4.0f0 +const Cnstnt = 45.0f0 + +const HbtypeF = 70 +const HbtypeE = 69 +const Hardness = 38.0f0 +const Npnpdist = 5.5f0 +const Nppdist = 1.0f0 + +const Float32Max = typemax(Float32) + +struct Vec3f32 + x::Float32 + y::Float32 + z::Float32 +end + +struct Vec4f32 + x::Float32 + y::Float32 + z::Float32 + w::Float32 +end + +struct Atom + x::Float32 + y::Float32 + z::Float32 + type::Int32 +end + +function read(io::IO, ::Type{Atom}) + Atom(read(io, Float32), read(io, Float32), read(io, Float32), read(io, Int32)) +end + +struct FFParams + hbtype::Int32 + radius::Float32 + hphb::Float32 + elsc::Float32 +end + +function read(io::IO, ::Type{FFParams}) + FFParams(read(io, Int32), read(io, Float32), read(io, Float32), read(io, Float32)) +end + +@with_kw mutable struct Params + device::String = "1" + list::Bool = false + numposes::UInt = DefaultNPoses + iterations::UInt = DefaultInterations + wgsize::UInt = DefaultWGSize + ppwi::UInt = DefaultPPWI + deck::String = "../data/bm1" +end + +struct Deck + protein::AbstractArray{Atom} + ligand::AbstractArray{Atom} + forcefield::AbstractArray{FFParams} + poses::AbstractArray{Float32,2} +end + +const DeviceWithRepr = Tuple{Any,String,Any} + +function parse_options(given::Params) + s = ArgParseSettings() + @add_arg_table s begin + "--list", "-l" + help = "List available devices" + action = :store_true + "--device", "-d" + help = "Select device at DEVICE or the device name substring if DEVICE is not an integer, NOTE: Julia is 1-indexed" + arg_type = String + default = given.device + "--numposes", "-n" + help = "Compute energies for N poses" + arg_type = Int + default = convert(Int, given.numposes) + "--iterations", "-i" + help = "Repeat kernel I times" + arg_type = Int + default = convert(Int, given.iterations) + "--wgsize", "-w" + help = "Run with work-group size WGSIZE" + arg_type = Int + default = convert(Int, given.wgsize) + "--ppwi", "-p" + help = "Compute PPWI poses per work-item" + arg_type = Int + default = convert(Int, given.ppwi) + "--deck" + help = "Use the DECK directory as input deck" + arg_type = String + default = given.deck + end + args = parse_args(s) + # surely there's a better way than doing this: + for (arg, val) in args + setproperty!(given, Symbol(arg), val) + end +end + +function read_structs(path::String, ::Type{T})::Vector{T} where {T} + io = open(path, "r") + size = filesize(io) + [read(io, T) for _ = 1:size÷sizeof(T)] +end + +function print_timings(params::Params, deck::Deck, elapsedSeconds::Float64, wgsize::UInt) + + # Average time per iteration + averageIterationSeconds = elapsedSeconds / params.iterations + + natlig = length(deck.ligand) + natpro = length(deck.protein) + + # Compute FLOP/s + ops_per_wg = (wgsize * 27 + natlig * (2 + wgsize * 18 + natpro * (10 + wgsize * 30)) + wgsize) + + total_ops = ops_per_wg * (params.numposes / wgsize) + flops = total_ops / averageIterationSeconds + gflops = flops / 1.0e9 + + total_finsts = 25.0 * natpro * natlig * params.numposes + finsts = total_finsts / averageIterationSeconds + gfinsts = finsts / 1e9 + + interactions = params.numposes * natlig * natpro + interactions_per_sec = interactions / averageIterationSeconds + + # Print stats + @printf "- Kernel time: %.03f ms\n" (elapsedSeconds * 1000.0) + @printf "- Average time: %.03f ms\n" (averageIterationSeconds * 1000.0) + @printf "- Interactions/s: %.03f billion\n" interactions_per_sec / 1e9 + @printf "- GFLOP/s: %.03f\n" gflops + @printf "- GFInst/s: %.03f\n" gfinsts +end + +function main() + + params::Params = Params() + parse_options(params) + + + ds = devices() + + if params.list + for (i, (_, name, type)) in enumerate(ds) + println("[$i] ($type) $name") + end + exit(0) + end + + deviceIndex = try + index = parse(Int, params.device) + if index < 1 || index > length(ds) + error("Device $(index) out of range (1..$(length(ds))), NOTE: Julia is 1-indexed") + end + index + catch + index = findfirst(x -> occursin(params.device, "($(x[3])) $(x[2])"), ds) + if index === nothing + error("No device match the substring `$(params.device)`, see --list for available devices") + end + index + end + + + poses = permutedims( # reshape is column order so we flip it back again + reshape( + read_structs("$(params.deck)/poses.in", Float32), # read poses as one long array + (params.numposes, 6), # reshape it to 6 slices of numposes sized rows + ), + ) + + deck = Deck( + read_structs("$(params.deck)/protein.in", Atom), + read_structs("$(params.deck)/ligand.in", Atom), + read_structs("$(params.deck)/forcefield.in", FFParams), + poses, + ) + + println("Poses : ", size(deck.poses)[2]) + println("Iterations: ", params.iterations) + println("Ligands : ", length(deck.ligand)) + println("Protein : ", length(deck.protein)) + println("Forcefield: ", length(deck.forcefield)) + println("Deck : ", params.deck) + println("WGsize : ", params.wgsize) + println("PPWI : ", params.ppwi) + + (energies, rumtimeSeconds, ppwi) = run(params, deck, ds[deviceIndex]) + + print_timings(params, deck, rumtimeSeconds, ppwi) + + output = open("energies.out", "w+") + println("\nEnergies:") + for i = 1:size(deck.poses)[2] + @printf(output, "%7.2f\n", energies[i]) + if (i < 16) + @printf("%7.2f\n", energies[i]) + end + end + + ref_energies = open("$(params.deck)/ref_energies.out", "r") + n_ref_poses = size(deck.poses)[2] + if n_ref_poses > RefNPoses + println("Only validating the first $(RefNPoses) poses") + n_ref_poses = RefNPoses + end + + maxdiff = 0.0 + for i = 1:n_ref_poses + line = readline(ref_energies) + if line == "" + error("ran out of ref energies lines to verify") + end + e = parse(Float32, line) + if (abs(e) < 1.0 && abs(energies[i]) < 1.0) + continue + end + diff = abs(e - energies[i]) / e + if (diff > 0.01) + println("![$(i)] $(e) $(energies[i]) ~ $(diff)") + maxdiff = diff + end + end + + @printf "Largest difference was %.03f%%.\n\n" 100 * maxdiff # Expect numbers to be accurate to 2 decimal places + close(ref_energies) + +end diff --git a/miniBUDE.jl/src/CUDA.jl b/miniBUDE.jl/src/CUDA.jl new file mode 100644 index 0000000..34a27aa --- /dev/null +++ b/miniBUDE.jl/src/CUDA.jl @@ -0,0 +1,236 @@ +include("BUDE.jl") +using CUDA, StaticArrays + + +function devices()::Vector{DeviceWithRepr} + return !CUDA.functional(false) ? [] : + map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", "CUDA.jl"), CUDA.devices()) +end + +function run(params::Params, deck::Deck, device::DeviceWithRepr) + # show_reason is set to true here so it dumps CUDA info + # for us regardless of whether it's functional + if !CUDA.functional(true) + error("Non-functional CUDA configuration") + end + + # so CUDA's device is 0 indexed, so -1 from Julia + CUDA.device!(device[1]) + CUDA.math_mode!(CUDA.FAST_MATH) + + println("Using CUDA device: $(CUDA.name(device[1])) ($(repr(device[1])))") + + protein = CuArray{Atom}(deck.protein) + ligand = CuArray{Atom}(deck.ligand) + forcefield = CuArray{FFParams}(deck.forcefield) + poses = CuArray{Float32,2}(deck.poses) + + nprotein::Int = length(deck.protein) + nligand::Int = length(deck.ligand) + nforcefield::Int = length(deck.forcefield) + nposes::Int = size(deck.poses)[2] + + etotals = CuArray{Float32}(undef, nposes) + nblocks = ceil(UInt, ceil(nposes / params.ppwi) / params.wgsize) + nthreads = params.wgsize + + println("Using kernel parameters: <<<$(nblocks),$(nthreads)>>> 1:$nposes") + + shared = false + shmem_bytes = shared ? sizeof(FFParams) * nforcefield : 0 + + # warmup + @cuda blocks = nblocks threads = nthreads shmem = shmem_bytes fasten_main( + nprotein, + nligand, + nforcefield, + nposes, + protein, + ligand, + forcefield, + poses, + etotals, + Val(shared), + Val(convert(Int, params.ppwi)), + ) + CUDA.synchronize() + elapsed = @elapsed begin + for _ = 1:params.iterations + @cuda blocks = nblocks threads = nthreads shmem = shmem_bytes fasten_main( + nprotein, + nligand, + nforcefield, + nposes, + protein, + ligand, + forcefield, + poses, + etotals, + Val(shared), + Val(convert(Int, params.ppwi)), + ) + end + CUDA.synchronize() + end + + (Array{Float32}(etotals), elapsed, params.ppwi) +end + +# const NUM_TD_PER_THREAD = 8 + +@fastmath function fasten_main( + nprotein::Int, + nligand::Int, + nforcefield::Int, + nposes::Int, + protein_::CuDeviceVector{Atom}, + ligand_::CuDeviceVector{Atom}, + forcefield_::CuDeviceVector{FFParams}, + poses_::CuDeviceMatrix{Float32}, + etotals::CuDeviceVector{Float32}, + ::Val{Shared}, + ::Val{PPWI}, +) where {Shared,PPWI} + + + protein = CUDA.Const(protein_) + ligand = CUDA.Const(ligand_) + global_forcefield = CUDA.Const(forcefield_) + poses = CUDA.Const(poses_) + + # Get index of first TD + ix = (blockIdx().x - 1) * blockDim().x * PPWI + threadIdx().x + + # Have extra threads do the last member intead of return. + # A return would disable use of barriers, so not using return is better + ix = ix <= nposes ? ix : nposes - PPWI + + etot = MArray{Tuple{PPWI},Float32}(undef) + transform = MArray{Tuple{PPWI,3},Vec4f32}(undef) + + + if Shared + forcefield = @cuDynamicSharedMem(FFParams, (nforcefield)) + if ix < nforcefield + @inbounds forcefield[ix] = global_forcefield[ix] + end + else + forcefield = global_forcefield + end + + + + lsz = blockDim().x + @simd for i = 1:PPWI + index = (ix) + (i - 1) * lsz + @inbounds sx::Float32 = sin(poses[1, index]) + @inbounds cx::Float32 = cos(poses[1, index]) + @inbounds sy::Float32 = sin(poses[2, index]) + @inbounds cy::Float32 = cos(poses[2, index]) + @inbounds sz::Float32 = sin(poses[3, index]) + @inbounds cz::Float32 = cos(poses[3, index]) + @inbounds transform[i, 1] = # + Vec4f32(cy * cz, sx * sy * cz - cx * sz, cx * sy * cz + sx * sz, poses[4, index]) + @inbounds transform[i, 2] = # + Vec4f32(cy * sz, sx * sy * sz + cx * cz, cx * sy * sz - sx * cz, poses[5, index]) + @inbounds transform[i, 3] = # + Vec4f32(-sy, sx * cy, cx * cy, poses[6, index]) + @inbounds etot[i] = 0 + end + + if Shared + CUDA.sync_threads() + end + + for il = 1:nligand + + @inbounds l_atom::Atom = ligand[il] + @inbounds l_params::FFParams = forcefield[l_atom.type+1] + lhphb_ltz::Bool = l_params.hphb < Zero + lhphb_gtz::Bool = l_params.hphb > Zero + + lpos = MArray{Tuple{PPWI},Vec3f32}(undef) + + @simd for i = 1:PPWI + # Transform ligand atom + @inbounds lpos[i] = Vec3f32( + transform[i, 1].w + + l_atom.x * transform[i, 1].x + + l_atom.y * transform[i, 1].y + + l_atom.z * transform[i, 1].z, + transform[i, 2].w + + l_atom.x * transform[i, 2].x + + l_atom.y * transform[i, 2].y + + l_atom.z * transform[i, 2].z, + transform[i, 3].w + + l_atom.x * transform[i, 3].x + + l_atom.y * transform[i, 3].y + + l_atom.z * transform[i, 3].z, + ) + end + + for ip = 1:nprotein + @inbounds p_atom = protein[ip] + @inbounds p_params::FFParams = forcefield[p_atom.type+1] + + radij::Float32 = p_params.radius + l_params.radius + r_radij::Float32 = One / radij + + elcdst::Float32 = (p_params.hbtype == HbtypeF && l_params.hbtype == HbtypeF) ? Four : Two + elcdst1::Float32 = (p_params.hbtype == HbtypeF && l_params.hbtype == HbtypeF) ? Quarter : Half + type_E::Bool = ((p_params.hbtype == HbtypeE || l_params.hbtype == HbtypeE)) + + phphb_ltz::Bool = p_params.hphb < Zero + phphb_gtz::Bool = p_params.hphb > Zero + phphb_nz::Bool = p_params.hphb != Zero + p_hphb::Float32 = p_params.hphb * (phphb_ltz && lhphb_gtz ? -One : One) + l_hphb::Float32 = l_params.hphb * (phphb_gtz && lhphb_ltz ? -One : One) + distdslv::Float32 = + (phphb_ltz ? (lhphb_ltz ? Npnpdist : Nppdist) : (lhphb_ltz ? Nppdist : -Float32Max)) + r_distdslv::Float32 = One / distdslv + + chrg_init::Float32 = l_params.elsc * p_params.elsc + dslv_init::Float32 = p_hphb + l_hphb + + + @simd for i = 1:PPWI + @inbounds x::Float32 = lpos[i].x - p_atom.x + @inbounds y::Float32 = lpos[i].y - p_atom.y + @inbounds z::Float32 = lpos[i].z - p_atom.z + + distij::Float32 = sqrt(x * x + y * y + z * z) + + # Calculate the sum of the sphere radii + distbb::Float32 = distij - radij + zone1::Bool = (distbb < Zero) + + # Calculate steric energy + @inbounds etot[i] += (One - (distij * r_radij)) * (zone1 ? Two * Hardness : Zero) + + # Calculate formal and dipole charge interactions + chrg_e::Float32 = + chrg_init * ((zone1 ? One : (One - distbb * elcdst1)) * (distbb < elcdst ? One : Zero)) + neg_chrg_e::Float32 = -abs(chrg_e) + chrg_e = type_E ? neg_chrg_e : chrg_e + @inbounds etot[i] += chrg_e * Cnstnt + + # Calculate the two cases for Nonpolar-Polar repulsive interactions + coeff::Float32 = (One - (distbb * r_distdslv)) + dslv_e::Float32 = dslv_init * ((distbb < distdslv && phphb_nz) ? One : Zero) + dslv_e *= (zone1 ? One : coeff) + @inbounds etot[i] += dslv_e + end + end + end + + td_base = (blockIdx().x - 1) * blockDim().x * PPWI + threadIdx().x + if td_base <= nposes + @simd for i = 1:PPWI + @inbounds etotals[td_base+(i-1)*blockDim().x] = etot[i] * Half + end + end + + nothing +end + +main() diff --git a/miniBUDE.jl/src/KernelAbstractions.jl b/miniBUDE.jl/src/KernelAbstractions.jl new file mode 100644 index 0000000..190cb19 --- /dev/null +++ b/miniBUDE.jl/src/KernelAbstractions.jl @@ -0,0 +1,250 @@ +include("BUDE.jl") +using ROCKernels, CUDAKernels, KernelAbstractions, StaticArrays, CUDA, AMDGPU + +@enum Backend cuda rocm cpu + + +function list_rocm_devices()::Vector{DeviceWithRepr} + try + # AMDGPU.agents()'s internal iteration order isn't stable + sorted = sort(AMDGPU.get_agents(:gpu), by = repr) + map(x -> (x, repr(x), rocm), sorted) + catch + # probably unsupported + [] + end +end + +function list_cuda_devices()::Vector{DeviceWithRepr} + return !CUDA.functional(false) ? [] : + map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", cuda), CUDA.devices()) +end + +function devices()::Vector{DeviceWithRepr} + cudas = list_cuda_devices() + rocms = list_rocm_devices() + cpus = [(undef, "$(Sys.cpu_info()[1].model) ($(Threads.nthreads())T)", cpu)] + vcat(cpus, cudas, rocms) +end + +function run(params::Params, deck::Deck, device::DeviceWithRepr) + + + nposes::Int64 = size(deck.poses)[2] + blocks_size::Int64 = ceil(Int64, nposes / params.ppwi) + nthreads::Int64 = params.wgsize + + (selected, _, backend) = device + println("KernelAbstractions Backend: $backend") + + if backend == cpu + println("Using CPU with max $(Threads.nthreads()) threads") + protein = deck.protein + ligand = deck.ligand + forcefield = deck.forcefield + poses = deck.poses + etotals = Array{Float32}(undef, nposes) + backend_impl = CPU() + elseif backend == cuda + CUDA.device!(selected) + if CUDA.device() != selected + error("Cannot select CUDA device, expecting $selected, but got $(CUDA.device())") + end + if !CUDA.functional(true) + error("Non-functional CUDA configuration") + end + println("Using CUDA device: $(CUDA.name(selected)) ($(repr(selected)))") + + protein = CuArray{Atom}(deck.protein) + ligand = CuArray{Atom}(deck.ligand) + forcefield = CuArray{FFParams}(deck.forcefield) + poses = CuArray{Float32,2}(deck.poses) + etotals = CuArray{Float32}(undef, nposes) + backend_impl = CUDADevice() + elseif backend == rocm + AMDGPU.DEFAULT_AGENT[] = selected + if AMDGPU.get_default_agent() != selected + error("Cannot select HSA device, expecting $selected, but got $(AMDGPU.get_default_agent())") + end + println("Using GPU HSA device: $(AMDGPU.get_name(selected)) ($(repr(selected)))") + + protein = ROCArray{Atom}(deck.protein) + ligand = ROCArray{Atom}(deck.ligand) + forcefield = ROCArray{FFParams}(deck.forcefield) + poses = ROCArray{Float32,2}(deck.poses) + etotals = ROCArray{Float32}(undef, nposes) + backend_impl = ROCDevice() + else + error("unsupported backend $(backend)") + end + + println("Using kernel parameters: <<<$blocks_size,$nthreads>>> 1:$nposes") + + + kernel! = fasten_main(backend_impl, nthreads, blocks_size) + + wait( + kernel!( + nthreads, + nposes, + protein, + ligand, + forcefield, + poses, + etotals, + Val(convert(Int, params.ppwi)), + ndrange = blocks_size, + ), + ) + + elapsed = @elapsed for i = 1:params.iterations + wait( + kernel!( + nthreads, + nposes, + protein, + ligand, + forcefield, + poses, + etotals, + Val(convert(Int, params.ppwi)), + ndrange = blocks_size, + ), + ) + end + + (Array{Float32}(etotals), elapsed, params.ppwi) +end + +@fastmath @kernel function fasten_main( + wgsize::Int64, + nposes::Int64, + @Const(protein), + @Const(ligand), + @Const(forcefield), + @Const(poses), + etotals, + ::Val{PPWI}, +) where {PPWI} + + gid = @index(Group) + lid = @index(Local) + + # Get index of first TD + ix = (gid - 1) * wgsize * PPWI + lid + + # Have extra threads do the last member intead of return. + # A return would disable use of barriers, so not using return is better + ix = ix <= nposes ? ix : nposes - PPWI + + etot = MArray{Tuple{PPWI},Float32}(undef) + transform = MArray{Tuple{PPWI,3},Vec4f32}(undef) + + lsz = wgsize + for i = 1:PPWI + index = (ix) + (i - 1) * lsz + @inbounds sx::Float32 = sin(poses[1, index]) + @inbounds cx::Float32 = cos(poses[1, index]) + @inbounds sy::Float32 = sin(poses[2, index]) + @inbounds cy::Float32 = cos(poses[2, index]) + @inbounds sz::Float32 = sin(poses[3, index]) + @inbounds cz::Float32 = cos(poses[3, index]) + @inbounds transform[i, 1] = # + Vec4f32(cy * cz, sx * sy * cz - cx * sz, cx * sy * cz + sx * sz, poses[4, index]) + @inbounds transform[i, 2] = # + Vec4f32(cy * sz, sx * sy * sz + cx * cz, cx * sy * sz - sx * cz, poses[5, index]) + @inbounds transform[i, 3] = # + Vec4f32(-sy, sx * cy, cx * cy, poses[6, index]) + etot[i] = 0 + end + + @inbounds for l_atom::Atom in ligand + + l_params::FFParams = forcefield[l_atom.type+1] + lhphb_ltz::Bool = l_params.hphb < Zero + lhphb_gtz::Bool = l_params.hphb > Zero + + lpos = MArray{Tuple{PPWI},Vec3f32}(undef) + + @simd for i = 1:PPWI + # Transform ligand atom + @inbounds lpos[i] = Vec3f32( + transform[i, 1].w + + l_atom.x * transform[i, 1].x + + l_atom.y * transform[i, 1].y + + l_atom.z * transform[i, 1].z, + transform[i, 2].w + + l_atom.x * transform[i, 2].x + + l_atom.y * transform[i, 2].y + + l_atom.z * transform[i, 2].z, + transform[i, 3].w + + l_atom.x * transform[i, 3].x + + l_atom.y * transform[i, 3].y + + l_atom.z * transform[i, 3].z, + ) + end + + @inbounds for p_atom::Atom in protein + @inbounds p_params::FFParams = forcefield[p_atom.type+1] + + radij::Float32 = p_params.radius + l_params.radius + r_radij::Float32 = One / radij + + elcdst::Float32 = (p_params.hbtype == HbtypeF && l_params.hbtype == HbtypeF) ? Four : Two + elcdst1::Float32 = (p_params.hbtype == HbtypeF && l_params.hbtype == HbtypeF) ? Quarter : Half + type_E::Bool = ((p_params.hbtype == HbtypeE || l_params.hbtype == HbtypeE)) + + phphb_ltz::Bool = p_params.hphb < Zero + phphb_gtz::Bool = p_params.hphb > Zero + phphb_nz::Bool = p_params.hphb != Zero + p_hphb::Float32 = p_params.hphb * (phphb_ltz && lhphb_gtz ? -One : One) + l_hphb::Float32 = l_params.hphb * (phphb_gtz && lhphb_ltz ? -One : One) + distdslv::Float32 = + (phphb_ltz ? (lhphb_ltz ? Npnpdist : Nppdist) : (lhphb_ltz ? Nppdist : -Float32Max)) + r_distdslv::Float32 = One / distdslv + + chrg_init::Float32 = l_params.elsc * p_params.elsc + dslv_init::Float32 = p_hphb + l_hphb + + + @simd for i = 1:PPWI + @inbounds x::Float32 = lpos[i].x - p_atom.x + @inbounds y::Float32 = lpos[i].y - p_atom.y + @inbounds z::Float32 = lpos[i].z - p_atom.z + + distij::Float32 = sqrt(x * x + y * y + z * z) + + # Calculate the sum of the sphere radii + distbb::Float32 = distij - radij + zone1::Bool = (distbb < Zero) + + # Calculate steric energy + @inbounds etot[i] += (One - (distij * r_radij)) * (zone1 ? Two * Hardness : Zero) + + # Calculate formal and dipole charge interactions + chrg_e::Float32 = + chrg_init * ((zone1 ? One : (One - distbb * elcdst1)) * (distbb < elcdst ? One : Zero)) + neg_chrg_e::Float32 = -abs(chrg_e) + chrg_e = type_E ? neg_chrg_e : chrg_e + @inbounds etot[i] += chrg_e * Cnstnt + + # Calculate the two cases for Nonpolar-Polar repulsive interactions + coeff::Float32 = (One - (distbb * r_distdslv)) + dslv_e::Float32 = dslv_init * ((distbb < distdslv && phphb_nz) ? One : Zero) + dslv_e *= (zone1 ? One : coeff) + @inbounds etot[i] += dslv_e + end + end + end + + td_base = (gid - 1) * wgsize * PPWI + lid + if td_base <= nposes + @simd for i = 1:PPWI + @inbounds etotals[td_base+(i-1)*wgsize] = etot[i] * Half + end + end + + nothing +end + +main() diff --git a/miniBUDE.jl/src/Threaded.jl b/miniBUDE.jl/src/Threaded.jl new file mode 100644 index 0000000..7393393 --- /dev/null +++ b/miniBUDE.jl/src/Threaded.jl @@ -0,0 +1,180 @@ +include("BUDE.jl") +using StaticArrays + +const Device = (undef, "CPU", "Threaded") + +function devices() + return [Device] +end + +const Float32Max = typemax(Float32) + +function run(params::Params, deck::Deck, _::DeviceWithRepr) + println("Using max $(Threads.nthreads()) threads") + if params.ppwi != DefaultPPWI + @warn "Threaded implementation only uses wgsize, the PPWI argument is ignored" + end + + poses = size(deck.poses)[2] + etotals = Array{Float32}(undef, poses) + + # warmup + fasten_main( + Val(convert(Int, params.wgsize)), + deck.protein, + deck.ligand, + deck.forcefield, + deck.poses, + etotals, + ) + + elapsed = @elapsed for _ = 1:params.iterations + fasten_main( + Val(convert(Int, params.wgsize)), + deck.protein, + deck.ligand, + deck.forcefield, + deck.poses, + etotals, + ) + end + + + (etotals, elapsed, params.wgsize) +end + +@fastmath function fasten_main( + ::Val{WGSIZE}, + protein::AbstractArray{Atom}, + ligand::AbstractArray{Atom}, + forcefield::AbstractArray{FFParams}, + poses::AbstractArray{Float32,2}, + etotals::AbstractArray{Float32}, +) where {WGSIZE} + nposes::Int = size(poses)[2] + numGroups::Int = nposes ÷ WGSIZE + nligand::Int = length(ligand) + nprotein::Int = length(protein) + + Threads.@threads for group = 1:numGroups + + etot = MArray{Tuple{WGSIZE},Float32}(undef) + transform = MArray{Tuple{3,4,WGSIZE},Float32}(undef) + + @simd for i = 1:WGSIZE + ix = (group - 1) * (WGSIZE) + i + @inbounds sx::Float32 = sin(poses[1, ix]) + @inbounds cx::Float32 = cos(poses[1, ix]) + @inbounds sy::Float32 = sin(poses[2, ix]) + @inbounds cy::Float32 = cos(poses[2, ix]) + @inbounds sz::Float32 = sin(poses[3, ix]) + @inbounds cz::Float32 = cos(poses[3, ix]) + @inbounds transform[1, 1, i] = cy * cz + @inbounds transform[1, 2, i] = sx * sy * cz - cx * sz + @inbounds transform[1, 3, i] = cx * sy * cz + sx * sz + @inbounds transform[1, 4, i] = poses[4, ix] + @inbounds transform[2, 1, i] = cy * sz + @inbounds transform[2, 2, i] = sx * sy * sz + cx * cz + @inbounds transform[2, 3, i] = cx * sy * sz - sx * cz + @inbounds transform[2, 4, i] = poses[5, ix] + @inbounds transform[3, 1, i] = -sy + @inbounds transform[3, 2, i] = sx * cy + @inbounds transform[3, 3, i] = cx * cy + @inbounds transform[3, 4, i] = poses[6, ix] + @inbounds etot[i] = Zero + end + + for il = 1:nligand + @inbounds l_atom::Atom = ligand[il] + + @inbounds l_params::FFParams = forcefield[l_atom.type+1] + lhphb_ltz::Bool = l_params.hphb < Zero + lhphb_gtz::Bool = l_params.hphb > Zero + + + lpos_x = MArray{Tuple{WGSIZE},Float32}(undef) + lpos_y = MArray{Tuple{WGSIZE},Float32}(undef) + lpos_z = MArray{Tuple{WGSIZE},Float32}(undef) + + @simd for i = 1:WGSIZE + @inbounds lpos_x[i] = ( + transform[1, 4, i] + + l_atom.x * transform[1, 1, i] + + l_atom.y * transform[1, 2, i] + + l_atom.z * transform[1, 3, i] + ) + @inbounds lpos_y[i] = ( + transform[2, 4, i] + + l_atom.x * transform[2, 1, i] + + l_atom.y * transform[2, 2, i] + + l_atom.z * transform[2, 3, i] + ) + @inbounds lpos_z[i] = ( + transform[3, 4, i] + + l_atom.x * transform[3, 1, i] + + l_atom.y * transform[3, 2, i] + + l_atom.z * transform[3, 3, i] + ) + end + + for ip = 1:nprotein + @inbounds p_atom = protein[ip] + @inbounds p_params::FFParams = forcefield[p_atom.type+1] + + radij::Float32 = p_params.radius + l_params.radius + r_radij::Float32 = One / radij + + elcdst::Float32 = (p_params.hbtype == HbtypeF && l_params.hbtype == HbtypeF) ? Four : Two + elcdst1::Float32 = + (p_params.hbtype == HbtypeF && l_params.hbtype == HbtypeF) ? Quarter : Half + type_E::Bool = ((p_params.hbtype == HbtypeE || l_params.hbtype == HbtypeE)) + + phphb_ltz::Bool = p_params.hphb < Zero + phphb_gtz::Bool = p_params.hphb > Zero + phphb_nz::Bool = p_params.hphb != Zero + p_hphb::Float32 = p_params.hphb * (phphb_ltz && lhphb_gtz ? -One : One) + l_hphb::Float32 = l_params.hphb * (phphb_gtz && lhphb_ltz ? -One : One) + distdslv::Float32 = + (phphb_ltz ? (lhphb_ltz ? Npnpdist : Nppdist) : (lhphb_ltz ? Nppdist : -Float32Max)) + r_distdslv::Float32 = One / distdslv + + chrg_init::Float32 = l_params.elsc * p_params.elsc + dslv_init::Float32 = p_hphb + l_hphb + + @simd for i = 1:WGSIZE + @inbounds x::Float32 = lpos_x[i] - p_atom.x + @inbounds y::Float32 = lpos_y[i] - p_atom.y + @inbounds z::Float32 = lpos_z[i] - p_atom.z + + distij::Float32 = sqrt(x * x + y * y + z * z) + + # Calculate the sum of the sphere radii + distbb::Float32 = distij - radij + zone1::Bool = (distbb < Zero) + + # Calculate steric energy + @inbounds etot[i] += (One - (distij * r_radij)) * (zone1 ? Two * Hardness : Zero) + + # Calculate formal and dipole charge interactions + chrg_e::Float32 = + chrg_init * ((zone1 ? One : (One - distbb * elcdst1)) * (distbb < elcdst ? One : Zero)) + neg_chrg_e::Float32 = -abs(chrg_e) + chrg_e = type_E ? neg_chrg_e : chrg_e + @inbounds etot[i] += chrg_e * Cnstnt + + # Calculate the two cases for Nonpolar-Polar repulsive interactions + coeff::Float32 = (One - (distbb * r_distdslv)) + dslv_e::Float32 = dslv_init * ((distbb < distdslv && phphb_nz) ? One : Zero) + dslv_e *= (zone1 ? One : coeff) + @inbounds etot[i] += dslv_e + end + end + end + @simd for i = 1:WGSIZE + ix = (group - 1) * WGSIZE + i + @inbounds etotals[ix] = etot[i] * Half + end + end +end + +main() diff --git a/miniBUDE.jl/src/miniBUDE.jl b/miniBUDE.jl/src/miniBUDE.jl new file mode 100644 index 0000000..8326527 --- /dev/null +++ b/miniBUDE.jl/src/miniBUDE.jl @@ -0,0 +1,3 @@ +module JuliaStream end + +println("Please run benchmarks directly via `julia --project src/.jl`") diff --git a/miniBUDE.jl/src/oneAPI.jl b/miniBUDE.jl/src/oneAPI.jl new file mode 100644 index 0000000..a7f963d --- /dev/null +++ b/miniBUDE.jl/src/oneAPI.jl @@ -0,0 +1,201 @@ +include("BUDE.jl") +using oneAPI, StaticArrays + +function devices()::Vector{DeviceWithRepr} + all = map(oneL0.devices, oneL0.drivers()) |> Iterators.flatten |> Iterators.collect + map(dev -> (dev, repr("text/plain", dev), "oneAPi.jl"), all) +end + + +function run(params::Params, deck::Deck, device::DeviceWithRepr) + + oneAPI.allowscalar(false) + oneAPI.device!(device[1]) + + println("Using L0 device: $(repr("text/plain",device[1]))") + + protein = oneArray{Atom}(deck.protein) + ligand = oneArray{Atom}(deck.ligand) + forcefield = oneArray{FFParams}(deck.forcefield) + poses = oneArray{Float32,2}(deck.poses) + + nprotein::Int = length(deck.protein) + nligand::Int = length(deck.ligand) + nforcefield::Int = length(deck.forcefield) + nposes::Int = size(deck.poses)[2] + + etotals = oneArray{Float32}(undef, nposes) + blocks_size = ceil(UInt, nposes / params.ppwi) + nthreads = params.wgsize + + println("Using kernel parameters: <<<$(blocks_size),$(nthreads)>>> 1:$nposes") + + # warmup + @oneapi items = nthreads groups = blocks_size ÷ nthreads fasten_main( + nprotein, + nligand, + nforcefield, + nposes, + protein, + ligand, + forcefield, + poses, + etotals, + Val(convert(Int, params.ppwi)), + ) + oneAPI.synchronize() + + elapsed = @elapsed begin + for _ = 1:params.iterations + @oneapi items = nthreads groups = blocks_size ÷ nthreads fasten_main( + nprotein, + nligand, + nforcefield, + nposes, + protein, + ligand, + forcefield, + poses, + etotals, + Val(convert(Int, params.ppwi)), + ) + oneAPI.synchronize() + end + end + + (Array{Float32}(etotals), elapsed, params.ppwi) +end + +@fastmath function fasten_main( + nprotein::Int, + nligand::Int, + nforcefield::Int, + nposes::Int, + protein, + ligand, + forcefield, + poses, + etotals, + ::Val{PPWI}, +) where {PPWI} + + # Get index of first TD + ix = (get_group_id() - 1) * get_local_size() * PPWI + get_local_id() + + # Have extra threads do the last member intead of return. + # A return would disable use of barriers, so not using return is better + ix = ix <= nposes ? ix : nposes - PPWI + + etot = MArray{Tuple{PPWI},Float32}(undef) + transform = MArray{Tuple{PPWI,3},Vec4f32}(undef) + + lsz = get_local_size() + @simd for i = 1:PPWI + index = (ix) + (i - 1) * lsz + @inbounds sx::Float32 = sin(poses[1, index]) + @inbounds cx::Float32 = cos(poses[1, index]) + @inbounds sy::Float32 = sin(poses[2, index]) + @inbounds cy::Float32 = cos(poses[2, index]) + @inbounds sz::Float32 = sin(poses[3, index]) + @inbounds cz::Float32 = cos(poses[3, index]) + @inbounds transform[i, 1] = # + Vec4f32(cy * cz, sx * sy * cz - cx * sz, cx * sy * cz + sx * sz, poses[4, index]) + @inbounds transform[i, 2] = # + Vec4f32(cy * sz, sx * sy * sz + cx * cz, cx * sy * sz - sx * cz, poses[5, index]) + @inbounds transform[i, 3] = # + Vec4f32(-sy, sx * cy, cx * cy, poses[6, index]) + etot[i] = 0 + end + + for il = 1:nligand + + @inbounds l_atom::Atom = ligand[il] + @inbounds l_params::FFParams = forcefield[l_atom.type+1] + lhphb_ltz::Bool = l_params.hphb < Zero + lhphb_gtz::Bool = l_params.hphb > Zero + + lpos = MArray{Tuple{PPWI},Vec3f32}(undef) + + @simd for i = 1:PPWI + # Transform ligand atom + @inbounds lpos[i] = Vec3f32( + transform[i, 1].w + + l_atom.x * transform[i, 1].x + + l_atom.y * transform[i, 1].y + + l_atom.z * transform[i, 1].z, + transform[i, 2].w + + l_atom.x * transform[i, 2].x + + l_atom.y * transform[i, 2].y + + l_atom.z * transform[i, 2].z, + transform[i, 3].w + + l_atom.x * transform[i, 3].x + + l_atom.y * transform[i, 3].y + + l_atom.z * transform[i, 3].z, + ) + end + + for ip = 1:nprotein + @inbounds p_atom = protein[ip] + @inbounds p_params::FFParams = forcefield[p_atom.type+1] + + radij::Float32 = p_params.radius + l_params.radius + r_radij::Float32 = One / radij + + elcdst::Float32 = (p_params.hbtype == HbtypeF && l_params.hbtype == HbtypeF) ? Four : Two + elcdst1::Float32 = (p_params.hbtype == HbtypeF && l_params.hbtype == HbtypeF) ? Quarter : Half + type_E::Bool = ((p_params.hbtype == HbtypeE || l_params.hbtype == HbtypeE)) + + phphb_ltz::Bool = p_params.hphb < Zero + phphb_gtz::Bool = p_params.hphb > Zero + phphb_nz::Bool = p_params.hphb != Zero + p_hphb::Float32 = p_params.hphb * (phphb_ltz && lhphb_gtz ? -One : One) + l_hphb::Float32 = l_params.hphb * (phphb_gtz && lhphb_ltz ? -One : One) + distdslv::Float32 = + (phphb_ltz ? (lhphb_ltz ? Npnpdist : Nppdist) : (lhphb_ltz ? Nppdist : -Float32Max)) + r_distdslv::Float32 = One / distdslv + + chrg_init::Float32 = l_params.elsc * p_params.elsc + dslv_init::Float32 = p_hphb + l_hphb + + + @simd for i = 1:PPWI + @inbounds x::Float32 = lpos[i].x - p_atom.x + @inbounds y::Float32 = lpos[i].y - p_atom.y + @inbounds z::Float32 = lpos[i].z - p_atom.z + + distij::Float32 = sqrt(x * x + y * y + z * z) + + # Calculate the sum of the sphere radii + distbb::Float32 = distij - radij + zone1::Bool = (distbb < Zero) + + # Calculate steric energy + @inbounds etot[i] += (One - (distij * r_radij)) * (zone1 ? Two * Hardness : Zero) + + # Calculate formal and dipole charge interactions + chrg_e::Float32 = + chrg_init * ((zone1 ? One : (One - distbb * elcdst1)) * (distbb < elcdst ? One : Zero)) + neg_chrg_e::Float32 = -abs(chrg_e) + chrg_e = type_E ? neg_chrg_e : chrg_e + @inbounds etot[i] += chrg_e * Cnstnt + + # Calculate the two cases for Nonpolar-Polar repulsive interactions + coeff::Float32 = (One - (distbb * r_distdslv)) + dslv_e::Float32 = dslv_init * ((distbb < distdslv && phphb_nz) ? One : Zero) + dslv_e *= (zone1 ? One : coeff) + @inbounds etot[i] += dslv_e + end + end + end + + td_base = (get_group_id() - 1) * get_local_size() * PPWI + get_local_id() + if td_base <= nposes + @simd for i = 1:PPWI + @inbounds etotals[td_base+(i-1)*get_local_size()] = etot[i] * Half + end + end + + nothing +end + +main()