-
Notifications
You must be signed in to change notification settings - Fork 148
Description
Currently, we explicitly reset_du / set_zero! before running the volume integrals. This is most likely a historical reason due to the blending in the Hennemann-Gassner Shock-Capturing which potentially adds up two volume integrals
Trixi.jl/src/solvers/dgsem_tree/dg_1d.jl
Lines 145 to 172 in 0ffa20f
| @inline function flux_differencing_kernel!(du, u, element, | |
| mesh::Union{TreeMesh{1}, StructuredMesh{1}}, | |
| have_nonconservative_terms::False, equations, | |
| volume_flux, dg::DGSEM, cache, alpha = true) | |
| # true * [some floating point value] == [exactly the same floating point value] | |
| # This can (hopefully) be optimized away due to constant propagation. | |
| @unpack derivative_split = dg.basis | |
| # Calculate volume integral in one element | |
| for i in eachnode(dg) | |
| u_node = get_node_vars(u, equations, dg, i, element) | |
| # All diagonal entries of `derivative_split` are zero. Thus, we can skip | |
| # the computation of the diagonal terms. In addition, we use the symmetry | |
| # of the `volume_flux` to save half of the possible two-point flux | |
| # computations. | |
| # x direction | |
| for ii in (i + 1):nnodes(dg) | |
| u_node_ii = get_node_vars(u, equations, dg, ii, element) | |
| flux1 = volume_flux(u_node, u_node_ii, 1, equations) | |
| multiply_add_to_node_vars!(du, alpha * derivative_split[i, ii], flux1, | |
| equations, dg, i, element) | |
| multiply_add_to_node_vars!(du, alpha * derivative_split[ii, i], flux1, | |
| equations, dg, ii, element) | |
| end | |
| end | |
| end |
Trixi.jl/src/solvers/dgsem_tree/dg_1d.jl
Lines 209 to 233 in 0ffa20f
| @inline function fv_kernel!(du, u, | |
| mesh::Union{TreeMesh{1}, StructuredMesh{1}}, | |
| have_nonconservative_terms, equations, | |
| volume_flux_fv, dg::DGSEM, cache, element, alpha = true) | |
| @unpack fstar1_L_threaded, fstar1_R_threaded = cache | |
| @unpack inverse_weights = dg.basis # Plays role of inverse DG-subcell sizes | |
| # Calculate FV two-point fluxes | |
| fstar1_L = fstar1_L_threaded[Threads.threadid()] | |
| fstar1_R = fstar1_R_threaded[Threads.threadid()] | |
| calcflux_fv!(fstar1_L, fstar1_R, u, mesh, | |
| have_nonconservative_terms, equations, | |
| volume_flux_fv, dg, element, cache) | |
| # Calculate FV volume integral contribution | |
| for i in eachnode(dg) | |
| for v in eachvariable(equations) | |
| du[v, i, element] += (alpha * | |
| (inverse_weights[i] * | |
| (fstar1_L[v, i + 1] - fstar1_R[v, i]))) | |
| end | |
| end | |
| return nothing | |
| end |
In practical somewhat large scale simulations these memory accesses are actually comparably expensive, see for instance this 95M DoF Hyperbolic simulation, where resetting du is as expensive as applying the Jacobian:
────────────────────────────────────────────────────────────────────────────────────────────────────
Simulation running 'CompressibleEulerEquations3D' with DGSEM(polydeg=3)
────────────────────────────────────────────────────────────────────────────────────────────────────
#timesteps: 5890 run time: 4.16604353e+03 s
Δt: 1.02347931e-07 └── GC time: 4.31043791e-01 s (0.010%)
sim. time: 6.05000000e+00 (100.000%) time/DOF/rhs!: 5.46942834e-08 s
PID: 6.99376191e-08 s
#DOFs per field: 18869632 alloc'd memory: 17846.699 MiB
#elements: 294838
CL_p : 2.95117804e-01
────────────────────────────────────────────────────────────────────────────────────────────────────
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Trixi.jl Time Allocations
─────────────────────── ────────────────────────
Tot / % measured: 1.16h / 100.0% 651MiB / 99.3%
Section ncalls time %tot avg alloc %tot avg
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────
main loop 1 1.16h 100.0% 1.16h 643MiB 99.4% 643MiB
Paired Explicit Runge-Kutta ODE integration step 5.89k 1.15h 99.0% 701ms 427MiB 66.0% 74.2KiB
rhs! 29.4k 2326s 55.8% 79.0ms 111MiB 17.2% 3.86KiB
interface flux 29.4k 872s 20.9% 29.6ms 15.3MiB 2.4% 544B
volume integral 29.4k 757s 18.2% 25.7ms 13.9MiB 2.2% 496B
surface integral 29.4k 241s 5.8% 8.18ms 10.3MiB 1.6% 368B
prolong2interfaces 29.4k 241s 5.8% 8.17ms 8.54MiB 1.3% 304B
Jacobian 29.4k 93.6s 2.2% 3.18ms 12.6MiB 1.9% 448B
reset ∂u/∂t 29.4k 78.2s 1.9% 2.65ms 0.00B 0.0% 0.00B
boundary flux 29.4k 35.7s 0.9% 1.21ms 41.8MiB 6.5% 1.45KiB
prolong2boundaries 29.4k 7.07s 0.2% 240μs 8.54MiB 1.3% 304B
~rhs!~ 29.4k 1.18s 0.0% 40.1μs 9.33KiB 0.0% 0.32B
prolong2mortars 29.4k 17.1ms 0.0% 580ns 0.00B 0.0% 0.00B
mortar flux 29.4k 12.8ms 0.0% 433ns 0.00B 0.0% 0.00B
source terms 29.4k 3.02ms 0.0% 103ns 0.00B 0.0% 0.00B
rhs! (part.) 70.7k 910s 21.8% 12.9ms 273MiB 42.2% 3.95KiB
volume integral 70.7k 310s 7.4% 4.38ms 33.4MiB 5.2% 496B
interface flux 70.7k 290s 7.0% 4.10ms 38.8MiB 6.0% 576B
prolong2interfaces 70.7k 84.6s 2.0% 1.20ms 22.6MiB 3.5% 336B
boundary flux 70.7k 80.9s 1.9% 1.14ms 100MiB 15.5% 1.45KiB
surface integral 70.7k 69.5s 1.7% 984μs 24.8MiB 3.8% 368B
reset ∂u/∂t 70.7k 51.2s 1.2% 724μs 0.00B 0.0% 0.00B
Jacobian 70.7k 15.7s 0.4% 222μs 30.2MiB 4.7% 448B
prolong2boundaries 70.7k 6.59s 0.2% 93.2μs 22.6MiB 3.5% 336B
~rhs! (part.)~ 70.7k 1.38s 0.0% 19.6μs 9.33KiB 0.0% 0.14B
prolong2mortars 70.7k 36.0ms 0.0% 510ns 0.00B 0.0% 0.00B
mortar flux 70.7k 23.0ms 0.0% 326ns 0.00B 0.0% 0.00B
source terms 70.7k 9.27ms 0.0% 131ns 0.00B 0.0% 0.00B
~Paired Explicit Runge-Kutta ODE integration step~ 5.89k 891s 21.4% 151ms 43.1MiB 6.7% 7.48KiB
Step-Callbacks 5.89k 39.6s 0.9% 6.72ms 216MiB 33.4% 37.6KiB
calculate dt 2.94k 19.6s 0.5% 6.65ms 1.39MiB 0.2% 496B
~Step-Callbacks~ 5.89k 14.4s 0.3% 2.45ms 1.80MiB 0.3% 320B
analyze solution 59 5.60s 0.1% 94.8ms 213MiB 32.9% 3.61MiB
~main loop~ 1 44.8ms 0.0% 44.8ms 1.47KiB 0.0% 1.47KiB
analyze solution 1 192ms 0.0% 192ms 3.61MiB 0.6% 3.61MiB
calculate dt 1 9.35ms 0.0% 9.35ms 496B 0.0% 496B
─────────────────────────────────────────────────────────────────────────────────────────────────────────────────── For this 460M DoF Hyperbolic-Parabolic simulation resetting gradients is as expensive as Jacobian, interface prolongation, interface flux computation(!)
────────────────────────────────────────────────────────────────────────────────────────────────────
Simulation running 'CompressibleEulerEquations3D' with DGSEM(polydeg=4)
────────────────────────────────────────────────────────────────────────────────────────────────────
#timesteps: 222 run time: 3.20091320e+02 s
Δt: 2.78788528e-04 └── GC time: 3.70423828e-01 s (0.116%)
sim. time: 1.00050000e+03 (100.000%) time/DOF/rhs!: 9.55724407e-08 s
PID: 1.10936055e-07 s
#DOFs per field: 26785500 alloc'd memory: 76887.935 MiB
#elements: 214284
CD_p : -7.82537637e+00
CD_f : 5.97248153e-02
CD_p : 2.49420750e+01
CD_f : 6.17264422e-02
────────────────────────────────────────────────────────────────────────────────────────────────────
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Trixi.jl Time Allocations
─────────────────────── ────────────────────────
Tot / % measured: 321s / 100.0% 6.02GiB / 100.0%
Section ncalls time %tot avg alloc %tot avg
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────
main loop 1 320s 99.8% 320s 5.03GiB 83.4% 5.03GiB
Paired Explicit Runge-Kutta ODE integration step 222 315s 98.1% 1.42s 37.3MiB 0.6% 172KiB
rhs_hyperbolic_parabolic! 444 234s 73.1% 528ms 5.00MiB 0.1% 11.5KiB
rhs_parabolic! 444 176s 54.9% 397ms 3.83MiB 0.1% 8.84KiB
calculate gradient 444 99.2s 30.9% 223ms 1.87MiB 0.0% 4.30KiB
surface integral 444 41.6s 13.0% 93.6ms 246KiB 0.0% 567B
volume integral 444 32.6s 10.2% 73.5ms 242KiB 0.0% 558B
reset gradients 444 6.63s 2.1% 14.9ms 0.00B 0.0% 0.00B
Jacobian 444 6.36s 2.0% 14.3ms 479KiB 0.0% 1.08KiB
interface flux 444 6.33s 2.0% 14.3ms 215KiB 0.0% 496B
prolong2interfaces 444 5.52s 1.7% 12.4ms 146KiB 0.0% 336B
boundary flux 444 120ms 0.0% 271μs 430KiB 0.0% 992B
prolong2boundaries 444 55.4ms 0.0% 125μs 146KiB 0.0% 336B
~calculate gradient~ 444 16.0ms 0.0% 36.0μs 7.34KiB 0.0% 16.9B
mortar flux 444 344μs 0.0% 774ns 0.00B 0.0% 0.00B
prolong2mortars 444 255μs 0.0% 575ns 0.00B 0.0% 0.00B
calculate viscous fluxes 444 24.6s 7.7% 55.4ms 271KiB 0.0% 624B
prolong2interfaces 444 18.8s 5.9% 42.3ms 215KiB 0.0% 496B
volume integral 444 13.8s 4.3% 31.1ms 242KiB 0.0% 558B
surface integral 444 5.13s 1.6% 11.6ms 160KiB 0.0% 368B
interface flux 444 4.82s 1.5% 10.9ms 173KiB 0.0% 400B
transform variables 444 4.05s 1.3% 9.12ms 146KiB 0.0% 336B
reset ∂u/∂t 444 2.83s 0.9% 6.37ms 6.48KiB 0.0% 14.9B
Jacobian 444 2.69s 0.8% 6.05ms 146KiB 0.0% 336B
prolong2boundaries 444 168ms 0.1% 379μs 215KiB 0.0% 496B
boundary flux 444 102ms 0.0% 229μs 430KiB 0.0% 992B
~rhs_parabolic!~ 444 20.6ms 0.0% 46.4μs 10.8KiB 0.0% 24.9B
prolong2mortars 444 288μs 0.0% 648ns 0.00B 0.0% 0.00B
mortar flux 444 205μs 0.0% 461ns 0.00B 0.0% 0.00B
rhs! 444 54.1s 16.9% 122ms 1.15MiB 0.0% 2.66KiB
volume integral 444 24.0s 7.5% 54.0ms 146KiB 0.0% 336B
interface flux 444 13.9s 4.3% 31.4ms 194KiB 0.0% 448B
prolong2interfaces 444 5.74s 1.8% 12.9ms 111KiB 0.0% 256B
surface integral 444 4.98s 1.6% 11.2ms 132KiB 0.0% 304B
Jacobian 444 2.69s 0.8% 6.06ms 132KiB 0.0% 304B
reset ∂u/∂t 444 2.46s 0.8% 5.54ms 0.00B 0.0% 0.00B
boundary flux 444 162ms 0.1% 365μs 347KiB 0.0% 800B
prolong2boundaries 444 83.3ms 0.0% 188μs 111KiB 0.0% 256B
~rhs!~ 444 15.8ms 0.0% 35.6μs 9.33KiB 0.0% 21.5B
prolong2mortars 444 375μs 0.0% 845ns 0.00B 0.0% 0.00B
mortar flux 444 212μs 0.0% 478ns 0.00B 0.0% 0.00B
source terms 444 60.5μs 0.0% 136ns 0.00B 0.0% 0.00B
~rhs_hyperbolic_parabolic!~ 444 4.07s 1.3% 9.17ms 15.3KiB 0.0% 35.4B
~Paired Explicit Runge-Kutta ODE integration step~ 222 55.2s 17.2% 249ms 1.52MiB 0.0% 7.00KiB
rhs_hyperbolic_parabolic! (part.) 2.66k 25.0s 7.8% 9.38ms 30.8MiB 0.5% 11.9KiB
calculate gradient 2.66k 8.18s 2.5% 3.07ms 11.3MiB 0.2% 4.35KiB
surface integral 2.66k 2.53s 0.8% 948μs 1.14MiB 0.0% 448B
volume integral 2.66k 2.32s 0.7% 872μs 1.46MiB 0.0% 576B
interface flux 2.66k 1.29s 0.4% 485μs 1.26MiB 0.0% 496B
reset gradients 2.66k 751ms 0.2% 282μs 0.00B 0.0% 0.00B
prolong2interfaces 2.66k 508ms 0.2% 191μs 957KiB 0.0% 368B
boundary flux 2.66k 461ms 0.1% 173μs 2.52MiB 0.0% 992B
Jacobian 2.66k 241ms 0.1% 90.4μs 3.05MiB 0.0% 1.17KiB
prolong2boundaries 2.66k 49.9ms 0.0% 18.7μs 957KiB 0.0% 368B
~calculate gradient~ 2.66k 21.4ms 0.0% 8.04μs 7.34KiB 0.0% 2.82B
mortar flux 2.66k 1.07ms 0.0% 401ns 0.00B 0.0% 0.00B
prolong2mortars 2.66k 929μs 0.0% 349ns 0.00B 0.0% 0.00B
calculate viscous fluxes 2.66k 4.03s 1.3% 1.51ms 1.59MiB 0.0% 624B
interface flux 5.33k 3.16s 1.0% 593μs 2.40MiB 0.0% 472B
volume integral 5.33k 2.76s 0.9% 519μs 2.32MiB 0.0% 456B
prolong2interfaces 5.33k 2.14s 0.7% 403μs 2.07MiB 0.0% 408B
boundary flux 5.33k 1.67s 0.5% 313μs 4.55MiB 0.1% 896B
transform variables 2.66k 1.27s 0.4% 479μs 957KiB 0.0% 368B
surface integral 5.33k 675ms 0.2% 127μs 1.79MiB 0.0% 352B
reset ∂u/∂t 5.33k 488ms 0.2% 91.5μs 0.00B 0.0% 0.00B
~rhs_hyperbolic_parabolic! (part.)~ 2.66k 303ms 0.1% 114μs 11.5KiB 0.0% 4.43B
Jacobian 5.33k 166ms 0.1% 31.2μs 1.79MiB 0.0% 352B
prolong2boundaries 5.33k 135ms 0.0% 25.3μs 2.07MiB 0.0% 408B
mortar flux 5.33k 2.28ms 0.0% 429ns 0.00B 0.0% 0.00B
prolong2mortars 5.33k 1.88ms 0.0% 352ns 0.00B 0.0% 0.00B
source terms 2.66k 338μs 0.0% 127ns 0.00B 0.0% 0.00B
Step-Callbacks 222 5.53s 1.7% 24.9ms 4.99GiB 82.8% 23.0MiB
analyze solution 5 3.42s 1.1% 684ms 4.99GiB 82.8% 1.00GiB
calculate dt 222 2.10s 0.7% 9.47ms 79.8KiB 0.0% 368B
~Step-Callbacks~ 222 6.43ms 0.0% 28.9μs 121KiB 0.0% 560B
~main loop~ 1 1.67ms 0.0% 1.67ms 1.47KiB 0.0% 1.47KiB
analyze solution 1 644ms 0.2% 644ms 1.00GiB 16.6% 1.00GiB
calculate dt 1 8.94ms 0.0% 8.94ms 368B 0.0% 368B
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────One way to implement this would be to add volume kernel functions which specialize on an additional argument similar to non_conservative_terms which is true or false.
Then, only for the true case the multiply_and_add_to_node_vars would be called, and in other cases the variables would simply set.
Currently, the only place where the addition to du is required is the stabilizing volume integral here:
Trixi.jl/src/solvers/dgsem/calc_volume_integral.jl
Lines 217 to 227 in 0ffa20f
| # Calculate DG volume integral contribution | |
| volume_integral_kernel!(du, u, element, mesh, | |
| have_nonconservative_terms, equations, | |
| volume_integral_blend_high_order, dg, cache, | |
| 1 - alpha_element) | |
| # Calculate FV volume integral contribution | |
| volume_integral_kernel!(du, u, element, mesh, | |
| have_nonconservative_terms, equations, | |
| volume_integral_blend_low_order, dg, cache, | |
| alpha_element) |