-
Notifications
You must be signed in to change notification settings - Fork 34
/
Copy pathCudaBVH.cpp
292 lines (229 loc) · 10.9 KB
/
CudaBVH.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
/*
* Copyright (c) 2009-2011, NVIDIA Corporation
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA Corporation nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "CudaBVH.h"
#include "SceneLoader.h" // required for triangles and vertices
//Nodes / BVHLayout_Compact (12 floats + 4 ints = 64 bytes)
// innernode contains two childnodes c0 and c1, each having x,y,z coords for AABBhi and AABBlo, 2*2*3 = 12 floats
//
// nodes[innerOfs + 0 ] = Vec4f(c0.lo.x, c0.hi.x, c0.lo.y, c0.hi.y) // 4 floats = 16 bytes
// nodes[innerOfs + 16] = Vec4f(c1.lo.x, c1.hi.x, c1.lo.y, c1.hi.y) // increment nodes array index with 16
// nodes[innerOfs + 32] = Vec4f(c0.lo.z, c0.hi.z, c1.lo.z, c1.hi.z)
// nodes[innerOfs + 48] = Vec4i(c0.innerOfs or ~c0.triOfs, c1.innerOfs or ~c1.triOfs, 0, 0) // either inner or leaf, two dummy zeros at the end
// CudaBVH Compact: Vec4f(c0.lo.x, c0.hi.x, c0.lo.y, c0.hi.y)
// CudaBVH Compact: Vec4f(c1.lo.x, c1.hi.x, c1.lo.y, c1.hi.y)
// CudaBVH Compact: Vec4f(c0.lo.z, c0.hi.z, c1.lo.z, c1.hi.z)
// CudaBVH Compact: Vec4f(c0.innerOfs or ~c0.triOfs, c1.innerOfs or ~c1.triOfs, 0, 0)
// BVH node bounds: c0.lo.x, c0.hi.x, c0.lo.y, c0.hi.y, c0.lo.z, c0.hi.z
static int woopcount = 0; // counts Woopified triangles
CudaBVH::CudaBVH(const BVH& bvh, BVHLayout layout)
: m_layout(layout)
{
FW_ASSERT(layout >= 0 && layout < BVHLayout_Max);
if (layout == BVHLayout_Compact)
{
createCompact(bvh, 1);
return;
}
if (layout == BVHLayout_Compact2)
{
createCompact(bvh, 16);
return;
}
}
CudaBVH::~CudaBVH(void)
{
}
//------------------------------------------------------------------------
Vec2i CudaBVH::getNodeSubArray(int idx) const
{
FW_ASSERT(idx >= 0 && idx < 4);
S32 size = (S32)m_nodes.getSize();
if (m_layout == BVHLayout_SOA_AOS || m_layout == BVHLayout_SOA_SOA)
return Vec2i((size >> 2) * idx, (size >> 2));
return Vec2i(0, size);
}
//------------------------------------------------------------------------
Vec2i CudaBVH::getTriWoopSubArray(int idx) const
{
FW_ASSERT(idx >= 0 && idx < 4);
S32 size = (S32)m_triWoop.getSize();
if (m_layout == BVHLayout_AOS_SOA || m_layout == BVHLayout_SOA_SOA)
return Vec2i((size >> 2) * idx, (size >> 2));
return Vec2i(0, size);
}
//------------------------------------------------------------------------
CudaBVH& CudaBVH::operator=(CudaBVH& other)
{
if (&other != this)
{
m_layout = other.m_layout;
m_nodes = other.m_nodes;
m_triWoop = other.m_triWoop;
m_triIndex = other.m_triIndex;
}
return *this;
}
namespace detail
{
struct StackEntry
{
const BVHNode* node;
S32 idx;
StackEntry(const BVHNode* n = NULL, int i = 0) : node(n), idx(i) {}
};
}
void CudaBVH::createCompact(const BVH& bvh, int nodeOffsetSizeDiv)
{
using namespace detail; // for StackEntry
int leafcount = 0; // counts leafnodes
// construct and initialize data arrays which will be copied to CudaBVH buffers (last part of this function).
Array<Vec4i> nodeData(NULL, 4);
Array<Vec4i> triWoopData;
Array<Vec4i> triDebugData; // array for regular (non-woop) triangles
Array<S32> triIndexData;
// construct a stack (array of stack entries) to help in filling the data arrays
Array<StackEntry> stack(StackEntry(bvh.getRoot(), 0)); // initialise stack to rootnode
while (stack.getSize()) // while stack is not empty
{
StackEntry e = stack.removeLast(); // pop the stack
FW_ASSERT(e.node->getNumChildNodes() == 2);
const AABB* cbox[2];
int cidx[2]; // stores indices to both children
// Process children.
// for each child in entry e
for (int i = 0; i < 2; i++)
{
const BVHNode* child = e.node->getChildNode(i); // current childnode
cbox[i] = &child->m_bounds; // current child's AABB
////////////////////////////
/// INNER NODE
//////////////////////////////
// Inner node => push to stack.
if (!child->isLeaf()) // no leaf, thus an inner node
{ // compute childindex
cidx[i] = nodeData.getNumBytes() / nodeOffsetSizeDiv; // nodeOffsetSizeDiv is 1 for Fermi kernel, 16 for Kepler kernel
// push the current child on the stack
stack.add(StackEntry(child, nodeData.getSize()));
nodeData.add(NULL, 4); /// adds 4 * Vec4i per inner node or 4 * 16 bytes/Vec4i = 64 bytes of empty data per inner node
continue; // process remaining childnode (if any)
}
//////////////////////
/// LEAF NODE
/////////////////////
// Leaf => append triangles.
const LeafNode* leaf = reinterpret_cast<const LeafNode*>(child);
// index of a leafnode is a negative number, hence the ~
cidx[i] = ~triWoopData.getSize(); // leafs must be stored as negative (bitwise complement) in order to be recognised by pathtracer as a leaf
// for each triangle in leaf, range of triangle index j from m_lo to m_hi
for (int j = leaf->m_lo; j < leaf->m_hi; j++)
{
// transform the triangle's vertices to Woop triangle (simple transform to right angled triangle, see paper by Sven Woop)
woopifyTri(bvh, j); /// j is de triangle index in triIndex array
if (m_woop[0].x == 0.0f) m_woop[0].x = 0.0f; // avoid degenerate coordinates
// add the transformed woop triangle to triWoopData
triWoopData.add((Vec4i*)m_woop, 3);
triDebugData.add((Vec4i*)m_debugtri, 3);
// add tri index for current triangle to triIndexData
triIndexData.add(bvh.getTriIndices()[j]);
triIndexData.add(0); // zero padding because CUDA kernel uses same index for vertex array (3 vertices per triangle)
triIndexData.add(0); // and array of triangle indices
}
// Leaf node terminator to indicate end of leaf, stores hexadecimal value 0x80000000 (= 2147483648 in decimal)
triWoopData.add(0x80000000); // leafnode terminator code indicates the last triangle of the leaf node
triDebugData.add(0x80000000);
// add extra zero to triangle indices array to indicate end of leaf
triIndexData.add(0); // terminates triIndexdata for current leaf
leafcount++;
}
// Write entry for current node.
/// 4 Vec4i per node (according to compact bvh node layout)
Vec4i* dst = nodeData.getPtr(e.idx);
///std::cout << "e.idx: " << e.idx << " cidx[0]: " << cidx[0] << " cidx[1]: " << cidx[1] << "\n";
dst[0] = Vec4i(floatToBits(cbox[0]->min().x), floatToBits(cbox[0]->max().x), floatToBits(cbox[0]->min().y), floatToBits(cbox[0]->max().y));
dst[1] = Vec4i(floatToBits(cbox[1]->min().x), floatToBits(cbox[1]->max().x), floatToBits(cbox[1]->min().y), floatToBits(cbox[1]->max().y));
dst[2] = Vec4i(floatToBits(cbox[0]->min().z), floatToBits(cbox[0]->max().z), floatToBits(cbox[1]->min().z), floatToBits(cbox[1]->max().z));
dst[3] = Vec4i(cidx[0], cidx[1], 0, 0);
} // end of while loop, will iteratively empty the stack
m_leafnodecount = leafcount;
m_tricount = woopcount;
// Write data arrays to arrays of CudaBVH
m_gpuNodes = (Vec4i*) malloc(nodeData.getNumBytes());
m_gpuNodesSize = nodeData.getSize();
for (int i = 0; i < nodeData.getSize(); i++){
m_gpuNodes[i].x = nodeData.get(i).x;
m_gpuNodes[i].y = nodeData.get(i).y;
m_gpuNodes[i].z = nodeData.get(i).z;
m_gpuNodes[i].w = nodeData.get(i).w; // child indices
}
m_gpuTriWoop = (Vec4i*) malloc(triWoopData.getSize() * sizeof(Vec4i));
m_gpuTriWoopSize = triWoopData.getSize();
for (int i = 0; i < triWoopData.getSize(); i++){
m_gpuTriWoop[i].x = triWoopData.get(i).x;
m_gpuTriWoop[i].y = triWoopData.get(i).y;
m_gpuTriWoop[i].z = triWoopData.get(i).z;
m_gpuTriWoop[i].w = triWoopData.get(i).w;
}
m_debugTri = (Vec4i*)malloc(triDebugData.getSize() * sizeof(Vec4i));
m_debugTriSize = triDebugData.getSize();
for (int i = 0; i < triDebugData.getSize(); i++){
m_debugTri[i].x = triDebugData.get(i).x;
m_debugTri[i].y = triDebugData.get(i).y;
m_debugTri[i].z = triDebugData.get(i).z;
m_debugTri[i].w = triDebugData.get(i).w;
}
m_gpuTriIndices = (S32*) malloc(triIndexData.getSize() * sizeof(S32));
m_gpuTriIndicesSize = triIndexData.getSize();
for (int i = 0; i < triIndexData.getSize(); i++){
m_gpuTriIndices[i] = triIndexData.get(i);
}
}
//------------------------------------------------------------------------
void CudaBVH::woopifyTri(const BVH& bvh, int triIdx)
{
woopcount++;
// fetch the 3 vertex indices of this triangle
const Vec3i& vtxInds = bvh.getScene()->getTriangle(bvh.getTriIndices()[triIdx]).vertices;
const Vec3f& v0 = bvh.getScene()->getVertex(vtxInds.x);
const Vec3f& v1 = bvh.getScene()->getVertex(vtxInds.y);
const Vec3f& v2 = bvh.getScene()->getVertex(vtxInds.z);
// regular triangles (for debugging only)
m_debugtri[0] = Vec4f(v0.x, v0.y, v0.z, 0.0f);
m_debugtri[1] = Vec4f(v1.x, v1.y, v1.z, 0.0f);
m_debugtri[2] = Vec4f(v2.x, v2.y, v2.z, 0.0f);
Mat4f mtx;
// compute edges and transform them with a matrix
mtx.setCol(0, Vec4f(v0 - v2, 0.0f)); // sets matrix column 0 equal to a Vec4f(Vec3f, 0.0f )
mtx.setCol(1, Vec4f(v1 - v2, 0.0f));
mtx.setCol(2, Vec4f(cross(v0 - v2, v1 - v2), 0.0f));
mtx.setCol(3, Vec4f(v2, 1.0f));
mtx = invert(mtx);
/// m_woop[3] stores 3 transformed triangle edges
m_woop[0] = Vec4f(mtx(2, 0), mtx(2, 1), mtx(2, 2), -mtx(2, 3)); // elements of 3rd row of inverted matrix
m_woop[1] = mtx.getRow(0);
m_woop[2] = mtx.getRow(1);
}
//------------------------------------------------------------------------