Skip to content

Commit a751a33

Browse files
author
David Svitov
committed
Implement StopThePop as an optional configuration
1 parent a976565 commit a751a33

File tree

13 files changed

+1238
-38
lines changed

13 files changed

+1238
-38
lines changed

cuda_rasterizer/auxiliary.h

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#define MIDDEPTH_OFFSET 5
2828
#define DISTORTION_OFFSET 6
2929
#define MEDIAN_WEIGHT_OFFSET 7
30+
#define OUTPUT_CHANNELS 8
3031

3132
// distortion helper macros
3233
#define BACKFACE_CULL 1
@@ -35,6 +36,16 @@
3536
#define FAR_PLANE 100.0
3637
#define DETACH_WEIGHT 0
3738

39+
#define TILE_SORTING 0
40+
#define PIXEL_RESORTING 0
41+
#define BUFFER_LENGTH 8
42+
43+
#define FAST_INFERENCE 0
44+
#define MAX_BILLBOARD_SIZE 1000
45+
46+
constexpr uint32_t WARP_SIZE = 32U;
47+
constexpr uint32_t WARP_MASK = 0xFFFFFFFFU;
48+
3849
// Spherical harmonics coefficients
3950
__device__ const float SH_C0 = 0.28209479177387814f;
4051
__device__ const float SH_C1 = 0.4886025119029199f;
@@ -55,12 +66,33 @@ __device__ const float SH_C3[] = {
5566
-0.5900435899266435f
5667
};
5768

69+
template<typename T>
70+
__device__ void swap_T(T& a, T& b)
71+
{
72+
T temp = a;
73+
a = b;
74+
b = temp;
75+
}
76+
5877
__forceinline__ __device__ float ndc2Pix(float v, int S)
5978
{
6079
return ((v + 1.0) * S - 1.0) * 0.5;
6180
}
6281

63-
__forceinline__ __device__ void getRect(const float2 p, int max_radius, uint2& rect_min, uint2& rect_max, dim3 grid)
82+
83+
__forceinline__ __device__ void getRect(const float2 p, float2 max_radius, uint2& rect_min, uint2& rect_max, dim3 grid)
84+
{
85+
rect_min = {
86+
min(grid.x, max((int)0, (int)floorf((p.x - max_radius.x) / BLOCK_X))),
87+
min(grid.y, max((int)0, (int)floorf((p.y - max_radius.y) / BLOCK_Y)))
88+
};
89+
rect_max = {
90+
min(grid.x, max((int)0, (int)ceilf((p.x + max_radius.x) / BLOCK_X))),
91+
min(grid.y, max((int)0, (int)ceilf((p.y + max_radius.y) / BLOCK_Y)))
92+
};
93+
}
94+
95+
__forceinline__ __device__ void getRectOld(const float2 p, int max_radius, uint2& rect_min, uint2& rect_max, dim3 grid)
6496
{
6597
rect_min = {
6698
min(grid.x, max((int)0, (int)((p.x - max_radius) / BLOCK_X))),
@@ -261,7 +293,7 @@ scale_to_mat(const float3 scale, const float glob_scale) {
261293
glm::mat3 S = glm::mat3(1.f);
262294
S[0][0] = glob_scale * scale.x;
263295
S[1][1] = glob_scale * scale.y;
264-
S[2][2] = glob_scale * scale.z;
296+
//S[2][2] = glob_scale * scale.z;
265297
return S;
266298
}
267299

@@ -276,4 +308,4 @@ throw std::runtime_error(cudaGetErrorString(ret)); \
276308
} \
277309
}
278310

279-
#endif
311+
#endif

cuda_rasterizer/backward.cu

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "auxiliary.h"
1515
#include <cooperative_groups.h>
1616
#include <cooperative_groups/reduce.h>
17+
#include "stopthepop_2DGS/resorted_render.cuh"
1718
namespace cg = cooperative_groups;
1819

1920
// Backward pass for conversion of spherical harmonics to RGB for
@@ -257,11 +258,6 @@ renderCUDA(
257258
float last_alpha = 0;
258259
float last_color[C] = { 0 };
259260

260-
// Gradient of pixel coordinate w.r.t. normalized
261-
// screen-space viewport corrdinates (-1 to 1)
262-
const float ddelx_dx = 0.5 * W;
263-
const float ddely_dy = 0.5 * H;
264-
265261
// Traverse all Gaussians
266262
for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE)
267263
{
@@ -744,6 +740,8 @@ void BACKWARD::render(
744740
const float* depths,
745741
const float* final_Ts,
746742
const uint32_t* n_contrib,
743+
const float* out_color,
744+
const float* out_others,
747745
const float* dL_dpixels,
748746
const float* dL_depths,
749747
float * dL_dtransMat,
@@ -753,6 +751,34 @@ void BACKWARD::render(
753751
float* dL_dtexture_alpha,
754752
float* dL_dtexture_color)
755753
{
754+
#if PIXEL_RESORTING
755+
renderkBufferBackwardCUDA<NUM_CHANNELS> << <grid, block >> >(
756+
ranges,
757+
point_list,
758+
W, H,
759+
focal_x, focal_y,
760+
bg_color,
761+
texture_alpha,
762+
texture_color,
763+
texture_size,
764+
means2D,
765+
normal_array,
766+
transMats,
767+
colors,
768+
depths,
769+
final_Ts,
770+
n_contrib,
771+
out_color,
772+
out_others,
773+
dL_dpixels,
774+
dL_depths,
775+
dL_dtransMat,
776+
dL_dmean2D,
777+
dL_dnormal3D,
778+
dL_dcolors,
779+
dL_dtexture_alpha,
780+
dL_dtexture_color);
781+
#else
756782
renderCUDA<NUM_CHANNELS> << <grid, block >> >(
757783
ranges,
758784
point_list,
@@ -777,4 +803,5 @@ void BACKWARD::render(
777803
dL_dcolors,
778804
dL_dtexture_alpha,
779805
dL_dtexture_color);
806+
#endif
780807
}

cuda_rasterizer/backward.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ namespace BACKWARD
3737
const float* depths,
3838
const float* final_Ts,
3939
const uint32_t* n_contrib,
40+
const float* out_color,
41+
const float* out_others,
4042
const float* dL_dpixels,
4143
const float* dL_depths,
4244
float * dL_dtransMat,

cuda_rasterizer/forward.cu

Lines changed: 93 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
#include "forward.h"
1313
#include "grid_sample.h"
1414
#include "auxiliary.h"
15+
#include "stopthepop_2DGS/stopthepop_common.cuh"
16+
#include "stopthepop_2DGS/resorted_render.cuh"
1517
#include <cooperative_groups.h>
1618
#include <cooperative_groups/reduce.h>
1719
namespace cg = cooperative_groups;
@@ -181,6 +183,7 @@ __global__ void preprocessCUDA(int P, int D, int M,
181183
const float tan_fovx, const float tan_fovy,
182184
const float focal_x, const float focal_y,
183185
int* radii,
186+
float2* rects,
184187
float2* points_xy_image,
185188
float* depths,
186189
float* transMats,
@@ -233,9 +236,18 @@ __global__ void preprocessCUDA(int P, int D, int M,
233236
float radius = ceil(truncated_R * max(max(extent.x, extent.y), FilterSize));
234237

235238
uint2 rect_min, rect_max;
236-
getRect(center, radius, rect_min, rect_max, grid);
237-
if ((rect_max.x - rect_min.x) * (rect_max.y - rect_min.y) == 0)
238-
return;
239+
#if FAST_INFERENCE
240+
if (radius > MAX_BILLBOARD_SIZE)
241+
getRectOld(center, radius, rect_min, rect_max, grid);
242+
else
243+
getRect(center, extent, rect_min, rect_max, grid);
244+
#else
245+
getRectOld(center, radius, rect_min, rect_max, grid);
246+
#endif
247+
248+
if ((rect_max.x - rect_min.x) * (rect_max.y - rect_min.y) == 0) {
249+
return;
250+
}
239251

240252
// compute colors
241253
if (colors_precomp == nullptr) {
@@ -246,6 +258,7 @@ __global__ void preprocessCUDA(int P, int D, int M,
246258
}
247259

248260
depths[idx] = p_view.z;
261+
rects[idx] = extent;
249262
radii[idx] = (int)radius;
250263
points_xy_image[idx] = center;
251264
// store them in float4
@@ -299,7 +312,6 @@ renderCUDA(
299312

300313
// Allocate storage for batches of collectively fetched data.
301314
__shared__ int collected_id[BLOCK_SIZE];
302-
__shared__ float2 collected_xy[BLOCK_SIZE];
303315
__shared__ float3 collected_normal[BLOCK_SIZE];
304316
__shared__ float3 collected_Tu[BLOCK_SIZE];
305317
__shared__ float3 collected_Tv[BLOCK_SIZE];
@@ -319,7 +331,7 @@ renderCUDA(
319331
float dist1 = {0};
320332
float dist2 = {0};
321333
float distortion = {0};
322-
float median_depth = {0};
334+
float median_depth = {100};
323335
float median_weight = {0};
324336
float median_contributor = {-1};
325337

@@ -339,7 +351,6 @@ renderCUDA(
339351
{
340352
int coll_id = point_list[range.x + progress];
341353
collected_id[block.thread_rank()] = coll_id;
342-
collected_xy[block.thread_rank()] = points_xy_image[coll_id];
343354
collected_normal[block.thread_rank()] = normal_array[coll_id];
344355
collected_Tu[block.thread_rank()] = {transMats[9 * coll_id+0], transMats[9 * coll_id+1], transMats[9 * coll_id+2]};
345356
collected_Tv[block.thread_rank()] = {transMats[9 * coll_id+3], transMats[9 * coll_id+4], transMats[9 * coll_id+5]};
@@ -409,7 +420,7 @@ renderCUDA(
409420
float error = mapped_depth * mapped_depth * A + dist2 - 2 * mapped_depth * dist1;
410421
distortion += error * alpha * T;
411422

412-
if (T > 0.5) {
423+
if (T > 0.5 && alpha > 0.05) {
413424
median_depth = depth;
414425
median_weight = alpha * T;
415426
median_contributor = contributor;
@@ -484,25 +495,48 @@ void FORWARD::render(
484495
float* out_others,
485496
float* impact)
486497
{
487-
renderCUDA<NUM_CHANNELS> << <grid, block >> > (
488-
ranges,
489-
point_list,
490-
W, H,
491-
focal_x, focal_y,
492-
means2D,
493-
colors,
494-
texture_alpha,
495-
texture_color,
496-
texture_size,
497-
transMats,
498-
depths,
499-
normal_array,
500-
final_T,
501-
n_contrib,
502-
bg_color,
503-
out_color,
504-
out_others,
505-
impact);
498+
499+
#if PIXEL_RESORTING
500+
renderBufferCUDA<NUM_CHANNELS> << <grid, block >> > (
501+
ranges,
502+
point_list,
503+
W, H,
504+
focal_x, focal_y,
505+
means2D,
506+
colors,
507+
texture_alpha,
508+
texture_color,
509+
texture_size,
510+
transMats,
511+
depths,
512+
normal_array,
513+
final_T,
514+
n_contrib,
515+
bg_color,
516+
out_color,
517+
out_others,
518+
impact);
519+
#else
520+
renderCUDA<NUM_CHANNELS> << <grid, block >> > (
521+
ranges,
522+
point_list,
523+
W, H,
524+
focal_x, focal_y,
525+
means2D,
526+
colors,
527+
texture_alpha,
528+
texture_color,
529+
texture_size,
530+
transMats,
531+
depths,
532+
normal_array,
533+
final_T,
534+
n_contrib,
535+
bg_color,
536+
out_color,
537+
out_others,
538+
impact);
539+
#endif
506540
}
507541

508542
void FORWARD::preprocess(int P, int D, int M,
@@ -521,6 +555,7 @@ void FORWARD::preprocess(int P, int D, int M,
521555
const float focal_x, const float focal_y,
522556
const float tan_fovx, const float tan_fovy,
523557
int* radii,
558+
float2* rects,
524559
float2* means2D,
525560
float* depths,
526561
float* transMats,
@@ -547,6 +582,7 @@ void FORWARD::preprocess(int P, int D, int M,
547582
tan_fovx, tan_fovy,
548583
focal_x, focal_y,
549584
radii,
585+
rects,
550586
means2D,
551587
depths,
552588
transMats,
@@ -557,3 +593,34 @@ void FORWARD::preprocess(int P, int D, int M,
557593
prefiltered
558594
);
559595
}
596+
597+
void FORWARD::duplicate(
598+
int P,
599+
int W, int H,
600+
const float focal_x, const float focal_y,
601+
const float2* means2D,
602+
const float* depths,
603+
const float2* scales,
604+
const float* view2gaussians,
605+
const uint32_t* offsets,
606+
const int* radii,
607+
const float2* rects,
608+
uint64_t* gaussian_keys_unsorted,
609+
uint32_t* gaussian_values_unsorted,
610+
dim3 grid)
611+
{
612+
duplicateWithKeys_extended<false, true> << <(P + 255) / 256, 256 >> >(
613+
P, W, H, focal_x, focal_y,
614+
means2D,
615+
depths,
616+
scales,
617+
view2gaussians,
618+
offsets,
619+
radii,
620+
rects,
621+
gaussian_keys_unsorted,
622+
gaussian_values_unsorted,
623+
grid
624+
);
625+
626+
}

cuda_rasterizer/forward.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ namespace FORWARD
3737
const float focal_x, float focal_y,
3838
const float tan_fovx, float tan_fovy,
3939
int* radii,
40+
float2* rects,
4041
float2* points_xy_image,
4142
float* depths,
4243
float* transMats,
@@ -67,6 +68,21 @@ namespace FORWARD
6768
float* out_color,
6869
float* out_others,
6970
float* impact);
71+
72+
void duplicate(
73+
int P,
74+
int W, int H,
75+
const float focal_x, const float focal_y,
76+
const float2 *means2D,
77+
const float* depths,
78+
const float2* scales,
79+
const float* view2gaussians,
80+
const uint32_t* offsets,
81+
const int* radii,
82+
const float2* rects,
83+
uint64_t* gaussian_keys_unsorted,
84+
uint32_t* gaussian_values_unsorted,
85+
dim3 grid);
7086
}
7187

7288

cuda_rasterizer/rasterizer.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ namespace CudaRasterizer
7575
const float* campos,
7676
const float tan_fovx, float tan_fovy,
7777
const int* radii,
78+
const float* out_color,
79+
const float* out_others,
7880
char* geom_buffer,
7981
char* binning_buffer,
8082
char* image_buffer,

0 commit comments

Comments
 (0)