david-svitov
diff --git a/‎cuda_rasterizer/auxiliary.h‎
Lines changed: 35 additions & 3 deletions b/‎cuda_rasterizer/auxiliary.h‎
Lines changed: 35 additions & 3 deletions
diff --git a/‎cuda_rasterizer/backward.cu‎
Lines changed: 32 additions & 5 deletions b/‎cuda_rasterizer/backward.cu‎
Lines changed: 32 additions & 5 deletions
diff --git a/‎cuda_rasterizer/backward.h‎
Lines changed: 2 additions & 0 deletions b/‎cuda_rasterizer/backward.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cuda_rasterizer/forward.cu‎
Lines changed: 93 additions & 26 deletions b/‎cuda_rasterizer/forward.cu‎
Lines changed: 93 additions & 26 deletions
diff --git a/‎cuda_rasterizer/forward.h‎
Lines changed: 16 additions & 0 deletions b/‎cuda_rasterizer/forward.h‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎cuda_rasterizer/rasterizer.h‎
Lines changed: 2 additions & 0 deletions b/‎cuda_rasterizer/rasterizer.h‎
Lines changed: 2 additions & 0 deletions
@@ -27,6 +27,7 @@
 #define MIDDEPTH_OFFSET 5
 #define DISTORTION_OFFSET 6
 #define MEDIAN_WEIGHT_OFFSET 7
+#define OUTPUT_CHANNELS 8
 
 // distortion helper macros
 #define BACKFACE_CULL 1
@@ -35,6 +36,16 @@
 #define FAR_PLANE 100.0
 #define DETACH_WEIGHT 0
 
+#define TILE_SORTING 0
+#define PIXEL_RESORTING 0
+#define BUFFER_LENGTH 8
+
+#define FAST_INFERENCE 0
+#define MAX_BILLBOARD_SIZE 1000
+
+constexpr uint32_t WARP_SIZE = 32U;
+constexpr uint32_t WARP_MASK = 0xFFFFFFFFU;
+
 // Spherical harmonics coefficients
 __device__ const float SH_C0 = 0.28209479177387814f;
 __device__ const float SH_C1 = 0.4886025119029199f;
@@ -55,12 +66,33 @@ __device__ const float SH_C3[] = {
 	-0.5900435899266435f
 };
 
+template<typename T>
+__device__ void swap_T(T& a, T& b)
+{
+	T temp = a;
+	a = b;
+	b = temp;
+}
+
 __forceinline__ __device__ float ndc2Pix(float v, int S)
 {
 	return ((v + 1.0) * S - 1.0) * 0.5;
 }
 
-__forceinline__ __device__ void getRect(const float2 p, int max_radius, uint2& rect_min, uint2& rect_max, dim3 grid)
+
+__forceinline__ __device__ void getRect(const float2 p, float2 max_radius, uint2& rect_min, uint2& rect_max, dim3 grid)
+{
+	rect_min = {
+		min(grid.x, max((int)0, (int)floorf((p.x - max_radius.x) / BLOCK_X))),
+		min(grid.y, max((int)0, (int)floorf((p.y - max_radius.y) / BLOCK_Y)))
+	};
+	rect_max = {
+		min(grid.x, max((int)0, (int)ceilf((p.x + max_radius.x) / BLOCK_X))),
+		min(grid.y, max((int)0, (int)ceilf((p.y + max_radius.y) / BLOCK_Y)))
+	};
+}
+
+__forceinline__ __device__ void getRectOld(const float2 p, int max_radius, uint2& rect_min, uint2& rect_max, dim3 grid)
 {
 	rect_min = {
 		min(grid.x, max((int)0, (int)((p.x - max_radius) / BLOCK_X))),
@@ -261,7 +293,7 @@ scale_to_mat(const float3 scale, const float glob_scale) {
 	glm::mat3 S = glm::mat3(1.f);
 	S[0][0] = glob_scale * scale.x;
 	S[1][1] = glob_scale * scale.y;
-	S[2][2] = glob_scale * scale.z;
+	//S[2][2] = glob_scale * scale.z;
 	return S;
 }
 
@@ -276,4 +308,4 @@ throw std::runtime_error(cudaGetErrorString(ret)); \
 } \
 }
 
-#endif
+#endif
@@ -14,6 +14,7 @@
 #include "auxiliary.h"
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
+#include "stopthepop_2DGS/resorted_render.cuh"
 namespace cg = cooperative_groups;
 
 // Backward pass for conversion of spherical harmonics to RGB for
@@ -257,11 +258,6 @@ renderCUDA(
 	float last_alpha = 0;
 	float last_color[C] = { 0 };
 
-	// Gradient of pixel coordinate w.r.t. normalized 
-	// screen-space viewport corrdinates (-1 to 1)
-	const float ddelx_dx = 0.5 * W;
-	const float ddely_dy = 0.5 * H;
-
 	// Traverse all Gaussians
 	for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE)
 	{
@@ -744,6 +740,8 @@ void BACKWARD::render(
 	const float* depths,
 	const float* final_Ts,
 	const uint32_t* n_contrib,
+	const float* out_color,
+	const float* out_others,
 	const float* dL_dpixels,
 	const float* dL_depths,
 	float * dL_dtransMat,
@@ -753,6 +751,34 @@ void BACKWARD::render(
 	float* dL_dtexture_alpha,
 	float* dL_dtexture_color)
 {
+#if PIXEL_RESORTING
+    renderkBufferBackwardCUDA<NUM_CHANNELS> << <grid, block >> >(
+		ranges,
+		point_list,
+		W, H,
+		focal_x, focal_y,
+		bg_color,
+		texture_alpha,
+		texture_color,
+		texture_size,
+		means2D,
+		normal_array,
+		transMats,
+		colors,
+		depths,
+		final_Ts,
+		n_contrib,
+		out_color,
+	    out_others,
+		dL_dpixels,
+		dL_depths,
+		dL_dtransMat,
+		dL_dmean2D,
+		dL_dnormal3D,
+		dL_dcolors,
+		dL_dtexture_alpha,
+		dL_dtexture_color);
+#else
 	renderCUDA<NUM_CHANNELS> << <grid, block >> >(
 		ranges,
 		point_list,
@@ -777,4 +803,5 @@ void BACKWARD::render(
 		dL_dcolors,
 		dL_dtexture_alpha,
 		dL_dtexture_color);
+#endif
 }
@@ -37,6 +37,8 @@ namespace BACKWARD
 		const float* depths,
 		const float* final_Ts,
 		const uint32_t* n_contrib,
+		const float* out_color,
+		const float* out_others,
 		const float* dL_dpixels,
 		const float* dL_depths,
 		float * dL_dtransMat,
 
@@ -12,6 +12,8 @@
 #include "forward.h"
 #include "grid_sample.h"
 #include "auxiliary.h"
+#include "stopthepop_2DGS/stopthepop_common.cuh"
+#include "stopthepop_2DGS/resorted_render.cuh"
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 namespace cg = cooperative_groups;
@@ -181,6 +183,7 @@ __global__ void preprocessCUDA(int P, int D, int M,
 	const float tan_fovx, const float tan_fovy,
 	const float focal_x, const float focal_y,
 	int* radii,
+    float2* rects,
 	float2* points_xy_image,
 	float* depths,
 	float* transMats,
@@ -233,9 +236,18 @@ __global__ void preprocessCUDA(int P, int D, int M,
 	float radius = ceil(truncated_R * max(max(extent.x, extent.y), FilterSize));
 
 	uint2 rect_min, rect_max;
-	getRect(center, radius, rect_min, rect_max, grid);
-	if ((rect_max.x - rect_min.x) * (rect_max.y - rect_min.y) == 0)
-		return;
+#if FAST_INFERENCE
+	if (radius > MAX_BILLBOARD_SIZE)
+	    getRectOld(center, radius, rect_min, rect_max, grid);
+	else
+	    getRect(center, extent, rect_min, rect_max, grid);
+#else
+    getRectOld(center, radius, rect_min, rect_max, grid);
+#endif
+
+	if ((rect_max.x - rect_min.x) * (rect_max.y - rect_min.y) == 0) {
+    		return;
+	}
 
 	// compute colors 
 	if (colors_precomp == nullptr) {
@@ -246,6 +258,7 @@ __global__ void preprocessCUDA(int P, int D, int M,
 	}
 
 	depths[idx] = p_view.z;
+    rects[idx] = extent;
 	radii[idx] = (int)radius;
 	points_xy_image[idx] = center;
 	// store them in float4
@@ -299,7 +312,6 @@ renderCUDA(
 
 	// Allocate storage for batches of collectively fetched data.
 	__shared__ int collected_id[BLOCK_SIZE];
-	__shared__ float2 collected_xy[BLOCK_SIZE];
 	__shared__ float3 collected_normal[BLOCK_SIZE];
 	__shared__ float3 collected_Tu[BLOCK_SIZE];
 	__shared__ float3 collected_Tv[BLOCK_SIZE];
@@ -319,7 +331,7 @@ renderCUDA(
 	float dist1 = {0};
 	float dist2 = {0};
 	float distortion = {0};
-	float median_depth = {0};
+	float median_depth = {100};
 	float median_weight = {0};
 	float median_contributor = {-1};
 
@@ -339,7 +351,6 @@ renderCUDA(
 		{
 			int coll_id = point_list[range.x + progress];
 			collected_id[block.thread_rank()] = coll_id;
-			collected_xy[block.thread_rank()] = points_xy_image[coll_id];
 			collected_normal[block.thread_rank()] = normal_array[coll_id];
 			collected_Tu[block.thread_rank()] = {transMats[9 * coll_id+0], transMats[9 * coll_id+1], transMats[9 * coll_id+2]};
 			collected_Tv[block.thread_rank()] = {transMats[9 * coll_id+3], transMats[9 * coll_id+4], transMats[9 * coll_id+5]};
@@ -409,7 +420,7 @@ renderCUDA(
 			float error = mapped_depth * mapped_depth * A + dist2 - 2 * mapped_depth * dist1;
 			distortion += error * alpha * T;
 
-			if (T > 0.5) {
+			if (T > 0.5 && alpha > 0.05) {
 				median_depth = depth;
 				median_weight = alpha * T;
 				median_contributor = contributor;
@@ -484,25 +495,48 @@ void FORWARD::render(
 	float* out_others,
 	float* impact)
 {
-	renderCUDA<NUM_CHANNELS> << <grid, block >> > (
-		ranges,
-		point_list,
-		W, H,
-		focal_x, focal_y,
-		means2D,
-		colors,
-		texture_alpha,
-		texture_color,
-		texture_size,
-		transMats,
-		depths,
-		normal_array,
-		final_T,
-		n_contrib,
-		bg_color,
-		out_color,
-		out_others,
-		impact);
+
+#if PIXEL_RESORTING
+    renderBufferCUDA<NUM_CHANNELS> << <grid, block >> > (
+	    ranges,
+	    point_list,
+	    W, H,
+	    focal_x, focal_y,
+	    means2D,
+	    colors,
+	    texture_alpha,
+	    texture_color,
+	    texture_size,
+	    transMats,
+	    depths,
+	    normal_array,
+	    final_T,
+	    n_contrib,
+	    bg_color,
+	    out_color,
+	    out_others,
+	    impact);
+#else
+    renderCUDA<NUM_CHANNELS> << <grid, block >> > (
+	    ranges,
+	    point_list,
+	    W, H,
+	    focal_x, focal_y,
+	    means2D,
+	    colors,
+	    texture_alpha,
+	    texture_color,
+	    texture_size,
+	    transMats,
+	    depths,
+	    normal_array,
+	    final_T,
+	    n_contrib,
+	    bg_color,
+	    out_color,
+	    out_others,
+	    impact);
+#endif
 }
 
 void FORWARD::preprocess(int P, int D, int M,
@@ -521,6 +555,7 @@ void FORWARD::preprocess(int P, int D, int M,
 	const float focal_x, const float focal_y,
 	const float tan_fovx, const float tan_fovy,
 	int* radii,
+    float2* rects,
 	float2* means2D,
 	float* depths,
 	float* transMats,
@@ -547,6 +582,7 @@ void FORWARD::preprocess(int P, int D, int M,
 		tan_fovx, tan_fovy,
 		focal_x, focal_y,
 		radii,
+		rects,
 		means2D,
 		depths,
 		transMats,
@@ -557,3 +593,34 @@ void FORWARD::preprocess(int P, int D, int M,
 		prefiltered
 		);
 }
+
+void FORWARD::duplicate(
+	int P,
+	int W, int H,
+	const float focal_x, const float focal_y,
+	const float2* means2D,
+	const float* depths,
+	const float2* scales,
+	const float* view2gaussians,
+	const uint32_t* offsets,
+	const int* radii,
+	const float2* rects,
+	uint64_t* gaussian_keys_unsorted,
+	uint32_t* gaussian_values_unsorted,
+	dim3 grid)
+{
+	duplicateWithKeys_extended<false, true> << <(P + 255) / 256, 256 >> >(
+		P, W, H, focal_x, focal_y,
+		means2D,
+		depths,
+		scales,
+		view2gaussians,
+		offsets,
+		radii,
+		rects,
+		gaussian_keys_unsorted,
+		gaussian_values_unsorted,
+		grid
+	);
+
+}
@@ -37,6 +37,7 @@ namespace FORWARD
 		const float focal_x, float focal_y,
 		const float tan_fovx, float tan_fovy,
 		int* radii,
+		float2* rects,
 		float2* points_xy_image,
 		float* depths,
 		float* transMats,
@@ -67,6 +68,21 @@ namespace FORWARD
 		float* out_color,
 		float* out_others,
 		float* impact);
+		
+    void duplicate(
+		int P,
+		int W, int H,
+		const float focal_x, const float focal_y,
+		const float2 *means2D,
+		const float* depths,
+		const float2* scales,
+		const float* view2gaussians,
+		const uint32_t* offsets,
+		const int* radii,
+		const float2* rects,
+		uint64_t* gaussian_keys_unsorted,
+		uint32_t* gaussian_values_unsorted,
+		dim3 grid);
 }
 
 
 
@@ -75,6 +75,8 @@ namespace CudaRasterizer
 			const float* campos,
 			const float tan_fovx, float tan_fovy,
 			const int* radii,
+			const float* out_color,
+			const float* out_others,
 			char* geom_buffer,
 			char* binning_buffer,
 			char* image_buffer,