cuda

How to avoid using local memory implicitly in CUDA?


I am working on a CUDA software path tracing renderer, and now I am stuck in the problem of suboptimal L1TEX local load/store access pattern. NCU told me that the bottleneck is within the following code (metric is L2 Theoretical Sectors Local). The following code is basically a stackless linear BVH traversal, and I marked the local memory access with \**\ comments.

__device__ float ray_intersect_bvh(
    const Ray& ray,
    const cudaTextureObject_t bvh_leaves,
    const cudaTextureObject_t node_fronts,
    const cudaTextureObject_t node_backs,
    ConstF4Ptr cached_nodes,
    const PrecomputedArray& verts,
    int& min_index,
    int& min_obj_idx,
    float& prim_u,
    float& prim_v,
    const int node_num,
    const int cache_num,
    float min_dist
) {
    bool valid_cache = false;         // whether we find valid node-intersection in cached nodes
    int node_idx     = 0;             // BVH tree node index
    float aabb_tmin  = 0;             // minimum intersection time of AABB (positive)
    // The following while lobe checks cached (in shared memory) node intersection
    // near root layers of the BVH tree are cached for faster access
    while (node_idx < cache_num && !valid_cache) {
        const LinearNode node(
            cached_nodes[node_idx],
            cached_nodes[node_idx + cache_num]
        );
        bool intersect_node = node.aabb.intersect(ray, aabb_tmin) && aabb_tmin < min_dist;
        int all_offset = node.aabb.base(), gmem_index = node.aabb.prim_cnt();
        int increment = (!intersect_node) * all_offset + (intersect_node && all_offset != 1) * 1;

        node_idx += increment;

        if (intersect_node && all_offset == 1) {
            valid_cache = true;
            node_idx = gmem_index;
        }
    }
    // if we find a valid intersection in cached nodes, we continue traversal in texture memory
    if (valid_cache) {
        while (node_idx < node_num) {
            const LinearNode node(tex1Dfetch<float4>(node_fronts, node_idx),
                            tex1Dfetch<float4>(node_backs, node_idx));
            bool intersect_node = node.aabb.intersect(ray, aabb_tmin) && aabb_tmin < min_dist;
            int beg_idx = 0, end_idx = 0;
            node.get_range(beg_idx, end_idx);
            
            int increment = (!intersect_node) * (end_idx < 0 ? -end_idx : 1) + int(intersect_node);
            if (intersect_node && end_idx > 0) {
                end_idx += beg_idx;
                // For BVH leaf node: traverse all the triangles within the range
                for (int idx = beg_idx; idx < end_idx; idx ++) {   /* Up to 44%.12 L2 theoretical sectors are accessed for this line */
                    /* SASS for the above line: LDL.LU, load 32 bit. */
                    bool valid  = false;
                    int2 obj_prim_idx = tex1Dfetch<int2>(bvh_leaves, idx);
                    float it_u = 0, it_v = 0, dist = Primitive::intersect(ray, verts, obj_prim_idx.y, it_u, it_v, obj_prim_idx.x >= 0);
                    valid = dist > EPSILON && dist < min_dist;
                    min_dist = valid ? dist : min_dist;
                    prim_u   = valid ? it_u : prim_u;
                    prim_v   = valid ? it_v : prim_v;
                    min_index = valid ? obj_prim_idx.y : min_index;
                    min_obj_idx = valid ? obj_prim_idx.x : min_obj_idx;
                }
            }
            node_idx += increment;
        }
    }
    return min_dist;
}

Also, the AABB intersection test has local memory issues, too. This is more severe, as NCU reported a bigger issue in local memory store pattern:

__device__ bool intersect(const Ray& ray, float& t_near) const {
    // Vec3 is a simple encapsulation of float3
    Vec3 invDir = ray.d.rcp();
    Vec3 t1s = (mini - ray.o) * invDir;             // long scoreboard
    Vec3 t2s = (maxi - ray.o) * invDir;

    float tmin = t1s.minimize(t2s).max_elem();
    float tmax = t1s.maximize(t2s).min_elem();
    t_near = tmin;
    return (tmax > tmin) && (tmax > 0);             /* SASS: STL, store 32 bit. local memory access problem */
}

Some other contexts:

I wonder why compiler does this implicit local memory access, where I think there should be sufficient registers to use. I also want to know how to avoid this. I know that there is a keyword register in CPP, though it is mostly useless, it kind of implies that there is way to hint the the compiler.

The profiling report (with source attached within the report) can be found here, using NCU (I am using version 2024.3.1) can open it. In source, check out the L2 Theoretical Sectors Local.

The code itself can be found here.


Solution

  • Looking at your Github repository, you enabled separate compilation, which forces the compiler to spill some registers to local memory.

    I can reproduce this locally and I can confirm that, when disabling separate compilation, local memory accesses are gone. You can verify this on Godbolt:

    More information on the performance impact of separate compilation can be found in this post on NVIDIA developers blog.