There appears to be a bug when the output size of a network is greater than 16. This a

Here is some more. <div class="snippet-clipboard-content notranslate position-rela

CUTLASS Error when output size of network > 16 about tiny-cuda-nn HOT 6 CLOSED

half-potato commented on May 19, 2024

CUTLASS Error when output size of network > 16

from tiny-cuda-nn.

Comments (6)

half-potato commented on May 19, 2024

Here is some more.

#2  0x00007ffff7c64729 in __assert_fail_base (fmt=0x7ffff7dfa588 <_nl_C_codeset+16> "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=0x7fff3a80c5eb <cutlass::NUM_THREADS_PER_QUAD_PAIR+40171> "false", 
    file=0x7fff3a80c590 <cutlass::NUM_THREADS_PER_QUAD_PAIR+40080> "/home/dronelab/instant-ngp/dependencies/tiny-cuda-nn/include/tiny-cuda-nn/cutlass_matmul.h", line=363, 
    function=0x7fff3a811490 <cutlass::NUM_THREADS_PER_QUAD_PAIR+60304> "void tcnn::fc_multiply_impl(cudaStream_t, const typename Gemm::Arguments&) [with Gemm = cutlass::gemm::device::Gemm<cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, cutlass::layout::Column"...) at assert.c:92
#3  0x00007ffff7c76006 in __GI___assert_fail (assertion=0x7fff3a80c5eb <cutlass::NUM_THREADS_PER_QUAD_PAIR+40171> "false", 
    file=0x7fff3a80c590 <cutlass::NUM_THREADS_PER_QUAD_PAIR+40080> "/home/dronelab/instant-ngp/dependencies/tiny-cuda-nn/include/tiny-cuda-nn/cutlass_matmul.h", line=363, 
    function=0x7fff3a811490 <cutlass::NUM_THREADS_PER_QUAD_PAIR+60304> "void tcnn::fc_multiply_impl(cudaStream_t, const typename Gemm::Arguments&) [with Gemm = cutlass::gemm::device::Gemm<cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, cutlass::layout::Column"...) at assert.c:101
#4  0x00007fff3a6aa0a6 in void tcnn::fc_multiply_impl<cutlass::gemm::device::Gemm<cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, cutlass::gemm::GemmShape<128, 32, 32, false>, cutlass::gemm::GemmShape<32, 32, 32, false>, cutlass::gemm::GemmShape<16, 8, 8, false>, tcnn::ActivationEpilogue<cutlass::half_t, 8, cutlass::half_t, cutlass::half_t, (cutlass::FloatRoundStyle)2>, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, 2, 8, 8, false, cutlass::arch::OpMultiplyAdd, false> >(CUstream_st*, cutlass::gemm::device::Gemm<cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, cutlass::gemm::GemmShape<128, 32, 32, false>, cutlass::gemm::GemmShape<32, 32, 32, false>, cutlass::gemm::GemmShape<16, 8, 8, false>, tcnn::ActivationEpilogue<cutlass::half_t, 8, cutlass::half_t, cutlass::half_t, (cutlass::FloatRoundStyle)2>, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, 2, 8, 8, false, cutlass::arch::OpMultiplyAdd, false>::Arguments const&) ()
   from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#5  0x00007fff3a696980 in void tcnn::fc_multiply<tcnn::LayerConfig<cutlass::gemm::GemmShape<128, 32, 32, false>, cutlass::gemm::GemmShape<32, 32, 32, false> >, __half, (tcnn::MatrixLayout)0, __half, (tcnn::MatrixLayout)1, __half, (tcnn::MatrixLayout)0, __half, (tcnn::MatrixLayout)0>(CUstream_st*, tcnn::GPUMatrix<__half, (tcnn::MatrixLayout)0> const&, tcnn::GPUMatrix<__half, (tcnn::MatrixLayout)1> const&, tcnn::GPUMatrix<__half, (tcnn::MatrixLayout)0> const&, tcnn::GPUMatrix<__half, (tcnn::MatrixLayout)0>&, tcnn::Activation, bool, bool) () from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#6  0x00007fff3a690f01 in void tcnn::fc_multiply<tcnn::LayerConfig<cutlass::gemm::GemmShape<128, 32, 32, false>, cutlass::gemm::GemmShape<32, 32, 32, false> >, __half, (tcnn::MatrixLayout)0, __half, (tcnn::MatrixLayout)1, __half, __half>(CUstream_st*, tcnn::GPUMatrix<__half, (tcnn::MatrixLayout)0> const&, tcnn::GPUMatrix<__half, (tcnn::MatrixLayout)1> const&, tcnn::GPUMatrixDynamic<__half> const&, tcnn::GPUMatrixDynamic<__half>&, tcnn::Activation, bool, bool) () from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#7  0x00007fff3a68c6a6 in void tcnn::fc_multiply<tcnn::LayerConfig<cutlass::gemm::GemmShape<128, 32, 32, false>, cutlass::gemm::GemmShape<32, 32, 32, false> >, __half, (tcnn::MatrixLayout)0, __half, __half, __half>(CUstream_st*, tcnn::GPUMatrix<__half, (tcnn::MatrixLayout)0> const&, tcnn::GPUMatrixDynamic<__half> const&, tcnn::GPUMatrixDynamic<__half> const&, tcnn::GPUMatrixDynamic<__half>&, tcnn::Activation, bool, bool)
    () from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#8  0x00007fff3a688bf7 in void tcnn::fc_multiply<tcnn::LayerConfig<cutlass::gemm::GemmShape<128, 32, 32, false>, cutlass::gemm::GemmShape<32, 32, 32, false> >, __half, (tcnn::MatrixLayout)0, __half, __half>(CUstream_st*, tcnn::GPUMatrix<__half, (tcnn::MatrixLayout)0> const&, tcnn::GPUMatrixDynamic<__half> const&, tcnn::GPUMatrixDynamic<__half>&, tcnn::Activation) ()
   from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#9  0x00007fff3a6d34ef in bool tcnn::compute_layer<tcnn::LayerConfig<cutlass::gemm::GemmShape<128, 32, 32, false>, cutlass::gemm::GemmShape<32, 32, 32, false> >, __half>(CUstream_st*, bool, tcnn::Activation, tcnn::GPUMatrix<__half, (tcnn::MatrixLayout)0> const&, tcnn::GPUMatrixDynamic<__half> const&, tcnn::GPUMatrixDynamic<__half>&, tcnn::GPUMatrixDynamic<__half>&) ()
   from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#10 0x00007fff3a6d3388 in bool tcnn::compute_inference_layer<tcnn::LayerConfig<cutlass::gemm::GemmShape<128, 32, 32, false>, cutlass::gemm::GemmShape<32, 32, 32, false> >, __half>(CUstream_st*, tcnn::Activation, tcnn::GPUMatrix<__half, (tcnn::MatrixLayout)0> const&, tcnn::GPUMatrixDynamic<__half> const&, tcnn::GPUMatrixDynamic<__half>&) () from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#11 0x00007fff3a6d327f in tcnn::CutlassMLP<__half>::inference_mixed_precision(CUstream_st*, tcnn::GPUMatrixDynamic<__half> const&, tcnn::GPUMatrixDynamic<__half>&, bool)::{lambda()#1}::operator()() const ()
   from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#12 0x00007fff3a6d3b4c in std::_Function_handler<void (), tcnn::CutlassMLP<__half>::inference_mixed_precision(CUstream_st*, tcnn::GPUMatrixDynamic<__half> const&, tcnn::GPUMatrixDynamic<__half>&, bool)::{lambda()#1}>::_M_invoke(std::_Any_data const&) () from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#13 0x00007fff3a220574 in std::function<void ()>::operator()() const (this=0x7fffffffcec0) at /usr/include/c++/9/bits/std_function.h:688
#14 0x00007fff3a3ca70a in tcnn::CudaGraph::capture_and_execute(CUstream_st*, bool, std::function<void ()>) () from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#15 0x00007fff3a6d01b7 in tcnn::CutlassMLP<__half>::inference_mixed_precision(CUstream_st*, tcnn::GPUMatrixDynamic<__half> const&, tcnn::GPUMatrixDynamic<__half>&, bool) ()
   from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#16 0x00007fff3a3e98ad in ngp::SHNerfNetwork<__half>::density(CUstream_st*, tcnn::PitchedPtr<float const> const&, tcnn::GPUMatrixDynamic<__half>&, bool) ()
   from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#17 0x00007fff3a3d5b23 in ngp::Testbed::update_density_grid_nerf(float, unsigned int, unsigned int, CUstream_st*) () from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#18 0x00007fff3a3db6ca in ngp::Testbed::training_prep_nerf(unsigned int, unsigned int, CUstream_st*) () from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so

from tiny-cuda-nn.

half-potato commented on May 19, 2024

Okay I can see that you guys were trying to fix this but can you give me the gist of why there needs to be a separate function for the last layer? The hidden layers work with width 64 so why is there a separate function?

from tiny-cuda-nn.

Tom94 commented on May 19, 2024

I'm not sure what the first half of your comment refers to.

As for why there's a separate kernel call: the fully fused kernel's launch parameters and computation structure is optimized specifically for the hidden width of the network, and, optionally, single-wmma-fragment outputs (16-wide).

Supporting generic 64 x OUTPUT_WIDTH outputs would add overhead, both implementation and perf-wise, which may or may not be larger than the cost of the current approach of falling back to CUTLASS. I'm open to external contributions that attempt & benchmark this, but currently do not have the cycles to dive into it myself.

The error itself that you're into seems to be related to CUTLASS itself. Other people had this error in a related project and it turned out to be caused by a mismatch of their GPU and their compilation settings: NVlabs/instant-ngp#219 . It's probably worth double-checking that your GPU's compute capability matches the one that's passed to the compiler. (Either through CMake if you're using tiny-cuda-nn natively or through setup.py if you're going for the bindings.)

from tiny-cuda-nn.

half-potato commented on May 19, 2024

Thanks for the help. My compute compatibility has been set correctly (8.6 for the GTX3090). The second comment is a stack trace for the error. Can you help me figure out how to support generic 64 x OUTPUT_WIDTH? I need this capability for something I am testing. I can accept a performance hit.

from tiny-cuda-nn.

half-potato commented on May 19, 2024

I'll hit you with the deets:
NVIDIA-SMI 510.47.03 Driver Version: 510.47.03 CUDA Version: 11.6
Commit: ad31b9d2289582bafc0e07f3465f070de8dd1134

-- The C compiler identification is GNU 9.4.0                                                                                                                                                                        
-- The CXX compiler identification is GNU 9.4.0                                                                                                                                                                      
-- The CUDA compiler identification is NVIDIA 11.6.55                                                                                                                                                                
-- Detecting C compiler ABI info                                                                                                                                                                                     
-- Detecting C compiler ABI info - done                                                                                                                                                                              
-- Check for working C compiler: /usr/bin/cc - skipped                                                                                                                                                               
-- Detecting C compile features                                                                                                                                                                                      
-- Detecting C compile features - done                                                                                                                                                                               
-- Detecting CXX compiler ABI info                                                                                                                                                                                   
-- Detecting CXX compiler ABI info - done                                                                                                                                                                            
-- Check for working CXX compiler: /usr/bin/c++ - skipped                                                                                                                                                            
-- Detecting CXX compile features                                                                                                                                                                                    
-- Detecting CXX compile features - done                                                                                                                                                                             
-- Detecting CUDA compiler ABI info                                                                                                                                                                                  
-- Detecting CUDA compiler ABI info - done                                                                                                                                                                           
-- Check for working CUDA compiler: /usr/local/cuda/bin/nvcc - skipped                                                                                                                                               
-- Detecting CUDA compile features                                                                                                                                                                                   
-- Detecting CUDA compile features - done                                                                                                                                                                            
-- No release type specified. Setting to 'Release'.                                                                                                                                                                  
-- Looking for pthread.h                                                                                                                                                                                             
-- Looking for pthread.h - found                                                                                                                                                                                     
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD                                                                                                                                                                           
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed                                                                                                                                                                  
-- Looking for pthread_create in pthreads                                                                                                                                                                            
-- Looking for pthread_create in pthreads - not found                                                                                                                                                                
-- Looking for pthread_create in pthread                                                                                                                                                                             
-- Looking for pthread_create in pthread - found                                                                                                                                                                     
-- Found Threads: TRUE                                                                                                                                                                                               
-- Using X11 for window creation                                                                                                                                                                                     
-- Found X11: /usr/include                                                                                                                                                                                           
-- Looking for XOpenDisplay in /usr/lib/x86_64-linux-gnu/libX11.so;/usr/lib/x86_64-linux-gnu/libXext.so                                                                                                              
-- Looking for XOpenDisplay in /usr/lib/x86_64-linux-gnu/libX11.so;/usr/lib/x86_64-linux-gnu/libXext.so - found                                                                                                      
-- Looking for gethostbyname                                                                                                                                                                                         
-- Looking for gethostbyname - found                                                                                                                                                                                 
-- Looking for connect                                                                                                                                                                                               
-- Looking for connect - found                                                                                                                                                                                       
-- Looking for remove                                                                                                                                                                                                
-- Looking for remove - found                                                                                                                                                                                        
-- Looking for shmat
-- Looking for shmat - found
-- Found GLEW: /usr/include (found version "2.1.0") 
-- Found OpenMP_C: -fopenmp (found version "4.5") 
-- Found OpenMP_CXX: -fopenmp (found version "4.5") 
-- Found OpenMP: TRUE (found version "4.5")  
-- OptiX_INSTALL_DIR value: /usr/local/NVIDIA-OptiX-SDK-7.4.0-linux64-x86_64
-- Found Python: /home/dronelab/miniconda3/envs/nerf/bin/python3.7 (found suitable version "3.7.11", minimum required is "3.7") found components: Interpreter Development Development.Module Development.Embed 
-- pybind11 v2.7.1 
CMake Warning (dev) at /snap/cmake/1035/share/cmake-3.22/Modules/CMakeDependentOption.cmake:84 (message):
  Policy CMP0127 is not set: cmake_dependent_option() supports full Condition
  Syntax.  Run "cmake --help-policy CMP0127" for policy details.  Use the
  cmake_policy command to set the policy and suppress this warning.
Call Stack (most recent call first):
  dependencies/pybind11/CMakeLists.txt:98 (cmake_dependent_option)
This warning is for project developers.  Use -Wno-dev to suppress it.

-- Performing Test HAS_FLTO
-- Performing Test HAS_FLTO - Success
-- Targeting GPU architectures: 86
-- Configuring done
-- Generating done
-- Build files have been written to: /home/dronelab/instant-ngp-ori/build

from tiny-cuda-nn.

half-potato commented on May 19, 2024

Latest changes fixed this issue. Thanks for the help!

from tiny-cuda-nn.

CUTLASS Error when output size of network > 16 about tiny-cuda-nn HOT 6 CLOSED

Comments (6)

Related Issues (20)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent