Comments (6)
Here is some more.
#2 0x00007ffff7c64729 in __assert_fail_base (fmt=0x7ffff7dfa588 <_nl_C_codeset+16> "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=0x7fff3a80c5eb <cutlass::NUM_THREADS_PER_QUAD_PAIR+40171> "false",
file=0x7fff3a80c590 <cutlass::NUM_THREADS_PER_QUAD_PAIR+40080> "/home/dronelab/instant-ngp/dependencies/tiny-cuda-nn/include/tiny-cuda-nn/cutlass_matmul.h", line=363,
function=0x7fff3a811490 <cutlass::NUM_THREADS_PER_QUAD_PAIR+60304> "void tcnn::fc_multiply_impl(cudaStream_t, const typename Gemm::Arguments&) [with Gemm = cutlass::gemm::device::Gemm<cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, cutlass::layout::Column"...) at assert.c:92
#3 0x00007ffff7c76006 in __GI___assert_fail (assertion=0x7fff3a80c5eb <cutlass::NUM_THREADS_PER_QUAD_PAIR+40171> "false",
file=0x7fff3a80c590 <cutlass::NUM_THREADS_PER_QUAD_PAIR+40080> "/home/dronelab/instant-ngp/dependencies/tiny-cuda-nn/include/tiny-cuda-nn/cutlass_matmul.h", line=363,
function=0x7fff3a811490 <cutlass::NUM_THREADS_PER_QUAD_PAIR+60304> "void tcnn::fc_multiply_impl(cudaStream_t, const typename Gemm::Arguments&) [with Gemm = cutlass::gemm::device::Gemm<cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, cutlass::layout::Column"...) at assert.c:101
#4 0x00007fff3a6aa0a6 in void tcnn::fc_multiply_impl<cutlass::gemm::device::Gemm<cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, cutlass::gemm::GemmShape<128, 32, 32, false>, cutlass::gemm::GemmShape<32, 32, 32, false>, cutlass::gemm::GemmShape<16, 8, 8, false>, tcnn::ActivationEpilogue<cutlass::half_t, 8, cutlass::half_t, cutlass::half_t, (cutlass::FloatRoundStyle)2>, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, 2, 8, 8, false, cutlass::arch::OpMultiplyAdd, false> >(CUstream_st*, cutlass::gemm::device::Gemm<cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, cutlass::gemm::GemmShape<128, 32, 32, false>, cutlass::gemm::GemmShape<32, 32, 32, false>, cutlass::gemm::GemmShape<16, 8, 8, false>, tcnn::ActivationEpilogue<cutlass::half_t, 8, cutlass::half_t, cutlass::half_t, (cutlass::FloatRoundStyle)2>, cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, 2, 8, 8, false, cutlass::arch::OpMultiplyAdd, false>::Arguments const&) ()
from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#5 0x00007fff3a696980 in void tcnn::fc_multiply<tcnn::LayerConfig<cutlass::gemm::GemmShape<128, 32, 32, false>, cutlass::gemm::GemmShape<32, 32, 32, false> >, __half, (tcnn::MatrixLayout)0, __half, (tcnn::MatrixLayout)1, __half, (tcnn::MatrixLayout)0, __half, (tcnn::MatrixLayout)0>(CUstream_st*, tcnn::GPUMatrix<__half, (tcnn::MatrixLayout)0> const&, tcnn::GPUMatrix<__half, (tcnn::MatrixLayout)1> const&, tcnn::GPUMatrix<__half, (tcnn::MatrixLayout)0> const&, tcnn::GPUMatrix<__half, (tcnn::MatrixLayout)0>&, tcnn::Activation, bool, bool) () from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#6 0x00007fff3a690f01 in void tcnn::fc_multiply<tcnn::LayerConfig<cutlass::gemm::GemmShape<128, 32, 32, false>, cutlass::gemm::GemmShape<32, 32, 32, false> >, __half, (tcnn::MatrixLayout)0, __half, (tcnn::MatrixLayout)1, __half, __half>(CUstream_st*, tcnn::GPUMatrix<__half, (tcnn::MatrixLayout)0> const&, tcnn::GPUMatrix<__half, (tcnn::MatrixLayout)1> const&, tcnn::GPUMatrixDynamic<__half> const&, tcnn::GPUMatrixDynamic<__half>&, tcnn::Activation, bool, bool) () from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#7 0x00007fff3a68c6a6 in void tcnn::fc_multiply<tcnn::LayerConfig<cutlass::gemm::GemmShape<128, 32, 32, false>, cutlass::gemm::GemmShape<32, 32, 32, false> >, __half, (tcnn::MatrixLayout)0, __half, __half, __half>(CUstream_st*, tcnn::GPUMatrix<__half, (tcnn::MatrixLayout)0> const&, tcnn::GPUMatrixDynamic<__half> const&, tcnn::GPUMatrixDynamic<__half> const&, tcnn::GPUMatrixDynamic<__half>&, tcnn::Activation, bool, bool)
() from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#8 0x00007fff3a688bf7 in void tcnn::fc_multiply<tcnn::LayerConfig<cutlass::gemm::GemmShape<128, 32, 32, false>, cutlass::gemm::GemmShape<32, 32, 32, false> >, __half, (tcnn::MatrixLayout)0, __half, __half>(CUstream_st*, tcnn::GPUMatrix<__half, (tcnn::MatrixLayout)0> const&, tcnn::GPUMatrixDynamic<__half> const&, tcnn::GPUMatrixDynamic<__half>&, tcnn::Activation) ()
from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#9 0x00007fff3a6d34ef in bool tcnn::compute_layer<tcnn::LayerConfig<cutlass::gemm::GemmShape<128, 32, 32, false>, cutlass::gemm::GemmShape<32, 32, 32, false> >, __half>(CUstream_st*, bool, tcnn::Activation, tcnn::GPUMatrix<__half, (tcnn::MatrixLayout)0> const&, tcnn::GPUMatrixDynamic<__half> const&, tcnn::GPUMatrixDynamic<__half>&, tcnn::GPUMatrixDynamic<__half>&) ()
from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#10 0x00007fff3a6d3388 in bool tcnn::compute_inference_layer<tcnn::LayerConfig<cutlass::gemm::GemmShape<128, 32, 32, false>, cutlass::gemm::GemmShape<32, 32, 32, false> >, __half>(CUstream_st*, tcnn::Activation, tcnn::GPUMatrix<__half, (tcnn::MatrixLayout)0> const&, tcnn::GPUMatrixDynamic<__half> const&, tcnn::GPUMatrixDynamic<__half>&) () from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#11 0x00007fff3a6d327f in tcnn::CutlassMLP<__half>::inference_mixed_precision(CUstream_st*, tcnn::GPUMatrixDynamic<__half> const&, tcnn::GPUMatrixDynamic<__half>&, bool)::{lambda()#1}::operator()() const ()
from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#12 0x00007fff3a6d3b4c in std::_Function_handler<void (), tcnn::CutlassMLP<__half>::inference_mixed_precision(CUstream_st*, tcnn::GPUMatrixDynamic<__half> const&, tcnn::GPUMatrixDynamic<__half>&, bool)::{lambda()#1}>::_M_invoke(std::_Any_data const&) () from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#13 0x00007fff3a220574 in std::function<void ()>::operator()() const (this=0x7fffffffcec0) at /usr/include/c++/9/bits/std_function.h:688
#14 0x00007fff3a3ca70a in tcnn::CudaGraph::capture_and_execute(CUstream_st*, bool, std::function<void ()>) () from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#15 0x00007fff3a6d01b7 in tcnn::CutlassMLP<__half>::inference_mixed_precision(CUstream_st*, tcnn::GPUMatrixDynamic<__half> const&, tcnn::GPUMatrixDynamic<__half>&, bool) ()
from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#16 0x00007fff3a3e98ad in ngp::SHNerfNetwork<__half>::density(CUstream_st*, tcnn::PitchedPtr<float const> const&, tcnn::GPUMatrixDynamic<__half>&, bool) ()
from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#17 0x00007fff3a3d5b23 in ngp::Testbed::update_density_grid_nerf(float, unsigned int, unsigned int, CUstream_st*) () from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
#18 0x00007fff3a3db6ca in ngp::Testbed::training_prep_nerf(unsigned int, unsigned int, CUstream_st*) () from /home/dronelab/instant-ngp/build/pyngp.cpython-37m-x86_64-linux-gnu.so
from tiny-cuda-nn.
Okay I can see that you guys were trying to fix this but can you give me the gist of why there needs to be a separate function for the last layer? The hidden layers work with width 64 so why is there a separate function?
from tiny-cuda-nn.
I'm not sure what the first half of your comment refers to.
As for why there's a separate kernel call: the fully fused kernel's launch parameters and computation structure is optimized specifically for the hidden width of the network, and, optionally, single-wmma-fragment outputs (16-wide).
Supporting generic 64 x OUTPUT_WIDTH
outputs would add overhead, both implementation and perf-wise, which may or may not be larger than the cost of the current approach of falling back to CUTLASS. I'm open to external contributions that attempt & benchmark this, but currently do not have the cycles to dive into it myself.
The error itself that you're into seems to be related to CUTLASS itself. Other people had this error in a related project and it turned out to be caused by a mismatch of their GPU and their compilation settings: NVlabs/instant-ngp#219 . It's probably worth double-checking that your GPU's compute capability matches the one that's passed to the compiler. (Either through CMake if you're using tiny-cuda-nn natively or through setup.py
if you're going for the bindings.)
from tiny-cuda-nn.
Thanks for the help. My compute compatibility has been set correctly (8.6 for the GTX3090). The second comment is a stack trace for the error. Can you help me figure out how to support generic 64 x OUTPUT_WIDTH? I need this capability for something I am testing. I can accept a performance hit.
from tiny-cuda-nn.
I'll hit you with the deets:
NVIDIA-SMI 510.47.03 Driver Version: 510.47.03 CUDA Version: 11.6
Commit: ad31b9d2289582bafc0e07f3465f070de8dd1134
-- The C compiler identification is GNU 9.4.0
-- The CXX compiler identification is GNU 9.4.0
-- The CUDA compiler identification is NVIDIA 11.6.55
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Detecting CUDA compiler ABI info
-- Detecting CUDA compiler ABI info - done
-- Check for working CUDA compiler: /usr/local/cuda/bin/nvcc - skipped
-- Detecting CUDA compile features
-- Detecting CUDA compile features - done
-- No release type specified. Setting to 'Release'.
-- Looking for pthread.h
-- Looking for pthread.h - found
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed
-- Looking for pthread_create in pthreads
-- Looking for pthread_create in pthreads - not found
-- Looking for pthread_create in pthread
-- Looking for pthread_create in pthread - found
-- Found Threads: TRUE
-- Using X11 for window creation
-- Found X11: /usr/include
-- Looking for XOpenDisplay in /usr/lib/x86_64-linux-gnu/libX11.so;/usr/lib/x86_64-linux-gnu/libXext.so
-- Looking for XOpenDisplay in /usr/lib/x86_64-linux-gnu/libX11.so;/usr/lib/x86_64-linux-gnu/libXext.so - found
-- Looking for gethostbyname
-- Looking for gethostbyname - found
-- Looking for connect
-- Looking for connect - found
-- Looking for remove
-- Looking for remove - found
-- Looking for shmat
-- Looking for shmat - found
-- Found GLEW: /usr/include (found version "2.1.0")
-- Found OpenMP_C: -fopenmp (found version "4.5")
-- Found OpenMP_CXX: -fopenmp (found version "4.5")
-- Found OpenMP: TRUE (found version "4.5")
-- OptiX_INSTALL_DIR value: /usr/local/NVIDIA-OptiX-SDK-7.4.0-linux64-x86_64
-- Found Python: /home/dronelab/miniconda3/envs/nerf/bin/python3.7 (found suitable version "3.7.11", minimum required is "3.7") found components: Interpreter Development Development.Module Development.Embed
-- pybind11 v2.7.1
CMake Warning (dev) at /snap/cmake/1035/share/cmake-3.22/Modules/CMakeDependentOption.cmake:84 (message):
Policy CMP0127 is not set: cmake_dependent_option() supports full Condition
Syntax. Run "cmake --help-policy CMP0127" for policy details. Use the
cmake_policy command to set the policy and suppress this warning.
Call Stack (most recent call first):
dependencies/pybind11/CMakeLists.txt:98 (cmake_dependent_option)
This warning is for project developers. Use -Wno-dev to suppress it.
-- Performing Test HAS_FLTO
-- Performing Test HAS_FLTO - Success
-- Targeting GPU architectures: 86
-- Configuring done
-- Generating done
-- Build files have been written to: /home/dronelab/instant-ngp-ori/build
from tiny-cuda-nn.
Latest changes fixed this issue. Thanks for the help!
from tiny-cuda-nn.
Related Issues (20)
- Is the RTX4070ti supported?
- install issue HOT 6
- Add auxiliary losses directly imposed on params HOT 1
- README executable instructions include non-executable shell prompts
- Question about the bounding box
- initiailization of hash grid
- tinycudann ImportError: tinycudann_bindings/_80_C.cpython-38-x86_64-linux-gnu.so: undefined symbol: HOT 2
- Enable 5D grids HOT 2
- Link Against tiny-cuda-nn in C++ Program HOT 1
- [Question]: can tiny-cuda-nn build a network with layer's bias=0?
- Problems encountered during installation HOT 1
- Already setted the CUDA_HOME but still:CUDA_HOME environment variable is not set. Please set it to your CUDA install root. HOT 1
- Manual installation with torch extension fails: parameter packs not expanding after cmake build success HOT 1
- Tiny cuda nn compilation issue
- pt
- pip install of tiny-cuda-nn does not install it in
- tinycudnn not working with conda environment
- pip install:g++ error
- Inconsistent number of parameters loading encoding with tiny-cuda-nn pytorch bindings and instant-ngp
- OSError: Could not find compatible tinycudann extension for compute capability 87. HOT 1
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from tiny-cuda-nn.