/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu: In instantiation of ‘void HostLayerNormGradient(const V*, const U*, const U*, at::Tensor*, int, int, const V*, const V*, double, T*, V*, V*) [with T = float; U = float; V = c10::Half]’:
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:800:95: required from here
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:737:138: warning: ‘T* at::Tensor::data() const [with T = float]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
737 | cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:737:210: warning: ‘T* at::Tensor::data() const [with T = float]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
737 | cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:737:247: warning: ‘T* at::Tensor::data() const [with T = float]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
737 | cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:750:137: warning: ‘T* at::Tensor::data() const [with T = float]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
750 | cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:750:174: warning: ‘T* at::Tensor::data() const [with T = float]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
750 | cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:768:129: warning: ‘T* at::Tensor::data() const [with T = float]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
768 | cuComputeGradInput<<<blocks1, threads1, nshared, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu: In instantiation of ‘void HostLayerNormGradient(const V*, const U*, const U*, at::Tensor*, int, int, const V*, const V*, double, T*, V*, V*) [with T = float; U = float; V = c10::BFloat16]’:
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:800:103: required from here
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:737:138: warning: ‘T* at::Tensor::data() const [with T = float]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
737 | cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:737:210: warning: ‘T* at::Tensor::data() const [with T = float]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
737 | cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:737:247: warning: ‘T* at::Tensor::data() const [with T = float]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
737 | cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:750:137: warning: ‘T* at::Tensor::data() const [with T = float]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
750 | cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:750:174: warning: ‘T* at::Tensor::data() const [with T = float]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
750 | cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:768:129: warning: ‘T* at::Tensor::data() const [with T = float]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
768 | cuComputeGradInput<<<blocks1, threads1, nshared, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu: In instantiation of ‘void HostLayerNormGradient(const V*, const U*, const U*, at::Tensor*, int, int, const V*, const V*, double, T*, V*, V*) [with T = c10::Half; U = float; V = c10::Half]’:
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:800:127: required from here
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:737:138: warning: ‘T* at::Tensor::data() const [with T = c10::Half]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
737 | cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:737:210: warning: ‘T* at::Tensor::data() const [with T = float]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
737 | cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:737:247: warning: ‘T* at::Tensor::data() const [with T = float]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
737 | cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:750:137: warning: ‘T* at::Tensor::data() const [with T = float]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
750 | cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:750:174: warning: ‘T* at::Tensor::data() const [with T = float]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
750 | cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:768:129: warning: ‘T* at::Tensor::data() const [with T = c10::Half]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
768 | cuComputeGradInput<<<blocks1, threads1, nshared, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu: In instantiation of ‘void HostLayerNormGradient(const V*, const U*, const U*, at::Tensor*, int, int, const V*, const V*, double, T*, V*, V*) [with T = c10::BFloat16; U = float; V = c10::BFloat16]’:
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:800:138: required from here
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:737:138: warning: ‘T* at::Tensor::data() const [with T = c10::BFloat16]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
737 | cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:737:210: warning: ‘T* at::Tensor::data() const [with T = float]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
737 | cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:737:247: warning: ‘T* at::Tensor::data() const [with T = float]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
737 | cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:750:137: warning: ‘T* at::Tensor::data() const [with T = float]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
750 | cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:750:174: warning: ‘T* at::Tensor::data() const [with T = float]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
750 | cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
/epfllm/Megatron-LLM/megatron/fused_kernels/layer_norm_cuda_kernel.cu:768:129: warning: ‘T* at::Tensor::data() const [with T = c10::BFloat16]’ is deprecated: Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead. [-Wdeprecated-declarations]
768 | cuComputeGradInput<<<blocks1, threads1, nshared, stream>>>(
| ^
/usr/local/lib/python3.10/dist-packages/torch/include/ATen/core/TensorBody.h:245:1: note: declared here
245 | T * data() const {
| ^ ~~
[3/3] c++ layer_norm_cuda.o layer_norm_cuda_kernel.cuda.o -shared -L/usr/local/lib/python3.10/dist-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda -ltorch -ltorch_python -L/usr/local/cuda/lib64 -lcudart -o fused_mix_prec_layer_norm_cuda.so
Loading extension module fused_mix_prec_layer_norm_cuda...
Detected CUDA files, patching ldflags
Emitting ninja build file /epfllm/Megatron-LLM/megatron/fused_kernels/build/build.ninja...
Building extension module fused_dense_cuda...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
[1/3] c++ -MMD -MF fused_weight_gradient_dense.o.d -DTORCH_EXTENSION_NAME=fused_dense_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -isystem /usr/local/lib/python3.10/dist-packages/torch/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/TH -isystem /usr/local/lib/python3.10/dist-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /usr/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=1 -fPIC -std=c++17 -O3 -c /epfllm/Megatron-LLM/megatron/fused_kernels/fused_weight_gradient_dense.cpp -o fused_weight_gradient_dense.o
[2/3] /usr/local/cuda/bin/nvcc -DTORCH_EXTENSION_NAME=fused_dense_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -isystem /usr/local/lib/python3.10/dist-packages/torch/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/TH -isystem /usr/local/lib/python3.10/dist-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /usr/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options '-fPIC' -O3 -gencode arch=compute_70,code=sm_70 --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -gencode arch=compute_80,code=sm_80 -std=c++17 -c /epfllm/Megatron-LLM/megatron/fused_kernels/fused_weight_gradient_dense.cu -o fused_weight_gradient_dense.cuda.o
[3/3] c++ fused_weight_gradient_dense.o fused_weight_gradient_dense.cuda.o -shared -L/usr/local/lib/python3.10/dist-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda -ltorch -ltorch_python -L/usr/local/cuda/lib64 -lcudart -o fused_dense_cuda.so
Loading extension module fused_dense_cuda...
Building model ...
/epfllm/Megatron-LLM/megatron/model/llama_model.py:38: UserWarning: Llama is not intended to use dropout
warnings.warn( "Llama is not intended to use dropout")
/epfllm/Megatron-LLM/megatron/model/llama_model.py:40: UserWarning: Llama is not intended to use dropout
warnings.warn( "Llama is not intended to use dropout")
loading release checkpoint from ./model
checkpoint version 3.0
successfully loaded checkpoint from ./model at iteration 0
using world size: 4, data-parallel-size: 1, tensor-model-parallel size: 4, pipeline-model-parallel size: 1
setting global batch size to 1
accumulate and all-reduce gradients in fp32 for bfloat16 data type.
using torch.bfloat16 for parameters ...
------------------------ arguments ------------------------
accumulate_allreduce_grads_in_fp32 .............. True
adam_beta1 ...................................... 0.9
adam_beta2 ...................................... 0.999
adam_eps ........................................ 1e-08
adlr_autoresume ................................. False
adlr_autoresume_interval ........................ 1000
apply_query_key_layer_scaling ................... True
apply_residual_connection_post_layernorm ........ False
async_tensor_model_parallel_allreduce ........... True
attention_dropout ............................... 0.1
attention_softmax_in_fp32 ....................... False
barrier_with_L1_time ............................ True
bert_load ....................................... None
bf16 ............................................ True
bias_dropout_fusion ............................. False
bias_gelu_fusion ................................ False
biencoder_projection_dim ........................ 0
biencoder_shared_query_context_model ............ False
block_data_path ................................. None
classes_fraction ................................ 1.0
clip_grad ....................................... 1.0
consumed_train_samples .......................... 0
consumed_valid_samples .......................... 0
data_impl ....................................... infer
data_parallel_random_init ....................... False
data_parallel_size .............................. 1
data_path ....................................... None
data_per_class_fraction ......................... 1.0
data_sharding ................................... True
dataloader_type ................................. single
DDP_impl ........................................ local
decoder_num_layers .............................. None
decoder_seq_length .............................. None
dino_bottleneck_size ............................ 256
dino_freeze_last_layer .......................... 1
dino_head_hidden_size ........................... 2048
dino_local_crops_number ......................... 10
dino_local_img_size ............................. 96
dino_norm_last_layer ............................ False
dino_teacher_temp ............................... 0.07
dino_warmup_teacher_temp ........................ 0.04
dino_warmup_teacher_temp_epochs ................. 30
distribute_saved_activations .................... False
distributed_backend ............................. nccl
embedding_path .................................. None
empty_unused_memory_level ....................... 0
encoder_num_layers .............................. 32
encoder_seq_length .............................. 4096
end_weight_decay ................................ 0.01
eod_mask_loss ................................... False
eval_interval ................................... 1000
eval_iters ...................................... 100
evidence_data_path .............................. None
exit_duration_in_mins ........................... None
exit_interval ................................... None
exit_signal_handler ............................. False
ffn_hidden_size ................................. 11008
finetune ........................................ False
fp16 ............................................ False
fp16_lm_cross_entropy ........................... False
fp32_residual_connection ........................ False
fp8_amax_compute_algo ........................... most_recent
fp8_amax_history_len ............................ 1
fp8_e4m3 ........................................ False
fp8_hybrid ...................................... False
fp8_interval .................................... 1
fp8_margin ...................................... 0
fp8_wgrad ....................................... True
global_batch_size ............................... 1
glu_activation .................................. swiglu
gradient_accumulation_fusion .................... True
head_lr_mult .................................... 1.0
hidden_dropout .................................. 0.1
hidden_size ..................................... 4096
hysteresis ...................................... 2
ict_head_size ................................... None
ict_load ........................................ None
img_h ........................................... 224
img_w ........................................... 224
indexer_batch_size .............................. 128
indexer_log_interval ............................ 1000
inference_batch_times_seqlen_threshold .......... 512
init_method_std ................................. 0.02
init_method_xavier_uniform ...................... False
initial_loss_scale .............................. 4294967296
iter_per_epoch .................................. 1250
kv_channels ..................................... 128
layernorm_epsilon ............................... 1e-05
lima_dropout .................................... False
load ............................................ None
local_rank ...................................... None
log_batch_size_to_tensorboard ................... False
log_interval .................................... 100
log_memory_to_tensorboard ....................... False
log_num_zeros_in_grad ........................... False
log_params_norm ................................. False
log_timers_to_tensorboard ....................... False
log_validation_ppl_to_tensorboard ............... False
log_world_size_to_tensorboard ................... False
loss_scale ...................................... None
loss_scale_window ............................... 1000
lr .............................................. None
lr_decay_iters .................................. None
lr_decay_samples ................................ None
lr_decay_style .................................. linear
lr_warmup_fraction .............................. None
lr_warmup_iters ................................. 0
lr_warmup_samples ............................... 0
make_vocab_size_divisible_by .................... 128
mask_prob ....................................... 0.15
masked_softmax_fusion ........................... False
max_position_embeddings ......................... 4096
max_tokens_to_oom ............................... 12000
merge_file ...................................... None
metrics ......................................... []
micro_batch_size ................................ 1
min_loss_scale .................................. 1.0
min_lr .......................................... 0.0
mmap_warmup ..................................... False
new_tokens ...................................... True
no_load_optim ................................... True
no_load_rng ..................................... True
no_persist_layer_norm ........................... False
no_save_optim ................................... True
no_save_rng ..................................... True
num_attention_heads ............................. 32
num_attention_heads_kv .......................... 32
num_channels .................................... 3
num_classes ..................................... 1000
num_layers ...................................... 32
num_layers_per_virtual_pipeline_stage ........... None
num_workers ..................................... 2
onnx_safe ....................................... None
optimizer ....................................... adam
override_opt_param_scheduler .................... False
parallel_attn ................................... False
parallel_layernorm .............................. False
params_dtype .................................... torch.bfloat16
patch_dim ....................................... 16
perform_initialization .......................... False
pipeline_model_parallel_size .................... 1
pipeline_model_parallel_split_rank .............. None
position_embedding_type ......................... PositionEmbeddingType.rotary
query_in_block_prob ............................. 0.1
rampup_batch_size ............................... None
rank ............................................ 0
recompute_granularity ........................... None
recompute_method ................................ None
recompute_num_layers ............................ 1
reset_attention_mask ............................ False
reset_position_ids .............................. False
retriever_report_topk_accuracies ................ []
retriever_score_scaling ......................... False
retriever_seq_length ............................ 256
rope_scaling_factor ............................. 1.0
rope_theta ...................................... 10000.0
sample_rate ..................................... 1.0
save ............................................ ./model_sharded
save_interval ................................... 1
scalar_loss_mask ................................ 0.0
scatter_gather_tensors_in_pipeline .............. True
seed ............................................ 1234
seq_length ...................................... 4096
sequence_parallel ............................... False
sgd_momentum .................................... 0.9
short_seq_prob .................................. 0.1
skip_iters ...................................... []
split ........................................... 969, 30, 1
standalone_embedding_stage ...................... False
start_weight_decay .............................. 0.01
tensor_model_parallel_size ...................... 4
tensorboard_dir ................................. None
tensorboard_log_interval ........................ 1
tensorboard_queue_size .......................... 1000
test_data_path .................................. None
tie_embed_logits ................................ False
timing_log_level ................................ 0
timing_log_option ............................... minmax
titles_data_path ................................ None
tokenizer_model ................................. None
tokenizer_type .................................. SentencePieceTokenizer
train_data_path ................................. None
train_iters ..................................... None
train_samples ................................... None
transformer_impl ................................ local
transformer_pipeline_model_parallel_size ........ 1
use_bias ........................................ False
use_checkpoint_args ............................. False
use_checkpoint_opt_param_scheduler .............. False
use_contiguous_buffers_in_local_ddp ............. True
use_cpu_initialization .......................... True
use_distributed_optimizer ....................... False
use_flash_attn .................................. False
use_one_sent_docs ............................... False
use_post_ln ..................................... False
use_ring_exchange_p2p ........................... False
use_rms_norm .................................... True
valid_data_path ................................. None
variable_seq_lengths ............................ False
virtual_pipeline_model_parallel_size ............ None
vocab_extra_ids ................................. 0
vocab_extra_ids_list ............................ None
vocab_file ...................................... None
wandb_api_key ................................... None
wandb_entity .................................... meditron
wandb_id ........................................ None
wandb_logger .................................... False
wandb_project ................................... None
wandb_resume .................................... False
weight_decay .................................... 0.01
weight_decay_incr_style ......................... constant
world_size ...................................... 4
-------------------- end of arguments ---------------------
setting number of micro-batches to constant 1
Setting consumed_train_samples to 0 and consumed_valid_samples to 0
sending embeddings
sending lm_head
Detected CUDA files, patching ldflags
Emitting ninja build file /epfllm/Megatron-LLM/megatron/fused_kernels/build/build.ninja...
sending transformer layer 0
Building extension module fused_mix_prec_layer_norm_cuda...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module fused_mix_prec_layer_norm_cuda...
Detected CUDA files, patching ldflags
Emitting ninja build file /epfllm/Megatron-LLM/megatron/fused_kernels/build/build.ninja...
Building extension module fused_dense_cuda...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module fused_dense_cuda...
Bus error (core dumped)