Hi, I am running the file 'pretrain_OAG.py', but meet the following bug after few iterations, I don't find how to solve this bug, do you have any idea? Thank you~
Start Pretraining...
Data Preparation: 73.9s
Epoch: 1, (1 / 266) 41.3s LR: 0.00005 Train Loss: (5.224, 10.440) Valid Loss: (5.086, 10.286) NDCG: 0.273 Norm: 0.604 queue: 12
UPDATE!!!
Data Preparation: 21.6s
Epoch: 1, (2 / 266) 40.3s LR: 0.00006 Train Loss: (4.914, 10.121) Valid Loss: (4.820, 9.884) NDCG: 0.361 Norm: 0.660 queue: 12
UPDATE!!!
Data Preparation: 22.7s
Epoch: 1, (3 / 266) 40.5s LR: 0.00007 Train Loss: (4.821, 9.512) Valid Loss: (4.682, 8.894) NDCG: 0.374 Norm: 0.729 queue: 12
UPDATE!!!
Data Preparation: 22.2s
Epoch: 1, (4 / 266) 40.5s LR: 0.00007 Train Loss: (4.712, 8.381) Valid Loss: (4.597, 7.592) NDCG: 0.362 Norm: 0.841 queue: 12
UPDATE!!!
Data Preparation: 22.8s
Epoch: 1, (5 / 266) 40.8s LR: 0.00008 Train Loss: (4.673, 7.576) Valid Loss: (4.740, 7.292) NDCG: 0.354 Norm: 0.905 queue: 12
UPDATE!!!
Data Preparation: 21.2s
Epoch: 1, (6 / 266) 40.6s LR: 0.00009 Train Loss: (4.560, 7.215) Valid Loss: (4.421, 6.747) NDCG: 0.361 Norm: 0.991 queue: 12
UPDATE!!!
Data Preparation: 28.1s
Epoch: 1, (7 / 266) 40.7s LR: 0.00010 Train Loss: (4.552, 6.979) Valid Loss: (4.371, 6.690) NDCG: 0.382 Norm: 1.057 queue: 12
UPDATE!!!
Data Preparation: 22.1s
Epoch: 1, (8 / 266) 40.4s LR: 0.00011 Train Loss: (4.519, 6.856) Valid Loss: (4.848, 6.588) NDCG: 0.348 Norm: 1.117 queue: 12
Data Preparation: 22.0s
Epoch: 1, (9 / 266) 40.1s LR: 0.00012 Train Loss: (4.421, 6.804) Valid Loss: (4.393, 6.605) NDCG: 0.383 Norm: 1.147 queue: 12
UPDATE!!!
Data Preparation: 25.9s
Epoch: 1, (10 / 266) 40.0s LR: 0.00013 Train Loss: (4.369, 6.741) Valid Loss: (4.654, 6.518) NDCG: 0.361 Norm: 1.180 queue: 12
Data Preparation: 22.3s
/opt/conda/conda-bld/pytorch_1570710743984/work/aten/src/THC/THCTensorScatterGather.cu:130: void THCudaTensor_scatterKernel(TensorInfo<Real, IndexType>, TensorInfo<Real, IndexType>, TensorInfo<long, IndexType>, int, IndexType) [with IndexType = unsigned int, Real = float, Dims = 2]: block: [12,0,0], thread: [328,0,0] Assertion indexValue >= 0 && indexValue < tensor.sizes[dim]
failed.
/opt/conda/conda-bld/pytorch_1570710743984/work/aten/src/THC/THCTensorScatterGather.cu:130: void THCudaTensor_scatterKernel(TensorInfo<Real, IndexType>, TensorInfo<Real, IndexType>, TensorInfo<long, IndexType>, int, IndexType) [with IndexType = unsigned int, Real = float, Dims = 2]: block: [12,0,0], thread: [329,0,0] Assertion indexValue >= 0 && indexValue < tensor.sizes[dim]
failed.
/opt/conda/conda-bld/pytorch_1570710743984/work/aten/src/THC/THCTensorScatterGather.cu:130: void THCudaTensor_scatterKernel(TensorInfo<Real, IndexType>, TensorInfo<Real, IndexType>, TensorInfo<long, IndexType>, int, IndexType) [with IndexType = unsigned int, Real = float, Dims = 2]: block: [12,0,0], thread: [330,0,0] Assertion indexValue >= 0 && indexValue < tensor.sizes[dim]
failed.
/opt/conda/conda-bld/pytorch_1570710743984/work/aten/src/THC/THCTensorScatterGather.cu:130: void THCudaTensor_scatterKernel(TensorInfo<Real, IndexType>, TensorInfo<Real, IndexType>, TensorInfo<long, IndexType>, int, IndexType) [with IndexType = unsigned int, Real = float, Dims = 2]: block: [12,0,0], thread: [331,0,0] Assertion indexValue >= 0 && indexValue < tensor.sizes[dim]
failed.
/opt/conda/conda-bld/pytorch_1570710743984/work/aten/src/THC/THCTensorScatterGather.cu:130: void THCudaTensor_scatterKernel(TensorInfo<Real, IndexType>, TensorInfo<Real, IndexType>, TensorInfo<long, IndexType>, int, IndexType) [with IndexType = unsigned int, Real = float, Dims = 2]: block: [12,0,0], thread: [332,0,0] Assertion indexValue >= 0 && indexValue < tensor.sizes[dim]
failed.
/opt/conda/conda-bld/pytorch_1570710743984/work/aten/src/THC/THCTensorScatterGather.cu:130: void THCudaTensor_scatterKernel(TensorInfo<Real, IndexType>, TensorInfo<Real, IndexType>, TensorInfo<long, IndexType>, int, IndexType) [with IndexType = unsigned int, Real = float, Dims = 2]: block: [12,0,0], thread: [333,0,0] Assertion indexValue >= 0 && indexValue < tensor.sizes[dim]
failed.
/opt/conda/conda-bld/pytorch_1570710743984/work/aten/src/THC/THCTensorScatterGather.cu:130: void THCudaTensor_scatterKernel(TensorInfo<Real, IndexType>, TensorInfo<Real, IndexType>, TensorInfo<long, IndexType>, int, IndexType) [with IndexType = unsigned int, Real = float, Dims = 2]: block: [12,0,0], thread: [334,0,0] Assertion indexValue >= 0 && indexValue < tensor.sizes[dim]
failed.
/opt/conda/conda-bld/pytorch_1570710743984/work/aten/src/THC/THCTensorScatterGather.cu:130: void THCudaTensor_scatterKernel(TensorInfo<Real, IndexType>, TensorInfo<Real, IndexType>, TensorInfo<long, IndexType>, int, IndexType) [with IndexType = unsigned int, Real = float, Dims = 2]: block: [12,0,0], thread: [335,0,0] Assertion indexValue >= 0 && indexValue < tensor.sizes[dim]
failed.
/opt/conda/conda-bld/pytorch_1570710743984/work/aten/src/THC/THCTensorScatterGather.cu:130: void THCudaTensor_scatterKernel(TensorInfo<Real, IndexType>, TensorInfo<Real, IndexType>, TensorInfo<long, IndexType>, int, IndexType) [with IndexType = unsigned int, Real = float, Dims = 2]: block: [0,0,0], thread: [16,0,0] Assertion indexValue >= 0 && indexValue < tensor.sizes[dim]
failed.
/opt/conda/conda-bld/pytorch_1570710743984/work/aten/src/THC/THCTensorScatterGather.cu:130: void THCudaTensor_scatterKernel(TensorInfo<Real, IndexType>, TensorInfo<Real, IndexType>, TensorInfo<long, IndexType>, int, IndexType) [with IndexType = unsigned int, Real = float, Dims = 2]: block: [0,0,0], thread: [17,0,0] Assertion indexValue >= 0 && indexValue < tensor.sizes[dim]
failed.
/opt/conda/conda-bld/pytorch_1570710743984/work/aten/src/THC/THCTensorScatterGather.cu:130: void THCudaTensor_scatterKernel(TensorInfo<Real, IndexType>, TensorInfo<Real, IndexType>, TensorInfo<long, IndexType>, int, IndexType) [with IndexType = unsigned int, Real = float, Dims = 2]: block: [0,0,0], thread: [18,0,0] Assertion indexValue >= 0 && indexValue < tensor.sizes[dim]
failed.
/opt/conda/conda-bld/pytorch_1570710743984/work/aten/src/THC/THCTensorScatterGather.cu:130: void THCudaTensor_scatterKernel(TensorInfo<Real, IndexType>, TensorInfo<Real, IndexType>, TensorInfo<long, IndexType>, int, IndexType) [with IndexType = unsigned int, Real = float, Dims = 2]: block: [0,0,0], thread: [19,0,0] Assertion indexValue >= 0 && indexValue < tensor.sizes[dim]
failed.
/opt/conda/conda-bld/pytorch_1570710743984/work/aten/src/THC/THCTensorScatterGather.cu:130: void THCudaTensor_scatterKernel(TensorInfo<Real, IndexType>, TensorInfo<Real, IndexType>, TensorInfo<long, IndexType>, int, IndexType) [with IndexType = unsigned int, Real = float, Dims = 2]: block: [0,0,0], thread: [20,0,0] Assertion indexValue >= 0 && indexValue < tensor.sizes[dim]
failed.
/opt/conda/conda-bld/pytorch_1570710743984/work/aten/src/THC/THCTensorScatterGather.cu:130: void THCudaTensor_scatterKernel(TensorInfo<Real, IndexType>, TensorInfo<Real, IndexType>, TensorInfo<long, IndexType>, int, IndexType) [with IndexType = unsigned int, Real = float, Dims = 2]: block: [0,0,0], thread: [21,0,0] Assertion indexValue >= 0 && indexValue < tensor.sizes[dim]
failed.
/opt/conda/conda-bld/pytorch_1570710743984/work/aten/src/THC/THCTensorScatterGather.cu:130: void THCudaTensor_scatterKernel(TensorInfo<Real, IndexType>, TensorInfo<Real, IndexType>, TensorInfo<long, IndexType>, int, IndexType) [with IndexType = unsigned int, Real = float, Dims = 2]: block: [0,0,0], thread: [22,0,0] Assertion indexValue >= 0 && indexValue < tensor.sizes[dim]
failed.
/opt/conda/conda-bld/pytorch_1570710743984/work/aten/src/THC/THCTensorScatterGather.cu:130: void THCudaTensor_scatterKernel(TensorInfo<Real, IndexType>, TensorInfo<Real, IndexType>, TensorInfo<long, IndexType>, int, IndexType) [with IndexType = unsigned int, Real = float, Dims = 2]: block: [0,0,0], thread: [23,0,0] Assertion indexValue >= 0 && indexValue < tensor.sizes[dim]
failed.
Traceback (most recent call last):
File "pretrain_OAG.py", line 262, in
loss.backward()
File "/opt/conda/lib/python3.6/site-packages/torch/tensor.py", line 150, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/opt/conda/lib/python3.6/site-packages/torch/autograd/init.py", line 99, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: copy_if failed to synchronize: device-side assert triggered