I'm having an issue where some combinations of hyper-parameters result in 'nan loss encountered'. I suspect to high a learning rate coupled with amp
enabled.
The issue is when this occurs allennlp-optuna crashes. It would be preferable if an exception simple resulted in the trial being marked as failed and the search continue from the next trial.
2021-11-22 20:26:30,886 - CRITICAL - root - Uncaught exception
Traceback (most recent call last):
File "bin/allennlp", line 8, in <module>
sys.exit(run())
File "site-packages/allennlp/__main__.py", line 46, in run
main(prog="allennlp")
File "site-packages/allennlp/commands/__init__.py", line 122, in main
args.func(args)
File "site-packages/allennlp_optuna/commands/tune.py", line 89, in tune
study.optimize(objective, n_trials=n_trials, timeout=timeout)
File "site-packages/optuna/study/study.py", line 400, in optimize
_optimize(
File "site-packages/optuna/study/_optimize.py", line 66, in _optimize
_optimize_sequential(
File "site-packages/optuna/study/_optimize.py", line 163, in _optimize_sequential
trial = _run_trial(study, func, catch)
File "site-packages/optuna/study/_optimize.py", line 264, in _run_trial
raise func_err
File "site-packages/optuna/study/_optimize.py", line 213, in _run_trial
value_or_values = func(trial)
File "site-packages/allennlp_optuna/commands/tune.py", line 57, in _objective
return executor.run()
File "site-packages/optuna/integration/allennlp/_executor.py", line 215, in run
allennlp.commands.train.train_model(
File "site-packages/allennlp/commands/train.py", line 254, in train_model
model = _train_worker(
File "site-packages/allennlp/commands/train.py", line 504, in _train_worker
metrics = train_loop.run()
File "site-packages/allennlp/commands/train.py", line 577, in run
return self.trainer.train()
File "site-packages/allennlp/training/gradient_descent_trainer.py", line 750, in train
metrics, epoch = self._try_train()
File "site-packages/allennlp/training/gradient_descent_trainer.py", line 773, in _try_train
train_metrics = self._train_epoch(epoch)
File "site-packages/allennlp/training/gradient_descent_trainer.py", line 495, in _train_epoch
raise ValueError("nan loss encountered")
ValueError: nan loss encountered