Even if it is not our error, I think we could handle it more gracefully, for example by giving a better error message or maybe just retrying obtaining the job status a couple of times until some maximum number or retries or timeout is reached (I did something similar to gbasf2 in #108). But for that, knowing the cause would be useful.
-- Failed to fetch ads from: <131.169.223.41:9618?addrs=131.169.223.41-9618+[2001-638-700-10df--1-29]-9618&alias=bird-htc-sched13.desy.de&noUDP&sock=schedd_1649_a7c2_7> : bird-htc-sched13.desy.de
SECMAN:2007:Failed to end classad message.
INFO: Worker Worker(salt=379961246, workers=30, host=naf-belle12.desy.de, username=alina, pid=11924) was stopped. Shutting down Keep-Alive thread
-- Failed to fetch ads from: <131.169.223.41:9618?addrs=131.169.223.41-9618+[2001-638-700-10df--1-29]-9618&alias=bird-htc-sched13.desy.de&noUDP&sock=schedd_1649_a7c2_7> : bird-htc-sched13.desy.de
SECMAN:2007:Failed to end classad message.
Traceback (most recent call last):
File "/afs/desy.de/user/a/alina/.local/lib/python3.8/site-packages/luigi/interface.py", line 173, in _schedule_and_run
success &= worker.run()
File "/afs/desy.de/user/a/alina/.local/lib/python3.8/site-packages/luigi/worker.py", line 1203, in run
self._handle_next_task()
File "/afs/desy.de/user/a/alina/.local/lib/python3.8/site-packages/luigi/worker.py", line 1058, in _handle_next_task
self._purge_children() # Deal with subprocess failures
File "/afs/desy.de/user/a/alina/.local/lib/python3.8/site-packages/luigi/worker.py", line 1034, in _purge_children
if not p.is_alive() and p.exitcode:
File "/afs/desy.de/user/a/alina/.local/lib/python3.8/site-packages/b2luigi/batch/processes/__init__.py", line 135, in is_alive
job_status = self.get_job_status()
File "/afs/desy.de/user/a/alina/.local/lib/python3.8/site-packages/b2luigi/batch/processes/htcondor.py", line 154, in get_job_status
job_status = _batch_job_status_cache[self._batch_job_id]
File "/cvmfs/belle.cern.ch/el7/externals/v01-10-00/Linux_x86_64/common/lib/python3.8/site-packages/cachetools/ttl.py", line 81, in __getitem__
return self.__missing__(key)
File "/afs/desy.de/user/a/alina/.local/lib/python3.8/site-packages/b2luigi/batch/cache.py", line 28, in __missing__
self._ask_for_job_status(job_id=None)
File "/afs/desy.de/user/a/alina/.local/lib/python3.8/site-packages/b2luigi/batch/processes/htcondor.py", line 46, in _ask_for_job_status
output = subprocess.check_output(q_cmd)
File "/cvmfs/belle.cern.ch/el7/externals/v01-10-00/Linux_x86_64/common/lib/python3.8/subprocess.py", line 415, in check_output
return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
File "/cvmfs/belle.cern.ch/el7/externals/v01-10-00/Linux_x86_64/common/lib/python3.8/subprocess.py", line 516, in run
raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command '['condor_q', '-json', '-attributes', 'ClusterId,JobStatus,ExitStatus']' returned non-zero exit status 1.