I've had this specific issue a few times so I'm opening an issue for it to log my progress on it. It's probably related to issue #35 (at least it comes and goes with different joblib versions) but I definitely want a log for this specific error message.
Below is the error message:
In [4]: dat.blech_clust_run(umap=True)
Running Blech Clust...
[Parallel(n_jobs=11)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=11)]: Done 3 tasks | elapsed: 2.7s
[Parallel(n_jobs=11)]: Done 10 tasks | elapsed: 3.8s
[Parallel(n_jobs=11)]: Done 19 tasks | elapsed: 4.0s
Exception in blechpy.datastructures.dataset.blech_clust_run
---------------------------------------------------------------------------
TerminatedWorkerError Traceback (most recent call last)
<ipython-input-4-4069635d4cce> in <module>
----> 1 dat.blech_clust_run(umap=True)
~/Documents/CodeVault/blechpy/blechpy/utils/decorators.py in wrapper(*args, **kwargs)
24 sys.stdout = old_out
25 if fail is not False:
---> 26 raise fail
27
28 else:
~/Documents/CodeVault/blechpy/blechpy/utils/decorators.py in wrapper(*args, **kwargs)
16 sys.stdout = f
17 try:
---> 18 func(*args, **kwargs)
19 fail = False
20 except Exception as e:
~/Documents/CodeVault/blechpy/blechpy/datastructures/dataset.py in blech_clust_run(self, data_quality, multi_process, n_cores, umap)
885
886 results = Parallel(n_jobs=n_cores, verbose=10)(delayed(run_joblib_process)
--> 887 (co) for co in clust_objs)
888
889
~/anaconda3/envs/blech_dev/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
1054
1055 with self._backend.retrieval_context():
-> 1056 self.retrieve()
1057 # Make sure that we get a last message telling us we are done
1058 elapsed_time = time.time() - self._start_time
~/anaconda3/envs/blech_dev/lib/python3.7/site-packages/joblib/parallel.py in retrieve(self)
933 try:
934 if getattr(self._backend, 'supports_timeout', False):
--> 935 self._output.extend(job.get(timeout=self.timeout))
936 else:
937 self._output.extend(job.get())
~/anaconda3/envs/blech_dev/lib/python3.7/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
~/anaconda3/envs/blech_dev/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
~/anaconda3/envs/blech_dev/lib/python3.7/concurrent/futures/_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.
The exit codes of the workers are {SIGKILL(-9)}
Below is the debugging trace:
In [5]: %debug
> /home/dsvedberg/anaconda3/envs/blech_dev/lib/python3.7/concurrent/futures/_base.py(384)__get_result()
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
ipdb> u
> /home/dsvedberg/anaconda3/envs/blech_dev/lib/python3.7/concurrent/futures/_base.py(435)result()
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
ipdb> u
> /home/dsvedberg/anaconda3/envs/blech_dev/lib/python3.7/site-packages/joblib/_parallel_backends.py(542)wrap_future_result()
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
ipdb> u
> /home/dsvedberg/anaconda3/envs/blech_dev/lib/python3.7/site-packages/joblib/parallel.py(935)retrieve()
933 try:
934 if getattr(self._backend, 'supports_timeout', False):
--> 935 self._output.extend(job.get(timeout=self.timeout))
936 else:
937 self._output.extend(job.get())
ipdb> u
> /home/dsvedberg/anaconda3/envs/blech_dev/lib/python3.7/site-packages/joblib/parallel.py(1056)__call__()
1054
1055 with self._backend.retrieval_context():
-> 1056 self.retrieve()
1057 # Make sure that we get a last message telling us we are done
1058 elapsed_time = time.time() - self._start_time
ipdb> u
> /home/dsvedberg/Documents/CodeVault/blechpy/blechpy/datastructures/dataset.py(887)blech_clust_run()
885
886 results = Parallel(n_jobs=n_cores, verbose=10)(delayed(run_joblib_process)
--> 887 (co) for co in clust_objs)
888
889
ipdb> u
> /home/dsvedberg/Documents/CodeVault/blechpy/blechpy/utils/decorators.py(18)wrapper()
16 sys.stdout = f
17 try:
---> 18 func(*args, **kwargs)
19 fail = False
20 except Exception as e:
ipdb> u
> /home/dsvedberg/Documents/CodeVault/blechpy/blechpy/utils/decorators.py(26)wrapper()
24 sys.stdout = old_out
25 if fail is not False:
---> 26 raise fail
27
28 else:
ipdb> u
> <ipython-input-4-4069635d4cce>(1)<module>()
----> 1 dat.blech_clust_run(umap=True)
Below is the pip freeze output for the environment that was used:
(blech_dev) dsvedberg@dsvedberg-Z370M-DS3H:~$ pip freeze
anyio==3.6.1
appdirs==1.4.4
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
attrs==21.4.0
Babel==2.10.1
backcall==0.1.0
beautifulsoup4==4.11.1
bleach==5.0.0
-e git+https://github.com/danielsvedberg/blechpy.git@2685e920dde53c4ba6acc505aaea021518df2524#egg=blechpy
bokeh==1.4.0
certifi==2021.10.8
cffi==1.14.5
chardet==4.0.0
Click==7.0
cloudpickle==1.3.0
colorama==0.4.4
colorcet==2.0.2
cryptography==3.4.7
cycler==0.10.0
dask==2.11.0
datashader==0.10.0
datashape==0.5.2
debugpy==1.5.1
decorator==4.4.2
defusedxml==0.7.1
dill==0.3.1.1
distributed==2.11.0
docutils==0.18.1
easygui==0.98.1
entrypoints==0.3
fastjsonschema==2.15.3
feather-format==0.4.1
fsspec==0.6.2
h5py==2.10.0
HeapDict==1.0.1
holoviews==1.12.7
idna==2.10
imageio==2.8.0
importlib-metadata==4.6.0
importlib-resources==5.7.1
ipykernel==6.9.1
ipython==7.32.0
ipython-genutils==0.2.0
jedi==0.16.0
jeepney==0.6.0
Jinja2==2.11.1
joblib==1.1.0
json5==0.9.8
jsonschema==4.5.1
jupyter-client==6.1.12
jupyter-core==4.6.3
jupyter-server==1.17.0
jupyterlab==3.4.2
jupyterlab-pygments==0.2.2
jupyterlab-server==2.14.0
keyring==23.0.1
keyrings.alt==4.0.2
kiwisolver==1.1.0
littleutils==0.2.2
llvmlite==0.31.0
locket==0.2.0
Mako==1.1.3
Markdown==3.3.3
MarkupSafe==1.1.1
matplotlib==3.2.0
matplotlib-inline==0.1.3
mistune==0.8.4
mkl-fft==1.3.1
mkl-random==1.2.2
mkl-service==2.4.0
msgpack==1.0.0
multipledispatch==0.6.0
nbclassic==0.3.7
nbclient==0.6.4
nbconvert==6.5.0
nbformat==5.4.0
nest-asyncio==1.5.4
networkx==2.4
notebook==6.4.11
notebook-shim==0.1.0
numba==0.48.0
numexpr==2.7.1
numpy==1.18.5
outdated==0.2.0
packaging==20.3
pandas==1.3.5
pandas-flavor==0.2.0
pandocfilters==1.5.0
param==1.9.3
parso==0.6.2
partd==1.1.0
patsy==0.5.1
pdoc3==0.9.2
pexpect==4.8.0
pickleshare==0.7.5
Pillow==7.0.0
pingouin==0.3.8
pkginfo==1.8.2
prometheus-client==0.14.1
prompt-toolkit==3.0.3
psutil==5.9.0
ptyprocess==0.6.0
pushbullet-tools==0.0.7
pyarrow==0.17.0
pycparser==2.20
pyct==0.4.6
Pygments==2.9.0
pyparsing==2.4.6
pyrsistent==0.18.1
python-dateutil==2.8.1
pytz==2019.3
pyviz-comms==0.7.3
PyWavelets==1.1.1
PyYAML==5.4
pyzmq==22.3.0
readme-renderer==32.0
requests==2.25.1
requests-toolbelt==0.9.1
rfc3986==2.0.0
scikit-image==0.16.2
scikit-learn==0.22.2.post1
scipy==1.3.3
seaborn==0.11.1
SecretStorage==3.3.1
Send2Trash==1.8.0
six==1.16.0
sniffio==1.2.0
sortedcontainers==2.1.0
soupsieve==2.3.2.post1
spyder-kernels==2.1.3
statsmodels==0.11.1
tables==3.6.1
tabulate==0.8.7
tblib==1.6.0
terminado==0.15.0
tinycss2==1.1.1
toolz==0.10.0
tornado==6.1
tqdm==4.61.1
traitlets==5.1.1
twine==3.8.0
typing-extensions==3.10.0.0
umap-learn==0.4.6
urllib3==1.26.6
wcwidth==0.1.8
webencodings==0.5.1
websocket-client==1.3.2
wurlitzer==3.0.2
xarray==0.15.0
zict==2.0.0
zipp==3.4.1
I've had this specific issue a few times so I'm opening an issue for it to log my progress on it. It's probably related to issue #35 (at least it comes and goes with different joblib versions) but I definitely want a log for this specific error message.
Below is the error message:
In [4]: dat.blech_clust_run(umap=True) Running Blech Clust... [Parallel(n_jobs=11)]: Using backend LokyBackend with 11 concurrent workers. [Parallel(n_jobs=11)]: Done 3 tasks | elapsed: 2.7s [Parallel(n_jobs=11)]: Done 10 tasks | elapsed: 3.8s [Parallel(n_jobs=11)]: Done 19 tasks | elapsed: 4.0s Exception in blechpy.datastructures.dataset.blech_clust_run --------------------------------------------------------------------------- TerminatedWorkerError Traceback (most recent call last) <ipython-input-4-4069635d4cce> in <module> ----> 1 dat.blech_clust_run(umap=True) ~/Documents/CodeVault/blechpy/blechpy/utils/decorators.py in wrapper(*args, **kwargs) 24 sys.stdout = old_out 25 if fail is not False: ---> 26 raise fail 27 28 else: ~/Documents/CodeVault/blechpy/blechpy/utils/decorators.py in wrapper(*args, **kwargs) 16 sys.stdout = f 17 try: ---> 18 func(*args, **kwargs) 19 fail = False 20 except Exception as e: ~/Documents/CodeVault/blechpy/blechpy/datastructures/dataset.py in blech_clust_run(self, data_quality, multi_process, n_cores, umap) 885 886 results = Parallel(n_jobs=n_cores, verbose=10)(delayed(run_joblib_process) --> 887 (co) for co in clust_objs) 888 889 ~/anaconda3/envs/blech_dev/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable) 1054 1055 with self._backend.retrieval_context(): -> 1056 self.retrieve() 1057 # Make sure that we get a last message telling us we are done 1058 elapsed_time = time.time() - self._start_time ~/anaconda3/envs/blech_dev/lib/python3.7/site-packages/joblib/parallel.py in retrieve(self) 933 try: 934 if getattr(self._backend, 'supports_timeout', False): --> 935 self._output.extend(job.get(timeout=self.timeout)) 936 else: 937 self._output.extend(job.get()) ~/anaconda3/envs/blech_dev/lib/python3.7/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout) 540 AsyncResults.get from multiprocessing.""" 541 try: --> 542 return future.result(timeout=timeout) 543 except CfTimeoutError as e: 544 raise TimeoutError from e ~/anaconda3/envs/blech_dev/lib/python3.7/concurrent/futures/_base.py in result(self, timeout) 433 raise CancelledError() 434 elif self._state == FINISHED: --> 435 return self.__get_result() 436 else: 437 raise TimeoutError() ~/anaconda3/envs/blech_dev/lib/python3.7/concurrent/futures/_base.py in __get_result(self) 382 def __get_result(self): 383 if self._exception: --> 384 raise self._exception 385 else: 386 return self._result TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker. The exit codes of the workers are {SIGKILL(-9)}Below is the debugging trace:
Below is the pip freeze output for the environment that was used: