Hi, I am trying to follow the instructions in the "Full-page curriculum learning (from system-level to full-page)" section to train a model.
The bug: Finetuning will fail with segmentation fault. It is not deterministic when, but eventually it will. It happened to me usually after a few hours of training.
Epoch 14/9999 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 83/83 0:00:15 • 0:00:00 6.02it/s v_num: ylku loss_step: 0.979 sta
ge_step:
2.000 loss_epoch: 0.344 stage_ep
och:
2.000 val_CER: 84.374 val_SER: 8
3.285
Epoch 17/9999 ━━━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━ 25/83 0:00:06 • 0:00:11 5.74it/s v_num: ylku loss_step: 0.059 sta
ge_step:
2.000 loss_epoch: 0.284 stage_ep
och:
Epoch 17/9999 ━━━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━ 25/83 0:00:08 • 0:00:11 5.74it/s v_num: ylku loss_step: 0.059 sta
ge_step:
2.000 loss_epoch: 0.284 stage_ep
och:
2.000 val_CER: 91.542 val_SER: 1
04.510
val_LER: 95.428
Traceback (most recent call last):
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/torch/utils/data/dataloader.py",
line 1243, in _try_get_data
data = self._data_queue.get(timeout=timeout)
File "/usr/lib/python3.10/multiprocessing/queues.py", line 122, in get
return _ForkingPickler.loads(res)
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/torch/multiprocessing/reductions.
py", line 541, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.10/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.10/multiprocessing/resource_sharer.py", line 86, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.10/multiprocessing/connection.py", line 508, in Client
answer_challenge(c, authkey)
File "/usr/lib/python3.10/multiprocessing/connection.py", line 752, in answer_challenge
message = connection.recv_bytes(256) # reject large message
File "/usr/lib/python3.10/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/usr/lib/python3.10/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/usr/lib/python3.10/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/torch/utils/data/_utils/signal_ha
ndling.py", line 73, in handler
_error_if_any_worker_fails()
RuntimeError: DataLoader worker (pid 503424) is killed by signal: Segmentation fault.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/opensource_models/SMT/fp-train-2.py", line 63, in <module>
fire.Fire(launch)
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/fire/core.py", line 135, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/fire/core.py", line 468, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/fire/core.py", line 684, in _Call
AndUpdateTrace
component = fn(*varargs, **kwargs)
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/opensource_models/SMT/fp-train-2.py", line 60, in launch
main(config_path, starting_checkpoint)
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/opensource_models/SMT/fp-train-2.py", line 53, in main
trainer.fit(model_wrapper,datamodule=datamodule)
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer
.py", line 584, in fit
call._call_and_handle_interrupt(
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py
", line 49, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer
.py", line 630, in _fit_impl
self._run(model, ckpt_path=ckpt_path, weights_only=weights_only)
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer
.py", line 1079, in _run
results = self._run_stage()
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer
.py", line 1123, in _run_stage
self.fit_loop.run()
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.
py", line 217, in run
self.advance()
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.
py", line 465, in advance
self.epoch_loop.run(self._data_fetcher)
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/lightning/pytorch/loops/training_
epoch_loop.py", line 153, in run
self.advance(data_fetcher)
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/lightning/pytorch/loops/training_
epoch_loop.py", line 311, in advance
batch, _, __ = next(data_fetcher)
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/lightning/pytorch/loops/fetchers.
py", line 134, in __next__
batch = super().__next__()
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/lightning/pytorch/loops/fetchers.
py", line 61, in __next__
batch = next(self.iterator)
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/lightning/pytorch/utilities/combi
ned_loader.py", line 341, in __next__
out = next(self._iterator)
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/lightning/pytorch/utilities/combi
ned_loader.py", line 78, in __next__
out[i] = next(self.iterators[i])
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/torch/utils/data/dataloader.py",
line 701, in __next__
data = self._next_data()
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/torch/utils/data/dataloader.py",
line 1448, in _next_data
idx, data = self._get_data()
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/torch/utils/data/dataloader.py",
line 1412, in _get_data
success, data = self._try_get_data()
File "/mnt/silenos/silenos1/nlp/projekty/music_ocr/venv/lib/python3.10/site-packages/torch/utils/data/dataloader.py",
line 1256, in _try_get_data
raise RuntimeError(
RuntimeError: DataLoader worker (pid(s) 503424) exited unexpectedly
wandb:
wandb: 🚀 View run SMT-FP-CL at: https://wandb.ai/stepan-omelka-masarykova-univerzita/SMT-FP/runs/n0deylku
(venv) xomelka1@apollo:/mnt/silenos/silenos1/nlp/projekty/music_ocr/opensource_models/SMT$
{
"data": {
"data_path": "antoniorv6/grandstaff-ekern",
"batch_size": 1,
"vocab_name": "Polish_Scores_BeKern",
"num_workers": 20,
"krn_format": "bekern",
"reduce_ratio": 0.5
},
"checkpoint": {
"dirpath": "weights/Polish_Scores",
"filename": "FP-Polish_Scores-system-level",
"monitor": "val_SER"
}
}
trainer = Trainer(max_epochs=10000, min_steps=300000-skip_steps,
check_val_every_n_epoch=5, # remove the early_stopping in the next line
logger=wandb_logger, callbacks=[checkpointer, stage_checkpointer, early_stopping], precision='16-mixed')
This is not absolutely required - the training can fail even before it's stopped. Do it just to guarantee the bug is reproduced.
This error happened repeatedly. I tried to reduce the number of workers to 4, but it still failed with the same error.
Hi, I am trying to follow the instructions in the "Full-page curriculum learning (from system-level to full-page)" section to train a model.
The bug: Finetuning will fail with segmentation fault. It is not deterministic when, but eventually it will. It happened to me usually after a few hours of training.
Console error:
Steps to reproduce:
prep - create folder Generator/paper_textures in the root of the project and upload at least one image of a paper (otherwise the training will fail, apart from that this is not relevant for the bug itself)

pretrain weights for example polish scores with this config:
run:
python fp-train-1.py config/FP-Polish_Scores/pretraining.jsonfp-train-2.pyThis is not absolutely required - the training can fail even before it's stopped. Do it just to guarantee the bug is reproduced.
python fp-train-2.py config/FP-Polish_Scores/finetuning.json --starting_checkpoint weights/Polish_Scores/FP-Polish_Scores-system-level.ckptThis error happened repeatedly. I tried to reduce the number of workers to 4, but it still failed with the same error.