Skip to content

Commit 4cdb0bf

Browse files
benjefferymergify[bot]
authored andcommitted
Improve error messages for legacy and compressed formats
1 parent 5ee30fb commit 4cdb0bf

File tree

9 files changed

+128
-16
lines changed

9 files changed

+128
-16
lines changed

python/_tskitmodule.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* MIT License
33
*
4-
* Copyright (c) 2019-2022 Tskit Developers
4+
* Copyright (c) 2019-2023 Tskit Developers
55
* Copyright (c) 2015-2018 University of Oxford
66
*
77
* Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -221,10 +221,10 @@ handle_library_error(int err)
221221
{
222222
int kas_err;
223223
const char *not_kas_format_msg
224-
= "File not in kastore format. If this file "
225-
"was generated by msprime < 0.6.0 (June 2018) it uses the old HDF5-based "
226-
"format which can no longer be read directly. Please convert to the new "
227-
"kastore format using the ``tskit upgrade`` command.";
224+
= "File not in kastore format. Either the file is corrupt or it is not a "
225+
"tskit tree sequence file. It may be a legacy HDF file upgradable with "
226+
"`tskit upgrade` or a compressed tree sequence file that can be decompressed "
227+
"with `tszip`.";
228228
const char *ibd_pairs_not_stored_msg
229229
= "Sample pairs are not stored by default "
230230
"in the IdentitySegments object returned by ibd_segments(), and you have "

python/requirements/CI-complete/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,5 @@ pytest==7.1.3
1919
pytest-cov==4.0.0
2020
pytest-xdist==2.5.0
2121
svgwrite==1.4.3
22+
tszip==0.2.2
2223
xmlunittest==0.5.0

python/requirements/CI-tests-pip/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,5 @@ biopython==1.79
1111
dendropy==4.5.2
1212
networkx==2.6.3 # Held at 2.6.3 for Python 3.7 compatibility
1313
msgpack==1.0.4
14-
newick==1.3.2
14+
newick==1.3.2
15+
tszip==0.2.2

python/requirements/development.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ sphinx-jupyterbook-latex
3636
sphinxcontrib-prettyspecialmethods
3737
tqdm
3838
tskit-book-theme
39+
tszip
3940
pydata_sphinx_theme>=0.7.2
4041
svgwrite>=1.1.10
4142
xmlunittest

python/tests/test_file_format.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# MIT License
22
#
3-
# Copyright (c) 2018-2022 Tskit Developers
3+
# Copyright (c) 2018-2023 Tskit Developers
44
# Copyright (c) 2016-2018 University of Oxford
55
#
66
# Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -36,12 +36,12 @@
3636
import msprime
3737
import numpy as np
3838
import pytest
39+
import tszip as tszip
3940

4041
import tests.tsutil as tsutil
4142
import tskit
4243
import tskit.exceptions as exceptions
4344

44-
4545
CURRENT_FILE_MAJOR = 12
4646
CURRENT_FILE_MINOR = 7
4747

@@ -262,11 +262,17 @@ def test_format_too_old_raised_for_hdf5(self):
262262
]
263263
for filename in files:
264264
path = os.path.join(test_data_dir, "hdf5-formats", filename)
265+
265266
with pytest.raises(
266267
exceptions.FileFormatError,
267-
match="uses the old HDF5-based format which can no longer",
268+
match="appears to be in HDF5 format",
268269
):
269270
tskit.load(path)
271+
with pytest.raises(
272+
exceptions.FileFormatError,
273+
match="appears to be in HDF5 format",
274+
):
275+
tskit.TableCollection.load(path)
270276

271277
def test_msprime_v_0_5_0(self):
272278
path = os.path.join(test_data_dir, "hdf5-formats", "msprime-0.5.0_v10.0.hdf5")
@@ -511,6 +517,14 @@ def test_no_h5py(self):
511517
with pytest.raises(ImportError, match=msg):
512518
tskit.dump_legacy(ts, path)
513519

520+
def test_tszip_file(self):
521+
ts = msprime.simulate(5)
522+
tszip.compress(ts, self.temp_file)
523+
with pytest.raises(tskit.FileFormatError, match="appears to be in zip format"):
524+
tskit.load(self.temp_file)
525+
with pytest.raises(tskit.FileFormatError, match="appears to be in zip format"):
526+
tskit.TableCollection.load(self.temp_file)
527+
514528

515529
class TestDumpFormat(TestFileFormat):
516530
"""

python/tests/test_fileobj.py

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# MIT License
22
#
3-
# Copyright (c) 2018-2022 Tskit Developers
3+
# Copyright (c) 2018-2023 Tskit Developers
44
#
55
# Permission is hereby granted, free of charge, to any person obtaining a copy
66
# of this software and associated documentation files (the "Software"), to deal
@@ -35,6 +35,7 @@
3535
import traceback
3636

3737
import pytest
38+
import tszip
3839
from pytest import fixture
3940

4041
import tskit
@@ -308,3 +309,59 @@ def verify_stream(self, ts_list, client_fd):
308309
def test_single_then_multi(self, ts_fixture, replicate_ts_fixture, client_fd):
309310
self.verify_stream([ts_fixture], client_fd)
310311
self.verify_stream(replicate_ts_fixture, client_fd)
312+
313+
314+
def write_to_fifo(path, file_path):
315+
with open(path, "wb") as fifo:
316+
with open(file_path, "rb") as file:
317+
fifo.write(file.read())
318+
319+
320+
def read_from_fifo(path, expected_exception, error_text, read_func):
321+
with open(path) as fifo:
322+
with pytest.raises(expected_exception, match=error_text):
323+
read_func(fifo)
324+
325+
326+
def write_and_read_from_fifo(fifo_path, file_path, expected_exception, error_text):
327+
os.mkfifo(fifo_path)
328+
for read_func in [tskit.load, tskit.TableCollection.load]:
329+
read_process = multiprocessing.Process(
330+
target=read_from_fifo,
331+
args=(fifo_path, expected_exception, error_text, read_func),
332+
)
333+
read_process.start()
334+
write_process = multiprocessing.Process(
335+
target=write_to_fifo, args=(fifo_path, file_path)
336+
)
337+
write_process.start()
338+
write_process.join(timeout=3)
339+
read_process.join(timeout=3)
340+
341+
342+
@pytest.mark.skipif(IS_WINDOWS, reason="No FIFOs on Windows")
343+
class TestBadStream:
344+
def test_bad_stream(self, tmp_path):
345+
fifo_path = tmp_path / "fifo"
346+
bad_file_path = tmp_path / "bad_file"
347+
bad_file_path.write_bytes(b"bad data")
348+
write_and_read_from_fifo(
349+
fifo_path, bad_file_path, tskit.FileFormatError, "not in kastore format"
350+
)
351+
352+
def test_legacy_stream(self, tmp_path):
353+
fifo_path = tmp_path / "fifo"
354+
legacy_file_path = os.path.join(
355+
os.path.dirname(__file__), "data", "hdf5-formats", "msprime-0.3.0_v2.0.hdf5"
356+
)
357+
write_and_read_from_fifo(
358+
fifo_path, legacy_file_path, tskit.FileFormatError, "not in kastore format"
359+
)
360+
361+
def test_tszip_stream(self, tmp_path, ts_fixture):
362+
fifo_path = tmp_path / "fifo"
363+
zip_file_path = tmp_path / "tszip_file"
364+
tszip.compress(ts_fixture, zip_file_path)
365+
write_and_read_from_fifo(
366+
fifo_path, zip_file_path, tskit.FileFormatError, "not in kastore format"
367+
)

python/tskit/tables.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3197,12 +3197,18 @@ def __getstate__(self):
31973197
def load(cls, file_or_path, *, skip_tables=False, skip_reference_sequence=False):
31983198
file, local_file = util.convert_file_like_to_open_file(file_or_path, "rb")
31993199
ll_tc = _tskit.TableCollection()
3200-
ll_tc.load(
3201-
file,
3202-
skip_tables=skip_tables,
3203-
skip_reference_sequence=skip_reference_sequence,
3204-
)
3205-
return TableCollection(ll_tables=ll_tc)
3200+
try:
3201+
ll_tc.load(
3202+
file,
3203+
skip_tables=skip_tables,
3204+
skip_reference_sequence=skip_reference_sequence,
3205+
)
3206+
return TableCollection(ll_tables=ll_tc)
3207+
except tskit.FileFormatError as e:
3208+
util.raise_known_file_format_errors(file, e)
3209+
finally:
3210+
if local_file:
3211+
file.close()
32063212

32073213
def dump(self, file_or_path):
32083214
"""

python/tskit/trees.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4093,6 +4093,8 @@ def load(cls, file_or_path, *, skip_tables=False, skip_reference_sequence=False)
40934093
skip_reference_sequence=skip_reference_sequence,
40944094
)
40954095
return TreeSequence(ts)
4096+
except tskit.FileFormatError as e:
4097+
util.raise_known_file_format_errors(file, e)
40964098
finally:
40974099
if local_file:
40984100
file.close()

python/tskit/util.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
Module responsible for various utility functions used in other modules.
2424
"""
2525
import dataclasses
26+
import io
2627
import itertools
2728
import json
2829
import numbers
@@ -749,3 +750,32 @@ def random_nucleotides(length: numbers.Number, *, seed: Union[int, None] = None)
749750
encoded_nucleotides = np.array(list(map(ord, "ACTG")), dtype=np.int8)
750751
a = rng.choice(encoded_nucleotides, size=int(length))
751752
return a.tobytes().decode("ascii")
753+
754+
755+
def raise_known_file_format_errors(open_file, existing_exception):
756+
"""
757+
Sniffs the file for pk-zip or hdf header bytes, then raises an exception
758+
if these are detected, if not raises the existing exception.
759+
"""
760+
# Check for HDF5 header bytes
761+
try:
762+
open_file.seek(0)
763+
header = open_file.read(4)
764+
except io.UnsupportedOperation:
765+
# If we can't seek, we can't sniff the file.
766+
raise existing_exception
767+
if header == b"\x89HDF":
768+
raise tskit.FileFormatError(
769+
"The specified file appears to be in HDF5 format. This file "
770+
"may have been generated by msprime < 0.6.0 (June 2018) which "
771+
"can no longer be read directly. Please convert to the new "
772+
"kastore format using the ``tskit upgrade`` command."
773+
) from existing_exception
774+
if header[:2] == b"\x50\x4b":
775+
raise tskit.FileFormatError(
776+
"The specified file appears to be in zip format, so may be a compressed "
777+
"tree sequence. Try using the tszip module to decompress this file before "
778+
"loading. `pip install tszip; tsunzip <filename>` or use "
779+
"`tszip.decompress` in Python code."
780+
) from existing_exception
781+
raise existing_exception

0 commit comments

Comments
 (0)