From 494b4ea3ed97862b8eb22129e304321e6f62d08f Mon Sep 17 00:00:00 2001 From: Cade Mirchandani Date: Sat, 25 Jan 2025 12:50:44 -0800 Subject: [PATCH] migrate to maturin for pyd4 build --- .github/workflows/build-test-pyd4.yml | 235 +++++++++++++++++++++--- .gitignore | 1 + pyd4/Cargo.toml | 8 +- pyd4/pyproject.toml | 23 +-- pyd4/{ => python}/pyd4/__init__.py | 246 ++++++++++++++++++-------- pyd4/src/lib.rs | 2 +- 6 files changed, 398 insertions(+), 117 deletions(-) rename pyd4/{ => python}/pyd4/__init__.py (79%) diff --git a/.github/workflows/build-test-pyd4.yml b/.github/workflows/build-test-pyd4.yml index 16bca9b..ba5ac7e 100644 --- a/.github/workflows/build-test-pyd4.yml +++ b/.github/workflows/build-test-pyd4.yml @@ -1,42 +1,223 @@ -name: Build pyd4 +# This file is autogenerated by maturin v1.8.1 +# To update, run +# +# maturin generate-ci github --pytest +# +name: pyd4 CI on: push: - branches: [master] + branches: + - main + - master + tags: + - '*' pull_request: - branches: [master] + workflow_dispatch: + +permissions: + contents: read jobs: - build-and-test: + linux: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - - name: Set up Rust - uses: actions-rs/toolchain@v1 + - uses: actions/setup-python@v5 with: - toolchain: stable - profile: minimal + python-version: '3.12' + - uses: dtolnay/rust-toolchain@stable + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: x86_64 + args: --release --out dist --find-interpreter -m ./pyd4/Cargo.toml + # - name: Upload wheels + # uses: actions/upload-artifact@v4 + # with: + # name: wheels-linux-${{ matrix.platform.target }} + # path: dist + - name: pytest + shell: bash + run: | + set -e + python3 -m venv .venv + source .venv/bin/activate + pip install pyd4 --find-links dist --force-reinstall + pip install pytest + pytest + - - name: Install UV - uses: astral-sh/setup-uv@v3 + # musllinux: + # runs-on: ${{ matrix.platform.runner }} + # strategy: + # matrix: + # platform: + # - runner: ubuntu-22.04 + # target: x86_64 + # - runner: ubuntu-22.04 + # target: x86 + # - runner: ubuntu-22.04 + # target: aarch64 + # - runner: ubuntu-22.04 + # target: armv7 + # steps: + # - uses: actions/checkout@v4 + # - uses: actions/setup-python@v5 + # with: + # python-version: 3.x + # - name: Build wheels + # uses: PyO3/maturin-action@v1 + # with: + # target: ${{ matrix.platform.target }} + # args: --release --out dist --find-interpreter + # sccache: 'true' + # manylinux: musllinux_1_2 + # - name: Upload wheels + # uses: actions/upload-artifact@v4 + # with: + # name: wheels-musllinux-${{ matrix.platform.target }} + # path: dist + # - name: pytest + # if: ${{ startsWith(matrix.platform.target, 'x86_64') }} + # uses: addnab/docker-run-action@v3 + # with: + # image: alpine:latest + # options: -v ${{ github.workspace }}:/io -w /io + # run: | + # set -e + # apk add py3-pip py3-virtualenv + # python3 -m virtualenv .venv + # source .venv/bin/activate + # pip install pyd4 --no-index --find-links dist --force-reinstall + # pip install pytest + # pytest + # - name: pytest + # if: ${{ !startsWith(matrix.platform.target, 'x86') }} + # uses: uraimo/run-on-arch-action@v2 + # with: + # arch: ${{ matrix.platform.target }} + # distro: alpine_latest + # githubToken: ${{ github.token }} + # install: | + # apk add py3-virtualenv + # run: | + # set -e + # python3 -m virtualenv .venv + # source .venv/bin/activate + # pip install pytest + # pip install pyd4 --find-links dist --force-reinstall + # pytest - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version-file: "pyd4/pyproject.toml" + # windows: + # runs-on: ${{ matrix.platform.runner }} + # strategy: + # matrix: + # platform: + # - runner: windows-latest + # target: x64 + # - runner: windows-latest + # target: x86 + # steps: + # - uses: actions/checkout@v4 + # - uses: actions/setup-python@v5 + # with: + # python-version: 3.x + # architecture: ${{ matrix.platform.target }} + # - name: Build wheels + # uses: PyO3/maturin-action@v1 + # with: + # target: ${{ matrix.platform.target }} + # args: --release --out dist --find-interpreter + # sccache: 'true' + # - name: Upload wheels + # uses: actions/upload-artifact@v4 + # with: + # name: wheels-windows-${{ matrix.platform.target }} + # path: dist + # - name: pytest + # if: ${{ !startsWith(matrix.platform.target, 'aarch64') }} + # shell: bash + # run: | + # set -e + # python3 -m venv .venv + # source .venv/Scripts/activate + # pip install pyd4 --find-links dist --force-reinstall + # pip install pytest + # pytest - - name: Install project - run: | - cd pyd4 - uv sync --all-extras --dev + # macos: + # runs-on: ${{ matrix.platform.runner }} + # strategy: + # matrix: + # platform: + # - runner: macos-13 + # target: x86_64 + # - runner: macos-14 + # target: aarch64 + # steps: + # - uses: actions/checkout@v4 + # - uses: actions/setup-python@v5 + # with: + # python-version: 3.x + # - name: Build wheels + # uses: PyO3/maturin-action@v1 + # with: + # target: ${{ matrix.platform.target }} + # args: --release --out dist --find-interpreter + # sccache: 'true' + # - name: Upload wheels + # uses: actions/upload-artifact@v4 + # with: + # name: wheels-macos-${{ matrix.platform.target }} + # path: dist + # - name: pytest + # run: | + # set -e + # python3 -m venv .venv + # source .venv/bin/activate + # pip install pyd4 --find-links dist --force-reinstall + # pip install pytest + # pytest - - name: Build - run: | - cd pyd4 - uv build + # sdist: + # runs-on: ubuntu-latest + # steps: + # - uses: actions/checkout@v4 + # - name: Build sdist + # uses: PyO3/maturin-action@v1 + # with: + # command: sdist + # args: --out dist + # - name: Upload sdist + # uses: actions/upload-artifact@v4 + # with: + # name: wheels-sdist + # path: dist - - name: Test - run: | - cd pyd4 - uv run pytest \ No newline at end of file + # release: + # name: Release + # runs-on: ubuntu-latest + # if: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' }} + # needs: [linux, musllinux, windows, macos, sdist] + # permissions: + # # Use to sign the release artifacts + # id-token: write + # # Used to upload release artifacts + # contents: write + # # Used to generate artifact attestation + # attestations: write + # steps: + # - uses: actions/download-artifact@v4 + # - name: Generate artifact attestation + # uses: actions/attest-build-provenance@v1 + # with: + # subject-path: 'wheels-*/*' + # - name: Publish to PyPI + # if: ${{ startsWith(github.ref, 'refs/tags/') }} + # uses: PyO3/maturin-action@v1 + # env: + # MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} + # with: + # command: upload + # args: --non-interactive --skip-existing wheels-*/* \ No newline at end of file diff --git a/.gitignore b/.gitignore index e1f745a..119835c 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,4 @@ pyd4/*.egg-info/** pyd4/build/** pyd4/**/__pycache__/** .ipynb* +dist \ No newline at end of file diff --git a/pyd4/Cargo.toml b/pyd4/Cargo.toml index 4d1e6da..aea661a 100644 --- a/pyd4/Cargo.toml +++ b/pyd4/Cargo.toml @@ -12,5 +12,9 @@ env_logger = "0.9.0" [lib] crate-type = ["cdylib"] -name = "pyd4" -path = "src/lib.rs" \ No newline at end of file +name = "_pyd4" +path = "src/lib.rs" + +[features] +extension-module = ["pyo3/extension-module"] +default = ["extension-module"] \ No newline at end of file diff --git a/pyd4/pyproject.toml b/pyd4/pyproject.toml index 0eedf27..aa0f418 100644 --- a/pyd4/pyproject.toml +++ b/pyd4/pyproject.toml @@ -1,13 +1,13 @@ [build-system] -requires = ["setuptools", "setuptools-rust", "wheel"] -build-backend = "setuptools.build_meta" +requires = ["maturin>=1,<2"] +build-backend = "maturin" [project] name = "pyd4" -version = "0.3.6.2" -requires-python = ">=3.12" +version = "0.3.10" +requires-python = ">=3.10" description = "Python binding for D4 file format" -dependencies = ["numpy>=2.2.2"] +dependencies = ["numpy>1.24.4"] readme = "README.md" classifiers = [ "License :: OSI Approved :: MIT License", @@ -19,15 +19,10 @@ classifiers = [ "Operating System :: MacOS :: MacOS X", ] -[tool.setuptools.packages] -find = { where = ["pyd4"] } +[tool.maturin] +python-source = "python" +module-name = "pyd4._pyd4" # Points to Rust -[[tool.setuptools-rust.ext-modules]] -target = "pyd4" -path = "Cargo.toml" -binding = "PyO3" [dependency-groups] -dev = [ - "pytest>=8.3.4", -] \ No newline at end of file +dev = ["pytest>=8.3.4", "maturin>=1,<2"] \ No newline at end of file diff --git a/pyd4/pyd4/__init__.py b/pyd4/python/pyd4/__init__.py similarity index 79% rename from pyd4/pyd4/__init__.py rename to pyd4/python/pyd4/__init__.py index d0a5516..a2dc4c2 100644 --- a/pyd4/pyd4/__init__.py +++ b/pyd4/python/pyd4/__init__.py @@ -2,7 +2,13 @@ The Python Binding for the D4 file format. """ -from .pyd4 import D4File as D4FileImpl, D4Iter, D4Builder as D4BuilderImpl, D4Writer as D4WriterImpl, D4Merger as D4MergerImpl +from ._pyd4 import ( + D4File as D4FileImpl, + D4Iter, + D4Builder as D4BuilderImpl, + D4Writer as D4WriterImpl, + D4Merger as D4MergerImpl, +) import numpy import ctypes @@ -13,37 +19,48 @@ import math from pathlib import Path -def bam_to_d4(bam_file, output = None, compression = False, encode_dict = "auto", read_flags = 0xffff, reference_genome = None): + +def bam_to_d4( + bam_file, + output=None, + compression=False, + encode_dict="auto", + read_flags=0xFFFF, + reference_genome=None, +): """ Create a coverage profile from a given BAM/CRAM file and return the opened D4 file. - This function provide a fast way to profile per-base coverage and load it as numpy array. - The following code will compute the depth for hg002 and load the per-base converage as + This function provide a fast way to profile per-base coverage and load it as numpy array. + The following code will compute the depth for hg002 and load the per-base converage as numpy array. (Typically this takes < 2 min) chr1_coverage = bam_to_d4("hg002.bam").load_to_np("1") - + If the output parameter is given, the D4 file will be placed to the path 'output' described. """ if output == None: - fp = tempfile.NamedTemporaryFile(delete = False, suffix = ".d4") + fp = tempfile.NamedTemporaryFile(delete=False, suffix=".d4") output = fp.name + def remove_temp_file(): os.remove(output) + atexit.register(remove_temp_file) cmd_line = ["d4tools", "create", bam_file, output] if compression: cmd_line.append("-z") if encode_dict != "auto": - cmd_line.append("--dict-range=%s"%encode_dict) + cmd_line.append("--dict-range=%s" % encode_dict) else: cmd_line.append("--dict-auto") - if read_flags != 0xffff: - cmd_line.append("--bam-flag=%s"%read_flags) + if read_flags != 0xFFFF: + cmd_line.append("--bam-flag=%s" % read_flags) if reference_genome != None: cmd_line.append("--ref=" + reference_genome) subprocess.run(cmd_line) return D4File(output) + def enumerate_values(inf, chrom, begin, end): """ A helper function that can enumerate all the values in given range. @@ -51,16 +68,22 @@ def enumerate_values(inf, chrom, begin, end): for pos,val in pyd4.enumerate_values(input, "1", 0, 10000): print(pos, val) - + """ if inf.__class__ == list: + def gen(): iters = [x.value_iter(chrom, begin, end) for x in inf] denom = [x.get_denominator() for x in inf] for pos in range(begin, end): yield (chrom, pos, [f.__next__() / d for f, d in zip(iters, denom)]) + return gen() - return map(lambda p: (chrom, p[0], p[1]), zip(range(begin, end), inf.value_iter(chrom, begin, end))) + return map( + lambda p: (chrom, p[0], p[1]), + zip(range(begin, end), inf.value_iter(chrom, begin, end)), + ) + def open_all_tracks(fp): """ @@ -69,65 +92,84 @@ def open_all_tracks(fp): f = D4File(fp) return [f.open_track(track_label) for track_label in f.list_tracks()] + class D4Merger(D4MergerImpl): """ The helper class to make multi-track D4 files. This class enables merge multiple single track D4 file into one multi-track file """ + def __del__(self): self.merge() + def add_track(self, path): tag = str(Path(path).stem) self.add_tagged_track(tag, path) return self + def add_tagged_track(self, tag, path): super().add_tagged_track(tag, path) return self + class D4Matrix: """ Higher level abstraction for a multitrack D4 file """ - def __init__(self, tracks, track_names = None): + + def __init__(self, tracks, track_names=None): self.tracks = tracks self.track_names = track_names + def enumerate_values(self, chrom, begin, end): """ Enumerate values in the given range """ return enumerate_values(self.tracks, chrom, begin, end) + def __getitem__(self, key): data = [track[key] for track in self.tracks] return numpy.stack(data) - def resample(self, regions, method = "mean", bin_size = 1000, allow_bin_size_adjustment = True): - data = [track.resample(regions, method, bin_size, allow_bin_size_adjustment) for track in self.tracks] + + def resample( + self, regions, method="mean", bin_size=1000, allow_bin_size_adjustment=True + ): + data = [ + track.resample(regions, method, bin_size, allow_bin_size_adjustment) + for track in self.tracks + ] ret = [] for idx in range(len(data[0])): region_data = [track_data[idx] for track_data in data] ret.append(numpy.stack(region_data)) return ret + + class D4Writer: def __init__(self, writer_obj): self._inner = writer_obj + def __del__(self): if self._inner: self._inner.close() + def close(self): """ Manually close the D4 writer. Unless the D4 writer is closed, the output file - may be incompleted and unable to read correctly. + may be incompleted and unable to read correctly. - This will be automatically called when the writer gets deleted. + This will be automatically called when the writer gets deleted. You can also call this function explicitly so that the file will be complete right after this invocation. """ if self._inner != None: self._inner.close() self._inner = None + def write_np_array(self, chr, pos, data): """ - Write a numpy array to a D4 file - The data will be stored from the locus chr:pos specified + Write a numpy array to a D4 file + The data will be stored from the locus chr:pos specified """ if self._inner == None: raise RuntimeError("Unable to write a closed D4 file") @@ -138,45 +180,54 @@ def write_np_array(self, chr, pos, data): data_ptr = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)) data_addr = ctypes.cast(data_ptr, ctypes.c_void_p).value self._inner.write(chr, pos, data_addr, data.shape[0]) + + class D4Builder(D4BuilderImpl): """ The helper class to build a new D4 file """ + def __init__(self, output_path): self.output_path = output_path self.index = "" - def enable_secondary_table_compression(self, level = 5): + + def enable_secondary_table_compression(self, level=5): """ Enable the secondary table compression for the d4 file to created """ self.set_compression(level) return self + def set_dict_bits(self, n): """ Set how many bits we want to use for the primary table. This function will encode the value range starting from 0. """ - self.dict_range(0, 1< nth: return value value += 1 return 0 + def median(self): """ Compute the median value of the histogram """ return self.percentile(50) + def std(self): """ Compute the standard deviation of this histogram @@ -297,31 +368,34 @@ def std(self): ex = sum / self.total_count() esx = square_sum / self.total_count() return math.sqrt(esx - ex * ex) + + class D4File(D4FileImpl): """ - The python wrapper for a D4 file reader. - - 'mean', 'median', 'percentile' method supports various 'regions' parameter: - - # Single chromosome, this will return a single value - self.mean("chr1") - # List of chromosomes, this will return a list of values - self.mean(["chr1", "chr2"]) - # Region specification as "chr:begin-end" or "chr:begin-" - self.mean("chr1:0-10000") - # List of region specification - self.mean(["chr1:1000-", "chr2:0-1000"]) - # A tuple of (chr, begin, end) or (chr, begin) - self.mean(("chr1", 0, 10000)) - # A list of tuple - self.mean([("chr1", 0, 10000)]) + The python wrapper for a D4 file reader. + + 'mean', 'median', 'percentile' method supports various 'regions' parameter: + + # Single chromosome, this will return a single value + self.mean("chr1") + # List of chromosomes, this will return a list of values + self.mean(["chr1", "chr2"]) + # Region specification as "chr:begin-end" or "chr:begin-" + self.mean("chr1:0-10000") + # List of region specification + self.mean(["chr1:1000-", "chr2:0-1000"]) + # A tuple of (chr, begin, end) or (chr, begin) + self.mean(("chr1", 0, 10000)) + # A list of tuple + self.mean([("chr1", 0, 10000)]) """ - def create_on_same_genome(self, output, seqs = None): + + def create_on_same_genome(self, output, seqs=None): """ - Create a new D4 file which would use the same reference genome. + Create a new D4 file which would use the same reference genome. - Use 'seqs' parameter to selectively choose which chromosome to select + Use 'seqs' parameter to selectively choose which chromosome to select """ ret = D4Builder(output) if seqs != None: @@ -331,46 +405,51 @@ def create_on_same_genome(self, output, seqs = None): ret.add_seq(seq, this_seqs[seq]) ret.dup_dict(self) return ret + def percentile(self, regions, nth): """ Return the percentile value in the given regions. """ - def collect_region(name, begin, end): + + def collect_region(name, begin, end): return (name, begin, end) + region_spec = self._for_each_region(regions, collect_region) histo_result = super().histogram(region_spec, 0, 1000) ret = [] for result, (chr, begin, end) in zip(histo_result, region_spec): ret.append(self._percentile_impl(result, begin, end, nth)) return ret - def _percentile_impl(self, result, chrom, begin = 0, end = None, nth = 50): + + def _percentile_impl(self, result, chrom, begin=0, end=None, nth=50): if end == None: chroms = dict(self.chroms()) end = chroms[chrom] hist, below, above = super().histogram([(chrom, begin, end)], 0, 65536)[0] total = end - begin - if nth < below * 100.0 / total or \ - 100.0 - above * 100.0 / total < nth: + if nth < below * 100.0 / total or 100.0 - above * 100.0 / total < nth: data = self[(chrom, begin, end)] - low,high = data.min(),data.max() + 1 + low, high = data.min(), data.max() + 1 while high - low > 1: mid = (high + low) // 2 p = (data < mid).sum() * 100.0 / total if p < nth: low = mid - else: + else: high = mid return low acc = below - for value,count in hist: + for value, count in hist: if (acc + count) * 100.0 / total > nth: return value acc += count + def enumerate_values(self, chrom, begin, end): """ Enuemrate all the values in given range """ return enumerate_values([self], chrom, begin, end) + def open_all_tracks(self): """ Open all the tracks that are living in this file @@ -378,41 +457,50 @@ def open_all_tracks(self): tracks = self.list_tracks() return D4Matrix( [D4File(self.get_track_specifier(track_label)) for track_label in tracks], - track_names = tracks + track_names=tracks, ) + def chrom_names(self): """ Return a list of chromosome names """ return list(map(lambda x: x[0], self.chroms())) + def histogram(self, regions, min=0, max=1024): """ Returns the value histogram for given regions """ is_list = type(regions) == list - regions = self._for_each_region(regions, lambda name, begin, end: (name, begin, end), False) + regions = self._for_each_region( + regions, lambda name, begin, end: (name, begin, end), False + ) ret = super().histogram(regions, min, max) if not is_list: return Histogram(ret[0]) return list(map(Histogram, ret)) + def median(self, regions): """ return the median value for the given regions """ - return self.percentile(regions, nth = 50) + return self.percentile(regions, nth=50) + def mean(self, regions): """ - Compute the mean depth of the given region. - """ + Compute the mean depth of the given region. + """ is_list = type(regions) == list - regions = self._for_each_region(regions, lambda name, begin, end: (name, begin, end), False) + regions = self._for_each_region( + regions, lambda name, begin, end: (name, begin, end), False + ) ret = super().mean(regions) if not is_list: return ret[0] return ret + def _parse_region(self, key): chroms = dict(self.chroms()) - splitted = key.split(":",1) + splitted = key.split(":", 1) chr = splitted[0] if len(splitted) == 1: return (chr, 0, chroms[chr]) @@ -423,6 +511,7 @@ def _parse_region(self, key): return (chr, int(begin), chroms[chr]) else: return (chr, int(begin), int(end)) + def __getitem__(self, key): if type(key) == str: key = self._parse_region(key) @@ -430,7 +519,8 @@ def __getitem__(self, key): return self.load_to_np(key) else: raise ValueError("Unspported region specification") - def _for_each_region(self, regions, func, unpack_single_value = True): + + def _for_each_region(self, regions, func, unpack_single_value=True): ret = [] chroms = dict(self.chroms()) single_value = False @@ -455,7 +545,10 @@ def _for_each_region(self, regions, func, unpack_single_value = True): if unpack_single_value: return ret if not single_value else ret[0] return ret - def resample(self, regions, method = "mean", bin_size = 1000, allow_bin_size_adjustment = True): + + def resample( + self, regions, method="mean", bin_size=1000, allow_bin_size_adjustment=True + ): """ Re-sample the given region and return the value as an numpy array """ @@ -465,6 +558,7 @@ def resample(self, regions, method = "mean", bin_size = 1000, allow_bin_size_adj bin_size = 65536 else: bin_size -= bin_size % 65536 + def split_region(chr, begin, end): ret = [] while begin < end: @@ -472,6 +566,7 @@ def split_region(chr, begin, end): ret.append((chr, begin, bin_end)) begin = bin_end return ret + splitted = self._for_each_region(regions, split_region, False) size = [] tasks = [] @@ -486,9 +581,9 @@ def split_region(chr, begin, end): values = self.median(tasks) else: raise TypeError("Unsupported resample method") - ret = [numpy.zeros(shape = (size[i])) for i in range(0, idx)] + ret = [numpy.zeros(shape=(size[i])) for i in range(0, idx)] idx = 0 - ofs = 0 + ofs = 0 for val in values: if ofs >= size[idx]: ofs = 0 @@ -496,12 +591,14 @@ def split_region(chr, begin, end): ret[idx][ofs] = val ofs += 1 return (ret[0] if unpack and len(ret) == 1 else ret, bin_size) + def load_to_np(self, regions): """ Load regions as numpy array. It's similar to the __getitem__ operator. """ + def load_to_np_impl(name, begin, end): - buf = numpy.zeros(shape=(end - begin,), dtype = numpy.int32) + buf = numpy.zeros(shape=(end - begin,), dtype=numpy.int32) buf_ptr = buf.ctypes.data_as(ctypes.POINTER(ctypes.c_uint32)) buf_addr = ctypes.cast(buf_ptr, ctypes.c_void_p).value self.load_values_to_buffer(name, begin, end, buf_addr) @@ -510,5 +607,8 @@ def load_to_np_impl(name, begin, end): return buf else: return buf / denom + return self._for_each_region(regions, load_to_np_impl) -__all__ = [ 'D4File', 'D4Iter', 'D4Matrix', 'D4Builder'] + + +__all__ = ["D4File", "D4Iter", "D4Matrix", "D4Builder"] diff --git a/pyd4/src/lib.rs b/pyd4/src/lib.rs index e100d24..e0420a0 100644 --- a/pyd4/src/lib.rs +++ b/pyd4/src/lib.rs @@ -77,7 +77,7 @@ impl ReaderWrapper { } #[pymodule] -pub fn pyd4(_py: Python<'_>, m: &PyModule) -> PyResult<()> { +pub fn _pyd4(_py: Python<'_>, m: &PyModule) -> PyResult<()> { env_logger::init(); m.add_class::()?; m.add_class::()?;