From 503edac506ce85928f367deb13ea8dd7bcef4bcf Mon Sep 17 00:00:00 2001 From: copenri Date: Sun, 9 Mar 2025 17:10:16 -0400 Subject: [PATCH 1/5] Add test cases for output utils Fixes: #35 --- .github/workflows/run_tests.yml | 28 +++++++ src/eastr/extract_junctions.py | 5 +- src/eastr/output_utils.py | 118 ++++++++++++++-------------- src/eastr/output_utils_test.py | 131 ++++++++++++++++++++++++++++++++ src/eastr/run_eastr.py | 8 +- src/eastr/utils.py | 34 +++++++-- src/eastr/utils_test.py | 37 +++++++++ 7 files changed, 292 insertions(+), 69 deletions(-) create mode 100644 .github/workflows/run_tests.yml create mode 100644 src/eastr/output_utils_test.py create mode 100644 src/eastr/utils_test.py diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml new file mode 100644 index 0000000..76da00c --- /dev/null +++ b/.github/workflows/run_tests.yml @@ -0,0 +1,28 @@ +name: Run Unit Tests + +on: + pull_request: + +jobs: + tests: + name: Run Python Tests + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.12", "3.13"] + steps: + - uses: actions/checkout@v4 + with: + submodules: 'recursive' + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' + - name: Install dependencies + run: python -m pip install --upgrade pip setuptools wheel + - name: Test with pytest + run: | + pip install pytest pytest-cov + pip install -e . + pytest src/eastr --cov=eastr diff --git a/src/eastr/extract_junctions.py b/src/eastr/extract_junctions.py index 9eace1e..d2e013f 100644 --- a/src/eastr/extract_junctions.py +++ b/src/eastr/extract_junctions.py @@ -5,6 +5,7 @@ import shlex import subprocess +from eastr import utils import pandas as pd this_directory = pathlib.Path(__file__).resolve().parent @@ -40,7 +41,7 @@ def get_junctions_multi_bed(bed_list:list, p) -> dict: dd = collections.defaultdict(dict) for i, d in enumerate(results): - name2 = os.path.splitext(os.path.basename(bed_list[i]))[0] + name2 = utils.sanitize_name(bed_list[i]) for key, (name, score) in d.items(): if key not in dd: dd[key]['samples']= set() @@ -54,7 +55,7 @@ def get_junctions_multi_bed(bed_list:list, p) -> dict: def junction_extractor(bam_path:str, out_path:str) -> dict: check_for_dependency() - name = os.path.splitext(os.path.basename(bam_path))[0] + name = utils.sanitize_name(bam_path) cmd = f"{JUNCTION_CMD} -o {out_path} {bam_path}" a = subprocess.Popen(shlex.split(cmd), stdout=subprocess.DEVNULL) b = a.communicate() diff --git a/src/eastr/output_utils.py b/src/eastr/output_utils.py index f2df037..3dc778c 100644 --- a/src/eastr/output_utils.py +++ b/src/eastr/output_utils.py @@ -7,7 +7,7 @@ import subprocess import sys import tempfile -from typing import List, Union +from typing import List, Optional, Union from eastr import alignment_utils from eastr import utils @@ -16,74 +16,112 @@ # This should exist with source after compilation. VACUUM_CMD = os.path.join(this_directory, 'vacuum') -def out_junctions_filelist(bam_list:list, gtf_path, bed_list, out_junctions, suffix="") -> Union[List[str], None, str]: +def out_junctions_filelist( + bam_list: Optional[list[str]] = None, + bed_list: Optional[list[str]] = None, + gtf_path: Optional[str] = None, + out_junctions: Optional[str] = None, + suffix: Optional[str] = None, +) -> Union[List[str], None, str]: if out_junctions is None: return None if gtf_path: - if utils.check_directory_or_file(out_junctions) == 'dir': - out_junctions= out_junctions + "/" + os.path.splitext(os.path.basename(gtf_path)[0]) + suffix + ".bed" + if os.path.isdir(out_junctions): + out_junctions = os.path.join( + out_junctions, + utils.sanitize_and_update_extension(gtf_path, f"{suffix}.bed"), + ) return out_junctions if bed_list: - if suffix in ["_original_junctions", ""]: + if suffix in {"_original_junctions", "", None}: return None if len(bed_list) == 1: - if utils.check_directory_or_file(out_junctions) == 'dir': - out_junctions = f"{out_junctions}/{os.path.splitext(os.path.basename(gtf_path))[0]}{suffix}.bed" + if os.path.isdir(out_junctions): + out_junctions = os.path.join( + out_junctions, + utils.sanitize_and_update_extension(gtf_path, f"{suffix}.bed"), + ) path = os.path.dirname(out_junctions) utils.make_dir(path) return [out_junctions] - if utils.check_directory_or_file(out_junctions) == 'file': + if not os.path.isdir(out_junctions): print("ERROR: the path provided for the output bed files is a file path, not a directory") sys.exit(1) utils.make_dir(out_junctions) result = [] for bed in bed_list: - result.append(f"{out_junctions}/{os.path.splitext(os.path.basename(bed))[0]}{suffix}.bed") + result.append( + os.path.join( + out_junctions, + utils.sanitize_and_update_extension(bed, f"{suffix}.bed"), + ) + ) return result if len(bam_list) == 1: - if utils.check_directory_or_file(out_junctions) == 'dir': - out_junctions = f"{out_junctions}/{os.path.splitext(os.path.basename(bam_list[0]))[0]}{suffix}.bed" + if os.path.isdir(out_junctions): + out_junctions = os.path.join( + out_junctions, + utils.sanitize_and_update_extension(bam_list[0], f"{suffix}.bed"), + ) path = os.path.dirname(out_junctions) utils.make_dir(path) return [out_junctions] - if utils.check_directory_or_file(out_junctions) == 'file': + if not os.path.isdir(out_junctions): print("ERROR: the path provided for the output bed files is a file path, not a directory") sys.exit(1) utils.make_dir(out_junctions) result = [] for bam in bam_list: - result.append(out_junctions + "/" + os.path.splitext(os.path.basename(bam))[0] + suffix + ".bed") + result.append( + os.path.join( + out_junctions, + utils.sanitize_and_update_extension(bam, f"{suffix}.bed"), + ) + ) return result -def out_filtered_bam_filelist(bam_list:list, out_filtered_bam, suffix="_EASTR_filtered") -> Union[List[str], None]: +def out_filtered_bam_filelist( + bam_list: Optional[list[str]] = None, + out_filtered_bam: Optional[str] = None, + suffix: Optional[str] = None, +) -> Union[List[str], None]: result = None if bam_list is None or out_filtered_bam is None: - return + return result + + if suffix is None: + suffix = "_EASTR_filtered" if len(bam_list) == 1: - if utils.check_directory_or_file(out_filtered_bam) == 'dir': - out_filtered_bam = out_filtered_bam + "/" + os.path.splitext(os.path.basename(bam_list[0]))[0] + suffix + ".bam" + if os.path.isdir(out_filtered_bam): + out_filtered_bam = os.path.join( + out_filtered_bam, + utils.sanitize_and_update_extension(bam_list[0], f"{suffix}.bam"), + ) path = os.path.dirname(out_filtered_bam) utils.make_dir(path) result = [out_filtered_bam] - else: - if utils.check_directory_or_file(out_filtered_bam) == 'file': + if not os.path.isdir(out_filtered_bam): print("ERROR: the path provided for the output file is a file, not a directory") sys.exit(1) utils.make_dir(out_filtered_bam) result = [] for bam in bam_list: - result.append(out_filtered_bam + "/" + os.path.splitext(os.path.basename(bam))[0] + suffix + ".bam") - + result.append( + os.path.join( + out_filtered_bam, + utils.sanitize_and_update_extension(bam, f"{suffix}.bam"), + ) + ) return result def writer_spurious_dict_bam_to_bed(spurious_dict, named_keys, scoring, writer): @@ -160,7 +198,7 @@ def create_sample_to_bed_dict(sample_names, out_removed_junctions_filelist): def spurious_dict_bed_by_sample_to_bed(spurious_dict, bed_list, out_removed_junctions_filelist, scoring): - sample_names = [os.path.splitext(os.path.basename(bed_path))[0] for bed_path in bed_list] + sample_names = [utils.sanitize_name(bed_path) for bed_path in bed_list] sample_to_bed = create_sample_to_bed_dict(sample_names, out_removed_junctions_filelist) if out_removed_junctions_filelist is None: @@ -188,7 +226,7 @@ def spurious_dict_bed_by_sample_to_bed(spurious_dict, bed_list, out_removed_junc def spurious_dict_bam_by_sample_to_bed(spurious_dict, bam_list, out_removed_junctions_filelist, scoring): #get sample name from bam_list - sample_names = [os.path.splitext(os.path.basename(bam_path))[0] for bam_path in bam_list] + sample_names = [utils.sanitize_name(bam_path) for bam_path in bam_list] #dictionary where the key is the sample name and the value is a file path sample_to_bed = {} @@ -227,7 +265,7 @@ def filter_bam_with_vacuum(bam_path, spurious_junctions_bed, out_bam_path, verbo if verbose: vacuum_cmd = f"{vacuum_cmd} -V" if removed_alignments_bam: - out_bam_name = os.path.splitext(out_bam_path)[0] + out_bam_name, _ = os.path.splitext(out_bam_path) vacuum_cmd = f'{vacuum_cmd} -r {out_bam_name}_removed_alignments.bam' vacuum_cmd = f'{vacuum_cmd} -o {out_bam_path} {bam_path} {spurious_junctions_bed}' vacuum_cmd = shlex.split(vacuum_cmd) @@ -256,7 +294,7 @@ def filter_multi_bam_with_vacuum(bam_list, sample_to_bed, out_bam_list, p, verbo removed_alignments_bam = [False for bam in bam_list] - sample_names = [os.path.splitext(os.path.basename(bam_path))[0] for bam_path in bam_list] + sample_names = [utils.sanitize_name(bam_path) for bam_path in bam_list] #run filter_bam_with_vacuum in parallel with multiprocessing starmap pool = multiprocessing.Pool(processes=p) with pool: @@ -266,36 +304,6 @@ def filter_multi_bam_with_vacuum(bam_list, sample_to_bed, out_bam_list, p, verbo for out in outs: print(out) -#def write_gtf_to_bed(spurious_dict, out_removed_junctions_filelist, scoring): - # sorted_keys = spurious_dict.keys() - # named_keys = {} - # for i, key in enumerate(sorted_keys): - # name = "JUNC{}".format(i+1) - # named_keys[key] = name - - # out_io_dict = {} - # for i,sample in enumerate(sample_names): - # out_io_dict[sample] = [StringIO(), out_removed_junctions_filelist[i]] - - # for key, value in spurious_dict.items(): - # name = named_keys[key] - # samples = value['samples'] - # score2 = alignment_utils.calc_alignment_score(value['hit'],scoring) - # for i, sample in enumerate(samples): - # score, sample_id = sample - # out_io_dict[sample_id][0].write(f"{key[0]}\t{key[1]}\t{key[2]}\t{name}\t{score}\t{key[3]}\t{score2}") - - # for (out,filepath) in out_io_dict.values(): - # with open(filepath, 'w') as out_removed_junctions: - # out_removed_junctions.write(out.getvalue()) - -# if __name__ == '__main__': -# # import time -# # start = time.time() -# # spurious_alignments, NH = get_spurious_alignments(bam_path, spurious_introns) -# # end = time.time() -# # print(f"took {(end-start)/60} mins")) - def check_for_dependency(): """Check if runtime dependency exists.""" if not os.path.exists(VACUUM_CMD): diff --git a/src/eastr/output_utils_test.py b/src/eastr/output_utils_test.py new file mode 100644 index 0000000..3333190 --- /dev/null +++ b/src/eastr/output_utils_test.py @@ -0,0 +1,131 @@ +"""eastr output utils test.""" + +import tempfile +import unittest + +from eastr import output_utils + + +class OutputUtilsTest(unittest.TestCase): + out_junctions_test_folder = None + + def setUp(self): + self.out_junctions_test_folder = tempfile.TemporaryDirectory() + + def tearDown(self): + self.out_junctions_test_folder.cleanup() + + def test_out_junctions_filelist_no_out_junctions(self): + """Expect that function should return None if out_junctions in None.""" + results = output_utils.out_junctions_filelist( + bam_list=None, + gtf_path=None, + bed_list=None, + out_junctions=None, + suffix="test" + ) + self.assertIsNone(results) + + def test_out_junctions_filelist_gtf_path(self): + """Expect that function should return a filelist for a gtf_path.""" + with tempfile.NamedTemporaryFile(suffix=".gtf") as gtfp: + results = output_utils.out_junctions_filelist( + bam_list=None, + gtf_path=gtfp.name, + bed_list=None, + out_junctions=self.out_junctions_test_folder.name, + suffix="test" + ) + self.assertIsNotNone(results) + self.assertTrue(results.endswith("test.bed")) + + def test_out_junctions_filelist_bedlist(self): + """Expect that function should return a filelist for a bedlist.""" + # No results for suffix with "_original_junctions" + results = output_utils.out_junctions_filelist( + bam_list=None, + gtf_path=None, + bed_list=["test"], + out_junctions=self.out_junctions_test_folder.name, + suffix="_original_junctions" + ) + self.assertIsNone(results) + + # No results for empty suffix" + results = output_utils.out_junctions_filelist( + bam_list=None, + gtf_path=None, + bed_list=["test"], + out_junctions=self.out_junctions_test_folder.name, + suffix="" + ) + self.assertIsNone(results) + + # Results for valid bed_list and suffix + sample_bed_list = ["test1", "test2"] + results = output_utils.out_junctions_filelist( + bam_list=None, + gtf_path=None, + bed_list=sample_bed_list, + out_junctions=self.out_junctions_test_folder.name, + suffix="_valid" + ) + self.assertIsNotNone(results) + for i, result in enumerate(results): + self.assertTrue(result.endswith(f"{sample_bed_list[i]}_valid.bed")) + + def test_out_filtered_bam_filelist_none(self): + """Expect that function should return a None for empty bamlist.""" + results = output_utils.out_filtered_bam_filelist( + bam_list=None, + out_filtered_bam="test", + suffix=None, + ) + self.assertIsNone(results) + + def test_out_filtered_bam_filelist_empty_out_filtered_bam(self): + """Expect that function should return a None for empty out_filtered_bam.""" + results = output_utils.out_filtered_bam_filelist( + bam_list=["test"], + out_filtered_bam=None, + suffix=None, + ) + self.assertIsNone(results) + + def test_out_filtered_bam_filelist_single_bam_list(self): + """Expect that function should return a single out filtered bam.""" + results = output_utils.out_filtered_bam_filelist( + bam_list=["test"], + out_filtered_bam=self.out_junctions_test_folder.name, + suffix=None, + ) + self.assertIsNotNone(results) + self.assertEqual(len(results), 1) + self.assertTrue(results[0].endswith("test_EASTR_filtered.bam")) + + def test_out_filtered_bam_filelist_single_bam_list_suffix(self): + """Expect that function should return a single out a bam with suffix.""" + results = output_utils.out_filtered_bam_filelist( + bam_list=["test"], + out_filtered_bam=self.out_junctions_test_folder.name, + suffix="_test", + ) + self.assertIsNotNone(results) + self.assertEqual(len(results), 1) + self.assertTrue(results[0].endswith("test_test.bam")) + + def test_out_filtered_bam_filelist_multiple_bam_list_suffix(self): + """Expect that function should return a multiple out a bam with suffix.""" + bam_list = ["test1", "test2"] + results = output_utils.out_filtered_bam_filelist( + bam_list=bam_list, + out_filtered_bam=self.out_junctions_test_folder.name, + suffix="_test", + ) + self.assertIsNotNone(results) + self.assertEqual(len(results), 2) + for i, result in enumerate(results): + self.assertTrue(result.endswith(f"{bam_list[i]}_test.bam")) + +if __name__ == "__main__": + unittest.main() diff --git a/src/eastr/run_eastr.py b/src/eastr/run_eastr.py index abaa267..bc3ea4d 100644 --- a/src/eastr/run_eastr.py +++ b/src/eastr/run_eastr.py @@ -223,8 +223,8 @@ def main(): is_bam = True #if a single bam file is provided - extension = os.path.splitext(os.path.basename(bam_list))[1] - if extension in ['.bam','.cram','.sam']: + extension = utils.get_file_extension(bam_list) + if extension in {'.bam','.cram','.sam'}: bam_list = [bam_list] else: @@ -236,8 +236,8 @@ def main(): elif bed_list: - extension = os.path.splitext(os.path.basename(bed_list))[1] - if extension in ['.bed']: + extension = utils.get_file_extension(bed_list) + if extension in {'.bed'}: bed_list = [bed_list] else: diff --git a/src/eastr/utils.py b/src/eastr/utils.py index 565ffeb..dffc390 100644 --- a/src/eastr/utils.py +++ b/src/eastr/utils.py @@ -8,10 +8,10 @@ def index_fasta(ref_fa): if not os.path.exists(f"{ref_fa}.fai"): pysam.faidx(ref_fa) -#Make a new directory -def make_dir(path): +def make_dir(path: str): + """Make a directory, if it exists just skip.""" directory = os.path.join(path) - os.makedirs(directory,exist_ok=True) + os.makedirs(directory, exist_ok=True) def get_chroms_list_from_fasta(ref_fa): @@ -22,8 +22,26 @@ def get_chroms_list_from_fasta(ref_fa): chrom_sizes[chrom] = fasta.get_reference_length(chrom) return chrom_sizes -def check_directory_or_file(path:str) -> str: - if os.path.splitext(os.path.basename(path))[1]!='': - return 'file' - else: - return 'dir' +def sanitize_name(path: str) -> str: + """Return base name from path without extension. + Example: + input: /var/tmp/file.ext + output: file + """ + base_name = os.path.basename(path) + name_without_extension, _ = os.path.splitext(base_name) + return name_without_extension + +def get_file_extension(path: str) -> str: + """Return file extension name from a file path. + Example: + input: /var/tmp/file.ext + output: ext + """ + base_name = os.path.basename(path) + _, extension = os.path.splitext(base_name) + return extension + +def sanitize_and_update_extension(path, extension) -> str: + base_name = sanitize_name(path) + return f"{base_name}{extension}" diff --git a/src/eastr/utils_test.py b/src/eastr/utils_test.py new file mode 100644 index 0000000..351dc9c --- /dev/null +++ b/src/eastr/utils_test.py @@ -0,0 +1,37 @@ +"""eastr utilities test.""" + +import unittest + +from eastr import utils + + +class UtilsTest(unittest.TestCase): + + def test_sanitize_name(self): + path = "/var/tmp/file.extension" + result = utils.sanitize_name(path) + expected = "file" + self.assertEqual(result, expected) + + def test_sanitize_name_two(self): + path = ".../file.extension" + result = utils.sanitize_name(path) + expected = "file" + self.assertEqual(result, expected) + + def test_get_file_extension(self): + path = ".../file.extension" + result = utils.get_file_extension(path) + expected = ".extension" + self.assertEqual(result, expected) + + def test_sanitize_extension(self): + path = ".../file.extension" + extension = "_suffix.bam" + result = utils.sanitize_and_update_extension(path, extension) + expected = "file_suffix.bam" + self.assertEqual(result, expected) + + +if __name__ == "__main__": + unittest.main() From 503f58c52aac67805868158aa17eec12d878448e Mon Sep 17 00:00:00 2001 From: copenri Date: Sun, 9 Mar 2025 17:15:43 -0400 Subject: [PATCH 2/5] Install dependency for pysam --- .github/workflows/run_tests.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 76da00c..61902a3 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -20,9 +20,8 @@ jobs: python-version: ${{ matrix.python-version }} cache: 'pip' - name: Install dependencies - run: python -m pip install --upgrade pip setuptools wheel + run: python -m pip install --upgrade pip setuptools wheel pysam - name: Test with pytest run: | pip install pytest pytest-cov - pip install -e . pytest src/eastr --cov=eastr From 4bfee91303285356e3e17f61b4d94a5b6444b363 Mon Sep 17 00:00:00 2001 From: copenri Date: Sun, 9 Mar 2025 17:17:03 -0400 Subject: [PATCH 3/5] Add mappy to dependency list --- .github/workflows/run_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 61902a3..4b987ae 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -20,7 +20,7 @@ jobs: python-version: ${{ matrix.python-version }} cache: 'pip' - name: Install dependencies - run: python -m pip install --upgrade pip setuptools wheel pysam + run: python -m pip install --upgrade pip setuptools wheel pysam mappy - name: Test with pytest run: | pip install pytest pytest-cov From 509ea12700a4404694625c406cae6b0ce3fe0f46 Mon Sep 17 00:00:00 2001 From: copenri Date: Sun, 9 Mar 2025 17:18:14 -0400 Subject: [PATCH 4/5] Add verbose statement --- .github/workflows/run_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 4b987ae..9809983 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -24,4 +24,4 @@ jobs: - name: Test with pytest run: | pip install pytest pytest-cov - pytest src/eastr --cov=eastr + pytest -vv src/eastr --cov=eastr From 3c6c7f133f72f2036f43e40c0e6f682cf6e44bbe Mon Sep 17 00:00:00 2001 From: copenri Date: Sun, 9 Mar 2025 17:20:53 -0400 Subject: [PATCH 5/5] Force color --- .github/workflows/run_tests.yml | 4 +++- src/eastr/utils.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 9809983..94448db 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -10,6 +10,8 @@ jobs: strategy: matrix: python-version: ["3.12", "3.13"] + env: + PY_COLORS: "1" steps: - uses: actions/checkout@v4 with: @@ -21,7 +23,7 @@ jobs: cache: 'pip' - name: Install dependencies run: python -m pip install --upgrade pip setuptools wheel pysam mappy - - name: Test with pytest + - name: Run tests run: | pip install pytest pytest-cov pytest -vv src/eastr --cov=eastr diff --git a/src/eastr/utils.py b/src/eastr/utils.py index dffc390..31051a5 100644 --- a/src/eastr/utils.py +++ b/src/eastr/utils.py @@ -36,7 +36,7 @@ def get_file_extension(path: str) -> str: """Return file extension name from a file path. Example: input: /var/tmp/file.ext - output: ext + output: .ext """ base_name = os.path.basename(path) _, extension = os.path.splitext(base_name)