From 6e8437172dafe4139049d4e7c8707943ea9c57ac Mon Sep 17 00:00:00 2001 From: Shlok Natarajan Date: Mon, 14 Jul 2025 12:27:19 -0700 Subject: [PATCH 1/2] feat: copy markdown helper --- pixi.toml | 1 + src/components/annotations_pipeline.py | 2 +- src/copy_markdown.py | 39 ++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 src/copy_markdown.py diff --git a/pixi.toml b/pixi.toml index b9ec1b2..c9cb954 100644 --- a/pixi.toml +++ b/pixi.toml @@ -17,6 +17,7 @@ update-download-map = "python -c 'from src.fetch_articles.article_downloader imp download-articles = "python -m src.fetch_articles.article_downloader" download-data = "gdown --fuzzy https://drive.google.com/file/d/1qtQWvi0x_k5_JofgrfsgkWzlIdb6isr9/view && unzip autogkb-data.zip && rm autogkb-data.zip" setup-repo = "pixi install && pixi run download-data" +copy-markdown = "python -m src.copy_markdown" [dependencies] seaborn = ">=0.13.2,<0.14" diff --git a/src/components/annotations_pipeline.py b/src/components/annotations_pipeline.py index 8e6ba13..1111929 100644 --- a/src/components/annotations_pipeline.py +++ b/src/components/annotations_pipeline.py @@ -40,7 +40,7 @@ def generate_final_structure(self): "functional_annotations": self.functional_annotations, } - def run(self, save_path: str = "data/extractions"): + def run(self, save_path: str = "data/annotations"): logger.info("Getting Study Parameters") self.study_parameters = get_study_parameters(self.article_text) diff --git a/src/copy_markdown.py b/src/copy_markdown.py new file mode 100644 index 0000000..cc9aa6e --- /dev/null +++ b/src/copy_markdown.py @@ -0,0 +1,39 @@ +from typing import List +import os +import shutil +from loguru import logger +from pathlib import Path +""" +Pass in a list of PMCIDs and have those files copied to a new folder +""" + +def copy_markdown(pmcids: List[str]) -> None: + succesful = 0 + for pmcid in pmcids: + try: + source_file = Path("data") / "articles" / f"{pmcid}.md" + destination = Path("data") / "extractions" / "markdown" + os.makedirs(destination, exist_ok=True) + shutil.copy2(source_file, destination / f"{pmcid}.md") + succesful += 1 + except Exception as e: + logger.error(e) + logger.info(f"Copied {succesful}/{len(pmcids)} markdown to data/extractions/markdown") + +def main(): + pmcids = [ + "PMC4737107", + "PMC5712579", + "PMC5728534", + "PMC5749368", + "PMC11730665" + ] + copy_markdown(pmcids=pmcids) + +if __name__ == "__main__": + main() + + + + + From a7a4c73c511db92e36d55a7266be5d0a4f517a22 Mon Sep 17 00:00:00 2001 From: Shlok Natarajan Date: Tue, 15 Jul 2025 10:48:44 -0700 Subject: [PATCH 2/2] chore: black formatting --- src/copy_markdown.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/src/copy_markdown.py b/src/copy_markdown.py index cc9aa6e..e432cd1 100644 --- a/src/copy_markdown.py +++ b/src/copy_markdown.py @@ -3,37 +3,32 @@ import shutil from loguru import logger from pathlib import Path + """ Pass in a list of PMCIDs and have those files copied to a new folder """ + def copy_markdown(pmcids: List[str]) -> None: succesful = 0 for pmcid in pmcids: try: - source_file = Path("data") / "articles" / f"{pmcid}.md" - destination = Path("data") / "extractions" / "markdown" + source_file = Path("data") / "articles" / f"{pmcid}.md" + destination = Path("data") / "extractions" / "markdown" os.makedirs(destination, exist_ok=True) shutil.copy2(source_file, destination / f"{pmcid}.md") succesful += 1 except Exception as e: logger.error(e) - logger.info(f"Copied {succesful}/{len(pmcids)} markdown to data/extractions/markdown") + logger.info( + f"Copied {succesful}/{len(pmcids)} markdown to data/extractions/markdown" + ) + def main(): - pmcids = [ - "PMC4737107", - "PMC5712579", - "PMC5728534", - "PMC5749368", - "PMC11730665" - ] + pmcids = ["PMC4737107", "PMC5712579", "PMC5728534", "PMC5749368", "PMC11730665"] copy_markdown(pmcids=pmcids) + if __name__ == "__main__": main() - - - - -