diff --git a/pixi.toml b/pixi.toml index b9ec1b2..c9cb954 100644 --- a/pixi.toml +++ b/pixi.toml @@ -17,6 +17,7 @@ update-download-map = "python -c 'from src.fetch_articles.article_downloader imp download-articles = "python -m src.fetch_articles.article_downloader" download-data = "gdown --fuzzy https://drive.google.com/file/d/1qtQWvi0x_k5_JofgrfsgkWzlIdb6isr9/view && unzip autogkb-data.zip && rm autogkb-data.zip" setup-repo = "pixi install && pixi run download-data" +copy-markdown = "python -m src.copy_markdown" [dependencies] seaborn = ">=0.13.2,<0.14" diff --git a/src/copy_markdown.py b/src/copy_markdown.py new file mode 100644 index 0000000..e432cd1 --- /dev/null +++ b/src/copy_markdown.py @@ -0,0 +1,34 @@ +from typing import List +import os +import shutil +from loguru import logger +from pathlib import Path + +""" +Pass in a list of PMCIDs and have those files copied to a new folder +""" + + +def copy_markdown(pmcids: List[str]) -> None: + succesful = 0 + for pmcid in pmcids: + try: + source_file = Path("data") / "articles" / f"{pmcid}.md" + destination = Path("data") / "extractions" / "markdown" + os.makedirs(destination, exist_ok=True) + shutil.copy2(source_file, destination / f"{pmcid}.md") + succesful += 1 + except Exception as e: + logger.error(e) + logger.info( + f"Copied {succesful}/{len(pmcids)} markdown to data/extractions/markdown" + ) + + +def main(): + pmcids = ["PMC4737107", "PMC5712579", "PMC5728534", "PMC5749368", "PMC11730665"] + copy_markdown(pmcids=pmcids) + + +if __name__ == "__main__": + main()