diff --git a/.gitignore b/.gitignore index a7d460d..c66234c 100644 --- a/.gitignore +++ b/.gitignore @@ -95,6 +95,7 @@ Pipfile.lock ## Bibliography auxiliary files (bibtex/biblatex/biber): *.bbl +!tests/cited.bbl *.bcf *.blg *-blx.aux diff --git a/bibtextools/__main__.py b/bibtextools/__main__.py index ff79e63..e556044 100644 --- a/bibtextools/__main__.py +++ b/bibtextools/__main__.py @@ -5,6 +5,7 @@ from .modernize_bib_file import modernize_bib_main from .clean_bib_file import clean_bib_file_main from .combine_bib_files import combine_bib_files_main +from .filter_bib_file import filter_cited_main from .util import write_bib_database DEFAULT_REMOVE = ["abstract", "annote", @@ -60,6 +61,14 @@ def get_arg_parser(): parser_combine.add_argument("--force", action="store_true", help="Force the automatic removal of duplicate entries with the same citation (the shorter one will be removed) and skip the interactive prompt.") parser_combine.add_argument("-o", "--output", help="Output file for the new bib entries. If not specified, it will be the input file with a 'clean-' prefix.") parser_combine.add_argument("bib_files", nargs="+") + + parser_filter = subparsers.add_parser("filter-cited", + help="Filter a bib file to keep only entries that are cited in a given .bbl file.") + parser_filter.add_argument("--bbl", dest="bbl_file", required=True, + help="Path to the .bbl file whose citations define which entries to keep.") + parser_filter.add_argument("-v", "--verbose", action="count", default=0, help="Verbosity level. -v is info and -vv is debug") + parser_filter.add_argument("-o", "--output", help="Output file for the filtered bib entries. If not specified, it will be the input file with a 'filter-cited-' prefix.") + parser_filter.add_argument("bib_file") return parser @@ -81,6 +90,8 @@ def main(): clean_entries = clean_bib_file_main(**args) elif command == "combine": clean_entries = combine_bib_files_main(**args) + elif command == "filter-cited": + clean_entries = filter_cited_main(**args) if output is None: if command == "combine": _bib_file_name = args['bib_files'][0] diff --git a/bibtextools/cli.py b/bibtextools/cli.py index 80259ff..2602b9b 100644 --- a/bibtextools/cli.py +++ b/bibtextools/cli.py @@ -6,6 +6,7 @@ from . import __version__ from .clean_bib_file import clean_bib_file_main from .combine_bib_files import combine_bib_files_main +from .filter_bib_file import filter_cited_main from .modernize_bib_file import modernize_bib_main from .util import write_bib_database @@ -87,6 +88,17 @@ def combine(ctx, bib_file, allow_duplicates, replace_ids, force): verbose=ctx.parent.params['verbose']) return combined_entries, bib_file +@main.command("filter-cited") +@click.pass_context +@click.argument("bib_file", required=True, type=click.Path(exists=True)) +@click.option("--bbl", "bbl_file", required=True, + type=click.Path(exists=True, dir_okay=False)) +def filter_cited(ctx, bib_file, bbl_file): + kept_entries = filter_cited_main(bib_file=bib_file, + bbl_file=bbl_file, + verbose=ctx.parent.params['verbose']) + return kept_entries, bib_file + @main.result_callback() @click.pass_context def save_file(ctx, result, output, verbose): diff --git a/bibtextools/filter_bib_file.py b/bibtextools/filter_bib_file.py new file mode 100644 index 0000000..492fb88 --- /dev/null +++ b/bibtextools/filter_bib_file.py @@ -0,0 +1,42 @@ +import logging +import re + +from .const import KEY_ID +from .util import load_bib_file + + +def get_bbl_keys(bbl_file, encoding="utf-8"): + """Extract citation keys from a .bbl file, auto-detecting the backend.""" + with open(bbl_file, encoding=encoding) as _bbl_file: + content = _bbl_file.read() + biblatex_keys = set(re.findall(r"\\entry\{([^}]+)\}", content)) + bibtex_keys = set(re.findall(r"\\bibitem(?:\[.*?\])?\{([^}]+)\}", content)) + if biblatex_keys and not bibtex_keys: + return biblatex_keys, "biblatex" + if bibtex_keys and not biblatex_keys: + return bibtex_keys, "bibtex" + return biblatex_keys | bibtex_keys, "unknown" + + +def filter_cited_main(bib_file, bbl_file, verbose=logging.WARN, encoding="utf-8"): + logging.basicConfig(format="%(asctime)s - [%(levelname)8s]: %(message)s") + logger = logging.getLogger("filter_cited") + logger.setLevel(verbose) + logger.info("Filtering entries of %s using citations from %s", bib_file, bbl_file) + bib_database = load_bib_file(bib_file, abbr=None, encoding=encoding) + entries = bib_database.get_entry_list() + cited_keys, backend = get_bbl_keys(bbl_file, encoding=encoding) + logger.info("Detected bbl backend: %s", backend) + logger.info("Found %d cited keys in bbl file", len(cited_keys)) + bib_ids = set(entry.get(KEY_ID) for entry in entries) + kept = [entry for entry in entries if entry.get(KEY_ID) in cited_keys] + unused = bib_ids - cited_keys + missing = cited_keys - bib_ids + logger.info("Keeping %d of %d entries", len(kept), len(entries)) + if unused: + logger.info("Dropping %d uncited entries: %s", + len(unused), ", ".join(sorted(unused))) + if missing: + logger.warning("%d cited keys are not present in the bib file: %s", + len(missing), ", ".join(sorted(missing))) + return kept diff --git a/tests/cited.bbl b/tests/cited.bbl new file mode 100644 index 0000000..d069f11 --- /dev/null +++ b/tests/cited.bbl @@ -0,0 +1,3 @@ +\entry{Key123}{article}{} +\entry{Conference2015}{inproceedings}{} +\entry{NotInBib}{article}{} diff --git a/tests/test_click.py b/tests/test_click.py index cd7c4a4..512d526 100644 --- a/tests/test_click.py +++ b/tests/test_click.py @@ -13,6 +13,7 @@ BIB_MAIN = "old.bib" SECOND_BIB = "unicode.bib" CLEAN_BIB_MAIN = "clean-old.bib" +BBL_FILE = "cited.bbl" def test_main_modern(tmpdir, bib_file=BIB_MAIN): out_file = os.path.join(tmpdir, CLEAN_BIB_MAIN) @@ -31,3 +32,14 @@ def test_main_combine(tmpdir, bib_file=[BIB_MAIN, SECOND_BIB]): parser = BibTexParser(homogenize_fields=True, common_strings=True) bib_database = bibtexparser.load(_bib_file, parser=parser) assert len(bib_database.get_entry_list()) == 8 + +def test_main_filter_cited(tmpdir, bib_file=BIB_MAIN, bbl_file=BBL_FILE): + out_file = os.path.join(tmpdir, "filter-cited-old.bib") + runner = CliRunner() + result = runner.invoke(main, f'-o "{out_file}" filter-cited {bib_file} --bbl {bbl_file}') + with open(out_file, encoding="utf-8") as _bib_file: + parser = BibTexParser(homogenize_fields=True, common_strings=True) + bib_database = bibtexparser.load(_bib_file, parser=parser) + entries = bib_database.get_entry_list() + kept_ids = {entry["ID"] for entry in entries} + assert kept_ids == {"Key123", "Conference2015"} and result.exit_code == 0 diff --git a/tests/test_main.py b/tests/test_main.py index 136b39d..edb429c 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -12,6 +12,7 @@ BIB_MAIN = "old.bib" SECOND_BIB = "unicode.bib" CLEAN_BIB_MAIN = "clean-old.bib" +BBL_FILE = "cited.bbl" def test_main_modern(tmpdir, bib_file=BIB_MAIN): out_file = os.path.join(tmpdir, CLEAN_BIB_MAIN) @@ -30,3 +31,15 @@ def test_main_combine(tmpdir, bib_file=[BIB_MAIN, SECOND_BIB]): parser = BibTexParser(homogenize_fields=True, common_strings=True) bib_database = bibtexparser.load(_bib_file, parser=parser) assert len(bib_database.get_entry_list()) == 8 + +def test_main_filter_cited(tmpdir, bib_file=BIB_MAIN, bbl_file=BBL_FILE): + out_file = os.path.join(tmpdir, "filter-cited-old.bib") + sys.argv = [sys.argv[0], 'filter-cited', bib_file, + '--bbl', bbl_file, '-o', '{}'.format(out_file)] + main() + with open(out_file, encoding="utf-8") as _bib_file: + parser = BibTexParser(homogenize_fields=True, common_strings=True) + bib_database = bibtexparser.load(_bib_file, parser=parser) + entries = bib_database.get_entry_list() + kept_ids = {entry["ID"] for entry in entries} + assert kept_ids == {"Key123", "Conference2015"}