From d93525524e4939e3093f52a280f5f14ab8897e14 Mon Sep 17 00:00:00 2001 From: Adrian Edin Date: Mon, 13 Apr 2026 09:14:57 +0200 Subject: [PATCH] Add filter-cited subcommand to keep only entries cited in a .bbl file. Parses citation keys from a biblatex or bibtex .bbl file (auto-detected) and filters the input .bib so only the cited entries are kept, warning on cited keys that are missing from the bib. Wired into both the argparse and click CLIs, with matching tests and a tests/cited.bbl fixture un-ignored via .gitignore. --- .gitignore | 1 + bibtextools/__main__.py | 11 +++++++++ bibtextools/cli.py | 12 ++++++++++ bibtextools/filter_bib_file.py | 42 ++++++++++++++++++++++++++++++++++ tests/cited.bbl | 3 +++ tests/test_click.py | 12 ++++++++++ tests/test_main.py | 13 +++++++++++ 7 files changed, 94 insertions(+) create mode 100644 bibtextools/filter_bib_file.py create mode 100644 tests/cited.bbl diff --git a/.gitignore b/.gitignore index a7d460d..c66234c 100644 --- a/.gitignore +++ b/.gitignore @@ -95,6 +95,7 @@ Pipfile.lock ## Bibliography auxiliary files (bibtex/biblatex/biber): *.bbl +!tests/cited.bbl *.bcf *.blg *-blx.aux diff --git a/bibtextools/__main__.py b/bibtextools/__main__.py index ff79e63..e556044 100644 --- a/bibtextools/__main__.py +++ b/bibtextools/__main__.py @@ -5,6 +5,7 @@ from .modernize_bib_file import modernize_bib_main from .clean_bib_file import clean_bib_file_main from .combine_bib_files import combine_bib_files_main +from .filter_bib_file import filter_cited_main from .util import write_bib_database DEFAULT_REMOVE = ["abstract", "annote", @@ -60,6 +61,14 @@ def get_arg_parser(): parser_combine.add_argument("--force", action="store_true", help="Force the automatic removal of duplicate entries with the same citation (the shorter one will be removed) and skip the interactive prompt.") parser_combine.add_argument("-o", "--output", help="Output file for the new bib entries. If not specified, it will be the input file with a 'clean-' prefix.") parser_combine.add_argument("bib_files", nargs="+") + + parser_filter = subparsers.add_parser("filter-cited", + help="Filter a bib file to keep only entries that are cited in a given .bbl file.") + parser_filter.add_argument("--bbl", dest="bbl_file", required=True, + help="Path to the .bbl file whose citations define which entries to keep.") + parser_filter.add_argument("-v", "--verbose", action="count", default=0, help="Verbosity level. -v is info and -vv is debug") + parser_filter.add_argument("-o", "--output", help="Output file for the filtered bib entries. If not specified, it will be the input file with a 'filter-cited-' prefix.") + parser_filter.add_argument("bib_file") return parser @@ -81,6 +90,8 @@ def main(): clean_entries = clean_bib_file_main(**args) elif command == "combine": clean_entries = combine_bib_files_main(**args) + elif command == "filter-cited": + clean_entries = filter_cited_main(**args) if output is None: if command == "combine": _bib_file_name = args['bib_files'][0] diff --git a/bibtextools/cli.py b/bibtextools/cli.py index 80259ff..2602b9b 100644 --- a/bibtextools/cli.py +++ b/bibtextools/cli.py @@ -6,6 +6,7 @@ from . import __version__ from .clean_bib_file import clean_bib_file_main from .combine_bib_files import combine_bib_files_main +from .filter_bib_file import filter_cited_main from .modernize_bib_file import modernize_bib_main from .util import write_bib_database @@ -87,6 +88,17 @@ def combine(ctx, bib_file, allow_duplicates, replace_ids, force): verbose=ctx.parent.params['verbose']) return combined_entries, bib_file +@main.command("filter-cited") +@click.pass_context +@click.argument("bib_file", required=True, type=click.Path(exists=True)) +@click.option("--bbl", "bbl_file", required=True, + type=click.Path(exists=True, dir_okay=False)) +def filter_cited(ctx, bib_file, bbl_file): + kept_entries = filter_cited_main(bib_file=bib_file, + bbl_file=bbl_file, + verbose=ctx.parent.params['verbose']) + return kept_entries, bib_file + @main.result_callback() @click.pass_context def save_file(ctx, result, output, verbose): diff --git a/bibtextools/filter_bib_file.py b/bibtextools/filter_bib_file.py new file mode 100644 index 0000000..492fb88 --- /dev/null +++ b/bibtextools/filter_bib_file.py @@ -0,0 +1,42 @@ +import logging +import re + +from .const import KEY_ID +from .util import load_bib_file + + +def get_bbl_keys(bbl_file, encoding="utf-8"): + """Extract citation keys from a .bbl file, auto-detecting the backend.""" + with open(bbl_file, encoding=encoding) as _bbl_file: + content = _bbl_file.read() + biblatex_keys = set(re.findall(r"\\entry\{([^}]+)\}", content)) + bibtex_keys = set(re.findall(r"\\bibitem(?:\[.*?\])?\{([^}]+)\}", content)) + if biblatex_keys and not bibtex_keys: + return biblatex_keys, "biblatex" + if bibtex_keys and not biblatex_keys: + return bibtex_keys, "bibtex" + return biblatex_keys | bibtex_keys, "unknown" + + +def filter_cited_main(bib_file, bbl_file, verbose=logging.WARN, encoding="utf-8"): + logging.basicConfig(format="%(asctime)s - [%(levelname)8s]: %(message)s") + logger = logging.getLogger("filter_cited") + logger.setLevel(verbose) + logger.info("Filtering entries of %s using citations from %s", bib_file, bbl_file) + bib_database = load_bib_file(bib_file, abbr=None, encoding=encoding) + entries = bib_database.get_entry_list() + cited_keys, backend = get_bbl_keys(bbl_file, encoding=encoding) + logger.info("Detected bbl backend: %s", backend) + logger.info("Found %d cited keys in bbl file", len(cited_keys)) + bib_ids = set(entry.get(KEY_ID) for entry in entries) + kept = [entry for entry in entries if entry.get(KEY_ID) in cited_keys] + unused = bib_ids - cited_keys + missing = cited_keys - bib_ids + logger.info("Keeping %d of %d entries", len(kept), len(entries)) + if unused: + logger.info("Dropping %d uncited entries: %s", + len(unused), ", ".join(sorted(unused))) + if missing: + logger.warning("%d cited keys are not present in the bib file: %s", + len(missing), ", ".join(sorted(missing))) + return kept diff --git a/tests/cited.bbl b/tests/cited.bbl new file mode 100644 index 0000000..d069f11 --- /dev/null +++ b/tests/cited.bbl @@ -0,0 +1,3 @@ +\entry{Key123}{article}{} +\entry{Conference2015}{inproceedings}{} +\entry{NotInBib}{article}{} diff --git a/tests/test_click.py b/tests/test_click.py index cd7c4a4..512d526 100644 --- a/tests/test_click.py +++ b/tests/test_click.py @@ -13,6 +13,7 @@ BIB_MAIN = "old.bib" SECOND_BIB = "unicode.bib" CLEAN_BIB_MAIN = "clean-old.bib" +BBL_FILE = "cited.bbl" def test_main_modern(tmpdir, bib_file=BIB_MAIN): out_file = os.path.join(tmpdir, CLEAN_BIB_MAIN) @@ -31,3 +32,14 @@ def test_main_combine(tmpdir, bib_file=[BIB_MAIN, SECOND_BIB]): parser = BibTexParser(homogenize_fields=True, common_strings=True) bib_database = bibtexparser.load(_bib_file, parser=parser) assert len(bib_database.get_entry_list()) == 8 + +def test_main_filter_cited(tmpdir, bib_file=BIB_MAIN, bbl_file=BBL_FILE): + out_file = os.path.join(tmpdir, "filter-cited-old.bib") + runner = CliRunner() + result = runner.invoke(main, f'-o "{out_file}" filter-cited {bib_file} --bbl {bbl_file}') + with open(out_file, encoding="utf-8") as _bib_file: + parser = BibTexParser(homogenize_fields=True, common_strings=True) + bib_database = bibtexparser.load(_bib_file, parser=parser) + entries = bib_database.get_entry_list() + kept_ids = {entry["ID"] for entry in entries} + assert kept_ids == {"Key123", "Conference2015"} and result.exit_code == 0 diff --git a/tests/test_main.py b/tests/test_main.py index 136b39d..edb429c 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -12,6 +12,7 @@ BIB_MAIN = "old.bib" SECOND_BIB = "unicode.bib" CLEAN_BIB_MAIN = "clean-old.bib" +BBL_FILE = "cited.bbl" def test_main_modern(tmpdir, bib_file=BIB_MAIN): out_file = os.path.join(tmpdir, CLEAN_BIB_MAIN) @@ -30,3 +31,15 @@ def test_main_combine(tmpdir, bib_file=[BIB_MAIN, SECOND_BIB]): parser = BibTexParser(homogenize_fields=True, common_strings=True) bib_database = bibtexparser.load(_bib_file, parser=parser) assert len(bib_database.get_entry_list()) == 8 + +def test_main_filter_cited(tmpdir, bib_file=BIB_MAIN, bbl_file=BBL_FILE): + out_file = os.path.join(tmpdir, "filter-cited-old.bib") + sys.argv = [sys.argv[0], 'filter-cited', bib_file, + '--bbl', bbl_file, '-o', '{}'.format(out_file)] + main() + with open(out_file, encoding="utf-8") as _bib_file: + parser = BibTexParser(homogenize_fields=True, common_strings=True) + bib_database = bibtexparser.load(_bib_file, parser=parser) + entries = bib_database.get_entry_list() + kept_ids = {entry["ID"] for entry in entries} + assert kept_ids == {"Key123", "Conference2015"}