Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ Pipfile.lock

## Bibliography auxiliary files (bibtex/biblatex/biber):
*.bbl
!tests/cited.bbl
*.bcf
*.blg
*-blx.aux
Expand Down
11 changes: 11 additions & 0 deletions bibtextools/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .modernize_bib_file import modernize_bib_main
from .clean_bib_file import clean_bib_file_main
from .combine_bib_files import combine_bib_files_main
from .filter_bib_file import filter_cited_main
from .util import write_bib_database

DEFAULT_REMOVE = ["abstract", "annote",
Expand Down Expand Up @@ -60,6 +61,14 @@ def get_arg_parser():
parser_combine.add_argument("--force", action="store_true", help="Force the automatic removal of duplicate entries with the same citation (the shorter one will be removed) and skip the interactive prompt.")
parser_combine.add_argument("-o", "--output", help="Output file for the new bib entries. If not specified, it will be the input file with a 'clean-' prefix.")
parser_combine.add_argument("bib_files", nargs="+")

parser_filter = subparsers.add_parser("filter-cited",
help="Filter a bib file to keep only entries that are cited in a given .bbl file.")
parser_filter.add_argument("--bbl", dest="bbl_file", required=True,
help="Path to the .bbl file whose citations define which entries to keep.")
parser_filter.add_argument("-v", "--verbose", action="count", default=0, help="Verbosity level. -v is info and -vv is debug")
parser_filter.add_argument("-o", "--output", help="Output file for the filtered bib entries. If not specified, it will be the input file with a 'filter-cited-' prefix.")
parser_filter.add_argument("bib_file")
return parser


Expand All @@ -81,6 +90,8 @@ def main():
clean_entries = clean_bib_file_main(**args)
elif command == "combine":
clean_entries = combine_bib_files_main(**args)
elif command == "filter-cited":
clean_entries = filter_cited_main(**args)
if output is None:
if command == "combine":
_bib_file_name = args['bib_files'][0]
Expand Down
12 changes: 12 additions & 0 deletions bibtextools/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from . import __version__
from .clean_bib_file import clean_bib_file_main
from .combine_bib_files import combine_bib_files_main
from .filter_bib_file import filter_cited_main
from .modernize_bib_file import modernize_bib_main
from .util import write_bib_database

Expand Down Expand Up @@ -87,6 +88,17 @@ def combine(ctx, bib_file, allow_duplicates, replace_ids, force):
verbose=ctx.parent.params['verbose'])
return combined_entries, bib_file

@main.command("filter-cited")
@click.pass_context
@click.argument("bib_file", required=True, type=click.Path(exists=True))
@click.option("--bbl", "bbl_file", required=True,
type=click.Path(exists=True, dir_okay=False))
def filter_cited(ctx, bib_file, bbl_file):
kept_entries = filter_cited_main(bib_file=bib_file,
bbl_file=bbl_file,
verbose=ctx.parent.params['verbose'])
return kept_entries, bib_file

@main.result_callback()
@click.pass_context
def save_file(ctx, result, output, verbose):
Expand Down
42 changes: 42 additions & 0 deletions bibtextools/filter_bib_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import logging
import re

from .const import KEY_ID
from .util import load_bib_file


def get_bbl_keys(bbl_file, encoding="utf-8"):
"""Extract citation keys from a .bbl file, auto-detecting the backend."""
with open(bbl_file, encoding=encoding) as _bbl_file:
content = _bbl_file.read()
biblatex_keys = set(re.findall(r"\\entry\{([^}]+)\}", content))
bibtex_keys = set(re.findall(r"\\bibitem(?:\[.*?\])?\{([^}]+)\}", content))
if biblatex_keys and not bibtex_keys:
return biblatex_keys, "biblatex"
if bibtex_keys and not biblatex_keys:
return bibtex_keys, "bibtex"
return biblatex_keys | bibtex_keys, "unknown"


def filter_cited_main(bib_file, bbl_file, verbose=logging.WARN, encoding="utf-8"):
logging.basicConfig(format="%(asctime)s - [%(levelname)8s]: %(message)s")
logger = logging.getLogger("filter_cited")
logger.setLevel(verbose)
logger.info("Filtering entries of %s using citations from %s", bib_file, bbl_file)
bib_database = load_bib_file(bib_file, abbr=None, encoding=encoding)
entries = bib_database.get_entry_list()
cited_keys, backend = get_bbl_keys(bbl_file, encoding=encoding)
logger.info("Detected bbl backend: %s", backend)
logger.info("Found %d cited keys in bbl file", len(cited_keys))
bib_ids = set(entry.get(KEY_ID) for entry in entries)
kept = [entry for entry in entries if entry.get(KEY_ID) in cited_keys]
unused = bib_ids - cited_keys
missing = cited_keys - bib_ids
logger.info("Keeping %d of %d entries", len(kept), len(entries))
if unused:
logger.info("Dropping %d uncited entries: %s",
len(unused), ", ".join(sorted(unused)))
if missing:
logger.warning("%d cited keys are not present in the bib file: %s",
len(missing), ", ".join(sorted(missing)))
return kept
3 changes: 3 additions & 0 deletions tests/cited.bbl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
\entry{Key123}{article}{}
\entry{Conference2015}{inproceedings}{}
\entry{NotInBib}{article}{}
12 changes: 12 additions & 0 deletions tests/test_click.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
BIB_MAIN = "old.bib"
SECOND_BIB = "unicode.bib"
CLEAN_BIB_MAIN = "clean-old.bib"
BBL_FILE = "cited.bbl"

def test_main_modern(tmpdir, bib_file=BIB_MAIN):
out_file = os.path.join(tmpdir, CLEAN_BIB_MAIN)
Expand All @@ -31,3 +32,14 @@ def test_main_combine(tmpdir, bib_file=[BIB_MAIN, SECOND_BIB]):
parser = BibTexParser(homogenize_fields=True, common_strings=True)
bib_database = bibtexparser.load(_bib_file, parser=parser)
assert len(bib_database.get_entry_list()) == 8

def test_main_filter_cited(tmpdir, bib_file=BIB_MAIN, bbl_file=BBL_FILE):
out_file = os.path.join(tmpdir, "filter-cited-old.bib")
runner = CliRunner()
result = runner.invoke(main, f'-o "{out_file}" filter-cited {bib_file} --bbl {bbl_file}')
with open(out_file, encoding="utf-8") as _bib_file:
parser = BibTexParser(homogenize_fields=True, common_strings=True)
bib_database = bibtexparser.load(_bib_file, parser=parser)
entries = bib_database.get_entry_list()
kept_ids = {entry["ID"] for entry in entries}
assert kept_ids == {"Key123", "Conference2015"} and result.exit_code == 0
13 changes: 13 additions & 0 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
BIB_MAIN = "old.bib"
SECOND_BIB = "unicode.bib"
CLEAN_BIB_MAIN = "clean-old.bib"
BBL_FILE = "cited.bbl"

def test_main_modern(tmpdir, bib_file=BIB_MAIN):
out_file = os.path.join(tmpdir, CLEAN_BIB_MAIN)
Expand All @@ -30,3 +31,15 @@ def test_main_combine(tmpdir, bib_file=[BIB_MAIN, SECOND_BIB]):
parser = BibTexParser(homogenize_fields=True, common_strings=True)
bib_database = bibtexparser.load(_bib_file, parser=parser)
assert len(bib_database.get_entry_list()) == 8

def test_main_filter_cited(tmpdir, bib_file=BIB_MAIN, bbl_file=BBL_FILE):
out_file = os.path.join(tmpdir, "filter-cited-old.bib")
sys.argv = [sys.argv[0], 'filter-cited', bib_file,
'--bbl', bbl_file, '-o', '{}'.format(out_file)]
main()
with open(out_file, encoding="utf-8") as _bib_file:
parser = BibTexParser(homogenize_fields=True, common_strings=True)
bib_database = bibtexparser.load(_bib_file, parser=parser)
entries = bib_database.get_entry_list()
kept_ids = {entry["ID"] for entry in entries}
assert kept_ids == {"Key123", "Conference2015"}