A Rust library for parsing and deduplicating academic citations.
[dependencies]
biblib = "0.4"For minimal builds:
[dependencies]
biblib = { version = "0.4", default-features = false, features = ["ris", "regex"] }| Format | Feature | Description |
|---|---|---|
| RIS | ris |
Research Information Systems format |
| PubMed | pubmed |
MEDLINE/PubMed .nbib files |
| EndNote XML | xml |
EndNote XML export format |
| CSV | csv |
Configurable delimited files |
All format features are enabled by default.
use biblib::{CitationParser, RisParser};
let ris_content = r#"TY - JOUR
TI - Machine Learning in Healthcare
AU - Smith, John
AU - Doe, Jane
PY - 2023
ER -"#;
let parser = RisParser::new();
let citations = parser.parse(ris_content).unwrap();
println!("Title: {}", citations[0].title);
println!("Authors: {:?}", citations[0].authors);use biblib::detect_and_parse;
let content = "TY - JOUR\nTI - Example\nER -";
let (citations, format) = detect_and_parse(content).unwrap();
println!("Detected format: {}", format); // "RIS"use biblib::dedupe::{Deduplicator, DeduplicatorConfig};
let config = DeduplicatorConfig {
group_by_year: true, // Group by year for performance
run_in_parallel: true, // Use parallel processing
source_preferences: vec!["PubMed".to_string()], // Prefer PubMed records
};
let deduplicator = Deduplicator::new().with_config(config);
let groups = deduplicator.find_duplicates(&citations).unwrap();
for group in groups {
if !group.duplicates.is_empty() {
println!("Kept: {}", group.unique.title);
println!("Duplicates: {}", group.duplicates.len());
}
}use biblib::csv::{CsvParser, CsvConfig};
use biblib::CitationParser;
let mut config = CsvConfig::new();
config
.set_delimiter(b';')
.set_header_mapping("title", vec!["Article Name".to_string()])
.set_header_mapping("authors", vec!["Writers".to_string()]);
let parser = CsvParser::with_config(config);
let citations = parser.parse("Article Name;Writers\nMy Paper;Smith J").unwrap();Each parsed citation contains:
| Field | Type | Description |
|---|---|---|
title |
String |
Work title |
authors |
Vec<Author> |
Authors with name, given name, affiliations |
journal |
Option<String> |
Full journal name |
journal_abbr |
Option<String> |
Journal abbreviation |
date |
Option<Date> |
Year, month, day |
volume |
Option<String> |
Volume number |
issue |
Option<String> |
Issue number |
pages |
Option<String> |
Page range |
doi |
Option<String> |
Digital Object Identifier |
pmid |
Option<String> |
PubMed ID |
pmc_id |
Option<String> |
PubMed Central ID |
issn |
Vec<String> |
ISSNs |
abstract_text |
Option<String> |
Abstract |
keywords |
Vec<String> |
Keywords |
urls |
Vec<String> |
Related URLs |
mesh_terms |
Vec<String> |
MeSH terms (PubMed) |
extra_fields |
HashMap |
Additional format-specific fields |
| Feature | Dependencies | Description |
|---|---|---|
ris |
- | RIS format parser |
pubmed |
- | PubMed/MEDLINE parser |
xml |
quick-xml |
EndNote XML parser |
csv |
csv |
CSV parser |
dedupe |
rayon, strsim |
Deduplication engine |
regex |
regex |
Full regex support |
lite |
regex-lite |
Lightweight regex (smaller binary) |
diagnostics |
ariadne |
Pretty, coloured error output with source context |
Default: all features enabled except lite and diagnostics.
Note: At least one of
regexorlitemust always be enabled — the crate will not compile without one of them. They are mutually exclusive; do not enable both.
All parse errors carry a 1-based line number and, where available, a byte-offset span pointing to the problematic citation record:
use biblib::{CitationParser, RisParser, ValueError};
match RisParser::new().parse(input) {
Ok(citations) => println!("Parsed {} citations", citations.len()),
Err(e) => {
eprintln!("Parse error: {}", e); // includes "at line N" when known
if let ValueError::MissingValue { key, .. } = &e.error {
eprintln!("Missing required field: {}", key);
}
}
}Enable the diagnostics feature for human-friendly, coloured output powered by ariadne:
[dependencies]
biblib = { version = "0.4", features = ["diagnostics"] }use biblib::{RisParser, parse_with_diagnostics};
let source = std::fs::read_to_string("citations.ris")?;
match parse_with_diagnostics(&RisParser::new(), &source, "citations.ris") {
Ok(citations) => println!("Parsed {} citations", citations.len()),
Err(diagnostic) => eprintln!("{}", diagnostic),
// Error: Error in RIS format at line 5: Missing value for TI
// ╭─[citations.ris:5:1]
// 5 │ TY - JOUR
// │ ──────────── Missing value for TI
// ╰───
}You can also call error.to_diagnostic(filename, source) directly on any ParseError.
- Parsing Guide — Format-specific tag mappings, date formats, and author handling
- Deduplication Guide — Matching algorithm, similarity thresholds, and configuration
- API Docs — Complete API reference
Licensed under either of Apache License 2.0 or MIT at your option.