11from __future__ import annotations
22
33import ast
4+ import contextlib
45import csv
56import logging
67import os
3031)
3132
3233from paperqa .utils import (
34+ compute_unique_doc_id ,
3335 create_bibtex_key ,
3436 encode_id ,
3537 format_bibtex ,
3638 get_citation_ids ,
3739 maybe_get_date ,
40+ md5sum ,
3841)
3942from paperqa .version import __version__ as pqa_version
4043
5356 "docname" ,
5457 "dockey" ,
5558 "citation" ,
59+ "content_hash" , # Metadata providers won't give this
5660}
61+ # Sentinel to autopopulate a field within model_validator
62+ AUTOPOPULATE_VALUE = "" # NOTE: this is falsy by design
5763
5864
5965class Doc (Embeddable ):
@@ -62,6 +68,13 @@ class Doc(Embeddable):
6268 docname : str
6369 dockey : DocKey
6470 citation : str
71+ content_hash : str | None = Field (
72+ default = AUTOPOPULATE_VALUE ,
73+ description = (
74+ "Optional hash of the document's contents (to reiterate, not a file path to"
75+ " the document, but the document's contents itself)."
76+ ),
77+ )
6578 # Sort the serialization to minimize the diff of serialized objects
6679 fields_to_overwrite_from_metadata : Annotated [set [str ], PlainSerializer (sorted )] = (
6780 Field (
@@ -160,10 +173,6 @@ def __hash__(self) -> int:
160173 return hash ((self .name , self .text ))
161174
162175
163- # Sentinel to autopopulate a field within model_validator
164- AUTOPOPULATE_VALUE = "" # NOTE: this is falsy by design
165-
166-
167176class Context (BaseModel ):
168177 """A class to hold the context of a question."""
169178
@@ -570,8 +579,8 @@ class DocDetails(Doc):
570579 doc_id : str | None = Field (
571580 default = None ,
572581 description = (
573- "Unique ID for this document. Simple ways to acquire one include "
574- " hashing the DOI or a stringifying a UUID ."
582+ "Unique ID for this document. A simple and robust way to acquire one is "
583+ " hashing the paper content's hash concatenate with the lowercased DOI ."
575584 ),
576585 )
577586 file_location : str | os .PathLike | None = Field (
@@ -630,9 +639,9 @@ def lowercase_doi_and_populate_doc_id(cls, data: dict[str, Any]) -> dict[str, An
630639 doi = doi .replace (url_prefix_to_remove , "" )
631640 data ["doi" ] = doi .lower ()
632641 if not data .get ("doc_id" ): # keep user defined doc_ids
633- data ["doc_id" ] = encode_id (doi . lower ( ))
642+ data ["doc_id" ] = compute_unique_doc_id (doi , data . get ( "content_hash" ))
634643 elif not data .get ("doc_id" ): # keep user defined doc_ids
635- data ["doc_id" ] = encode_id ( uuid4 ( ))
644+ data ["doc_id" ] = compute_unique_doc_id ( doi , data . get ( "content_hash" ))
636645
637646 if "dockey" in data .get (
638647 "fields_to_overwrite_from_metadata" ,
@@ -838,6 +847,17 @@ def populate_bibtex_key_citation(cls, data: dict[str, Any]) -> dict[str, Any]:
838847 data ["citation" ] = data .get ("title" ) or CITATION_FALLBACK_DATA ["title" ]
839848 return data
840849
850+ @classmethod
851+ def populate_content_hash (cls , data : dict [str , Any ]) -> dict [str , Any ]:
852+ if ( # Check for missing or autopopulate value, but preserve `None`
853+ data .get ("content_hash" , AUTOPOPULATE_VALUE ) == AUTOPOPULATE_VALUE
854+ ):
855+ data ["content_hash" ] = None # Assume we don't have it
856+ if data .get ("file_location" ): # Try to update it
857+ with contextlib .suppress (FileNotFoundError ):
858+ data ["content_hash" ] = md5sum (data ["file_location" ])
859+ return data
860+
841861 @model_validator (mode = "before" )
842862 @classmethod
843863 def validate_all_fields (cls , data : Mapping [str , Any ]) -> dict [str , Any ]:
@@ -857,6 +877,7 @@ def validate_all_fields(cls, data: Mapping[str, Any]) -> dict[str, Any]:
857877 data [possibly_str_field ], str
858878 ):
859879 data [possibly_str_field ] = ast .literal_eval (data [possibly_str_field ])
880+ data = cls .populate_content_hash (data )
860881 data = cls .lowercase_doi_and_populate_doc_id (data )
861882 data = cls .remove_invalid_authors (data )
862883 data = cls .misc_string_cleaning (data )
@@ -1017,6 +1038,14 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912
10171038 )
10181039 else :
10191040 merged_data [field ] = max (self_value , other_value )
1041+ elif field == "content_hash" and ( # noqa: PLR0916
1042+ # Hashes are both present but differ
1043+ (self_value and other_value and self_value != other_value )
1044+ # One hash is explicitly disabled (not autopopulated)
1045+ or (self_value is None or other_value is None )
1046+ ):
1047+ # We don't know which to pick, so just discard the value
1048+ merged_data [field ] = None
10201049
10211050 else :
10221051 # Prefer non-null values, default preference for 'other' object.
@@ -1031,10 +1060,13 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912
10311060 else self_value
10321061 )
10331062
1034- # Recalculate doc_id if doi has changed
1035- if merged_data ["doi" ] != self .doi :
1036- merged_data ["doc_id" ] = (
1037- encode_id (merged_data ["doi" ].lower ()) if merged_data ["doi" ] else None
1063+ if (
1064+ merged_data ["doi" ] != self .doi
1065+ or merged_data ["content_hash" ] != self .content_hash
1066+ ):
1067+ # Recalculate doc_id if doi or content hash has changed
1068+ merged_data ["doc_id" ] = compute_unique_doc_id (
1069+ merged_data ["doi" ], merged_data .get ("content_hash" )
10381070 )
10391071
10401072 # Create and return new DocDetails instance
0 commit comments