From a0035070a96ed2876ddde9a2bd855f2eaac0ca08 Mon Sep 17 00:00:00 2001 From: Alexander Abakah Date: Mon, 12 May 2025 13:26:18 -0700 Subject: [PATCH 01/18] added subject correction --- ames/matchers/__init__.py | 4 + ames/matchers/caltechdata.py | 596 ++++++++++++++++++++--------------- run_subject_id_correction.py | 29 ++ 3 files changed, 377 insertions(+), 252 deletions(-) create mode 100644 run_subject_id_correction.py diff --git a/ames/matchers/__init__.py b/ames/matchers/__init__.py index a5923f56..07911bba 100644 --- a/ames/matchers/__init__.py +++ b/ames/matchers/__init__.py @@ -2,6 +2,7 @@ from .caltechdata import match_codemeta from .caltechdata import add_thesis_doi from .caltechdata import add_usage +from .caltechdata import edit_subject from .datacite import update_datacite_metadata from .datacite import update_datacite_media from .datacite import submit_report @@ -24,3 +25,6 @@ from .caltechauthors import save_metadata_to_file from .caltechauthors import add_related_identifiers_from_csv from .caltechauthors import add_authors_affiliations + + + diff --git a/ames/matchers/caltechdata.py b/ames/matchers/caltechdata.py index 845ee502..b866fc8c 100644 --- a/ames/matchers/caltechdata.py +++ b/ames/matchers/caltechdata.py @@ -1,5 +1,5 @@ import os, json -from caltechdata_api import caltechdata_edit +from caltechdata_api import caltechdata_edit, get_metadata from ames import codemeta_to_datacite from ames.harvesters import get_records from progressbar import progressbar @@ -11,264 +11,356 @@ import requests + + +def edit_subject(record, token, correction_subjects, test=True): + + + if test: + rurl = "https://data.caltechlibrary.dev/api/records/" + record + else: + rurl = "https://data.caltechlibrary.dev/api/records/" + record + + + headers = { + "Authorization": "Bearer %s" % token, + "Content-type": "application/json", + } + + + data = requests.get(rurl, headers=headers).json() + + + json_string = json.dumps(data["metadata"], indent=4) + + + + + metadata = get_metadata( + record, + production=False, + validate=True, + emails=False, + schema="43", + token=False, + authors=False, +) + print(metadata["subjects"]) + + + if metadata["subjects"] : + for i in metadata["subjects"]: + for each_correct_subject in correction_subjects.keys(): + if i["subject"] == each_correct_subject and "id" not in i: + i["id"] = correction_subjects[each_correct_subject] + i["subject"] = each_correct_subject + + + + + caltechdata_edit( + record, + metadata=metadata, + token=token, + production=not test, + publish=True, + ) + + + metadata = get_metadata( + record, + production=False, + validate=True, + emails=False, + schema="43", + token=False, + authors=False, + ) + + return metadata + + + + + + + def match_cd_refs(): - token = os.environ["RDMTOK"] - - matches = [] - collection = "caltechdata.ds" - keys = dataset.keys(collection) - if "mediaupdate" in keys: - keys.remove("mediaupdate") - - # Get event data results - event_data = "crossref_refs.ds" - event_keys = dataset.keys(event_data) - event_keys.remove("captured") - f_name = "match_cd_refs" - dot_paths = [".obj_id", ".id", ".subj_id"] - labels = ["obj_id", "id", "subj_id"] - print("Getting Event Data Records") - if dataset.has_frame(event_data, f_name): - if not dataset.frame_reframe(event_data, f_name, event_keys): - err = dataset.error_message() - print(f"Failed to reframe {f_name} in {event_data}, {err}") - exit() - elif not dataset.frame_create(event_data, f_name, event_keys, dot_paths, labels): - err = dataset.error_message() - print(f"Failed to create frame {f_name} in {event_data}, {err}") - exit() - grid = dataset.frame_grid(event_data, f_name) - df = pd.DataFrame(np.array(grid), columns=["obj_id", "id", "subj_id"]) - grouped = df.groupby(["obj_id"]) - groups = grouped.groups - # Look at all CaltechDATA records - for k in keys: - # Collect matched new links for the record - record_matches = [] - metadata, err = dataset.read(collection, k) - for idv in metadata["identifiers"]: - if idv["identifierType"] == "oai": - rdm_id = idv["identifier"].split("oai:data.caltech.edu:")[1] - if err != "": - print(f"Unexpected error on read: {err}") - doi = "https://doi.org/" + k - if doi in groups: - hits = grouped.get_group(doi) - print(hits) - for index, h in hits.iterrows(): - # Trigger for whether we already have this link - new = True - if "relatedIdentifiers" in metadata: - for m in metadata["relatedIdentifiers"]: - if m["relatedIdentifier"] in h["subj_id"]: - new = False - if new == True: - match = h["subj_id"] - print(match) - print(h["obj_id"]) - inputv = input("Do you approve this link? Type Y or N: ") - if inputv == "Y": - record_matches.append(match) - # If we have to update record - if len(record_matches) > 0: - ids = [] - if "relatedIdentifiers" in metadata: - for m in metadata["relatedIdentifiers"]: - ids.append(m) - matches.append([k, record_matches]) - # Now collect identifiers for record - for match in record_matches: - split = match.split("doi.org/") - new_id = { - "relatedIdentifier": split[1], - "relatedIdentifierType": "DOI", - "relationType": "IsCitedBy", - } - ids.append(new_id) - metadata["relatedIdentifiers"] = ids - response = caltechdata_edit( - rdm_id, metadata, token, production=True, publish=True - ) - print(response) - return matches + token = os.environ["RDMTOK"] + + + matches = [] + collection = "caltechdata.ds" + keys = dataset.keys(collection) + if "mediaupdate" in keys: + keys.remove("mediaupdate") + + + # Get event data results + event_data = "crossref_refs.ds" + event_keys = dataset.keys(event_data) + event_keys.remove("captured") + f_name = "match_cd_refs" + dot_paths = [".obj_id", ".id", ".subj_id"] + labels = ["obj_id", "id", "subj_id"] + print("Getting Event Data Records") + if dataset.has_frame(event_data, f_name): + if not dataset.frame_reframe(event_data, f_name, event_keys): + err = dataset.error_message() + print(f"Failed to reframe {f_name} in {event_data}, {err}") + exit() + elif not dataset.frame_create(event_data, f_name, event_keys, dot_paths, labels): + err = dataset.error_message() + print(f"Failed to create frame {f_name} in {event_data}, {err}") + exit() + grid = dataset.frame_grid(event_data, f_name) + df = pd.DataFrame(np.array(grid), columns=["obj_id", "id", "subj_id"]) + grouped = df.groupby(["obj_id"]) + groups = grouped.groups + # Look at all CaltechDATA records + for k in keys: + # Collect matched new links for the record + record_matches = [] + metadata, err = dataset.read(collection, k) + for idv in metadata["identifiers"]: + if idv["identifierType"] == "oai": + rdm_id = idv["identifier"].split("oai:data.caltech.edu:")[1] + if err != "": + print(f"Unexpected error on read: {err}") + doi = "https://doi.org/" + k + if doi in groups: + hits = grouped.get_group(doi) + print(hits) + for index, h in hits.iterrows(): + # Trigger for whether we already have this link + new = True + if "relatedIdentifiers" in metadata: + for m in metadata["relatedIdentifiers"]: + if m["relatedIdentifier"] in h["subj_id"]: + new = False + if new == True: + match = h["subj_id"] + print(match) + print(h["obj_id"]) + inputv = input("Do you approve this link? Type Y or N: ") + if inputv == "Y": + record_matches.append(match) + # If we have to update record + if len(record_matches) > 0: + ids = [] + if "relatedIdentifiers" in metadata: + for m in metadata["relatedIdentifiers"]: + ids.append(m) + matches.append([k, record_matches]) + # Now collect identifiers for record + for match in record_matches: + split = match.split("doi.org/") + new_id = { + "relatedIdentifier": split[1], + "relatedIdentifierType": "DOI", + "relationType": "IsCitedBy", + } + ids.append(new_id) + metadata["relatedIdentifiers"] = ids + response = caltechdata_edit( + rdm_id, metadata, token, production=True, publish=True + ) + print(response) + return matches + + def match_codemeta(): - collection = "github_records.ds" - keys = dataset.keys(collection) - for k in keys: - existing, err = dataset.read(collection, k) - if err != "": - print(f"Unexpected error on read: {err}") - if "completed" not in existing: - print("Processing new record ", k) - if dataset.attachments(collection, k) != "": - dataset.detach(collection, k) - - # Update CaltechDATA - token = os.environ["TINDTOK"] - - infile = open("codemeta.json", "r") - try: - meta = json.load(infile) - except: - print("Invalid json file - Skipping forever ", k) - else: - standardized = codemeta_to_datacite(meta) - - # Check that all records have a GitHub subject tag - add = True - for s in standardized["subjects"]: - if s["subject"] == "Github": - add = False - if s["subject"] == "GitHub": - add = False - if add == True: - standardized["subjects"].append({"subject": "GitHub"}) - response = caltechdata_edit(k, standardized, token, {}, {}, True) - print(response) - os.system("rm codemeta.json") - - existing["completed"] = "True" - if not dataset.update(collection, k, existing): - err = dataset.error_message() - print(f"Unexpected error on read: {err}") + collection = "github_records.ds" + keys = dataset.keys(collection) + for k in keys: + existing, err = dataset.read(collection, k) + if err != "": + print(f"Unexpected error on read: {err}") + if "completed" not in existing: + print("Processing new record ", k) + if dataset.attachments(collection, k) != "": + dataset.detach(collection, k) + + + # Update CaltechDATA + token = os.environ["TINDTOK"] + + + infile = open("codemeta.json", "r") + try: + meta = json.load(infile) + except: + print("Invalid json file - Skipping forever ", k) + else: + standardized = codemeta_to_datacite(meta) + + + # Check that all records have a GitHub subject tag + add = True + for s in standardized["subjects"]: + if s["subject"] == "Github": + add = False + if s["subject"] == "GitHub": + add = False + if add == True: + standardized["subjects"].append({"subject": "GitHub"}) + response = caltechdata_edit(k, standardized, token, {}, {}, True) + print(response) + os.system("rm codemeta.json") + + + existing["completed"] = "True" + if not dataset.update(collection, k, existing): + err = dataset.error_message() + print(f"Unexpected error on read: {err}") + + def add_usage(collection, token, usage_collection, production=True): - """Add in usage text in the description field""" - keys = dataset.keys(collection) - biggest_views = 0 - biggest_views_record = "" - biggest_downloads = 0 - biggest_downloads_record = "" - total_views = 0 - total_downloads = 0 - for k in keys: - record, err = dataset.read(collection, k) - if err != "": - print(err) - exit() - usage, err = dataset.read(usage_collection, k) - views = usage["grand-total-unique-investigations"] - downloads = usage["grand-total-unique-requests"] - if views > biggest_views: - biggest_views = views - biggest_views_record = k - if downloads > biggest_downloads: - biggest_downloads = downloads - biggest_downloads_record = k - total_views += views - total_downloads += downloads - date = datetime.fromisoformat(usage["dataset-dates"][0]["value"]) - now = datetime.today() - first = date.strftime("%B %d, %Y") - last = now.strftime("%B %d, %Y") - if views > 1: - u_txt = ( - "
Unique Views: " - + str(views) - + "
Unique Downloads: " - + str(downloads) - + "
between " - + first - + " and " - + last - + '
More info on how stats are collected
" - ) - description = record["descriptions"] - use_exists = False - for d in description: - descr_text = d["description"] - # We always update an existing listing - if descr_text.startswith("
Unique Views:"): - d["description"] = u_txt - use_exists = True - # Otherwise we add a new one - if use_exists == False: - description.append({"descriptionType": "Other", "description": u_txt}) - response = caltechdata_edit( - k, {"descriptions": description}, token, {}, {}, production - ) - print(response) - print(f"Most downloads {biggest_downloads} for record {biggest_downloads_record}") - print(f"Most views {biggest_views} for record {biggest_views_record}") - print(f"Total downloads {total_downloads}") - print(f"Total views {total_views}") + """Add in usage text in the description field""" + keys = dataset.keys(collection) + biggest_views = 0 + biggest_views_record = "" + biggest_downloads = 0 + biggest_downloads_record = "" + total_views = 0 + total_downloads = 0 + for k in keys: + record, err = dataset.read(collection, k) + if err != "": + print(err) + exit() + usage, err = dataset.read(usage_collection, k) + views = usage["grand-total-unique-investigations"] + downloads = usage["grand-total-unique-requests"] + if views > biggest_views: + biggest_views = views + biggest_views_record = k + if downloads > biggest_downloads: + biggest_downloads = downloads + biggest_downloads_record = k + total_views += views + total_downloads += downloads + date = datetime.fromisoformat(usage["dataset-dates"][0]["value"]) + now = datetime.today() + first = date.strftime("%B %d, %Y") + last = now.strftime("%B %d, %Y") + if views > 1: + u_txt = ( + "
Unique Views: " + + str(views) + + "
Unique Downloads: " + + str(downloads) + + "
between " + + first + + " and " + + last + + '
More info on how stats are collected
" + ) + description = record["descriptions"] + use_exists = False + for d in description: + descr_text = d["description"] + # We always update an existing listing + if descr_text.startswith("
Unique Views:"): + d["description"] = u_txt + use_exists = True + # Otherwise we add a new one + if use_exists == False: + description.append({"descriptionType": "Other", "description": u_txt}) + response = caltechdata_edit( + k, {"descriptions": description}, token, {}, {}, production + ) + print(response) + print(f"Most downloads {biggest_downloads} for record {biggest_downloads_record}") + print(f"Most views {biggest_views} for record {biggest_views_record}") + print(f"Total downloads {total_downloads}") + print(f"Total views {total_views}") + + def add_thesis_doi(data_collection, thesis_collection, token, production=True): - """Add in theis DOI to CaltechDATA records""" - - # Search across CaltechTHESIS DOIs - dot_paths = ["._Key", ".doi", ".official_url", ".related_url"] - labels = ["eprint_id", "doi", "official_url", "related_url"] - keys = dataset.keys(thesis_collection) - all_metadata = get_records(dot_paths, "dois", thesis_collection, keys, labels) - dois = [] - for metadata in progressbar(all_metadata, redirect_stdout=True): - if "doi" in metadata: - record_doi = metadata["doi"].strip() - if "related_url" in metadata and "items" in metadata["related_url"]: - items = metadata["related_url"]["items"] - for item in items: - if "url" in item: - url = item["url"].strip() - if "type" in item: - itype = item["type"].strip().lower() - if itype == "doi": - if idutils.is_doi(url): - doi = "10." + url.split("10.")[1] - prefix = doi.split("/")[0] - if prefix == "10.22002": - dois.append([doi, record_doi]) - else: - print("Ignoring non-DOI") - print(metadata["eprint_id"]) - print(url.split("10.")) - for doi_link in dois: - cd_doi = doi_link[0] - thesis_doi = doi_link[1] - # Exclude tombstone records - if cd_doi != "10.22002/D1.1987": - print("Checking " + cd_doi) - record, err = dataset.read(data_collection, cd_doi) - if err != "": - print(err) - exit() - - for idv in record["identifiers"]: - if idv["identifierType"] == "oai": - record_number = idv["identifier"].split("data.caltech.edu:")[1] - - done = False - if "relatedIdentifiers" in record: - for idv in record["relatedIdentifiers"]: - identifier = idv["relatedIdentifier"] - if identifier == thesis_doi: - done = True - if done == False: - identifiers = record["relatedIdentifiers"] - identifiers.append( - { - "relatedIdentifier": thesis_doi, - "relatedIdentifierType": "DOI", - "relationType": "IsSupplementTo", - } - ) - record["relatedIdentifiers"] = identifiers - else: - record["relatedIdentifiers"] = [ - { - "relatedIdentifier": thesis_doi, - "relatedIdentifierType": "DOI", - "relationType": "IsSupplementTo", - } - ] - if done == False: - print("Adding " + thesis_doi + " to " + cd_doi) - response = caltechdata_edit( - record_number, record, token, {}, True, publish=True - ) - print(response) + """Add in theis DOI to CaltechDATA records""" + + + # Search across CaltechTHESIS DOIs + dot_paths = ["._Key", ".doi", ".official_url", ".related_url"] + labels = ["eprint_id", "doi", "official_url", "related_url"] + keys = dataset.keys(thesis_collection) + all_metadata = get_records(dot_paths, "dois", thesis_collection, keys, labels) + dois = [] + for metadata in progressbar(all_metadata, redirect_stdout=True): + if "doi" in metadata: + record_doi = metadata["doi"].strip() + if "related_url" in metadata and "items" in metadata["related_url"]: + items = metadata["related_url"]["items"] + for item in items: + if "url" in item: + url = item["url"].strip() + if "type" in item: + itype = item["type"].strip().lower() + if itype == "doi": + if idutils.is_doi(url): + doi = "10." + url.split("10.")[1] + prefix = doi.split("/")[0] + if prefix == "10.22002": + dois.append([doi, record_doi]) + else: + print("Ignoring non-DOI") + print(metadata["eprint_id"]) + print(url.split("10.")) + for doi_link in dois: + cd_doi = doi_link[0] + thesis_doi = doi_link[1] + # Exclude tombstone records + if cd_doi != "10.22002/D1.1987": + print("Checking " + cd_doi) + record, err = dataset.read(data_collection, cd_doi) + if err != "": + print(err) + exit() + + + for idv in record["identifiers"]: + if idv["identifierType"] == "oai": + record_number = idv["identifier"].split("data.caltech.edu:")[1] + + + done = False + if "relatedIdentifiers" in record: + for idv in record["relatedIdentifiers"]: + identifier = idv["relatedIdentifier"] + if identifier == thesis_doi: + done = True + if done == False: + identifiers = record["relatedIdentifiers"] + identifiers.append( + { + "relatedIdentifier": thesis_doi, + "relatedIdentifierType": "DOI", + "relationType": "IsSupplementTo", + } + ) + record["relatedIdentifiers"] = identifiers + else: + record["relatedIdentifiers"] = [ + { + "relatedIdentifier": thesis_doi, + "relatedIdentifierType": "DOI", + "relationType": "IsSupplementTo", + } + ] + if done == False: + print("Adding " + thesis_doi + " to " + cd_doi) + response = caltechdata_edit( + record_number, record, token, {}, True, publish=True + ) + print(response) + + + diff --git a/run_subject_id_correction.py b/run_subject_id_correction.py new file mode 100644 index 00000000..ba0e3c38 --- /dev/null +++ b/run_subject_id_correction.py @@ -0,0 +1,29 @@ +from caltechdata_api import get_metadata +from ames.matchers import edit_subject +import os + + +def all_corrected(): + + + record = "2d2wf-j0256" + subjects_to_correct = {'Biological sciences': 'http://www.oecd.org/science/inno/38235147.pdf?1.6', 'Chemical sciences': 'http://www.oecd.org/science/inno/38235147.pdf?1.4', 'Computer and information sciences': 'http://www.oecd.org/science/inno/38235147.pdf?1.2'} + + + metadata = edit_subject("2d2wf-j0256", os.environ.get("CALTECH_DATA_API"), subjects_to_correct) + + + + + for i in metadata["subjects"]: + for each_correct_subject in subjects_to_correct.keys(): + if "id" in i.keys(): + if i["subject"] == each_correct_subject and i["id"] != subjects_to_correct[each_correct_subject]: + print(i["subject"],"'s id wasn't added.") + return False + print("All subject ids were added") + return True + + +all_corrected() + From 86f391caadb6c2b4b3cc3e39b57616f6568a4e50 Mon Sep 17 00:00:00 2001 From: Alexander Abakah Date: Mon, 12 May 2025 13:29:43 -0700 Subject: [PATCH 02/18] added subject correction --- ames/matchers/__init__.py | 3 - ames/matchers/caltechdata.py | 641 +++++++++++++++++------------------ run_subject_id_correction.py | 41 +-- 3 files changed, 324 insertions(+), 361 deletions(-) diff --git a/ames/matchers/__init__.py b/ames/matchers/__init__.py index 07911bba..be300d1b 100644 --- a/ames/matchers/__init__.py +++ b/ames/matchers/__init__.py @@ -25,6 +25,3 @@ from .caltechauthors import save_metadata_to_file from .caltechauthors import add_related_identifiers_from_csv from .caltechauthors import add_authors_affiliations - - - diff --git a/ames/matchers/caltechdata.py b/ames/matchers/caltechdata.py index b866fc8c..92283ab6 100644 --- a/ames/matchers/caltechdata.py +++ b/ames/matchers/caltechdata.py @@ -11,356 +11,319 @@ import requests - - def edit_subject(record, token, correction_subjects, test=True): - - if test: - rurl = "https://data.caltechlibrary.dev/api/records/" + record - else: - rurl = "https://data.caltechlibrary.dev/api/records/" + record - - - headers = { - "Authorization": "Bearer %s" % token, - "Content-type": "application/json", - } - - - data = requests.get(rurl, headers=headers).json() - - - json_string = json.dumps(data["metadata"], indent=4) - - - - - metadata = get_metadata( - record, - production=False, - validate=True, - emails=False, - schema="43", - token=False, - authors=False, -) - print(metadata["subjects"]) - - - if metadata["subjects"] : - for i in metadata["subjects"]: - for each_correct_subject in correction_subjects.keys(): - if i["subject"] == each_correct_subject and "id" not in i: - i["id"] = correction_subjects[each_correct_subject] - i["subject"] = each_correct_subject - - - - - caltechdata_edit( - record, - metadata=metadata, - token=token, - production=not test, - publish=True, - ) - - - metadata = get_metadata( - record, - production=False, - validate=True, - emails=False, - schema="43", - token=False, - authors=False, - ) - - return metadata - - - - - + if test: + rurl = "https://data.caltechlibrary.dev/api/records/" + record + else: + rurl = "https://data.caltechlibrary.dev/api/records/" + record + + headers = { + "Authorization": "Bearer %s" % token, + "Content-type": "application/json", + } + + data = requests.get(rurl, headers=headers).json() + + json_string = json.dumps(data["metadata"], indent=4) + + metadata = get_metadata( + record, + production=False, + validate=True, + emails=False, + schema="43", + token=False, + authors=False, + ) + print(metadata["subjects"]) + + if metadata["subjects"]: + for i in metadata["subjects"]: + for each_correct_subject in correction_subjects.keys(): + if i["subject"] == each_correct_subject and "id" not in i: + i["id"] = correction_subjects[each_correct_subject] + i["subject"] = each_correct_subject + + caltechdata_edit( + record, + metadata=metadata, + token=token, + production=not test, + publish=True, + ) + + metadata = get_metadata( + record, + production=False, + validate=True, + emails=False, + schema="43", + token=False, + authors=False, + ) + + return metadata def match_cd_refs(): - token = os.environ["RDMTOK"] - - - matches = [] - collection = "caltechdata.ds" - keys = dataset.keys(collection) - if "mediaupdate" in keys: - keys.remove("mediaupdate") - - - # Get event data results - event_data = "crossref_refs.ds" - event_keys = dataset.keys(event_data) - event_keys.remove("captured") - f_name = "match_cd_refs" - dot_paths = [".obj_id", ".id", ".subj_id"] - labels = ["obj_id", "id", "subj_id"] - print("Getting Event Data Records") - if dataset.has_frame(event_data, f_name): - if not dataset.frame_reframe(event_data, f_name, event_keys): - err = dataset.error_message() - print(f"Failed to reframe {f_name} in {event_data}, {err}") - exit() - elif not dataset.frame_create(event_data, f_name, event_keys, dot_paths, labels): - err = dataset.error_message() - print(f"Failed to create frame {f_name} in {event_data}, {err}") - exit() - grid = dataset.frame_grid(event_data, f_name) - df = pd.DataFrame(np.array(grid), columns=["obj_id", "id", "subj_id"]) - grouped = df.groupby(["obj_id"]) - groups = grouped.groups - # Look at all CaltechDATA records - for k in keys: - # Collect matched new links for the record - record_matches = [] - metadata, err = dataset.read(collection, k) - for idv in metadata["identifiers"]: - if idv["identifierType"] == "oai": - rdm_id = idv["identifier"].split("oai:data.caltech.edu:")[1] - if err != "": - print(f"Unexpected error on read: {err}") - doi = "https://doi.org/" + k - if doi in groups: - hits = grouped.get_group(doi) - print(hits) - for index, h in hits.iterrows(): - # Trigger for whether we already have this link - new = True - if "relatedIdentifiers" in metadata: - for m in metadata["relatedIdentifiers"]: - if m["relatedIdentifier"] in h["subj_id"]: - new = False - if new == True: - match = h["subj_id"] - print(match) - print(h["obj_id"]) - inputv = input("Do you approve this link? Type Y or N: ") - if inputv == "Y": - record_matches.append(match) - # If we have to update record - if len(record_matches) > 0: - ids = [] - if "relatedIdentifiers" in metadata: - for m in metadata["relatedIdentifiers"]: - ids.append(m) - matches.append([k, record_matches]) - # Now collect identifiers for record - for match in record_matches: - split = match.split("doi.org/") - new_id = { - "relatedIdentifier": split[1], - "relatedIdentifierType": "DOI", - "relationType": "IsCitedBy", - } - ids.append(new_id) - metadata["relatedIdentifiers"] = ids - response = caltechdata_edit( - rdm_id, metadata, token, production=True, publish=True - ) - print(response) - return matches - - + token = os.environ["RDMTOK"] + + matches = [] + collection = "caltechdata.ds" + keys = dataset.keys(collection) + if "mediaupdate" in keys: + keys.remove("mediaupdate") + + # Get event data results + event_data = "crossref_refs.ds" + event_keys = dataset.keys(event_data) + event_keys.remove("captured") + f_name = "match_cd_refs" + dot_paths = [".obj_id", ".id", ".subj_id"] + labels = ["obj_id", "id", "subj_id"] + print("Getting Event Data Records") + if dataset.has_frame(event_data, f_name): + if not dataset.frame_reframe(event_data, f_name, event_keys): + err = dataset.error_message() + print(f"Failed to reframe {f_name} in {event_data}, {err}") + exit() + elif not dataset.frame_create(event_data, f_name, event_keys, dot_paths, labels): + err = dataset.error_message() + print(f"Failed to create frame {f_name} in {event_data}, {err}") + exit() + grid = dataset.frame_grid(event_data, f_name) + df = pd.DataFrame(np.array(grid), columns=["obj_id", "id", "subj_id"]) + grouped = df.groupby(["obj_id"]) + groups = grouped.groups + # Look at all CaltechDATA records + for k in keys: + # Collect matched new links for the record + record_matches = [] + metadata, err = dataset.read(collection, k) + for idv in metadata["identifiers"]: + if idv["identifierType"] == "oai": + rdm_id = idv["identifier"].split("oai:data.caltech.edu:")[1] + if err != "": + print(f"Unexpected error on read: {err}") + doi = "https://doi.org/" + k + if doi in groups: + hits = grouped.get_group(doi) + print(hits) + for index, h in hits.iterrows(): + # Trigger for whether we already have this link + new = True + if "relatedIdentifiers" in metadata: + for m in metadata["relatedIdentifiers"]: + if m["relatedIdentifier"] in h["subj_id"]: + new = False + if new == True: + match = h["subj_id"] + print(match) + print(h["obj_id"]) + inputv = input("Do you approve this link? Type Y or N: ") + if inputv == "Y": + record_matches.append(match) + # If we have to update record + if len(record_matches) > 0: + ids = [] + if "relatedIdentifiers" in metadata: + for m in metadata["relatedIdentifiers"]: + ids.append(m) + matches.append([k, record_matches]) + # Now collect identifiers for record + for match in record_matches: + split = match.split("doi.org/") + new_id = { + "relatedIdentifier": split[1], + "relatedIdentifierType": "DOI", + "relationType": "IsCitedBy", + } + ids.append(new_id) + metadata["relatedIdentifiers"] = ids + response = caltechdata_edit( + rdm_id, metadata, token, production=True, publish=True + ) + print(response) + return matches def match_codemeta(): - collection = "github_records.ds" - keys = dataset.keys(collection) - for k in keys: - existing, err = dataset.read(collection, k) - if err != "": - print(f"Unexpected error on read: {err}") - if "completed" not in existing: - print("Processing new record ", k) - if dataset.attachments(collection, k) != "": - dataset.detach(collection, k) - - - # Update CaltechDATA - token = os.environ["TINDTOK"] - - - infile = open("codemeta.json", "r") - try: - meta = json.load(infile) - except: - print("Invalid json file - Skipping forever ", k) - else: - standardized = codemeta_to_datacite(meta) - - - # Check that all records have a GitHub subject tag - add = True - for s in standardized["subjects"]: - if s["subject"] == "Github": - add = False - if s["subject"] == "GitHub": - add = False - if add == True: - standardized["subjects"].append({"subject": "GitHub"}) - response = caltechdata_edit(k, standardized, token, {}, {}, True) - print(response) - os.system("rm codemeta.json") - - - existing["completed"] = "True" - if not dataset.update(collection, k, existing): - err = dataset.error_message() - print(f"Unexpected error on read: {err}") - - + collection = "github_records.ds" + keys = dataset.keys(collection) + for k in keys: + existing, err = dataset.read(collection, k) + if err != "": + print(f"Unexpected error on read: {err}") + if "completed" not in existing: + print("Processing new record ", k) + if dataset.attachments(collection, k) != "": + dataset.detach(collection, k) + + # Update CaltechDATA + token = os.environ["TINDTOK"] + + infile = open("codemeta.json", "r") + try: + meta = json.load(infile) + except: + print("Invalid json file - Skipping forever ", k) + else: + standardized = codemeta_to_datacite(meta) + + # Check that all records have a GitHub subject tag + add = True + for s in standardized["subjects"]: + if s["subject"] == "Github": + add = False + if s["subject"] == "GitHub": + add = False + if add == True: + standardized["subjects"].append({"subject": "GitHub"}) + response = caltechdata_edit(k, standardized, token, {}, {}, True) + print(response) + os.system("rm codemeta.json") + + existing["completed"] = "True" + if not dataset.update(collection, k, existing): + err = dataset.error_message() + print(f"Unexpected error on read: {err}") def add_usage(collection, token, usage_collection, production=True): - """Add in usage text in the description field""" - keys = dataset.keys(collection) - biggest_views = 0 - biggest_views_record = "" - biggest_downloads = 0 - biggest_downloads_record = "" - total_views = 0 - total_downloads = 0 - for k in keys: - record, err = dataset.read(collection, k) - if err != "": - print(err) - exit() - usage, err = dataset.read(usage_collection, k) - views = usage["grand-total-unique-investigations"] - downloads = usage["grand-total-unique-requests"] - if views > biggest_views: - biggest_views = views - biggest_views_record = k - if downloads > biggest_downloads: - biggest_downloads = downloads - biggest_downloads_record = k - total_views += views - total_downloads += downloads - date = datetime.fromisoformat(usage["dataset-dates"][0]["value"]) - now = datetime.today() - first = date.strftime("%B %d, %Y") - last = now.strftime("%B %d, %Y") - if views > 1: - u_txt = ( - "
Unique Views: " - + str(views) - + "
Unique Downloads: " - + str(downloads) - + "
between " - + first - + " and " - + last - + '
More info on how stats are collected
" - ) - description = record["descriptions"] - use_exists = False - for d in description: - descr_text = d["description"] - # We always update an existing listing - if descr_text.startswith("
Unique Views:"): - d["description"] = u_txt - use_exists = True - # Otherwise we add a new one - if use_exists == False: - description.append({"descriptionType": "Other", "description": u_txt}) - response = caltechdata_edit( - k, {"descriptions": description}, token, {}, {}, production - ) - print(response) - print(f"Most downloads {biggest_downloads} for record {biggest_downloads_record}") - print(f"Most views {biggest_views} for record {biggest_views_record}") - print(f"Total downloads {total_downloads}") - print(f"Total views {total_views}") - - + """Add in usage text in the description field""" + keys = dataset.keys(collection) + biggest_views = 0 + biggest_views_record = "" + biggest_downloads = 0 + biggest_downloads_record = "" + total_views = 0 + total_downloads = 0 + for k in keys: + record, err = dataset.read(collection, k) + if err != "": + print(err) + exit() + usage, err = dataset.read(usage_collection, k) + views = usage["grand-total-unique-investigations"] + downloads = usage["grand-total-unique-requests"] + if views > biggest_views: + biggest_views = views + biggest_views_record = k + if downloads > biggest_downloads: + biggest_downloads = downloads + biggest_downloads_record = k + total_views += views + total_downloads += downloads + date = datetime.fromisoformat(usage["dataset-dates"][0]["value"]) + now = datetime.today() + first = date.strftime("%B %d, %Y") + last = now.strftime("%B %d, %Y") + if views > 1: + u_txt = ( + "
Unique Views: " + + str(views) + + "
Unique Downloads: " + + str(downloads) + + "
between " + + first + + " and " + + last + + '
More info on how stats are collected
" + ) + description = record["descriptions"] + use_exists = False + for d in description: + descr_text = d["description"] + # We always update an existing listing + if descr_text.startswith("
Unique Views:"): + d["description"] = u_txt + use_exists = True + # Otherwise we add a new one + if use_exists == False: + description.append({"descriptionType": "Other", "description": u_txt}) + response = caltechdata_edit( + k, {"descriptions": description}, token, {}, {}, production + ) + print(response) + print(f"Most downloads {biggest_downloads} for record {biggest_downloads_record}") + print(f"Most views {biggest_views} for record {biggest_views_record}") + print(f"Total downloads {total_downloads}") + print(f"Total views {total_views}") def add_thesis_doi(data_collection, thesis_collection, token, production=True): - """Add in theis DOI to CaltechDATA records""" - - - # Search across CaltechTHESIS DOIs - dot_paths = ["._Key", ".doi", ".official_url", ".related_url"] - labels = ["eprint_id", "doi", "official_url", "related_url"] - keys = dataset.keys(thesis_collection) - all_metadata = get_records(dot_paths, "dois", thesis_collection, keys, labels) - dois = [] - for metadata in progressbar(all_metadata, redirect_stdout=True): - if "doi" in metadata: - record_doi = metadata["doi"].strip() - if "related_url" in metadata and "items" in metadata["related_url"]: - items = metadata["related_url"]["items"] - for item in items: - if "url" in item: - url = item["url"].strip() - if "type" in item: - itype = item["type"].strip().lower() - if itype == "doi": - if idutils.is_doi(url): - doi = "10." + url.split("10.")[1] - prefix = doi.split("/")[0] - if prefix == "10.22002": - dois.append([doi, record_doi]) - else: - print("Ignoring non-DOI") - print(metadata["eprint_id"]) - print(url.split("10.")) - for doi_link in dois: - cd_doi = doi_link[0] - thesis_doi = doi_link[1] - # Exclude tombstone records - if cd_doi != "10.22002/D1.1987": - print("Checking " + cd_doi) - record, err = dataset.read(data_collection, cd_doi) - if err != "": - print(err) - exit() - - - for idv in record["identifiers"]: - if idv["identifierType"] == "oai": - record_number = idv["identifier"].split("data.caltech.edu:")[1] - - - done = False - if "relatedIdentifiers" in record: - for idv in record["relatedIdentifiers"]: - identifier = idv["relatedIdentifier"] - if identifier == thesis_doi: - done = True - if done == False: - identifiers = record["relatedIdentifiers"] - identifiers.append( - { - "relatedIdentifier": thesis_doi, - "relatedIdentifierType": "DOI", - "relationType": "IsSupplementTo", - } - ) - record["relatedIdentifiers"] = identifiers - else: - record["relatedIdentifiers"] = [ - { - "relatedIdentifier": thesis_doi, - "relatedIdentifierType": "DOI", - "relationType": "IsSupplementTo", - } - ] - if done == False: - print("Adding " + thesis_doi + " to " + cd_doi) - response = caltechdata_edit( - record_number, record, token, {}, True, publish=True - ) - print(response) - - - + """Add in theis DOI to CaltechDATA records""" + + # Search across CaltechTHESIS DOIs + dot_paths = ["._Key", ".doi", ".official_url", ".related_url"] + labels = ["eprint_id", "doi", "official_url", "related_url"] + keys = dataset.keys(thesis_collection) + all_metadata = get_records(dot_paths, "dois", thesis_collection, keys, labels) + dois = [] + for metadata in progressbar(all_metadata, redirect_stdout=True): + if "doi" in metadata: + record_doi = metadata["doi"].strip() + if "related_url" in metadata and "items" in metadata["related_url"]: + items = metadata["related_url"]["items"] + for item in items: + if "url" in item: + url = item["url"].strip() + if "type" in item: + itype = item["type"].strip().lower() + if itype == "doi": + if idutils.is_doi(url): + doi = "10." + url.split("10.")[1] + prefix = doi.split("/")[0] + if prefix == "10.22002": + dois.append([doi, record_doi]) + else: + print("Ignoring non-DOI") + print(metadata["eprint_id"]) + print(url.split("10.")) + for doi_link in dois: + cd_doi = doi_link[0] + thesis_doi = doi_link[1] + # Exclude tombstone records + if cd_doi != "10.22002/D1.1987": + print("Checking " + cd_doi) + record, err = dataset.read(data_collection, cd_doi) + if err != "": + print(err) + exit() + + for idv in record["identifiers"]: + if idv["identifierType"] == "oai": + record_number = idv["identifier"].split("data.caltech.edu:")[1] + + done = False + if "relatedIdentifiers" in record: + for idv in record["relatedIdentifiers"]: + identifier = idv["relatedIdentifier"] + if identifier == thesis_doi: + done = True + if done == False: + identifiers = record["relatedIdentifiers"] + identifiers.append( + { + "relatedIdentifier": thesis_doi, + "relatedIdentifierType": "DOI", + "relationType": "IsSupplementTo", + } + ) + record["relatedIdentifiers"] = identifiers + else: + record["relatedIdentifiers"] = [ + { + "relatedIdentifier": thesis_doi, + "relatedIdentifierType": "DOI", + "relationType": "IsSupplementTo", + } + ] + if done == False: + print("Adding " + thesis_doi + " to " + cd_doi) + response = caltechdata_edit( + record_number, record, token, {}, True, publish=True + ) + print(response) diff --git a/run_subject_id_correction.py b/run_subject_id_correction.py index ba0e3c38..545921e6 100644 --- a/run_subject_id_correction.py +++ b/run_subject_id_correction.py @@ -5,25 +5,28 @@ def all_corrected(): - - record = "2d2wf-j0256" - subjects_to_correct = {'Biological sciences': 'http://www.oecd.org/science/inno/38235147.pdf?1.6', 'Chemical sciences': 'http://www.oecd.org/science/inno/38235147.pdf?1.4', 'Computer and information sciences': 'http://www.oecd.org/science/inno/38235147.pdf?1.2'} - - - metadata = edit_subject("2d2wf-j0256", os.environ.get("CALTECH_DATA_API"), subjects_to_correct) - - - - - for i in metadata["subjects"]: - for each_correct_subject in subjects_to_correct.keys(): - if "id" in i.keys(): - if i["subject"] == each_correct_subject and i["id"] != subjects_to_correct[each_correct_subject]: - print(i["subject"],"'s id wasn't added.") - return False - print("All subject ids were added") - return True + record = "2d2wf-j0256" + subjects_to_correct = { + "Biological sciences": "http://www.oecd.org/science/inno/38235147.pdf?1.6", + "Chemical sciences": "http://www.oecd.org/science/inno/38235147.pdf?1.4", + "Computer and information sciences": "http://www.oecd.org/science/inno/38235147.pdf?1.2", + } + + metadata = edit_subject( + "2d2wf-j0256", os.environ.get("CALTECH_DATA_API"), subjects_to_correct + ) + + for i in metadata["subjects"]: + for each_correct_subject in subjects_to_correct.keys(): + if "id" in i.keys(): + if ( + i["subject"] == each_correct_subject + and i["id"] != subjects_to_correct[each_correct_subject] + ): + print(i["subject"], "'s id wasn't added.") + return False + print("All subject ids were added") + return True all_corrected() - From 47d5a022aa0e71d65abbdf2ca334b6e8832b6e1a Mon Sep 17 00:00:00 2001 From: Alexander Abakah Date: Mon, 12 May 2025 17:20:51 -0700 Subject: [PATCH 03/18] added name to codemeta --- codemeta.json | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/codemeta.json b/codemeta.json index 7f6c4edd..85481d36 100755 --- a/codemeta.json +++ b/codemeta.json @@ -39,6 +39,16 @@ "name": "Caltech" }, "@id": "https://orcid.org/0009-0002-2450-6471" + }, + { + "@type": "Person", + "givenName": "Alexander", + "familyName": "Abakah", + "affiliation": { + "@type": "Organization", + "name": "Caltech" + }, + "@id": "https://orcid.org/0009-0003-5640-6691" } ], "developmentStatus": "active", From c7d575dfd6067f93960803e9e7d76140eafde93b Mon Sep 17 00:00:00 2001 From: Alexander Abakah Date: Tue, 13 May 2025 15:17:53 -0700 Subject: [PATCH 04/18] added unit tests to test for the behavour of numerous records and their subjects --- run_subject_id_correction.py | 9 +-- test_subjects.py | 132 +++++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+), 4 deletions(-) create mode 100644 test_subjects.py diff --git a/run_subject_id_correction.py b/run_subject_id_correction.py index 545921e6..584a557c 100644 --- a/run_subject_id_correction.py +++ b/run_subject_id_correction.py @@ -3,9 +3,9 @@ import os -def all_corrected(): +def all_corrected(record): - record = "2d2wf-j0256" + # record = "2d2wf-j0256" subjects_to_correct = { "Biological sciences": "http://www.oecd.org/science/inno/38235147.pdf?1.6", "Chemical sciences": "http://www.oecd.org/science/inno/38235147.pdf?1.4", @@ -13,7 +13,7 @@ def all_corrected(): } metadata = edit_subject( - "2d2wf-j0256", os.environ.get("CALTECH_DATA_API"), subjects_to_correct + record, os.environ.get("CALTECH_DATA_API"), subjects_to_correct ) for i in metadata["subjects"]: @@ -25,8 +25,9 @@ def all_corrected(): ): print(i["subject"], "'s id wasn't added.") return False + print("Final subjects:", metadata["subjects"]) print("All subject ids were added") return True -all_corrected() +# all_corrected() diff --git a/test_subjects.py b/test_subjects.py new file mode 100644 index 00000000..fc8d98a4 --- /dev/null +++ b/test_subjects.py @@ -0,0 +1,132 @@ +import unittest +import random, os +from run_subject_id_correction import all_corrected +from caltechdata_api import caltechdata_write + +os.environ["RDMTOK"] = "FVyjwsxBvfNXm5NmmfL8fKGI8hhA6puT9pNJO8PAyrLlNYdeMjfjhBVvuhbs" + + +titles = [ + "ClimateData2024", + "OceanSalinityRecords", + "GlobalTemperatureSet", + "PlantGrowthStudy", + "SoilCompositionData", + "WildlifeObservation2023", + "AirQualityMetrics", + "RainfallPatterns", + "ForestCoverAnalysis", + "BirdMigrationData" +] + + +subjects = [ + {"subject": "Biological Sciences"}, + {"subject": "Econs"}, + { + "subject": "Mathematics", + }, + {"subject": "biological Sciences"}, + { + "id": "http://www.oecd.org/science/inno/38235147.pdf?1.6", + "subject": "Biological sciences", + "scheme": "FOS" + }, + { + "subject": "Sociology", + }, + {"subject": "Political Science"}, + { + "subject": "Medical Sciences", + }, + {"subject": "Art History"}, + {"subject": "Chemical Sciences"}, + { + "subject": "Psychology", + }, + {"subject": "Law"}, + { + "subject": "Agricultural Sciences", + }, + {"subject": "Engineering"}, + { + "id": "http://www.oecd.org/science/inno/38235147.pdf?1.4", + "subject": "Chemical sciences", + "scheme": "FOS" + }, + {"subject": "Computer and information sciences"}, + { + "subject": "Educational Sciences", + }, + {"subject": "Linguistics"}, + {"subject": "Religious Studies"}, + { + "id": "http://www.oecd.org/science/inno/38235147.pdf?1.2", + "subject": "Computer and information sciences", + "scheme": "FOS" + } +] + + +record_ids = [] + +metadata = { + "titles": [{"title": "enter title"}], + "creators": [ + { + "familyName": "Abakah", + "givenName": "Alexander", + "nameType": "Personal", + "nameIdentifiers": [ + {"nameIdentifier": "0009-0003-5640-6691", "nameIdentifierScheme": "ORCID"} + ], + "affiliations": [{"affiliation": "Caltech"}] + }, + + ], + "types": {"resourceType": "Dataset", "resourceTypeGeneral": "Dataset"}, + "descriptions": [{"description": "A data set of forest fires", "descriptionType": "Summary"}], + "dates": [{"date": "2023-11-30", "dateType": "Created"}], + "publisher": "Caltech Library", + "subjects": [{"subject":"Enter Subject"}], +} + + +for title_idx in range(len(titles)): + metadata["titles"][0]["title"] = titles[title_idx] + + number_of_subjects = random.randint(1, len(subjects)) + + + for subject in range(number_of_subjects): + subject_index = random.randint(1, len(subjects) - 1) + if len(metadata["subjects"]) == 1: + metadata["subjects"][0] = subjects[subject_index] + else: + metadata["subjects"].append(subjects[subject_index]) + + response = caltechdata_write( + metadata = metadata, + # files=files, + production=False, + publish= True +) + + + record_ids.append("" + response) + + + + + + +class TestSubjects(unittest.TestCase): + + def test_subject_changes(self): + for i in range(len(record_ids)): + self.assertEqual(all_corrected(record_ids[i]), True) + + + +if __name__ == '__main__': + unittest.main() From 5d06aa5881c7b10b03c101f7ff3f6d37cc20f447 Mon Sep 17 00:00:00 2001 From: Alexander Abakah Date: Fri, 16 May 2025 14:05:07 -0700 Subject: [PATCH 05/18] Added regression tests for subjects correction --- ames/matchers/caltechdata.py | 2 +- test_subjects.py | 110 +++++++++++++++++++++++++++++++---- 2 files changed, 101 insertions(+), 11 deletions(-) diff --git a/ames/matchers/caltechdata.py b/ames/matchers/caltechdata.py index 92283ab6..1951a723 100644 --- a/ames/matchers/caltechdata.py +++ b/ames/matchers/caltechdata.py @@ -41,7 +41,7 @@ def edit_subject(record, token, correction_subjects, test=True): if metadata["subjects"]: for i in metadata["subjects"]: for each_correct_subject in correction_subjects.keys(): - if i["subject"] == each_correct_subject and "id" not in i: + if i["subject"].lower() == each_correct_subject.lower() and "id" not in i: i["id"] = correction_subjects[each_correct_subject] i["subject"] = each_correct_subject diff --git a/test_subjects.py b/test_subjects.py index fc8d98a4..dba9b22b 100644 --- a/test_subjects.py +++ b/test_subjects.py @@ -1,10 +1,15 @@ import unittest -import random, os +import random, os, copy, time, requests from run_subject_id_correction import all_corrected -from caltechdata_api import caltechdata_write +from caltechdata_api import caltechdata_write, get_metadata os.environ["RDMTOK"] = "FVyjwsxBvfNXm5NmmfL8fKGI8hhA6puT9pNJO8PAyrLlNYdeMjfjhBVvuhbs" +headers = { + "Authorization": "Bearer %s" % "FVyjwsxBvfNXm5NmmfL8fKGI8hhA6puT9pNJO8PAyrLlNYdeMjfjhBVvuhbs", + "Content-type": "application/json", +} + titles = [ "ClimateData2024", @@ -92,6 +97,23 @@ } +# Creating a record with malformed subjects and check correction +malformed_metadata = copy.deepcopy(metadata) +malformed_metadata['subjects'] = [ + {"subject": " Biological sciences "}, # Extra spaces + {"subject": "CHEMICAL SCIENCES"}, # All caps + {"subject": "computer and information sciences"}, # Incorrect capitalization +] + +# Creating a test record +response = caltechdata_write( + metadata=malformed_metadata, + production=False, + publish=True +) +record_ids.append("" + response) + + for title_idx in range(len(titles)): metadata["titles"][0]["title"] = titles[title_idx] @@ -107,7 +129,6 @@ response = caltechdata_write( metadata = metadata, - # files=files, production=False, publish= True ) @@ -118,15 +139,84 @@ - - class TestSubjects(unittest.TestCase): - def test_subject_changes(self): - for i in range(len(record_ids)): - self.assertEqual(all_corrected(record_ids[i]), True) - + def setUp(self): + # Initialize test data + self.record_ids = record_ids + + def test_asubject_changes(self): + for i in range(len(self.record_ids)): + self.assertEqual(all_corrected(self.record_ids[i]), True) + + + def test_bsubject_case_normalization(self): + # Test that subjects with different case get normalized + for record_id in self.record_ids: + record_metadata = get_metadata( + record_id, + production=False, + validate=True, + emails=False, + schema="43", + token=False, + authors=False, + ) + for subject_idx in range(len(record_metadata["subjects"])): + if "subject" in record_metadata["subjects"][subject_idx] and isinstance(record_metadata["subjects"][subject_idx]["subject"], str): + # Check that subjects are properly cased (first letter capitalized) + self.assertTrue(record_metadata["subjects"][subject_idx]["subject"][0].isupper(), + f"Subject '{record_metadata["subjects"][subject_idx]["subject"]}' in record' {record_id} 'should start with uppercase") + + def test_csubject_id_present(self): + time.sleep(5) + # Test that subjects have proper IDs where applicable + for record_id in self.record_ids: + rurl = "https://data.caltechlibrary.dev/api/records/" + record_id + data = requests.get(rurl, headers=headers).json() + record_metadata = data["metadata"] + for subject_idx in range(len(record_metadata["subjects"])): + if record_metadata["subjects"][subject_idx]["subject"] in ["Biological sciences", "Chemical sciences", + "Computer and information sciences"]: + self.assertIn("id", record_metadata["subjects"][subject_idx], f"Subject '{record_metadata["subjects"][subject_idx]["subject"]}' in record' {record_id} 'should have an ID") + def test_dsubject_scheme_consistent(self): + # Test that subjects with IDs have consistent schemes + for record_id in self.record_ids: + record_metadata = get_metadata( + record_id, + production=False, + validate=True, + emails=False, + schema="43", + token=False, + authors=False, + ) + for subject_idx in range(len(record_metadata["subjects"])): + if 'id' in record_metadata["subjects"][subject_idx]: + self.assertIn('scheme', record_metadata["subjects"][subject_idx], + f"Subject with ID '{record_metadata["subjects"][subject_idx]["id"]}' should have a scheme") + self.assertEqual(record_metadata["subjects"][subject_idx]["scheme"], 'FOS', + f"Subject scheme for' {record_metadata["subjects"][subject_idx]["subject"]}'in record' {record_id}' should be 'FOS' but was '{record_metadata["subjects"][subject_idx]["scheme"]}'") + + def test_eduplicate_subjects_removed(self): + # Test that duplicate subjects are removed + for record_id in self.record_ids: + record_metadata = get_metadata( + record_id, + production=False, + validate=True, + emails=False, + schema="43", + token=False, + authors=False, + ) + subjects_list = [record_metadata["subjects"][subject_idx]["subject"].lower() for subject_idx in range(len(record_metadata["subjects"]))] + self.assertEqual(len(subjects_list), len(set(subjects_list)), + f"Found duplicate subjects in record {record_id}") + + + if __name__ == '__main__': - unittest.main() + unittest.main() \ No newline at end of file From 18a33238165e0654d9675e4a5b8dfb75bab58e18 Mon Sep 17 00:00:00 2001 From: Alexander Abakah Date: Sat, 17 May 2025 10:12:04 -0700 Subject: [PATCH 06/18] addded test cases in edit_subjects --- ames/matchers/caltechdata.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/ames/matchers/caltechdata.py b/ames/matchers/caltechdata.py index 1951a723..622bcac6 100644 --- a/ames/matchers/caltechdata.py +++ b/ames/matchers/caltechdata.py @@ -53,7 +53,7 @@ def edit_subject(record, token, correction_subjects, test=True): publish=True, ) - metadata = get_metadata( + record_metadata = get_metadata( record, production=False, validate=True, @@ -63,6 +63,36 @@ def edit_subject(record, token, correction_subjects, test=True): authors=False, ) + + for subject_idx in range(len(record_metadata["subjects"])): + if "subject" in record_metadata["subjects"][subject_idx] and isinstance(record_metadata["subjects"][subject_idx]["subject"], str): + # Check that subjects are properly cased (first letter capitalized) + assert record_metadata["subjects"][subject_idx]["subject"][0].isupper(), \ + f"Subject '{record_metadata["subjects"][subject_idx]["subject"]}' in record' {record} 'should start with uppercase" + + #check that each subject that should have an id has it + rurl = "https://data.caltechlibrary.dev/api/records/" + record + data = requests.get(rurl, headers=headers).json() + record_metadata = data["metadata"] + for subject_idx in range(len(record_metadata["subjects"])): + if record_metadata["subjects"][subject_idx]["subject"] in ["Biological sciences", "Chemical sciences", + "Computer and information sciences"]: + assert "id" in record_metadata["subjects"][subject_idx], \ + f"Subject '{record_metadata["subjects"][subject_idx]["subject"]}' in record' {record} 'should have an ID" + + #check that the schema is FOS + for subject_idx in range(len(record_metadata["subjects"])): + if 'id' in record_metadata["subjects"][subject_idx]: + assert 'scheme' in record_metadata["subjects"][subject_idx], \ + f"Subject with ID '{record_metadata["subjects"][subject_idx]["id"]}' should have a scheme" + assert record_metadata["subjects"][subject_idx]["scheme"] == 'FOS', \ + f"Subject scheme for' {record_metadata["subjects"][subject_idx]["subject"]}'in record' {record}' should be 'FOS' but was '{record_metadata["subjects"][subject_idx]["scheme"]}'" + + #check that there are no duplicate records. + subjects_list = [record_metadata["subjects"][subject_idx]["subject"].lower() for subject_idx in range(len(record_metadata["subjects"]))] + assert len(subjects_list) == len(set(subjects_list)), \ + f"Found duplicate subjects in record {record}" + return metadata From ce4ff9ebdb31c38f2a805039e6e5895cca4f006f Mon Sep 17 00:00:00 2001 From: Alexander Abakah Date: Mon, 19 May 2025 01:33:00 -0700 Subject: [PATCH 07/18] added more tests --- ames/matchers/caltechdata.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/ames/matchers/caltechdata.py b/ames/matchers/caltechdata.py index 622bcac6..2643e7b3 100644 --- a/ames/matchers/caltechdata.py +++ b/ames/matchers/caltechdata.py @@ -12,7 +12,6 @@ def edit_subject(record, token, correction_subjects, test=True): - if test: rurl = "https://data.caltechlibrary.dev/api/records/" + record else: @@ -63,7 +62,6 @@ def edit_subject(record, token, correction_subjects, test=True): authors=False, ) - for subject_idx in range(len(record_metadata["subjects"])): if "subject" in record_metadata["subjects"][subject_idx] and isinstance(record_metadata["subjects"][subject_idx]["subject"], str): # Check that subjects are properly cased (first letter capitalized) @@ -93,7 +91,23 @@ def edit_subject(record, token, correction_subjects, test=True): assert len(subjects_list) == len(set(subjects_list)), \ f"Found duplicate subjects in record {record}" - return metadata + #Check that each subject is non-empty (after trimming whitespace) + for idx, sub in enumerate(record_metadata["subjects"]): + trimmed_name = sub["subject"].strip() + assert trimmed_name, f"Subject at index {idx} in record '{record}' is empty or whitespace." + + #Check that “subject” fields are strings (not number, None, etc.) + for idx, sub in enumerate(record_metadata["subjects"]): + assert isinstance(sub["subject"], str), f"Subject at index {idx} in record '{record}' must be a string, got {type(sub['subject'])}." + + # Check that no subject includes illegal characters (example: “@”) + forbidden_chars = ["@", "#", "%"] + for idx, sub in enumerate(record_metadata["subjects"]): + for c in forbidden_chars: + assert c not in sub["subject"], \ + f"Subject '{sub['subject']}' in record '{record}' includes forbidden character '{c}'" + + return record_metadata def match_cd_refs(): From f7cf5a2071e7259f3d043f80979cd80ef12ca2db Mon Sep 17 00:00:00 2001 From: Alexander Abakah Date: Fri, 23 May 2025 00:58:12 -0700 Subject: [PATCH 08/18] refractoring tests --- test_subjects.py | 282 +++++++++++++++++------------------------------ 1 file changed, 104 insertions(+), 178 deletions(-) diff --git a/test_subjects.py b/test_subjects.py index dba9b22b..5ba2ec57 100644 --- a/test_subjects.py +++ b/test_subjects.py @@ -1,5 +1,5 @@ import unittest -import random, os, copy, time, requests +import os, copy, time, requests from run_subject_id_correction import all_corrected from caltechdata_api import caltechdata_write, get_metadata @@ -10,71 +10,7 @@ "Content-type": "application/json", } - -titles = [ - "ClimateData2024", - "OceanSalinityRecords", - "GlobalTemperatureSet", - "PlantGrowthStudy", - "SoilCompositionData", - "WildlifeObservation2023", - "AirQualityMetrics", - "RainfallPatterns", - "ForestCoverAnalysis", - "BirdMigrationData" -] - - -subjects = [ - {"subject": "Biological Sciences"}, - {"subject": "Econs"}, - { - "subject": "Mathematics", - }, - {"subject": "biological Sciences"}, - { - "id": "http://www.oecd.org/science/inno/38235147.pdf?1.6", - "subject": "Biological sciences", - "scheme": "FOS" - }, - { - "subject": "Sociology", - }, - {"subject": "Political Science"}, - { - "subject": "Medical Sciences", - }, - {"subject": "Art History"}, - {"subject": "Chemical Sciences"}, - { - "subject": "Psychology", - }, - {"subject": "Law"}, - { - "subject": "Agricultural Sciences", - }, - {"subject": "Engineering"}, - { - "id": "http://www.oecd.org/science/inno/38235147.pdf?1.4", - "subject": "Chemical sciences", - "scheme": "FOS" - }, - {"subject": "Computer and information sciences"}, - { - "subject": "Educational Sciences", - }, - {"subject": "Linguistics"}, - {"subject": "Religious Studies"}, - { - "id": "http://www.oecd.org/science/inno/38235147.pdf?1.2", - "subject": "Computer and information sciences", - "scheme": "FOS" - } -] - - -record_ids = [] - +# Base metadata metadata = { "titles": [{"title": "enter title"}], "creators": [ @@ -86,136 +22,126 @@ {"nameIdentifier": "0009-0003-5640-6691", "nameIdentifierScheme": "ORCID"} ], "affiliations": [{"affiliation": "Caltech"}] - }, - + } ], "types": {"resourceType": "Dataset", "resourceTypeGeneral": "Dataset"}, "descriptions": [{"description": "A data set of forest fires", "descriptionType": "Summary"}], "dates": [{"date": "2023-11-30", "dateType": "Created"}], "publisher": "Caltech Library", - "subjects": [{"subject":"Enter Subject"}], + "subjects": [{"subject": "Enter Subject"}], } - -# Creating a record with malformed subjects and check correction +# A version of the metadata that is deliberately malformed malformed_metadata = copy.deepcopy(metadata) -malformed_metadata['subjects'] = [ - {"subject": " Biological sciences "}, # Extra spaces - {"subject": "CHEMICAL SCIENCES"}, # All caps - {"subject": "computer and information sciences"}, # Incorrect capitalization +malformed_metadata["subjects"] = [ + {"subject": " Biological sciences "}, # Extra spaces + {"subject": "CHEMICAL SCIENCES"}, # All caps + {"subject": "computer and information sciences"}, # Incorrect capitalization ] -# Creating a test record -response = caltechdata_write( - metadata=malformed_metadata, - production=False, - publish=True -) -record_ids.append("" + response) - - -for title_idx in range(len(titles)): - metadata["titles"][0]["title"] = titles[title_idx] - - number_of_subjects = random.randint(1, len(subjects)) - - - for subject in range(number_of_subjects): - subject_index = random.randint(1, len(subjects) - 1) - if len(metadata["subjects"]) == 1: - metadata["subjects"][0] = subjects[subject_index] - else: - metadata["subjects"].append(subjects[subject_index]) - - response = caltechdata_write( - metadata = metadata, - production=False, - publish= True -) - - - record_ids.append("" + response) - - - class TestSubjects(unittest.TestCase): - - def setUp(self): - # Initialize test data - self.record_ids = record_ids - + def test_asubject_changes(self): - for i in range(len(self.record_ids)): - self.assertEqual(all_corrected(self.record_ids[i]), True) - - + # Create a test record with malformed subjects + test_data = copy.deepcopy(malformed_metadata) + record_id = caltechdata_write( + metadata=test_data, + production=False, + publish=True + ) + # Verify correction + self.assertEqual(all_corrected(record_id), True, f"Subjects in record {record_id} were not corrected properly") + def test_bsubject_case_normalization(self): - # Test that subjects with different case get normalized - for record_id in self.record_ids: - record_metadata = get_metadata( - record_id, - production=False, - validate=True, - emails=False, - schema="43", - token=False, - authors=False, - ) - for subject_idx in range(len(record_metadata["subjects"])): - if "subject" in record_metadata["subjects"][subject_idx] and isinstance(record_metadata["subjects"][subject_idx]["subject"], str): - # Check that subjects are properly cased (first letter capitalized) - self.assertTrue(record_metadata["subjects"][subject_idx]["subject"][0].isupper(), - f"Subject '{record_metadata["subjects"][subject_idx]["subject"]}' in record' {record_id} 'should start with uppercase") - + # Create a record whose subjects need case normalization + test_data = copy.deepcopy(malformed_metadata) + record_id = caltechdata_write( + metadata=test_data, + production=False, + publish=True + ) + record_metadata = get_metadata( + record_id, production=False, validate=True, emails=False, schema="43", token=False, authors=False + ) + for subject_obj in record_metadata.get("subjects", []): + if "subject" in subject_obj and isinstance(subject_obj["subject"], str): + self.assertTrue( + subject_obj["subject"][0].isupper(), + f"Subject '{subject_obj['subject']}' in record {record_id} should start with uppercase" + ) + def test_csubject_id_present(self): + # Create a record with known subjects that should map to IDs + test_data = copy.deepcopy(malformed_metadata) + test_data["subjects"] = [ + {"subject": "Biological sciences"}, + {"subject": "Chemical sciences"}, + {"subject": "Computer and information sciences"}, + ] + record_id = caltechdata_write( + metadata=test_data, + production=False, + publish=True + ) time.sleep(5) - # Test that subjects have proper IDs where applicable - for record_id in self.record_ids: - rurl = "https://data.caltechlibrary.dev/api/records/" + record_id - data = requests.get(rurl, headers=headers).json() - record_metadata = data["metadata"] - for subject_idx in range(len(record_metadata["subjects"])): - if record_metadata["subjects"][subject_idx]["subject"] in ["Biological sciences", "Chemical sciences", - "Computer and information sciences"]: - self.assertIn("id", record_metadata["subjects"][subject_idx], f"Subject '{record_metadata["subjects"][subject_idx]["subject"]}' in record' {record_id} 'should have an ID") - + rurl = "https://data.caltechlibrary.dev/api/records/" + record_id + data = requests.get(rurl, headers=headers).json() + record_metadata = data["metadata"] + for subject_obj in record_metadata.get("subjects", []): + if subject_obj["subject"] in ["Biological sciences", "Chemical sciences", "Computer and information sciences"]: + self.assertIn("id", subject_obj, f"Subject '{subject_obj['subject']}' in record {record_id} should have an ID") + def test_dsubject_scheme_consistent(self): - # Test that subjects with IDs have consistent schemes - for record_id in self.record_ids: - record_metadata = get_metadata( - record_id, - production=False, - validate=True, - emails=False, - schema="43", - token=False, - authors=False, - ) - for subject_idx in range(len(record_metadata["subjects"])): - if 'id' in record_metadata["subjects"][subject_idx]: - self.assertIn('scheme', record_metadata["subjects"][subject_idx], - f"Subject with ID '{record_metadata["subjects"][subject_idx]["id"]}' should have a scheme") - self.assertEqual(record_metadata["subjects"][subject_idx]["scheme"], 'FOS', - f"Subject scheme for' {record_metadata["subjects"][subject_idx]["subject"]}'in record' {record_id}' should be 'FOS' but was '{record_metadata["subjects"][subject_idx]["scheme"]}'") - + # Create a record with IDs that should link to scheme FOS + test_data = copy.deepcopy(metadata) + test_data["subjects"] = [ + { + "id": "http://www.oecd.org/science/inno/38235147.pdf?1.2", + "subject": "Computer and information sciences", + "scheme": "FOS" + } + ] + record_id = caltechdata_write( + metadata=test_data, + production=False, + publish=True + ) + record_metadata = get_metadata( + record_id, production=False, validate=True, emails=False, schema="43", token=False, authors=False + ) + for subject_obj in record_metadata.get("subjects", []): + if "id" in subject_obj: + self.assertIn( + "scheme", subject_obj, + f"Subject with ID '{subject_obj['id']}' should have a scheme" + ) + self.assertEqual( + subject_obj["scheme"], "FOS", + f"Subject scheme for '{subject_obj['subject']}' in record {record_id} should be 'FOS'" + ) + def test_eduplicate_subjects_removed(self): - # Test that duplicate subjects are removed - for record_id in self.record_ids: - record_metadata = get_metadata( - record_id, - production=False, - validate=True, - emails=False, - schema="43", - token=False, - authors=False, - ) - subjects_list = [record_metadata["subjects"][subject_idx]["subject"].lower() for subject_idx in range(len(record_metadata["subjects"]))] - self.assertEqual(len(subjects_list), len(set(subjects_list)), - f"Found duplicate subjects in record {record_id}") - - + # Create a record with duplicate subjects + test_data = copy.deepcopy(metadata) + test_data["subjects"] = [ + {"subject": "Biological sciences"}, + {"subject": "biological Sciences"}, + ] + record_id = caltechdata_write( + metadata=test_data, + production=False, + publish=True + ) + record_metadata = get_metadata( + record_id, production=False, validate=True, emails=False, schema="43", token=False, authors=False + ) + subjects_list = [s["subject"].lower() for s in record_metadata.get("subjects", [])] + self.assertEqual( + len(subjects_list), + len(set(subjects_list)), + f"Found duplicate subjects in record {record_id}" + ) if __name__ == '__main__': From 4ca8e24c743087490b6ea3ac9d0f4ac53284b43c Mon Sep 17 00:00:00 2001 From: Alexander Abakah Date: Fri, 23 May 2025 19:28:52 -0700 Subject: [PATCH 09/18] refractoring tests --- test_subjects.py | 56 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/test_subjects.py b/test_subjects.py index 5ba2ec57..7670363a 100644 --- a/test_subjects.py +++ b/test_subjects.py @@ -10,7 +10,6 @@ "Content-type": "application/json", } -# Base metadata metadata = { "titles": [{"title": "enter title"}], "creators": [ @@ -42,8 +41,8 @@ class TestSubjects(unittest.TestCase): - def test_asubject_changes(self): - # Create a test record with malformed subjects + def test_subject_changes(self): + # Creates a test record with malformed subjects test_data = copy.deepcopy(malformed_metadata) record_id = caltechdata_write( metadata=test_data, @@ -52,9 +51,10 @@ def test_asubject_changes(self): ) # Verify correction self.assertEqual(all_corrected(record_id), True, f"Subjects in record {record_id} were not corrected properly") + print("Passed test_subject_changes") - def test_bsubject_case_normalization(self): - # Create a record whose subjects need case normalization + def test_subject_case_normalization(self): + # Creates a record whose subjects need case normalization test_data = copy.deepcopy(malformed_metadata) record_id = caltechdata_write( metadata=test_data, @@ -70,9 +70,10 @@ def test_bsubject_case_normalization(self): subject_obj["subject"][0].isupper(), f"Subject '{subject_obj['subject']}' in record {record_id} should start with uppercase" ) + print("Passed test_subject_case_normalization") - def test_csubject_id_present(self): - # Create a record with known subjects that should map to IDs + def test_subject_id_present(self): + # Creates a record with known subjects that should map to IDs test_data = copy.deepcopy(malformed_metadata) test_data["subjects"] = [ {"subject": "Biological sciences"}, @@ -91,15 +92,16 @@ def test_csubject_id_present(self): for subject_obj in record_metadata.get("subjects", []): if subject_obj["subject"] in ["Biological sciences", "Chemical sciences", "Computer and information sciences"]: self.assertIn("id", subject_obj, f"Subject '{subject_obj['subject']}' in record {record_id} should have an ID") + print("Passed test_subject_id_present") - def test_dsubject_scheme_consistent(self): - # Create a record with IDs that should link to scheme FOS + def test_subject_scheme_consistent(self): + # Creates a record with IDs that should link to scheme FOS test_data = copy.deepcopy(metadata) test_data["subjects"] = [ { "id": "http://www.oecd.org/science/inno/38235147.pdf?1.2", "subject": "Computer and information sciences", - "scheme": "FOS" + "scheme": "fos" } ] record_id = caltechdata_write( @@ -112,16 +114,38 @@ def test_dsubject_scheme_consistent(self): ) for subject_obj in record_metadata.get("subjects", []): if "id" in subject_obj: - self.assertIn( - "scheme", subject_obj, - f"Subject with ID '{subject_obj['id']}' should have a scheme" - ) self.assertEqual( subject_obj["scheme"], "FOS", f"Subject scheme for '{subject_obj['subject']}' in record {record_id} should be 'FOS'" ) + print("Passed test_subject_scheme_consistent") + + def test_subject_has_scheme(self): + # Creates a record with IDs doesn't have a scheme + test_data = copy.deepcopy(metadata) + test_data["subjects"] = [ + { + "id": "http://www.oecd.org/science/inno/38235147.pdf?1.2", + "subject": "Computer and information sciences", + } + ] + record_id = caltechdata_write( + metadata=test_data, + production=False, + publish=True + ) + record_metadata = get_metadata( + record_id, production=False, validate=True, emails=False, schema="43", token=False, authors=False + ) + for subject_obj in record_metadata.get("subjects", []): + if "id" in subject_obj: + self.assertIn( + "scheme", subject_obj, + f"Subject with ID '{subject_obj['id']}' should have a scheme" + ) + print("Passed test_subject_has_scheme") - def test_eduplicate_subjects_removed(self): + def test_duplicate_subjects_removed(self): # Create a record with duplicate subjects test_data = copy.deepcopy(metadata) test_data["subjects"] = [ @@ -142,6 +166,8 @@ def test_eduplicate_subjects_removed(self): len(set(subjects_list)), f"Found duplicate subjects in record {record_id}" ) + print("Passed test_duplicate_subjects_removed") + if __name__ == '__main__': From 1daa180fee0f4ebf4dec3f4b9432854eb3c6fd17 Mon Sep 17 00:00:00 2001 From: Alexander Abakah Date: Sun, 25 May 2025 11:27:55 -0700 Subject: [PATCH 10/18] refractoring tests --- test_subjects.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/test_subjects.py b/test_subjects.py index 7670363a..d8202bea 100644 --- a/test_subjects.py +++ b/test_subjects.py @@ -169,6 +169,52 @@ def test_duplicate_subjects_removed(self): print("Passed test_duplicate_subjects_removed") + def test_subjects_with_id_no_text(self): + # Create a record with subject id but no subject text + + #This test would raise a validation error because caltechdata_write expects each subject + # to have a subject text. + test_data = copy.deepcopy(metadata) + test_data["subjects"] = [ + { + "id": "http://www.oecd.org/science/inno/38235147.pdf?1.2", + } + ] + + try: + record_id = caltechdata_write( + metadata=test_data, + production=False, + publish=True + ) + + print("Passed test_subjects_with_id_no_text") + + except Exception as e: + self.fail(f"Each subject with an id shold have a subject text: {e}") + + def test_subjects_with_malformed_ids(self): + # Create a record with malformed url + + test_data = copy.deepcopy(metadata) + test_data["subjects"] = [ + { + "id": "http://www.oecd.org/science/inno/382351479.pdf?1.2", + "subject": "Computer and information sciences", + } + ] + + try: + record_id = caltechdata_write( + metadata=test_data, + production=False, + publish=True + ) + print("Passed test_subjects_with_malformed_ids") + except Exception as e: + self.fail(f"Subject id: {e} is not correct") + + if __name__ == '__main__': unittest.main() \ No newline at end of file From db9103b131089518dd06551d83780cb55c408395 Mon Sep 17 00:00:00 2001 From: Alexander Abakah Date: Mon, 26 May 2025 10:48:50 -0700 Subject: [PATCH 11/18] complete --- ames/matchers/caltechdata.py | 30 +++++++++++++---- test_subjects.py | 65 ++++++++++-------------------------- 2 files changed, 41 insertions(+), 54 deletions(-) diff --git a/ames/matchers/caltechdata.py b/ames/matchers/caltechdata.py index 2643e7b3..02671b4a 100644 --- a/ames/matchers/caltechdata.py +++ b/ames/matchers/caltechdata.py @@ -35,14 +35,32 @@ def edit_subject(record, token, correction_subjects, test=True): token=False, authors=False, ) + print(metadata["subjects"]) - if metadata["subjects"]: - for i in metadata["subjects"]: - for each_correct_subject in correction_subjects.keys(): - if i["subject"].lower() == each_correct_subject.lower() and "id" not in i: - i["id"] = correction_subjects[each_correct_subject] - i["subject"] = each_correct_subject + seen_subjects = set() + new_subjects = [] + + for subject_entry in metadata["subjects"]: + subject_name = subject_entry["subject"].strip().lower() + + if subject_name in seen_subjects: + continue + seen_subjects.add(subject_name) + + + for correct_subject in correction_subjects.keys(): + if subject_name == correct_subject.lower() and "id" not in subject_entry: + subject_entry["id"] = correction_subjects[correct_subject] + subject_entry["subject"] = correct_subject + + new_subjects.append(subject_entry) + + metadata["subjects"] = new_subjects + + + + print(metadata["subjects"]) caltechdata_edit( record, diff --git a/test_subjects.py b/test_subjects.py index d8202bea..96ccc689 100644 --- a/test_subjects.py +++ b/test_subjects.py @@ -61,6 +61,9 @@ def test_subject_case_normalization(self): production=False, publish=True ) + + all_corrected(record_id) + record_metadata = get_metadata( record_id, production=False, validate=True, emails=False, schema="43", token=False, authors=False ) @@ -85,7 +88,9 @@ def test_subject_id_present(self): production=False, publish=True ) - time.sleep(5) + + all_corrected(record_id) + rurl = "https://data.caltechlibrary.dev/api/records/" + record_id data = requests.get(rurl, headers=headers).json() record_metadata = data["metadata"] @@ -109,6 +114,9 @@ def test_subject_scheme_consistent(self): production=False, publish=True ) + + all_corrected(record_id) + record_metadata = get_metadata( record_id, production=False, validate=True, emails=False, schema="43", token=False, authors=False ) @@ -134,6 +142,9 @@ def test_subject_has_scheme(self): production=False, publish=True ) + + all_corrected(record_id) + record_metadata = get_metadata( record_id, production=False, validate=True, emails=False, schema="43", token=False, authors=False ) @@ -157,9 +168,14 @@ def test_duplicate_subjects_removed(self): production=False, publish=True ) + + all_corrected(record_id) + record_metadata = get_metadata( record_id, production=False, validate=True, emails=False, schema="43", token=False, authors=False ) + + subjects_list = [s["subject"].lower() for s in record_metadata.get("subjects", [])] self.assertEqual( len(subjects_list), @@ -169,52 +185,5 @@ def test_duplicate_subjects_removed(self): print("Passed test_duplicate_subjects_removed") - def test_subjects_with_id_no_text(self): - # Create a record with subject id but no subject text - - #This test would raise a validation error because caltechdata_write expects each subject - # to have a subject text. - test_data = copy.deepcopy(metadata) - test_data["subjects"] = [ - { - "id": "http://www.oecd.org/science/inno/38235147.pdf?1.2", - } - ] - - try: - record_id = caltechdata_write( - metadata=test_data, - production=False, - publish=True - ) - - print("Passed test_subjects_with_id_no_text") - - except Exception as e: - self.fail(f"Each subject with an id shold have a subject text: {e}") - - def test_subjects_with_malformed_ids(self): - # Create a record with malformed url - - test_data = copy.deepcopy(metadata) - test_data["subjects"] = [ - { - "id": "http://www.oecd.org/science/inno/382351479.pdf?1.2", - "subject": "Computer and information sciences", - } - ] - - try: - record_id = caltechdata_write( - metadata=test_data, - production=False, - publish=True - ) - print("Passed test_subjects_with_malformed_ids") - except Exception as e: - self.fail(f"Subject id: {e} is not correct") - - - if __name__ == '__main__': unittest.main() \ No newline at end of file From e6c0321f95747cf7c132db8bf8dea42ff7091c3a Mon Sep 17 00:00:00 2001 From: Alexander Abakah Date: Thu, 29 May 2025 10:53:55 -0700 Subject: [PATCH 12/18] added --- run_subject_id_correction.py | 1 - 1 file changed, 1 deletion(-) diff --git a/run_subject_id_correction.py b/run_subject_id_correction.py index 584a557c..8c3a0bfc 100644 --- a/run_subject_id_correction.py +++ b/run_subject_id_correction.py @@ -5,7 +5,6 @@ def all_corrected(record): - # record = "2d2wf-j0256" subjects_to_correct = { "Biological sciences": "http://www.oecd.org/science/inno/38235147.pdf?1.6", "Chemical sciences": "http://www.oecd.org/science/inno/38235147.pdf?1.4", From 6a6d9b3307625fc7f2ef4014e743caf555500608 Mon Sep 17 00:00:00 2001 From: Alexander Abakah Date: Mon, 9 Jun 2025 08:04:06 -0700 Subject: [PATCH 13/18] new changes --- ames/matchers/caltechdata.py | 49 +----------------------------------- test_subjects.py | 10 +++----- 2 files changed, 5 insertions(+), 54 deletions(-) diff --git a/ames/matchers/caltechdata.py b/ames/matchers/caltechdata.py index 02671b4a..6e6c119e 100644 --- a/ames/matchers/caltechdata.py +++ b/ames/matchers/caltechdata.py @@ -28,7 +28,7 @@ def edit_subject(record, token, correction_subjects, test=True): metadata = get_metadata( record, - production=False, + production=not test, validate=True, emails=False, schema="43", @@ -36,8 +36,6 @@ def edit_subject(record, token, correction_subjects, test=True): authors=False, ) - print(metadata["subjects"]) - seen_subjects = set() new_subjects = [] @@ -80,51 +78,6 @@ def edit_subject(record, token, correction_subjects, test=True): authors=False, ) - for subject_idx in range(len(record_metadata["subjects"])): - if "subject" in record_metadata["subjects"][subject_idx] and isinstance(record_metadata["subjects"][subject_idx]["subject"], str): - # Check that subjects are properly cased (first letter capitalized) - assert record_metadata["subjects"][subject_idx]["subject"][0].isupper(), \ - f"Subject '{record_metadata["subjects"][subject_idx]["subject"]}' in record' {record} 'should start with uppercase" - - #check that each subject that should have an id has it - rurl = "https://data.caltechlibrary.dev/api/records/" + record - data = requests.get(rurl, headers=headers).json() - record_metadata = data["metadata"] - for subject_idx in range(len(record_metadata["subjects"])): - if record_metadata["subjects"][subject_idx]["subject"] in ["Biological sciences", "Chemical sciences", - "Computer and information sciences"]: - assert "id" in record_metadata["subjects"][subject_idx], \ - f"Subject '{record_metadata["subjects"][subject_idx]["subject"]}' in record' {record} 'should have an ID" - - #check that the schema is FOS - for subject_idx in range(len(record_metadata["subjects"])): - if 'id' in record_metadata["subjects"][subject_idx]: - assert 'scheme' in record_metadata["subjects"][subject_idx], \ - f"Subject with ID '{record_metadata["subjects"][subject_idx]["id"]}' should have a scheme" - assert record_metadata["subjects"][subject_idx]["scheme"] == 'FOS', \ - f"Subject scheme for' {record_metadata["subjects"][subject_idx]["subject"]}'in record' {record}' should be 'FOS' but was '{record_metadata["subjects"][subject_idx]["scheme"]}'" - - #check that there are no duplicate records. - subjects_list = [record_metadata["subjects"][subject_idx]["subject"].lower() for subject_idx in range(len(record_metadata["subjects"]))] - assert len(subjects_list) == len(set(subjects_list)), \ - f"Found duplicate subjects in record {record}" - - #Check that each subject is non-empty (after trimming whitespace) - for idx, sub in enumerate(record_metadata["subjects"]): - trimmed_name = sub["subject"].strip() - assert trimmed_name, f"Subject at index {idx} in record '{record}' is empty or whitespace." - - #Check that “subject” fields are strings (not number, None, etc.) - for idx, sub in enumerate(record_metadata["subjects"]): - assert isinstance(sub["subject"], str), f"Subject at index {idx} in record '{record}' must be a string, got {type(sub['subject'])}." - - # Check that no subject includes illegal characters (example: “@”) - forbidden_chars = ["@", "#", "%"] - for idx, sub in enumerate(record_metadata["subjects"]): - for c in forbidden_chars: - assert c not in sub["subject"], \ - f"Subject '{sub['subject']}' in record '{record}' includes forbidden character '{c}'" - return record_metadata diff --git a/test_subjects.py b/test_subjects.py index 96ccc689..2975ff07 100644 --- a/test_subjects.py +++ b/test_subjects.py @@ -3,8 +3,6 @@ from run_subject_id_correction import all_corrected from caltechdata_api import caltechdata_write, get_metadata -os.environ["RDMTOK"] = "FVyjwsxBvfNXm5NmmfL8fKGI8hhA6puT9pNJO8PAyrLlNYdeMjfjhBVvuhbs" - headers = { "Authorization": "Bearer %s" % "FVyjwsxBvfNXm5NmmfL8fKGI8hhA6puT9pNJO8PAyrLlNYdeMjfjhBVvuhbs", "Content-type": "application/json", @@ -65,7 +63,7 @@ def test_subject_case_normalization(self): all_corrected(record_id) record_metadata = get_metadata( - record_id, production=False, validate=True, emails=False, schema="43", token=False, authors=False + record_id, production=False ) for subject_obj in record_metadata.get("subjects", []): if "subject" in subject_obj and isinstance(subject_obj["subject"], str): @@ -118,7 +116,7 @@ def test_subject_scheme_consistent(self): all_corrected(record_id) record_metadata = get_metadata( - record_id, production=False, validate=True, emails=False, schema="43", token=False, authors=False + record_id, production=False ) for subject_obj in record_metadata.get("subjects", []): if "id" in subject_obj: @@ -146,7 +144,7 @@ def test_subject_has_scheme(self): all_corrected(record_id) record_metadata = get_metadata( - record_id, production=False, validate=True, emails=False, schema="43", token=False, authors=False + record_id, production=False ) for subject_obj in record_metadata.get("subjects", []): if "id" in subject_obj: @@ -172,7 +170,7 @@ def test_duplicate_subjects_removed(self): all_corrected(record_id) record_metadata = get_metadata( - record_id, production=False, validate=True, emails=False, schema="43", token=False, authors=False + record_id, production=False ) From 7f9654d53141eff3a3c6999594dff7387d621a61 Mon Sep 17 00:00:00 2001 From: Alexander Abakah Date: Mon, 9 Jun 2025 09:57:04 -0700 Subject: [PATCH 14/18] fixed issues upper and lower case --- ames/matchers/caltechdata.py | 13 ++----------- test_subjects.py | 24 +----------------------- 2 files changed, 3 insertions(+), 34 deletions(-) diff --git a/ames/matchers/caltechdata.py b/ames/matchers/caltechdata.py index 6e6c119e..73a5c19c 100644 --- a/ames/matchers/caltechdata.py +++ b/ames/matchers/caltechdata.py @@ -36,27 +36,18 @@ def edit_subject(record, token, correction_subjects, test=True): authors=False, ) - seen_subjects = set() new_subjects = [] - for subject_entry in metadata["subjects"]: - subject_name = subject_entry["subject"].strip().lower() - - if subject_name in seen_subjects: - continue - seen_subjects.add(subject_name) - + for subject_entry in metadata["subjects"]: for correct_subject in correction_subjects.keys(): - if subject_name == correct_subject.lower() and "id" not in subject_entry: + if subject_entry["subject"] == correct_subject and "id" not in subject_entry: subject_entry["id"] = correction_subjects[correct_subject] subject_entry["subject"] = correct_subject new_subjects.append(subject_entry) metadata["subjects"] = new_subjects - - print(metadata["subjects"]) diff --git a/test_subjects.py b/test_subjects.py index 2975ff07..3173154a 100644 --- a/test_subjects.py +++ b/test_subjects.py @@ -51,28 +51,6 @@ def test_subject_changes(self): self.assertEqual(all_corrected(record_id), True, f"Subjects in record {record_id} were not corrected properly") print("Passed test_subject_changes") - def test_subject_case_normalization(self): - # Creates a record whose subjects need case normalization - test_data = copy.deepcopy(malformed_metadata) - record_id = caltechdata_write( - metadata=test_data, - production=False, - publish=True - ) - - all_corrected(record_id) - - record_metadata = get_metadata( - record_id, production=False - ) - for subject_obj in record_metadata.get("subjects", []): - if "subject" in subject_obj and isinstance(subject_obj["subject"], str): - self.assertTrue( - subject_obj["subject"][0].isupper(), - f"Subject '{subject_obj['subject']}' in record {record_id} should start with uppercase" - ) - print("Passed test_subject_case_normalization") - def test_subject_id_present(self): # Creates a record with known subjects that should map to IDs test_data = copy.deepcopy(malformed_metadata) @@ -174,7 +152,7 @@ def test_duplicate_subjects_removed(self): ) - subjects_list = [s["subject"].lower() for s in record_metadata.get("subjects", [])] + subjects_list = [s["subject"] for s in record_metadata.get("subjects", [])] self.assertEqual( len(subjects_list), len(set(subjects_list)), From 06653aa3bdc0015f3a42f7174900ae11461c9ad1 Mon Sep 17 00:00:00 2001 From: Alexander Abakah Date: Mon, 9 Jun 2025 10:06:08 -0700 Subject: [PATCH 15/18] fixed issues upper and lower case --- ames/matchers/caltechdata.py | 4 ---- test_subjects.py | 2 ++ 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/ames/matchers/caltechdata.py b/ames/matchers/caltechdata.py index 73a5c19c..4d7f473d 100644 --- a/ames/matchers/caltechdata.py +++ b/ames/matchers/caltechdata.py @@ -22,10 +22,6 @@ def edit_subject(record, token, correction_subjects, test=True): "Content-type": "application/json", } - data = requests.get(rurl, headers=headers).json() - - json_string = json.dumps(data["metadata"], indent=4) - metadata = get_metadata( record, production=not test, diff --git a/test_subjects.py b/test_subjects.py index 3173154a..14e539f8 100644 --- a/test_subjects.py +++ b/test_subjects.py @@ -3,6 +3,8 @@ from run_subject_id_correction import all_corrected from caltechdata_api import caltechdata_write, get_metadata +os.environ["RDMTOK"] = "FVyjwsxBvfNXm5NmmfL8fKGI8hhA6puT9pNJO8PAyrLlNYdeMjfjhBVvuhbs" + headers = { "Authorization": "Bearer %s" % "FVyjwsxBvfNXm5NmmfL8fKGI8hhA6puT9pNJO8PAyrLlNYdeMjfjhBVvuhbs", "Content-type": "application/json", From f421597d97aa3e0ae70130f0d138af2a71827e12 Mon Sep 17 00:00:00 2001 From: Alexander Abakah Date: Thu, 12 Jun 2025 12:00:10 -0700 Subject: [PATCH 16/18] refactored code --- run_subject_id_correction.py | 27 +++++---------- subjects_to_correct.csv | 4 +++ test_subjects.py | 66 +++++++++++++++++++----------------- 3 files changed, 46 insertions(+), 51 deletions(-) create mode 100644 subjects_to_correct.csv diff --git a/run_subject_id_correction.py b/run_subject_id_correction.py index 8c3a0bfc..eb971153 100644 --- a/run_subject_id_correction.py +++ b/run_subject_id_correction.py @@ -1,32 +1,21 @@ from caltechdata_api import get_metadata from ames.matchers import edit_subject import os +import pandas as pd +df = pd.read_csv("subjects_to_correct.csv") -def all_corrected(record): +subjects_to_correct = dict(zip(df['subject'], df['subject url'])) - subjects_to_correct = { - "Biological sciences": "http://www.oecd.org/science/inno/38235147.pdf?1.6", - "Chemical sciences": "http://www.oecd.org/science/inno/38235147.pdf?1.4", - "Computer and information sciences": "http://www.oecd.org/science/inno/38235147.pdf?1.2", - } +def all_corrected(record, subjects_to_correct = subjects_to_correct): metadata = edit_subject( record, os.environ.get("CALTECH_DATA_API"), subjects_to_correct ) - for i in metadata["subjects"]: - for each_correct_subject in subjects_to_correct.keys(): - if "id" in i.keys(): - if ( - i["subject"] == each_correct_subject - and i["id"] != subjects_to_correct[each_correct_subject] - ): - print(i["subject"], "'s id wasn't added.") - return False - print("Final subjects:", metadata["subjects"]) - print("All subject ids were added") - return True + if metadata: + return True + else: + return False -# all_corrected() diff --git a/subjects_to_correct.csv b/subjects_to_correct.csv new file mode 100644 index 00000000..21abaee6 --- /dev/null +++ b/subjects_to_correct.csv @@ -0,0 +1,4 @@ +subject,subject url +Biological sciences,http://www.oecd.org/science/inno/38235147.pdf?1.6 +Chemical sciences,http://www.oecd.org/science/inno/38235147.pdf?1.4 +Computer and information sciences,http://www.oecd.org/science/inno/38235147.pdf?1.2 diff --git a/test_subjects.py b/test_subjects.py index 14e539f8..88790e18 100644 --- a/test_subjects.py +++ b/test_subjects.py @@ -1,5 +1,6 @@ import unittest import os, copy, time, requests +import pandas as pd from run_subject_id_correction import all_corrected from caltechdata_api import caltechdata_write, get_metadata @@ -10,7 +11,7 @@ "Content-type": "application/json", } -metadata = { +original_metadata = { "titles": [{"title": "enter title"}], "creators": [ { @@ -31,13 +32,31 @@ } # A version of the metadata that is deliberately malformed -malformed_metadata = copy.deepcopy(metadata) +malformed_metadata = copy.deepcopy(original_metadata) malformed_metadata["subjects"] = [ {"subject": " Biological sciences "}, # Extra spaces {"subject": "CHEMICAL SCIENCES"}, # All caps {"subject": "computer and information sciences"}, # Incorrect capitalization ] +df = pd.read_csv("subjects_to_correct.csv") + +subjects_to_correct = dict(zip(df['subject'], df['subject url'])) + +def test_change(record_id): + metadata = get_metadata(record_id, production = False) + for i in metadata["subjects"]: + for each_correct_subject in subjects_to_correct.keys(): + if "id" in i.keys(): + if ( + i["subject"] == each_correct_subject + and i["id"] != subjects_to_correct[each_correct_subject] + ): + print(i["subject"], "'s id wasn't added.") + return False + print("Changes made!") + return True + class TestSubjects(unittest.TestCase): @@ -51,6 +70,17 @@ def test_subject_changes(self): ) # Verify correction self.assertEqual(all_corrected(record_id), True, f"Subjects in record {record_id} were not corrected properly") + self.assertEqual(test_change(record_id), True) + print("Passed test_subject_changes") + + #Verify no change was made to original metadata + record_id = caltechdata_write( + metadata=copy.deepcopy(original_metadata), + production=False, + publish=True + ) + self.assertEqual(all_corrected(record_id), True, f"Subjects in original record {record_id} were not edited properly") + self.assertEqual(test_change(record_id), True) print("Passed test_subject_changes") def test_subject_id_present(self): @@ -79,7 +109,7 @@ def test_subject_id_present(self): def test_subject_scheme_consistent(self): # Creates a record with IDs that should link to scheme FOS - test_data = copy.deepcopy(metadata) + test_data = copy.deepcopy(original_metadata) test_data["subjects"] = [ { "id": "http://www.oecd.org/science/inno/38235147.pdf?1.2", @@ -108,7 +138,7 @@ def test_subject_scheme_consistent(self): def test_subject_has_scheme(self): # Creates a record with IDs doesn't have a scheme - test_data = copy.deepcopy(metadata) + test_data = copy.deepcopy(original_metadata) test_data["subjects"] = [ { "id": "http://www.oecd.org/science/inno/38235147.pdf?1.2", @@ -134,34 +164,6 @@ def test_subject_has_scheme(self): ) print("Passed test_subject_has_scheme") - def test_duplicate_subjects_removed(self): - # Create a record with duplicate subjects - test_data = copy.deepcopy(metadata) - test_data["subjects"] = [ - {"subject": "Biological sciences"}, - {"subject": "biological Sciences"}, - ] - record_id = caltechdata_write( - metadata=test_data, - production=False, - publish=True - ) - - all_corrected(record_id) - - record_metadata = get_metadata( - record_id, production=False - ) - - - subjects_list = [s["subject"] for s in record_metadata.get("subjects", [])] - self.assertEqual( - len(subjects_list), - len(set(subjects_list)), - f"Found duplicate subjects in record {record_id}" - ) - print("Passed test_duplicate_subjects_removed") - if __name__ == '__main__': unittest.main() \ No newline at end of file From 14801128a001ed34d21f27b419d65cde2641db33 Mon Sep 17 00:00:00 2001 From: Alexander Abakah Date: Mon, 30 Jun 2025 11:38:36 -0700 Subject: [PATCH 17/18] fixed tokens --- test_subjects.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_subjects.py b/test_subjects.py index 88790e18..d181f149 100644 --- a/test_subjects.py +++ b/test_subjects.py @@ -4,10 +4,10 @@ from run_subject_id_correction import all_corrected from caltechdata_api import caltechdata_write, get_metadata -os.environ["RDMTOK"] = "FVyjwsxBvfNXm5NmmfL8fKGI8hhA6puT9pNJO8PAyrLlNYdeMjfjhBVvuhbs" +token = os.environ["TOKEN"] headers = { - "Authorization": "Bearer %s" % "FVyjwsxBvfNXm5NmmfL8fKGI8hhA6puT9pNJO8PAyrLlNYdeMjfjhBVvuhbs", + "Authorization": "Bearer %s" % token, "Content-type": "application/json", } From 151c223b84005fb41b103f656c34db2023230ccd Mon Sep 17 00:00:00 2001 From: Alexander Abakah Date: Mon, 30 Jun 2025 11:49:02 -0700 Subject: [PATCH 18/18] fixed header issues --- ames/matchers/caltechdata.py | 5 ----- test_subjects.py | 6 ------ 2 files changed, 11 deletions(-) diff --git a/ames/matchers/caltechdata.py b/ames/matchers/caltechdata.py index 4d7f473d..5e66a23a 100644 --- a/ames/matchers/caltechdata.py +++ b/ames/matchers/caltechdata.py @@ -25,11 +25,6 @@ def edit_subject(record, token, correction_subjects, test=True): metadata = get_metadata( record, production=not test, - validate=True, - emails=False, - schema="43", - token=False, - authors=False, ) new_subjects = [] diff --git a/test_subjects.py b/test_subjects.py index d181f149..645246c7 100644 --- a/test_subjects.py +++ b/test_subjects.py @@ -4,12 +4,6 @@ from run_subject_id_correction import all_corrected from caltechdata_api import caltechdata_write, get_metadata -token = os.environ["TOKEN"] - -headers = { - "Authorization": "Bearer %s" % token, - "Content-type": "application/json", -} original_metadata = { "titles": [{"title": "enter title"}],