diff --git a/ames/matchers/__init__.py b/ames/matchers/__init__.py index a5923f56..be300d1b 100644 --- a/ames/matchers/__init__.py +++ b/ames/matchers/__init__.py @@ -2,6 +2,7 @@ from .caltechdata import match_codemeta from .caltechdata import add_thesis_doi from .caltechdata import add_usage +from .caltechdata import edit_subject from .datacite import update_datacite_metadata from .datacite import update_datacite_media from .datacite import submit_report diff --git a/ames/matchers/caltechdata.py b/ames/matchers/caltechdata.py index 845ee502..5e66a23a 100644 --- a/ames/matchers/caltechdata.py +++ b/ames/matchers/caltechdata.py @@ -1,5 +1,5 @@ import os, json -from caltechdata_api import caltechdata_edit +from caltechdata_api import caltechdata_edit, get_metadata from ames import codemeta_to_datacite from ames.harvesters import get_records from progressbar import progressbar @@ -11,6 +11,58 @@ import requests +def edit_subject(record, token, correction_subjects, test=True): + if test: + rurl = "https://data.caltechlibrary.dev/api/records/" + record + else: + rurl = "https://data.caltechlibrary.dev/api/records/" + record + + headers = { + "Authorization": "Bearer %s" % token, + "Content-type": "application/json", + } + + metadata = get_metadata( + record, + production=not test, + ) + + new_subjects = [] + + for subject_entry in metadata["subjects"]: + + for correct_subject in correction_subjects.keys(): + if subject_entry["subject"] == correct_subject and "id" not in subject_entry: + subject_entry["id"] = correction_subjects[correct_subject] + subject_entry["subject"] = correct_subject + + new_subjects.append(subject_entry) + + metadata["subjects"] = new_subjects + + print(metadata["subjects"]) + + caltechdata_edit( + record, + metadata=metadata, + token=token, + production=not test, + publish=True, + ) + + record_metadata = get_metadata( + record, + production=False, + validate=True, + emails=False, + schema="43", + token=False, + authors=False, + ) + + return record_metadata + + def match_cd_refs(): token = os.environ["RDMTOK"] diff --git a/codemeta.json b/codemeta.json index 7f6c4edd..85481d36 100755 --- a/codemeta.json +++ b/codemeta.json @@ -39,6 +39,16 @@ "name": "Caltech" }, "@id": "https://orcid.org/0009-0002-2450-6471" + }, + { + "@type": "Person", + "givenName": "Alexander", + "familyName": "Abakah", + "affiliation": { + "@type": "Organization", + "name": "Caltech" + }, + "@id": "https://orcid.org/0009-0003-5640-6691" } ], "developmentStatus": "active", diff --git a/run_subject_id_correction.py b/run_subject_id_correction.py new file mode 100644 index 00000000..eb971153 --- /dev/null +++ b/run_subject_id_correction.py @@ -0,0 +1,21 @@ +from caltechdata_api import get_metadata +from ames.matchers import edit_subject +import os +import pandas as pd + +df = pd.read_csv("subjects_to_correct.csv") + +subjects_to_correct = dict(zip(df['subject'], df['subject url'])) + +def all_corrected(record, subjects_to_correct = subjects_to_correct): + + metadata = edit_subject( + record, os.environ.get("CALTECH_DATA_API"), subjects_to_correct + ) + + if metadata: + return True + else: + return False + + diff --git a/subjects_to_correct.csv b/subjects_to_correct.csv new file mode 100644 index 00000000..21abaee6 --- /dev/null +++ b/subjects_to_correct.csv @@ -0,0 +1,4 @@ +subject,subject url +Biological sciences,http://www.oecd.org/science/inno/38235147.pdf?1.6 +Chemical sciences,http://www.oecd.org/science/inno/38235147.pdf?1.4 +Computer and information sciences,http://www.oecd.org/science/inno/38235147.pdf?1.2 diff --git a/test_subjects.py b/test_subjects.py new file mode 100644 index 00000000..645246c7 --- /dev/null +++ b/test_subjects.py @@ -0,0 +1,163 @@ +import unittest +import os, copy, time, requests +import pandas as pd +from run_subject_id_correction import all_corrected +from caltechdata_api import caltechdata_write, get_metadata + + +original_metadata = { + "titles": [{"title": "enter title"}], + "creators": [ + { + "familyName": "Abakah", + "givenName": "Alexander", + "nameType": "Personal", + "nameIdentifiers": [ + {"nameIdentifier": "0009-0003-5640-6691", "nameIdentifierScheme": "ORCID"} + ], + "affiliations": [{"affiliation": "Caltech"}] + } + ], + "types": {"resourceType": "Dataset", "resourceTypeGeneral": "Dataset"}, + "descriptions": [{"description": "A data set of forest fires", "descriptionType": "Summary"}], + "dates": [{"date": "2023-11-30", "dateType": "Created"}], + "publisher": "Caltech Library", + "subjects": [{"subject": "Enter Subject"}], +} + +# A version of the metadata that is deliberately malformed +malformed_metadata = copy.deepcopy(original_metadata) +malformed_metadata["subjects"] = [ + {"subject": " Biological sciences "}, # Extra spaces + {"subject": "CHEMICAL SCIENCES"}, # All caps + {"subject": "computer and information sciences"}, # Incorrect capitalization +] + +df = pd.read_csv("subjects_to_correct.csv") + +subjects_to_correct = dict(zip(df['subject'], df['subject url'])) + +def test_change(record_id): + metadata = get_metadata(record_id, production = False) + for i in metadata["subjects"]: + for each_correct_subject in subjects_to_correct.keys(): + if "id" in i.keys(): + if ( + i["subject"] == each_correct_subject + and i["id"] != subjects_to_correct[each_correct_subject] + ): + print(i["subject"], "'s id wasn't added.") + return False + print("Changes made!") + return True + + +class TestSubjects(unittest.TestCase): + + def test_subject_changes(self): + # Creates a test record with malformed subjects + test_data = copy.deepcopy(malformed_metadata) + record_id = caltechdata_write( + metadata=test_data, + production=False, + publish=True + ) + # Verify correction + self.assertEqual(all_corrected(record_id), True, f"Subjects in record {record_id} were not corrected properly") + self.assertEqual(test_change(record_id), True) + print("Passed test_subject_changes") + + #Verify no change was made to original metadata + record_id = caltechdata_write( + metadata=copy.deepcopy(original_metadata), + production=False, + publish=True + ) + self.assertEqual(all_corrected(record_id), True, f"Subjects in original record {record_id} were not edited properly") + self.assertEqual(test_change(record_id), True) + print("Passed test_subject_changes") + + def test_subject_id_present(self): + # Creates a record with known subjects that should map to IDs + test_data = copy.deepcopy(malformed_metadata) + test_data["subjects"] = [ + {"subject": "Biological sciences"}, + {"subject": "Chemical sciences"}, + {"subject": "Computer and information sciences"}, + ] + record_id = caltechdata_write( + metadata=test_data, + production=False, + publish=True + ) + + all_corrected(record_id) + + rurl = "https://data.caltechlibrary.dev/api/records/" + record_id + data = requests.get(rurl, headers=headers).json() + record_metadata = data["metadata"] + for subject_obj in record_metadata.get("subjects", []): + if subject_obj["subject"] in ["Biological sciences", "Chemical sciences", "Computer and information sciences"]: + self.assertIn("id", subject_obj, f"Subject '{subject_obj['subject']}' in record {record_id} should have an ID") + print("Passed test_subject_id_present") + + def test_subject_scheme_consistent(self): + # Creates a record with IDs that should link to scheme FOS + test_data = copy.deepcopy(original_metadata) + test_data["subjects"] = [ + { + "id": "http://www.oecd.org/science/inno/38235147.pdf?1.2", + "subject": "Computer and information sciences", + "scheme": "fos" + } + ] + record_id = caltechdata_write( + metadata=test_data, + production=False, + publish=True + ) + + all_corrected(record_id) + + record_metadata = get_metadata( + record_id, production=False + ) + for subject_obj in record_metadata.get("subjects", []): + if "id" in subject_obj: + self.assertEqual( + subject_obj["scheme"], "FOS", + f"Subject scheme for '{subject_obj['subject']}' in record {record_id} should be 'FOS'" + ) + print("Passed test_subject_scheme_consistent") + + def test_subject_has_scheme(self): + # Creates a record with IDs doesn't have a scheme + test_data = copy.deepcopy(original_metadata) + test_data["subjects"] = [ + { + "id": "http://www.oecd.org/science/inno/38235147.pdf?1.2", + "subject": "Computer and information sciences", + } + ] + record_id = caltechdata_write( + metadata=test_data, + production=False, + publish=True + ) + + all_corrected(record_id) + + record_metadata = get_metadata( + record_id, production=False + ) + for subject_obj in record_metadata.get("subjects", []): + if "id" in subject_obj: + self.assertIn( + "scheme", subject_obj, + f"Subject with ID '{subject_obj['id']}' should have a scheme" + ) + print("Passed test_subject_has_scheme") + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file