caltechlibrary · tmorrell · Jun 30, 2025 · May 12, 2025 · May 12, 2025 · May 13, 2025
diff --git a/ames/matchers/__init__.py b/ames/matchers/__init__.py
@@ -2,6 +2,7 @@
 from .caltechdata import match_codemeta
 from .caltechdata import add_thesis_doi
 from .caltechdata import add_usage
+from .caltechdata import edit_subject
 from .datacite import update_datacite_metadata
 from .datacite import update_datacite_media
 from .datacite import submit_report

diff --git a/ames/matchers/caltechdata.py b/ames/matchers/caltechdata.py
@@ -1,5 +1,5 @@
 import os, json
-from caltechdata_api import caltechdata_edit
+from caltechdata_api import caltechdata_edit, get_metadata
 from ames import codemeta_to_datacite
 from ames.harvesters import get_records
 from progressbar import progressbar
@@ -11,6 +11,58 @@
 import requests
 
 
+def edit_subject(record, token, correction_subjects, test=True):
+    if test:
+        rurl = "https://data.caltechlibrary.dev/api/records/" + record
+    else:
+        rurl = "https://data.caltechlibrary.dev/api/records/" + record
+
+    headers = {
+        "Authorization": "Bearer %s" % token,
+        "Content-type": "application/json",
+    }
+
+    metadata = get_metadata(
+        record,
+        production=not test,
+    )
+
+    new_subjects = []
+
+    for subject_entry in metadata["subjects"]:   
+
+        for correct_subject in correction_subjects.keys():
+            if subject_entry["subject"] == correct_subject and "id" not in subject_entry:
+                subject_entry["id"] = correction_subjects[correct_subject]
+                subject_entry["subject"] = correct_subject
+
+        new_subjects.append(subject_entry)
+
+    metadata["subjects"] = new_subjects
+
+    print(metadata["subjects"])
+
+    caltechdata_edit(
+        record,
+        metadata=metadata,
+        token=token,
+        production=not test,
+        publish=True,
+    )
+
+    record_metadata = get_metadata(
+        record,
+        production=False,
+        validate=True,
+        emails=False,
+        schema="43",
+        token=False,
+        authors=False,
+    )
+
+    return record_metadata
+
+
 def match_cd_refs():
     token = os.environ["RDMTOK"]
 

diff --git a/codemeta.json b/codemeta.json
@@ -39,6 +39,16 @@
                 "name": "Caltech"
             },
             "@id": "https://orcid.org/0009-0002-2450-6471"
+        },
+        {
+            "@type": "Person",
+            "givenName": "Alexander",
+            "familyName": "Abakah",
+            "affiliation": {
+                "@type": "Organization",
+                "name": "Caltech"
+            },
+            "@id": "https://orcid.org/0009-0003-5640-6691"
         }
     ],
     "developmentStatus": "active",

diff --git a/run_subject_id_correction.py b/run_subject_id_correction.py
@@ -0,0 +1,21 @@
+from caltechdata_api import get_metadata
+from ames.matchers import edit_subject
+import os
+import pandas as pd
+
+df = pd.read_csv("subjects_to_correct.csv")
+
+subjects_to_correct = dict(zip(df['subject'], df['subject url']))
+
+def all_corrected(record, subjects_to_correct = subjects_to_correct):
+
+    metadata = edit_subject(
+        record, os.environ.get("CALTECH_DATA_API"), subjects_to_correct
+    )
+
+    if metadata:
+        return True
+    else:
+        return False
+
+
diff --git a/subjects_to_correct.csv b/subjects_to_correct.csv
@@ -0,0 +1,4 @@
+subject,subject url
+Biological sciences,http://www.oecd.org/science/inno/38235147.pdf?1.6
+Chemical sciences,http://www.oecd.org/science/inno/38235147.pdf?1.4
+Computer and information sciences,http://www.oecd.org/science/inno/38235147.pdf?1.2
diff --git a/test_subjects.py b/test_subjects.py
@@ -0,0 +1,163 @@
+import unittest
+import os, copy, time, requests
+import pandas as pd
+from run_subject_id_correction import all_corrected 
+from caltechdata_api import caltechdata_write, get_metadata
+
+
+original_metadata = {
+    "titles": [{"title": "enter title"}],
+    "creators": [
+        {
+            "familyName": "Abakah",
+            "givenName": "Alexander",
+            "nameType": "Personal",
+            "nameIdentifiers": [
+                {"nameIdentifier": "0009-0003-5640-6691", "nameIdentifierScheme": "ORCID"}
+            ],
+            "affiliations": [{"affiliation": "Caltech"}]
+        }
+    ],
+    "types": {"resourceType": "Dataset", "resourceTypeGeneral": "Dataset"},
+    "descriptions": [{"description": "A data set of forest fires", "descriptionType": "Summary"}],
+    "dates": [{"date": "2023-11-30", "dateType": "Created"}],
+    "publisher": "Caltech Library",
+    "subjects": [{"subject": "Enter Subject"}],
+}
+
+# A version of the metadata that is deliberately malformed
+malformed_metadata = copy.deepcopy(original_metadata)
+malformed_metadata["subjects"] = [
+    {"subject": "  Biological sciences  "},  # Extra spaces
+    {"subject": "CHEMICAL SCIENCES"},        # All caps
+    {"subject": "computer and information sciences"},  # Incorrect capitalization
+]
+
+df = pd.read_csv("subjects_to_correct.csv")
+
+subjects_to_correct = dict(zip(df['subject'], df['subject url']))
+
+def test_change(record_id):
+    metadata = get_metadata(record_id, production = False)
+    for i in metadata["subjects"]:
+        for each_correct_subject in subjects_to_correct.keys():
+            if "id" in i.keys():
+                if (
+                    i["subject"] == each_correct_subject
+                    and i["id"] != subjects_to_correct[each_correct_subject]
+                ):
+                    print(i["subject"], "'s id wasn't added.")
+                    return False
+    print("Changes made!")
+    return True
+
+
+class TestSubjects(unittest.TestCase):
+
+    def test_subject_changes(self):
+        # Creates a test record with malformed subjects
+        test_data = copy.deepcopy(malformed_metadata)
+        record_id = caltechdata_write(
+            metadata=test_data,
+            production=False,
+            publish=True
+        )
+        # Verify correction
+        self.assertEqual(all_corrected(record_id), True, f"Subjects in record {record_id} were not corrected properly")
+        self.assertEqual(test_change(record_id), True)
+        print("Passed test_subject_changes")
+
+        #Verify no change was made to original metadata
+        record_id = caltechdata_write(
+            metadata=copy.deepcopy(original_metadata),
+            production=False,
+            publish=True
+        )
+        self.assertEqual(all_corrected(record_id), True, f"Subjects in original record {record_id} were not edited properly")
+        self.assertEqual(test_change(record_id), True)
+        print("Passed test_subject_changes")
+
+    def test_subject_id_present(self):
+        # Creates a record with known subjects that should map to IDs
+        test_data = copy.deepcopy(malformed_metadata)
+        test_data["subjects"] = [
+            {"subject": "Biological sciences"},
+            {"subject": "Chemical sciences"},
+            {"subject": "Computer and information sciences"},
+        ]
+        record_id = caltechdata_write(
+            metadata=test_data,
+            production=False,
+            publish=True
+        )
+
+        all_corrected(record_id)
+
+        rurl = "https://data.caltechlibrary.dev/api/records/" + record_id
+        data = requests.get(rurl, headers=headers).json()
+        record_metadata = data["metadata"]
+        for subject_obj in record_metadata.get("subjects", []):
+            if subject_obj["subject"] in ["Biological sciences", "Chemical sciences", "Computer and information sciences"]:
+                self.assertIn("id", subject_obj, f"Subject '{subject_obj['subject']}' in record {record_id} should have an ID")
+        print("Passed test_subject_id_present")
+
+    def test_subject_scheme_consistent(self):
+        # Creates a record with IDs that should link to scheme FOS
+        test_data = copy.deepcopy(original_metadata)
+        test_data["subjects"] = [
+            {
+                "id": "http://www.oecd.org/science/inno/38235147.pdf?1.2",
+                "subject": "Computer and information sciences",
+                "scheme": "fos"
+            }
+        ]
+        record_id = caltechdata_write(
+            metadata=test_data,
+            production=False,
+            publish=True
+        )
+
+        all_corrected(record_id)
+
+        record_metadata = get_metadata(
+            record_id, production=False
+        )
+        for subject_obj in record_metadata.get("subjects", []):
+            if "id" in subject_obj:
+                self.assertEqual(
+                    subject_obj["scheme"], "FOS",
+                    f"Subject scheme for '{subject_obj['subject']}' in record {record_id} should be 'FOS'"
+                )
+        print("Passed test_subject_scheme_consistent")
+
+    def test_subject_has_scheme(self):
+        # Creates a record with IDs doesn't have a scheme
+        test_data = copy.deepcopy(original_metadata)
+        test_data["subjects"] = [
+            {
+                "id": "http://www.oecd.org/science/inno/38235147.pdf?1.2",
+                "subject": "Computer and information sciences",
+            }
+        ]
+        record_id = caltechdata_write(
+            metadata=test_data,
+            production=False,
+            publish=True
+        )
+
+        all_corrected(record_id)
+
+        record_metadata = get_metadata(
+            record_id, production=False
+        )
+        for subject_obj in record_metadata.get("subjects", []):
+            if "id" in subject_obj:
+                self.assertIn(
+                    "scheme", subject_obj,
+                    f"Subject with ID '{subject_obj['id']}' should have a scheme"
+                )
+        print("Passed test_subject_has_scheme")
+
+
+if __name__ == '__main__':
+    unittest.main()