Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ames/matchers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from .caltechdata import match_codemeta
from .caltechdata import add_thesis_doi
from .caltechdata import add_usage
from .caltechdata import edit_subject
from .datacite import update_datacite_metadata
from .datacite import update_datacite_media
from .datacite import submit_report
Expand Down
54 changes: 53 additions & 1 deletion ames/matchers/caltechdata.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os, json
from caltechdata_api import caltechdata_edit
from caltechdata_api import caltechdata_edit, get_metadata
from ames import codemeta_to_datacite
from ames.harvesters import get_records
from progressbar import progressbar
Expand All @@ -11,6 +11,58 @@
import requests


def edit_subject(record, token, correction_subjects, test=True):
if test:
rurl = "https://data.caltechlibrary.dev/api/records/" + record
else:
rurl = "https://data.caltechlibrary.dev/api/records/" + record

headers = {
"Authorization": "Bearer %s" % token,
"Content-type": "application/json",
}

metadata = get_metadata(
record,
production=not test,
)

new_subjects = []

for subject_entry in metadata["subjects"]:

for correct_subject in correction_subjects.keys():
if subject_entry["subject"] == correct_subject and "id" not in subject_entry:
subject_entry["id"] = correction_subjects[correct_subject]
subject_entry["subject"] = correct_subject

new_subjects.append(subject_entry)

metadata["subjects"] = new_subjects

print(metadata["subjects"])

caltechdata_edit(
record,
metadata=metadata,
token=token,
production=not test,
publish=True,
)

record_metadata = get_metadata(
record,
production=False,
validate=True,
emails=False,
schema="43",
token=False,
authors=False,
)

return record_metadata


def match_cd_refs():
token = os.environ["RDMTOK"]

Expand Down
10 changes: 10 additions & 0 deletions codemeta.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,16 @@
"name": "Caltech"
},
"@id": "https://orcid.org/0009-0002-2450-6471"
},
{
"@type": "Person",
"givenName": "Alexander",
"familyName": "Abakah",
"affiliation": {
"@type": "Organization",
"name": "Caltech"
},
"@id": "https://orcid.org/0009-0003-5640-6691"
}
],
"developmentStatus": "active",
Expand Down
21 changes: 21 additions & 0 deletions run_subject_id_correction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from caltechdata_api import get_metadata
from ames.matchers import edit_subject
import os
import pandas as pd

df = pd.read_csv("subjects_to_correct.csv")

subjects_to_correct = dict(zip(df['subject'], df['subject url']))

def all_corrected(record, subjects_to_correct = subjects_to_correct):

metadata = edit_subject(
record, os.environ.get("CALTECH_DATA_API"), subjects_to_correct
)

if metadata:
return True
else:
return False


4 changes: 4 additions & 0 deletions subjects_to_correct.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
subject,subject url
Biological sciences,http://www.oecd.org/science/inno/38235147.pdf?1.6
Chemical sciences,http://www.oecd.org/science/inno/38235147.pdf?1.4
Computer and information sciences,http://www.oecd.org/science/inno/38235147.pdf?1.2
163 changes: 163 additions & 0 deletions test_subjects.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import unittest
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move to the tests directory

import os, copy, time, requests
import pandas as pd
from run_subject_id_correction import all_corrected
from caltechdata_api import caltechdata_write, get_metadata


original_metadata = {
"titles": [{"title": "enter title"}],
"creators": [
{
"familyName": "Abakah",
"givenName": "Alexander",
"nameType": "Personal",
"nameIdentifiers": [
{"nameIdentifier": "0009-0003-5640-6691", "nameIdentifierScheme": "ORCID"}
],
"affiliations": [{"affiliation": "Caltech"}]
}
],
"types": {"resourceType": "Dataset", "resourceTypeGeneral": "Dataset"},
"descriptions": [{"description": "A data set of forest fires", "descriptionType": "Summary"}],
"dates": [{"date": "2023-11-30", "dateType": "Created"}],
"publisher": "Caltech Library",
"subjects": [{"subject": "Enter Subject"}],
}

# A version of the metadata that is deliberately malformed
malformed_metadata = copy.deepcopy(original_metadata)
malformed_metadata["subjects"] = [
{"subject": " Biological sciences "}, # Extra spaces
{"subject": "CHEMICAL SCIENCES"}, # All caps
{"subject": "computer and information sciences"}, # Incorrect capitalization
]

df = pd.read_csv("subjects_to_correct.csv")

subjects_to_correct = dict(zip(df['subject'], df['subject url']))

def test_change(record_id):
metadata = get_metadata(record_id, production = False)
for i in metadata["subjects"]:
for each_correct_subject in subjects_to_correct.keys():
if "id" in i.keys():
if (
i["subject"] == each_correct_subject
and i["id"] != subjects_to_correct[each_correct_subject]
):
print(i["subject"], "'s id wasn't added.")
return False
print("Changes made!")
return True


class TestSubjects(unittest.TestCase):

def test_subject_changes(self):
# Creates a test record with malformed subjects
test_data = copy.deepcopy(malformed_metadata)
record_id = caltechdata_write(
metadata=test_data,
production=False,
publish=True
)
# Verify correction
self.assertEqual(all_corrected(record_id), True, f"Subjects in record {record_id} were not corrected properly")
self.assertEqual(test_change(record_id), True)
print("Passed test_subject_changes")

#Verify no change was made to original metadata
record_id = caltechdata_write(
metadata=copy.deepcopy(original_metadata),
production=False,
publish=True
)
self.assertEqual(all_corrected(record_id), True, f"Subjects in original record {record_id} were not edited properly")
self.assertEqual(test_change(record_id), True)
print("Passed test_subject_changes")

def test_subject_id_present(self):
# Creates a record with known subjects that should map to IDs
test_data = copy.deepcopy(malformed_metadata)
test_data["subjects"] = [
{"subject": "Biological sciences"},
{"subject": "Chemical sciences"},
{"subject": "Computer and information sciences"},
]
record_id = caltechdata_write(
metadata=test_data,
production=False,
publish=True
)

all_corrected(record_id)

rurl = "https://data.caltechlibrary.dev/api/records/" + record_id
data = requests.get(rurl, headers=headers).json()
record_metadata = data["metadata"]
for subject_obj in record_metadata.get("subjects", []):
if subject_obj["subject"] in ["Biological sciences", "Chemical sciences", "Computer and information sciences"]:
self.assertIn("id", subject_obj, f"Subject '{subject_obj['subject']}' in record {record_id} should have an ID")
print("Passed test_subject_id_present")

def test_subject_scheme_consistent(self):
# Creates a record with IDs that should link to scheme FOS
test_data = copy.deepcopy(original_metadata)
test_data["subjects"] = [
{
"id": "http://www.oecd.org/science/inno/38235147.pdf?1.2",
"subject": "Computer and information sciences",
"scheme": "fos"
}
]
record_id = caltechdata_write(
metadata=test_data,
production=False,
publish=True
)

all_corrected(record_id)

record_metadata = get_metadata(
record_id, production=False
)
for subject_obj in record_metadata.get("subjects", []):
if "id" in subject_obj:
self.assertEqual(
subject_obj["scheme"], "FOS",
f"Subject scheme for '{subject_obj['subject']}' in record {record_id} should be 'FOS'"
)
print("Passed test_subject_scheme_consistent")

def test_subject_has_scheme(self):
# Creates a record with IDs doesn't have a scheme
test_data = copy.deepcopy(original_metadata)
test_data["subjects"] = [
{
"id": "http://www.oecd.org/science/inno/38235147.pdf?1.2",
"subject": "Computer and information sciences",
}
]
record_id = caltechdata_write(
metadata=test_data,
production=False,
publish=True
)

all_corrected(record_id)

record_metadata = get_metadata(
record_id, production=False
)
for subject_obj in record_metadata.get("subjects", []):
if "id" in subject_obj:
self.assertIn(
"scheme", subject_obj,
f"Subject with ID '{subject_obj['id']}' should have a scheme"
)
print("Passed test_subject_has_scheme")

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do a test with metadata to verify that "Enter Subject" isn't deleted


if __name__ == '__main__':
unittest.main()