From 430e209d0a80186deaf0b5cc5e723f7b4bb26d1a Mon Sep 17 00:00:00 2001 From: jgw4sq <~jgw4sq@virginia.edu> Date: Fri, 11 Mar 2016 13:20:41 -0500 Subject: [PATCH] New commit for fixing formating of nist harvester tags --- scrapi/harvesters/nist.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/scrapi/harvesters/nist.py b/scrapi/harvesters/nist.py index 3192d2ec..defd5676 100644 --- a/scrapi/harvesters/nist.py +++ b/scrapi/harvesters/nist.py @@ -6,13 +6,39 @@ from __future__ import unicode_literals from scrapi.base import OAIHarvester +from scrapi.base.helpers import updated_schema class NistHarvester(OAIHarvester): short_name = 'nist' long_name = 'NIST MaterialsData' url = 'https://materialsdata.nist.gov' - base_url = 'https://materialsdata.nist.gov/dspace/oai/request' property_list = ['relation', 'rights', 'identifier', 'type', 'date', 'setSpec'] timezone_granularity = True + + @property + def schema(self): + return updated_schema(self._schema, {'subjects': ('//dc:subject/node()', format_tags)}) + + +def format_tags(all_tags): + tags = [] + for tag in all_tags: + tag = tag.replace("Computational File Repository Categories", '') + tag = tag.replace("Computational File Repository", '') + tag = tag.replace("File Repository Categories", '') + + if "::" in tag: + tags.extend(tag.split("::")) + if "," in tag: + tags.extend(tag.split(",")) + elif "::" not in tag: + tags.append(tag) + + for tag in tags: + if "::" in tag: + tags.remove(tag) + if tag == "": + tags.remove(tag) + return tags