From 3c81805b312a1e85c87f4140fcb14810c0bfe364 Mon Sep 17 00:00:00 2001 From: erinspace Date: Mon, 30 Nov 2015 17:40:34 -0500 Subject: [PATCH 1/8] Add AutoOAI harvester that automates much of the oai harvester process --- scrapi/base/__init__.py | 217 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 213 insertions(+), 4 deletions(-) diff --git a/scrapi/base/__init__.py b/scrapi/base/__init__.py index a79a0e89..c1d7858b 100644 --- a/scrapi/base/__init__.py +++ b/scrapi/base/__init__.py @@ -10,6 +10,7 @@ from furl import furl from lxml import etree +from scrapi import requests from scrapi import registry from scrapi import settings from scrapi.base.schemas import OAISCHEMA @@ -108,6 +109,206 @@ def normalize(self, raw_doc): return NormalizedDocument(transformed, clean=True) +class AutoOAIHarvester(XMLHarvester): + """ Take a given URL and approved sets, and harvest everything that repo + has on the sets from all available + """ + + _identify_element = None + _timezone_granularity = None + _metadata_prefixes = None + _property_list = None + _record_encoding = None + + timeout = 0.5 + approved_sets = None + timezone_granularity = False + property_list = ['date', 'type'] + force_request_update = False + verify = True + all_namespaces = {} + + def namespaces(self, element): + namespaces = element.nsmap + + for key, value in namespaces.items(): + if not key: + namespaces['ns0'] = value + namespaces.pop(None) + self.all_namespaces.update(namespaces) + print('UPDATING NSPS WITH {}'.format(namespaces)) + return namespaces + + @property + def identify_element(self): + if self._identify_element: + return self._identify_element + url = furl(self.base_url) + url.args['verb'] = 'Identify' + self._identify_element = etree.XML(requests.get(url.url).content) + + return self._identify_element + + @property + def metadata_prefixes(self): + if self._metadata_prefixes: + return self._metadata_prefixes + url = furl(self.base_url) + url.args['verb'] = 'ListMetadataFormats' + xml_content = etree.XML(requests.get(url.url).content) + namespaces = self.namespaces(xml_content) + self._metadata_prefixes = xml_content.xpath('//ns0:metadataPrefix/node()', namespaces=namespaces) + + return self._metadata_prefixes + + @property + def record_encoding(self): + if self._record_encoding: + return self._record_encoding + url = furl(self.base_url) + url.args['verb'] = 'Identify' + + self._record_encoding = requests.get(url.url).encoding + return self._record_encoding + + @property + def long_name(self): + namespaces = self.namespaces(self.identify_element) + return self.identify_element.xpath('//ns0:repositoryName/node()', namespaces=namespaces) + + @property + def timezone_granularity(self): + if self._timezone_granularity: + return self._timezone_granularity + namespaces = self.namespaces(self.identify_element) + granularity = self.identify_element.xpath('//ns0:granularity/node()', namespaces=namespaces) + + if 'hh:mm:ss' in granularity: + return True + else: + return False + + @property + def schema(self): + return self._schema + + @property + def _schema(self): + return updated_schema(OAISCHEMA, self.formatted_properties) + + @property + def formatted_properties(self): + return { + 'otherProperties': build_properties( + *list( + map( + self.format_property, + self.property_list + ) + ) + ) + } + + def format_property(self, property): + if property == 'date': + null_on_error(datetime_formatter) + fn = compose(lambda x: list( + map( + null_on_error(datetime_formatter), + x + ) + ), coerce_to_list, self.resolve_property) + else: + fn = self.resolve_property + inner_tuple = ['//{}:{}/node()'.format(namespace, property) for namespace in self.all_namespaces] + inner_tuple.append(fn) + return (property, tuple(inner_tuple)) + + def resolve_property(self, *args): + ret = [item for sublist in args for item in sublist] + return ret[0] if len(ret) == 1 else ret + + def get_identifiers(self, identifiers_url): + identifier_content = requests.get(identifiers_url).content + identifiers = etree.XML(identifier_content) + return identifiers.xpath('//ns0:identifier/node()', namespaces=self.namespaces(identifiers)) + + def get_record(self, record_url): + record_content = requests.get(record_url, throttle=0.5).content + record_xml = etree.XML(record_content) + + # make sure we add all of the namespaces to all_namespaces + metadata = record_xml.xpath('//ns0:metadata', namespaces=self.namespaces(record_xml)) + for child in metadata[0].getchildren(): + self.namespaces(child) + self.namespaces(record_xml) + + return record_xml + + def harvest(self, start_date=None, end_date=None): + start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat() + end_date = (end_date or date.today()).isoformat() + + url = furl(self.base_url) + + if self.timezone_granularity: + start_date += 'T00:00:00Z' + end_date += 'T00:00:00Z' + + records = [] + # Get a list of all identifiers for each metadata prefix given the date range + for prefix in self.metadata_prefixes: + print('checking out the prefix {}'.format(prefix)) + url.args['verb'] = 'ListIdentifiers' + url.args['metadataPrefix'] = prefix + url.args['from'] = start_date + url.args['until'] = end_date + prefix_ids = self.get_identifiers(url.url) + + url.args.pop('from') + url.args.pop('until') + # request each of those identifiers records for that prefix + for identifier in prefix_ids: + url.args['verb'] = 'GetRecord' + url.args['identifier'] = identifier + + records.append(self.get_record(url.url)) + + # For testing only! + if len(records) % 3 == 0: + print('Collected {} records...'.format(len(records))) + return [ + RawDocument({ + 'doc': etree.tostring(record, encoding=self.record_encoding), + 'source': self.short_name, + 'docID': record.xpath('//ns0:header/ns0:identifier', namespaces=self.namespaces(record))[0].text, + 'filetype': 'xml' + }) for record in records + ] + + def normalize(self, raw_doc): + str_result = raw_doc.get('doc') + result = etree.XML(str_result) + + if self.approved_sets: + set_spec = result.xpath( + '//ns0:header/ns0:setSpec/node()', + namespaces=self.namespaces + ) + # check if there's an intersection between the approved sets and the + # setSpec list provided in the record. If there isn't, don't normalize. + if not {x.replace('publication:', '') for x in set_spec}.intersection(self.approved_sets): + logger.info('Series {} not in approved list'.format(set_spec)) + return None + + status = result.xpath('//ns0:header/@status', namespaces=self.namespaces(result)) + if status and status[0] == 'deleted': + logger.info('Deleted record, not normalizing {}'.format(raw_doc['docID'])) + return None + + return super(AutoOAIHarvester, self).normalize(raw_doc) + + class OAIHarvester(XMLHarvester): """ Create a harvester with a oai_dc namespace, that will harvest documents within a certain date range @@ -150,12 +351,21 @@ def _schema(self): @property def formatted_properties(self): return { - 'otherProperties': build_properties(*list(map(self.format_property, self.property_list))) - } + 'otherProperties': build_properties( + *list( + map( + self.format_property, + self.property_list)))} def format_property(self, property): if property == 'date': - fn = compose(lambda x: list(map(null_on_error(datetime_formatter), x)), coerce_to_list, self.resolve_property) + force_date = null_on_error(datetime_formatter) + fn = compose(lambda x: list( + map( + null_on_error(datetime_formatter), + x + ) + ), coerce_to_list, self.resolve_property) else: fn = self.resolve_property return (property, ( @@ -169,7 +379,6 @@ def resolve_property(self, dc, ns0): return ret[0] if len(ret) == 1 else ret def harvest(self, start_date=None, end_date=None): - start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat() end_date = (end_date or date.today()).isoformat() From 6874668ccfe448bb93e27404ca5ff1648073b890 Mon Sep 17 00:00:00 2001 From: erinspace Date: Tue, 1 Dec 2015 11:48:00 -0500 Subject: [PATCH 2/8] Update searching for record encoding when there is none --- scrapi/base/__init__.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/scrapi/base/__init__.py b/scrapi/base/__init__.py index c1d7858b..2a50792b 100644 --- a/scrapi/base/__init__.py +++ b/scrapi/base/__init__.py @@ -118,7 +118,7 @@ class AutoOAIHarvester(XMLHarvester): _timezone_granularity = None _metadata_prefixes = None _property_list = None - _record_encoding = None + _record_encoding = 'utf8' timeout = 0.5 approved_sets = None @@ -136,7 +136,6 @@ def namespaces(self, element): namespaces['ns0'] = value namespaces.pop(None) self.all_namespaces.update(namespaces) - print('UPDATING NSPS WITH {}'.format(namespaces)) return namespaces @property @@ -163,12 +162,11 @@ def metadata_prefixes(self): @property def record_encoding(self): - if self._record_encoding: - return self._record_encoding url = furl(self.base_url) url.args['verb'] = 'Identify' - self._record_encoding = requests.get(url.url).encoding + if requests.get(url.url).encoding != 'None': + self._record_encoding = requests.get(url.url).encoding return self._record_encoding @property @@ -274,17 +272,17 @@ def harvest(self, start_date=None, end_date=None): records.append(self.get_record(url.url)) - # For testing only! - if len(records) % 3 == 0: - print('Collected {} records...'.format(len(records))) - return [ - RawDocument({ - 'doc': etree.tostring(record, encoding=self.record_encoding), - 'source': self.short_name, - 'docID': record.xpath('//ns0:header/ns0:identifier', namespaces=self.namespaces(record))[0].text, - 'filetype': 'xml' - }) for record in records - ] + try: + return [ + RawDocument({ + 'doc': etree.tostring(record, encoding=self.record_encoding), + 'source': self.short_name, + 'docID': record.xpath('//ns0:header/ns0:identifier', namespaces=self.namespaces(record))[0].text, + 'filetype': 'xml' + }) for record in records + ] + except Exception: + import ipdb; ipdb.set_trace() def normalize(self, raw_doc): str_result = raw_doc.get('doc') From 6e8c4d289a889193b8415bf9151b004c141a1f57 Mon Sep 17 00:00:00 2001 From: erinspace Date: Tue, 1 Dec 2015 11:48:29 -0500 Subject: [PATCH 3/8] Use the ammassed all_namespaces dict when transforming string --- scrapi/base/transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapi/base/transformer.py b/scrapi/base/transformer.py index 5065de71..e68481c4 100644 --- a/scrapi/base/transformer.py +++ b/scrapi/base/transformer.py @@ -92,7 +92,7 @@ class XMLTransformer(BaseTransformer): namespaces = {} def _transform_string(self, string, doc): - return doc.xpath(string, namespaces=self.namespaces) + return doc.xpath(string, namespaces=self.all_namespaces) @six.add_metaclass(abc.ABCMeta) From 74a759b4c1e45f4af5a4298b371aaf74ead2bd6b Mon Sep 17 00:00:00 2001 From: erinspace Date: Tue, 1 Dec 2015 16:48:03 -0500 Subject: [PATCH 4/8] Fix record encoding and getting the rest of the record --- scrapi/base/__init__.py | 45 +++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/scrapi/base/__init__.py b/scrapi/base/__init__.py index 2a50792b..c0ecdc4a 100644 --- a/scrapi/base/__init__.py +++ b/scrapi/base/__init__.py @@ -118,15 +118,16 @@ class AutoOAIHarvester(XMLHarvester): _timezone_granularity = None _metadata_prefixes = None _property_list = None - _record_encoding = 'utf8' + _record_encoding = None timeout = 0.5 + verify = True + all_namespaces = {} approved_sets = None + default_encoding = 'utf8' timezone_granularity = False - property_list = ['date', 'type'] force_request_update = False - verify = True - all_namespaces = {} + property_list = ['date', 'type'] def namespaces(self, element): namespaces = element.nsmap @@ -162,11 +163,17 @@ def metadata_prefixes(self): @property def record_encoding(self): + if self._record_encoding: + return self._record_encoding url = furl(self.base_url) url.args['verb'] = 'Identify' - if requests.get(url.url).encoding != 'None': + encoding = requests.get(url.url).encoding + + if encoding != 'None': self._record_encoding = requests.get(url.url).encoding + else: + self._record_encoding = self.default_encoding return self._record_encoding @property @@ -237,8 +244,9 @@ def get_record(self, record_url): # make sure we add all of the namespaces to all_namespaces metadata = record_xml.xpath('//ns0:metadata', namespaces=self.namespaces(record_xml)) - for child in metadata[0].getchildren(): - self.namespaces(child) + if metadata: + for child in metadata[0].getchildren(): + self.namespaces(child) self.namespaces(record_xml) return record_xml @@ -256,7 +264,8 @@ def harvest(self, start_date=None, end_date=None): records = [] # Get a list of all identifiers for each metadata prefix given the date range for prefix in self.metadata_prefixes: - print('checking out the prefix {}'.format(prefix)) + if url.args.get('identifier'): + url.args.pop('identifier') url.args['verb'] = 'ListIdentifiers' url.args['metadataPrefix'] = prefix url.args['from'] = start_date @@ -269,20 +278,16 @@ def harvest(self, start_date=None, end_date=None): for identifier in prefix_ids: url.args['verb'] = 'GetRecord' url.args['identifier'] = identifier - records.append(self.get_record(url.url)) - try: - return [ - RawDocument({ - 'doc': etree.tostring(record, encoding=self.record_encoding), - 'source': self.short_name, - 'docID': record.xpath('//ns0:header/ns0:identifier', namespaces=self.namespaces(record))[0].text, - 'filetype': 'xml' - }) for record in records - ] - except Exception: - import ipdb; ipdb.set_trace() + return [ + RawDocument({ + 'doc': etree.tostring(record, encoding=self.record_encoding), + 'source': self.short_name, + 'docID': record.xpath('//ns0:header/ns0:identifier', namespaces=self.namespaces(record))[0].text, + 'filetype': 'xml' + }) for record in records + ] def normalize(self, raw_doc): str_result = raw_doc.get('doc') From c1ac2d4c5c9b89d4d718ab87009d84869cd995f5 Mon Sep 17 00:00:00 2001 From: erinspace Date: Tue, 1 Dec 2015 16:49:22 -0500 Subject: [PATCH 5/8] Add logging for no found metadata element --- scrapi/base/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scrapi/base/__init__.py b/scrapi/base/__init__.py index c0ecdc4a..0cfeec81 100644 --- a/scrapi/base/__init__.py +++ b/scrapi/base/__init__.py @@ -247,6 +247,8 @@ def get_record(self, record_url): if metadata: for child in metadata[0].getchildren(): self.namespaces(child) + else: + logger.info('No metadata element found, was this a proper request?') self.namespaces(record_xml) return record_xml From b4c54ae652b76c192436357799fc3b8df77fc2b9 Mon Sep 17 00:00:00 2001 From: erinspace Date: Tue, 1 Dec 2015 16:49:46 -0500 Subject: [PATCH 6/8] Add all_namespaces as a default for transformer --- scrapi/base/transformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scrapi/base/transformer.py b/scrapi/base/transformer.py index e68481c4..eea34505 100644 --- a/scrapi/base/transformer.py +++ b/scrapi/base/transformer.py @@ -90,6 +90,7 @@ def _transform_kwargs(self, t, doc): class XMLTransformer(BaseTransformer): namespaces = {} + all_namespaces = {} def _transform_string(self, string, doc): return doc.xpath(string, namespaces=self.all_namespaces) From e4df0c61435a5fdb51f9851d7920cd2a0eab3cc3 Mon Sep 17 00:00:00 2001 From: erinspace Date: Tue, 1 Dec 2015 16:56:38 -0500 Subject: [PATCH 7/8] Remove unused variable --- scrapi/base/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapi/base/__init__.py b/scrapi/base/__init__.py index 0cfeec81..55b8a5fb 100644 --- a/scrapi/base/__init__.py +++ b/scrapi/base/__init__.py @@ -364,7 +364,7 @@ def formatted_properties(self): def format_property(self, property): if property == 'date': - force_date = null_on_error(datetime_formatter) + null_on_error(datetime_formatter) fn = compose(lambda x: list( map( null_on_error(datetime_formatter), From 386b3ec167928431f7c2b8e7e4b10649db6e5bb8 Mon Sep 17 00:00:00 2001 From: erinspace Date: Wed, 2 Dec 2015 08:56:24 -0500 Subject: [PATCH 8/8] Update string transformer looking for different harvesters --- scrapi/base/transformer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scrapi/base/transformer.py b/scrapi/base/transformer.py index eea34505..5de38f39 100644 --- a/scrapi/base/transformer.py +++ b/scrapi/base/transformer.py @@ -93,7 +93,9 @@ class XMLTransformer(BaseTransformer): all_namespaces = {} def _transform_string(self, string, doc): - return doc.xpath(string, namespaces=self.all_namespaces) + if self.all_namespaces: + return doc.xpath(string, namespaces=self.all_namespaces) + return doc.xpath(string, namespaces=self.namespaces) @six.add_metaclass(abc.ABCMeta)