From bccfadddb7463deb77dd2711a2decb7cbd4e6797 Mon Sep 17 00:00:00 2001 From: MerlinZhang Date: Fri, 11 Mar 2016 16:25:02 -0500 Subject: [PATCH 1/7] [feature/lastharvest] Included LastHarvest model to keep track of most recently harvested metadata for each scrapi source --- api/webview/models.py | 11 +++++++++++ scripts/last_harvest.py | 0 2 files changed, 11 insertions(+) create mode 100644 scripts/last_harvest.py diff --git a/api/webview/models.py b/api/webview/models.py index 3376270f..92d97b16 100644 --- a/api/webview/models.py +++ b/api/webview/models.py @@ -52,3 +52,14 @@ class HarvesterResponse(models.Model): headers_str = models.TextField(null=True) status_code = models.IntegerField(null=True) time_made = models.DateTimeField(auto_now=True) + + +class LastHarvest(models.Model): + + source = models.TextField(primary_key=True) + + last_harvest = models.DateTimeField(auto_now=True) + + def as_json(self): + return dict( + source=self.source, last_harvest=str(self.last_harvest)) \ No newline at end of file diff --git a/scripts/last_harvest.py b/scripts/last_harvest.py new file mode 100644 index 00000000..e69de29b From a060c81af8433852feb80ad35e24a51d1d0186af Mon Sep 17 00:00:00 2001 From: MerlinZhang Date: Fri, 11 Mar 2016 16:25:53 -0500 Subject: [PATCH 2/7] [feature/lastharvest] Added new serializer for LastHarvest model --- api/webview/serializers.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/api/webview/serializers.py b/api/webview/serializers.py index 148724e1..c9c6e469 100644 --- a/api/webview/serializers.py +++ b/api/webview/serializers.py @@ -1,6 +1,6 @@ from rest_framework import serializers -from api.webview.models import Document +from api.webview.models import Document, LastHarvest class DocumentSerializer(serializers.ModelSerializer): @@ -10,3 +10,10 @@ class DocumentSerializer(serializers.ModelSerializer): class Meta: model = Document fields = ('key', 'providerUpdatedDateTime', 'source', 'docID', 'raw', 'normalized') + + +class LastHarvestSerializer(serializers.ModelSerializer): + + class Meta: + model = LastHarvest + fields = ('source', 'last_harvest') From 009f32caa8780026b299e731af38020b5a4504ca Mon Sep 17 00:00:00 2001 From: MerlinZhang Date: Fri, 11 Mar 2016 16:27:13 -0500 Subject: [PATCH 3/7] [feature/lastharvest] Added new url to display database information for most recent harvests, sorted by source --- api/webview/urls.py | 1 + 1 file changed, 1 insertion(+) diff --git a/api/webview/urls.py b/api/webview/urls.py index 38f8c658..e7d2a9a9 100644 --- a/api/webview/urls.py +++ b/api/webview/urls.py @@ -5,6 +5,7 @@ url(r'^documents/$', views.DocumentList.as_view()), url(r'^get-api-key/$', views.DocumentList.as_view(), name='get-api-key'), url(r'^documents/status', views.status, name='status'), + url(r'^documents/harvester_status', views.harvester_status, name='harvester_status'), url(r'^documents/(?P\w+)/$', views.DocumentsFromSource.as_view(), name='source'), url(r'^documents/(?P[a-z]+)/(?P(.*))/$', views.document_detail, name='document_detail'), url(r'^institutions', views.institutions, name='institutions'), From 07721accf5210951c18f011701ea46406cad927a Mon Sep 17 00:00:00 2001 From: MerlinZhang Date: Fri, 11 Mar 2016 16:28:06 -0500 Subject: [PATCH 4/7] [feature/lastharvest] Created view to display most recent harvest information as a list of json objects --- api/webview/views.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/api/webview/views.py b/api/webview/views.py index bc077f54..39c0c5bc 100644 --- a/api/webview/views.py +++ b/api/webview/views.py @@ -9,8 +9,8 @@ from elasticsearch import Elasticsearch from scrapi import settings -from api.webview.models import Document -from api.webview.serializers import DocumentSerializer +from api.webview.models import Document, LastHarvest +from api.webview.serializers import DocumentSerializer, LastHarvestSerializer es = Elasticsearch(settings.ELASTIC_URI, request_timeout=settings.ELASTIC_TIMEOUT) @@ -71,6 +71,21 @@ def status(request): return HttpResponse(json.dumps({'status': 'ok'}), content_type='application/json', status=200) +@api_view(['GET']) +@xframe_options_exempt +def harvester_status(request): + """ + Show the last harvested dates for each harvester + """ + try: + result_set = LastHarvest.objects.all() + except Document.DoesNotExist: + return Response(status=404) + + harvester_statuses = [obj.as_json() for obj in result_set] + return HttpResponse(json.dumps(harvester_statuses), content_type='application/json', status=200) + + @api_view(['GET', 'POST']) def institutions(request): if not es: From 77654cd7285fbacc2b33d26c1148d327e7d61086 Mon Sep 17 00:00:00 2001 From: MerlinZhang Date: Fri, 11 Mar 2016 16:30:22 -0500 Subject: [PATCH 5/7] [feature/lastharvest] Added method last_harvested() to update LastHarvest database when more recent metadata is harvested Included method call in process_normalized to only store recent updates when the metadata is normalized --- scrapi/processing/postgres.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/scrapi/processing/postgres.py b/scrapi/processing/postgres.py index 865560ef..0681e87e 100644 --- a/scrapi/processing/postgres.py +++ b/scrapi/processing/postgres.py @@ -8,7 +8,9 @@ import django -from api.webview.models import HarvesterResponse, Document, Version +from dateutil.parser import parse + +from api.webview.models import HarvesterResponse, Document, Version, LastHarvest from scrapi import events from scrapi.util import json_without_bytes @@ -100,6 +102,18 @@ def process_raw(self, raw_doc): document.save() + def last_harvested(self, document): + document_date = parse(document.providerUpdatedDateTime) + document_source = document.source + if LastHarvest.objects.filter(source=document_source).exists(): + most_recent = LastHarvest.objects.get(source=document_source) + if document_date > most_recent.last_harvest: + most_recent.last_harvest = document_date + most_recent.save() + else: + new_source = LastHarvest(source=document_source, last_harvest=document_date) + new_source.save() + @events.logged(events.PROCESSING, 'normalized.postgres') def process_normalized(self, raw_doc, normalized): document = self.version(raw=raw_doc, normalized=normalized) @@ -112,6 +126,9 @@ def process_normalized(self, raw_doc, normalized): document.save() + self.last_harvested(document) + + def _get_by_source_id(self, source, docID): try: return Document.objects.get(key=Document._make_key(source, docID)) From bf695defc4da8f9d9fcced3b8613d38a88183f16 Mon Sep 17 00:00:00 2001 From: MerlinZhang Date: Mon, 14 Mar 2016 12:20:37 -0400 Subject: [PATCH 6/7] Fixed pep8 --- api/webview/models.py | 2 +- api/webview/views.py | 2 +- scrapi/processing/postgres.py | 1 - scripts/last_harvest.py | 29 +++++++++++++++++++++++++++++ 4 files changed, 31 insertions(+), 3 deletions(-) diff --git a/api/webview/models.py b/api/webview/models.py index 92d97b16..99bf9376 100644 --- a/api/webview/models.py +++ b/api/webview/models.py @@ -62,4 +62,4 @@ class LastHarvest(models.Model): def as_json(self): return dict( - source=self.source, last_harvest=str(self.last_harvest)) \ No newline at end of file + source=self.source, last_harvest=str(self.last_harvest)) diff --git a/api/webview/views.py b/api/webview/views.py index 39c0c5bc..66f4e7ec 100644 --- a/api/webview/views.py +++ b/api/webview/views.py @@ -10,7 +10,7 @@ from scrapi import settings from api.webview.models import Document, LastHarvest -from api.webview.serializers import DocumentSerializer, LastHarvestSerializer +from api.webview.serializers import DocumentSerializer es = Elasticsearch(settings.ELASTIC_URI, request_timeout=settings.ELASTIC_TIMEOUT) diff --git a/scrapi/processing/postgres.py b/scrapi/processing/postgres.py index 0681e87e..b5ed39bb 100644 --- a/scrapi/processing/postgres.py +++ b/scrapi/processing/postgres.py @@ -128,7 +128,6 @@ def process_normalized(self, raw_doc, normalized): self.last_harvested(document) - def _get_by_source_id(self, source, docID): try: return Document.objects.get(key=Document._make_key(source, docID)) diff --git a/scripts/last_harvest.py b/scripts/last_harvest.py index e69de29b..16477650 100644 --- a/scripts/last_harvest.py +++ b/scripts/last_harvest.py @@ -0,0 +1,29 @@ +# from api.webview.models import Document +# from api.webview.models import LastHarvest +# +# +# import logging +# logger = logging.getLogger(__name__) +# +# +# class LastHarvest(): +# +# def update_harvest(self): +# sources = Document.objects.values('source').distinct() +# print sources +# for source in sources: +# document = Document.objects.values(source=source).order_by('providerUpdatedDateTime')[0] +# document_date = document.providerUpdatedDateTime +# if LastHarvest.objects.filter(source=source).exists(): +# most_recent = LastHarvest.objects.filter(source=source) +# if document_date > most_recent.last_harvest: +# most_recent.last_harvest = document_date +# most_recent.save() +# +# else: +# new_source = LastHarvest(source=source, date = document_date) +# new_source.save() +# +# +# def main(): +# LastHarvest.get_date() From c7357ce5cfc6f8f4b4200f138c07b74b7fb8c6be Mon Sep 17 00:00:00 2001 From: MerlinZhang Date: Tue, 15 Mar 2016 10:46:18 -0400 Subject: [PATCH 7/7] Added migration --- api/webview/migrations/0004_lastharvest.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 api/webview/migrations/0004_lastharvest.py diff --git a/api/webview/migrations/0004_lastharvest.py b/api/webview/migrations/0004_lastharvest.py new file mode 100644 index 00000000..b201a066 --- /dev/null +++ b/api/webview/migrations/0004_lastharvest.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from django.db import models, migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('webview', '0003_version_status'), + ] + + operations = [ + migrations.CreateModel( + name='LastHarvest', + fields=[ + ('source', models.TextField(serialize=False, primary_key=True)), + ('last_harvest', models.DateTimeField(auto_now=True)), + ], + ), + ]