diff --git a/api/webview/migrations/0004_lastharvest.py b/api/webview/migrations/0004_lastharvest.py new file mode 100644 index 00000000..b201a066 --- /dev/null +++ b/api/webview/migrations/0004_lastharvest.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from django.db import models, migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('webview', '0003_version_status'), + ] + + operations = [ + migrations.CreateModel( + name='LastHarvest', + fields=[ + ('source', models.TextField(serialize=False, primary_key=True)), + ('last_harvest', models.DateTimeField(auto_now=True)), + ], + ), + ] diff --git a/api/webview/models.py b/api/webview/models.py index 3376270f..99bf9376 100644 --- a/api/webview/models.py +++ b/api/webview/models.py @@ -52,3 +52,14 @@ class HarvesterResponse(models.Model): headers_str = models.TextField(null=True) status_code = models.IntegerField(null=True) time_made = models.DateTimeField(auto_now=True) + + +class LastHarvest(models.Model): + + source = models.TextField(primary_key=True) + + last_harvest = models.DateTimeField(auto_now=True) + + def as_json(self): + return dict( + source=self.source, last_harvest=str(self.last_harvest)) diff --git a/api/webview/serializers.py b/api/webview/serializers.py index 148724e1..c9c6e469 100644 --- a/api/webview/serializers.py +++ b/api/webview/serializers.py @@ -1,6 +1,6 @@ from rest_framework import serializers -from api.webview.models import Document +from api.webview.models import Document, LastHarvest class DocumentSerializer(serializers.ModelSerializer): @@ -10,3 +10,10 @@ class DocumentSerializer(serializers.ModelSerializer): class Meta: model = Document fields = ('key', 'providerUpdatedDateTime', 'source', 'docID', 'raw', 'normalized') + + +class LastHarvestSerializer(serializers.ModelSerializer): + + class Meta: + model = LastHarvest + fields = ('source', 'last_harvest') diff --git a/api/webview/urls.py b/api/webview/urls.py index 38f8c658..e7d2a9a9 100644 --- a/api/webview/urls.py +++ b/api/webview/urls.py @@ -5,6 +5,7 @@ url(r'^documents/$', views.DocumentList.as_view()), url(r'^get-api-key/$', views.DocumentList.as_view(), name='get-api-key'), url(r'^documents/status', views.status, name='status'), + url(r'^documents/harvester_status', views.harvester_status, name='harvester_status'), url(r'^documents/(?P\w+)/$', views.DocumentsFromSource.as_view(), name='source'), url(r'^documents/(?P[a-z]+)/(?P(.*))/$', views.document_detail, name='document_detail'), url(r'^institutions', views.institutions, name='institutions'), diff --git a/api/webview/views.py b/api/webview/views.py index bc077f54..66f4e7ec 100644 --- a/api/webview/views.py +++ b/api/webview/views.py @@ -9,7 +9,7 @@ from elasticsearch import Elasticsearch from scrapi import settings -from api.webview.models import Document +from api.webview.models import Document, LastHarvest from api.webview.serializers import DocumentSerializer es = Elasticsearch(settings.ELASTIC_URI, request_timeout=settings.ELASTIC_TIMEOUT) @@ -71,6 +71,21 @@ def status(request): return HttpResponse(json.dumps({'status': 'ok'}), content_type='application/json', status=200) +@api_view(['GET']) +@xframe_options_exempt +def harvester_status(request): + """ + Show the last harvested dates for each harvester + """ + try: + result_set = LastHarvest.objects.all() + except Document.DoesNotExist: + return Response(status=404) + + harvester_statuses = [obj.as_json() for obj in result_set] + return HttpResponse(json.dumps(harvester_statuses), content_type='application/json', status=200) + + @api_view(['GET', 'POST']) def institutions(request): if not es: diff --git a/scrapi/processing/postgres.py b/scrapi/processing/postgres.py index 865560ef..b5ed39bb 100644 --- a/scrapi/processing/postgres.py +++ b/scrapi/processing/postgres.py @@ -8,7 +8,9 @@ import django -from api.webview.models import HarvesterResponse, Document, Version +from dateutil.parser import parse + +from api.webview.models import HarvesterResponse, Document, Version, LastHarvest from scrapi import events from scrapi.util import json_without_bytes @@ -100,6 +102,18 @@ def process_raw(self, raw_doc): document.save() + def last_harvested(self, document): + document_date = parse(document.providerUpdatedDateTime) + document_source = document.source + if LastHarvest.objects.filter(source=document_source).exists(): + most_recent = LastHarvest.objects.get(source=document_source) + if document_date > most_recent.last_harvest: + most_recent.last_harvest = document_date + most_recent.save() + else: + new_source = LastHarvest(source=document_source, last_harvest=document_date) + new_source.save() + @events.logged(events.PROCESSING, 'normalized.postgres') def process_normalized(self, raw_doc, normalized): document = self.version(raw=raw_doc, normalized=normalized) @@ -112,6 +126,8 @@ def process_normalized(self, raw_doc, normalized): document.save() + self.last_harvested(document) + def _get_by_source_id(self, source, docID): try: return Document.objects.get(key=Document._make_key(source, docID)) diff --git a/scripts/last_harvest.py b/scripts/last_harvest.py new file mode 100644 index 00000000..16477650 --- /dev/null +++ b/scripts/last_harvest.py @@ -0,0 +1,29 @@ +# from api.webview.models import Document +# from api.webview.models import LastHarvest +# +# +# import logging +# logger = logging.getLogger(__name__) +# +# +# class LastHarvest(): +# +# def update_harvest(self): +# sources = Document.objects.values('source').distinct() +# print sources +# for source in sources: +# document = Document.objects.values(source=source).order_by('providerUpdatedDateTime')[0] +# document_date = document.providerUpdatedDateTime +# if LastHarvest.objects.filter(source=source).exists(): +# most_recent = LastHarvest.objects.filter(source=source) +# if document_date > most_recent.last_harvest: +# most_recent.last_harvest = document_date +# most_recent.save() +# +# else: +# new_source = LastHarvest(source=source, date = document_date) +# new_source.save() +# +# +# def main(): +# LastHarvest.get_date()