CenterForOpenScience · MerlinZhang · Mar 11, 2016 · Mar 11, 2016 · Mar 11, 2016 · Mar 11, 2016
diff --git a/api/webview/migrations/0004_lastharvest.py b/api/webview/migrations/0004_lastharvest.py
@@ -0,0 +1,21 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+from django.db import models, migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('webview', '0003_version_status'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='LastHarvest',
+            fields=[
+                ('source', models.TextField(serialize=False, primary_key=True)),
+                ('last_harvest', models.DateTimeField(auto_now=True)),
+            ],
+        ),
+    ]
diff --git a/api/webview/models.py b/api/webview/models.py
@@ -52,3 +52,14 @@ class HarvesterResponse(models.Model):
     headers_str = models.TextField(null=True)
     status_code = models.IntegerField(null=True)
     time_made = models.DateTimeField(auto_now=True)
+
+
+class LastHarvest(models.Model):
+
+    source = models.TextField(primary_key=True)
+
+    last_harvest = models.DateTimeField(auto_now=True)
+
+    def as_json(self):
+        return dict(
+            source=self.source, last_harvest=str(self.last_harvest))
diff --git a/api/webview/serializers.py b/api/webview/serializers.py
@@ -1,6 +1,6 @@
 from rest_framework import serializers
 
-from api.webview.models import Document
+from api.webview.models import Document, LastHarvest
 
 
 class DocumentSerializer(serializers.ModelSerializer):
@@ -10,3 +10,10 @@ class DocumentSerializer(serializers.ModelSerializer):
     class Meta:
         model = Document
         fields = ('key', 'providerUpdatedDateTime', 'source', 'docID', 'raw', 'normalized')
+
+
+class LastHarvestSerializer(serializers.ModelSerializer):
+
+    class Meta:
+        model = LastHarvest
+        fields = ('source', 'last_harvest')
diff --git a/api/webview/urls.py b/api/webview/urls.py
@@ -5,6 +5,7 @@
     url(r'^documents/$', views.DocumentList.as_view()),
     url(r'^get-api-key/$', views.DocumentList.as_view(), name='get-api-key'),
     url(r'^documents/status', views.status, name='status'),
+    url(r'^documents/harvester_status', views.harvester_status, name='harvester_status'),
     url(r'^documents/(?P<source>\w+)/$', views.DocumentsFromSource.as_view(), name='source'),
     url(r'^documents/(?P<source>[a-z]+)/(?P<docID>(.*))/$', views.document_detail, name='document_detail'),
     url(r'^institutions', views.institutions, name='institutions'),

diff --git a/api/webview/views.py b/api/webview/views.py
@@ -9,7 +9,7 @@
 from elasticsearch import Elasticsearch
 
 from scrapi import settings
-from api.webview.models import Document
+from api.webview.models import Document, LastHarvest
 from api.webview.serializers import DocumentSerializer
 
 es = Elasticsearch(settings.ELASTIC_URI, request_timeout=settings.ELASTIC_TIMEOUT)
@@ -71,6 +71,21 @@ def status(request):
     return HttpResponse(json.dumps({'status': 'ok'}), content_type='application/json', status=200)
 
 
+@api_view(['GET'])
+@xframe_options_exempt
+def harvester_status(request):
+    """
+    Show the last harvested dates for each harvester
+    """
+    try:
+        result_set = LastHarvest.objects.all()
+    except Document.DoesNotExist:
+        return Response(status=404)
+
+    harvester_statuses = [obj.as_json() for obj in result_set]
+    return HttpResponse(json.dumps(harvester_statuses), content_type='application/json', status=200)
+
+
 @api_view(['GET', 'POST'])
 def institutions(request):
     if not es:

diff --git a/scrapi/processing/postgres.py b/scrapi/processing/postgres.py
@@ -8,7 +8,9 @@
 
 import django
 
-from api.webview.models import HarvesterResponse, Document, Version
+from dateutil.parser import parse
+
+from api.webview.models import HarvesterResponse, Document, Version, LastHarvest
 
 from scrapi import events
 from scrapi.util import json_without_bytes
@@ -100,6 +102,18 @@ def process_raw(self, raw_doc):
 
         document.save()
 
+    def last_harvested(self, document):
+        document_date = parse(document.providerUpdatedDateTime)
+        document_source = document.source
+        if LastHarvest.objects.filter(source=document_source).exists():
+            most_recent = LastHarvest.objects.get(source=document_source)
+            if document_date > most_recent.last_harvest:
+                    most_recent.last_harvest = document_date
+                    most_recent.save()
+        else:
+            new_source = LastHarvest(source=document_source, last_harvest=document_date)
+            new_source.save()
+
     @events.logged(events.PROCESSING, 'normalized.postgres')
     def process_normalized(self, raw_doc, normalized):
         document = self.version(raw=raw_doc, normalized=normalized)
@@ -112,6 +126,8 @@ def process_normalized(self, raw_doc, normalized):
 
         document.save()
 
+        self.last_harvested(document)
+
     def _get_by_source_id(self, source, docID):
         try:
             return Document.objects.get(key=Document._make_key(source, docID))

diff --git a/scripts/last_harvest.py b/scripts/last_harvest.py
@@ -0,0 +1,29 @@
+# from api.webview.models import Document
+# from api.webview.models import LastHarvest
+#
+#
+# import logging
+# logger = logging.getLogger(__name__)
+#
+#
+# class LastHarvest():
+#
+#     def update_harvest(self):
+#         sources = Document.objects.values('source').distinct()
+#         print sources
+#         for source in sources:
+#             document = Document.objects.values(source=source).order_by('providerUpdatedDateTime')[0]
+#             document_date = document.providerUpdatedDateTime
+#             if LastHarvest.objects.filter(source=source).exists():
+#                 most_recent = LastHarvest.objects.filter(source=source)
+#                 if document_date > most_recent.last_harvest:
+#                     most_recent.last_harvest = document_date
+#                     most_recent.save()
+#
+#             else:
+#                 new_source = LastHarvest(source=source, date = document_date)
+#                 new_source.save()
+#
+#
+# def main():
+#     LastHarvest.get_date()