Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions api/webview/migrations/0004_lastharvest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

from django.db import models, migrations


class Migration(migrations.Migration):

dependencies = [
('webview', '0003_version_status'),
]

operations = [
migrations.CreateModel(
name='LastHarvest',
fields=[
('source', models.TextField(serialize=False, primary_key=True)),
('last_harvest', models.DateTimeField(auto_now=True)),
],
),
]
11 changes: 11 additions & 0 deletions api/webview/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,14 @@ class HarvesterResponse(models.Model):
headers_str = models.TextField(null=True)
status_code = models.IntegerField(null=True)
time_made = models.DateTimeField(auto_now=True)


class LastHarvest(models.Model):

source = models.TextField(primary_key=True)

last_harvest = models.DateTimeField(auto_now=True)

def as_json(self):
return dict(
source=self.source, last_harvest=str(self.last_harvest))
9 changes: 8 additions & 1 deletion api/webview/serializers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from rest_framework import serializers

from api.webview.models import Document
from api.webview.models import Document, LastHarvest


class DocumentSerializer(serializers.ModelSerializer):
Expand All @@ -10,3 +10,10 @@ class DocumentSerializer(serializers.ModelSerializer):
class Meta:
model = Document
fields = ('key', 'providerUpdatedDateTime', 'source', 'docID', 'raw', 'normalized')


class LastHarvestSerializer(serializers.ModelSerializer):

class Meta:
model = LastHarvest
fields = ('source', 'last_harvest')
1 change: 1 addition & 0 deletions api/webview/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
url(r'^documents/$', views.DocumentList.as_view()),
url(r'^get-api-key/$', views.DocumentList.as_view(), name='get-api-key'),
url(r'^documents/status', views.status, name='status'),
url(r'^documents/harvester_status', views.harvester_status, name='harvester_status'),
url(r'^documents/(?P<source>\w+)/$', views.DocumentsFromSource.as_view(), name='source'),
url(r'^documents/(?P<source>[a-z]+)/(?P<docID>(.*))/$', views.document_detail, name='document_detail'),
url(r'^institutions', views.institutions, name='institutions'),
Expand Down
17 changes: 16 additions & 1 deletion api/webview/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from elasticsearch import Elasticsearch

from scrapi import settings
from api.webview.models import Document
from api.webview.models import Document, LastHarvest
from api.webview.serializers import DocumentSerializer

es = Elasticsearch(settings.ELASTIC_URI, request_timeout=settings.ELASTIC_TIMEOUT)
Expand Down Expand Up @@ -71,6 +71,21 @@ def status(request):
return HttpResponse(json.dumps({'status': 'ok'}), content_type='application/json', status=200)


@api_view(['GET'])
@xframe_options_exempt
def harvester_status(request):
"""
Show the last harvested dates for each harvester
"""
try:
result_set = LastHarvest.objects.all()
except Document.DoesNotExist:
return Response(status=404)

harvester_statuses = [obj.as_json() for obj in result_set]
return HttpResponse(json.dumps(harvester_statuses), content_type='application/json', status=200)


@api_view(['GET', 'POST'])
def institutions(request):
if not es:
Expand Down
18 changes: 17 additions & 1 deletion scrapi/processing/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@

import django

from api.webview.models import HarvesterResponse, Document, Version
from dateutil.parser import parse

from api.webview.models import HarvesterResponse, Document, Version, LastHarvest

from scrapi import events
from scrapi.util import json_without_bytes
Expand Down Expand Up @@ -100,6 +102,18 @@ def process_raw(self, raw_doc):

document.save()

def last_harvested(self, document):
document_date = parse(document.providerUpdatedDateTime)
document_source = document.source
if LastHarvest.objects.filter(source=document_source).exists():
most_recent = LastHarvest.objects.get(source=document_source)
if document_date > most_recent.last_harvest:
most_recent.last_harvest = document_date
most_recent.save()
else:
new_source = LastHarvest(source=document_source, last_harvest=document_date)
new_source.save()

@events.logged(events.PROCESSING, 'normalized.postgres')
def process_normalized(self, raw_doc, normalized):
document = self.version(raw=raw_doc, normalized=normalized)
Expand All @@ -112,6 +126,8 @@ def process_normalized(self, raw_doc, normalized):

document.save()

self.last_harvested(document)

def _get_by_source_id(self, source, docID):
try:
return Document.objects.get(key=Document._make_key(source, docID))
Expand Down
29 changes: 29 additions & 0 deletions scripts/last_harvest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# from api.webview.models import Document

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file needs to be removed from the git repository.

# from api.webview.models import LastHarvest
#
#
# import logging
# logger = logging.getLogger(__name__)
#
#
# class LastHarvest():
#
# def update_harvest(self):
# sources = Document.objects.values('source').distinct()
# print sources
# for source in sources:
# document = Document.objects.values(source=source).order_by('providerUpdatedDateTime')[0]
# document_date = document.providerUpdatedDateTime
# if LastHarvest.objects.filter(source=source).exists():
# most_recent = LastHarvest.objects.filter(source=source)
# if document_date > most_recent.last_harvest:
# most_recent.last_harvest = document_date
# most_recent.save()
#
# else:
# new_source = LastHarvest(source=source, date = document_date)
# new_source.save()
#
#
# def main():
# LastHarvest.get_date()