Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
d9257da
Add resume option to OAI-PMH harvests
erinspace Sep 14, 2015
3a56981
Add resume option to biomedcentral
erinspace Sep 15, 2015
9a89778
Add resume is False to test harvesters
erinspace Sep 15, 2015
6c82987
Add resume option to springer
erinspace Sep 15, 2015
b1e18a7
Add resume arg into get records biomed
erinspace Sep 15, 2015
3d198da
Fix scitech harvester with reusme
erinspace Sep 15, 2015
892812f
Add break to crossref harvester
erinspace Sep 15, 2015
7526629
Add resume option to plos
erinspace Sep 15, 2015
b1b931a
Add resume to osf harvester
erinspace Sep 15, 2015
319a301
Add resume to harvarddataverse
erinspace Sep 15, 2015
fa5ae2e
Add resume to figshare
erinspace Sep 15, 2015
0cabf79
Add unused resume thing to doepages
erinspace Sep 15, 2015
5eba0fa
Add resume to dataone
erinspace Sep 15, 2015
fb7edbc
Add resume option to ct - only grab first 100 if not
erinspace Sep 15, 2015
e296b63
Add resume to daily ssrsn, does nothing though
erinspace Sep 15, 2015
c1b2f65
Add resume option to invoke task
erinspace Sep 15, 2015
4a6a4b3
Add resume option into celery tasks for processing
erinspace Sep 15, 2015
1804c51
Add resume option to test oai harvester
erinspace Sep 15, 2015
98e2f5f
fix task tests for resume option
erinspace Sep 15, 2015
c063c99
Add test for calls with resume is false
erinspace Sep 15, 2015
418e17b
Merge branch 'develop' of github.com:fabianvf/scrapi into feature/no_…
erinspace Sep 21, 2015
c135f08
Add page_limit arg to oai pmh base harvesters
erinspace Sep 21, 2015
14b8028
Add page_limit arg to inv and celery tasks
erinspace Sep 21, 2015
7cf5c57
Update tests with page_limit arg
erinspace Sep 21, 2015
89e1460
Add page_limit arg to test harvesters
erinspace Sep 21, 2015
78dcb72
Add page_limit to non-oai harvesters
erinspace Sep 21, 2015
c7b3cc0
Whitespace flake8 fix
erinspace Sep 21, 2015
f6e8469
Add page limit to the README
erinspace Sep 21, 2015
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,14 @@ Either --start or --end can also be used on their own. Not supplying arguments w

If --end is given with no --start, start will default to the number of days specified in ```settings.DAYS_BACK``` before the given end date.

You can also use the ```page_limit``` (or ```-p```)argument to limit your harvest to a certain number of pages. This is useful for large datasets and for testing locally.

To only harvest 2 pages of data from MIT, run:

```bash
$ invoke harvester mit --page_limit 2
```


### Automated OAI PMH Harvester Creation
Writing a harvester for inclusion with scrAPI? If the provider makes their metadata available using the OAI-PMH standard, then [autooai](https://github.com/erinspace/autooai) is a utility that will do most of the work for you.
Expand Down
24 changes: 15 additions & 9 deletions scrapi/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def resolve_property(self, dc, ns0):
ret = dc + ns0
return ret[0] if len(ret) == 1 else ret

def harvest(self, start_date=None, end_date=None):
def harvest(self, start_date=None, end_date=None, page_limit=None):

start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat()
end_date = (end_date or date.today()).isoformat()
Expand All @@ -180,7 +180,7 @@ def harvest(self, start_date=None, end_date=None):
url.args['from'] = start_date
url.args['until'] = end_date

records = self.get_records(url.url, start_date, end_date)
records = self.get_records(url.url, start_date, end_date, page_limit)

rawdoc_list = []
for record in records:
Expand All @@ -196,17 +196,23 @@ def harvest(self, start_date=None, end_date=None):

return rawdoc_list

def get_records(self, url, start_date, end_date):
def get_records(self, url, start_date, end_date, page_limit):
url = furl(url)
all_records, token = oai_get_records_and_token(url.url, self.timeout, self.force_request_update, self.namespaces, self.verify)

pages_harvested = 1
while token:
url.remove('from')
url.remove('until')
url.remove('metadataPrefix')
url.args['resumptionToken'] = token[0]
records, token = oai_get_records_and_token(url.url, self.timeout, self.force_request_update, self.namespaces, self.verify)
all_records += records
print("Page limit is {} and pagees harvested is {}".format(page_limit, pages_harvested))
if page_limit and int(page_limit) == int(pages_harvested):
break
else:
url.remove('from')
url.remove('until')
url.remove('metadataPrefix')
url.args['resumptionToken'] = token[0]
records, token = oai_get_records_and_token(url.url, self.timeout, self.force_request_update, self.namespaces, self.verify)
all_records += records
pages_harvested += 1

return all_records

Expand Down
15 changes: 9 additions & 6 deletions scrapi/harvesters/biomedcentral.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def schema(self):
)
}

def harvest(self, start_date=None, end_date=None):
def harvest(self, start_date=None, end_date=None, page_limit=None):

start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)

Expand All @@ -96,7 +96,7 @@ def harvest(self, start_date=None, end_date=None):
date_number = end_date - start_date

search_url = self.URL.format(date_number.days)
records = self.get_records(search_url)
records = self.get_records(search_url, page_limit)

record_list = []
for record in records:
Expand All @@ -115,7 +115,7 @@ def harvest(self, start_date=None, end_date=None):

return record_list

def get_records(self, search_url):
def get_records(self, search_url, page_limit):
now = datetime.now()
records = requests.get(search_url + "#{}".format(date.today()))
page = 1
Expand All @@ -132,8 +132,11 @@ def get_records(self, search_url):
continue
all_records.append(record)

page += 1
records = requests.get(search_url + '&page={}#{}'.format(str(page), date.today()), throttle=10)
current_records = len(records.json()['entries'])
if page_limit and int(page_limit) == page:
break
else:
page += 1
records = requests.get(search_url + '&page={}#{}'.format(str(page), date.today()), throttle=10)
current_records = len(records.json()['entries'])

return all_records
4 changes: 3 additions & 1 deletion scrapi/harvesters/clinicaltrials.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ class ClinicalTrialsHarvester(XMLHarvester):
def namespaces(self):
return None

def harvest(self, start_date=None, end_date=None):
def harvest(self, start_date=None, end_date=None, page_limit=None):
""" First, get a list of all recently updated study urls,
then get the xml one by one and save it into a list
of docs including other information """
Expand Down Expand Up @@ -166,6 +166,8 @@ def harvest(self, start_date=None, end_date=None):
official_count += 1
count += 1
if count % 100 == 0:
if page_limit and page_limit == official_count / 100:
break
logger.info("You've requested {} studies, keep going!".format(official_count))
count = 0

Expand Down
5 changes: 4 additions & 1 deletion scrapi/harvesters/crossref.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def schema(self):
)
}

def harvest(self, start_date=None, end_date=None):
def harvest(self, start_date=None, end_date=None, page_limit=None):
start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
end_date = end_date or date.today()

Expand All @@ -128,4 +128,7 @@ def harvest(self, start_date=None, end_date=None):
'filetype': 'json'
}))

if page_limit and int(page_limit) == i / 1000:
break

return doc_list
2 changes: 1 addition & 1 deletion scrapi/harvesters/dailyssrn.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class DailyssrnHarvester(XMLHarvester):
}
}

def harvest(self, start_date=None, end_date=None):
def harvest(self, start_date=None, end_date=None, page_limit=None):

url = 'http://dailyssrn.com/rss/rss-all-2.0.xml'

Expand Down
12 changes: 8 additions & 4 deletions scrapi/harvesters/dataone.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,12 +139,12 @@ class DataOneHarvester(XMLHarvester):
'description': ("str[@name='abstract']/node()", single_result)
}

def harvest(self, start_date=None, end_date=None):
def harvest(self, start_date=None, end_date=None, page_limit=None):

start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
end_date = end_date or date.today()

records = self.get_records(start_date, end_date)
records = self.get_records(start_date, end_date, page_limit)

xml_list = []
for record in records:
Expand All @@ -159,7 +159,7 @@ def harvest(self, start_date=None, end_date=None):

return xml_list

def get_records(self, start_date, end_date):
def get_records(self, start_date, end_date, page_limit):
''' helper function to get a response from the DataONE
API, with the specified number of rows.
Returns an etree element with results '''
Expand All @@ -183,4 +183,8 @@ def get_records(self, start_date, end_date):
docs = etree.XML(data.content).xpath('//doc')
for doc in docs:
yield doc
n += 1000

if page_limit and int(page_limit) == n / 1000:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This may have different behavior in python3 (division operator semantics changed slightly, IIRC)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, I would put this check as a condition of the while loop.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

# python3
1 / 2   # 0.5
1 // 2  # 0

# python 2
1 / 2   # 0
1 // 2  # 0

from __future__ import division
1 / 2   # 0.5
1 // 2  # 0

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thx

break
else:
n += 1000
2 changes: 1 addition & 1 deletion scrapi/harvesters/doepages.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class DoepagesHarvester(XMLHarvester):
'dcq': 'http://purl.org/dc/terms/'
}

def harvest(self, start_date=None, end_date=None):
def harvest(self, start_date=None, end_date=None, page_limit=None):

start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
end_date = end_date or date.today()
Expand Down
13 changes: 8 additions & 5 deletions scrapi/harvesters/figshare.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class FigshareHarvester(JSONHarvester):
)
}

def harvest(self, start_date=None, end_date=None):
def harvest(self, start_date=None, end_date=None, page_limit=None):
""" Figshare should always have a 24 hour delay because they
manually go through and check for test projects. Most of them
are removed within 24 hours.
Expand All @@ -67,7 +67,7 @@ def harvest(self, start_date=None, end_date=None):
end_date.isoformat()
)

records = self.get_records(search_url)
records = self.get_records(search_url, page_limit)

record_list = []
for record in records:
Expand All @@ -86,7 +86,7 @@ def harvest(self, start_date=None, end_date=None):

return record_list

def get_records(self, search_url):
def get_records(self, search_url, page_limit):
records = requests.get(search_url)
total_records = records.json()['items_found']
page = 1
Expand All @@ -99,7 +99,10 @@ def get_records(self, search_url):
if len(all_records) < total_records:
all_records.append(record)

page += 1
records = requests.get(search_url + '&page={}'.format(str(page)), throttle=3)
if page_limit and int(page_limit) == page:
break
else:
page += 1
records = requests.get(search_url + '&page={}'.format(str(page)), throttle=3)

return all_records
11 changes: 7 additions & 4 deletions scrapi/harvesters/harvarddataverse.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class HarvardDataverseHarvester(JSONHarvester):
)
}

def harvest(self, start_date=None, end_date=None):
def harvest(self, start_date=None, end_date=None, page_limit=None):
start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat()
end_date = (end_date or date.today()).isoformat()

Expand All @@ -69,7 +69,7 @@ def harvest(self, start_date=None, end_date=None):
query.args['order'] = 'asc'
query.args['fq'] = 'dateSort:[{}T00:00:00Z TO {}T00:00:00Z]'.format(start_date, end_date)

records = self.get_records(query.url)
records = self.get_records(query.url, page_limit)
record_list = []
for record in records:
doc_id = record['global_id']
Expand All @@ -87,7 +87,7 @@ def harvest(self, start_date=None, end_date=None):

return record_list

def get_records(self, search_url):
def get_records(self, search_url, page_limit):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you make this into a generator?

records = requests.get(search_url)
total_records = records.json()['data']['total_count']
start = 0
Expand All @@ -100,6 +100,9 @@ def get_records(self, search_url):
for record in record_list:
all_records.append(record)

start += self.MAX_ITEMS_PER_REQUEST
if page_limit and int(page_limit) == start / self.MAX_ITEMS_PER_REQUEST:
break
else:
start += self.MAX_ITEMS_PER_REQUEST

return all_records
12 changes: 8 additions & 4 deletions scrapi/harvesters/osf.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,14 @@ def schema(self):
)
}

def harvest(self, start_date=None, end_date=None):
def harvest(self, start_date=None, end_date=None, page_limit=None):
# Always harvest a 2 day period starting 2 days back to honor time given
# to contributors to cancel a public registration
start_date = start_date or date.today() - timedelta(4)
end_date = end_date or date.today() - timedelta(2)

search_url = self.URL.format(start_date.isoformat(), end_date.isoformat())
records = self.get_records(search_url)
records = self.get_records(search_url, page_limit)

record_list = []
for record in records:
Expand All @@ -99,7 +99,7 @@ def harvest(self, start_date=None, end_date=None):

return record_list

def get_records(self, search_url):
def get_records(self, search_url, page_limit):
records = requests.get(search_url)

total = int(records.json()['counts']['registration'])
Expand All @@ -113,6 +113,10 @@ def get_records(self, search_url):
all_records.append(record)

from_arg += 1000
records = requests.get(search_url + '&from={}'.format(str(from_arg)), throttle=10)

if page_limit and int(page_limit) == from_arg / 1000:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see above

break
else:
records = requests.get(search_url + '&from={}'.format(str(from_arg)), throttle=10)

return all_records
11 changes: 7 additions & 4 deletions scrapi/harvesters/plos.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class PlosHarvester(XMLHarvester):
MAX_ROWS_PER_REQUEST = 999
BASE_URL = 'http://api.plos.org/search'

def fetch_rows(self, start_date, end_date):
def fetch_rows(self, start_date, end_date, page_limit):
query = 'publication_date:[{}T00:00:00Z TO {}T00:00:00Z]'.format(start_date, end_date)

resp = requests.get(self.BASE_URL, params={
Expand All @@ -72,9 +72,12 @@ def fetch_rows(self, start_date, end_date):
for doc in etree.XML(response.content).xpath('//doc'):
yield doc

current_row += self.MAX_ROWS_PER_REQUEST
if page_limit and int(page_limit) == current_row / self.MAX_ROWS_PER_REQUEST:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should also be a condition of the outer loop.

break
else:
current_row += self.MAX_ROWS_PER_REQUEST

def harvest(self, start_date=None, end_date=None):
def harvest(self, start_date=None, end_date=None, page_limit=None):

start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
end_date = end_date or date.today()
Expand All @@ -90,7 +93,7 @@ def harvest(self, start_date=None, end_date=None):
'docID': row.xpath("str[@name='id']")[0].text,
})
for row in
self.fetch_rows(start_date.isoformat(), end_date.isoformat())
self.fetch_rows(start_date.isoformat(), end_date.isoformat(), page_limit)
if row.xpath("arr[@name='abstract']")
or row.xpath("str[@name='author_display']")
]
Expand Down
13 changes: 8 additions & 5 deletions scrapi/harvesters/scitech.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class SciTechHarvester(XMLHarvester):

schema = DOESCHEMA

def harvest(self, start_date=None, end_date=None):
def harvest(self, start_date=None, end_date=None, page_limit=None):
"""A function for querying the SciTech Connect database for raw XML.
The XML is chunked into smaller pieces, each representing data
about an article/report. If there are multiple pages of results,
Expand All @@ -54,10 +54,10 @@ def harvest(self, start_date=None, end_date=None):
'doc': etree.tostring(record),
'docID': six.u(record.xpath('dc:ostiId/node()', namespaces=self.namespaces)[0]),
})
for record in self._fetch_records(start_date, end_date)
for record in self._fetch_records(start_date, end_date, page_limit)
]

def _fetch_records(self, start_date, end_date):
def _fetch_records(self, start_date, end_date, page_limit):
page = 0
morepages = True

Expand All @@ -76,5 +76,8 @@ def _fetch_records(self, start_date, end_date):
for record in xml.xpath('records/record'):
yield record

page += 1
morepages = xml.xpath('//records/@morepages')[0] == 'true'
if page_limit and int(page_limit) == page:
break
else:
page += 1
morepages = xml.xpath('//records/@morepages')[0] == 'true'
Loading