-
Notifications
You must be signed in to change notification settings - Fork 47
[WIP] Add "page_limit" argument to harvesters #368
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
d9257da
3a56981
9a89778
6c82987
b1e18a7
3d198da
892812f
7526629
b1b931a
319a301
fa5ae2e
0cabf79
5eba0fa
fb7edbc
e296b63
c1b2f65
4a6a4b3
1804c51
98e2f5f
c063c99
418e17b
c135f08
14b8028
7cf5c57
89e1460
78dcb72
c7b3cc0
f6e8469
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -57,7 +57,7 @@ class HarvardDataverseHarvester(JSONHarvester): | |
| ) | ||
| } | ||
|
|
||
| def harvest(self, start_date=None, end_date=None): | ||
| def harvest(self, start_date=None, end_date=None, page_limit=None): | ||
| start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat() | ||
| end_date = (end_date or date.today()).isoformat() | ||
|
|
||
|
|
@@ -69,7 +69,7 @@ def harvest(self, start_date=None, end_date=None): | |
| query.args['order'] = 'asc' | ||
| query.args['fq'] = 'dateSort:[{}T00:00:00Z TO {}T00:00:00Z]'.format(start_date, end_date) | ||
|
|
||
| records = self.get_records(query.url) | ||
| records = self.get_records(query.url, page_limit) | ||
| record_list = [] | ||
| for record in records: | ||
| doc_id = record['global_id'] | ||
|
|
@@ -87,7 +87,7 @@ def harvest(self, start_date=None, end_date=None): | |
|
|
||
| return record_list | ||
|
|
||
| def get_records(self, search_url): | ||
| def get_records(self, search_url, page_limit): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you make this into a generator? |
||
| records = requests.get(search_url) | ||
| total_records = records.json()['data']['total_count'] | ||
| start = 0 | ||
|
|
@@ -100,6 +100,9 @@ def get_records(self, search_url): | |
| for record in record_list: | ||
| all_records.append(record) | ||
|
|
||
| start += self.MAX_ITEMS_PER_REQUEST | ||
| if page_limit and int(page_limit) == start / self.MAX_ITEMS_PER_REQUEST: | ||
| break | ||
| else: | ||
| start += self.MAX_ITEMS_PER_REQUEST | ||
|
|
||
| return all_records | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -73,14 +73,14 @@ def schema(self): | |
| ) | ||
| } | ||
|
|
||
| def harvest(self, start_date=None, end_date=None): | ||
| def harvest(self, start_date=None, end_date=None, page_limit=None): | ||
| # Always harvest a 2 day period starting 2 days back to honor time given | ||
| # to contributors to cancel a public registration | ||
| start_date = start_date or date.today() - timedelta(4) | ||
| end_date = end_date or date.today() - timedelta(2) | ||
|
|
||
| search_url = self.URL.format(start_date.isoformat(), end_date.isoformat()) | ||
| records = self.get_records(search_url) | ||
| records = self.get_records(search_url, page_limit) | ||
|
|
||
| record_list = [] | ||
| for record in records: | ||
|
|
@@ -99,7 +99,7 @@ def harvest(self, start_date=None, end_date=None): | |
|
|
||
| return record_list | ||
|
|
||
| def get_records(self, search_url): | ||
| def get_records(self, search_url, page_limit): | ||
| records = requests.get(search_url) | ||
|
|
||
| total = int(records.json()['counts']['registration']) | ||
|
|
@@ -113,6 +113,10 @@ def get_records(self, search_url): | |
| all_records.append(record) | ||
|
|
||
| from_arg += 1000 | ||
| records = requests.get(search_url + '&from={}'.format(str(from_arg)), throttle=10) | ||
|
|
||
| if page_limit and int(page_limit) == from_arg / 1000: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see above |
||
| break | ||
| else: | ||
| records = requests.get(search_url + '&from={}'.format(str(from_arg)), throttle=10) | ||
|
|
||
| return all_records | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -48,7 +48,7 @@ class PlosHarvester(XMLHarvester): | |
| MAX_ROWS_PER_REQUEST = 999 | ||
| BASE_URL = 'http://api.plos.org/search' | ||
|
|
||
| def fetch_rows(self, start_date, end_date): | ||
| def fetch_rows(self, start_date, end_date, page_limit): | ||
| query = 'publication_date:[{}T00:00:00Z TO {}T00:00:00Z]'.format(start_date, end_date) | ||
|
|
||
| resp = requests.get(self.BASE_URL, params={ | ||
|
|
@@ -72,9 +72,12 @@ def fetch_rows(self, start_date, end_date): | |
| for doc in etree.XML(response.content).xpath('//doc'): | ||
| yield doc | ||
|
|
||
| current_row += self.MAX_ROWS_PER_REQUEST | ||
| if page_limit and int(page_limit) == current_row / self.MAX_ROWS_PER_REQUEST: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should also be a condition of the outer loop. |
||
| break | ||
| else: | ||
| current_row += self.MAX_ROWS_PER_REQUEST | ||
|
|
||
| def harvest(self, start_date=None, end_date=None): | ||
| def harvest(self, start_date=None, end_date=None, page_limit=None): | ||
|
|
||
| start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) | ||
| end_date = end_date or date.today() | ||
|
|
@@ -90,7 +93,7 @@ def harvest(self, start_date=None, end_date=None): | |
| 'docID': row.xpath("str[@name='id']")[0].text, | ||
| }) | ||
| for row in | ||
| self.fetch_rows(start_date.isoformat(), end_date.isoformat()) | ||
| self.fetch_rows(start_date.isoformat(), end_date.isoformat(), page_limit) | ||
| if row.xpath("arr[@name='abstract']") | ||
| or row.xpath("str[@name='author_display']") | ||
| ] | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This may have different behavior in python3 (division operator semantics changed slightly, IIRC)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, I would put this check as a condition of the while loop.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thx