Skip to content

Commit 1658b33

Browse files
committed
Add --start-idx=<n>, --end-idx=<m> option to enable ranged downloads
1 parent ee89550 commit 1658b33

File tree

2 files changed

+71
-46
lines changed

2 files changed

+71
-46
lines changed

AUTHORS.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@ Patches and Suggestions
1919
-----------------------
2020

2121
- VM Brasseur
22+
- Russ Magee <rmagee@gmail.com>

internetarchive/cli/ia_download.py

Lines changed: 70 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@
3333
-R, --retries=<retries> Set number of retries to <retries> [default: 5].
3434
-I, --itemlist=<file> Download items from a specified file. Itemlists should
3535
be a plain text file with one identifier per line.
36+
-n, --start-idx=<n> Start immediately at item <n>
37+
-m, --end-idx=<m> End download after item <m>
3638
-S, --search=<query> Download items returned from a specified search query.
3739
-P, --search-parameters=<key:value>... Download items returned from a specified search query.
3840
-g, --glob=<pattern> Only download files whose filename matches the
@@ -110,6 +112,8 @@ def main(argv, session: ArchiveSession) -> None:
110112
'--download-history': Use(bool),
111113
'--parameters': Use(lambda x: get_args_dict(x, query_string=True)),
112114
'--source': list,
115+
'--start-idx': Use(lambda item: item[0] if item else None), #Or(int, None), #Use(lambda x: x[0]),
116+
'--end-idx': Use(lambda item: item[0] if item else None), #Or(int, None), #Use(lambda x: x[0]),
113117
'--exclude-source': list,
114118
'--timeout': Or([], And(Use(lambda t: ast.literal_eval(t[0])), Or(int, float),
115119
error=timeout_msg))
@@ -128,6 +132,16 @@ def main(argv, session: ArchiveSession) -> None:
128132
print(f'{exc}\n{printable_usage(__doc__)}', file=sys.stderr)
129133
sys.exit(1)
130134

135+
if args['--start-idx']:
136+
start_idx = int(args['--start-idx'])-1
137+
else:
138+
start_idx = 0
139+
140+
if args['--end-idx']:
141+
end_idx = int(args['--end-idx'])
142+
else:
143+
end_idx = None
144+
131145
retries = int(args['--retries'])
132146
ids: list[File | str] | Search | TextIO
133147

@@ -165,55 +179,65 @@ def main(argv, session: ArchiveSession) -> None:
165179
else:
166180
files = None
167181

182+
if start_idx != None:
183+
print(f'Starting download at collection item {start_idx+1}')
184+
168185
errors = []
169186
for i, identifier in enumerate(ids):
170-
try:
171-
identifier = identifier.strip()
172-
except AttributeError:
173-
identifier = identifier.get('identifier')
174-
if total_ids > 1:
175-
item_index = f'{i + 1}/{total_ids}'
187+
if end_idx != None and end_idx == i:
188+
print(f'Ending download at specified item {end_idx}.')
189+
break
190+
if start_idx != None and i < start_idx:
191+
pass
176192
else:
177-
item_index = None
178-
179-
try:
180-
item = session.get_item(identifier)
181-
except Exception as exc:
182-
print(f'{identifier}: failed to retrieve item metadata - errors', file=sys.stderr)
183-
raise
184-
if 'You are attempting to make an HTTPS' in str(exc):
185-
print(f'\n{exc}', file=sys.stderr)
186-
sys.exit(1)
187-
else:
188-
continue
189-
190-
# Otherwise, download the entire item.
191-
ignore_history_dir = True if not args['--download-history'] else False
192-
_errors = item.download(
193-
files=files,
194-
formats=args['--format'],
195-
glob_pattern=args['--glob'],
196-
exclude_pattern=args['--exclude'],
197-
dry_run=args['--dry-run'],
198-
verbose=not args['--quiet'],
199-
ignore_existing=args['--ignore-existing'],
200-
checksum=args['--checksum'],
201-
destdir=args['--destdir'],
202-
no_directory=args['--no-directories'],
203-
retries=retries,
204-
item_index=item_index,
205-
ignore_errors=True,
206-
on_the_fly=args['--on-the-fly'],
207-
no_change_timestamp=args['--no-change-timestamp'],
208-
params=args['--parameters'],
209-
ignore_history_dir=ignore_history_dir,
210-
source=args['--source'],
211-
exclude_source=args['--exclude-source'],
212-
stdout=args['--stdout'],
213-
timeout=args['--timeout'],
214-
)
215-
if _errors:
216-
errors.append(_errors)
193+
try:
194+
identifier = identifier.strip()
195+
except AttributeError:
196+
identifier = identifier.get('identifier')
197+
if total_ids > 1:
198+
item_index = f'{i + 1}/{total_ids}'
199+
else:
200+
item_index = None
201+
202+
try:
203+
item = session.get_item(identifier)
204+
except Exception as exc:
205+
print(f'{identifier}: failed to retrieve item metadata - errors', file=sys.stderr)
206+
raise
207+
if 'You are attempting to make an HTTPS' in str(exc):
208+
print(f'\n{exc}', file=sys.stderr)
209+
sys.exit(1)
210+
else:
211+
continue
212+
213+
# Otherwise, download the entire item.
214+
ignore_history_dir = True if not args['--download-history'] else False
215+
_errors = item.download(
216+
files=files,
217+
formats=args['--format'],
218+
glob_pattern=args['--glob'],
219+
exclude_pattern=args['--exclude'],
220+
dry_run=args['--dry-run'],
221+
verbose=not args['--quiet'],
222+
ignore_existing=args['--ignore-existing'],
223+
checksum=args['--checksum'],
224+
destdir=args['--destdir'],
225+
no_directory=args['--no-directories'],
226+
retries=retries,
227+
item_index=item_index,
228+
ignore_errors=True,
229+
on_the_fly=args['--on-the-fly'],
230+
no_change_timestamp=args['--no-change-timestamp'],
231+
params=args['--parameters'],
232+
ignore_history_dir=ignore_history_dir,
233+
source=args['--source'],
234+
exclude_source=args['--exclude-source'],
235+
stdout=args['--stdout'],
236+
timeout=args['--timeout'],
237+
)
238+
if _errors:
239+
errors.append(_errors)
240+
##endif (start_idx)
217241
if errors:
218242
# TODO: add option for a summary/report.
219243
sys.exit(1)

0 commit comments

Comments
 (0)