From 8a1a3cefa5639689bc83aa34df914eec090d408f Mon Sep 17 00:00:00 2001 From: Gabriel Schneider Date: Fri, 26 Nov 2021 11:31:27 -0300 Subject: [PATCH 1/2] improvement: parallelized images download --- main.py | 14 +++++++++++--- requirements.txt | 2 ++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index f46f1d4..fcecedc 100644 --- a/main.py +++ b/main.py @@ -3,6 +3,8 @@ from pycocotools.coco import COCO from tqdm import tqdm import os +import multiprocessing +from joblib import Parallel, delayed anns_path = './annotations/annotations.json' dest_path = './images/' @@ -10,6 +12,7 @@ def try_download(source: str, dest: str, name: str): + print(f'downloading {source}') r = requests.get(source, allow_redirects=True) if r.ok: with open(f'{dest}{name}', 'wb') as f: @@ -25,9 +28,14 @@ def download(img: dict, dest_path: str): def main(): dataset = COCO(anns_path) - for img in tqdm(dataset.imgs.values()): - download(img, dest_path) + download_pool = multiprocessing.Pool() + num_cores = multiprocessing.cpu_count() + nimgs = len(tqdm(dataset.imgs.values())) + inputs = tqdm(dataset.imgs.values()) + + processed_list = Parallel(n_jobs=num_cores)(delayed(download)(img, dest_path) for img in inputs) + if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/requirements.txt b/requirements.txt index 4c6535b..dc52a96 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ requests pycocotools tqdm +multiprocessing +joblib From dcce4a4e8ebef35b8c89e1fec679904dbc407326 Mon Sep 17 00:00:00 2001 From: Gabriel Schneider Date: Fri, 26 Nov 2021 11:38:06 -0300 Subject: [PATCH 2/2] newline delete --- main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/main.py b/main.py index fcecedc..042d364 100644 --- a/main.py +++ b/main.py @@ -30,7 +30,6 @@ def main(): download_pool = multiprocessing.Pool() - num_cores = multiprocessing.cpu_count() nimgs = len(tqdm(dataset.imgs.values())) inputs = tqdm(dataset.imgs.values())