|
1 | 1 | import os |
2 | | -from typing import Any, Dict, List, Optional, Sequence, Union |
| 2 | +from typing import Any, Dict, Iterable, List, Optional, Sequence, Union |
3 | 3 |
|
4 | 4 | import requests |
5 | 5 |
|
|
18 | 18 | convert_export_payload, |
19 | 19 | format_dataset_item_response, |
20 | 20 | format_prediction_response, |
| 21 | + paginate_generator, |
21 | 22 | serialize_and_write_to_presigned_url, |
22 | 23 | ) |
23 | 24 |
|
|
32 | 33 | EMBEDDING_DIMENSION_KEY, |
33 | 34 | EMBEDDINGS_URL_KEY, |
34 | 35 | EXPORTED_ROWS, |
| 36 | + ITEMS_KEY, |
35 | 37 | KEEP_HISTORY_KEY, |
36 | 38 | MESSAGE_KEY, |
37 | 39 | NAME_KEY, |
|
51 | 53 | ) |
52 | 54 | from .dataset_item_uploader import DatasetItemUploader |
53 | 55 | from .deprecation_warning import deprecated |
54 | | -from .errors import DatasetItemRetrievalError |
| 56 | +from .errors import NucleusAPIError |
55 | 57 | from .metadata_manager import ExportMetadataType, MetadataManager |
56 | 58 | from .payload_constructor import ( |
57 | 59 | construct_append_scenes_payload, |
@@ -160,25 +162,51 @@ def size(self) -> int: |
160 | 162 | dataset_size = DatasetSize.parse_obj(response) |
161 | 163 | return dataset_size.count |
162 | 164 |
|
| 165 | + def items_generator(self, page_size=100000) -> Iterable[DatasetItem]: |
| 166 | + """Generator yielding all dataset items in the dataset. |
| 167 | +
|
| 168 | +
|
| 169 | + :: |
| 170 | + sum_example_field = 0 |
| 171 | + for item in dataset.items_generator(): |
| 172 | + sum += item.metadata["example_field"] |
| 173 | +
|
| 174 | + Args: |
| 175 | + page_size (int, optional): Number of items to return per page. If you are |
| 176 | + experiencing timeouts while using this generator, you can try lowering |
| 177 | + the page size. |
| 178 | +
|
| 179 | + Yields: |
| 180 | + an iterable of DatasetItem objects. |
| 181 | + """ |
| 182 | + json_generator = paginate_generator( |
| 183 | + client=self._client, |
| 184 | + endpoint=f"dataset/{self.id}/itemsPage", |
| 185 | + result_key=ITEMS_KEY, |
| 186 | + page_size=page_size, |
| 187 | + ) |
| 188 | + for item_json in json_generator: |
| 189 | + yield DatasetItem.from_json(item_json) |
| 190 | + |
163 | 191 | @property |
164 | 192 | def items(self) -> List[DatasetItem]: |
165 | | - """List of all DatasetItem objects in the Dataset.""" |
166 | | - response = self._client.make_request( |
167 | | - {}, f"dataset/{self.id}/datasetItems", requests.get |
168 | | - ) |
169 | | - dataset_items = response.get("dataset_items", None) |
170 | | - error = response.get("error", None) |
171 | | - constructed_dataset_items = [] |
172 | | - if dataset_items: |
173 | | - for item in dataset_items: |
174 | | - image_url = item.get("original_image_url") |
175 | | - metadata = item.get("metadata", None) |
176 | | - ref_id = item.get("ref_id", None) |
177 | | - dataset_item = DatasetItem(image_url, ref_id, metadata) |
178 | | - constructed_dataset_items.append(dataset_item) |
179 | | - elif error: |
180 | | - raise DatasetItemRetrievalError(message=error) |
181 | | - return constructed_dataset_items |
| 193 | + """List of all DatasetItem objects in the Dataset. |
| 194 | +
|
| 195 | + For fetching more than 200k items see :meth:`NucleusDataset.items_generator`. |
| 196 | + """ |
| 197 | + try: |
| 198 | + response = self._client.make_request( |
| 199 | + {}, f"dataset/{self.id}/datasetItems", requests.get |
| 200 | + ) |
| 201 | + except NucleusAPIError as e: |
| 202 | + if e.status_code == 503: |
| 203 | + e.message += "\nThe server timed out while trying to load your items. Please try iterating over dataset.items_generator() instead." |
| 204 | + raise e |
| 205 | + dataset_item_jsons = response.get("dataset_items", None) |
| 206 | + return [ |
| 207 | + DatasetItem.from_json(item_json) |
| 208 | + for item_json in dataset_item_jsons |
| 209 | + ] |
182 | 210 |
|
183 | 211 | @property |
184 | 212 | def scenes(self) -> List[ScenesListEntry]: |
|
0 commit comments