@@ -109,6 +109,9 @@ def search_all(
109
109
)
110
110
self .setup_search ()
111
111
112
+ # Reset the doc_id counter to prevent any initialization during client setup
113
+ self .__class__ ._doc_id_counter = None
114
+
112
115
search_one = functools .partial (self .__class__ ._search_one , top = top )
113
116
insert_one = functools .partial (self .__class__ ._insert_one )
114
117
@@ -155,88 +158,184 @@ def cycling_query_generator(queries, total_count):
155
158
used_queries = queries_list
156
159
total_query_count = len (used_queries )
157
160
158
- if parallel == 1 :
159
- # Single-threaded execution
160
- start = time .perf_counter ()
161
-
162
- # Process queries
163
- results = []
164
- total_insert_count = 0
165
- total_search_count = 0
166
- all_insert_latencies = []
167
- all_search_latencies = []
168
-
169
- for query in used_queries :
170
- if random .random () < insert_fraction :
171
- precision , latency = insert_one (query )
172
- total_insert_count += 1
173
- all_insert_latencies .append (latency )
174
- results .append (('insert' , precision , latency ))
175
- else :
176
- precision , latency = search_one (query )
177
- total_search_count += 1
178
- all_search_latencies .append (latency )
179
- results .append (('search' , precision , latency ))
180
-
181
- total_time = time .perf_counter () - start
161
+ # Interval reporting setup
162
+ interval_size = 10000 # Report every 10K operations
163
+ need_interval_reporting = total_query_count >= interval_size
164
+ interval_counter = 0
165
+ overall_start_time = time .perf_counter ()
166
+
167
+ # Calculate total number of intervals for progress tracking
168
+ total_intervals = (total_query_count + interval_size - 1 ) // interval_size # Ceiling division
169
+
170
+ # Initialize progress bar for intervals if needed (only if output is to terminal)
171
+ if need_interval_reporting and os .isatty (1 ): # Check if stdout is a terminal
172
+ interval_pbar = tqdm .tqdm (total = total_intervals , desc = "Intervals" , unit = "interval" )
182
173
else :
183
- # Dynamically calculate chunk size based on total_query_count
184
- chunk_size = max (1 , total_query_count // parallel )
185
-
186
- # If used_queries is a generator, we need to handle it differently
187
- if hasattr (used_queries , '__next__' ):
188
- # For generators, we'll create chunks on-the-fly
189
- query_chunks = []
190
- remaining = total_query_count
191
- while remaining > 0 :
192
- current_chunk_size = min (chunk_size , remaining )
193
- chunk = [next (used_queries ) for _ in range (current_chunk_size )]
194
- query_chunks .append (chunk )
195
- remaining -= current_chunk_size
174
+ interval_pbar = None
175
+
176
+ # Initialize global doc_id offset to ensure uniqueness across intervals
177
+ global_doc_id_offset = 0
178
+
179
+ # Overall accumulators
180
+ overall_results = []
181
+ overall_insert_count = 0
182
+ overall_search_count = 0
183
+ overall_insert_latencies = []
184
+ overall_search_latencies = []
185
+
186
+ # Interval statistics for output file
187
+ interval_stats = []
188
+
189
+ # Convert generator to iterator for interval processing
190
+ query_iterator = iter (used_queries )
191
+
192
+ # Process queries in intervals of 10K
193
+ while True :
194
+ # Get next interval chunk (up to 10K queries)
195
+ interval_queries = list (islice (query_iterator , interval_size ))
196
+ if not interval_queries :
197
+ break # No more queries
198
+
199
+ interval_counter += 1
200
+ current_interval_size = len (interval_queries )
201
+
202
+ if parallel == 1 :
203
+ # Single-threaded execution for this interval
204
+ interval_start = time .perf_counter ()
205
+
206
+ # Force reset and set doc_id counter offset for single-threaded execution
207
+ # This ensures we override any previous initialization
208
+ self .__class__ ._doc_id_counter = itertools .count (global_doc_id_offset )
209
+
210
+ # Process queries for this interval
211
+ interval_results = []
212
+ interval_insert_count = 0
213
+ interval_search_count = 0
214
+ interval_insert_latencies = []
215
+ interval_search_latencies = []
216
+
217
+ for query in interval_queries :
218
+ if random .random () < insert_fraction :
219
+ precision , latency = insert_one (query )
220
+ interval_insert_count += 1
221
+ interval_insert_latencies .append (latency )
222
+ interval_results .append (('insert' , precision , latency ))
223
+ else :
224
+ precision , latency = search_one (query )
225
+ interval_search_count += 1
226
+ interval_search_latencies .append (latency )
227
+ interval_results .append (('search' , precision , latency ))
228
+
229
+ interval_time = time .perf_counter () - interval_start
196
230
else :
197
- # For lists, we can use the chunked_iterable function
198
- query_chunks = list (chunked_iterable (used_queries , chunk_size ))
199
-
200
- # Create a queue to collect results
201
- result_queue = Queue ()
202
-
203
- # Create worker processes
204
- processes = []
205
- for chunk in query_chunks :
206
- process = Process (target = worker_function , args = (self , distance , search_one , insert_one ,
207
- chunk , result_queue , insert_fraction ))
208
- processes .append (process )
209
-
210
- # Start worker processes
211
- for process in processes :
212
- process .start ()
213
-
214
- # Collect results from all worker processes
215
- results = []
216
- total_insert_count = 0
217
- total_search_count = 0
218
- all_insert_latencies = []
219
- all_search_latencies = []
220
- min_start_time = time .perf_counter ()
221
-
222
- for _ in processes :
223
- proc_start_time , chunk_results , insert_count , search_count , insert_latencies , search_latencies = result_queue .get ()
224
- results .extend (chunk_results )
225
- total_insert_count += insert_count
226
- total_search_count += search_count
227
- all_insert_latencies .extend (insert_latencies )
228
- all_search_latencies .extend (search_latencies )
231
+ # Parallel execution for this interval
232
+ # Dynamically calculate chunk size based on current interval size
233
+ chunk_size = max (1 , current_interval_size // parallel )
234
+
235
+ # For interval queries (always a list), use chunked_iterable
236
+ query_chunks = list (chunked_iterable (interval_queries , chunk_size ))
237
+
238
+ # Create a queue to collect results
239
+ result_queue = Queue ()
240
+
241
+ # Create worker processes
242
+ processes = []
243
+ for i , chunk in enumerate (query_chunks ):
244
+ # Calculate unique doc_id offset for this worker in this interval
245
+ worker_doc_id_offset = global_doc_id_offset + (i * 1000000 )
246
+ process = Process (target = worker_function , args = (self , distance , search_one , insert_one ,
247
+ chunk , result_queue , insert_fraction , worker_doc_id_offset ))
248
+ processes .append (process )
249
+
250
+ # Start worker processes
251
+ for process in processes :
252
+ process .start ()
253
+
254
+ # Collect results from all worker processes
255
+ interval_results = []
256
+ interval_insert_count = 0
257
+ interval_search_count = 0
258
+ interval_insert_latencies = []
259
+ interval_search_latencies = []
260
+ min_start_time = time .perf_counter ()
261
+
262
+ for _ in processes :
263
+ proc_start_time , chunk_results , insert_count , search_count , insert_latencies , search_latencies = result_queue .get ()
264
+ interval_results .extend (chunk_results )
265
+ interval_insert_count += insert_count
266
+ interval_search_count += search_count
267
+ interval_insert_latencies .extend (insert_latencies )
268
+ interval_search_latencies .extend (search_latencies )
269
+
270
+ # Update min_start_time if necessary
271
+ if proc_start_time < min_start_time :
272
+ min_start_time = proc_start_time
273
+
274
+ # Stop measuring time for the critical work
275
+ interval_time = time .perf_counter () - min_start_time
276
+
277
+ # Wait for all worker processes to finish
278
+ for process in processes :
279
+ process .join ()
280
+
281
+ # Accumulate overall results
282
+ overall_results .extend (interval_results )
283
+ overall_insert_count += interval_insert_count
284
+ overall_search_count += interval_search_count
285
+ overall_insert_latencies .extend (interval_insert_latencies )
286
+ overall_search_latencies .extend (interval_search_latencies )
287
+
288
+ # Update global doc_id offset for next interval
289
+ if parallel == 1 :
290
+ # For single-threaded, reserve space based on actual inserts in this interval
291
+ global_doc_id_offset += max (1000000 , interval_insert_count * 2 ) # Some buffer
292
+ else :
293
+ # Reserve space for all parallel workers in this interval
294
+ global_doc_id_offset += parallel * 1000000
295
+
296
+ # Report interval metrics if needed
297
+ if need_interval_reporting :
298
+ interval_search_precisions = [result [1 ] for result in interval_results if result [0 ] == 'search' ]
229
299
230
- # Update min_start_time if necessary
231
- if proc_start_time < min_start_time :
232
- min_start_time = proc_start_time
233
-
234
- # Stop measuring time for the critical work
235
- total_time = time .perf_counter () - min_start_time
236
-
237
- # Wait for all worker processes to finish
238
- for process in processes :
239
- process .join ()
300
+ # Create interval statistics for output file
301
+ interval_stat = {
302
+ "interval" : interval_counter ,
303
+ "operations" : current_interval_size ,
304
+ "time_seconds" : float (interval_time ), # Ensure it's a float
305
+ "rps" : float (current_interval_size / interval_time ), # Ensure it's a float
306
+ "searches" : interval_search_count ,
307
+ "inserts" : interval_insert_count ,
308
+ "search_precision" : float (np .mean (interval_search_precisions )) if interval_search_precisions else None
309
+ }
310
+ interval_stats .append (interval_stat )
311
+
312
+ # Debug: Print number of intervals collected so far
313
+ print (f"DEBUG: Collected { len (interval_stats )} intervals so far" , flush = True )
314
+
315
+ # Update progress bar with same metrics (this goes to terminal)
316
+ if interval_pbar :
317
+ interval_pbar .update (1 )
318
+ interval_pbar .set_postfix ({
319
+ 'RPS' : f"{ current_interval_size / interval_time :.1f} " ,
320
+ 'Searches' : interval_search_count ,
321
+ 'Inserts' : interval_insert_count ,
322
+ 'Precision' : f"{ np .mean (interval_search_precisions ):.4f} " if interval_search_precisions else "N/A"
323
+ })
324
+
325
+ # Close progress bar when done
326
+ if interval_pbar :
327
+ interval_pbar .close ()
328
+ print () # Add a blank line after progress bar
329
+
330
+ # Calculate total time for overall metrics
331
+ total_time = time .perf_counter () - overall_start_time
332
+
333
+ # Use overall accumulated results
334
+ results = overall_results
335
+ total_insert_count = overall_insert_count
336
+ total_search_count = overall_search_count
337
+ all_insert_latencies = overall_insert_latencies
338
+ all_search_latencies = overall_search_latencies
240
339
241
340
# Extract overall precisions and latencies
242
341
all_precisions = [result [1 ] for result in results ]
@@ -247,6 +346,11 @@ def cycling_query_generator(queries, total_count):
247
346
248
347
self .__class__ .delete_client ()
249
348
349
+
350
+ if len (interval_stats ) > 0 :
351
+ print (f"DEBUG: First interval: { interval_stats [0 ]} " , flush = True )
352
+ print (f"DEBUG: Last interval: { interval_stats [- 1 ]} " , flush = True )
353
+
250
354
return {
251
355
# Overall metrics
252
356
"total_time" : total_time ,
@@ -274,6 +378,9 @@ def cycling_query_generator(queries, total_count):
274
378
"actual_insert_fraction" : total_insert_count / len (all_latencies ) if len (all_latencies ) > 0 else 0 ,
275
379
"target_insert_fraction" : insert_fraction ,
276
380
381
+ # Interval statistics (only included if intervals were used)
382
+ "interval_stats" : interval_stats if interval_stats else None ,
383
+
277
384
# Legacy compatibility (for existing code that expects these)
278
385
"mean_time" : np .mean (all_latencies ),
279
386
"mean_precisions" : np .mean (search_precisions ) if search_precisions else 1.0 , # Only search precisions
@@ -326,7 +433,7 @@ def process_chunk(chunk, search_one, insert_one, insert_fraction):
326
433
return results , insert_count , search_count , insert_latencies , search_latencies
327
434
328
435
# Function to be executed by each worker process
329
- def worker_function (self , distance , search_one , insert_one , chunk , result_queue , insert_fraction = 0.0 ):
436
+ def worker_function (self , distance , search_one , insert_one , chunk , result_queue , insert_fraction = 0.0 , doc_id_offset = 0 ):
330
437
self .init_client (
331
438
self .host ,
332
439
distance ,
@@ -335,6 +442,9 @@ def worker_function(self, distance, search_one, insert_one, chunk, result_queue,
335
442
)
336
443
self .setup_search ()
337
444
445
+ # Force set the doc_id counter offset for this worker (overrides any previous state)
446
+ self .__class__ ._doc_id_counter = itertools .count (doc_id_offset )
447
+
338
448
start_time = time .perf_counter ()
339
449
results , insert_count , search_count , insert_latencies , search_latencies = process_chunk (
340
450
chunk , search_one , insert_one , insert_fraction
0 commit comments