From 7c25782b4ab709ba98d35480830bd3517ae7cd86 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Mon, 4 Aug 2025 15:42:22 +0200 Subject: [PATCH 1/9] Use duckdb to speed up dependency management --- audb/core/dependencies.py | 193 +++++++++++++++++++++++++++++++++++++- pyproject.toml | 1 + 2 files changed, 193 insertions(+), 1 deletion(-) diff --git a/audb/core/dependencies.py b/audb/core/dependencies.py index 84a94cf3..8109705f 100644 --- a/audb/core/dependencies.py +++ b/audb/core/dependencies.py @@ -7,6 +7,7 @@ import re import tempfile +import duckdb import pandas as pd import pyarrow as pa import pyarrow.csv as csv @@ -79,6 +80,9 @@ def __init__(self): ("version", pa.string()), ] ) + # duckdb connection for fast queries + self._duckdb_conn = None + self._parquet_file = None def __call__(self) -> pd.DataFrame: r"""Return dependencies as a table. @@ -99,6 +103,16 @@ def __contains__(self, file: str) -> bool: ``True`` if a dependency to the file exists """ + if self._duckdb_conn is not None and self._parquet_file is not None: + try: + result = self._duckdb_conn.execute( + f"SELECT COUNT(*) FROM '{self._parquet_file}' WHERE file = ?", + [file], + ).fetchone() + return result[0] > 0 + except Exception: + pass + # Fallback to pandas return file in self._df.index def __eq__(self, other: "Dependencies") -> bool: @@ -140,6 +154,16 @@ def archives(self) -> list[str]: list of archives """ + if self._duckdb_conn is not None and self._parquet_file is not None: + try: + result = self._duckdb_conn.execute( + f"SELECT DISTINCT archive FROM '{self._parquet_file}' " + "ORDER BY archive" + ).fetchall() + return [row[0] for row in result] + except Exception: + pass + # Fallback to pandas return sorted(self._df.archive.unique().tolist()) @property @@ -150,6 +174,14 @@ def attachments(self) -> list[str]: list of attachments """ + if self._duckdb_conn is not None and self._parquet_file is not None: + try: + return self._duckdb_query_files( + f"type = {define.DEPENDENCY_TYPE['attachment']}" + ) + except Exception: + pass + # Fallback to pandas return self._df[ self._df["type"] == define.DEPENDENCY_TYPE["attachment"] ].index.tolist() @@ -162,6 +194,16 @@ def attachment_ids(self) -> list[str]: list of attachment IDs """ + if self._duckdb_conn is not None and self._parquet_file is not None: + try: + result = self._duckdb_conn.execute( + f"SELECT archive FROM '{self._parquet_file}' " + f"WHERE type = {define.DEPENDENCY_TYPE['attachment']}" + ).fetchall() + return [row[0] for row in result] + except Exception: + pass + # Fallback to pandas return self._df[ self._df["type"] == define.DEPENDENCY_TYPE["attachment"] ].archive.tolist() @@ -174,6 +216,15 @@ def files(self) -> list[str]: list of files """ + if self._duckdb_conn is not None and self._parquet_file is not None: + try: + result = self._duckdb_conn.execute( + f"SELECT file FROM '{self._parquet_file}'" + ).fetchall() + return [row[0] for row in result] + except Exception: + pass + # Fallback to pandas return self._df.index.tolist() @property @@ -184,6 +235,14 @@ def media(self) -> list[str]: list of media """ + if self._duckdb_conn is not None and self._parquet_file is not None: + try: + return self._duckdb_query_files( + f"type = {define.DEPENDENCY_TYPE['media']}" + ) + except Exception: + pass + # Fallback to pandas return self._df[ self._df["type"] == define.DEPENDENCY_TYPE["media"] ].index.tolist() @@ -196,6 +255,14 @@ def removed_media(self) -> list[str]: list of media """ + if self._duckdb_conn is not None and self._parquet_file is not None: + try: + return self._duckdb_query_files( + f"type = {define.DEPENDENCY_TYPE['media']} AND removed = 1" + ) + except Exception: + pass + # Fallback to pandas return self._df[ (self._df["type"] == define.DEPENDENCY_TYPE["media"]) & (self._df["removed"] == 1) @@ -223,6 +290,14 @@ def tables(self) -> list[str]: list of tables """ + if self._duckdb_conn is not None and self._parquet_file is not None: + try: + return self._duckdb_query_files( + f"type = {define.DEPENDENCY_TYPE['meta']}" + ) + except Exception: + pass + # Fallback to pandas return self._df[ self._df["type"] == define.DEPENDENCY_TYPE["meta"] ].index.tolist() @@ -237,6 +312,17 @@ def archive(self, file: str) -> str: archive name """ + if self._duckdb_conn is not None and self._parquet_file is not None: + try: + result = self._duckdb_conn.execute( + f"SELECT archive FROM '{self._parquet_file}' WHERE file = ?", [file] + ).fetchone() + if result is None: + raise KeyError(f"File '{file}' not found in dependencies") + return result[0] + except duckdb.Error: + pass + # Fallback to pandas return self._df.archive[file] def bit_depth(self, file: str) -> int: @@ -316,6 +402,9 @@ def load(self, path: str): FileNotFoundError: if ``path`` does not exists """ + # Close existing DuckDB connection + self._close_duckdb_connection() + self._df = pd.DataFrame(columns=define.DEPENDENCY_TABLE.keys()) path = audeer.path(path) extension = audeer.file_extension(path) @@ -351,6 +440,8 @@ def load(self, path: str): elif extension == "parquet": table = parquet.read_table(path) self._df = self._table_to_dataframe(table) + # Set up DuckDB connection for fast queries + self._setup_duckdb_connection(path) def removed(self, file: str) -> bool: r"""Check if file is marked as removed. @@ -400,6 +491,8 @@ def save(self, path: str): elif path.endswith("parquet"): table = self._dataframe_to_table(self._df, file_column=True) parquet.write_table(table, path) + # Set up DuckDB connection for the newly saved file + self._setup_duckdb_connection(path) def type(self, file: str) -> int: r"""Type of file. @@ -441,6 +534,9 @@ def _add_attachment( checksum: checksum of file """ + # Data is being modified, invalidate DuckDB cache + self._close_duckdb_connection() + format = audeer.file_extension(file).lower() self._df.loc[file] = [ @@ -481,6 +577,9 @@ def _add_media( where each tuple holds the values of a new media entry """ + # Data is being modified, invalidate DuckDB cache + self._close_duckdb_connection() + df = pd.DataFrame.from_records( values, columns=["file"] + list(define.DEPENDENCY_TABLE.keys()), @@ -502,6 +601,9 @@ def _add_meta( version: version string """ + # Data is being modified, invalidate DuckDB cache + self._close_duckdb_connection() + format = audeer.file_extension(file).lower() if format == "parquet": archive = "" @@ -538,7 +640,22 @@ def _column_loc( scalar value """ - value = self._df.at[file, column] + # Use DuckDB for fast lookup if parquet file is available + if self._duckdb_conn is not None and self._parquet_file is not None: + try: + result = self._duckdb_conn.execute( + f"SELECT {column} FROM '{self._parquet_file}' WHERE file = ?", + [file], + ).fetchone() + if result is None: + raise KeyError(f"File '{file}' not found in dependencies") + value = result[0] + except (duckdb.Error, Exception): + # Fallback to pandas if DuckDB query fails + value = self._df.at[file, column] + else: + value = self._df.at[file, column] + if dtype is not None: value = dtype(value) return value @@ -579,6 +696,9 @@ def _drop(self, files: Sequence[str]): files: relative file paths """ + # Data is being modified, invalidate DuckDB cache + self._close_duckdb_connection() + # self._df.drop is slow, # see https://stackoverflow.com/a/53394627. # The solution presented in https://stackoverflow.com/a/53395360 @@ -594,6 +714,9 @@ def _remove(self, file: str): file: relative file path """ + # Data is being modified, invalidate DuckDB cache + self._close_duckdb_connection() + self._df.at[file, "removed"] = 1 @staticmethod @@ -664,6 +787,9 @@ def _update_media( where each tuple holds the new values for a media entry """ + # Data is being modified, invalidate DuckDB cache + self._close_duckdb_connection() + df = pd.DataFrame.from_records( values, columns=["file"] + list(define.DEPENDENCY_TABLE.keys()), @@ -683,8 +809,73 @@ def _update_media_version( version: version string """ + # Data is being modified, invalidate DuckDB cache + self._close_duckdb_connection() + self._df.loc[files, "version"] = version + def _setup_duckdb_connection(self, parquet_path: str): + r"""Set up DuckDB connection for fast queries. + + Args: + parquet_path: path to parquet file + + """ + try: + self._duckdb_conn = duckdb.connect() + self._parquet_file = parquet_path + # Test the connection with a simple query + self._duckdb_conn.execute( + f"SELECT COUNT(*) FROM '{parquet_path}'" + ).fetchone() + except Exception: + # If DuckDB setup fails, fall back to pandas + self._duckdb_conn = None + self._parquet_file = None + + def _duckdb_query_files(self, condition: str = None) -> list[str]: + r"""Query files using DuckDB for better performance. + + Args: + condition: SQL WHERE condition (without WHERE keyword) + + Returns: + list of file paths + + """ + if self._duckdb_conn is None or self._parquet_file is None: + # Fallback to pandas + if condition: + # This is a simplified fallback - in practice, condition + # translation would need to be more sophisticated + return self._df.index.tolist() + return self._df.index.tolist() + + try: + query = f"SELECT file FROM '{self._parquet_file}'" + if condition: + query += f" WHERE {condition}" + result = self._duckdb_conn.execute(query).fetchall() + return [row[0] for row in result] + except Exception: + # Fallback to pandas + return self._df.index.tolist() + + def _close_duckdb_connection(self): + r"""Close DuckDB connection if open.""" + if self._duckdb_conn is not None: + try: + self._duckdb_conn.close() + except Exception: + pass + finally: + self._duckdb_conn = None + self._parquet_file = None + + def __del__(self): + r"""Cleanup DuckDB connection on object destruction.""" + self._close_duckdb_connection() + def error_message_missing_object( object_type: str, diff --git a/pyproject.toml b/pyproject.toml index 2f6b857b..b22bbbe9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ dependencies = [ 'audiofile >=1.0.0', 'audobject >=0.5.0', 'audresample >=0.1.6', + "duckdb>=1.3.2", 'filelock', 'oyaml', 'pandas >=2.1.0', From a49532970c7e0360ae49ec37bf309896e82d3dbe Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 6 Aug 2025 11:39:13 +0200 Subject: [PATCH 2/9] Try to fix potential errors --- audb/core/dependencies.py | 69 +++++++++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 6 deletions(-) diff --git a/audb/core/dependencies.py b/audb/core/dependencies.py index 8109705f..012c82e0 100644 --- a/audb/core/dependencies.py +++ b/audb/core/dependencies.py @@ -110,7 +110,12 @@ def __contains__(self, file: str) -> bool: [file], ).fetchone() return result[0] > 0 + except (duckdb.Error, duckdb.InvalidInputException): + # Log DuckDB specific errors but don't fail + pass except Exception: + # Other unexpected errors - close connection and fallback + self._close_duckdb_connection() pass # Fallback to pandas return file in self._df.index @@ -161,7 +166,12 @@ def archives(self) -> list[str]: "ORDER BY archive" ).fetchall() return [row[0] for row in result] + except (duckdb.Error, duckdb.InvalidInputException): + # Log DuckDB specific errors but don't fail + pass except Exception: + # Other unexpected errors - close connection and fallback + self._close_duckdb_connection() pass # Fallback to pandas return sorted(self._df.archive.unique().tolist()) @@ -179,7 +189,10 @@ def attachments(self) -> list[str]: return self._duckdb_query_files( f"type = {define.DEPENDENCY_TYPE['attachment']}" ) + except (duckdb.Error, duckdb.InvalidInputException): + pass except Exception: + self._close_duckdb_connection() pass # Fallback to pandas return self._df[ @@ -201,7 +214,10 @@ def attachment_ids(self) -> list[str]: f"WHERE type = {define.DEPENDENCY_TYPE['attachment']}" ).fetchall() return [row[0] for row in result] + except (duckdb.Error, duckdb.InvalidInputException): + pass except Exception: + self._close_duckdb_connection() pass # Fallback to pandas return self._df[ @@ -222,7 +238,10 @@ def files(self) -> list[str]: f"SELECT file FROM '{self._parquet_file}'" ).fetchall() return [row[0] for row in result] + except (duckdb.Error, duckdb.InvalidInputException): + pass except Exception: + self._close_duckdb_connection() pass # Fallback to pandas return self._df.index.tolist() @@ -240,7 +259,10 @@ def media(self) -> list[str]: return self._duckdb_query_files( f"type = {define.DEPENDENCY_TYPE['media']}" ) + except (duckdb.Error, duckdb.InvalidInputException): + pass except Exception: + self._close_duckdb_connection() pass # Fallback to pandas return self._df[ @@ -260,7 +282,10 @@ def removed_media(self) -> list[str]: return self._duckdb_query_files( f"type = {define.DEPENDENCY_TYPE['media']} AND removed = 1" ) + except (duckdb.Error, duckdb.InvalidInputException): + pass except Exception: + self._close_duckdb_connection() pass # Fallback to pandas return self._df[ @@ -295,7 +320,10 @@ def tables(self) -> list[str]: return self._duckdb_query_files( f"type = {define.DEPENDENCY_TYPE['meta']}" ) + except (duckdb.Error, duckdb.InvalidInputException): + pass except Exception: + self._close_duckdb_connection() pass # Fallback to pandas return self._df[ @@ -650,8 +678,12 @@ def _column_loc( if result is None: raise KeyError(f"File '{file}' not found in dependencies") value = result[0] - except (duckdb.Error, Exception): - # Fallback to pandas if DuckDB query fails + except (duckdb.Error, duckdb.InvalidInputException): + # DuckDB specific errors - fallback to pandas + value = self._df.at[file, column] + except Exception: + # Other unexpected errors - close connection and fallback + self._close_duckdb_connection() value = self._df.at[file, column] else: value = self._df.at[file, column] @@ -828,10 +860,19 @@ def _setup_duckdb_connection(self, parquet_path: str): self._duckdb_conn.execute( f"SELECT COUNT(*) FROM '{parquet_path}'" ).fetchone() - except Exception: + except (duckdb.Error, duckdb.InvalidInputException): # If DuckDB setup fails, fall back to pandas self._duckdb_conn = None self._parquet_file = None + except Exception: + # Unexpected error during setup + if hasattr(self, "_duckdb_conn") and self._duckdb_conn is not None: + try: + self._duckdb_conn.close() + except Exception: + pass + self._duckdb_conn = None + self._parquet_file = None def _duckdb_query_files(self, condition: str = None) -> list[str]: r"""Query files using DuckDB for better performance. @@ -857,16 +898,28 @@ def _duckdb_query_files(self, condition: str = None) -> list[str]: query += f" WHERE {condition}" result = self._duckdb_conn.execute(query).fetchall() return [row[0] for row in result] + except (duckdb.Error, duckdb.InvalidInputException): + # DuckDB specific errors - fallback to pandas + pass except Exception: - # Fallback to pandas + # Unexpected errors - close connection and fallback + self._close_duckdb_connection() + pass + + # Fallback to pandas + if condition: + # This is a simplified fallback - in practice, condition + # translation would need to be more sophisticated return self._df.index.tolist() + return self._df.index.tolist() def _close_duckdb_connection(self): r"""Close DuckDB connection if open.""" - if self._duckdb_conn is not None: + if hasattr(self, "_duckdb_conn") and self._duckdb_conn is not None: try: self._duckdb_conn.close() except Exception: + # Ignore errors during connection cleanup pass finally: self._duckdb_conn = None @@ -874,7 +927,11 @@ def _close_duckdb_connection(self): def __del__(self): r"""Cleanup DuckDB connection on object destruction.""" - self._close_duckdb_connection() + try: + self._close_duckdb_connection() + except Exception: + # Ignore errors during cleanup to avoid issues during shutdown + pass def error_message_missing_object( From 4cd00931137ab008fab93b06c4369d45bf7f7f3a Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 30 Dec 2025 09:34:10 +0100 Subject: [PATCH 3/9] Simplify code --- audb/core/dependencies.py | 37 +++++-------------------------------- 1 file changed, 5 insertions(+), 32 deletions(-) diff --git a/audb/core/dependencies.py b/audb/core/dependencies.py index 012c82e0..b65a93e5 100644 --- a/audb/core/dependencies.py +++ b/audb/core/dependencies.py @@ -884,34 +884,11 @@ def _duckdb_query_files(self, condition: str = None) -> list[str]: list of file paths """ - if self._duckdb_conn is None or self._parquet_file is None: - # Fallback to pandas - if condition: - # This is a simplified fallback - in practice, condition - # translation would need to be more sophisticated - return self._df.index.tolist() - return self._df.index.tolist() - - try: - query = f"SELECT file FROM '{self._parquet_file}'" - if condition: - query += f" WHERE {condition}" - result = self._duckdb_conn.execute(query).fetchall() - return [row[0] for row in result] - except (duckdb.Error, duckdb.InvalidInputException): - # DuckDB specific errors - fallback to pandas - pass - except Exception: - # Unexpected errors - close connection and fallback - self._close_duckdb_connection() - pass - - # Fallback to pandas + query = f"SELECT file FROM '{self._parquet_file}'" if condition: - # This is a simplified fallback - in practice, condition - # translation would need to be more sophisticated - return self._df.index.tolist() - return self._df.index.tolist() + query += f" WHERE {condition}" + result = self._duckdb_conn.execute(query).fetchall() + return [row[0] for row in result] def _close_duckdb_connection(self): r"""Close DuckDB connection if open.""" @@ -927,11 +904,7 @@ def _close_duckdb_connection(self): def __del__(self): r"""Cleanup DuckDB connection on object destruction.""" - try: - self._close_duckdb_connection() - except Exception: - # Ignore errors during cleanup to avoid issues during shutdown - pass + self._close_duckdb_connection() def error_message_missing_object( From e0a08010d98099fd83ce9b23480b4de610cd864b Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 30 Dec 2025 10:25:23 +0100 Subject: [PATCH 4/9] Don't catch errors --- audb/core/dependencies.py | 109 +++++++++----------------------------- 1 file changed, 26 insertions(+), 83 deletions(-) diff --git a/audb/core/dependencies.py b/audb/core/dependencies.py index b65a93e5..492fe57a 100644 --- a/audb/core/dependencies.py +++ b/audb/core/dependencies.py @@ -104,19 +104,11 @@ def __contains__(self, file: str) -> bool: """ if self._duckdb_conn is not None and self._parquet_file is not None: - try: - result = self._duckdb_conn.execute( - f"SELECT COUNT(*) FROM '{self._parquet_file}' WHERE file = ?", - [file], - ).fetchone() - return result[0] > 0 - except (duckdb.Error, duckdb.InvalidInputException): - # Log DuckDB specific errors but don't fail - pass - except Exception: - # Other unexpected errors - close connection and fallback - self._close_duckdb_connection() - pass + result = self._duckdb_conn.execute( + f"SELECT COUNT(*) FROM '{self._parquet_file}' WHERE file = ?", + [file], + ).fetchone() + return result[0] > 0 # Fallback to pandas return file in self._df.index @@ -160,19 +152,10 @@ def archives(self) -> list[str]: """ if self._duckdb_conn is not None and self._parquet_file is not None: - try: - result = self._duckdb_conn.execute( - f"SELECT DISTINCT archive FROM '{self._parquet_file}' " - "ORDER BY archive" - ).fetchall() - return [row[0] for row in result] - except (duckdb.Error, duckdb.InvalidInputException): - # Log DuckDB specific errors but don't fail - pass - except Exception: - # Other unexpected errors - close connection and fallback - self._close_duckdb_connection() - pass + result = self._duckdb_conn.execute( + f"SELECT DISTINCT archive FROM '{self._parquet_file}' ORDER BY archive" + ).fetchall() + return [row[0] for row in result] # Fallback to pandas return sorted(self._df.archive.unique().tolist()) @@ -185,15 +168,9 @@ def attachments(self) -> list[str]: """ if self._duckdb_conn is not None and self._parquet_file is not None: - try: - return self._duckdb_query_files( - f"type = {define.DEPENDENCY_TYPE['attachment']}" - ) - except (duckdb.Error, duckdb.InvalidInputException): - pass - except Exception: - self._close_duckdb_connection() - pass + return self._duckdb_query_files( + f"type = {define.DEPENDENCY_TYPE['attachment']}" + ) # Fallback to pandas return self._df[ self._df["type"] == define.DEPENDENCY_TYPE["attachment"] @@ -208,17 +185,11 @@ def attachment_ids(self) -> list[str]: """ if self._duckdb_conn is not None and self._parquet_file is not None: - try: - result = self._duckdb_conn.execute( - f"SELECT archive FROM '{self._parquet_file}' " - f"WHERE type = {define.DEPENDENCY_TYPE['attachment']}" - ).fetchall() - return [row[0] for row in result] - except (duckdb.Error, duckdb.InvalidInputException): - pass - except Exception: - self._close_duckdb_connection() - pass + result = self._duckdb_conn.execute( + f"SELECT archive FROM '{self._parquet_file}' " + f"WHERE type = {define.DEPENDENCY_TYPE['attachment']}" + ).fetchall() + return [row[0] for row in result] # Fallback to pandas return self._df[ self._df["type"] == define.DEPENDENCY_TYPE["attachment"] @@ -233,16 +204,10 @@ def files(self) -> list[str]: """ if self._duckdb_conn is not None and self._parquet_file is not None: - try: - result = self._duckdb_conn.execute( - f"SELECT file FROM '{self._parquet_file}'" - ).fetchall() - return [row[0] for row in result] - except (duckdb.Error, duckdb.InvalidInputException): - pass - except Exception: - self._close_duckdb_connection() - pass + result = self._duckdb_conn.execute( + f"SELECT file FROM '{self._parquet_file}'" + ).fetchall() + return [row[0] for row in result] # Fallback to pandas return self._df.index.tolist() @@ -255,15 +220,7 @@ def media(self) -> list[str]: """ if self._duckdb_conn is not None and self._parquet_file is not None: - try: - return self._duckdb_query_files( - f"type = {define.DEPENDENCY_TYPE['media']}" - ) - except (duckdb.Error, duckdb.InvalidInputException): - pass - except Exception: - self._close_duckdb_connection() - pass + return self._duckdb_query_files(f"type = {define.DEPENDENCY_TYPE['media']}") # Fallback to pandas return self._df[ self._df["type"] == define.DEPENDENCY_TYPE["media"] @@ -278,15 +235,9 @@ def removed_media(self) -> list[str]: """ if self._duckdb_conn is not None and self._parquet_file is not None: - try: - return self._duckdb_query_files( - f"type = {define.DEPENDENCY_TYPE['media']} AND removed = 1" - ) - except (duckdb.Error, duckdb.InvalidInputException): - pass - except Exception: - self._close_duckdb_connection() - pass + return self._duckdb_query_files( + f"type = {define.DEPENDENCY_TYPE['media']} AND removed = 1" + ) # Fallback to pandas return self._df[ (self._df["type"] == define.DEPENDENCY_TYPE["media"]) @@ -316,15 +267,7 @@ def tables(self) -> list[str]: """ if self._duckdb_conn is not None and self._parquet_file is not None: - try: - return self._duckdb_query_files( - f"type = {define.DEPENDENCY_TYPE['meta']}" - ) - except (duckdb.Error, duckdb.InvalidInputException): - pass - except Exception: - self._close_duckdb_connection() - pass + return self._duckdb_query_files(f"type = {define.DEPENDENCY_TYPE['meta']}") # Fallback to pandas return self._df[ self._df["type"] == define.DEPENDENCY_TYPE["meta"] From c039124ee7530c6d17531e4f77a89b29adb4efd2 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 30 Dec 2025 10:27:25 +0100 Subject: [PATCH 5/9] Store deps file in db_root --- audb/core/api.py | 9 ++++-- audb/core/dependencies.py | 61 +++++++++++++++++++-------------------- 2 files changed, 37 insertions(+), 33 deletions(-) diff --git a/audb/core/api.py b/audb/core/api.py index 1e4fc8d2..4e705616 100644 --- a/audb/core/api.py +++ b/audb/core/api.py @@ -312,7 +312,9 @@ def dependencies( # See https://github.com/audeering/audb/pull/507 # backend_interface = utils.lookup_backend(name, version) - deps = download_dependencies(backend_interface, name, version, verbose) + deps = download_dependencies( + db_root, backend_interface, name, version, verbose + ) # Store as pickle in cache deps.save(cached_deps_file) @@ -513,9 +515,12 @@ def remove_media( for version in versions(name): backend_interface = utils.lookup_backend(name, version) - deps = download_dependencies(backend_interface, name, version, verbose) with tempfile.TemporaryDirectory() as db_root: + deps = download_dependencies( + db_root, backend_interface, name, version, verbose + ) + # Track if we need to upload the dependency table again upload = False diff --git a/audb/core/dependencies.py b/audb/core/dependencies.py index 492fe57a..8cbc0ffc 100644 --- a/audb/core/dependencies.py +++ b/audb/core/dependencies.py @@ -5,7 +5,6 @@ import errno import os import re -import tempfile import duckdb import pandas as pd @@ -945,6 +944,7 @@ def filter_deps( def download_dependencies( + db_root: str, backend_interface: type[audbackend.interface.Base], name: str, version: str, @@ -953,12 +953,12 @@ def download_dependencies( r"""Load dependency file from backend. Download dependency file - for requested database - to a temporary folder, + for requested database, and return an dependency object loaded from that file. Args: + db_root: folder to store the dependency file backend_interface: backend interface name: database name version: database version @@ -968,34 +968,33 @@ def download_dependencies( dependency object """ - with tempfile.TemporaryDirectory() as tmp_root: - # Load `db.parquet` file, - # or if non-existent `db.zip` - # from backend - remote_deps_file = backend_interface.join("/", name, define.DEPENDENCY_FILE) - if backend_interface.exists(remote_deps_file, version): - local_deps_file = os.path.join(tmp_root, define.DEPENDENCY_FILE) - backend_interface.get_file( - remote_deps_file, - local_deps_file, - version, - verbose=verbose, - ) - else: - remote_deps_file = backend_interface.join("/", name, define.DB + ".zip") - local_deps_file = os.path.join( - tmp_root, - define.LEGACY_DEPENDENCY_FILE, - ) - backend_interface.get_archive( - remote_deps_file, - tmp_root, - version, - verbose=verbose, - ) - # Create deps object from downloaded file - deps = Dependencies() - deps.load(local_deps_file) + # Load `db.parquet` file, + # or if non-existent `db.zip` + # from backend + remote_deps_file = backend_interface.join("/", name, define.DEPENDENCY_FILE) + if backend_interface.exists(remote_deps_file, version): + local_deps_file = os.path.join(db_root, define.DEPENDENCY_FILE) + backend_interface.get_file( + remote_deps_file, + local_deps_file, + version, + verbose=verbose, + ) + else: + remote_deps_file = backend_interface.join("/", name, define.DB + ".zip") + local_deps_file = os.path.join( + db_root, + define.LEGACY_DEPENDENCY_FILE, + ) + backend_interface.get_archive( + remote_deps_file, + db_root, + version, + verbose=verbose, + ) + # Create deps object from downloaded file + deps = Dependencies() + deps.load(local_deps_file) return deps From 5719cd708791aa043e94fd873c996e40528680f1 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 30 Dec 2025 11:46:54 +0100 Subject: [PATCH 6/9] Update api.py --- audb/core/api.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/audb/core/api.py b/audb/core/api.py index 4e705616..fbdf1714 100644 --- a/audb/core/api.py +++ b/audb/core/api.py @@ -293,12 +293,12 @@ def dependencies( version, cache_root=cache_root, ) - cached_deps_file = os.path.join(db_root, define.CACHED_DEPENDENCY_FILE) + deps_file = os.path.join(db_root, define.DEPENDENCY_FILE) with FolderLock(db_root): try: deps = Dependencies() - deps.load(cached_deps_file) + deps.load(deps_file) except Exception: # does not catch KeyboardInterupt # If loading cached file fails, load again from backend # @@ -315,8 +315,9 @@ def dependencies( deps = download_dependencies( db_root, backend_interface, name, version, verbose ) - # Store as pickle in cache - deps.save(cached_deps_file) + # Store as parquet in cache + if not os.path.exists(deps_file): + deps.save(deps_file) return deps From 5be8bca9f6fcdbe44a7cd44631d5be75c334707a Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 30 Dec 2025 11:48:09 +0100 Subject: [PATCH 7/9] Update dependencies.py --- audb/core/dependencies.py | 98 ++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 54 deletions(-) diff --git a/audb/core/dependencies.py b/audb/core/dependencies.py index 8cbc0ffc..e40afa66 100644 --- a/audb/core/dependencies.py +++ b/audb/core/dependencies.py @@ -60,8 +60,11 @@ class Dependencies: """ # noqa: E501 def __init__(self): - self._df = pd.DataFrame(columns=define.DEPENDENCY_TABLE.keys()) - self._df = self._set_dtypes(self._df) + self.path = None + """Path to dependency file.""" + + self._df = self._init_df() + # pyarrow schema # used for reading and writing files self._schema = pa.schema( @@ -81,7 +84,6 @@ def __init__(self): ) # duckdb connection for fast queries self._duckdb_conn = None - self._parquet_file = None def __call__(self) -> pd.DataFrame: r"""Return dependencies as a table. @@ -102,9 +104,9 @@ def __contains__(self, file: str) -> bool: ``True`` if a dependency to the file exists """ - if self._duckdb_conn is not None and self._parquet_file is not None: + if self._duckdb_conn is not None: result = self._duckdb_conn.execute( - f"SELECT COUNT(*) FROM '{self._parquet_file}' WHERE file = ?", + f"SELECT COUNT(*) FROM '{self.path}' WHERE file = ?", [file], ).fetchone() return result[0] > 0 @@ -150,9 +152,9 @@ def archives(self) -> list[str]: list of archives """ - if self._duckdb_conn is not None and self._parquet_file is not None: + if self._duckdb_conn is not None: result = self._duckdb_conn.execute( - f"SELECT DISTINCT archive FROM '{self._parquet_file}' ORDER BY archive" + f"SELECT DISTINCT archive FROM '{self.path}' ORDER BY archive" ).fetchall() return [row[0] for row in result] # Fallback to pandas @@ -166,7 +168,7 @@ def attachments(self) -> list[str]: list of attachments """ - if self._duckdb_conn is not None and self._parquet_file is not None: + if self._duckdb_conn is not None: return self._duckdb_query_files( f"type = {define.DEPENDENCY_TYPE['attachment']}" ) @@ -183,9 +185,9 @@ def attachment_ids(self) -> list[str]: list of attachment IDs """ - if self._duckdb_conn is not None and self._parquet_file is not None: + if self._duckdb_conn is not None: result = self._duckdb_conn.execute( - f"SELECT archive FROM '{self._parquet_file}' " + f"SELECT archive FROM '{self.path}' " f"WHERE type = {define.DEPENDENCY_TYPE['attachment']}" ).fetchall() return [row[0] for row in result] @@ -202,9 +204,9 @@ def files(self) -> list[str]: list of files """ - if self._duckdb_conn is not None and self._parquet_file is not None: + if self._duckdb_conn is not None: result = self._duckdb_conn.execute( - f"SELECT file FROM '{self._parquet_file}'" + f"SELECT file FROM '{self.path}'" ).fetchall() return [row[0] for row in result] # Fallback to pandas @@ -218,7 +220,7 @@ def media(self) -> list[str]: list of media """ - if self._duckdb_conn is not None and self._parquet_file is not None: + if self._duckdb_conn is not None: return self._duckdb_query_files(f"type = {define.DEPENDENCY_TYPE['media']}") # Fallback to pandas return self._df[ @@ -233,7 +235,7 @@ def removed_media(self) -> list[str]: list of media """ - if self._duckdb_conn is not None and self._parquet_file is not None: + if self._duckdb_conn is not None: return self._duckdb_query_files( f"type = {define.DEPENDENCY_TYPE['media']} AND removed = 1" ) @@ -265,7 +267,7 @@ def tables(self) -> list[str]: list of tables """ - if self._duckdb_conn is not None and self._parquet_file is not None: + if self._duckdb_conn is not None: return self._duckdb_query_files(f"type = {define.DEPENDENCY_TYPE['meta']}") # Fallback to pandas return self._df[ @@ -282,10 +284,10 @@ def archive(self, file: str) -> str: archive name """ - if self._duckdb_conn is not None and self._parquet_file is not None: + if self._duckdb_conn is not None: try: result = self._duckdb_conn.execute( - f"SELECT archive FROM '{self._parquet_file}' WHERE file = ?", [file] + f"SELECT archive FROM '{self.path}' WHERE file = ?", [file] ).fetchone() if result is None: raise KeyError(f"File '{file}' not found in dependencies") @@ -372,12 +374,9 @@ def load(self, path: str): FileNotFoundError: if ``path`` does not exists """ - # Close existing DuckDB connection - self._close_duckdb_connection() - - self._df = pd.DataFrame(columns=define.DEPENDENCY_TABLE.keys()) path = audeer.path(path) extension = audeer.file_extension(path) + if extension not in ["csv", "pkl", "parquet"]: raise ValueError( f"File extension of 'path' has to be 'csv', 'pkl', or 'parquet' " @@ -389,6 +388,12 @@ def load(self, path: str): os.strerror(errno.ENOENT), path, ) + + self.path = path + + # Close existing DuckDB connection + self._close_duckdb_connection() + if extension == "pkl": self._df = pd.read_pickle(path) # Correct dtypes @@ -411,7 +416,7 @@ def load(self, path: str): table = parquet.read_table(path) self._df = self._table_to_dataframe(table) # Set up DuckDB connection for fast queries - self._setup_duckdb_connection(path) + self._setup_duckdb_connection() def removed(self, file: str) -> bool: r"""Check if file is marked as removed. @@ -462,7 +467,7 @@ def save(self, path: str): table = self._dataframe_to_table(self._df, file_column=True) parquet.write_table(table, path) # Set up DuckDB connection for the newly saved file - self._setup_duckdb_connection(path) + # self._setup_duckdb_connection() def type(self, file: str) -> int: r"""Type of file. @@ -611,10 +616,10 @@ def _column_loc( """ # Use DuckDB for fast lookup if parquet file is available - if self._duckdb_conn is not None and self._parquet_file is not None: + if self._duckdb_conn is not None: try: result = self._duckdb_conn.execute( - f"SELECT {column} FROM '{self._parquet_file}' WHERE file = ?", + f"SELECT {column} FROM '{self.path}' WHERE file = ?", [file], ).fetchone() if result is None: @@ -681,6 +686,16 @@ def _drop(self, files: Sequence[str]): # isn't. self._df = self._df[~self._df.index.isin(files)] + def _init_df(self) -> pd.DataFrame: + r"""Initialize empty dataframe. + + Returns: + empty dependency table dataframe + + """ + df = pd.DataFrame(columns=define.DEPENDENCY_TABLE.keys()) + return self._set_dtypes(df) + def _remove(self, file: str): r"""Mark file as removed. @@ -788,33 +803,9 @@ def _update_media_version( self._df.loc[files, "version"] = version - def _setup_duckdb_connection(self, parquet_path: str): - r"""Set up DuckDB connection for fast queries. - - Args: - parquet_path: path to parquet file - - """ - try: - self._duckdb_conn = duckdb.connect() - self._parquet_file = parquet_path - # Test the connection with a simple query - self._duckdb_conn.execute( - f"SELECT COUNT(*) FROM '{parquet_path}'" - ).fetchone() - except (duckdb.Error, duckdb.InvalidInputException): - # If DuckDB setup fails, fall back to pandas - self._duckdb_conn = None - self._parquet_file = None - except Exception: - # Unexpected error during setup - if hasattr(self, "_duckdb_conn") and self._duckdb_conn is not None: - try: - self._duckdb_conn.close() - except Exception: - pass - self._duckdb_conn = None - self._parquet_file = None + def _setup_duckdb_connection(self): + r"""Set up DuckDB connection for fast queries.""" + self._duckdb_conn = duckdb.connect() def _duckdb_query_files(self, condition: str = None) -> list[str]: r"""Query files using DuckDB for better performance. @@ -826,7 +817,7 @@ def _duckdb_query_files(self, condition: str = None) -> list[str]: list of file paths """ - query = f"SELECT file FROM '{self._parquet_file}'" + query = f"SELECT file FROM '{self.path}'" if condition: query += f" WHERE {condition}" result = self._duckdb_conn.execute(query).fetchall() @@ -842,7 +833,6 @@ def _close_duckdb_connection(self): pass finally: self._duckdb_conn = None - self._parquet_file = None def __del__(self): r"""Cleanup DuckDB connection on object destruction.""" From acf3f707469915f655e737bf404149fa3cfc5f42 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 30 Dec 2025 11:48:42 +0100 Subject: [PATCH 8/9] Update load_to.py --- audb/core/load_to.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/audb/core/load_to.py b/audb/core/load_to.py index d2dc35c8..9e95e259 100644 --- a/audb/core/load_to.py +++ b/audb/core/load_to.py @@ -446,8 +446,12 @@ def load_to( # save dependencies - dep_path_tmp = os.path.join(db_root_tmp, define.DEPENDENCY_FILE) - deps.save(dep_path_tmp) + if deps.path is not None and os.path.basename(deps.path) == define.DEPENDENCY_FILE: + dep_path_tmp = deps.path + else: + # Convert old dependency formats + dep_path_tmp = os.path.join(db_root_tmp, define.DEPENDENCY_FILE) + deps.save(dep_path_tmp) audeer.move_file( dep_path_tmp, os.path.join(db_root, define.DEPENDENCY_FILE), From 6f4e9b7c759bf49611dd2aac3d23b853ffaf31ac Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 30 Dec 2025 15:47:34 +0100 Subject: [PATCH 9/9] Update dependency benchmark script (#530) * Update dependency benchmark script * Better formatting of __ * Don't create lists --- benchmarks/benchmark-dependencies-methods.py | 541 ++++++++----------- 1 file changed, 235 insertions(+), 306 deletions(-) diff --git a/benchmarks/benchmark-dependencies-methods.py b/benchmarks/benchmark-dependencies-methods.py index 55e2cc2a..5ccf1f90 100644 --- a/benchmarks/benchmark-dependencies-methods.py +++ b/benchmarks/benchmark-dependencies-methods.py @@ -27,92 +27,11 @@ cache = audeer.mkdir("./cache") +print(f"audb v{audb.__version__}") -def astype(df, dtype): - """Convert to desired dataframe dtypes.""" - if dtype == "object": - # Use `object` to represent strings - df["archive"] = df["archive"].astype("object") - df["bit_depth"] = df["bit_depth"].astype("int32") - df["channels"] = df["channels"].astype("int32") - df["checksum"] = df["checksum"].astype("object") - df["duration"] = df["duration"].astype("float64") - df["format"] = df["format"].astype("object") - df["removed"] = df["removed"].astype("int32") - df["sampling_rate"] = df["sampling_rate"].astype("int32") - df["type"] = df["type"].astype("int32") - df["version"] = df["version"].astype("object") - df.index = df.index.astype(audb.core.define.DEPENDENCY_INDEX_DTYPE) - # Set dtypes in library - audb.core.define.DEPENDENCY_TABLE = { - "archive": "object", - "bit_depth": "int32", - "channels": "int32", - "checksum": "object", - "duration": "float64", - "format": "object", - "removed": "int32", - "sampling_rate": "int32", - "type": "int32", - "version": "object", - } - elif dtype == "string": - # Use `string` to represent strings - df["archive"] = df["archive"].astype("string") - df["bit_depth"] = df["bit_depth"].astype("int32") - df["channels"] = df["channels"].astype("int32") - df["checksum"] = df["checksum"].astype("string") - df["duration"] = df["duration"].astype("float64") - df["format"] = df["format"].astype("string") - df["removed"] = df["removed"].astype("int32") - df["sampling_rate"] = df["sampling_rate"].astype("int32") - df["type"] = df["type"].astype("int32") - df["version"] = df["version"].astype("string") - df.index = df.index.astype(audb.core.define.DEPENDENCY_INDEX_DTYPE) - # Set dtypes in library - audb.core.define.DEPENDENCY_TABLE = { - "archive": "string", - "bit_depth": "int32", - "channels": "int32", - "checksum": "string", - "duration": "float64", - "format": "string", - "removed": "int32", - "sampling_rate": "int32", - "type": "int32", - "version": "string", - } - elif dtype == "pyarrow": - # Use `pyarrow` to represent all dtypes - df["archive"] = df["archive"].astype("string[pyarrow]") - df["bit_depth"] = df["bit_depth"].astype("int32[pyarrow]") - df["channels"] = df["channels"].astype("int32[pyarrow]") - df["checksum"] = df["checksum"].astype("string[pyarrow]") - df["duration"] = df["duration"].astype("float64[pyarrow]") - df["format"] = df["format"].astype("string[pyarrow]") - df["removed"] = df["removed"].astype("int32[pyarrow]") - df["sampling_rate"] = df["sampling_rate"].astype("int32[pyarrow]") - df["type"] = df["type"].astype("int32[pyarrow]") - df["version"] = df["version"].astype("string[pyarrow]") - df.index = df.index.astype(audb.core.define.DEPENDENCY_INDEX_DTYPE) - # Set dtypes in library - audb.core.define.DEPENDENCY_TABLE = { - "archive": "string[pyarrow]", - "bit_depth": "int32[pyarrow]", - "channels": "int32[pyarrow]", - "checksum": "string[pyarrow]", - "duration": "float64[pyarrow]", - "format": "string[pyarrow]", - "removed": "int32[pyarrow]", - "sampling_rate": "int32[pyarrow]", - "type": "int32[pyarrow]", - "version": "string[pyarrow]", - } - return df - -# === Dependencies pandas.DataFrame === -data_cache = audeer.path(cache, "df.pkl") +# === Create legacy CSV dependency table === +data_cache = audeer.path(cache, "df.csv") num_rows = 1000000 if not os.path.exists(data_cache): bit_depths = [0, 16, 24] @@ -140,237 +59,247 @@ def astype(df, dtype): for n in range(num_rows) ] df = pd.DataFrame.from_records(records) - df = df.astype(audb.core.define.DEPENDENDENCY_TABLE) + df = df.astype(audb.core.define.DEPENDENCY_TABLE) df.set_index("file", inplace=True) df.index.name = None df.index = df.index.astype(audb.core.define.DEPENDENCY_INDEX_DTYPE) - df.to_pickle(data_cache) - + df.to_csv(data_cache) -# ===== Benchmark audb.Dependencies ===== +# Prepare deps object deps = audb.Dependencies() deps.load(data_cache) -file = "file-10.wav" +deps_file = audeer.path(cache, audb.core.define.DEPENDENCY_FILE) n_files = 10000 -_files = deps._df.index[:n_files].tolist() -dtypes = ["string", "object", "pyarrow"] -results = pd.DataFrame(columns=dtypes) +_files = deps.files[:n_files] +results = pd.DataFrame(columns=["result"]) results.index.name = "method" -for dtype in dtypes: - deps.load(data_cache) - deps._df = astype(deps._df, dtype) - - # Check we have the expected dtypes - # in dependency table - # and library - if dtype == "pyarrow": - expected_dtype = "string[pyarrow]" - else: - expected_dtype = dtype - assert deps._df.archive.dtype == expected_dtype - assert audb.core.define.DEPENDENCY_TABLE["archive"] == expected_dtype - - method = "Dependencies.__call__()" - t0 = time.time() - deps() - t = time.time() - t0 - results.at[method, dtype] = t - - # Access the index one time. - # Further calls will be faster - file in deps - - method = f"Dependencies.__contains__({n_files} files)" - t0 = time.time() - [file in deps for file in _files] - t = time.time() - t0 - results.at[method, dtype] = t - - method = f"Dependencies.__get_item__({n_files} files)" - t0 = time.time() - [deps[file] for file in _files] - t = time.time() - t0 - results.at[method, dtype] = t - - method = "Dependencies.__len__()" - t0 = time.time() - len(deps) - t = time.time() - t0 - results.at[method, dtype] = t - - method = "Dependencies.__str__()" - t0 = time.time() - str(deps) - t = time.time() - t0 - results.at[method, dtype] = t - - method = "Dependencies.archives" - t0 = time.time() - deps.archives - t = time.time() - t0 - results.at[method, dtype] = t - - method = "Dependencies.attachments" - t0 = time.time() - deps.attachments - t = time.time() - t0 - results.at[method, dtype] = t - - method = "Dependencies.attachment_ids" - t0 = time.time() - deps.attachment_ids - t = time.time() - t0 - results.at[method, dtype] = t - - method = "Dependencies.files" - t0 = time.time() - deps.files - t = time.time() - t0 - results.at[method, dtype] = t - - method = "Dependencies.media" - t0 = time.time() - deps.media - t = time.time() - t0 - results.at[method, dtype] = t - - method = "Dependencies.removed_media" - t0 = time.time() - deps.removed_media - t = time.time() - t0 - results.at[method, dtype] = t - - method = "Dependencies.table_ids" - t0 = time.time() - deps.table_ids - t = time.time() - t0 - results.at[method, dtype] = t - - method = "Dependencies.tables" - t0 = time.time() - deps.tables - t = time.time() - t0 - results.at[method, dtype] = t - - method = f"Dependencies.archive({n_files} files)" - t0 = time.time() - [deps.archive(file) for file in _files] - t = time.time() - t0 - results.at[method, dtype] = t - - method = f"Dependencies.bit_depth({n_files} files)" - t0 = time.time() - [deps.bit_depth(file) for file in _files] - t = time.time() - t0 - results.at[method, dtype] = t - - method = f"Dependencies.channels({n_files} files)" - t0 = time.time() - [deps.channels(file) for file in _files] - t = time.time() - t0 - results.at[method, dtype] = t - - method = f"Dependencies.checksum({n_files} files)" - t0 = time.time() - [deps.checksum(file) for file in _files] - t = time.time() - t0 - results.at[method, dtype] = t - - method = f"Dependencies.duration({n_files} files)" - t0 = time.time() - [deps.duration(file) for file in _files] - t = time.time() - t0 - results.at[method, dtype] = t - - method = f"Dependencies.format({n_files} files)" - t0 = time.time() - [deps.format(file) for file in _files] - t = time.time() - t0 - results.at[method, dtype] = t - - method = f"Dependencies.removed({n_files} files)" - t0 = time.time() - [deps.removed(file) for file in _files] - t = time.time() - t0 - results.at[method, dtype] = t - - method = f"Dependencies.sampling_rate({n_files} files)" - t0 = time.time() - [deps.sampling_rate(file) for file in _files] - t = time.time() - t0 - results.at[method, dtype] = t - - method = f"Dependencies.type({n_files} files)" - t0 = time.time() - [deps.type(file) for file in _files] - t = time.time() - t0 - results.at[method, dtype] = t - - method = f"Dependencies.version({n_files} files)" - t0 = time.time() - [deps.version(file) for file in _files] - t = time.time() - t0 - results.at[method, dtype] = t - - # ------------------------------------------------------------------------- - method = "Dependencies._add_attachment()" - t0 = time.time() - deps._add_attachment("attachment.txt", "1.0.0", "archive", "checksum") - t = time.time() - t0 - results.at[method, dtype] = t - - method = f"Dependencies._add_media({n_files} files)" - values = [ - ( - f"file-new-{n}.wav", # file - f"archive-new-{n}", # archive - 16, # bit_depth - 1, # channels - f"checksum-{n}", # checksum - 0.4, # duration - "wav", # format - 0, # removed - 16000, # sampling_rate - 1, # type - "1.0.0", # version - ) - for n in range(n_files) - ] - t0 = time.time() - deps._add_media(values) - t = time.time() - t0 - results.at[method, dtype] = t - - method = "Dependencies._add_meta()" - t0 = time.time() - deps._add_meta("db.new-table.csv", "1.0.0", "archive", "checksum") - t = time.time() - t0 - results.at[method, dtype] = t - - method = "Dependencies._drop()" - t0 = time.time() - deps._drop(["file-90000.wav"]) - t = time.time() - t0 - results.at[method, dtype] = t - - method = "Dependencies._remove()" - t0 = time.time() - deps._remove(file) - t = time.time() - t0 - results.at[method, dtype] = t - - method = "Dependencies._update_media()" - t0 = time.time() - deps._update_media(values) - t = time.time() - t0 - results.at[method, dtype] = t - - method = f"Dependencies._update_media_version({n_files} files)" - t0 = time.time() - deps._update_media_version([f"file-{n}.wav" for n in range(n_files)], "version") - t = time.time() - t0 - results.at[method, dtype] = t +# ===== Benchmark audb.Dependencies ===== +method = "Dependencies.save()" +t0 = time.time() +deps.save(deps_file) +t = time.time() - t0 +results.at[method, "result"] = t + +method = "Dependencies.load()" +deps = audb.Dependencies() +t0 = time.time() +deps.load(deps_file) +t = time.time() - t0 +results.at[method, "result"] = t + +method = r"Dependencies.\_\_call\_\_()" +t0 = time.time() +deps() +t = time.time() - t0 +results.at[method, "result"] = t + +# Access the index one time. +# Further calls will be faster +"file-10.wav" in deps + +method = rf"Dependencies.\_\_contains\_\_({n_files} files)" +t0 = time.time() +for file in _files: + _ = file in deps +t = time.time() - t0 +results.at[method, "result"] = t + +method = rf"Dependencies.\_\_get_item\_\_({n_files} files)" +t0 = time.time() +for file in _files: + _ = deps[file] +t = time.time() - t0 +results.at[method, "result"] = t + +method = r"Dependencies.\_\_len\_\_()" +t0 = time.time() +len(deps) +t = time.time() - t0 +results.at[method, "result"] = t + +method = r"Dependencies.\_\_str\_\_()" +t0 = time.time() +str(deps) +t = time.time() - t0 +results.at[method, "result"] = t + +method = "Dependencies.archives" +t0 = time.time() +deps.archives +t = time.time() - t0 +results.at[method, "result"] = t + +method = "Dependencies.attachments" +t0 = time.time() +deps.attachments +t = time.time() - t0 +results.at[method, "result"] = t + +method = "Dependencies.attachment_ids" +t0 = time.time() +deps.attachment_ids +t = time.time() - t0 +results.at[method, "result"] = t + +method = "Dependencies.files" +t0 = time.time() +deps.files +t = time.time() - t0 +results.at[method, "result"] = t + +method = "Dependencies.media" +t0 = time.time() +deps.media +t = time.time() - t0 +results.at[method, "result"] = t + +method = "Dependencies.removed_media" +t0 = time.time() +deps.removed_media +t = time.time() - t0 +results.at[method, "result"] = t + +method = "Dependencies.table_ids" +t0 = time.time() +deps.table_ids +t = time.time() - t0 +results.at[method, "result"] = t + +method = "Dependencies.tables" +t0 = time.time() +deps.tables +t = time.time() - t0 +results.at[method, "result"] = t + +method = f"Dependencies.archive({n_files} files)" +t0 = time.time() +for file in _files: + _ = deps.archive(file) +t = time.time() - t0 +results.at[method, "result"] = t + +method = f"Dependencies.bit_depth({n_files} files)" +t0 = time.time() +for file in _files: + _ = deps.bit_depth(file) +t = time.time() - t0 +results.at[method, "result"] = t + +method = f"Dependencies.channels({n_files} files)" +t0 = time.time() +for file in _files: + _ = deps.channels(file) +t = time.time() - t0 +results.at[method, "result"] = t + +method = f"Dependencies.checksum({n_files} files)" +t0 = time.time() +for file in _files: + _ = deps.checksum(file) +t = time.time() - t0 +results.at[method, "result"] = t + +method = f"Dependencies.duration({n_files} files)" +t0 = time.time() +for file in _files: + _ = deps.duration(file) +t = time.time() - t0 +results.at[method, "result"] = t + +method = f"Dependencies.format({n_files} files)" +t0 = time.time() +for file in _files: + _ = deps.format(file) +t = time.time() - t0 +results.at[method, "result"] = t + +method = f"Dependencies.removed({n_files} files)" +t0 = time.time() +for file in _files: + _ = deps.removed(file) +t = time.time() - t0 +results.at[method, "result"] = t + +method = f"Dependencies.sampling_rate({n_files} files)" +t0 = time.time() +for file in _files: + _ = deps.sampling_rate(file) +t = time.time() - t0 +results.at[method, "result"] = t + +method = f"Dependencies.type({n_files} files)" +t0 = time.time() +for file in _files: + _ = deps.type(file) +t = time.time() - t0 +results.at[method, "result"] = t + +method = f"Dependencies.version({n_files} files)" +t0 = time.time() +for file in _files: + _ = deps.version(file) +t = time.time() - t0 +results.at[method, "result"] = t + +# ------------------------------------------------------------------------- +method = "Dependencies._add_attachment()" +t0 = time.time() +deps._add_attachment("attachment.txt", "1.0.0", "archive", "checksum") +t = time.time() - t0 +results.at[method, "result"] = t + +method = f"Dependencies._add_media({n_files} files)" +values = [ + ( + f"file-new-{n}.wav", # file + f"archive-new-{n}", # archive + 16, # bit_depth + 1, # channels + f"checksum-{n}", # checksum + 0.4, # duration + "wav", # format + 0, # removed + 16000, # sampling_rate + 1, # type + "1.0.0", # version + ) + for n in range(n_files) +] +t0 = time.time() +deps._add_media(values) +t = time.time() - t0 +results.at[method, "result"] = t + +method = "Dependencies._add_meta()" +t0 = time.time() +deps._add_meta("db.new-table.csv", "1.0.0", "checksum") +t = time.time() - t0 +results.at[method, "result"] = t + +method = "Dependencies._drop()" +t0 = time.time() +deps._drop(["file-90000.wav"]) +t = time.time() - t0 +results.at[method, "result"] = t + +method = "Dependencies._remove()" +t0 = time.time() +deps._remove("file-10.wav") +t = time.time() - t0 +results.at[method, "result"] = t + +method = "Dependencies._update_media()" +t0 = time.time() +deps._update_media(values) +t = time.time() - t0 +results.at[method, "result"] = t + +method = f"Dependencies._update_media_version({n_files} files)" +t0 = time.time() +deps._update_media_version([f"file-{n}.wav" for n in range(n_files)], "version") +t = time.time() - t0 +results.at[method, "result"] = t # ===== Save results =====