diff --git a/CHANGELOG.md b/CHANGELOG.md index efb0a1e..5ab3659 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## 2025-05-16 +- Rework `/ingestion` to use the DuckDB BigQuery extension instead of BigQuery Python API. +- Bump DuckDB to `1.2.2` +- Bump dbt-duckdb to `1.9.3` + ## 2025-02-09 - Bumping DuckDB to `1.2.0`. diff --git a/Dockerfile b/Dockerfile index a8b1bfe..df1e557 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,7 @@ +ARG PLATFORM=amd64 + # Stage 1: Base -FROM python:3.12 as base +FROM --platform=linux/${PLATFORM} python:3.12 as base # Install UV via pip RUN pip install uv==0.5.5 --no-cache-dir diff --git a/README.md b/README.md index 790eab9..7a715d9 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,10 @@ The project is a monorepo composed of series in 3 parts : - transformation, under `transform` folder ([YouTube video](https://www.youtube.com/watch?v=SpfEQQXBGMQ), [Blog](https://motherduck.com/blog/duckdb-dbt-e2e-data-engineering-project-part-2/)) - Visualization, under `dashboard` folder ([YouTube video](https://youtu.be/ta_Pzc2EEEo), [Blog](https://motherduck.com/blog/duckdb-dashboard-e2e-data-engineering-project-part-3/)) -Please refer to the [`CHANGELOG.md`](./CHANGELOG.md) for the latest updates. +> ⚠️ This project has undergone significant changes since the tutorials were recorded. +> To follow along with the original tutorial content, please check out the [`feat/tutorial-archive` branch](https://github.com/mehd-io/pypi-duck-flow/tree/feat/tutorial-archive). + +You can also refer to the [`CHANGELOG.md`](./CHANGELOG.md) for a complete list of updates. ## High level architecture ![High level architecture](./docs/etl_architecture.png) diff --git a/ingestion/bigquery.py b/ingestion/bigquery.py index ffac006..c6e3e2a 100644 --- a/ingestion/bigquery.py +++ b/ingestion/bigquery.py @@ -1,11 +1,4 @@ -import os -from google.cloud import bigquery -from google.oauth2 import service_account -from google.auth.exceptions import DefaultCredentialsError -from loguru import logger -import time -from ingestion.models import PypiJobParameters, FileDownloads -import pyarrow as pa +from ingestion.models import PypiJobParameters PYPI_PUBLIC_DATASET = "bigquery-public-data.pypi.file_downloads" @@ -13,61 +6,24 @@ def build_pypi_query( params: PypiJobParameters, pypi_public_dataset: str = PYPI_PUBLIC_DATASET ) -> str: - # Query the public PyPI dataset from BigQuery + """Build an optimized BigQuery query for PyPI file downloads # /!\ This is a large dataset, filter accordingly /!\ + """ + return f""" - SELECT * + SELECT + timestamp, + country_code, + url, + project, + file, + details, + tls_protocol, + tls_cipher FROM `{pypi_public_dataset}` WHERE - project = '{params.pypi_project}' + project = ''{params.pypi_project}'' AND {params.timestamp_column} >= TIMESTAMP("{params.start_date}") AND {params.timestamp_column} < TIMESTAMP("{params.end_date}") """ - - -def get_bigquery_client(project_name: str) -> bigquery.Client: - """Get Big Query client""" - try: - service_account_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") - - if service_account_path: - credentials = service_account.Credentials.from_service_account_file( - service_account_path - ) - bigquery_client = bigquery.Client( - project=project_name, credentials=credentials - ) - return bigquery_client - - raise EnvironmentError( - "No valid credentials found for BigQuery authentication." - ) - - except DefaultCredentialsError as creds_error: - raise creds_error - - -2 - - -def get_bigquery_result( - query_str: str, bigquery_client: bigquery.Client, model: FileDownloads -) -> pa.Table: - """Get query result from BigQuery and yield rows as dictionaries.""" - try: - # Start measuring time - start_time = time.time() - # Run the query and directly load into a DataFrame - logger.info(f"Running query: {query_str}") - # dataframe = bigquery_client.query(query_str).to_dataframe(dtypes=FileDownloads().pandas_dtypes) - pa_tbl = bigquery_client.query(query_str).to_arrow() - # Log the time taken for query execution and data loading - elapsed_time = time.time() - start_time - logger.info(f"Query executed and data loaded in {elapsed_time:.2f} seconds") - # Iterate over DataFrame rows and yield as dictionaries - return pa_tbl - - except Exception as e: - logger.error(f"Error running query: {e}") - raise diff --git a/ingestion/duck.py b/ingestion/duck.py index 24e4ca2..6f309ab 100644 --- a/ingestion/duck.py +++ b/ingestion/duck.py @@ -1,88 +1,209 @@ import duckdb import os -import pyarrow as pa from loguru import logger -class ArrowTableLoadingBuffer: +class MotherDuckBigQueryLoader: + """ + A specialized loader for transferring data from BigQuery to MotherDuck. + + This class abstracts the implementation details of: + 1. Loading data from BigQuery to local DuckDB + 2. Deleting existing data from MotherDuck for the given date range + 3. Loading data in chunks to MotherDuck for optimal performance + """ + def __init__( self, - duckdb_schema: str, - pyarrow_schema: pa.Schema, - database_name: str, - table_name: str, - dryrun: bool = False, - destination="local", - chunk_size: int = 50000, + motherduck_database: str, + motherduck_table: str, + project_id: str, ): - self.duckdb_schema = duckdb_schema - self.pyarrow_schema = pyarrow_schema - self.dryrun = dryrun - self.database_name = database_name - self.table_name = table_name - self.total_inserted = 0 - self.conn = self.initialize_connection(destination, duckdb_schema) - self.primary_key_exists = "PRIMARY KEY" in duckdb_schema.upper() - self.chunk_size = chunk_size - - def initialize_connection(self, destination, sql): - if destination == "md": - logger.info("Connecting to MotherDuck...") - if not os.environ.get("motherduck_token"): - raise ValueError( - "MotherDuck token is required. Set the environment variable 'MOTHERDUCK_TOKEN'." - ) - conn = duckdb.connect("md:") - if not self.dryrun: + """ + Initialize the MotherDuck BigQuery loader. + + Args: + motherduck_database: Target MotherDuck database + motherduck_table: Target MotherDuck table + project_id: GCP project ID for BigQuery + """ + self.motherduck_database = motherduck_database + self.motherduck_table = motherduck_table + self.project_id = project_id + + # Initialize a single DuckDB connection + self.conn = duckdb.connect(database=":memory:") + + # Load BigQuery extension + self.conn.execute("INSTALL bigquery FROM community;") + self.conn.execute("LOAD bigquery;") + + # Configure BigQuery extension + self.conn.execute("SET bq_experimental_filter_pushdown = true") + self.conn.execute(f"SET bq_query_timeout_ms = 300000") # 5 minutes timeout + + # Attach BigQuery project + self.conn.execute( + f"ATTACH 'project={self.project_id}' AS bq (TYPE bigquery, READ_ONLY)" + ) + + # Attach MotherDuck + if not os.environ.get("motherduck_token"): + raise ValueError( + "MotherDuck token is required. Set the environment variable 'MOTHERDUCK_TOKEN'." + ) + + self.conn.execute("ATTACH 'md:'") + self.conn.execute(f"CREATE DATABASE IF NOT EXISTS {self.motherduck_database}") + + def load_from_bigquery_to_motherduck( + self, + query: str, + timestamp_column: str, + start_date: str, + end_date: str, + chunk_size: int = 100000, # Optimal MotherDuck chunk size + ) -> int: + """ + Load data from BigQuery to MotherDuck. + Use a temporary table in DuckDB to load the data in chunks. + + Returns: + Total number of rows loaded to MotherDuck + """ + logger.info(f"Starting BigQuery to MotherDuck transfer job") + + # Estimate row count (for progress reporting) + estimated_rows = self._estimate_row_count(query) + if estimated_rows > 0: + logger.info(f"Found approximately {estimated_rows:,} rows to process") + + # STEP 2: Create a DuckDB table with BigQuery data + logger.info("Loading data from BigQuery to temporary table") + loaded_rows = self._load_from_bigquery(query) + + if loaded_rows == 0: + logger.warning("No data was loaded from BigQuery") + return 0 + + logger.info(f"Deleting existing data for date range {start_date} to {end_date}") + self._delete_existing_data(timestamp_column, start_date, end_date) + + logger.info("Copying data to MotherDuck") + copied_rows = self._copy_to_motherduck(chunk_size) + + if estimated_rows > 0: + logger.info( + f"Transferred {copied_rows:,} rows ({(copied_rows/estimated_rows)*100:.1f}% of estimated)" + ) + + return copied_rows + + def _estimate_row_count(self, query: str) -> int: + """Estimate the row count for the BigQuery query""" + try: + count_query = f"SELECT COUNT(*) FROM bq.({query})" + result = self.conn.execute(count_query).fetchone() + return result[0] if result else 0 + except Exception as e: + logger.warning(f"Failed to estimate row count: {e}") + return 0 + + def _load_from_bigquery(self, query: str) -> int: + """Load data from BigQuery to local temporary table""" + try: + # Create a temporary table with the result of the BigQuery query + + query_tmp = f""" + CREATE OR REPLACE TABLE memory.temp_table AS + SELECT * FROM bigquery_query('{self.project_id}', '{query}') + """ + logger.info(query_tmp) + self.conn.execute(query_tmp) + # Check row count + result = self.conn.execute( + "SELECT COUNT(*) FROM memory.temp_table" + ).fetchone() + return result[0] if result else 0 + + except Exception as e: + logger.error(f"Error loading data from BigQuery: {e}") + raise + + def _delete_existing_data( + self, timestamp_column: str, start_date: str, end_date: str + ): + """Delete existing data from MotherDuck in the specified date range""" + try: + # Check if table exists in MotherDuck + table_exists = self.conn.execute(f""" + SELECT COUNT(*) > 0 + FROM duckdb_tables() + WHERE database_name = '{self.motherduck_database}' + AND table_name = '{self.motherduck_table}' + """).fetchone()[0] + + if not table_exists: logger.info( - f"Creating database {self.database_name} if it doesn't exist" + f"Table {self.motherduck_database}.main.{self.motherduck_table} does not exist. Skipping delete operation." ) - conn.execute(f"CREATE DATABASE IF NOT EXISTS {self.database_name}") - conn.execute(f"USE {self.database_name}") - else: - conn = duckdb.connect(database=f"{self.database_name}.db") - if not self.dryrun: - conn.execute(sql) - return conn - - def insert(self, table: pa.Table): - if not self.dryrun: - total_rows = table.num_rows - for batch_start in range(0, total_rows, self.chunk_size): - batch_end = min(batch_start + self.chunk_size, total_rows) - chunk = table.slice(batch_start, batch_end - batch_start) - self.insert_chunk(chunk) - logger.info(f"Inserted chunk {batch_start} to {batch_end}") - self.total_inserted += total_rows - logger.info(f"Total inserted: {self.total_inserted} rows") - - def insert_chunk(self, chunk: pa.Table): - self.conn.register("buffer_table", chunk) - if self.primary_key_exists: - insert_query = f""" - INSERT OR REPLACE INTO {self.table_name} SELECT * FROM buffer_table - """ - else: - insert_query = f"INSERT INTO {self.table_name} SELECT * FROM buffer_table" - self.conn.execute(insert_query) - self.conn.unregister("buffer_table") - - def load_aws_credentials(self, profile: str): - logger.info(f"Loading AWS credentials for profile: {profile}") - self.conn.sql(f"CALL load_aws_credentials('{profile}');") - - def write_to_s3(self, s3_path: str, timestamp_column: str, aws_profile: str): - self.load_aws_credentials(aws_profile) - logger.info(f"Writing data to S3 {s3_path}/{self.table_name}") - self.conn.sql( - f""" - COPY ( - SELECT *, - YEAR({timestamp_column}) AS year, - MONTH({timestamp_column}) AS month - FROM {self.table_name} - ) - TO '{s3_path}/{self.table_name}' - (FORMAT PARQUET, PARTITION_BY (year, month), OVERWRITE_OR_IGNORE 1, COMPRESSION 'ZSTD', ROW_GROUP_SIZE 1000000); + return + + # Use database in MotherDuck + delete_query = f""" + DELETE FROM {self.motherduck_database}.main.{self.motherduck_table} + WHERE {timestamp_column} >= '{start_date}' AND {timestamp_column} < '{end_date}' """ - ) + self.conn.execute(delete_query) + logger.info( + f"Successfully deleted data for date range {start_date} to {end_date}" + ) + + except Exception as e: + logger.error(f"Error deleting existing data: {e}") + raise + + def _copy_to_motherduck(self, chunk_size: int) -> int: + """Copy data from temporary table to MotherDuck in chunks for optimal performance""" + try: + # Ensure target table exists in MotherDuck with the same schema + self.conn.execute(f"USE {self.motherduck_database}") + logger.info( + f"Creating table if not exists {self.motherduck_database}.main.{self.motherduck_table}" + ) + self.conn.execute(f""" + CREATE TABLE IF NOT EXISTS + {self.motherduck_database}.main.{self.motherduck_table} AS + SELECT * FROM memory.temp_table limit 0 + """) + + # Count total rows + total_rows = self.conn.execute( + "SELECT COUNT(*) FROM memory.temp_table" + ).fetchone()[0] + logger.info(f"Total rows to copy: {total_rows}") + if total_rows == 0: + logger.info("No data to copy to MotherDuck") + return 0 + + # Copy data in chunks using rowid for efficient chunking + for chunk_start in range(0, total_rows, chunk_size): + chunk_end = min(chunk_start + chunk_size - 1, total_rows - 1) + logger.info(f"Copying chunk {chunk_start} to {chunk_end}") + self.conn.execute(f""" + INSERT INTO {self.motherduck_database}.main.{self.motherduck_table} + SELECT * FROM memory.temp_table + WHERE rowid BETWEEN {chunk_start} AND {chunk_end} + """) + + chunk_num = (chunk_start // chunk_size) + 1 + total_chunks = (total_rows + chunk_size - 1) // chunk_size + logger.info( + f"Copied chunk {chunk_num}/{total_chunks} ({chunk_end-chunk_start+1} rows)" + ) + + return total_rows + + except Exception as e: + logger.error(f"Error copying data to MotherDuck: {e}") + raise diff --git a/ingestion/models.py b/ingestion/models.py index 2adaa13..7f43347 100644 --- a/ingestion/models.py +++ b/ingestion/models.py @@ -1,207 +1,15 @@ -from typing import Type -import pyarrow as pa -from pydantic import BaseModel, Field -from datetime import datetime -from typing import Optional, Union, List, Annotated - - -class File(BaseModel): - filename: Optional[str] - project: Optional[str] - version: Optional[str] - type: Optional[str] - - -class Libc(BaseModel): - lib: Optional[str] - version: Optional[str] - - -class Distro(BaseModel): - name: Optional[str] - version: Optional[str] - id: Optional[str] - libc: Optional[Libc] - - -class Implementation(BaseModel): - name: Optional[str] - version: Optional[str] - - -class Installer(BaseModel): - name: Optional[str] - version: Optional[str] - - -class System(BaseModel): - name: Optional[str] - release: Optional[str] - - -class Details(BaseModel): - installer: Optional[Installer] - python: Optional[str] - implementation: Optional[Implementation] - distro: Optional[Distro] - system: Optional[System] - cpu: Optional[str] - openssl_version: Optional[str] - setuptools_version: Optional[str] - rustc_version: Optional[str] - ci: Optional[bool] - - -class FileDownloads(BaseModel): - timestamp: Optional[datetime] = None - country_code: Optional[str] = None - url: Optional[str] = None - project: Optional[str] = None - file: Optional[File] = None - details: Optional[Details] = None - tls_protocol: Optional[str] = None - tls_cipher: Optional[str] = None - - @classmethod - def duckdb_schema(cls, table_name="pypi_file_downloads"): - return f""" - CREATE TABLE IF NOT EXISTS {table_name} ( - timestamp TIMESTAMP WITH TIME ZONE, - country_code VARCHAR, - url VARCHAR, - project VARCHAR, - file STRUCT("filename" VARCHAR, "project" VARCHAR, "version" VARCHAR, "type" VARCHAR), - details STRUCT("installer" STRUCT("name" VARCHAR, "version" VARCHAR), "python" VARCHAR, "implementation" STRUCT("name" VARCHAR, "version" VARCHAR), "distro" STRUCT("name" VARCHAR, "version" VARCHAR, "id" VARCHAR, "libc" STRUCT("lib" VARCHAR, "version" VARCHAR)), "system" STRUCT("name" VARCHAR, "release" VARCHAR), "cpu" VARCHAR, "openssl_version" VARCHAR, "setuptools_version" VARCHAR, "rustc_version" VARCHAR, "ci" BOOLEAN), - tls_protocol VARCHAR, - tls_cipher VARCHAR - ) - """ - - @classmethod - def pyarrow_schema(cls): - return pa.schema( - [ - pa.field("timestamp", pa.timestamp("us", tz="UTC")), - pa.field("country_code", pa.string()), - pa.field("url", pa.string()), - pa.field("project", pa.string()), - pa.field( - "file", - pa.struct( - [ - pa.field("filename", pa.string()), - pa.field("project", pa.string()), - pa.field("version", pa.string()), - pa.field("type", pa.string()), - ] - ), - ), - pa.field( - "details", - pa.struct( - [ - pa.field( - "installer", - pa.struct( - [ - pa.field("name", pa.string()), - pa.field("version", pa.string()), - ] - ), - ), - pa.field("python", pa.string()), - pa.field( - "implementation", - pa.struct( - [ - pa.field("name", pa.string()), - pa.field("version", pa.string()), - ] - ), - ), - pa.field( - "distro", - pa.struct( - [ - pa.field("name", pa.string()), - pa.field("version", pa.string()), - pa.field("id", pa.string()), - pa.field( - "libc", - pa.struct( - [ - pa.field("lib", pa.string()), - pa.field("version", pa.string()), - ] - ), - ), - ] - ), - ), - pa.field( - "system", - pa.struct( - [ - pa.field("name", pa.string()), - pa.field("release", pa.string()), - ] - ), - ), - pa.field("cpu", pa.string()), - pa.field("openssl_version", pa.string()), - pa.field("setuptools_version", pa.string()), - pa.field("rustc_version", pa.string()), - pa.field("ci", pa.bool_()), - ] - ), - ), - pa.field("tls_protocol", pa.string()), - pa.field("tls_cipher", pa.string()), - ] - ) +from pydantic import BaseModel +from typing import Union, List class PypiJobParameters(BaseModel): + """Parameters for PyPI data ingestion job""" + start_date: str = "2019-04-01" end_date: str = "2023-11-30" pypi_project: str = "duckdb" - table_name: str = "pypi_file_downloads" database_name: str = "duckdb_stats" + table_name: str = "pypi_file_downloads" gcp_project: str timestamp_column: str = "timestamp" - destination: Annotated[ - Union[List[str], str], Field(default=["local"]) - ] # local, s3, md - s3_path: Optional[str] - aws_profile: Optional[str] - - -class TableValidationError(Exception): - """Custom exception for Table validation errors.""" - - pass - - -def validate_table(table: pa.Table, model: Type[BaseModel]): - """ - Validates each row of a PyArrow Table against a Pydantic model. - Raises TableValidationError if any row fails validation. - - :param table: PyArrow Table to validate. - :param model: Pydantic model to validate against. - :raises: TableValidationError - """ - errors = [] - - for i in range(table.num_rows): - row = {column: table[column][i].as_py() for column in table.column_names} - try: - model(**row) - except ValidationError as e: - errors.append(f"Row {i} failed validation: {e}") - - if errors: - error_message = "\n".join(errors) - raise TableValidationError( - f"Table validation failed with the following errors:\n{error_message}" - ) + destination: Union[List[str], str] = ["local"] # local, s3, md diff --git a/ingestion/pipeline.py b/ingestion/pipeline.py index 6ae950e..6319a67 100644 --- a/ingestion/pipeline.py +++ b/ingestion/pipeline.py @@ -1,60 +1,33 @@ -from ingestion.bigquery import ( - get_bigquery_client, - get_bigquery_result, - build_pypi_query, -) +from ingestion.bigquery import build_pypi_query from datetime import datetime from loguru import logger -from ingestion.duck import ArrowTableLoadingBuffer +from ingestion.duck import MotherDuckBigQueryLoader import fire -from ingestion.models import ( - validate_table, - FileDownloads, - PypiJobParameters, -) +from ingestion.models import PypiJobParameters def main(params: PypiJobParameters): start_time = datetime.now() - # Loading data from BigQuery - pa_tbl = get_bigquery_result( - query_str=build_pypi_query(params), - bigquery_client=get_bigquery_client(project_name=params.gcp_project), - model=FileDownloads, - ) - validate_table(pa_tbl, FileDownloads) - # Loading to DuckDB - logger.info(f"Sinking data to {params.destination}") - # Determine the initial destination (local or md) - initial_destination = "local" if params.destination == "s3" else params.destination - logger.info(f"Initial data sink: {initial_destination}") - buffer = ArrowTableLoadingBuffer( - duckdb_schema=FileDownloads.duckdb_schema(params.table_name), - pyarrow_schema=FileDownloads.pyarrow_schema(), - database_name=params.database_name, - table_name=params.table_name, - dryrun=False, - destination=initial_destination, + # Initialize the MotherDuck loader + loader = MotherDuckBigQueryLoader( + motherduck_database=params.database_name, + motherduck_table=params.table_name, + project_id=params.gcp_project, ) - logger.info(f"Deleting existing data from {params.start_date} to {params.end_date}") - delete_query = f""" - DELETE FROM {params.database_name}.main.{params.table_name} - WHERE {params.timestamp_column} >= '{params.start_date}' AND {params.timestamp_column} < '{params.end_date}' - """ - buffer.conn.execute(delete_query) - logger.info("Existing data deleted") - buffer.insert(pa_tbl) + # Build BigQuery query + query = build_pypi_query(params) + + # Load data from BigQuery to MotherDuck + loader.load_from_bigquery_to_motherduck( + query=query, + timestamp_column=params.timestamp_column, + start_date=params.start_date, + end_date=params.end_date, + ) - # If destination is S3, write to S3 - if params.destination == "s3": - logger.info("Writing data to S3") - buffer.write_to_s3( - s3_path=params.s3_path, - timestamp_column=params.timestamp_column, - aws_profile=params.aws_profile, - ) + # Log total execution time end_time = datetime.now() elapsed = (end_time - start_time).total_seconds() logger.info( diff --git a/ingestion/tests/test_duck.py b/ingestion/tests/test_duck.py index eb118af..311bed8 100644 --- a/ingestion/tests/test_duck.py +++ b/ingestion/tests/test_duck.py @@ -1,137 +1,125 @@ -import pyarrow as pa import pytest -from ingestion.duck import ArrowTableLoadingBuffer +import duckdb +from unittest.mock import patch, MagicMock, ANY, call +from ingestion.duck import MotherDuckBigQueryLoader @pytest.fixture -def buffer_factory(): - def _create_buffer(schema, primary_key=False): - pk_clause = "PRIMARY KEY" if primary_key else "" - return ArrowTableLoadingBuffer( - duckdb_schema=f"CREATE TABLE test_table (id INTEGER {pk_clause}, name VARCHAR)", - pyarrow_schema=pa.schema([("id", pa.int64()), ("name", pa.string())]), - database_name=":memory:", - table_name="test_table", - dryrun=False, - destination="local", - chunk_size=2, # Small chunk size for testing - ) - - return _create_buffer - - -def test_insert_and_query(buffer_factory): - buffer = buffer_factory(schema="CREATE TABLE test_table (id INTEGER, name VARCHAR)") - data = pa.Table.from_pydict( - { - "id": pa.array([1, 2, 3, 4], type=pa.int64()), - "name": pa.array(["Alice", "Bob", "Charlie", "David"]), - } - ) - buffer.insert(data) - - # Check total number of rows - total_rows = buffer.conn.execute("SELECT COUNT(*) FROM test_table").fetchone()[0] - assert total_rows == 4, f"Expected 4 total rows, but found {total_rows}" - - # Verify that all data was inserted correctly - result = buffer.conn.execute( - "SELECT id, name FROM test_table ORDER BY id" - ).fetchall() - expected = [(1, "Alice"), (2, "Bob"), (3, "Charlie"), (4, "David")] - assert result == expected, "Data in the table does not match expected values" - - -def test_insert_chunks(buffer_factory): - buffer = buffer_factory(schema="CREATE TABLE test_table (id INTEGER, name VARCHAR)") - chunk1 = pa.Table.from_pydict( - { - "id": pa.array([1, 2], type=pa.int64()), - "name": pa.array(["Alice", "Bob"]), - } - ) - chunk2 = pa.Table.from_pydict( - { - "id": pa.array([3, 4], type=pa.int64()), - "name": pa.array(["Charlie", "David"]), - } - ) - - buffer.insert(chunk1) - buffer.insert(chunk2) - - # Check total number of rows - total_rows = buffer.conn.execute("SELECT COUNT(*) FROM test_table").fetchone()[0] - assert total_rows == 4, f"Expected 4 total rows, but found {total_rows}" - - # Verify that all data was inserted correctly - result = buffer.conn.execute( - "SELECT id, name FROM test_table ORDER BY id" - ).fetchall() - expected = [(1, "Alice"), (2, "Bob"), (3, "Charlie"), (4, "David")] - assert result == expected, "Data in the table does not match expected values" +def mock_duckdb(): + with patch('duckdb.connect') as mock_connect: + mock_conn = MagicMock() + mock_connect.return_value = mock_conn + yield mock_conn -def test_insert_large_table(buffer_factory): - buffer = buffer_factory(schema="CREATE TABLE test_table (id INTEGER, name VARCHAR)") - large_data = pa.Table.from_pydict( - { - "id": pa.array(range(1, 101), type=pa.int64()), - "name": pa.array([f"Name_{i}" for i in range(1, 101)]), - } - ) - buffer.insert(large_data) - - # Check total number of rows - total_rows = buffer.conn.execute("SELECT COUNT(*) FROM test_table").fetchone()[0] - assert total_rows == 100, f"Expected 100 total rows, but found {total_rows}" - - # Verify first and last rows - first_row = buffer.conn.execute( - "SELECT id, name FROM test_table ORDER BY id LIMIT 1" - ).fetchone() - last_row = buffer.conn.execute( - "SELECT id, name FROM test_table ORDER BY id DESC LIMIT 1" - ).fetchone() - - assert first_row == (1, "Name_1"), f"First row does not match expected: {first_row}" - assert last_row == ( - 100, - "Name_100", - ), f"Last row does not match expected: {last_row}" - - -def test_primary_key_handling(buffer_factory): - buffer = buffer_factory( - schema="CREATE TABLE test_table (id INTEGER PRIMARY KEY, name VARCHAR)", - primary_key=True, - ) - - # Insert initial data - initial_data = pa.Table.from_pydict( - { - "id": pa.array([1, 2], type=pa.int64()), - "name": pa.array(["Alice", "Bob"]), - } +@pytest.fixture +def loader(mock_duckdb): + with patch.dict('os.environ', {'motherduck_token': 'test_token', 'HOME': '/tmp'}): + loader = MotherDuckBigQueryLoader( + motherduck_database="test_db", + motherduck_table="test_table", + project_id="test_project" + ) + loader.conn = mock_duckdb + return loader + + +def test_loader_initialization(loader, mock_duckdb): + assert loader.motherduck_database == "test_db" + assert loader.motherduck_table == "test_table" + assert loader.project_id == "test_project" + + # Verify that the necessary DuckDB commands were executed + mock_duckdb.execute.assert_any_call("INSTALL bigquery FROM community;") + mock_duckdb.execute.assert_any_call("LOAD bigquery;") + mock_duckdb.execute.assert_any_call("SET bq_experimental_filter_pushdown = true") + mock_duckdb.execute.assert_any_call("SET bq_query_timeout_ms = 300000") + mock_duckdb.execute.assert_any_call("ATTACH 'project=test_project' AS bq (TYPE bigquery, READ_ONLY)") + mock_duckdb.execute.assert_any_call("ATTACH 'md:'") + mock_duckdb.execute.assert_any_call("CREATE DATABASE IF NOT EXISTS test_db") + + +def test_loader_initialization_without_token(): + with patch.dict('os.environ', {'HOME': '/tmp'}, clear=True): + with pytest.raises(ValueError, match="MotherDuck token is required"): + MotherDuckBigQueryLoader( + motherduck_database="test_db", + motherduck_table="test_table", + project_id="test_project" + ) + + +def test_load_from_bigquery_to_motherduck(loader, mock_duckdb): + # Mock the row count estimation + mock_duckdb.execute.return_value.fetchone.return_value = (1000,) + + # Test the load_from_bigquery_to_motherduck method + query = "SELECT * FROM test_table" + result = loader.load_from_bigquery_to_motherduck( + query=query, + timestamp_column="timestamp", + start_date="2023-01-01", + end_date="2023-01-31", + chunk_size=100 ) - buffer.insert(initial_data) - - # Insert data with a duplicate key - duplicate_data = pa.Table.from_pydict( - { - "id": pa.array([2, 3], type=pa.int64()), - "name": pa.array(["Updated Bob", "Charlie"]), - } + + assert result == 1000 + # Verify that the necessary DuckDB commands were executed + mock_duckdb.execute.assert_any_call(ANY) # The exact query might vary, so we use ANY + + +def test_delete_existing_data(loader, mock_duckdb): + # Mock table existence check + mock_duckdb.execute.return_value.fetchone.return_value = (True,) + + # Test the _delete_existing_data method + loader._delete_existing_data( + timestamp_column="timestamp", + start_date="2023-01-01", + end_date="2023-01-31" ) - buffer.insert(duplicate_data) - - # Check total number of rows (should be 3, not 4) - total_rows = buffer.conn.execute("SELECT COUNT(*) FROM test_table").fetchone()[0] - assert total_rows == 3, f"Expected 3 total rows, but found {total_rows}" - - # Verify that the duplicate key was updated - result = buffer.conn.execute( - "SELECT id, name FROM test_table ORDER BY id" - ).fetchall() - expected = [(1, "Alice"), (2, "Updated Bob"), (3, "Charlie")] - assert result == expected, "Data in the table does not match expected values" + + # Verify the delete query was executed with the correct pattern + delete_calls = [call[0][0] for call in mock_duckdb.execute.call_args_list] + assert any("DELETE FROM test_db.main.test_table" in call for call in delete_calls) + assert any("timestamp >= '2023-01-01'" in call for call in delete_calls) + assert any("timestamp < '2023-01-31'" in call for call in delete_calls) + + +def test_copy_to_motherduck(loader, mock_duckdb): + # Mock the row count + mock_duckdb.execute.return_value.fetchone.return_value = (1000,) + + # Test the _copy_to_motherduck method + result = loader._copy_to_motherduck(chunk_size=100) + + assert result == 1000 + + # Get all SQL calls made to execute + sql_calls = [str(call[0][0]) for call in mock_duckdb.execute.call_args_list] + + # Print all SQL calls for debugging + print("\nActual SQL calls:") + for i, call in enumerate(sql_calls): + print(f"{i}: {call}") + + # Verify the table creation + assert any("USE test_db" in call for call in sql_calls) + + # More flexible table creation check + create_table_calls = [call for call in sql_calls if "CREATE TABLE" in call] + assert len(create_table_calls) > 0, "No CREATE TABLE statement found" + assert any("test_db.main.test_table" in call for call in create_table_calls) + assert any("memory.temp_table" in call for call in create_table_calls) + + # Verify the data copying + insert_calls = [call for call in sql_calls if "INSERT INTO" in call] + assert len(insert_calls) > 0, "No INSERT statements found" + assert any("test_db.main.test_table" in call for call in insert_calls) + assert any("memory.temp_table" in call for call in insert_calls) + assert any("rowid BETWEEN" in call for call in insert_calls) + + # Verify chunking behavior + chunk_calls = [call for call in sql_calls if "rowid BETWEEN" in call] + assert len(chunk_calls) > 0, "No chunking calls found" + assert len(chunk_calls) == 10, f"Expected 10 chunks, got {len(chunk_calls)}" diff --git a/ingestion/tests/test_models.py b/ingestion/tests/test_models.py index 899ea09..963ea68 100644 --- a/ingestion/tests/test_models.py +++ b/ingestion/tests/test_models.py @@ -1,181 +1,47 @@ -import pandas as pd import pytest -from pydantic import BaseModel -import duckdb -from ingestion.models import ( - validate_table, - PypiJobParameters, - TableValidationError, - FileDownloads, - File, - Details, - Installer, - Implementation, - Distro, - System, - Libc, -) -from ingestion.bigquery import build_pypi_query -import pyarrow as pa +from ingestion.models import PypiJobParameters -class MyModel(BaseModel): - column1: int - column2: str - column3: float +def test_pypi_job_parameters_defaults(): + params = PypiJobParameters(gcp_project="test_project") + assert params.start_date == "2019-04-01" + assert params.end_date == "2023-11-30" + assert params.pypi_project == "duckdb" + assert params.database_name == "duckdb_stats" + assert params.table_name == "pypi_file_downloads" + assert params.timestamp_column == "timestamp" + assert params.destination == ["local"] -@pytest.fixture -def file_downloads_table(): - df = pd.DataFrame( - { - "timestamp": [ - pd.Timestamp("2023-01-01T12:00:00Z"), - pd.Timestamp("2023-01-02T12:00:00Z"), - ], - "country_code": ["US", "CA"], - "url": ["http://example.com/file1", "http://example.com/file2"], - "project": ["project1", "project2"], - "file": [ - File( - filename="file1.txt", project="project1", version="1.0", type="txt" - ).dict(), - File( - filename="file2.txt", project="project2", version="1.0", type="txt" - ).dict(), - ], - "details": [ - Details( - installer=Installer(name="pip", version="21.0"), - python="3.8.5", - implementation=Implementation(name="CPython", version="3.8.5"), - distro=Distro( - name="Ubuntu", - version="20.04", - id="ubuntu2004", - libc=Libc(lib="glibc", version="2.31"), - ), - system=System(name="Linux", release="5.4.0-58-generic"), - cpu="x86_64", - openssl_version="1.1.1", - setuptools_version="50.3.0", - rustc_version="1.47.0", - ci=False, - ).dict(), - Details( - installer=Installer(name="pip", version="21.0"), - python="3.8.5", - implementation=Implementation(name="CPython", version="3.8.5"), - distro=Distro( - name="Ubuntu", - version="20.04", - id="ubuntu2004", - libc=Libc(lib="glibc", version="2.31"), - ), - system=System(name="Linux", release="5.4.0-58-generic"), - cpu="x86_64", - openssl_version="1.1.1", - setuptools_version="50.3.0", - rustc_version="1.47.0", - ci=False, - ).dict(), - ], - "tls_protocol": ["TLSv1.2", "TLSv1.3"], - "tls_cipher": ["AES128-GCM-SHA256", "AES256-GCM-SHA384"], - } - ) - - return pa.Table.from_pandas(df) - - -def test_validate_table_with_valid_data(): - valid_data = { - "column1": pa.array([1, 2]), - "column2": pa.array(["a", "b"]), - "column3": pa.array([1.1, 2.2]), - } - valid_table = pa.table(valid_data) - errors = validate_table(valid_table, MyModel) - assert not errors, f"Validation errors were found in valid data: {errors}" - - -def test_validate_table_with_valid_data(): - valid_data = { - "column1": pa.array([1, 2]), - "column2": pa.array(["a", "b"]), - "column3": pa.array([1.1, 2.2]), - } - valid_table = pa.table(valid_data) - errors = validate_table(valid_table, MyModel) - assert not errors, f"Validation errors were found in valid data: {errors}" - - -def test_file_downloads_validation(file_downloads_table): - try: - validate_table(file_downloads_table, FileDownloads) - except TableValidationError as e: - pytest.fail(f"Table validation failed: {e}") - - -def test_build_pypi_query(): +def test_pypi_job_parameters_custom_values(): params = PypiJobParameters( - table_name="test_table", - s3_path="s3://bucket/path", - aws_profile="test_profile", gcp_project="test_project", - start_date="2019-04-01", - end_date="2023-11-30", - timestamp_column="timestamp", - ) - query = build_pypi_query(params) - expected_query = f""" - SELECT * - FROM - `bigquery-public-data.pypi.file_downloads` - WHERE - project = 'duckdb' - AND timestamp >= TIMESTAMP("2019-04-01") - AND timestamp < TIMESTAMP("2023-11-30") - """ - assert query.strip() == expected_query.strip() - - -@pytest.fixture -def file_downloads_df(): - # Set up DuckDB in-memory database - conn = duckdb.connect(database=":memory:", read_only=False) - conn.execute( - """ - CREATE TABLE tbl ( - timestamp TIMESTAMP WITH TIME ZONE, - country_code VARCHAR, - url VARCHAR, - project VARCHAR, - file STRUCT(filename VARCHAR, project VARCHAR, version VARCHAR, type VARCHAR), - details STRUCT( - installer STRUCT(name VARCHAR, version VARCHAR), - python VARCHAR, - implementation STRUCT(name VARCHAR, version VARCHAR), - distro STRUCT( - name VARCHAR, - version VARCHAR, - id VARCHAR, - libc STRUCT(lib VARCHAR, version VARCHAR) - ), - system STRUCT(name VARCHAR, release VARCHAR), - cpu VARCHAR, - openssl_version VARCHAR, - setuptools_version VARCHAR, - rustc_version VARCHAR, - ci BOOLEAN - ), - tls_protocol VARCHAR, - tls_cipher VARCHAR - ) - """ + start_date="2020-01-01", + end_date="2024-01-01", + pypi_project="test_project", + database_name="test_db", + table_name="test_table", + timestamp_column="created_at", + destination="s3" ) - - # Load data from CSV - conn.execute("COPY tbl FROM 'ingestion/tests/sample_file_downloads.csv' (HEADER)") - # Create DataFrame - return conn.execute("SELECT * FROM tbl").df() + assert params.start_date == "2020-01-01" + assert params.end_date == "2024-01-01" + assert params.pypi_project == "test_project" + assert params.database_name == "test_db" + assert params.table_name == "test_table" + assert params.timestamp_column == "created_at" + assert params.destination == "s3" + + +def test_pypi_job_parameters_destination_types(): + # Test with string destination + params_str = PypiJobParameters(gcp_project="test_project", destination="s3") + assert params_str.destination == "s3" + + # Test with list destination + params_list = PypiJobParameters(gcp_project="test_project", destination=["s3", "md"]) + assert params_list.destination == ["s3", "md"] + + # Test with default destination + params_default = PypiJobParameters(gcp_project="test_project") + assert params_default.destination == ["local"] diff --git a/pyproject.toml b/pyproject.toml index bd35519..cd446b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,16 +6,12 @@ authors = ["Mehdi Ouazza "] readme = "README.md" requires-python = "==3.12.*" dependencies = [ - "duckdb==1.2.0", + "duckdb==1.2.2", "fire>=0.5.0,<1.0.0", - "google-cloud-bigquery==3.25.0", "loguru>=0.7.2,<1.0.0", - "google-auth>=2.24.0,<3.0.0", "pydantic>=2.7.4,<3.0.0", "db-dtypes>=1.1.1,<2.0.0", - "pyarrow==17.0.0", - "google-cloud-bigquery-storage==2.26.0", - "dbt-duckdb==1.9.0", + "dbt-duckdb==1.9.3", ] [tool.pytest.ini_options] diff --git a/transform/pypi_metrics/package-lock.yml b/transform/pypi_metrics/package-lock.yml index b1c5f9f..8ed06b1 100644 --- a/transform/pypi_metrics/package-lock.yml +++ b/transform/pypi_metrics/package-lock.yml @@ -1,6 +1,6 @@ packages: -- git: https://github.com/EqualExperts/dbt-unit-testing - revision: e54329c3971c8c397f284a36de6a627832a24263 -- package: dbt-labs/dbt_utils - version: 1.1.1 -sha1_hash: 489e353aef73788c75f5b4d9348741e99d4d1bdb + - git: https://github.com/EqualExperts/dbt-unit-testing + revision: 2be719665432e73bcf3d5c50fb332709e7561bd7 + - package: dbt-labs/dbt_utils + version: 1.3.0 +sha1_hash: 0b7fb918ae209d7b023184f444071c0555ecb734 diff --git a/uv.lock b/uv.lock index 1382ba8..d5d5e3d 100644 --- a/uv.lock +++ b/uv.lock @@ -46,15 +46,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ed/20/bc79bc575ba2e2a7f70e8a1155618bb1301eaa5132a8271373a6903f73f8/babel-2.16.0-py3-none-any.whl", hash = "sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b", size = 9587599 }, ] -[[package]] -name = "cachetools" -version = "5.5.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c3/38/a0f315319737ecf45b4319a8cd1f3a908e29d9277b46942263292115eee7/cachetools-5.5.0.tar.gz", hash = "sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a", size = 27661 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a4/07/14f8ad37f2d12a5ce41206c21820d8cb6561b728e51fad4530dff0552a67/cachetools-5.5.0-py3-none-any.whl", hash = "sha256:02134e8439cdc2ffb62023ce1debca2944c3f289d66bb17ead3ab3dede74b292", size = 9524 }, -] - [[package]] name = "certifi" version = "2024.8.30" @@ -203,7 +194,7 @@ wheels = [ [[package]] name = "dbt-duckdb" -version = "1.9.0" +version = "1.9.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "dbt-adapters" }, @@ -211,9 +202,9 @@ dependencies = [ { name = "dbt-core" }, { name = "duckdb" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/fe/c0/a601257ce4ac4470783ac135a05be25db86a8a2f233538271e6645b3cbff/dbt_duckdb-1.9.0.tar.gz", hash = "sha256:56c82f511f96fde696ee0c86e34b784a75dfad0997b249f87faf2fc9dee7830e", size = 92955 } +sdist = { url = "https://files.pythonhosted.org/packages/aa/67/25b81b0773c13c5c15a973c4fe585f36d201b6ea4ed742bdf872a8880141/dbt_duckdb-1.9.3.tar.gz", hash = "sha256:53950de9ed9f4d3b5cf91b842054be440a940e24f17e9aa7ffaa2bf1448d65a5", size = 105788 } wheels = [ - { url = "https://files.pythonhosted.org/packages/4b/72/fe647c056c3745c705204a8f1e2c21c21ebb1863aa1b6c95f80e76db4ad6/dbt_duckdb-1.9.0-py3-none-any.whl", hash = "sha256:f6534ad95f37f0e35bd591546cc736e85d156bc7c1509c0cb9cdcda5c50fe00c", size = 62773 }, + { url = "https://files.pythonhosted.org/packages/a8/74/6a27016687920790f45b242c67dd1c1009ba553843e7158deab32635a540/dbt_duckdb-1.9.3-py3-none-any.whl", hash = "sha256:a377b9f7370c42dacf408e7c0041e4b9c3c032f87a77c751cd34ff8c5fc3bf08", size = 70003 }, ] [[package]] @@ -273,18 +264,18 @@ wheels = [ [[package]] name = "duckdb" -version = "1.2.0" +version = "1.2.2" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e3/e2/5e6820ec8cc50c4ab6172debea68e2236ea6a5d9caa56297cfb42fca1fa2/duckdb-1.2.0.tar.gz", hash = "sha256:a5ce81828e6d1c3f06836d3bda38eef8355765f08ad5ce239abd6f56934dd1f8", size = 11586759 } +sdist = { url = "https://files.pythonhosted.org/packages/28/b8/0f86278684fb7a1fac7c0c869fc6d68ed005cdc91c963eb4373e0551bc0a/duckdb-1.2.2.tar.gz", hash = "sha256:1e53555dece49201df08645dbfa4510c86440339889667702f936b7d28d39e43", size = 11595514 } wheels = [ - { url = "https://files.pythonhosted.org/packages/06/0d/eb787bc4f01603db350974299eadd5439a5a4b7c93abbf2ef8570b61ae43/duckdb-1.2.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:c0427501908d3b4fe464913b0ae2418ff52d1fa24b3982d864169b1d54b6bbee", size = 15253269 }, - { url = "https://files.pythonhosted.org/packages/c6/c5/0a44aab15bee5ae5cb26d4eff62cde294ab30008e7d79c20ae3506806642/duckdb-1.2.0-cp312-cp312-macosx_12_0_universal2.whl", hash = "sha256:33df2430f35e68739fd9fb6bbe1a5f86f4f46b362c9abb3a3f74a989560ef597", size = 31928134 }, - { url = "https://files.pythonhosted.org/packages/36/42/7b009c605f7e14aae2e97ae5a13ab732e48c013f66b2ef634d9eac1d106d/duckdb-1.2.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:fd8ca2910efb85f0dd0d50383eaed9b6b7e86e6cacb032c36712e84265855e58", size = 16785162 }, - { url = "https://files.pythonhosted.org/packages/af/37/dea4aaf505d8893873719342a69b61318188287fc63ad669031012be62f3/duckdb-1.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9747d3d2a3290c795c0343eb927dbc75ca78d0440726824c2a39d9634fba9394", size = 18708026 }, - { url = "https://files.pythonhosted.org/packages/d6/73/0f019556ea1558c3c7d376554bf77956bef9a7fef605bba1ba29b3092adb/duckdb-1.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:91704accb74223267ae226f3470d71f7ad824549482b3f7fc91710a9fe5a1152", size = 20182548 }, - { url = "https://files.pythonhosted.org/packages/f4/96/34d14a62835983f92c0a492ad39ab28ccf79e61a71edd7c95f8f94776fb5/duckdb-1.2.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d9357737c6699b1f57e1d02b299371b2634bf08927d4e8386146ec5e4d1ebb31", size = 18378723 }, - { url = "https://files.pythonhosted.org/packages/70/6c/ddd22889033e9bb60ac216fb43c82796440491fa6d764eed34aae0496cc9/duckdb-1.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8d61ba5272dd1bf772b7a74f4964e83080602f8f6e9a46a0fa7203a4e0e05249", size = 21707163 }, - { url = "https://files.pythonhosted.org/packages/31/bc/65f19868c927285dd0ca50646861915e3e5f92a0d05e9e7d9e9faed2f16e/duckdb-1.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:f317cfa2f6ff3bc209985715669f4b8dd601faa69e46a206163e53b8db61a1d1", size = 11351204 }, + { url = "https://files.pythonhosted.org/packages/77/25/549f68e55e1b455bd2daf2e5fc912000a3139fe0395111b3d49b23a2cec1/duckdb-1.2.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:f745379f44ad302560688855baaed9739c03b37a331338eda6a4ac655e4eb42f", size = 15271882 }, + { url = "https://files.pythonhosted.org/packages/f6/84/13de7bf9056dcc7a346125d9a9f0f26f76c633db6b54052738f78f828538/duckdb-1.2.2-cp312-cp312-macosx_12_0_universal2.whl", hash = "sha256:087713fc5958cae5eb59097856b3deaae0def021660c8f2052ec83fa8345174a", size = 31964873 }, + { url = "https://files.pythonhosted.org/packages/0f/53/c8d2d56a801b7843ea87f8533a3634e6b38f06910098a266f8a096bd4c61/duckdb-1.2.2-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:a1f96395319c447a31b9477881bd84b4cb8323d6f86f21ceaef355d22dd90623", size = 16800653 }, + { url = "https://files.pythonhosted.org/packages/bb/36/e25791d879fb93b92a56bf481ce11949ab19109103ae2ba12d64e49355d9/duckdb-1.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6aba3bc0acf4f8d52b94f7746c3b0007b78b517676d482dc516d63f48f967baf", size = 18735524 }, + { url = "https://files.pythonhosted.org/packages/d7/46/4745aa10a1e460f4c8b473eddaffe2c783ac5280e1e5929dd84bd1a1acde/duckdb-1.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5c1556775a9ebaa49b5c8d64718f155ac3e05b34a49e9c99443cf105e8b0371", size = 20210314 }, + { url = "https://files.pythonhosted.org/packages/ff/0d/8563fc5ece36252e3d07dd3d29c7a0a034dcf62f14bed7cdc016d95adcbe/duckdb-1.2.2-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d625cc7d2faacfb2fc83ebbe001ae75dda175b3d8dce6a51a71c199ffac3627a", size = 18755134 }, + { url = "https://files.pythonhosted.org/packages/11/f1/b7ade7d980eee4fb3ad7469ccf23adb3668a9a28cf3989b24418392d3786/duckdb-1.2.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:73263f81545c5cb4360fbaf7b22a493e55ddf88fadbe639c43efb7bc8d7554c4", size = 22294397 }, + { url = "https://files.pythonhosted.org/packages/eb/c9/896e8ced7b408df81e015fe0c6497cda46c92d9dfc8bf84b6d13f5dad473/duckdb-1.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:b1c0c4d737fd2ab9681e4e78b9f361e0a827916a730e84fa91e76dca451b14d5", size = 11370381 }, ] [[package]] @@ -296,156 +287,6 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/6b/b6/82c7e601d6d3c3278c40b7bd35e17e82aa227f050aa9f66cb7b7fce29471/fire-0.7.0.tar.gz", hash = "sha256:961550f07936eaf65ad1dc8360f2b2bf8408fad46abbfa4d2a3794f8d2a95cdf", size = 87189 } -[[package]] -name = "google-api-core" -version = "2.23.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "google-auth" }, - { name = "googleapis-common-protos" }, - { name = "proto-plus" }, - { name = "protobuf" }, - { name = "requests" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/fa/6b/b98553c2061c4e2186f5bbfb1aa1a6ef13fc0775c096d18595d3c99ba023/google_api_core-2.23.0.tar.gz", hash = "sha256:2ceb087315e6af43f256704b871d99326b1f12a9d6ce99beaedec99ba26a0ace", size = 160094 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/17/a4/c26886d57d90032c5f74c2e80aefdc38ec58551fc46bd4ce79fb2c9389fa/google_api_core-2.23.0-py3-none-any.whl", hash = "sha256:c20100d4c4c41070cf365f1d8ddf5365915291b5eb11b83829fbd1c999b5122f", size = 156554 }, -] - -[package.optional-dependencies] -grpc = [ - { name = "grpcio" }, - { name = "grpcio-status" }, -] - -[[package]] -name = "google-auth" -version = "2.36.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "cachetools" }, - { name = "pyasn1-modules" }, - { name = "rsa" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/6a/71/4c5387d8a3e46e3526a8190ae396659484377a73b33030614dd3b28e7ded/google_auth-2.36.0.tar.gz", hash = "sha256:545e9618f2df0bcbb7dcbc45a546485b1212624716975a1ea5ae8149ce769ab1", size = 268336 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2d/9a/3d5087d27865c2f0431b942b5c4500b7d1b744dd3262fdc973a4c39d099e/google_auth-2.36.0-py2.py3-none-any.whl", hash = "sha256:51a15d47028b66fd36e5c64a82d2d57480075bccc7da37cde257fc94177a61fb", size = 209519 }, -] - -[[package]] -name = "google-cloud-bigquery" -version = "3.25.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "google-api-core", extra = ["grpc"] }, - { name = "google-auth" }, - { name = "google-cloud-core" }, - { name = "google-resumable-media" }, - { name = "packaging" }, - { name = "python-dateutil" }, - { name = "requests" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/39/07/d6f8c55f68d796a6a045cbb3c1783ed1c77ec641acbf9e6ff78b38b127a4/google-cloud-bigquery-3.25.0.tar.gz", hash = "sha256:5b2aff3205a854481117436836ae1403f11f2594e6810a98886afd57eda28509", size = 455186 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/81/98/2f931388614ea894640f84c1874d72d84d890c093e334a3990e363ff689e/google_cloud_bigquery-3.25.0-py2.py3-none-any.whl", hash = "sha256:7f0c371bc74d2a7fb74dacbc00ac0f90c8c2bec2289b51dd6685a275873b1ce9", size = 239012 }, -] - -[[package]] -name = "google-cloud-bigquery-storage" -version = "2.26.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "google-api-core", extra = ["grpc"] }, - { name = "google-auth" }, - { name = "proto-plus" }, - { name = "protobuf" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b4/0f/dc6c8251f2a81a1b81ee875c3e2d9b4eb0be6eeb5e6884fc5d4d20cef622/google_cloud_bigquery_storage-2.26.0.tar.gz", hash = "sha256:840275bd0a4b207c0ac821bcc6741406ffb3b5ef940fa05089a90eb4403453fa", size = 237740 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0c/8f/99e7cb894e83e2a36368bb9b1a86afe9eaf43d166dcb121852fba764fb3a/google_cloud_bigquery_storage-2.26.0-py2.py3-none-any.whl", hash = "sha256:375927703afb0a057d3abcbe1a1d644a2a6852c43b864c0b5c93c18e4d3cd999", size = 239188 }, -] - -[[package]] -name = "google-cloud-core" -version = "2.4.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "google-api-core" }, - { name = "google-auth" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b8/1f/9d1e0ba6919668608570418a9a51e47070ac15aeff64261fb092d8be94c0/google-cloud-core-2.4.1.tar.gz", hash = "sha256:9b7749272a812bde58fff28868d0c5e2f585b82f37e09a1f6ed2d4d10f134073", size = 35587 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5e/0f/2e2061e3fbcb9d535d5da3f58cc8de4947df1786fe6a1355960feb05a681/google_cloud_core-2.4.1-py2.py3-none-any.whl", hash = "sha256:a9e6a4422b9ac5c29f79a0ede9485473338e2ce78d91f2370c01e730eab22e61", size = 29233 }, -] - -[[package]] -name = "google-crc32c" -version = "1.6.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/67/72/c3298da1a3773102359c5a78f20dae8925f5ea876e37354415f68594a6fb/google_crc32c-1.6.0.tar.gz", hash = "sha256:6eceb6ad197656a1ff49ebfbbfa870678c75be4344feb35ac1edf694309413dc", size = 14472 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/cf/41/65a91657d6a8123c6c12f9aac72127b6ac76dda9e2ba1834026a842eb77c/google_crc32c-1.6.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:ed767bf4ba90104c1216b68111613f0d5926fb3780660ea1198fc469af410e9d", size = 30268 }, - { url = "https://files.pythonhosted.org/packages/59/d0/ee743a267c7d5c4bb8bd865f7d4c039505f1c8a4b439df047fdc17be9769/google_crc32c-1.6.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:62f6d4a29fea082ac4a3c9be5e415218255cf11684ac6ef5488eea0c9132689b", size = 30113 }, - { url = "https://files.pythonhosted.org/packages/25/53/e5e449c368dd26ade5fb2bb209e046d4309ed0623be65b13f0ce026cb520/google_crc32c-1.6.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c87d98c7c4a69066fd31701c4e10d178a648c2cac3452e62c6b24dc51f9fcc00", size = 32995 }, - { url = "https://files.pythonhosted.org/packages/52/12/9bf6042d5b0ac8c25afed562fb78e51b0641474097e4139e858b45de40a5/google_crc32c-1.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd5e7d2445d1a958c266bfa5d04c39932dc54093fa391736dbfdb0f1929c1fb3", size = 32614 }, - { url = "https://files.pythonhosted.org/packages/76/29/fc20f5ec36eac1eea0d0b2de4118c774c5f59c513f2a8630d4db6991f3e0/google_crc32c-1.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:7aec8e88a3583515f9e0957fe4f5f6d8d4997e36d0f61624e70469771584c760", size = 33445 }, -] - -[[package]] -name = "google-resumable-media" -version = "2.7.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "google-crc32c" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/58/5a/0efdc02665dca14e0837b62c8a1a93132c264bd02054a15abb2218afe0ae/google_resumable_media-2.7.2.tar.gz", hash = "sha256:5280aed4629f2b60b847b0d42f9857fd4935c11af266744df33d8074cae92fe0", size = 2163099 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/82/35/b8d3baf8c46695858cb9d8835a53baa1eeb9906ddaf2f728a5f5b640fd1e/google_resumable_media-2.7.2-py2.py3-none-any.whl", hash = "sha256:3ce7551e9fe6d99e9a126101d2536612bb73486721951e9562fee0f90c6ababa", size = 81251 }, -] - -[[package]] -name = "googleapis-common-protos" -version = "1.66.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "protobuf" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/ff/a7/8e9cccdb1c49870de6faea2a2764fa23f627dd290633103540209f03524c/googleapis_common_protos-1.66.0.tar.gz", hash = "sha256:c3e7b33d15fdca5374cc0a7346dd92ffa847425cc4ea941d970f13680052ec8c", size = 114376 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a0/0f/c0713fb2b3d28af4b2fded3291df1c4d4f79a00d15c2374a9e010870016c/googleapis_common_protos-1.66.0-py2.py3-none-any.whl", hash = "sha256:d7abcd75fabb2e0ec9f74466401f6c119a0b498e27370e9be4c94cb7e382b8ed", size = 221682 }, -] - -[[package]] -name = "grpcio" -version = "1.68.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d5/da/132615afbfc722df4bba963844843a205aa298fd5f9a03fa2995e8dddf11/grpcio-1.68.0.tar.gz", hash = "sha256:7e7483d39b4a4fddb9906671e9ea21aaad4f031cdfc349fec76bdfa1e404543a", size = 12682655 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/30/66/79508e13feee4182e6f2ea260ad4eea96b8b396bbf81334660142a6eecab/grpcio-1.68.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:8af6137cc4ae8e421690d276e7627cfc726d4293f6607acf9ea7260bd8fc3d7d", size = 5147575 }, - { url = "https://files.pythonhosted.org/packages/41/8d/19ffe12a736f57e9860bad506c0e711dd3c9c7c9f06030cfd87fa3eb6b45/grpcio-1.68.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:4028b8e9a3bff6f377698587d642e24bd221810c06579a18420a17688e421af7", size = 11126767 }, - { url = "https://files.pythonhosted.org/packages/9c/c6/9aa8178d0fa3c893531a3ef38fa65a0e9997047ded9a8a20e3aa5706f923/grpcio-1.68.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:f60fa2adf281fd73ae3a50677572521edca34ba373a45b457b5ebe87c2d01e1d", size = 5644649 }, - { url = "https://files.pythonhosted.org/packages/36/91/e2c451a103b8b595d3e3725fc78c76242d38a96cfe22dd9a47c31faba99d/grpcio-1.68.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e18589e747c1e70b60fab6767ff99b2d0c359ea1db8a2cb524477f93cdbedf5b", size = 6292623 }, - { url = "https://files.pythonhosted.org/packages/0b/5f/cbb2c0dfb3f7b893b30d6daca0a7829067f302c55f20b9c470111f48e6e3/grpcio-1.68.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0d30f3fee9372796f54d3100b31ee70972eaadcc87314be369360248a3dcffe", size = 5905873 }, - { url = "https://files.pythonhosted.org/packages/9d/37/ddc32a46baccac6a0a3cdcabd6908d23dfa526f061a1b81211fe029489c7/grpcio-1.68.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:7e0a3e72c0e9a1acab77bef14a73a416630b7fd2cbd893c0a873edc47c42c8cd", size = 6630863 }, - { url = "https://files.pythonhosted.org/packages/45/69/4f74f67ae33be4422bd20050e09ad8b5318f8827a7eb153507de8fb78aef/grpcio-1.68.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a831dcc343440969aaa812004685ed322cdb526cd197112d0db303b0da1e8659", size = 6200368 }, - { url = "https://files.pythonhosted.org/packages/91/e9/25e51915cd972e8c66daf29644e653135f967d7411eccd2651fa347a6337/grpcio-1.68.0-cp312-cp312-win32.whl", hash = "sha256:5a180328e92b9a0050958ced34dddcb86fec5a8b332f5a229e353dafc16cd332", size = 3637786 }, - { url = "https://files.pythonhosted.org/packages/e2/1d/b1250907a727f08de6508d752f367e4b46d113d4eac9eb919ebd9da6a5d6/grpcio-1.68.0-cp312-cp312-win_amd64.whl", hash = "sha256:2bddd04a790b69f7a7385f6a112f46ea0b34c4746f361ebafe9ca0be567c78e9", size = 4390622 }, -] - -[[package]] -name = "grpcio-status" -version = "1.68.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "googleapis-common-protos" }, - { name = "grpcio" }, - { name = "protobuf" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/70/95/500b24fc02a98ea16097e978958dd7ab9e16db9316a3f0bdc88d5ff239df/grpcio_status-1.68.0.tar.gz", hash = "sha256:8369823de22ab6a2cddb3804669c149ae7a71819e127c2dca7c2322028d52bea", size = 13666 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/40/ba/dc535631a9dffa421b327ebfc961911af54c396aa5324efd122a94f72464/grpcio_status-1.68.0-py3-none-any.whl", hash = "sha256:0a71b15d989f02df803b4ba85c5bf1f43aeaa58ac021e5f9974b8cadc41f784d", size = 14427 }, -] - [[package]] name = "idna" version = "3.10" @@ -721,18 +562,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 }, ] -[[package]] -name = "proto-plus" -version = "1.25.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "protobuf" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/7e/05/74417b2061e1bf1b82776037cad97094228fa1c1b6e82d08a78d3fb6ddb6/proto_plus-1.25.0.tar.gz", hash = "sha256:fbb17f57f7bd05a68b7707e745e26528b0b3c34e378db91eef93912c54982d91", size = 56124 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/dd/25/0b7cc838ae3d76d46539020ec39fc92bfc9acc29367e58fe912702c2a79e/proto_plus-1.25.0-py3-none-any.whl", hash = "sha256:c91fc4a65074ade8e458e95ef8bac34d4008daa7cce4a12d6707066fca648961", size = 50126 }, -] - [[package]] name = "protobuf" version = "5.29.0" @@ -765,27 +594,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ae/49/baafe2a964f663413be3bd1cf5c45ed98c5e42e804e2328e18f4570027c1/pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7", size = 25099235 }, ] -[[package]] -name = "pyasn1" -version = "0.6.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ba/e9/01f1a64245b89f039897cb0130016d79f77d52669aae6ee7b159a6c4c018/pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034", size = 145322 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c8/f1/d6a797abb14f6283c0ddff96bbdd46937f64122b8c925cab503dd37f8214/pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629", size = 83135 }, -] - -[[package]] -name = "pyasn1-modules" -version = "0.4.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pyasn1" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/1d/67/6afbf0d507f73c32d21084a79946bfcfca5fbc62a72057e9c23797a737c9/pyasn1_modules-0.4.1.tar.gz", hash = "sha256:c28e2dbf9c06ad61c71a075c7e0f9fd0f1b0bb2d2ad4377f240d33ac2ab60a7c", size = 310028 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/77/89/bc88a6711935ba795a679ea6ebee07e128050d6382eaa35a0a47c8032bdc/pyasn1_modules-0.4.1-py3-none-any.whl", hash = "sha256:49bfa96b45a292b711e986f222502c1c9a5e1f4e568fc30e2574a6c7d07838fd", size = 181537 }, -] - [[package]] name = "pydantic" version = "2.10.2" @@ -834,11 +642,7 @@ dependencies = [ { name = "dbt-duckdb" }, { name = "duckdb" }, { name = "fire" }, - { name = "google-auth" }, - { name = "google-cloud-bigquery" }, - { name = "google-cloud-bigquery-storage" }, { name = "loguru" }, - { name = "pyarrow" }, { name = "pydantic" }, ] @@ -851,14 +655,10 @@ dev = [ [package.metadata] requires-dist = [ { name = "db-dtypes", specifier = ">=1.1.1,<2.0.0" }, - { name = "dbt-duckdb", specifier = "==1.9.0" }, - { name = "duckdb", specifier = "==1.2.0" }, + { name = "dbt-duckdb", specifier = "==1.9.3" }, + { name = "duckdb", specifier = "==1.2.2" }, { name = "fire", specifier = ">=0.5.0,<1.0.0" }, - { name = "google-auth", specifier = ">=2.24.0,<3.0.0" }, - { name = "google-cloud-bigquery", specifier = "==3.25.0" }, - { name = "google-cloud-bigquery-storage", specifier = "==2.26.0" }, { name = "loguru", specifier = ">=0.7.2,<1.0.0" }, - { name = "pyarrow", specifier = "==17.0.0" }, { name = "pydantic", specifier = ">=2.7.4,<3.0.0" }, ] @@ -991,18 +791,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e6/08/7a186847dd78881a781d2be9b42c8e49c3261c0f4a6d0289ba9a1e4cde71/rpds_py-0.21.0-cp312-none-win_amd64.whl", hash = "sha256:e860f065cc4ea6f256d6f411aba4b1251255366e48e972f8a347cf88077b24fd", size = 220735 }, ] -[[package]] -name = "rsa" -version = "4.9" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pyasn1" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/aa/65/7d973b89c4d2351d7fb232c2e452547ddfa243e93131e7cfa766da627b52/rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21", size = 29711 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/49/97/fa78e3d2f65c02c8e1268b9aba606569fe97f6c8f7c2d74394553347c145/rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7", size = 34315 }, -] - [[package]] name = "ruff" version = "0.8.1"