From db85ead0df5ca593890a216e41703ec14a248e82 Mon Sep 17 00:00:00 2001 From: Anthonios Partheniou Date: Tue, 4 Nov 2025 17:45:32 +0000 Subject: [PATCH 01/14] feat: Add support for Python 3.14 --- .github/sync-repo-settings.yaml | 39 ------------------- .github/workflows/lint.yml | 2 +- .github/workflows/unittest.yml | 4 +- .../{system-3.13.cfg => system-3.14.cfg} | 2 +- CONTRIBUTING.rst | 10 +++-- noxfile.py | 8 ++-- owlbot.py | 4 +- setup.py | 5 ++- testing/constraints-3.14.txt | 0 9 files changed, 20 insertions(+), 54 deletions(-) delete mode 100644 .github/sync-repo-settings.yaml rename .kokoro/presubmit/{system-3.13.cfg => system-3.14.cfg} (84%) create mode 100644 testing/constraints-3.14.txt diff --git a/.github/sync-repo-settings.yaml b/.github/sync-repo-settings.yaml deleted file mode 100644 index 130dd1fe..00000000 --- a/.github/sync-repo-settings.yaml +++ /dev/null @@ -1,39 +0,0 @@ -# https://github.com/googleapis/repo-automation-bots/tree/main/packages/sync-repo-settings -# Rules for main branch protection -branchProtectionRules: -# Identifies the protection rule pattern. Name of the branch to be protected. -# Defaults to `main` -- pattern: main - requiresCodeOwnerReviews: true - requiresStrictStatusChecks: true - requiredStatusCheckContexts: - - 'cla/google' - - 'OwlBot Post Processor' - - 'docs' - - 'lint' - - 'unit (3.9)' - - 'unit (3.10)' - - 'unit (3.11)' - - 'unit (3.12)' - - 'unit (3.13)' - - 'cover' - - 'Kokoro' - - 'Samples - Lint' - - 'Samples - Python 3.9' - - 'Samples - Python 3.10' - - 'Samples - Python 3.11' - - 'Samples - Python 3.12' - - 'Samples - Python 3.13' -permissionRules: - - team: actools-python - permission: admin - - team: actools - permission: admin - - team: api-bigquery - permission: push - - team: yoshi-python - permission: push - - team: python-samples-owners - permission: push - - team: python-samples-reviewers - permission: push diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 1051da0b..3ed755f0 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -12,7 +12,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: "3.10" + python-version: "3.14" - name: Install nox run: | python -m pip install --upgrade setuptools pip wheel diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 7137d0ad..3a22f126 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - python: ['3.9', '3.10', '3.11', '3.12', '3.13'] + python: ['3.9', '3.10', '3.11', '3.12', '3.13', '3.14'] steps: - name: Checkout uses: actions/checkout@v4 @@ -45,7 +45,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: "3.10" + python-version: "3.14" - name: Install coverage run: | python -m pip install --upgrade setuptools pip wheel diff --git a/.kokoro/presubmit/system-3.13.cfg b/.kokoro/presubmit/system-3.14.cfg similarity index 84% rename from .kokoro/presubmit/system-3.13.cfg rename to .kokoro/presubmit/system-3.14.cfg index 3ec53cf9..ac911a3d 100644 --- a/.kokoro/presubmit/system-3.13.cfg +++ b/.kokoro/presubmit/system-3.14.cfg @@ -3,5 +3,5 @@ # Only run the following session(s) env_vars: { key: "NOX_SESSION" - value: "system-3.13" + value: "system-3.14" } diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 2e8e9860..c7d230de 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -22,7 +22,7 @@ In order to add a feature: documentation. - The feature must work fully on the following CPython versions: - 3.9, 3.10, 3.11, 3.12 and 3.13 on both UNIX and Windows. + 3.9, 3.10, 3.11, 3.12, 3.13 and 3.14 on both UNIX and Windows. - The feature must not add unnecessary dependencies (where "unnecessary" is of course subjective, but new dependencies should @@ -72,7 +72,7 @@ We use `nox `__ to instrument our tests. - To run a single unit test:: - $ nox -s unit-3.13 -- -k + $ nox -s unit-3.14 -- -k .. note:: @@ -143,12 +143,12 @@ Running System Tests $ nox -s system # Run a single system test - $ nox -s system-3.13 -- -k + $ nox -s system-3.14 -- -k .. note:: - System tests are only configured to run under Python 3.9, 3.10, 3.11, 3.12 and 3.13. + System tests are only configured to run under Python 3.14. For expediency, we do not run them in older versions of Python 3. This alone will not run the tests. You'll need to change some local @@ -226,12 +226,14 @@ We support: - `Python 3.11`_ - `Python 3.12`_ - `Python 3.13`_ +- `Python 3.14`_ .. _Python 3.9: https://docs.python.org/3.9/ .. _Python 3.10: https://docs.python.org/3.10/ .. _Python 3.11: https://docs.python.org/3.11/ .. _Python 3.12: https://docs.python.org/3.12/ .. _Python 3.13: https://docs.python.org/3.13/ +.. _Python 3.14: https://docs.python.org/3.14/ Supported versions can be found in our ``noxfile.py`` `config`_. diff --git a/noxfile.py b/noxfile.py index e2e9f723..175eddd2 100644 --- a/noxfile.py +++ b/noxfile.py @@ -32,9 +32,9 @@ ISORT_VERSION = "isort==5.10.1" LINT_PATHS = ["docs", "pandas_gbq", "tests", "noxfile.py", "setup.py"] -DEFAULT_PYTHON_VERSION = "3.10" +DEFAULT_PYTHON_VERSION = "3.14" -UNIT_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13"] +UNIT_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] UNIT_TEST_STANDARD_DEPENDENCIES = [ "mock", "asyncmock", @@ -56,7 +56,7 @@ "3.9": [], } -SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13"] +SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] SYSTEM_TEST_STANDARD_DEPENDENCIES = [ "mock", "pytest", @@ -161,7 +161,7 @@ def format(session): @_calculate_duration def lint_setup_py(session): """Verify that setup.py is valid (including RST check).""" - session.install("docutils", "pygments") + session.install("docutils", "pygments", "setuptools") session.run("python", "setup.py", "check", "--restructuredtext", "--strict") diff --git a/owlbot.py b/owlbot.py index cde35a98..fd07d9a8 100644 --- a/owlbot.py +++ b/owlbot.py @@ -35,8 +35,8 @@ extras = ["tqdm", "geopandas"] templated_files = common.py_library( default_python_version="3.10", - unit_test_python_versions=["3.9", "3.10", "3.11", "3.12", "3.13"], - system_test_python_versions=["3.9", "3.10", "3.11", "3.12", "3.13"], + unit_test_python_versions=["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"], + system_test_python_versions=["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"], cov_level=96, unit_test_external_dependencies=["freezegun"], unit_test_extras=extras, diff --git a/setup.py b/setup.py index 893d801b..e2a6fd5e 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,9 @@ "db-dtypes >=1.0.4,<2.0.0", "numpy >=1.18.1", "pandas >=1.1.4", - "pyarrow >=4.0.0", + "pyarrow >= 4.0.0", + # See https://arrow.apache.org/release/22.0.0.html + "pyarrow >= 22.0.0; python_version >= '3.14'", "pydata-google-auth >=1.5.0", # Note: google-api-core and google-auth are also included via transitive # dependency on google-cloud-bigquery, but this library also uses them @@ -90,6 +92,7 @@ "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", "Operating System :: OS Independent", "Topic :: Internet", "Topic :: Scientific/Engineering", diff --git a/testing/constraints-3.14.txt b/testing/constraints-3.14.txt new file mode 100644 index 00000000..e69de29b From 1195bc3dcb758d94d244f26fa1e82428c108e535 Mon Sep 17 00:00:00 2001 From: Anthonios Partheniou Date: Tue, 4 Nov 2025 15:05:14 -0500 Subject: [PATCH 02/14] chore(librarian): onboard to librarian (#977) Towards https://github.com/googleapis/librarian/issues/2456 --- .github/.OwlBot.lock.yaml | 17 ----------------- .github/.OwlBot.yaml | 19 ------------------- .github/release-please.yml | 2 -- .github/release-trigger.yml | 2 -- .librarian/state.yaml | 10 ++++++++++ CHANGELOG.md | 4 ++++ 6 files changed, 14 insertions(+), 40 deletions(-) delete mode 100644 .github/.OwlBot.lock.yaml delete mode 100644 .github/.OwlBot.yaml delete mode 100644 .github/release-please.yml delete mode 100644 .github/release-trigger.yml create mode 100644 .librarian/state.yaml diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml deleted file mode 100644 index 0ba69903..00000000 --- a/.github/.OwlBot.lock.yaml +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -docker: - image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:fbbc8db67afd8b7d71bf694c5081a32da0c528eba166fbcffb3b6e56ddf907d5 -# created: 2025-10-30T00:16:55.473963098Z diff --git a/.github/.OwlBot.yaml b/.github/.OwlBot.yaml deleted file mode 100644 index 33779d65..00000000 --- a/.github/.OwlBot.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -docker: - image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - -begin-after-commit-hash: 1afeb53252641dc35a421fa5acc59e2f3229ad6d - diff --git a/.github/release-please.yml b/.github/release-please.yml deleted file mode 100644 index 466597e5..00000000 --- a/.github/release-please.yml +++ /dev/null @@ -1,2 +0,0 @@ -releaseType: python -handleGHRelease: true diff --git a/.github/release-trigger.yml b/.github/release-trigger.yml deleted file mode 100644 index 6601e150..00000000 --- a/.github/release-trigger.yml +++ /dev/null @@ -1,2 +0,0 @@ -enabled: true -multiScmName: python-bigquery-pandas diff --git a/.librarian/state.yaml b/.librarian/state.yaml new file mode 100644 index 00000000..3ee4d862 --- /dev/null +++ b/.librarian/state.yaml @@ -0,0 +1,10 @@ +image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator:latest +libraries: + - id: pandas-gbq + version: 0.30.0 + apis: [] + source_roots: + - . + preserve_regex: [] + remove_regex: [] + tag_format: v{version} diff --git a/CHANGELOG.md b/CHANGELOG.md index 7549689f..afbf8c03 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +[PyPI History][1] + +[1]: https://pypi.org/project/pandas-gbq/#history + ## [0.30.0](https://github.com/googleapis/python-bigquery-pandas/compare/v0.29.2...v0.30.0) (2025-10-31) From 05caa72d36367ac67e056907c8a56f5f4c28e7dc Mon Sep 17 00:00:00 2001 From: Anthonios Partheniou Date: Tue, 11 Nov 2025 15:48:01 -0500 Subject: [PATCH 03/14] chore(librarian): update image sha and clean up unused files (#980) This PR updates the SHA for the librarian language image to match the one [here](https://github.com/googleapis/google-cloud-python/blob/2feb74032fd9c5cc7eaf6072ab03e9e8397bd434/.librarian/state.yaml#L1C1-L1C170). Owlbot related files are also cleaned up --------- Co-authored-by: ohmayr --- .github/auto-approve.yml | 3 -- .librarian/state.yaml | 2 +- owlbot.py | 95 ---------------------------------------- 3 files changed, 1 insertion(+), 99 deletions(-) delete mode 100644 .github/auto-approve.yml delete mode 100644 owlbot.py diff --git a/.github/auto-approve.yml b/.github/auto-approve.yml deleted file mode 100644 index 311ebbb8..00000000 --- a/.github/auto-approve.yml +++ /dev/null @@ -1,3 +0,0 @@ -# https://github.com/googleapis/repo-automation-bots/tree/main/packages/auto-approve -processes: - - "OwlBotTemplateChanges" diff --git a/.librarian/state.yaml b/.librarian/state.yaml index 3ee4d862..9a5bf347 100644 --- a/.librarian/state.yaml +++ b/.librarian/state.yaml @@ -1,4 +1,4 @@ -image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator:latest +image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:c8612d3fffb3f6a32353b2d1abd16b61e87811866f7ec9d65b59b02eb452a620 libraries: - id: pandas-gbq version: 0.30.0 diff --git a/owlbot.py b/owlbot.py deleted file mode 100644 index fd07d9a8..00000000 --- a/owlbot.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""This script is used to synthesize generated parts of this library.""" - -import pathlib - -import synthtool as s -from synthtool import gcp -from synthtool.languages import python - -REPO_ROOT = pathlib.Path(__file__).parent.absolute() - -common = gcp.CommonTemplates() - -# ---------------------------------------------------------------------------- -# Add templated files -# ---------------------------------------------------------------------------- - -extras_by_python = { - # Use a middle version of Python to test when no extras are installed. - "3.9": [] -} -extras = ["tqdm", "geopandas"] -templated_files = common.py_library( - default_python_version="3.10", - unit_test_python_versions=["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"], - system_test_python_versions=["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"], - cov_level=96, - unit_test_external_dependencies=["freezegun"], - unit_test_extras=extras, - unit_test_extras_by_python=extras_by_python, - system_test_extras=extras, - intersphinx_dependencies={ - "pandas": "https://pandas.pydata.org/pandas-docs/stable/", - "pydata-google-auth": "https://pydata-google-auth.readthedocs.io/en/latest/", - }, -) -s.move( - templated_files, - excludes=[ - # pandas-gbq was originally licensed BSD-3-Clause License - "LICENSE", - # Multi-processing note isn't relevant, as pandas_gbq is responsible for - # creating clients, not the end user. - "docs/multiprocessing.rst", - "noxfile.py", - "README.rst", - ".github/workflows/docs.yml", # to avoid overwriting python version - ".github/workflows/lint.yml", # to avoid overwriting python version - ".github/sync-repo-settings.yaml", - # exclude this file as we have an alternate prerelease.cfg - ".kokoro/presubmit/prerelease-deps.cfg", - ".kokoro/presubmit/presubmit.cfg", - "renovate.json", # to avoid overwriting the ignorePaths list additions: - # ".github/workflows/docs.yml AND lint.yml" specifically - # the version of python referenced in each of those files. - # Currently renovate bot wants to change 3.10 to 3.13. - ], -) - -# ---------------------------------------------------------------------------- -# Fixup files -# ---------------------------------------------------------------------------- - -s.replace( - [".github/header-checker-lint.yml"], - '"Google LLC"', - '"pandas-gbq Authors"', -) - -# ---------------------------------------------------------------------------- -# Samples templates -# ---------------------------------------------------------------------------- - -python.py_samples(skip_readmes=True) - -# ---------------------------------------------------------------------------- -# Final cleanup -# ---------------------------------------------------------------------------- - -s.shell.run(["nox", "-s", "format"], hide_output=False) -for noxfile in REPO_ROOT.glob("samples/**/noxfile.py"): - s.shell.run(["nox", "-s", "format"], cwd=noxfile.parent, hide_output=False) From 99c9082fd6245300ec5c699cbfd35c6a03c9e381 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Mon, 17 Nov 2025 12:37:21 -0600 Subject: [PATCH 04/14] feat: add pandas_gbq.sample (#983) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-pandas/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- pandas_gbq/__init__.py | 5 + pandas_gbq/constants.py | 7 + pandas_gbq/core/read.py | 179 +++++++++++++++++ pandas_gbq/core/sample.py | 323 ++++++++++++++++++++++++++++++ pandas_gbq/exceptions.py | 21 ++ pandas_gbq/gbq.py | 7 +- pandas_gbq/gbq_connector.py | 185 ++---------------- setup.py | 7 +- testing/constraints-3.9.txt | 7 +- tests/system/conftest.py | 23 +++ tests/system/test_sample.py | 97 +++++++++ tests/unit/test_core_sample.py | 348 +++++++++++++++++++++++++++++++++ tests/unit/test_gbq.py | 3 +- tests/unit/test_to_gbq.py | 10 +- 14 files changed, 1035 insertions(+), 187 deletions(-) create mode 100644 pandas_gbq/core/read.py create mode 100644 pandas_gbq/core/sample.py create mode 100644 tests/system/test_sample.py create mode 100644 tests/unit/test_core_sample.py diff --git a/pandas_gbq/__init__.py b/pandas_gbq/__init__.py index a842c81f..0c243869 100644 --- a/pandas_gbq/__init__.py +++ b/pandas_gbq/__init__.py @@ -2,10 +2,12 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. +import logging import warnings from pandas_gbq import version as pandas_gbq_version from pandas_gbq.contexts import Context, context +from pandas_gbq.core.sample import sample from . import _versions_helpers from .gbq import read_gbq, to_gbq # noqa @@ -21,6 +23,8 @@ FutureWarning, ) +logger = logging.Logger(__name__) + __version__ = pandas_gbq_version.__version__ __all__ = [ @@ -29,4 +33,5 @@ "read_gbq", "Context", "context", + "sample", ] diff --git a/pandas_gbq/constants.py b/pandas_gbq/constants.py index 37266b3c..498b03b5 100644 --- a/pandas_gbq/constants.py +++ b/pandas_gbq/constants.py @@ -2,6 +2,8 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. +import google.api_core.exceptions + # BigQuery uses powers of 2 in calculating data sizes. See: # https://cloud.google.com/bigquery/pricing#data The documentation uses # GiB rather than GB to disambiguate from the alternative base 10 units. @@ -10,3 +12,8 @@ BYTES_IN_MIB = 1024 * BYTES_IN_KIB BYTES_IN_GIB = 1024 * BYTES_IN_MIB BYTES_TO_RECOMMEND_BIGFRAMES = BYTES_IN_GIB + +HTTP_ERRORS = ( + google.api_core.exceptions.ClientError, + google.api_core.exceptions.GoogleAPIError, +) diff --git a/pandas_gbq/core/read.py b/pandas_gbq/core/read.py new file mode 100644 index 00000000..bc089002 --- /dev/null +++ b/pandas_gbq/core/read.py @@ -0,0 +1,179 @@ +# Copyright (c) 2025 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import typing +from typing import Any, Dict, Optional, Sequence +import warnings + +import google.cloud.bigquery +import google.cloud.bigquery.table +import numpy as np + +import pandas_gbq +import pandas_gbq.constants +import pandas_gbq.exceptions +import pandas_gbq.features +import pandas_gbq.timestamp + +# Only import at module-level at type checking time to avoid circular +# dependencies in the pandas package, which has an optional dependency on +# pandas-gbq. +if typing.TYPE_CHECKING: # pragma: NO COVER + import pandas + + +def _bqschema_to_nullsafe_dtypes(schema_fields): + """Specify explicit dtypes based on BigQuery schema. + + This function only specifies a dtype when the dtype allows nulls. + Otherwise, use pandas's default dtype choice. + + See: http://pandas.pydata.org/pandas-docs/dev/missing_data.html + #missing-data-casting-rules-and-indexing + """ + import db_dtypes + + # If you update this mapping, also update the table at + # `docs/reading.rst`. + dtype_map = { + "FLOAT": np.dtype(float), + "INTEGER": "Int64", + "TIME": db_dtypes.TimeDtype(), + # Note: Other types such as 'datetime64[ns]' and db_types.DateDtype() + # are not included because the pandas range does not align with the + # BigQuery range. We need to attempt a conversion to those types and + # fall back to 'object' when there are out-of-range values. + } + + # Amend dtype_map with newer extension types if pandas version allows. + if pandas_gbq.features.FEATURES.pandas_has_boolean_dtype: + dtype_map["BOOLEAN"] = "boolean" + + dtypes = {} + for field in schema_fields: + name = str(field["name"]) + # Array BigQuery type is represented as an object column containing + # list objects. + if field["mode"].upper() == "REPEATED": + dtypes[name] = "object" + continue + + dtype = dtype_map.get(field["type"].upper()) + if dtype: + dtypes[name] = dtype + + return dtypes + + +def _finalize_dtypes( + df: pandas.DataFrame, schema_fields: Sequence[Dict[str, Any]] +) -> pandas.DataFrame: + """ + Attempt to change the dtypes of those columns that don't map exactly. + + For example db_dtypes.DateDtype() and datetime64[ns] cannot represent + 0001-01-01, but they can represent dates within a couple hundred years of + 1970. See: + https://github.com/googleapis/python-bigquery-pandas/issues/365 + """ + import db_dtypes + import pandas.api.types + + # If you update this mapping, also update the table at + # `docs/reading.rst`. + dtype_map = { + "DATE": db_dtypes.DateDtype(), + "DATETIME": "datetime64[ns]", + "TIMESTAMP": "datetime64[ns]", + } + + for field in schema_fields: + # This method doesn't modify ARRAY/REPEATED columns. + if field["mode"].upper() == "REPEATED": + continue + + name = str(field["name"]) + dtype = dtype_map.get(field["type"].upper()) + + # Avoid deprecated conversion to timezone-naive dtype by only casting + # object dtypes. + if dtype and pandas.api.types.is_object_dtype(df[name]): + df[name] = df[name].astype(dtype, errors="ignore") + + # Ensure any TIMESTAMP columns are tz-aware. + df = pandas_gbq.timestamp.localize_df(df, schema_fields) + + return df + + +def download_results( + results: google.cloud.bigquery.table.RowIterator, + *, + bqclient: google.cloud.bigquery.Client, + progress_bar_type: Optional[str], + warn_on_large_results: bool = True, + max_results: Optional[int], + user_dtypes: Optional[dict], + use_bqstorage_api: bool, +) -> Optional[pandas.DataFrame]: + # No results are desired, so don't bother downloading anything. + if max_results == 0: + return None + + if user_dtypes is None: + user_dtypes = {} + + create_bqstorage_client = use_bqstorage_api + if max_results is not None: + create_bqstorage_client = False + + # If we're downloading a large table, BigQuery DataFrames might be a + # better fit. Not all code paths will populate rows_iter._table, but + # if it's not populated that means we are working with a small result + # set. + if ( + warn_on_large_results + and (table_ref := getattr(results, "_table", None)) is not None + ): + table = bqclient.get_table(table_ref) + if ( + isinstance((num_bytes := table.num_bytes), int) + and num_bytes > pandas_gbq.constants.BYTES_TO_RECOMMEND_BIGFRAMES + ): + num_gib = num_bytes / pandas_gbq.constants.BYTES_IN_GIB + warnings.warn( + f"Recommendation: Your results are {num_gib:.1f} GiB. " + "Consider using BigQuery DataFrames (https://bit.ly/bigframes-intro)" + "to process large results with pandas compatible APIs with transparent SQL " + "pushdown to BigQuery engine. This provides an opportunity to save on costs " + "and improve performance. " + "Please reach out to bigframes-feedback@google.com with any " + "questions or concerns. To disable this message, run " + "warnings.simplefilter('ignore', category=pandas_gbq.exceptions.LargeResultsWarning)", + category=pandas_gbq.exceptions.LargeResultsWarning, + # user's code + # -> read_gbq + # -> run_query + # -> download_results + stacklevel=4, + ) + + try: + schema_fields = [field.to_api_repr() for field in results.schema] + conversion_dtypes = _bqschema_to_nullsafe_dtypes(schema_fields) + conversion_dtypes.update(user_dtypes) + df = results.to_dataframe( + dtypes=conversion_dtypes, + progress_bar_type=progress_bar_type, + create_bqstorage_client=create_bqstorage_client, + ) + except pandas_gbq.constants.HTTP_ERRORS as ex: + raise pandas_gbq.exceptions.translate_exception(ex) from ex + + df = _finalize_dtypes(df, schema_fields) + + pandas_gbq.logger.debug("Got {} rows.\n".format(results.total_rows)) + return df diff --git a/pandas_gbq/core/sample.py b/pandas_gbq/core/sample.py new file mode 100644 index 00000000..49eee4b5 --- /dev/null +++ b/pandas_gbq/core/sample.py @@ -0,0 +1,323 @@ +# Copyright (c) 2025 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import typing +from typing import Optional, Sequence, cast + +import google.cloud.bigquery +import google.cloud.bigquery.table +import google.oauth2.credentials +import psutil + +import pandas_gbq.constants +import pandas_gbq.core.read +import pandas_gbq.gbq_connector + +# Only import at module-level at type checking time to avoid circular +# dependencies in the pandas package, which has an optional dependency on +# pandas-gbq. +if typing.TYPE_CHECKING: # pragma: NO COVER + import pandas + + +_READ_API_ELIGIBLE_TYPES = ("TABLE", "MATERIALIZED_VIEW", "EXTERNAL") +_TABLESAMPLE_ELIGIBLE_TYPES = ("TABLE", "EXTERNAL") + +# Base logical sizes for non-complex and non-variable types. +# https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/data-types#data_type_sizes +_TYPE_SIZES = { + # Fixed size types + "BOOL": 1, + "DATE": 8, + "DATETIME": 8, + "FLOAT64": 8, + "INT64": 8, + "TIME": 8, + "TIMESTAMP": 8, + "INTERVAL": 16, + "NUMERIC": 16, + "RANGE": 16, + "BIGNUMERIC": 32, + # Variable types with a fixed-size assumption + "STRING": pandas_gbq.constants.BYTES_IN_KIB, + "JSON": pandas_gbq.constants.BYTES_IN_KIB, + "BYTES": pandas_gbq.constants.BYTES_IN_MIB, + # Formula: 16 logical bytes + 24 logical bytes * num_vertices + # Assuming a small, fixed number of vertices (e.g., 5) for estimation: + "GEOGRAPHY": 16 + (24 * 5), +} +# TODO(tswast): Choose an estimate based on actual BigQuery stats. +_ARRAY_LENGTH_ESTIMATE = 5 +_UNKNOWN_TYPE_SIZE_ESTIMATE = 4 +_MAX_ROW_BYTES = 100 * pandas_gbq.constants.BYTES_IN_MIB +_MAX_AUTO_TARGET_BYTES = 1 * pandas_gbq.constants.BYTES_IN_GIB + + +def _calculate_target_bytes(target_mb: Optional[int]) -> int: + if target_mb is not None: + return target_mb * pandas_gbq.constants.BYTES_IN_MIB + + mem = psutil.virtual_memory() + return min(_MAX_AUTO_TARGET_BYTES, max(_MAX_ROW_BYTES, mem.available // 4)) + + +def _estimate_limit( + *, + target_bytes: int, + table_bytes: Optional[int], + table_rows: Optional[int], + fields: Sequence[google.cloud.bigquery.SchemaField], +) -> int: + if table_bytes and table_rows: + proportion = target_bytes / table_bytes + return max(1, int(table_rows * proportion)) + + row_bytes_estimate = _estimate_row_bytes(fields) + assert row_bytes_estimate >= 0 + + if row_bytes_estimate == 0: + # Assume there's some overhead per row so we have some kind of limit. + return target_bytes + + return max(1, target_bytes // row_bytes_estimate) + + +def _estimate_field_bytes(field: google.cloud.bigquery.SchemaField) -> int: + """Recursive helper function to calculate the size of a single field.""" + field_type = field.field_type + + # If the field is REPEATED (ARRAY), its size is the sum of its elements. + if field.mode == "REPEATED": + # Create a temporary single-element field for size calculation + temp_field = google.cloud.bigquery.SchemaField( + field.name, field.field_type, mode="NULLABLE", fields=field.fields + ) + element_size = _estimate_field_bytes(temp_field) + return _ARRAY_LENGTH_ESTIMATE * element_size + + if field_type == "STRUCT" or field_type == "RECORD": + # STRUCT has 0 logical bytes + the size of its contained fields. + return _estimate_row_bytes(field.fields) + + return _TYPE_SIZES.get(field_type.upper(), _UNKNOWN_TYPE_SIZE_ESTIMATE) + + +def _estimate_row_bytes(fields: Sequence[google.cloud.bigquery.SchemaField]) -> int: + """ + Estimates the logical row size in bytes for a list of BigQuery SchemaField objects, + using the provided data type size chart and assuming 1MB for all STRING and BYTES + fields. + + Args: + schema_fields: A list of google.cloud.bigquery.SchemaField objects + representing the table schema. + + Returns: + An integer representing the estimated total row size in logical bytes. + """ + total_size = min( + _MAX_ROW_BYTES, + sum(_estimate_field_bytes(field) for field in fields), + ) + return total_size + + +def _download_results_in_parallel( + rows: google.cloud.bigquery.table.RowIterator, + *, + bqclient: google.cloud.bigquery.Client, + progress_bar_type: Optional[str] = None, + use_bqstorage_api: bool = True, +): + table_reference = getattr(rows, "_table", None) + schema = getattr(rows, "_schema", None) + + # If the results are large enough to materialize a table, break the + # connection to the original query that contains an ORDER BY clause to allow + # reading with multiple streams. + if table_reference is not None and schema is not None: + rows = bqclient.list_rows( + table_reference, + selected_fields=schema, + ) + + return pandas_gbq.core.read.download_results( + rows, + bqclient=bqclient, + progress_bar_type=progress_bar_type, + warn_on_large_results=False, + max_results=None, + user_dtypes=None, + use_bqstorage_api=use_bqstorage_api, + ) + + +def _sample_with_tablesample( + table: google.cloud.bigquery.Table, + *, + bqclient: google.cloud.bigquery.Client, + proportion: float, + target_row_count: int, + progress_bar_type: Optional[str] = None, + use_bqstorage_api: bool = True, +) -> Optional[pandas.DataFrame]: + query = f""" + SELECT * + FROM `{table.project}.{table.dataset_id}.{table.table_id}` + TABLESAMPLE SYSTEM ({float(proportion) * 100.0} PERCENT) + ORDER BY RAND() DESC + LIMIT {int(target_row_count)}; + """ + rows = bqclient.query_and_wait(query) + return _download_results_in_parallel( + rows, + bqclient=bqclient, + progress_bar_type=progress_bar_type, + use_bqstorage_api=use_bqstorage_api, + ) + + +def _sample_with_limit( + table: google.cloud.bigquery.Table, + *, + bqclient: google.cloud.bigquery.Client, + target_row_count: int, + progress_bar_type: Optional[str] = None, + use_bqstorage_api: bool = True, +) -> Optional[pandas.DataFrame]: + query = f""" + SELECT * + FROM `{table.project}.{table.dataset_id}.{table.table_id}` + ORDER BY RAND() DESC + LIMIT {int(target_row_count)}; + """ + rows = bqclient.query_and_wait(query) + return _download_results_in_parallel( + rows, + bqclient=bqclient, + progress_bar_type=progress_bar_type, + use_bqstorage_api=use_bqstorage_api, + ) + + +def sample( + table_id: str, + *, + target_mb: Optional[int] = None, + credentials: Optional[google.oauth2.credentials.Credentials] = None, + billing_project_id: Optional[str] = None, + progress_bar_type: Optional[str] = None, + use_bqstorage_api: bool = True, +) -> Optional[pandas.DataFrame]: + """Sample a BigQuery table, attempting to limit the amount of data read. + + This function attempts to sample a BigQuery table to a target size in + memory. It prioritizes methods that minimize data scanned and downloaded. + + The target size is based on an estimate of the row size and this method + return more or less than expected. If the table metadata doesn't include + a size, such as with views, an estimate based on the table schema is + used. + + Sampling is based on the `BigQuery TABLESAMPLE + `_ feature, + which can provide a biased sample if data is not randomly distributed + among file blocks. For more control over sampling, use BigQuery + DataFrames ``read_gbq_table`` and ``DataFrame.sample`` methods. + + Specificially, the sampling strategy is as follows: + + 1. If the table is small enough (based on `target_mb` or available memory) + and eligible for the BigQuery Storage Read API, the entire table is + downloaded. + 2. If the table is larger than the target size and eligible for + `TABLESAMPLE SYSTEM` (e.g., a regular table), a `TABLESAMPLE` query + is used to retrieve a proportion of rows, followed by `ORDER BY RAND()` + and `LIMIT` to get the `target_row_count`. + 3. If `TABLESAMPLE` is not applicable (e.g., for views) or `num_bytes` is + not available, a full table scan is performed with `ORDER BY RAND()` + and `LIMIT` to retrieve the `target_row_count`. + + Args: + table_id: The BigQuery table ID to sample, in the format + "project.dataset.table" or "dataset.table". + target_mb: Optional. The target size in megabytes for the sampled + DataFrame. If not specified, it defaults to 1/4 of available + system memory, with a minimum of 100MB and maximum of 1 GB. + credentials: Optional. The credentials to use for BigQuery access. + If not provided, `pandas_gbq` will attempt to infer them. + billing_project_id: Optional. The ID of the Google Cloud project to + bill for the BigQuery job. If not provided, `pandas_gbq` will + attempt to infer it. + progress_bar_type: Optional. Type of progress bar to display. + See `pandas_gbq.core.read.download_results` for options. + use_bqstorage_api: Optional. If `True`, use the BigQuery Storage Read + API for faster downloads. Defaults to `True`. + + Returns: + A `pandas.DataFrame` containing the sampled data, or `None` if no data + could be sampled. + """ + target_bytes = _calculate_target_bytes(target_mb) + connector = pandas_gbq.gbq_connector.GbqConnector( + project_id=billing_project_id, credentials=credentials + ) + credentials = cast(google.oauth2.credentials.Credentials, connector.credentials) + bqclient = connector.get_client() + table = bqclient.get_table(table_id) + num_rows = table.num_rows + num_bytes = table.num_bytes + table_type = table.table_type + + # Some tables such as views report 0 despite actually having rows. + if num_bytes == 0: + num_bytes = None + + # Table is small enough to download the whole thing. + if ( + table_type in _READ_API_ELIGIBLE_TYPES + and num_bytes is not None + and num_bytes <= target_bytes + ): + rows_iter = bqclient.list_rows(table) + return pandas_gbq.core.read.download_results( + rows_iter, + bqclient=bqclient, + progress_bar_type=progress_bar_type, + warn_on_large_results=False, + max_results=None, + user_dtypes=None, + use_bqstorage_api=use_bqstorage_api, + ) + + target_row_count = _estimate_limit( + target_bytes=target_bytes, + table_bytes=num_bytes, + table_rows=num_rows, + fields=table.schema, + ) + + # Table is eligible for TABLESAMPLE. + if num_bytes is not None and table_type in _TABLESAMPLE_ELIGIBLE_TYPES: + proportion = target_bytes / num_bytes + return _sample_with_tablesample( + table, + bqclient=bqclient, + proportion=proportion, + target_row_count=target_row_count, + progress_bar_type=progress_bar_type, + use_bqstorage_api=use_bqstorage_api, + ) + + # Not eligible for TABLESAMPLE or reading directly, so take a random sample + # with a full table scan. + return _sample_with_limit( + table, + bqclient=bqclient, + target_row_count=target_row_count, + progress_bar_type=progress_bar_type, + use_bqstorage_api=use_bqstorage_api, + ) diff --git a/pandas_gbq/exceptions.py b/pandas_gbq/exceptions.py index 1acec712..ab5f05b4 100644 --- a/pandas_gbq/exceptions.py +++ b/pandas_gbq/exceptions.py @@ -110,3 +110,24 @@ class QueryTimeout(ValueError): Raised when the query request exceeds the timeoutMs value specified in the BigQuery configuration. """ + + +def translate_exception(ex): + # See `BigQuery Troubleshooting Errors + # `__ + + message = ( + ex.message.casefold() + if hasattr(ex, "message") and ex.message is not None + else "" + ) + if "cancelled" in message: + return QueryTimeout("Reason: {0}".format(ex)) + elif "schema does not match" in message: + error_message = ex.errors[0]["message"] + return InvalidSchema(f"Reason: {error_message}") + elif "already exists: table" in message: + error_message = ex.errors[0]["message"] + return TableCreationError(f"Reason: {error_message}") + else: + return GenericGBQException("Reason: {0}".format(ex)) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 880dcef9..dcc96d49 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -25,12 +25,7 @@ from pandas_gbq.exceptions import InvalidSchema # noqa - backward compatible export from pandas_gbq.exceptions import QueryTimeout # noqa - backward compatible export from pandas_gbq.features import FEATURES -from pandas_gbq.gbq_connector import ( # noqa - backward compatible export - GbqConnector, - _bqschema_to_nullsafe_dtypes, - _finalize_dtypes, - create_user_agent, -) +from pandas_gbq.gbq_connector import GbqConnector # noqa - backward compatible export from pandas_gbq.gbq_connector import _get_client # noqa - backward compatible export import pandas_gbq.schema import pandas_gbq.schema.pandas_to_bigquery diff --git a/pandas_gbq/gbq_connector.py b/pandas_gbq/gbq_connector.py index 2b3b716e..81f726f6 100644 --- a/pandas_gbq/gbq_connector.py +++ b/pandas_gbq/gbq_connector.py @@ -2,15 +2,14 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. +from __future__ import annotations import logging import time import typing -from typing import Any, Dict, Optional, Sequence, Union +from typing import Any, Dict, Optional, Union import warnings -import numpy as np - # Only import at module-level at type checking time to avoid circular # dependencies in the pandas package, which has an optional dependency on # pandas-gbq. @@ -19,17 +18,12 @@ import pandas_gbq.constants from pandas_gbq.contexts import context +import pandas_gbq.core.read import pandas_gbq.environment as environment import pandas_gbq.exceptions -from pandas_gbq.exceptions import ( - GenericGBQException, - InvalidSchema, - QueryTimeout, - TableCreationError, -) +from pandas_gbq.exceptions import QueryTimeout from pandas_gbq.features import FEATURES import pandas_gbq.query -import pandas_gbq.timestamp try: import tqdm # noqa @@ -57,11 +51,9 @@ def __init__( rfc9110_delimiter=False, bigquery_client=None, ): - from google.api_core.exceptions import ClientError, GoogleAPIError - from pandas_gbq import auth - self.http_error = (ClientError, GoogleAPIError) + self.http_error = pandas_gbq.constants.HTTP_ERRORS self.project_id = project_id self.location = location self.reauth = reauth @@ -156,22 +148,7 @@ def get_client(self): def process_http_error(ex): # See `BigQuery Troubleshooting Errors # `__ - - message = ( - ex.message.casefold() - if hasattr(ex, "message") and ex.message is not None - else "" - ) - if "cancelled" in message: - raise QueryTimeout("Reason: {0}".format(ex)) - elif "schema does not match" in message: - error_message = ex.errors[0]["message"] - raise InvalidSchema(f"Reason: {error_message}") - elif "already exists: table" in message: - error_message = ex.errors[0]["message"] - raise TableCreationError(f"Reason: {error_message}") - else: - raise GenericGBQException("Reason: {0}".format(ex)) from ex + raise pandas_gbq.exceptions.translate_exception(ex) from ex def download_table( self, @@ -179,7 +156,7 @@ def download_table( max_results: Optional[int] = None, progress_bar_type: Optional[str] = None, dtypes: Optional[Dict[str, Union[str, Any]]] = None, - ) -> "pandas.DataFrame": + ) -> Optional[pandas.DataFrame]: from google.cloud import bigquery self._start_timer() @@ -274,61 +251,15 @@ def _download_results( progress_bar_type=None, user_dtypes=None, ): - # No results are desired, so don't bother downloading anything. - if max_results == 0: - return None - - if user_dtypes is None: - user_dtypes = {} - - create_bqstorage_client = self.use_bqstorage_api - if max_results is not None: - create_bqstorage_client = False - - # If we're downloading a large table, BigQuery DataFrames might be a - # better fit. Not all code paths will populate rows_iter._table, but - # if it's not populated that means we are working with a small result - # set. - if (table_ref := getattr(rows_iter, "_table", None)) is not None: - table = self.client.get_table(table_ref) - if ( - isinstance((num_bytes := table.num_bytes), int) - and num_bytes > pandas_gbq.constants.BYTES_TO_RECOMMEND_BIGFRAMES - ): - num_gib = num_bytes / pandas_gbq.constants.BYTES_IN_GIB - warnings.warn( - f"Recommendation: Your results are {num_gib:.1f} GiB. " - "Consider using BigQuery DataFrames (https://bit.ly/bigframes-intro)" - "to process large results with pandas compatible APIs with transparent SQL " - "pushdown to BigQuery engine. This provides an opportunity to save on costs " - "and improve performance. " - "Please reach out to bigframes-feedback@google.com with any " - "questions or concerns. To disable this message, run " - "warnings.simplefilter('ignore', category=pandas_gbq.exceptions.LargeResultsWarning)", - category=pandas_gbq.exceptions.LargeResultsWarning, - # user's code - # -> read_gbq - # -> run_query - # -> download_results - stacklevel=4, - ) - - try: - schema_fields = [field.to_api_repr() for field in rows_iter.schema] - conversion_dtypes = _bqschema_to_nullsafe_dtypes(schema_fields) - conversion_dtypes.update(user_dtypes) - df = rows_iter.to_dataframe( - dtypes=conversion_dtypes, - progress_bar_type=progress_bar_type, - create_bqstorage_client=create_bqstorage_client, - ) - except self.http_error as ex: - self.process_http_error(ex) - - df = _finalize_dtypes(df, schema_fields) - - logger.debug("Got {} rows.\n".format(rows_iter.total_rows)) - return df + return pandas_gbq.core.read.download_results( + rows_iter, + bqclient=self.get_client(), + progress_bar_type=progress_bar_type, + warn_on_large_results=True, + max_results=max_results, + user_dtypes=user_dtypes, + use_bqstorage_api=self.use_bqstorage_api, + ) def load_data( self, @@ -369,90 +300,6 @@ def load_data( self.process_http_error(ex) -def _bqschema_to_nullsafe_dtypes(schema_fields): - """Specify explicit dtypes based on BigQuery schema. - - This function only specifies a dtype when the dtype allows nulls. - Otherwise, use pandas's default dtype choice. - - See: http://pandas.pydata.org/pandas-docs/dev/missing_data.html - #missing-data-casting-rules-and-indexing - """ - import db_dtypes - - # If you update this mapping, also update the table at - # `docs/reading.rst`. - dtype_map = { - "FLOAT": np.dtype(float), - "INTEGER": "Int64", - "TIME": db_dtypes.TimeDtype(), - # Note: Other types such as 'datetime64[ns]' and db_types.DateDtype() - # are not included because the pandas range does not align with the - # BigQuery range. We need to attempt a conversion to those types and - # fall back to 'object' when there are out-of-range values. - } - - # Amend dtype_map with newer extension types if pandas version allows. - if FEATURES.pandas_has_boolean_dtype: - dtype_map["BOOLEAN"] = "boolean" - - dtypes = {} - for field in schema_fields: - name = str(field["name"]) - # Array BigQuery type is represented as an object column containing - # list objects. - if field["mode"].upper() == "REPEATED": - dtypes[name] = "object" - continue - - dtype = dtype_map.get(field["type"].upper()) - if dtype: - dtypes[name] = dtype - - return dtypes - - -def _finalize_dtypes( - df: "pandas.DataFrame", schema_fields: Sequence[Dict[str, Any]] -) -> "pandas.DataFrame": - """ - Attempt to change the dtypes of those columns that don't map exactly. - - For example db_dtypes.DateDtype() and datetime64[ns] cannot represent - 0001-01-01, but they can represent dates within a couple hundred years of - 1970. See: - https://github.com/googleapis/python-bigquery-pandas/issues/365 - """ - import db_dtypes - import pandas.api.types - - # If you update this mapping, also update the table at - # `docs/reading.rst`. - dtype_map = { - "DATE": db_dtypes.DateDtype(), - "DATETIME": "datetime64[ns]", - "TIMESTAMP": "datetime64[ns]", - } - - for field in schema_fields: - # This method doesn't modify ARRAY/REPEATED columns. - if field["mode"].upper() == "REPEATED": - continue - - name = str(field["name"]) - dtype = dtype_map.get(field["type"].upper()) - - # Avoid deprecated conversion to timezone-naive dtype by only casting - # object dtypes. - if dtype and pandas.api.types.is_object_dtype(df[name]): - df[name] = df[name].astype(dtype, errors="ignore") - - # Ensure any TIMESTAMP columns are tz-aware. - df = pandas_gbq.timestamp.localize_df(df, schema_fields) - - return df - - def _get_client(user_agent, rfc9110_delimiter, project_id, credentials): import google.api_core.client_info diff --git a/setup.py b/setup.py index e2a6fd5e..f47bf04b 100644 --- a/setup.py +++ b/setup.py @@ -29,15 +29,16 @@ # See https://arrow.apache.org/release/22.0.0.html "pyarrow >= 22.0.0; python_version >= '3.14'", "pydata-google-auth >=1.5.0", + "psutil >=5.9.8", # Note: google-api-core and google-auth are also included via transitive # dependency on google-cloud-bigquery, but this library also uses them # directly. - "google-api-core >= 2.10.2, <3.0.0", - "google-auth >=2.13.0", + "google-api-core >= 2.15.0, <3.0.0", + "google-auth >=2.14.1", "google-auth-oauthlib >=0.7.0", # Please also update the minimum version in pandas_gbq/features.py to # allow pandas-gbq to detect invalid package versions at runtime. - "google-cloud-bigquery >=3.4.2,<4.0.0", + "google-cloud-bigquery >=3.20.0,<4.0.0", "packaging >=22.0.0", ] extras = { diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index db8a499a..aff46b28 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -8,12 +8,13 @@ db-dtypes==1.0.4 numpy==1.19.4 pandas==1.1.4 +psutil==5.9.8 pyarrow==4.0.0 pydata-google-auth==1.5.0 -google-api-core==2.10.2 -google-auth==2.13.0 +google-api-core==2.15.0 +google-auth==2.14.1 google-auth-oauthlib==0.7.0 -google-cloud-bigquery==3.4.2 +google-cloud-bigquery==3.20.0 packaging==22.0.0 # Extras google-cloud-bigquery-storage==2.16.2 diff --git a/tests/system/conftest.py b/tests/system/conftest.py index cb8aadb9..c761276d 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -45,6 +45,29 @@ def project(project_id): return project_id +@pytest.fixture +def external_table(bigquery_client, random_dataset, project_id): + table_id = prefixer.create_prefix() + full_table_id = str(random_dataset.table(table_id)) + table = bigquery.Table( + full_table_id, + [ + bigquery.SchemaField("name", "STRING"), + bigquery.SchemaField("post_abbr", "STRING"), + ], + ) + external_data_configuration = bigquery.ExternalConfig("CSV") + csv_options = bigquery.CSVOptions() + csv_options.skip_leading_rows = 1 + external_data_configuration.csv_options = csv_options + external_data_configuration.source_uris = [ + "gs://cloud-samples-data/bigquery/us-states/us-states.csv" + ] + table.external_data_configuration = external_data_configuration + bigquery_client.create_table(table) + return full_table_id + + @pytest.fixture def to_gbq(credentials, project_id): import pandas_gbq diff --git a/tests/system/test_sample.py b/tests/system/test_sample.py new file mode 100644 index 00000000..7dbd89a5 --- /dev/null +++ b/tests/system/test_sample.py @@ -0,0 +1,97 @@ +# Copyright (c) 2025 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +import google.cloud.bigquery +import google.oauth2.credentials + +import pandas_gbq + + +def test_sample_small_table( + project_id: str, + credentials: google.oauth2.credentials.Credentials, + bigquery_client: google.cloud.bigquery.Client, +): + # Arrange + table_id = "bigquery-public-data.ml_datasets.penguins" + table = bigquery_client.get_table(table_id) + num_bytes = table.num_bytes + num_rows = table.num_rows + + # Act + df = pandas_gbq.sample( + table_id, + target_mb=1_000, + credentials=credentials, + billing_project_id=project_id, + ) + + # Assert + assert num_bytes is not None and num_bytes > 0 + assert num_rows is not None and num_rows > 0 + assert df is not None and len(df.index) == num_rows + + +def test_sample_large_table( + project_id: str, + credentials: google.oauth2.credentials.Credentials, + bigquery_client: google.cloud.bigquery.Client, +): + # Arrange + table_id = "bigquery-public-data.chicago_taxi_trips.taxi_trips" + table = bigquery_client.get_table(table_id) + num_bytes = table.num_bytes + num_rows = table.num_rows + + # Act + df = pandas_gbq.sample( + table_id, target_mb=10, credentials=credentials, billing_project_id=project_id + ) + + # Assert + assert num_bytes is not None and num_bytes > 0 + assert num_rows is not None and num_rows > 0 + assert df is not None + rows_downloaded = len(df.index) + assert rows_downloaded > 0 + assert rows_downloaded < num_rows + bytes_downloaded = df.memory_usage().sum() + assert bytes_downloaded < num_bytes + + +def test_sample_small_external_table( + project_id: str, + credentials: google.oauth2.credentials.Credentials, + external_table: str, +): + # Act + df = pandas_gbq.sample( + external_table, + target_mb=1_000, + credentials=credentials, + billing_project_id=project_id, + ) + + # Assert + assert df is not None + rows_downloaded = len(df.index) + assert rows_downloaded > 0 + + +def test_sample_view( + project_id: str, + credentials: google.oauth2.credentials.Credentials, +): + # Arrange + table_id = "bigquery-public-data.ethereum_blockchain.live_contracts" + + # Act + df = pandas_gbq.sample( + table_id, target_mb=10, credentials=credentials, billing_project_id=project_id + ) + + # Assert + assert df is not None + rows_downloaded = len(df.index) + assert rows_downloaded > 0 diff --git a/tests/unit/test_core_sample.py b/tests/unit/test_core_sample.py new file mode 100644 index 00000000..5e5a15e7 --- /dev/null +++ b/tests/unit/test_core_sample.py @@ -0,0 +1,348 @@ +# Copyright (c) 2025 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +from typing import Sequence +from unittest import mock + +import google.cloud.bigquery +import pytest + +import pandas_gbq.constants +import pandas_gbq.core.sample + + +@pytest.mark.parametrize( + "schema, expected_size", + [ + pytest.param( + [ + google.cloud.bigquery.SchemaField("id", "INT64"), # 8 + google.cloud.bigquery.SchemaField("is_valid", "BOOL"), # 1 + google.cloud.bigquery.SchemaField("price", "NUMERIC"), # 16 + google.cloud.bigquery.SchemaField("big_value", "BIGNUMERIC"), # 32 + ], + 8 + 1 + 16 + 32, # 57 + id="Fixed_Size_Types", + ), + pytest.param( + [ + google.cloud.bigquery.SchemaField( + "coords", + "RECORD", + fields=[ + google.cloud.bigquery.SchemaField("lat", "FLOAT64"), # 8 + google.cloud.bigquery.SchemaField("lon", "FLOAT64"), # 8 + ], + ), + ], + 16, # 8 + 8 + id="Simple_Struct", + ), + pytest.param( + [ + google.cloud.bigquery.SchemaField( + "history", "TIMESTAMP", mode="REPEATED" + ), # 5 * 8 + ], + pandas_gbq.core.sample._ARRAY_LENGTH_ESTIMATE * 8, # 40 + id="Simple_Array", + ), + pytest.param( + [ + google.cloud.bigquery.SchemaField( + "addresses", + "RECORD", + mode="REPEATED", + fields=[ + google.cloud.bigquery.SchemaField("street", "STRING"), # 1KIB + google.cloud.bigquery.SchemaField("zip", "INT64"), # 8 + ], + ), + ], + pandas_gbq.core.sample._ARRAY_LENGTH_ESTIMATE + * (pandas_gbq.constants.BYTES_IN_KIB + 8), + id="Repeated_Struct", + ), + pytest.param( + [ + google.cloud.bigquery.SchemaField( + "empty_struct", "RECORD", fields=[] + ), # 0 + google.cloud.bigquery.SchemaField("simple_int", "INT64"), # 8 + ], + 8, # 0 + 8 + id="empty-struct", + ), + pytest.param( + [ + google.cloud.bigquery.SchemaField("bytes", "BYTES"), + ] + * 9_999, + pandas_gbq.core.sample._MAX_ROW_BYTES, + id="many-bytes", + ), + # Case 8: Complex Mix (Combining multiple cases) + pytest.param( + [ + google.cloud.bigquery.SchemaField("key", "INT64"), # 8 + google.cloud.bigquery.SchemaField("notes", "STRING"), # 1KIB + google.cloud.bigquery.SchemaField( + "history", "TIMESTAMP", mode="REPEATED" + ), # 40 + google.cloud.bigquery.SchemaField( + "details", + "RECORD", + fields=[ + google.cloud.bigquery.SchemaField("d1", "NUMERIC"), # 16 + google.cloud.bigquery.SchemaField("d2", "BYTES"), # 1MB + ], + ), + ], + 8 + + pandas_gbq.constants.BYTES_IN_KIB + + 40 + + (16 + pandas_gbq.constants.BYTES_IN_MIB), + id="Complex_Mix", + ), + ], +) +def test_estimate_row_size_parametrized( + schema: Sequence[google.cloud.bigquery.SchemaField], expected_size: int +): + actual_size = pandas_gbq.core.sample._estimate_row_bytes(schema) + assert actual_size == expected_size + + +def test_calculate_target_bytes_with_target_mb(): + target_mb = 200 + expected_bytes = target_mb * pandas_gbq.constants.BYTES_IN_MIB + actual_bytes = pandas_gbq.core.sample._calculate_target_bytes(target_mb) + assert actual_bytes == expected_bytes + + +@mock.patch("psutil.virtual_memory") +def test_calculate_target_bytes_with_available_memory(mock_virtual_memory): + # Mock psutil.virtual_memory to return a mock object with an 'available' attribute. + available_memory = 2 * pandas_gbq.constants.BYTES_IN_GIB # 2 GB + mock_virtual_memory.return_value = mock.Mock(available=available_memory) + + # Expected bytes is available memory / 4, as it falls between _MAX_ROW_BYTES and _MAX_AUTO_TARGET_BYTES + expected_bytes = available_memory // 4 + actual_bytes = pandas_gbq.core.sample._calculate_target_bytes(None) + assert actual_bytes == expected_bytes + + +@mock.patch("psutil.virtual_memory") +def test_calculate_target_bytes_low_memory_uses_max_row_bytes(mock_virtual_memory): + # Mock psutil.virtual_memory to return a mock object with an 'available' attribute. + # Set available memory to a low value. + available_memory = 100 # 100 bytes + mock_virtual_memory.return_value = mock.Mock(available=available_memory) + + # Expected bytes should be _MAX_ROW_BYTES because available // 4 is less. + expected_bytes = pandas_gbq.core.sample._MAX_ROW_BYTES + actual_bytes = pandas_gbq.core.sample._calculate_target_bytes(None) + assert actual_bytes == expected_bytes + + +@mock.patch("psutil.virtual_memory") +def test_calculate_target_bytes_caps_at_max_auto_target_bytes(mock_virtual_memory): + # Mock psutil.virtual_memory to return a mock object with an 'available' attribute. + # Set available memory to a high value (e.g., 8 GB) so that available // 4 > _MAX_AUTO_TARGET_BYTES. + available_memory = 8 * pandas_gbq.constants.BYTES_IN_GIB # 8 GB + mock_virtual_memory.return_value = mock.Mock(available=available_memory) + + # Expected bytes should be _MAX_AUTO_TARGET_BYTES (1 GiB) because available // 4 (2 GiB) is capped. + expected_bytes = pandas_gbq.core.sample._MAX_AUTO_TARGET_BYTES + actual_bytes = pandas_gbq.core.sample._calculate_target_bytes(None) + assert actual_bytes == expected_bytes + + +@pytest.mark.parametrize( + "target_bytes, table_bytes, table_rows, fields, expected_limit", + [ + # With table_bytes and table_rows, should use proportion + pytest.param( + 1000, 10000, 100, [], 10, id="with-stats-simple" + ), # 100 * (1000/10000) + pytest.param(1, 10000, 100, [], 1, id="with-stats-min-1"), # min is 1 + # Without stats, should estimate from schema + pytest.param( + 1000, + None, + None, + [google.cloud.bigquery.SchemaField("col1", "INT64")], # 8 bytes + 125, # 1000 // 8 + id="no-stats-simple", + ), + pytest.param( + 10, + None, + None, + [google.cloud.bigquery.SchemaField("col1", "NUMERIC")], # 16 bytes + 1, # max(1, 10 // 16) + id="no-stats-min-1", + ), + # Edge case: row_bytes_estimate is 0 + pytest.param( + 1000, + None, + None, + [], + 1000, + id="no-stats-zero-row-size", # empty schema -> 0 bytes + ), + ], +) +def test_estimate_limit(target_bytes, table_bytes, table_rows, fields, expected_limit): + limit = pandas_gbq.core.sample._estimate_limit( + target_bytes=target_bytes, + table_bytes=table_bytes, + table_rows=table_rows, + fields=fields, + ) + assert limit == expected_limit + + +@mock.patch("pandas_gbq.core.read.download_results") +def test_sample_with_tablesample(mock_download_results, mock_bigquery_client): + mock_table = mock.Mock(spec=google.cloud.bigquery.Table) + mock_table.project = "test-project" + mock_table.dataset_id = "test_dataset" + mock_table.table_id = "test_table" + + proportion = 0.1 + target_row_count = 100 + + pandas_gbq.core.sample._sample_with_tablesample( + mock_table, + bqclient=mock_bigquery_client, + proportion=proportion, + target_row_count=target_row_count, + ) + + mock_bigquery_client.query_and_wait.assert_called_once() + query = mock_bigquery_client.query_and_wait.call_args[0][0] + assert "TABLESAMPLE SYSTEM (10.0 PERCENT)" in query + assert "LIMIT 100" in query + assert ( + f"FROM `{mock_table.project}.{mock_table.dataset_id}.{mock_table.table_id}`" + in query + ) + + mock_download_results.assert_called_once() + + +@mock.patch("pandas_gbq.core.read.download_results") +def test_sample_with_limit(mock_download_results, mock_bigquery_client): + mock_table = mock.Mock(spec=google.cloud.bigquery.Table) + mock_table.project = "test-project" + mock_table.dataset_id = "test_dataset" + mock_table.table_id = "test_table" + + target_row_count = 200 + + pandas_gbq.core.sample._sample_with_limit( + mock_table, + bqclient=mock_bigquery_client, + target_row_count=target_row_count, + ) + + mock_bigquery_client.query_and_wait.assert_called_once() + query = mock_bigquery_client.query_and_wait.call_args[0][0] + assert "TABLESAMPLE" not in query + assert "LIMIT 200" in query + assert ( + f"FROM `{mock_table.project}.{mock_table.dataset_id}.{mock_table.table_id}`" + in query + ) + + mock_download_results.assert_called_once() + + +@pytest.fixture +def mock_gbq_connector(mock_bigquery_client): + with mock.patch("pandas_gbq.gbq_connector.GbqConnector") as mock_connector_class: + mock_connector = mock_connector_class.return_value + mock_connector.get_client.return_value = mock_bigquery_client + mock_connector.credentials = mock.Mock() + yield mock_connector + + +@mock.patch("pandas_gbq.core.read.download_results") +def test_sample_small_table_downloads_all( + mock_download_results, mock_gbq_connector, mock_bigquery_client +): + mock_table = mock.Mock(spec=google.cloud.bigquery.Table) + type(mock_table).table_type = mock.PropertyMock(return_value="TABLE") + type(mock_table).num_bytes = mock.PropertyMock(return_value=1000) + type(mock_table).num_rows = mock.PropertyMock(return_value=10) + type(mock_table).schema = mock.PropertyMock(return_value=[]) + mock_bigquery_client.get_table.return_value = mock_table + + with mock.patch( + "pandas_gbq.core.sample._calculate_target_bytes", return_value=2000 + ): + pandas_gbq.core.sample.sample("my-project.my_dataset.my_table") + + mock_bigquery_client.list_rows.assert_called_once_with(mock_table) + mock_download_results.assert_called_once() + # Check that we didn't try to run a query for sampling + mock_bigquery_client.query_and_wait.assert_not_called() + + +@mock.patch("pandas_gbq.core.sample._sample_with_tablesample") +def test_sample_uses_tablesample( + mock_sample_with_tablesample, mock_gbq_connector, mock_bigquery_client +): + mock_table = mock.Mock(spec=google.cloud.bigquery.Table) + type(mock_table).table_type = mock.PropertyMock(return_value="TABLE") + type(mock_table).num_bytes = mock.PropertyMock(return_value=1_000_000_000_000) + type(mock_table).num_rows = mock.PropertyMock(return_value=1_000) + type(mock_table).schema = mock.PropertyMock( + return_value=[google.cloud.bigquery.SchemaField("col1", "INT64")] + ) + mock_bigquery_client.get_table.return_value = mock_table + + pandas_gbq.core.sample.sample("my-project.my_dataset.my_table", target_mb=1) + + mock_sample_with_tablesample.assert_called_once() + + +@mock.patch("pandas_gbq.core.sample._sample_with_limit") +def test_sample_uses_limit_fallback( + mock_sample_with_limit, mock_gbq_connector, mock_bigquery_client +): + mock_table = mock.Mock(spec=google.cloud.bigquery.Table) + mock_table.num_bytes = 10000 + mock_table.num_rows = 100 + mock_table.table_type = "VIEW" # Not eligible for TABLESAMPLE + mock_table.schema = [google.cloud.bigquery.SchemaField("col1", "INT64")] + mock_bigquery_client.get_table.return_value = mock_table + + with mock.patch( + "pandas_gbq.core.sample._calculate_target_bytes", return_value=1000 + ): + pandas_gbq.core.sample.sample("my-project.my_dataset.my_table") + + mock_sample_with_limit.assert_called_once() + + +@mock.patch("pandas_gbq.core.sample._sample_with_limit") +def test_sample_uses_limit_fallback_no_bytes( + mock_sample_with_limit, mock_gbq_connector, mock_bigquery_client +): + mock_table = mock.Mock(spec=google.cloud.bigquery.Table) + mock_table.num_bytes = None # num_bytes can be None + mock_table.num_rows = 100 + mock_table.table_type = "TABLE" + mock_table.schema = [google.cloud.bigquery.SchemaField("col1", "INT64")] + mock_bigquery_client.get_table.return_value = mock_table + + with mock.patch( + "pandas_gbq.core.sample._calculate_target_bytes", return_value=1000 + ): + pandas_gbq.core.sample.sample("my-project.my_dataset.my_table") + + mock_sample_with_limit.assert_called_once() diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 75574820..6eafe9e2 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -21,6 +21,7 @@ from pandas_gbq import gbq import pandas_gbq.constants +import pandas_gbq.core.read import pandas_gbq.exceptions import pandas_gbq.features from pandas_gbq.features import FEATURES @@ -104,7 +105,7 @@ def get_table(table_ref_or_id, **kwargs): ], ) def test__bqschema_to_nullsafe_dtypes(type_, expected): - result = gbq._bqschema_to_nullsafe_dtypes( + result = pandas_gbq.core.read._bqschema_to_nullsafe_dtypes( [dict(name="x", type=type_, mode="NULLABLE")] ) if not expected: diff --git a/tests/unit/test_to_gbq.py b/tests/unit/test_to_gbq.py index 681f18b8..6f00ccf8 100644 --- a/tests/unit/test_to_gbq.py +++ b/tests/unit/test_to_gbq.py @@ -206,7 +206,7 @@ def test_to_gbq_with_if_exists_unknown(): ], ) def test_create_user_agent(user_agent, rfc9110_delimiter, expected): - from pandas_gbq.gbq import create_user_agent + from pandas_gbq.gbq_connector import create_user_agent result = create_user_agent(user_agent, rfc9110_delimiter) assert result == expected @@ -214,14 +214,14 @@ def test_create_user_agent(user_agent, rfc9110_delimiter, expected): @mock.patch.dict(os.environ, {"VSCODE_PID": "1234"}, clear=True) def test_create_user_agent_vscode(): - from pandas_gbq.gbq import create_user_agent + from pandas_gbq.gbq_connector import create_user_agent assert create_user_agent() == f"pandas-{pd.__version__} vscode" @mock.patch.dict(os.environ, {"VSCODE_PID": "1234"}, clear=True) def test_create_user_agent_vscode_plugin(): - from pandas_gbq.gbq import create_user_agent + from pandas_gbq.gbq_connector import create_user_agent with tempfile.TemporaryDirectory() as tmpdir: user_home = Path(tmpdir) @@ -247,14 +247,14 @@ def test_create_user_agent_vscode_plugin(): @mock.patch.dict(os.environ, {"JPY_PARENT_PID": "1234"}, clear=True) def test_create_user_agent_jupyter(): - from pandas_gbq.gbq import create_user_agent + from pandas_gbq.gbq_connector import create_user_agent assert create_user_agent() == f"pandas-{pd.__version__} jupyter" @mock.patch.dict(os.environ, {"JPY_PARENT_PID": "1234"}, clear=True) def test_create_user_agent_jupyter_extension(): - from pandas_gbq.gbq import create_user_agent + from pandas_gbq.gbq_connector import create_user_agent def custom_import_module_side_effect(name, package=None): if name == "bigquery_jupyter_plugin": From 9e75a48aabae8ec1c94ca220ecb09ef03f71315a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Mon, 17 Nov 2025 21:04:17 +0000 Subject: [PATCH 05/14] chore: librarian release pull request: 20251117T201128Z (#984) PR created by the Librarian CLI to initialize a release. Merging this PR will auto trigger a release. Librarian Version: v0.6.0 Language Image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:c8612d3fffb3f6a32353b2d1abd16b61e87811866f7ec9d65b59b02eb452a620
pandas-gbq: 0.31.0 ## [0.31.0](https://github.com/googleapis/python-bigquery-pandas/compare/v0.30.0...v0.31.0) (2025-11-17) ### Features * add pandas_gbq.sample (#983) ([ac771c12](https://github.com/googleapis/python-bigquery-pandas/commit/ac771c12))
--------- Co-authored-by: Anthonios Partheniou --- .librarian/state.yaml | 3 ++- CHANGELOG.md | 7 +++++++ pandas_gbq/version.py | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/.librarian/state.yaml b/.librarian/state.yaml index 9a5bf347..931526b2 100644 --- a/.librarian/state.yaml +++ b/.librarian/state.yaml @@ -1,7 +1,8 @@ image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:c8612d3fffb3f6a32353b2d1abd16b61e87811866f7ec9d65b59b02eb452a620 libraries: - id: pandas-gbq - version: 0.30.0 + version: 0.31.0 + last_generated_commit: "" apis: [] source_roots: - . diff --git a/CHANGELOG.md b/CHANGELOG.md index afbf8c03..ee12fc18 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,13 @@ [1]: https://pypi.org/project/pandas-gbq/#history +## [0.31.0](https://github.com/googleapis/python-bigquery-pandas/compare/v0.30.0...v0.31.0) (2025-11-17) + + +### Features + +* add pandas_gbq.sample (#983) ([ac771c12f58b7378a8550515c9c656da7fa980a8](https://github.com/googleapis/python-bigquery-pandas/commit/ac771c12f58b7378a8550515c9c656da7fa980a8)) + ## [0.30.0](https://github.com/googleapis/python-bigquery-pandas/compare/v0.29.2...v0.30.0) (2025-10-31) diff --git a/pandas_gbq/version.py b/pandas_gbq/version.py index 25edafef..157b6a56 100644 --- a/pandas_gbq/version.py +++ b/pandas_gbq/version.py @@ -2,4 +2,4 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. -__version__ = "0.30.0" +__version__ = "0.31.0" From 304fd58647c366f59ca8dcba7c9eedeb7ac6e2a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Mon, 8 Dec 2025 15:23:03 -0600 Subject: [PATCH 06/14] chore: librarian release pull request: 20251208T205820Z (#997) PR created by the Librarian CLI to initialize a release. Merging this PR will auto trigger a release. Librarian Version: v0.7.0 Language Image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:c8612d3fffb3f6a32353b2d1abd16b61e87811866f7ec9d65b59b02eb452a620
pandas-gbq: 0.31.1 ## [0.31.1](https://github.com/googleapis/python-bigquery-pandas/compare/v0.31.0...v0.31.1) (2025-12-08)
--------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .librarian/state.yaml | 2 +- CHANGELOG.md | 6 ++++++ pandas_gbq/version.py | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.librarian/state.yaml b/.librarian/state.yaml index 931526b2..b4f59fa9 100644 --- a/.librarian/state.yaml +++ b/.librarian/state.yaml @@ -1,7 +1,7 @@ image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:c8612d3fffb3f6a32353b2d1abd16b61e87811866f7ec9d65b59b02eb452a620 libraries: - id: pandas-gbq - version: 0.31.0 + version: 0.31.1 last_generated_commit: "" apis: [] source_roots: diff --git a/CHANGELOG.md b/CHANGELOG.md index ee12fc18..0c7fb240 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ [1]: https://pypi.org/project/pandas-gbq/#history +## [0.31.1](https://github.com/googleapis/python-bigquery-pandas/compare/v0.31.0...v0.31.1) (2025-12-08) + +### Dependencies + +* Temporarily mark as incompatible with pandas 3.0 + ## [0.31.0](https://github.com/googleapis/python-bigquery-pandas/compare/v0.30.0...v0.31.0) (2025-11-17) diff --git a/pandas_gbq/version.py b/pandas_gbq/version.py index 157b6a56..e0a24dfd 100644 --- a/pandas_gbq/version.py +++ b/pandas_gbq/version.py @@ -2,4 +2,4 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. -__version__ = "0.31.0" +__version__ = "0.31.1" From a4217225a7adb3f98024f68813ca9d08103c0239 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Mon, 15 Dec 2025 06:17:38 -0500 Subject: [PATCH 07/14] remove pytz as dependency due to failing test --- tests/system/test_gbq.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 1457ec30..8e068c03 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -14,7 +14,6 @@ import pandas.api.types import pandas.testing as tm import pytest -import pytz from pandas_gbq import gbq import pandas_gbq.schema @@ -46,7 +45,7 @@ def make_mixed_dataframe_v2(test_size): ints = np.random.randint(1, 10, size=(1, test_size)) strs = np.random.randint(1, 10, size=(1, test_size)).astype(str) times = [ - datetime.datetime.now(pytz.timezone("US/Arizona")) for t in range(test_size) + datetime.datetime.now(datetime.zoneinfo.ZoneInfo("US/Arizona")) for t in range(test_size) ] return DataFrame( { @@ -894,7 +893,7 @@ def test_google_upload_errors_should_raise_exception(self, project_id): raise pytest.skip("buggy test") test_id = "5" - test_timestamp = datetime.datetime.now(pytz.timezone("US/Arizona")) + test_timestamp = datetime.datetime.now(datetime.zoneinfo.ZoneInfo("US/Arizona")) bad_df = DataFrame( { "bools": [False, False], From 739730da1cae194d5cb6a8e341ec365f01dc86bf Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Mon, 15 Dec 2025 06:45:33 -0500 Subject: [PATCH 08/14] corrects module name --- tests/system/test_gbq.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 8e068c03..5653d0b8 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -6,10 +6,10 @@ import datetime import sys +import zoneinfo import numpy as np import packaging.version -import pandas from pandas import DataFrame import pandas.api.types import pandas.testing as tm @@ -45,7 +45,7 @@ def make_mixed_dataframe_v2(test_size): ints = np.random.randint(1, 10, size=(1, test_size)) strs = np.random.randint(1, 10, size=(1, test_size)).astype(str) times = [ - datetime.datetime.now(datetime.zoneinfo.ZoneInfo("US/Arizona")) for t in range(test_size) + datetime.datetime.now(zoneinfo.ZoneInfo("US/Arizona")) for t in range(test_size) ] return DataFrame( { @@ -893,7 +893,7 @@ def test_google_upload_errors_should_raise_exception(self, project_id): raise pytest.skip("buggy test") test_id = "5" - test_timestamp = datetime.datetime.now(datetime.zoneinfo.ZoneInfo("US/Arizona")) + test_timestamp = datetime.datetime.now(zoneinfo.ZoneInfo("US/Arizona")) bad_df = DataFrame( { "bools": [False, False], From 8191a9dd7a10ceab3f5dca8bdd8fa0a22d6f101d Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Mon, 15 Dec 2025 07:09:25 -0500 Subject: [PATCH 09/14] swapping timezones --- tests/system/test_gbq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 5653d0b8..f396eb39 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -45,7 +45,7 @@ def make_mixed_dataframe_v2(test_size): ints = np.random.randint(1, 10, size=(1, test_size)) strs = np.random.randint(1, 10, size=(1, test_size)).astype(str) times = [ - datetime.datetime.now(zoneinfo.ZoneInfo("US/Arizona")) for t in range(test_size) + datetime.datetime.now(zoneinfo.ZoneInfo("UTC")) for t in range(test_size) ] return DataFrame( { @@ -893,7 +893,7 @@ def test_google_upload_errors_should_raise_exception(self, project_id): raise pytest.skip("buggy test") test_id = "5" - test_timestamp = datetime.datetime.now(zoneinfo.ZoneInfo("US/Arizona")) + test_timestamp = datetime.datetime.now(zoneinfo.ZoneInfo("UTC")) bad_df = DataFrame( { "bools": [False, False], From 1a253a8497895cfc1e68f4de46b533280682d0e4 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Mon, 15 Dec 2025 08:22:09 -0500 Subject: [PATCH 10/14] restore accidentally deleted module name --- tests/system/test_gbq.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index f396eb39..614802cd 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -10,6 +10,7 @@ import numpy as np import packaging.version +import pandas from pandas import DataFrame import pandas.api.types import pandas.testing as tm From 3692743732cf91bb5de58d3ab8eb08686f71536b Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Mon, 15 Dec 2025 16:33:31 -0500 Subject: [PATCH 11/14] chore(python): Add support for Python 3.14 --- noxfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index 175eddd2..af8d60c5 100644 --- a/noxfile.py +++ b/noxfile.py @@ -56,7 +56,7 @@ "3.9": [], } -SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] +SYSTEM_TEST_PYTHON_VERSIONS = ["3.14"] SYSTEM_TEST_STANDARD_DEPENDENCIES = [ "mock", "pytest", From b33d3a5aa37a171769d46cd7c905831cb5428ffe Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Mon, 15 Dec 2025 16:34:28 -0500 Subject: [PATCH 12/14] formatting update --- tests/system/test_gbq.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 614802cd..5b85b9ed 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -45,9 +45,7 @@ def make_mixed_dataframe_v2(test_size): flts = np.random.randn(1, test_size) ints = np.random.randint(1, 10, size=(1, test_size)) strs = np.random.randint(1, 10, size=(1, test_size)).astype(str) - times = [ - datetime.datetime.now(zoneinfo.ZoneInfo("UTC")) for t in range(test_size) - ] + times = [datetime.datetime.now(zoneinfo.ZoneInfo("UTC")) for t in range(test_size)] return DataFrame( { "bools": bools[0], From 3ed693206fb59d7f1d99daf686b72fad7ef3b019 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Mon, 15 Dec 2025 17:56:59 -0500 Subject: [PATCH 13/14] Removes unused sample tests for 3.7 and 3.8 --- .kokoro/samples/python3.7/common.cfg | 40 --------------------- .kokoro/samples/python3.7/continuous.cfg | 6 ---- .kokoro/samples/python3.7/periodic-head.cfg | 11 ------ .kokoro/samples/python3.7/periodic.cfg | 6 ---- .kokoro/samples/python3.7/presubmit.cfg | 6 ---- .kokoro/samples/python3.8/common.cfg | 40 --------------------- .kokoro/samples/python3.8/continuous.cfg | 6 ---- .kokoro/samples/python3.8/periodic-head.cfg | 11 ------ .kokoro/samples/python3.8/periodic.cfg | 6 ---- .kokoro/samples/python3.8/presubmit.cfg | 6 ---- 10 files changed, 138 deletions(-) delete mode 100644 .kokoro/samples/python3.7/common.cfg delete mode 100644 .kokoro/samples/python3.7/continuous.cfg delete mode 100644 .kokoro/samples/python3.7/periodic-head.cfg delete mode 100644 .kokoro/samples/python3.7/periodic.cfg delete mode 100644 .kokoro/samples/python3.7/presubmit.cfg delete mode 100644 .kokoro/samples/python3.8/common.cfg delete mode 100644 .kokoro/samples/python3.8/continuous.cfg delete mode 100644 .kokoro/samples/python3.8/periodic-head.cfg delete mode 100644 .kokoro/samples/python3.8/periodic.cfg delete mode 100644 .kokoro/samples/python3.8/presubmit.cfg diff --git a/.kokoro/samples/python3.7/common.cfg b/.kokoro/samples/python3.7/common.cfg deleted file mode 100644 index be202f33..00000000 --- a/.kokoro/samples/python3.7/common.cfg +++ /dev/null @@ -1,40 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -# Build logs will be here -action { - define_artifacts { - regex: "**/*sponge_log.xml" - } -} - -# Specify which tests to run -env_vars: { - key: "RUN_TESTS_SESSION" - value: "py-3.7" -} - -# Declare build specific Cloud project. -env_vars: { - key: "BUILD_SPECIFIC_GCLOUD_PROJECT" - value: "python-docs-samples-tests-py37" -} - -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery-pandas/.kokoro/test-samples.sh" -} - -# Configure the docker image for kokoro-trampoline. -env_vars: { - key: "TRAMPOLINE_IMAGE" - value: "gcr.io/cloud-devrel-kokoro-resources/python-samples-testing-docker" -} - -# Download secrets for samples -gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples" - -# Download trampoline resources. -gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" - -# Use the trampoline script to run in docker. -build_file: "python-bigquery-pandas/.kokoro/trampoline_v2.sh" \ No newline at end of file diff --git a/.kokoro/samples/python3.7/continuous.cfg b/.kokoro/samples/python3.7/continuous.cfg deleted file mode 100644 index a1c8d975..00000000 --- a/.kokoro/samples/python3.7/continuous.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} \ No newline at end of file diff --git a/.kokoro/samples/python3.7/periodic-head.cfg b/.kokoro/samples/python3.7/periodic-head.cfg deleted file mode 100644 index 98efde4d..00000000 --- a/.kokoro/samples/python3.7/periodic-head.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} - -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery-pandas/.kokoro/test-samples-against-head.sh" -} diff --git a/.kokoro/samples/python3.7/periodic.cfg b/.kokoro/samples/python3.7/periodic.cfg deleted file mode 100644 index 71cd1e59..00000000 --- a/.kokoro/samples/python3.7/periodic.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "False" -} diff --git a/.kokoro/samples/python3.7/presubmit.cfg b/.kokoro/samples/python3.7/presubmit.cfg deleted file mode 100644 index a1c8d975..00000000 --- a/.kokoro/samples/python3.7/presubmit.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} \ No newline at end of file diff --git a/.kokoro/samples/python3.8/common.cfg b/.kokoro/samples/python3.8/common.cfg deleted file mode 100644 index 7424a3b9..00000000 --- a/.kokoro/samples/python3.8/common.cfg +++ /dev/null @@ -1,40 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -# Build logs will be here -action { - define_artifacts { - regex: "**/*sponge_log.xml" - } -} - -# Specify which tests to run -env_vars: { - key: "RUN_TESTS_SESSION" - value: "py-3.8" -} - -# Declare build specific Cloud project. -env_vars: { - key: "BUILD_SPECIFIC_GCLOUD_PROJECT" - value: "python-docs-samples-tests-py38" -} - -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery-pandas/.kokoro/test-samples.sh" -} - -# Configure the docker image for kokoro-trampoline. -env_vars: { - key: "TRAMPOLINE_IMAGE" - value: "gcr.io/cloud-devrel-kokoro-resources/python-samples-testing-docker" -} - -# Download secrets for samples -gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples" - -# Download trampoline resources. -gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" - -# Use the trampoline script to run in docker. -build_file: "python-bigquery-pandas/.kokoro/trampoline_v2.sh" \ No newline at end of file diff --git a/.kokoro/samples/python3.8/continuous.cfg b/.kokoro/samples/python3.8/continuous.cfg deleted file mode 100644 index a1c8d975..00000000 --- a/.kokoro/samples/python3.8/continuous.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} \ No newline at end of file diff --git a/.kokoro/samples/python3.8/periodic-head.cfg b/.kokoro/samples/python3.8/periodic-head.cfg deleted file mode 100644 index 98efde4d..00000000 --- a/.kokoro/samples/python3.8/periodic-head.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} - -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery-pandas/.kokoro/test-samples-against-head.sh" -} diff --git a/.kokoro/samples/python3.8/periodic.cfg b/.kokoro/samples/python3.8/periodic.cfg deleted file mode 100644 index 71cd1e59..00000000 --- a/.kokoro/samples/python3.8/periodic.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "False" -} diff --git a/.kokoro/samples/python3.8/presubmit.cfg b/.kokoro/samples/python3.8/presubmit.cfg deleted file mode 100644 index a1c8d975..00000000 --- a/.kokoro/samples/python3.8/presubmit.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} \ No newline at end of file From 527f90c1f2fcda02ecf082af07cc9e34497c83ee Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Mon, 15 Dec 2025 18:25:05 -0500 Subject: [PATCH 14/14] updates system default pythons to include 3.9 --- noxfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index af8d60c5..8ed65438 100644 --- a/noxfile.py +++ b/noxfile.py @@ -56,7 +56,7 @@ "3.9": [], } -SYSTEM_TEST_PYTHON_VERSIONS = ["3.14"] +SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.14"] SYSTEM_TEST_STANDARD_DEPENDENCIES = [ "mock", "pytest",