From 6fd7db9406269b73260ebef73d9dc45b086c82db Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Mon, 23 Nov 2020 15:28:19 -0800 Subject: [PATCH 01/90] feat: tf gcs-bq-ingest module sets bq permissions ... on multiple projects fixup! --- .../gcs_ocn_bq_ingest_function/README.md | 3 ++- .../gcs_ocn_bq_ingest_function/main.tf | 16 ++++++++++++++++ .../gcs_ocn_bq_ingest_function/variables.tf | 7 ++++++- udfs/tests/.gitignore | 1 + 4 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 udfs/tests/.gitignore diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/README.md b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/README.md index 1e42b1966..d5859d5cb 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/README.md @@ -28,6 +28,7 @@ documented [here](../gcs_ocn_bq_ingest_function/README.md) | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | app\_id | Application Name | `any` | n/a | yes | +| bigquery\_project\_ids | Project IDs to grant bigquery Admin / Job user for the data ingester account | `list(string)` | `[]` | no | | cloudfunctions\_source\_bucket | GCS bucket to store Cloud Functions Source | `any` | n/a | yes | | data\_ingester\_sa | Service Account Email responsible for ingesting data to BigQuery | `any` | n/a | yes | | destination\_regex | A [Python Regex with named capturing groups](https://docs.python.org/3/howto/regex.html#non-capturing-and-named-groups) for destination `dataset`, `table`, (optional: `partition`, `batch`) | `string` | `""` | no | @@ -36,7 +37,7 @@ documented [here](../gcs_ocn_bq_ingest_function/README.md) | input\_prefix | GCS prefix to watch for new files in input\_bucket | `any` | `null` | no | | job\_prefix | Prefix for BigQuery Job IDs | `string` | `""` | no | | max\_batch\_bytes | Max bytes for BigQuery Load job | `string` | `""` | no | -| project\_id | GCP Project ID | `any` | n/a | yes | +| project\_id | GCP Project ID containing cloud function, and input bucket | `any` | n/a | yes | | region | GCP region in which to deploy cloud function | `string` | `"us-central1"` | no | | success\_filename | Filename to trigger a load of a prefix | `string` | `""` | no | | use\_pubsub\_notifications | Setting this to true will use Pub/Sub notifications By default we will use Cloud Functions Event direct notifications. See https://cloud.google.com/storage/docs/pubsub-notifications. | `bool` | `false` | no | diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf index 204d9bb42..cd62642ea 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf @@ -92,6 +92,22 @@ module "data_ingester_service_account" { ] } +# Grant the ingester service account permissions to run load jobs and mutate +# data in the target project +resource "google_project_iam_binding" "ingester_bq_job_user" { + for_each = toset(concat(var.bigquery_project_ids, [var.project_id])) + project = each.key + members = [module.data_ingester_service_account.iam_email] + role = "roles/bigquery.jobUser" +} + +resource "google_project_iam_binding" "ingester_bq_admin" { + for_each = toset(concat(var.bigquery_project_ids, [var.project_id])) + project = each.key + members = [module.data_ingester_service_account.iam_email] + role = "roles/bigquery.admin" +} + # Allow the GCS service account to publish notification for new objects to the # notification topic. resource "google_pubsub_topic_iam_binding" "gcs_publisher" { diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf index cd5e162bd..e68139b52 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. variable "project_id" { - description = "GCP Project ID" + description = "GCP Project ID containing cloud function, and input bucket" } variable "app_id" { @@ -74,3 +74,8 @@ variable "use_pubsub_notifications" { default = false } +variable "bigquery_project_ids" { + description = "Additional project IDs to grant bigquery Admin / Job user for the data ingester account" + type = list(string) + default = [] +} diff --git a/udfs/tests/.gitignore b/udfs/tests/.gitignore new file mode 100644 index 000000000..c18dd8d83 --- /dev/null +++ b/udfs/tests/.gitignore @@ -0,0 +1 @@ +__pycache__/ From d2f00ce903707828b3fc39b8ee02e62cfa7ca259 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Mon, 23 Nov 2020 15:56:00 -0800 Subject: [PATCH 02/90] fixup roles --- .../gcs_ocn_bq_ingest_function/README.md | 3 +-- .../gcs_ocn_bq_ingest_function/main.tf | 15 ++++----------- .../gcs_ocn_bq_ingest_function/outputs.tf | 5 +++++ .../gcs_ocn_bq_ingest_function/variables.tf | 3 ++- 4 files changed, 12 insertions(+), 14 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/README.md b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/README.md index d5859d5cb..f1acab548 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/README.md @@ -6,7 +6,6 @@ for event based ingest of GCS data to BigQuery described [here](../README.md). Note that by default all environment variables for the cloud function will be empty deferring to the defaults implemented in the function and documented [here](../gcs_ocn_bq_ingest_function/README.md) - ## Requirements | Name | Version | @@ -28,7 +27,7 @@ documented [here](../gcs_ocn_bq_ingest_function/README.md) | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | app\_id | Application Name | `any` | n/a | yes | -| bigquery\_project\_ids | Project IDs to grant bigquery Admin / Job user for the data ingester account | `list(string)` | `[]` | no | +| bigquery\_project\_ids | Additional project IDs to grant bigquery Admin / Job user for the data ingester account | `list(string)` | `[]` | no | | cloudfunctions\_source\_bucket | GCS bucket to store Cloud Functions Source | `any` | n/a | yes | | data\_ingester\_sa | Service Account Email responsible for ingesting data to BigQuery | `any` | n/a | yes | | destination\_regex | A [Python Regex with named capturing groups](https://docs.python.org/3/howto/regex.html#non-capturing-and-named-groups) for destination `dataset`, `table`, (optional: `partition`, `batch`) | `string` | `""` | no | diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf index cd62642ea..faf9b3b82 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf @@ -36,6 +36,7 @@ module "bucket" { } resource "google_storage_notification" "notification" { + depends_on = [google_pubsub_topic_iam_binding.gcs_publisher] count = var.use_pubsub_notifications ? 1 : 0 bucket = module.bucket.bucket object_name_prefix = var.input_prefix @@ -88,24 +89,16 @@ module "data_ingester_service_account" { names = [var.data_ingester_sa, ] project_roles = [ "${var.project_id}=>roles/bigquery.jobUser", - "${var.project_id}=>roles/bigquery.dataEditor", ] } -# Grant the ingester service account permissions to run load jobs and mutate -# data in the target project -resource "google_project_iam_binding" "ingester_bq_job_user" { - for_each = toset(concat(var.bigquery_project_ids, [var.project_id])) - project = each.key - members = [module.data_ingester_service_account.iam_email] - role = "roles/bigquery.jobUser" -} - +# Grant the ingester service account permissions to mutate data in +# target project(s) resource "google_project_iam_binding" "ingester_bq_admin" { for_each = toset(concat(var.bigquery_project_ids, [var.project_id])) project = each.key members = [module.data_ingester_service_account.iam_email] - role = "roles/bigquery.admin" + role = "roles/bigquery.dataEditor" } # Allow the GCS service account to publish notification for new objects to the diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/outputs.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/outputs.tf index 8ba2f4025..e34d2d0f4 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/outputs.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/outputs.tf @@ -16,3 +16,8 @@ output "cloud-function" { value = google_cloudfunctions_function.gcs_to_bq } +output "data-ingester-sa" { + description = "data ingester service account email created as cloud function identity" + value = module.data_ingester_service_account.email +} + diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf index e68139b52..0452e9769 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf @@ -75,7 +75,8 @@ variable "use_pubsub_notifications" { } variable "bigquery_project_ids" { - description = "Additional project IDs to grant bigquery Admin / Job user for the data ingester account" + description = "Additional project IDs to grant bigquery Admin for the data ingester account" type = list(string) default = [] } + From 65d9515cbe75abf5c7123844aa3f5c192ff60a80 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Mon, 23 Nov 2020 16:00:30 -0800 Subject: [PATCH 03/90] fixup dockerfil ci check --- tools/cloud_functions/gcs_event_based_ingest/Dockerfile.ci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/Dockerfile.ci b/tools/cloud_functions/gcs_event_based_ingest/Dockerfile.ci index eb12bd903..5cd40aa1e 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/Dockerfile.ci +++ b/tools/cloud_functions/gcs_event_based_ingest/Dockerfile.ci @@ -1,4 +1,4 @@ FROM python:3.8-slim COPY requirements.txt requirements-dev.txt ./ -RUN pip3 install -r requirements-dev.txt +RUN pip3 install --no-cache-dir -r requirements-dev.txt ENTRYPOINT ["pytest"] From 7fdffd7539cfc47542fbaa48429a3d8395b2eb1a Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Tue, 24 Nov 2020 10:54:34 -0800 Subject: [PATCH 04/90] docs: add note on unicode delimiters --- .../cloud_functions/gcs_event_based_ingest/README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/cloud_functions/gcs_event_based_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/README.md index 9fda82d39..529e17939 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/README.md @@ -133,6 +133,17 @@ The result of merging these would be: This configuration system gives us the ability to DRY up common defaults but override them at whatever level is appropriate as new cases come up. +### Note on Delimiters: Use Unicode +For CSV loads the `fieldDelimiter` in load.json to external.json should be +specified as a unicode character _not_ a hexidecimal character as hexidecimal +characters will confuse python's `json.load` function. +For example ctrl-P should be specified as: +```json +{ + "fieldDelimiter": "\u0010" +} +``` + #### Transformation SQL In some cases we may need to perform transformations on the files in GCS before they can be loaded to BigQuery. This is handled by query on an From 537f05d1ef78f14a43a8124ac91ea10777c12239 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Mon, 30 Nov 2020 12:27:09 -0800 Subject: [PATCH 05/90] fix: update nested values in configs --- .../gcs_event_based_ingest/.flake8 | 2 +- .../gcs_ocn_bq_ingest/main.py | 43 ++++++++-- .../test_gcs_ocn_bq_ingest.py | 83 +++++++++++++++++++ 3 files changed, 118 insertions(+), 10 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/.flake8 b/tools/cloud_functions/gcs_event_based_ingest/.flake8 index dafc87320..732e2a9fc 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/.flake8 +++ b/tools/cloud_functions/gcs_event_based_ingest/.flake8 @@ -1,6 +1,6 @@ [flake8] max-line-length = 110 ignore = E731,W504,I001,W503,E402 -exclude = .svn,CVS,.bzr,.hg,.git,__pycache__,.eggs,*.egg,node_modules,.venv +exclude = .svn,CVS,.bzr,.hg,.git,__pycache__,.eggs,*.egg,node_modules,.venv,.terraform # format = ${cyan}%(path)s${reset}:${yellow_bold}%(row)d${reset}:${green_bold}%(col)d${reset}: ${red_bold}%(code)s${reset} %(text)s diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index d05b771db..20d4d7604 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -17,6 +17,8 @@ """Background Cloud Function for loading data from GCS to BigQuery. """ import collections +import collections.abc +import copy import json import os import pathlib @@ -152,9 +154,8 @@ def main(event: Dict, context): # pylint: disable=unused-argument default_query_config = bigquery.QueryJobConfig() default_query_config.use_legacy_sql = False default_query_config.labels = labels - bq_client = bigquery.Client( - client_info=CLIENT_INFO, - default_query_job_config=default_query_config) + bq_client = bigquery.Client(client_info=CLIENT_INFO, + default_query_job_config=default_query_config) print(f"looking for {gsurl}_config/bq_transform.sql") external_query_sql = read_gcs_file_if_exists( @@ -308,10 +309,8 @@ def handle_duplicate_notification(bkt: storage.Bucket, success_created_unix_timestamp = success_blob.time_created.timestamp() claim_blob: storage.Blob = bkt.blob( - success_blob.name.replace( - SUCCESS_FILENAME, - f"_claimed_{success_created_unix_timestamp}") - ) + success_blob.name.replace(SUCCESS_FILENAME, + f"_claimed_{success_created_unix_timestamp}")) try: claim_blob.upload_from_string("", if_generation_match=0) except google.api_core.exceptions.PreconditionFailed as err: @@ -379,9 +378,9 @@ def _get_parent_config(path): config_q.append(json.loads(config)) parts.pop() - merged_config = dict() + merged_config: Dict = {} while config_q: - merged_config.update(config_q.popleft()) + recursive_update(merged_config, config_q.popleft(), in_place=True) print(f"merged_config: {merged_config}") return bigquery.LoadJobConfig.from_api_repr({"load": merged_config}) @@ -549,3 +548,29 @@ def removesuffix(in_str: str, suffix: str) -> str: if suffix and in_str.endswith(suffix): return in_str[:-len(suffix)] return in_str[:] + + +def recursive_update( + original: Dict, + update: Dict, + in_place: bool = False +): + """ + return a recursively updated dictionary. + + Note, lists will be completely overwritten by value in update if there is a + conflict. + + original: (dict) the base dictionary + update: (dict) the dictionary of updates to apply on original + in_place: (bool) if true then original will be mutated in place else a new + dictionary as a result of the update will be returned. + """ + out = original if in_place else copy.deepcopy(original) + + for key, value in update.items(): + if isinstance(value, dict): + out[key] = recursive_update(out.get(key, {}), value) + else: + out[key] = value + return out diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py index f3e02a50b..6f983d22d 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py @@ -119,3 +119,86 @@ def test_default_destination_regex(test_input: str, ]) def test_flattend2dlist(test_input, expected): assert gcs_ocn_bq_ingest.main.flatten2dlist(test_input) == expected + + +@pytest.mark.parametrize( + "original, update, expected", + [ + # yapf: disable + ( # empty original + {}, + { + "a": 1 + }, + { + "a": 1 + } + ), + ( # empty update + { + "a": 1 + }, + {}, + { + "a": 1 + }), + ( # basic update of top-level key + { + "a": 1 + }, + { + "a": 2 + }, + { + "a": 2 + }), + ( # update of list + { + "a": [1] + }, + { + "a": [2] + }, + { + "a": [2] + }), + ( # update of nested key + { + "a": { + "b": 1 + } + }, + { + "a": { + "b": 2 + } + }, + { + "a": { + "b": 2 + } + }), + ( # don't drop keys that only appear in original + { + "a": { + "b": 1, + "c": 2 + }, + "d": 3 + }, + { + "a": { + "b": 4 + }, + }, + { + "a": { + "b": 4, + "c": 2 + }, + "d": 3 + }), + # yapf: enable + ]) +def test_recursive_update(original, update, expected): + assert gcs_ocn_bq_ingest.main.recursive_update(original, update) == expected From 125ca9f47661cdef47ca76298ab361cd2ecafc35 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Mon, 30 Nov 2020 13:13:01 -0800 Subject: [PATCH 06/90] chore: improve error message for wrong external table name (#200) --- .../gcs_ocn_bq_ingest/main.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index d05b771db..32316593e 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -249,8 +249,17 @@ def external_query( # pylint: disable=too-many-arguments while time.monotonic() - start_poll_for_errors < WAIT_FOR_JOB_SECONDS: job.reload() if job.errors: - raise RuntimeError( - f"query job {job.job_id} failed quickly: {job.errors}") + msg = f"query job {job.job_id} failed quickly: {job.errors}" + for err in job.errors: + # BQ gives confusing warning about missing dataset if the + # external query refers to the wrong external table name. + # In this case we can give the end user a little more context. + if "missing dataset" in err.get("message", ""): + raise RuntimeError( + "External queries must select from the external table " + "named 'temp_ext'. This error may be due to specifying" + "the wrong name for the external table. " + msg) + raise RuntimeError(msg) time.sleep(JOB_POLL_INTERVAL_SECONDS) From 02458b81f3c599f4925966345586fe2d646cc52b Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Wed, 25 Nov 2020 12:03:29 -0800 Subject: [PATCH 07/90] fix: external configs not found in parent dirs --- .../gcs_event_based_ingest/.flake8 | 2 +- .../gcs_ocn_bq_ingest/main.py | 40 ++++++++++--------- .../gcs_event_based_ingest/tests/conftest.py | 29 +++++++++++++- .../test_gcs_ocn_bq_ingest_it.py | 24 +++++++++++ 4 files changed, 74 insertions(+), 21 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/.flake8 b/tools/cloud_functions/gcs_event_based_ingest/.flake8 index dafc87320..732e2a9fc 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/.flake8 +++ b/tools/cloud_functions/gcs_event_based_ingest/.flake8 @@ -1,6 +1,6 @@ [flake8] max-line-length = 110 ignore = E731,W504,I001,W503,E402 -exclude = .svn,CVS,.bzr,.hg,.git,__pycache__,.eggs,*.egg,node_modules,.venv +exclude = .svn,CVS,.bzr,.hg,.git,__pycache__,.eggs,*.egg,node_modules,.venv,.terraform # format = ${cyan}%(path)s${reset}:${yellow_bold}%(row)d${reset}:${green_bold}%(col)d${reset}: ${red_bold}%(code)s${reset} %(text)s diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index d05b771db..aab5e8410 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -37,7 +37,10 @@ MAX_SOURCE_URIS_PER_LOAD = 10**4 DEFAULT_EXTERNAL_TABLE_DEFINITION = { - "sourceFormat": "CSV", + # The default must be a self describing data format + # because autodetecting CSV /JSON schemas is likely to not match + # expectations / assumptions of the transformation query. + "sourceFormat": "PARQUET", } DEFAULT_JOB_LABELS = { @@ -152,18 +155,18 @@ def main(event: Dict, context): # pylint: disable=unused-argument default_query_config = bigquery.QueryJobConfig() default_query_config.use_legacy_sql = False default_query_config.labels = labels - bq_client = bigquery.Client( - client_info=CLIENT_INFO, - default_query_job_config=default_query_config) + bq_client = bigquery.Client(client_info=CLIENT_INFO, + default_query_job_config=default_query_config) - print(f"looking for {gsurl}_config/bq_transform.sql") + print("looking for bq_transform.sql") external_query_sql = read_gcs_file_if_exists( gcs_client, f"{gsurl}_config/bq_transform.sql") - print(f"external_query_sql = {external_query_sql}") if not external_query_sql: - external_query_sql = look_for_transform_sql(gcs_client, gsurl) + external_query_sql = look_for_config_in_parents(gcs_client, gsurl, + "bq_transform.sql") if external_query_sql: print("EXTERNAL QUERY") + print(f"found external query:\n{external_query_sql}") external_query(gcs_client, bq_client, gsurl, external_query_sql, dest_table_ref, create_job_id_prefix(dest_table_ref, batch_id)) @@ -217,15 +220,19 @@ def external_query( # pylint: disable=too-many-arguments """ external_table_config = read_gcs_file_if_exists( gcs_client, f"{gsurl}_config/external.json") + if not external_table_config: + external_table_config = look_for_config_in_parents( + gcs_client, gsurl, "external.json") if external_table_config: external_table_def = json.loads(external_table_config) else: print(f"Falling back to default CSV external table." - f" {gsurl}/_config/external.json not found.") + f" {gsurl}_config/external.json not found.") external_table_def = DEFAULT_EXTERNAL_TABLE_DEFINITION external_table_def["sourceUris"] = flatten2dlist( get_batches_for_prefix(gcs_client, gsurl)) + print(f"external table def = {json.dumps(external_table_config, indent=2)}") external_config = bigquery.ExternalConfig.from_api_repr(external_table_def) job_config = bigquery.QueryJobConfig( table_definitions={"temp_ext": external_config}, use_legacy_sql=False) @@ -308,10 +315,8 @@ def handle_duplicate_notification(bkt: storage.Bucket, success_created_unix_timestamp = success_blob.time_created.timestamp() claim_blob: storage.Blob = bkt.blob( - success_blob.name.replace( - SUCCESS_FILENAME, - f"_claimed_{success_created_unix_timestamp}") - ) + success_blob.name.replace(SUCCESS_FILENAME, + f"_claimed_{success_created_unix_timestamp}")) try: claim_blob.upload_from_string("", if_generation_match=0) except google.api_core.exceptions.PreconditionFailed as err: @@ -333,16 +338,15 @@ def _get_parent_config_file(storage_client, config_filename, bucket, path): f"gs://{bucket}/{config_path}") -def look_for_transform_sql(storage_client: storage.Client, - gsurl: str) -> Optional[str]: - """look in parent directories for _config/bq_transform.sql""" - config_filename = "bq_transform.sql" +def look_for_config_in_parents(storage_client: storage.Client, gsurl: str, + config_filename: str) -> Optional[str]: + """look in parent directories for _config/config_filename""" blob: storage.Blob = storage.Blob.from_string(gsurl) bucket_name = blob.bucket.name obj_path = blob.name parts = removesuffix(obj_path, "/").split("/") - def _get_parent_query(path): + def _get_parent_config(path): return _get_parent_config_file(storage_client, config_filename, bucket_name, path) @@ -350,7 +354,7 @@ def _get_parent_query(path): while parts: if config: return config - config = _get_parent_query("/".join(parts)) + config = _get_parent_config("/".join(parts)) parts.pop() return config diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py index c0ae3f8ab..4121ba3fc 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py @@ -134,6 +134,30 @@ def teardown(): return data_objs[-1] +@pytest.fixture(scope="function") +@pytest.mark.usefixtures("gcs_bucket", "dest_dataset", "dest_table") +def gcs_data_under_sub_dirs(request, gcs_bucket, dest_dataset, + dest_table) -> storage.blob.Blob: + data_objs = [] + for test_file in ["part-m-00000", "part-m-00001", "_SUCCESS"]: + data_obj: storage.blob.Blob = gcs_bucket.blob("/".join([ + f"{dest_dataset.project}.{dest_dataset.dataset_id}", + dest_table.table_id, "foo", "bar", "baz", test_file + ])) + data_obj.upload_from_filename( + os.path.join(TEST_DIR, "resources", "test-data", "nation", + test_file)) + data_objs.append(data_obj) + + def teardown(): + for do in data_objs: + if do.exists: + do.delete() + + request.addfinalizer(teardown) + return data_objs[-1] + + @pytest.fixture(scope="function") @pytest.mark.usefixtures("gcs_bucket", "dest_dataset", "dest_table") def gcs_truncating_load_config(request, gcs_bucket, dest_dataset, @@ -188,7 +212,7 @@ def gcs_external_config(request, gcs_bucket, dest_dataset, dest_table) -> List[storage.blob.Blob]: config_objs = [] sql_obj = gcs_bucket.blob("/".join([ - dest_dataset.dataset_id, + f"{dest_dataset.project}.{dest_dataset.dataset_id}", dest_table.table_id, "_config", "bq_transform.sql", @@ -198,7 +222,8 @@ def gcs_external_config(request, gcs_bucket, dest_dataset, sql_obj.upload_from_string(sql) config_obj = gcs_bucket.blob("/".join([ - dest_dataset.dataset_id, dest_table.table_id, "_config", "external.json" + f"{dest_dataset.project}.{dest_dataset.dataset_id}", + dest_table.table_id, "_config", "external.json" ])) with open(os.path.join(TEST_DIR, "resources", diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py index a5a81b949..44a5e717a 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py @@ -209,6 +209,30 @@ def test_load_job_partitioned(bq, gcs_partitioned_data, bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows) +@pytest.mark.IT +def test_look_for_config_in_parents(bq, gcs_data_under_sub_dirs, + gcs_external_config, dest_dataset, + dest_table, mock_env): + """test discovery of configuration files for external query in parent + _config paths. + """ + if not all((blob.exists() for blob in gcs_external_config)): + raise google.cloud.exceptions.NotFound("config objects must exist") + if not gcs_data_under_sub_dirs.exists(): + raise google.cloud.exceptions.NotFound("test data objects must exist") + test_event = { + "attributes": { + "bucketId": gcs_data_under_sub_dirs.bucket.name, + "objectId": gcs_data_under_sub_dirs.name + } + } + gcs_ocn_bq_ingest.main.main(test_event, None) + test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation", + "part-m-00001") + expected_num_rows = sum(1 for _ in open(test_data_file)) + bq_wait_for_rows(bq, dest_table, expected_num_rows) + + def bq_wait_for_rows(bq_client: bigquery.Client, table: bigquery.Table, expected_num_rows: int): """ From 1525102768c4df9f9b48b6498a583d66fed5dc36 Mon Sep 17 00:00:00 2001 From: pdunn Date: Tue, 1 Dec 2020 20:32:41 +0000 Subject: [PATCH 08/90] time series UDFs (#198) * time series UDFs * code review changes Co-authored-by: Ryan McDowell --- udfs/community/README.md | 67 ++++++++++++++++++++++ udfs/community/linear_interpolate.sql | 30 ++++++++++ udfs/community/test_cases.yaml | 38 +++++++++++- udfs/community/ts_gen_keyed_timestamps.sql | 46 +++++++++++++++ udfs/community/ts_linear_interpolate.sql | 67 ++++++++++++++++++++++ udfs/community/ts_tumble.sql | 30 ++++++++++ 6 files changed, 277 insertions(+), 1 deletion(-) create mode 100644 udfs/community/linear_interpolate.sql create mode 100644 udfs/community/ts_gen_keyed_timestamps.sql create mode 100644 udfs/community/ts_linear_interpolate.sql create mode 100644 udfs/community/ts_tumble.sql diff --git a/udfs/community/README.md b/udfs/community/README.md index a8badc8e5..3107a3b9d 100644 --- a/udfs/community/README.md +++ b/udfs/community/README.md @@ -21,6 +21,7 @@ SELECT bqutil.fn.int(1.684) * [int](#intv-any-type) * [json_typeof](#json_typeofjson-string) * [last_day](#lastdaydt-date) +* [linear_interpolate](#linear_interpolate) * [median](#medianarr-any-type) * [nlp_compromise_number](#nlp_compromise_numberstr-string) * [nlp_compromise_people](#nlp_compromise_peoplestr-string) @@ -30,6 +31,9 @@ SELECT bqutil.fn.int(1.684) * [random_int](#random_intmin-any-type-max-any-type) * [random_value](#random_valuearr-any-type) * [translate](#translateexpression-string-characters_to_replace-string-characters_to_substitute-string) +* [ts_gen_keyed_timestamps](#ts_gen_keyed_timestamps) +* [ts_linear_interpolate](#ts_linear_interpolate) +* [ts_tumble](#ts_tumble) * [typeof](#typeofinput-any-type) * [url_keys](#url_keysquery-string) * [url_param](#url_paramquery-string-p-string) @@ -230,6 +234,22 @@ results: | 1987-12-31 | 1998-09-30 | 2020-02-29 | 2019-02-28 | +### [linear_interpolate(pos INT64, prev STRUCT, next STRUCT)](linear_interpolate.sql) +Interpolate the current positions value from the preceding and folllowing coordinates + +```sql +SELECT + bqutil.fn.linear_interpolate(2, STRUCT(0 AS x, 0.0 AS y), STRUCT(10 AS x, 10.0 AS y)), + bqutil.fn.linear_interpolate(2, STRUCT(0 AS x, 0.0 AS y), STRUCT(20 AS x, 10.0 AS y)) +``` + +results: + +| f0_ | f1_ | +|-----|-----| +| 2.0 | 1.0 | + + ### [median(arr ANY TYPE)](median.sql) Get the median of an array of numbers. @@ -344,6 +364,53 @@ SELECT bqutil.fn.translate('mint tea', 'inea', 'osin') most tin ``` +### [ts_gen_keyed_timestamps(keys ARRAY, tumble_seconds INT64, min_ts TIMESTAMP, max_ts TIMESTAMP](ts_gen_keyed_timestamps.sql) +Generate a timestamp array associated with each key + +```sql +SELECT * +FROM + UNNEST(bqutil.fn.ts_gen_keyed_timestamps(['abc', 'def'], 60, TIMESTAMP '2020-01-01 00:30:00', TIMESTAMP '2020-01-01 00:31:00)) +``` + +| series_key | tumble_val +|------------|-------------------------| +| abc | 2020-01-01 00:30:00 UTC | +| def | 2020-01-01 00:30:00 UTC | +| abc | 2020-01-01 00:31:00 UTC | +| def | 2020-01-01 00:31:00 UTC | + + +### [ts_linear_interpolate(pos TIMESTAMP, prev STRUCT(x TIMESTAMP, y FLOAT6), next STRUCT(x TIMESTAMP, y FLOAT64))](ts_linear_interpolation.sql) +Interpolate the positions value using timestamp seconds as the x-axis + +```sql +select bqutil.fn.ts_linear_interpolate( + TIMESTAMP '2020-01-01 00:30:00', + STRUCT(TIMESTAMP '2020-01-01 00:29:00' AS x, 1.0 AS y), + STRUCT(TIMESTAMP '2020-01-01 00:31:00' AS x, 3.0 AS y) +) +``` + +| f0_ | +|-----| +| 2.0 | + + +### [ts_tumble(input_ts TIMESTAMP, tumble_seconds INT64)](ts_tumble.sql) +Calculate the [tumbling window](https://cloud.google.com/dataflow/docs/concepts/streaming-pipelines#tumbling-windows) the input_ts belongs in + +```sql +SELECT + fn.ts_tumble(TIMESTAMP '2020-01-01 00:17:30', 900) AS min_15, + fn.ts_tumble(TIMESTAMP '2020-01-01 00:17:30', 600) AS min_10, + fn.ts_tumble(TIMESTAMP '2020-01-01 00:17:30', 60) As min_1 +``` + +| min_15 | min_10 | | +|-------------------------|-------------------------|-------------------------| +| 2020-01-01 00:15:00 UTC | 2020-01-01 00:10:00 UTC | 2020-01-01 00:17:00 UTC | + ### [typeof(input ANY TYPE)](typeof.sql) diff --git a/udfs/community/linear_interpolate.sql b/udfs/community/linear_interpolate.sql new file mode 100644 index 000000000..4dc54229d --- /dev/null +++ b/udfs/community/linear_interpolate.sql @@ -0,0 +1,30 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +-- linear_interpolate: +-- Input: +-- pos: the position on x axis for the independent variable +-- prev: the x,y coordinate of the preceding value +-- next: the x,y coordinate of the following value +-- Output: the interpolated y value +CREATE OR REPLACE FUNCTION fn.linear_interpolate(pos INT64, prev STRUCT, next STRUCT) +RETURNS FLOAT64 AS ( + CASE + WHEN pos IS NULL OR prev IS NULL OR next IS NULL THEN NULL + ELSE + (next.y - prev.y) / (next.x - prev.x) * (pos - prev.x) + prev.y + END +); diff --git a/udfs/community/test_cases.yaml b/udfs/community/test_cases.yaml index 067a27fcc..2f5e80d64 100644 --- a/udfs/community/test_cases.yaml +++ b/udfs/community/test_cases.yaml @@ -293,4 +293,40 @@ percentage_difference: - test: input: CAST(1.0 AS FLOAT64), CAST(1000000000 AS INT64) expected_output: CAST(2.0 AS FLOAT64) - +linear_interpolate: + - test: + input: CAST(2 AS INT64), STRUCT(CAST(1 AS INT64) AS x, CAST(1.0 AS FLOAT64) AS y), STRUCT(CAST(3 AS INT64) AS x, CAST(3.0 AS FLOAT64) AS y) + expected_output: CAST(2.0 AS FLOAT64) + - test: + input: CAST(3 AS INT64), STRUCT(CAST(1 AS INT64) AS x, CAST(1.0 AS FLOAT64) AS y), STRUCT(CAST(4 AS INT64) AS x, CAST(4.0 AS FLOAT64) AS y) + expected_output: CAST(3.0 AS FLOAT64) +ts_lin_interpolate: + - test: + input: CAST('2020-01-01 00:15:00' AS TIMESTAMP), STRUCT(CAST('2020-01-01 00:00:00' AS TIMESTAMP) AS x, CAST(1.0 AS FLOAT64)), STRUCT(CAST('2020-01-01 00:30:00' AS TIMESTAMP) AS x, CAST(3.0 AS FLOAT64)) + expected_output: CAST(2.0 AS FLOAT64) + - test: + input: CAST('2020-01-01 00:15:00' AS TIMESTAMP), STRUCT(CAST('2020-01-01 00:00:00' AS TIMESTAMP) AS x, CAST(1.0 AS FLOAT64)), STRUCT(CAST('2020-01-01 02:30:00' AS TIMESTAMP) AS x, CAST(3.0 AS FLOAT64)) + expected_output: CAST(1.2 AS FLOAT64) +ts_tumble: + - test: + input: CAST('2020-01-01 00:17:30' AS TIMESTAMP), CAST(900 AS INT64) + expected_output: CAST('2020-01-01 00:15:00' AS TIMESTAMP) + - test: + input: CAST('2020-01-01 00:17:30' AS TIMESTAMP), CAST(600 AS INT64) + expected_output: CAST('2020-01-01 00:10:00' AS TIMESTAMP) + - test: + input: CAST('2020-01-01 00:17:30' AS TIMESTAMP), CAST(300 AS INT64) + expected_output: CAST('2020-01-01 00:15:00' AS TIMESTAMP) + - test: + input: CAST('2020-01-01 00:17:30' AS TIMESTAMP), CAST(60 AS INT64) + expected_output: CAST('2020-01-01 00:17:00' AS TIMESTAMP) + - test: + input: CAST('2020-01-01 00:17:30' AS TIMESTAMP), CAST(0 AS INT64) + expected_output: (NULL) +ts_gen_keyed_timestamps: + - test: + input: ARRAY['abc'], CAST(60 AS INT64), CAST('2020-01-01 00:30:00' AS TIMESTAMP), CAST('2020-01-01 00:31:00' AS TIMESTAMP) + expected_output: ([STRUCT(CAST('abc' AS STRING) AS series_key, CAST('2020-01-01 00:30:00' AS TIMESTAMP) AS tumble_val), STRUCT(CAST('abc' AS STRING) AS series_key, CAST('2020-01-01 00:31:00' AS TIMESTAMP) AS tumble_val)]) + - test: + input: ARRAY['abc', 'def'], CAST(60 AS INT64), CAST('2020-01-01 00:30:00' AS TIMESTAMP), CAST('2020-01-01 00:30:30' AS TIMESTAMP) + expected_output: ([STRUCT(CAST('abc' AS STRING) AS series_key, CAST('2020-01-01 00:30:00' AS TIMESTAMP) AS tumble_val), STRUCT(CAST('def' AS STRING) AS series_key, CAST('2020-01-01 00:30:00' AS TIMESTAMP) AS tumble_val)]) diff --git a/udfs/community/ts_gen_keyed_timestamps.sql b/udfs/community/ts_gen_keyed_timestamps.sql new file mode 100644 index 000000000..45e549521 --- /dev/null +++ b/udfs/community/ts_gen_keyed_timestamps.sql @@ -0,0 +1,46 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* +* Generate an array of key-timestamp structs with the specified min, max and interval timeseries +* Example Usage: +* SELECT * +* FROM UNNEST(bqutil.fn.ts_gen_keyed_timestamp(['abc'], 900, '2020-01-01', '2020-01-02') a +* LEFT JOIN dataset.table ON a.series_key = a.key AND a.tumble_val = b.timestamp +*/ + +-- ts_gen_keyed_timestamps: +-- Input: +-- keys: strings that are cross joined with the generated timestamps +-- tumble_seconds: the windowing interval for each generated timestamp +-- min_ts: the inclusive lower bound for the generated timestamps, normalized by the tumble_seconds +-- max_ts: the inclusive upper bound for the generated timestamps, normalized by the tumble_seconds +-- Output: An array of generated timestamps for each key - ARRAY> +CREATE OR REPLACE FUNCTION fn.ts_gen_keyed_timestamps(keys ARRAY, tumble_seconds INT64, min_ts TIMESTAMP, max_ts Timestamp) +RETURNS ARRAY> AS (( + SELECT ARRAY_AGG(x) + FROM ( + SELECT series_key, tumble_val + FROM UNNEST( + GENERATE_TIMESTAMP_ARRAY( + bqutil.fn.ts_tumble(min_ts, tumble_seconds), + bqutil.fn.ts_tumble(max_ts, tumble_seconds), + INTERVAL tumble_seconds SECOND + ) + ) AS tumble_val + CROSS JOIN UNNEST(keys) AS series_key + ) x +)); diff --git a/udfs/community/ts_linear_interpolate.sql b/udfs/community/ts_linear_interpolate.sql new file mode 100644 index 000000000..9be441166 --- /dev/null +++ b/udfs/community/ts_linear_interpolate.sql @@ -0,0 +1,67 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* +* wrap fn.linear_interpolate to handle time series interpolation +* +* Example usage: use value if exists, otherwise attempt linear interpolation, else fill with zero +* +* WITH tbl AS ( +* SELECT 'abc' key, CAST('2021-01-01' AS TIMESTAMP) ts, 1 value, STRUCT(CAST('2021-01-01' AS TIMESTAMP) AS x, 1 AS y) coord +* UNION ALL +* SELECT 'abc', CAST('2021-01-02' AS TIMESTAMP), null, null +* UNION ALL +* SELECT 'abc', CAST('2021-01-03' AS TIMESTAMP), 3, STRUCT(CAST('2021-01-03' AS TIMESTAMP) AS x, 3 AS y) +* UNION ALL +* SELECT 'abc', CAST('2021-01-04' AS TIMESTAMP), null, null +* ) +* SELECT +* *, +* COALESCE(coord.y, +* fn.ts_lin_interpolate( +* ts, +* LAST_VALUE(coord IGNORE NULLS) +* OVER (PARTITION BY key +* ORDER BY unix_seconds(ts) ASC +* RANGE BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), +* FIRST_VALUE(coord IGNORE NULLS) +* OVER (PARTITION BY key +* ORDER BY unix_seconds(ts) ASC +* RANGE BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING) +* ), +* 0 +* ) AS intrp +* FROM tbl +*/ + +-- ts_linear_interpolate: +-- Input: +-- pos: the independent variable of a linear interpolation, represented as a TIMESTAMP +-- prev: the x,y coordinate of the preceding value, where the x-coordinate is a TIMESTAMP +-- next: the x,y coordinate of the following value, where the x-coordinate is a TIMESTAMP +-- Output: the interpolated y value +CREATE OR REPLACE FUNCTION fn.ts_linear_interpolate(pos TIMESTAMP, prev STRUCT, next STRUCT) +RETURNS FLOAT64 AS ( + CASE + WHEN pos IS NULL OR prev IS NULL OR next IS NULL THEN NULL + ELSE + bqutil.fn.linear_interpolate( + UNIX_SECONDS(pos), + STRUCT(UNIX_SECONDS(prev.x) AS x, prev.y AS y), + STRUCT(UNIX_SECONDS(next.x) AS x, next.y AS y) + ) + END +); diff --git a/udfs/community/ts_tumble.sql b/udfs/community/ts_tumble.sql new file mode 100644 index 000000000..263002202 --- /dev/null +++ b/udfs/community/ts_tumble.sql @@ -0,0 +1,30 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +-- ts_tumble: +-- Input: +-- input_ts: timestamp to be divided into a [tumble window](https://cloud.google.com/dataflow/docs/reference/sql/streaming-extensions#tumble) +-- tumble_seconds: size of the tumble window in seconds +-- Output: the starting TIMESTAMP of the tumble winow the input_ts belongs to +CREATE OR REPLACE FUNCTION fn.ts_tumble(input_ts TIMESTAMP, tumble_seconds INT64) +RETURNS TIMESTAMP +AS ( + IF ( + tumble_seconds > 0, + TIMESTAMP_SECONDS(DIV(UNIX_SECONDS(input_ts), tumble_seconds) * tumble_seconds), + NULL + ) +); From cfe8f8ce90bfa47151f60808a7a231af8c639180 Mon Sep 17 00:00:00 2001 From: Daniel De Leo Date: Tue, 1 Dec 2020 18:07:50 -0500 Subject: [PATCH 09/90] Adding helper assets for JMeter performance testing on BigQuery (#203) * Adding helper assets for JMeter performance testing on BigQuery * Adding trailing new lines and removing commented code Co-authored-by: Ryan McDowell --- performance_testing/jmeter/README.md | 128 +++++++ .../jmeter/bigquery_http_sampler.jmx | 338 ++++++++++++++++++ .../jmeter/bigquery_jdbc_sampler.jmx | 269 ++++++++++++++ .../jmeter/cancel_running_jobs.py | 38 ++ .../jmeter/http_sampler_results.sql | 32 ++ .../jmeter/jdbc_sampler_results.sql | 27 ++ .../jmeter/run_jmeter_http_sampler.sh | 44 +++ .../jmeter/run_jmeter_jdbc_sampler.sh | 44 +++ .../jmeter/test_queries/complex_selects.csv | 101 ++++++ .../jmeter/test_queries/medium_selects.csv | 101 ++++++ .../jmeter/test_queries/simple_selects.csv | 101 ++++++ 11 files changed, 1223 insertions(+) create mode 100644 performance_testing/jmeter/README.md create mode 100644 performance_testing/jmeter/bigquery_http_sampler.jmx create mode 100644 performance_testing/jmeter/bigquery_jdbc_sampler.jmx create mode 100644 performance_testing/jmeter/cancel_running_jobs.py create mode 100644 performance_testing/jmeter/http_sampler_results.sql create mode 100644 performance_testing/jmeter/jdbc_sampler_results.sql create mode 100755 performance_testing/jmeter/run_jmeter_http_sampler.sh create mode 100755 performance_testing/jmeter/run_jmeter_jdbc_sampler.sh create mode 100644 performance_testing/jmeter/test_queries/complex_selects.csv create mode 100644 performance_testing/jmeter/test_queries/medium_selects.csv create mode 100644 performance_testing/jmeter/test_queries/simple_selects.csv diff --git a/performance_testing/jmeter/README.md b/performance_testing/jmeter/README.md new file mode 100644 index 000000000..a7f855fdf --- /dev/null +++ b/performance_testing/jmeter/README.md @@ -0,0 +1,128 @@ +# Using JMeter for BigQuery Performance Testing + +## Before You Start + +Make sure you've completed the following prerequisite steps before running the +provided JMeter test plans + +* Install + [Java 8+ Oracle JDK](https://www.oracle.com/java/technologies/javase/javase-jdk8-downloads.html) + from Oracle page +* Download the + [Simba BigQuery JDBC Driver](https://cloud.google.com/bigquery/providers/simba-drivers) +* Download the latest + [JMeter Binary](https://jmeter.apache.org/download_jmeter.cgi) + +## Which JMeter Test Plan Do I Use? + +### [bigquery_jdbc_sampler.jmx](bigquery_jdbc_sampler.jmx) (Runs queries using JDBC driver) + +#### Pros + +* **Long-running job polling** - The JDBC request sampler is necessary for + tests where queries run longer than 4 minutes and where a consistent + concurrency level must be maintained. The JDBC driver will poll the query + job until it is finished before submitting a new query, ensuring that JMeter + active threads exactly match active BigQuery query jobs. +* **Simpler query format** - The JDBC request sampler does not require you to + form a JSON configuration object to submit the query to the API. This + eliminates JSON errors as a source of problems. + * Unescaped double quotes are allowed in SQL queries - You do not have to + escape double quotes in your SQL queries as is required in the HTTP + sampler. + +#### Cons + +* **JDBC overhead latency** - The JDBC driver has some overhead latency + associated with it versus directly calling the REST API. Use the + BigQuery-provided + [INFORMATION_SCHEMA.JOBS_BY*](https://cloud.google.com/bigquery/docs/information-schema-jobs) + view to exclusively measure query runtime without any other latencies like + network. +* **BigQuery job labels unsupported** - You cannot currently set labels for + jobs submitted by the JDBC driver. In order to get a similar effect to + labeling, you'll need to include something like a JSON object in a comment + in each query, that can be parsed when querying the + [INFORMATION_SCHEMA.JOBS_BY*](https://cloud.google.com/bigquery/docs/information-schema-jobs) + view. +* **Response rows must be returned** - The JDBC driver does not support an + option to return 0 results. The MaxResults JDBC config should therefore be + set to 1, since the default setting of 0 instructs the JDBC driver to return + all rows. + +### [bigquery_http_sampler.jmx](bigquery_http_sampler.jmx) (Runs queries using REST API) + +#### Pros + +* **Fully configurable job options, including job labels** - The HTTP request + sampler allows you to specify the raw JSON request body which can include + any supported BigQuery options. In particular, it's very useful to include + query labels, since these will be present in the + [jobs metadata schema](https://cloud.google.com/bigquery/docs/information-schema-jobs#schema) + in the labels field. +* **Faster Performance** - Since JMeter is making REST calls directly to the + BigQuery API, the performance is faster than having to invoke BigQuery API + via the Java JDBC driver. + +#### Cons + +* **Default 1 hour maximum lifetime for access tokens** - The HTTP request + sampler uses an access token (which you provide as a command-line parameter + at startup) to authenticate with BigQuery. The default maximum lifetime of a + Google access token is 1 hour (3,600 seconds). However, you can extend the + maximum lifetime to 12 hours by + [modifying the organization policy](https://cloud.google.com/resource-manager/docs/organization-policy/restricting-service-accounts#extend_oauth_ttl). + JMeter calls to BigQuery APIs will start failing if your JMeter test runs + longer than your access token’s maximum lifetime. +* **JSON body configuration** - You need to configure the API request payload + using JSON, and the JSON object configuration is easy to break. A stray + quote or a missing comma can make your query fail in ways that are hard to + troubleshoot. + * **Queries must have all double quotes escaped** - Since the SQL queries + you pass to JMeter are values inside the HTTP request JSON body, you + must escape all double quotes that appear in the SQL query with a + backslash. ( e.g. SELECT \”Hello World\” ) +* **4min Max Timeout** - If a query runs for longer than 4 minutes, it can + appear to be done. If you intend to use JMeter's data to characterize the + runtime of your queries, this is a critical consideration. The results will + be wrong if you have queries that are long-running. + +## Running the JMeter Test Plan + +The JMeter test plans provided in this repo are designed to be run with very few +modifications. You should first test-run them this way before adding in more +changes to simplify troubleshooting if any issues are encountered. + +### [run_jmeter_jdbc_sampler.sh](run_jmeter_jdbc_sampler.sh) (**Runs bigquery_jdbc_sampler.jmx**) + +1. Replace the bash script placeholders with your own values, depending on + whether you use JDBC or HTTP as shown below: + * `-Jproject_id=`*YOUR_PROJECT* + * `-Juser.classpath=`*/path/to/your/SimbaJDBCDriverforGoogleBigQuery* +1. Ensure proper authentication is set up for either service account or user + account authentication: + * Service account authentication: \ + `export GOOGLE_APPLICATION_CREDENTIALS=`*/path/to/your/private_key.json* + * User account authentication: \ + `gcloud auth application-default login` +1. Run the bash helper script to begin the JMeter test + * `bash run_jmeter_jdbc_sampler.sh` + +### [run_jmeter_http_sampler.sh](run_jmeter_http_sampler.sh) (**Runs bigquery_http_sampler.jmx**) + +1. Replace the bash script placeholders shown below with your own values: + * `-Jproject_id=`*YOUR_PROJECT* +1. Ensure proper authentication is set up + * Service account authentication: \ + `gcloud auth activate-service-account + --key-file=`*/path/to/your/private_key.json* + * User account authentication: \ + `gcloud auth login` +1. Run the bash helper script to begin the JMeter test + * `bash run_jmeter_http_sampler.sh` + +## Inspecting the JMeter Test Plans + +The best method of viewing and understand the JMeter test plans is to open then in JMeter's GUI mode as shown below: +* `./apache-jmeter-5.3/bin/jmeter -t bigquery_jdbc_sampler.jmx` +* `./apache-jmeter-5.3/bin/jmeter -t bigquery_http_sampler.jmx` diff --git a/performance_testing/jmeter/bigquery_http_sampler.jmx b/performance_testing/jmeter/bigquery_http_sampler.jmx new file mode 100644 index 000000000..31bfca048 --- /dev/null +++ b/performance_testing/jmeter/bigquery_http_sampler.jmx @@ -0,0 +1,338 @@ + + + + + + false + false + + + + + + + + + + Authorization + Bearer ${__P(token)} + + + Content-Type + application/json + + + + + + continue + + false + ${__P(num_loops,0)} + + ${__P(simple_num_users,1)} + ${__P(ramp_time)} + 1504864705000 + 1504864705000 + true + ${__P(thread_duration,3600)} + + true + + + + true + + + + false + { + "kind": "bigquery#QueryRequest", + "useQueryCache": false, + "useLegacySql": false, + "timeoutMs":21600000, + "query": "${simple_query}", + "labels": {"jmeter_id": "${simple_id}", "run_id": "${__P(run_id)}"}, + "maxResults": 1 +} + + + = + + + + bigquery.googleapis.com + + https + + /bigquery/v2/projects/${__P(project_id)}/queries + POST + true + false + true + false + + + 21600000 + + + + true + + saveConfig + + + true + true + true + + true + true + true + true + false + true + true + false + false + false + true + false + false + false + true + 0 + true + true + true + true + true + true + + + ${__P(error_csv_path)} + + + + \t + + ${__P(simple_csv_path)} + false + false + true + shareMode.all + false + + + + + + + continue + + false + ${__P(num_loops,0)} + + ${__P(medium_num_users,1)} + ${__P(ramp_time)} + 1504864705000 + 1504864705000 + true + ${__P(thread_duration,3600)} + + true + + + + true + + + + false + { + "kind": "bigquery#QueryRequest", + "useQueryCache": false, + "useLegacySql": false, + "timeoutMs":21600000, + "query": "${medium_query}", + "labels": {"jmeter_id": "${medium_id}", "run_id": "${__P(run_id)}"}, + "maxResults": 1 +} + + + = + + + + bigquery.googleapis.com + + https + + /bigquery/v2/projects/${__P(project_id)}/queries + POST + true + false + true + false + + + 21600000 + + + + true + + saveConfig + + + true + true + true + + true + true + true + true + false + true + true + false + false + false + true + false + false + false + true + 0 + true + true + true + true + true + true + + + ${__P(error_csv_path)} + + + + \t + + ${__P(medium_csv_path)} + false + false + true + shareMode.all + false + + + + + + + continue + + false + ${__P(num_loops,0)} + + ${__P(complex_num_users,1)} + ${__P(ramp_time)} + 1504864705000 + 1504864705000 + true + ${__P(thread_duration,3600)} + + true + + + + true + + + + false + { + "kind": "bigquery#QueryRequest", + "useQueryCache": false, + "useLegacySql": false, + "timeoutMs":21600000, + "query": "${complex_query}", + "labels": {"jmeter_id": "${complex_id}", "run_id": "${__P(run_id)}"}, + "maxResults": 1 +} + + + = + + + + bigquery.googleapis.com + + https + + /bigquery/v2/projects/${__P(project_id)}/queries + POST + true + false + true + false + + + 21600000 + + + + true + + saveConfig + + + true + true + true + + true + true + true + true + false + true + true + false + false + false + true + false + false + false + true + 0 + true + true + true + true + true + true + + + ${__P(error_csv_path)} + + + + \t + + ${__P(complex_csv_path)} + false + false + true + shareMode.all + false + + + + + + + + diff --git a/performance_testing/jmeter/bigquery_jdbc_sampler.jmx b/performance_testing/jmeter/bigquery_jdbc_sampler.jmx new file mode 100644 index 000000000..0b0e9887b --- /dev/null +++ b/performance_testing/jmeter/bigquery_jdbc_sampler.jmx @@ -0,0 +1,269 @@ + + + + + + false + false + + + + + + + + true + + 5000 + + bq_pool + jdbc:bigquery://https://www.googleapis.com/bigquery/v2:443;OAuthType=3;ProjectId=${__P(project_id)};Timeout=3600;useQueryCache=0;MaxResults=1; + com.simba.googlebigquery.jdbc42.Driver + + true + + 0 + false + 10000 + DEFAULT + 60000 + + + + + continue + + false + ${__P(num_loops,0)} + + ${__P(simple_num_users,1)} + ${__P(ramp_time)} + true + ${__P(thread_duration,3600)} + + true + + + + bq_pool + /*${__P(run_id)},${simple_id}*/ ${simple_query} + + + -1 + Select Statement + Store as String + 0 + + + + + + ${__P(simple_csv_path)} + + + false + \t + false + true + false + shareMode.all + + + + true + + saveConfig + + + true + true + true + + true + true + true + true + false + true + true + false + false + false + true + false + false + false + true + 0 + true + true + true + true + true + true + + + ${__P(error_csv_path)} + + + + + + continue + + false + ${__P(num_loops,0)} + + ${__P(medium_num_users,1)} + ${__P(ramp_time)} + true + ${__P(thread_duration,3600)} + + true + + + + bq_pool + /*${__P(run_id)},${medium_id}*/ ${medium_query} + + + -1 + Select Statement + Store as String + 0 + + + + + + ${__P(medium_csv_path)} + + + false + \t + false + true + false + shareMode.group + + + + true + + saveConfig + + + true + true + true + + true + true + true + true + false + true + true + false + false + false + true + false + false + false + true + 0 + true + true + true + true + true + true + + + ${__P(error_csv_path)} + + + + + + continue + + false + ${__P(num_loops,0)} + + ${__P(complex_num_users,1)} + ${__P(ramp_time)} + true + ${__P(thread_duration,3600)} + + true + + + + bq_pool + /*${__P(run_id)},${complex_id}*/ ${complex_query} + + + -1 + Select Statement + Store as String + 0 + + + + + + ${__P(complex_csv_path)} + + + false + \t + false + true + false + shareMode.group + + + + true + + saveConfig + + + true + true + true + + true + true + true + true + false + true + true + false + false + false + true + false + false + false + true + 0 + true + true + true + true + true + true + + + ${__P(error_csv_path)} + + + + + + + diff --git a/performance_testing/jmeter/cancel_running_jobs.py b/performance_testing/jmeter/cancel_running_jobs.py new file mode 100644 index 000000000..e645fd642 --- /dev/null +++ b/performance_testing/jmeter/cancel_running_jobs.py @@ -0,0 +1,38 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from argparse import ArgumentParser +from google.cloud import bigquery + + +def cancel_jobs(client): + for job in client.list_jobs(all_users=True, state_filter="RUNNING"): + client.cancel_job(job.job_id, location='us') + + +def get_cmd_line_args(): + parser = ArgumentParser() + parser.add_argument( + '--project_id', + help='Project in which all running BigQuery jobs will be cancelled.') + return parser.parse_args() + + +def main(): + args = get_cmd_line_args() + cancel_jobs(bigquery.Client(project=args.project_id)) + + +if __name__ == '__main__': + main() diff --git a/performance_testing/jmeter/http_sampler_results.sql b/performance_testing/jmeter/http_sampler_results.sql new file mode 100644 index 000000000..fd228272f --- /dev/null +++ b/performance_testing/jmeter/http_sampler_results.sql @@ -0,0 +1,32 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +SELECT + SPLIT(labels[OFFSET(1)].value, '_')[OFFSET(0)] AS complexity, + COUNT(1) +FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_PROJECT +WHERE + DATE(creation_time) = CURRENT_DATE() -- Partitioning column + AND project_id = 'YOUR_PROJECT' -- Clustering column + AND ARRAY_LENGTH(labels) > 0 + AND EXISTS ( + SELECT * + FROM UNNEST(labels) AS labels + WHERE + labels.key = 'run_id' + AND labels.value = 'jmeter_http_test' + ) +GROUP BY 1 diff --git a/performance_testing/jmeter/jdbc_sampler_results.sql b/performance_testing/jmeter/jdbc_sampler_results.sql new file mode 100644 index 000000000..5889f7b66 --- /dev/null +++ b/performance_testing/jmeter/jdbc_sampler_results.sql @@ -0,0 +1,27 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +SELECT +-- SPLIT(TRIM(SPLIT(query, '*/')[OFFSET(0)],'/*'))[OFFSET(1)] AS query_id, + SPLIT(SPLIT(TRIM(SPLIT(query, '*/')[OFFSET(0)],'/*'))[OFFSET(1)], '_')[OFFSET(0)] AS complexity, + COUNT(1) +FROM + `region-us`.INFORMATION_SCHEMA.JOBS_BY_PROJECT +WHERE + DATE(creation_time) = CURRENT_DATE() -- Partitioning column + AND project_id = 'YOUR_PROJECT' -- Clustering column + AND SPLIT(TRIM(SPLIT(query, '*/')[OFFSET(0)],'/*'))[OFFSET(0)] = 'jmeter_jdbc_test' +GROUP BY 1 diff --git a/performance_testing/jmeter/run_jmeter_http_sampler.sh b/performance_testing/jmeter/run_jmeter_http_sampler.sh new file mode 100755 index 000000000..30368255c --- /dev/null +++ b/performance_testing/jmeter/run_jmeter_http_sampler.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +######################################################################### +# Make sure you run the following gcloud auth command +# if you're not using a service account to authenticate: +# +# gcloud auth login +# +# If you are using a service account, run the following gcloud auth command +# after specifying the path to your service account private key. +# +# gcloud auth activate-service-account --key-file=/path/to/your/private_key.json +# +######################################################################### + +apache-jmeter-5.3/bin/jmeter -n \ +-t bigquery_http_sampler.jmx \ +-Jproject_id=YOUR_PROJECT \ +-Jtoken=$(gcloud auth print-access-token) \ +-Jsimple_csv_path=test_queries/simple_selects.csv \ +-Jmedium_csv_path=test_queries/medium_selects.csv \ +-Jcomplex_csv_path=test_queries/complex_selects.csv \ +-Jerror_csv_path=errors.csv \ +-Jsimple_num_users=6 \ +-Jmedium_num_users=3 \ +-Jcomplex_num_users=1 \ +-Jnum_loops=-1 \ +-Jrun_id=jmeter_http_test \ +-Jthread_duration=10 \ +-Jramp_time=0; diff --git a/performance_testing/jmeter/run_jmeter_jdbc_sampler.sh b/performance_testing/jmeter/run_jmeter_jdbc_sampler.sh new file mode 100755 index 000000000..95763442c --- /dev/null +++ b/performance_testing/jmeter/run_jmeter_jdbc_sampler.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +######################################################################### +# Make sure you run the following gcloud auth command +# if you're not using a service account to authenticate: +# +# gcloud auth application-default login +# +# If you are using a service account, uncomment the export command below +# and specify the path to your service account private key. +# +# export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your/private_key.json +# +######################################################################### + +apache-jmeter-5.3/bin/jmeter -n \ +-t bigquery_jdbc_sampler.jmx \ +-Jproject_id=YOUR_PROJECT \ +-Juser.classpath=/path/to/your/SimbaJDBCDriverforGoogleBigQuery \ +-Jsimple_csv_path=test_queries/simple_selects.csv \ +-Jmedium_csv_path=test_queries/medium_selects.csv \ +-Jcomplex_csv_path=test_queries/complex_selects.csv \ +-Jerror_csv_path=errors.csv \ +-Jsimple_num_users=6 \ +-Jmedium_num_users=3 \ +-Jcomplex_num_users=1 \ +-Jnum_loops=-1 \ +-Jrun_id=jmeter_jdbc_test \ +-Jthread_duration=10 \ +-Jramp_time=0; diff --git a/performance_testing/jmeter/test_queries/complex_selects.csv b/performance_testing/jmeter/test_queries/complex_selects.csv new file mode 100644 index 000000000..0f5b3f042 --- /dev/null +++ b/performance_testing/jmeter/test_queries/complex_selects.csv @@ -0,0 +1,101 @@ +complex_id complex_query +complex_00 SELECT 'some complex query 00'; +complex_01 SELECT 'some complex query 01'; +complex_02 SELECT 'some complex query 02'; +complex_03 SELECT 'some complex query 03'; +complex_04 SELECT 'some complex query 04'; +complex_05 SELECT 'some complex query 05'; +complex_06 SELECT 'some complex query 06'; +complex_07 SELECT 'some complex query 07'; +complex_08 SELECT 'some complex query 08'; +complex_09 SELECT 'some complex query 09'; +complex_10 SELECT 'some complex query 10'; +complex_11 SELECT 'some complex query 11'; +complex_12 SELECT 'some complex query 12'; +complex_13 SELECT 'some complex query 13'; +complex_14 SELECT 'some complex query 14'; +complex_15 SELECT 'some complex query 15'; +complex_16 SELECT 'some complex query 16'; +complex_17 SELECT 'some complex query 17'; +complex_18 SELECT 'some complex query 18'; +complex_19 SELECT 'some complex query 19'; +complex_20 SELECT 'some complex query 20'; +complex_21 SELECT 'some complex query 21'; +complex_22 SELECT 'some complex query 22'; +complex_23 SELECT 'some complex query 23'; +complex_24 SELECT 'some complex query 24'; +complex_25 SELECT 'some complex query 25'; +complex_26 SELECT 'some complex query 26'; +complex_27 SELECT 'some complex query 27'; +complex_28 SELECT 'some complex query 28'; +complex_29 SELECT 'some complex query 29'; +complex_30 SELECT 'some complex query 30'; +complex_31 SELECT 'some complex query 31'; +complex_32 SELECT 'some complex query 32'; +complex_33 SELECT 'some complex query 33'; +complex_34 SELECT 'some complex query 34'; +complex_35 SELECT 'some complex query 35'; +complex_36 SELECT 'some complex query 36'; +complex_37 SELECT 'some complex query 37'; +complex_38 SELECT 'some complex query 38'; +complex_39 SELECT 'some complex query 39'; +complex_40 SELECT 'some complex query 40'; +complex_41 SELECT 'some complex query 41'; +complex_42 SELECT 'some complex query 42'; +complex_43 SELECT 'some complex query 43'; +complex_44 SELECT 'some complex query 44'; +complex_45 SELECT 'some complex query 45'; +complex_46 SELECT 'some complex query 46'; +complex_47 SELECT 'some complex query 47'; +complex_48 SELECT 'some complex query 48'; +complex_49 SELECT 'some complex query 49'; +complex_50 SELECT 'some complex query 50'; +complex_51 SELECT 'some complex query 51'; +complex_52 SELECT 'some complex query 52'; +complex_53 SELECT 'some complex query 53'; +complex_54 SELECT 'some complex query 54'; +complex_55 SELECT 'some complex query 55'; +complex_56 SELECT 'some complex query 56'; +complex_57 SELECT 'some complex query 57'; +complex_58 SELECT 'some complex query 58'; +complex_59 SELECT 'some complex query 59'; +complex_60 SELECT 'some complex query 60'; +complex_61 SELECT 'some complex query 61'; +complex_62 SELECT 'some complex query 62'; +complex_63 SELECT 'some complex query 63'; +complex_64 SELECT 'some complex query 64'; +complex_65 SELECT 'some complex query 65'; +complex_66 SELECT 'some complex query 66'; +complex_67 SELECT 'some complex query 67'; +complex_68 SELECT 'some complex query 68'; +complex_69 SELECT 'some complex query 69'; +complex_70 SELECT 'some complex query 70'; +complex_71 SELECT 'some complex query 71'; +complex_72 SELECT 'some complex query 72'; +complex_73 SELECT 'some complex query 73'; +complex_74 SELECT 'some complex query 74'; +complex_75 SELECT 'some complex query 75'; +complex_76 SELECT 'some complex query 76'; +complex_77 SELECT 'some complex query 77'; +complex_78 SELECT 'some complex query 78'; +complex_79 SELECT 'some complex query 79'; +complex_80 SELECT 'some complex query 80'; +complex_81 SELECT 'some complex query 81'; +complex_82 SELECT 'some complex query 82'; +complex_83 SELECT 'some complex query 83'; +complex_84 SELECT 'some complex query 84'; +complex_85 SELECT 'some complex query 85'; +complex_86 SELECT 'some complex query 86'; +complex_87 SELECT 'some complex query 87'; +complex_88 SELECT 'some complex query 88'; +complex_89 SELECT 'some complex query 89'; +complex_90 SELECT 'some complex query 90'; +complex_91 SELECT 'some complex query 91'; +complex_92 SELECT 'some complex query 92'; +complex_93 SELECT 'some complex query 93'; +complex_94 SELECT 'some complex query 94'; +complex_95 SELECT 'some complex query 95'; +complex_96 SELECT 'some complex query 96'; +complex_97 SELECT 'some complex query 97'; +complex_98 SELECT 'some complex query 98'; +complex_99 SELECT 'some complex query 99'; diff --git a/performance_testing/jmeter/test_queries/medium_selects.csv b/performance_testing/jmeter/test_queries/medium_selects.csv new file mode 100644 index 000000000..59cdabe62 --- /dev/null +++ b/performance_testing/jmeter/test_queries/medium_selects.csv @@ -0,0 +1,101 @@ +medium_id medium_query +medium_00 SELECT 'some medium complexity query 00'; +medium_01 SELECT 'some medium complexity query 01'; +medium_02 SELECT 'some medium complexity query 02'; +medium_03 SELECT 'some medium complexity query 03'; +medium_04 SELECT 'some medium complexity query 04'; +medium_05 SELECT 'some medium complexity query 05'; +medium_06 SELECT 'some medium complexity query 06'; +medium_07 SELECT 'some medium complexity query 07'; +medium_08 SELECT 'some medium complexity query 08'; +medium_09 SELECT 'some medium complexity query 09'; +medium_10 SELECT 'some medium complexity query 10'; +medium_11 SELECT 'some medium complexity query 11'; +medium_12 SELECT 'some medium complexity query 12'; +medium_13 SELECT 'some medium complexity query 13'; +medium_14 SELECT 'some medium complexity query 14'; +medium_15 SELECT 'some medium complexity query 15'; +medium_16 SELECT 'some medium complexity query 16'; +medium_17 SELECT 'some medium complexity query 17'; +medium_18 SELECT 'some medium complexity query 18'; +medium_19 SELECT 'some medium complexity query 19'; +medium_20 SELECT 'some medium complexity query 20'; +medium_21 SELECT 'some medium complexity query 21'; +medium_22 SELECT 'some medium complexity query 22'; +medium_23 SELECT 'some medium complexity query 23'; +medium_24 SELECT 'some medium complexity query 24'; +medium_25 SELECT 'some medium complexity query 25'; +medium_26 SELECT 'some medium complexity query 26'; +medium_27 SELECT 'some medium complexity query 27'; +medium_28 SELECT 'some medium complexity query 28'; +medium_29 SELECT 'some medium complexity query 29'; +medium_30 SELECT 'some medium complexity query 30'; +medium_31 SELECT 'some medium complexity query 31'; +medium_32 SELECT 'some medium complexity query 32'; +medium_33 SELECT 'some medium complexity query 33'; +medium_34 SELECT 'some medium complexity query 34'; +medium_35 SELECT 'some medium complexity query 35'; +medium_36 SELECT 'some medium complexity query 36'; +medium_37 SELECT 'some medium complexity query 37'; +medium_38 SELECT 'some medium complexity query 38'; +medium_39 SELECT 'some medium complexity query 39'; +medium_40 SELECT 'some medium complexity query 40'; +medium_41 SELECT 'some medium complexity query 41'; +medium_42 SELECT 'some medium complexity query 42'; +medium_43 SELECT 'some medium complexity query 43'; +medium_44 SELECT 'some medium complexity query 44'; +medium_45 SELECT 'some medium complexity query 45'; +medium_46 SELECT 'some medium complexity query 46'; +medium_47 SELECT 'some medium complexity query 47'; +medium_48 SELECT 'some medium complexity query 48'; +medium_49 SELECT 'some medium complexity query 49'; +medium_50 SELECT 'some medium complexity query 50'; +medium_51 SELECT 'some medium complexity query 51'; +medium_52 SELECT 'some medium complexity query 52'; +medium_53 SELECT 'some medium complexity query 53'; +medium_54 SELECT 'some medium complexity query 54'; +medium_55 SELECT 'some medium complexity query 55'; +medium_56 SELECT 'some medium complexity query 56'; +medium_57 SELECT 'some medium complexity query 57'; +medium_58 SELECT 'some medium complexity query 58'; +medium_59 SELECT 'some medium complexity query 59'; +medium_60 SELECT 'some medium complexity query 60'; +medium_61 SELECT 'some medium complexity query 61'; +medium_62 SELECT 'some medium complexity query 62'; +medium_63 SELECT 'some medium complexity query 63'; +medium_64 SELECT 'some medium complexity query 64'; +medium_65 SELECT 'some medium complexity query 65'; +medium_66 SELECT 'some medium complexity query 66'; +medium_67 SELECT 'some medium complexity query 67'; +medium_68 SELECT 'some medium complexity query 68'; +medium_69 SELECT 'some medium complexity query 69'; +medium_70 SELECT 'some medium complexity query 70'; +medium_71 SELECT 'some medium complexity query 71'; +medium_72 SELECT 'some medium complexity query 72'; +medium_73 SELECT 'some medium complexity query 73'; +medium_74 SELECT 'some medium complexity query 74'; +medium_75 SELECT 'some medium complexity query 75'; +medium_76 SELECT 'some medium complexity query 76'; +medium_77 SELECT 'some medium complexity query 77'; +medium_78 SELECT 'some medium complexity query 78'; +medium_79 SELECT 'some medium complexity query 79'; +medium_80 SELECT 'some medium complexity query 80'; +medium_81 SELECT 'some medium complexity query 81'; +medium_82 SELECT 'some medium complexity query 82'; +medium_83 SELECT 'some medium complexity query 83'; +medium_84 SELECT 'some medium complexity query 84'; +medium_85 SELECT 'some medium complexity query 85'; +medium_86 SELECT 'some medium complexity query 86'; +medium_87 SELECT 'some medium complexity query 87'; +medium_88 SELECT 'some medium complexity query 88'; +medium_89 SELECT 'some medium complexity query 89'; +medium_90 SELECT 'some medium complexity query 90'; +medium_91 SELECT 'some medium complexity query 91'; +medium_92 SELECT 'some medium complexity query 92'; +medium_93 SELECT 'some medium complexity query 93'; +medium_94 SELECT 'some medium complexity query 94'; +medium_95 SELECT 'some medium complexity query 95'; +medium_96 SELECT 'some medium complexity query 96'; +medium_97 SELECT 'some medium complexity query 97'; +medium_98 SELECT 'some medium complexity query 98'; +medium_99 SELECT 'some medium complexity query 99'; diff --git a/performance_testing/jmeter/test_queries/simple_selects.csv b/performance_testing/jmeter/test_queries/simple_selects.csv new file mode 100644 index 000000000..11eabe283 --- /dev/null +++ b/performance_testing/jmeter/test_queries/simple_selects.csv @@ -0,0 +1,101 @@ +simple_id simple_query +simple_00 SELECT 'some simple query 00'; +simple_01 SELECT 'some simple query 01'; +simple_02 SELECT 'some simple query 02'; +simple_03 SELECT 'some simple query 03'; +simple_04 SELECT 'some simple query 04'; +simple_05 SELECT 'some simple query 05'; +simple_06 SELECT 'some simple query 06'; +simple_07 SELECT 'some simple query 07'; +simple_08 SELECT 'some simple query 08'; +simple_09 SELECT 'some simple query 09'; +simple_10 SELECT 'some simple query 10'; +simple_11 SELECT 'some simple query 11'; +simple_12 SELECT 'some simple query 12'; +simple_13 SELECT 'some simple query 13'; +simple_14 SELECT 'some simple query 14'; +simple_15 SELECT 'some simple query 15'; +simple_16 SELECT 'some simple query 16'; +simple_17 SELECT 'some simple query 17'; +simple_18 SELECT 'some simple query 18'; +simple_19 SELECT 'some simple query 19'; +simple_20 SELECT 'some simple query 20'; +simple_21 SELECT 'some simple query 21'; +simple_22 SELECT 'some simple query 22'; +simple_23 SELECT 'some simple query 23'; +simple_24 SELECT 'some simple query 24'; +simple_25 SELECT 'some simple query 25'; +simple_26 SELECT 'some simple query 26'; +simple_27 SELECT 'some simple query 27'; +simple_28 SELECT 'some simple query 28'; +simple_29 SELECT 'some simple query 29'; +simple_30 SELECT 'some simple query 30'; +simple_31 SELECT 'some simple query 31'; +simple_32 SELECT 'some simple query 32'; +simple_33 SELECT 'some simple query 33'; +simple_34 SELECT 'some simple query 34'; +simple_35 SELECT 'some simple query 35'; +simple_36 SELECT 'some simple query 36'; +simple_37 SELECT 'some simple query 37'; +simple_38 SELECT 'some simple query 38'; +simple_39 SELECT 'some simple query 39'; +simple_40 SELECT 'some simple query 40'; +simple_41 SELECT 'some simple query 41'; +simple_42 SELECT 'some simple query 42'; +simple_43 SELECT 'some simple query 43'; +simple_44 SELECT 'some simple query 44'; +simple_45 SELECT 'some simple query 45'; +simple_46 SELECT 'some simple query 46'; +simple_47 SELECT 'some simple query 47'; +simple_48 SELECT 'some simple query 48'; +simple_49 SELECT 'some simple query 49'; +simple_50 SELECT 'some simple query 50'; +simple_51 SELECT 'some simple query 51'; +simple_52 SELECT 'some simple query 52'; +simple_53 SELECT 'some simple query 53'; +simple_54 SELECT 'some simple query 54'; +simple_55 SELECT 'some simple query 55'; +simple_56 SELECT 'some simple query 56'; +simple_57 SELECT 'some simple query 57'; +simple_58 SELECT 'some simple query 58'; +simple_59 SELECT 'some simple query 59'; +simple_60 SELECT 'some simple query 60'; +simple_61 SELECT 'some simple query 61'; +simple_62 SELECT 'some simple query 62'; +simple_63 SELECT 'some simple query 63'; +simple_64 SELECT 'some simple query 64'; +simple_65 SELECT 'some simple query 65'; +simple_66 SELECT 'some simple query 66'; +simple_67 SELECT 'some simple query 67'; +simple_68 SELECT 'some simple query 68'; +simple_69 SELECT 'some simple query 69'; +simple_70 SELECT 'some simple query 70'; +simple_71 SELECT 'some simple query 71'; +simple_72 SELECT 'some simple query 72'; +simple_73 SELECT 'some simple query 73'; +simple_74 SELECT 'some simple query 74'; +simple_75 SELECT 'some simple query 75'; +simple_76 SELECT 'some simple query 76'; +simple_77 SELECT 'some simple query 77'; +simple_78 SELECT 'some simple query 78'; +simple_79 SELECT 'some simple query 79'; +simple_80 SELECT 'some simple query 80'; +simple_81 SELECT 'some simple query 81'; +simple_82 SELECT 'some simple query 82'; +simple_83 SELECT 'some simple query 83'; +simple_84 SELECT 'some simple query 84'; +simple_85 SELECT 'some simple query 85'; +simple_86 SELECT 'some simple query 86'; +simple_87 SELECT 'some simple query 87'; +simple_88 SELECT 'some simple query 88'; +simple_89 SELECT 'some simple query 89'; +simple_90 SELECT 'some simple query 90'; +simple_91 SELECT 'some simple query 91'; +simple_92 SELECT 'some simple query 92'; +simple_93 SELECT 'some simple query 93'; +simple_94 SELECT 'some simple query 94'; +simple_95 SELECT 'some simple query 95'; +simple_96 SELECT 'some simple query 96'; +simple_97 SELECT 'some simple query 97'; +simple_98 SELECT 'some simple query 98'; +simple_99 SELECT 'some simple query 99'; From d951ebfbc5a0336c861038f5247590f3936186a6 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Wed, 2 Dec 2020 14:14:33 -0800 Subject: [PATCH 10/90] feat: bq project env-var Add an environment variable to support overriding the default project for the BigQuery Client. By default this will be the project in which the cloud function is deployed. --- tools/cloud_functions/gcs_event_based_ingest/README.md | 6 ++++-- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md | 1 + .../gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/README.md index 9fda82d39..c976c21c7 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/README.md @@ -37,8 +37,10 @@ better fit your naming convention on GCS. Your regex must include [Python Regex with named capturing groups](https://docs.python.org/3/howto/regex.html#non-capturing-and-named-groups) for destination `dataset`, and `table`. Note, that `dataset` can optionally, explicitly specify destination project -(i.e. `gs://${BUCKET}/project_id.dataset_id/table/....`) otherwise the default -project will be inferred from Application Default Credential (the project in +(i.e. `gs://${BUCKET}/project_id.dataset_id/table/....`) alternatively, +one can set the `BQ_PROJECT` environment variable to set to override the +default target project for datasets at the function level. The default behavior is to +infer the project from Application Default Credential (the project in which the Cloud Function is running, or the ADC configured in Google Cloud SDK if invoked locally). This is useful in scenarios where a single deployment of the Cloud Function is responsible for ingesting data into BigQuery tables in diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md index c86dceea4..5e30a1c4b 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md @@ -31,6 +31,7 @@ following default behavior. | `DESTINATION_REGEX` | A [Python Regex with named capturing groups](https://docs.python.org/3/howto/regex.html#non-capturing-and-named-groups) for `dataset`, `table`, (optional: `partition` or `yyyy`, `mm`, `dd`, `hh`, `batch`) | `MAX_BATCH_BYTES` | Max bytes for BigQuery Load job | `15000000000000` ([15 TB](https://cloud.google.com/bigquery/quotas#load_jobs)| | `JOB_PREFIX` | Prefix for BigQuery Job IDs | `gcf-ingest-` | +| `BQ_PROJECT` | Default BQ project to use if not specified in dataset capturing group | Project where Cloud Function is deployed | ## Implementation notes diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index d05b771db..aa27422f9 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -112,7 +112,7 @@ def main(event: Dict, context): # pylint: disable=unused-argument prefix_to_load = removesuffix(object_id, SUCCESS_FILENAME) gsurl = f"gs://{bucket_id}/{prefix_to_load}" gcs_client = storage.Client(client_info=CLIENT_INFO) - project = gcs_client.project + project = os.getenv("BQ_PROJECT", gcs_client.project) bkt = cached_get_bucket(gcs_client, bucket_id) success_blob: storage.Blob = bkt.blob(object_id) handle_duplicate_notification(bkt, success_blob, gsurl) From cf23f1ba4e152c6a410a05153fc28002426157c7 Mon Sep 17 00:00:00 2001 From: Ryan den Otter Date: Fri, 4 Dec 2020 14:09:59 -0700 Subject: [PATCH 11/90] Move utility methods into a utils module Change the tests to use new utils module. --- .../gcs_ocn_bq_ingest/main.py | 462 +---------------- .../gcs_ocn_bq_ingest/utils.py | 477 ++++++++++++++++++ .../gcs_event_based_ingest/tests/conftest.py | 4 +- .../test_gcs_ocn_bq_ingest.py | 5 +- 4 files changed, 488 insertions(+), 460 deletions(-) create mode 100644 tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 6025122d1..8f4ff3d64 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -16,46 +16,19 @@ # limitations under the License. """Background Cloud Function for loading data from GCS to BigQuery. """ -import collections -import collections.abc -import copy -import json import os -import pathlib import re -import time -from typing import Any, Deque, Dict, List, Optional, Tuple +from typing import Dict -import cachetools import google.api_core.client_info -import google.api_core.exceptions import google.cloud.exceptions from google.cloud import bigquery, storage -# https://cloud.google.com/bigquery/quotas#load_jobs -# 15TB per BQ load job (soft limit). -DEFAULT_MAX_BATCH_BYTES = str(15 * 10**12) -# 10,000 GCS URIs per BQ load job. -MAX_SOURCE_URIS_PER_LOAD = 10**4 - -DEFAULT_EXTERNAL_TABLE_DEFINITION = { - # The default must be a self describing data format - # because autodetecting CSV /JSON schemas is likely to not match - # expectations / assumptions of the transformation query. - "sourceFormat": "PARQUET", -} - -DEFAULT_JOB_LABELS = { - "component": "event-based-gcs-ingest", - "cloud-function-name": os.getenv("FUNCTION_NAME"), -} - -BASE_LOAD_JOB_CONFIG = { - "sourceFormat": "CSV", - "fieldDelimiter": ",", - "writeDisposition": "WRITE_APPEND", - "labels": DEFAULT_JOB_LABELS, -} +from .utils import (parse_notification, SUCCESS_FILENAME, removesuffix, + cached_get_bucket, handle_duplicate_notification, + DEFAULT_JOB_LABELS, read_gcs_file_if_exists, + look_for_config_in_parents, external_query, + create_job_id_prefix, load_batches) # yapf: disable DEFAULT_DESTINATION_REGEX = ( @@ -70,28 +43,9 @@ ) # yapf: enable -# Will wait up to this polling for errors before exiting -# This is to check if job fail quickly, not to assert it succeed. -# This may not be honored if longer than cloud function timeout. -# https://cloud.google.com/functions/docs/concepts/exec#timeout -# One might consider lowering this to 1-2 seconds to lower the -# upper bound of expected execution time to stay within the free tier. -# https://cloud.google.com/functions/pricing#free_tier -WAIT_FOR_JOB_SECONDS = int(os.getenv("WAIT_FOR_JOB_SECONDS", "5")) - -# Use caution when lowering the job polling rate. -# Keep in mind that many concurrent executions of this cloud function should not -# violate the 300 concurrent requests or 100 request per second. -# https://cloud.google.com/bigquery/quotas#all_api_requests -JOB_POLL_INTERVAL_SECONDS = 1 - -SUCCESS_FILENAME = os.getenv("SUCCESS_FILENAME", "_SUCCESS") - CLIENT_INFO = google.api_core.client_info.ClientInfo( user_agent="google-pso-tool/bq-severless-loader") -DEFAULT_JOB_PREFIX = "gcf-ingest-" - def main(event: Dict, context): # pylint: disable=unused-argument """entry point for background cloud function for event driven GCS to @@ -177,407 +131,3 @@ def main(event: Dict, context): # pylint: disable=unused-argument print("LOAD_JOB") load_batches(gcs_client, bq_client, gsurl, dest_table_ref, create_job_id_prefix(dest_table_ref, batch_id)) - - -def create_job_id_prefix(dest_table_ref: bigquery.TableReference, - batch_id: Optional[str]): - """Create job id prefix with a consistent naming convention. - The naming conventions is as follows: - gcf-ingest----- - Parts that are not inferrable from the GCS path with have a 'None' - placeholder. This naming convention is crucial for monitoring the system. - Note, gcf-ingest- can be overridden with environment variable JOB_PREFIX - - Examples: - - Non-partitioned Non batched tables: - - gs://${BUCKET}/tpch/lineitem/_SUCCESS - - gcf-ingest-tpch-lineitem-None-None- - Non-partitioned batched tables: - - gs://${BUCKET}/tpch/lineitem/batch000/_SUCCESS - - gcf-ingest-tpch-lineitem-None-batch000- - Partitioned Batched tables: - - gs://${BUCKET}/tpch/lineitem/$20201031/batch000/_SUCCESS - - gcf-ingest-tpch-lineitem-20201031-batch000- - """ - table_partition = dest_table_ref.table_id.split("$") - if len(table_partition) < 2: - # If there is no partition put a None placeholder - table_partition.append("None") - return f"{os.getenv('JOB_PREFIX', DEFAULT_JOB_PREFIX)}" \ - f"{dest_table_ref.dataset_id}-" \ - f"{'-'.join(table_partition)}-" \ - f"{batch_id}-" - - -def external_query( # pylint: disable=too-many-arguments - gcs_client: storage.Client, bq_client: bigquery.Client, gsurl: str, - query: str, dest_table_ref: bigquery.TableReference, - job_id_prefix: str): - """Load from query over external table from GCS. - - This hinges on a SQL query defined in GCS at _config/bq_transform.sql and - an external table definition _config/external.json (otherwise will assume - CSV external table) - """ - external_table_config = read_gcs_file_if_exists( - gcs_client, f"{gsurl}_config/external.json") - if not external_table_config: - external_table_config = look_for_config_in_parents( - gcs_client, gsurl, "external.json") - if external_table_config: - external_table_def = json.loads(external_table_config) - else: - print(f"Falling back to default CSV external table." - f" {gsurl}_config/external.json not found.") - external_table_def = DEFAULT_EXTERNAL_TABLE_DEFINITION - - external_table_def["sourceUris"] = flatten2dlist( - get_batches_for_prefix(gcs_client, gsurl)) - print(f"external table def = {json.dumps(external_table_config, indent=2)}") - external_config = bigquery.ExternalConfig.from_api_repr(external_table_def) - job_config = bigquery.QueryJobConfig( - table_definitions={"temp_ext": external_config}, use_legacy_sql=False) - - # Note, dest_table might include a partition decorator. - rendered_query = query.format( - dest_dataset=dest_table_ref.dataset_id, - dest_table=dest_table_ref.table_id, - ) - - job: bigquery.QueryJob = bq_client.query( - rendered_query, - job_config=job_config, - job_id_prefix=job_id_prefix, - ) - - print(f"started asynchronous query job: {job.job_id}") - - start_poll_for_errors = time.monotonic() - # Check if job failed quickly - while time.monotonic() - start_poll_for_errors < WAIT_FOR_JOB_SECONDS: - job.reload() - if job.errors: - raise RuntimeError( - f"query job {job.job_id} failed quickly: {job.errors}") - time.sleep(JOB_POLL_INTERVAL_SECONDS) - - -def flatten2dlist(arr: List[List[Any]]) -> List[Any]: - """Flatten list of lists to flat list of elements""" - return [j for i in arr for j in i] - - -def load_batches(gcs_client, bq_client, gsurl, dest_table_ref, job_id_prefix): - """orchestrate 1 or more load jobs based on number of URIs and total byte - size of objects at gsurl""" - batches = get_batches_for_prefix(gcs_client, gsurl) - load_config = construct_load_job_config(gcs_client, gsurl) - load_config.labels = DEFAULT_JOB_LABELS - batch_count = len(batches) - - jobs: List[bigquery.LoadJob] = [] - for batch_num, batch in enumerate(batches): - print(load_config.to_api_repr()) - job: bigquery.LoadJob = bq_client.load_table_from_uri( - batch, - dest_table_ref, - job_config=load_config, - job_id_prefix=f"{job_id_prefix}{batch_num}-of-{batch_count}-", - ) - - print(f"started asyncronous bigquery load job with id: {job.job_id} for" - f" {gsurl}") - jobs.append(job) - - start_poll_for_errors = time.monotonic() - # Check if job failed quickly - while time.monotonic() - start_poll_for_errors < WAIT_FOR_JOB_SECONDS: - # Check if job failed quickly - for job in jobs: - job.reload() - if job.errors: - raise RuntimeError( - f"load job {job.job_id} failed quickly: {job.errors}") - time.sleep(JOB_POLL_INTERVAL_SECONDS) - - -def handle_duplicate_notification(bkt: storage.Bucket, - success_blob: storage.Blob, gsurl: str): - """ - Need to handle potential duplicate Pub/Sub notifications. - To achieve this we will drop an empty "claimed" file that indicates - an invocation of this cloud function has picked up the success file - with a certain creation timestamp. This will support republishing the - success file as a mechanism of re-running the ingestion while avoiding - duplicate ingestion due to multiple Pub/Sub messages for a success file - with the same creation time. - """ - success_blob.reload() - success_created_unix_timestamp = success_blob.time_created.timestamp() - - claim_blob: storage.Blob = bkt.blob( - success_blob.name.replace(SUCCESS_FILENAME, - f"_claimed_{success_created_unix_timestamp}")) - try: - claim_blob.upload_from_string("", if_generation_match=0) - except google.api_core.exceptions.PreconditionFailed as err: - raise RuntimeError( - f"The prefix {gsurl} appears to already have been claimed for " - f"{gsurl}{SUCCESS_FILENAME} with created timestamp" - f"{success_created_unix_timestamp}." - "This means that another invocation of this cloud function has" - "claimed the ingestion of this batch." - "This may be due to a rare duplicate delivery of the Pub/Sub " - "storage notification.") from err - - -def _get_parent_config_file(storage_client, config_filename, bucket, path): - config_dir_name = "_config" - parent_path = pathlib.Path(path).parent - config_path = parent_path / config_dir_name / config_filename - return read_gcs_file_if_exists(storage_client, - f"gs://{bucket}/{config_path}") - - -def look_for_config_in_parents(storage_client: storage.Client, gsurl: str, - config_filename: str) -> Optional[str]: - """look in parent directories for _config/config_filename""" - blob: storage.Blob = storage.Blob.from_string(gsurl) - bucket_name = blob.bucket.name - obj_path = blob.name - parts = removesuffix(obj_path, "/").split("/") - - def _get_parent_config(path): - return _get_parent_config_file(storage_client, config_filename, - bucket_name, path) - - config = None - while parts: - if config: - return config - config = _get_parent_config("/".join(parts)) - parts.pop() - return config - - -def construct_load_job_config(storage_client: storage.Client, - gsurl: str) -> bigquery.LoadJobConfig: - """ - merge dictionaries for loadjob.json configs in parent directories. - The configs closest to gsurl should take precedence. - """ - config_filename = "load.json" - blob: storage.Blob = storage.Blob.from_string(gsurl) - bucket_name = blob.bucket.name - obj_path = blob.name - parts = removesuffix(obj_path, "/").split("/") - - def _get_parent_config(path): - return _get_parent_config_file(storage_client, config_filename, - bucket_name, path) - - config_q: Deque[Dict[str, Any]] = collections.deque() - config_q.append(BASE_LOAD_JOB_CONFIG) - while parts: - config = _get_parent_config("/".join(parts)) - if config: - config_q.append(json.loads(config)) - parts.pop() - - merged_config: Dict = {} - while config_q: - recursive_update(merged_config, config_q.popleft(), in_place=True) - print(f"merged_config: {merged_config}") - return bigquery.LoadJobConfig.from_api_repr({"load": merged_config}) - - -def get_batches_for_prefix(gcs_client: storage.Client, - prefix_path: str, - ignore_subprefix="_config/", - ignore_file=SUCCESS_FILENAME) -> List[List[str]]: - """ - This function creates batches of GCS uris for a given prefix. - This prefix could be a table prefix or a partition prefix inside a - table prefix. - returns an Array of their batches - (one batch has an array of multiple GCS uris) - """ - batches = [] - blob: storage.Blob = storage.Blob.from_string(prefix_path) - bucket_name = blob.bucket.name - prefix_name = blob.name - - prefix_filter = f"{prefix_name}" - bucket = cached_get_bucket(gcs_client, bucket_name) - blobs = list(bucket.list_blobs(prefix=prefix_filter, delimiter="/")) - - cumulative_bytes = 0 - max_batch_size = int(os.getenv("MAX_BATCH_BYTES", DEFAULT_MAX_BATCH_BYTES)) - batch: List[str] = [] - for blob in blobs: - # API returns root prefix also. Which should be ignored. - # Similarly, the _SUCCESS file should be ignored. - # Finally, anything in the _config/ prefix should be ignored. - if (blob.name - not in {f"{prefix_name}/", f"{prefix_name}/{ignore_file}"} - or blob.name.startswith(f"{prefix_name}/{ignore_subprefix}")): - if blob.size == 0: # ignore empty files - print(f"ignoring empty file: gs://{bucket}/{blob.name}") - continue - cumulative_bytes += blob.size - - # keep adding until we reach threshold - if cumulative_bytes <= max_batch_size or len( - batch) > MAX_SOURCE_URIS_PER_LOAD: - batch.append(f"gs://{bucket_name}/{blob.name}") - else: - batches.append(batch.copy()) - batch.clear() - batch.append(f"gs://{bucket_name}/{blob.name}") - cumulative_bytes = blob.size - - # pick up remaining files in the final batch - if len(batch) > 0: - batches.append(batch.copy()) - batch.clear() - - if len(batches) > 1: - print(f"split into {len(batches)} load jobs.") - elif len(batches) == 1: - print("using single load job.") - else: - raise RuntimeError("No files to load!") - return batches - - -def parse_notification(notification: dict) -> Tuple[str, str]: - """valdiates notification payload - Args: - notification(dict): Pub/Sub Storage Notification - https://cloud.google.com/storage/docs/pubsub-notifications - Or Cloud Functions direct trigger - https://cloud.google.com/functions/docs/tutorials/storage - with notification schema - https://cloud.google.com/storage/docs/json_api/v1/objects#resource - Returns: - tuple of bucketId and objectId attributes - Raises: - KeyError if the input notification does not contain the expected - attributes. - """ - if notification.get("kind") == "storage#object": - # notification is GCS Object reosource from Cloud Functions trigger - # https://cloud.google.com/storage/docs/json_api/v1/objects#resource - return notification["bucket"], notification["name"] - if notification.get("attributes"): - # notification is Pub/Sub message. - try: - attributes = notification["attributes"] - return attributes["bucketId"], attributes["objectId"] - except KeyError: - raise RuntimeError( - "Issue with Pub/Sub message, did not contain expected" - f"attributes: 'bucketId' and 'objectId': {notification}" - ) from KeyError - raise RuntimeError( - "Cloud Function recieved unexpected trigger:\n" - f"{notification}\n" - "This function only supports direct Cloud Functions" - "Background Triggers or Pub/Sub storage notificaitons" - "as described in the following links:\n" - "https://cloud.google.com/storage/docs/pubsub-notifications\n" - "https://cloud.google.com/functions/docs/tutorials/storage") - - -# cache lookups against GCS API for 1 second as buckets / objects have update -# limit of once per second and we might do several of the same lookup during -# the functions lifetime. This should improve performance by eliminating -# unnecessary API calls. The lookups on bucket and objects in this function -# should not be changing during the function's lifetime as this would lead to -# non-deterministic results with or without this cache. -# https://cloud.google.com/storage/quotas -@cachetools.cached(cachetools.TTLCache(maxsize=1024, ttl=1)) -def read_gcs_file(gcs_client: storage.Client, gsurl: str) -> str: - """ - Read a GCS object as a string - - Args: - gcs_client: GCS client - gsurl: GCS URI for object to read in gs://bucket/path/to/object format - Returns: - str - """ - blob = storage.Blob.from_string(gsurl) - return blob.download_as_bytes(client=gcs_client).decode('UTF-8') - - -def read_gcs_file_if_exists(gcs_client: storage.Client, - gsurl: str) -> Optional[str]: - """return string of gcs object contents or None if the object does not exist - """ - try: - return read_gcs_file(gcs_client, gsurl) - except google.cloud.exceptions.NotFound: - return None - - -# Cache bucket lookups (see reasoning in comment above) -@cachetools.cached(cachetools.TTLCache(maxsize=1024, ttl=1)) -def cached_get_bucket( - gcs_client: storage.Client, - bucket_id: str, -) -> storage.Bucket: - """get storage.Bucket object by bucket_id string if exists or raise - google.cloud.exceptions.NotFound.""" - return gcs_client.get_bucket(bucket_id) - - -def dict_to_bq_schema(schema: List[Dict]) -> List[bigquery.SchemaField]: - """Converts a list of dicts to list of bigquery.SchemaField for use with - bigquery client library. Dicts must contain name and type keys. - The dict may optionally contain a mode key.""" - default_mode = "NULLABLE" - return [ - bigquery.SchemaField( - x["name"], - x["type"], - mode=x.get("mode") if x.get("mode") else default_mode) - for x in schema - ] - - -# To be added to built in str in python 3.9 -# https://www.python.org/dev/peps/pep-0616/ -def removesuffix(in_str: str, suffix: str) -> str: - """removes suffix from a string.""" - # suffix='' should not call self[:-0]. - if suffix and in_str.endswith(suffix): - return in_str[:-len(suffix)] - return in_str[:] - - -def recursive_update( - original: Dict, - update: Dict, - in_place: bool = False -): - """ - return a recursively updated dictionary. - - Note, lists will be completely overwritten by value in update if there is a - conflict. - - original: (dict) the base dictionary - update: (dict) the dictionary of updates to apply on original - in_place: (bool) if true then original will be mutated in place else a new - dictionary as a result of the update will be returned. - """ - out = original if in_place else copy.deepcopy(original) - - for key, value in update.items(): - if isinstance(value, dict): - out[key] = recursive_update(out.get(key, {}), value) - else: - out[key] = value - return out diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py new file mode 100644 index 000000000..e1e8df7f2 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py @@ -0,0 +1,477 @@ +# Copyright 2020 Google LLC. +# This software is provided as-is, without warranty or representation +# for any use or purpose. +# Your use of it is subject to your agreement with Google. + +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains utility methods used by the BQIngest process +""" +import os +import collections +import collections.abc +import copy +import json +import pathlib +import time +from typing import Any, Deque, Dict, List, Optional, Tuple + +import cachetools +import google.api_core.exceptions +import google.api_core.client_info +import google.cloud.exceptions +from google.cloud import bigquery, storage + +# Will wait up to this polling for errors before exiting +# This is to check if job fail quickly, not to assert it succeed. +# This may not be honored if longer than cloud function timeout. +# https://cloud.google.com/functions/docs/concepts/exec#timeout +# One might consider lowering this to 1-2 seconds to lower the +# upper bound of expected execution time to stay within the free tier. +# https://cloud.google.com/functions/pricing#free_tier +WAIT_FOR_JOB_SECONDS = int(os.getenv("WAIT_FOR_JOB_SECONDS", "5")) + +DEFAULT_EXTERNAL_TABLE_DEFINITION = { + # The default must be a self describing data format + # because autodetecting CSV /JSON schemas is likely to not match + # expectations / assumptions of the transformation query. + "sourceFormat": "PARQUET", +} + +# Use caution when lowering the job polling rate. +# Keep in mind that many concurrent executions of this cloud function should not +# violate the 300 concurrent requests or 100 request per second. +# https://cloud.google.com/bigquery/quotas#all_api_requests +JOB_POLL_INTERVAL_SECONDS = 1 + +DEFAULT_JOB_LABELS = { + "component": "event-based-gcs-ingest", + "cloud-function-name": os.getenv("FUNCTION_NAME"), +} + +BASE_LOAD_JOB_CONFIG = { + "sourceFormat": "CSV", + "fieldDelimiter": ",", + "writeDisposition": "WRITE_APPEND", + "labels": DEFAULT_JOB_LABELS, +} + +# https://cloud.google.com/bigquery/quotas#load_jobs +# 15TB per BQ load job (soft limit). +DEFAULT_MAX_BATCH_BYTES = str(15 * 10**12) + +# 10,000 GCS URIs per BQ load job. +MAX_SOURCE_URIS_PER_LOAD = 10**4 + +SUCCESS_FILENAME = os.getenv("SUCCESS_FILENAME", "_SUCCESS") + +DEFAULT_JOB_PREFIX = "gcf-ingest-" + + +def create_job_id_prefix(dest_table_ref: bigquery.TableReference, + batch_id: Optional[str]): + """Create job id prefix with a consistent naming convention. + The naming conventions is as follows: + gcf-ingest----- + Parts that are not inferrable from the GCS path with have a 'None' + placeholder. This naming convention is crucial for monitoring the system. + Note, gcf-ingest- can be overridden with environment variable JOB_PREFIX + + Examples: + + Non-partitioned Non batched tables: + - gs://${BUCKET}/tpch/lineitem/_SUCCESS + - gcf-ingest-tpch-lineitem-None-None- + Non-partitioned batched tables: + - gs://${BUCKET}/tpch/lineitem/batch000/_SUCCESS + - gcf-ingest-tpch-lineitem-None-batch000- + Partitioned Batched tables: + - gs://${BUCKET}/tpch/lineitem/$20201031/batch000/_SUCCESS + - gcf-ingest-tpch-lineitem-20201031-batch000- + """ + table_partition = dest_table_ref.table_id.split("$") + if len(table_partition) < 2: + # If there is no partition put a None placeholder + table_partition.append("None") + return f"{os.getenv('JOB_PREFIX', DEFAULT_JOB_PREFIX)}" \ + f"{dest_table_ref.dataset_id}-" \ + f"{'-'.join(table_partition)}-" \ + f"{batch_id}-" + + +def external_query( # pylint: disable=too-many-arguments + gcs_client: storage.Client, bq_client: bigquery.Client, gsurl: str, + query: str, dest_table_ref: bigquery.TableReference, + job_id_prefix: str): + """Load from query over external table from GCS. + + This hinges on a SQL query defined in GCS at _config/bq_transform.sql and + an external table definition _config/external.json (otherwise will assume + CSV external table) + """ + external_table_config = read_gcs_file_if_exists( + gcs_client, f"{gsurl}_config/external.json") + if not external_table_config: + external_table_config = look_for_config_in_parents( + gcs_client, gsurl, "external.json") + if external_table_config: + external_table_def = json.loads(external_table_config) + else: + print(f"Falling back to default CSV external table." + f" {gsurl}_config/external.json not found.") + external_table_def = DEFAULT_EXTERNAL_TABLE_DEFINITION + + external_table_def["sourceUris"] = flatten2dlist( + get_batches_for_prefix(gcs_client, gsurl)) + print(f"external table def = {json.dumps(external_table_config, indent=2)}") + external_config = bigquery.ExternalConfig.from_api_repr(external_table_def) + job_config = bigquery.QueryJobConfig( + table_definitions={"temp_ext": external_config}, use_legacy_sql=False) + + # Note, dest_table might include a partition decorator. + rendered_query = query.format( + dest_dataset=dest_table_ref.dataset_id, + dest_table=dest_table_ref.table_id, + ) + + job: bigquery.QueryJob = bq_client.query( + rendered_query, + job_config=job_config, + job_id_prefix=job_id_prefix, + ) + + print(f"started asynchronous query job: {job.job_id}") + + start_poll_for_errors = time.monotonic() + # Check if job failed quickly + while time.monotonic() - start_poll_for_errors < WAIT_FOR_JOB_SECONDS: + job.reload() + if job.errors: + raise RuntimeError( + f"query job {job.job_id} failed quickly: {job.errors}") + time.sleep(JOB_POLL_INTERVAL_SECONDS) + + +def flatten2dlist(arr: List[List[Any]]) -> List[Any]: + """Flatten list of lists to flat list of elements""" + return [j for i in arr for j in i] + + +def load_batches(gcs_client, bq_client, gsurl, dest_table_ref, job_id_prefix): + """orchestrate 1 or more load jobs based on number of URIs and total byte + size of objects at gsurl""" + batches = get_batches_for_prefix(gcs_client, gsurl) + load_config = construct_load_job_config(gcs_client, gsurl) + load_config.labels = DEFAULT_JOB_LABELS + batch_count = len(batches) + + jobs: List[bigquery.LoadJob] = [] + for batch_num, batch in enumerate(batches): + print(load_config.to_api_repr()) + job: bigquery.LoadJob = bq_client.load_table_from_uri( + batch, + dest_table_ref, + job_config=load_config, + job_id_prefix=f"{job_id_prefix}{batch_num}-of-{batch_count}-", + ) + + print(f"started asyncronous bigquery load job with id: {job.job_id} for" + f" {gsurl}") + jobs.append(job) + + start_poll_for_errors = time.monotonic() + # Check if job failed quickly + while time.monotonic() - start_poll_for_errors < WAIT_FOR_JOB_SECONDS: + # Check if job failed quickly + for job in jobs: + job.reload() + if job.errors: + raise RuntimeError( + f"load job {job.job_id} failed quickly: {job.errors}") + time.sleep(JOB_POLL_INTERVAL_SECONDS) + + +def handle_duplicate_notification(bkt: storage.Bucket, + success_blob: storage.Blob, gsurl: str): + """ + Need to handle potential duplicate Pub/Sub notifications. + To achieve this we will drop an empty "claimed" file that indicates + an invocation of this cloud function has picked up the success file + with a certain creation timestamp. This will support republishing the + success file as a mechanism of re-running the ingestion while avoiding + duplicate ingestion due to multiple Pub/Sub messages for a success file + with the same creation time. + """ + success_blob.reload() + success_created_unix_timestamp = success_blob.time_created.timestamp() + + claim_blob: storage.Blob = bkt.blob( + success_blob.name.replace(SUCCESS_FILENAME, + f"_claimed_{success_created_unix_timestamp}")) + try: + claim_blob.upload_from_string("", if_generation_match=0) + except google.api_core.exceptions.PreconditionFailed as err: + raise RuntimeError( + f"The prefix {gsurl} appears to already have been claimed for " + f"{gsurl}{SUCCESS_FILENAME} with created timestamp" + f"{success_created_unix_timestamp}." + "This means that another invocation of this cloud function has" + "claimed the ingestion of this batch." + "This may be due to a rare duplicate delivery of the Pub/Sub " + "storage notification.") from err + + +def _get_parent_config_file(storage_client, config_filename, bucket, path): + config_dir_name = "_config" + parent_path = pathlib.Path(path).parent + config_path = parent_path / config_dir_name / config_filename + return read_gcs_file_if_exists(storage_client, + f"gs://{bucket}/{config_path}") + + +def look_for_config_in_parents(storage_client: storage.Client, gsurl: str, + config_filename: str) -> Optional[str]: + """look in parent directories for _config/config_filename""" + blob: storage.Blob = storage.Blob.from_string(gsurl) + bucket_name = blob.bucket.name + obj_path = blob.name + parts = removesuffix(obj_path, "/").split("/") + + def _get_parent_config(path): + return _get_parent_config_file(storage_client, config_filename, + bucket_name, path) + + config = None + while parts: + if config: + return config + config = _get_parent_config("/".join(parts)) + parts.pop() + return config + + +def construct_load_job_config(storage_client: storage.Client, + gsurl: str) -> bigquery.LoadJobConfig: + """ + merge dictionaries for loadjob.json configs in parent directories. + The configs closest to gsurl should take precedence. + """ + config_filename = "load.json" + blob: storage.Blob = storage.Blob.from_string(gsurl) + bucket_name = blob.bucket.name + obj_path = blob.name + parts = removesuffix(obj_path, "/").split("/") + + def _get_parent_config(path): + return _get_parent_config_file(storage_client, config_filename, + bucket_name, path) + + config_q: Deque[Dict[str, Any]] = collections.deque() + config_q.append(BASE_LOAD_JOB_CONFIG) + while parts: + config = _get_parent_config("/".join(parts)) + if config: + config_q.append(json.loads(config)) + parts.pop() + + merged_config: Dict = {} + while config_q: + recursive_update(merged_config, config_q.popleft(), in_place=True) + print(f"merged_config: {merged_config}") + return bigquery.LoadJobConfig.from_api_repr({"load": merged_config}) + + +def get_batches_for_prefix(gcs_client: storage.Client, + prefix_path: str, + ignore_subprefix="_config/", + ignore_file=SUCCESS_FILENAME) -> List[List[str]]: + """ + This function creates batches of GCS uris for a given prefix. + This prefix could be a table prefix or a partition prefix inside a + table prefix. + returns an Array of their batches + (one batch has an array of multiple GCS uris) + """ + batches = [] + blob: storage.Blob = storage.Blob.from_string(prefix_path) + bucket_name = blob.bucket.name + prefix_name = blob.name + + prefix_filter = f"{prefix_name}" + bucket = cached_get_bucket(gcs_client, bucket_name) + blobs = list(bucket.list_blobs(prefix=prefix_filter, delimiter="/")) + + cumulative_bytes = 0 + max_batch_size = int(os.getenv("MAX_BATCH_BYTES", DEFAULT_MAX_BATCH_BYTES)) + batch: List[str] = [] + for blob in blobs: + # API returns root prefix also. Which should be ignored. + # Similarly, the _SUCCESS file should be ignored. + # Finally, anything in the _config/ prefix should be ignored. + if (blob.name + not in {f"{prefix_name}/", f"{prefix_name}/{ignore_file}"} + or blob.name.startswith(f"{prefix_name}/{ignore_subprefix}")): + if blob.size == 0: # ignore empty files + print(f"ignoring empty file: gs://{bucket}/{blob.name}") + continue + cumulative_bytes += blob.size + + # keep adding until we reach threshold + if cumulative_bytes <= max_batch_size or len( + batch) > MAX_SOURCE_URIS_PER_LOAD: + batch.append(f"gs://{bucket_name}/{blob.name}") + else: + batches.append(batch.copy()) + batch.clear() + batch.append(f"gs://{bucket_name}/{blob.name}") + cumulative_bytes = blob.size + + # pick up remaining files in the final batch + if len(batch) > 0: + batches.append(batch.copy()) + batch.clear() + + if len(batches) > 1: + print(f"split into {len(batches)} load jobs.") + elif len(batches) == 1: + print("using single load job.") + else: + raise RuntimeError("No files to load!") + return batches + + +def parse_notification(notification: dict) -> Tuple[str, str]: + """valdiates notification payload + Args: + notification(dict): Pub/Sub Storage Notification + https://cloud.google.com/storage/docs/pubsub-notifications + Or Cloud Functions direct trigger + https://cloud.google.com/functions/docs/tutorials/storage + with notification schema + https://cloud.google.com/storage/docs/json_api/v1/objects#resource + Returns: + tuple of bucketId and objectId attributes + Raises: + KeyError if the input notification does not contain the expected + attributes. + """ + if notification.get("kind") == "storage#object": + # notification is GCS Object reosource from Cloud Functions trigger + # https://cloud.google.com/storage/docs/json_api/v1/objects#resource + return notification["bucket"], notification["name"] + if notification.get("attributes"): + # notification is Pub/Sub message. + try: + attributes = notification["attributes"] + return attributes["bucketId"], attributes["objectId"] + except KeyError: + raise RuntimeError( + "Issue with Pub/Sub message, did not contain expected" + f"attributes: 'bucketId' and 'objectId': {notification}" + ) from KeyError + raise RuntimeError( + "Cloud Function recieved unexpected trigger:\n" + f"{notification}\n" + "This function only supports direct Cloud Functions" + "Background Triggers or Pub/Sub storage notificaitons" + "as described in the following links:\n" + "https://cloud.google.com/storage/docs/pubsub-notifications\n" + "https://cloud.google.com/functions/docs/tutorials/storage") + + +# cache lookups against GCS API for 1 second as buckets / objects have update +# limit of once per second and we might do several of the same lookup during +# the functions lifetime. This should improve performance by eliminating +# unnecessary API calls. The lookups on bucket and objects in this function +# should not be changing during the function's lifetime as this would lead to +# non-deterministic results with or without this cache. +# https://cloud.google.com/storage/quotas +@cachetools.cached(cachetools.TTLCache(maxsize=1024, ttl=1)) +def read_gcs_file(gcs_client: storage.Client, gsurl: str) -> str: + """ + Read a GCS object as a string + + Args: + gcs_client: GCS client + gsurl: GCS URI for object to read in gs://bucket/path/to/object format + Returns: + str + """ + blob = storage.Blob.from_string(gsurl) + return blob.download_as_bytes(client=gcs_client).decode('UTF-8') + + +def read_gcs_file_if_exists(gcs_client: storage.Client, + gsurl: str) -> Optional[str]: + """return string of gcs object contents or None if the object does not exist + """ + try: + return read_gcs_file(gcs_client, gsurl) + except google.cloud.exceptions.NotFound: + return None + + +# Cache bucket lookups (see reasoning in comment above) +@cachetools.cached(cachetools.TTLCache(maxsize=1024, ttl=1)) +def cached_get_bucket( + gcs_client: storage.Client, + bucket_id: str, +) -> storage.Bucket: + """get storage.Bucket object by bucket_id string if exists or raise + google.cloud.exceptions.NotFound.""" + return gcs_client.get_bucket(bucket_id) + + +def dict_to_bq_schema(schema: List[Dict]) -> List[bigquery.SchemaField]: + """Converts a list of dicts to list of bigquery.SchemaField for use with + bigquery client library. Dicts must contain name and type keys. + The dict may optionally contain a mode key.""" + default_mode = "NULLABLE" + return [ + bigquery.SchemaField( + x["name"], + x["type"], + mode=x.get("mode") if x.get("mode") else default_mode) + for x in schema + ] + + +# To be added to built in str in python 3.9 +# https://www.python.org/dev/peps/pep-0616/ +def removesuffix(in_str: str, suffix: str) -> str: + """removes suffix from a string.""" + # suffix='' should not call self[:-0]. + if suffix and in_str.endswith(suffix): + return in_str[:-len(suffix)] + return in_str[:] + + +def recursive_update(original: Dict, update: Dict, in_place: bool = False): + """ + return a recursively updated dictionary. + + Note, lists will be completely overwritten by value in update if there is a + conflict. + + original: (dict) the base dictionary + update: (dict) the dictionary of updates to apply on original + in_place: (bool) if true then original will be mutated in place else a new + dictionary as a result of the update will be returned. + """ + out = original if in_place else copy.deepcopy(original) + + for key, value in update.items(): + if isinstance(value, dict): + out[key] = recursive_update(out.get(key, {}), value) + else: + out[key] = value + return out diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py index 4121ba3fc..4adf3ba43 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py @@ -21,7 +21,7 @@ import pytest from google.cloud import bigquery, storage -import gcs_ocn_bq_ingest.main +import gcs_ocn_bq_ingest.utils TEST_DIR = os.path.realpath(os.path.dirname(__file__)) LOAD_JOB_POLLING_TIMEOUT = 10 # seconds @@ -93,7 +93,7 @@ def teardown(): def dest_table(request, bq, mock_env, dest_dataset) -> bigquery.Table: with open(os.path.join(TEST_DIR, "resources", "nation_schema.json")) as schema_file: - schema = gcs_ocn_bq_ingest.main.dict_to_bq_schema( + schema = gcs_ocn_bq_ingest.utils.dict_to_bq_schema( json.load(schema_file)) table = bigquery.Table( diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py index 6f983d22d..2a7e8896e 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py @@ -19,6 +19,7 @@ import pytest import gcs_ocn_bq_ingest.main +import gcs_ocn_bq_ingest.utils COMPILED_DEFAULT_DENTINATION_REGEX = re.compile( gcs_ocn_bq_ingest.main.DEFAULT_DESTINATION_REGEX) @@ -118,7 +119,7 @@ def test_default_destination_regex(test_input: str, ([["foo"], [], ["bar", "baz"]], ["foo", "bar", "baz"]), ]) def test_flattend2dlist(test_input, expected): - assert gcs_ocn_bq_ingest.main.flatten2dlist(test_input) == expected + assert gcs_ocn_bq_ingest.utils.flatten2dlist(test_input) == expected @pytest.mark.parametrize( @@ -201,4 +202,4 @@ def test_flattend2dlist(test_input, expected): # yapf: enable ]) def test_recursive_update(original, update, expected): - assert gcs_ocn_bq_ingest.main.recursive_update(original, update) == expected + assert gcs_ocn_bq_ingest.utils.recursive_update(original, update) == expected From b52d29123d411270f874845570da4193499434c6 Mon Sep 17 00:00:00 2001 From: Ryan den Otter Date: Fri, 4 Dec 2020 14:29:57 -0700 Subject: [PATCH 12/90] Fix sorting issues --- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py | 10 +++++----- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 8f4ff3d64..dec77b8ab 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -24,11 +24,11 @@ import google.cloud.exceptions from google.cloud import bigquery, storage -from .utils import (parse_notification, SUCCESS_FILENAME, removesuffix, - cached_get_bucket, handle_duplicate_notification, - DEFAULT_JOB_LABELS, read_gcs_file_if_exists, - look_for_config_in_parents, external_query, - create_job_id_prefix, load_batches) +from .utils import (DEFAULT_JOB_LABELS, SUCCESS_FILENAME, cached_get_bucket, + create_job_id_prefix, external_query, + handle_duplicate_notification, load_batches, + look_for_config_in_parents, parse_notification, + read_gcs_file_if_exists, removesuffix) # yapf: disable DEFAULT_DESTINATION_REGEX = ( diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py index e1e8df7f2..db99b839b 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py @@ -16,18 +16,18 @@ # limitations under the License. """Contains utility methods used by the BQIngest process """ -import os import collections import collections.abc import copy import json +import os import pathlib import time from typing import Any, Deque, Dict, List, Optional, Tuple import cachetools -import google.api_core.exceptions import google.api_core.client_info +import google.api_core.exceptions import google.cloud.exceptions from google.cloud import bigquery, storage From 3276214178f2562f0f7ef1b365d3becc42e6d941 Mon Sep 17 00:00:00 2001 From: Ryan den Otter Date: Fri, 4 Dec 2020 16:52:23 -0700 Subject: [PATCH 13/90] Move out constants into their own file Change import pattern --- .../gcs_ocn_bq_ingest/constants.py | 83 +++++++++++++++++++ .../gcs_ocn_bq_ingest/main.py | 61 +++++--------- .../gcs_ocn_bq_ingest/utils.py | 81 +++++------------- .../test_gcs_ocn_bq_ingest.py | 43 ++++------ 4 files changed, 140 insertions(+), 128 deletions(-) create mode 100644 tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py new file mode 100644 index 000000000..eefcc9f52 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py @@ -0,0 +1,83 @@ +# Copyright 2020 Google LLC. +# This software is provided as-is, without warranty or representation +# for any use or purpose. +# Your use of it is subject to your agreement with Google. + +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Configurations for Cloud Function for loading data from GCS to BigQuery. +""" +import os + +import google.api_core.client_info +import google.cloud.exceptions + +# Will wait up to this polling for errors before exiting +# This is to check if job fail quickly, not to assert it succeed. +# This may not be honored if longer than cloud function timeout. +# https://cloud.google.com/functions/docs/concepts/exec#timeout +# One might consider lowering this to 1-2 seconds to lower the +# upper bound of expected execution time to stay within the free tier. +# https://cloud.google.com/functions/pricing#free_tier +WAIT_FOR_JOB_SECONDS = int(os.getenv("WAIT_FOR_JOB_SECONDS", "5")) + +DEFAULT_EXTERNAL_TABLE_DEFINITION = { + # The default must be a self describing data format + # because autodetecting CSV /JSON schemas is likely to not match + # expectations / assumptions of the transformation query. + "sourceFormat": "PARQUET", +} + +# Use caution when lowering the job polling rate. +# Keep in mind that many concurrent executions of this cloud function should not +# violate the 300 concurrent requests or 100 request per second. +# https://cloud.google.com/bigquery/quotas#all_api_requests +JOB_POLL_INTERVAL_SECONDS = 1 + +DEFAULT_JOB_LABELS = { + "component": "event-based-gcs-ingest", + "cloud-function-name": os.getenv("FUNCTION_NAME"), +} + +BASE_LOAD_JOB_CONFIG = { + "sourceFormat": "CSV", + "fieldDelimiter": ",", + "writeDisposition": "WRITE_APPEND", + "labels": DEFAULT_JOB_LABELS, +} + +# https://cloud.google.com/bigquery/quotas#load_jobs +# 15TB per BQ load job (soft limit). +DEFAULT_MAX_BATCH_BYTES = str(15 * 10**12) + +# 10,000 GCS URIs per BQ load job. +MAX_SOURCE_URIS_PER_LOAD = 10**4 + +SUCCESS_FILENAME = os.getenv("SUCCESS_FILENAME", "_SUCCESS") + +DEFAULT_JOB_PREFIX = "gcf-ingest-" + +# yapf: disable +DEFAULT_DESTINATION_REGEX = ( + r"^(?P[\w\-\._0-9]+)/" # dataset (required) + r"(?P[\w\-_0-9]+)/?" # table name (required) + r"(?P\$[0-9]+)?/?" # partition decorator (optional) + r"(?P[0-9]{4})?/?" # partition year (yyyy) (optional) + r"(?P[0-9]{2})?/?" # partition month (mm) (optional) + r"(?P
[0-9]{2})?/?" # partition day (dd) (optional) + r"(?P[0-9]{2})?/?" # partition hour (hh) (optional) + r"(?P[\w\-_0-9]+)?/" # batch id (optional) +) +# yapf: enable + +CLIENT_INFO = google.api_core.client_info.ClientInfo( + user_agent="google-pso-tool/bq-severless-loader") diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index dec77b8ab..6e81e2a1d 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -20,31 +20,9 @@ import re from typing import Dict -import google.api_core.client_info -import google.cloud.exceptions from google.cloud import bigquery, storage -from .utils import (DEFAULT_JOB_LABELS, SUCCESS_FILENAME, cached_get_bucket, - create_job_id_prefix, external_query, - handle_duplicate_notification, load_batches, - look_for_config_in_parents, parse_notification, - read_gcs_file_if_exists, removesuffix) - -# yapf: disable -DEFAULT_DESTINATION_REGEX = ( - r"^(?P[\w\-\._0-9]+)/" # dataset (required) - r"(?P
[\w\-_0-9]+)/?" # table name (required) - r"(?P\$[0-9]+)?/?" # partition decorator (optional) - r"(?P[0-9]{4})?/?" # partition year (yyyy) (optional) - r"(?P[0-9]{2})?/?" # partition month (mm) (optional) - r"(?P
[0-9]{2})?/?" # partition day (dd) (optional) - r"(?P[0-9]{2})?/?" # partition hour (hh) (optional) - r"(?P[\w\-_0-9]+)?/" # batch id (optional) -) -# yapf: enable - -CLIENT_INFO = google.api_core.client_info.ClientInfo( - user_agent="google-pso-tool/bq-severless-loader") +from . import constants, utils def main(event: Dict, context): # pylint: disable=unused-argument @@ -54,27 +32,28 @@ def main(event: Dict, context): # pylint: disable=unused-argument # Set by Cloud Function Execution Environment # https://cloud.google.com/functions/docs/env-var destination_regex = os.getenv("DESTINATION_REGEX", - DEFAULT_DESTINATION_REGEX) + constants.DEFAULT_DESTINATION_REGEX) dest_re = re.compile(destination_regex) - bucket_id, object_id = parse_notification(event) + bucket_id, object_id = utils.parse_notification(event) # Exit eagerly if not a success file. # we can improve this with pub/sub message filtering once it supports # a hasSuffix filter function (we can filter on hasSuffix successfile name) # https://cloud.google.com/pubsub/docs/filtering - if not object_id.endswith(f"/{SUCCESS_FILENAME}"): + if not object_id.endswith(f"/{constants.SUCCESS_FILENAME}"): print( - f"No-op. This notification was not for a {SUCCESS_FILENAME} file.") + f"No-op. This notification was not for a {constants.SUCCESS_FILENAME} file." + ) return - prefix_to_load = removesuffix(object_id, SUCCESS_FILENAME) + prefix_to_load = utils.removesuffix(object_id, constants.SUCCESS_FILENAME) gsurl = f"gs://{bucket_id}/{prefix_to_load}" - gcs_client = storage.Client(client_info=CLIENT_INFO) + gcs_client = storage.Client(client_info=constants.CLIENT_INFO) project = os.getenv("BQ_PROJECT", gcs_client.project) - bkt = cached_get_bucket(gcs_client, bucket_id) + bkt = utils.cached_get_bucket(gcs_client, bucket_id) success_blob: storage.Blob = bkt.blob(object_id) - handle_duplicate_notification(bkt, success_blob, gsurl) + utils.handle_duplicate_notification(bkt, success_blob, gsurl) destination_match = dest_re.match(object_id) if not destination_match: @@ -95,7 +74,7 @@ def main(event: Dict, context): # pylint: disable=unused-argument if not partition and any(part_list): partition = '$' + ''.join(part_list) batch_id = destination_details.get('batch') - labels = DEFAULT_JOB_LABELS + labels = constants.DEFAULT_JOB_LABELS labels["bucket"] = bucket_id if batch_id: @@ -111,23 +90,23 @@ def main(event: Dict, context): # pylint: disable=unused-argument default_query_config = bigquery.QueryJobConfig() default_query_config.use_legacy_sql = False default_query_config.labels = labels - bq_client = bigquery.Client(client_info=CLIENT_INFO, + bq_client = bigquery.Client(client_info=constants.CLIENT_INFO, default_query_job_config=default_query_config) print("looking for bq_transform.sql") - external_query_sql = read_gcs_file_if_exists( + external_query_sql = utils.read_gcs_file_if_exists( gcs_client, f"{gsurl}_config/bq_transform.sql") if not external_query_sql: - external_query_sql = look_for_config_in_parents(gcs_client, gsurl, - "bq_transform.sql") + external_query_sql = utils.look_for_config_in_parents( + gcs_client, gsurl, "bq_transform.sql") if external_query_sql: print("EXTERNAL QUERY") print(f"found external query:\n{external_query_sql}") - external_query(gcs_client, bq_client, gsurl, external_query_sql, - dest_table_ref, - create_job_id_prefix(dest_table_ref, batch_id)) + utils.external_query( + gcs_client, bq_client, gsurl, external_query_sql, dest_table_ref, + utils.create_job_id_prefix(dest_table_ref, batch_id)) return print("LOAD_JOB") - load_batches(gcs_client, bq_client, gsurl, dest_table_ref, - create_job_id_prefix(dest_table_ref, batch_id)) + utils.load_batches(gcs_client, bq_client, gsurl, dest_table_ref, + utils.create_job_id_prefix(dest_table_ref, batch_id)) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py index db99b839b..5ec878c8a 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py @@ -31,50 +31,7 @@ import google.cloud.exceptions from google.cloud import bigquery, storage -# Will wait up to this polling for errors before exiting -# This is to check if job fail quickly, not to assert it succeed. -# This may not be honored if longer than cloud function timeout. -# https://cloud.google.com/functions/docs/concepts/exec#timeout -# One might consider lowering this to 1-2 seconds to lower the -# upper bound of expected execution time to stay within the free tier. -# https://cloud.google.com/functions/pricing#free_tier -WAIT_FOR_JOB_SECONDS = int(os.getenv("WAIT_FOR_JOB_SECONDS", "5")) - -DEFAULT_EXTERNAL_TABLE_DEFINITION = { - # The default must be a self describing data format - # because autodetecting CSV /JSON schemas is likely to not match - # expectations / assumptions of the transformation query. - "sourceFormat": "PARQUET", -} - -# Use caution when lowering the job polling rate. -# Keep in mind that many concurrent executions of this cloud function should not -# violate the 300 concurrent requests or 100 request per second. -# https://cloud.google.com/bigquery/quotas#all_api_requests -JOB_POLL_INTERVAL_SECONDS = 1 - -DEFAULT_JOB_LABELS = { - "component": "event-based-gcs-ingest", - "cloud-function-name": os.getenv("FUNCTION_NAME"), -} - -BASE_LOAD_JOB_CONFIG = { - "sourceFormat": "CSV", - "fieldDelimiter": ",", - "writeDisposition": "WRITE_APPEND", - "labels": DEFAULT_JOB_LABELS, -} - -# https://cloud.google.com/bigquery/quotas#load_jobs -# 15TB per BQ load job (soft limit). -DEFAULT_MAX_BATCH_BYTES = str(15 * 10**12) - -# 10,000 GCS URIs per BQ load job. -MAX_SOURCE_URIS_PER_LOAD = 10**4 - -SUCCESS_FILENAME = os.getenv("SUCCESS_FILENAME", "_SUCCESS") - -DEFAULT_JOB_PREFIX = "gcf-ingest-" +from . import constants def create_job_id_prefix(dest_table_ref: bigquery.TableReference, @@ -102,7 +59,7 @@ def create_job_id_prefix(dest_table_ref: bigquery.TableReference, if len(table_partition) < 2: # If there is no partition put a None placeholder table_partition.append("None") - return f"{os.getenv('JOB_PREFIX', DEFAULT_JOB_PREFIX)}" \ + return f"{os.getenv('JOB_PREFIX', constants.DEFAULT_JOB_PREFIX)}" \ f"{dest_table_ref.dataset_id}-" \ f"{'-'.join(table_partition)}-" \ f"{batch_id}-" @@ -128,7 +85,7 @@ def external_query( # pylint: disable=too-many-arguments else: print(f"Falling back to default CSV external table." f" {gsurl}_config/external.json not found.") - external_table_def = DEFAULT_EXTERNAL_TABLE_DEFINITION + external_table_def = constants.DEFAULT_EXTERNAL_TABLE_DEFINITION external_table_def["sourceUris"] = flatten2dlist( get_batches_for_prefix(gcs_client, gsurl)) @@ -153,12 +110,13 @@ def external_query( # pylint: disable=too-many-arguments start_poll_for_errors = time.monotonic() # Check if job failed quickly - while time.monotonic() - start_poll_for_errors < WAIT_FOR_JOB_SECONDS: + while time.monotonic( + ) - start_poll_for_errors < constants.WAIT_FOR_JOB_SECONDS: job.reload() if job.errors: raise RuntimeError( f"query job {job.job_id} failed quickly: {job.errors}") - time.sleep(JOB_POLL_INTERVAL_SECONDS) + time.sleep(constants.JOB_POLL_INTERVAL_SECONDS) def flatten2dlist(arr: List[List[Any]]) -> List[Any]: @@ -171,7 +129,7 @@ def load_batches(gcs_client, bq_client, gsurl, dest_table_ref, job_id_prefix): size of objects at gsurl""" batches = get_batches_for_prefix(gcs_client, gsurl) load_config = construct_load_job_config(gcs_client, gsurl) - load_config.labels = DEFAULT_JOB_LABELS + load_config.labels = constants.DEFAULT_JOB_LABELS batch_count = len(batches) jobs: List[bigquery.LoadJob] = [] @@ -190,14 +148,15 @@ def load_batches(gcs_client, bq_client, gsurl, dest_table_ref, job_id_prefix): start_poll_for_errors = time.monotonic() # Check if job failed quickly - while time.monotonic() - start_poll_for_errors < WAIT_FOR_JOB_SECONDS: + while time.monotonic( + ) - start_poll_for_errors < constants.WAIT_FOR_JOB_SECONDS: # Check if job failed quickly for job in jobs: job.reload() if job.errors: raise RuntimeError( f"load job {job.job_id} failed quickly: {job.errors}") - time.sleep(JOB_POLL_INTERVAL_SECONDS) + time.sleep(constants.JOB_POLL_INTERVAL_SECONDS) def handle_duplicate_notification(bkt: storage.Bucket, @@ -215,14 +174,14 @@ def handle_duplicate_notification(bkt: storage.Bucket, success_created_unix_timestamp = success_blob.time_created.timestamp() claim_blob: storage.Blob = bkt.blob( - success_blob.name.replace(SUCCESS_FILENAME, + success_blob.name.replace(constants.SUCCESS_FILENAME, f"_claimed_{success_created_unix_timestamp}")) try: claim_blob.upload_from_string("", if_generation_match=0) except google.api_core.exceptions.PreconditionFailed as err: raise RuntimeError( f"The prefix {gsurl} appears to already have been claimed for " - f"{gsurl}{SUCCESS_FILENAME} with created timestamp" + f"{gsurl}{constants.SUCCESS_FILENAME} with created timestamp" f"{success_created_unix_timestamp}." "This means that another invocation of this cloud function has" "claimed the ingestion of this batch." @@ -276,7 +235,7 @@ def _get_parent_config(path): bucket_name, path) config_q: Deque[Dict[str, Any]] = collections.deque() - config_q.append(BASE_LOAD_JOB_CONFIG) + config_q.append(constants.BASE_LOAD_JOB_CONFIG) while parts: config = _get_parent_config("/".join(parts)) if config: @@ -290,10 +249,11 @@ def _get_parent_config(path): return bigquery.LoadJobConfig.from_api_repr({"load": merged_config}) -def get_batches_for_prefix(gcs_client: storage.Client, - prefix_path: str, - ignore_subprefix="_config/", - ignore_file=SUCCESS_FILENAME) -> List[List[str]]: +def get_batches_for_prefix( + gcs_client: storage.Client, + prefix_path: str, + ignore_subprefix="_config/", + ignore_file=constants.SUCCESS_FILENAME) -> List[List[str]]: """ This function creates batches of GCS uris for a given prefix. This prefix could be a table prefix or a partition prefix inside a @@ -311,7 +271,8 @@ def get_batches_for_prefix(gcs_client: storage.Client, blobs = list(bucket.list_blobs(prefix=prefix_filter, delimiter="/")) cumulative_bytes = 0 - max_batch_size = int(os.getenv("MAX_BATCH_BYTES", DEFAULT_MAX_BATCH_BYTES)) + max_batch_size = int( + os.getenv("MAX_BATCH_BYTES", constants.DEFAULT_MAX_BATCH_BYTES)) batch: List[str] = [] for blob in blobs: # API returns root prefix also. Which should be ignored. @@ -327,7 +288,7 @@ def get_batches_for_prefix(gcs_client: storage.Client, # keep adding until we reach threshold if cumulative_bytes <= max_batch_size or len( - batch) > MAX_SOURCE_URIS_PER_LOAD: + batch) > constants.MAX_SOURCE_URIS_PER_LOAD: batch.append(f"gs://{bucket_name}/{blob.name}") else: batches.append(batch.copy()) diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py index 2a7e8896e..712b380be 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py @@ -18,11 +18,12 @@ import pytest +import gcs_ocn_bq_ingest.constants import gcs_ocn_bq_ingest.main import gcs_ocn_bq_ingest.utils COMPILED_DEFAULT_DENTINATION_REGEX = re.compile( - gcs_ocn_bq_ingest.main.DEFAULT_DESTINATION_REGEX) + gcs_ocn_bq_ingest.constants.DEFAULT_DESTINATION_REGEX) @pytest.mark.parametrize( @@ -127,40 +128,31 @@ def test_flattend2dlist(test_input, expected): [ # yapf: disable ( # empty original - {}, - { + {}, { "a": 1 - }, - { + }, { "a": 1 - } - ), + }), ( # empty update { "a": 1 - }, - {}, - { + }, {}, { "a": 1 }), ( # basic update of top-level key { "a": 1 - }, - { + }, { "a": 2 - }, - { + }, { "a": 2 }), ( # update of list { "a": [1] - }, - { + }, { "a": [2] - }, - { + }, { "a": [2] }), ( # update of nested key @@ -168,13 +160,11 @@ def test_flattend2dlist(test_input, expected): "a": { "b": 1 } - }, - { + }, { "a": { "b": 2 } - }, - { + }, { "a": { "b": 2 } @@ -186,13 +176,11 @@ def test_flattend2dlist(test_input, expected): "c": 2 }, "d": 3 - }, - { + }, { "a": { "b": 4 }, - }, - { + }, { "a": { "b": 4, "c": 2 @@ -202,4 +190,5 @@ def test_flattend2dlist(test_input, expected): # yapf: enable ]) def test_recursive_update(original, update, expected): - assert gcs_ocn_bq_ingest.utils.recursive_update(original, update) == expected + assert gcs_ocn_bq_ingest.utils.recursive_update(original, + update) == expected From 117d91bfa51ae736496cae1f83e7c7d540a16086 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Fri, 4 Dec 2020 16:17:44 -0800 Subject: [PATCH 14/90] fixup! pylint --- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 6e81e2a1d..e6e7deaa9 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -22,7 +22,8 @@ from google.cloud import bigquery, storage -from . import constants, utils +# pylint in cloud build is being flaky about this import discovery. +from . import constants, utils # pylint: disable=no-name-in-module def main(event: Dict, context): # pylint: disable=unused-argument From 3770c84132d57ae33ceb4ee11bdace188a2a15b0 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Fri, 4 Dec 2020 16:20:47 -0800 Subject: [PATCH 15/90] fixup! fixup! gcb pylint issue --- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py index 5ec878c8a..434b423c7 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py @@ -31,7 +31,8 @@ import google.cloud.exceptions from google.cloud import bigquery, storage -from . import constants +# pylint in cloud build is being flaky about this import discovery. +from . import constants # pylint: disable=no-name-in-module def create_job_id_prefix(dest_table_ref: bigquery.TableReference, From 81bb167a6abd4db96589a81f0856247c5ad55154 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Fri, 4 Dec 2020 11:59:00 -0800 Subject: [PATCH 16/90] feat: sequencing with backlog publisher / subscriber * Restructures code into constants and exception modules * Implements Backlog Publisher / Subscriber algorithm for ordering incrementals * Implements basic integration tests for Publisher / Subscriber --- .../gcs_ocn_bq_ingest/README.md | 1 + .../gcs_ocn_bq_ingest/constants.py | 33 +- .../gcs_ocn_bq_ingest/exceptions.py | 52 +++ .../gcs_ocn_bq_ingest/main.py | 206 +++++---- .../gcs_ocn_bq_ingest/ordering.py | 180 ++++++++ .../gcs_ocn_bq_ingest/requirements.txt | 5 +- .../gcs_ocn_bq_ingest/utils.py | 418 ++++++++++++++---- .../gcs_event_based_ingest/pytest.ini | 1 + .../requirements-dev.txt | 2 +- .../gcs_event_based_ingest/requirements.txt | 5 +- .../gcs_event_based_ingest/tests/conftest.py | 168 ++++++- .../test_gcs_ocn_bq_ingest.py | 24 + .../test_gcs_ocn_bq_ingest_it.py | 16 +- .../gcs_ocn_bq_ingest/test_ordering_it.py | 141 ++++++ .../tests/resources/ordering_schema.json | 10 + .../resources/test-data/ordering/00/_SUCCESS | 0 .../resources/test-data/ordering/00/data.csv | 1 + .../resources/test-data/ordering/01/_SUCCESS | 0 .../resources/test-data/ordering/01/data.csv | 1 + .../resources/test-data/ordering/02/_SUCCESS | 0 .../resources/test-data/ordering/02/data.csv | 1 + 21 files changed, 1069 insertions(+), 196 deletions(-) create mode 100644 tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/exceptions.py create mode 100644 tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py create mode 100644 tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py create mode 100644 tools/cloud_functions/gcs_event_based_ingest/tests/resources/ordering_schema.json create mode 100644 tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/00/_SUCCESS create mode 100644 tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/00/data.csv create mode 100644 tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/01/_SUCCESS create mode 100644 tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/01/data.csv create mode 100644 tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/02/_SUCCESS create mode 100644 tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/02/data.csv diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md index 5e30a1c4b..1252b1dda 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md @@ -32,6 +32,7 @@ following default behavior. | `MAX_BATCH_BYTES` | Max bytes for BigQuery Load job | `15000000000000` ([15 TB](https://cloud.google.com/bigquery/quotas#load_jobs)| | `JOB_PREFIX` | Prefix for BigQuery Job IDs | `gcf-ingest-` | | `BQ_PROJECT` | Default BQ project to use if not specified in dataset capturing group | Project where Cloud Function is deployed | +| `ORDERED_PER_TABLE` | Force jobs to be executed sequentially (rather than parallel) based on the backlog. This is the same as having an `ORDERME` file in every config directory | `False` | ## Implementation notes diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py index eefcc9f52..a0db05425 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py @@ -16,7 +16,9 @@ # limitations under the License. """Configurations for Cloud Function for loading data from GCS to BigQuery. """ +import distutils.util import os +import re import google.api_core.client_info import google.cloud.exceptions @@ -71,13 +73,42 @@ r"^(?P[\w\-\._0-9]+)/" # dataset (required) r"(?P
[\w\-_0-9]+)/?" # table name (required) r"(?P\$[0-9]+)?/?" # partition decorator (optional) - r"(?P[0-9]{4})?/?" # partition year (yyyy) (optional) + r"(?:" # [begin] yyyy/mm/dd/hh/ group (optional) + r"(?P[0-9]{4})/?" # partition year (yyyy) (optional) r"(?P[0-9]{2})?/?" # partition month (mm) (optional) r"(?P
[0-9]{2})?/?" # partition day (dd) (optional) r"(?P[0-9]{2})?/?" # partition hour (hh) (optional) + r")?" # [end]yyyy/mm/dd/hh/ group (optional) r"(?P[\w\-_0-9]+)?/" # batch id (optional) ) # yapf: enable +DESTINATION_REGEX = re.compile( + os.getenv("DESTINATION_REGEX", DEFAULT_DESTINATION_REGEX)) + CLIENT_INFO = google.api_core.client_info.ClientInfo( user_agent="google-pso-tool/bq-severless-loader") + +# Filename used to (re)start the backfill subscriber loop. +BACKFILL_FILENAME = "_BACKFILL" + +# When this file is uploaded the subscriber will start applying items in order +# off the backlog. This is meant to help scenarios where historical loads to GCS +# are parallelized but must be applied in order. One can drop a _HISTORYDONE +# file to indicate the entire history has been uploaded and it is safe to start +# applying items in the backlog in order. By default this will be empty and the +# backlog subscriber will not wait for any file and start applying the first +# items in the backlog. +START_BACKFILL_FILENAME = os.getenv("START_BACKFILL_FILENAME") + +# Filenames that cause cloud function to take action. +ACTION_FILENAMES = { + SUCCESS_FILENAME, + BACKFILL_FILENAME, + START_BACKFILL_FILENAME, +} + +RESTART_BUFFER_SECONDS = os.getenv("RESTART_BUFFER_SECONDS", 30) + +ORDER_ALL_JOBS = bool( + distutils.util.strtobool(os.getenv("ORDER_ALL_JOBS", "False"))) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/exceptions.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/exceptions.py new file mode 100644 index 000000000..908db717c --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/exceptions.py @@ -0,0 +1,52 @@ +# Copyright 2020 Google LLC. +# This software is provided as-is, without warranty or representation +# for any use or purpose. +# Your use of it is subject to your agreement with Google. + +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Custom Exceptions of GCS event based ingest to BigQuery""" + + +class DuplicateNotificationException(Exception): + """Exception to indicate that the function was triggered twice for the same + event.""" + + +class BigQueryJobFailure(Exception): + """Exception to indicate that the function was triggered twice for the same + event.""" + + +class DestinationRegexMatchException(Exception): + """Exception to indicate that a success file did not match the destination + regex specified in the DESTINATION_REGEX environment variable (or the + default)""" + + +class UnexpectedTriggerException(Exception): + """Exception to indicate the cloud function was triggered with an unexpected + payload.""" + + +class BacklogException(Exception): + """Exception to indicate an issue with the backlog mechanics of this + function.""" + + +EXCEPTIONS_TO_REPORT = { + BigQueryJobFailure, + UnexpectedTriggerException, + DestinationRegexMatchException, + BacklogException, + DuplicateNotificationException, +} diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index e6e7deaa9..199ac47a8 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -17,97 +17,133 @@ """Background Cloud Function for loading data from GCS to BigQuery. """ import os -import re +import time from typing import Dict -from google.cloud import bigquery, storage - # pylint in cloud build is being flaky about this import discovery. -from . import constants, utils # pylint: disable=no-name-in-module +# pylint: disable=no-name-in-module +from google.cloud import bigquery, error_reporting, storage + +from . import constants, exceptions, ordering, utils +# Reuse GCP Clients across function invocations using globbals +# https://cloud.google.com/functions/docs/bestpractices/tips#use_global_variables_to_reuse_objects_in_future_invocations +# pylint: disable=global-statement +from .utils import apply + +ERROR_REPORTING_CLIENT = None + +BQ_CLIENT = None + +GCS_CLIENT = None def main(event: Dict, context): # pylint: disable=unused-argument """entry point for background cloud function for event driven GCS to BigQuery ingest.""" - # pylint: disable=too-many-locals - # Set by Cloud Function Execution Environment - # https://cloud.google.com/functions/docs/env-var - destination_regex = os.getenv("DESTINATION_REGEX", - constants.DEFAULT_DESTINATION_REGEX) - dest_re = re.compile(destination_regex) - - bucket_id, object_id = utils.parse_notification(event) - - # Exit eagerly if not a success file. - # we can improve this with pub/sub message filtering once it supports - # a hasSuffix filter function (we can filter on hasSuffix successfile name) - # https://cloud.google.com/pubsub/docs/filtering - if not object_id.endswith(f"/{constants.SUCCESS_FILENAME}"): - print( - f"No-op. This notification was not for a {constants.SUCCESS_FILENAME} file." - ) - return - - prefix_to_load = utils.removesuffix(object_id, constants.SUCCESS_FILENAME) - gsurl = f"gs://{bucket_id}/{prefix_to_load}" - gcs_client = storage.Client(client_info=constants.CLIENT_INFO) - project = os.getenv("BQ_PROJECT", gcs_client.project) - bkt = utils.cached_get_bucket(gcs_client, bucket_id) - success_blob: storage.Blob = bkt.blob(object_id) - utils.handle_duplicate_notification(bkt, success_blob, gsurl) - - destination_match = dest_re.match(object_id) - if not destination_match: - raise RuntimeError(f"Object ID {object_id} did not match regex:" - f" {destination_regex}") - destination_details = destination_match.groupdict() try: - dataset = destination_details['dataset'] - table = destination_details['table'] - except KeyError: - raise RuntimeError( - f"Object ID {object_id} did not match dataset and table in regex:" - f" {destination_regex}") from KeyError - partition = destination_details.get('partition') - year, month, day, hour = ( - destination_details.get(key, "") for key in ('yyyy', 'mm', 'dd', 'hh')) - part_list = (year, month, day, hour) - if not partition and any(part_list): - partition = '$' + ''.join(part_list) - batch_id = destination_details.get('batch') - labels = constants.DEFAULT_JOB_LABELS - labels["bucket"] = bucket_id - - if batch_id: - labels["batch-id"] = batch_id - - if partition: - dest_table_ref = bigquery.TableReference.from_string( - f"{dataset}.{table}{partition}", default_project=project) - else: - dest_table_ref = bigquery.TableReference.from_string( - f"{dataset}.{table}", default_project=project) - - default_query_config = bigquery.QueryJobConfig() - default_query_config.use_legacy_sql = False - default_query_config.labels = labels - bq_client = bigquery.Client(client_info=constants.CLIENT_INFO, - default_query_job_config=default_query_config) - - print("looking for bq_transform.sql") - external_query_sql = utils.read_gcs_file_if_exists( - gcs_client, f"{gsurl}_config/bq_transform.sql") - if not external_query_sql: - external_query_sql = utils.look_for_config_in_parents( - gcs_client, gsurl, "bq_transform.sql") - if external_query_sql: - print("EXTERNAL QUERY") - print(f"found external query:\n{external_query_sql}") - utils.external_query( - gcs_client, bq_client, gsurl, external_query_sql, dest_table_ref, - utils.create_job_id_prefix(dest_table_ref, batch_id)) - return - - print("LOAD_JOB") - utils.load_batches(gcs_client, bq_client, gsurl, dest_table_ref, - utils.create_job_id_prefix(dest_table_ref, batch_id)) + function_start_time = time.monotonic() + # pylint: disable=too-many-locals + + bucket_id, object_id = utils.parse_notification(event) + + basename_object_id = os.path.basename(object_id) + + # Exit eagerly if this is not a file to take action on. + if basename_object_id not in constants.ACTION_FILENAMES: + action_filenames = constants.ACTION_FILENAMES + if constants.START_BACKFILL_FILENAME is None: + action_filenames.remove(None) + print(f"No-op. This notification was not for a" + f"{action_filenames} file.") + return + + # Ignore success files in the backlog directory + if (basename_object_id == constants.SUCCESS_FILENAME + and "/_backlog/" in object_id): + print(f"No-op. This notification was for " + f"gs://{bucket_id}/{object_id} a" + f"{constants.SUCCESS_FILENAME} in a" + "/_backlog/ directory.") + return + + gcs_client = lazy_gcs_client() + bq_client = lazy_bq_client() + table_ref, batch = utils.gcs_path_to_table_ref_and_batch(object_id) + + enforce_ordering = (constants.ORDER_ALL_JOBS + or utils.look_for_config_in_parents( + gcs_client, f"gs://{bucket_id}/{object_id}", + "ORDERME") is not None) + + bkt: storage.Bucket = utils.cached_get_bucket(gcs_client, bucket_id) + event_blob: storage.Blob = bkt.blob(object_id) + + if enforce_ordering: + if (constants.START_BACKFILL_FILENAME and basename_object_id + == constants.START_BACKFILL_FILENAME): + # This will be the first backfill file. + ordering.start_backfill_subscriber_if_not_running( + gcs_client, bkt, utils.get_table_prefix(object_id)) + return + if basename_object_id == constants.SUCCESS_FILENAME: + ordering.backlog_publisher(gcs_client, event_blob) + elif basename_object_id == constants.BACKFILL_FILENAME: + ordering.backlog_subscriber(gcs_client, bq_client, + lazy_error_reporting_client(), + event_blob, function_start_time) + else: # Default behavior submit job as soon as success file lands. + bkt = utils.cached_get_bucket(gcs_client, bucket_id) + success_blob: storage.Blob = bkt.blob(object_id) + utils.handle_duplicate_notification(success_blob) + apply( + gcs_client, + bq_client, + success_blob, + None, # None lock blob as there is no serialization required. + utils.create_job_id(table_ref, batch)) + # Unexpected exceptions will actually raise which may cause a cold restart. + except tuple(exceptions.EXCEPTIONS_TO_REPORT): + # We do this because we know these errors do not require a cold restart + # of the cloud function. + lazy_error_reporting_client().report_exception() + + +def lazy_error_reporting_client() -> error_reporting.Client: + """ + Return a error reporting client that may be shared between cloud function + invocations. + + https://cloud.google.com/functions/docs/monitoring/error-reporting + """ + global ERROR_REPORTING_CLIENT + if not ERROR_REPORTING_CLIENT: + ERROR_REPORTING_CLIENT = error_reporting.Client( + client_info=constants.CLIENT_INFO) + return ERROR_REPORTING_CLIENT + + +def lazy_bq_client() -> bigquery.Client: + """ + Return a BigQuery Client that may be shared between cloud function + invocations. + """ + global BQ_CLIENT + if not BQ_CLIENT: + default_query_config = bigquery.QueryJobConfig() + default_query_config.use_legacy_sql = False + default_query_config.labels = constants.DEFAULT_JOB_LABELS + BQ_CLIENT = bigquery.Client( + client_info=constants.CLIENT_INFO, + default_query_job_config=default_query_config) + return BQ_CLIENT + + +def lazy_gcs_client() -> storage.Client: + """ + Return a BigQuery Client that may be shared between cloud function + invocations. + """ + global GCS_CLIENT + if not GCS_CLIENT: + GCS_CLIENT = storage.Client(client_info=constants.CLIENT_INFO) + return GCS_CLIENT diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py new file mode 100644 index 000000000..310eb1f52 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py @@ -0,0 +1,180 @@ +# Copyright 2020 Google LLC. +# This software is provided as-is, without warranty or representation +# for any use or purpose. +# Your use of it is subject to your agreement with Google. + +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Background Cloud Function for loading data from GCS to BigQuery. +""" +import os +import time +import traceback + +import google.api_core +import google.api_core.exceptions +# pylint in cloud build is being flaky about this import discovery. +# pylint: disable=no-name-in-module +from google.cloud import bigquery, error_reporting, storage + +from . import constants, exceptions, utils + + +def backlog_publisher( + gcs_client: storage.Client, + event_blob: storage.Blob, +): + """add success files to the the backlog and trigger backfill if necessary""" + bkt = event_blob.bucket + + # Create an entry in _backlog for this table for this batch / success file + backlog_blob = success_blob_to_backlog_blob(event_blob) + backlog_blob.upload_from_string("", client=gcs_client) + print(f"added gs://{backlog_blob.bucket.name}/{backlog_blob.name} " + "to the backlog.") + + start_backfill = True + table_prefix = utils.get_table_prefix(event_blob.name) + if constants.START_BACKFILL_FILENAME: + start_backfill_blob = bkt.blob( + f"{table_prefix}/{constants.START_BACKFILL_FILENAME}") + start_backfill = start_backfill_blob.exists() + + if start_backfill: + start_backfill_subscriber_if_not_running(gcs_client, bkt, table_prefix) + + +# pylint: disable=too-many-arguments,too-many-locals +def backlog_subscriber(gcs_client: storage.Client, bq_client: bigquery.Client, + error_client: error_reporting.Client, + backfill_blob: storage.Blob, function_start_time: float): + """Pick up the table lock, poll BQ job id until completion and process next + item in the backlog. + """ + # We need to retrigger the backfill loop before the Cloud Functions Timeout. + restart_time = function_start_time + ( + float(os.getenv("FUNCTION_TIMEOUT_SEC", "60")) - + constants.RESTART_BUFFER_SECONDS) + bkt = backfill_blob.bucket + utils.handle_duplicate_notification(backfill_blob) + table_prefix = utils.get_table_prefix(backfill_blob.name) + last_job_done = False + # we will poll for job completion this long in an individual iteration of + # the while loop. + polling_timeout = 5 # seconds + lock_blob: storage.Blob = bkt.blob(f"{table_prefix}/_bqlock") + if restart_time - polling_timeout < time.monotonic(): + raise EnvironmentError( + "The Cloud Function timeout is too short for " + "backlog subscriber to do it's job. We recommend " + "setting the timeout to 540 seconds or at least " + "1 minute (Cloud Functions default).") + while time.monotonic() < restart_time - polling_timeout: + job_id = utils.read_gcs_file_if_exists( + gcs_client, f"gs://{bkt.name}/{lock_blob.name}") + if job_id: + if job_id.startswith( + os.getenv('JOB_PREFIX', constants.DEFAULT_JOB_PREFIX)): + try: + last_job_done = utils.wait_on_bq_job_id( + bq_client, job_id, polling_timeout) + except (exceptions.BigQueryJobFailure, + google.api_core.exceptions.NotFound): + last_job_done = False + error_client.report( + f"previous BigQuery job: {job_id} failed or could not " + "be found. This will kill the backfill subscriber for " + f"the table prefix {table_prefix}." + "Once the issue is dealt with by a human, the lock" + "file at: " + f"gs://{lock_blob.bucket.name}/{lock_blob.name} " + "should be manually removed and a new empty " + f"{constants.BACKFILL_FILENAME}" + "file uploaded to:" + f"gs://{lock_blob.bucket.name}/{table_prefix}/_BACKFILL" + f"to resume the backfill subscriber so it can " + "continue with the next item in the backlog.\n" + "Original Exception:\n" + f"{traceback.format_exc()}") + time.sleep(polling_timeout) + continue + else: + print(f"sleeping for {polling_timeout} seconds because" + f"found manual lock gs://{bkt.name}/{lock_blob.name} with" + "contents:\n" + f"""{utils.read_gcs_file_if_exists(gcs_client, + f'gs://{lock_blob.bucket.name}/{lock_blob.name}')}""") + time.sleep(polling_timeout) + continue + if last_job_done: + utils.remove_oldest_backlog_item(gcs_client, bkt, table_prefix) + last_job_done = False + + next_backlog_file = utils.get_next_backlog_item(gcs_client, bkt, + table_prefix) + if not next_backlog_file: + print(f"backlog is empty for gs://{bkt.name}/{table_prefix}." + "baclog subscriber exiting.") + utils.handle_bq_lock(gcs_client, lock_blob, None) + return + next_success_file: storage.Blob = bkt.blob( + next_backlog_file.name.replace("/_backlog/", "/")) + table_ref, batch = utils.gcs_path_to_table_ref_and_batch( + next_success_file.name) + if not next_success_file.exists(): + raise exceptions.BacklogException( + "backlog contains" + f"gs://{next_backlog_file.bucket}/{next_backlog_file.name}" + "but the corresponding success file does not exist at:" + f"gs://{next_success_file.bucket}/{next_success_file.name}") + utils.apply(gcs_client, bq_client, next_success_file, lock_blob, + utils.create_job_id(table_ref, batch)) + # retrigger the subscriber loop by reposting the _BACKFILL file + print("ran out of time, restarting backfill subscriber loop for:" + f"gs://{bkt.name}/{table_prefix}") + backfill_blob = bkt.blob(f"{table_prefix}/{constants.BACKFILL_FILENAME}") + backfill_blob.upload_from_string("") + + +def start_backfill_subscriber_if_not_running(gcs_client: storage.Client, + bkt: storage.Bucket, + table_prefix: str): + """start the backfill subscriber if it is not already runnning for this + table prefix. + + created a backfill file for the table prefix if not exists. + """ + # Create a _BACKFILL file for this table if not exists + backfill_blob = bkt.blob(f"{table_prefix}/{constants.BACKFILL_FILENAME}") + try: + backfill_blob.upload_from_string("", + if_generation_match=0, + client=gcs_client) + print("triggered backfill with " + f"gs://{backfill_blob.bucket.name}/{backfill_blob.name} " + f"created at {backfill_blob.time_created}. exiting. ") + except google.api_core.exceptions.PreconditionFailed: + backfill_blob.reload() + print("backfill already in progress due to: " + f"gs://{backfill_blob.bucket.name}/{backfill_blob.name} " + f"created at {backfill_blob.time_created}. exiting.") + + +def success_blob_to_backlog_blob(success_blob: storage.Blob) -> storage.Blob: + """create a blob object that is a pointer to the input success blob in the + backlog + """ + bkt = success_blob.bucket + table_prefix = utils.get_table_prefix(success_blob.name) + success_file_suffix = utils.removeprefix(success_blob.name, + f"{table_prefix}/") + return bkt.blob(f"{table_prefix}/_backlog/{success_file_suffix}") diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/requirements.txt b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/requirements.txt index ccba892ee..f2112fdcc 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/requirements.txt +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/requirements.txt @@ -1,2 +1,3 @@ -google-cloud-bigquery==2.2.0 -google-cloud-storage==1.32.0 +google-cloud-bigquery==2.6.0 +google-cloud-storage==1.33.0 +google-cloud-error-reporting==1.1.0 diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py index 434b423c7..f38555846 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py @@ -23,53 +23,23 @@ import os import pathlib import time -from typing import Any, Deque, Dict, List, Optional, Tuple +import uuid +from typing import Any, Deque, Dict, List, Optional, Tuple, Union import cachetools +import google.api_core import google.api_core.client_info import google.api_core.exceptions import google.cloud.exceptions -from google.cloud import bigquery, storage - # pylint in cloud build is being flaky about this import discovery. -from . import constants # pylint: disable=no-name-in-module - +from google.cloud import bigquery, storage -def create_job_id_prefix(dest_table_ref: bigquery.TableReference, - batch_id: Optional[str]): - """Create job id prefix with a consistent naming convention. - The naming conventions is as follows: - gcf-ingest----- - Parts that are not inferrable from the GCS path with have a 'None' - placeholder. This naming convention is crucial for monitoring the system. - Note, gcf-ingest- can be overridden with environment variable JOB_PREFIX - - Examples: - - Non-partitioned Non batched tables: - - gs://${BUCKET}/tpch/lineitem/_SUCCESS - - gcf-ingest-tpch-lineitem-None-None- - Non-partitioned batched tables: - - gs://${BUCKET}/tpch/lineitem/batch000/_SUCCESS - - gcf-ingest-tpch-lineitem-None-batch000- - Partitioned Batched tables: - - gs://${BUCKET}/tpch/lineitem/$20201031/batch000/_SUCCESS - - gcf-ingest-tpch-lineitem-20201031-batch000- - """ - table_partition = dest_table_ref.table_id.split("$") - if len(table_partition) < 2: - # If there is no partition put a None placeholder - table_partition.append("None") - return f"{os.getenv('JOB_PREFIX', constants.DEFAULT_JOB_PREFIX)}" \ - f"{dest_table_ref.dataset_id}-" \ - f"{'-'.join(table_partition)}-" \ - f"{batch_id}-" +from . import constants, exceptions # pylint: disable=no-name-in-module def external_query( # pylint: disable=too-many-arguments gcs_client: storage.Client, bq_client: bigquery.Client, gsurl: str, - query: str, dest_table_ref: bigquery.TableReference, - job_id_prefix: str): + query: str, dest_table_ref: bigquery.TableReference, job_id: str): """Load from query over external table from GCS. This hinges on a SQL query defined in GCS at _config/bq_transform.sql and @@ -104,7 +74,7 @@ def external_query( # pylint: disable=too-many-arguments job: bigquery.QueryJob = bq_client.query( rendered_query, job_config=job_config, - job_id_prefix=job_id_prefix, + job_id=job_id, ) print(f"started asynchronous query job: {job.job_id}") @@ -115,7 +85,7 @@ def external_query( # pylint: disable=too-many-arguments ) - start_poll_for_errors < constants.WAIT_FOR_JOB_SECONDS: job.reload() if job.errors: - raise RuntimeError( + raise exceptions.BigQueryJobFailure( f"query job {job.job_id} failed quickly: {job.errors}") time.sleep(constants.JOB_POLL_INTERVAL_SECONDS) @@ -125,23 +95,18 @@ def flatten2dlist(arr: List[List[Any]]) -> List[Any]: return [j for i in arr for j in i] -def load_batches(gcs_client, bq_client, gsurl, dest_table_ref, job_id_prefix): +def load_batches(gcs_client, bq_client, gsurl, dest_table_ref, job_id): """orchestrate 1 or more load jobs based on number of URIs and total byte size of objects at gsurl""" batches = get_batches_for_prefix(gcs_client, gsurl) load_config = construct_load_job_config(gcs_client, gsurl) load_config.labels = constants.DEFAULT_JOB_LABELS - batch_count = len(batches) jobs: List[bigquery.LoadJob] = [] - for batch_num, batch in enumerate(batches): + for batch in batches: print(load_config.to_api_repr()) job: bigquery.LoadJob = bq_client.load_table_from_uri( - batch, - dest_table_ref, - job_config=load_config, - job_id_prefix=f"{job_id_prefix}{batch_num}-of-{batch_count}-", - ) + batch, dest_table_ref, job_config=load_config, job_id=job_id) print(f"started asyncronous bigquery load job with id: {job.job_id} for" f" {gsurl}") @@ -155,41 +120,11 @@ def load_batches(gcs_client, bq_client, gsurl, dest_table_ref, job_id_prefix): for job in jobs: job.reload() if job.errors: - raise RuntimeError( + raise exceptions.BigQueryJobFailure( f"load job {job.job_id} failed quickly: {job.errors}") time.sleep(constants.JOB_POLL_INTERVAL_SECONDS) -def handle_duplicate_notification(bkt: storage.Bucket, - success_blob: storage.Blob, gsurl: str): - """ - Need to handle potential duplicate Pub/Sub notifications. - To achieve this we will drop an empty "claimed" file that indicates - an invocation of this cloud function has picked up the success file - with a certain creation timestamp. This will support republishing the - success file as a mechanism of re-running the ingestion while avoiding - duplicate ingestion due to multiple Pub/Sub messages for a success file - with the same creation time. - """ - success_blob.reload() - success_created_unix_timestamp = success_blob.time_created.timestamp() - - claim_blob: storage.Blob = bkt.blob( - success_blob.name.replace(constants.SUCCESS_FILENAME, - f"_claimed_{success_created_unix_timestamp}")) - try: - claim_blob.upload_from_string("", if_generation_match=0) - except google.api_core.exceptions.PreconditionFailed as err: - raise RuntimeError( - f"The prefix {gsurl} appears to already have been claimed for " - f"{gsurl}{constants.SUCCESS_FILENAME} with created timestamp" - f"{success_created_unix_timestamp}." - "This means that another invocation of this cloud function has" - "claimed the ingestion of this batch." - "This may be due to a rare duplicate delivery of the Pub/Sub " - "storage notification.") from err - - def _get_parent_config_file(storage_client, config_filename, bucket, path): config_dir_name = "_config" parent_path = pathlib.Path(path).parent @@ -307,7 +242,8 @@ def get_batches_for_prefix( elif len(batches) == 1: print("using single load job.") else: - raise RuntimeError("No files to load!") + raise google.api_core.exceptions.NotFound( + f"No files to load at gs://{bucket_name}/{prefix_path}!") return batches @@ -336,12 +272,12 @@ def parse_notification(notification: dict) -> Tuple[str, str]: attributes = notification["attributes"] return attributes["bucketId"], attributes["objectId"] except KeyError: - raise RuntimeError( + raise exceptions.UnexpectedTriggerException( "Issue with Pub/Sub message, did not contain expected" f"attributes: 'bucketId' and 'objectId': {notification}" ) from KeyError - raise RuntimeError( - "Cloud Function recieved unexpected trigger:\n" + raise exceptions.UnexpectedTriggerException( + "Cloud Function received unexpected trigger:\n" f"{notification}\n" "This function only supports direct Cloud Functions" "Background Triggers or Pub/Sub storage notificaitons" @@ -409,6 +345,13 @@ def dict_to_bq_schema(schema: List[Dict]) -> List[bigquery.SchemaField]: # To be added to built in str in python 3.9 # https://www.python.org/dev/peps/pep-0616/ +def removeprefix(in_str: str, prefix: str) -> str: + """remove string prefix""" + if in_str.startswith(prefix): + return in_str[len(prefix):] + return in_str[:] + + def removesuffix(in_str: str, suffix: str) -> str: """removes suffix from a string.""" # suffix='' should not call self[:-0]. @@ -437,3 +380,316 @@ def recursive_update(original: Dict, update: Dict, in_place: bool = False): else: out[key] = value return out + + +def handle_duplicate_notification(blob_to_claim: storage.Blob): + """ + Need to handle potential duplicate Pub/Sub notifications. + To achieve this we will drop an empty "claimed" file that indicates + an invocation of this cloud function has picked up the success file + with a certain creation timestamp. This will support republishing the + success file as a mechanism of re-running the ingestion while avoiding + duplicate ingestion due to multiple Pub/Sub messages for a success file + with the same creation time. + """ + blob_to_claim.reload() + created_unix_timestamp = blob_to_claim.time_created.timestamp() + + basename = os.path.basename(blob_to_claim.name) + claim_blob: storage.Blob = blob_to_claim.bucket.blob( + blob_to_claim.name.replace( + basename, f"_claimed_{basename}_created_at_" + f"{created_unix_timestamp}")) + try: + claim_blob.upload_from_string("", if_generation_match=0) + except google.api_core.exceptions.PreconditionFailed as err: + raise exceptions.DuplicateNotificationException( + f"gs://{blob_to_claim.bucket.name}/{blob_to_claim.name} appears" + "to already have been claimed for created timestamp: " + f"{created_unix_timestamp}." + "This means that another invocation of this cloud function has " + "claimed the work to be one for this file. " + "This may be due to a rare duplicate delivery of the Pub/Sub " + "storage notification.") from err + + +def get_table_prefix(object_id: str) -> str: + """Find the table prefix for a object_id based on the destination regex. + Args: + object_id: str object ID to parse + Returns: + str: table prefix + """ + match = constants.DESTINATION_REGEX.match(object_id) + if not match: + raise exceptions.DestinationRegexMatchException( + f"could not determine table prefix for object id: {object_id}" + "because it did not contain a match for destination_regex: " + f"{constants.DESTINATION_REGEX.pattern}") + table_group_index = match.re.groupindex.get("table") + if table_group_index: + table_level_index = match.regs[table_group_index][1] + return object_id[:table_level_index] + raise exceptions.DestinationRegexMatchException( + f"could not determine table prefix for object id: {object_id}" + "because it did not contain a match for the table capturing group " + f"in destination regex: {constants.DESTINATION_REGEX.pattern}") + + +def get_next_backlog_item( + gcs_client: storage.Client, + bkt: storage.Bucket, + table_prefix: str, +) -> Optional[storage.Blob]: + """ + Get next blob in the backlog if the backlog is not empty. + + Args: + gcs_client: storage.Client + bkt: storage.Bucket that this cloud functions is ingesting data for. + table_prefix: the prefix for the table whose backlog should be checked. + + Retruns: + storage.Blob: pointer to a SUCCESS file in the backlog + """ + backlog_blobs = gcs_client.list_blobs(bkt, + prefix=f"{table_prefix}/_backlog/") + # Backlog items will be lexciographically sorted + # https://cloud.google.com/storage/docs/json_api/v1/objects/list + for blob in backlog_blobs: + return blob # Return first item in iterator + return None + + +def remove_oldest_backlog_item( + gcs_client: storage.Client, + bkt: storage.Bucket, + table_prefix: str, +) -> bool: + """ + Remove the oldes pointer in the backlog if the backlog is not empty. + + Args: + gcs_client: storage.Client + bkt: storage.Bucket that this cloud functions is ingesting data for. + table_prefix: the prefix for the table whose backlog should be checked. + + Returns: + bool: True if we removed the oldest blob. False if the backlog was + empty. + """ + backlog_blobs = gcs_client.list_blobs(bkt, + prefix=f"{table_prefix}/_backlog/") + # Backlog items will be lexciographically sorted + # https://cloud.google.com/storage/docs/json_api/v1/objects/list + blob: storage.Blob + for blob in backlog_blobs: + blob.delete() + return True # Return after deleteing first blob in the iterator + return False + + +def wait_on_bq_job_id(bq_client: bigquery.Client, + job_id: str, + polling_timeout: int, + polling_interval: int = 1) -> bool: + """" + Wait for a BigQuery Job ID to complete. + + Args: + bq_client: bigquery.Client + job_id: str the BQ job ID to wait on + polling_timeout: int number of seconds to poll this job ID + polling_interval: frequency to query the job state during polling + Returns: + bool: if the job ID has finished successfully. True if DONE without + errors, False if RUNNING or PENDING + Raises: + exceptions.BigQueryJobFailure if the job failed. + google.api_core.exceptions.NotFound if the job id cannot be found. + """ + start_poll = time.monotonic() + while time.monotonic() - start_poll < (polling_timeout - polling_interval): + job: Union[bigquery.LoadJob, + bigquery.QueryJob] = bq_client.get_job(job_id) + if job.state == "DONE": + if job.errors: + raise exceptions.BigQueryJobFailure( + f"BigQuery Job {job.job_id} failed during backfill with the" + f"following errors: {job.errors}") + return True + if job.state in {"RUNNING", "PENDING"}: + print(f"waiting on BigQuery Job {job.job_id}") + time.sleep(polling_interval) + return False + + +def wait_on_gcs_blob(gcs_client: storage.Client, + wait_blob: storage.Blob, + polling_timeout: int, + polling_interval: int = 1) -> bool: + """" + Wait for a GCS Object to exists. + + Args: + gcs_client: storage.Client + wait_blob: storage.Bllob the GCS to wait on. + polling_timeout: int number of seconds to poll this job ID + polling_interval: frequency to query the job state during polling + Returns: + bool: if the job ID has finished successfully. True if DONE without + errors, False if RUNNING or PENDING + Raises: + exceptions.BigQueryJobFailure if the job failed. + google.api_core.exceptions.NotFound if the job id cannot be found. + """ + start_poll = time.monotonic() + while time.monotonic() - start_poll < (polling_timeout - polling_interval): + if wait_blob.exists(client=gcs_client): + return True + print( + f"waiting on GCS file gs://{wait_blob.bucket.name}/{wait_blob.name}" + ) + time.sleep(polling_interval) + return False + + +def gcs_path_to_table_ref_and_batch( + object_id) -> Tuple[bigquery.TableReference, Optional[str]]: + """extract bigquery table reference and batch id from gcs object id""" + + destination_match = constants.DESTINATION_REGEX.match(object_id) + if not destination_match: + raise RuntimeError(f"Object ID {object_id} did not match regex:" + f" {constants.DESTINATION_REGEX.pattern}") + destination_details = destination_match.groupdict() + try: + dataset = destination_details['dataset'] + table = destination_details['table'] + except KeyError: + raise exceptions.DestinationRegexMatchException( + f"Object ID {object_id} did not match dataset and table in regex:" + f" {constants.DESTINATION_REGEX.pattern}") from KeyError + partition = destination_details.get('partition') + year, month, day, hour = ( + destination_details.get(key, "") for key in ('yyyy', 'mm', 'dd', 'hh')) + part_list = (year, month, day, hour) + if not partition and any(part_list): + partition = '$' + ''.join(part_list) + batch_id = destination_details.get('batch') + labels = constants.DEFAULT_JOB_LABELS + + if batch_id: + labels["batch-id"] = batch_id + + if partition: + + dest_table_ref = bigquery.TableReference.from_string( + f"{dataset}.{table}{partition}", + default_project=os.getenv("BQ_PROJECT", os.getenv("GCP_PROJECT"))) + else: + dest_table_ref = bigquery.TableReference.from_string( + f"{dataset}.{table}", + default_project=os.getenv("BQ_PROJECT", os.getenv("GCP_PROJECT"))) + return dest_table_ref, batch_id + + +def create_job_id(dest_table_ref: bigquery.TableReference, + batch_id: Optional[str]): + """Create job id prefix with a consistent naming convention. + The naming conventions is as follows: + gcf-ingest----- + Parts that are not inferrable from the GCS path with have a 'None' + placeholder. This naming convention is crucial for monitoring the system. + Note, gcf-ingest- can be overridden with environment variable JOB_PREFIX + + Examples: + + Non-partitioned Non batched tables: + - gs://${BUCKET}/tpch/lineitem/_SUCCESS + - gcf-ingest-tpch-lineitem-None-None- + Non-partitioned batched tables: + - gs://${BUCKET}/tpch/lineitem/batch000/_SUCCESS + - gcf-ingest-tpch-lineitem-None-batch000- + Partitioned Batched tables: + - gs://${BUCKET}/tpch/lineitem/$20201031/batch000/_SUCCESS + - gcf-ingest-tpch-lineitem-20201031-batch000- + """ + table_partition = dest_table_ref.table_id.split("$") + if len(table_partition) < 2: + # If there is no partition put a None placeholder + table_partition.append("None") + return f"{os.getenv('JOB_PREFIX', constants.DEFAULT_JOB_PREFIX)}" \ + f"{dest_table_ref.dataset_id}-" \ + f"{'-'.join(table_partition)}-" \ + f"{batch_id}-{uuid.uuid4()}" + + +def handle_bq_lock(gcs_client: storage.Client, lock_blob: storage.Blob, + next_job_id: Optional[str]): + """Reclaim the lock blob for the new job id (in-place) or delete the lock + blob if next_job_id is None.""" + try: + if next_job_id: + if lock_blob.exists(): + lock_blob.upload_from_string( + next_job_id, + if_generation_match=lock_blob.generation, + client=gcs_client) + else: # This happens when submitting the first job in the backlog + lock_blob.upload_from_string(next_job_id, + if_generation_match=0, + client=gcs_client) + else: + print("releasing lock at: " + f"gs://{lock_blob.bucket.name}/{lock_blob.name}") + lock_blob.delete( + if_generation_match=lock_blob.generation, + client=gcs_client, + ) + except google.api_core.exceptions.PreconditionFailed as err: + raise exceptions.BacklogException( + f"The lock at gs://{lock_blob.bucket.name}/{lock_blob.name}" + f"was changed by another process.") from err + + +def apply( + gcs_client: storage.Client, + bq_client: bigquery.Client, + success_blob: storage.Blob, + lock_blob: Optional[storage.Blob], + job_id: str, +): + """ + Apply an incremental batch to the target BigQuery table via an asynchronous + load job or external query. + + Args: + gcs_client: storage.Client + bq_client: bigquery.Client + success_blob: storage.Blob the success file whose batch should be + applied. + lock_blob: storage.Blob + job_id: str + """ + bkt = success_blob.bucket + if lock_blob is not None: + handle_bq_lock(gcs_client, lock_blob, job_id) + dest_table_ref, _ = gcs_path_to_table_ref_and_batch(success_blob.name) + gsurl = removesuffix(f"gs://{bkt.name}/{success_blob.name}", + constants.SUCCESS_FILENAME) + print("looking for bq_transform.sql") + external_query_sql = read_gcs_file_if_exists( + gcs_client, f"{gsurl}_config/bq_transform.sql") + if not external_query_sql: + external_query_sql = look_for_config_in_parents(gcs_client, gsurl, + "bq_transform.sql") + if external_query_sql: + print("EXTERNAL QUERY") + print(f"found external query:\n{external_query_sql}") + external_query(gcs_client, bq_client, gsurl, external_query_sql, + dest_table_ref, job_id) + return + + print("LOAD_JOB") + load_batches(gcs_client, bq_client, gsurl, dest_table_ref, job_id) diff --git a/tools/cloud_functions/gcs_event_based_ingest/pytest.ini b/tools/cloud_functions/gcs_event_based_ingest/pytest.ini index 990ea2ca2..3864588b3 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/pytest.ini +++ b/tools/cloud_functions/gcs_event_based_ingest/pytest.ini @@ -1,5 +1,6 @@ [pytest] markers = IT: marks tests as slow integration test requiring cloud resouces (deselect with '-m "not IT"') + ORDERING: marks tests that test features related to ordering CLI: marks tests of CLI utilities addopts = --workers=auto diff --git a/tools/cloud_functions/gcs_event_based_ingest/requirements-dev.txt b/tools/cloud_functions/gcs_event_based_ingest/requirements-dev.txt index 7682e7da0..b86a61183 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/requirements-dev.txt +++ b/tools/cloud_functions/gcs_event_based_ingest/requirements-dev.txt @@ -7,4 +7,4 @@ mypy pylint pytest-parallel pytest-cov -google-cloud-pubsub +google-cloud-pubsub>=2.2.0 diff --git a/tools/cloud_functions/gcs_event_based_ingest/requirements.txt b/tools/cloud_functions/gcs_event_based_ingest/requirements.txt index c65fa4df4..7279c2550 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/requirements.txt +++ b/tools/cloud_functions/gcs_event_based_ingest/requirements.txt @@ -1,3 +1,4 @@ -google-cloud-bigquery>=2.2.0 -google-cloud-storage>=1.32.0 +google-cloud-bigquery>=2.6.0 +google-cloud-storage>=1.33.0 +google-cloud-error-reporting>=1.1.0 cachetools diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py index 4adf3ba43..93a459d63 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py @@ -18,9 +18,11 @@ import uuid from typing import List +import google.api_core.exceptions import pytest -from google.cloud import bigquery, storage +from google.cloud import bigquery, error_reporting, storage +import gcs_ocn_bq_ingest.ordering import gcs_ocn_bq_ingest.utils TEST_DIR = os.path.realpath(os.path.dirname(__file__)) @@ -39,7 +41,12 @@ def gcs() -> storage.Client: return storage.Client() -@pytest.mark.usefixtures("gcs") +@pytest.fixture(scope="module") +def error() -> error_reporting.Client: + """GCS Client""" + return error_reporting.Client() + + @pytest.fixture def gcs_bucket(request, gcs) -> storage.bucket.Bucket: """GCS bucket for test artifacts""" @@ -60,16 +67,15 @@ def teardown(): return bucket -@pytest.mark.usefixtures("gcs_bucket") @pytest.fixture def mock_env(gcs, monkeypatch): """environment variable mocks""" # Infer project from ADC of gcs client. monkeypatch.setenv("GCP_PROJECT", gcs.project) monkeypatch.setenv("FUNCTION_NAME", "integration-test") + monkeypatch.setenv("FUNCTION_TIMEOUT_SEC", "120") -@pytest.mark.usefixtures("bq", "mock_env") @pytest.fixture def dest_dataset(request, bq, mock_env, monkeypatch): random_dataset = f"test_bq_ingest_gcf_{str(uuid.uuid4())[:8].replace('-','_')}" @@ -88,7 +94,6 @@ def teardown(): return dataset -@pytest.mark.usefixtures("bq", "mock_env", "dest_dataset") @pytest.fixture def dest_table(request, bq, mock_env, dest_dataset) -> bigquery.Table: with open(os.path.join(TEST_DIR, "resources", @@ -111,7 +116,6 @@ def teardown(): @pytest.fixture(scope="function") -@pytest.mark.usefixtures("gcs_bucket", "dest_dataset", "dest_table") def gcs_data(request, gcs_bucket, dest_dataset, dest_table) -> storage.blob.Blob: data_objs = [] @@ -135,7 +139,6 @@ def teardown(): @pytest.fixture(scope="function") -@pytest.mark.usefixtures("gcs_bucket", "dest_dataset", "dest_table") def gcs_data_under_sub_dirs(request, gcs_bucket, dest_dataset, dest_table) -> storage.blob.Blob: data_objs = [] @@ -151,7 +154,7 @@ def gcs_data_under_sub_dirs(request, gcs_bucket, dest_dataset, def teardown(): for do in data_objs: - if do.exists: + if do.exists(): do.delete() request.addfinalizer(teardown) @@ -159,7 +162,6 @@ def teardown(): @pytest.fixture(scope="function") -@pytest.mark.usefixtures("gcs_bucket", "dest_dataset", "dest_table") def gcs_truncating_load_config(request, gcs_bucket, dest_dataset, dest_table) -> storage.blob.Blob: config_obj: storage.blob.Blob = gcs_bucket.blob("/".join([ @@ -180,7 +182,6 @@ def teardown(): @pytest.fixture(scope="function") -@pytest.mark.usefixtures("gcs_bucket", "dest_dataset", "dest_table") def gcs_batched_data(request, gcs_bucket, dest_dataset, dest_table) -> List[storage.blob.Blob]: """ @@ -199,14 +200,13 @@ def gcs_batched_data(request, gcs_bucket, dest_dataset, def teardown(): for do in data_objs: - if do.exists: + if do.exists(): do.delete() request.addfinalizer(teardown) return [data_objs[-1], data_objs[-4]] -@pytest.mark.usefixtures("gcs_bucket", "dest_dataset", "dest_table") @pytest.fixture def gcs_external_config(request, gcs_bucket, dest_dataset, dest_table) -> List[storage.blob.Blob]: @@ -249,7 +249,7 @@ def gcs_external_config(request, gcs_bucket, dest_dataset, def teardown(): for do in config_objs: - if do.exists: + if do.exists(): do.delete() request.addfinalizer(teardown) @@ -257,7 +257,6 @@ def teardown(): @pytest.fixture(scope="function") -@pytest.mark.usefixtures("gcs_bucket", "dest_dataset", "dest_parttioned_table") def gcs_partitioned_data(request, gcs_bucket, dest_dataset, dest_partitioned_table) -> List[storage.blob.Blob]: data_objs = [] @@ -274,7 +273,8 @@ def gcs_partitioned_data(request, gcs_bucket, dest_dataset, def teardown(): for dobj in data_objs: - if dobj.exists: + # we expect some backfill files to be removed by the cloud function. + if dobj.exists(): dobj.delete() request.addfinalizer(teardown) @@ -282,7 +282,6 @@ def teardown(): @pytest.fixture(scope="function") -@pytest.mark.usefixtures("gcs_bucket", "dest_dataset", "dest_table") def dest_partitioned_table(request, bq: bigquery.Client, mock_env, dest_dataset) -> bigquery.Table: public_table: bigquery.Table = bq.get_table( @@ -335,3 +334,140 @@ def bq_wait_for_rows(bq_client: bigquery.Client, table: bigquery.Table, f"{table.project}.{table.dataset_id}.{table.table_id} to " f"reach {expected_num_rows} rows." f"last poll returned {actual_num_rows} rows.") + + +@pytest.fixture +def dest_ordered_update_table(request, bq, mock_env, + dest_dataset) -> bigquery.Table: + with open(os.path.join(TEST_DIR, "resources", + "ordering_schema.json")) as schema_file: + schema = gcs_ocn_bq_ingest.utils.dict_to_bq_schema( + json.load(schema_file)) + + table = bigquery.Table( + f"{os.environ.get('GCP_PROJECT')}.{dest_dataset.dataset_id}" + ".cf_test_ordering", + schema=schema, + ) + + table: bigquery.Table = bq.create_table(table) + # Our test query only updates so we need to populate the first row. + bq.load_table_from_json([{"id": 1, "alpha_update": ""}], table) + + def teardown(): + bq.delete_table(table, not_found_ok=True) + + request.addfinalizer(teardown) + return table + + +@pytest.fixture(scope="function") +def gcs_ordered_update_data( + request, gcs_bucket, dest_dataset, + dest_ordered_update_table) -> List[storage.blob.Blob]: + data_objs = [] + chunks = { + "00", + "01", + "02", + } + for chunk in chunks: + for test_file in ["data.csv", "_SUCCESS"]: + data_obj: storage.blob.Blob = gcs_bucket.blob("/".join([ + f"{dest_dataset.project}.{dest_dataset.dataset_id}", + dest_ordered_update_table.table_id, chunk, test_file + ])) + data_obj.upload_from_filename( + os.path.join(TEST_DIR, "resources", "test-data", "ordering", + chunk, test_file)) + data_objs.append(data_obj) + + def teardown(): + for dobj in data_objs: + if dobj.exists(): + dobj.delete() + + request.addfinalizer(teardown) + return list(filter(lambda do: do.name.endswith("_SUCCESS"), data_objs)) + + +@pytest.fixture(scope="function") +def gcs_backlog(request, gcs, gcs_bucket, + gcs_ordered_update_data) -> List[storage.blob.Blob]: + data_objs = [] + + for success_blob in gcs_ordered_update_data: + gcs_ocn_bq_ingest.ordering.backlog_publisher(gcs, success_blob) + backlog_blob = gcs_ocn_bq_ingest.ordering.success_blob_to_backlog_blob( + success_blob) + backlog_blob.upload_from_string("") + data_objs.append(backlog_blob) + + def teardown(): + for dobj in data_objs: + if dobj.exists(): + dobj.delete() + + request.addfinalizer(teardown) + return list(filter(lambda do: do.name.endswith("_SUCCESS"), data_objs)) + + +@pytest.fixture +def gcs_external_update_config(request, gcs_bucket, dest_dataset, + dest_ordered_update_table) -> storage.Blob: + config_objs = [] + sql_obj = gcs_bucket.blob("/".join([ + f"{dest_dataset.project}.{dest_dataset.dataset_id}", + dest_ordered_update_table.table_id, + "_config", + "bq_transform.sql", + ])) + + sql = """ + UPDATE {dest_dataset}.{dest_table} dest + SET alpha_update = CONCAT(dest.alpha_update, src.alpha_update) + FROM temp_ext src + WHERE dest.id = src.id + """ + sql_obj.upload_from_string(sql) + + config_obj = gcs_bucket.blob("/".join([ + f"{dest_dataset.project}.{dest_dataset.dataset_id}", + dest_ordered_update_table.table_id, "_config", "external.json" + ])) + + with open(os.path.join(TEST_DIR, "resources", + "ordering_schema.json")) as schema: + fields = json.load(schema) + config = { + "schema": { + "fields": fields + }, + "csvOptions": { + "allowJaggedRows": False, + "allowQuotedNewlines": False, + "encoding": "UTF-8", + "fieldDelimiter": "|", + "skipLeadingRows": 0, + }, + "sourceFormat": "CSV", + "sourceUris": ["REPLACEME"], + } + config_obj.upload_from_string(json.dumps(config)) + backfill_blob = gcs_bucket.blob("/".join([ + f"{dest_dataset.project}.{dest_dataset.dataset_id}", + dest_ordered_update_table.table_id, + gcs_ocn_bq_ingest.constants.BACKFILL_FILENAME + ])) + backfill_blob.upload_from_string("") + config_objs.append(sql_obj) + config_objs.append(config_obj) + config_objs.append(backfill_blob) + + def teardown(): + for do in config_objs: + if do.exists(): + do.delete() + + request.addfinalizer(teardown) + return backfill_blob diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py index 712b380be..019fd848e 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py @@ -192,3 +192,27 @@ def test_flattend2dlist(test_input, expected): def test_recursive_update(original, update, expected): assert gcs_ocn_bq_ingest.utils.recursive_update(original, update) == expected + + +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + "dataset/table/_SUCCESS", # flat + "dataset/table"), + ( + "dataset/table/$20201030/_SUCCESS", # partitioned + "dataset/table"), + ( + "dataset/table/$20201030/batch_id/_SUCCESS", # partitioned, batched + "dataset/table"), + ( + "dataset/table/batch_id/_SUCCESS", # batched (no partitioning) + "dataset/table"), + ("dataset/table/2020/01/02/03/batch_id/_SUCCESS", "dataset/table"), + ("project.dataset/table/2020/01/02/03/batch_id/_SUCCESS", + "project.dataset/table"), + ("dataset/table/_backlog/_BACKFILL", "dataset/table"), + ]) +def test_get_table_prefix(test_input, expected): + assert gcs_ocn_bq_ingest.utils.get_table_prefix(test_input) == expected diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py index 44a5e717a..8aadeb08b 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py @@ -14,6 +14,7 @@ """integration tests for gcs_ocn_bq_ingest""" import os import time +import unittest.mock import google.cloud.exceptions import pytest @@ -22,7 +23,7 @@ import gcs_ocn_bq_ingest.main TEST_DIR = os.path.realpath(os.path.dirname(__file__) + "/..") -LOAD_JOB_POLLING_TIMEOUT = 10 # seconds +LOAD_JOB_POLLING_TIMEOUT = 20 # seconds @pytest.mark.IT @@ -67,8 +68,8 @@ def test_gcf_event_schema(bq, gcs_data, dest_dataset, dest_table, mock_env): @pytest.mark.IT -def test_duplicate_notification(bq, gcs_data, dest_dataset, dest_table, - mock_env): +def test_duplicate_success_notification(bq, gcs_data, dest_dataset, dest_table, + mock_env): """tests behavior with two notifications for the same success file.""" if not gcs_data.exists(): raise EnvironmentError("test data objects must exist") @@ -79,12 +80,11 @@ def test_duplicate_notification(bq, gcs_data, dest_dataset, dest_table, } } gcs_ocn_bq_ingest.main.main(test_event, None) - did_second_invocation_raise = False - try: + with unittest.mock.patch.object(google.cloud.error_reporting.Client, + "report_exception") as mock_method: gcs_ocn_bq_ingest.main.main(test_event, None) - except RuntimeError: - did_second_invocation_raise = True - assert did_second_invocation_raise + + mock_method.assert_called_once() test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation", "part-m-00001") diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py new file mode 100644 index 000000000..3608d6e08 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py @@ -0,0 +1,141 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""integration tests for the ordering behavior of backlog gcs_ocn_bq_ingest""" +import os +import queue +import time + +import pytest +from google.cloud import storage + +import gcs_ocn_bq_ingest.constants +import gcs_ocn_bq_ingest.main +import gcs_ocn_bq_ingest.ordering +import gcs_ocn_bq_ingest.utils + +TEST_DIR = os.path.realpath(os.path.dirname(__file__) + "/..") +LOAD_JOB_POLLING_TIMEOUT = 20 # seconds + + +@pytest.mark.IT +@pytest.mark.ORDERING +def test_backlog_publisher(gcs, gcs_bucket, gcs_partitioned_data, mock_env): + """Test basic functionality of backlog_publisher + Drop two success files. + Assert that both success files are added to backlog and backfill file + created. + Assert that that only one backfill file is not recreated. + """ + table_prefix = "" + # load each partition. + for gcs_data in gcs_partitioned_data: + if not gcs_data.exists(): + raise EnvironmentError("test data objects must exist") + if gcs_data.name.endswith(gcs_ocn_bq_ingest.constants.SUCCESS_FILENAME): + table_prefix = gcs_ocn_bq_ingest.utils.get_table_prefix( + gcs_data.name) + gcs_ocn_bq_ingest.ordering.backlog_publisher(gcs, gcs_data) + + expected_backlog_blobs = queue.Queue() + expected_backlog_blobs.put("/".join([ + table_prefix, "_backlog", "$2017041101", + gcs_ocn_bq_ingest.constants.SUCCESS_FILENAME + ])) + expected_backlog_blobs.put("/".join([ + table_prefix, "_backlog", "$2017041102", + gcs_ocn_bq_ingest.constants.SUCCESS_FILENAME + ])) + + for backlog_blob in gcs_bucket.list_blobs( + prefix=f"{table_prefix}/_backlog"): + assert backlog_blob.name == expected_backlog_blobs.get(block=False) + + backfill_blob: storage.Blob = gcs_bucket.blob( + f"{table_prefix}/{gcs_ocn_bq_ingest.constants.BACKFILL_FILENAME}") + assert backfill_blob.exists() + + +@pytest.mark.IT +@pytest.mark.ORDERING +def test_backlog_publisher_with_existing_backfill_file(gcs, gcs_bucket, + dest_dataset, + dest_partitioned_table, + gcs_partitioned_data, + mock_env): + """Test basic functionality of backlog_publisher when the backfill is + already running. It should not repost this backfill file. + """ + table_prefix = "/".join( + [dest_dataset.dataset_id, dest_partitioned_table.table_id]) + backfill_blob: storage.Blob = gcs_bucket.blob( + f"{table_prefix}/{gcs_ocn_bq_ingest.constants.BACKFILL_FILENAME}") + backfill_blob.upload_from_string("") + backfill_blob.reload() + original_backfill_blob_generation = backfill_blob.generation + table_prefix = "" + # load each partition. + for gcs_data in gcs_partitioned_data: + if not gcs_data.exists(): + raise EnvironmentError("test data objects must exist") + if gcs_data.name.endswith(gcs_ocn_bq_ingest.constants.SUCCESS_FILENAME): + table_prefix = gcs_ocn_bq_ingest.utils.get_table_prefix( + gcs_data.name) + gcs_ocn_bq_ingest.ordering.backlog_publisher(gcs, gcs_data) + + # Use of queue to test that list responses are returned in expected order. + expected_backlog_blobs = queue.Queue() + expected_backlog_blobs.put("/".join([ + table_prefix, "_backlog", "$2017041101", + gcs_ocn_bq_ingest.constants.SUCCESS_FILENAME + ])) + expected_backlog_blobs.put("/".join([ + table_prefix, "_backlog", "$2017041102", + gcs_ocn_bq_ingest.constants.SUCCESS_FILENAME + ])) + + for backlog_blob in gcs_bucket.list_blobs( + prefix=f"{table_prefix}/_backlog"): + assert backlog_blob.name == expected_backlog_blobs.get(block=False) + + backfill_blob.reload() + assert backfill_blob.generation == original_backfill_blob_generation + + +@pytest.mark.IT +@pytest.mark.ORDERING +def test_single_backlog_subscriber_in_order(bq, gcs, gcs_bucket, error, + dest_ordered_update_table, + gcs_ordered_update_data, + gcs_external_update_config, + gcs_backlog, mock_env): + """Test basic functionality of backlog subscriber. + Populate a backlog with 3 files that make updates where we can assert + that these jobs were applied in order. + """ + gcs_ocn_bq_ingest.ordering.backlog_subscriber(gcs, bq, error, + gcs_external_update_config, + time.monotonic()) + backlog_blobs = gcs_bucket.list_blobs( + prefix=f"{gcs_ocn_bq_ingest.utils.get_table_prefix(gcs_external_update_config.name)}/_backlog/" + ) + assert backlog_blobs.num_results == 0, "backlog is not empty" + rows = bq.query("SELECT alpha_update FROM " + f"{dest_ordered_update_table.dataset_id}" + f".{dest_ordered_update_table.table_id}") + expected_num_rows = 1 + num_rows = 0 + for row in rows: + num_rows += 1 + assert row["alpha_update"] == "ABC", "incrementals not applied in order" + assert num_rows == expected_num_rows diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/ordering_schema.json b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/ordering_schema.json new file mode 100644 index 000000000..ea54a4eed --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/ordering_schema.json @@ -0,0 +1,10 @@ +[ + { + "name": "id", + "type": "INT64" + }, + { + "name": "alpha_update", + "type": "STRING" + } +] diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/00/_SUCCESS b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/00/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/00/data.csv b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/00/data.csv new file mode 100644 index 000000000..6b4f72558 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/00/data.csv @@ -0,0 +1 @@ +1|A diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/01/_SUCCESS b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/01/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/01/data.csv b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/01/data.csv new file mode 100644 index 000000000..3b4f35bfc --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/01/data.csv @@ -0,0 +1 @@ +1|B diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/02/_SUCCESS b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/02/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/02/data.csv b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/02/data.csv new file mode 100644 index 000000000..ecf1eb9e0 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/02/data.csv @@ -0,0 +1 @@ +1|C From 3c798f76be98ed4fe8c7577d9911a5cac194b655 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Tue, 8 Dec 2020 18:52:02 -0800 Subject: [PATCH 17/90] fixup! mypy pylint --- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py index a0db05425..aa1e67f33 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py @@ -108,7 +108,7 @@ START_BACKFILL_FILENAME, } -RESTART_BUFFER_SECONDS = os.getenv("RESTART_BUFFER_SECONDS", 30) +RESTART_BUFFER_SECONDS = int(os.getenv("RESTART_BUFFER_SECONDS", "30")) ORDER_ALL_JOBS = bool( distutils.util.strtobool(os.getenv("ORDER_ALL_JOBS", "False"))) From c631150373d255ceb2666d47ed1e1613c9edd6ba Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Tue, 8 Dec 2020 18:54:18 -0800 Subject: [PATCH 18/90] fixup! flake8 --- tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py index 93a459d63..a06d21b36 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py @@ -18,7 +18,6 @@ import uuid from typing import List -import google.api_core.exceptions import pytest from google.cloud import bigquery, error_reporting, storage From d5fabfa1497e64ce834725f776929d61a2adde80 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Tue, 8 Dec 2020 18:56:17 -0800 Subject: [PATCH 19/90] fixup! mypy tests --- tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py index a06d21b36..46fe1d9ef 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py @@ -349,7 +349,7 @@ def dest_ordered_update_table(request, bq, mock_env, schema=schema, ) - table: bigquery.Table = bq.create_table(table) + table = bq.create_table(table) # Our test query only updates so we need to populate the first row. bq.load_table_from_json([{"id": 1, "alpha_update": ""}], table) From 1c26e2362a7c96e707eebb9201f02be77b0a2034 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Wed, 9 Dec 2020 11:31:15 -0800 Subject: [PATCH 20/90] support _config/*.sql for bq tranform sql --- .../gcs_event_based_ingest/.gitignore | 1 + .../gcs_ocn_bq_ingest/constants.py | 2 ++ .../gcs_ocn_bq_ingest/utils.py | 30 +++++++++++++++---- 3 files changed, 27 insertions(+), 6 deletions(-) create mode 100644 tools/cloud_functions/gcs_event_based_ingest/.gitignore diff --git a/tools/cloud_functions/gcs_event_based_ingest/.gitignore b/tools/cloud_functions/gcs_event_based_ingest/.gitignore new file mode 100644 index 000000000..8ca3bf9ba --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/.gitignore @@ -0,0 +1 @@ +prof/ diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py index aa1e67f33..32f2238b1 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py @@ -112,3 +112,5 @@ ORDER_ALL_JOBS = bool( distutils.util.strtobool(os.getenv("ORDER_ALL_JOBS", "False"))) + +BQ_TRANSFORM_SQL="*.sql" \ No newline at end of file diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py index f38555846..7953f2358 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py @@ -19,6 +19,7 @@ import collections import collections.abc import copy +import fnmatch import json import os import pathlib @@ -42,7 +43,7 @@ def external_query( # pylint: disable=too-many-arguments query: str, dest_table_ref: bigquery.TableReference, job_id: str): """Load from query over external table from GCS. - This hinges on a SQL query defined in GCS at _config/bq_transform.sql and + This hinges on a SQL query defined in GCS at _config/*.sql and an external table definition _config/external.json (otherwise will assume CSV external table) """ @@ -126,11 +127,27 @@ def load_batches(gcs_client, bq_client, gsurl, dest_table_ref, job_id): def _get_parent_config_file(storage_client, config_filename, bucket, path): + bkt = storage_client.lookup_bucket(bucket) config_dir_name = "_config" parent_path = pathlib.Path(path).parent - config_path = parent_path / config_dir_name / config_filename + config_path = parent_path / config_dir_name + config_file_path = config_path / config_filename + # Handle wild card (to support bq transform sql with different names). + if "*" in config_filename: + matches: List[storage.Blob] = list(filter( + lambda blob: fnmatch.fnmatch(blob.name, config_filename), + bkt.list_blobs(prefix=config_path))) + if matches: + if len(matches) > 1: + raise RuntimeError( + f"Multiple matches for gs://{bucket}/{config_file_path}" + ) + return read_gcs_file_if_exists(storage_client, + f"gs://{bucket}/{matches[0].name}") + else: + return None return read_gcs_file_if_exists(storage_client, - f"gs://{bucket}/{config_path}") + f"gs://{bucket}/{config_file_path}") def look_for_config_in_parents(storage_client: storage.Client, gsurl: str, @@ -678,12 +695,13 @@ def apply( dest_table_ref, _ = gcs_path_to_table_ref_and_batch(success_blob.name) gsurl = removesuffix(f"gs://{bkt.name}/{success_blob.name}", constants.SUCCESS_FILENAME) - print("looking for bq_transform.sql") + print( + f"looking for a transformation tranformation sql file in parent _config.") external_query_sql = read_gcs_file_if_exists( - gcs_client, f"{gsurl}_config/bq_transform.sql") + gcs_client, f"{gsurl}_config/*.sql") if not external_query_sql: external_query_sql = look_for_config_in_parents(gcs_client, gsurl, - "bq_transform.sql") + "*.sql") if external_query_sql: print("EXTERNAL QUERY") print(f"found external query:\n{external_query_sql}") From d16fb1bd5614190bfe81910c507f2e9c85bcf586 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Wed, 9 Dec 2020 12:00:17 -0800 Subject: [PATCH 21/90] improve performance of wait_on_bq_job --- .../gcs_ocn_bq_ingest/constants.py | 6 ++--- .../gcs_ocn_bq_ingest/ordering.py | 11 ++++---- .../gcs_ocn_bq_ingest/utils.py | 27 ++++++++++--------- 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py index 32f2238b1..e7eb75ff5 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py @@ -23,14 +23,14 @@ import google.api_core.client_info import google.cloud.exceptions -# Will wait up to this polling for errors before exiting +# Will wait up to this long polling for errors in a bq job before exiting # This is to check if job fail quickly, not to assert it succeed. # This may not be honored if longer than cloud function timeout. # https://cloud.google.com/functions/docs/concepts/exec#timeout # One might consider lowering this to 1-2 seconds to lower the # upper bound of expected execution time to stay within the free tier. # https://cloud.google.com/functions/pricing#free_tier -WAIT_FOR_JOB_SECONDS = int(os.getenv("WAIT_FOR_JOB_SECONDS", "5")) +WAIT_FOR_JOB_SECONDS = int(os.getenv("WAIT_FOR_JOB_SECONDS", "1")) DEFAULT_EXTERNAL_TABLE_DEFINITION = { # The default must be a self describing data format @@ -113,4 +113,4 @@ ORDER_ALL_JOBS = bool( distutils.util.strtobool(os.getenv("ORDER_ALL_JOBS", "False"))) -BQ_TRANSFORM_SQL="*.sql" \ No newline at end of file +BQ_TRANSFORM_SQL = "*.sql" diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py index 310eb1f52..468915831 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py @@ -79,11 +79,12 @@ def backlog_subscriber(gcs_client: storage.Client, bq_client: bigquery.Client, "setting the timeout to 540 seconds or at least " "1 minute (Cloud Functions default).") while time.monotonic() < restart_time - polling_timeout: - job_id = utils.read_gcs_file_if_exists( + lock_contents = utils.read_gcs_file_if_exists( gcs_client, f"gs://{bkt.name}/{lock_blob.name}") - if job_id: - if job_id.startswith( + if lock_contents: + if lock_contents.startswith( os.getenv('JOB_PREFIX', constants.DEFAULT_JOB_PREFIX)): + job_id = lock_contents try: last_job_done = utils.wait_on_bq_job_id( bq_client, job_id, polling_timeout) @@ -110,9 +111,7 @@ def backlog_subscriber(gcs_client: storage.Client, bq_client: bigquery.Client, else: print(f"sleeping for {polling_timeout} seconds because" f"found manual lock gs://{bkt.name}/{lock_blob.name} with" - "contents:\n" - f"""{utils.read_gcs_file_if_exists(gcs_client, - f'gs://{lock_blob.bucket.name}/{lock_blob.name}')}""") + f"contents:\n {lock_contents}") time.sleep(polling_timeout) continue if last_job_done: diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py index 7953f2358..ae29f4b4f 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py @@ -55,10 +55,12 @@ def external_query( # pylint: disable=too-many-arguments if external_table_config: external_table_def = json.loads(external_table_config) else: - print(f"Falling back to default CSV external table." - f" {gsurl}_config/external.json not found.") + print(f" {gsurl}_config/external.json not found in parents of {gsurl}." + "Falling back to default PARQUET external table:\n" + f"{json.dumps(constants.DEFAULT_EXTERNAL_TABLE_DEFINITION)}") external_table_def = constants.DEFAULT_EXTERNAL_TABLE_DEFINITION + # This may cause an issue if >10,000 files. however, we external_table_def["sourceUris"] = flatten2dlist( get_batches_for_prefix(gcs_client, gsurl)) print(f"external table def = {json.dumps(external_table_config, indent=2)}") @@ -88,6 +90,8 @@ def external_query( # pylint: disable=too-many-arguments if job.errors: raise exceptions.BigQueryJobFailure( f"query job {job.job_id} failed quickly: {job.errors}") + if job.state == "DONE": + return time.sleep(constants.JOB_POLL_INTERVAL_SECONDS) @@ -134,18 +138,16 @@ def _get_parent_config_file(storage_client, config_filename, bucket, path): config_file_path = config_path / config_filename # Handle wild card (to support bq transform sql with different names). if "*" in config_filename: - matches: List[storage.Blob] = list(filter( - lambda blob: fnmatch.fnmatch(blob.name, config_filename), - bkt.list_blobs(prefix=config_path))) + matches: List[storage.Blob] = list( + filter(lambda blob: fnmatch.fnmatch(blob.name, config_filename), + bkt.list_blobs(prefix=config_path))) if matches: if len(matches) > 1: raise RuntimeError( - f"Multiple matches for gs://{bucket}/{config_file_path}" - ) + f"Multiple matches for gs://{bucket}/{config_file_path}") return read_gcs_file_if_exists(storage_client, f"gs://{bucket}/{matches[0].name}") - else: - return None + return None return read_gcs_file_if_exists(storage_client, f"gs://{bucket}/{config_file_path}") @@ -696,9 +698,10 @@ def apply( gsurl = removesuffix(f"gs://{bkt.name}/{success_blob.name}", constants.SUCCESS_FILENAME) print( - f"looking for a transformation tranformation sql file in parent _config.") - external_query_sql = read_gcs_file_if_exists( - gcs_client, f"{gsurl}_config/*.sql") + "looking for a transformation tranformation sql file in parent _config." + ) + external_query_sql = read_gcs_file_if_exists(gcs_client, + f"{gsurl}_config/*.sql") if not external_query_sql: external_query_sql = look_for_config_in_parents(gcs_client, gsurl, "*.sql") From c627af029b8062168930c20b71861fefacb9c296 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Wed, 9 Dec 2020 17:22:03 -0800 Subject: [PATCH 22/90] wip --- .../gcs_event_based_ingest/ORDERING.md | 107 ++++++++++++++++++ .../gcs_event_based_ingest/README.md | 6 + .../gcs_ocn_bq_ingest/README.md | 11 +- .../gcs_ocn_bq_ingest/constants.py | 7 +- .../gcs_ocn_bq_ingest/main.py | 57 +++++++--- .../gcs_ocn_bq_ingest/ordering.py | 91 ++++++++------- .../test_gcs_ocn_bq_ingest.py | 20 ++++ .../gcs_ocn_bq_ingest/test_ordering_it.py | 2 +- 8 files changed, 243 insertions(+), 58 deletions(-) create mode 100644 tools/cloud_functions/gcs_event_based_ingest/ORDERING.md diff --git a/tools/cloud_functions/gcs_event_based_ingest/ORDERING.md b/tools/cloud_functions/gcs_event_based_ingest/ORDERING.md new file mode 100644 index 000000000..24a20fcd7 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/ORDERING.md @@ -0,0 +1,107 @@ +# Ordering Batches +There are use cases where it is important for incremental batches get +applied in order rather than as soon as they are uploaded to GCS (which is the +default behavior of this solution). +1. When using External Query that performs DML other than insert only. +(e.g. an `UPDATE` assumes that prior batches have already been committed) +1. To ensure that there are not time gaps in the data (e.g. ensure that +2020/01/02 data is not committed to BigQuery before 2020/01/01, or similarly +that 00 hour is ingested before the 01 hour, etc.) + +This Cloud Function supports serializing the submission of ingestion jobs to +BigQuery by using Google Cloud Storage's consistency guarantees to provide a +pessimistic lock on a table to prevent concurrent jobs and +[GCS Object.list](https://cloud.google.com/storage/docs/json_api/v1/objects/list) +lexicographic sorting of results to providing ordering gurantees. +The solution involves a table level `_backlog/` directory to keep track +of success files whose batches have not yet been committed to BigQuery and +a table level `_bqlock` file to keep track of what job is currently ingesting to +that table. This way we can make our Cloud Function idempotent by having all the +state stored in GCS so we can safely retrigger it to skirt the Cloud Functions +timeout. + +## Assumptions +This ordering solution assumes that you want to apply batches in lexicographic +order. This is usually the case because path names usually contain some sort of +date / hour information. + +## Enabling Ordering +### Environment Variable +Ordering can be enabled at the function level by setting the `ORDER_PER_TABLE` +environment variable to `"True"`. +### Config File +Ordering can be configured at any level of your naming convention (e.g. dataset +table or some sub-path) by placing a `_config/ORDERME` file. This can be helpful +in scenarios where your historical load can be processed safely in parallel but +incrementals must be ordered. +For example: +```text +gs://${BUCKET}/${DATASET}/${TABLE}/historical/_config/load.json +gs://${BUCKET}/${DATASET}/${TABLE}/incremental/_config/external.json +gs://${BUCKET}/${DATASET}/${TABLE}/incremental/_config/bq_transform.sql +gs://${BUCKET}/${DATASET}/${TABLE}/incremental/_config/ORDERME +``` + +## Dealing With Out of Order Publishing to GCS During Historical Load +In some use cases, there is a period where incrementals that must be applied in +order are uploaded in parallel (meaning their _SUCCESS files are expected to be +out of order). This typically happens during some historical backfill period. +This can be solved by setting the `START_BACKFILL_FILENAME` environment +variable to a file name that indicates that the parallel upload of historical +incrementals is complete (e.g. `_HISTORYDONE`). This will cause all success +files for a table to be added to the backlog until the `_HISTORYDONE` file is +dropped at the table level. At that point the backlog subscriber will begin +processing the batches in order. + +## Batch Failure Behavior +When ordering is enabled, if the BQ job to apply a batch failed, it is not safe +to continue to ingest the next batch. The Cloud Function will leave the +`_bqlock` file and stop trying to process the backlog. The Cloud function +will report an exception like this which should be alerted on as the ingestion +process for the table will be deadlocked until there is human intervention to +address the failed batch: +```text + f"previous BigQuery job: {job_id} failed or could not " + "be found. This will kill the backfill subscriber for " + f"the table prefix {table_prefix}." + "Once the issue is dealt with by a human, the lock" + "file at: " + f"gs://{lock_blob.bucket.name}/{lock_blob.name} " + "should be manually removed and a new empty _BACKFILL" + "file uploaded to:" + f"gs://{lock_blob.bucket.name}/{table_prefix}/_BACKFILL" + f"to resume the backfill subscriber so it can " + "continue with the next item in the backlog.\n" + "Original Exception:\n" + f"{traceback.format_exc()}") +``` + +## Ordering Mechanics Explained +We've treated ordering incremental commits to table as a variation on the +[Producer-Consumer Problem](https://en.wikipedia.org/wiki/Producer%E2%80%93consumer_problem) +Where we have multiple producers (each call of Backlog Publisher) and a single +Consumer (the Backlog Subscriber which is enforced to be a singleton per table +with a claim file). Our solution is to use GCS `_backlog` directory as our queue +and `_bqlock` as a mutex. + +### Backlog Publisher +The Backlog Publisher has two responsibilities: +1. add incoming success files to a +table's `_backlog` so they are not "forgotten" by the ingestion system. +1. if there is a non-empty backlog start the backfill subscriber (if one is not +already running). This is accomplished by dropping a table level `_BACKFILL` file. + +### Backlog Subscriber +The Backlog Subscriber is responsible for keeping track of BigQuery jobs running +on a table and ensure that batches are committed in order. When the backlog is +not empty for a table the backlog subscriber should be running for that table. +It will either be polling a `RUNNING` BigQuery job for completion, or submitting +the next batch in the `_backlog`. + +The state of what BigQuery job is currently running on a table is kept in a +`_bqlock` file at the table prefix. + +In order to escape the maximum nine-minute (540s) Cloud Function Timeout, the +backfill subscriber will re-trigger itself by posting a new `_BACKFILL` file +until the `_backlog` for the table prefix is empty. When a new success file +arrives it is the reponsibility of the publisher to restart the subscriber. diff --git a/tools/cloud_functions/gcs_event_based_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/README.md index 2477cc4f5..5dcedf5c9 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/README.md @@ -389,6 +389,12 @@ In theory, one could set up Pub/Sub notifications from multiple GCS Buckets Pub/Sub topic so that data uploaded to any of these buckets could get automatically loaded to BigQuery by a single deployment of the Cloud Function. +## Ordering Guarantees +It is possible to configure the Cloud Function to apply incrementals in order if +this is crucial to your data integrity. This naturally comes with a performance +penalty as for a given table we cannot parallelize ingestion of batches. +The ordering behavior and options are described in detail in [ORDERING.md](ORDERING.md) + ## Backfill There are some cases where you may have data already copied to GCS according to the naming convention / with success files before the Object Change diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md index 1252b1dda..a1f417d7b 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md @@ -9,7 +9,8 @@ BigQuery Table. 1. [Pub/Sub Notification](https://cloud.google.com/storage/docs/pubsub-notifications) object finalize. 1. Cloud Function subscribes to notifications and ingests all the data into -BigQuery a directory once a `_SUCCESS` file arrives. +BigQuery from a GCS prefix once a `_SUCCESS` file arrives. The success file name +is configurable with environment variable. ## Deployment @@ -32,9 +33,13 @@ following default behavior. | `MAX_BATCH_BYTES` | Max bytes for BigQuery Load job | `15000000000000` ([15 TB](https://cloud.google.com/bigquery/quotas#load_jobs)| | `JOB_PREFIX` | Prefix for BigQuery Job IDs | `gcf-ingest-` | | `BQ_PROJECT` | Default BQ project to use if not specified in dataset capturing group | Project where Cloud Function is deployed | -| `ORDERED_PER_TABLE` | Force jobs to be executed sequentially (rather than parallel) based on the backlog. This is the same as having an `ORDERME` file in every config directory | `False` | - +| `ORDER_PER_TABLE`\* | Force jobs to be executed sequentially (rather than parallel) based on the backlog. This is the same as having an `ORDERME` file in every config directory | `False` | +| `START_BACKFILL_FILENAME`\*| Block submitting BigQuery Jobs for a table until this file is present at the table prefix. By default this will not happen. | `None` | +| `RESTART_BUFFER_SECONDS`\* | Buffer before Cloud Function timeout to leave before re-triggering the backfill subscriber | 30 | +\* only affect the behavior when ordering is enabled for a table. +See [ORDERING.md](../ORDERING.md) + ## Implementation notes 1. To support notifications based on a GCS prefix (rather than every object in the bucket), we chose to use manually diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py index e7eb75ff5..0936d3f14 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py @@ -72,6 +72,7 @@ DEFAULT_DESTINATION_REGEX = ( r"^(?P[\w\-\._0-9]+)/" # dataset (required) r"(?P
[\w\-_0-9]+)/?" # table name (required) + r"(?:historical|incremental)?/?" # break up hist v.s. inc to separate prefixes (optional) r"(?P\$[0-9]+)?/?" # partition decorator (optional) r"(?:" # [begin] yyyy/mm/dd/hh/ group (optional) r"(?P[0-9]{4})/?" # partition year (yyyy) (optional) @@ -110,7 +111,9 @@ RESTART_BUFFER_SECONDS = int(os.getenv("RESTART_BUFFER_SECONDS", "30")) -ORDER_ALL_JOBS = bool( - distutils.util.strtobool(os.getenv("ORDER_ALL_JOBS", "False"))) +ORDER_PER_TABLE = bool( + distutils.util.strtobool(os.getenv("ORDER_PER_TABLE", "False"))) BQ_TRANSFORM_SQL = "*.sql" + +ENSURE_SUBSCRIBER_SECONDS = 10 diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 199ac47a8..60e22c77b 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -48,7 +48,8 @@ def main(event: Dict, context): # pylint: disable=unused-argument basename_object_id = os.path.basename(object_id) - # Exit eagerly if this is not a file to take action on. + # Exit eagerly if this is not a file to take action on + # (e.g. a data, config, or lock file) if basename_object_id not in constants.ACTION_FILENAMES: action_filenames = constants.ACTION_FILENAMES if constants.START_BACKFILL_FILENAME is None: @@ -57,20 +58,11 @@ def main(event: Dict, context): # pylint: disable=unused-argument f"{action_filenames} file.") return - # Ignore success files in the backlog directory - if (basename_object_id == constants.SUCCESS_FILENAME - and "/_backlog/" in object_id): - print(f"No-op. This notification was for " - f"gs://{bucket_id}/{object_id} a" - f"{constants.SUCCESS_FILENAME} in a" - "/_backlog/ directory.") - return - gcs_client = lazy_gcs_client() bq_client = lazy_bq_client() table_ref, batch = utils.gcs_path_to_table_ref_and_batch(object_id) - enforce_ordering = (constants.ORDER_ALL_JOBS + enforce_ordering = (constants.ORDER_PER_TABLE or utils.look_for_config_in_parents( gcs_client, f"gs://{bucket_id}/{object_id}", "ORDERME") is not None) @@ -78,6 +70,38 @@ def main(event: Dict, context): # pylint: disable=unused-argument bkt: storage.Bucket = utils.cached_get_bucket(gcs_client, bucket_id) event_blob: storage.Blob = bkt.blob(object_id) + # For SUCCESS files in a backlog directory, ensure that subscriber is + # running. + if ( + basename_object_id == constants.SUCCESS_FILENAME + and "/_backlog/" in object_id + ): + print(f"This notification was for " + f"gs://{bucket_id}/{object_id} a" + f"{constants.SUCCESS_FILENAME} in a" + "/_backlog/ directory. Ensuring that subscriber is running.") + # Handle rare race condition where: + # 1. subscriber reads an empty backlog (before it can delete the + # _BACKFILL blob...) + # 2. a new item is added to the backlog (causing a separate function + # invocation) + # 3. In this new invocation we reach this point in the code path and + # start_subscriber_if_not_running sees the old _BACKFILL and does + # not create a new one. + # 4. The subscriber deletes the _BACKFILL blob and exits without + # processing the new item on the backlog from #2. + backfill_blob = ordering.start_backfill_subscriber_if_not_running( + gcs_client, bkt, utils.get_table_prefix(object_id)) + + time.sleep(constants.ENSURE_SUBSCRIBER_SECONDS) + while not utils.wait_on_gcs_blob( + gcs_client, backfill_blob, constants.ENSURE_SUBSCRIBER_SECONDS + ): + backfill_blob =\ + ordering.start_backfill_subscriber_if_not_running( + gcs_client, bkt, utils.get_table_prefix(object_id)) + return + if enforce_ordering: if (constants.START_BACKFILL_FILENAME and basename_object_id == constants.START_BACKFILL_FILENAME): @@ -89,7 +113,6 @@ def main(event: Dict, context): # pylint: disable=unused-argument ordering.backlog_publisher(gcs_client, event_blob) elif basename_object_id == constants.BACKFILL_FILENAME: ordering.backlog_subscriber(gcs_client, bq_client, - lazy_error_reporting_client(), event_blob, function_start_time) else: # Default behavior submit job as soon as success file lands. bkt = utils.cached_get_bucket(gcs_client, bucket_id) @@ -102,10 +125,16 @@ def main(event: Dict, context): # pylint: disable=unused-argument None, # None lock blob as there is no serialization required. utils.create_job_id(table_ref, batch)) # Unexpected exceptions will actually raise which may cause a cold restart. - except tuple(exceptions.EXCEPTIONS_TO_REPORT): + except tuple(exceptions.EXCEPTIONS_TO_REPORT) as original_error: # We do this because we know these errors do not require a cold restart # of the cloud function. - lazy_error_reporting_client().report_exception() + try: + lazy_error_reporting_client().report_exception() + except Exception: # pylint: disable=broad-except + # This mostly handles the case where error reporting API is not + # enabled or IAM permissions did not allow us to report errors with + # error reporting API. + raise original_error def lazy_error_reporting_client() -> error_reporting.Client: diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py index 468915831..2d4ade6be 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py @@ -19,6 +19,7 @@ import os import time import traceback +from typing import Optional import google.api_core import google.api_core.exceptions @@ -42,20 +43,12 @@ def backlog_publisher( print(f"added gs://{backlog_blob.bucket.name}/{backlog_blob.name} " "to the backlog.") - start_backfill = True table_prefix = utils.get_table_prefix(event_blob.name) - if constants.START_BACKFILL_FILENAME: - start_backfill_blob = bkt.blob( - f"{table_prefix}/{constants.START_BACKFILL_FILENAME}") - start_backfill = start_backfill_blob.exists() - - if start_backfill: - start_backfill_subscriber_if_not_running(gcs_client, bkt, table_prefix) + start_backfill_subscriber_if_not_running(gcs_client, bkt, table_prefix) # pylint: disable=too-many-arguments,too-many-locals def backlog_subscriber(gcs_client: storage.Client, bq_client: bigquery.Client, - error_client: error_reporting.Client, backfill_blob: storage.Blob, function_start_time: float): """Pick up the table lock, poll BQ job id until completion and process next item in the backlog. @@ -90,40 +83,47 @@ def backlog_subscriber(gcs_client: storage.Client, bq_client: bigquery.Client, bq_client, job_id, polling_timeout) except (exceptions.BigQueryJobFailure, google.api_core.exceptions.NotFound): - last_job_done = False - error_client.report( + raise exceptions.BigQueryJobFailure( f"previous BigQuery job: {job_id} failed or could not " "be found. This will kill the backfill subscriber for " - f"the table prefix {table_prefix}." - "Once the issue is dealt with by a human, the lock" + f"the table prefix: {table_prefix}." + "Once the issue is dealt with by a human, the lock " "file at: " f"gs://{lock_blob.bucket.name}/{lock_blob.name} " "should be manually removed and a new empty " - f"{constants.BACKFILL_FILENAME}" - "file uploaded to:" - f"gs://{lock_blob.bucket.name}/{table_prefix}/_BACKFILL" + f"{constants.BACKFILL_FILENAME} " + "file uploaded to: " + f"gs://{backfill_blob.bucket.name}/{table_prefix}" + "/_BACKFILL " f"to resume the backfill subscriber so it can " "continue with the next item in the backlog.\n" "Original Exception:\n" f"{traceback.format_exc()}") - time.sleep(polling_timeout) - continue else: print(f"sleeping for {polling_timeout} seconds because" f"found manual lock gs://{bkt.name}/{lock_blob.name} with" - f"contents:\n {lock_contents}") + f"contents:\n {lock_contents}. This will be an infinite" + "loop until the manual lock is released.") time.sleep(polling_timeout) continue if last_job_done: utils.remove_oldest_backlog_item(gcs_client, bkt, table_prefix) last_job_done = False + check_backlog_time = time.monotonic() next_backlog_file = utils.get_next_backlog_item(gcs_client, bkt, table_prefix) if not next_backlog_file: - print(f"backlog is empty for gs://{bkt.name}/{table_prefix}." - "baclog subscriber exiting.") + backfill_blob.delete(if_generation_match=backfill_blob.generation) + if time.monotonic() > check_backlog_time: + raise exceptions.BacklogException( + "Please check if the backlog is empty for " + f"gs://${bkt.name}/{table_prefix}/_backlog/" + "There was more than {}" + ) utils.handle_bq_lock(gcs_client, lock_blob, None) + print(f"backlog is empty for gs://{bkt.name}/{table_prefix}. " + "backlog subscriber exiting.") return next_success_file: storage.Blob = bkt.blob( next_backlog_file.name.replace("/_backlog/", "/")) @@ -144,28 +144,43 @@ def backlog_subscriber(gcs_client: storage.Client, bq_client: bigquery.Client, backfill_blob.upload_from_string("") -def start_backfill_subscriber_if_not_running(gcs_client: storage.Client, - bkt: storage.Bucket, - table_prefix: str): +def start_backfill_subscriber_if_not_running( + gcs_client: storage.Client, + bkt: storage.Bucket, + table_prefix: str +) -> Optional[storage.Blob]: """start the backfill subscriber if it is not already runnning for this table prefix. created a backfill file for the table prefix if not exists. """ - # Create a _BACKFILL file for this table if not exists - backfill_blob = bkt.blob(f"{table_prefix}/{constants.BACKFILL_FILENAME}") - try: - backfill_blob.upload_from_string("", - if_generation_match=0, - client=gcs_client) - print("triggered backfill with " - f"gs://{backfill_blob.bucket.name}/{backfill_blob.name} " - f"created at {backfill_blob.time_created}. exiting. ") - except google.api_core.exceptions.PreconditionFailed: - backfill_blob.reload() - print("backfill already in progress due to: " - f"gs://{backfill_blob.bucket.name}/{backfill_blob.name} " - f"created at {backfill_blob.time_created}. exiting.") + start_backfill = True + # Do not start subscriber until START_BACKFILL_FILENAME has been dropped + # at the table prefix. + if constants.START_BACKFILL_FILENAME: + start_backfill_blob = bkt.blob( + f"{table_prefix}/{constants.START_BACKFILL_FILENAME}") + start_backfill = start_backfill_blob.exists() + + if start_backfill: + # Create a _BACKFILL file for this table if not exists + backfill_blob = bkt.blob(f"{table_prefix}/{constants.BACKFILL_FILENAME}") + try: + backfill_blob.upload_from_string("", + if_generation_match=0, + client=gcs_client) + print("triggered backfill with " + f"gs://{backfill_blob.bucket.name}/{backfill_blob.name} " + f"created at {backfill_blob.time_created}. exiting. ") + return backfill_blob + except google.api_core.exceptions.PreconditionFailed: + backfill_blob.reload() + print("backfill already in progress due to: " + f"gs://{backfill_blob.bucket.name}/{backfill_blob.name} " + f"created at {backfill_blob.time_created}. exiting.") + return backfill_blob + else: + return None def success_blob_to_backlog_blob(success_blob: storage.Blob) -> storage.Blob: diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py index 019fd848e..49f76389f 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py @@ -97,6 +97,26 @@ "hh": "03", "batch": "batch_id" }), + ("project.dataset/table/historical/2020/01/02/03/batch_id/_SUCCESS", { + "dataset": "project.dataset", + "table": "table", + "partition": None, + "yyyy": "2020", + "mm": "01", + "dd": "02", + "hh": "03", + "batch": "batch_id" + }), + ("project.dataset/table/incremental/2020/01/02/04/batch_id/_SUCCESS", { + "dataset": "project.dataset", + "table": "table", + "partition": None, + "yyyy": "2020", + "mm": "01", + "dd": "02", + "hh": "04", + "batch": "batch_id" + }), ]) def test_default_destination_regex(test_input: str, expected: Dict[str, Optional[str]]): diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py index 3608d6e08..c3cb23585 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py @@ -123,7 +123,7 @@ def test_single_backlog_subscriber_in_order(bq, gcs, gcs_bucket, error, Populate a backlog with 3 files that make updates where we can assert that these jobs were applied in order. """ - gcs_ocn_bq_ingest.ordering.backlog_subscriber(gcs, bq, error, + gcs_ocn_bq_ingest.ordering.backlog_subscriber(gcs, bq, gcs_external_update_config, time.monotonic()) backlog_blobs = gcs_bucket.list_blobs( From 35e26d9a8a1f82a847efdd9b09e118e7a25e6cdf Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Wed, 9 Dec 2020 18:13:42 -0800 Subject: [PATCH 23/90] fixup! handle race condition --- .../gcs_ocn_bq_ingest/constants.py | 2 +- .../gcs_ocn_bq_ingest/main.py | 48 +++++---------- .../gcs_ocn_bq_ingest/ordering.py | 59 +++++++++++++++++-- 3 files changed, 72 insertions(+), 37 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py index 0936d3f14..908d0e854 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py @@ -116,4 +116,4 @@ BQ_TRANSFORM_SQL = "*.sql" -ENSURE_SUBSCRIBER_SECONDS = 10 +ENSURE_SUBSCRIBER_SECONDS = 5 diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 60e22c77b..163fa0629 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -70,39 +70,21 @@ def main(event: Dict, context): # pylint: disable=unused-argument bkt: storage.Bucket = utils.cached_get_bucket(gcs_client, bucket_id) event_blob: storage.Blob = bkt.blob(object_id) - # For SUCCESS files in a backlog directory, ensure that subscriber is - # running. - if ( - basename_object_id == constants.SUCCESS_FILENAME - and "/_backlog/" in object_id - ): - print(f"This notification was for " - f"gs://{bucket_id}/{object_id} a" - f"{constants.SUCCESS_FILENAME} in a" - "/_backlog/ directory. Ensuring that subscriber is running.") - # Handle rare race condition where: - # 1. subscriber reads an empty backlog (before it can delete the - # _BACKFILL blob...) - # 2. a new item is added to the backlog (causing a separate function - # invocation) - # 3. In this new invocation we reach this point in the code path and - # start_subscriber_if_not_running sees the old _BACKFILL and does - # not create a new one. - # 4. The subscriber deletes the _BACKFILL blob and exits without - # processing the new item on the backlog from #2. - backfill_blob = ordering.start_backfill_subscriber_if_not_running( - gcs_client, bkt, utils.get_table_prefix(object_id)) - - time.sleep(constants.ENSURE_SUBSCRIBER_SECONDS) - while not utils.wait_on_gcs_blob( - gcs_client, backfill_blob, constants.ENSURE_SUBSCRIBER_SECONDS - ): - backfill_blob =\ - ordering.start_backfill_subscriber_if_not_running( - gcs_client, bkt, utils.get_table_prefix(object_id)) - return - if enforce_ordering: + # For SUCCESS files in a backlog directory, ensure that subscriber + # is running. + if ( + basename_object_id == constants.SUCCESS_FILENAME + and "/_backlog/" in object_id + ): + print(f"This notification was for " + f"gs://{bucket_id}/{object_id} a" + f"{constants.SUCCESS_FILENAME} in a" + "/_backlog/ directory. " + f"Watiting {constants.ENSURE_SUBSCRIBER_SECONDS} seconds to " + "ensure that subscriber is running.") + ordering.subscriber_monitor(gcs_client, bkt, object_id) + return if (constants.START_BACKFILL_FILENAME and basename_object_id == constants.START_BACKFILL_FILENAME): # This will be the first backfill file. @@ -111,9 +93,11 @@ def main(event: Dict, context): # pylint: disable=unused-argument return if basename_object_id == constants.SUCCESS_FILENAME: ordering.backlog_publisher(gcs_client, event_blob) + return elif basename_object_id == constants.BACKFILL_FILENAME: ordering.backlog_subscriber(gcs_client, bq_client, event_blob, function_start_time) + return else: # Default behavior submit job as soon as success file lands. bkt = utils.cached_get_bucket(gcs_client, bucket_id) success_blob: storage.Blob = bkt.blob(object_id) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py index 2d4ade6be..cbc21d3ac 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py @@ -115,12 +115,25 @@ def backlog_subscriber(gcs_client: storage.Client, bq_client: bigquery.Client, table_prefix) if not next_backlog_file: backfill_blob.delete(if_generation_match=backfill_blob.generation) - if time.monotonic() > check_backlog_time: - raise exceptions.BacklogException( - "Please check if the backlog is empty for " + if ( + check_backlog_time + constants.ENSURE_SUBSCRIBER_SECONDS < + time.monotonic() + ): + print( + "checking if the backlog is still empty for " f"gs://${bkt.name}/{table_prefix}/_backlog/" - "There was more than {}" + f"There was more than {constants.ENSURE_SUBSCRIBER_SECONDS}" + " seconds between listing items on the backlog and " + f"attempting to delete the {constants.BACKFILL_FILENAME}. " + "This should not happen often but is meant to alleviate a " + "race condition in the event that something caused the " + "delete operation was delayed or had to be retried for a " + "long time." ) + next_backlog_file = utils.get_next_backlog_item( + gcs_client, bkt, table_prefix) + if next_backlog_file: + continue utils.handle_bq_lock(gcs_client, lock_blob, None) print(f"backlog is empty for gs://{bkt.name}/{table_prefix}. " "backlog subscriber exiting.") @@ -192,3 +205,41 @@ def success_blob_to_backlog_blob(success_blob: storage.Blob) -> storage.Blob: success_file_suffix = utils.removeprefix(success_blob.name, f"{table_prefix}/") return bkt.blob(f"{table_prefix}/_backlog/{success_file_suffix}") + + +def subscriber_monitor( + gcs_client: storage.Client, + bkt: storage.Bucket, + object_id: str +): + """ + Monitor to handle a rare race condition where: + + 1. subscriber reads an empty backlog (before it can delete the + _BACKFILL blob...) + 2. a new item is added to the backlog (causing a separate + function invocation) + 3. In this new invocation we reach this point in the code path + and start_subscriber_if_not_running sees the old _BACKFILL + and does not create a new one. + 4. The subscriber deletes the _BACKFILL blob and exits without + processing the new item on the backlog from #2. + + We handle this by success file added to the backlog starts this monitoring + to wait constants.ENSURE_SUBSCRIBER_SECONDS before checking that the + backfill file exists. On the subscriber side we check if there was more time + than this between list backlog items and delete backfill calls. This way + we always handle this race condition either in this monitor or in the + subscriber itself. + """ + backfill_blob = start_backfill_subscriber_if_not_running( + gcs_client, bkt, utils.get_table_prefix(object_id)) + + time.sleep(constants.ENSURE_SUBSCRIBER_SECONDS) + while not utils.wait_on_gcs_blob( + gcs_client, backfill_blob, constants.ENSURE_SUBSCRIBER_SECONDS + ): + backfill_blob = \ + start_backfill_subscriber_if_not_running( + gcs_client, bkt, utils.get_table_prefix(object_id)) + From 8c97f5a7efd9097cd891a1ba47ff057ae52b9fa7 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Wed, 9 Dec 2020 19:49:06 -0800 Subject: [PATCH 24/90] ordering docs and isort single line rule --- .../gcs_event_based_ingest/.isort.cfg | 1 + .../gcs_event_based_ingest/ORDERING.md | 36 +++- .../gcs_event_based_ingest/backfill.py | 4 +- .../gcs_ocn_bq_ingest/main.py | 34 ++-- .../gcs_ocn_bq_ingest/ordering.py | 45 +++-- .../gcs_ocn_bq_ingest/utils.py | 14 +- .../gcs_event_based_ingest/img/ordering.png | Bin 0 -> 77197 bytes .../ordered_backfill.py | 179 ++++++++++++++++++ .../gcs_event_based_ingest/tests/conftest.py | 54 +++++- .../test_gcs_ocn_bq_ingest.py | 3 +- .../test_gcs_ocn_bq_ingest_it.py | 33 ++++ 11 files changed, 357 insertions(+), 46 deletions(-) create mode 100644 tools/cloud_functions/gcs_event_based_ingest/img/ordering.png create mode 100644 tools/cloud_functions/gcs_event_based_ingest/ordered_backfill.py diff --git a/tools/cloud_functions/gcs_event_based_ingest/.isort.cfg b/tools/cloud_functions/gcs_event_based_ingest/.isort.cfg index ed7944aca..7b7b2d6f3 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/.isort.cfg +++ b/tools/cloud_functions/gcs_event_based_ingest/.isort.cfg @@ -1,3 +1,4 @@ [settings] src_paths=backfill.py,gcs_ocn_bq_ingest,test skip=terraform_module +force_single_line=True diff --git a/tools/cloud_functions/gcs_event_based_ingest/ORDERING.md b/tools/cloud_functions/gcs_event_based_ingest/ORDERING.md index 24a20fcd7..8a3dda5d8 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/ORDERING.md +++ b/tools/cloud_functions/gcs_event_based_ingest/ORDERING.md @@ -42,7 +42,7 @@ gs://${BUCKET}/${DATASET}/${TABLE}/incremental/_config/bq_transform.sql gs://${BUCKET}/${DATASET}/${TABLE}/incremental/_config/ORDERME ``` -## Dealing With Out of Order Publishing to GCS During Historical Load +## Dealing With Out-of-Order Publishing to GCS During Historical Load In some use cases, there is a period where incrementals that must be applied in order are uploaded in parallel (meaning their _SUCCESS files are expected to be out of order). This typically happens during some historical backfill period. @@ -82,7 +82,8 @@ We've treated ordering incremental commits to table as a variation on the Where we have multiple producers (each call of Backlog Publisher) and a single Consumer (the Backlog Subscriber which is enforced to be a singleton per table with a claim file). Our solution is to use GCS `_backlog` directory as our queue -and `_bqlock` as a mutex. +and `_bqlock` as a mutex. There is still a rare corner case of a race condition +that we handle as well. ### Backlog Publisher The Backlog Publisher has two responsibilities: @@ -105,3 +106,34 @@ In order to escape the maximum nine-minute (540s) Cloud Function Timeout, the backfill subscriber will re-trigger itself by posting a new `_BACKFILL` file until the `_backlog` for the table prefix is empty. When a new success file arrives it is the reponsibility of the publisher to restart the subscriber. + + +### Note on Handling Race Condition +we use subscribe_monitor to handle a rare race condition where: + +1. subscriber reads an empty backlog (before it can delete the + _BACKFILL blob...) +2. a new item is added to the backlog (causing a separate + function invocation) +3. In this new invocation we reach this point in the code path + and start_subscriber_if_not_running sees the old _BACKFILL + and does not create a new one. +4. The subscriber deletes the _BACKFILL blob and exits without + processing the new item on the backlog from #2. + +We handle this by the following: + +1. When success file added to the backlog starts this monitoring +to wait 10 seconds before checking that the backfill file exists. To catch if +the backfill file disappears when it should not. This might trigger an extra +loop of the backfill subscriber but this loop will not take any action and this +wasted compute is far better than dropping a batch of data. +1. On the subscriber side we check if there was more time +than 10 seconds between list backlog items and delete backfill calls. If so the +subscriber double checks that the backlog is still empty. This way +we always handle this race condition either in this monitor or in the +subscriber itself. + + +### Visualization of Ordering Triggers in the Cloud Function +![architecture](img/ordering.png) diff --git a/tools/cloud_functions/gcs_event_based_ingest/backfill.py b/tools/cloud_functions/gcs_event_based_ingest/backfill.py index f0a2ce415..3730074ee 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/backfill.py +++ b/tools/cloud_functions/gcs_event_based_ingest/backfill.py @@ -19,7 +19,9 @@ import os import pprint import sys -from typing import Dict, Iterator, List +from typing import Dict +from typing import Iterator +from typing import List import google.api_core.client_info from google.cloud import storage diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 163fa0629..3d349eeea 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -22,9 +22,14 @@ # pylint in cloud build is being flaky about this import discovery. # pylint: disable=no-name-in-module -from google.cloud import bigquery, error_reporting, storage - -from . import constants, exceptions, ordering, utils +from google.cloud import bigquery +from google.cloud import error_reporting +from google.cloud import storage + +from . import constants +from . import exceptions +from . import ordering +from . import utils # Reuse GCP Clients across function invocations using globbals # https://cloud.google.com/functions/docs/bestpractices/tips#use_global_variables_to_reuse_objects_in_future_invocations # pylint: disable=global-statement @@ -73,16 +78,15 @@ def main(event: Dict, context): # pylint: disable=unused-argument if enforce_ordering: # For SUCCESS files in a backlog directory, ensure that subscriber # is running. - if ( - basename_object_id == constants.SUCCESS_FILENAME - and "/_backlog/" in object_id - ): - print(f"This notification was for " - f"gs://{bucket_id}/{object_id} a" - f"{constants.SUCCESS_FILENAME} in a" - "/_backlog/ directory. " - f"Watiting {constants.ENSURE_SUBSCRIBER_SECONDS} seconds to " - "ensure that subscriber is running.") + if (basename_object_id == constants.SUCCESS_FILENAME + and "/_backlog/" in object_id): + print( + f"This notification was for " + f"gs://{bucket_id}/{object_id} a" + f"{constants.SUCCESS_FILENAME} in a" + "/_backlog/ directory. " + f"Watiting {constants.ENSURE_SUBSCRIBER_SECONDS} seconds to " + "ensure that subscriber is running.") ordering.subscriber_monitor(gcs_client, bkt, object_id) return if (constants.START_BACKFILL_FILENAME and basename_object_id @@ -95,8 +99,8 @@ def main(event: Dict, context): # pylint: disable=unused-argument ordering.backlog_publisher(gcs_client, event_blob) return elif basename_object_id == constants.BACKFILL_FILENAME: - ordering.backlog_subscriber(gcs_client, bq_client, - event_blob, function_start_time) + ordering.backlog_subscriber(gcs_client, bq_client, event_blob, + function_start_time) return else: # Default behavior submit job as soon as success file lands. bkt = utils.cached_get_bucket(gcs_client, bucket_id) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py index cbc21d3ac..dea38dbec 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py @@ -25,9 +25,12 @@ import google.api_core.exceptions # pylint in cloud build is being flaky about this import discovery. # pylint: disable=no-name-in-module -from google.cloud import bigquery, error_reporting, storage +from google.cloud import bigquery +from google.cloud import storage -from . import constants, exceptions, utils +from . import constants +from . import exceptions +from . import utils def backlog_publisher( @@ -115,10 +118,8 @@ def backlog_subscriber(gcs_client: storage.Client, bq_client: bigquery.Client, table_prefix) if not next_backlog_file: backfill_blob.delete(if_generation_match=backfill_blob.generation) - if ( - check_backlog_time + constants.ENSURE_SUBSCRIBER_SECONDS < - time.monotonic() - ): + if (check_backlog_time + constants.ENSURE_SUBSCRIBER_SECONDS - 2 < + time.monotonic()): print( "checking if the backlog is still empty for " f"gs://${bkt.name}/{table_prefix}/_backlog/" @@ -128,12 +129,16 @@ def backlog_subscriber(gcs_client: storage.Client, bq_client: bigquery.Client, "This should not happen often but is meant to alleviate a " "race condition in the event that something caused the " "delete operation was delayed or had to be retried for a " - "long time." - ) + "long time.") next_backlog_file = utils.get_next_backlog_item( gcs_client, bkt, table_prefix) if next_backlog_file: - continue + # The backfill file may have been deleted but the backlog is + # not empty. Retrigger the backfill subscriber loop by + # dropping a new backfill file. + start_backfill_subscriber_if_not_running( + gcs_client, bkt, table_prefix) + return utils.handle_bq_lock(gcs_client, lock_blob, None) print(f"backlog is empty for gs://{bkt.name}/{table_prefix}. " "backlog subscriber exiting.") @@ -158,10 +163,8 @@ def backlog_subscriber(gcs_client: storage.Client, bq_client: bigquery.Client, def start_backfill_subscriber_if_not_running( - gcs_client: storage.Client, - bkt: storage.Bucket, - table_prefix: str -) -> Optional[storage.Blob]: + gcs_client: storage.Client, bkt: storage.Bucket, + table_prefix: str) -> Optional[storage.Blob]: """start the backfill subscriber if it is not already runnning for this table prefix. @@ -177,7 +180,8 @@ def start_backfill_subscriber_if_not_running( if start_backfill: # Create a _BACKFILL file for this table if not exists - backfill_blob = bkt.blob(f"{table_prefix}/{constants.BACKFILL_FILENAME}") + backfill_blob = bkt.blob( + f"{table_prefix}/{constants.BACKFILL_FILENAME}") try: backfill_blob.upload_from_string("", if_generation_match=0, @@ -207,11 +211,8 @@ def success_blob_to_backlog_blob(success_blob: storage.Blob) -> storage.Blob: return bkt.blob(f"{table_prefix}/_backlog/{success_file_suffix}") -def subscriber_monitor( - gcs_client: storage.Client, - bkt: storage.Bucket, - object_id: str -): +def subscriber_monitor(gcs_client: storage.Client, bkt: storage.Bucket, + object_id: str): """ Monitor to handle a rare race condition where: @@ -236,10 +237,8 @@ def subscriber_monitor( gcs_client, bkt, utils.get_table_prefix(object_id)) time.sleep(constants.ENSURE_SUBSCRIBER_SECONDS) - while not utils.wait_on_gcs_blob( - gcs_client, backfill_blob, constants.ENSURE_SUBSCRIBER_SECONDS - ): + while not utils.wait_on_gcs_blob(gcs_client, backfill_blob, + constants.ENSURE_SUBSCRIBER_SECONDS): backfill_blob = \ start_backfill_subscriber_if_not_running( gcs_client, bkt, utils.get_table_prefix(object_id)) - diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py index ae29f4b4f..208189e39 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py @@ -25,7 +25,13 @@ import pathlib import time import uuid -from typing import Any, Deque, Dict, List, Optional, Tuple, Union +from typing import Any +from typing import Deque +from typing import Dict +from typing import List +from typing import Optional +from typing import Tuple +from typing import Union import cachetools import google.api_core @@ -33,9 +39,11 @@ import google.api_core.exceptions import google.cloud.exceptions # pylint in cloud build is being flaky about this import discovery. -from google.cloud import bigquery, storage +from google.cloud import bigquery +from google.cloud import storage -from . import constants, exceptions # pylint: disable=no-name-in-module +from . import constants # pylint: disable=no-name-in-module +from . import exceptions def external_query( # pylint: disable=too-many-arguments diff --git a/tools/cloud_functions/gcs_event_based_ingest/img/ordering.png b/tools/cloud_functions/gcs_event_based_ingest/img/ordering.png new file mode 100644 index 0000000000000000000000000000000000000000..0361ac97cf74a6cc64d60db7b6f6230694b727ac GIT binary patch literal 77197 zcmeFZg;$kd*Dj0*QWDZFjkJJt3P=mm-5?+>-635{OG~MAcWgQZ1nExc*mQH&*5`e{ z=lQ+oJLCKTXN)}pgM)WNegr-# z_E4pQgQI|xk$9=yh z(KGhiE3R>N@NL{}Wuw;ZZ(;Y%!RR!1bC=3R-=Q>=)s2k}dS)6?_HmB^^QlMr$4B7pTn1cM=`0&N zz4ZTBfGBi8=zp`UGlXY8^FCpECIJL&L-y$6mbCv^!!K;5#RD$wB0lI3Bq$i!+8uRm z`twVEh3~ys7%89e_d%c2Z88qyPG&y$_hw}#%myuL_^fjjPepuUv&03Yv&8ts2kG>B z3?M#nBs}}FzkgUhjI1{8qtj~fN-fgm({B$s$8Z+l{d>5go7i72(Dhn&=V1%CL5Q7) zEz0l719NQ1H${V3e3nPj1Bul%j=#rZg*+w0h`HW>FV*Uarlzr;D&ugmK8G_Y)~Zxd zFI`|b{*6vxcVM4oi-h!7UdVc2A6WxQ&9k9E8d0JZBrrkR8$$(bW|XIE4+4uc*@sGY z#=Z_3hT$?8Q$VdWsCfzhoQUXmplayOousY!uw}iB@dnFsV|*YYBBE7gkyE7CC`T^j zK^KPI71y62C({v(*A>5Ue!RiP;jkh$UA|hj1abWRgGR5BJ29Ri?pdJoj!T?;va%e+ zr}$Jl3X!Ipi0_Dv&;2N%GYm&OmR^xr>SG2P$8NplFQ51Ob9mmqWc)8aJwj*fpDK4; zZ}nHHu${-fwW>5PRnHo^J{wE|ze%6(n}Bn1;jo<(bU)dwHyuioygIj|mWiW#V-t|- zd$6!NF6d0_b2C+8Ht=}lmb{4DHtnlEcMsxYRQw;5uKR+Y(gi+B$3J_Lr(S$C`d!IP zao!`12ey4)ya(Y1-?TA^ad5#e`{*#Sm8G?AjyRfjh1)0>Yg_guvV53p@itg!zNQfJ zwdaz^J(S$NAduN`4F)gNuW#0i#}SdioldkyUMq>H10y7KZsl5AM9Y!v;6_{a zp$M3l(G3P#J;0n`P9h84ncf+mj)OzdR4^PbAmezy#!S0})o!I5*>w<%?rD0sBx2SR z4WV(_CA`{BV5*IqtFQe2#@x|p_;lM@!-igQKgX_=cD9mh#kDVXBv#m4mTO@sJtR+~ z6k6eZ%{<+B?pAg@$>>vU&rX|dHMaST&+%>ILbHx$OQ^NXAJ=M#18j}Etye)T6|T$9 z0_W{L@Jsu$aQaipoEp95CIhOHCXOovSv}H%&NmmR&?D5G(uFc3HfI;tn~RLMU3Oce zuk$3rZXO_H3U#rG)ik=0vFOyuCpqt&Aqe;uuV|nVsH{wmeSmkJw@M@Uqoe{rH_Z(w zXTBi=Uq=1MZI~e>vd(7@4N`^S#PTKMK=?5?O7t9Q-u|$-+Qwt7N)!uvFMY7UG8n#? z!X+TnA1`Az_MP?WEHT`GrhEo+Zwd|z>&$xdk_#8o==v1R<&Gap!fuQ}#8V;#CkQ*h zx;Ho4HB!^zF!}Gk0`5{jmoHiks84ZwPdC^xxjY<7>XNthudmVk=Y>qd!ovMHO*XSH zw`N_ltrrCKuxUc6d2Hw6W~v)|KT+=G>DFf#G?ZWl3ThOnD$mrAQ7yLz#f9N`=1+Qi z-y#I?ybAB!ZlItE4lsQ6-T7owNsUadi%qaZnz-8aKoEL#r0k9UR9M{1V#3T5L*b^y z=azNqG6yo0e*bA>*q|=}k@^EJ1h#08Ly{;Jn0ljtV`43re-XQqx#etuSo!z$1>@38Zjyh#FmJ?#Z=Z)|I^xdnRVTT9Bn(-Jj1LL12e%sxIt4 z3Sm>}8cSXT?|bbar^=HY2@EQl803P*6=KvMs_>yCB0i)jws&O@Q7Y`KRVv?(k3WQa zE}P0-Pw3&JG)zhmZZL5EOeYo=B!s}h&wSwMDN6x{FpfO^0DOjob8&T=DxcXR&uzEB zcChFhPa}gZ9faDI=W`prt$T2^u9E;4=vwD>XokY7?`Gnw;T${(Hr>cdp5N zn4MP4YJ!=GUg3s1hFVH_g#a;QQ6>(*uOmc})_L1hR@H90LZowrz?;&^b0i!62-xf8 z*0&pMYRMN`wYIfyp;oqcZr(q?Sb~6w5=O+?;Iw^`8qb_8^67%BCun8z!=h;#KUk?8 z@IKCAX4x^gkhzEd2;vhk&lF5PI}-(VHvh(v;@>j>Y}b`#cQc2Io2|L!tkX zbnMCiz09cMOY8!A$s*Yk{Ev%Mi9rW!maxGe^^B7(X#OJu;bnn&em!F1x6*zqN!R}W zabf9JVtDniMH3wZZBFjl1}y)Zbz^}s?QnyTm5n9XO_tsx*k+u^ZD77?4`erL$n5=) z>ch%I@5;`r%^b7oAkPvRdJ(q9f+N}PRzG0PhY5CNPGy*Iq4#1XgJu|aJo&e{j+lRI z2g%c1mr2vCpOt5<3_jJB#;^ZwKL%0!hDK>-6PsOjN|aaF_b^7sd{K5T_eog)5JHR3 zS0z5sL63L4|bjn+_d~1b9P|ic9M` zZ13OAs(=`W^7j@=RAttYnQMigRE;~)ZphXLD@)&--r_LK{wT+E(L?T7F>@y$o9iJ3wW40w4MvcrpnwI;P(xUH^#SIR;85zQz(*UpT(BPm+OOa< z#K7^WsL!;!E8l1%M-Ku|0$!xD;M1&&52R@;lYEmw>M?%bfDW9-C)C(y;6m^NOgR58 zFa)Ga&d+?n#X1!WrCV3QjXZAfk;!2hGxDFK^kag3Wk@Y&KGK8C z5US&sVPN~OjPaZre3E7*J6!5mf(E{!ND1CEFxe~%Ze3Pmk*3D{*&S~@&`@#cuj8Gx z3rjo{H$kx3kBjgpH}BE@wf(1mwyy$izJUtfD{5JyZO9Cl1}#lR{!y{kETE2L z;h7o#xvwu9NjiV?{MK7w?ED)kqu@q6VqoVg;Jrt`tPZcDhYt%v&)dIBjsZ(>-`t$s z@YDMMS(7WaZgn1{@|2oQSMa@G>s>tBomL6MXGy_jR256%zSYZ?2rDos!+g3h5PxH$Z(j!cAw##DzE}g|uRIr*v znK;Sbvpss2Juwx5aD&&MHN*Z&p$~aV{AQ)GV<MErhEs}tx6;uk~Th8yOYHjtBz+Gj(%1@u{JbdtYxVS_< zm0O$JT9fhvlHw8qwKZ6VAvL@L zwU6`DpVjZUV(DJbAo4XUW(vy|Yn#OrxJ{MG(Csw4LFKn6Sm=wjoiR8fd+V~)p|h3d zB<}Zj=@Jq6dhUriQXeVT`_&Q~oKp+RzI|!Wm@L+z15xJ5)T56stKBvyMk61|=acvx zzka9hMvm+ML>W1zBcCCtJd`fLz;4nVlPeQfc-2t0;ePzJuf}F)ZD+Fd zr~{KP&v~b$pW4!Lc5Vix3q!1j0p{Bk-ECW?1V+XJ&WtR_(FeeW|K6OXCkvn^0L ztafV{3km(AYS z4f1KcVz%=QjL;R@H(5WCb0(sK?p$`Ko?W!Wa9JsA&AI#Wvj(qhY%mQZ4#vGsRY!@A zUlu@Z&qu?%QfiKNG3mjGHR_-nh$LD?T>VIV`VC~`v)N{O7uColB*2?Yh5G#pT?|4q z@6Ue4X|_LC9n0%bLGjJ?Y!?M2Ak|Jg+|sfMSG2tzgQ=5mp00d&6cy*R#gW{FhYj_- zy~4Acu85B|2E}sQJUW%y z`6zNFS8kj~BNm8U;jqehd3T%oBehn)JqYcaSpG~^K2xT!cWeq5NqDWR8z}n{o7|6O zLHz5%kOn84tDL7@07)0%kvmxEpj9v8eY;^(0uRa$F|YK$)=Y%LD|WVTXJ2lFM$bivcn_GLb5yG=-Q}SsIG`&E@;|wj_>8m z_~y%J{TngdT8j}GFE6iLQU7e<5Ldr`dkswLEedbcMVCs~d}D2IcT|X4JENZa-A(=- z1UiNrMoXdE6PcVZ^3eKSVy)fcr1jja#h${^6uXCJ_sva#8~RoU1kff#Ne_q`nb zZ&=GgnUcH7;B(XkID2za?=(m}00A8t=q(vMbIaxv)E1)glO zcdm&$BQNzQL~M204W{JL7_?B0R`0aJO*v;6xY-=9F?`q8sW7dnaQOALV_QIZwOh^X z=3+XQK}Fqr*8gE+FOY$1VEWK;M96kmL{hnj76^UBlmf7FH~=WC0dlYRwF^v)CyF@G zwWCP`wEQhfQn#In)e4iIM??ZHW%8e&^RyyWGd$`IuCx3lt5sndeUuwb$gaj=es{oJ zZbV|$$tH3AUFmkc6rwJ3ak%_2bW!RfxipYnJpIJs$L|@9Sr2T^x7K*?bD4stGn8syMWn>i$h@~)Y)KMU?GN|PvXg~cp@r{$ z_^Xf$Fb4>IvrtCSUZ|M)1Y5vGH=>j6ikQ3Tn{I=Xevn)$zk(UF&;5;Kr1e~#^7gn@ ziC~?Z^he|E9|vt8JWYE)?N?Y&yO=$-jTV(iHiAMgSs9dbW43-JZFjM4&r)Sau1qc* zAVh;~%TT)2vXgD7iPb=ut+k#IT;!<;O{1$_!b2n%253alp!ao+hW^-7-?1}jF*SYmIv1=-ECIvKv3bg76Z|)jO(Ou6`65k2BP}+$ojW_8M$(zFyxP zx#h4o)L_u6D2iv%(F)?kB%2X`^q4SbXl|@XOFJBwfpTy5K8bJiq96wZ-M+DW-|vpS zBDbpg{Lb4Xn;(;Dbuw2bvENyX|EXY?Ea1;znA6i=<}Q+-Imjh;!f7)Ms$M-i0*-dv z6@g`iKg zaO=|wkmN;uA_M+d^Cgf-se&lgG+wMe5k8G$!1LgXE}_?)W7eurbN`Hfa5?Vo zbA5Iz9*%!2vv|Ha@^Gy`!PoR`muoajoEqh+a6lxvka%ft4AxKQIf`g!u9UOt<~+7vnplRg^6Lo=Fs4FQ}Oo?WtKE4Vi%|FK_qINCeR3{ z`?^%ZtR6IcYgJcV>R(Rh`yUdJ@Q({LO4DSCN6JkM#*y|ntbTq8SnF349q+g_Xo}t) zdsbHIywflWxqk*+NA3+&N@Sf(^1hcN%J`X&(ZlYxCME5fqP$qs%~&nXN-I$BXZ6(l zO*x~fmL3COW1z14NsLkNv46mtGPTqhfNm0{Q(kO!T@+B$G`G7Vc(H+fKpJS<4?PjD zo@01+pQGw+Rs|%>{m?q+DY#MixP&bmCvwq=1qG#GjlR#zh z0vYDG*RU#P3pI3JUMR#+(T=MLN#_G3N*~~|6-8Q=itK(rtrzcj+^^57ahcS$vFQ~U zdt1G3E?#=`z9@MhIw5B2(c*qwAS*V~Ky0FW%&`hn>uB5-|3C zL{!hn?>V)p-_7xG9-}|r(DO`?Fl*QDXG`n`Aabf=hd)A3DLfXCUO;>j?XF&=Dc2r! ztd9B6kOW(w^5sNlSjXC=fmQEhse$SKT>fa85tFl;e>EtVwF9g{bb%F@$nZlHK9p1> z+&5~YUk9H>5mzhu)|wBIvK{^;4qtSGVi(MkQWyi5O%Y0*%577Qn#FzE+^%`E(k2+| zvC;$uSl;;gBZ-{(ItMOIXoN&I?^%%JfLXt)Y9AjIj$G=;+acX7>`%%!{Kn_+<*_OJ ze^M=}t`Nj^BAvAdajxB5s%*c@3`D_^$q)?6eK+3lfP-jtVHV_U{rM_VszPr3ly^Rt zc?GCKAV>wsuEDXVK2uV5Mt@WPUM=AD;71YeN^?bJ89MuBtkJ8GDY%#Pn8+T)dSVNGRZwweacs1FdckIA4r!c9iwg56MPXw7~*;}Q!U5nMU06- zksF1hsaX*QV1cY|?tiH-9ZY>6AEq15RE=z-W=9fL7I@L(&|^$S7?{56et62FGtnPG zph)X|#k|e`D*c=0WXSS4DzLMVg~MkMpIfn5S~*(eoS_V%kJFq0-*nrIKG5SeiTqv< zl2y5_QHbn#fitX{2{(|kGmQp^X{Z9PK<{J2-}q+p2Y`a!5O!NO7M=cpFr1Squ^8aH={`}xl zsA9S0#cq0cRxSpq1=H4OLTn(4BkvdnhQeP%B?d>8?wG!L={)3|rAD*F0Nx9k<*}sE z3qo{$XC?)C>d5WS)#tfESGMV8VrlTtEVMWQu2)x%2TSz;%>u(p-2W$FB6dTr=_pm= zoyLTONgh9(DN>lx)ZBb3Mrh0nihKO%FU3S%rQ8Zdul}w>5vC5VASM4NfDtu(v(V_O z7-o1wD&nKNXmxV|JvFWq^@qog#&-fMNu@Rm`QvgX{_Mn^;eTKkmaNO;L1+YPt^N;X z9~(4#sENp4oot;N^O|%=;z!>ZfQuA9sdxx0fDfMmv~H+=4)%k!DjY#D2p}fkimHN+ zU(oxYH=csDnOZ7(Q>f!3xzH#iO~luSHaG&0i4N4EkIMlNO)1&^=+9zC{w(J5-xgEY zWtaPXSuiM}XXtpA;A#FmmsaS;{n?%lpVQ{21KluoYVq+|!RtP;+J7{CaF0_lY3=RZ zwxGB||IrY8eaC!}#(+qqPx+@le*J9tdttD)wkKt>H+PRzWrw@(Jw zO#{{Od+?yJlT9Ypj|+iSuOU|~*WvBO(fYtu31F9sFk=*wSkteBx=Oinr~k6F6uj`B z=j2HM?dcE0Apk7FCXCuT#Q_NPN4UuK<0&nC9 zL!XAU1y1E5zW1#33TZzx34)U}HL!@&Y5J%cbj}F+GMejDhCg)wg~{qL{Hdje`or;N zfy$_0)5_B1YvX0tM$)`aX?3PEdMHo_O63`j`#aytT)7XQ=Vts4A zdMDJ)Gekvyp+Idw`W+^UBliM2xJtrkUh1QInVITB`Y2+x-*i+kznBCA0gvQ1iEs=P zxJUo17eKSICcnuo=FL#1FJlO1=7=J{1I_nwN)tx$6wt$461<^I4tF=4PU$xyLLPp~rNu5_PUZzu%FQ5=R}ZnihhQ~{e@ zL3=FmTx^_GzbPwIVWOzcfVUKv`E$C#TV0zNOE#Fo1Y-N^)ZVFOy8@EyT_7r+k7_o7 zFQ?`W%8#^qWe~ofz)9`Rtmj!xc1!%hk8EL#woT$_Gd;F$dts>oq;I-13BY{6-XAV8 zT%IrXcn}^U0~AZ@{l;C}>X3c?dGyjQY#*(H{$Q*9N9(hu>-{zX5InPfT=i@o*L~*S zB%Yd4#~Y=xadh6NmEH9l!cOlK`cbC_F37)2%1{Ileu}v0!&F6yBI!@rwaO zCYF8}Q#oH*M(uJeKSCDnn@pVUSA{($fP-=^qFDXWM;uri#!6R~<^7W~$s1zFCi~?E zQjxols4)N_O}Seq<>O#rj}*B9rb_4YS04!|msVGE2%34XVtMS;@fLIoR9_M#`yZ#& z!w45Y-zx@4B;m-kJ9?fO%-4PK2R`zVbkT|%d<`hDDlbpABKX<7PBz_XgAW$Gzw4g? z>Pw7pAz*i!Q}$>rj1%vk6>(RZK;<0=~_3s*FfE zr+aubM~ZXS`A5c|V;%xNb?88FLB*81XJh2szA1llJuTZ$vWdjjoLNj`!<1A2Z}xjLh{jMLyq>jv=hrn40ez0k=vEYaL8IkKfd5_V@-+eysuxww$38It9|< z(&= z346=jzrVYg1~A>OEVB7vO2*D>n3a;tEZyDG%bt{ISL+Qv%_dk_z-4fA>eS-t%vT{u zfa@mO1w1i|_(-~Qo&_Mp{h|Q{bRGbt<_x5EIp2sWi+pV!D@^Sc5Dm}vTrAgd{b-mf zo!?nGN%JZ3AJRS%+wJZq0lS$=!vzdTZH;L-i z`$EU&SK({#5r0{lAfr4ceEv#LalFwLEnmCZ30bESP{|-JvOWl3{JB1`U?btT!ejC2 z{6OT+_lOx-sA5Lbn}j%8xo5pJ=%vp)?b`z#Y^sQ1RHYw6)e}(;noxa-qSg+y5rBjG z*hFa!(4u(V?^EkFM8=ATC=B$*r01#T2e9&C5(A}N$^b?)@2EF#tBwnqNjd>!`jz}B z2$V63nJv;Fi>5W{9*gC&((a3ADCQTicGJI@6-9JEa zggmJHzZjy(w+;*_Qc2yB)1*98x`~S^pZW-Qw%zk_0U!NhONAE-Etlujc|k!^|HA*d zkp#2*NAc$&mJ1mF!;6jZ=aD zQ$PC?a#tyL>Kuf|PQCx6220Xli_y8mxjb5r0NjfFOpOf_5r^p~ zy+)V53+SrZ-OVL7&%SS84WNZ+A@_HUCwsFRsQ5Q*Px|}f=mF_!2HHtp0|>GSB)Y1^ zFDxhkT>o>{@&3+xe6ikfqqC<+YNacJetjS*DIAZfKaO4zfChAQHX*OSyI!B!m>pDG zO?D0p$Z@>=@eJ@BNfsm7VWp*|pyyDx#%_@nB(i`cT9MwHsm`;W9)0XN(L}wg!d}&N zPzt$Y98BSglTBa@>JJ3988x6S;(czfl6N71{8G-9equROW%)f)#BM>*{pO)pyx1{lFs3EA)j!ADpRKUC$1}j!y$W>u^J4z7wtmr5euvpZidq8 z#lYP`yC8W2fW_td;~B=26de%_7T640m=l;Z)YTNz`SV)jzse=CN~{nV=mOe?A`FLK zJ%F(NYvaO-(lp0O z%yT2LQ!{=bEae(&E%(c#JnRud9^E>7*$Z*pC9biqCeT_)r&g$*eZf`xWcP-@tJJwQ zmB)@1a(kAF?F%|U6Tl%evWK3Jfsi_WjjST*GKHWYIxw$@o905%8 zG_O`NideZoHQx}ai(pK<`W;G>>Ug0Brm9Ba#`)#(#%T5{G}_N0kBPFI6uS{KDBU8K zW4Z{;OiQ_0`G@T$OY}gy8n(Z_$osWW(444xm`9pnxjI^+ClE#^Q1h5vkhfLtv1g6# zL6D?XxB-CXggj`%uxU|MpTyu{`GeC*Kz>5i3-)6+nk#d3vZZpmaDH~CUP>Y&%CX-& zcP&$R5yayH(7kO=giPA6;fcecUxe+PZriRKJa& zb@oP;5%2bIbJcFBQXV=6=wYqG zeGAxCYQf=)7!iu`*!^h6dny$_F_a8bKgp?WbR?=_NQWPO>G=2(Fm6CMYz zVJn;rz?|WztHTWf$=WM%=Cn-alMF&p(vG>*oK^_9w}!A+972C0LjMu@<`xRk;=Dd{t7q&tkH;b^lolux88?O^a18^ zTY7(UtcYhD*}&TXH#>;uct8&*u-zr-t6WKNWFJVC2h0Qaj~`)TI~~}Z0QGhnXX9P1 zM917dv>$%nPE%L^2Bsa9zQEX?ZjG*E0#GXQWM@*FQH_!4NMR}frk)_gL9HlxxhvUV zfzvG&2O{m3*SJBj@YG~j`b`dq5mG^n;%;KdOLC2@FQe3<5j1KRoW`A@6!c+lHkv)p z84pz7)FjKg1+g8I@Ys%>N3gXbLpw>CRKI`0)?R-6ydU(jBy5I+ppy$v1#yP{e-i#n z7MpK-S!Ia@AsC%L3qHVcbBz&1y~g!4|M8jqJ;-8{g*PgK!P}47UH5z{5aM zKo>#QhBFO7VENWm_tH!OFq-whcJWI ze!ZVmmbzRAmh@bF1O<}o2Uy!>wQaJP%FQ-2Xd9_UC6er=6cYn>i;Uo=7xRhkdw_Kn zFmYvl)>V5mJrYNApQvle&php!l!;NpywtNV1SNTY)tmN-g%J#&Xi1z9(f0G~;D;28 z%eOvRA(%YgohldK6HgoN>L3o>Y4=2ag62@v*n*~n*!wy1&{zd=1!rR_MvauR_IpTW zO@lGB&y$(RyCuqTJYBZnH9&_i9L9XXM zB9>1ebvmR*!@XrCIQw6_M~;Y$_J5Q*Ks2T7JNSPC#cUl2b_6v@%&P+jJ39lErwcu# z+#h{!L3TNsH}DMfq^3lY&FX=;vk#R=hM{&&0EP=@H8``S2Fd1xe^tmuP|)G$6)zee z3^IP2ju1=@8L!8jJyb=626ss4#JNo-Sa`_h+$H9`CVJfJff-`RpBX9fAe#>@C=4B(KPlXZ zeP1DjYbyPUdL|q^B9KN1pAA1YA0N49XAvba&1A6bpLgc0B0m8 zB(zghjJe~@yEK*H`9t*`v0E0Sr@AB^=fA)L#+48yvSk`ccqvD^?D2gjAbw zQ0Ry6ub#1xe+8W!GAP>f*AG4M5U&TG&mMnM%vAL~>|R;MK)rg^{Z?;?L{`V z(>0EXsAz|0nDwZeTi_e-IZpt4O&Muo9YuC(VQtnhwl!x`w8luwor}Vmh}o|+mo(}S zLOPPYkj->c-)=JW%pE?53+t)!lKhMV;f;Nu&>*1!@*)+ZEyKqL-ftYYCW;Gz3VQfd zA8@$&>eP@*Lkr1>tU!gU`4VI$sBYHpL}2<>j==(rVB5jU?mdZQD?CgTdqWjZh-gf> zoXq<%bSG$uPyGEC;C2dWgc<_aejj2{#<3zi>SP_d5Q|)iU~f6h6m(afZ;M#W61wX; zpr}}m>LAcTHg%heJb2c1u(TI|+=IuBH7_mg1j>#COCCVcFq-0?y?hMuJXf&4MY;1l zTzVjQY&CyY>2Bp2YOuTWZoDAhaTnRu2h;^K)UfQCTw_;gmp-s}bzie7y0S%jF7O;) zFa(^y+n~okvCQ4juZIEApC-~H9HJ?4kDS$gHx?wmIzweDE{X4GYT07H9B_o+Q3=R# zk-}9t-VeGZx_n*48zBxtI3=vV@R~H(TcG1oiL65r1Z)A`A}{5sj``cm+3AKBS6<>R zIAXOTJ4-$gIc(j4C-*ZOO4D*5jq%(5mgzGZBA$93+4_)6kgy|Vxw5Tn=@@y0y7KLy zawgkSrTH-VDcc0DvEcXNV32bF{b`VSQIm5qjvPMzKXXHSi%^64an`zaM573hsVY|Ca^8`{nXXgEe%!bvZ(JO)07U>!2z^bfqnD_Ly&FVvd zDR=KwB-`NE3Zb2uWd#+}<*p=y6MwcjWXxCh4QrZpC68S5vY)0gb?bfWMJAz@m4M~& znS(kZH5WnBfuZdgfxR6vWN}Hrq`IHYZ!2^1Z*U zZSpvk9mLGA@37w8WUCuSR+-LlW`1rbNb^vn%x^QuTrcSre_j3a*~-Di7JvTS3v};p zQovU)!8CggofW3&4Q8n5P^xUcYym{P$3E&*_A256uHrS{e3A~Rp+S{;;SDG9{M32<$OY6IZo}yV;~~Z)XQYzjn8R9R%Ae94A)hSfj3PEs0+l3hbr+${y0#>z)*zdY$9zFInnI?Oym%4)bcry4 z*3qAyD66#%Ph-ur&jaY77>lV5sLHw{s)1uI0$t`-WFj>+IIv7c6eNI|!ia*tR`V8F zvsVB)IRbbYm~)9GWY3oE41KxSX2m2&_3Db;RO82)HBoYw- zKv03EO{Km%FlCr19YZAx3jHz9fUYxy0MC0QF5*fdD+8o(gS{fnH~<=Zz}yd~DW(ey z-zGD41ZTCdw|E6zB>>D8k^9+h!rDO6Yw)|FFZG=WXpOoZ(V_O)vnxKOzPh;R-Mm#uPxk!^dLQJrjdcf1Cc->saQ}GD|fNQGR=YYaE0@HIc zDKGqUWfSDX7LcDbvApczcA*jlZuLc1sf&6MfOQRpV1gv7KoWo@Iv5q8znm;aq;CpwxZw>+e>F()kw#GjN3YbJr=n0qp*CR%^dX7Wk<-cKTCw^u>{4|ut? z2#Era$jB<8ef!YsrKPrCfOu1yE;B0aAJeU}u%&f_c8I}eNP@HcrJi<7`=AFlaKY>w zT@nM#RPs&HSL7d=5PdpD{oE$=X3yGBMOwe&9;5Kaw>EozG(F$Xsc_k2b-y^wH|>j6 zKUjaAAs8cWv${m{-$oJU8UPx(Ni)4V&N2KpwG+6=uI5vrGQM%UlH8_7|jOvgN zp5zbGUWP2qH=O$2+@Aj#Ho|2})B%9Xm)Od3P{M}FYED(Gi-H!dFMt#bk_P@nnnD$p z>@n)Q*fPNcOtqzTE9iSC`{?m!s-Ea{&^GNGGn7u^jt@69L5WbJ-<;E)lEgAg^;Fm^ zcCMaN4!Uw@?jDnn7WCgf4uI)rurf|@hIGIfToOOWMEnWp*MtD-bPp8c3PikR>7C(t z>AKRbUlHsv*_xVmCic{2CSioo&`~0D@?yz!j}1SAizNIXLqzH&1;ydt(`usyqG;&k z;H!>dQn!v%%6SX2ArvrYpbi5z6>Qj|9MR!7hV&9_W-75#t#lnTvaz+N zX@Ly~0Pbag48TH$iePbPxr61*bYQjY{M;`n?C);Wh+bHv%r##FqScbwc{`r|DK;|u zgEVR>&11m+t^s~4mg<#pa}tkE&11^_kLc2^aWKyMXxnk0#?TjbMf0I2EHMA^-~~6k z%+k)J-DpmaCZGLMJeUuS1Au6FuCyr5gEX=pE^Pr9(pdZDcE&a|Ha2Wcs4UBCWUVj+ zc;Ic09utk~r;nIYJ}`l=OGGwy z-5O;%TIsTq$rR4RZl;x8ONt~?c>?B_{SsjA1)=vhI2lUd9Z0Bji*{jMDA814D0rpl zH&AJscVoSy5ML5~k=2C(vh3;QcT_Yo0v;!60MwE}SupOp89oX|-@(gn(Ih-{sr#>9 zfMlRQ7T`*AWdFNU=S0|_bEjzbh)UX-Jdka4In7nd={NnIgg97vQF8hK-2nXu}wJ;+5{`}~-(uOFa@T3dykx!UVdKujvdLoK#@Gi-dbu?euu?%a%r1~%^qqKI>}^B27TZsXGP0t0EREq<)lcLiz! zrfa=gv4ZYYMC{pdroBtPwN^>2Icq0dauqhS@}N__32pdiv;1Q7^>=JATdDuV&hFqI zP2ZP|t)$8rMvt+ABO0nEuL?$(gOyl~rDdra#cK;@FyU$laY6P|SW6f!(_a`g!upeuD|;{K-#ckI#?V z2U7VQi6=kP6w^Gwgw2L!p$%Kc`F#M=;zt$?K_>#d*(a#6%`DEv-EE`O=5sNCvOWEB z3@TTE>aMq^TdLY>s`EaR&rf% zJ|?+fOd<EOeAeLUt8eB@WP+Pm?%aNIzpw;}S%pR;-O4KagaH81Mv8=0F!U z%LM~LliX4}P;EUos0BYt9ZaJMbYCdG0KM8c8uRtPL6`X=hhb|xMO1rwD<0#`rmkdT z85bZ}<)caYNGu*9TkVhLmBSjCg3P6UfmF^3T&$h|SjqnwK;@?LV{Qj6uQ7!hc7?tm zKO_3Gp;?|iFqB?t^CD%#<@jU^&2rNA`&@(bA`JpfrFNA?{098W51H0MnD?I!c+XF> zv<5Dp?&WWhAp%xvBhWS}Wh$v?V3VLVAPeuavO&KRa({W1-_q37F8%qrzWUkD^q@n@ z45r}+27P^fY&vNiy^Fn!24hT7r8@mNXl&r?obU4xJX~< zkScIpJ!PE@YivoA@2x}S`ea-aRmkfM<#7iUeW>n{hhet|NH8HO5lBD)M85zm%!mS#os%h_c}4~ zz=R_0^?7Ezao3UbTwVF@^XyY`iO?k`eSM!cm$kh;lUvbUSpN4LoU~1JJX!D~*deHb zA!7$)7SIR+j;{I`nuLyGl(i9i6bt zU{7t}I+>cVg=t&X*NgG)uxja6JeZ=lGxBvyh8y(Qe*^iXYqSD<4J@w!^E=dIFa;Ce zwSanVk11)@*#`qtdEFlJkwUBTcWf}aj!c0nXk2`z!J1iRF`@)WEt0idPz%9W*96HZ zZMkF)yz1Bb;>BnLL;aW{Yd_zptO0)q`XX`pT=#Rm?(g(gdK8~QZj@g$YLpbuSS5#i z_21RB+?{#}$Um9BOo)Y_(#Bva-E2*9oY{bjMa^$8FbW$z0@Z?~ANPkx&M-PldN`xw z1OjNk`XcS>H((CtnUE(&LX$gMSiVOJ5|0vKttSDIVG#A8@E~BwzV)1admiFzpj~6l09ca* zKto%>ukGwG4J5H8?akH>0B(auA&pnL(ZvR^N|A!#AtK;%RDiNh0L&@EW-0L9AkAk0 zQ?x;M=Y+n1!v>FB;e$e1wqn&6I}K)H zgez}fHo5N2h&44eja`6;62$o4H-qQAeEkHbQ7tnzsgb$Xu`>Zx9lD#aL_XhHZp-TC5VXcU6s?$gcvcmHN)1Z8DL>6 z(xb=755KLZ?10eK>!h-&#M@5BsI>>21o&RW-Ut`1)WxNP6gEOE0|zXu>QMn$yj zi=_pRr{Slyrt~;o>%&1_$Pn_R0cyhmm~}9T){I>OdN5v~xdY0i;5BFopfzd_d=FwA zc!o?Wz-IU)R?g4%W@Xb>IP1UxVDr5gxr%O?$~KL>+3yxQr)QdI%5t|M;ch4$sJW?M2;nDFY}CDNzR#T=r&U0Bc}& zxYXuf@b2_xh4bQk(N`m56}sA!dctp&|L`G&kKu-3b{B`^Z?|MF zt2`RT4A~$w>fN)xIH{whHa*aF@uBM7_$v<6-fX&h_v3e9z%20Upu&tv=9|LpNn8&v zmI`b{836?w%yi)8YFA%*lt;bM4P?v|@s$lKTK|!(NX%nP4Q2-=7r&KM);mthu5|Xn z`@j0^c)nkpYx!$24~%68AbfNOqw7-7>AK}&X@Z!NBv;X6SonFg24r}rY5N7 zk8U*=xbDxzZ~f{4*|WJCGB9eyT!&Q`;4|d^7h7)~R%O?H3lkzG0wUchBB6vJB`A$S zcM3{NiF8RQ(j`hOAl)ShY(iQ?y+E-gwogf(`iCq*2%KcZMxHgYNGlegRFd^y#kJvgRXB=OeF;Es+$Ep+f z(zH*noRZl*--cF%{X&sID#gxsr==&*5zI>*g1QOY%c*ae4}y12ff>Nbm+jbOajE&U zDoMy7@QRkag|rg^cmb#zZ3fN7FPbPg3XMGW*T^$DWf6L7=|&CAXE zoz%f)VF*l}#r%t7{r1omjG4;M&o847B|BXCkXf*A=X-L(IBjy~!eCiSmdW~46VN%? zRYGwnL@9)5;pKyExaI|1)>z23P`^7L!lFrhlqUHEz?%7#h__caqe)p$lv>{lX&iC) z{)R$=LRUgD3yQ$pZCd6B@|Tx z4&m4Cg;w9r=AfjZd$Z{TfBfILX9BJ117F$=h#})Zb0N26mWWoXBZTZNn|wqa*Cnr> zM0O?HgDnYz%7CLFi~v75%Y8WKK(*?a-?W(yU};I>ZVpcYpUVT$ylSdma#9c;ATV>U zHIg`jnw0t(CD-4KXMnzc2JO%fK;G33BBY@NqVEg5hHzT#wU#t!|1$diRiBxV3!hN< z`LT3pib-Mlr*3{Nz;QCrRDBT1NoQbUQ;acq*809Ma6j&5#!!v*yIDATI|j0xhp@5U zSp@#bXmG3t48LM~#N(mF1%GT7JQ7~$zKCPoY2RHN4h1@_S*}j$*XuyN-t-4d6fO-e zDFGaxBz(msYQ;tZmAre)7dJt+C;I$MKEKA)kX7bg&LzjSq17Sk12_sh7Fv_opZ!pn zn0a?MPZs1!qF#=LQ|thLSa}@ig`xq3JE6{i4`YbGe}PhmMau*KmhY~}ghBQ1&$Ok*Q>9JwDXS)0n)^6SgR5AbTPR&_e<8* z32D>|kX1?%8j1gS$83%9<1-^D9I2SjTMeEkBw@J(BoVe2j~6x)u+3G#QcHPlW%-dA z9fO@VsV-11a}m>xvD4nABQRfqMV;w$W*ql1N3Zf_x{Rt@z15(#QdE7KZ{pr^$pi8L0yCo0*+Oc2!?6Cy+4J-ae?^W)DZEpI<5Az3#R#rrV`XkkmqDaH+ z-SYCa`Esd-i)SCWOEbGg2NK|ClI$9MKj`aX4R>s!LjatmER~vp1`ZExjwi|4hf=h> z@1CF*_766F)fcW}ji{g*N6LB?IRKM`s<|8}w)qRbG|-)>+T=x?>pgr!=-Gyi_s-y* zahzgoaQ;b~X^$@36X}?{ZYip-i0$PfQ*Vx!Ee#yVRjBg3H>UM})$+39o$=J3C!BXs?uxpGz`w?( z!sakS33d4(30f8bp9`F~WQv=?M6?`$z(FI~ZuP@4p*1X%e>a;rG4w$A;pD)1++M~F=*JugU?%y|# z4}n}+0to#soe9My*POq;*V-Xe1tE(k?7xXx^BvRL_S{Yn!iREpV*joQ8Tg<|!>%Ea zQ2=pyz;NFuy)J~$NL+nESS_v@|gmPvZ(diP31{g^sRuNe(8>2f{=8kO|X?dJCocmWtO zcdyN6qyT?cKu4K=jOb31pq1M3{;2W(U`O)3a+&ZG;nwc?v_KX76Gd-9i>x%GYg z-J5C-FUe=_<2^LIa~)9|*3`jtr8v*ANq4C1`JU3|M0osdLCuQy#)f#+xiwS=Hr!Pt zuMGm@}$hn=n zl!)ot%Il12+n{`C`(G2AoWqRxyBcYeco>&HBPC}b{_tl+b5gATZYAV;E&cSKj<0tJ z|0sOG;X{HH;3d;85&428AZJek^e^6;4B!>9-Pu`JX)e3|8U3~hMC5!2yRdt&H++fN zANPQaBJvr@7+Hthe*hKq)eOrr{|E5~9d4%3v->#qsDd_#2Fq8crMyeG5Bl!6bCP~D zX}TOQ-VvujeXk6ZKz1VubVtY^GUM^FpZk7)Euk@1n2R;9Te0%=&2Qze1V7NL$9!`F z5fGHCJd?Gjfvb5>B>K{w5RyL=l}QQ#m(LHiSmJ0>;x$`<*9Ew@gl92sv^Fj zwi6bH>^2*92u&f zq?E07v9Yxcusffij$rLbeAhUc<0zPWYEKk4aEK4{G(Fk$Dpp=dAgIhiO5u%j z9@3UIL}R=xeqXUR_{u6TAkH-Qf~4e{M47nhIvXC51N=@D4;!F*(bsB85C8#D`9ix~ z94$mG>-vF;xes}F_M4+7`y>s&`5Q^IikITbAe`}%wTW6Bq2QEt3rRO$%k9jq%E&yw z)U8c!9(R{J_<+fAq{7Nbv{L)Obh{8efk0_{g&PXxp0u@6$}F0IYuo@DRJM@79>TnZTOYYOpQq$Yn#{FC*6PU#&TGCn!I> z3Y>j`n3Fl8c8k+-A+rlRbfdAOD2(3+MHl}F4`+eJs4ftY$PfL01!j}t#{)m3;K$n{ z*!-xM?GZ%$uQyEMUqSG^%Qa5+1F(1knqqE$qwY|B$^=~BSAJgBPo&Kej)oDuj-(HY zqot`~u?TBI&d&{5w%Rj>Pak*q_=Q`HB8C3K4 z$yyisrW&ii2~5%t?l~^JyO#42aqk0FwJ{=M&6*{!J_Df?Z=H{e6MQfyP1wsDpc_s@ zG-TM@@h5D(=2hni1moT#B_GW2Me>hQ#NEpbYMf*tzP<_2w^%^W<1mqpP<`}PluorTt6#1Va&3dncq<3HnZVe$bp6B;#t`GSY2@fM4Pv$rq_#eIM@_v4=J%JT&kOeAHiyOVqYU06Ijdlyv8AceSnjGRx&e4M1ZJzgz?`l24_ z4o`cs$?R4X9Zu~m+fjY*gJmRgDVp`P1ESR77RVyL0x>28E5y;fJou2(mDMbA!4$9e zbgk$Pe|6EsOh{F3tHA#p`!L7TWzR#=oC5ah=J9cmp;89qGrJnwg-tV3x5{N*OS&E+ z_M4^zj#D|RtoyUUOU*b2cdB$CYyRq!0(B_zyW#7fUbPy~14^a83OL_fq?FML91@R1!5S)aSr>H?-i~4> zo3^b=j<^Z?d5Th+)FJR_bEf`#Ev0bI0Hg!tN*nm_;$&K%oL_0ful&QX1yu<ofcj^Cd1p#s!wS|=BT)uGegpyL%dwJp$S*b}w_d3c zBnYVFX@q=z!-ePEcF*%E=XFGCNmp{lcl4)>#=DfegMysCiZ&%;;zg$(+Z;6{EfQua zJfE@v&nsG_0{X|CpywJ7alb-^<+(79i0971foYY6g)!TNJeD*RVwOh!5nY}55c2<2 zhAZD0x9d>sszQpy0fgyTAT-DQ2w4oO!@glVzqxs9zU5Tp^K(J8I3UQiHlkbM*7_Xq z#mPFMmkiJx*;@rMX11W6amBST1E!F@)#LweO6L>N{0C1_B*i4M6_q`~!9e0unF+#a^ z#vFrmZc))~@(D{bEJ;bDyIcK^(}s0!hJYjBf_7azw=Qbi-1Y6Xy9LE!j{=;TFsspe zlCPE5QpvtRajFPvJ5R&h>aQW{SHYVbanh)ms?9Q$BnDm5`vL=FZtnnGt zlIJci7A|f?1>!G)5S)niM_#+XDxh!)ymjXZwmBwPidV7vOdkRuM4J}99{*={{CGFq zn+94zdb(DMgLMOlPwzl|QW*^@I-iC9@TELLQd=duMOTRHop9(X`X57$zjn3R36@zV zIyZi0YN>_pU{Ach^xrvbygff0LUsX}$KDb!oIM7$3llkSs&ibu9S=NoD!h5WFJmff zzQ?;lQs0sD_3_yvj?a9F%RZ|FJ3vU2cZ(?_{6FpRaE@3%LMTzY%(w2b4Wcu&yZ?KX zAx(KQ4(=$~(7RlFhug+XP;(~O(U@sC`?X0F^e+|w>C0D&5b-&mic|2p32-pXlaV0p z^-E=@?_7YL!VXjf=Bgl~f0izqp!V`1PQql>7IM0fPRD)QSA!Hxq-GkiL|GZn6CiV)IKtH)PHN&s?* ztMKFb#=Q~t)svHbeLo-ICRoQ+PTY3}nQO;Smdto0fSBD!sFS^S&^{r z_r^Ty`+s^)F&NATmm`fs(u^DXgaTDhZ!8V4uP8BrFunQ$6~k{lQgLU5rRkO}XvU`4 z$yyTQe-nniaA;&){R#H`BscqwFVT&t<8>&5%R(RjnQr-uFKt>|>YI)JvEY`aLCZcW zQDdZF6$FJ0)^329!z5Vdt3VJ#znURRIpMs0#@$a_wZI-lsA&25aK>{xkn0_etV#{R0q=|rLi^0PHTYRQ z*BH)q+Pm+GE@dYC!mn4Bzoxemh2_8JJA>~jaxXbHx$L`zYoAzOOGec6C;OT4pIi0~ z_6_5_vhT0Ie&|sBax+r~Ke%ziUSq%VXk&cCz#$pCJK+HU?x<^M+IAjO*0+-nUufsO z>h}bEHNH2|b2~|LCe&_II+jM`BOE-$i@cty2U>ne+Ok)5{Vxc(4Zf+Rx-Ik=X!RxE z(^HCHczaaqqCd?9WS*P3hTjuu11GEOtjnXn*1X_~N==P!KVJ+rdhS~|(b>U&@6Y{%m8 z4aX};-E&(Wnu6xM^dv^!ckWQuwUbYW)KpwDjG*L=yn4+i!~O5LE0;+FR-o~okVPmn zr_>RVwBXFMTR2s{EnAF#rh^!zsnGIU=a~ykL9u zEJ^h%Mugpf$ z=(;6LQS#G=11f(l_4lbnt(7ae2S-Bsj#ctFU-U7X?O#PA>^zp*JlPqHLTMA8Gq}#r8&23c|58;v5S(}4F-tiQM@*&t+{-6S83^7e7<={mA9o*)$ z@5xau{`iX~NvFXp1$v}sO@wT$tb-|Rgfk|A3gs%8H*WbEVEdbw$YI?pw45J4uWx@2 zO@;@U-Ev1nI$TByum`Tf9&jOv)}$)Nm>+cuAAcisePzrsU3}+9iqoXssBTL-XDKwN z2cey=_UW#jKev8W?>__h-R`pvkHPhyu9E@zp9s&5`y=A*HHXX)?{SZMlXJz@M5Pn9 zi!RN$7O=0q59dkWVn*$L&iMaxDRKk_hNlTruCsfP9pBE?udWT}cKwi>B)qM`6jrh_ zG!y{FOE6C5gdTybmc=+N7s*#F(57*HtL>TQzOkw)>AqX>J?q;y9bvk~i`W>g4tm$W z(0NR(X({NAyGi}^d0h&D3MQ9BE_w%$SE0=qYWgMGlWgD9A`@`*z zY~vr}e8CWJ#4z0;h6^K-2$nWR3d0*|lRu8}M{&6kk*8xxUjNmc`j*&Ni+yree0^-x z$J{Nk^3CkSDfQB4H3ssc?y3F;IV#d?PjFzRIG|lyiwPlYyDBOU8tPyy1@Xa*_GtpI z;`%C2Dxn>%4044hGfd?ce5$7bpjrx|kNYeB^-Uqr@ng?OvN+RdR3*4nzD(&E%I6VT zFQC1QqW6reyhcmA-fOhEdgma}4CVXiQCNoHa|;*`35zD->06%bOsI)xA1%$P2pThv zrkq=zPgL{)XI;&7=!brf8U`T+@-lWHCAUs)jJeDDh!%TCa~c+o)3uWlFw&f{zsa2N zefsEl(ffUOFsKpV^+1z52kUpTSxQv$`oJYaAgOm0Tb(C^Nt3{BR(49se*Y9WYw3%X zJbR&Se5Nr{ILQUA1Xmvm`mq+o`9zkK+n=AUWj5OHm145o189Yd78@DLl{KUoasCF* zN^v*Wv{_lzlxuo!KfElpm~39yXamIu9LCSXS<{WlQ7UXvf85cwCqKnAdbV2(+nwK+{T9UK#lV{;mn-OP1QH}9!DQl zFrVW544#!5YS&#)@QC8Wd6Ceal{eU&@AxA&d-Qsh2Zv0Ie z`C7C!O=g{a1_w$m#>n3B1ubUCKmGQWYJNDLi}_B%_AhI@r~GO7Tv9k;ny>>38VU?= zz)BIt@3iL5ZTxtLY$<@a4cd5S>~}w$%JAG%Fr}${)+&e7EOHm0A?pJVZEby1{H+Ay z^4Bv@a5nP&$R!XSIUzZbQ{#EwL3)hwV1ljtyxK(-Zk~~R9BW-WED7AY2E=Mpr*L9H+*iP3@5;fy_7lW%2<;7<* zK=?eC{~Mi>j&*#Kqz9RlcK7IbXYmGhmr3Dp&=39NAgz`FI!a1Pg*})jktdkV#f!m1 zpXecSaW)vg6%Bz%7unOgn`Uiw?BTSdS!}9#jdkwb(@a=QZZ2 zENjmli3f6LmGNuV6)yerZt2wMKa0CKk$NhMN^%Y+!mA^F!{pKm&U zW+dG`8IZNGCT_8Nu{2&jZgd)1S+X{D8s*4M+vXocAfgMO5-UF{?ahi3wY)tZZvad0 zSn5c9F7p1Lu8!zM(6fIRE9q&E z`D3hs(YH^?SI1{Fdd_}v=zG}hhgpB~X(0TCoEyeiofW*#l;U?3H7d}Vog?G@bulkXkhA`?B=`A;(h95?CJJ?%~^O}n~`2ObRYc}zG0T1 z-@&I*iD<$XtIKO$`1BJNAifs#8{mu-&*nWyA*NG^9X0k<14fkKC`Q#EZOmpq#t2JQ z@Blolc6&b8eC#Cs1n>6|U*)JYQ)aOEfD}#PWWMa5v{`~SSzl=JJ22k5jpMy%OS%ALz zeiN);BCR_{_S*25ujy+z%af3_&#@U~r=O2GUni=iodEIAC;I-3wiTcj70)Ris8F_9x-ZT<#vgR0I;DfjF zV*4Ts(NINLMbD|?QROklfENP2W*w+LW7Idw<#dT+tIlOaHpJRD92*wZm6cziKl%8h zwH3#Swm1ZgFpEcwktF;W7J-#o`DX|Cn&1qE?aDGR&#lOadOIOIZmFRaK=Gyb0)Ff~5Zom`G1<;?EfyYQhV{N>TzQ zndK|I3~8)n9t{KkZ?dNHyAE?gyFUdEWi@A0e(5$BlgUMUma>8oiLzZX3O%1)Eh-bIDUF@s+R!_w%QSxNsT5pMK*u zs?%feInt`xZaDu?8XkNJJ=G%hZk67!UnX2PzH~V?+iMWHYPftr(rUKYh4eCrzLJqV z`zwEFGzT&xiYYB!4{+Y70)>c`-wo*iAp3Ih{UHgb^+NAAH7TQdkHE&QmjnM$fg2dT zZKYe93NN{@5$mF>2tSFJW39FQOes>^rr$TSHIo865EOcD^3@@q*31VRd1&?3vZTWs zVSOp@dr?GsZ^VKcCseX*irYhYo*z&Ts&IUt!}LoLlkj)eiFyV8=!(!(Deb zqXS?G{^NSZ;d#55r9rT&^skVGUy+8X#QU;C@sK>7a!keDPWL#BI`sOHR<4?A@KtYv zMB!rlZl7P47wFahN+uMRwk5Sy(qSypW)Cy-*a!Wz`I0YK@)2|ik&-99n>cO9g*}O+ zZUrJ*a+89N3$_q}_GL;3B<`-|M=IBQ2!*|j*_W4sQ7_3@#FC;NAL;nGWeGe#Kr?J~ z61H8jp&Xa!X0o_@Dj@N-98%ZQrG@uIk~lrD_?%e1uRGn+22RIAbYu$>((hHjO8tG6 zfTWc$qu-QZN6DR%nZ%xx`PlKR2ntxjOiC-a(LU0eTj?j;oqub#g3~L1DDa6zca^2J z`=hwjWue1iZA1r5E@wyx5BbrDo9YAUA9M=I!Crc%*DV96(G#fh$u6)ECeRjN*IyPr zfv-#~N7xB~4{Qu^#f>E#A7oR~@2bh73?<6zA_+YF=iUkC&)-d<%)_qpytKE%sP~&q zl@RN?RV>a0X>OJ4TTAZ-c<^5QJ4zAQpv4a_53EkZ40#7;RQm?buF!jm9iYZaV|SZy zXKU*lU1z-m1np-UBMke8|0t$~Ugyz$v_RP2*_6_rxy=;)K=zegb43@)pD>4IN$=Y3 zDCS5dh@P$N%0G9&Y6(w1upR%`*FMkyi+^#b>)D0zdG5;y{(6h0p3Tvjopzp6*GMex z5Ig1L`Paig+l&30HmCH2k)ia>%9k^y_iDETvF5(Hc}hq=eCxvj90vWDwX0n22jjS# z&&3Oglwn47c6VOTqL*jCiEMc;tP}=si^CM;`o@ln({=>oeRj~63!fzP8Z{hM4y7PU znOaglx$7O);jXP}HG9LAC1<#N@b};T*a7<}@851)HS=#+!?)_n)6iMs!JXs}Yp@}; zDXDcsh|tG_NtERmJje_D)nP_NUCcAGjeXB+q$H1j<1XPeT%7xI@;L|wcJ-bpvI%6A zupQ)YIG1QYxzB0A|3o?MwBU@}{KbpT^1iFFID}exVKV`jiAt)PDtbv^%>;LJDe>gM z?tc^TAeI;Qa(8xfF4!wYw5)%)nC!-rZsff2cbJzM_j*x=_T1+)m!cQt2^AC0HFq}_ zc5mXcwZ-#l&pv<5?vN@V-T=znS^ z%uisX^`YqSULTMsmy$deX(#lW^*)QIMC&R?Tr+Fh=2e{3R;Zme5K8yqNvU{BK+zR< z>m|{O(;%;s;=S}n*TuHY-PsFCCgEAHN|=7;b@Xh^eEh{?1{R~qxMeep4#j%kl!h3O_zdJ}vZh!^+6pOid}n)ZA-yV@o_GBnkvQS*7i0A- zx6Vhaj?Qz3RO5q%`mVQ-)0MYmgRi*D6n!sx8{Jd-k{12gCZ>KFLl|(4h4t1QJs7Z` zp!{3`y%OVgZAUF}zCJbPWc2=9KS>zo>~ChaxsGEbInDn4Zku8?<}h1>3i3?nHnVJ( z#?P3S!ZH+ozfK>ABaEetsZ;Jol zI-3}dLtyV)MxX4uMyRr8G!jBQPa~5!eO#%?t6N>-_li;H@%s<&2=k9O1eVD$Mr~hd zi5l4PV)T6~f8_23phKSGy4@TQpU*XWm^SoW$I@3^eT`qF-X?DdiI2xY)6rnO9-WeNE|A?A~g_LI&awE)`ZwCVYLo z-idZgn-l)eb4My$3V#0Ue^mZW4*$&-C7fV*)@%v=h-55k>Q7ySAU9UNRMuTWc_gYF zH2+e6GsgXX=6)}5V` zK+3lhXDRE3{EMCmFnO|F+I-ing)paGQNO5h8bj(02XX=$FvOjHiyWi&c0gEG-z8bK zGo}^0EQC@v8;;8rx))M28#bG=I3fxi{Nne$Akv1H`S!!MQ4U>& zR5hA2_D#7tUE4%BB@aqXFWiW)ZuDZ#zOKKewBOcx#lBA)F12~yfhx^)>Z!k@R_>A2d_3J9}(~aAo z#9V8pocqGjiTt?j5384%uyW))E{!$l!oWRhiJO605C?D{yOsZ&KkLbmi#Y!&*u`QE zq*P3DV7pdEv924zaQ%~D7kjv9d~9u@(u%H2hs)-)m)?iR;kXGZicEJM#*s9$S7Qf)05UQ=^z1VGw7c&ehhZ*(Pl9sSt4jmI-c<~ zHbi0_YLbAA;88nqiZ~0uW1PbD$n#8}?=c_L+z*3e%REMyS5MOwj;+rzk06aF@ER>V zRX#u4{vQ7}Ifx(-olaaIc8jD(?+nMGFTY+i+h4!x5%=m7l9nC2=VfqKBP8;>Ti=bl z`tJ8d6&X%)VgUN;nBArKQ@I9|>p$A-uilv7A|?H9v}{ zshh1%{!y4uSNb-@+m7-Pl;0GhLyhK=UV5l8xXp1uSuEcezl`6hvHHQL+CYV}m9lcB zORM8})k4W%+Uu=c{+d%ja|c(zC^=sIlF155O{JtHC-tSm|$S9vKw;jy@B+75aM}z}Uh0 zOxwcfAonJJc{CpUB;0Ssc7T-NXBsgdQX9}lR_L-{7~@6ucVpuFUink5#uv&bArcMv zS)t1L-QD9#0pXz$UFVzanO=LtyC~&t1!z?MUP@s6@Z-masx{p3$|Cr#^#~%KK`1U1 zH+YoYMF;;hY~168>>F(WBsK4RVFPyVWB1P@qx`|k2uCOrBEvu=1PJUVW0nQpCz z{oTI!riA*P50j&!I60)yYfk#(vJ=X-@v;46vWz6wU;`@Hz zdb5yxV!94!CMat*!4lMolO6K5q=_8jDJbC*ID8z^zY}7~+*v`~FgzWlD z9Rinlp^tQbHt^@mfjleWnm;p_PXx_N^*)@H8ZRtNw+X%e7YiWpyK9pB1yxUJ7Ccq| zDsA}He{-`v7%#E-p1zQ0g z1eJ5k*KewyhrRmV-<9!TvY-eavBoNM`WND<{Dhc-b``K%a94L1D_BkV%|f|V`^^%k zD&?x8_p#{n|MQp9i1oDMt2G8Jii-zIt!oyU&c@3gH5t!tvyLf3!q;js_!S(Pkmk#~ z6nU@|RpJ-Dwa`*vSAwk$-L2`%TJc|67dM_&@xD$KTZg#pKq>sPMb<-m ztPAQA)J8# z=ik$9SY~b=hBX_rL`XB8oVDI363?S(rk?58?mV!@k5!U|+x@JK)Q9ITlVJO8v}qK( z-|uTkJw0kY&-wdmPaJ!E9EVLnK=dGW_1>R;wFIa__olfj+H$8wTT<_-oakmMRQ+hH zeNZE`;z>Xueq(26IiP{>ny{_mQp_>pJ=QFLJfUp!rS3}iwJ}{qngkY}?`a8t3^yhy+JG6@?{6zJ$ClD2IE#;A4)i=AL>A-T8i6v+mUF_R(8p!0c}EW&)GUZ_{;ml`x` z)25I`E zM7Tlf3ZtX}2MG1bzL)vKdIXu~2f*Vh0%|j$YObi%H(L_iuc<)1ms*DO9{qBskz_2DVLv~?{@P+v&seFmS^nytabL#oaBcKEI^aQ z4`c)%p>>#vCH_Wf6zHf{R5cv`p}nbHlw9izs8Wa>BB6qJ_PgP9D;9`#ouC=%H33&w zrhy!S{_knm*?2w#YQR1vRX76(QNmr6#$Jw!DS@-0F`2f`otb@x16jV$bJdD1rTU(* z@lBLH54ggB`dD{%U;(V9hw6_Nl{@2(&D586{aIY}btwJzp{Uq*tHGzevgQTch;CiP z?`*Vlc>CleLZ|v=N*LWewF6Ky6`pbkkg{L}(x1h1aQ@bkcWW>0)mNCdTtO~e87ne%g83(=OUnY8NWBqr#SS#=e*KZ5@j*kTj}H+SJdp39^Nt&qrxNx^?F7Ef zX6ughB<$g50OBa~*tf><(7|i+uAcF*Y0$DVJL~g3SLeAq*I?{>raA3-x>;@Pbio=a z&H`H2A3(G1Bqq(Y>C(QC{fvS4uVhgdrI7+%*6q5z2)!Rj97LbwX#t(Mn+nus1a4~N zM4cGwSJ{3#A^FQYYUr8*!q>^jr0tx`u1)8L_vR+&C*G?Ss+P{ZjQBMSMSgZjz$)ra z)*Rr1h@w?3x~UZAy|%AiLayx!JscyKOeE%9dlIKVq-od9AA*`MzvV9`tMRfd6Cde2 z|CrswS031KdOSy!*279gwQF3t8o4nbcrhGhK^=;rGX=*LB~k8d`G6E6p(N^L@ag@| zZ;p$*l_AKaVi{5dq^>*6>6reUbAm@fqc=G?N+P2gyidEZiN&M%P?z;?6t&=r*XcKO zm72B3Eq}A@|0T}lKjB}6$qp^gc;B;iV}J3>lEoo{sAob{Mz1C2wEJowPzZ+6VppPR z+G`IN(4?4rf+#OAs<(%|M9&UR=8=k$vNF1sD3Y)=S#7V~M=ANGq~nc+-mOpyo*3v? z7(%UwW6AwMvu<+W{Cq9HSP7n&BD1uM7OvYUUaML& zL9;g=vL3N|Ueqf zNxJ+;4lUF9>BmY+G5hPI`M;h9IxKhuMxQU=zfi57pLi;HEaN_ogrZ3Phs?}O&TUP9 zES$J%LemdN!h!FwDCT$u#oOJ-m;0{T^U*cS&QHtwCtN?1TFJ41RPrzj#L^|x+zUim zQP-a(p~HK*%YiD9#ap8O!tLc|~VBlCzDRZ&4x+w6C-p zl&eh9_N@=!+ekPl2f1d0Z~=n4Bs}-lvBj3lM!iV@zOrCE8^$Tl%zo`AiSg;;boN4n z>fRcz%!635?K<_ZPg1x#J3Fll`uX~z7EmX_J3pLDUvKK+gaTbb&CZ1Rl9`{yl|II{ zr?(~+uK1~x4<=B|q?{j2+T$<}xNV)o5b`eW#}Oer%%9dIwZGdF3NA6}vxd{?5%diiu%&HUZTF zM}DgTw{gUMS>V62)q5KAL4hm#$B*k#ow2?jAWT*_Jlw1%1JMC4xXL+0^QaBb-lyVsbO^}#c^yY*c(R|bGMV2ohKtgZ9JQOFhomQvSi z_gg~onJ_VW`y_o|#{o8WbP;n*9*@(=*3Y<3qjZ8u|1-WxDO9p!qM{r+&4MJQuraQR zy8t0L-Isz$51Md1HD>EJ}(NGx@b|V#fVxAqtpp2B%!@PMT-uD zgonmvLU3>Olv5}h2S*p2_eGK{82%-<5DoK7L<&FNbl6o-e>I`xdmXqqAHZ?{sQyo) z4a$1HQ2)h?uP1JAUbxNdY~Yq{Yt!0r+s;w9l%I^~+vi%)FJ_aA3s9X}q`J#lL2Zq< zPvS87o@_5da6xUY6IMGcXcqdwVgq0;wCi+X01-uyoH`t zcixHcEFCwXDdY7j*E%8Pe!ezjytLDv7Axs}&Ww9C6RIJI&dB>dp9p;rXRk9W>?T#W zzL`a3em&=D2_}jc+oofyo>~yQ#ysY8^xdbgO&#(eX>=7JXl(=&4z5GwyfYkJf+elr ziI1_7KFyx}0@r?`*Sa&7$zN0iM#qyhNkezZ9p1}-g+SbFnKjrIhvI4J<9wh@5crke zJ_F!ruIH{N-vTY(w*Aix*sawQ%}e&%d=2-J5v^i*&^feTCz!eM>-y&Ww%hE27-lb) z2m|t}+$@rQe)?&hHj*gNlBo`N$GI*W>2tRAecODd&V3hVJI7}+CFqd&JOamrTaxtx zn?eiNgo;PQHqHDw49&&T&1HJtw5{igf<_oTX>~8 z^H@?+nm1lrn{sOc{mfI0#w))t=lTy&SW_5p-Gd|+vj|Hbe&j0J=Y^vx;noSBVLA`N zwO1y0FgGzgcATGzIXq5ZoS$w$s2EF)|LI90cd+UFh*S-rvASTn4ub^ryYF3BE8iNt%C;$)R`eXe@iPXtDam3LWz93wp(PGm4TY}8U z$^|3wPO47sC;^35$y%g?#YQJi^1?;wNu&h@qp(}=-h@R#CC*dfiq^b_8^6Es7>8KR zxl@V0%Ha<}(w=+H{l!=P+$)qA_aT^`k5@d=b;S6U0E`mA1 z@OAu2HJo&5+`2JrP(fAmJ@g4qbN|Cku8N$nVa!Y8GfZTI!tu5NQtb7{C~$bVLn8a| z+kJOhZ?FppJe+aAv3xk%QRcjxV_Z3{79Vl4(xG=ns}Ua=`m>nkqZBXCPjZvr_z&EyTAYL6w%zm|LZ8yEF@W6~ zta(29Gl45^qCELDP$CqA)$7+H&b%F^6rEi&*%gtVZb=^eU_P-r#t@ZxNkF!cgGK1>8F`Juq`eqiNtgV9&@zce1IU{l=2JUa_ zUg^J2J1whuQ5Ao~e0^7NRg6Thi|VxY9Whgu-cQH7ydZg&|#o_6dtx^E2MyvjDZ+S!jkHL zzwEn=`UBi1}6jtzDe_LL`@)s=e^c6B&2_MwAlR1oJCNC=x zp6L=xHum+d+$Po`ow%a&SNtm0d^ZNCI4H1D8EK?N+FAO<)k{9|VQx>|V-e>k)#)2V%L_c+ z5uIBu@>$Sae%2;7yeBwJfV63=`{M3W`WTy(Qz!1n)ceCyt)I!-D5CeN$WV8RZKfgW zf9A0D_3fX>Zu0Jwe)&~YqHwEbrI6z3)!oDTKwo)D5;v!L)wbs151q5d&9~eV=g(SG z%|x!0d`pcY=`LAz;NT$JVn-!1@h9OKW8P>kKIo4~NPWFk$d`qIz|iVj)YyIVq%VO& zOJcqfdx%AlnqdeusdA%K0wv9cspp#%xcoKMHmoq2SnBXgA`-NW4nAQbbaI1u6mFB^ zp{!Glw~^chBCMouVY^cM8+)v>lXFD;cyhci%y%UF<;uosKfwgHmwWIlGvC|SJeiw5 z9X41_Wj7VO8lE?&?(Lzhk)bk=kN1gE)mQu@FBMuYhMu*KQPOjwJSG{>uSBT_6bEYi z>C)j|=HnHVr@G%fRY%A|MM8wZ$6L{eg(vA$lFePbth9H);ze`%U9h7~;)PG994+dl zE_0ceM}dJrPOwqrQ>pz+gow`0(BH?)jN0>EJbJ&)F#+wIgW4bE=x!-KNjZ64JY?_q zWa6(VW0a(vp}EkJkhM`$GA41xpUqZDbz7I_x_lnDqJ($ z+QxRv*lzG-njg&xd64aa{|}HeOtUcrNf?QFaKc1KU6Sq|iyYWbPTo-^<|D@#qgC47 zLoIk<5IsmHO~e*y;Sw;>zBNV`pvZ74ey7uno&3}nyR`4hQR`bwBN-%$r88b@jwtxX zRnOd=asbij1>MBl!u*@68&c^lMf0t(C0CkQ3g5-#LX&B6{w(vMalwJd+aC=(ZM0n! zfuuGJmVxV#u{oSh?5!6UIm}a%Bpc6SgqM(4S zBZJ3uOekqk>Fnb^6m68@X(i$7bL*OVdQQdnw!>0exe7XNRBxh??D-)gK?6^K|6#`5 zqgh}VcfBLMpNr5R-uwQ#*Yuv_O5k5_h2IZu2av{G*2o^78_jWVrL`?}!P|aa=UJKI zoqPr1Uq@LgM0n$WQS}ynQAKOGFf~d@D%~L6-6<_fND0y)(n@!ybeDvHGz{I1bSvGB zl0!FsYtFgn{_gh=%$~jWTJL(}2_mEKg3y3NHoV@9?V^iKR^Ew&dV5$Fod!4U74 zl0W$w#=|I8t}pJ{B~frMnUTM(`&}P4tWLFJU-u@HybvUWk>_ro`{f~^aTvO93w7H| zc3i}@guzIK4(AW#e@aDL!IrF!PJpO)S&)?mxD}sk{X5^A1KCj z)5Vd4#uC>^K>X=M-LD@PC;XqhmYg50Pjm|8(dgp6hzq^M-H4yzIU)y3Yb1gI!t<9= zNIC0yhz7=;70Cl7z#WPi@kup|<{qDe*^dpK3X6lq509HO59=VbPs}ZDXV%a#hV{}8 z5#i(%VzvSK+WQrVEHL2&%9C?^mSQQT!HcjsCna0 zTqzv|$*!}IAd_pO&In55fH{jY|3Znoz-!I%Gj&Xb6DcGS+)0Gk-kQi5GB?L{%d7J- zEBx-wm{r6`o-9N+!paMT_kAaKeh;GxdjTlAZYy4090Y2dK5i8-;}?7K%mmo2+$b)t(tUDpf4hG?+27=c zKv;`$EL<&@yXHoAAR$-%GLmg=nbf9r|6DVL0!FH^icmlCWJTGH|0K5`DUcxCPkC?- z8X1GmDvfZ9nBe#E9J4o@z(bidYd)(km@Ja5kNmRJI?9yKD>#YvCvm^gngyya^1-I* z)9;T6-jUdNnFv-VYMU57lTU4kbs9f^KSjzJQx)xKuO$+&`}T{X-`tImOoZR#{7wz6 zBQ-hpyhJB$Y2IFS=&$(P$^M0uS81!G1wt5PBK z@VtC}jO6BWA{GRZXGKEJuM2R+@WE2yAQ-Q*Cym7-oxmdaT7@$r)*j@*Zb$I*iJnQWlyvD9@nfqK-x{!kHa#?1+rvI37sGS09;GUHAJ2LgcTFZ(W8 zdw6r9A*MbEE#o@9)%_m-p}Zq@YiN!(A&^@%9!8q@y)_kmD-WSYOz$6<5d04k{mSzh z4t>BcgrgI3)y<&P_i|dk^%OZ^_=FIA+Wc(&73zhIAhxtS z9vJ&x)7_Hr=fJgY=>dlnpA-Q0A&Zs@n+LnZ9mCZ52VQo@@b}mN8 zb95riM$Xr=@$F0<6-i>lB#Ik#)A=83He3?tKWALos-!@kUoyeUwN{(gUSIRwtDzPl zwnnJG!&X{t7ei{!A|wXz?O7syF>AVe5-6A61RN27MN^Ib8vdqRl`?f|T7j!T$aDx$ z>C6Dv4ba56ujUdt3=Hj*$(YG2N?%TsZj)j5$LDqWFB5bBE7V|GcFS~X&&}0<8VvsW z6-0Vz^LK3iry!VGKluu?z83@|<=IrWKRJFYu#7a~!a&Wk0Q4Xz96Af}&GNqVhq0Jq zYDhkXY;25Sh(L|gW>^QPC$=TKJ7 zJ^ZM|!DK4|Tl)eqKw};SRDqbst8Dd3qt;rxWt|7hY4m@#FRQA)TWIm;*7&W~SYQyT z@$ElTMQja)qlg=pXBav~+|qE-!Dx`^(k7^q!Wqv8lOT0^KhG}ehV(m2_*|a!CeSMBzhCi(pV(s9R{}Vfp=UH{ITnB@*u&CK zqW>>`;{_Q177UEYDQm1ve5bXW)3397SCI_lbMso8{g!ED&!0~HXPm4QiL?gB$?0?Z z1AIwexb=dC1^&9d)zv2;y1ref326Tx7NFW?p%ya4QWl&=$dMI`5d_raahl#%oA-cG zD+xOnX(b$a5b+FUbO7)}IkIsI!8vi>8<11!xI9*Q9`u_=YI>p0cYDtb@PgQ17AYl4 z2aotqVn%$Zc6f^j>FibN7xADPGyC>#O%G~bUweozeC-hd`iT?`+Q5?9bgv(ibfy`H zCNn{>^h1++^bG%kFc=@#g?IT3ab0{TuNj<4PS zVX*wE^1M3Ia3wkelAz1vF;Q$(ppV}Xo5MWm(7#*u?J&MWt5@*~QS*nYnl<_1^P}IJ z4&ft|E!xb}qO_S%9xN~K`EU!#VJh!~IYno*HEj_&w0~^+%`%PpTMzVbwC?-Q{DfTK zBjYAyVLpX(O^E8;$}_>g0MLWno*5tpY~Z)t8vxTT?@Wl`{K!UHLtc%c^Hyj!;y=p3s5cEZ-;m~@y#7hvv9?h zu)2?K3<7jWX%4zS;E`8X7KKXw2OWZ!_Ur89{sqJsru)S5+iE0`_CU5=ZjL*BMw?LE zah85D5#k&Kzl@v?-3r^_FK6AFJEPleKZVt{Df0H(`5s5FA`tLpRV#s3pSqMnjaIDp z`@m%YZPy(;`ut26mht-6B_TlVYGW3rhpo1p$2_QWiodF8WZNj%vPM{Q1r{^tA==QD zvI?l`_cGF;b^N|bCDs`@lsD>24t^hesSQl)a{|HkCYJ$#9uQxw2aUfZs=@jYU4TX? zGmvdwT^9IOT54{3G?~|ycY9FE!y#n{=oWl3VA~~o4xJ0`o|5hi6OW$ zSdWZkokkny2?k?eMlg)2v)Cmw;#;3Z3>;ub0YDi6sx&vX|BLifgNHJ8evf^K8G;M2 zMPX-Rdb%XNwlqUF_eSGiR3G=gJeVtW|EjV#*K_urjtq=%lDdxl7>sgx`p27lPVrAOn;kvpK%@lvrh2OO)~~Bim4~9WfTHllt<$uX&hJr zQncyJY2ZCbIz9R7Y=>^R-8*zCljVJNs?&aQLQaYGEdSV^6H!|lzuZqc*&h#e_&r6v zt$MQ~Zxi0hzg1e5q7i&M6b>1$tk(we3pP^pC*KYxg@EGV=Qx_+%sJ=b^Kh7T+V>pl zC-7C)G17v@fQNfKnX%2ldGl>tRn4-1WTu_93;^*1fHh_3_@z}D>wsQqcdWd8B`Rz| zVEc94H4WOYof351I@*rD*-;Tzd36D6jkA^oVhNSkBFsD=S=9}U4^-rffwOeK zno?B09HwVTg`Rs5YKImD`9c9ajEYU0>4(4*3uRp`GGq9`qCgoTu0CU(m8LVpnALoBoJQ%#n9rAC=zsIRbVH}*f-Z^R zSR+RB_s|Nk!<>^6lHMzTxEB0`R=QvWa8X5DU@!`CuMu0>-mm7%S<8S}E;}G>A&`+o zjfDLSQk@FELj-0R+RoYlbtf>Qu-*n3 z`y+z5LjvOwczRQBn~`dSmf?r=m%$OCQ8(|`FUBej8QJ-&ogf?C)Pl|iUh-6_s9t#H z0C>>oAs`Nw9)Y3VZ7|N?K4wb-pMD)Lu+_E`aM{2M=b7i*uBdcFN1w*-+`)+s2cgKG zFA$rr1KT2i#k-Shqt!I}uR80oEPc0P9h$<rHLOG4x#wV`d9KjX8>JqeJl@Gvq0?xBP z=OqTMrB=TDtM=@L8WXxdX3mce9L9h-65!V1FTLF&Au7h<}JWIQJS90DYA!o%) zb4|irpFPxIyIAg1h}ctH{B&9qu#mxKmv(b%a+pyF`Is*rWJo)0zN`fRgVsMLbJ+e% z5Bm(TeumaD2jR~Pr$SGbnuhTYx__7Azx+KZ+H$S)@k=yP@uEI6O7m~bq%%jDzH9F+|YEe)mxryMz%WEA8$Q zSE-RHq`F7_rHBYY;qyfAhoG6^R+fBTI(cFU=(wbF

+@+Z~d=8P?Ly`f2tw5vlyUPGx&p22qwk! z`_=!Ywqn^j!cTn9ng;kBebF{bLsF0y*i>gtgrjR+K)y!0MZ??7y*`re1>5q@WV003 zTUxnh>@wyb%rBl5bNpqfrnCI;mG{u;t*#5hbqFBg+6y~u{f9(KHwxS;%B!m#wojX) z)vIp4W2a6kjb(3&I-7cjo=t0QjvYk7(hBaPta{XY{YM+A*@-cw<>$7K8z|!Fr4-OPzaov|Fz=~V6VC{58KczX!A3vcL2*X|fBW)2 ziyse~D64)ljuWy9iFIx#B={6jf1W2=O*3x`E1~u0%Qgdao*0tNDl1ssk4&R3-EgK; zvRCVTH#p*fq~Cy!1LK_Rr}cC`J%{DUd>+^V4XNYBUQ$45s`fj71biL7emXs^pUepC z-*7T^^>@cxrYKg2>c$lc#Zguf=v>at;Ryg}-olFC{l%lCgi7kXPKUq~i1kA@Y+1=} zd&ZIz3K4iwzF8#ueG~w3$<7)7a<2;<`>#}+7lPozS)q5OT@eA`VS6FVz}Q`8@Oh7& zu9W{4G5dTK_1k9bBKnhH^lLsQORukqrHOR<_Egup#!F=%Us>%*)oHs8CB6h$s2Xly zF~WR=V`($pxjM@D-yBH`o+bvXStV7dNI{^0I`=);|tKZ>cT|rrIzp5LORtqEZd+q?z`x-(1%T+G9&-!R4Ak!>h zJcUie>6XBVVF|MZpCvhJqcBLnp#%wop?OPHiTW=;l`B%+kLOp4-)Yi*?Gu#hzHKS1 z=sa332G~`SG-1BMHXH!dbcXZ{Sopy{9gxqI0tT~s{DlsDYfsCET{3~!^uqa8B3awT z5wG4=gE)Q!EIpTcNJ#Z0g`GZ8XNokaXDy#@4+}aE%yWb?aUNR$IwCrtBMPk-gS#u1 zNHL}K*hxozJ~NCdHZ!w4{-g2HPlNRuiyF=PPk!JQ>7iW$xl(wM0Xaa7+qlhF z4uVbfhTz+1FkE4QciAoF0{^q{Jw1bctovE=FvwHQhLiP=5W{0)h9NEH!^zc;Mw8Oa zYx4f{rn1~`36le4O}1_NESoOHm1H52?~Lv)JEp(b!O(#~>N~!v94R)->5{lpbYOL^ zTB3nwXN|f(+jwF!Is?Bw%1?6me8ABtNRi+b4>sv#@pcebgZg|b zklpz^MQz@ZtKl!vjx#8_9iNe9=1|J$jY9Lw_5gEio~8AJd$2xx3jDfs4{{uPF@_e` zrxo9dzAix1(=SK+Kl1k)h?q~H#K)DPNG_qPAm9#yPS7++MQCdS@`Oc=8-Uo=B@vFU zB1Brl=;qY<^bf85P*@yCiA@{WV|eIP-EwE>o1X*|`=a(6uh3~_7PPS-pk9-Uu$2Rh z2>X{B78z3Tq+geN#ppTY;7Ny~B8wTwJJJ7BzG(?je-A@9J(@!Eba@Uyk0$)Yi0O)u zb2Ws)a0>5pzMIqG*`&g#V00N>;HwC3fch-BGJ`o*u3e>I!X~~Q)oA*(_P@8+QRknk zylcS1rT^evMEf5p7lQ@foeM|;?*;#E=Cc(q8sd8+7#PRxunt_*2eoyIBzal^6gm@HBjjxL^-U)e{H)V{f65o3t5Q?s zCk}SX#bjI^5nOb^$p5|f&li<=_st)2$L4z5_1#>q4(4}fAf5y4-FCUaj^U+AdX+J= zkt}3qTqXJ6{jTlJQA@)QwLD=h-Z*|W#0Dh!m@HL!rY;sy#~qW4V+EPH8jF4m#TpU? z&=4pl!vFjYoW_34VjdBhYv-NDYHgwKAM@m*61KKZ3Wnn{z2ANu)ukCuMsyJv0?Fwz ze+N~A4~?;ZFRP@OjTj&}bSFUFY9L0UodqC2YZLy~CZP{Ze9s=NHs7E6 zqUifC^Nd~c6EC*{ttg3Sa9U9;_SffNVTFxfpD8!-Un41 zs1%n=HjAi7YZgnjbE!mlFhG0PpT%#r{NLl17_op|Du6(X!ARZd@GJe0vOImT$%RYe zKT@X~k9{*AYF0`bDt-1#_wr8&rnO$DRk|c30I(xSTR*naB7qgAr4r8Il|`go2Vw#5 zg1Kmeh_jYKySC|0qunbFqjFK5TrB9b0K82vt!)G#Is_VD<`$Pxnwjf5Dg|+I@{2!|B&(rbsUo*;!dNhyz|Tgo?zT%Mb*hoAer4 z#}^aNeN?hU+2D#!IT11VFF7XBqw6L4cYgl-#$hqU7qTdwNQsAjY`_0s*M7kj&5 zRXiSCT(XML`8S1(IG^e*SaVH464O2K7^7aUE9;zy*PVA#Xdo8yAhO-`*+c@h<1Lxy z)5~z!ey5aqN+QyF=W&%OgN8!-&b*k@pD?{?P))OpS3GIq;^NYPi!>qQUvI`V0ka(j z?k^jK0POFMtYR_MlQwkCPNEnoHIe~!s z-MU}o4ZOXOJ7+Vb0SWM5?Fo$;{7-gjGH$24z|~L=f*e?&N;fhBUNN~7AYw1Y^nmI=VYWIZ~4j7H`l)3rSjM44jcG*HtyT(!9zS{-rj*ewIA7y1K6{K`@!pzNdBF(J)xi9KKc7Z)PX~nQZDfB&Huk? z5vOSc!Wg_{Uw}qLjtq>IB{e%+GtM2(RRsed6Z-0qYG-g10<_jbDd~qRjSYue-B(Mn z-3j1lefN{`>k#hgGL)k5l4MH_iD5N8@ds~(@wzgQJfn})9HkAV&_t!m=eMv<&K zu*`+U0cprVz9b;Lyklr(5Nqqc7NKFP)YdseMGo2AaN}>w%;I2~SGunI^=X3OzY;TZ zQvN_(j85=UO2mT^=>anbB#_E2#EO>H|M{z z&HnzX!Xwq|3?i6E!Jkk;EbWF`U{?tcO=>87<8$1h=@MyBPK$5q&WjT*l|;<4hOQ>8 zx2FGxJTHG42HPJnuG24s3#ttc?oVgEbaMq7pZiCzeivM55IOcokN(ha`G|NG1tYE0 zXpKeJ(e)pfpyUAhf13nPUdP}?4O>$a9M{FshR(_5JMTZLgQ5r)g1MNu46Z>(%DkY> zM7N!K+mPS9uh2RQ6h}k4;W{JH9<2fz5MbDPumT0BsQLN%qY@k|$OHsd5BT;|^rxb` z0$WP%v>;{|uyC6E)FBqTwRqp|_3f{1jTAFZ1(FNwerLKLgb5~`5DU2WTRgaHh5(o4 z5*;K^t?mz>%{Lyfz;@a9lZp&s2R#qtMB3GUP)9Uf?VCXEfug8gUaT|sQ^@;)j;bNGy)31(f&9(f)Pb?zsFTw7Kkom;D5FH zPs``VJY*zWg8sB`0*M(XAL(s~p*>SFhhd>~J|UYf-D3S=Txu#2n1orN8;(##$>_3< zfk3=J^2PZB6tyFoRQ;WBtqpNB`HY906C1YpBX87py_jT(^vfh+0$ zq+c|RXV#rcs8F+)2Fkn0&y(BPtti%d@VwMwmYuv@PbiQ@^H25oG-h<>Wt#8}5DE=ehcTQX$EyMX-ve!wn?wG+Ch9hN^UtegY zZ@#LMz`FvRK22{h4S{qod{Ju5m5Ukz#}MB3So~~xCipAH%slPxu;^h-w&xgYUAsX( z8K1b=4}bk?BuxixEAVWp*p2pw@iO6yfuW)JsVQ6L4@}TD;nX<{0#OU%WgwkPjIKu- zABW39F*q1_^jb^H=U6M~|2>Y6RT|DNUtoOr?rg4*LJmC%U0&R*N-k@L!73+W()D8Q*_?3oB@Kf^U3a zpm_}+UMeJ)1qIQ-!!e?AAeEmB8~n^@Vm?GKlWdn>+xpM$vX3pVahSK!m>3owaD0QPcdkC^{1 z*!3FhG(F7$jln1Tf`NfySViLDC7g=oqEV{#1|W6bf?_ENWZjYgGcWFb`?v51FqztW z1PHyMyclt2G&HnR0M3VV4b8v=Pki48oQ)p{r`QJCU&mfSZzU_~^p@CE!rL4#gsW z=5CecvB%VOe>(gFhA&H{@0HB%}5&0>9n>XQy437 zA%oKnf$tzyFvf11Lzof_QMbxOwh7QBHS{|EJ+_bmhDw)9I3Art9goRR)c4h9;Z_*$ zoBG?rjz1_=UePU|~RA_XiFaR01xBr_(5oH?Oe8UV`RF9B?Tc z21@m2fc6z2L_%yC2;g2BvrAZ}r!8`NWPcXwO4IknFjlbLpxfrh5ghG|8T_{D5gh;v zQWgtjZ*=x%p(@v-SC$k1z(IXX&+SWKP-%%#5752_b8+Cx$`u3q^Q|Tr1D_c*bxiv@ zTeLiUMkmwCgNniG&$ir^YL%;(!XzHAH|aP_^dqKQ0V7}4{WD^#pM{%#s|2@~BE~gD zN8;q?^Syr95zu-{NGZFwu&`j^umilFR=`u;>(OGdw6uitGB2osPO|Fu1>kOEttKYE zsB^&PK)0C?&+&4%5?iFc$Mfm=Mw9#K0ye*%f*&zK|0$A|Of;1+3TMctNZ|Z|Mx%vb zmbDGOxF!PTu{~8U6D%V=y@UX#ZP>o!p`g}#zq|n{?iB%t0V{08)E{X@?HG>{D2O&Z zb%=rvP^x5{Rue`v!5x2&!!rPbGQNG{jC6wb`@U?Dq5k%{-aJTImWcCb%jU}}tS52+ zSGu%fj}2_MSYvB}xBtTe#M4D60x;O08%)~~P`3Q-Yf;wf9D{j2TwDzRYMKV)n(yNo zW!(_=c#&46w7A&-PWD%Jok~e}TU@^{n^6KvAR;2EuaNp6{?bvq!t(6t9|Q-qTpIm# zGAk3Pz9p6~d_j;vAA9irv@U%dlf9ee`L^rUBHMc>#N3zCENGPsl6okVBcD#Vo#zLb zn?{|PO*_o2^<5dI_-;(VKlJ!F-iWpDzlR1R#Dk81574Qyw7YBUAmtv?$S|Da?3?}c zCmq@b?mtdH(A&%)?I4_tToNz57exH$yZIJXhmJ#STxdAqH+Qm9b~CmB-ZtJXrS zMSHSvZ3AVc??wDxAcUckA6M>%KW9MiyZ{7AW{sy6`YmK=UXLTv_=-fA;k$0iiIUig@%zM5=Y2 zaFY6SdKrL72)%)MqlKaz^}22WHm}*~)*xW~rMBSoh8Bs3i0x9ag@`>pJwGMs!LQ`G zAIoUa>Y{2b@iX+If-|msZng=#`msfk<{Gw3*V+r-WR$e&qoVy5ZMs3jy0!)7T{8tG z@ik03jQn!V!^3oKvNx2GUdpR73NT#jf&;wGuCMPvuzld)y*UYt{p9ARW&zIS($yzQ zGt{cz;ySnn`t>&NoiP`3c2lxe@3Sl)MeRXJxk~q1YdgIZAshC`WDM!ENd;`XOJ)m| z09I-L4{Q<5Nt%^AAKcSOS&0!@e*#P&U|U?Vx7gwe)qNI(nW};l74R^SO3Jmm9*Cv- z6lOAbhC}o^rY99ECXP7%OBWw$xc9`VRj{viv+(_K> z?~j!H*C|+{djqCFff|-rd#j>79E{E#3X! z3TI{SsLskzsF$u4=g*M_k|c>;*w@rFoB`iG<$85!HyyEC^0a`Yxmcsu+iZgDwOkN8pa^e&{Xs=E!)jk3_aE9ma`@=OKm z+>UM@E@qpCZu}$`js@GR)@~r3CM|vf_Zyy=LWHqBy)_4m4Js5~Fe!HsfE=4fmAQ`{o7BRH`bG zC?Hs#C?T&S*ZZF8J%%6Qb5?qDFF{MNpvc$A-D)uZeDla3yRNN~B(mf7pH&2*ms*|7 zuzhPYoYTl&KADgyP$#T)oi*$dI#V~JRHC8&Btu@4-tjsRkgLU5BXq=QjY##pH_v%; zf4M)ag2-I&>vq&}5n?!nJrO*MZ(vqVl}H-ig|MA^C+ZvRPN>Fr4YioUwCOmg9!}g9 znhSXBiV%0+)2NdS$JVvcLwn(aoVJMEB=`GiCzP$s#9vBJbmDe<+~A(<`wpDKwudeH zLB}uH0;uvHyu*D288nRVS;G^RYwwd@dV99Kd z#tRfGz?GO7f1Lxw;vqC8^lQyi9?+GfPc98^iC1yC=?Kq1ZZ02GMt)r1&*sWvsTx#Yz+?e z5LtRSqyurc<7Yo?lxjb&5y}@+>q6E&P|tc=gM-C`_%}DP*CN_%QL$wTtVX)LxMj2e>O^bb-}#>%>>`;YfLjc?5*_sOTfE1u!Zi9#}Oh&yg=QS5bWM413)^08A5ke&G*uO=VC>KYp4p5}%^a{i3W!4MV^YwqS%#fPHCq{~<>0sIs1 zsFk%@t`U}7-D}c$a1{gHFVn3n)YgMQ0gw!;iPwN6xjbuX(7R&d_oWenE26Lwj7mO* z#(9-h6Zhhox94_}cA}mC$f5s-=dAa+(5y?3D{_n)SnMYDy1)_xE$lAN`x@T7yV!3S z`mM6c0*KYNYXWCV`tS}S$OZJsAx}bsI@s(VtHpXmB3F%HHgt;yZvKI}+VpfvXS*7A zmo9zfJa|9bpPm$40(we<+h76lQjKEpMO$_7Q4%-);Eo5x$k`r~aOZ7&FnJl@C7)2Q z8 zc14l)O)WMb8|}Wb4heD=cUd0Ngc=bzck3?-(Lw#`wd-CIsznGt%>!qH*2@1m&1-&! zK@gMNSAV#FZE@(;k7R~HC?io9f4A1?9x4>a9QdNfNzrQ@>d<@rT;Ifd&T^Tv)`OM6 zEy@DB_VCb(U4OA+ct9xEXj}w~h(ax z_GhOAK*k{qMTpvfcWpx~e9l8G2qU?e?2?9bt96!S8JZCW?;<(At^{;nug3L8TC=jJ zOEeND2%mj;dU;F*sAio?k!IGNGVYu%WUwX=c zCi=375*At63D+`Vrg?Ih$C2t=*%*4`0Nj&@7QYIqi0q)*=D|gH4vnYrW)O+>-?;OIjt(mKspT(=xL zCb50I;?v$-o!D(X=*^O~iIGaZxjbB)K4D$>q6@#uBG;hHjkqf-P_yOcj;%q4<9LxH zRG(vaSovx6{R?m~YHJ;>zGFuy&n@`b3*V1zUZG<+FwR+pG@%dvgDnRSh zM>#{TZ?udsN`1{GVJ!0Gw9CkcRig-@Crj)}C_Ew4cwB-*h`R<4(hi9!?fkS~I!pZ` zRCXEo^c9=h3&)+ni+6o<(DBC!&_mD|H2dtsphbG37F^i_#3F9a?{mf7h&d;_kptR3 zTu=P8YKO*%`JzF+E{OUQ)_MpoZkQ7od^vi*(rqy%xxXwi1AQqWabZKF^=^*w9R%2qAqN>Q< z{JR`O7X4ac+oR%vEFG%wycnGTfWuH;;3P7u=L`I$A=i-n`lrw-4(nG0)f>sLen`J$ z0ppxPiAwASdbPHLZJdy~Y?HR^zVpxVB<-8fwvP(E=eXr5qNI7z!1JTjkBPJ+dK*?x@accke@WE-DI{ioGTkK#U)?xnja-m z_-p0Rt(iMD)xK_dn+(5ft8Bgb3tfKf1dA`H`uFDf`X5&gmYUw#UelQmIU4fQD7UXC zCuYvnENcj`jr%$t;#@k=TUuQtS*CI8()G(Mw|Np88yoN0Cwheuo)p~cE{fC^W(!{s zDu27?G4QTK0rEvr(!r;e9cfJDN>okvLyFX=k#TK!TQzB-o?FBWG^`w*Tk)&3afb8z zk{=htuZPa$#i4^d>c7&*a8oQoL`k>Y895Nowgv;i_ble?s>aqbGWs^e^YZ1)ciX8~ zJ1knYvfD!#I7HWf21XMJx3#g=Ovu-cPKR-Xv-~3OLxMy-+zOrlN&y;gs{hTI_IKZZ zSVfxqy#c9IZ;%|w;{TR8DKc`i{eEU163HeaLW(c2J^L7!B1RvA z{SWQ#ic)@eENU`W*7lCDo9wqN)sOoE79JM6o!Z@gV>O@hKcdQef=G$0@;2LttChw* z>X6LSBCLK{8DF86helH@SS*ZpsXq{B_!WsRwm(=(v29pQE3Kg`w0k($$Gv#;GhV7hSOXJ@^ zJlK>-*wY|7qWog%$lvoq>W|Lst+)MG60luVlPjC;;zK7|DU&yRJegO><#p|Ju$zRoq4+$y^ML_Llu56RZ?dWnQ^(AuSb-}J;t({jb0PJx zLjUO$<&49&L08Rq^X!zq_TX`+|0WM#3-lzTB%xbuX7xak7VtTc&J55VBnBh?}ki$0=+2$$8jo>+L^y`tDaXB zMqL4`u**eguG^QzS*Lg4pFDZE_-$Cj)NvaT?qww6218-iVhyB`J>7=}6i*G{pSJis zxSSt~;9cysg^GJGG-9%7O?jiL`YbbO?O1u-UkmY}vs(MEdevw&%h@eu$lO}n%iM^$ z96XbY{#N|ayw2wR(@bGLB?*O{H76wSC&I!)Qw8ibo1fA#b|e}Wx_S|O%JQSw7GNJ+ zgsrD@o8!tHcwE|q|N4>bN=nh*j`p^+l+62A+5E1Rceb*t&tCDjw=5YD;!uQa`6J5x zyDM5w(>`v{L6vaZj7pivk>U#q-v0bAq$vk7=GE7d;~s+#dYE_vc4O5u@5T`y-e8O=XD4eH#s{ z{H6zU9Mezv4Pu_hV#*i1eSjGH%wvIShK2%|f#NA7bnZ>-O}UpjIf5n6+C*_XZZyiQthLa7~${P&2fEDLCxy(#7ih`s7idjLJjqXbh zf^B8kcGcmV{AFkOygTNtQ8I~vU{?`{w zIscSIDno{oZ98WTRb*ORE=~p+F35E7YUmO1U}i|waid|lzpG4ov#nm}WHAI|w2adS zW|5Iv)W4Yt9RJqz?E81(i56GuIxZ3pJWkx$%*|Dw7A9d=yC4F#Jsf2c@TvzCc2a() z`vk__qZ#W;e)da^Q74;EseB(fuZkKot(oMpcx@M+f^Kzpa<@f9%pbV7=YQ?p`&ooJ>k1o;h5vvITVvb%AQ*nIZTYXg|GCaH>F4J~Z+8nv2 z$H%)IVY?i=<)Q2QvwAs8{zXNkmuoUeqyk48d7ZAuaz%VT7yH`ICVyFnCQ;sIiN>Hk zCouY5u)LS(XV9kGwYoY!2`p*_=woAJ!??{!iU>`rr6~UYZ+xyp2f(vG3M6+&GcfYx zE~Gwxj=kgS4k2l<7Qg}h*&eXiYB`PN|UZ{R;hVJ}eCU z5W17?&CvCqr}9|~?TqQ8ad2?(Eey}UYQ{>bV7|KD$DIDC4MNT@&9-wb;vwh?P4{go zo#c)4cr-{GLnm9VPyq^#s3-I)1PYDu698K z7#@B3_k_;cUs>c}{>-!1N|TUWg#Wy#zKRPx8C>(!)KcXTgT~L8@XL|@ja80DvBh#J z?vtX!07xm9>k&E#8j^~~K&8soPpc+8`X%hBh3#a;)4ureXT<5S_}UIQ%61f;|4KT% zZk{|N=EjX8)s}Ct8x_k_Lm=VX`!T~qYHMK~gGNOvjYOefv_IpUh+^30@$T?(p-9d7 z`sT*N==E}`VxfxRPZbHaE&v!-Z92_R(uj1T3GkF1ZnM$@aTMbZwYp(cNhO zVD{w@^g)N4;VE9uy)aG%Ms+e!ioG_*0x4M0Sia-z{??$Q)l`GW#aANEoWK_TzzcYI z{ESq6lA};b)-zLY>s?16$jRB+qgS52lPTng%VWXbGnAAuRi#1GF63Y~tpa)fJcbJB z#~!K`j9^f-G<q@C zfs2u$uHAPcXDq}#=f`@YAzNZ<-!}M)?AZRVm^obzK3wpNZZ|b@ORE*WrF+4gI8MF2 zK>HhDM}WMQj|}*<6OCUmJyl&+9@HHRDf`hj?NuJctcA;RA3x{7{yZ!8E01GXjC+RP0B}q-aK6z#KTLj$h zIrr7_jsFdkg9S=MRb)ZWsoq8MOp%h>I4E~35lDmzFUM|}O9DRPTtpQ>yWa6}=`##bbU!y7W;P*`M)P(cZ z+HnVVNnK|(@|RLA=R_2Xp+v-j%o}r$TG3rMOWMyzOXhTK0C7|C$RBpSGn}HgII1j* z#E(NH=0UJC^4|Dz3=vt_;T<238Dj^(6Yt7~7uXxSJgzR`n537TK;4{@#bvrAd?1M> zqoy?eqdUkmWx;=&uBCYo9RXg~ekeYH`dG6#^enP@eEO`R1L^GPg}%Dev}djj1?D z=^{x5{)Nk39^Rpe-xhtXAX%+gmXwhZ?Hq{nc4pXVM(P~`IZ7kGzwe_^nVdfmSdQ4L zVZ?ahh8{r}^QE*NI296d7-;@t=_eW-4qRzl1lg15SQ*kVw;*C#6uC_amTpV!B7Y1?7i2X zYp%6seCB7ipTfMV$jqeI_t`WbSIP-f+A^x;TgP?$u56)lk9+Nw14jvu0*( zE^^$u+g*q59f#GgaoIi_U6&UpalUOJa4Uv41*tuwLP~Ub#~im)J!Aj#D+Rt&WOwnD zm+!O$oDdH@lb$kvN@d2x%3I03!|{`od%i`gq5K}{ET6;jQXM)&i7<)T+->1 zPj9DA-uw|cX?^o~WdKvtp`bzj!e_Co^OnrEtT?HpXLm$PZ=AIq^69khso<^-#%19{ z*p{449ey#)`1lgh*LqU_s3^^2dpE>QDN@YZ1-=|&y~K+2&z}Btshv8=M(vV<;?M$r+9iyib{R0dmOx}znAnBZS{2G2aZi9<~nO& ztlu6dcZ2aGaTBP_cYcp-)^3p zcUIGt@&2>)tu*IhyV!5#KvxR4VRYiRQhLQ&u0)anjolN0BMX*O5aV9CVjJX3HD zVr#59MZ{rx3^uIecw^#9nHbAcg0rK1mpYMVGK#lwlfexU$S7rJoE>G?t5Hzg8P=E6 zVmuBCPs7lTZ;6=I)GcOfj2Bk%;o>(wN!G6jW2KI}N5s~sF+HGf+=*O~cz=@G@7V0< zv0H+7&#^Urexd{;2|`;Xk*Y=dnZt8?ath?-cFIpCtdObsI8%~ov>$`_#wRR&oGiC9 zOkoWpQU1&QdxCcr4j)SyvwISq$a_^z35n?Z+Q?!Fe6Xwp#3PA^q&krmMruA>j;}E~ z<}K%$UnYU~#Lu2jrs`Ap{!ML5tl{{xr{sLcKm<5XtY5OK`jnqYfsvep7iP+0!0CbK zCaY-2MWU_9xyM;zp9KJh}3pWiVENt`DjS&@#Qv$^7f z5joGUK14V6g@vROsegs<99eO&U;gYiB;bKf3eRPAA~+{U4%KIP7HA!XgDH7(b7_b3 z>H{ilYF{g-$|MhD3#m9cUDYYi^N$dF^lV{R?0ud4;93_?S&3;!`h1Vh#y*$s$h%+b zj-=<$zi{e1*!L{995(hk*bz$J_!xaetDuzY{H{B9 zX80biu8-)hGCM2pq`dXsV^vHlZ7-q~gu!-H`5qRnt*_fJbm>~rNbL;Eg!~t3(yxvF z(y;%bJ)Vmo;LQHJyxkM16>(Y}vqz26=qM`SZZS^D?}P#Z?2Vq(D7>kzw$ZJm8y;8{QRB?<+I97v+xGZ& z%tAWmc(RvUq!TrtX-gCnL8PeX83CIb)i)&Sy96W%Xg+>+X{=0ADS}(=JC^|Q-HD1W zf`HbLIit;mH&q^+Cmm)_N|sxes~`>d3v?@SnPg7C?|)?i4&P*}MgjlPu7GF$pXxM&oMLPh5FbWu-vC0P-t2Ry*e6AXACS}t7| znFkNd^=e(H0hg`L)B(m(_;G>*>E-~gk8dC9<{QV0@Xn5!JJm(uS+jx^>zxThwSu!Ro%TkBQCq0l?PUeF6tp#$TGmBCc0N%^=zVqfmGOhg27+L6j#n$fj^FXx zLU$;jMTBJd#p)q-v%&N8fUC{X7U46~iZ6?0-=xv0SPI1Kq(d0#IAfoD^&-0d~%S zUH9gIdvtqW)_a5uA>QD8+c}y=%yI4@>)?y3HUNdi$4bf1UbqlKe8ERD3eie_$I}oN zpA;>uzTLQ?wMk#s`wK(#L#tW*Vh-`HCpt!V19kuzO0RA5l}UXHrz3~ zSB)m9vYWmb&z%|pH@?60X>36<+AA_D>S=-jd*RKOx2YB4yQLpIpa`WCr4@RlcMr`$ z-;{jyVMum5_fxncpVVHLrxRI-a-%2uq}&vT<|scv#dydQ5)jh_A*9p13mA1~H4Si~ zA_4E?N0kL_*1mlEZ{HbgCvbm!twhfdZ`f~CRQ+Cc5QzEmB z^RZCZcbM)W(E1lhkJp67`GtfQ8PqG5?`)1vDsPUZI*f#)1n$4LTl!Q&4c#2_{TbTS(YU}QQc1-o_jCtzX=AH3~jVh8n*i3G^^L6T5pMqza*Qn zNMn04R(5H+J<31vhHIS1I=ioD^x<-q+xo?Zy&^Hp{_*}&vrC(m5+eteYznCDcm8`h zS|xcN!VGn4ZC|zwFo&*inyNT~S+Srw*OxBy{7jfip87+0AFhjHK2#wTJ*qqgmZOgn zEsvvI&42zts$}cg<6-$mW|a$`j^b$yv(-8vi&47M>L|RJw^)1E!89kS(uv2ts3_eFpvYg-y8aDdwF}`dp5hRWKhwa z-YC;!F6Z9S9m=Y7vuUF*LH_fQlDRsB19a>CpgQ$1N#ECYVWY!B>H!lT%xkpqb+Udi zR`YS9iEqDPY0S#&$Z7gH|Bb=6`Ahj?R@T56qA_|hm^@^NfIMjec!BI$UX!z{s~B@J zop9(NIPG4`qxyp1;?udOiX=NHPL13Y^wuWRi8j2g*63px?(yZpoc&+Qyw~Zep zG4s(RT6~raw%XD@W)>>B*`nN?NYTrDBxJ?mU=C?oa@@F#)T<`k_`d(Ns!9twpIl>c z(UJ90TiN2|IcsK9px6-?rRv@w_y`k3<4%qflYh&f-EFzNRNlBpAAMgh{!L&LYvbE? zWT>?_`nI&%bX8+K9;<`dW6wY#cSl>1)L-mh_kz@uhg$ z)3Rk3)sZ1@1ew`9l)=IFP;Mi$nf=Ch=c|t}J_DQ?v5RkJpJm(KKF}LQH0pGc?QrSu z5LP-gJ?xvBUS0+7?HYmOEn{rnS1;Akv4qv23i=>;uH5V`!p9J-Ih0 zxpamkIurIvrL2KdZ=o!vdUG4ir`Q=&iZZJgpq%~NR-pi!mv&J41y|$1N$PT1zRU!j z^jf2L*UP`lcO_58%F=e|@CvI+WG#?$>s?F{LTWDFhjTyAxuP`Y=L_K&Ue7I4%+{CQ z%yQF?+sP=`vC^Ih+Q*C!-?rPrz7WZ8@koFiX*oP#`8iFIMbZ5>&5j8x8ggqnnVW+1 zVeGR)^?2@LL|S*mNzhh#Rh0kl!c0hF^w+dJWCFzSWu|f%d{`&Si3zX{LDH_MD7fFxLo4V3xAfU zxvIYz3i#(@5B!kRP{6ZaY&Y#8pyWA_8MW}~O`q6mrcAt<<=gq_+%E!F4R55h)0Ww0 zN56LIa-lgg-o;~QKyyX^UIi1J7lJjtv6Hd;@*{X%t7)NUQo#7Ac6CO5mbRtc5`5akrEJ*ufF!Cv8VQJP6${c0|H!+@9 z1%#K0k*eXC;L~tci!0?{)kjaeuT7<0b6t7s5zzS{vOSS=^NMSdU|p%S50CzJWPDCL zC(WR6n_5=;&u-ax2dh(KFpb}=GgQC!Kz=vLiy~^$5MFb1^~R>BRgGy7gz4_2RJ0UeI=e-*@jq6s zj#}1NA4xXWFySYTMF-OQB7%u9t4Gc>1K)k`GQAv1QZ9_f49?2E%*s%{VMV&FtlW>^ zM--vr`vg$yX}Y%$84sQp9!xn1r}fMwrb9J{hpH!zJRo{N?H%7EIl$Biz&eQp?w*)T z@Ak-vg^0eN^Ig$Wx~6&~GjiJSR4021d0dxEQ}}TA{4wU4eE?v`d#@m_x_F`7vTw%&+N0fkjP$8qU#yHyc+T4m?my*$UCQU>$-v zbMX@`1hLd>uR?-Z>P7SSy=N5C8vi^&o!tx_cU8iEl$~2Sp&vBe?W zf;Kl*%k)j>Ce^aTbCa$Dlz|#Tt89uqv!*q#m$(>iaD|-h;%@Gk6_O4Anq%0QL^fV- z<=Yy^kpM2Z9*Y~gE{^yuYW7ukaTIaI_a5ECEla4BiM!5SQr@66zwsi^r3+huUtu#*3hUJIuul@1Ds|CH z+b-39r8f`{dG0*Y6>ysuJQ2rL!`C8v{7}>eNCUO64c=@>j{j6c!2@A=xUS>tI|Lpj;u1J}KN`N=b7I|N?T^#mLcOWRN8NPHY+ zPZz~)yLe0|!3AL}WxTtn&BDqF=@`~@=EAaipZ$|E;TKlpAHIG;m|Mz) z=ndw2774#5nVlexW+$?nH7aA=w9g-yB6D)<9G32uTRKO}_@aIwgum#uC}!O-@21MT z*`JKkbaA9mftJi~s+uCF`CwQSEqY2TiWtcVBo#mfOAKGE*+} zH&U4^>Uz}OlUy$O?um3kwBcB&tc=dAzuVor#U8nr7paxzFr7JZf!THKu`Z~d6#6)h z(I%*tDv+@&565i%SPvpoPI~{a#uP~?oDa0jU7GHX+2;DY+I0C492VtWyTlvW(^UT8wmWYDhAO4NS;mBRBt~!KXBQcc`zsa)E}p^6!=ssnV5nHz2&+_moKHN7TPMW*x+bqn1&-!tOQS>e?^&YR+2=cLJ$|ekkjdTh zV`>baC;Q(DdH!$3+A1K%SrKn_M|x+tH<%3zf9pg_O9`^P#C(3LBqXHb<9zOQ{R=sZ4v8BO<$fG-kV6dKu4B z#sqR?)6LAXkM&Tv?X(<6b#5Os%X7bxEy4K=@3go$YCGatd|C<)VaJlyF&gXFwdVqZ zC&CkNF5nn%HA4T2BU1&B0+^H-4f|k0F+S=4_8C zsXamBw!TQ6s=LM|=40=uact>7#I7#sWgtw>rk^FDr>DomPp0&-7-bWkWobG7j=C;V zpxcYPvooq&}LqlRbtsih0U$q_7Q=0tT*p}$^%H4iLIk_cMgd6|h#?~C{CTO!2 z7X&!u=j8k?%B)6loR+Rp4Q6?VQ5tfZ+tFCxB!aBx-#=6#@G4sCE?-7eeZ~cyYVv-C z)9zuD&|nEc;ZC(gP2L3#Ko$W;;qRX*Eo^Y>KOSK7#lUHv@BGGhV#$xM?_79+pK|Bl zZ;Ru$!8imo>c%U|V+|{bQ;XOSMhy>NT8-M4JotF|JZGY&rn89|HckKU&qW1*Txm6Q zk1NBjpLhvMhr_HpRM>i_cP zG1}qKdyF{PH2vQ{c2{BC8ASZPwUC4Qd?I6-ADU40P+c+NtguZd5fd|P-e5TP`|DT8 z;OkMZm)mUKU~25woYXhtI1XE|;8D+Qgq!<(j&Z@s-x?9=*9`eC+VI2^pZq~L=*}m7 z05ln|(+{bo&*hOl+O=>t(SbC#Ubc%+@Ep*Zv~{L<7~m4mJ>*SU*G7+)ef z{adsQIR2!N2IQPt`K;H8td&tV+)NYQ(`>hwZN%{D{Es3q@6`(cB-DG%PBYQ}S_Ij5 z41>cixEgIOt^T~nI}-4xyrw~3XJor^mjJA2MH1X?0>QCgi%8!q-n)1I8xfH2^u%Px z)@r10ju1TtjH>@}0&e5}mpa#TN%L?0smYwU&(G|KcE(XMT)C1mhWKj}Oua`PFNvE= z3QS{!(~;seL`|X4K`E!#F+25 zf?Qz$MZ)p#WsUL6285P}rRf;=?@#5#a5`;wsrd-tSE-cY`k1pmo|9i^)QG;oR+ zZbElXUht1WEx`_A<(k0Mmppg7`Vt*foV z?Nsyz)T4g`GLPpNt5t;=7VjUx$fbJvLnyU>fB#)i$_(Mg6Q?MQI-*&$psUb#&w|@- zz`H>{3Y-lyNzh+*W(0yzCIBY=1%2lknDXcAFE`G!wC^2FKP-TReerEP2mm8I5h1_{ zqQMC*Li~Xv<+WTCBKgxb2_I=EF=%q`^=tZh4zwB1`_T0G`1H4L8Ml=I_w^V^Dkvla zPTxusalcyLJb}L9fRY2~=QMQO9aeA1I47*?19;@;fL(kA#qRPRhTfa^WQ8vEV8F3k zcl=!DqM-;-=)Vs273@j3LV8tvAG|(DM@a$;0%VR9yncR^75wP*6qB4ESU-E!j5@q` zlC?@>f_bs3zOJr&BF<>sR<_Rb-`f#K?qBaHaz8rB z+PF#Ylz&Q1{CDdV0sPmfiNt(0v3({6kg_;RDT30#D%+#1I|0ZD)7=#yV~G;=bU7hB z8+#{e60rWnhyj37tCBJABxPL+32b+og--E0VP`rYwE%*kH2DcMEwIM+W>+eYCsn1k zX>ePAO>-+YAEnv#m6k)|YxCd%@w? z+I;~X4&H|!U!UMJ>on!HReQz~FJLv|Rd4|9#x-md3Lojr0OcwhL=I0vL)W`Dh0uLD zxq+LhyV7=2W0}+?SE(@m<|P{*&7zB$Bq7)2Ig|6z zcswrfd>L*U=E{QDAj<0aLez7sYCn);v3`x@3_*Ur=q@4>ss))?)&x3wJi$s0Hg3r?wu?e@Y%6$5`AH>%`Gcpu%K4sgCm2I_!$onJ7f4a$jijC^ zxd&bYnskZVsY~G;gUx2_71atptTrAi)pOA(&5^*)%V}#A_bH;{*~a(;zIm z9nMi=jq>f1& zOA|hetFw;GXfzcqlxzG+;+T-zcQ04<1dDuZ6Oa#uh#g|ip(puL4Y=nG`4-rC%uv!z zFo)^~XjK_*HZ=ogIoizCiCR|@RV1{izqOnAum@RJ73Cpk5N%*=r0fH8W z3M#EL;_ZjOhL8}|2h0Dm8XJyOc?Bh4>kR#T=#s!1wPDqhM%N^W-GuWW6N{kH%b z74lp%`}3rvA7gu-D`sRO1kE~QRZ8!Z5o>4c!6uYjD*e=qrsB2_OPIi5^s^-Ecr&t5 zz-{C^cdHtdTMnxyX(A2oy=GK%Hd8B%-Ra87iuK)I(Fc}F$)KL~Zy$KQcb$~HWtbIY zmJ}hjssLQ_T>_ zy7KS}fc6v!%w&_jKfJ)OPSUyoJ%fDCTz682OtP8*A1+6!C=4}|9;gV^!n{&44e*xZ zG!0akeq>oeOrJ``t(fK=mqDG|z;+CBC+?7oDTyuHy6Zt%sC8#fUUgkz++?hC8Oxb0 z!IT9S=SWI6G)u<&@7#{w%6RGqPI7uuf@T4aQ67~;jKQ3goma~ubyiD;gjw!5@dZJ3Z}#e`uG1&yUC~Iz*12}%3jJ^a!#|tLk2Elx zwi%62q%tX2PGHPtlL z%JNEyF;v@=4m9N|Q+yWp@gcPUIg!9}q)kx*q@o?IW9FDPNvZdtU@4}7Yw*6=(_$ch zTQDS5r!Gu56x3(Ee}`?j16GHhQbritBDX3J)64JZ3Jo8%k)HT#i7|#rw;A>=*JoOj zXGgDIXu7I!OQzLqxx9tf_W+aUD|eVcs;5oQ#qZVY)Uq0CE81hdsb9r$ zU=?SZ($MRbj2EbR@=po58SadAKI5J!!voqQf2EiJEGKL({5hFj7DisuG#D8w*pW~$ z7dxLFFIXq<-iuURjr*vleaY(14msO}D@K0O6KvU#gPbtkKnw;bM+aFeGiSu`jgP*- zbY8zpT}aNF6=gt8O<7IZasF16z?<+$L-!qql$#yxWsp>z=aaasmeZBMW2ky56 zoje)y9iyR_2+X1lg}>;?*t|LHGS}p@8GC`Td|0XYIDxH~kIT@&qaEKI(aPtYindzU zOd~pSZ?0l(dA}CsA{JOIaTqgy#2`z`wT|b$qz1x=nt0-ZRD3p9SQalx?<%UfsnI#Z z>mlO6c&kY^w&dA~%dIONM24Qxtx@en7Ffk^HguS^y=@!5N<0D*2rOPQ;u^do4H-tR zYSRqPD7qvXj%C-Un9i(we6!k_?aULUH??G_lomC5i1LOsN2Y?OIzL-FqB)q7ONru9 zW}jPiGYw=~mDFC*GB#GUyfZ*^l$}Mf1cZJKsq4g7=UW~d+BVBO1gzkdM_;d%GgSu` zD89SXz?kG^_$%}4x*j?{HRoCO5*c@jRmUlYL4|W%%kei^7Q@UmAC6sCc3~4rEe9m- z&kKF&P@eL)jsF0uriUeh9_qA5n8 ztz(_qmo@g;w-$x=XU7&N8ynd>HFn}?+mvcW(?#6L8uzzSuJhfN*^zd5(~F)gjSBUh#J14d_zo16@I# znB0%?yXghnaZ24Nx@yx@WQ+2G#E$bkQ{uCKWa{?;RLn#1vS=k^c&y=Ahn9?hnMWDM zR@hM^%eFI0AgjE@p{=0c#`*har(@?U5EZODsdf~TyD4hKskf*dsrZM+d_%<}HVWam(9wGI5e0dfcR~)7FRNa)foTJ{#hsAI#^R|rdET=&dnrnuOhgH!^ zK`q8m_>7z}tBt_;rHm*(d7JBVnpwY2l8~B^atd$Bp|tS20SV1=W-HQg%voxdZzhmY5o0Xe1SY}9%_^vqdO6US`SNB)<-Y*cZ`d{vu5F;Kjg1ZC#d>7OSs6w z!Xs(A`?i22AVn@OO%OF025ehxA_!pBdaKK&407ri--c90oy>$1jM*;+cg z)E)be4}KbA7*;uUwH79{EIAJGxfy5|@`q8txVf*nT#+JR1!u{y%D_~fV5wdoGeb&j zhXd}IpOQ?TS&NR$irI4U(zWFrA5OE5qT#xuT|^`ujM5F0r7bR}*q0u!F`eqxNfn%G zZf1jA!j*B9L{-Nh15cr3AzS=r2cH0-Xr5TMXlCg+_8@OaMLPfE%i`>V=e-~%6M%#i z%bLF5vC~+Gf9*`u_kT4h6(Bjwc+q3m!u^!^&)CMH;rUa=9k zSkWfy0Fd8OHDs zN}jy1XtyXo3f$7P#9C1+##U^U1p6~}0F$>uFIfRz+mskCa6JZu9w;YY4N^RkBKAL1 zyl`I^LKu4ljAAqKZYkqGH-LLU3O?4bmdOkoYbt`Eu`{`Qyu$Br-7Zvq4Eu&^Ny61( z@HCoRy#IO)u(?q|G-=fDmPO3%CN+p*hzgWiQ~nMJWx@;!?VnCM4$(Db5VJEGp~fE} zNhdjoay7p!mWL--V54hoVXptg7hvb~BZVu%&YD$3;hn|?lOn~-lKvPqFi#+f``u2# z7jMFoH>oK{f`12ueqy0-&o39Bfh+zAm;;RP=yEz>HCT+gZG!8P z@Zj>h#x(H8{}h*&L?2zsJVgfbN-zTOf;etCHt3Ic@B6=AI{YdP8=a!IH{}4qAA?eJ z;y-6X6#{N%lVBUHQNc|jh*+RH^ytnmAnEcOs^$=Dnqr5KHKQ!5k3M96=?lNw{}3nu ziY@o7X*gfg9{6}N)42^BE&}dZI{l`1bi*7#eWO_vq8WeBL8RY15D)PL&L=N2J6BvV z)>=AZE~(kH0rdtLnP*&WSr`9EtuRL_1G|V5E@~MbUBpssJ;vt+9CGFyA79=&{&Qit ztDM|){`aWFace*X!_rh*L}Uz)f1b_JuNCArsQV>FfoKB=lG2@#xIy`(KAa6ayS+Ht*RJ)+<)7t+wVocE!ptmTCy?&eOK2Qa<-TFAsw)+E zijsSY)W(GAPq_|2yfnopp!+e96A<8;cPBok_r<6TOG#ZTcW%0#!=C^5a`|V`;DRJ6 z#Vw-X)?1g(l9N9Jf=U7?aXw4>eO+eDqWj`e1W2wa6@)h~>``!o$c^D~Ric!XXsE1{ zDeh~gZ=~mv{1SiQwa8pT2(sdo@CF4&D2eM?BgkeR|N14x+;$1UNP4aK@EEj|gR1n- zrmhQXTU$>mNv6JiJ3%kT^DB?X_SXrwcYTu=4jvzRpKxco(oTT$F5)2Nj|CLzHwWv& z!$5HiLDtWOLpr=y`OQvDgo1S1KCtTC^T1krH#;FZl9g6QK7@hy4uLz9xark)0-whd z^6K8l>*A_{n}XYSe|a>2nTz@wj0#@DH8?C6B@GG^H^z-*Fus@`_^awL;d>QTlm+Pm zyb22g>q686S8N&~(be9CaKmS^9hs=V0Av>gc;JR8JO?UYZK+>a9r320?$P1aEb!J~&7NBd+LWgCv6 zIB1^MNk1XgxYECD+N{t78;oc2{t;(>*N>p|a1{?}m_ZROJK0_a~oE z<-UJ+hI>r3tkItFq~;y5rZ?Y6hIUaBJWpTtpQ4WB>gJbe8s}#(fZWADL0J`0)-W3P ziWIy828N8f=M|I9QTp4FhhwMRc6UPCwHexjFDCwkocN0yZY0@T6S9wI-d$doVMVbi z1}stkUe!~autNSSm75B{Gj;vmIQr~cI`Onyku}A6U#7Cp_DPz%gfBihQp#35=OE(N zXUYG3Ec8Fr)&xdE+m%oqcv8AvWM}>~YIeDQkTP;)w=>Dtm6SLVU4hKh;E*e{}PB>Hmk(a=Qe%X-@7MSIJf>15qx>d4s8g|qd(&phM}uHq z!-Klq8z+b&#q-b{09O97TB7f2w}U!32n9s-JD*ssk*q4lh+iKvN zIFLcs#1U;}>8fZd!ky}tBl86L!Xwby9hY=V@rncaX^D6}9GNDQk}xP+t@2UAoPtkGr64Oi&S;R-ZKrMLb=ips$zQ_{pjqgxU*!+J*3

}e)8zfAP3Q2S zrefqZrF~e0H**O(?reHs%hMLw=#<1?mv8LMiIGy=-`v_#E#)x0*?{OZfZ>e{a}UQF zT6y9Q{&5Cug0hbAf1_odTF3jz-k5eW4ZWBfWmbD++x*1VC}%$?=4O6*4t9qFR&u2=fs^-1kYd2^&qR54(nC)!)&gTMdhej=d zxALI%I6bPHIzT>dLbf@eaB6WezrlCu6C+?&vVX1A5z-SB*z#9M z=RK~REDDAE)QB;iha$+wL|yCBk$nEh@Dt!}#ZFc``bq+S+r7fw?bTqY&oFua)IHhC zeYR5?*uU=0YZhod1NOgp+SW;4wT_N zuXSCW+L<$K3pyv;U!*StR59dGL?t01fy=6_x2AP|W@e_p_&zzd^b&do`TkXoy!7b_ zHGjM)YdZi*E&%UFZUZH$JxL5}*!z(}-<5!YxM+(KH0Yp30_nd@2u*~AlE1mMs ziToBPO>3N?O0L9d-iuSK_{_n<0eh;PeGDu8ORpXF2VW}kmFj`1;+){${11OCj|Nyc zI?qCaZZ}ckEwMEMHJEILweZ?%;|)Om%>5X)I@=lNNeRhJ@I|_=_#WK3jU|0!KYd|~ zoJQ!8Kg?Pvo0<&M_3JdJC<>&3p2TvLJK}Ts1)xq_xq~Mm6UU=>?KoRsG^VY!^|fy0 z2&BO!Hy7yNjgFE4a}Zg!GHo;P#|+Drw<81XUrDs~=iRfAl9gq=lZFBM3;f;9y))2s zD?G3HV+H}R%y0n-3@1x}gWc3@ zRCIG#Lta?hZog!jQfW)01FppQzf;JD%to{;_RQGN!sDlapZ zC%bYQ$Fp-)Xc7|$R7ErB*;9!^tCz|*HWGOU<4{?#lUYhuKmjAv|E7yFc(WtsLw~NC z2}`p(8YNj~E&ybdvR)Yb{7c4v6~aOa74qrz(mQc)Q-ltSWV~Vw`4D>f8pomkrdr1; z)IVRb`8@Mspg@0*qYX+Wfr5}$LW1ulFcqm(SW63-eg2;YYKtaDLd%TDTgzfH+bhS{ zCOiewBeCW9GZXbUsKgFF252?zS4ahYhdRVaMoHG8B7Nn;cLE*uXutNHbpRvqnsrKq zK-9F-K{2{|{FT{L;j3322a+f0eP8TreT6dhmm`#}3q!I>j*liA7~8^9{NFzd`SMi^ zSY|T5{m!uc%X7fWjPvcRT`WdMTKcs{p~Hjzm(O%$QyfsP+bcFdCJ*_cy&(l44cd^6 z+)`mQo6$1&Zn!=QUR%?Jb;D9uR~Lhen2b@%M%7J1>EnGUEBaJo;+S{ApkrO$L23ZA zW2H9!ZtU&!J^7tD2{V)ybG>Ot0#JKeDyBDF)lcaHX$)q8X*`T-Q_d{3^@BL<(7q+ zs}WEQL&HKRXw3)J|7PVuBk!N_m$OeA095`)3qn@3r)CZ=OqWHriGwW=sf;(^50^I_pL>p+2}HM$5c-B$bZ-Z zr9RD|cGJtt&u+)~Kv8=9C6wz(!R>2qYld})2hW2m3r=yXX1Waos5xY+^rhL@FMYbD zp0h>=wegU3_5HRsy`Q5|cm}8mRL5Q65OJCxxVX6>Ao=X%y<*2XL;1j2H$;?S)Ym&= zHbpKk4t`{0$s0>QFx$wVs3*u58Hk1q`TlO)+SDyPR^=DzP9o8TslX8OzgjLx@eMRu z^#d(rlk*>Ze0|*E`)M^!HD%9%!ALXg}1B`o;&2G6lu&jIjd;SZssI72HwHQA}!cUSuHaIL~ zCz)9b=g95%XF_@*bErA+QG#s)iZd&9E-iig7BZLat4#!?5{nj4xmT8K8Cr$tunz|)-&~d|Z>u(h1N`QX?Ti2>|UwOF( zYU<%EBG6Xz`m3PwE57rhpuOA_D%41LMK1h)v`J*CMPJg2!~gUQ+C)mOh2BiwMZ|&Yvun1Dh7+bg+aa^y|{F_ROIN0X2SYmybIR+L|P0m zUI9xZ0M>?yd^rl}O<{d5ive?O<|$UN9eRn+&yH5tY191c8vTJr8(@Zb-XnrnnCl5b z3P<3sE39Au@qX*o09#5^)*EX`85Of18Ax>n?F2yNE?hy!2Ii&CA{rI4hh6k&!5(K_ODVj zVf}4U!MD~_P+br89~EU~v&x-Ft|2#)f8X*Lk(Mk(QE7RySaBr56TB|e{og)94r?@4 z(?0!gZOvFi+D(v2_>*^eN8XVeC;a9^Ks+%M&KDw}2X=9(_d)d_^h5POJ!z>ID{IBfO;zwJ~PXkW8w9CEw zg?~CVnMkCL{(1f4qn9#YuJmkQ`HwKIGu9VaH?}0Zg77Kt2Hn#)7-W%hc=2A$Z`qwFYP>+7QeZtQ0`S|VaX%b1MC|*HOB>_fU3;VG_Pte+V)g!$ zOGNE*t4ZG5cB|&6jM7>^~*I8f#j6^x)E|sYJ@Lt{yKK)AgC#KpA>2_ z2+`+z{OVdIrj{iiD{!?z5UpkQ9Esh59G(M;SUgIV`fjGWj65DUC2S)K8$@tF zNx_TZ%|5B=&aumSjc3EALT|Ry zPy}{Y<3OXrke3 z+CCF9byp0kL7q=3HY_MaXb$?z3RoFtE)Te^x?Eft4{zOHyC&5>I(|*0>O%Gi zF@f3IEu%mZ+$Y{=&jJ8Md!uf>jmp~kW>_@jgrZI>-`0@c8eaChW2+GUD*94^kB|kfqrhmTVaS4QwrdZNDd%Q~Pal`|9}!XuYyw;6;@HXG>**hd@bOg;;+W{~ zG5$MP5dhMRev|CRkN!K}`^VpK0m1*X7hvcoS?+h;TUrV}27hi#D@zqg7(MyF0EE<- AP5=M^ literal 0 HcmV?d00001 diff --git a/tools/cloud_functions/gcs_event_based_ingest/ordered_backfill.py b/tools/cloud_functions/gcs_event_based_ingest/ordered_backfill.py new file mode 100644 index 000000000..87285a891 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/ordered_backfill.py @@ -0,0 +1,179 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Command Line utility for backfilling gcs_ocn_bq_ingest cloud function when +ordering of incrementals is required +""" +import argparse +import concurrent.futures +import logging +import os +import pprint +import sys +from typing import Dict +from typing import Iterator +from typing import List + +import google.api_core.client_info +from google.cloud import storage + +import gcs_ocn_bq_ingest.main # pylint: disable=import-error + +CLIENT_INFO = google.api_core.client_info.ClientInfo( + user_agent="google-pso-tool/bq-severless-loader-cli") + +os.environ["FUNCTION_NAME"] = "backfill-cli" + + +def find_blobs_with_suffix( + gcs_client: storage.Client, + prefix: str, + suffix: str = "_SUCCESS", +) -> Iterator[storage.Blob]: + """ + Find GCS blobs with a given suffix. + + :param gcs_client: storage.Client + :param prefix: A GCS prefix to search i.e. gs://bucket/prefix/to/search + :param suffix: A suffix in blob name to match + :return: Iterable of blobs matching the suffix. + """ + prefix_blob: storage.Blob = storage.Blob.from_string(prefix) + # filter passes on scalability / laziness advantages of iterator. + return filter( + lambda blob: blob.name.endswith(suffix), + prefix_blob.bucket.list_blobs(client=gcs_client, + prefix=prefix_blob.name)) + + +def main(args: argparse.Namespace): + """main entry point for backfill CLI.""" + gcs_client: storage.Client = storage.Client(client_info=CLIENT_INFO) + pubsub_client = None + suffix = args.success_filename + if args.destination_regex: + os.environ["DESTINATION_REGEX"] = args.destination_regex + if args.mode == "NOTIFICATIONS": + if not args.pubsub_topic: + raise ValueError("when passing mode=NOTIFICATIONS" + "you must also pass pubsub_topic.") + # import is here because this utility can be used without + # google-cloud-pubsub dependency in LOCAL mode. + # pylint: disable=import-outside-toplevel + from google.cloud import pubsub + pubsub_client = pubsub.PublisherClient() + + # These are all I/O bound tasks so use Thread Pool concurrency for speed. + with concurrent.futures.ThreadPoolExecutor() as executor: + future_to_gsurl = {} + for blob in find_blobs_with_suffix(gcs_client, args.gcs_path, suffix): + if pubsub_client: + # kwargs are message attributes + # https://googleapis.dev/python/pubsub/latest/publisher/index.html#publish-a-message + logging.info("sending pubsub message for: %s", + f"gs://{blob.bucket.name}/{blob.name}") + future_to_gsurl[executor.submit( + pubsub_client.publish, + args.pubsub_topic, + b'', # cloud function ignores message body + bucketId=blob.bucket.name, + objectId=blob.name, + _metaInfo="this message was submitted with " + "gcs_ocn_bq_ingest backfill.py utility" + )] = f"gs://{blob.bucket.name}/{blob.name}" + else: + logging.info("running cloud function locally for: %s", + f"gs://{blob.bucket.name}/{blob.name}") + future_to_gsurl[executor.submit( + gcs_ocn_bq_ingest.main.main, + { + "attributes": { + "bucketId": blob.bucket.name, + "objectId": blob.name + } + }, + None, + )] = f"gs://{blob.bucket.name}/{blob.name}" + exceptions: Dict[str, Exception] = dict() + for future in concurrent.futures.as_completed(future_to_gsurl): + gsurl = future_to_gsurl[future] + try: + future.result() + except Exception as err: # pylint: disable=broad-except + logging.error("Error processing %s: %s", gsurl, err) + exceptions[gsurl] = err + if exceptions: + raise RuntimeError("The following errors were encountered:\n" + + pprint.pformat(exceptions)) + + +def parse_args(args: List[str]) -> argparse.Namespace: + """argument parser for backfill CLI""" + parser = argparse.ArgumentParser( + description="utility to backfill success file notifications " + "or run the cloud function locally in concurrent threads.") + + parser.add_argument( + "--gcs-path", + "-p", + help="GCS path (e.g. gs://bucket/prefix/to/search/)to search for " + "existing _SUCCESS files", + required=True, + ) + + parser.add_argument( + "--mode", + "-m", + help="How to perform the backfill: LOCAL run cloud function main" + " method locally (in concurrent threads) or NOTIFICATIONS just push" + " notifications to Pub/Sub for a deployed version of the cloud function" + " to pick up. Default is NOTIFICATIONS.", + required=False, + type=str.upper, + choices=["LOCAL", "NOTIFICATIONS"], + default="NOTIFICATIONS", + ) + + parser.add_argument( + "--pubsub-topic", + "--topic", + "-t", + help="Pub/Sub notifications topic to post notifications for. " + "i.e. projects/{PROJECT_ID}/topics/{TOPIC_ID} " + "Required if using NOTIFICATIONS mode.", + required=False, + default=None, + ) + + parser.add_argument( + "--success-filename", + "-f", + help="Override the default success filename '_SUCCESS'", + required=False, + default="_SUCCESS", + ) + + parser.add_argument( + "--destination-regex", + "-r", + help="Override the default destination regex for determining BigQuery" + "destination based on information encoded in the GCS path of the" + "success file", + required=False, + default=None, + ) + return parser.parse_args(args) + + +if __name__ == "__main__": + main(parse_args(sys.argv)) diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py index 46fe1d9ef..0f6bfbca2 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py @@ -19,7 +19,9 @@ from typing import List import pytest -from google.cloud import bigquery, error_reporting, storage +from google.cloud import bigquery +from google.cloud import error_reporting +from google.cloud import storage import gcs_ocn_bq_ingest.ordering import gcs_ocn_bq_ingest.utils @@ -470,3 +472,53 @@ def teardown(): request.addfinalizer(teardown) return backfill_blob + + +@pytest.mark.usefixtures("bq", "gcs_bucket", "dest_dataset", + "dest_partitioned_table") +@pytest.fixture +def gcs_external_partitioned_config( + request, bq, gcs_bucket, dest_dataset, + dest_partitioned_table) -> List[storage.blob.Blob]: + config_objs = [] + sql_obj = gcs_bucket.blob("/".join([ + dest_dataset.dataset_id, + dest_partitioned_table.table_id, + "_config", + "bq_transform.sql", + ])) + + sql = "INSERT {dest_dataset}.cf_test_nyc_311 SELECT * FROM temp_ext" + sql_obj.upload_from_string(sql) + + config_obj = gcs_bucket.blob("/".join([ + dest_dataset.dataset_id, dest_partitioned_table.table_id, "_config", + "external.json" + ])) + + public_table: bigquery.Table = bq.get_table( + bigquery.TableReference.from_string( + "bigquery-public-data.new_york_311.311_service_requests")) + config = { + "schema": public_table.to_api_repr()['schema'], + "csvOptions": { + "allowJaggedRows": False, + "allowQuotedNewlines": False, + "encoding": "UTF-8", + "fieldDelimiter": "|", + "skipLeadingRows": 0, + }, + "sourceFormat": "CSV", + "sourceUris": ["REPLACEME"], + } + config_obj.upload_from_string(json.dumps(config)) + config_objs.append(sql_obj) + config_objs.append(config_obj) + + def teardown(): + for do in config_objs: + if do.exists: + do.delete() + + request.addfinalizer(teardown) + return config_objs diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py index 49f76389f..e312351af 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py @@ -14,7 +14,8 @@ # limitations under the License. """unit tests for gcs_ocn_bq_ingest""" import re -from typing import Dict, Optional +from typing import Dict +from typing import Optional import pytest diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py index 8aadeb08b..6459a206b 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py @@ -209,6 +209,39 @@ def test_load_job_partitioned(bq, gcs_partitioned_data, bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows) +@pytest.mark.IT +def test_external_query_partitioned(bq, gcs_partitioned_data, + gcs_external_partitioned_config, + dest_dataset, dest_partitioned_table, + mock_env): + """tests the basic external query ingrestion mechanics + with bq_transform.sql and external.json + """ + if not all((blob.exists() for blob in gcs_external_partitioned_config)): + raise google.cloud.exceptions.NotFound("config objects must exist") + + for blob in gcs_partitioned_data: + if not blob.exists(): + raise google.cloud.exceptions.NotFound( + "test data objects must exist") + test_event = { + "attributes": { + "bucketId": blob.bucket.name, + "objectId": blob.name + } + } + gcs_ocn_bq_ingest.main.main(test_event, None) + expected_num_rows = 0 + for part in [ + "$2017041101", + "$2017041102", + ]: + test_data_file = os.path.join(TEST_DIR, "resources", "test-data", + "nyc_311", part, "nyc_311.csv") + expected_num_rows += sum(1 for _ in open(test_data_file)) + bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows) + + @pytest.mark.IT def test_look_for_config_in_parents(bq, gcs_data_under_sub_dirs, gcs_external_config, dest_dataset, From 70d2d2b75e8d07fc13f59994b98c740d9a925a5e Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Wed, 9 Dec 2020 19:59:55 -0800 Subject: [PATCH 25/90] docs --- .../gcs_event_based_ingest/README.md | 20 +++++++++------- .../gcs_ocn_bq_ingest/README.md | 21 ++++++++++++++++- .../gcs_ocn_bq_ingest/constants.py | 23 ++++++++++--------- 3 files changed, 44 insertions(+), 20 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/README.md index 5dcedf5c9..51b5b3a06 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/README.md @@ -21,14 +21,18 @@ By Default we try to read dataset, table, partition (or yyyy/mm/dd/hh) and batch id using the following python regex: ```python3 DEFAULT_DESTINATION_REGEX = ( - r"^(?P[\w\-\._0-9]+)/" # dataset (required) - r"(?P
[\w\-_0-9]+)/?" # table name (required) - r"(?P\$[0-9]+)?/?" # partition decortator (optional) - r"(?P[0-9]{4})?/?" # partition year (yyyy) (optional) - r"(?P[0-9]{2})?/?" # partition month (mm) (optional) - r"(?P
[0-9]{2})?/?" # partition day (dd) (optional) - r"(?P[0-9]{2})?/?" # partition hour (hh) (optional) - r"(?P[\w\-_0-9]+)?/" # batch id (optional) + r"^(?P[\w\-\._0-9]+)/" # dataset (required) + r"(?P
[\w\-_0-9]+)/?" # table name (required) + # break up historical v.s. incremental to separate prefixes (optional) + r"(?:historical|incremental)?/?" + r"(?P\$[0-9]+)?/?" # partition decorator (optional) + r"(?:" # [begin] yyyy/mm/dd/hh/ group (optional) + r"(?P[0-9]{4})/?" # partition year (yyyy) (optional) + r"(?P[0-9]{2})?/?" # partition month (mm) (optional) + r"(?P
[0-9]{2})?/?" # partition day (dd) (optional) + r"(?P[0-9]{2})?/?" # partition hour (hh) (optional) + r")?" # [end]yyyy/mm/dd/hh/ group (optional) + r"(?P[\w\-_0-9]+)?/" # batch id (optional) ) ``` you can see if this meets your needs in this [regex playground](https://regex101.com/r/5Y9TDh/2) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md index a1f417d7b..868d9e7bf 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md @@ -29,7 +29,7 @@ following default behavior. |-----------------------|---------------------------------------|----------------------------------------------| | `WAIT_FOR_JOB_SECONDS`| How long to wait before deciding BQ job did not fail quickly| `5` | | `SUCCESS_FILENAME` | Filename to trigger a load of a prefix| `_SUCCESS` | -| `DESTINATION_REGEX` | A [Python Regex with named capturing groups](https://docs.python.org/3/howto/regex.html#non-capturing-and-named-groups) for `dataset`, `table`, (optional: `partition` or `yyyy`, `mm`, `dd`, `hh`, `batch`) +| `DESTINATION_REGEX` | A [Python Regex with named capturing groups](https://docs.python.org/3/howto/regex.html#non-capturing-and-named-groups) for `dataset`, `table`, (optional: `partition` or `yyyy`, `mm`, `dd`, `hh`, `batch`) | (see below)| | `MAX_BATCH_BYTES` | Max bytes for BigQuery Load job | `15000000000000` ([15 TB](https://cloud.google.com/bigquery/quotas#load_jobs)| | `JOB_PREFIX` | Prefix for BigQuery Job IDs | `gcf-ingest-` | | `BQ_PROJECT` | Default BQ project to use if not specified in dataset capturing group | Project where Cloud Function is deployed | @@ -39,6 +39,24 @@ following default behavior. \* only affect the behavior when ordering is enabled for a table. See [ORDERING.md](../ORDERING.md) + +## Default Destination Regex +```python3 +DEFAULT_DESTINATION_REGEX = ( + r"^(?P[\w\-\._0-9]+)/" # dataset (required) + r"(?P
[\w\-_0-9]+)/?" # table name (required) + # break up historical v.s. incremental to separate prefixes (optional) + r"(?:historical|incremental)?/?" + r"(?P\$[0-9]+)?/?" # partition decorator (optional) + r"(?:" # [begin] yyyy/mm/dd/hh/ group (optional) + r"(?P[0-9]{4})/?" # partition year (yyyy) (optional) + r"(?P[0-9]{2})?/?" # partition month (mm) (optional) + r"(?P
[0-9]{2})?/?" # partition day (dd) (optional) + r"(?P[0-9]{2})?/?" # partition hour (hh) (optional) + r")?" # [end]yyyy/mm/dd/hh/ group (optional) + r"(?P[\w\-_0-9]+)?/" # batch id (optional) +) +` ## Implementation notes 1. To support notifications based on a GCS prefix @@ -46,3 +64,4 @@ See [ORDERING.md](../ORDERING.md) configure Pub/Sub Notifications manually and use a Pub/Sub triggered Cloud Function. + diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py index 908d0e854..daa116dfe 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py @@ -70,17 +70,18 @@ # yapf: disable DEFAULT_DESTINATION_REGEX = ( - r"^(?P[\w\-\._0-9]+)/" # dataset (required) - r"(?P
[\w\-_0-9]+)/?" # table name (required) - r"(?:historical|incremental)?/?" # break up hist v.s. inc to separate prefixes (optional) - r"(?P\$[0-9]+)?/?" # partition decorator (optional) - r"(?:" # [begin] yyyy/mm/dd/hh/ group (optional) - r"(?P[0-9]{4})/?" # partition year (yyyy) (optional) - r"(?P[0-9]{2})?/?" # partition month (mm) (optional) - r"(?P
[0-9]{2})?/?" # partition day (dd) (optional) - r"(?P[0-9]{2})?/?" # partition hour (hh) (optional) - r")?" # [end]yyyy/mm/dd/hh/ group (optional) - r"(?P[\w\-_0-9]+)?/" # batch id (optional) + r"^(?P[\w\-\._0-9]+)/" # dataset (required) + r"(?P
[\w\-_0-9]+)/?" # table name (required) + # break up historical v.s. incremental to separate prefixes (optional) + r"(?:historical|incremental)?/?" + r"(?P\$[0-9]+)?/?" # partition decorator (optional) + r"(?:" # [begin] yyyy/mm/dd/hh/ group (optional) + r"(?P[0-9]{4})/?" # partition year (yyyy) (optional) + r"(?P[0-9]{2})?/?" # partition month (mm) (optional) + r"(?P
[0-9]{2})?/?" # partition day (dd) (optional) + r"(?P[0-9]{2})?/?" # partition hour (hh) (optional) + r")?" # [end]yyyy/mm/dd/hh/ group (optional) + r"(?P[\w\-_0-9]+)?/" # batch id (optional) ) # yapf: enable From 6ec3625144ffab3d2533db0430ef1ac06038c998 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Wed, 9 Dec 2020 20:06:25 -0800 Subject: [PATCH 26/90] fixup linters --- .../gcs_ocn_bq_ingest/README.md | 2 +- .../gcs_ocn_bq_ingest/main.py | 22 +++++++++---------- .../gcs_ocn_bq_ingest/ordering.py | 4 ++-- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md index 868d9e7bf..e93b10056 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md @@ -15,7 +15,7 @@ is configurable with environment variable. ## Deployment The source for this Cloud Function can easily be reused to repeat this pattern -for many tables by using the accompanying terraform module (TODO). +for many tables by using the accompanying terraform module. This way we can reuse the tested source code for the Cloud Function. diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 3d349eeea..7a7fe8a28 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -98,20 +98,20 @@ def main(event: Dict, context): # pylint: disable=unused-argument if basename_object_id == constants.SUCCESS_FILENAME: ordering.backlog_publisher(gcs_client, event_blob) return - elif basename_object_id == constants.BACKFILL_FILENAME: + if basename_object_id == constants.BACKFILL_FILENAME: ordering.backlog_subscriber(gcs_client, bq_client, event_blob, function_start_time) return else: # Default behavior submit job as soon as success file lands. - bkt = utils.cached_get_bucket(gcs_client, bucket_id) - success_blob: storage.Blob = bkt.blob(object_id) - utils.handle_duplicate_notification(success_blob) - apply( - gcs_client, - bq_client, - success_blob, - None, # None lock blob as there is no serialization required. - utils.create_job_id(table_ref, batch)) + if basename_object_id == constants.SUCCESS_FILENAME: + utils.handle_duplicate_notification(event_blob) + apply( + gcs_client, + bq_client, + event_blob, + # None lock blob as there is no serialization required. + None, + utils.create_job_id(table_ref, batch)) # Unexpected exceptions will actually raise which may cause a cold restart. except tuple(exceptions.EXCEPTIONS_TO_REPORT) as original_error: # We do this because we know these errors do not require a cold restart @@ -122,7 +122,7 @@ def main(event: Dict, context): # pylint: disable=unused-argument # This mostly handles the case where error reporting API is not # enabled or IAM permissions did not allow us to report errors with # error reporting API. - raise original_error + raise original_error # pylint: disable=raise-missing-from def lazy_error_reporting_client() -> error_reporting.Client: diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py index dea38dbec..092a232da 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py @@ -85,7 +85,7 @@ def backlog_subscriber(gcs_client: storage.Client, bq_client: bigquery.Client, last_job_done = utils.wait_on_bq_job_id( bq_client, job_id, polling_timeout) except (exceptions.BigQueryJobFailure, - google.api_core.exceptions.NotFound): + google.api_core.exceptions.NotFound) as err: raise exceptions.BigQueryJobFailure( f"previous BigQuery job: {job_id} failed or could not " "be found. This will kill the backfill subscriber for " @@ -101,7 +101,7 @@ def backlog_subscriber(gcs_client: storage.Client, bq_client: bigquery.Client, f"to resume the backfill subscriber so it can " "continue with the next item in the backlog.\n" "Original Exception:\n" - f"{traceback.format_exc()}") + f"{traceback.format_exc()}") from err else: print(f"sleeping for {polling_timeout} seconds because" f"found manual lock gs://{bkt.name}/{lock_blob.name} with" From 2d0e5a8c5d8bb0c1478fefee95019fc95199cf6f Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Wed, 9 Dec 2020 20:08:55 -0800 Subject: [PATCH 27/90] fixup import style --- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 7a7fe8a28..2cf3bbd64 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -33,7 +33,6 @@ # Reuse GCP Clients across function invocations using globbals # https://cloud.google.com/functions/docs/bestpractices/tips#use_global_variables_to_reuse_objects_in_future_invocations # pylint: disable=global-statement -from .utils import apply ERROR_REPORTING_CLIENT = None @@ -105,7 +104,7 @@ def main(event: Dict, context): # pylint: disable=unused-argument else: # Default behavior submit job as soon as success file lands. if basename_object_id == constants.SUCCESS_FILENAME: utils.handle_duplicate_notification(event_blob) - apply( + utils.apply( gcs_client, bq_client, event_blob, From 7cb00e46d7560e568a39f3f6114a4cb6cd1f0364 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Wed, 9 Dec 2020 20:10:50 -0800 Subject: [PATCH 28/90] typing isort single line exclusion --- tools/cloud_functions/gcs_event_based_ingest/.isort.cfg | 1 + tools/cloud_functions/gcs_event_based_ingest/backfill.py | 4 +--- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py | 1 + .../gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py | 8 +------- .../gcs_event_based_ingest/ordered_backfill.py | 4 +--- .../tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py | 3 +-- 6 files changed, 6 insertions(+), 15 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/.isort.cfg b/tools/cloud_functions/gcs_event_based_ingest/.isort.cfg index 7b7b2d6f3..6f72bca0f 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/.isort.cfg +++ b/tools/cloud_functions/gcs_event_based_ingest/.isort.cfg @@ -2,3 +2,4 @@ src_paths=backfill.py,gcs_ocn_bq_ingest,test skip=terraform_module force_single_line=True +single_line_exclusions=typing diff --git a/tools/cloud_functions/gcs_event_based_ingest/backfill.py b/tools/cloud_functions/gcs_event_based_ingest/backfill.py index 3730074ee..f0a2ce415 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/backfill.py +++ b/tools/cloud_functions/gcs_event_based_ingest/backfill.py @@ -19,9 +19,7 @@ import os import pprint import sys -from typing import Dict -from typing import Iterator -from typing import List +from typing import Dict, Iterator, List import google.api_core.client_info from google.cloud import storage diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 2cf3bbd64..776f61317 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -30,6 +30,7 @@ from . import exceptions from . import ordering from . import utils + # Reuse GCP Clients across function invocations using globbals # https://cloud.google.com/functions/docs/bestpractices/tips#use_global_variables_to_reuse_objects_in_future_invocations # pylint: disable=global-statement diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py index 208189e39..6fdc1192c 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py @@ -25,13 +25,7 @@ import pathlib import time import uuid -from typing import Any -from typing import Deque -from typing import Dict -from typing import List -from typing import Optional -from typing import Tuple -from typing import Union +from typing import Any, Deque, Dict, List, Optional, Tuple, Union import cachetools import google.api_core diff --git a/tools/cloud_functions/gcs_event_based_ingest/ordered_backfill.py b/tools/cloud_functions/gcs_event_based_ingest/ordered_backfill.py index 87285a891..ed0b1da79 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/ordered_backfill.py +++ b/tools/cloud_functions/gcs_event_based_ingest/ordered_backfill.py @@ -20,9 +20,7 @@ import os import pprint import sys -from typing import Dict -from typing import Iterator -from typing import List +from typing import Dict, Iterator, List import google.api_core.client_info from google.cloud import storage diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py index e312351af..49f76389f 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py @@ -14,8 +14,7 @@ # limitations under the License. """unit tests for gcs_ocn_bq_ingest""" import re -from typing import Dict -from typing import Optional +from typing import Dict, Optional import pytest From 0be46f97df9a7c4c4b4b1e6b60aed7941a3ec132 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Wed, 9 Dec 2020 20:13:59 -0800 Subject: [PATCH 29/90] fixup gcb no-name-in-module bug --- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py index 6fdc1192c..8144b4bae 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py @@ -37,7 +37,7 @@ from google.cloud import storage from . import constants # pylint: disable=no-name-in-module -from . import exceptions +from . import exceptions # pylint: disable=no-name-in-module def external_query( # pylint: disable=too-many-arguments From 9a0ee105cb18a81c4d869178189280acb6d97fcb Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Thu, 10 Dec 2020 14:12:12 -0800 Subject: [PATCH 30/90] add test of subscriber after subscriber exit --- .../gcs_ocn_bq_ingest/main.py | 3 +- .../gcs_ocn_bq_ingest/ordering.py | 9 +++-- .../gcs_ocn_bq_ingest/utils.py | 21 +++++----- .../gcs_event_based_ingest/tests/conftest.py | 37 ++++++++++++++++-- .../gcs_ocn_bq_ingest/test_ordering_it.py | 39 ++++++++++++++++--- .../resources/test-data/ordering/00/data.csv | 1 - .../resources/test-data/ordering/01/data.csv | 2 +- .../resources/test-data/ordering/02/data.csv | 2 +- .../test-data/ordering/{00 => 03}/_SUCCESS | 0 .../resources/test-data/ordering/03/data.csv | 1 + .../resources/test-data/ordering/04/_SUCCESS | 0 .../resources/test-data/ordering/04/data.csv | 1 + 12 files changed, 87 insertions(+), 29 deletions(-) delete mode 100644 tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/00/data.csv rename tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/{00 => 03}/_SUCCESS (100%) create mode 100644 tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/03/data.csv create mode 100644 tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/04/_SUCCESS create mode 100644 tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/04/data.csv diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 776f61317..fc415f94b 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -151,7 +151,8 @@ def lazy_bq_client() -> bigquery.Client: default_query_config.labels = constants.DEFAULT_JOB_LABELS BQ_CLIENT = bigquery.Client( client_info=constants.CLIENT_INFO, - default_query_job_config=default_query_config) + default_query_job_config=default_query_config, + project=os.getenv("BQ_PROJECT", os.getenv("GCP_PROJECT"))) return BQ_CLIENT diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py index 092a232da..4e188e5b0 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py @@ -36,7 +36,7 @@ def backlog_publisher( gcs_client: storage.Client, event_blob: storage.Blob, -): +) -> Optional[storage.Blob]: """add success files to the the backlog and trigger backfill if necessary""" bkt = event_blob.bucket @@ -47,7 +47,8 @@ def backlog_publisher( "to the backlog.") table_prefix = utils.get_table_prefix(event_blob.name) - start_backfill_subscriber_if_not_running(gcs_client, bkt, table_prefix) + return start_backfill_subscriber_if_not_running(gcs_client, bkt, + table_prefix) # pylint: disable=too-many-arguments,too-many-locals @@ -118,7 +119,7 @@ def backlog_subscriber(gcs_client: storage.Client, bq_client: bigquery.Client, table_prefix) if not next_backlog_file: backfill_blob.delete(if_generation_match=backfill_blob.generation) - if (check_backlog_time + constants.ENSURE_SUBSCRIBER_SECONDS - 2 < + if (check_backlog_time + constants.ENSURE_SUBSCRIBER_SECONDS < time.monotonic()): print( "checking if the backlog is still empty for " @@ -134,7 +135,7 @@ def backlog_subscriber(gcs_client: storage.Client, bq_client: bigquery.Client, gcs_client, bkt, table_prefix) if next_backlog_file: # The backfill file may have been deleted but the backlog is - # not empty. Retrigger the backfill subscriber loop by + # not empty. Re-trigger the backfill subscriber loop by # dropping a new backfill file. start_backfill_subscriber_if_not_running( gcs_client, bkt, table_prefix) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py index 8144b4bae..f3ac5dfc5 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py @@ -72,15 +72,16 @@ def external_query( # pylint: disable=too-many-arguments # Note, dest_table might include a partition decorator. rendered_query = query.format( - dest_dataset=dest_table_ref.dataset_id, + dest_dataset=f"`{dest_table_ref.project}`.{dest_table_ref.dataset_id}", dest_table=dest_table_ref.table_id, ) - job: bigquery.QueryJob = bq_client.query( - rendered_query, - job_config=job_config, - job_id=job_id, - ) + job: bigquery.QueryJob = bq_client.query(rendered_query, + job_config=job_config, + job_id=job_id, + project=os.getenv( + "BQ_PROJECT", + bq_client.project)) print(f"started asynchronous query job: {job.job_id}") @@ -259,12 +260,10 @@ def get_batches_for_prefix( batch.clear() if len(batches) > 1: - print(f"split into {len(batches)} load jobs.") - elif len(batches) == 1: - print("using single load job.") - else: + print(f"split into {len(batches)} batches.") + elif len(batches) < 1: raise google.api_core.exceptions.NotFound( - f"No files to load at gs://{bucket_name}/{prefix_path}!") + f"No files to load at {prefix_path}!") return batches diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py index 0f6bfbca2..d8e877e0b 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py @@ -338,7 +338,7 @@ def bq_wait_for_rows(bq_client: bigquery.Client, table: bigquery.Table, @pytest.fixture -def dest_ordered_update_table(request, bq, mock_env, +def dest_ordered_update_table(request, gcs, gcs_bucket, bq, mock_env, dest_dataset) -> bigquery.Table: with open(os.path.join(TEST_DIR, "resources", "ordering_schema.json")) as schema_file: @@ -352,11 +352,31 @@ def dest_ordered_update_table(request, bq, mock_env, ) table = bq.create_table(table) - # Our test query only updates so we need to populate the first row. - bq.load_table_from_json([{"id": 1, "alpha_update": ""}], table) + + # Our test query only updates on a single row so we need to populate + # original row. + # This can be used to simulate an existing _bqlock from a prior run of the + # subscriber loop with a job that has succeeded. + job: bigquery.LoadJob = bq.load_table_from_json( + [{ + "id": 1, + "alpha_update": "" + }], + table, + job_id_prefix=gcs_ocn_bq_ingest.constants.DEFAULT_JOB_PREFIX) + + # The subscriber will be responsible for cleaning up this file. + bqlock_obj: storage.blob.Blob = gcs_bucket.blob("/".join([ + f"{dest_dataset.project}.{dest_dataset.dataset_id}", table.table_id, + "_bqlock" + ])) + + bqlock_obj.upload_from_string(job.job_id) def teardown(): bq.delete_table(table, not_found_ok=True) + if bqlock_obj.exists(): + bqlock_obj.delete() request.addfinalizer(teardown) return table @@ -367,10 +387,17 @@ def gcs_ordered_update_data( request, gcs_bucket, dest_dataset, dest_ordered_update_table) -> List[storage.blob.Blob]: data_objs = [] + older_success_blob: storage.blob.Blob = gcs_bucket.blob("/".join([ + f"{dest_dataset.project}.{dest_dataset.dataset_id}", + dest_ordered_update_table.table_id, "00", "_SUCCESS" + ])) + older_success_blob.upload_from_string("") + data_objs.append(older_success_blob) + chunks = { - "00", "01", "02", + "03", } for chunk in chunks: for test_file in ["data.csv", "_SUCCESS"]: @@ -397,6 +424,8 @@ def gcs_backlog(request, gcs, gcs_bucket, gcs_ordered_update_data) -> List[storage.blob.Blob]: data_objs = [] + # We will deal with the last incremental in the test itself to test the + # behavior of a new backlog subscriber. for success_blob in gcs_ordered_update_data: gcs_ocn_bq_ingest.ordering.backlog_publisher(gcs, success_blob) backlog_blob = gcs_ocn_bq_ingest.ordering.success_blob_to_backlog_blob( diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py index c3cb23585..aa0cc5a94 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py @@ -114,11 +114,11 @@ def test_backlog_publisher_with_existing_backfill_file(gcs, gcs_bucket, @pytest.mark.IT @pytest.mark.ORDERING -def test_single_backlog_subscriber_in_order(bq, gcs, gcs_bucket, error, - dest_ordered_update_table, - gcs_ordered_update_data, - gcs_external_update_config, - gcs_backlog, mock_env): +def test_backlog_subscriber_in_order(bq, gcs, gcs_bucket, error, dest_dataset, + dest_ordered_update_table, + gcs_ordered_update_data, + gcs_external_update_config, gcs_backlog, + mock_env): """Test basic functionality of backlog subscriber. Populate a backlog with 3 files that make updates where we can assert that these jobs were applied in order. @@ -130,6 +130,8 @@ def test_single_backlog_subscriber_in_order(bq, gcs, gcs_bucket, error, prefix=f"{gcs_ocn_bq_ingest.utils.get_table_prefix(gcs_external_update_config.name)}/_backlog/" ) assert backlog_blobs.num_results == 0, "backlog is not empty" + bqlock_blob: storage.Blob = gcs_bucket.blob("_bqlock") + assert not bqlock_blob.exists(), "_bqlock was not cleaned up" rows = bq.query("SELECT alpha_update FROM " f"{dest_ordered_update_table.dataset_id}" f".{dest_ordered_update_table.table_id}") @@ -137,5 +139,30 @@ def test_single_backlog_subscriber_in_order(bq, gcs, gcs_bucket, error, num_rows = 0 for row in rows: num_rows += 1 - assert row["alpha_update"] == "ABC", "incrementals not applied in order" + assert row["alpha_update"] == "ABC", "backlog not applied in order" + assert num_rows == expected_num_rows + + # Now we will test what happens when the publisher posts another batch after + # the backlog subscriber has exited. + data_obj: storage.Blob + for test_file in ["data.csv", "_SUCCESS"]: + data_obj = gcs_bucket.blob("/".join([ + f"{dest_dataset.project}.{dest_dataset.dataset_id}", + dest_ordered_update_table.table_id, "04", test_file + ])) + data_obj.upload_from_filename( + os.path.join(TEST_DIR, "resources", "test-data", "ordering", "04", + test_file)) + backfill_blob = gcs_ocn_bq_ingest.ordering.backlog_publisher(gcs, data_obj) + gcs_ocn_bq_ingest.ordering.backlog_subscriber(gcs, bq, backfill_blob, + time.monotonic()) + + rows = bq.query("SELECT alpha_update FROM " + f"{dest_ordered_update_table.dataset_id}" + f".{dest_ordered_update_table.table_id}") + expected_num_rows = 1 + num_rows = 0 + for row in rows: + num_rows += 1 + assert row["alpha_update"] == "ABCD", "new incremental not applied" assert num_rows == expected_num_rows diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/00/data.csv b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/00/data.csv deleted file mode 100644 index 6b4f72558..000000000 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/00/data.csv +++ /dev/null @@ -1 +0,0 @@ -1|A diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/01/data.csv b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/01/data.csv index 3b4f35bfc..6b4f72558 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/01/data.csv +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/01/data.csv @@ -1 +1 @@ -1|B +1|A diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/02/data.csv b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/02/data.csv index ecf1eb9e0..3b4f35bfc 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/02/data.csv +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/02/data.csv @@ -1 +1 @@ -1|C +1|B diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/00/_SUCCESS b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/03/_SUCCESS similarity index 100% rename from tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/00/_SUCCESS rename to tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/03/_SUCCESS diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/03/data.csv b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/03/data.csv new file mode 100644 index 000000000..ecf1eb9e0 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/03/data.csv @@ -0,0 +1 @@ +1|C diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/04/_SUCCESS b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/04/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/04/data.csv b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/04/data.csv new file mode 100644 index 000000000..09b72c865 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/04/data.csv @@ -0,0 +1 @@ +1|D From feb867ee1817b34821e33dae08796831d4cea280 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Thu, 10 Dec 2020 17:11:04 -0800 Subject: [PATCH 31/90] chores: tf updates, larger machine type, etc. --- .../gcs_event_based_ingest/cloudbuild.yaml | 2 +- .../gcs_ocn_bq_ingest/main.py | 1 - .../gcs_ocn_bq_ingest/ordering.py | 42 ++++++-- .../gcs_ocn_bq_ingest/utils.py | 20 ++-- .../requirements-dev.txt | 1 + .../gcs_ocn_bq_ingest_function/README.md | 9 +- .../gcs_ocn_bq_ingest_function/main.tf | 8 +- .../gcs_ocn_bq_ingest_function/variables.tf | 23 +--- .../gcs_ocn_bq_ingest/test_ordering_it.py | 102 +++++++++++++++--- 9 files changed, 144 insertions(+), 64 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml index d1367b925..697e6d702 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml +++ b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml @@ -128,6 +128,6 @@ steps: - 'IT' id: 'integration-test' options: - machineType: 'N1_HIGHCPU_8' + machineType: 'N1_HIGHCPU_32' substitutions: '_BUILD_DIR': 'tools/cloud_functions/gcs_event_based_ingest' diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index fc415f94b..fe4efb903 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -104,7 +104,6 @@ def main(event: Dict, context): # pylint: disable=unused-argument return else: # Default behavior submit job as soon as success file lands. if basename_object_id == constants.SUCCESS_FILENAME: - utils.handle_duplicate_notification(event_blob) utils.apply( gcs_client, bq_client, diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py index 4e188e5b0..0e88238b3 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py @@ -19,7 +19,7 @@ import os import time import traceback -from typing import Optional +from typing import Optional, Tuple import google.api_core import google.api_core.exceptions @@ -52,17 +52,19 @@ def backlog_publisher( # pylint: disable=too-many-arguments,too-many-locals -def backlog_subscriber(gcs_client: storage.Client, bq_client: bigquery.Client, +def backlog_subscriber(gcs_client: Optional[storage.Client], + bq_client: Optional[bigquery.Client], backfill_blob: storage.Blob, function_start_time: float): """Pick up the table lock, poll BQ job id until completion and process next item in the backlog. """ + gcs_client, bq_client = _get_clients_if_none(gcs_client, bq_client) # We need to retrigger the backfill loop before the Cloud Functions Timeout. restart_time = function_start_time + ( float(os.getenv("FUNCTION_TIMEOUT_SEC", "60")) - constants.RESTART_BUFFER_SECONDS) bkt = backfill_blob.bucket - utils.handle_duplicate_notification(backfill_blob) + utils.handle_duplicate_notification(gcs_client, backfill_blob) table_prefix = utils.get_table_prefix(backfill_blob.name) last_job_done = False # we will poll for job completion this long in an individual iteration of @@ -118,7 +120,8 @@ def backlog_subscriber(gcs_client: storage.Client, bq_client: bigquery.Client, next_backlog_file = utils.get_next_backlog_item(gcs_client, bkt, table_prefix) if not next_backlog_file: - backfill_blob.delete(if_generation_match=backfill_blob.generation) + backfill_blob.delete(if_generation_match=backfill_blob.generation, + client=gcs_client) if (check_backlog_time + constants.ENSURE_SUBSCRIBER_SECONDS < time.monotonic()): print( @@ -148,7 +151,7 @@ def backlog_subscriber(gcs_client: storage.Client, bq_client: bigquery.Client, next_backlog_file.name.replace("/_backlog/", "/")) table_ref, batch = utils.gcs_path_to_table_ref_and_batch( next_success_file.name) - if not next_success_file.exists(): + if not next_success_file.exists(client=gcs_client): raise exceptions.BacklogException( "backlog contains" f"gs://{next_backlog_file.bucket}/{next_backlog_file.name}" @@ -177,7 +180,7 @@ def start_backfill_subscriber_if_not_running( if constants.START_BACKFILL_FILENAME: start_backfill_blob = bkt.blob( f"{table_prefix}/{constants.START_BACKFILL_FILENAME}") - start_backfill = start_backfill_blob.exists() + start_backfill = start_backfill_blob.exists(client=gcs_client) if start_backfill: # Create a _BACKFILL file for this table if not exists @@ -192,7 +195,7 @@ def start_backfill_subscriber_if_not_running( f"created at {backfill_blob.time_created}. exiting. ") return backfill_blob except google.api_core.exceptions.PreconditionFailed: - backfill_blob.reload() + backfill_blob.reload(client=gcs_client) print("backfill already in progress due to: " f"gs://{backfill_blob.bucket.name}/{backfill_blob.name} " f"created at {backfill_blob.time_created}. exiting.") @@ -243,3 +246,28 @@ def subscriber_monitor(gcs_client: storage.Client, bkt: storage.Bucket, backfill_blob = \ start_backfill_subscriber_if_not_running( gcs_client, bkt, utils.get_table_prefix(object_id)) + + +def _get_clients_if_none( + gcs_client: Optional[storage.Client], + bq_client: Optional[bigquery.Client] +) -> Tuple[storage.Client, bigquery.Client]: + """method to handle case where clients are None. + + This is a workaround to be able to run the backlog subscriber in a separate + process to facilitate some of our integration tests. Though it should be + harmless. + """ + print("instantiating missing clients in backlog subscriber this should only" + "happen during integration tests.") + if not gcs_client: + gcs_client = storage.Client(client_info=constants.CLIENT_INFO) + if not bq_client: + default_query_config = bigquery.QueryJobConfig() + default_query_config.use_legacy_sql = False + default_query_config.labels = constants.DEFAULT_JOB_LABELS + bq_client = bigquery.Client( + client_info=constants.CLIENT_INFO, + default_query_job_config=default_query_config, + project=os.getenv("BQ_PROJECT", os.getenv("GCP_PROJECT"))) + return gcs_client, bq_client diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py index f3ac5dfc5..c07f718cb 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py @@ -89,7 +89,7 @@ def external_query( # pylint: disable=too-many-arguments # Check if job failed quickly while time.monotonic( ) - start_poll_for_errors < constants.WAIT_FOR_JOB_SECONDS: - job.reload() + job.reload(client=bq_client) if job.errors: raise exceptions.BigQueryJobFailure( f"query job {job.job_id} failed quickly: {job.errors}") @@ -126,7 +126,7 @@ def load_batches(gcs_client, bq_client, gsurl, dest_table_ref, job_id): ) - start_poll_for_errors < constants.WAIT_FOR_JOB_SECONDS: # Check if job failed quickly for job in jobs: - job.reload() + job.reload(client=bq_client) if job.errors: raise exceptions.BigQueryJobFailure( f"load job {job.job_id} failed quickly: {job.errors}") @@ -402,7 +402,8 @@ def recursive_update(original: Dict, update: Dict, in_place: bool = False): return out -def handle_duplicate_notification(blob_to_claim: storage.Blob): +def handle_duplicate_notification(gcs_client: storage.Client, + blob_to_claim: storage.Blob): """ Need to handle potential duplicate Pub/Sub notifications. To achieve this we will drop an empty "claimed" file that indicates @@ -412,7 +413,7 @@ def handle_duplicate_notification(blob_to_claim: storage.Blob): duplicate ingestion due to multiple Pub/Sub messages for a success file with the same creation time. """ - blob_to_claim.reload() + blob_to_claim.reload(client=gcs_client) created_unix_timestamp = blob_to_claim.time_created.timestamp() basename = os.path.basename(blob_to_claim.name) @@ -421,7 +422,9 @@ def handle_duplicate_notification(blob_to_claim: storage.Blob): basename, f"_claimed_{basename}_created_at_" f"{created_unix_timestamp}")) try: - claim_blob.upload_from_string("", if_generation_match=0) + claim_blob.upload_from_string("", + if_generation_match=0, + client=gcs_client) except google.api_core.exceptions.PreconditionFailed as err: raise exceptions.DuplicateNotificationException( f"gs://{blob_to_claim.bucket.name}/{blob_to_claim.name} appears" @@ -504,7 +507,7 @@ def remove_oldest_backlog_item( # https://cloud.google.com/storage/docs/json_api/v1/objects/list blob: storage.Blob for blob in backlog_blobs: - blob.delete() + blob.delete(client=gcs_client) return True # Return after deleteing first blob in the iterator return False @@ -651,7 +654,7 @@ def handle_bq_lock(gcs_client: storage.Client, lock_blob: storage.Blob, blob if next_job_id is None.""" try: if next_job_id: - if lock_blob.exists(): + if lock_blob.exists(client=gcs_client): lock_blob.upload_from_string( next_job_id, if_generation_match=lock_blob.generation, @@ -669,7 +672,7 @@ def handle_bq_lock(gcs_client: storage.Client, lock_blob: storage.Blob, ) except google.api_core.exceptions.PreconditionFailed as err: raise exceptions.BacklogException( - f"The lock at gs://{lock_blob.bucket.name}/{lock_blob.name}" + f"The lock at gs://{lock_blob.bucket.name}/{lock_blob.name} " f"was changed by another process.") from err @@ -692,6 +695,7 @@ def apply( lock_blob: storage.Blob job_id: str """ + handle_duplicate_notification(gcs_client, success_blob) bkt = success_blob.bucket if lock_blob is not None: handle_bq_lock(gcs_client, lock_blob, job_id) diff --git a/tools/cloud_functions/gcs_event_based_ingest/requirements-dev.txt b/tools/cloud_functions/gcs_event_based_ingest/requirements-dev.txt index b86a61183..f250ab6ee 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/requirements-dev.txt +++ b/tools/cloud_functions/gcs_event_based_ingest/requirements-dev.txt @@ -8,3 +8,4 @@ pylint pytest-parallel pytest-cov google-cloud-pubsub>=2.2.0 +pytest-repeat diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/README.md b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/README.md index f1acab548..b347aceeb 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/README.md @@ -27,24 +27,21 @@ documented [here](../gcs_ocn_bq_ingest_function/README.md) | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | app\_id | Application Name | `any` | n/a | yes | -| bigquery\_project\_ids | Additional project IDs to grant bigquery Admin / Job user for the data ingester account | `list(string)` | `[]` | no | +| bigquery\_project\_ids | Additional project IDs to grant bigquery Admin for the data ingester account | `list(string)` | `[]` | no | | cloudfunctions\_source\_bucket | GCS bucket to store Cloud Functions Source | `any` | n/a | yes | | data\_ingester\_sa | Service Account Email responsible for ingesting data to BigQuery | `any` | n/a | yes | -| destination\_regex | A [Python Regex with named capturing groups](https://docs.python.org/3/howto/regex.html#non-capturing-and-named-groups) for destination `dataset`, `table`, (optional: `partition`, `batch`) | `string` | `""` | no | +| environment\_variables | Environment variables to set on the cloud function. | `map(string)` | `{}` | no | | function\_source\_folder | Path to Cloud Function source | `string` | `"../gcs_event_based_ingest/gcs_ocn_bq_ingest/"` | no | | input\_bucket | GCS bucket to watch for new files | `any` | n/a | yes | | input\_prefix | GCS prefix to watch for new files in input\_bucket | `any` | `null` | no | -| job\_prefix | Prefix for BigQuery Job IDs | `string` | `""` | no | -| max\_batch\_bytes | Max bytes for BigQuery Load job | `string` | `""` | no | | project\_id | GCP Project ID containing cloud function, and input bucket | `any` | n/a | yes | | region | GCP region in which to deploy cloud function | `string` | `"us-central1"` | no | -| success\_filename | Filename to trigger a load of a prefix | `string` | `""` | no | | use\_pubsub\_notifications | Setting this to true will use Pub/Sub notifications By default we will use Cloud Functions Event direct notifications. See https://cloud.google.com/storage/docs/pubsub-notifications. | `bool` | `false` | no | -| wait\_for\_job\_seconds | How long to wait before deciding BQ job did not fail quickly | `string` | `""` | no | ## Outputs | Name | Description | |------|-------------| | cloud-function | instance of cloud function deployed by this module. | +| data-ingester-sa | data ingester service account email created as cloud function identity | diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf index faf9b3b82..16d7ce821 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf @@ -69,13 +69,7 @@ resource "google_cloudfunctions_function" "gcs_to_bq" { source_archive_bucket = var.cloudfunctions_source_bucket source_archive_object = google_storage_bucket_object.function_zip_object.name entry_point = "main" - environment_variables = { - WAIT_FOR_JOB_SECONDS = var.wait_for_job_seconds - SUCCESS_FILENAME = var.success_filename - DESTINATION_REGEX = var.destination_regex - MAX_BATCH_BYTES = var.max_batch_bytes - JOB_PREFIX = var.job_prefix - } + environment_variables = var.environment_variables event_trigger { event_type = var.use_pubsub_notifications ? "providers/cloud.pubsub/eventTypes/topic.publish" : "google.storage.object.finalize" resource = var.use_pubsub_notifications ? google_pubsub_topic.notification_topic[0].id : module.bucket.name diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf index 0452e9769..d26edee2e 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf @@ -36,27 +36,12 @@ variable "data_ingester_sa" { description = "Service Account Email responsible for ingesting data to BigQuery" } -variable "wait_for_job_seconds" { - description = "How long to wait before deciding BQ job did not fail quickly" - default = "" -} -variable "success_filename" { - description = "Filename to trigger a load of a prefix" - default = "" -} -variable "destination_regex" { - description = "A [Python Regex with named capturing groups](https://docs.python.org/3/howto/regex.html#non-capturing-and-named-groups) for destination `dataset`, `table`, (optional: `partition`, `batch`)" - default = "" -} -variable "max_batch_bytes" { - description = "Max bytes for BigQuery Load job" - default = "" +variable "environment_variables" { + description = "Environment variables to set on the cloud function." + type = map(string) + default = {} } -variable "job_prefix" { - description = "Prefix for BigQuery Job IDs " - default = "" -} variable "region" { description = "GCP region in which to deploy cloud function" diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py index aa0cc5a94..197d4f92d 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py @@ -12,11 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. """integration tests for the ordering behavior of backlog gcs_ocn_bq_ingest""" +import multiprocessing import os import queue +import random import time import pytest +from google.cloud import bigquery from google.cloud import storage import gcs_ocn_bq_ingest.constants @@ -114,14 +117,17 @@ def test_backlog_publisher_with_existing_backfill_file(gcs, gcs_bucket, @pytest.mark.IT @pytest.mark.ORDERING -def test_backlog_subscriber_in_order(bq, gcs, gcs_bucket, error, dest_dataset, - dest_ordered_update_table, - gcs_ordered_update_data, - gcs_external_update_config, gcs_backlog, - mock_env): +def test_backlog_subscriber_in_order_with_new_batch_after_exit( + bq, gcs, gcs_bucket, dest_dataset, dest_ordered_update_table, + gcs_ordered_update_data, gcs_external_update_config, gcs_backlog, + mock_env): """Test basic functionality of backlog subscriber. Populate a backlog with 3 files that make updates where we can assert that these jobs were applied in order. + + To ensure that the subscriber cleans up properly after itself before exit, + we will drop a 4th batch after the subscriber has exited and assert that it + gets applied as expected. """ gcs_ocn_bq_ingest.ordering.backlog_subscriber(gcs, bq, gcs_external_update_config, @@ -144,16 +150,8 @@ def test_backlog_subscriber_in_order(bq, gcs, gcs_bucket, error, dest_dataset, # Now we will test what happens when the publisher posts another batch after # the backlog subscriber has exited. - data_obj: storage.Blob - for test_file in ["data.csv", "_SUCCESS"]: - data_obj = gcs_bucket.blob("/".join([ - f"{dest_dataset.project}.{dest_dataset.dataset_id}", - dest_ordered_update_table.table_id, "04", test_file - ])) - data_obj.upload_from_filename( - os.path.join(TEST_DIR, "resources", "test-data", "ordering", "04", - test_file)) - backfill_blob = gcs_ocn_bq_ingest.ordering.backlog_publisher(gcs, data_obj) + backfill_blob = _post_a_new_batch(gcs_bucket, dest_dataset, + dest_ordered_update_table) gcs_ocn_bq_ingest.ordering.backlog_subscriber(gcs, bq, backfill_blob, time.monotonic()) @@ -166,3 +164,77 @@ def test_backlog_subscriber_in_order(bq, gcs, gcs_bucket, error, dest_dataset, num_rows += 1 assert row["alpha_update"] == "ABCD", "new incremental not applied" assert num_rows == expected_num_rows + + +@pytest.mark.IT +@pytest.mark.ORDERING +@pytest.mark.repeat(5) +def test_backlog_subscriber_in_order_with_new_batch_while_running( + bq, gcs, gcs_bucket, dest_dataset, dest_ordered_update_table, + gcs_ordered_update_data, gcs_external_update_config: storage.Blob, + gcs_backlog, mock_env): + """Test functionality of backlog subscriber when new batches are added + before the subscriber is done finishing the existing backlog. + + Populate a backlog with 3 files that make updates where we can assert + that these jobs were applied in order. + In another process populate a fourth batch, and call the publisher. + """ + # Cannot pickle clients to another process so we need to recreate some + # objects without the client property. + backfill_blob = storage.Blob.from_string( + f"gs://{gcs_external_update_config.bucket.name}/" + f"{gcs_external_update_config.name}") + dataset = bigquery.Dataset.from_string( + f"{dest_dataset.project}.{dest_dataset.dataset_id}") + table = bigquery.Table.from_string( + f"{dest_dataset.project}.{dest_dataset.dataset_id}." + f"{dest_ordered_update_table.table_id}") + bkt = storage.Bucket.from_string(f"gs://{gcs_bucket.name}") + + # Run subscriber w/ backlog and publisher w/ new batch in parallel. + with multiprocessing.Pool(processes=2) as pool: + res_subscriber = pool.apply_async( + gcs_ocn_bq_ingest.ordering.backlog_subscriber, + (None, None, backfill_blob, time.monotonic())) + # We run this test multiple times and sleep a random amount to simulate + # the next batch landing at a random time during the backfill. + time.sleep(random.uniform(0, 2)) + res_backlog_publisher = pool.apply_async(_post_a_new_batch, + (bkt, dataset, table)) + + # wait on each function to complete + res_subscriber.wait() + res_backlog_publisher.wait() + + backlog_blobs = gcs_bucket.list_blobs( + prefix=f"{gcs_ocn_bq_ingest.utils.get_table_prefix(gcs_external_update_config.name)}/_backlog/" + ) + assert backlog_blobs.num_results == 0, "backlog is not empty" + bqlock_blob: storage.Blob = gcs_bucket.blob("_bqlock") + assert not bqlock_blob.exists(), "_bqlock was not cleaned up" + rows = bq.query("SELECT alpha_update FROM " + f"{dest_ordered_update_table.dataset_id}" + f".{dest_ordered_update_table.table_id}") + expected_num_rows = 1 + num_rows = 0 + for row in rows: + num_rows += 1 + assert row["alpha_update"] == "ABCD", "backlog not applied in order" + assert num_rows == expected_num_rows + + +def _post_a_new_batch(gcs_bucket, dest_dataset, dest_ordered_update_table): + # We may run this in another process and cannot pickle client objects + gcs = storage.Client() + data_obj: storage.Blob + for test_file in ["data.csv", "_SUCCESS"]: + data_obj = gcs_bucket.blob("/".join([ + f"{dest_dataset.project}.{dest_dataset.dataset_id}", + dest_ordered_update_table.table_id, "04", test_file + ])) + data_obj.upload_from_filename(os.path.join(TEST_DIR, "resources", + "test-data", "ordering", + "04", test_file), + client=gcs) + return gcs_ocn_bq_ingest.ordering.backlog_publisher(gcs, data_obj) From 2218212d585eb078b5872e110608ad4cbf6aa5ba Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Thu, 10 Dec 2020 17:16:58 -0800 Subject: [PATCH 32/90] terraform fmt --- .../terraform_module/gcs_ocn_bq_ingest_function/variables.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf index d26edee2e..ca3073a0d 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf @@ -38,8 +38,8 @@ variable "data_ingester_sa" { variable "environment_variables" { description = "Environment variables to set on the cloud function." - type = map(string) - default = {} + type = map(string) + default = {} } From d528d851f390bdc71db4c4305ced0cf8c02755a4 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Fri, 11 Dec 2020 13:37:23 -0800 Subject: [PATCH 33/90] handle abandoned _BACKFILL and other review feedback --- .../gcs_event_based_ingest/ORDERING.md | 16 +- .../gcs_event_based_ingest/cloudbuild.yaml | 12 +- .../gcs_ocn_bq_ingest/exceptions.py | 1 - .../gcs_ocn_bq_ingest/main.py | 100 ++++++---- .../gcs_ocn_bq_ingest/ordering.py | 40 +++- .../gcs_ocn_bq_ingest/utils.py | 14 +- .../ordered_backfill.py | 177 ------------------ .../gcs_event_based_ingest/pytest.ini | 1 + .../requirements-dev.txt | 2 +- .../gcs_event_based_ingest/tests/conftest.py | 6 + .../test_gcs_ocn_bq_ingest.py | 48 +++++ .../test_gcs_ocn_bq_ingest_it.py | 6 - .../gcs_ocn_bq_ingest/test_ordering_it.py | 10 +- 13 files changed, 174 insertions(+), 259 deletions(-) delete mode 100644 tools/cloud_functions/gcs_event_based_ingest/ordered_backfill.py diff --git a/tools/cloud_functions/gcs_event_based_ingest/ORDERING.md b/tools/cloud_functions/gcs_event_based_ingest/ORDERING.md index 8a3dda5d8..c85020276 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/ORDERING.md +++ b/tools/cloud_functions/gcs_event_based_ingest/ORDERING.md @@ -44,8 +44,8 @@ gs://${BUCKET}/${DATASET}/${TABLE}/incremental/_config/ORDERME ## Dealing With Out-of-Order Publishing to GCS During Historical Load In some use cases, there is a period where incrementals that must be applied in -order are uploaded in parallel (meaning their _SUCCESS files are expected to be -out of order). This typically happens during some historical backfill period. +order are uploaded in parallel (meaning their `_SUCCESS` files are expected to +be out of order). This typically happens during some historical backfill period. This can be solved by setting the `START_BACKFILL_FILENAME` environment variable to a file name that indicates that the parallel upload of historical incrementals is complete (e.g. `_HISTORYDONE`). This will cause all success @@ -90,7 +90,8 @@ The Backlog Publisher has two responsibilities: 1. add incoming success files to a table's `_backlog` so they are not "forgotten" by the ingestion system. 1. if there is a non-empty backlog start the backfill subscriber (if one is not -already running). This is accomplished by dropping a table level `_BACKFILL` file. +already running). This is accomplished by dropping a table level `_BACKFILL` +file if it does not already exist. ### Backlog Subscriber The Backlog Subscriber is responsible for keeping track of BigQuery jobs running @@ -105,11 +106,11 @@ The state of what BigQuery job is currently running on a table is kept in a In order to escape the maximum nine-minute (540s) Cloud Function Timeout, the backfill subscriber will re-trigger itself by posting a new `_BACKFILL` file until the `_backlog` for the table prefix is empty. When a new success file -arrives it is the reponsibility of the publisher to restart the subscriber. +arrives it is the responsibility of the publisher to restart the subscriber. ### Note on Handling Race Condition -we use subscribe_monitor to handle a rare race condition where: +We use `subscribe_monitor` to handle a rare race condition where: 1. subscriber reads an empty backlog (before it can delete the _BACKFILL blob...) @@ -130,9 +131,8 @@ loop of the backfill subscriber but this loop will not take any action and this wasted compute is far better than dropping a batch of data. 1. On the subscriber side we check if there was more time than 10 seconds between list backlog items and delete backfill calls. If so the -subscriber double checks that the backlog is still empty. This way -we always handle this race condition either in this monitor or in the -subscriber itself. +subscriber double checks that the backlog is still empty. This way we always +handle this race condition either in this monitor or in the subscriber itself. ### Visualization of Ordering Triggers in the Cloud Function diff --git a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml index 697e6d702..4a6d5b519 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml +++ b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml @@ -113,19 +113,21 @@ steps: - 'mypy-main' - 'mypy-tests' - 'terraform-fmt' + entrypoint: /bin/sh args: - - '-m' - - 'not IT' + - '-c' + # pip installing again to get GCB to recognize mocker from pytest-mock + - 'pip install -r requirements-dev.txt && python3 -m pytest -m "not IT"' id: 'unit-test' - name: 'gcr.io/$PROJECT_ID/gcs_event_based_ingest_ci' dir: '${_BUILD_DIR}' waitFor: - 'build-ci-image' - 'unit-test' + entrypoint: /bin/sh args: - - '--maxfail=1' - - '-m' - - 'IT' + - '-c' + - 'pip install -r requirements-dev.txt && python3 -m pytest -m IT' id: 'integration-test' options: machineType: 'N1_HIGHCPU_32' diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/exceptions.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/exceptions.py index 908db717c..a1126c22e 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/exceptions.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/exceptions.py @@ -48,5 +48,4 @@ class BacklogException(Exception): UnexpectedTriggerException, DestinationRegexMatchException, BacklogException, - DuplicateNotificationException, } diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index fe4efb903..73b7b9657 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -18,7 +18,8 @@ """ import os import time -from typing import Dict +import traceback +from typing import Dict, Optional # pylint in cloud build is being flaky about this import discovery. # pylint: disable=no-name-in-module @@ -65,7 +66,6 @@ def main(event: Dict, context): # pylint: disable=unused-argument gcs_client = lazy_gcs_client() bq_client = lazy_bq_client() - table_ref, batch = utils.gcs_path_to_table_ref_and_batch(object_id) enforce_ordering = (constants.ORDER_PER_TABLE or utils.look_for_config_in_parents( @@ -75,43 +75,14 @@ def main(event: Dict, context): # pylint: disable=unused-argument bkt: storage.Bucket = utils.cached_get_bucket(gcs_client, bucket_id) event_blob: storage.Blob = bkt.blob(object_id) - if enforce_ordering: - # For SUCCESS files in a backlog directory, ensure that subscriber - # is running. - if (basename_object_id == constants.SUCCESS_FILENAME - and "/_backlog/" in object_id): - print( - f"This notification was for " - f"gs://{bucket_id}/{object_id} a" - f"{constants.SUCCESS_FILENAME} in a" - "/_backlog/ directory. " - f"Watiting {constants.ENSURE_SUBSCRIBER_SECONDS} seconds to " - "ensure that subscriber is running.") - ordering.subscriber_monitor(gcs_client, bkt, object_id) - return - if (constants.START_BACKFILL_FILENAME and basename_object_id - == constants.START_BACKFILL_FILENAME): - # This will be the first backfill file. - ordering.start_backfill_subscriber_if_not_running( - gcs_client, bkt, utils.get_table_prefix(object_id)) - return - if basename_object_id == constants.SUCCESS_FILENAME: - ordering.backlog_publisher(gcs_client, event_blob) - return - if basename_object_id == constants.BACKFILL_FILENAME: - ordering.backlog_subscriber(gcs_client, bq_client, event_blob, - function_start_time) - return - else: # Default behavior submit job as soon as success file lands. - if basename_object_id == constants.SUCCESS_FILENAME: - utils.apply( - gcs_client, - bq_client, - event_blob, - # None lock blob as there is no serialization required. - None, - utils.create_job_id(table_ref, batch)) + triage_event(gcs_client, bq_client, event_blob, function_start_time, + enforce_ordering) + # Unexpected exceptions will actually raise which may cause a cold restart. + except exceptions.DuplicateNotificationException: + print("recieved duplicate notification. this was handled gracefully." + f"{traceback.format_exc()}") + except tuple(exceptions.EXCEPTIONS_TO_REPORT) as original_error: # We do this because we know these errors do not require a cold restart # of the cloud function. @@ -124,6 +95,59 @@ def main(event: Dict, context): # pylint: disable=unused-argument raise original_error # pylint: disable=raise-missing-from +def triage_event(gcs_client: Optional[storage.Client], + bq_client: Optional[bigquery.Client], + event_blob: storage.Blob, + function_start_time: float, + enforce_ordering: bool = False): + """call the appropriate method based on the details of the trigger event + blob.""" + bkt = event_blob.bucket + basename_object_id = os.path.basename(event_blob.name) + table_ref, batch = utils.gcs_path_to_table_ref_and_batch(event_blob.name) + if enforce_ordering: + # For SUCCESS files in a backlog directory, ensure that subscriber + # is running. + if (basename_object_id == constants.SUCCESS_FILENAME + and "/_backlog/" in event_blob.name): + print(f"This notification was for " + f"gs://{bkt.name}/{event_blob.name} a" + f"{constants.SUCCESS_FILENAME} in a" + "/_backlog/ directory. " + f"Watiting {constants.ENSURE_SUBSCRIBER_SECONDS} seconds to " + "ensure that subscriber is running.") + ordering.subscriber_monitor(gcs_client, bkt, event_blob.name) + return + if (constants.START_BACKFILL_FILENAME + and basename_object_id == constants.START_BACKFILL_FILENAME): + # This will be the first backfill file. + ordering.start_backfill_subscriber_if_not_running( + gcs_client, bkt, utils.get_table_prefix(event_blob.name)) + return + if basename_object_id == constants.SUCCESS_FILENAME: + ordering.backlog_publisher(gcs_client, event_blob) + return + if basename_object_id == constants.BACKFILL_FILENAME: + if (event_blob.name != f"{utils.get_table_prefix(event_blob.name)}/" + f"{constants.BACKFILL_FILENAME}"): + raise RuntimeError( + f"recieved notification for gs://{event_blob.bucket.name}/" + f"{event_blob.name}\n{constants.BACKFILL_FILENAME} files " + "are expected only at the table prefix level.") + ordering.backlog_subscriber(gcs_client, bq_client, event_blob, + function_start_time) + return + else: # Default behavior submit job as soon as success file lands. + if basename_object_id == constants.SUCCESS_FILENAME: + utils.apply( + gcs_client, + bq_client, + event_blob, + # None lock blob as there is no serialization required. + None, + utils.create_job_id(table_ref, batch)) + + def lazy_error_reporting_client() -> error_reporting.Client: """ Return a error reporting client that may be shared between cloud function diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py index 0e88238b3..b8ea9c323 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py @@ -14,8 +14,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Background Cloud Function for loading data from GCS to BigQuery. +"""Implement function to ensure loading data from GCS to BigQuery in order. """ +import datetime import os import time import traceback @@ -192,7 +193,7 @@ def start_backfill_subscriber_if_not_running( client=gcs_client) print("triggered backfill with " f"gs://{backfill_blob.bucket.name}/{backfill_blob.name} " - f"created at {backfill_blob.time_created}. exiting. ") + f"created at {backfill_blob.time_created}. exiting.") return backfill_blob except google.api_core.exceptions.PreconditionFailed: backfill_blob.reload(client=gcs_client) @@ -225,7 +226,7 @@ def subscriber_monitor(gcs_client: storage.Client, bkt: storage.Bucket, 2. a new item is added to the backlog (causing a separate function invocation) 3. In this new invocation we reach this point in the code path - and start_subscriber_if_not_running sees the old _BACKFILL + and start_backlog_subscriber_if_not_running sees the old _BACKFILL and does not create a new one. 4. The subscriber deletes the _BACKFILL blob and exits without processing the new item on the backlog from #2. @@ -240,23 +241,42 @@ def subscriber_monitor(gcs_client: storage.Client, bkt: storage.Bucket, backfill_blob = start_backfill_subscriber_if_not_running( gcs_client, bkt, utils.get_table_prefix(object_id)) - time.sleep(constants.ENSURE_SUBSCRIBER_SECONDS) - while not utils.wait_on_gcs_blob(gcs_client, backfill_blob, - constants.ENSURE_SUBSCRIBER_SECONDS): - backfill_blob = \ + # backfill blob may be none if the START_BACKFILL_FILENAME has not been + # dropped + if backfill_blob: + # Handle case where a subscriber loop was not able to repost the + # backfill file before the cloud function timeout. + if (datetime.datetime.utcnow() - backfill_blob.time_created > + datetime.timedelta( + seconds=int(os.getenv("FUNCTION_TIMEOUT_SEC", "60")))): + print( + f"backfill blob gs://{backfill_blob.bucket.name}/" + f"{backfill_blob.name} appears to be abandoned as it is older " + "than the cloud function timeout of " + f"{os.getenv('FUNCTION_TIMEOUT_SEC', '60')} seconds." + "reposting this backfill blob to restart the backfill" + "subscriber for this table.") + backfill_blob.delete(client=gcs_client) start_backfill_subscriber_if_not_running( gcs_client, bkt, utils.get_table_prefix(object_id)) + return + + time.sleep(constants.ENSURE_SUBSCRIBER_SECONDS) + while not utils.wait_on_gcs_blob(gcs_client, backfill_blob, + constants.ENSURE_SUBSCRIBER_SECONDS): + backfill_blob = \ + start_backfill_subscriber_if_not_running( + gcs_client, bkt, utils.get_table_prefix(object_id)) def _get_clients_if_none( - gcs_client: Optional[storage.Client], - bq_client: Optional[bigquery.Client] + gcs_client: Optional[storage.Client], bq_client: Optional[bigquery.Client] ) -> Tuple[storage.Client, bigquery.Client]: """method to handle case where clients are None. This is a workaround to be able to run the backlog subscriber in a separate process to facilitate some of our integration tests. Though it should be - harmless. + harmless if these clients are recreated in the Cloud Function. """ print("instantiating missing clients in backlog subscriber this should only" "happen during integration tests.") diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py index c07f718cb..1c83e7deb 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py @@ -306,14 +306,6 @@ def parse_notification(notification: dict) -> Tuple[str, str]: "https://cloud.google.com/functions/docs/tutorials/storage") -# cache lookups against GCS API for 1 second as buckets / objects have update -# limit of once per second and we might do several of the same lookup during -# the functions lifetime. This should improve performance by eliminating -# unnecessary API calls. The lookups on bucket and objects in this function -# should not be changing during the function's lifetime as this would lead to -# non-deterministic results with or without this cache. -# https://cloud.google.com/storage/quotas -@cachetools.cached(cachetools.TTLCache(maxsize=1024, ttl=1)) def read_gcs_file(gcs_client: storage.Client, gsurl: str) -> str: """ Read a GCS object as a string @@ -338,7 +330,11 @@ def read_gcs_file_if_exists(gcs_client: storage.Client, return None -# Cache bucket lookups (see reasoning in comment above) +# cache lookups against GCS API for 1 second as buckets have update +# limit of once per second and we might do several of the same lookup during +# the functions lifetime. This should improve performance by eliminating +# unnecessary API calls. +# https://cloud.google.com/storage/quotas @cachetools.cached(cachetools.TTLCache(maxsize=1024, ttl=1)) def cached_get_bucket( gcs_client: storage.Client, diff --git a/tools/cloud_functions/gcs_event_based_ingest/ordered_backfill.py b/tools/cloud_functions/gcs_event_based_ingest/ordered_backfill.py deleted file mode 100644 index ed0b1da79..000000000 --- a/tools/cloud_functions/gcs_event_based_ingest/ordered_backfill.py +++ /dev/null @@ -1,177 +0,0 @@ -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Command Line utility for backfilling gcs_ocn_bq_ingest cloud function when -ordering of incrementals is required -""" -import argparse -import concurrent.futures -import logging -import os -import pprint -import sys -from typing import Dict, Iterator, List - -import google.api_core.client_info -from google.cloud import storage - -import gcs_ocn_bq_ingest.main # pylint: disable=import-error - -CLIENT_INFO = google.api_core.client_info.ClientInfo( - user_agent="google-pso-tool/bq-severless-loader-cli") - -os.environ["FUNCTION_NAME"] = "backfill-cli" - - -def find_blobs_with_suffix( - gcs_client: storage.Client, - prefix: str, - suffix: str = "_SUCCESS", -) -> Iterator[storage.Blob]: - """ - Find GCS blobs with a given suffix. - - :param gcs_client: storage.Client - :param prefix: A GCS prefix to search i.e. gs://bucket/prefix/to/search - :param suffix: A suffix in blob name to match - :return: Iterable of blobs matching the suffix. - """ - prefix_blob: storage.Blob = storage.Blob.from_string(prefix) - # filter passes on scalability / laziness advantages of iterator. - return filter( - lambda blob: blob.name.endswith(suffix), - prefix_blob.bucket.list_blobs(client=gcs_client, - prefix=prefix_blob.name)) - - -def main(args: argparse.Namespace): - """main entry point for backfill CLI.""" - gcs_client: storage.Client = storage.Client(client_info=CLIENT_INFO) - pubsub_client = None - suffix = args.success_filename - if args.destination_regex: - os.environ["DESTINATION_REGEX"] = args.destination_regex - if args.mode == "NOTIFICATIONS": - if not args.pubsub_topic: - raise ValueError("when passing mode=NOTIFICATIONS" - "you must also pass pubsub_topic.") - # import is here because this utility can be used without - # google-cloud-pubsub dependency in LOCAL mode. - # pylint: disable=import-outside-toplevel - from google.cloud import pubsub - pubsub_client = pubsub.PublisherClient() - - # These are all I/O bound tasks so use Thread Pool concurrency for speed. - with concurrent.futures.ThreadPoolExecutor() as executor: - future_to_gsurl = {} - for blob in find_blobs_with_suffix(gcs_client, args.gcs_path, suffix): - if pubsub_client: - # kwargs are message attributes - # https://googleapis.dev/python/pubsub/latest/publisher/index.html#publish-a-message - logging.info("sending pubsub message for: %s", - f"gs://{blob.bucket.name}/{blob.name}") - future_to_gsurl[executor.submit( - pubsub_client.publish, - args.pubsub_topic, - b'', # cloud function ignores message body - bucketId=blob.bucket.name, - objectId=blob.name, - _metaInfo="this message was submitted with " - "gcs_ocn_bq_ingest backfill.py utility" - )] = f"gs://{blob.bucket.name}/{blob.name}" - else: - logging.info("running cloud function locally for: %s", - f"gs://{blob.bucket.name}/{blob.name}") - future_to_gsurl[executor.submit( - gcs_ocn_bq_ingest.main.main, - { - "attributes": { - "bucketId": blob.bucket.name, - "objectId": blob.name - } - }, - None, - )] = f"gs://{blob.bucket.name}/{blob.name}" - exceptions: Dict[str, Exception] = dict() - for future in concurrent.futures.as_completed(future_to_gsurl): - gsurl = future_to_gsurl[future] - try: - future.result() - except Exception as err: # pylint: disable=broad-except - logging.error("Error processing %s: %s", gsurl, err) - exceptions[gsurl] = err - if exceptions: - raise RuntimeError("The following errors were encountered:\n" + - pprint.pformat(exceptions)) - - -def parse_args(args: List[str]) -> argparse.Namespace: - """argument parser for backfill CLI""" - parser = argparse.ArgumentParser( - description="utility to backfill success file notifications " - "or run the cloud function locally in concurrent threads.") - - parser.add_argument( - "--gcs-path", - "-p", - help="GCS path (e.g. gs://bucket/prefix/to/search/)to search for " - "existing _SUCCESS files", - required=True, - ) - - parser.add_argument( - "--mode", - "-m", - help="How to perform the backfill: LOCAL run cloud function main" - " method locally (in concurrent threads) or NOTIFICATIONS just push" - " notifications to Pub/Sub for a deployed version of the cloud function" - " to pick up. Default is NOTIFICATIONS.", - required=False, - type=str.upper, - choices=["LOCAL", "NOTIFICATIONS"], - default="NOTIFICATIONS", - ) - - parser.add_argument( - "--pubsub-topic", - "--topic", - "-t", - help="Pub/Sub notifications topic to post notifications for. " - "i.e. projects/{PROJECT_ID}/topics/{TOPIC_ID} " - "Required if using NOTIFICATIONS mode.", - required=False, - default=None, - ) - - parser.add_argument( - "--success-filename", - "-f", - help="Override the default success filename '_SUCCESS'", - required=False, - default="_SUCCESS", - ) - - parser.add_argument( - "--destination-regex", - "-r", - help="Override the default destination regex for determining BigQuery" - "destination based on information encoded in the GCS path of the" - "success file", - required=False, - default=None, - ) - return parser.parse_args(args) - - -if __name__ == "__main__": - main(parse_args(sys.argv)) diff --git a/tools/cloud_functions/gcs_event_based_ingest/pytest.ini b/tools/cloud_functions/gcs_event_based_ingest/pytest.ini index 3864588b3..07bd1315d 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/pytest.ini +++ b/tools/cloud_functions/gcs_event_based_ingest/pytest.ini @@ -4,3 +4,4 @@ markers = ORDERING: marks tests that test features related to ordering CLI: marks tests of CLI utilities addopts = --workers=auto + diff --git a/tools/cloud_functions/gcs_event_based_ingest/requirements-dev.txt b/tools/cloud_functions/gcs_event_based_ingest/requirements-dev.txt index f250ab6ee..a9b6c99c0 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/requirements-dev.txt +++ b/tools/cloud_functions/gcs_event_based_ingest/requirements-dev.txt @@ -8,4 +8,4 @@ pylint pytest-parallel pytest-cov google-cloud-pubsub>=2.2.0 -pytest-repeat +pytest-mock diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py index d8e877e0b..5b06554f3 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py @@ -77,6 +77,12 @@ def mock_env(gcs, monkeypatch): monkeypatch.setenv("FUNCTION_TIMEOUT_SEC", "120") +@pytest.fixture +def ordered_mock_env(mock_env, monkeypatch): + """environment variable mocks""" + monkeypatch.setenv("ORDER_PER_TABLE", "TRUE") + + @pytest.fixture def dest_dataset(request, bq, mock_env, monkeypatch): random_dataset = f"test_bq_ingest_gcf_{str(uuid.uuid4())[:8].replace('-','_')}" diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py index 49f76389f..6dfc57dec 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py @@ -14,9 +14,11 @@ # limitations under the License. """unit tests for gcs_ocn_bq_ingest""" import re +import time from typing import Dict, Optional import pytest +from google.cloud import storage import gcs_ocn_bq_ingest.constants import gcs_ocn_bq_ingest.main @@ -236,3 +238,49 @@ def test_recursive_update(original, update, expected): ]) def test_get_table_prefix(test_input, expected): assert gcs_ocn_bq_ingest.utils.get_table_prefix(test_input) == expected + + +def test_triage_event(mock_env, mocker): + test_event_blob: storage.Blob = storage.Blob.from_string( + "gs://foo/bar/baz/00/_SUCCESS") + apply_mock = mocker.patch('gcs_ocn_bq_ingest.utils.apply') + gcs_ocn_bq_ingest.main.triage_event(None, None, test_event_blob, + time.monotonic()) + apply_mock.assert_called_once() + + +def test_triage_event_ordered(ordered_mock_env, mocker): + enforce_ordering = True + test_event_blob: storage.Blob = storage.Blob.from_string( + "gs://foo/bar/baz/00/_SUCCESS") + apply_mock = mocker.patch('gcs_ocn_bq_ingest.utils.apply') + publisher_mock = mocker.patch( + 'gcs_ocn_bq_ingest.ordering.backlog_publisher') + gcs_ocn_bq_ingest.main.triage_event(None, + None, + test_event_blob, + time.monotonic(), + enforce_ordering=enforce_ordering) + publisher_mock.assert_called_once() + + test_event_blob: storage.Blob = storage.Blob.from_string( + "gs://foo/bar/baz/_BACKFILL") + subscriber_mock = mocker.patch( + 'gcs_ocn_bq_ingest.ordering.backlog_subscriber') + gcs_ocn_bq_ingest.main.triage_event(None, + None, + test_event_blob, + time.monotonic(), + enforce_ordering=enforce_ordering) + subscriber_mock.assert_called_once() + + test_event_blob: storage.Blob = storage.Blob.from_string( + "gs://foo/bar/baz/_backlog/00/_SUCCESS") + monitor_mock = mocker.patch('gcs_ocn_bq_ingest.ordering.subscriber_monitor') + gcs_ocn_bq_ingest.main.triage_event(None, + None, + test_event_blob, + time.monotonic(), + enforce_ordering=enforce_ordering) + monitor_mock.assert_called_once() + apply_mock.assert_not_called() diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py index 6459a206b..e1fe45b18 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py @@ -14,7 +14,6 @@ """integration tests for gcs_ocn_bq_ingest""" import os import time -import unittest.mock import google.cloud.exceptions import pytest @@ -80,11 +79,6 @@ def test_duplicate_success_notification(bq, gcs_data, dest_dataset, dest_table, } } gcs_ocn_bq_ingest.main.main(test_event, None) - with unittest.mock.patch.object(google.cloud.error_reporting.Client, - "report_exception") as mock_method: - gcs_ocn_bq_ingest.main.main(test_event, None) - - mock_method.assert_called_once() test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation", "part-m-00001") diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py index 197d4f92d..c231d498e 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py @@ -168,11 +168,13 @@ def test_backlog_subscriber_in_order_with_new_batch_after_exit( @pytest.mark.IT @pytest.mark.ORDERING -@pytest.mark.repeat(5) +@pytest.mark.parametrize('execution_number', range(5)) def test_backlog_subscriber_in_order_with_new_batch_while_running( - bq, gcs, gcs_bucket, dest_dataset, dest_ordered_update_table, - gcs_ordered_update_data, gcs_external_update_config: storage.Blob, - gcs_backlog, mock_env): + execution_number, + bq, gcs, gcs_bucket, dest_dataset, dest_ordered_update_table, + gcs_ordered_update_data, gcs_external_update_config: storage.Blob, + gcs_backlog, mock_env +): """Test functionality of backlog subscriber when new batches are added before the subscriber is done finishing the existing backlog. From a0114e17f7f920afccd54459ace62efadd20afc6 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Fri, 11 Dec 2020 17:32:16 -0800 Subject: [PATCH 34/90] improve tests --- .../gcs_event_based_ingest/cloudbuild.yaml | 1 + .../gcs_ocn_bq_ingest/main.py | 3 +- .../gcs_ocn_bq_ingest/ordering.py | 19 ++++--- .../gcs_ocn_bq_ingest/utils.py | 2 +- .../requirements-dev.txt | 1 + .../gcs_event_based_ingest/requirements.txt | 4 +- .../gcs_event_based_ingest/tests/conftest.py | 2 +- .../gcs_ocn_bq_ingest/test_ordering_it.py | 53 ++++++++++++++----- 8 files changed, 60 insertions(+), 25 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml index 4a6d5b519..a41666c65 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml +++ b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml @@ -129,6 +129,7 @@ steps: - '-c' - 'pip install -r requirements-dev.txt && python3 -m pytest -m IT' id: 'integration-test' +timeout: '1200s' options: machineType: 'N1_HIGHCPU_32' substitutions: diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 73b7b9657..b172f430f 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -132,7 +132,8 @@ def triage_event(gcs_client: Optional[storage.Client], f"{constants.BACKFILL_FILENAME}"): raise RuntimeError( f"recieved notification for gs://{event_blob.bucket.name}/" - f"{event_blob.name}\n{constants.BACKFILL_FILENAME} files " + f"{event_blob.name}\n" + f"{constants.BACKFILL_FILENAME} files " "are expected only at the table prefix level.") ordering.backlog_subscriber(gcs_client, bq_client, event_blob, function_start_time) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py index b8ea9c323..74e490601 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py @@ -64,12 +64,14 @@ def backlog_subscriber(gcs_client: Optional[storage.Client], restart_time = function_start_time + ( float(os.getenv("FUNCTION_TIMEOUT_SEC", "60")) - constants.RESTART_BUFFER_SECONDS) + backfill_blob_generation = backfill_blob.generation bkt = backfill_blob.bucket utils.handle_duplicate_notification(gcs_client, backfill_blob) table_prefix = utils.get_table_prefix(backfill_blob.name) last_job_done = False # we will poll for job completion this long in an individual iteration of - # the while loop. + # the while loop (before checking if we are too close to cloud function + # timeout and should retrigger). polling_timeout = 5 # seconds lock_blob: storage.Blob = bkt.blob(f"{table_prefix}/_bqlock") if restart_time - polling_timeout < time.monotonic(): @@ -82,6 +84,8 @@ def backlog_subscriber(gcs_client: Optional[storage.Client], lock_contents = utils.read_gcs_file_if_exists( gcs_client, f"gs://{bkt.name}/{lock_blob.name}") if lock_contents: + # is this a lock placed by this cloud function. + # the else will handle a manual _bqlock if lock_contents.startswith( os.getenv('JOB_PREFIX', constants.DEFAULT_JOB_PREFIX)): job_id = lock_contents @@ -109,8 +113,9 @@ def backlog_subscriber(gcs_client: Optional[storage.Client], else: print(f"sleeping for {polling_timeout} seconds because" f"found manual lock gs://{bkt.name}/{lock_blob.name} with" - f"contents:\n {lock_contents}. This will be an infinite" - "loop until the manual lock is released.") + "This will be an infinite loop until the manual lock is " + "released.\n" + f"manual lock contents:\n {lock_contents}. ") time.sleep(polling_timeout) continue if last_job_done: @@ -121,7 +126,7 @@ def backlog_subscriber(gcs_client: Optional[storage.Client], next_backlog_file = utils.get_next_backlog_item(gcs_client, bkt, table_prefix) if not next_backlog_file: - backfill_blob.delete(if_generation_match=backfill_blob.generation, + backfill_blob.delete(if_generation_match=backfill_blob_generation, client=gcs_client) if (check_backlog_time + constants.ENSURE_SUBSCRIBER_SECONDS < time.monotonic()): @@ -154,9 +159,9 @@ def backlog_subscriber(gcs_client: Optional[storage.Client], next_success_file.name) if not next_success_file.exists(client=gcs_client): raise exceptions.BacklogException( - "backlog contains" - f"gs://{next_backlog_file.bucket}/{next_backlog_file.name}" - "but the corresponding success file does not exist at:" + "backlog contains " + f"gs://{next_backlog_file.bucket}/{next_backlog_file.name} " + "but the corresponding success file does not exist at: " f"gs://{next_success_file.bucket}/{next_success_file.name}") utils.apply(gcs_client, bq_client, next_success_file, lock_blob, utils.create_job_id(table_ref, batch)) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py index 1c83e7deb..7d9c85900 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py @@ -486,7 +486,7 @@ def remove_oldest_backlog_item( table_prefix: str, ) -> bool: """ - Remove the oldes pointer in the backlog if the backlog is not empty. + Remove the oldest pointer in the backlog if the backlog is not empty. Args: gcs_client: storage.Client diff --git a/tools/cloud_functions/gcs_event_based_ingest/requirements-dev.txt b/tools/cloud_functions/gcs_event_based_ingest/requirements-dev.txt index a9b6c99c0..2fe24ea9a 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/requirements-dev.txt +++ b/tools/cloud_functions/gcs_event_based_ingest/requirements-dev.txt @@ -9,3 +9,4 @@ pytest-parallel pytest-cov google-cloud-pubsub>=2.2.0 pytest-mock +pytest-repeat diff --git a/tools/cloud_functions/gcs_event_based_ingest/requirements.txt b/tools/cloud_functions/gcs_event_based_ingest/requirements.txt index 7279c2550..b715db130 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/requirements.txt +++ b/tools/cloud_functions/gcs_event_based_ingest/requirements.txt @@ -1,4 +1,4 @@ -google-cloud-bigquery>=2.6.0 -google-cloud-storage>=1.33.0 +google-cloud-bigquery>=2.6.1 +google-cloud-storage>=1.34.0 google-cloud-error-reporting>=1.1.0 cachetools diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py index 5b06554f3..1d4043de5 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py @@ -74,7 +74,7 @@ def mock_env(gcs, monkeypatch): # Infer project from ADC of gcs client. monkeypatch.setenv("GCP_PROJECT", gcs.project) monkeypatch.setenv("FUNCTION_NAME", "integration-test") - monkeypatch.setenv("FUNCTION_TIMEOUT_SEC", "120") + monkeypatch.setenv("FUNCTION_TIMEOUT_SEC", "540") @pytest.fixture diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py index c231d498e..50ff1c97c 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py @@ -15,7 +15,6 @@ import multiprocessing import os import queue -import random import time import pytest @@ -30,6 +29,11 @@ TEST_DIR = os.path.realpath(os.path.dirname(__file__) + "/..") LOAD_JOB_POLLING_TIMEOUT = 20 # seconds +# Testing that the subscriber does not get choked up by a common race condition +# is crucial to ensuring this solution works. +# This parameter is for running the subscriber tests many times. +NUM_TRIES_SUBSCRIBER_TESTS = 25 + @pytest.mark.IT @pytest.mark.ORDERING @@ -117,6 +121,7 @@ def test_backlog_publisher_with_existing_backfill_file(gcs, gcs_bucket, @pytest.mark.IT @pytest.mark.ORDERING +@pytest.mark.repeat(NUM_TRIES_SUBSCRIBER_TESTS) def test_backlog_subscriber_in_order_with_new_batch_after_exit( bq, gcs, gcs_bucket, dest_dataset, dest_ordered_update_table, gcs_ordered_update_data, gcs_external_update_config, gcs_backlog, @@ -129,9 +134,11 @@ def test_backlog_subscriber_in_order_with_new_batch_after_exit( we will drop a 4th batch after the subscriber has exited and assert that it gets applied as expected. """ - gcs_ocn_bq_ingest.ordering.backlog_subscriber(gcs, bq, - gcs_external_update_config, - time.monotonic()) + _run_subscriber( + gcs, + bq, + gcs_external_update_config + ) backlog_blobs = gcs_bucket.list_blobs( prefix=f"{gcs_ocn_bq_ingest.utils.get_table_prefix(gcs_external_update_config.name)}/_backlog/" ) @@ -152,8 +159,7 @@ def test_backlog_subscriber_in_order_with_new_batch_after_exit( # the backlog subscriber has exited. backfill_blob = _post_a_new_batch(gcs_bucket, dest_dataset, dest_ordered_update_table) - gcs_ocn_bq_ingest.ordering.backlog_subscriber(gcs, bq, backfill_blob, - time.monotonic()) + _run_subscriber(gcs, bq, backfill_blob) rows = bq.query("SELECT alpha_update FROM " f"{dest_ordered_update_table.dataset_id}" @@ -168,9 +174,8 @@ def test_backlog_subscriber_in_order_with_new_batch_after_exit( @pytest.mark.IT @pytest.mark.ORDERING -@pytest.mark.parametrize('execution_number', range(5)) +@pytest.mark.repeat(NUM_TRIES_SUBSCRIBER_TESTS) def test_backlog_subscriber_in_order_with_new_batch_while_running( - execution_number, bq, gcs, gcs_bucket, dest_dataset, dest_ordered_update_table, gcs_ordered_update_data, gcs_external_update_config: storage.Blob, gcs_backlog, mock_env @@ -194,14 +199,19 @@ def test_backlog_subscriber_in_order_with_new_batch_while_running( f"{dest_ordered_update_table.table_id}") bkt = storage.Bucket.from_string(f"gs://{gcs_bucket.name}") + basename = os.path.basename(gcs_external_update_config.name) + claim_blob: storage.Blob = gcs_external_update_config.bucket.blob( + gcs_external_update_config.name.replace( + basename, f"_claimed_{basename}_created_at_" + f"{gcs_external_update_config.time_created.timestamp()}")) # Run subscriber w/ backlog and publisher w/ new batch in parallel. with multiprocessing.Pool(processes=2) as pool: res_subscriber = pool.apply_async( - gcs_ocn_bq_ingest.ordering.backlog_subscriber, - (None, None, backfill_blob, time.monotonic())) - # We run this test multiple times and sleep a random amount to simulate - # the next batch landing at a random time during the backfill. - time.sleep(random.uniform(0, 2)) + _run_subscriber, + (None, None, backfill_blob)) + # wait for existence of claim blob to ensure subscriber is running. + while not claim_blob.exists(): + pass res_backlog_publisher = pool.apply_async(_post_a_new_batch, (bkt, dataset, table)) @@ -226,6 +236,23 @@ def test_backlog_subscriber_in_order_with_new_batch_while_running( assert num_rows == expected_num_rows +def _run_subscriber( + gcs_client: storage.Client, + bq_client: bigquery.Client, + backfill_blob, +): + try: + gcs_ocn_bq_ingest.ordering.backlog_subscriber( + gcs_client, + bq_client, + backfill_blob, + time.monotonic()) + except gcs_ocn_bq_ingest.exceptions.DuplicateNotificationException: + print("ignoring potential duplicate notification exception as this is" + "not a critical error and would be ignored by the main method" + "of the cloud function.") + + def _post_a_new_batch(gcs_bucket, dest_dataset, dest_ordered_update_table): # We may run this in another process and cannot pickle client objects gcs = storage.Client() From def1ddb9fe0b2936cb2262c274039a5a33455b70 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Sun, 13 Dec 2020 16:48:28 -0800 Subject: [PATCH 35/90] fix: handle long running bq jobs --- .../gcs_event_based_ingest/.gitignore | 1 + .../gcs_ocn_bq_ingest/main.py | 9 +-- .../gcs_ocn_bq_ingest/ordering.py | 58 ++++++++++++------ .../gcs_ocn_bq_ingest/utils.py | 25 +++++--- .../gcs_event_based_ingest/pytest.ini | 6 ++ .../gcs_event_based_ingest/tests/conftest.py | 14 +++-- .../gcs_ocn_bq_ingest/test_ordering_it.py | 59 +++++++++---------- 7 files changed, 103 insertions(+), 69 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/.gitignore b/tools/cloud_functions/gcs_event_based_ingest/.gitignore index 8ca3bf9ba..66d580175 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/.gitignore +++ b/tools/cloud_functions/gcs_event_based_ingest/.gitignore @@ -1 +1,2 @@ prof/ +test.log diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index b172f430f..5f79f5962 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -140,13 +140,8 @@ def triage_event(gcs_client: Optional[storage.Client], return else: # Default behavior submit job as soon as success file lands. if basename_object_id == constants.SUCCESS_FILENAME: - utils.apply( - gcs_client, - bq_client, - event_blob, - # None lock blob as there is no serialization required. - None, - utils.create_job_id(table_ref, batch)) + utils.apply(gcs_client, bq_client, event_blob, None, + utils.create_job_id(table_ref, batch)) def lazy_error_reporting_client() -> error_reporting.Client: diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py index 74e490601..cd998f985 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py @@ -24,6 +24,7 @@ import google.api_core import google.api_core.exceptions +import pytz # pylint in cloud build is being flaky about this import discovery. # pylint: disable=no-name-in-module from google.cloud import bigquery @@ -52,7 +53,7 @@ def backlog_publisher( table_prefix) -# pylint: disable=too-many-arguments,too-many-locals +# pylint: disable=too-many-arguments,too-many-locals,too-many-statements,too-many-branches def backlog_subscriber(gcs_client: Optional[storage.Client], bq_client: Optional[bigquery.Client], backfill_blob: storage.Blob, function_start_time: float): @@ -80,7 +81,8 @@ def backlog_subscriber(gcs_client: Optional[storage.Client], "backlog subscriber to do it's job. We recommend " "setting the timeout to 540 seconds or at least " "1 minute (Cloud Functions default).") - while time.monotonic() < restart_time - polling_timeout: + while time.monotonic() < restart_time - polling_timeout - 1: + first_bq_lock_claim = False lock_contents = utils.read_gcs_file_if_exists( gcs_client, f"gs://{bkt.name}/{lock_blob.name}") if lock_contents: @@ -118,14 +120,25 @@ def backlog_subscriber(gcs_client: Optional[storage.Client], f"manual lock contents:\n {lock_contents}. ") time.sleep(polling_timeout) continue - if last_job_done: + else: # this condition handles absence of _bqlock file + first_bq_lock_claim = True + last_job_done = True # there's no running job to poll. + + if not last_job_done: + # keep polling th running job. + continue + + # if reached here, last job is done. + if not first_bq_lock_claim: + # If the BQ lock was missing we do not want to delete a backlog + # item for a job we have not yet submitted. utils.remove_oldest_backlog_item(gcs_client, bkt, table_prefix) - last_job_done = False check_backlog_time = time.monotonic() next_backlog_file = utils.get_next_backlog_item(gcs_client, bkt, table_prefix) if not next_backlog_file: + print("no more files found in the backlog deleteing backfill blob") backfill_blob.delete(if_generation_match=backfill_blob_generation, client=gcs_client) if (check_backlog_time + constants.ENSURE_SUBSCRIBER_SECONDS < @@ -135,7 +148,7 @@ def backlog_subscriber(gcs_client: Optional[storage.Client], f"gs://${bkt.name}/{table_prefix}/_backlog/" f"There was more than {constants.ENSURE_SUBSCRIBER_SECONDS}" " seconds between listing items on the backlog and " - f"attempting to delete the {constants.BACKFILL_FILENAME}. " + f"deleting the {constants.BACKFILL_FILENAME}. " "This should not happen often but is meant to alleviate a " "race condition in the event that something caused the " "delete operation was delayed or had to be retried for a " @@ -143,7 +156,7 @@ def backlog_subscriber(gcs_client: Optional[storage.Client], next_backlog_file = utils.get_next_backlog_item( gcs_client, bkt, table_prefix) if next_backlog_file: - # The backfill file may have been deleted but the backlog is + # The backfill file was deleted but the backlog is # not empty. Re-trigger the backfill subscriber loop by # dropping a new backfill file. start_backfill_subscriber_if_not_running( @@ -163,8 +176,11 @@ def backlog_subscriber(gcs_client: Optional[storage.Client], f"gs://{next_backlog_file.bucket}/{next_backlog_file.name} " "but the corresponding success file does not exist at: " f"gs://{next_success_file.bucket}/{next_success_file.name}") + print("applying next batch for:" + f"gs://{next_success_file.bucket}/{next_success_file.name}") + next_job_id = utils.create_job_id(table_ref, batch) utils.apply(gcs_client, bq_client, next_success_file, lock_blob, - utils.create_job_id(table_ref, batch)) + next_job_id) # retrigger the subscriber loop by reposting the _BACKFILL file print("ran out of time, restarting backfill subscriber loop for:" f"gs://{bkt.name}/{table_prefix}") @@ -173,13 +189,15 @@ def backlog_subscriber(gcs_client: Optional[storage.Client], def start_backfill_subscriber_if_not_running( - gcs_client: storage.Client, bkt: storage.Bucket, + gcs_client: Optional[storage.Client], bkt: storage.Bucket, table_prefix: str) -> Optional[storage.Blob]: """start the backfill subscriber if it is not already runnning for this table prefix. created a backfill file for the table prefix if not exists. """ + if not gcs_client: + gcs_client = storage.Client(client_info=constants.CLIENT_INFO) start_backfill = True # Do not start subscriber until START_BACKFILL_FILENAME has been dropped # at the table prefix. @@ -198,7 +216,7 @@ def start_backfill_subscriber_if_not_running( client=gcs_client) print("triggered backfill with " f"gs://{backfill_blob.bucket.name}/{backfill_blob.name} " - f"created at {backfill_blob.time_created}. exiting.") + f"created at {backfill_blob.time_created}.") return backfill_blob except google.api_core.exceptions.PreconditionFailed: backfill_blob.reload(client=gcs_client) @@ -221,8 +239,8 @@ def success_blob_to_backlog_blob(success_blob: storage.Blob) -> storage.Blob: return bkt.blob(f"{table_prefix}/_backlog/{success_file_suffix}") -def subscriber_monitor(gcs_client: storage.Client, bkt: storage.Bucket, - object_id: str): +def subscriber_monitor(gcs_client: Optional[storage.Client], + bkt: storage.Bucket, object_id: str) -> bool: """ Monitor to handle a rare race condition where: @@ -243,6 +261,8 @@ def subscriber_monitor(gcs_client: storage.Client, bkt: storage.Bucket, we always handle this race condition either in this monitor or in the subscriber itself. """ + if not gcs_client: + gcs_client = storage.Client(client_info=constants.CLIENT_INFO) backfill_blob = start_backfill_subscriber_if_not_running( gcs_client, bkt, utils.get_table_prefix(object_id)) @@ -251,9 +271,10 @@ def subscriber_monitor(gcs_client: storage.Client, bkt: storage.Bucket, if backfill_blob: # Handle case where a subscriber loop was not able to repost the # backfill file before the cloud function timeout. - if (datetime.datetime.utcnow() - backfill_blob.time_created > - datetime.timedelta( - seconds=int(os.getenv("FUNCTION_TIMEOUT_SEC", "60")))): + time_created_utc = backfill_blob.time_created.replace(tzinfo=pytz.UTC) + now_utc = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC) + if (now_utc - time_created_utc > datetime.timedelta( + seconds=int(os.getenv("FUNCTION_TIMEOUT_SEC", "60")))): print( f"backfill blob gs://{backfill_blob.bucket.name}/" f"{backfill_blob.name} appears to be abandoned as it is older " @@ -264,14 +285,15 @@ def subscriber_monitor(gcs_client: storage.Client, bkt: storage.Bucket, backfill_blob.delete(client=gcs_client) start_backfill_subscriber_if_not_running( gcs_client, bkt, utils.get_table_prefix(object_id)) - return + return True time.sleep(constants.ENSURE_SUBSCRIBER_SECONDS) while not utils.wait_on_gcs_blob(gcs_client, backfill_blob, constants.ENSURE_SUBSCRIBER_SECONDS): - backfill_blob = \ - start_backfill_subscriber_if_not_running( - gcs_client, bkt, utils.get_table_prefix(object_id)) + start_backfill_subscriber_if_not_running( + gcs_client, bkt, utils.get_table_prefix(object_id)) + return True + return False def _get_clients_if_none( diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py index 7d9c85900..de49b8ca6 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py @@ -62,7 +62,7 @@ def external_query( # pylint: disable=too-many-arguments f"{json.dumps(constants.DEFAULT_EXTERNAL_TABLE_DEFINITION)}") external_table_def = constants.DEFAULT_EXTERNAL_TABLE_DEFINITION - # This may cause an issue if >10,000 files. however, we + # This may cause an issue if >10,000 files. external_table_def["sourceUris"] = flatten2dlist( get_batches_for_prefix(gcs_client, gsurl)) print(f"external table def = {json.dumps(external_table_config, indent=2)}") @@ -70,10 +70,12 @@ def external_query( # pylint: disable=too-many-arguments job_config = bigquery.QueryJobConfig( table_definitions={"temp_ext": external_config}, use_legacy_sql=False) - # Note, dest_table might include a partition decorator. + # drop partition decorator if present. + table_id = dest_table_ref.table_id.split("$")[0] + rendered_query = query.format( dest_dataset=f"`{dest_table_ref.project}`.{dest_table_ref.dataset_id}", - dest_table=dest_table_ref.table_id, + dest_table=table_id ) job: bigquery.QueryJob = bq_client.query(rendered_query, @@ -398,8 +400,10 @@ def recursive_update(original: Dict, update: Dict, in_place: bool = False): return out -def handle_duplicate_notification(gcs_client: storage.Client, - blob_to_claim: storage.Blob): +def handle_duplicate_notification( + gcs_client: storage.Client, + blob_to_claim: storage.Blob, +): """ Need to handle potential duplicate Pub/Sub notifications. To achieve this we will drop an empty "claimed" file that indicates @@ -422,8 +426,9 @@ def handle_duplicate_notification(gcs_client: storage.Client, if_generation_match=0, client=gcs_client) except google.api_core.exceptions.PreconditionFailed as err: + blob_to_claim.reload(client=gcs_client) raise exceptions.DuplicateNotificationException( - f"gs://{blob_to_claim.bucket.name}/{blob_to_claim.name} appears" + f"gs://{blob_to_claim.bucket.name}/{blob_to_claim.name} appears " "to already have been claimed for created timestamp: " f"{created_unix_timestamp}." "This means that another invocation of this cloud function has " @@ -540,6 +545,7 @@ def wait_on_bq_job_id(bq_client: bigquery.Client, if job.state in {"RUNNING", "PENDING"}: print(f"waiting on BigQuery Job {job.job_id}") time.sleep(polling_interval) + print(f"reached polling timeout waiting for bigquery job {job_id}") return False @@ -688,13 +694,13 @@ def apply( bq_client: bigquery.Client success_blob: storage.Blob the success file whose batch should be applied. - lock_blob: storage.Blob + lock_blob: storage.Blob _bqlock blob to acquire for this job. job_id: str """ handle_duplicate_notification(gcs_client, success_blob) - bkt = success_blob.bucket - if lock_blob is not None: + if lock_blob: handle_bq_lock(gcs_client, lock_blob, job_id) + bkt = success_blob.bucket dest_table_ref, _ = gcs_path_to_table_ref_and_batch(success_blob.name) gsurl = removesuffix(f"gs://{bkt.name}/{success_blob.name}", constants.SUCCESS_FILENAME) @@ -715,3 +721,4 @@ def apply( print("LOAD_JOB") load_batches(gcs_client, bq_client, gsurl, dest_table_ref, job_id) + return diff --git a/tools/cloud_functions/gcs_event_based_ingest/pytest.ini b/tools/cloud_functions/gcs_event_based_ingest/pytest.ini index 07bd1315d..7602954dc 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/pytest.ini +++ b/tools/cloud_functions/gcs_event_based_ingest/pytest.ini @@ -1,4 +1,10 @@ [pytest] +log_format = %(asctime)s %(levelname)s %(message)s +log_date_format = %Y-%m-%d %H:%M:%S +log_file_format = %(asctime)s %(levelname)s %(message)s +log_file_date_format = %Y-%m-%d %H:%M:%S +log_file_level = INFO +log_file = test.log markers = IT: marks tests as slow integration test requiring cloud resouces (deselect with '-m "not IT"') ORDERING: marks tests that test features related to ordering diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py index 1d4043de5..776f7b08b 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py @@ -85,7 +85,8 @@ def ordered_mock_env(mock_env, monkeypatch): @pytest.fixture def dest_dataset(request, bq, mock_env, monkeypatch): - random_dataset = f"test_bq_ingest_gcf_{str(uuid.uuid4())[:8].replace('-','_')}" + random_dataset = (f"test_bq_ingest_gcf_" + f"{str(uuid.uuid4())[:8].replace('-','_')}") dataset = bigquery.Dataset(f"{os.getenv('GCP_PROJECT')}" f".{random_dataset}") dataset.location = "US" @@ -109,7 +110,9 @@ def dest_table(request, bq, mock_env, dest_dataset) -> bigquery.Table: json.load(schema_file)) table = bigquery.Table( - f"{os.environ.get('GCP_PROJECT')}.{dest_dataset.dataset_id}.cf_test_nation", + f"{os.environ.get('GCP_PROJECT')}" + f".{dest_dataset.dataset_id}.cf_test_nation_" + f"{str(uuid.uuid4()).replace('-','_')}", schema=schema, ) @@ -298,7 +301,8 @@ def dest_partitioned_table(request, bq: bigquery.Client, mock_env, table: bigquery.Table = bigquery.Table( f"{os.environ.get('GCP_PROJECT')}" - f".{dest_dataset.dataset_id}.cf_test_nyc_311", + f".{dest_dataset.dataset_id}.cf_test_nyc_311_" + f"{str(uuid.uuid4()).replace('-','_')}", schema=schema, ) @@ -353,7 +357,7 @@ def dest_ordered_update_table(request, gcs, gcs_bucket, bq, mock_env, table = bigquery.Table( f"{os.environ.get('GCP_PROJECT')}.{dest_dataset.dataset_id}" - ".cf_test_ordering", + f".cf_test_ordering_{str(uuid.uuid4()).replace('-','_')}", schema=schema, ) @@ -523,7 +527,7 @@ def gcs_external_partitioned_config( "bq_transform.sql", ])) - sql = "INSERT {dest_dataset}.cf_test_nyc_311 SELECT * FROM temp_ext" + sql = "INSERT {dest_dataset}.{dest_table} SELECT * FROM temp_ext" sql_obj.upload_from_string(sql) config_obj = gcs_bucket.blob("/".join([ diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py index 50ff1c97c..2230417d9 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py @@ -16,6 +16,7 @@ import os import queue import time +from typing import Optional import pytest from google.cloud import bigquery @@ -32,6 +33,8 @@ # Testing that the subscriber does not get choked up by a common race condition # is crucial to ensuring this solution works. # This parameter is for running the subscriber tests many times. +# During development it can be helpful to tweak this up or down as you are +# experimenting. NUM_TRIES_SUBSCRIBER_TESTS = 25 @@ -134,11 +137,7 @@ def test_backlog_subscriber_in_order_with_new_batch_after_exit( we will drop a 4th batch after the subscriber has exited and assert that it gets applied as expected. """ - _run_subscriber( - gcs, - bq, - gcs_external_update_config - ) + _run_subscriber(gcs, bq, gcs_external_update_config) backlog_blobs = gcs_bucket.list_blobs( prefix=f"{gcs_ocn_bq_ingest.utils.get_table_prefix(gcs_external_update_config.name)}/_backlog/" ) @@ -176,10 +175,9 @@ def test_backlog_subscriber_in_order_with_new_batch_after_exit( @pytest.mark.ORDERING @pytest.mark.repeat(NUM_TRIES_SUBSCRIBER_TESTS) def test_backlog_subscriber_in_order_with_new_batch_while_running( - bq, gcs, gcs_bucket, dest_dataset, dest_ordered_update_table, - gcs_ordered_update_data, gcs_external_update_config: storage.Blob, - gcs_backlog, mock_env -): + bq, gcs, gcs_bucket, dest_dataset, dest_ordered_update_table, + gcs_ordered_update_data, gcs_external_update_config: storage.Blob, + gcs_backlog, mock_env): """Test functionality of backlog subscriber when new batches are added before the subscriber is done finishing the existing backlog. @@ -203,25 +201,33 @@ def test_backlog_subscriber_in_order_with_new_batch_while_running( claim_blob: storage.Blob = gcs_external_update_config.bucket.blob( gcs_external_update_config.name.replace( basename, f"_claimed_{basename}_created_at_" - f"{gcs_external_update_config.time_created.timestamp()}")) + f"{gcs_external_update_config.time_created.timestamp()}")) # Run subscriber w/ backlog and publisher w/ new batch in parallel. - with multiprocessing.Pool(processes=2) as pool: - res_subscriber = pool.apply_async( - _run_subscriber, - (None, None, backfill_blob)) + with multiprocessing.Pool(processes=3) as pool: + res_subscriber = pool.apply_async(_run_subscriber, + (None, None, backfill_blob)) # wait for existence of claim blob to ensure subscriber is running. while not claim_blob.exists(): pass res_backlog_publisher = pool.apply_async(_post_a_new_batch, (bkt, dataset, table)) + res_backlog_publisher.wait() + res_monitor = pool.apply_async( + gcs_ocn_bq_ingest.ordering.subscriber_monitor, + (None, bkt, + f"{dataset.project}.{dataset.dataset_id}/{table.table_id}/" + f"_backlog/04/_SUCCESS")) + + if res_monitor.get(): + print("subscriber monitor had to retrigger subscriber loop") + backfill_blob.reload(client=gcs) + _run_subscriber(None, None, backfill_blob) - # wait on each function to complete res_subscriber.wait() - res_backlog_publisher.wait() backlog_blobs = gcs_bucket.list_blobs( - prefix=f"{gcs_ocn_bq_ingest.utils.get_table_prefix(gcs_external_update_config.name)}/_backlog/" - ) + prefix=f"{gcs_ocn_bq_ingest.utils.get_table_prefix(gcs_external_update_config.name)}/" + f"_backlog/") assert backlog_blobs.num_results == 0, "backlog is not empty" bqlock_blob: storage.Blob = gcs_bucket.blob("_bqlock") assert not bqlock_blob.exists(), "_bqlock was not cleaned up" @@ -237,20 +243,13 @@ def test_backlog_subscriber_in_order_with_new_batch_while_running( def _run_subscriber( - gcs_client: storage.Client, - bq_client: bigquery.Client, + gcs_client: Optional[storage.Client], + bq_client: Optional[bigquery.Client], backfill_blob, ): - try: - gcs_ocn_bq_ingest.ordering.backlog_subscriber( - gcs_client, - bq_client, - backfill_blob, - time.monotonic()) - except gcs_ocn_bq_ingest.exceptions.DuplicateNotificationException: - print("ignoring potential duplicate notification exception as this is" - "not a critical error and would be ignored by the main method" - "of the cloud function.") + gcs_ocn_bq_ingest.ordering.backlog_subscriber(gcs_client, + bq_client, backfill_blob, + time.monotonic()) def _post_a_new_batch(gcs_bucket, dest_dataset, dest_ordered_update_table): From ddaf280d4c504f084ec580ab53cccd1a717bba9c Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Mon, 14 Dec 2020 17:48:34 -0800 Subject: [PATCH 36/90] chore: add e2e test, fixup terraform --- tools/__init__.py | 0 tools/cloud_functions/__init__.py | 0 .../gcs_event_based_ingest/Dockerfile.ci | 2 +- .../gcs_event_based_ingest/README.md | 21 ++- .../gcs_event_based_ingest/cloudbuild.yaml | 51 ++++++- .../gcs_event_based_ingest/e2e/.gitignore | 35 +++++ .../{tests/cli => e2e}/__init__.py | 0 .../gcs_event_based_ingest/e2e/conftest.py | 93 +++++++++++++ .../gcs_event_based_ingest/e2e/e2e_test.py | 127 ++++++++++++++++++ .../gcs_event_based_ingest/e2e/main.tf | 45 +++++++ .../common}/__init__.py | 0 .../{ => common}/constants.py | 9 +- .../{ => common}/exceptions.py | 4 +- .../{ => common}/ordering.py | 5 +- .../gcs_ocn_bq_ingest/{ => common}/utils.py | 17 ++- .../gcs_ocn_bq_ingest/main.py | 32 +++-- .../gcs_event_based_ingest/pytest.ini | 1 + .../gcs_ocn_bq_ingest_function/README.md | 5 +- .../gcs_ocn_bq_ingest_function/main.tf | 49 +++++-- .../gcs_ocn_bq_ingest_function/outputs.tf | 4 + .../gcs_ocn_bq_ingest_function/variables.tf | 9 ++ .../gcs_ocn_bq_ingest_function/versions.tf | 2 +- .../gcs_event_based_ingest/tests/__init__.py | 20 --- .../gcs_event_based_ingest/tests/conftest.py | 21 +-- .../test_gcs_ocn_bq_ingest.py | 35 +++-- .../gcs_ocn_bq_ingest/test_ordering_it.py | 56 ++++---- 26 files changed, 533 insertions(+), 110 deletions(-) create mode 100644 tools/__init__.py create mode 100644 tools/cloud_functions/__init__.py create mode 100644 tools/cloud_functions/gcs_event_based_ingest/e2e/.gitignore rename tools/cloud_functions/gcs_event_based_ingest/{tests/cli => e2e}/__init__.py (100%) create mode 100644 tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py create mode 100644 tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py create mode 100644 tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf rename tools/cloud_functions/gcs_event_based_ingest/{tests/gcs_ocn_bq_ingest => gcs_ocn_bq_ingest/common}/__init__.py (100%) rename tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/{ => common}/constants.py (96%) rename tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/{ => common}/exceptions.py (98%) rename tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/{ => common}/ordering.py (98%) rename tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/{ => common}/utils.py (97%) delete mode 100644 tools/cloud_functions/gcs_event_based_ingest/tests/__init__.py diff --git a/tools/__init__.py b/tools/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/cloud_functions/__init__.py b/tools/cloud_functions/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/cloud_functions/gcs_event_based_ingest/Dockerfile.ci b/tools/cloud_functions/gcs_event_based_ingest/Dockerfile.ci index 5cd40aa1e..f92277062 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/Dockerfile.ci +++ b/tools/cloud_functions/gcs_event_based_ingest/Dockerfile.ci @@ -1,4 +1,4 @@ FROM python:3.8-slim COPY requirements.txt requirements-dev.txt ./ RUN pip3 install --no-cache-dir -r requirements-dev.txt -ENTRYPOINT ["pytest"] +ENTRYPOINT ["python3 -m pytest"] diff --git a/tools/cloud_functions/gcs_event_based_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/README.md index 51b5b3a06..70029831c 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/README.md @@ -316,8 +316,10 @@ docker run --rm -it gcr.io/$PROJECT_ID/gcs_event_based_ingest_ci #### Running on your local machine Alternatively to the local cloudbuild or using the docker container to run your tests, you can `pip3 install -r requirements-dev.txt` and select certain tests -to run with [`pytest`](https://docs.pytest.org/en/stable/usage.html). This is -mostly useful if you'd like to integrate with your IDE debugger. +to run with [`python3 -m pytest`](https://docs.pytest.org/en/stable/usage.html). +Note, this is not quite the same as callin `pytest` without the `python -m` prefix +([pytest invocation docs](https://docs.pytest.org/en/stable/usage.html#calling-pytest-through-python-m-pytest)) +This is mostly useful if you'd like to integrate with your IDE debugger. Note that integration tests will spin up / tear down cloud resources that can incur a small cost. These resources will be spun up based on your Google Cloud SDK @@ -331,16 +333,25 @@ See more info on sharing pytest fixtures in the [pytest docs](https://docs.pytes #### Running All Tests ```bash -pytest +python3 -m pytest ``` #### Running Unit Tests Only ```bash -pytest -m "not IT" +python3 -m pytest -m "not IT" ``` #### Running Integration Tests Only ```bash -pytest -m IT +python3 -m pytest -m IT +``` + +#### Running System Tests Only +The system tests assume that you have deployed the cloud function. +```bash +export TF_VAR_short_sha=$(git rev-parse --short=7 HEAD) +export TF_VAR_project_id=${YOUR_GCP_PROJECT_ID} +(cd e2e && terraform init && terraform apply -auto-approve) +python3 -m pytest -m SYS ``` ## Deployment diff --git a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml index a41666c65..2ef218e43 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml +++ b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml @@ -117,7 +117,7 @@ steps: args: - '-c' # pip installing again to get GCB to recognize mocker from pytest-mock - - 'pip install -r requirements-dev.txt && python3 -m pytest -m "not IT"' + - 'pip install -r requirements-dev.txt && python3 -m pytest tests -m "not IT"' id: 'unit-test' - name: 'gcr.io/$PROJECT_ID/gcs_event_based_ingest_ci' dir: '${_BUILD_DIR}' @@ -127,10 +127,55 @@ steps: entrypoint: /bin/sh args: - '-c' - - 'pip install -r requirements-dev.txt && python3 -m pytest -m IT' + - 'pip install -r requirements-dev.txt && python3 -m pytest tests -m IT' id: 'integration-test' -timeout: '1200s' +- name: 'hashicorp/terraform' + waitFor: + - 'integration-test' + dir: '${_BUILD_DIR}/e2e' + args: ['init'] + id: 'terraform-e2e-init' +- name: 'hashicorp/terraform' + waitFor: + - 'terraform-e2e-init' + dir: '${_BUILD_DIR}/e2e' + args: ['apply', '-auto-approve'] + env: + - 'TF_VAR_project_id=$PROJECT_ID' + - 'TF_VAR_region=$_REGION' + - 'TF_VAR_suffix=$SHORT_SHA' + id: 'terraform-e2e-apply' +- name: 'gcr.io/$PROJECT_ID/gcs_event_based_ingest_ci' + dir: '${_BUILD_DIR}' + waitFor: + - 'build-ci-image' + - 'terraform-e2e-apply' + entrypoint: /bin/sh + args: + - '-c' + - 'python3 -m pytest e2e --tfstate=${_BUILD_DIR}/e2e/terraform.state' + id: 'e2e-test' +- name: 'hashicorp/terraform' + waitFor: + - 'e2e-test' + dir: '${_BUILD_DIR}/e2e' + # Note if the e2e test fails the resources will not be cleaned up due to + # cloud build not allowing ignored failed steps. + # this will allow maintainer to evaluate what went wrong during e2e test + # because the evidence will not be destroyed. + # Maintainers of bqutil project should destroy these resources after the + # failure cause has been diagnosed. + # We do not run this e2e test unless all unit and integration tests pass. + # https://github.com/GoogleCloudPlatform/cloud-builders/issues/253 + args: ['destroy', '-auto-approve'] + env: + - 'TF_VAR_project_id=$PROJECT_ID' + - 'TF_VAR_region=$_REGION' + - 'TF_VAR_suffix=$SHORT_SHA' + id: 'terraform-e2e-destroy' +timeout: '3600s' options: machineType: 'N1_HIGHCPU_32' substitutions: '_BUILD_DIR': 'tools/cloud_functions/gcs_event_based_ingest' + '_REGION': 'us-central1' diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/.gitignore b/tools/cloud_functions/gcs_event_based_ingest/e2e/.gitignore new file mode 100644 index 000000000..9e399369c --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/.gitignore @@ -0,0 +1,35 @@ +# Local .terraform directories +**/.terraform/* + +# .tfstate files +*.tfstate +*.tfstate.* + +# Crash log files +crash.log + +# Exclude all .tfvars files, which are likely to contain sentitive data, such as +# password, private keys, and other secrets. These should not be part of version +# control as they are data points which are potentially sensitive and subject +# to change depending on the environment. +# +*.tfvars + +# Ignore override files as they are usually used to override resources locally and so +# are not checked in +override.tf +override.tf.json +*_override.tf +*_override.tf.json + +# Include override files you do wish to add to version control using negated pattern +# +# !example_override.tf + +# Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan +# example: *tfplan* + +# Ignore CLI configuration files +.terraformrc +terraform.rc + diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/cli/__init__.py b/tools/cloud_functions/gcs_event_based_ingest/e2e/__init__.py similarity index 100% rename from tools/cloud_functions/gcs_event_based_ingest/tests/cli/__init__.py rename to tools/cloud_functions/gcs_event_based_ingest/e2e/__init__.py diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py new file mode 100644 index 000000000..80b870617 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py @@ -0,0 +1,93 @@ +# Copyright 2020 Google LLC. +# This software is provided as-is, without warranty or representation +# for any use or purpose. +# Your use of it is subject to your agreement with Google. + +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""End-to-end tests for event based BigQuery ingest Cloud Function.""" +import json +import os +import uuid + +import pytest +from google.cloud import bigquery +from google.cloud import storage + + +def pytest_addoption(parser): + # if Terraform was used to deploy resources, pass the state details + parser.addoption("--tfstate", action="store", default=None) + + +@pytest.fixture(scope="module") +def bq() -> bigquery.Client: + """BigQuery Client""" + return bigquery.Client(location="US") + + +@pytest.fixture(scope="module") +def gcs() -> storage.Client: + """GCS Client""" + return storage.Client() + + +@pytest.fixture(scope='module') +def tf_state(pytestconfig): + + # if we used Terraform to create the GCP resources, use the output variables + if pytestconfig.getoption('tfstate') is not None: + tf_state_file = pytestconfig.getoption('tfstate') + with open(tf_state_file, 'r', encoding='utf-8') as fp: + return json.load(fp) + + +@pytest.fixture +def dest_dataset(request, bq, monkeypatch): + random_dataset = (f"test_bq_ingest_gcf_" + f"{str(uuid.uuid4())[:8].replace('-','_')}") + dataset = bigquery.Dataset(f"{os.getenv('TF_VAR_project_id', 'bqutil')}" + f".{random_dataset}") + dataset.location = "US" + bq.create_dataset(dataset) + monkeypatch.setenv("BQ_LOAD_STATE_TABLE", + f"{dataset.dataset_id}.serverless_bq_loads") + print(f"created dataset {dataset.dataset_id}") + + def teardown(): + bq.delete_dataset(dataset, delete_contents=True, not_found_ok=True) + + request.addfinalizer(teardown) + return dataset + + +@pytest.fixture(scope="function") +def dest_table(request, bq: bigquery.Client, dest_dataset) -> bigquery.Table: + public_table: bigquery.Table = bq.get_table( + bigquery.TableReference.from_string( + "bigquery-public-data.new_york_311.311_service_requests")) + schema = public_table.schema + + table: bigquery.Table = bigquery.Table( + f"{os.environ.get('TF_VAR_project_id', 'bqutil')}" + f".{dest_dataset.dataset_id}.cf_e2e_test_nyc_311_" + f"{os.getenv('SHORT_SHA', 'manual')}", + schema=schema, + ) + + table = bq.create_table(table) + + def teardown(): + bq.delete_table(table, not_found_ok=True) + + request.addfinalizer(teardown) + return table diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py b/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py new file mode 100644 index 000000000..7eaa9f7e8 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py @@ -0,0 +1,127 @@ +# Copyright 2020 Google LLC. +# This software is provided as-is, without warranty or representation +# for any use or purpose. +# Your use of it is subject to your agreement with Google. + +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import concurrent.futures +import json +import time +from typing import Dict + +import pytest +from google.cloud import bigquery +from google.cloud import storage + +WAIT_FOR_ROWS_TIMEOUT = 180 # seconds + + +@pytest.mark.SYS +def test_gcs_ocn_bq_ingest_cloud_function( + gcs: storage.Client, + bq: bigquery.Client, + tf_state: Dict, + dest_table: bigquery.Table, +): + """drop some test data and assert that the excpected actions are taken by + the deployed cloud function""" + input_bucket_id = tf_state['outputs']['bucket']['value'] + table_prefix = f"{dest_table.dataset_id}/" \ + f"{dest_table.table_id}" + extract_config = bigquery.ExtractJobConfig() + extract_config.destination_format = bigquery.DestinationFormat.AVRO + public_table: bigquery.Table = bq.get_table( + bigquery.TableReference.from_string( + "bigquery-public-data.new_york_311.311_service_requests")) + + def _extract(batch: str): + extract_job: bigquery.ExtractJob = bq.extract_table( + public_table, f"gs://{input_bucket_id}/{table_prefix}/{batch}/" + f"data-*.avro", + job_config=extract_config) + return extract_job.result() + + batches = [ + "historical/00", "historical/01", "historical/02", "incremental/03" + ] + history_batch_nums = ["00", "01", "02"] + with concurrent.futures.ThreadPoolExecutor() as pool: + # export some data from public BQ table into a historical partitions + extract_results = pool.map(_extract, batches) + + for res in extract_results: + assert res.errors is None, f"extract job {res.job_id} failed" + + bkt: storage.Bucket = gcs.lookup_bucket(input_bucket_id) + # configure load jobs for this table + load_config = bkt.blob(f"{table_prefix}/_config/load.json") + load_config.upload_from_string( + json.dumps({ + "writeDisposition": "WRITE_APPEND", + "sourceFormat": "AVRO", + "useAvroLogicalTypes": "True", + })) + # add historical success files + for batch in history_batch_nums: + historical_success_blob: storage.Blob = bkt.blob( + f"{table_prefix}/historical/{batch}/_SUCCESS") + historical_success_blob.upload_from_string("") + + # assert 0 bq rows (because _HISTORYDONE not dropped yet) + dest_table: bigquery.Table = bq.get_table(dest_table) + assert dest_table.num_rows == 0, \ + "history was ingested before _HISTORYDONE was uploaded" + + # add _HISTORYDONE + history_done_blob: storage.Blob = bkt.blob(f"{table_prefix}/_HISTORYDONE") + history_done_blob.upload_from_string("") + + # wait for bq rows to reach expected num rows + bq_wait_for_rows(bq, dest_table, + public_table.num_rows * len(history_batch_nums)) + + # add the incremental success file + incremental_success_blob: storage.Blob = bkt.blob( + f"{table_prefix}/{batches[-1]}/_SUCCESS") + incremental_success_blob.upload_from_string("") + + # wait on new expected bq rows + bq_wait_for_rows(bq, dest_table, public_table.num_rows * len(batches)) + + +def bq_wait_for_rows(bq_client: bigquery.Client, table: bigquery.Table, + expected_num_rows: int): + """ + polls tables.get API for number of rows until reaches expected value or + times out. + + This is mostly an optimization to speed up the test suite without making it + flaky. + """ + + start_poll = time.monotonic() + actual_num_rows = 0 + while time.monotonic() - start_poll < WAIT_FOR_ROWS_TIMEOUT: + bq_table: bigquery.Table = bq_client.get_table(table) + actual_num_rows = bq_table.num_rows + if actual_num_rows == expected_num_rows: + return + if actual_num_rows > expected_num_rows: + raise AssertionError( + f"{table.project}.{table.dataset_id}.{table.table_id} has" + f"{actual_num_rows} rows. expected {expected_num_rows} rows.") + raise AssertionError( + f"Timed out after {WAIT_FOR_ROWS_TIMEOUT} seconds waiting for " + f"{table.project}.{table.dataset_id}.{table.table_id} to " + f"reach {expected_num_rows} rows." + f"last poll returned {actual_num_rows} rows.") diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf b/tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf new file mode 100644 index 000000000..4c302663e --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf @@ -0,0 +1,45 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +variable "short_sha" {} +variable "project_id" { default = "bqutil" } +variable "region" { default = "us-central1" } +output "bucket" { + value = module.gcs_ocn_bq_ingest.input-bucket +} + +resource "google_storage_bucket" "cloud_functions_source" { + name = "gcf-source-archives-${var.short_sha}" + project = var.project_id + storage_class = "REGIONAL" + location = var.region + force_destroy = "true" +} + +module "gcs_ocn_bq_ingest" { + source = "../terraform_module/gcs_ocn_bq_ingest_function" + function_source_folder = "../gcs_ocn_bq_ingest" + app_id = "gcs-ocn-bq-ingest-e2e-test-${var.short_sha}" + cloudfunctions_source_bucket = google_storage_bucket.cloud_functions_source.name + data_ingester_sa = "data-ingester-sa-${var.short_sha}" + input_bucket = "gcs-ocn-bq-ingest-e2e-tests-${var.short_sha}" + project_id = var.project_id + environment_variables = { + START_BACKFILL_FILENAME = "_HISTORYDONE" + ORDER_PER_TABLE = "True" + } + # We'll use a shorter timeout for e2e stress subscriber re-triggering + timeout = 60 + force_destroy = "true" +} + diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/__init__.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/__init__.py similarity index 100% rename from tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/__init__.py rename to tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/__init__.py diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py similarity index 96% rename from tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py rename to tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py index daa116dfe..50faf6d12 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/constants.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py @@ -47,16 +47,21 @@ DEFAULT_JOB_LABELS = { "component": "event-based-gcs-ingest", - "cloud-function-name": os.getenv("FUNCTION_NAME"), + "cloud-function-name": os.getenv("K_SERVICE"), } -BASE_LOAD_JOB_CONFIG = { +DEFAULT_LOAD_JOB_CONFIG = { "sourceFormat": "CSV", "fieldDelimiter": ",", "writeDisposition": "WRITE_APPEND", "labels": DEFAULT_JOB_LABELS, } +BASE_LOAD_JOB_CONFIG = { + "writeDisposition": "WRITE_APPEND", + "labels": DEFAULT_JOB_LABELS, +} + # https://cloud.google.com/bigquery/quotas#load_jobs # 15TB per BQ load job (soft limit). DEFAULT_MAX_BATCH_BYTES = str(15 * 10**12) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/exceptions.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/exceptions.py similarity index 98% rename from tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/exceptions.py rename to tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/exceptions.py index a1126c22e..8ab701e8d 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/exceptions.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/exceptions.py @@ -43,9 +43,9 @@ class BacklogException(Exception): function.""" -EXCEPTIONS_TO_REPORT = { +EXCEPTIONS_TO_REPORT = ( BigQueryJobFailure, UnexpectedTriggerException, DestinationRegexMatchException, BacklogException, -} +) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py similarity index 98% rename from tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py rename to tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py index cd998f985..68e39542d 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py @@ -60,11 +60,14 @@ def backlog_subscriber(gcs_client: Optional[storage.Client], """Pick up the table lock, poll BQ job id until completion and process next item in the backlog. """ + print(f"started backfill subscriber for gs://{backfill_blob.bucket.name}/" + f"{backfill_blob.name}") gcs_client, bq_client = _get_clients_if_none(gcs_client, bq_client) # We need to retrigger the backfill loop before the Cloud Functions Timeout. restart_time = function_start_time + ( float(os.getenv("FUNCTION_TIMEOUT_SEC", "60")) - constants.RESTART_BUFFER_SECONDS) + print(f"restart time is {restart_time}") backfill_blob_generation = backfill_blob.generation bkt = backfill_blob.bucket utils.handle_duplicate_notification(gcs_client, backfill_blob) @@ -169,7 +172,7 @@ def backlog_subscriber(gcs_client: Optional[storage.Client], next_success_file: storage.Blob = bkt.blob( next_backlog_file.name.replace("/_backlog/", "/")) table_ref, batch = utils.gcs_path_to_table_ref_and_batch( - next_success_file.name) + next_success_file.name, bq_client.project) if not next_success_file.exists(client=gcs_client): raise exceptions.BacklogException( "backlog contains " diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py similarity index 97% rename from tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py rename to tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index de49b8ca6..496ec8dae 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -75,8 +75,7 @@ def external_query( # pylint: disable=too-many-arguments rendered_query = query.format( dest_dataset=f"`{dest_table_ref.project}`.{dest_table_ref.dataset_id}", - dest_table=table_id - ) + dest_table=table_id) job: bigquery.QueryJob = bq_client.query(rendered_query, job_config=job_config, @@ -199,12 +198,16 @@ def _get_parent_config(path): while parts: config = _get_parent_config("/".join(parts)) if config: + print(f"found config: {'/'.join(parts)}") config_q.append(json.loads(config)) parts.pop() merged_config: Dict = {} while config_q: recursive_update(merged_config, config_q.popleft(), in_place=True) + if merged_config == constants.BASE_LOAD_JOB_CONFIG: + print("falling back to default CSV load job config") + return constants.DEFAULT_LOAD_JOB_CONFIG print(f"merged_config: {merged_config}") return bigquery.LoadJobConfig.from_api_repr({"load": merged_config}) @@ -580,7 +583,8 @@ def wait_on_gcs_blob(gcs_client: storage.Client, def gcs_path_to_table_ref_and_batch( - object_id) -> Tuple[bigquery.TableReference, Optional[str]]: + object_id: str, default_project: Optional[str] +) -> Tuple[bigquery.TableReference, Optional[str]]: """extract bigquery table reference and batch id from gcs object id""" destination_match = constants.DESTINATION_REGEX.match(object_id) @@ -611,11 +615,11 @@ def gcs_path_to_table_ref_and_batch( dest_table_ref = bigquery.TableReference.from_string( f"{dataset}.{table}{partition}", - default_project=os.getenv("BQ_PROJECT", os.getenv("GCP_PROJECT"))) + default_project=os.getenv("BQ_PROJECT", default_project)) else: dest_table_ref = bigquery.TableReference.from_string( f"{dataset}.{table}", - default_project=os.getenv("BQ_PROJECT", os.getenv("GCP_PROJECT"))) + default_project=os.getenv("BQ_PROJECT", default_project)) return dest_table_ref, batch_id @@ -701,7 +705,8 @@ def apply( if lock_blob: handle_bq_lock(gcs_client, lock_blob, job_id) bkt = success_blob.bucket - dest_table_ref, _ = gcs_path_to_table_ref_and_batch(success_blob.name) + dest_table_ref, _ = gcs_path_to_table_ref_and_batch(success_blob.name, + bq_client.project) gsurl = removesuffix(f"gs://{bkt.name}/{success_blob.name}", constants.SUCCESS_FILENAME) print( diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 5f79f5962..cbd55cf7c 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -27,10 +27,16 @@ from google.cloud import error_reporting from google.cloud import storage -from . import constants -from . import exceptions -from . import ordering -from . import utils +try: + from common import constants + from common import exceptions + from common import ordering + from common import utils +except ModuleNotFoundError: + from .common import constants + from .common import exceptions + from .common import ordering + from .common import utils # Reuse GCP Clients across function invocations using globbals # https://cloud.google.com/functions/docs/bestpractices/tips#use_global_variables_to_reuse_objects_in_future_invocations @@ -83,7 +89,7 @@ def main(event: Dict, context): # pylint: disable=unused-argument print("recieved duplicate notification. this was handled gracefully." f"{traceback.format_exc()}") - except tuple(exceptions.EXCEPTIONS_TO_REPORT) as original_error: + except exceptions.EXCEPTIONS_TO_REPORT as original_error: # We do this because we know these errors do not require a cold restart # of the cloud function. try: @@ -104,7 +110,13 @@ def triage_event(gcs_client: Optional[storage.Client], blob.""" bkt = event_blob.bucket basename_object_id = os.path.basename(event_blob.name) - table_ref, batch = utils.gcs_path_to_table_ref_and_batch(event_blob.name) + if bq_client: + table_ref, batch = utils.gcs_path_to_table_ref_and_batch( + event_blob.name, bq_client.project) + else: + table_ref, batch = utils.gcs_path_to_table_ref_and_batch( + event_blob.name, None) + if enforce_ordering: # For SUCCESS files in a backlog directory, ensure that subscriber # is running. @@ -140,8 +152,12 @@ def triage_event(gcs_client: Optional[storage.Client], return else: # Default behavior submit job as soon as success file lands. if basename_object_id == constants.SUCCESS_FILENAME: - utils.apply(gcs_client, bq_client, event_blob, None, - utils.create_job_id(table_ref, batch)) + utils.apply( + gcs_client, + bq_client, + event_blob, + None, # no lock blob when ordering not enabled. + utils.create_job_id(table_ref, batch)) def lazy_error_reporting_client() -> error_reporting.Client: diff --git a/tools/cloud_functions/gcs_event_based_ingest/pytest.ini b/tools/cloud_functions/gcs_event_based_ingest/pytest.ini index 7602954dc..bf550fdcf 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/pytest.ini +++ b/tools/cloud_functions/gcs_event_based_ingest/pytest.ini @@ -7,6 +7,7 @@ log_file_level = INFO log_file = test.log markers = IT: marks tests as slow integration test requiring cloud resouces (deselect with '-m "not IT"') + SYS: marks tests as slow system or e2e test requiring cloud resouces (deselect with '-m "not IT"') ORDERING: marks tests that test features related to ordering CLI: marks tests of CLI utilities addopts = --workers=auto diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/README.md b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/README.md index b347aceeb..d4ea6dbd1 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/README.md @@ -10,7 +10,7 @@ documented [here](../gcs_ocn_bq_ingest_function/README.md) | Name | Version | |------|---------| -| terraform | >= 0.12 | +| terraform | >= 0.13 | | archive | ~> 2.0.0 | | google | >= 3.38.0 | | template | ~> 2.2.0 | @@ -31,11 +31,13 @@ documented [here](../gcs_ocn_bq_ingest_function/README.md) | cloudfunctions\_source\_bucket | GCS bucket to store Cloud Functions Source | `any` | n/a | yes | | data\_ingester\_sa | Service Account Email responsible for ingesting data to BigQuery | `any` | n/a | yes | | environment\_variables | Environment variables to set on the cloud function. | `map(string)` | `{}` | no | +| force\_destroy | force destroy resources (e.g. for e2e tests) | `string` | `"false"` | no | | function\_source\_folder | Path to Cloud Function source | `string` | `"../gcs_event_based_ingest/gcs_ocn_bq_ingest/"` | no | | input\_bucket | GCS bucket to watch for new files | `any` | n/a | yes | | input\_prefix | GCS prefix to watch for new files in input\_bucket | `any` | `null` | no | | project\_id | GCP Project ID containing cloud function, and input bucket | `any` | n/a | yes | | region | GCP region in which to deploy cloud function | `string` | `"us-central1"` | no | +| timeout | Cloud Functions timeout in seconds | `number` | `540` | no | | use\_pubsub\_notifications | Setting this to true will use Pub/Sub notifications By default we will use Cloud Functions Event direct notifications. See https://cloud.google.com/storage/docs/pubsub-notifications. | `bool` | `false` | no | ## Outputs @@ -44,4 +46,5 @@ documented [here](../gcs_ocn_bq_ingest_function/README.md) |------|-------------| | cloud-function | instance of cloud function deployed by this module. | | data-ingester-sa | data ingester service account email created as cloud function identity | +| input-bucket | n/a | diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf index 16d7ce821..80226e344 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf @@ -23,15 +23,17 @@ resource "google_pubsub_topic" "notification_topic" { } module "bucket" { - source = "terraform-google-modules/cloud-storage/google//modules/simple_bucket" - version = "~> 1.3" + depends_on = [module.data_ingester_service_account] + source = "terraform-google-modules/cloud-storage/google//modules/simple_bucket" + version = "~> 1.3" - name = var.input_bucket - project_id = var.project_id - location = var.region + name = var.input_bucket + project_id = var.project_id + location = var.region + force_destroy = var.force_destroy iam_members = [{ role = "roles/storage.objectAdmin" - member = module.data_ingester_service_account.iam_email + member = "serviceAccount:${var.data_ingester_sa}@${var.project_id}.iam.gserviceaccount.com" }] } @@ -59,20 +61,28 @@ resource "google_storage_bucket_object" "function_zip_object" { content_type = "application/zip" } +locals { + function_name = "gcs_to_bq_${var.app_id}" +} resource "google_cloudfunctions_function" "gcs_to_bq" { + depends_on = [google_storage_bucket_object.function_zip_object] project = var.project_id - name = "gcs_to_bq_${var.app_id}" + name = local.function_name region = var.region runtime = "python38" - timeout = 9 * 60 # seconds - service_account_email = var.data_ingester_sa + timeout = var.timeout + service_account_email = module.data_ingester_service_account.email source_archive_bucket = var.cloudfunctions_source_bucket source_archive_object = google_storage_bucket_object.function_zip_object.name entry_point = "main" - environment_variables = var.environment_variables + environment_variables = merge(var.environment_variables, { + GCP_PROJECT = var.project_id, + FUNCTION_TIMEOUT_SEC = var.timeout + FUNCTION_NAME = local.function_name + }) event_trigger { event_type = var.use_pubsub_notifications ? "providers/cloud.pubsub/eventTypes/topic.publish" : "google.storage.object.finalize" - resource = var.use_pubsub_notifications ? google_pubsub_topic.notification_topic[0].id : module.bucket.name + resource = var.use_pubsub_notifications ? "projects/${var.project_id}/${google_pubsub_topic.notification_topic[0].id}" : module.bucket.bucket.name } } @@ -83,6 +93,7 @@ module "data_ingester_service_account" { names = [var.data_ingester_sa, ] project_roles = [ "${var.project_id}=>roles/bigquery.jobUser", + "${var.project_id}=>roles/storage.admin", ] } @@ -112,3 +123,19 @@ resource "google_pubsub_topic_iam_binding" "cf_subscriber" { members = [module.data_ingester_service_account.iam_email] } +module "project-services" { + source = "terraform-google-modules/project-factory/google//modules/project_services" + version = "4.0.0" + + project_id = var.project_id + disable_services_on_destroy = "false" + + activate_apis = [ + "compute.googleapis.com", + "iam.googleapis.com", + "bigquery.googleapis.com", + "storage.googleapis.com", + "pubsub.googleapis.com", + "clouderrorreporting.googleapis.com", + ] +} diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/outputs.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/outputs.tf index e34d2d0f4..69d8017ab 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/outputs.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/outputs.tf @@ -21,3 +21,7 @@ output "data-ingester-sa" { value = module.data_ingester_service_account.email } +output "input-bucket" { + value = module.bucket.bucket.name +} + diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf index ca3073a0d..1783034f5 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf @@ -65,3 +65,12 @@ variable "bigquery_project_ids" { default = [] } +variable "force_destroy" { + description = "force destroy resources (e.g. for e2e tests)" + default = "false" +} + +variable "timeout" { + description = "Cloud Functions timeout in seconds" + default = 540 +} diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/versions.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/versions.tf index 68daa41d7..e4234775c 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/versions.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/versions.tf @@ -15,7 +15,7 @@ */ terraform { - required_version = ">= 0.12" + required_version = ">= 0.13" required_providers { google = ">= 3.38.0" diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/__init__.py b/tools/cloud_functions/gcs_event_based_ingest/tests/__init__.py deleted file mode 100644 index 3deceee10..000000000 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright 2020 Google LLC. -# This software is provided as-is, without warranty or representation -# for any use or purpose. -# Your use of it is subject to your agreement with Google. - -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import sys - -sys.path.append(os.path.realpath(os.path.dirname(__file__) + "/..")) diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py index 776f7b08b..cfdc4323a 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py @@ -23,8 +23,8 @@ from google.cloud import error_reporting from google.cloud import storage -import gcs_ocn_bq_ingest.ordering -import gcs_ocn_bq_ingest.utils +import gcs_ocn_bq_ingest.common.ordering +import gcs_ocn_bq_ingest.common.utils TEST_DIR = os.path.realpath(os.path.dirname(__file__)) LOAD_JOB_POLLING_TIMEOUT = 10 # seconds @@ -75,6 +75,7 @@ def mock_env(gcs, monkeypatch): monkeypatch.setenv("GCP_PROJECT", gcs.project) monkeypatch.setenv("FUNCTION_NAME", "integration-test") monkeypatch.setenv("FUNCTION_TIMEOUT_SEC", "540") + monkeypatch.setenv("BQ_PROJECT", gcs.project) @pytest.fixture @@ -106,7 +107,7 @@ def teardown(): def dest_table(request, bq, mock_env, dest_dataset) -> bigquery.Table: with open(os.path.join(TEST_DIR, "resources", "nation_schema.json")) as schema_file: - schema = gcs_ocn_bq_ingest.utils.dict_to_bq_schema( + schema = gcs_ocn_bq_ingest.common.utils.dict_to_bq_schema( json.load(schema_file)) table = bigquery.Table( @@ -352,7 +353,7 @@ def dest_ordered_update_table(request, gcs, gcs_bucket, bq, mock_env, dest_dataset) -> bigquery.Table: with open(os.path.join(TEST_DIR, "resources", "ordering_schema.json")) as schema_file: - schema = gcs_ocn_bq_ingest.utils.dict_to_bq_schema( + schema = gcs_ocn_bq_ingest.common.utils.dict_to_bq_schema( json.load(schema_file)) table = bigquery.Table( @@ -373,7 +374,7 @@ def dest_ordered_update_table(request, gcs, gcs_bucket, bq, mock_env, "alpha_update": "" }], table, - job_id_prefix=gcs_ocn_bq_ingest.constants.DEFAULT_JOB_PREFIX) + job_id_prefix=gcs_ocn_bq_ingest.common.constants.DEFAULT_JOB_PREFIX) # The subscriber will be responsible for cleaning up this file. bqlock_obj: storage.blob.Blob = gcs_bucket.blob("/".join([ @@ -437,9 +438,11 @@ def gcs_backlog(request, gcs, gcs_bucket, # We will deal with the last incremental in the test itself to test the # behavior of a new backlog subscriber. for success_blob in gcs_ordered_update_data: - gcs_ocn_bq_ingest.ordering.backlog_publisher(gcs, success_blob) - backlog_blob = gcs_ocn_bq_ingest.ordering.success_blob_to_backlog_blob( - success_blob) + gcs_ocn_bq_ingest.common.ordering.backlog_publisher(gcs, success_blob) + backlog_blob = \ + gcs_ocn_bq_ingest.common.ordering.success_blob_to_backlog_blob( + success_blob + ) backlog_blob.upload_from_string("") data_objs.append(backlog_blob) @@ -497,7 +500,7 @@ def gcs_external_update_config(request, gcs_bucket, dest_dataset, backfill_blob = gcs_bucket.blob("/".join([ f"{dest_dataset.project}.{dest_dataset.dataset_id}", dest_ordered_update_table.table_id, - gcs_ocn_bq_ingest.constants.BACKFILL_FILENAME + gcs_ocn_bq_ingest.common.constants.BACKFILL_FILENAME ])) backfill_blob.upload_from_string("") config_objs.append(sql_obj) diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py index 6dfc57dec..349780f32 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py @@ -16,16 +16,17 @@ import re import time from typing import Dict, Optional +from unittest.mock import Mock import pytest from google.cloud import storage -import gcs_ocn_bq_ingest.constants +import gcs_ocn_bq_ingest.common.constants +import gcs_ocn_bq_ingest.common.utils import gcs_ocn_bq_ingest.main -import gcs_ocn_bq_ingest.utils COMPILED_DEFAULT_DENTINATION_REGEX = re.compile( - gcs_ocn_bq_ingest.constants.DEFAULT_DESTINATION_REGEX) + gcs_ocn_bq_ingest.common.constants.DEFAULT_DESTINATION_REGEX) @pytest.mark.parametrize( @@ -142,7 +143,7 @@ def test_default_destination_regex(test_input: str, ([["foo"], [], ["bar", "baz"]], ["foo", "bar", "baz"]), ]) def test_flattend2dlist(test_input, expected): - assert gcs_ocn_bq_ingest.utils.flatten2dlist(test_input) == expected + assert gcs_ocn_bq_ingest.common.utils.flatten2dlist(test_input) == expected @pytest.mark.parametrize( @@ -212,8 +213,8 @@ def test_flattend2dlist(test_input, expected): # yapf: enable ]) def test_recursive_update(original, update, expected): - assert gcs_ocn_bq_ingest.utils.recursive_update(original, - update) == expected + assert gcs_ocn_bq_ingest.common.utils.recursive_update(original, + update) == expected @pytest.mark.parametrize( @@ -237,14 +238,17 @@ def test_recursive_update(original, update, expected): ("dataset/table/_backlog/_BACKFILL", "dataset/table"), ]) def test_get_table_prefix(test_input, expected): - assert gcs_ocn_bq_ingest.utils.get_table_prefix(test_input) == expected + assert gcs_ocn_bq_ingest.common.utils.get_table_prefix( + test_input) == expected def test_triage_event(mock_env, mocker): test_event_blob: storage.Blob = storage.Blob.from_string( "gs://foo/bar/baz/00/_SUCCESS") - apply_mock = mocker.patch('gcs_ocn_bq_ingest.utils.apply') - gcs_ocn_bq_ingest.main.triage_event(None, None, test_event_blob, + apply_mock = mocker.patch('gcs_ocn_bq_ingest.common.utils.apply') + bq_mock = Mock() + bq_mock.project = "foo" + gcs_ocn_bq_ingest.main.triage_event(None, bq_mock, test_event_blob, time.monotonic()) apply_mock.assert_called_once() @@ -253,11 +257,13 @@ def test_triage_event_ordered(ordered_mock_env, mocker): enforce_ordering = True test_event_blob: storage.Blob = storage.Blob.from_string( "gs://foo/bar/baz/00/_SUCCESS") - apply_mock = mocker.patch('gcs_ocn_bq_ingest.utils.apply') + apply_mock = mocker.patch('gcs_ocn_bq_ingest.common.utils.apply') publisher_mock = mocker.patch( - 'gcs_ocn_bq_ingest.ordering.backlog_publisher') + 'gcs_ocn_bq_ingest.common.ordering.backlog_publisher') + bq_mock = Mock() + bq_mock.project = "foo" gcs_ocn_bq_ingest.main.triage_event(None, - None, + bq_mock, test_event_blob, time.monotonic(), enforce_ordering=enforce_ordering) @@ -266,7 +272,7 @@ def test_triage_event_ordered(ordered_mock_env, mocker): test_event_blob: storage.Blob = storage.Blob.from_string( "gs://foo/bar/baz/_BACKFILL") subscriber_mock = mocker.patch( - 'gcs_ocn_bq_ingest.ordering.backlog_subscriber') + 'gcs_ocn_bq_ingest.common.ordering.backlog_subscriber') gcs_ocn_bq_ingest.main.triage_event(None, None, test_event_blob, @@ -276,7 +282,8 @@ def test_triage_event_ordered(ordered_mock_env, mocker): test_event_blob: storage.Blob = storage.Blob.from_string( "gs://foo/bar/baz/_backlog/00/_SUCCESS") - monitor_mock = mocker.patch('gcs_ocn_bq_ingest.ordering.subscriber_monitor') + monitor_mock = mocker.patch( + 'gcs_ocn_bq_ingest.common.ordering.subscriber_monitor') gcs_ocn_bq_ingest.main.triage_event(None, None, test_event_blob, diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py index 2230417d9..9ecf236bc 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py @@ -22,10 +22,10 @@ from google.cloud import bigquery from google.cloud import storage -import gcs_ocn_bq_ingest.constants +import gcs_ocn_bq_ingest.common.constants +import gcs_ocn_bq_ingest.common.ordering +import gcs_ocn_bq_ingest.common.utils import gcs_ocn_bq_ingest.main -import gcs_ocn_bq_ingest.ordering -import gcs_ocn_bq_ingest.utils TEST_DIR = os.path.realpath(os.path.dirname(__file__) + "/..") LOAD_JOB_POLLING_TIMEOUT = 20 # seconds @@ -52,19 +52,20 @@ def test_backlog_publisher(gcs, gcs_bucket, gcs_partitioned_data, mock_env): for gcs_data in gcs_partitioned_data: if not gcs_data.exists(): raise EnvironmentError("test data objects must exist") - if gcs_data.name.endswith(gcs_ocn_bq_ingest.constants.SUCCESS_FILENAME): - table_prefix = gcs_ocn_bq_ingest.utils.get_table_prefix( + if gcs_data.name.endswith( + gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME): + table_prefix = gcs_ocn_bq_ingest.common.utils.get_table_prefix( gcs_data.name) - gcs_ocn_bq_ingest.ordering.backlog_publisher(gcs, gcs_data) + gcs_ocn_bq_ingest.common.ordering.backlog_publisher(gcs, gcs_data) expected_backlog_blobs = queue.Queue() expected_backlog_blobs.put("/".join([ table_prefix, "_backlog", "$2017041101", - gcs_ocn_bq_ingest.constants.SUCCESS_FILENAME + gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME ])) expected_backlog_blobs.put("/".join([ table_prefix, "_backlog", "$2017041102", - gcs_ocn_bq_ingest.constants.SUCCESS_FILENAME + gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME ])) for backlog_blob in gcs_bucket.list_blobs( @@ -72,7 +73,8 @@ def test_backlog_publisher(gcs, gcs_bucket, gcs_partitioned_data, mock_env): assert backlog_blob.name == expected_backlog_blobs.get(block=False) backfill_blob: storage.Blob = gcs_bucket.blob( - f"{table_prefix}/{gcs_ocn_bq_ingest.constants.BACKFILL_FILENAME}") + f"{table_prefix}/{gcs_ocn_bq_ingest.common.constants.BACKFILL_FILENAME}" + ) assert backfill_blob.exists() @@ -89,7 +91,8 @@ def test_backlog_publisher_with_existing_backfill_file(gcs, gcs_bucket, table_prefix = "/".join( [dest_dataset.dataset_id, dest_partitioned_table.table_id]) backfill_blob: storage.Blob = gcs_bucket.blob( - f"{table_prefix}/{gcs_ocn_bq_ingest.constants.BACKFILL_FILENAME}") + f"{table_prefix}/{gcs_ocn_bq_ingest.common.constants.BACKFILL_FILENAME}" + ) backfill_blob.upload_from_string("") backfill_blob.reload() original_backfill_blob_generation = backfill_blob.generation @@ -98,20 +101,21 @@ def test_backlog_publisher_with_existing_backfill_file(gcs, gcs_bucket, for gcs_data in gcs_partitioned_data: if not gcs_data.exists(): raise EnvironmentError("test data objects must exist") - if gcs_data.name.endswith(gcs_ocn_bq_ingest.constants.SUCCESS_FILENAME): - table_prefix = gcs_ocn_bq_ingest.utils.get_table_prefix( + if gcs_data.name.endswith( + gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME): + table_prefix = gcs_ocn_bq_ingest.common.utils.get_table_prefix( gcs_data.name) - gcs_ocn_bq_ingest.ordering.backlog_publisher(gcs, gcs_data) + gcs_ocn_bq_ingest.common.ordering.backlog_publisher(gcs, gcs_data) # Use of queue to test that list responses are returned in expected order. expected_backlog_blobs = queue.Queue() expected_backlog_blobs.put("/".join([ table_prefix, "_backlog", "$2017041101", - gcs_ocn_bq_ingest.constants.SUCCESS_FILENAME + gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME ])) expected_backlog_blobs.put("/".join([ table_prefix, "_backlog", "$2017041102", - gcs_ocn_bq_ingest.constants.SUCCESS_FILENAME + gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME ])) for backlog_blob in gcs_bucket.list_blobs( @@ -138,9 +142,9 @@ def test_backlog_subscriber_in_order_with_new_batch_after_exit( gets applied as expected. """ _run_subscriber(gcs, bq, gcs_external_update_config) - backlog_blobs = gcs_bucket.list_blobs( - prefix=f"{gcs_ocn_bq_ingest.utils.get_table_prefix(gcs_external_update_config.name)}/_backlog/" - ) + table_prefix = gcs_ocn_bq_ingest.common.utils.get_table_prefix( + gcs_external_update_config.name) + backlog_blobs = gcs_bucket.list_blobs(prefix=f"{table_prefix}/_backlog/") assert backlog_blobs.num_results == 0, "backlog is not empty" bqlock_blob: storage.Blob = gcs_bucket.blob("_bqlock") assert not bqlock_blob.exists(), "_bqlock was not cleaned up" @@ -213,7 +217,7 @@ def test_backlog_subscriber_in_order_with_new_batch_while_running( (bkt, dataset, table)) res_backlog_publisher.wait() res_monitor = pool.apply_async( - gcs_ocn_bq_ingest.ordering.subscriber_monitor, + gcs_ocn_bq_ingest.common.ordering.subscriber_monitor, (None, bkt, f"{dataset.project}.{dataset.dataset_id}/{table.table_id}/" f"_backlog/04/_SUCCESS")) @@ -225,9 +229,10 @@ def test_backlog_subscriber_in_order_with_new_batch_while_running( res_subscriber.wait() - backlog_blobs = gcs_bucket.list_blobs( - prefix=f"{gcs_ocn_bq_ingest.utils.get_table_prefix(gcs_external_update_config.name)}/" - f"_backlog/") + table_prefix = gcs_ocn_bq_ingest.common.utils.get_table_prefix( + gcs_external_update_config.name) + backlog_blobs = gcs_bucket.list_blobs(prefix=f"{table_prefix}/" + f"_backlog/") assert backlog_blobs.num_results == 0, "backlog is not empty" bqlock_blob: storage.Blob = gcs_bucket.blob("_bqlock") assert not bqlock_blob.exists(), "_bqlock was not cleaned up" @@ -247,9 +252,8 @@ def _run_subscriber( bq_client: Optional[bigquery.Client], backfill_blob, ): - gcs_ocn_bq_ingest.ordering.backlog_subscriber(gcs_client, - bq_client, backfill_blob, - time.monotonic()) + gcs_ocn_bq_ingest.common.ordering.backlog_subscriber( + gcs_client, bq_client, backfill_blob, time.monotonic()) def _post_a_new_batch(gcs_bucket, dest_dataset, dest_ordered_update_table): @@ -265,4 +269,4 @@ def _post_a_new_batch(gcs_bucket, dest_dataset, dest_ordered_update_table): "test-data", "ordering", "04", test_file), client=gcs) - return gcs_ocn_bq_ingest.ordering.backlog_publisher(gcs, data_obj) + return gcs_ocn_bq_ingest.common.ordering.backlog_publisher(gcs, data_obj) From c18e5e9787cd4398751dcf4ee2e16c523702004e Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Mon, 14 Dec 2020 18:36:28 -0800 Subject: [PATCH 37/90] ignore pylint redherring import errors --- .../gcs_ocn_bq_ingest/common/ordering.py | 6 +++--- .../gcs_ocn_bq_ingest/common/utils.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py index 68e39542d..28d7e203d 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py @@ -30,9 +30,9 @@ from google.cloud import bigquery from google.cloud import storage -from . import constants -from . import exceptions -from . import utils +from . import constants # pylint: disable=no-name-in-module,import-error +from . import exceptions # pylint: disable=no-name-in-module,import-error +from . import utils # pylint: disable=no-name-in-module,import-error def backlog_publisher( diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index 496ec8dae..764aec0a0 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -36,8 +36,8 @@ from google.cloud import bigquery from google.cloud import storage -from . import constants # pylint: disable=no-name-in-module -from . import exceptions # pylint: disable=no-name-in-module +from . import constants # pylint: disable=no-name-in-module,import-error +from . import exceptions # pylint: disable=no-name-in-module,import-error def external_query( # pylint: disable=too-many-arguments From 2c4376a0070cc4a432e88c8f10ba2fa6fdde23f1 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Mon, 14 Dec 2020 18:59:46 -0800 Subject: [PATCH 38/90] fixup! e2e tf to support builds where short_sha is set to empty string. --- tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf b/tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf index 4c302663e..53d1adc07 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf @@ -19,7 +19,7 @@ output "bucket" { } resource "google_storage_bucket" "cloud_functions_source" { - name = "gcf-source-archives-${var.short_sha}" + name = "gcf-source-archives${var.short_sha}" project = var.project_id storage_class = "REGIONAL" location = var.region @@ -29,10 +29,10 @@ resource "google_storage_bucket" "cloud_functions_source" { module "gcs_ocn_bq_ingest" { source = "../terraform_module/gcs_ocn_bq_ingest_function" function_source_folder = "../gcs_ocn_bq_ingest" - app_id = "gcs-ocn-bq-ingest-e2e-test-${var.short_sha}" + app_id = "gcs-ocn-bq-ingest-e2e-test${var.short_sha}" cloudfunctions_source_bucket = google_storage_bucket.cloud_functions_source.name - data_ingester_sa = "data-ingester-sa-${var.short_sha}" - input_bucket = "gcs-ocn-bq-ingest-e2e-tests-${var.short_sha}" + data_ingester_sa = "data-ingester-sa${var.short_sha}" + input_bucket = "gcs-ocn-bq-ingest-e2e-tests${var.short_sha}" project_id = var.project_id environment_variables = { START_BACKFILL_FILENAME = "_HISTORYDONE" From b6690afb882e251874d1c8ba99a15b8923d246e5 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Mon, 14 Dec 2020 20:32:57 -0800 Subject: [PATCH 39/90] fix TF_VAR env var --- tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml index 2ef218e43..56b7854be 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml +++ b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml @@ -143,7 +143,7 @@ steps: env: - 'TF_VAR_project_id=$PROJECT_ID' - 'TF_VAR_region=$_REGION' - - 'TF_VAR_suffix=$SHORT_SHA' + - 'TF_VAR_short_sha=$SHORT_SHA' id: 'terraform-e2e-apply' - name: 'gcr.io/$PROJECT_ID/gcs_event_based_ingest_ci' dir: '${_BUILD_DIR}' @@ -171,7 +171,7 @@ steps: env: - 'TF_VAR_project_id=$PROJECT_ID' - 'TF_VAR_region=$_REGION' - - 'TF_VAR_suffix=$SHORT_SHA' + - 'TF_VAR_short_sha=$SHORT_SHA' id: 'terraform-e2e-destroy' timeout: '3600s' options: From 36be628502f8386e7186076b4cedac409dc91926 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Mon, 14 Dec 2020 20:45:23 -0800 Subject: [PATCH 40/90] enable resource manager api --- .../terraform_module/gcs_ocn_bq_ingest_function/main.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf index 80226e344..e52c55775 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf @@ -137,5 +137,6 @@ module "project-services" { "storage.googleapis.com", "pubsub.googleapis.com", "clouderrorreporting.googleapis.com", + "cloudresourcemanager.googleapis.com", ] } From 610374396e0bd193990f34f9cb37c4f496fd179e Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Mon, 14 Dec 2020 21:02:31 -0800 Subject: [PATCH 41/90] enable cloud functions api... --- .../terraform_module/gcs_ocn_bq_ingest_function/main.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf index e52c55775..6094881c3 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf @@ -138,5 +138,6 @@ module "project-services" { "pubsub.googleapis.com", "clouderrorreporting.googleapis.com", "cloudresourcemanager.googleapis.com", + "cloudfunctions.googleapis.com", ] } From edcdae553577d6f6894dd28b2b2b34fea4ad6088 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Mon, 14 Dec 2020 21:27:14 -0800 Subject: [PATCH 42/90] add unit test timeout --- tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml index 56b7854be..c23d78d15 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml +++ b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml @@ -118,6 +118,7 @@ steps: - '-c' # pip installing again to get GCB to recognize mocker from pytest-mock - 'pip install -r requirements-dev.txt && python3 -m pytest tests -m "not IT"' + timeout: 15s id: 'unit-test' - name: 'gcr.io/$PROJECT_ID/gcs_event_based_ingest_ci' dir: '${_BUILD_DIR}' From 63f480dbf8b50aca036c95040b062517073f8086 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Mon, 14 Dec 2020 21:41:49 -0800 Subject: [PATCH 43/90] explicit local backend --- .../cloud_functions/gcs_event_based_ingest/cloudbuild.yaml | 2 +- tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml index c23d78d15..b44ba6df8 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml +++ b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml @@ -154,7 +154,7 @@ steps: entrypoint: /bin/sh args: - '-c' - - 'python3 -m pytest e2e --tfstate=${_BUILD_DIR}/e2e/terraform.state' + - 'python3 -m pytest e2e --tfstate=/workspace/${_BUILD_DIR}/e2e/terraform.state' id: 'e2e-test' - name: 'hashicorp/terraform' waitFor: diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf b/tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf index 53d1adc07..aed37b488 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf @@ -43,3 +43,9 @@ module "gcs_ocn_bq_ingest" { force_destroy = "true" } +terraform { + backend "local" { + path = "terraform.tfstate" + } +} + From 03d9b795f36375cfee48fcf0431cba16f93505d1 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Mon, 14 Dec 2020 21:50:52 -0800 Subject: [PATCH 44/90] debug missing state file --- tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml index b44ba6df8..21163fb4f 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml +++ b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml @@ -154,7 +154,8 @@ steps: entrypoint: /bin/sh args: - '-c' - - 'python3 -m pytest e2e --tfstate=/workspace/${_BUILD_DIR}/e2e/terraform.state' +# - 'python3 -m pytest e2e --tfstate=/workspace/${_BUILD_DIR}/e2e/terraform.state' + - 'ls -R /workspace' id: 'e2e-test' - name: 'hashicorp/terraform' waitFor: From fa82f12f7ed813fdd252f5884aa179c698e01eff Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Mon, 14 Dec 2020 22:05:11 -0800 Subject: [PATCH 45/90] debug --- tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml index 21163fb4f..85bda8250 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml +++ b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml @@ -154,8 +154,8 @@ steps: entrypoint: /bin/sh args: - '-c' -# - 'python3 -m pytest e2e --tfstate=/workspace/${_BUILD_DIR}/e2e/terraform.state' - 'ls -R /workspace' +# - 'python3 -m pytest e2e --tfstate=/workspace/${_BUILD_DIR}/e2e/terraform.state' id: 'e2e-test' - name: 'hashicorp/terraform' waitFor: From d1acf9e933254c2d2055f6c0e4c8946ea54b060c Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Mon, 14 Dec 2020 22:18:58 -0800 Subject: [PATCH 46/90] relative state path --- tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml index 85bda8250..cd1573d4e 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml +++ b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml @@ -154,8 +154,7 @@ steps: entrypoint: /bin/sh args: - '-c' - - 'ls -R /workspace' -# - 'python3 -m pytest e2e --tfstate=/workspace/${_BUILD_DIR}/e2e/terraform.state' + - 'python3 -m pytest e2e --tfstate=e2e/terraform.state' id: 'e2e-test' - name: 'hashicorp/terraform' waitFor: From b9e741c395b6d7f66a710326735a756ad5b1ccae Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Mon, 14 Dec 2020 22:27:23 -0800 Subject: [PATCH 47/90] typo .[tf]state --- tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml index cd1573d4e..32cf50f14 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml +++ b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml @@ -154,7 +154,7 @@ steps: entrypoint: /bin/sh args: - '-c' - - 'python3 -m pytest e2e --tfstate=e2e/terraform.state' + - 'python3 -m pytest e2e --tfstate=e2e/terraform.tfstate' id: 'e2e-test' - name: 'hashicorp/terraform' waitFor: From dadacaa95eadb4f0c7f0ab032ea4e87bb06bc543 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Mon, 14 Dec 2020 22:48:27 -0800 Subject: [PATCH 48/90] fixup docs --- tools/cloud_functions/gcs_event_based_ingest/README.md | 7 ++++--- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/README.md index 70029831c..99d47ab60 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/README.md @@ -252,6 +252,7 @@ SELECT total_slot_ms, destination_table state, + error_result, (SELECT value FROM UNNEST(labels) WHERE key = "component") as component, (SELECT value FROM UNNEST(labels) WHERE key = "cloud-function-name") as cloud_function_name, (SELECT value FROM UNNEST(labels) WHERE key = "batch-id") as batch_id, @@ -351,7 +352,7 @@ The system tests assume that you have deployed the cloud function. export TF_VAR_short_sha=$(git rev-parse --short=7 HEAD) export TF_VAR_project_id=${YOUR_GCP_PROJECT_ID} (cd e2e && terraform init && terraform apply -auto-approve) -python3 -m pytest -m SYS +python3 -m pytest e2e --tfstate e2e/terraform.tfstate ``` ## Deployment @@ -379,7 +380,7 @@ gcloud functions deploy test-gcs-bq-ingest \ --trigger-topic=${PUBSUB_TOPIC} \ --service-account=${SERVICE_ACCOUNT_EMAIL} \ --timeout=540 \ - --set-env-vars='DESTINATION_REGEX=^(?:[\w\-0-9]+)/(?P[\w\-_0-9]+)/(?P
[\w\-_0-9]+)/?(?:incremental|history)?/?(?P[0-9]{4})?/?(?P[0-9]{2})?/?(?P
[0-9]{2})?/?(?P[0-9]{2})?/?(?P[0-9]+)?/?' + --set-env-vars='DESTINATION_REGEX=^(?:[\w\-0-9]+)/(?P[\w\-_0-9]+)/(?P
[\w\-_0-9]+)/?(?:incremental|history)?/?(?P[0-9]{4})?/?(?P[0-9]{2})?/?(?P
[0-9]{2})?/?(?P[0-9]{2})?/?(?P[0-9]+)?/?,FUNCTION_TIMEOUT_SEC=540' ``` #### Cloud Functions Events @@ -396,7 +397,7 @@ gcloud functions deploy test-gcs-bq-ingest \ --trigger-event google.storage.object.finalize --service-account=${SERVICE_ACCOUNT_EMAIL} \ --timeout=540 \ - --set-env-vars='DESTINATION_REGEX=^(?:[\w\-0-9]+)/(?P[\w\-_0-9]+)/(?P
[\w\-_0-9]+)/?(?:incremental|history)?/?(?P[0-9]{4})?/?(?P[0-9]{2})?/?(?P
[0-9]{2})?/?(?P[0-9]{2})?/?(?P[0-9]+)?/?' + --set-env-vars='DESTINATION_REGEX=^(?:[\w\-0-9]+)/(?P[\w\-_0-9]+)/(?P
[\w\-_0-9]+)/?(?:incremental|history)?/?(?P[0-9]{4})?/?(?P[0-9]{2})?/?(?P
[0-9]{2})?/?(?P[0-9]{2})?/?(?P[0-9]+)?/?,FUNCTION_TIMEOUT_SEC=540' ``` In theory, one could set up Pub/Sub notifications from multiple GCS Buckets diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md index e93b10056..cd701cd09 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md @@ -33,6 +33,7 @@ following default behavior. | `MAX_BATCH_BYTES` | Max bytes for BigQuery Load job | `15000000000000` ([15 TB](https://cloud.google.com/bigquery/quotas#load_jobs)| | `JOB_PREFIX` | Prefix for BigQuery Job IDs | `gcf-ingest-` | | `BQ_PROJECT` | Default BQ project to use if not specified in dataset capturing group | Project where Cloud Function is deployed | +| `FUNCTION_TIMEOUT_SEC`| Number of seconds set for this deployment of Cloud Function (no longer part of python38 runtime) | 60 | | `ORDER_PER_TABLE`\* | Force jobs to be executed sequentially (rather than parallel) based on the backlog. This is the same as having an `ORDERME` file in every config directory | `False` | | `START_BACKFILL_FILENAME`\*| Block submitting BigQuery Jobs for a table until this file is present at the table prefix. By default this will not happen. | `None` | | `RESTART_BUFFER_SECONDS`\* | Buffer before Cloud Function timeout to leave before re-triggering the backfill subscriber | 30 | From 41f04aedd7efe7e62030e9e68c4a43dcd8f6c65a Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Tue, 15 Dec 2020 11:00:12 -0800 Subject: [PATCH 49/90] chore: clean up subscriber --- .../gcs_event_based_ingest/e2e/e2e_test.py | 21 ++- .../gcs_event_based_ingest/e2e/main.tf | 1 - .../gcs_ocn_bq_ingest/common/ordering.py | 160 +++++++++++------- .../gcs_ocn_bq_ingest/common/utils.py | 1 + 4 files changed, 115 insertions(+), 68 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py b/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py index 7eaa9f7e8..8e66658f3 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py @@ -14,6 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""End-to-end test for GCS event based ingest to BigQuery Cloud Function""" import concurrent.futures import json import time @@ -27,14 +28,20 @@ @pytest.mark.SYS -def test_gcs_ocn_bq_ingest_cloud_function( +def test_gcs_ocn_bq_ingest_cloud_function_long_runnning_bq_jobs_with_orderme( gcs: storage.Client, bq: bigquery.Client, tf_state: Dict, dest_table: bigquery.Table, ): - """drop some test data and assert that the excpected actions are taken by - the deployed cloud function""" + """This test assumes the cloud function has been deployed with the + accompanying terraform module which configures a 1 min timeout. + It exports some larger data from a public BigQuery table and then reloads + them to test table to test the cloud function behavior with longer running + BigQuery jobs which are likely to require the backlog subscriber to restart + itself by reposting a _BACKFILL file. The ordering behavior is controlled + with the ORDERME blob. + """ input_bucket_id = tf_state['outputs']['bucket']['value'] table_prefix = f"{dest_table.dataset_id}/" \ f"{dest_table.table_id}" @@ -64,13 +71,15 @@ def _extract(batch: str): bkt: storage.Bucket = gcs.lookup_bucket(input_bucket_id) # configure load jobs for this table - load_config = bkt.blob(f"{table_prefix}/_config/load.json") - load_config.upload_from_string( + load_config_blob = bkt.blob(f"{table_prefix}/_config/load.json") + load_config_blob.upload_from_string( json.dumps({ "writeDisposition": "WRITE_APPEND", "sourceFormat": "AVRO", "useAvroLogicalTypes": "True", })) + orderme_blob = bkt.blob(f"{table_prefix}/_config/ORDERME") + orderme_blob.upload_from_string("") # add historical success files for batch in history_batch_nums: historical_success_blob: storage.Blob = bkt.blob( @@ -78,7 +87,7 @@ def _extract(batch: str): historical_success_blob.upload_from_string("") # assert 0 bq rows (because _HISTORYDONE not dropped yet) - dest_table: bigquery.Table = bq.get_table(dest_table) + dest_table = bq.get_table(dest_table) assert dest_table.num_rows == 0, \ "history was ingested before _HISTORYDONE was uploaded" diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf b/tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf index aed37b488..af45d7eed 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf @@ -36,7 +36,6 @@ module "gcs_ocn_bq_ingest" { project_id = var.project_id environment_variables = { START_BACKFILL_FILENAME = "_HISTORYDONE" - ORDER_PER_TABLE = "True" } # We'll use a shorter timeout for e2e stress subscriber re-triggering timeout = 60 diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py index 28d7e203d..1b9bfeddf 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py @@ -53,7 +53,6 @@ def backlog_publisher( table_prefix) -# pylint: disable=too-many-arguments,too-many-locals,too-many-statements,too-many-branches def backlog_subscriber(gcs_client: Optional[storage.Client], bq_client: Optional[bigquery.Client], backfill_blob: storage.Blob, function_start_time: float): @@ -68,7 +67,6 @@ def backlog_subscriber(gcs_client: Optional[storage.Client], float(os.getenv("FUNCTION_TIMEOUT_SEC", "60")) - constants.RESTART_BUFFER_SECONDS) print(f"restart time is {restart_time}") - backfill_blob_generation = backfill_blob.generation bkt = backfill_blob.bucket utils.handle_duplicate_notification(gcs_client, backfill_blob) table_prefix = utils.get_table_prefix(backfill_blob.name) @@ -93,28 +91,9 @@ def backlog_subscriber(gcs_client: Optional[storage.Client], # the else will handle a manual _bqlock if lock_contents.startswith( os.getenv('JOB_PREFIX', constants.DEFAULT_JOB_PREFIX)): - job_id = lock_contents - try: - last_job_done = utils.wait_on_bq_job_id( - bq_client, job_id, polling_timeout) - except (exceptions.BigQueryJobFailure, - google.api_core.exceptions.NotFound) as err: - raise exceptions.BigQueryJobFailure( - f"previous BigQuery job: {job_id} failed or could not " - "be found. This will kill the backfill subscriber for " - f"the table prefix: {table_prefix}." - "Once the issue is dealt with by a human, the lock " - "file at: " - f"gs://{lock_blob.bucket.name}/{lock_blob.name} " - "should be manually removed and a new empty " - f"{constants.BACKFILL_FILENAME} " - "file uploaded to: " - f"gs://{backfill_blob.bucket.name}/{table_prefix}" - "/_BACKFILL " - f"to resume the backfill subscriber so it can " - "continue with the next item in the backlog.\n" - "Original Exception:\n" - f"{traceback.format_exc()}") from err + last_job_done = wait_on_last_job(bq_client, lock_blob, + backfill_blob, lock_contents, + polling_timeout) else: print(f"sleeping for {polling_timeout} seconds because" f"found manual lock gs://{bkt.name}/{lock_blob.name} with" @@ -136,39 +115,75 @@ def backlog_subscriber(gcs_client: Optional[storage.Client], # If the BQ lock was missing we do not want to delete a backlog # item for a job we have not yet submitted. utils.remove_oldest_backlog_item(gcs_client, bkt, table_prefix) - - check_backlog_time = time.monotonic() - next_backlog_file = utils.get_next_backlog_item(gcs_client, bkt, - table_prefix) - if not next_backlog_file: - print("no more files found in the backlog deleteing backfill blob") - backfill_blob.delete(if_generation_match=backfill_blob_generation, - client=gcs_client) - if (check_backlog_time + constants.ENSURE_SUBSCRIBER_SECONDS < - time.monotonic()): - print( - "checking if the backlog is still empty for " - f"gs://${bkt.name}/{table_prefix}/_backlog/" - f"There was more than {constants.ENSURE_SUBSCRIBER_SECONDS}" - " seconds between listing items on the backlog and " - f"deleting the {constants.BACKFILL_FILENAME}. " - "This should not happen often but is meant to alleviate a " - "race condition in the event that something caused the " - "delete operation was delayed or had to be retried for a " - "long time.") - next_backlog_file = utils.get_next_backlog_item( - gcs_client, bkt, table_prefix) - if next_backlog_file: - # The backfill file was deleted but the backlog is - # not empty. Re-trigger the backfill subscriber loop by - # dropping a new backfill file. - start_backfill_subscriber_if_not_running( - gcs_client, bkt, table_prefix) - return - utils.handle_bq_lock(gcs_client, lock_blob, None) - print(f"backlog is empty for gs://{bkt.name}/{table_prefix}. " - "backlog subscriber exiting.") + should_subscriber_exit = handle_backlog(gcs_client, bq_client, bkt, + lock_blob, backfill_blob) + if should_subscriber_exit: return + # retrigger the subscriber loop by reposting the _BACKFILL file + print("ran out of time, restarting backfill subscriber loop for:" + f"gs://{bkt.name}/{table_prefix}") + backfill_blob = bkt.blob(f"{table_prefix}/{constants.BACKFILL_FILENAME}") + backfill_blob.upload_from_string("") + + +def wait_on_last_job(bq_client: bigquery.Client, lock_blob: storage.Blob, + backfill_blob: storage.blob, job_id: str, + polling_timeout: int): + """wait on a bigquery job or raise informative exception. + + Args: + bq_client: bigquery.Client + lock_blob: storage.Blob _bqlock blob + backfill_blob: storage.blob _BACKFILL blob + job_id: str BigQuery job ID to wait on (read from _bqlock file) + polling_timeout: int seconds to poll before returning. + """ + try: + return utils.wait_on_bq_job_id(bq_client, job_id, polling_timeout) + except (exceptions.BigQueryJobFailure, + google.api_core.exceptions.NotFound) as err: + table_prefix = utils.get_table_prefix(backfill_blob.name) + raise exceptions.BigQueryJobFailure( + f"previous BigQuery job: {job_id} failed or could not " + "be found. This will kill the backfill subscriber for " + f"the table prefix: {table_prefix}." + "Once the issue is dealt with by a human, the lock " + "file at: " + f"gs://{lock_blob.bucket.name}/{lock_blob.name} " + "should be manually removed and a new empty " + f"{constants.BACKFILL_FILENAME} " + "file uploaded to: " + f"gs://{backfill_blob.bucket.name}/{table_prefix}" + "/_BACKFILL " + f"to resume the backfill subscriber so it can " + "continue with the next item in the backlog.\n" + "Original Exception:\n" + f"{traceback.format_exc()}") from err + + +def handle_backlog( + gcs_client: storage.Client, + bq_client: bigquery.Client, + bkt: storage.Bucket, + lock_blob: storage.Blob, + backfill_blob: storage.Blob, +): + """submit the next item in the _backlog if it is non-empty or clean up the + _BACKFILL and _bqlock files. + Args: + gcs_client: storage.Client + bq_client: bigquery.Client + bkt: storage.Bucket + lock_blob: storage.Blob _bqlock blob + backfill_blob: storage.blob _BACKFILL blob + Returns: + bool: should this backlog subscriber exit + """ + table_prefix = utils.get_table_prefix(backfill_blob.name) + check_backlog_time = time.monotonic() + next_backlog_file = utils.get_next_backlog_item(gcs_client, bkt, + table_prefix) + if next_backlog_file: next_success_file: storage.Blob = bkt.blob( next_backlog_file.name.replace("/_backlog/", "/")) table_ref, batch = utils.gcs_path_to_table_ref_and_batch( @@ -184,11 +199,34 @@ def backlog_subscriber(gcs_client: Optional[storage.Client], next_job_id = utils.create_job_id(table_ref, batch) utils.apply(gcs_client, bq_client, next_success_file, lock_blob, next_job_id) - # retrigger the subscriber loop by reposting the _BACKFILL file - print("ran out of time, restarting backfill subscriber loop for:" - f"gs://{bkt.name}/{table_prefix}") - backfill_blob = bkt.blob(f"{table_prefix}/{constants.BACKFILL_FILENAME}") - backfill_blob.upload_from_string("") + return False # BQ job running + print("no more files found in the backlog deleteing backfill blob") + backfill_blob.delete(if_generation_match=backfill_blob.generation, + client=gcs_client) + if (check_backlog_time + constants.ENSURE_SUBSCRIBER_SECONDS < + time.monotonic()): + print("checking if the backlog is still empty for " + f"gs://${bkt.name}/{table_prefix}/_backlog/" + f"There was more than {constants.ENSURE_SUBSCRIBER_SECONDS}" + " seconds between listing items on the backlog and " + f"deleting the {constants.BACKFILL_FILENAME}. " + "This should not happen often but is meant to alleviate a " + "race condition in the event that something caused the " + "delete operation was delayed or had to be retried for a " + "long time.") + next_backlog_file = utils.get_next_backlog_item(gcs_client, bkt, + table_prefix) + if next_backlog_file: + # The backfill file was deleted but the backlog is + # not empty. Re-trigger the backfill subscriber loop by + # dropping a new backfill file. + start_backfill_subscriber_if_not_running(gcs_client, bkt, + table_prefix) + return True # we are re-triggering a new backlog subscriber + utils.handle_bq_lock(gcs_client, lock_blob, None) + print(f"backlog is empty for gs://{bkt.name}/{table_prefix}. " + "backlog subscriber exiting.") + return True # the backlog is empty def start_backfill_subscriber_if_not_running( diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index 764aec0a0..9c41bb0fc 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -440,6 +440,7 @@ def handle_duplicate_notification( "storage notification.") from err +@cachetools.cached(cachetools.LRUCache(maxsize=1024)) def get_table_prefix(object_id: str) -> str: """Find the table prefix for a object_id based on the destination regex. Args: From d8ae3cfec50d3004961f34b385193588beea57cf Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Tue, 15 Dec 2020 12:03:23 -0800 Subject: [PATCH 50/90] fix: don't try to regex match _backlog/* items --- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index cbd55cf7c..228c1e509 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -110,7 +110,10 @@ def triage_event(gcs_client: Optional[storage.Client], blob.""" bkt = event_blob.bucket basename_object_id = os.path.basename(event_blob.name) - if bq_client: + # the _backlog/ directory is likely to mess up the regex matching + # in gcs_path_to_table_ref_and_batch and we won't use the variables in that + # code path anyway. + if bq_client and '_backlog' not in event_blob.name: table_ref, batch = utils.gcs_path_to_table_ref_and_batch( event_blob.name, bq_client.project) else: From d9f34823850001d174b455b3799af74c3ced970b Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Tue, 15 Dec 2020 12:07:21 -0800 Subject: [PATCH 51/90] don't regex match in triage if ordering enabled (this happens later) --- .../gcs_ocn_bq_ingest/main.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 228c1e509..466108042 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -110,15 +110,6 @@ def triage_event(gcs_client: Optional[storage.Client], blob.""" bkt = event_blob.bucket basename_object_id = os.path.basename(event_blob.name) - # the _backlog/ directory is likely to mess up the regex matching - # in gcs_path_to_table_ref_and_batch and we won't use the variables in that - # code path anyway. - if bq_client and '_backlog' not in event_blob.name: - table_ref, batch = utils.gcs_path_to_table_ref_and_batch( - event_blob.name, bq_client.project) - else: - table_ref, batch = utils.gcs_path_to_table_ref_and_batch( - event_blob.name, None) if enforce_ordering: # For SUCCESS files in a backlog directory, ensure that subscriber @@ -155,6 +146,12 @@ def triage_event(gcs_client: Optional[storage.Client], return else: # Default behavior submit job as soon as success file lands. if basename_object_id == constants.SUCCESS_FILENAME: + if bq_client: + table_ref, batch = utils.gcs_path_to_table_ref_and_batch( + event_blob.name, bq_client.project) + else: + table_ref, batch = utils.gcs_path_to_table_ref_and_batch( + event_blob.name, None) utils.apply( gcs_client, bq_client, From 7d2f28f3bea84e0f19b02b2d2f6041d13c24d0b6 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Tue, 15 Dec 2020 12:28:54 -0800 Subject: [PATCH 52/90] fix: subscriber monitor get table prefix --- .../gcs_ocn_bq_ingest/common/ordering.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py index 1b9bfeddf..bfc39c535 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py @@ -305,7 +305,8 @@ def subscriber_monitor(gcs_client: Optional[storage.Client], if not gcs_client: gcs_client = storage.Client(client_info=constants.CLIENT_INFO) backfill_blob = start_backfill_subscriber_if_not_running( - gcs_client, bkt, utils.get_table_prefix(object_id)) + gcs_client, bkt, utils.get_table_prefix( + object_id.replace("_backlog/", ""))) # backfill blob may be none if the START_BACKFILL_FILENAME has not been # dropped From 35fe6e3a66b27bcc6588c6be9835489d8eabf699 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Tue, 15 Dec 2020 12:53:12 -0800 Subject: [PATCH 53/90] fix: get_table_prefix issues w/ backlog, backfill and historydone --- .../gcs_event_based_ingest/README.md | 5 +++-- .../gcs_ocn_bq_ingest/common/ordering.py | 19 ++++++++++++------- .../gcs_ocn_bq_ingest/common/utils.py | 11 ++++++++++- .../gcs_ocn_bq_ingest/main.py | 7 ++++--- 4 files changed, 29 insertions(+), 13 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/README.md index 99d47ab60..f53917936 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/README.md @@ -349,10 +349,11 @@ python3 -m pytest -m IT #### Running System Tests Only The system tests assume that you have deployed the cloud function. ```bash -export TF_VAR_short_sha=$(git rev-parse --short=7 HEAD) -export TF_VAR_project_id=${YOUR_GCP_PROJECT_ID} +export TF_VAR_short_sha=$(git rev-parse --short=10 HEAD) +export TF_VAR_project_id=jferriero-pp-dev (cd e2e && terraform init && terraform apply -auto-approve) python3 -m pytest e2e --tfstate e2e/terraform.tfstate +(cd e2e && terraform destroy -auto-approve) ``` ## Deployment diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py index bfc39c535..78f820de9 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py @@ -69,7 +69,8 @@ def backlog_subscriber(gcs_client: Optional[storage.Client], print(f"restart time is {restart_time}") bkt = backfill_blob.bucket utils.handle_duplicate_notification(gcs_client, backfill_blob) - table_prefix = utils.get_table_prefix(backfill_blob.name) + table_prefix = utils.removesuffix(backfill_blob.name, + constants.BACKFILL_FILENAME) last_job_done = False # we will poll for job completion this long in an individual iteration of # the while loop (before checking if we are too close to cloud function @@ -142,7 +143,8 @@ def wait_on_last_job(bq_client: bigquery.Client, lock_blob: storage.Blob, return utils.wait_on_bq_job_id(bq_client, job_id, polling_timeout) except (exceptions.BigQueryJobFailure, google.api_core.exceptions.NotFound) as err: - table_prefix = utils.get_table_prefix(backfill_blob.name) + table_prefix = utils.removesuffix(backfill_blob.name, + constants.BACKFILL_FILENAME) raise exceptions.BigQueryJobFailure( f"previous BigQuery job: {job_id} failed or could not " "be found. This will kill the backfill subscriber for " @@ -179,7 +181,8 @@ def handle_backlog( Returns: bool: should this backlog subscriber exit """ - table_prefix = utils.get_table_prefix(backfill_blob.name) + table_prefix = utils.removesuffix(backfill_blob.name, + constants.BACKFILL_FILENAME) check_backlog_time = time.monotonic() next_backlog_file = utils.get_next_backlog_item(gcs_client, bkt, table_prefix) @@ -305,8 +308,8 @@ def subscriber_monitor(gcs_client: Optional[storage.Client], if not gcs_client: gcs_client = storage.Client(client_info=constants.CLIENT_INFO) backfill_blob = start_backfill_subscriber_if_not_running( - gcs_client, bkt, utils.get_table_prefix( - object_id.replace("_backlog/", ""))) + gcs_client, bkt, + utils.get_table_prefix(object_id)) # backfill blob may be none if the START_BACKFILL_FILENAME has not been # dropped @@ -326,14 +329,16 @@ def subscriber_monitor(gcs_client: Optional[storage.Client], "subscriber for this table.") backfill_blob.delete(client=gcs_client) start_backfill_subscriber_if_not_running( - gcs_client, bkt, utils.get_table_prefix(object_id)) + gcs_client, bkt, + utils.get_table_prefix(object_id)) return True time.sleep(constants.ENSURE_SUBSCRIBER_SECONDS) while not utils.wait_on_gcs_blob(gcs_client, backfill_blob, constants.ENSURE_SUBSCRIBER_SECONDS): start_backfill_subscriber_if_not_running( - gcs_client, bkt, utils.get_table_prefix(object_id)) + gcs_client, bkt, + utils.get_table_prefix(object_id)) return True return False diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index 9c41bb0fc..3016f76ac 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -448,7 +448,16 @@ def get_table_prefix(object_id: str) -> str: Returns: str: table prefix """ - match = constants.DESTINATION_REGEX.match(object_id) + basename = os.path.basename(object_id) + if basename in { + constants.BACKFILL_FILENAME, + constants.START_BACKFILL_FILENAME + }: + # These files will not match the regex and always should appear at the + # table level. + return removesuffix(object_id, basename) + match = constants.DESTINATION_REGEX.match( + object_id.replace("_backlog/", "")) if not match: raise exceptions.DestinationRegexMatchException( f"could not determine table prefix for object id: {object_id}" diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 466108042..6425fb506 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -128,7 +128,9 @@ def triage_event(gcs_client: Optional[storage.Client], and basename_object_id == constants.START_BACKFILL_FILENAME): # This will be the first backfill file. ordering.start_backfill_subscriber_if_not_running( - gcs_client, bkt, utils.get_table_prefix(event_blob.name)) + gcs_client, bkt, + utils.removesuffix(event_blob.name, + constants.START_BACKFILL_FILENAME)) return if basename_object_id == constants.SUCCESS_FILENAME: ordering.backlog_publisher(gcs_client, event_blob) @@ -169,8 +171,7 @@ def lazy_error_reporting_client() -> error_reporting.Client: """ global ERROR_REPORTING_CLIENT if not ERROR_REPORTING_CLIENT: - ERROR_REPORTING_CLIENT = error_reporting.Client( - client_info=constants.CLIENT_INFO) + ERROR_REPORTING_CLIENT = error_reporting.Client() return ERROR_REPORTING_CLIENT From d93a2c9df9325b49c94c90309ed0650f228a206a Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Tue, 15 Dec 2020 13:50:42 -0800 Subject: [PATCH 54/90] fix: look_for_config_in_parents should return empty string for empty file --- tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py | 2 +- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py b/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py index 8e66658f3..39be223b6 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py @@ -32,7 +32,7 @@ def test_gcs_ocn_bq_ingest_cloud_function_long_runnning_bq_jobs_with_orderme( gcs: storage.Client, bq: bigquery.Client, tf_state: Dict, - dest_table: bigquery.Table, + dest_table: bigquery.Table ): """This test assumes the cloud function has been deployed with the accompanying terraform module which configures a 1 min timeout. diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index 3016f76ac..5c05ab3ab 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -170,7 +170,7 @@ def _get_parent_config(path): config = None while parts: - if config: + if config is not None: return config config = _get_parent_config("/".join(parts)) parts.pop() From d50fefc49d87891ee62c3d248f21ce77066cccb2 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Tue, 15 Dec 2020 14:02:43 -0800 Subject: [PATCH 55/90] fix table prefix w/ trailing slash --- tools/.gitignore | 1 + .../gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py | 2 +- .../tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py | 1 - 3 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 tools/.gitignore diff --git a/tools/.gitignore b/tools/.gitignore new file mode 100644 index 000000000..c18dd8d83 --- /dev/null +++ b/tools/.gitignore @@ -0,0 +1 @@ +__pycache__/ diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index 5c05ab3ab..097f6a3dd 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -455,7 +455,7 @@ def get_table_prefix(object_id: str) -> str: }: # These files will not match the regex and always should appear at the # table level. - return removesuffix(object_id, basename) + return removesuffix(object_id, f"/{basename}") match = constants.DESTINATION_REGEX.match( object_id.replace("_backlog/", "")) if not match: diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py index 349780f32..ba6d95bf2 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py @@ -235,7 +235,6 @@ def test_recursive_update(original, update, expected): ("dataset/table/2020/01/02/03/batch_id/_SUCCESS", "dataset/table"), ("project.dataset/table/2020/01/02/03/batch_id/_SUCCESS", "project.dataset/table"), - ("dataset/table/_backlog/_BACKFILL", "dataset/table"), ]) def test_get_table_prefix(test_input, expected): assert gcs_ocn_bq_ingest.common.utils.get_table_prefix( From b16a8b0cc755d4a47b8b7e97f38426bba8762747 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Tue, 15 Dec 2020 14:28:10 -0800 Subject: [PATCH 56/90] use get_table_prefix instead of removesuffix --- .../gcs_event_based_ingest/e2e/e2e_test.py | 7 ++----- .../gcs_ocn_bq_ingest/common/ordering.py | 18 ++++++------------ .../gcs_ocn_bq_ingest/common/utils.py | 5 ++--- .../test_gcs_ocn_bq_ingest.py | 3 +++ 4 files changed, 13 insertions(+), 20 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py b/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py index 39be223b6..4d35fab39 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py @@ -29,11 +29,8 @@ @pytest.mark.SYS def test_gcs_ocn_bq_ingest_cloud_function_long_runnning_bq_jobs_with_orderme( - gcs: storage.Client, - bq: bigquery.Client, - tf_state: Dict, - dest_table: bigquery.Table -): + gcs: storage.Client, bq: bigquery.Client, tf_state: Dict, + dest_table: bigquery.Table): """This test assumes the cloud function has been deployed with the accompanying terraform module which configures a 1 min timeout. It exports some larger data from a public BigQuery table and then reloads diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py index 78f820de9..1b9bfeddf 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py @@ -69,8 +69,7 @@ def backlog_subscriber(gcs_client: Optional[storage.Client], print(f"restart time is {restart_time}") bkt = backfill_blob.bucket utils.handle_duplicate_notification(gcs_client, backfill_blob) - table_prefix = utils.removesuffix(backfill_blob.name, - constants.BACKFILL_FILENAME) + table_prefix = utils.get_table_prefix(backfill_blob.name) last_job_done = False # we will poll for job completion this long in an individual iteration of # the while loop (before checking if we are too close to cloud function @@ -143,8 +142,7 @@ def wait_on_last_job(bq_client: bigquery.Client, lock_blob: storage.Blob, return utils.wait_on_bq_job_id(bq_client, job_id, polling_timeout) except (exceptions.BigQueryJobFailure, google.api_core.exceptions.NotFound) as err: - table_prefix = utils.removesuffix(backfill_blob.name, - constants.BACKFILL_FILENAME) + table_prefix = utils.get_table_prefix(backfill_blob.name) raise exceptions.BigQueryJobFailure( f"previous BigQuery job: {job_id} failed or could not " "be found. This will kill the backfill subscriber for " @@ -181,8 +179,7 @@ def handle_backlog( Returns: bool: should this backlog subscriber exit """ - table_prefix = utils.removesuffix(backfill_blob.name, - constants.BACKFILL_FILENAME) + table_prefix = utils.get_table_prefix(backfill_blob.name) check_backlog_time = time.monotonic() next_backlog_file = utils.get_next_backlog_item(gcs_client, bkt, table_prefix) @@ -308,8 +305,7 @@ def subscriber_monitor(gcs_client: Optional[storage.Client], if not gcs_client: gcs_client = storage.Client(client_info=constants.CLIENT_INFO) backfill_blob = start_backfill_subscriber_if_not_running( - gcs_client, bkt, - utils.get_table_prefix(object_id)) + gcs_client, bkt, utils.get_table_prefix(object_id)) # backfill blob may be none if the START_BACKFILL_FILENAME has not been # dropped @@ -329,16 +325,14 @@ def subscriber_monitor(gcs_client: Optional[storage.Client], "subscriber for this table.") backfill_blob.delete(client=gcs_client) start_backfill_subscriber_if_not_running( - gcs_client, bkt, - utils.get_table_prefix(object_id)) + gcs_client, bkt, utils.get_table_prefix(object_id)) return True time.sleep(constants.ENSURE_SUBSCRIBER_SECONDS) while not utils.wait_on_gcs_blob(gcs_client, backfill_blob, constants.ENSURE_SUBSCRIBER_SECONDS): start_backfill_subscriber_if_not_running( - gcs_client, bkt, - utils.get_table_prefix(object_id)) + gcs_client, bkt, utils.get_table_prefix(object_id)) return True return False diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index 097f6a3dd..98c583899 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -450,14 +450,13 @@ def get_table_prefix(object_id: str) -> str: """ basename = os.path.basename(object_id) if basename in { - constants.BACKFILL_FILENAME, - constants.START_BACKFILL_FILENAME + constants.BACKFILL_FILENAME, constants.START_BACKFILL_FILENAME }: # These files will not match the regex and always should appear at the # table level. return removesuffix(object_id, f"/{basename}") match = constants.DESTINATION_REGEX.match( - object_id.replace("_backlog/", "")) + object_id.replace("/_backlog/", "/")) if not match: raise exceptions.DestinationRegexMatchException( f"could not determine table prefix for object id: {object_id}" diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py index ba6d95bf2..877ac0104 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py @@ -235,6 +235,9 @@ def test_recursive_update(original, update, expected): ("dataset/table/2020/01/02/03/batch_id/_SUCCESS", "dataset/table"), ("project.dataset/table/2020/01/02/03/batch_id/_SUCCESS", "project.dataset/table"), + ("dataset/table/_BACKFILL", "dataset/table"), + ("dataset/table/_bqlock", "dataset/table"), + ("dataset/table/_backlog/2020/01/02/03/_SUCCESS", "dataset/table"), ]) def test_get_table_prefix(test_input, expected): assert gcs_ocn_bq_ingest.common.utils.get_table_prefix( From f685511ea08a5fa6da0e516a5077dc6de1773bda Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Tue, 15 Dec 2020 16:10:10 -0800 Subject: [PATCH 57/90] chore: refactor terraform into pytest fixture to always clean up --- .../gcs_event_based_ingest/.hadolint.yaml | 2 + .../gcs_event_based_ingest/Dockerfile.ci | 13 +++++- .../gcs_event_based_ingest/README.md | 4 +- .../gcs_event_based_ingest/cloudbuild.yaml | 40 +++---------------- .../gcs_event_based_ingest/e2e/conftest.py | 33 ++++++++++++--- .../gcs_event_based_ingest/e2e/e2e_test.py | 10 +++-- .../gcs_ocn_bq_ingest/common/utils.py | 4 +- .../scripts/install_terraform.sh | 27 +++++++++++++ 8 files changed, 84 insertions(+), 49 deletions(-) create mode 100644 tools/cloud_functions/gcs_event_based_ingest/.hadolint.yaml create mode 100755 tools/cloud_functions/gcs_event_based_ingest/scripts/install_terraform.sh diff --git a/tools/cloud_functions/gcs_event_based_ingest/.hadolint.yaml b/tools/cloud_functions/gcs_event_based_ingest/.hadolint.yaml new file mode 100644 index 000000000..8f7e23e45 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/.hadolint.yaml @@ -0,0 +1,2 @@ +ignored: + - DL3008 diff --git a/tools/cloud_functions/gcs_event_based_ingest/Dockerfile.ci b/tools/cloud_functions/gcs_event_based_ingest/Dockerfile.ci index f92277062..d383e7563 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/Dockerfile.ci +++ b/tools/cloud_functions/gcs_event_based_ingest/Dockerfile.ci @@ -1,4 +1,15 @@ -FROM python:3.8-slim +FROM python:3.8 +RUN apt-get update \ + && apt-get install --no-install-recommends -y \ + apt-transport-https \ + ca-certificates \ + curl \ + sudo \ + unzip \ + && apt-get autoremove -yqq --purge \ + && apt-get clean && rm -rf /var/lib/apt/lists/* COPY requirements.txt requirements-dev.txt ./ +COPY scripts/install_terraform.sh ./ +RUN ./install_terraform.sh RUN pip3 install --no-cache-dir -r requirements-dev.txt ENTRYPOINT ["python3 -m pytest"] diff --git a/tools/cloud_functions/gcs_event_based_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/README.md index f53917936..87515fb1c 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/README.md @@ -351,9 +351,7 @@ The system tests assume that you have deployed the cloud function. ```bash export TF_VAR_short_sha=$(git rev-parse --short=10 HEAD) export TF_VAR_project_id=jferriero-pp-dev -(cd e2e && terraform init && terraform apply -auto-approve) -python3 -m pytest e2e --tfstate e2e/terraform.tfstate -(cd e2e && terraform destroy -auto-approve) +python3 -m pytest -vvv e2e ``` ## Deployment diff --git a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml index 32cf50f14..fb9ff10de 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml +++ b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml @@ -20,6 +20,8 @@ steps: dir: '${_BUILD_DIR}' entrypoint: '/bin/hadolint' args: + - '--config' + - '.hadolint.yaml' - 'Dockerfile.ci' id: 'lint-ci-docker-image' - name: 'gcr.io/kaniko-project/executor:latest' @@ -130,50 +132,20 @@ steps: - '-c' - 'pip install -r requirements-dev.txt && python3 -m pytest tests -m IT' id: 'integration-test' -- name: 'hashicorp/terraform' - waitFor: - - 'integration-test' - dir: '${_BUILD_DIR}/e2e' - args: ['init'] - id: 'terraform-e2e-init' -- name: 'hashicorp/terraform' - waitFor: - - 'terraform-e2e-init' - dir: '${_BUILD_DIR}/e2e' - args: ['apply', '-auto-approve'] +- name: 'gcr.io/$PROJECT_ID/gcs_event_based_ingest_ci' + dir: '${_BUILD_DIR}' env: - 'TF_VAR_project_id=$PROJECT_ID' - 'TF_VAR_region=$_REGION' - 'TF_VAR_short_sha=$SHORT_SHA' - id: 'terraform-e2e-apply' -- name: 'gcr.io/$PROJECT_ID/gcs_event_based_ingest_ci' - dir: '${_BUILD_DIR}' waitFor: + - 'integration-test' - 'build-ci-image' - - 'terraform-e2e-apply' entrypoint: /bin/sh args: - '-c' - - 'python3 -m pytest e2e --tfstate=e2e/terraform.tfstate' + - 'python3 -m pytest -vvv e2e' id: 'e2e-test' -- name: 'hashicorp/terraform' - waitFor: - - 'e2e-test' - dir: '${_BUILD_DIR}/e2e' - # Note if the e2e test fails the resources will not be cleaned up due to - # cloud build not allowing ignored failed steps. - # this will allow maintainer to evaluate what went wrong during e2e test - # because the evidence will not be destroyed. - # Maintainers of bqutil project should destroy these resources after the - # failure cause has been diagnosed. - # We do not run this e2e test unless all unit and integration tests pass. - # https://github.com/GoogleCloudPlatform/cloud-builders/issues/253 - args: ['destroy', '-auto-approve'] - env: - - 'TF_VAR_project_id=$PROJECT_ID' - - 'TF_VAR_region=$_REGION' - - 'TF_VAR_short_sha=$SHORT_SHA' - id: 'terraform-e2e-destroy' timeout: '3600s' options: machineType: 'N1_HIGHCPU_32' diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py index 80b870617..bce25d00a 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py @@ -17,6 +17,8 @@ """End-to-end tests for event based BigQuery ingest Cloud Function.""" import json import os +import shlex +import subprocess import uuid import pytest @@ -24,6 +26,9 @@ from google.cloud import storage +TEST_DIR = os.path.realpath(os.path.dirname(__file__)) + + def pytest_addoption(parser): # if Terraform was used to deploy resources, pass the state details parser.addoption("--tfstate", action="store", default=None) @@ -42,13 +47,29 @@ def gcs() -> storage.Client: @pytest.fixture(scope='module') -def tf_state(pytestconfig): +def terraform_infra(request): + def _run(cmd): + print( + subprocess.check_output( + cmd, + stderr=subprocess.STDOUT, + cwd=TEST_DIR + ) + ) + + init = shlex.split("terraform init") + apply = shlex.split("terraform apply -auto-approve") + destroy = shlex.split("terraform destroy -auto-approve") + + _run(init) + _run(apply) - # if we used Terraform to create the GCP resources, use the output variables - if pytestconfig.getoption('tfstate') is not None: - tf_state_file = pytestconfig.getoption('tfstate') - with open(tf_state_file, 'r', encoding='utf-8') as fp: - return json.load(fp) + def teardown(): + _run(destroy) + + request.addfinalizer(teardown) + with open(os.path.join(TEST_DIR, "terraform.tfstate")) as tf_state_file: + return json.load(tf_state_file) @pytest.fixture diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py b/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py index 4d35fab39..e5057a903 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py @@ -28,9 +28,11 @@ @pytest.mark.SYS -def test_gcs_ocn_bq_ingest_cloud_function_long_runnning_bq_jobs_with_orderme( - gcs: storage.Client, bq: bigquery.Client, tf_state: Dict, - dest_table: bigquery.Table): +def test_cloud_function_long_runnning_bq_jobs_with_orderme( + gcs: storage.Client, bq: bigquery.Client, + dest_table: bigquery.Table, + terraform_infra: Dict +): """This test assumes the cloud function has been deployed with the accompanying terraform module which configures a 1 min timeout. It exports some larger data from a public BigQuery table and then reloads @@ -39,7 +41,7 @@ def test_gcs_ocn_bq_ingest_cloud_function_long_runnning_bq_jobs_with_orderme( itself by reposting a _BACKFILL file. The ordering behavior is controlled with the ORDERME blob. """ - input_bucket_id = tf_state['outputs']['bucket']['value'] + input_bucket_id = terraform_infra['outputs']['bucket']['value'] table_prefix = f"{dest_table.dataset_id}/" \ f"{dest_table.table_id}" extract_config = bigquery.ExtractJobConfig() diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index 98c583899..cd6ef936b 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -450,7 +450,9 @@ def get_table_prefix(object_id: str) -> str: """ basename = os.path.basename(object_id) if basename in { - constants.BACKFILL_FILENAME, constants.START_BACKFILL_FILENAME + constants.BACKFILL_FILENAME, + constants.START_BACKFILL_FILENAME, + "_bqlock", }: # These files will not match the regex and always should appear at the # table level. diff --git a/tools/cloud_functions/gcs_event_based_ingest/scripts/install_terraform.sh b/tools/cloud_functions/gcs_event_based_ingest/scripts/install_terraform.sh new file mode 100755 index 000000000..70f9cb521 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/scripts/install_terraform.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# Copyright 2020 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# This software is provided as-is, +# without warranty or representation for any use or purpose. +# Your use of it is subject to your agreement with Google. +set -eao pipefail + +TERRAFORM_VERSION="0.14.2" +TERRAFORM_BASE_URL="https://releases.hashicorp.com/terraform" +TERRAFORM_ZIP="terraform_${TERRAFORM_VERSION}_$(uname | tr '[:upper:]' '[:lower:]')_amd64.zip" +echo "Downloading from ${TERRAFORM_BASE_URL}/${TERRAFORM_VERSION}/${TERRAFORM_ZIP}" +curl -Lo /tmp/terraform.zip "${TERRAFORM_BASE_URL}/${TERRAFORM_VERSION}/${TERRAFORM_ZIP}" +sudo unzip /tmp/terraform.zip -d /bin From 905949dcbeb80dabbad2be43aa55c5885372b323 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Tue, 15 Dec 2020 17:40:51 -0800 Subject: [PATCH 58/90] fix don't removesuffix for start backfill file --- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 6425fb506..6d5dbad73 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -66,7 +66,7 @@ def main(event: Dict, context): # pylint: disable=unused-argument action_filenames = constants.ACTION_FILENAMES if constants.START_BACKFILL_FILENAME is None: action_filenames.remove(None) - print(f"No-op. This notification was not for a" + print(f"No-op. This notification was not for a " f"{action_filenames} file.") return @@ -86,7 +86,7 @@ def main(event: Dict, context): # pylint: disable=unused-argument # Unexpected exceptions will actually raise which may cause a cold restart. except exceptions.DuplicateNotificationException: - print("recieved duplicate notification. this was handled gracefully." + print("recieved duplicate notification. this was handled gracefully.\n " f"{traceback.format_exc()}") except exceptions.EXCEPTIONS_TO_REPORT as original_error: @@ -117,8 +117,8 @@ def triage_event(gcs_client: Optional[storage.Client], if (basename_object_id == constants.SUCCESS_FILENAME and "/_backlog/" in event_blob.name): print(f"This notification was for " - f"gs://{bkt.name}/{event_blob.name} a" - f"{constants.SUCCESS_FILENAME} in a" + f"gs://{bkt.name}/{event_blob.name} a " + f"{constants.SUCCESS_FILENAME} in a " "/_backlog/ directory. " f"Watiting {constants.ENSURE_SUBSCRIBER_SECONDS} seconds to " "ensure that subscriber is running.") @@ -129,8 +129,8 @@ def triage_event(gcs_client: Optional[storage.Client], # This will be the first backfill file. ordering.start_backfill_subscriber_if_not_running( gcs_client, bkt, - utils.removesuffix(event_blob.name, - constants.START_BACKFILL_FILENAME)) + utils.get_table_prefix(event_blob.name) + ) return if basename_object_id == constants.SUCCESS_FILENAME: ordering.backlog_publisher(gcs_client, event_blob) From 675c756d9ef581e9f99bdf1b824d8532d502779d Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Tue, 15 Dec 2020 17:43:18 -0800 Subject: [PATCH 59/90] fixup isort --- tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py index bce25d00a..69aaea108 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py @@ -25,7 +25,6 @@ from google.cloud import bigquery from google.cloud import storage - TEST_DIR = os.path.realpath(os.path.dirname(__file__)) From f0ebcd04dcc08d3c754db97ed5de1d51e060e0ac Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Tue, 15 Dec 2020 17:51:55 -0800 Subject: [PATCH 60/90] more logging statements fail on untriageable event --- .../gcs_ocn_bq_ingest/common/ordering.py | 4 ++++ .../gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py index 1b9bfeddf..488292f39 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py @@ -246,6 +246,10 @@ def start_backfill_subscriber_if_not_running( start_backfill_blob = bkt.blob( f"{table_prefix}/{constants.START_BACKFILL_FILENAME}") start_backfill = start_backfill_blob.exists(client=gcs_client) + if not start_backfill: + print("note triggering backfill because" + f"gs://{start_backfill_blob.bucket.name}/" + f"{start_backfill_blob.name} was not found.") if start_backfill: # Create a _BACKFILL file for this table if not exists diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 6d5dbad73..652efb165 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -126,6 +126,9 @@ def triage_event(gcs_client: Optional[storage.Client], return if (constants.START_BACKFILL_FILENAME and basename_object_id == constants.START_BACKFILL_FILENAME): + print( + f"notification for gs://{event_blob.bucket.name}/" + f"{event_blob.name}") # This will be the first backfill file. ordering.start_backfill_subscriber_if_not_running( gcs_client, bkt, @@ -146,6 +149,9 @@ def triage_event(gcs_client: Optional[storage.Client], ordering.backlog_subscriber(gcs_client, bq_client, event_blob, function_start_time) return + raise RuntimeError( + f"gs://{event_blob.bucket.name}/" + f"{event_blob.name} could not be triaged.") else: # Default behavior submit job as soon as success file lands. if basename_object_id == constants.SUCCESS_FILENAME: if bq_client: From b83fee87f9db4c76b49db42b71c16bb4748bac0e Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Tue, 15 Dec 2020 17:59:42 -0800 Subject: [PATCH 61/90] fix pylint --- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 652efb165..f5a4e123c 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -111,6 +111,7 @@ def triage_event(gcs_client: Optional[storage.Client], bkt = event_blob.bucket basename_object_id = os.path.basename(event_blob.name) + # pylint: disable=no-else-raise if enforce_ordering: # For SUCCESS files in a backlog directory, ensure that subscriber # is running. From eae687fad22367b687958b3f1637dcc254bb169d Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Tue, 15 Dec 2020 18:48:26 -0800 Subject: [PATCH 62/90] feat: env-var t numDmlRowsAffected = 0 as a failure --- .../gcs_event_based_ingest/README.md | 27 ++++++++++++++--- .../gcs_event_based_ingest/e2e/conftest.py | 9 ++---- .../gcs_event_based_ingest/e2e/e2e_test.py | 6 ++-- .../gcs_ocn_bq_ingest/README.md | 1 + .../gcs_ocn_bq_ingest/common/constants.py | 11 +++++++ .../gcs_ocn_bq_ingest/common/utils.py | 29 +++++++++++++++---- .../gcs_ocn_bq_ingest/main.py | 14 ++++----- 7 files changed, 68 insertions(+), 29 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/README.md index 87515fb1c..8cf9a18d7 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/README.md @@ -156,14 +156,33 @@ before they can be loaded to BigQuery. This is handled by query on an temporary external table over the GCS objects as a proxy for load job. `gs://${INGESTION_BUCKET}/${BQ_DATASET}/${BQ_TABLE_NAME}/_config/bq_transform.sql` -Note, external queries will consume query slots from this project's reservation -or count towards your on-demand billing. They will _not_ use free tie load slots. +By default, if a query job finishes of statement type +`INSERT`,`UPDATE`,`DELETE`, or `MERGE` and `numDmlRowsAffected = 0` this will be +treated as a failure ([See Query Job Statistics API docs](https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobstatistics2)). +This is usually due to a bad query / configuration with bad DML predicate. +For example running the following query on an empty table: +```sql +UPDATE foo.bar dest ... FROM temp_ext src WHERE src.id = dest.id +``` + +By failing on this condition we keep the backlog intact when we run a query job +that unexpectedly did no affect any rows. +This can be disabled by setting the environment variable +`FAIL_ON_ZERO_DML_ROWS_AFFECTED=False`. + +A `CREATE OR REPLACE TABLE` is not DML and will not be subject to this behavior. + +##### Cost Note +External queries will consume query slots from this project's reservation +or count towards your on-demand billing. +They will _not_ use free tier load slots. + +##### External Table Name: `temp_ext` Note, that the query should select from a `temp_ext` which will be a temporary external table configured on the fly by the Cloud Function. The query must handle the logic for inserting into the destination table. -This means it should use BigQuery DML to either `INSERT` or `MERGE` into the -destination table. +This means it should use BigQuery DML to mutate the destination table. For example: ```sql INSERT {dest_dataset}.{dest_table} diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py index 69aaea108..58af67496 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py @@ -47,14 +47,11 @@ def gcs() -> storage.Client: @pytest.fixture(scope='module') def terraform_infra(request): + def _run(cmd): print( - subprocess.check_output( - cmd, - stderr=subprocess.STDOUT, - cwd=TEST_DIR - ) - ) + subprocess.check_output(cmd, stderr=subprocess.STDOUT, + cwd=TEST_DIR)) init = shlex.split("terraform init") apply = shlex.split("terraform apply -auto-approve") diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py b/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py index e5057a903..b8542631c 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py @@ -29,10 +29,8 @@ @pytest.mark.SYS def test_cloud_function_long_runnning_bq_jobs_with_orderme( - gcs: storage.Client, bq: bigquery.Client, - dest_table: bigquery.Table, - terraform_infra: Dict -): + gcs: storage.Client, bq: bigquery.Client, dest_table: bigquery.Table, + terraform_infra: Dict): """This test assumes the cloud function has been deployed with the accompanying terraform module which configures a 1 min timeout. It exports some larger data from a public BigQuery table and then reloads diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md index cd701cd09..5ffea5c17 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md @@ -34,6 +34,7 @@ following default behavior. | `JOB_PREFIX` | Prefix for BigQuery Job IDs | `gcf-ingest-` | | `BQ_PROJECT` | Default BQ project to use if not specified in dataset capturing group | Project where Cloud Function is deployed | | `FUNCTION_TIMEOUT_SEC`| Number of seconds set for this deployment of Cloud Function (no longer part of python38 runtime) | 60 | +| `FAIL_ON_ZERO_DML_ROWS_AFFECTED` | Treat External Queries that result in `numDmlAffectedRows = 0` as failures | True | | `ORDER_PER_TABLE`\* | Force jobs to be executed sequentially (rather than parallel) based on the backlog. This is the same as having an `ORDERME` file in every config directory | `False` | | `START_BACKFILL_FILENAME`\*| Block submitting BigQuery Jobs for a table until this file is present at the table prefix. By default this will not happen. | `None` | | `RESTART_BUFFER_SECONDS`\* | Buffer before Cloud Function timeout to leave before re-triggering the backfill subscriber | 30 | diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py index 50faf6d12..61931ec31 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py @@ -123,3 +123,14 @@ BQ_TRANSFORM_SQL = "*.sql" ENSURE_SUBSCRIBER_SECONDS = 5 + +FAIL_ON_ZERO_DML_ROWS_AFFECTED = bool( + distutils.util.strtobool(os.getenv("FAIL_ON_ZERO_DML_ROWS_AFFECTED", + "True"))) + +BQ_DML_STATEMENT_TYPES = { + "INSERT", + "UPDATE", + "DELETE", + "MERGE", +} diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index cd6ef936b..81d549cdb 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -23,6 +23,7 @@ import json import os import pathlib +import pprint import time import uuid from typing import Any, Deque, Dict, List, Optional, Tuple, Union @@ -93,8 +94,15 @@ def external_query( # pylint: disable=too-many-arguments job.reload(client=bq_client) if job.errors: raise exceptions.BigQueryJobFailure( - f"query job {job.job_id} failed quickly: {job.errors}") + f"query job {job.job_id} failed quickly: {job.errors}." + f"\n{pprint.pformat(job.to_api_repr())}") if job.state == "DONE": + if (constants.FAIL_ON_ZERO_DML_ROWS_AFFECTED + and job.statement_type in constants.BQ_DML_STATEMENT_TYPES + and job.num_dml_affected_rows < 1): + raise exceptions.BigQueryJobFailure( + f"query job {job.job_id} ran successfully but did not affect" + f"any rows.\n {pprint.pformat(job.to_api_repr())}") return time.sleep(constants.JOB_POLL_INTERVAL_SECONDS) @@ -130,7 +138,8 @@ def load_batches(gcs_client, bq_client, gsurl, dest_table_ref, job_id): job.reload(client=bq_client) if job.errors: raise exceptions.BigQueryJobFailure( - f"load job {job.job_id} failed quickly: {job.errors}") + f"load job {job.job_id} failed quickly: {job.errors}\n" + f"{pprint.pformat(job.to_api_repr())}") time.sleep(constants.JOB_POLL_INTERVAL_SECONDS) @@ -450,9 +459,9 @@ def get_table_prefix(object_id: str) -> str: """ basename = os.path.basename(object_id) if basename in { - constants.BACKFILL_FILENAME, - constants.START_BACKFILL_FILENAME, - "_bqlock", + constants.BACKFILL_FILENAME, + constants.START_BACKFILL_FILENAME, + "_bqlock", }: # These files will not match the regex and always should appear at the # table level. @@ -554,7 +563,15 @@ def wait_on_bq_job_id(bq_client: bigquery.Client, if job.errors: raise exceptions.BigQueryJobFailure( f"BigQuery Job {job.job_id} failed during backfill with the" - f"following errors: {job.errors}") + f"following errors: {job.errors}\n" + f"{pprint.pformat(job.to_api_repr())}") + if (isinstance(job, bigquery.QueryJob) + and constants.FAIL_ON_ZERO_DML_ROWS_AFFECTED + and job.statement_type in constants.BQ_DML_STATEMENT_TYPES + and job.num_dml_affected_rows < 1): + raise exceptions.BigQueryJobFailure( + f"query job {job.job_id} ran successfully but did not" + f"affect any rows.\n {pprint.pformat(job.to_api_repr())}") return True if job.state in {"RUNNING", "PENDING"}: print(f"waiting on BigQuery Job {job.job_id}") diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index f5a4e123c..10d2eb3b3 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -127,14 +127,11 @@ def triage_event(gcs_client: Optional[storage.Client], return if (constants.START_BACKFILL_FILENAME and basename_object_id == constants.START_BACKFILL_FILENAME): - print( - f"notification for gs://{event_blob.bucket.name}/" - f"{event_blob.name}") + print(f"notification for gs://{event_blob.bucket.name}/" + f"{event_blob.name}") # This will be the first backfill file. ordering.start_backfill_subscriber_if_not_running( - gcs_client, bkt, - utils.get_table_prefix(event_blob.name) - ) + gcs_client, bkt, utils.get_table_prefix(event_blob.name)) return if basename_object_id == constants.SUCCESS_FILENAME: ordering.backlog_publisher(gcs_client, event_blob) @@ -150,9 +147,8 @@ def triage_event(gcs_client: Optional[storage.Client], ordering.backlog_subscriber(gcs_client, bq_client, event_blob, function_start_time) return - raise RuntimeError( - f"gs://{event_blob.bucket.name}/" - f"{event_blob.name} could not be triaged.") + raise RuntimeError(f"gs://{event_blob.bucket.name}/" + f"{event_blob.name} could not be triaged.") else: # Default behavior submit job as soon as success file lands. if basename_object_id == constants.SUCCESS_FILENAME: if bq_client: From 94136b623a14b9f89be307bdc8667750a89a5334 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Tue, 15 Dec 2020 19:36:21 -0800 Subject: [PATCH 63/90] [skip ci] add comment to cloudbuild.yaml --- tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml index fb9ff10de..6ed07b7de 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml +++ b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml @@ -120,6 +120,7 @@ steps: - '-c' # pip installing again to get GCB to recognize mocker from pytest-mock - 'pip install -r requirements-dev.txt && python3 -m pytest tests -m "not IT"' + # GCB can sometimes get hung on this step for no reason but is doomed to not recover. timeout: 15s id: 'unit-test' - name: 'gcr.io/$PROJECT_ID/gcs_event_based_ingest_ci' From 790abb1ec1656cce0fed09417fd83bc29fb43060 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Tue, 15 Dec 2020 19:38:11 -0800 Subject: [PATCH 64/90] [skip ci] update comment in cloudbuild.yaml --- tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml index 6ed07b7de..32d39e742 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml +++ b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml @@ -120,7 +120,9 @@ steps: - '-c' # pip installing again to get GCB to recognize mocker from pytest-mock - 'pip install -r requirements-dev.txt && python3 -m pytest tests -m "not IT"' - # GCB can sometimes get hung on this step for no reason but is doomed to not recover. + # GCB sometimes get stuck on this step and is doomed to not recover. + # This is usually remedied by just re-running the build. + # adding this unit-test step level timeout so we can fail sooner and retry. timeout: 15s id: 'unit-test' - name: 'gcr.io/$PROJECT_ID/gcs_event_based_ingest_ci' From 94ca2f6891e55acc9cfde24525b375c8737ebce2 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Tue, 15 Dec 2020 19:44:01 -0800 Subject: [PATCH 65/90] chore: clean up unused fixture, init files --- tools/__init__.py | 0 tools/cloud_functions/__init__.py | 0 tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py | 5 ----- 3 files changed, 5 deletions(-) delete mode 100644 tools/__init__.py delete mode 100644 tools/cloud_functions/__init__.py diff --git a/tools/__init__.py b/tools/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tools/cloud_functions/__init__.py b/tools/cloud_functions/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py index 58af67496..f3cd060cd 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py @@ -28,11 +28,6 @@ TEST_DIR = os.path.realpath(os.path.dirname(__file__)) -def pytest_addoption(parser): - # if Terraform was used to deploy resources, pass the state details - parser.addoption("--tfstate", action="store", default=None) - - @pytest.fixture(scope="module") def bq() -> bigquery.Client: """BigQuery Client""" From b216d8886836ba170f0500e42578892a00d4320b Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Wed, 16 Dec 2020 11:44:14 -0800 Subject: [PATCH 66/90] chore: improve terraform printint in pytest fixture --- .../gcs_event_based_ingest/e2e/conftest.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py index f3cd060cd..bd64a2660 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py @@ -17,6 +17,7 @@ """End-to-end tests for event based BigQuery ingest Cloud Function.""" import json import os +import re import shlex import subprocess import uuid @@ -27,6 +28,8 @@ TEST_DIR = os.path.realpath(os.path.dirname(__file__)) +ANSI_ESCAPE_PATTERN = re.compile(r'\x1B\[[0-?]*[ -/]*[@-~]') + @pytest.fixture(scope="module") def bq() -> bigquery.Client: @@ -45,8 +48,15 @@ def terraform_infra(request): def _run(cmd): print( - subprocess.check_output(cmd, stderr=subprocess.STDOUT, - cwd=TEST_DIR)) + ANSI_ESCAPE_PATTERN.sub( + '', + subprocess.check_output( + cmd, + stderr=subprocess.STDOUT, + cwd=TEST_DIR + ).decode('UTF-8') + ) + ) init = shlex.split("terraform init") apply = shlex.split("terraform apply -auto-approve") From d5fe02bfc88e1cd554731f7cd0b1f76f996d146a Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Wed, 16 Dec 2020 13:47:03 -0800 Subject: [PATCH 67/90] better bq job ids --- .../gcs_event_based_ingest/README.md | 7 ++- .../gcs_ocn_bq_ingest/common/constants.py | 3 ++ .../gcs_ocn_bq_ingest/common/ordering.py | 2 +- .../gcs_ocn_bq_ingest/common/utils.py | 43 +++++++------------ .../gcs_ocn_bq_ingest/main.py | 2 +- 5 files changed, 27 insertions(+), 30 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/README.md index 8cf9a18d7..25ab421aa 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/README.md @@ -278,8 +278,13 @@ SELECT FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_PROJECT WHERE - (SELECT value FROM UNNEST(labels) WHERE key = "component") = "gcf-ingest-" + (SELECT value FROM UNNEST(labels) WHERE key = "component") = "event-based-gcs-ingest" ``` +If your external queries have mutliple sql statements only the parent job will +follow the `gcf-ingest-*` naming convention. Children jobs (for each statement) +begin with prefix _script_job. These jobs will still be labelled with +`component` and `cloud-function-name`. +For more information see [Scripting in Standard SQL](https://cloud.google.com/bigquery/docs/reference/standard-sql/scripting) ## Triggers GCS Object Finalize triggers can communicate with Cloud Functions directly or diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py index 61931ec31..bd13fef44 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py @@ -134,3 +134,6 @@ "DELETE", "MERGE", } + +# https://cloud.google.com/bigquery/docs/running-jobs#generate-jobid +NON_BQ_JOB_ID_REGEX = re.compile('[^0-9a-zA-Z_\-]+') diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py index 488292f39..5195ba248 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py @@ -196,7 +196,7 @@ def handle_backlog( f"gs://{next_success_file.bucket}/{next_success_file.name}") print("applying next batch for:" f"gs://{next_success_file.bucket}/{next_success_file.name}") - next_job_id = utils.create_job_id(table_ref, batch) + next_job_id = utils.create_job_id(next_success_file.name) utils.apply(gcs_client, bq_client, next_success_file, lock_blob, next_job_id) return False # BQ job running diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index 81d549cdb..78c2e6419 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -651,35 +651,24 @@ def gcs_path_to_table_ref_and_batch( return dest_table_ref, batch_id -def create_job_id(dest_table_ref: bigquery.TableReference, - batch_id: Optional[str]): - """Create job id prefix with a consistent naming convention. - The naming conventions is as follows: - gcf-ingest----- - Parts that are not inferrable from the GCS path with have a 'None' - placeholder. This naming convention is crucial for monitoring the system. +def create_job_id(success_file_path): + """Create job id prefix with a consistent naming convention based on the + success file path to give context of what caused this job to be submitted. + the rules for success file name -> job id are: + 1. slashes to dashes + 2. all non-alphanumeric dash or underscore will be replaced with underscore Note, gcf-ingest- can be overridden with environment variable JOB_PREFIX - - Examples: - - Non-partitioned Non batched tables: - - gs://${BUCKET}/tpch/lineitem/_SUCCESS - - gcf-ingest-tpch-lineitem-None-None- - Non-partitioned batched tables: - - gs://${BUCKET}/tpch/lineitem/batch000/_SUCCESS - - gcf-ingest-tpch-lineitem-None-batch000- - Partitioned Batched tables: - - gs://${BUCKET}/tpch/lineitem/$20201031/batch000/_SUCCESS - - gcf-ingest-tpch-lineitem-20201031-batch000- + 3. uuid for uniqueness """ - table_partition = dest_table_ref.table_id.split("$") - if len(table_partition) < 2: - # If there is no partition put a None placeholder - table_partition.append("None") - return f"{os.getenv('JOB_PREFIX', constants.DEFAULT_JOB_PREFIX)}" \ - f"{dest_table_ref.dataset_id}-" \ - f"{'-'.join(table_partition)}-" \ - f"{batch_id}-{uuid.uuid4()}" + clean_job_id = os.getenv('JOB_PREFIX', constants.DEFAULT_JOB_PREFIX) + clean_job_id += constants.NON_BQ_JOB_ID_REGEX.sub( + '_', + success_file_path.replace('/', '-') + ) + # add uniqueness in case we have to "re-process" a success file that is + # republished or handle multiple load jobs. + clean_job_id += str(uuid.uuid4()) + return clean_job_id[:1024] # make sure job id isn't too long def handle_bq_lock(gcs_client: storage.Client, lock_blob: storage.Blob, diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 10d2eb3b3..418963313 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -162,7 +162,7 @@ def triage_event(gcs_client: Optional[storage.Client], bq_client, event_blob, None, # no lock blob when ordering not enabled. - utils.create_job_id(table_ref, batch)) + utils.create_job_id(event_blob.name)) def lazy_error_reporting_client() -> error_reporting.Client: From fcb88a0b49fa56f340622e1a5ca373a9a5236a5f Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Wed, 16 Dec 2020 13:53:54 -0800 Subject: [PATCH 68/90] fixup regex escaping --- .../gcs_ocn_bq_ingest/common/constants.py | 2 +- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py index bd13fef44..1a5a1defa 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py @@ -136,4 +136,4 @@ } # https://cloud.google.com/bigquery/docs/running-jobs#generate-jobid -NON_BQ_JOB_ID_REGEX = re.compile('[^0-9a-zA-Z_\-]+') +NON_BQ_JOB_ID_REGEX = re.compile(r'[^0-9a-zA-Z_\-]+') diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index 78c2e6419..31a7cb589 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -666,7 +666,8 @@ def create_job_id(success_file_path): success_file_path.replace('/', '-') ) # add uniqueness in case we have to "re-process" a success file that is - # republished or handle multiple load jobs. + # republished (e.g. to fix a bad batch of data) or handle multiple load jobs + # for a single success file. clean_job_id += str(uuid.uuid4()) return clean_job_id[:1024] # make sure job id isn't too long From 85cea34a25181996d88278f1adb1924d6a283938 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Wed, 16 Dec 2020 14:14:57 -0800 Subject: [PATCH 69/90] make pylint happy --- .../gcs_ocn_bq_ingest/common/ordering.py | 2 -- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py | 6 ------ 2 files changed, 8 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py index 5195ba248..c6362755a 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py @@ -186,8 +186,6 @@ def handle_backlog( if next_backlog_file: next_success_file: storage.Blob = bkt.blob( next_backlog_file.name.replace("/_backlog/", "/")) - table_ref, batch = utils.gcs_path_to_table_ref_and_batch( - next_success_file.name, bq_client.project) if not next_success_file.exists(client=gcs_client): raise exceptions.BacklogException( "backlog contains " diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 418963313..1d2ef71eb 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -151,12 +151,6 @@ def triage_event(gcs_client: Optional[storage.Client], f"{event_blob.name} could not be triaged.") else: # Default behavior submit job as soon as success file lands. if basename_object_id == constants.SUCCESS_FILENAME: - if bq_client: - table_ref, batch = utils.gcs_path_to_table_ref_and_batch( - event_blob.name, bq_client.project) - else: - table_ref, batch = utils.gcs_path_to_table_ref_and_batch( - event_blob.name, None) utils.apply( gcs_client, bq_client, From f7af0fb0d1847229682f83763fb4975e0d8d3df9 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Wed, 16 Dec 2020 15:50:21 -0800 Subject: [PATCH 70/90] [skip ci] more docs --- .../gcs_event_based_ingest/ORDERING.md | 69 ++++++++++++++++++- .../gcs_event_based_ingest/README.md | 15 ++++ 2 files changed, 81 insertions(+), 3 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/ORDERING.md b/tools/cloud_functions/gcs_event_based_ingest/ORDERING.md index c85020276..4ae20dd0f 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/ORDERING.md +++ b/tools/cloud_functions/gcs_event_based_ingest/ORDERING.md @@ -75,6 +75,12 @@ address the failed batch: "Original Exception:\n" f"{traceback.format_exc()}") ``` +Note that once the `_bqlock` is removed and `_BACKFILL` is reposted, the Cloud +Function will proceed by applying the next batch in the `_backlog`. This means, +if you have applied the batch manually you should remove this object from the +`_backlog`. However, if you have patched the data on GCS for the failed batch +and would like the cloud function to apply it, then you leave this object in the +`_backlog`. ## Ordering Mechanics Explained We've treated ordering incremental commits to table as a variation on the @@ -90,13 +96,14 @@ The Backlog Publisher has two responsibilities: 1. add incoming success files to a table's `_backlog` so they are not "forgotten" by the ingestion system. 1. if there is a non-empty backlog start the backfill subscriber (if one is not -already running). This is accomplished by dropping a table level `_BACKFILL` +already running). This is accomplished by uploading a table level `_BACKFILL` file if it does not already exist. ### Backlog Subscriber The Backlog Subscriber is responsible for keeping track of BigQuery jobs running on a table and ensure that batches are committed in order. When the backlog is -not empty for a table the backlog subscriber should be running for that table. +not empty for a table the backlog subscriber should be running for that table +unless a job has failed. It will either be polling a `RUNNING` BigQuery job for completion, or submitting the next batch in the `_backlog`. @@ -106,7 +113,63 @@ The state of what BigQuery job is currently running on a table is kept in a In order to escape the maximum nine-minute (540s) Cloud Function Timeout, the backfill subscriber will re-trigger itself by posting a new `_BACKFILL` file until the `_backlog` for the table prefix is empty. When a new success file -arrives it is the responsibility of the publisher to restart the subscriber. +arrives it is the responsibility of the publisher to restart the subscriber if +one is not already running. + +### Example: Life of a Table +The following process explains the triggers (GCS files) and actions of the +Cloud Function for a single table prefix. + +1. Source data uploaded to GCS prefix for the destination dataset / table, etc. + - `gs://ingestion-bucket/dataset/table/historical/2020/01/02/03/foo-data-00.csv` + - `gs://ingestion-bucket/dataset/table/historical/2020/01/02/03/foo-data-01.csv` + - `gs://ingestion-bucket/dataset/table/historical/2020/01/02/04/foo-data-00.csv` + - `gs://ingestion-bucket/dataset/table/incremental/2020/01/02/05/foo-data-01.csv` +1. Success file uploaded to GCS (to indicate this atomic batch is ready to be +applied). + - `gs://ingestion-bucket/dataset/table/historical/2020/01/02/03/_SUCCESS` + - `gs://ingestion-bucket/dataset/table/historical/2020/01/02/04/_SUCCESS` + - `gs://ingestion-bucket/dataset/table/incremental/2020/01/02/05/_SUCCESS` +1. Backlog Publisher adds a pointer to each success file in the backlog for the +table. + - `gs://ingestion-bucket/dataset/table/_backlog/historical/2020/01/02/03/_SUCCESS` + - `gs://ingestion-bucket/dataset/table/_backlog/historical/2020/01/02/04/_SUCCESS` + - `gs://ingestion-bucket/dataset/table/_backlog/incremental/2020/01/02/05/_SUCCESS` +1. If the `START_BACKFILL_FILENAME` is set and the file exists at the table prefix, After adding each item the backlog, the Backlog Publisher will start the +Backfill Subscriber if it is not already running (as indicated by a `_BACKFILL` +file). If the `START_BACKFILL_FILENAME` is not present the backlog subscriber +will not be started until this file is uploaded. + - `gs://ingestion-bucket/dataset/table/_BACKFILL` +1. The Backlog Subscriber will look at the backlog and apply the batches in +order (lexicographic). This process looks like this: + 1. Claim this backfill file: + - `gs://ingestion-bucket/dataset/table/_claimed__BACKFILL_created_at_...` + 1. Claim first batch in backlog (ensure no duplicate processing): + - `gs://ingestion-bucket/dataset/table/historical/2020/01/02/03/_claimed__SUCCESS_created_at_...` + 1. Submit the BigQuery Job for this batch (load job or external query based on the `_config/*` files) + - Ingest the data at the `gs://ingestion-bucket/dataset/table/historical/2020/01/02/03/*` prefix + - Store the job ID in `gs://ingestion-bucket/dataset/table/_bqlock` + 1. Wait for this Job to complete successfully and remove this item from the backlog. + - If job is `DONE` with errors: + - Raise exception (do not continue to process any more batches) + - If job is `DONE` without errors remove the pointer from the backlog: + - DELETE `gs://ingestion-bucket/dataset/table/_backlog/historical/2020/01/02/03/_SUCCESS` + 1. Repeat from Backlog Subscriber step 2 + - Where the first item in the backlog is now + - `gs://ingestion-bucket/dataset/table/_backlog/historical/2020/01/02/04/_SUCCESS` + - And on the next loop: + - `gs://ingestion-bucket/dataset/table/_backlog/incremental/2020/01/02/05/_SUCCESS` +1. Backlog Subscriber sees the `_backlog/` is empty for the table. In other words +The BigQuery table is caught up with the data on GCS. + - DELETE `gs://ingestion-bucket/dataset/table/_BACKFILL` and exit +1. The next day a new incremental arrives + - `gs://ingestion-bucket/dataset/table/_backlog/incremental/2020/01/02/05/_SUCCESS` +1. The Backlog Publisher adds this item to the backlog and wakes up the +Backfill Subscriber by posting a new `_BACKFILL` file. + - `gs://ingestion-bucket/dataset/table/_backlog/incremental/2020/01/02/05/_SUCCESS` + - `gs://ingestion-bucket/dataset/table/_BACKFILL` +1. Backlog Subscriber will handle the backlog of just one item +(See Backlog Subscriber step #5 and #6 above) ### Note on Handling Race Condition diff --git a/tools/cloud_functions/gcs_event_based_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/README.md index 25ab421aa..d75b826eb 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/README.md @@ -443,6 +443,21 @@ files. The utility supports either invoking the Cloud Function main method locally (in concurrent threads) or publishing notifications for the success files (for a deployed Cloud Function to pick up). +### Backfill and Ordering +If you use the ordering feature on a table (or function wide) you should use the +`NOTIFICATIONS` mode to repost notifications to a pub/sub topic that your +deployed Cloud Function is listening to. The `LOCAL` mode does not support +ordering because this feature relies on (re)posting files like `_bqlock`, +`_BACKFILL` and various claim files and getting re-triggered by object +notifications for these. +The script will publish the notifications for success files and the Cloud +Function will add these to the appropriate table's backlog. +Once the script completes you can drop the `START_BACKFILL_FILENAME` +(e.g. `_HISTORYDONE`) for each table you want to trigger the backfill for. +In general, it would not be safe for this utility to drop a `_HISTORYDONE` for +every table because the parallel historical loads might still be in progress. + + ### Usage ``` python3 -m backfill -h From 7971bc39a09a9a89a6d2623e16a3ef3cecf123e8 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Thu, 17 Dec 2020 10:03:09 -0800 Subject: [PATCH 71/90] fix default load config return type --- .../gcs_event_based_ingest/e2e/conftest.py | 10 +++------- .../gcs_ocn_bq_ingest/common/utils.py | 10 +++++----- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py index bd64a2660..b8f12a14c 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py @@ -50,13 +50,9 @@ def _run(cmd): print( ANSI_ESCAPE_PATTERN.sub( '', - subprocess.check_output( - cmd, - stderr=subprocess.STDOUT, - cwd=TEST_DIR - ).decode('UTF-8') - ) - ) + subprocess.check_output(cmd, + stderr=subprocess.STDOUT, + cwd=TEST_DIR).decode('UTF-8'))) init = shlex.split("terraform init") apply = shlex.split("terraform apply -auto-approve") diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index 31a7cb589..5fcd41045 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -215,8 +215,10 @@ def _get_parent_config(path): while config_q: recursive_update(merged_config, config_q.popleft(), in_place=True) if merged_config == constants.BASE_LOAD_JOB_CONFIG: - print("falling back to default CSV load job config") - return constants.DEFAULT_LOAD_JOB_CONFIG + print("falling back to default CSV load job config. " + "Did you forget load.json?") + return bigquery.LoadJobConfig.from_api_repr( + constants.DEFAULT_LOAD_JOB_CONFIG) print(f"merged_config: {merged_config}") return bigquery.LoadJobConfig.from_api_repr({"load": merged_config}) @@ -662,9 +664,7 @@ def create_job_id(success_file_path): """ clean_job_id = os.getenv('JOB_PREFIX', constants.DEFAULT_JOB_PREFIX) clean_job_id += constants.NON_BQ_JOB_ID_REGEX.sub( - '_', - success_file_path.replace('/', '-') - ) + '_', success_file_path.replace('/', '-')) # add uniqueness in case we have to "re-process" a success file that is # republished (e.g. to fix a bad batch of data) or handle multiple load jobs # for a single success file. From de19c9878d10dc85098e68b615eaea1ead2507f5 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Thu, 7 Jan 2021 12:34:17 -0800 Subject: [PATCH 72/90] fix: fail on failure of children jobs During multi-statement BQ jobs, child jobs are submitted. If any of these fail we should consider the job a failure. --- .../gcs_ocn_bq_ingest/common/utils.py | 65 ++++++++++++------- 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index 5fcd41045..db343a177 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -58,7 +58,7 @@ def external_query( # pylint: disable=too-many-arguments if external_table_config: external_table_def = json.loads(external_table_config) else: - print(f" {gsurl}_config/external.json not found in parents of {gsurl}." + print(f" {gsurl}_config/external.json not found in parents of {gsurl}. " "Falling back to default PARQUET external table:\n" f"{json.dumps(constants.DEFAULT_EXTERNAL_TABLE_DEFINITION)}") external_table_def = constants.DEFAULT_EXTERNAL_TABLE_DEFINITION @@ -101,8 +101,8 @@ def external_query( # pylint: disable=too-many-arguments and job.statement_type in constants.BQ_DML_STATEMENT_TYPES and job.num_dml_affected_rows < 1): raise exceptions.BigQueryJobFailure( - f"query job {job.job_id} ran successfully but did not affect" - f"any rows.\n {pprint.pformat(job.to_api_repr())}") + f"query job {job.job_id} ran successfully but did not " + f"affect any rows.\n {pprint.pformat(job.to_api_repr())}") return time.sleep(constants.JOB_POLL_INTERVAL_SECONDS) @@ -136,10 +136,7 @@ def load_batches(gcs_client, bq_client, gsurl, dest_table_ref, job_id): # Check if job failed quickly for job in jobs: job.reload(client=bq_client) - if job.errors: - raise exceptions.BigQueryJobFailure( - f"load job {job.job_id} failed quickly: {job.errors}\n" - f"{pprint.pformat(job.to_api_repr())}") + check_for_bq_job_and_children_errors(bq_client, job) time.sleep(constants.JOB_POLL_INTERVAL_SECONDS) @@ -240,9 +237,8 @@ def get_batches_for_prefix( bucket_name = blob.bucket.name prefix_name = blob.name - prefix_filter = f"{prefix_name}" bucket = cached_get_bucket(gcs_client, bucket_name) - blobs = list(bucket.list_blobs(prefix=prefix_filter, delimiter="/")) + blobs = list(bucket.list_blobs(prefix=prefix_name, delimiter="/")) cumulative_bytes = 0 max_batch_size = int( @@ -309,14 +305,14 @@ def parse_notification(notification: dict) -> Tuple[str, str]: return attributes["bucketId"], attributes["objectId"] except KeyError: raise exceptions.UnexpectedTriggerException( - "Issue with Pub/Sub message, did not contain expected" + "Issue with Pub/Sub message, did not contain expected " f"attributes: 'bucketId' and 'objectId': {notification}" ) from KeyError raise exceptions.UnexpectedTriggerException( "Cloud Function received unexpected trigger:\n" f"{notification}\n" - "This function only supports direct Cloud Functions" - "Background Triggers or Pub/Sub storage notificaitons" + "This function only supports direct Cloud Functions " + "Background Triggers or Pub/Sub storage notificaitons " "as described in the following links:\n" "https://cloud.google.com/storage/docs/pubsub-notifications\n" "https://cloud.google.com/functions/docs/tutorials/storage") @@ -538,6 +534,38 @@ def remove_oldest_backlog_item( return False +def check_for_bq_job_and_children_errors(bq_client: bigquery.Client, + job: Union[bigquery.LoadJob, + bigquery.QueryJob]): + """checks if BigQuery job (or children jobs in case of multi-statement sql) + should be considered failed because there were errors or the query affected + no rows while FAIL_ON_ZERO_DML_ROWS_AFFECTED env var is set to True + (this is the default). + + Args: + bq_client: bigquery.Client + job: Union[bigquery.LoadJob, bigquery.QueryJob] job to check for errors. + Raises: + exceptions.BigQueryJobFailure + """ + if job.state != "DONE": + wait_on_bq_job_id(bq_client, job.job_id, 5) + if job.errors: + raise exceptions.BigQueryJobFailure( + f"BigQuery Job {job.job_id} failed during backfill with the " + f"following errors: {job.errors}\n" + f"{pprint.pformat(job.to_api_repr())}") + if isinstance(job, bigquery.QueryJob): + if (constants.FAIL_ON_ZERO_DML_ROWS_AFFECTED + and job.statement_type in constants.BQ_DML_STATEMENT_TYPES + and job.num_dml_affected_rows < 1): + raise exceptions.BigQueryJobFailure( + f"query job {job.job_id} ran successfully but did not " + f"affect any rows.\n {pprint.pformat(job.to_api_repr())}") + for child_job in bq_client.list_jobs(parent_job=job): + check_for_bq_job_and_children_errors(bq_client, child_job) + + def wait_on_bq_job_id(bq_client: bigquery.Client, job_id: str, polling_timeout: int, @@ -562,18 +590,7 @@ def wait_on_bq_job_id(bq_client: bigquery.Client, job: Union[bigquery.LoadJob, bigquery.QueryJob] = bq_client.get_job(job_id) if job.state == "DONE": - if job.errors: - raise exceptions.BigQueryJobFailure( - f"BigQuery Job {job.job_id} failed during backfill with the" - f"following errors: {job.errors}\n" - f"{pprint.pformat(job.to_api_repr())}") - if (isinstance(job, bigquery.QueryJob) - and constants.FAIL_ON_ZERO_DML_ROWS_AFFECTED - and job.statement_type in constants.BQ_DML_STATEMENT_TYPES - and job.num_dml_affected_rows < 1): - raise exceptions.BigQueryJobFailure( - f"query job {job.job_id} ran successfully but did not" - f"affect any rows.\n {pprint.pformat(job.to_api_repr())}") + check_for_bq_job_and_children_errors(bq_client, job) return True if job.state in {"RUNNING", "PENDING"}: print(f"waiting on BigQuery Job {job.job_id}") From 61d2c14e8a1ef0ea2d06e1bc71aa0066ee082e58 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Thu, 7 Jan 2021 18:28:28 -0800 Subject: [PATCH 73/90] chore: add test for child job failing behavior --- .../gcs_ocn_bq_ingest/README.md | 1 + .../gcs_ocn_bq_ingest/common/constants.py | 3 +- .../gcs_ocn_bq_ingest/common/utils.py | 19 ++---- .../gcs_ocn_bq_ingest/main.py | 21 ++++--- .../gcs_event_based_ingest/tests/conftest.py | 58 ++++++++++++++++++- .../test_gcs_ocn_bq_ingest_it.py | 31 +++++++++- 6 files changed, 107 insertions(+), 26 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md index 5ffea5c17..02fea45ff 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md @@ -38,6 +38,7 @@ following default behavior. | `ORDER_PER_TABLE`\* | Force jobs to be executed sequentially (rather than parallel) based on the backlog. This is the same as having an `ORDERME` file in every config directory | `False` | | `START_BACKFILL_FILENAME`\*| Block submitting BigQuery Jobs for a table until this file is present at the table prefix. By default this will not happen. | `None` | | `RESTART_BUFFER_SECONDS`\* | Buffer before Cloud Function timeout to leave before re-triggering the backfill subscriber | 30 | +| `USE_ERROR_REPORTING_API` | Should errors be reported using error reporting api to avoid cold restart (optimization) | True | \* only affect the behavior when ordering is enabled for a table. See [ORDERING.md](../ORDERING.md) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py index 1a5a1defa..acb6a4b24 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py @@ -30,7 +30,7 @@ # One might consider lowering this to 1-2 seconds to lower the # upper bound of expected execution time to stay within the free tier. # https://cloud.google.com/functions/pricing#free_tier -WAIT_FOR_JOB_SECONDS = int(os.getenv("WAIT_FOR_JOB_SECONDS", "1")) +WAIT_FOR_JOB_SECONDS = int(os.getenv("WAIT_FOR_JOB_SECONDS", "5")) DEFAULT_EXTERNAL_TABLE_DEFINITION = { # The default must be a self describing data format @@ -137,3 +137,4 @@ # https://cloud.google.com/bigquery/docs/running-jobs#generate-jobid NON_BQ_JOB_ID_REGEX = re.compile(r'[^0-9a-zA-Z_\-]+') + diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index db343a177..fdf078673 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -92,17 +92,8 @@ def external_query( # pylint: disable=too-many-arguments while time.monotonic( ) - start_poll_for_errors < constants.WAIT_FOR_JOB_SECONDS: job.reload(client=bq_client) - if job.errors: - raise exceptions.BigQueryJobFailure( - f"query job {job.job_id} failed quickly: {job.errors}." - f"\n{pprint.pformat(job.to_api_repr())}") if job.state == "DONE": - if (constants.FAIL_ON_ZERO_DML_ROWS_AFFECTED - and job.statement_type in constants.BQ_DML_STATEMENT_TYPES - and job.num_dml_affected_rows < 1): - raise exceptions.BigQueryJobFailure( - f"query job {job.job_id} ran successfully but did not " - f"affect any rows.\n {pprint.pformat(job.to_api_repr())}") + check_for_bq_job_and_children_errors(bq_client, job) return time.sleep(constants.JOB_POLL_INTERVAL_SECONDS) @@ -747,11 +738,9 @@ def apply( print( "looking for a transformation tranformation sql file in parent _config." ) - external_query_sql = read_gcs_file_if_exists(gcs_client, - f"{gsurl}_config/*.sql") - if not external_query_sql: - external_query_sql = look_for_config_in_parents(gcs_client, gsurl, - "*.sql") + external_query_sql = look_for_config_in_parents( + gcs_client, f"gs://{bkt.name}/{success_blob.name}", '*.sql') + if external_query_sql: print("EXTERNAL QUERY") print(f"found external query:\n{external_query_sql}") diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 1d2ef71eb..934c8a052 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -16,6 +16,7 @@ # limitations under the License. """Background Cloud Function for loading data from GCS to BigQuery. """ +import distutils.util import os import time import traceback @@ -92,13 +93,19 @@ def main(event: Dict, context): # pylint: disable=unused-argument except exceptions.EXCEPTIONS_TO_REPORT as original_error: # We do this because we know these errors do not require a cold restart # of the cloud function. - try: - lazy_error_reporting_client().report_exception() - except Exception: # pylint: disable=broad-except - # This mostly handles the case where error reporting API is not - # enabled or IAM permissions did not allow us to report errors with - # error reporting API. - raise original_error # pylint: disable=raise-missing-from + if ( + distutils.util.strtobool( + os.getenv("USE_ERROR_REPORTING_API", "True")) + ): + try: + lazy_error_reporting_client().report_exception() + except Exception: # pylint: disable=broad-except + # This mostly handles the case where error reporting API is not + # enabled or IAM permissions did not allow us to report errors + # with error reporting API. + raise original_error # pylint: disable=raise-missing-from + else: + raise original_error def triage_event(gcs_client: Optional[storage.Client], diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py index cfdc4323a..239cf98ab 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py @@ -530,7 +530,7 @@ def gcs_external_partitioned_config( "bq_transform.sql", ])) - sql = "INSERT {dest_dataset}.{dest_table} SELECT * FROM temp_ext" + sql = "INSERT {dest_dataset}.{dest_table} SELECT * FROM temp_ext;" sql_obj.upload_from_string(sql) config_obj = gcs_bucket.blob("/".join([ @@ -564,3 +564,59 @@ def teardown(): request.addfinalizer(teardown) return config_objs + + +@pytest.fixture +def no_use_error_reporting(monkeypatch): + monkeypatch.setenv("USE_ERROR_REPORTING_API", "False") + + +@pytest.fixture +def gcs_external_config_bad_statement( + request, gcs_bucket, dest_dataset, dest_table, no_use_error_reporting +) -> List[storage.blob.Blob]: + config_objs = [] + sql_obj = gcs_bucket.blob("/".join([ + f"{dest_dataset.project}.{dest_dataset.dataset_id}", + dest_table.table_id, + "_config", + "bq_transform.sql", + ])) + + sql = ("INSERT {dest_dataset}.{dest_table} SELECT * FROM temp_ext;\n" + "INSERT {dest_dataset}.{dest_table} SELECT 1/0;") + sql_obj.upload_from_string(sql) + + config_obj = gcs_bucket.blob("/".join([ + f"{dest_dataset.project}.{dest_dataset.dataset_id}", + dest_table.table_id, "_config", "external.json" + ])) + + with open(os.path.join(TEST_DIR, "resources", + "nation_schema.json")) as schema: + fields = json.load(schema) + config = { + "schema": { + "fields": fields + }, + "csvOptions": { + "allowJaggedRows": False, + "allowQuotedNewlines": False, + "encoding": "UTF-8", + "fieldDelimiter": "|", + "skipLeadingRows": 0, + }, + "sourceFormat": "CSV", + "sourceUris": ["REPLACEME"], + } + config_obj.upload_from_string(json.dumps(config)) + config_objs.append(sql_obj) + config_objs.append(config_obj) + + def teardown(): + for do in config_objs: + if do.exists(): + do.delete() + + request.addfinalizer(teardown) + return config_objs diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py index e1fe45b18..81709a5b2 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py @@ -143,8 +143,8 @@ def test_load_job_appending_batches(bq, gcs_batched_data, dest_dataset, @pytest.mark.IT -def test_external_query(bq, gcs_data, gcs_external_config, dest_dataset, - dest_table, mock_env): +def test_external_query_pure(bq, gcs_data, gcs_external_config, dest_dataset, + dest_table, mock_env): """tests the basic external query ingrestion mechanics with bq_transform.sql and external.json """ @@ -286,3 +286,30 @@ def bq_wait_for_rows(bq_client: bigquery.Client, table: bigquery.Table, f"{table.project}.{table.dataset_id}.{table.table_id} to " f"reach {expected_num_rows} rows." f"last poll returned {actual_num_rows} rows.") + + +@pytest.mark.IT +def test_external_query_with_bad_statement(bq, gcs_data, + gcs_external_config_bad_statement, + dest_dataset, dest_table, mock_env): + """tests the basic external query ingrestion mechanics + with bq_transform.sql and external.json + """ + if not gcs_data.exists(): + raise google.cloud.exceptions.NotFound("test data objects must exist") + if not all((blob.exists() for blob in gcs_external_config_bad_statement)): + raise google.cloud.exceptions.NotFound("config objects must exist") + + test_event = { + "attributes": { + "bucketId": gcs_data.bucket.name, + "objectId": gcs_data.name + } + } + raised = False + try: + gcs_ocn_bq_ingest.main.main(test_event, None) + except gcs_ocn_bq_ingest.common.exceptions.BigQueryJobFailure: + raised = True + + assert raised, "bad statement did not raise BigQueryJobFailure" From fb69a6a280c0c27015939ef3261d928370dd6837 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Thu, 7 Jan 2021 18:56:27 -0800 Subject: [PATCH 74/90] fixup flake8 --- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py | 6 ++---- .../gcs_event_based_ingest/tests/conftest.py | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 934c8a052..0141ed5b4 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -93,10 +93,8 @@ def main(event: Dict, context): # pylint: disable=unused-argument except exceptions.EXCEPTIONS_TO_REPORT as original_error: # We do this because we know these errors do not require a cold restart # of the cloud function. - if ( - distutils.util.strtobool( - os.getenv("USE_ERROR_REPORTING_API", "True")) - ): + if (distutils.util.strtobool( + os.getenv("USE_ERROR_REPORTING_API", "True"))): try: lazy_error_reporting_client().report_exception() except Exception: # pylint: disable=broad-except diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py index 239cf98ab..5dc4c7fa1 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py @@ -573,8 +573,8 @@ def no_use_error_reporting(monkeypatch): @pytest.fixture def gcs_external_config_bad_statement( - request, gcs_bucket, dest_dataset, dest_table, no_use_error_reporting -) -> List[storage.blob.Blob]: + request, gcs_bucket, dest_dataset, dest_table, + no_use_error_reporting) -> List[storage.blob.Blob]: config_objs = [] sql_obj = gcs_bucket.blob("/".join([ f"{dest_dataset.project}.{dest_dataset.dataset_id}", From 1aec908e42716a068605c1774dc40323b9faefe4 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Thu, 7 Jan 2021 19:00:01 -0800 Subject: [PATCH 75/90] fixup flake8 --- .../gcs_ocn_bq_ingest/common/constants.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py index acb6a4b24..c9a1e8323 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py @@ -136,5 +136,4 @@ } # https://cloud.google.com/bigquery/docs/running-jobs#generate-jobid -NON_BQ_JOB_ID_REGEX = re.compile(r'[^0-9a-zA-Z_\-]+') - +NON_BQ_JOB_ID_REGEX = re.compile(r'[^0-9a-zA-Z_\-]+') \ No newline at end of file From 0490217309274aeccac07f1fe3b95b84692f92df Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Thu, 7 Jan 2021 19:27:30 -0800 Subject: [PATCH 76/90] fixup flake8 --- .../gcs_ocn_bq_ingest/common/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py index c9a1e8323..90689c1ae 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py @@ -136,4 +136,4 @@ } # https://cloud.google.com/bigquery/docs/running-jobs#generate-jobid -NON_BQ_JOB_ID_REGEX = re.compile(r'[^0-9a-zA-Z_\-]+') \ No newline at end of file +NON_BQ_JOB_ID_REGEX = re.compile(r'[^0-9a-zA-Z_\-]+') From 3c3bd3dcdd5160f5a2e9f27b5ce2f831ca4729a5 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Mon, 11 Jan 2021 13:01:34 -0800 Subject: [PATCH 77/90] feat: separate bq storage and compute project env vars --- tools/cloud_functions/gcs_event_based_ingest/README.md | 7 ++++++- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md | 3 ++- .../gcs_ocn_bq_ingest/common/ordering.py | 2 +- .../gcs_ocn_bq_ingest/common/utils.py | 9 +++------ 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/README.md index d75b826eb..372590064 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/README.md @@ -42,7 +42,7 @@ better fit your naming convention on GCS. Your regex must include for destination `dataset`, and `table`. Note, that `dataset` can optionally, explicitly specify destination project (i.e. `gs://${BUCKET}/project_id.dataset_id/table/....`) alternatively, -one can set the `BQ_PROJECT` environment variable to set to override the +one can set the `BQ_STORAGE_PROJECT` environment variable to set to override the default target project for datasets at the function level. The default behavior is to infer the project from Application Default Credential (the project in which the Cloud Function is running, or the ADC configured in Google Cloud SDK @@ -234,6 +234,11 @@ at any parent folders `_config` prefix. This allows you dictate "for this table any new batch should `WRITE_TRUNCATE` it's parent partition/table" or "for that table any new batch should `WRITE_APPEND` to it's parent partition/table". +## Controlling BigQuery Compute Project +By default BigQuery jobs will be submitted in the project where the Cloud Function +is deployed. To submit jobs in another BigQuery project set the `BQ_PROJECT` +environment variable. + ## Monitoring Monitoring what data has been loaded by this solution should be done with the BigQuery [`INFORMATION_SCHEMA` jobs metadata](https://cloud.google.com/bigquery/docs/information-schema-jobs) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md index 02fea45ff..20c023825 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md @@ -32,7 +32,8 @@ following default behavior. | `DESTINATION_REGEX` | A [Python Regex with named capturing groups](https://docs.python.org/3/howto/regex.html#non-capturing-and-named-groups) for `dataset`, `table`, (optional: `partition` or `yyyy`, `mm`, `dd`, `hh`, `batch`) | (see below)| | `MAX_BATCH_BYTES` | Max bytes for BigQuery Load job | `15000000000000` ([15 TB](https://cloud.google.com/bigquery/quotas#load_jobs)| | `JOB_PREFIX` | Prefix for BigQuery Job IDs | `gcf-ingest-` | -| `BQ_PROJECT` | Default BQ project to use if not specified in dataset capturing group | Project where Cloud Function is deployed | +| `BQ_PROJECT` | Default BQ project to use to submit load / query jobs | Project where Cloud Function is deployed | +| `BQ_STORAGE_PROJECT` | Default BQ project to use for target table references if not specified in dataset capturing group | Project where Cloud Function is deployed | | `FUNCTION_TIMEOUT_SEC`| Number of seconds set for this deployment of Cloud Function (no longer part of python38 runtime) | 60 | | `FAIL_ON_ZERO_DML_ROWS_AFFECTED` | Treat External Queries that result in `numDmlAffectedRows = 0` as failures | True | | `ORDER_PER_TABLE`\* | Force jobs to be executed sequentially (rather than parallel) based on the backlog. This is the same as having an `ORDERME` file in every config directory | `False` | diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py index c6362755a..a53e83d1a 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py @@ -349,7 +349,7 @@ def _get_clients_if_none( harmless if these clients are recreated in the Cloud Function. """ print("instantiating missing clients in backlog subscriber this should only" - "happen during integration tests.") + " happen during integration tests.") if not gcs_client: gcs_client = storage.Client(client_info=constants.CLIENT_INFO) if not bq_client: diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index fdf078673..f7e6365cf 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -80,10 +80,7 @@ def external_query( # pylint: disable=too-many-arguments job: bigquery.QueryJob = bq_client.query(rendered_query, job_config=job_config, - job_id=job_id, - project=os.getenv( - "BQ_PROJECT", - bq_client.project)) + job_id=job_id) print(f"started asynchronous query job: {job.job_id}") @@ -653,11 +650,11 @@ def gcs_path_to_table_ref_and_batch( dest_table_ref = bigquery.TableReference.from_string( f"{dataset}.{table}{partition}", - default_project=os.getenv("BQ_PROJECT", default_project)) + default_project=os.getenv("BQ_STORAGE_PROJECT", default_project)) else: dest_table_ref = bigquery.TableReference.from_string( f"{dataset}.{table}", - default_project=os.getenv("BQ_PROJECT", default_project)) + default_project=os.getenv("BQ_STORAGE_PROJECT", default_project)) return dest_table_ref, batch_id From 9e8e52f7650a682d47cec160999cf84b06582bea Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Fri, 22 Jan 2021 12:19:30 -0800 Subject: [PATCH 78/90] fix: don't require escaping braces in sql, still support {dest_dataset} {dest_table} rendering --- .../gcs_ocn_bq_ingest/common/utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index f7e6365cf..5865cb5c2 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -74,9 +74,13 @@ def external_query( # pylint: disable=too-many-arguments # drop partition decorator if present. table_id = dest_table_ref.table_id.split("$")[0] - rendered_query = query.format( - dest_dataset=f"`{dest_table_ref.project}`.{dest_table_ref.dataset_id}", - dest_table=table_id) + # similar syntax to str.format but doesn't require escaping braces + # elsewhere in query (e.g. in a regex) + rendered_query = query\ + .replace( + "{dest_dataset}", + f"`{dest_table_ref.project}`.{dest_table_ref.dataset_id}")\ + .replace("{dest_table}", table_id) job: bigquery.QueryJob = bq_client.query(rendered_query, job_config=job_config, From 854aa68aeefa9969a1aad5a506142279e7413716 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Fri, 22 Jan 2021 15:10:14 -0800 Subject: [PATCH 79/90] happy newyear! copyright 2020 -> 2021 --- tools/cloud_functions/gcs_event_based_ingest/__init__.py | 2 +- tools/cloud_functions/gcs_event_based_ingest/backfill.py | 2 +- tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml | 2 +- tools/cloud_functions/gcs_event_based_ingest/e2e/__init__.py | 2 +- tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py | 2 +- tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py | 2 +- tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf | 2 +- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/__init__.py | 2 +- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/common/__init__.py | 2 +- .../gcs_ocn_bq_ingest/common/constants.py | 2 +- .../gcs_ocn_bq_ingest/common/exceptions.py | 2 +- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py | 2 +- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py | 2 +- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py | 2 +- .../gcs_event_based_ingest/scripts/install_terraform.sh | 2 +- .../terraform_module/gcs_ocn_bq_ingest_function/main.tf | 2 +- .../terraform_module/gcs_ocn_bq_ingest_function/outputs.tf | 2 +- .../terraform_module/gcs_ocn_bq_ingest_function/variables.tf | 2 +- .../terraform_module/gcs_ocn_bq_ingest_function/versions.tf | 2 +- .../gcs_event_based_ingest/tests/cli/test_backfill.py | 2 +- tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py | 2 +- .../tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py | 2 +- .../tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py | 2 +- .../tests/gcs_ocn_bq_ingest/test_ordering_it.py | 2 +- 24 files changed, 24 insertions(+), 24 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/__init__.py b/tools/cloud_functions/gcs_event_based_ingest/__init__.py index 7a3efb203..42ed0a407 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/__init__.py +++ b/tools/cloud_functions/gcs_event_based_ingest/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC. +# Copyright 2021 Google LLC. # This software is provided as-is, without warranty or representation # for any use or purpose. # Your use of it is subject to your agreement with Google. diff --git a/tools/cloud_functions/gcs_event_based_ingest/backfill.py b/tools/cloud_functions/gcs_event_based_ingest/backfill.py index f0a2ce415..105397553 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/backfill.py +++ b/tools/cloud_functions/gcs_event_based_ingest/backfill.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml index 32d39e742..0ae2de0ae 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml +++ b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC. +# Copyright 2021 Google LLC. # This software is provided as-is, without warranty or representation # for any use or purpose. # Your use of it is subject to your agreement with Google. diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/__init__.py b/tools/cloud_functions/gcs_event_based_ingest/e2e/__init__.py index 7a3efb203..42ed0a407 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/e2e/__init__.py +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC. +# Copyright 2021 Google LLC. # This software is provided as-is, without warranty or representation # for any use or purpose. # Your use of it is subject to your agreement with Google. diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py index b8f12a14c..2aa9684e1 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC. +# Copyright 2021 Google LLC. # This software is provided as-is, without warranty or representation # for any use or purpose. # Your use of it is subject to your agreement with Google. diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py b/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py index b8542631c..8ffa44c2f 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC. +# Copyright 2021 Google LLC. # This software is provided as-is, without warranty or representation # for any use or purpose. # Your use of it is subject to your agreement with Google. diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf b/tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf index af45d7eed..64e3973d3 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/__init__.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/__init__.py index 7a3efb203..42ed0a407 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/__init__.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC. +# Copyright 2021 Google LLC. # This software is provided as-is, without warranty or representation # for any use or purpose. # Your use of it is subject to your agreement with Google. diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/__init__.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/__init__.py index 7a3efb203..42ed0a407 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/__init__.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC. +# Copyright 2021 Google LLC. # This software is provided as-is, without warranty or representation # for any use or purpose. # Your use of it is subject to your agreement with Google. diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py index 90689c1ae..27e104586 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC. +# Copyright 2021 Google LLC. # This software is provided as-is, without warranty or representation # for any use or purpose. # Your use of it is subject to your agreement with Google. diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/exceptions.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/exceptions.py index 8ab701e8d..7f7b0e04b 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/exceptions.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/exceptions.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC. +# Copyright 2021 Google LLC. # This software is provided as-is, without warranty or representation # for any use or purpose. # Your use of it is subject to your agreement with Google. diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py index a53e83d1a..95fb99195 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC. +# Copyright 2021 Google LLC. # This software is provided as-is, without warranty or representation # for any use or purpose. # Your use of it is subject to your agreement with Google. diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index 5865cb5c2..44b8367ee 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC. +# Copyright 2021 Google LLC. # This software is provided as-is, without warranty or representation # for any use or purpose. # Your use of it is subject to your agreement with Google. diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 0141ed5b4..5b536ff25 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC. +# Copyright 2021 Google LLC. # This software is provided as-is, without warranty or representation # for any use or purpose. # Your use of it is subject to your agreement with Google. diff --git a/tools/cloud_functions/gcs_event_based_ingest/scripts/install_terraform.sh b/tools/cloud_functions/gcs_event_based_ingest/scripts/install_terraform.sh index 70f9cb521..4c1cd6f50 100755 --- a/tools/cloud_functions/gcs_event_based_ingest/scripts/install_terraform.sh +++ b/tools/cloud_functions/gcs_event_based_ingest/scripts/install_terraform.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Copyright 2020 Google Inc. +# Copyright 2021 Google Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf index 6094881c3..9899db2d1 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/outputs.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/outputs.tf index 69d8017ab..5ad0d2b9b 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/outputs.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/outputs.tf @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf index 1783034f5..78b1a1991 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/versions.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/versions.tf index e4234775c..3085198f2 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/versions.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/versions.tf @@ -1,5 +1,5 @@ /** - * Copyright 2020 Google LLC + * Copyright 2021 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/cli/test_backfill.py b/tools/cloud_functions/gcs_event_based_ingest/tests/cli/test_backfill.py index 5e9c20cb1..ac3419706 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/cli/test_backfill.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/cli/test_backfill.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py index 5dc4c7fa1..f1400ffc4 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py index 877ac0104..be36a397e 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py @@ -1,5 +1,5 @@ # dataset/table/_SUCCESS -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py index 81709a5b2..02dbeb318 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py index 9ecf236bc..7fe82d200 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 09daa9dd21219814acdb2a0c8202e7310322f051 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Thu, 18 Feb 2021 12:27:55 -0800 Subject: [PATCH 80/90] clean up newlines in logs / error messages --- .../gcs_ocn_bq_ingest/common/ordering.py | 8 ++++---- .../gcs_ocn_bq_ingest/common/utils.py | 16 ++++++++-------- .../gcs_ocn_bq_ingest/main.py | 4 ++-- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py index 95fb99195..83027589d 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py @@ -98,8 +98,8 @@ def backlog_subscriber(gcs_client: Optional[storage.Client], print(f"sleeping for {polling_timeout} seconds because" f"found manual lock gs://{bkt.name}/{lock_blob.name} with" "This will be an infinite loop until the manual lock is " - "released.\n" - f"manual lock contents:\n {lock_contents}. ") + "released. " + f"manual lock contents: {lock_contents}. ") time.sleep(polling_timeout) continue else: # this condition handles absence of _bqlock file @@ -156,8 +156,8 @@ def wait_on_last_job(bq_client: bigquery.Client, lock_blob: storage.Blob, f"gs://{backfill_blob.bucket.name}/{table_prefix}" "/_BACKFILL " f"to resume the backfill subscriber so it can " - "continue with the next item in the backlog.\n" - "Original Exception:\n" + "continue with the next item in the backlog." + "Original Exception:" f"{traceback.format_exc()}") from err diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index 44b8367ee..cf8676f43 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -59,7 +59,7 @@ def external_query( # pylint: disable=too-many-arguments external_table_def = json.loads(external_table_config) else: print(f" {gsurl}_config/external.json not found in parents of {gsurl}. " - "Falling back to default PARQUET external table:\n" + "Falling back to default PARQUET external table: " f"{json.dumps(constants.DEFAULT_EXTERNAL_TABLE_DEFINITION)}") external_table_def = constants.DEFAULT_EXTERNAL_TABLE_DEFINITION @@ -301,12 +301,12 @@ def parse_notification(notification: dict) -> Tuple[str, str]: f"attributes: 'bucketId' and 'objectId': {notification}" ) from KeyError raise exceptions.UnexpectedTriggerException( - "Cloud Function received unexpected trigger:\n" - f"{notification}\n" + "Cloud Function received unexpected trigger: " + f"{notification} " "This function only supports direct Cloud Functions " "Background Triggers or Pub/Sub storage notificaitons " - "as described in the following links:\n" - "https://cloud.google.com/storage/docs/pubsub-notifications\n" + "as described in the following links: " + "https://cloud.google.com/storage/docs/pubsub-notifications " "https://cloud.google.com/functions/docs/tutorials/storage") @@ -545,7 +545,7 @@ def check_for_bq_job_and_children_errors(bq_client: bigquery.Client, if job.errors: raise exceptions.BigQueryJobFailure( f"BigQuery Job {job.job_id} failed during backfill with the " - f"following errors: {job.errors}\n" + f"following errors: {job.errors} " f"{pprint.pformat(job.to_api_repr())}") if isinstance(job, bigquery.QueryJob): if (constants.FAIL_ON_ZERO_DML_ROWS_AFFECTED @@ -553,7 +553,7 @@ def check_for_bq_job_and_children_errors(bq_client: bigquery.Client, and job.num_dml_affected_rows < 1): raise exceptions.BigQueryJobFailure( f"query job {job.job_id} ran successfully but did not " - f"affect any rows.\n {pprint.pformat(job.to_api_repr())}") + f"affect any rows. {pprint.pformat(job.to_api_repr())}") for child_job in bq_client.list_jobs(parent_job=job): check_for_bq_job_and_children_errors(bq_client, child_job) @@ -744,7 +744,7 @@ def apply( if external_query_sql: print("EXTERNAL QUERY") - print(f"found external query:\n{external_query_sql}") + print(f"found external query: {external_query_sql}") external_query(gcs_client, bq_client, gsurl, external_query_sql, dest_table_ref, job_id) return diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 5b536ff25..bf2ccebad 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -87,7 +87,7 @@ def main(event: Dict, context): # pylint: disable=unused-argument # Unexpected exceptions will actually raise which may cause a cold restart. except exceptions.DuplicateNotificationException: - print("recieved duplicate notification. this was handled gracefully.\n " + print("recieved duplicate notification. this was handled gracefully. " f"{traceback.format_exc()}") except exceptions.EXCEPTIONS_TO_REPORT as original_error: @@ -146,7 +146,7 @@ def triage_event(gcs_client: Optional[storage.Client], f"{constants.BACKFILL_FILENAME}"): raise RuntimeError( f"recieved notification for gs://{event_blob.bucket.name}/" - f"{event_blob.name}\n" + f"{event_blob.name} " f"{constants.BACKFILL_FILENAME} files " "are expected only at the table prefix level.") ordering.backlog_subscriber(gcs_client, bq_client, event_blob, From 8821dc090f29878b5797540fc2a90496c67906f9 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Tue, 2 Mar 2021 10:32:33 -0800 Subject: [PATCH 81/90] improve logging in bq failures improved error logs and tests with object versioning fixup: logging spaces fixup ci dickerfile workdir REVERT ME THIS INTENTIONALLY BREAKS E2E TEST Revert "REVERT ME THIS INTENTIONALLY BREAKS E2E TEST" This reverts commit eeffc0fd47184c96d29fc0d8ef07662185076962. fixup catch client errors during apply Revert "Revert "REVERT ME THIS INTENTIONALLY BREAKS E2E TEST"" This reverts commit 5824cf372c08ff3432af6ddb1e94852dd78d0853. simpler one line message exceptions Revert "Revert "Revert "REVERT ME THIS INTENTIONALLY BREAKS E2E TEST""" This reverts commit ad3e1da9bc5436b3ac6540da8b7886d377e11b58. --- .../gcs_event_based_ingest/Dockerfile.ci | 4 ++- .../gcs_event_based_ingest/e2e/conftest.py | 23 +++++++++---- .../gcs_ocn_bq_ingest/common/exceptions.py | 17 +++++++--- .../gcs_ocn_bq_ingest/common/ordering.py | 2 +- .../gcs_ocn_bq_ingest/common/utils.py | 33 ++++++++++++++----- .../gcs_ocn_bq_ingest_function/main.tf | 2 +- .../gcs_event_based_ingest/tests/conftest.py | 6 ++++ 7 files changed, 65 insertions(+), 22 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/Dockerfile.ci b/tools/cloud_functions/gcs_event_based_ingest/Dockerfile.ci index d383e7563..2c656ef94 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/Dockerfile.ci +++ b/tools/cloud_functions/gcs_event_based_ingest/Dockerfile.ci @@ -8,8 +8,10 @@ RUN apt-get update \ unzip \ && apt-get autoremove -yqq --purge \ && apt-get clean && rm -rf /var/lib/apt/lists/* +WORKDIR /ci COPY requirements.txt requirements-dev.txt ./ COPY scripts/install_terraform.sh ./ RUN ./install_terraform.sh RUN pip3 install --no-cache-dir -r requirements-dev.txt -ENTRYPOINT ["python3 -m pytest"] +ENTRYPOINT ["python3", "-m", "pytest"] + diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py index 2aa9684e1..c0f91da4a 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py @@ -46,13 +46,24 @@ def gcs() -> storage.Client: @pytest.fixture(scope='module') def terraform_infra(request): + def _escape(in_str): + if in_str is not None: + return ANSI_ESCAPE_PATTERN.sub('', in_str.decode('UTF-8')) + return None + def _run(cmd): - print( - ANSI_ESCAPE_PATTERN.sub( - '', - subprocess.check_output(cmd, - stderr=subprocess.STDOUT, - cwd=TEST_DIR).decode('UTF-8'))) + result = subprocess.run(cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + cwd=TEST_DIR) + print(_escape(result.stdout)) + if result.returncode == 0: + return + raise subprocess.CalledProcessError( + returncode=result.returncode, + cmd=result.args, + output=_escape(result.stdout), + stderr=_escape(result.stderr)) init = shlex.split("terraform init") apply = shlex.split("terraform apply -auto-approve") diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/exceptions.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/exceptions.py index 7f7b0e04b..7a921696f 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/exceptions.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/exceptions.py @@ -17,28 +17,35 @@ """Custom Exceptions of GCS event based ingest to BigQuery""" -class DuplicateNotificationException(Exception): +class OneLineException(Exception): + """base class for exceptions whose messages will be displayed on a single + line for better readability in Cloud Function Logs""" + def __init__(self, msg): + super().__init__(msg.replace('\n', ' ').replace('\r', '')) + + +class DuplicateNotificationException(OneLineException): """Exception to indicate that the function was triggered twice for the same event.""" -class BigQueryJobFailure(Exception): +class BigQueryJobFailure(OneLineException): """Exception to indicate that the function was triggered twice for the same event.""" -class DestinationRegexMatchException(Exception): +class DestinationRegexMatchException(OneLineException): """Exception to indicate that a success file did not match the destination regex specified in the DESTINATION_REGEX environment variable (or the default)""" -class UnexpectedTriggerException(Exception): +class UnexpectedTriggerException(OneLineException): """Exception to indicate the cloud function was triggered with an unexpected payload.""" -class BacklogException(Exception): +class BacklogException(OneLineException): """Exception to indicate an issue with the backlog mechanics of this function.""" diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py index 83027589d..e30684eae 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py @@ -157,7 +157,7 @@ def wait_on_last_job(bq_client: bigquery.Client, lock_blob: storage.Blob, "/_BACKFILL " f"to resume the backfill subscriber so it can " "continue with the next item in the backlog." - "Original Exception:" + "Original Exception: " f"{traceback.format_exc()}") from err diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index cf8676f43..580f933de 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -24,7 +24,9 @@ import os import pathlib import pprint +import sys import time +import traceback import uuid from typing import Any, Deque, Dict, List, Optional, Tuple, Union @@ -716,6 +718,7 @@ def apply( lock_blob: Optional[storage.Blob], job_id: str, ): + # pylint: disable=too-many-locals """ Apply an incremental batch to the target BigQuery table via an asynchronous load job or external query. @@ -741,14 +744,28 @@ def apply( ) external_query_sql = look_for_config_in_parents( gcs_client, f"gs://{bkt.name}/{success_blob.name}", '*.sql') + try: + + if external_query_sql: + print("EXTERNAL QUERY") + print(f"found external query: {external_query_sql}") + external_query(gcs_client, bq_client, gsurl, external_query_sql, + dest_table_ref, job_id) + return - if external_query_sql: - print("EXTERNAL QUERY") - print(f"found external query: {external_query_sql}") - external_query(gcs_client, bq_client, gsurl, external_query_sql, - dest_table_ref, job_id) + print("LOAD_JOB") + load_batches(gcs_client, bq_client, gsurl, dest_table_ref, job_id) return - print("LOAD_JOB") - load_batches(gcs_client, bq_client, gsurl, dest_table_ref, job_id) - return + except (google.api_core.exceptions.GoogleAPIError, + google.api_core.exceptions.ClientError) as err: + etype, value, err_tb = sys.exc_info() + msg = (f"failed to submit job {job_id} for {gsurl}: " + f"{etype.__name__}: {value}") + blob = storage.Blob.from_string(gsurl) + table_prefix = get_table_prefix(blob.name) + bqlock = storage.Blob.from_string( + f"gs://{blob.bucket.name}/{table_prefix}/_bqlock") + # Write this error message to avoid confusion. + handle_bq_lock(gcs_client, bqlock, msg) + raise exceptions.BigQueryJobFailure(msg) from err diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf index 9899db2d1..6651c645b 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf @@ -103,7 +103,7 @@ resource "google_project_iam_binding" "ingester_bq_admin" { for_each = toset(concat(var.bigquery_project_ids, [var.project_id])) project = each.key members = [module.data_ingester_service_account.iam_email] - role = "roles/bigquery.dataEditor" + role = "roles/bigquery.admin" } # Allow the GCS service account to publish notification for new objects to the diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py index f1400ffc4..fea69c20d 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py @@ -52,6 +52,8 @@ def error() -> error_reporting.Client: def gcs_bucket(request, gcs) -> storage.bucket.Bucket: """GCS bucket for test artifacts""" bucket = gcs.create_bucket(str(uuid.uuid4())) + bucket.versioning_enabled = True + bucket.patch() # overide default field delimiter at bucket level load_config_json = { "fieldDelimiter": "|", @@ -61,6 +63,10 @@ def gcs_bucket(request, gcs) -> storage.bucket.Bucket: def teardown(): load_json_blob.delete() + bucket.versioning_enabled = False + bucket.patch() + for obj in gcs.list_blobs(bucket_or_name=bucket, versions=True): + obj.delete() bucket.delete(force=True) request.addfinalizer(teardown) From 33ae3291b5e81b79c49c296d093dc7b5eb6c3ba3 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Wed, 3 Mar 2021 14:54:04 -0800 Subject: [PATCH 82/90] fixup flake8 --- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index 580f933de..89b151b39 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -26,7 +26,6 @@ import pprint import sys import time -import traceback import uuid from typing import Any, Deque, Dict, List, Optional, Tuple, Union From 57443e0f559f251f7b8f7dd69946fff1811ee23a Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Wed, 3 Mar 2021 15:00:50 -0800 Subject: [PATCH 83/90] fixup mypy --- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index 89b151b39..bbd8d88c3 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -760,7 +760,7 @@ def apply( google.api_core.exceptions.ClientError) as err: etype, value, err_tb = sys.exc_info() msg = (f"failed to submit job {job_id} for {gsurl}: " - f"{etype.__name__}: {value}") + f"{etype.__class__.__name__}: {value}") blob = storage.Blob.from_string(gsurl) table_prefix = get_table_prefix(blob.name) bqlock = storage.Blob.from_string( From 97f48a73fbb484203b797a6dec613f6e7288122f Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Wed, 3 Mar 2021 15:04:44 -0800 Subject: [PATCH 84/90] fixup pylint --- tools/cloud_functions/gcs_event_based_ingest/README.md | 4 ++++ .../gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/README.md index 372590064..4c0cac057 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/README.md @@ -329,6 +329,10 @@ It's better for us to make a conscious decision to adopt new features or adjust CI configs or pin older version depending on the type for failure. This CI should be run on all new PRs and nightly. +Note, all functionality of the cloud function (including ordering) is +integration tested against buckets with object versioning enabled to ensure this +solution works for buckets using this feature. + ### Just Running the Tests #### Running in Docker ```bash diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index bbd8d88c3..1c5ad6642 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -758,7 +758,7 @@ def apply( except (google.api_core.exceptions.GoogleAPIError, google.api_core.exceptions.ClientError) as err: - etype, value, err_tb = sys.exc_info() + etype, value, _ = sys.exc_info() msg = (f"failed to submit job {job_id} for {gsurl}: " f"{etype.__class__.__name__}: {value}") blob = storage.Blob.from_string(gsurl) From 7293f32c0e2444f2d51e4c8f3e59d2f8460c2900 Mon Sep 17 00:00:00 2001 From: Jacob Ferriero Date: Wed, 3 Mar 2021 15:40:36 -0800 Subject: [PATCH 85/90] fixup BigQueryJobFailure docstring --- .../gcs_event_based_ingest/e2e/conftest.py | 9 ++++----- .../gcs_ocn_bq_ingest/common/exceptions.py | 7 +++++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py index c0f91da4a..7f3c73205 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py @@ -59,11 +59,10 @@ def _run(cmd): print(_escape(result.stdout)) if result.returncode == 0: return - raise subprocess.CalledProcessError( - returncode=result.returncode, - cmd=result.args, - output=_escape(result.stdout), - stderr=_escape(result.stderr)) + raise subprocess.CalledProcessError(returncode=result.returncode, + cmd=result.args, + output=_escape(result.stdout), + stderr=_escape(result.stderr)) init = shlex.split("terraform init") apply = shlex.split("terraform apply -auto-approve") diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/exceptions.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/exceptions.py index 7a921696f..a9eb9bab5 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/exceptions.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/exceptions.py @@ -20,6 +20,7 @@ class OneLineException(Exception): """base class for exceptions whose messages will be displayed on a single line for better readability in Cloud Function Logs""" + def __init__(self, msg): super().__init__(msg.replace('\n', ' ').replace('\r', '')) @@ -30,8 +31,10 @@ class DuplicateNotificationException(OneLineException): class BigQueryJobFailure(OneLineException): - """Exception to indicate that the function was triggered twice for the same - event.""" + """Exception to indicate that there was an issue with a BigQuery job. This + might include client errors (e.g. bad request which can happen if a _SUCCESS + file is dropped but there are not data files at the GCS prefix) or server + side errors like a job that fails to execute successfully.""" class DestinationRegexMatchException(OneLineException): From 91dd8af64d5e6ac6db736434dc5f45eb38f6719c Mon Sep 17 00:00:00 2001 From: Ryan den Otter Date: Tue, 23 Mar 2021 13:51:41 -0400 Subject: [PATCH 86/90] FEATURE: Snapshot the table once a chunk has successfully loaded --- .../gcs_ocn_bq_ingest/common/constants.py | 2 + .../gcs_ocn_bq_ingest/common/ordering.py | 6 +++ .../gcs_ocn_bq_ingest/common/utils.py | 40 +++++++++++++++++++ 3 files changed, 48 insertions(+) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py index 27e104586..58d85ff65 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py @@ -137,3 +137,5 @@ # https://cloud.google.com/bigquery/docs/running-jobs#generate-jobid NON_BQ_JOB_ID_REGEX = re.compile(r'[^0-9a-zA-Z_\-]+') + +SNAPSHOTTING = True diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py index e30684eae..e2b5287a9 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py @@ -94,6 +94,12 @@ def backlog_subscriber(gcs_client: Optional[storage.Client], last_job_done = wait_on_last_job(bq_client, lock_blob, backfill_blob, lock_contents, polling_timeout) + # if last_job_done = True, this means that a job just completed + # We need to check if SNAPSHOTTING is enabled + if last_job_done and constants.SNAPSHOTTING: + print("Snapshotting is enabled. Taking a snapshot") + utils.take_table_snapshot(bq_client, job_id=lock_contents, + lock_blob_name=lock_blob.name) else: print(f"sleeping for {polling_timeout} seconds because" f"found manual lock gs://{bkt.name}/{lock_blob.name} with" diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index 1c5ad6642..7ffd8a0aa 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -768,3 +768,43 @@ def apply( # Write this error message to avoid confusion. handle_bq_lock(gcs_client, bqlock, msg) raise exceptions.BigQueryJobFailure(msg) from err + + +def take_table_snapshot( + bq_client: bigquery.Client, + job_id: str, + lock_blob_name: str): + """ + Take a snapshot of the table. + We are creating the snapshot in a dataset named _snapshot + + Args: + bq_client: bigquery.Client + job_id: str + lock_blob_name: str the name of the lock_blob + """ + # Create the job + # NOTE: This feature is in ALPHA. We have to access the _properties + job_config = bigquery.CopyJobConfig() + job_config._properties["copy"]["operationType"] = "SNAPSHOT" # pylint: disable=W0212 + print(f"take_table_snapshot: extracting dataset and table name from {lock_blob_name}") + destination_match = constants.DESTINATION_REGEX.match(lock_blob_name) + if not destination_match: + raise RuntimeError(f"Lock Blob Name {lock_blob_name} did not match regex:" + f" {constants.DESTINATION_REGEX.pattern}") + destination_details = destination_match.groupdict() + try: + dataset = destination_details['dataset'] + table = destination_details['table'] + except KeyError: + raise exceptions.DestinationRegexMatchException( + f"Lock Blob Name {lock_blob_name} did not match dataset and table in regex:" + f" {constants.DESTINATION_REGEX.pattern}") from KeyError + + source_name = f"{dataset}.{table}" + # We need to name the snapshot based off of the job id. + snapshot_name = f"{dataset}_snapshot.{job_id}" + print(f"Creating snapshot: {snapshot_name}") + + job = bq_client.copy_table(source_name, snapshot_name, job_config=job_config) + job.result() # Wait for the job to complete. From 29b2412966794e8a1de6b6383490dafb8d7e0218 Mon Sep 17 00:00:00 2001 From: Ryan den Otter Date: Wed, 24 Mar 2021 16:09:24 -0400 Subject: [PATCH 87/90] Changing to a copy until the snapshotting feature is enabled --- .../gcs_ocn_bq_ingest/common/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index 7ffd8a0aa..8b5541c00 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -785,8 +785,8 @@ def take_table_snapshot( """ # Create the job # NOTE: This feature is in ALPHA. We have to access the _properties - job_config = bigquery.CopyJobConfig() - job_config._properties["copy"]["operationType"] = "SNAPSHOT" # pylint: disable=W0212 + # job_config = bigquery.CopyJobConfig() + # job_config._properties["copy"]["operationType"] = "SNAPSHOT" # pylint: disable=W0212 print(f"take_table_snapshot: extracting dataset and table name from {lock_blob_name}") destination_match = constants.DESTINATION_REGEX.match(lock_blob_name) if not destination_match: @@ -806,5 +806,5 @@ def take_table_snapshot( snapshot_name = f"{dataset}_snapshot.{job_id}" print(f"Creating snapshot: {snapshot_name}") - job = bq_client.copy_table(source_name, snapshot_name, job_config=job_config) + job = bq_client.copy_table(source_name, snapshot_name) job.result() # Wait for the job to complete. From db6a98e37853c28c02a5e74eb2730b6708a4598d Mon Sep 17 00:00:00 2001 From: Ryan den Otter Date: Wed, 24 Mar 2021 17:34:54 -0400 Subject: [PATCH 88/90] Make the SNAPSHOT_DATSET and ENABLE_SNAPSHOTTING constants environment variables --- .../gcs_ocn_bq_ingest/common/constants.py | 4 +++- .../gcs_ocn_bq_ingest/common/ordering.py | 2 +- .../gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py index 58d85ff65..656173289 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py @@ -138,4 +138,6 @@ # https://cloud.google.com/bigquery/docs/running-jobs#generate-jobid NON_BQ_JOB_ID_REGEX = re.compile(r'[^0-9a-zA-Z_\-]+') -SNAPSHOTTING = True +ENABLE_SNAPSHOTTING = bool( + distutils.util.strtobool(os.getenv("ENABLE_SNAPSHOTTING", "False"))) +SNAPSHOT_DATASET = str(os.getenv("SNAPSHOT_DATASET", 'snapshots')) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py index e2b5287a9..28477e201 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py @@ -96,7 +96,7 @@ def backlog_subscriber(gcs_client: Optional[storage.Client], polling_timeout) # if last_job_done = True, this means that a job just completed # We need to check if SNAPSHOTTING is enabled - if last_job_done and constants.SNAPSHOTTING: + if last_job_done and constants.ENABLE_SNAPSHOTTING: print("Snapshotting is enabled. Taking a snapshot") utils.take_table_snapshot(bq_client, job_id=lock_contents, lock_blob_name=lock_blob.name) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py index 8b5541c00..32e40cec2 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -803,7 +803,7 @@ def take_table_snapshot( source_name = f"{dataset}.{table}" # We need to name the snapshot based off of the job id. - snapshot_name = f"{dataset}_snapshot.{job_id}" + snapshot_name = f"{constants.SNAPSHOT_DATASET}.{job_id}" print(f"Creating snapshot: {snapshot_name}") job = bq_client.copy_table(source_name, snapshot_name) From 145c2af39c2b74dd8ee882d961252e958bf29a1f Mon Sep 17 00:00:00 2001 From: Ryan den Otter Date: Wed, 24 Mar 2021 19:15:35 -0400 Subject: [PATCH 89/90] force enable snapshotting --- tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py index fea69c20d..c7e85f213 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py @@ -88,6 +88,7 @@ def mock_env(gcs, monkeypatch): def ordered_mock_env(mock_env, monkeypatch): """environment variable mocks""" monkeypatch.setenv("ORDER_PER_TABLE", "TRUE") + monkeypatch.setenv("ENABLE_SNAPSHOTTING", "TRUE") @pytest.fixture From 5f69020ff65a1628fb32891a4d1fe0233f987a1f Mon Sep 17 00:00:00 2001 From: Ryan den Otter Date: Thu, 25 Mar 2021 15:25:12 -0400 Subject: [PATCH 90/90] setting the snapshotting as enabled and included in tests by default --- .../gcs_ocn_bq_ingest/common/constants.py | 4 +- .../gcs_event_based_ingest/tests/conftest.py | 20 ++++++++ .../gcs_ocn_bq_ingest/test_ordering_it.py | 48 +++++++++++++++++++ 3 files changed, 70 insertions(+), 2 deletions(-) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py index 656173289..00cabfda1 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py @@ -139,5 +139,5 @@ NON_BQ_JOB_ID_REGEX = re.compile(r'[^0-9a-zA-Z_\-]+') ENABLE_SNAPSHOTTING = bool( - distutils.util.strtobool(os.getenv("ENABLE_SNAPSHOTTING", "False"))) -SNAPSHOT_DATASET = str(os.getenv("SNAPSHOT_DATASET", 'snapshots')) + distutils.util.strtobool(os.getenv("ENABLE_SNAPSHOTTING", "FALSE"))) +SNAPSHOT_DATASET = str(os.getenv("SNAPSHOT_DATASET", "snapshots")) diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py index c7e85f213..2bfb910e7 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py @@ -88,6 +88,11 @@ def mock_env(gcs, monkeypatch): def ordered_mock_env(mock_env, monkeypatch): """environment variable mocks""" monkeypatch.setenv("ORDER_PER_TABLE", "TRUE") + + +@pytest.fixture +def snapshotting_env(mock_env, monkeypatch): + """environment variable mocks""" monkeypatch.setenv("ENABLE_SNAPSHOTTING", "TRUE") @@ -110,6 +115,21 @@ def teardown(): return dataset +@pytest.fixture +def snapshot_dataset(request, bq, mock_env): + dataset = bigquery.Dataset(f"{os.getenv('GCP_PROJECT')}" + f".{os.getenv('SNAPSHOT_DATASET')}") + dataset.location = "US" + bq.create_dataset(dataset) + print(f"created dataset {dataset.dataset_id}") + + def teardown(): + bq.delete_dataset(dataset, delete_contents=True, not_found_ok=True) + + request.addfinalizer(teardown) + return dataset + + @pytest.fixture def dest_table(request, bq, mock_env, dest_dataset) -> bigquery.Table: with open(os.path.join(TEST_DIR, "resources", diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py index 7fe82d200..55ff73b8a 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py @@ -175,6 +175,54 @@ def test_backlog_subscriber_in_order_with_new_batch_after_exit( assert num_rows == expected_num_rows +@pytest.mark.IT +@pytest.mark.ORDERING +def test_snapshotting( + bq, gcs, gcs_bucket, dest_dataset, dest_ordered_update_table, + gcs_ordered_update_data, gcs_external_update_config, gcs_backlog, + snapshotting_env, snapshot_dataset): + """Test basic functionality of backlog subscriber. + Populate a backlog with 3 files that make updates where we can assert + that these jobs were applied in order. + + To ensure that the subscriber cleans up properly after itself before exit, + we will drop a 4th batch after the subscriber has exited and assert that it + gets applied as expected. + """ + _run_subscriber(gcs, bq, gcs_external_update_config) + table_prefix = gcs_ocn_bq_ingest.common.utils.get_table_prefix( + gcs_external_update_config.name) + backlog_blobs = gcs_bucket.list_blobs(prefix=f"{table_prefix}/_backlog/") + assert backlog_blobs.num_results == 0, "backlog is not empty" + bqlock_blob: storage.Blob = gcs_bucket.blob("_bqlock") + assert not bqlock_blob.exists(), "_bqlock was not cleaned up" + rows = bq.query("SELECT alpha_update FROM " + f"{dest_ordered_update_table.dataset_id}" + f".{dest_ordered_update_table.table_id}") + expected_num_rows = 1 + num_rows = 0 + for row in rows: + num_rows += 1 + assert row["alpha_update"] == "ABC", "backlog not applied in order" + assert num_rows == expected_num_rows + + # Now we will test what happens when the publisher posts another batch after + # the backlog subscriber has exited. + backfill_blob = _post_a_new_batch(gcs_bucket, dest_dataset, + dest_ordered_update_table) + _run_subscriber(gcs, bq, backfill_blob) + + rows = bq.query("SELECT alpha_update FROM " + f"{dest_ordered_update_table.dataset_id}" + f".{dest_ordered_update_table.table_id}") + expected_num_rows = 1 + num_rows = 0 + for row in rows: + num_rows += 1 + assert row["alpha_update"] == "ABCD", "new incremental not applied" + assert num_rows == expected_num_rows + + @pytest.mark.IT @pytest.mark.ORDERING @pytest.mark.repeat(NUM_TRIES_SUBSCRIBER_TESTS)