diff --git a/tools/.gitignore b/tools/.gitignore new file mode 100644 index 000000000..c18dd8d83 --- /dev/null +++ b/tools/.gitignore @@ -0,0 +1 @@ +__pycache__/ diff --git a/tools/cloud_functions/gcs_event_based_ingest/.flake8 b/tools/cloud_functions/gcs_event_based_ingest/.flake8 index dafc87320..732e2a9fc 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/.flake8 +++ b/tools/cloud_functions/gcs_event_based_ingest/.flake8 @@ -1,6 +1,6 @@ [flake8] max-line-length = 110 ignore = E731,W504,I001,W503,E402 -exclude = .svn,CVS,.bzr,.hg,.git,__pycache__,.eggs,*.egg,node_modules,.venv +exclude = .svn,CVS,.bzr,.hg,.git,__pycache__,.eggs,*.egg,node_modules,.venv,.terraform # format = ${cyan}%(path)s${reset}:${yellow_bold}%(row)d${reset}:${green_bold}%(col)d${reset}: ${red_bold}%(code)s${reset} %(text)s diff --git a/tools/cloud_functions/gcs_event_based_ingest/.gitignore b/tools/cloud_functions/gcs_event_based_ingest/.gitignore new file mode 100644 index 000000000..66d580175 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/.gitignore @@ -0,0 +1,2 @@ +prof/ +test.log diff --git a/tools/cloud_functions/gcs_event_based_ingest/.hadolint.yaml b/tools/cloud_functions/gcs_event_based_ingest/.hadolint.yaml new file mode 100644 index 000000000..8f7e23e45 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/.hadolint.yaml @@ -0,0 +1,2 @@ +ignored: + - DL3008 diff --git a/tools/cloud_functions/gcs_event_based_ingest/.isort.cfg b/tools/cloud_functions/gcs_event_based_ingest/.isort.cfg index ed7944aca..6f72bca0f 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/.isort.cfg +++ b/tools/cloud_functions/gcs_event_based_ingest/.isort.cfg @@ -1,3 +1,5 @@ [settings] src_paths=backfill.py,gcs_ocn_bq_ingest,test skip=terraform_module +force_single_line=True +single_line_exclusions=typing diff --git a/tools/cloud_functions/gcs_event_based_ingest/Dockerfile.ci b/tools/cloud_functions/gcs_event_based_ingest/Dockerfile.ci index 5cd40aa1e..d383e7563 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/Dockerfile.ci +++ b/tools/cloud_functions/gcs_event_based_ingest/Dockerfile.ci @@ -1,4 +1,15 @@ -FROM python:3.8-slim +FROM python:3.8 +RUN apt-get update \ + && apt-get install --no-install-recommends -y \ + apt-transport-https \ + ca-certificates \ + curl \ + sudo \ + unzip \ + && apt-get autoremove -yqq --purge \ + && apt-get clean && rm -rf /var/lib/apt/lists/* COPY requirements.txt requirements-dev.txt ./ +COPY scripts/install_terraform.sh ./ +RUN ./install_terraform.sh RUN pip3 install --no-cache-dir -r requirements-dev.txt -ENTRYPOINT ["pytest"] +ENTRYPOINT ["python3 -m pytest"] diff --git a/tools/cloud_functions/gcs_event_based_ingest/ORDERING.md b/tools/cloud_functions/gcs_event_based_ingest/ORDERING.md new file mode 100644 index 000000000..4ae20dd0f --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/ORDERING.md @@ -0,0 +1,202 @@ +# Ordering Batches +There are use cases where it is important for incremental batches get +applied in order rather than as soon as they are uploaded to GCS (which is the +default behavior of this solution). +1. When using External Query that performs DML other than insert only. +(e.g. an `UPDATE` assumes that prior batches have already been committed) +1. To ensure that there are not time gaps in the data (e.g. ensure that +2020/01/02 data is not committed to BigQuery before 2020/01/01, or similarly +that 00 hour is ingested before the 01 hour, etc.) + +This Cloud Function supports serializing the submission of ingestion jobs to +BigQuery by using Google Cloud Storage's consistency guarantees to provide a +pessimistic lock on a table to prevent concurrent jobs and +[GCS Object.list](https://cloud.google.com/storage/docs/json_api/v1/objects/list) +lexicographic sorting of results to providing ordering gurantees. +The solution involves a table level `_backlog/` directory to keep track +of success files whose batches have not yet been committed to BigQuery and +a table level `_bqlock` file to keep track of what job is currently ingesting to +that table. This way we can make our Cloud Function idempotent by having all the +state stored in GCS so we can safely retrigger it to skirt the Cloud Functions +timeout. + +## Assumptions +This ordering solution assumes that you want to apply batches in lexicographic +order. This is usually the case because path names usually contain some sort of +date / hour information. + +## Enabling Ordering +### Environment Variable +Ordering can be enabled at the function level by setting the `ORDER_PER_TABLE` +environment variable to `"True"`. +### Config File +Ordering can be configured at any level of your naming convention (e.g. dataset +table or some sub-path) by placing a `_config/ORDERME` file. This can be helpful +in scenarios where your historical load can be processed safely in parallel but +incrementals must be ordered. +For example: +```text +gs://${BUCKET}/${DATASET}/${TABLE}/historical/_config/load.json +gs://${BUCKET}/${DATASET}/${TABLE}/incremental/_config/external.json +gs://${BUCKET}/${DATASET}/${TABLE}/incremental/_config/bq_transform.sql +gs://${BUCKET}/${DATASET}/${TABLE}/incremental/_config/ORDERME +``` + +## Dealing With Out-of-Order Publishing to GCS During Historical Load +In some use cases, there is a period where incrementals that must be applied in +order are uploaded in parallel (meaning their `_SUCCESS` files are expected to +be out of order). This typically happens during some historical backfill period. +This can be solved by setting the `START_BACKFILL_FILENAME` environment +variable to a file name that indicates that the parallel upload of historical +incrementals is complete (e.g. `_HISTORYDONE`). This will cause all success +files for a table to be added to the backlog until the `_HISTORYDONE` file is +dropped at the table level. At that point the backlog subscriber will begin +processing the batches in order. + +## Batch Failure Behavior +When ordering is enabled, if the BQ job to apply a batch failed, it is not safe +to continue to ingest the next batch. The Cloud Function will leave the +`_bqlock` file and stop trying to process the backlog. The Cloud function +will report an exception like this which should be alerted on as the ingestion +process for the table will be deadlocked until there is human intervention to +address the failed batch: +```text + f"previous BigQuery job: {job_id} failed or could not " + "be found. This will kill the backfill subscriber for " + f"the table prefix {table_prefix}." + "Once the issue is dealt with by a human, the lock" + "file at: " + f"gs://{lock_blob.bucket.name}/{lock_blob.name} " + "should be manually removed and a new empty _BACKFILL" + "file uploaded to:" + f"gs://{lock_blob.bucket.name}/{table_prefix}/_BACKFILL" + f"to resume the backfill subscriber so it can " + "continue with the next item in the backlog.\n" + "Original Exception:\n" + f"{traceback.format_exc()}") +``` +Note that once the `_bqlock` is removed and `_BACKFILL` is reposted, the Cloud +Function will proceed by applying the next batch in the `_backlog`. This means, +if you have applied the batch manually you should remove this object from the +`_backlog`. However, if you have patched the data on GCS for the failed batch +and would like the cloud function to apply it, then you leave this object in the +`_backlog`. + +## Ordering Mechanics Explained +We've treated ordering incremental commits to table as a variation on the +[Producer-Consumer Problem](https://en.wikipedia.org/wiki/Producer%E2%80%93consumer_problem) +Where we have multiple producers (each call of Backlog Publisher) and a single +Consumer (the Backlog Subscriber which is enforced to be a singleton per table +with a claim file). Our solution is to use GCS `_backlog` directory as our queue +and `_bqlock` as a mutex. There is still a rare corner case of a race condition +that we handle as well. + +### Backlog Publisher +The Backlog Publisher has two responsibilities: +1. add incoming success files to a +table's `_backlog` so they are not "forgotten" by the ingestion system. +1. if there is a non-empty backlog start the backfill subscriber (if one is not +already running). This is accomplished by uploading a table level `_BACKFILL` +file if it does not already exist. + +### Backlog Subscriber +The Backlog Subscriber is responsible for keeping track of BigQuery jobs running +on a table and ensure that batches are committed in order. When the backlog is +not empty for a table the backlog subscriber should be running for that table +unless a job has failed. +It will either be polling a `RUNNING` BigQuery job for completion, or submitting +the next batch in the `_backlog`. + +The state of what BigQuery job is currently running on a table is kept in a +`_bqlock` file at the table prefix. + +In order to escape the maximum nine-minute (540s) Cloud Function Timeout, the +backfill subscriber will re-trigger itself by posting a new `_BACKFILL` file +until the `_backlog` for the table prefix is empty. When a new success file +arrives it is the responsibility of the publisher to restart the subscriber if +one is not already running. + +### Example: Life of a Table +The following process explains the triggers (GCS files) and actions of the +Cloud Function for a single table prefix. + +1. Source data uploaded to GCS prefix for the destination dataset / table, etc. + - `gs://ingestion-bucket/dataset/table/historical/2020/01/02/03/foo-data-00.csv` + - `gs://ingestion-bucket/dataset/table/historical/2020/01/02/03/foo-data-01.csv` + - `gs://ingestion-bucket/dataset/table/historical/2020/01/02/04/foo-data-00.csv` + - `gs://ingestion-bucket/dataset/table/incremental/2020/01/02/05/foo-data-01.csv` +1. Success file uploaded to GCS (to indicate this atomic batch is ready to be +applied). + - `gs://ingestion-bucket/dataset/table/historical/2020/01/02/03/_SUCCESS` + - `gs://ingestion-bucket/dataset/table/historical/2020/01/02/04/_SUCCESS` + - `gs://ingestion-bucket/dataset/table/incremental/2020/01/02/05/_SUCCESS` +1. Backlog Publisher adds a pointer to each success file in the backlog for the +table. + - `gs://ingestion-bucket/dataset/table/_backlog/historical/2020/01/02/03/_SUCCESS` + - `gs://ingestion-bucket/dataset/table/_backlog/historical/2020/01/02/04/_SUCCESS` + - `gs://ingestion-bucket/dataset/table/_backlog/incremental/2020/01/02/05/_SUCCESS` +1. If the `START_BACKFILL_FILENAME` is set and the file exists at the table prefix, After adding each item the backlog, the Backlog Publisher will start the +Backfill Subscriber if it is not already running (as indicated by a `_BACKFILL` +file). If the `START_BACKFILL_FILENAME` is not present the backlog subscriber +will not be started until this file is uploaded. + - `gs://ingestion-bucket/dataset/table/_BACKFILL` +1. The Backlog Subscriber will look at the backlog and apply the batches in +order (lexicographic). This process looks like this: + 1. Claim this backfill file: + - `gs://ingestion-bucket/dataset/table/_claimed__BACKFILL_created_at_...` + 1. Claim first batch in backlog (ensure no duplicate processing): + - `gs://ingestion-bucket/dataset/table/historical/2020/01/02/03/_claimed__SUCCESS_created_at_...` + 1. Submit the BigQuery Job for this batch (load job or external query based on the `_config/*` files) + - Ingest the data at the `gs://ingestion-bucket/dataset/table/historical/2020/01/02/03/*` prefix + - Store the job ID in `gs://ingestion-bucket/dataset/table/_bqlock` + 1. Wait for this Job to complete successfully and remove this item from the backlog. + - If job is `DONE` with errors: + - Raise exception (do not continue to process any more batches) + - If job is `DONE` without errors remove the pointer from the backlog: + - DELETE `gs://ingestion-bucket/dataset/table/_backlog/historical/2020/01/02/03/_SUCCESS` + 1. Repeat from Backlog Subscriber step 2 + - Where the first item in the backlog is now + - `gs://ingestion-bucket/dataset/table/_backlog/historical/2020/01/02/04/_SUCCESS` + - And on the next loop: + - `gs://ingestion-bucket/dataset/table/_backlog/incremental/2020/01/02/05/_SUCCESS` +1. Backlog Subscriber sees the `_backlog/` is empty for the table. In other words +The BigQuery table is caught up with the data on GCS. + - DELETE `gs://ingestion-bucket/dataset/table/_BACKFILL` and exit +1. The next day a new incremental arrives + - `gs://ingestion-bucket/dataset/table/_backlog/incremental/2020/01/02/05/_SUCCESS` +1. The Backlog Publisher adds this item to the backlog and wakes up the +Backfill Subscriber by posting a new `_BACKFILL` file. + - `gs://ingestion-bucket/dataset/table/_backlog/incremental/2020/01/02/05/_SUCCESS` + - `gs://ingestion-bucket/dataset/table/_BACKFILL` +1. Backlog Subscriber will handle the backlog of just one item +(See Backlog Subscriber step #5 and #6 above) + + +### Note on Handling Race Condition +We use `subscribe_monitor` to handle a rare race condition where: + +1. subscriber reads an empty backlog (before it can delete the + _BACKFILL blob...) +2. a new item is added to the backlog (causing a separate + function invocation) +3. In this new invocation we reach this point in the code path + and start_subscriber_if_not_running sees the old _BACKFILL + and does not create a new one. +4. The subscriber deletes the _BACKFILL blob and exits without + processing the new item on the backlog from #2. + +We handle this by the following: + +1. When success file added to the backlog starts this monitoring +to wait 10 seconds before checking that the backfill file exists. To catch if +the backfill file disappears when it should not. This might trigger an extra +loop of the backfill subscriber but this loop will not take any action and this +wasted compute is far better than dropping a batch of data. +1. On the subscriber side we check if there was more time +than 10 seconds between list backlog items and delete backfill calls. If so the +subscriber double checks that the backlog is still empty. This way we always +handle this race condition either in this monitor or in the subscriber itself. + + +### Visualization of Ordering Triggers in the Cloud Function +![architecture](img/ordering.png) diff --git a/tools/cloud_functions/gcs_event_based_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/README.md index 9fda82d39..372590064 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/README.md @@ -21,14 +21,18 @@ By Default we try to read dataset, table, partition (or yyyy/mm/dd/hh) and batch id using the following python regex: ```python3 DEFAULT_DESTINATION_REGEX = ( - r"^(?P[\w\-\._0-9]+)/" # dataset (required) - r"(?P[\w\-_0-9]+)/?" # table name (required) - r"(?P\$[0-9]+)?/?" # partition decortator (optional) - r"(?P[0-9]{4})?/?" # partition year (yyyy) (optional) - r"(?P[0-9]{2})?/?" # partition month (mm) (optional) - r"(?P
[0-9]{2})?/?" # partition day (dd) (optional) - r"(?P[0-9]{2})?/?" # partition hour (hh) (optional) - r"(?P[\w\-_0-9]+)?/" # batch id (optional) + r"^(?P[\w\-\._0-9]+)/" # dataset (required) + r"(?P
[\w\-_0-9]+)/?" # table name (required) + # break up historical v.s. incremental to separate prefixes (optional) + r"(?:historical|incremental)?/?" + r"(?P\$[0-9]+)?/?" # partition decorator (optional) + r"(?:" # [begin] yyyy/mm/dd/hh/ group (optional) + r"(?P[0-9]{4})/?" # partition year (yyyy) (optional) + r"(?P[0-9]{2})?/?" # partition month (mm) (optional) + r"(?P
[0-9]{2})?/?" # partition day (dd) (optional) + r"(?P[0-9]{2})?/?" # partition hour (hh) (optional) + r")?" # [end]yyyy/mm/dd/hh/ group (optional) + r"(?P[\w\-_0-9]+)?/" # batch id (optional) ) ``` you can see if this meets your needs in this [regex playground](https://regex101.com/r/5Y9TDh/2) @@ -37,8 +41,10 @@ better fit your naming convention on GCS. Your regex must include [Python Regex with named capturing groups](https://docs.python.org/3/howto/regex.html#non-capturing-and-named-groups) for destination `dataset`, and `table`. Note, that `dataset` can optionally, explicitly specify destination project -(i.e. `gs://${BUCKET}/project_id.dataset_id/table/....`) otherwise the default -project will be inferred from Application Default Credential (the project in +(i.e. `gs://${BUCKET}/project_id.dataset_id/table/....`) alternatively, +one can set the `BQ_STORAGE_PROJECT` environment variable to set to override the +default target project for datasets at the function level. The default behavior is to +infer the project from Application Default Credential (the project in which the Cloud Function is running, or the ADC configured in Google Cloud SDK if invoked locally). This is useful in scenarios where a single deployment of the Cloud Function is responsible for ingesting data into BigQuery tables in @@ -133,20 +139,50 @@ The result of merging these would be: This configuration system gives us the ability to DRY up common defaults but override them at whatever level is appropriate as new cases come up. +### Note on Delimiters: Use Unicode +For CSV loads the `fieldDelimiter` in load.json to external.json should be +specified as a unicode character _not_ a hexidecimal character as hexidecimal +characters will confuse python's `json.load` function. +For example ctrl-P should be specified as: +```json +{ + "fieldDelimiter": "\u0010" +} +``` + #### Transformation SQL In some cases we may need to perform transformations on the files in GCS before they can be loaded to BigQuery. This is handled by query on an temporary external table over the GCS objects as a proxy for load job. `gs://${INGESTION_BUCKET}/${BQ_DATASET}/${BQ_TABLE_NAME}/_config/bq_transform.sql` -Note, external queries will consume query slots from this project's reservation -or count towards your on-demand billing. They will _not_ use free tie load slots. +By default, if a query job finishes of statement type +`INSERT`,`UPDATE`,`DELETE`, or `MERGE` and `numDmlRowsAffected = 0` this will be +treated as a failure ([See Query Job Statistics API docs](https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobstatistics2)). +This is usually due to a bad query / configuration with bad DML predicate. +For example running the following query on an empty table: + +```sql +UPDATE foo.bar dest ... FROM temp_ext src WHERE src.id = dest.id +``` + +By failing on this condition we keep the backlog intact when we run a query job +that unexpectedly did no affect any rows. +This can be disabled by setting the environment variable +`FAIL_ON_ZERO_DML_ROWS_AFFECTED=False`. +A `CREATE OR REPLACE TABLE` is not DML and will not be subject to this behavior. + +##### Cost Note +External queries will consume query slots from this project's reservation +or count towards your on-demand billing. +They will _not_ use free tier load slots. + +##### External Table Name: `temp_ext` Note, that the query should select from a `temp_ext` which will be a temporary external table configured on the fly by the Cloud Function. The query must handle the logic for inserting into the destination table. -This means it should use BigQuery DML to either `INSERT` or `MERGE` into the -destination table. +This means it should use BigQuery DML to mutate the destination table. For example: ```sql INSERT {dest_dataset}.{dest_table} @@ -198,6 +234,11 @@ at any parent folders `_config` prefix. This allows you dictate "for this table any new batch should `WRITE_TRUNCATE` it's parent partition/table" or "for that table any new batch should `WRITE_APPEND` to it's parent partition/table". +## Controlling BigQuery Compute Project +By default BigQuery jobs will be submitted in the project where the Cloud Function +is deployed. To submit jobs in another BigQuery project set the `BQ_PROJECT` +environment variable. + ## Monitoring Monitoring what data has been loaded by this solution should be done with the BigQuery [`INFORMATION_SCHEMA` jobs metadata](https://cloud.google.com/bigquery/docs/information-schema-jobs) @@ -235,14 +276,20 @@ SELECT total_slot_ms, destination_table state, + error_result, (SELECT value FROM UNNEST(labels) WHERE key = "component") as component, (SELECT value FROM UNNEST(labels) WHERE key = "cloud-function-name") as cloud_function_name, (SELECT value FROM UNNEST(labels) WHERE key = "batch-id") as batch_id, FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_PROJECT WHERE - (SELECT value FROM UNNEST(labels) WHERE key = "component") = "gcf-ingest-" + (SELECT value FROM UNNEST(labels) WHERE key = "component") = "event-based-gcs-ingest" ``` +If your external queries have mutliple sql statements only the parent job will +follow the `gcf-ingest-*` naming convention. Children jobs (for each statement) +begin with prefix _script_job. These jobs will still be labelled with +`component` and `cloud-function-name`. +For more information see [Scripting in Standard SQL](https://cloud.google.com/bigquery/docs/reference/standard-sql/scripting) ## Triggers GCS Object Finalize triggers can communicate with Cloud Functions directly or @@ -299,8 +346,10 @@ docker run --rm -it gcr.io/$PROJECT_ID/gcs_event_based_ingest_ci #### Running on your local machine Alternatively to the local cloudbuild or using the docker container to run your tests, you can `pip3 install -r requirements-dev.txt` and select certain tests -to run with [`pytest`](https://docs.pytest.org/en/stable/usage.html). This is -mostly useful if you'd like to integrate with your IDE debugger. +to run with [`python3 -m pytest`](https://docs.pytest.org/en/stable/usage.html). +Note, this is not quite the same as callin `pytest` without the `python -m` prefix +([pytest invocation docs](https://docs.pytest.org/en/stable/usage.html#calling-pytest-through-python-m-pytest)) +This is mostly useful if you'd like to integrate with your IDE debugger. Note that integration tests will spin up / tear down cloud resources that can incur a small cost. These resources will be spun up based on your Google Cloud SDK @@ -314,16 +363,24 @@ See more info on sharing pytest fixtures in the [pytest docs](https://docs.pytes #### Running All Tests ```bash -pytest +python3 -m pytest ``` #### Running Unit Tests Only ```bash -pytest -m "not IT" +python3 -m pytest -m "not IT" ``` #### Running Integration Tests Only ```bash -pytest -m IT +python3 -m pytest -m IT +``` + +#### Running System Tests Only +The system tests assume that you have deployed the cloud function. +```bash +export TF_VAR_short_sha=$(git rev-parse --short=10 HEAD) +export TF_VAR_project_id=jferriero-pp-dev +python3 -m pytest -vvv e2e ``` ## Deployment @@ -351,7 +408,7 @@ gcloud functions deploy test-gcs-bq-ingest \ --trigger-topic=${PUBSUB_TOPIC} \ --service-account=${SERVICE_ACCOUNT_EMAIL} \ --timeout=540 \ - --set-env-vars='DESTINATION_REGEX=^(?:[\w\-0-9]+)/(?P[\w\-_0-9]+)/(?P
[\w\-_0-9]+)/?(?:incremental|history)?/?(?P[0-9]{4})?/?(?P[0-9]{2})?/?(?P
[0-9]{2})?/?(?P[0-9]{2})?/?(?P[0-9]+)?/?' + --set-env-vars='DESTINATION_REGEX=^(?:[\w\-0-9]+)/(?P[\w\-_0-9]+)/(?P
[\w\-_0-9]+)/?(?:incremental|history)?/?(?P[0-9]{4})?/?(?P[0-9]{2})?/?(?P
[0-9]{2})?/?(?P[0-9]{2})?/?(?P[0-9]+)?/?,FUNCTION_TIMEOUT_SEC=540' ``` #### Cloud Functions Events @@ -368,7 +425,7 @@ gcloud functions deploy test-gcs-bq-ingest \ --trigger-event google.storage.object.finalize --service-account=${SERVICE_ACCOUNT_EMAIL} \ --timeout=540 \ - --set-env-vars='DESTINATION_REGEX=^(?:[\w\-0-9]+)/(?P[\w\-_0-9]+)/(?P
[\w\-_0-9]+)/?(?:incremental|history)?/?(?P[0-9]{4})?/?(?P[0-9]{2})?/?(?P
[0-9]{2})?/?(?P[0-9]{2})?/?(?P[0-9]+)?/?' + --set-env-vars='DESTINATION_REGEX=^(?:[\w\-0-9]+)/(?P[\w\-_0-9]+)/(?P
[\w\-_0-9]+)/?(?:incremental|history)?/?(?P[0-9]{4})?/?(?P[0-9]{2})?/?(?P
[0-9]{2})?/?(?P[0-9]{2})?/?(?P[0-9]+)?/?,FUNCTION_TIMEOUT_SEC=540' ``` In theory, one could set up Pub/Sub notifications from multiple GCS Buckets @@ -376,6 +433,12 @@ In theory, one could set up Pub/Sub notifications from multiple GCS Buckets Pub/Sub topic so that data uploaded to any of these buckets could get automatically loaded to BigQuery by a single deployment of the Cloud Function. +## Ordering Guarantees +It is possible to configure the Cloud Function to apply incrementals in order if +this is crucial to your data integrity. This naturally comes with a performance +penalty as for a given table we cannot parallelize ingestion of batches. +The ordering behavior and options are described in detail in [ORDERING.md](ORDERING.md) + ## Backfill There are some cases where you may have data already copied to GCS according to the naming convention / with success files before the Object Change @@ -385,6 +448,21 @@ files. The utility supports either invoking the Cloud Function main method locally (in concurrent threads) or publishing notifications for the success files (for a deployed Cloud Function to pick up). +### Backfill and Ordering +If you use the ordering feature on a table (or function wide) you should use the +`NOTIFICATIONS` mode to repost notifications to a pub/sub topic that your +deployed Cloud Function is listening to. The `LOCAL` mode does not support +ordering because this feature relies on (re)posting files like `_bqlock`, +`_BACKFILL` and various claim files and getting re-triggered by object +notifications for these. +The script will publish the notifications for success files and the Cloud +Function will add these to the appropriate table's backlog. +Once the script completes you can drop the `START_BACKFILL_FILENAME` +(e.g. `_HISTORYDONE`) for each table you want to trigger the backfill for. +In general, it would not be safe for this utility to drop a `_HISTORYDONE` for +every table because the parallel historical loads might still be in progress. + + ### Usage ``` python3 -m backfill -h diff --git a/tools/cloud_functions/gcs_event_based_ingest/__init__.py b/tools/cloud_functions/gcs_event_based_ingest/__init__.py index 7a3efb203..42ed0a407 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/__init__.py +++ b/tools/cloud_functions/gcs_event_based_ingest/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC. +# Copyright 2021 Google LLC. # This software is provided as-is, without warranty or representation # for any use or purpose. # Your use of it is subject to your agreement with Google. diff --git a/tools/cloud_functions/gcs_event_based_ingest/backfill.py b/tools/cloud_functions/gcs_event_based_ingest/backfill.py index f0a2ce415..105397553 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/backfill.py +++ b/tools/cloud_functions/gcs_event_based_ingest/backfill.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml index d1367b925..0ae2de0ae 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml +++ b/tools/cloud_functions/gcs_event_based_ingest/cloudbuild.yaml @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC. +# Copyright 2021 Google LLC. # This software is provided as-is, without warranty or representation # for any use or purpose. # Your use of it is subject to your agreement with Google. @@ -20,6 +20,8 @@ steps: dir: '${_BUILD_DIR}' entrypoint: '/bin/hadolint' args: + - '--config' + - '.hadolint.yaml' - 'Dockerfile.ci' id: 'lint-ci-docker-image' - name: 'gcr.io/kaniko-project/executor:latest' @@ -113,21 +115,43 @@ steps: - 'mypy-main' - 'mypy-tests' - 'terraform-fmt' + entrypoint: /bin/sh args: - - '-m' - - 'not IT' + - '-c' + # pip installing again to get GCB to recognize mocker from pytest-mock + - 'pip install -r requirements-dev.txt && python3 -m pytest tests -m "not IT"' + # GCB sometimes get stuck on this step and is doomed to not recover. + # This is usually remedied by just re-running the build. + # adding this unit-test step level timeout so we can fail sooner and retry. + timeout: 15s id: 'unit-test' - name: 'gcr.io/$PROJECT_ID/gcs_event_based_ingest_ci' dir: '${_BUILD_DIR}' waitFor: - 'build-ci-image' - 'unit-test' + entrypoint: /bin/sh args: - - '--maxfail=1' - - '-m' - - 'IT' + - '-c' + - 'pip install -r requirements-dev.txt && python3 -m pytest tests -m IT' id: 'integration-test' +- name: 'gcr.io/$PROJECT_ID/gcs_event_based_ingest_ci' + dir: '${_BUILD_DIR}' + env: + - 'TF_VAR_project_id=$PROJECT_ID' + - 'TF_VAR_region=$_REGION' + - 'TF_VAR_short_sha=$SHORT_SHA' + waitFor: + - 'integration-test' + - 'build-ci-image' + entrypoint: /bin/sh + args: + - '-c' + - 'python3 -m pytest -vvv e2e' + id: 'e2e-test' +timeout: '3600s' options: - machineType: 'N1_HIGHCPU_8' + machineType: 'N1_HIGHCPU_32' substitutions: '_BUILD_DIR': 'tools/cloud_functions/gcs_event_based_ingest' + '_REGION': 'us-central1' diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/.gitignore b/tools/cloud_functions/gcs_event_based_ingest/e2e/.gitignore new file mode 100644 index 000000000..9e399369c --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/.gitignore @@ -0,0 +1,35 @@ +# Local .terraform directories +**/.terraform/* + +# .tfstate files +*.tfstate +*.tfstate.* + +# Crash log files +crash.log + +# Exclude all .tfvars files, which are likely to contain sentitive data, such as +# password, private keys, and other secrets. These should not be part of version +# control as they are data points which are potentially sensitive and subject +# to change depending on the environment. +# +*.tfvars + +# Ignore override files as they are usually used to override resources locally and so +# are not checked in +override.tf +override.tf.json +*_override.tf +*_override.tf.json + +# Include override files you do wish to add to version control using negated pattern +# +# !example_override.tf + +# Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan +# example: *tfplan* + +# Ignore CLI configuration files +.terraformrc +terraform.rc + diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/cli/__init__.py b/tools/cloud_functions/gcs_event_based_ingest/e2e/__init__.py similarity index 96% rename from tools/cloud_functions/gcs_event_based_ingest/tests/cli/__init__.py rename to tools/cloud_functions/gcs_event_based_ingest/e2e/__init__.py index 7a3efb203..42ed0a407 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/cli/__init__.py +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC. +# Copyright 2021 Google LLC. # This software is provided as-is, without warranty or representation # for any use or purpose. # Your use of it is subject to your agreement with Google. diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py new file mode 100644 index 000000000..2aa9684e1 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/conftest.py @@ -0,0 +1,111 @@ +# Copyright 2021 Google LLC. +# This software is provided as-is, without warranty or representation +# for any use or purpose. +# Your use of it is subject to your agreement with Google. + +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""End-to-end tests for event based BigQuery ingest Cloud Function.""" +import json +import os +import re +import shlex +import subprocess +import uuid + +import pytest +from google.cloud import bigquery +from google.cloud import storage + +TEST_DIR = os.path.realpath(os.path.dirname(__file__)) + +ANSI_ESCAPE_PATTERN = re.compile(r'\x1B\[[0-?]*[ -/]*[@-~]') + + +@pytest.fixture(scope="module") +def bq() -> bigquery.Client: + """BigQuery Client""" + return bigquery.Client(location="US") + + +@pytest.fixture(scope="module") +def gcs() -> storage.Client: + """GCS Client""" + return storage.Client() + + +@pytest.fixture(scope='module') +def terraform_infra(request): + + def _run(cmd): + print( + ANSI_ESCAPE_PATTERN.sub( + '', + subprocess.check_output(cmd, + stderr=subprocess.STDOUT, + cwd=TEST_DIR).decode('UTF-8'))) + + init = shlex.split("terraform init") + apply = shlex.split("terraform apply -auto-approve") + destroy = shlex.split("terraform destroy -auto-approve") + + _run(init) + _run(apply) + + def teardown(): + _run(destroy) + + request.addfinalizer(teardown) + with open(os.path.join(TEST_DIR, "terraform.tfstate")) as tf_state_file: + return json.load(tf_state_file) + + +@pytest.fixture +def dest_dataset(request, bq, monkeypatch): + random_dataset = (f"test_bq_ingest_gcf_" + f"{str(uuid.uuid4())[:8].replace('-','_')}") + dataset = bigquery.Dataset(f"{os.getenv('TF_VAR_project_id', 'bqutil')}" + f".{random_dataset}") + dataset.location = "US" + bq.create_dataset(dataset) + monkeypatch.setenv("BQ_LOAD_STATE_TABLE", + f"{dataset.dataset_id}.serverless_bq_loads") + print(f"created dataset {dataset.dataset_id}") + + def teardown(): + bq.delete_dataset(dataset, delete_contents=True, not_found_ok=True) + + request.addfinalizer(teardown) + return dataset + + +@pytest.fixture(scope="function") +def dest_table(request, bq: bigquery.Client, dest_dataset) -> bigquery.Table: + public_table: bigquery.Table = bq.get_table( + bigquery.TableReference.from_string( + "bigquery-public-data.new_york_311.311_service_requests")) + schema = public_table.schema + + table: bigquery.Table = bigquery.Table( + f"{os.environ.get('TF_VAR_project_id', 'bqutil')}" + f".{dest_dataset.dataset_id}.cf_e2e_test_nyc_311_" + f"{os.getenv('SHORT_SHA', 'manual')}", + schema=schema, + ) + + table = bq.create_table(table) + + def teardown(): + bq.delete_table(table, not_found_ok=True) + + request.addfinalizer(teardown) + return table diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py b/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py new file mode 100644 index 000000000..8ffa44c2f --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/e2e_test.py @@ -0,0 +1,133 @@ +# Copyright 2021 Google LLC. +# This software is provided as-is, without warranty or representation +# for any use or purpose. +# Your use of it is subject to your agreement with Google. + +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""End-to-end test for GCS event based ingest to BigQuery Cloud Function""" +import concurrent.futures +import json +import time +from typing import Dict + +import pytest +from google.cloud import bigquery +from google.cloud import storage + +WAIT_FOR_ROWS_TIMEOUT = 180 # seconds + + +@pytest.mark.SYS +def test_cloud_function_long_runnning_bq_jobs_with_orderme( + gcs: storage.Client, bq: bigquery.Client, dest_table: bigquery.Table, + terraform_infra: Dict): + """This test assumes the cloud function has been deployed with the + accompanying terraform module which configures a 1 min timeout. + It exports some larger data from a public BigQuery table and then reloads + them to test table to test the cloud function behavior with longer running + BigQuery jobs which are likely to require the backlog subscriber to restart + itself by reposting a _BACKFILL file. The ordering behavior is controlled + with the ORDERME blob. + """ + input_bucket_id = terraform_infra['outputs']['bucket']['value'] + table_prefix = f"{dest_table.dataset_id}/" \ + f"{dest_table.table_id}" + extract_config = bigquery.ExtractJobConfig() + extract_config.destination_format = bigquery.DestinationFormat.AVRO + public_table: bigquery.Table = bq.get_table( + bigquery.TableReference.from_string( + "bigquery-public-data.new_york_311.311_service_requests")) + + def _extract(batch: str): + extract_job: bigquery.ExtractJob = bq.extract_table( + public_table, f"gs://{input_bucket_id}/{table_prefix}/{batch}/" + f"data-*.avro", + job_config=extract_config) + return extract_job.result() + + batches = [ + "historical/00", "historical/01", "historical/02", "incremental/03" + ] + history_batch_nums = ["00", "01", "02"] + with concurrent.futures.ThreadPoolExecutor() as pool: + # export some data from public BQ table into a historical partitions + extract_results = pool.map(_extract, batches) + + for res in extract_results: + assert res.errors is None, f"extract job {res.job_id} failed" + + bkt: storage.Bucket = gcs.lookup_bucket(input_bucket_id) + # configure load jobs for this table + load_config_blob = bkt.blob(f"{table_prefix}/_config/load.json") + load_config_blob.upload_from_string( + json.dumps({ + "writeDisposition": "WRITE_APPEND", + "sourceFormat": "AVRO", + "useAvroLogicalTypes": "True", + })) + orderme_blob = bkt.blob(f"{table_prefix}/_config/ORDERME") + orderme_blob.upload_from_string("") + # add historical success files + for batch in history_batch_nums: + historical_success_blob: storage.Blob = bkt.blob( + f"{table_prefix}/historical/{batch}/_SUCCESS") + historical_success_blob.upload_from_string("") + + # assert 0 bq rows (because _HISTORYDONE not dropped yet) + dest_table = bq.get_table(dest_table) + assert dest_table.num_rows == 0, \ + "history was ingested before _HISTORYDONE was uploaded" + + # add _HISTORYDONE + history_done_blob: storage.Blob = bkt.blob(f"{table_prefix}/_HISTORYDONE") + history_done_blob.upload_from_string("") + + # wait for bq rows to reach expected num rows + bq_wait_for_rows(bq, dest_table, + public_table.num_rows * len(history_batch_nums)) + + # add the incremental success file + incremental_success_blob: storage.Blob = bkt.blob( + f"{table_prefix}/{batches[-1]}/_SUCCESS") + incremental_success_blob.upload_from_string("") + + # wait on new expected bq rows + bq_wait_for_rows(bq, dest_table, public_table.num_rows * len(batches)) + + +def bq_wait_for_rows(bq_client: bigquery.Client, table: bigquery.Table, + expected_num_rows: int): + """ + polls tables.get API for number of rows until reaches expected value or + times out. + + This is mostly an optimization to speed up the test suite without making it + flaky. + """ + + start_poll = time.monotonic() + actual_num_rows = 0 + while time.monotonic() - start_poll < WAIT_FOR_ROWS_TIMEOUT: + bq_table: bigquery.Table = bq_client.get_table(table) + actual_num_rows = bq_table.num_rows + if actual_num_rows == expected_num_rows: + return + if actual_num_rows > expected_num_rows: + raise AssertionError( + f"{table.project}.{table.dataset_id}.{table.table_id} has" + f"{actual_num_rows} rows. expected {expected_num_rows} rows.") + raise AssertionError( + f"Timed out after {WAIT_FOR_ROWS_TIMEOUT} seconds waiting for " + f"{table.project}.{table.dataset_id}.{table.table_id} to " + f"reach {expected_num_rows} rows." + f"last poll returned {actual_num_rows} rows.") diff --git a/tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf b/tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf new file mode 100644 index 000000000..64e3973d3 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/e2e/main.tf @@ -0,0 +1,50 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +variable "short_sha" {} +variable "project_id" { default = "bqutil" } +variable "region" { default = "us-central1" } +output "bucket" { + value = module.gcs_ocn_bq_ingest.input-bucket +} + +resource "google_storage_bucket" "cloud_functions_source" { + name = "gcf-source-archives${var.short_sha}" + project = var.project_id + storage_class = "REGIONAL" + location = var.region + force_destroy = "true" +} + +module "gcs_ocn_bq_ingest" { + source = "../terraform_module/gcs_ocn_bq_ingest_function" + function_source_folder = "../gcs_ocn_bq_ingest" + app_id = "gcs-ocn-bq-ingest-e2e-test${var.short_sha}" + cloudfunctions_source_bucket = google_storage_bucket.cloud_functions_source.name + data_ingester_sa = "data-ingester-sa${var.short_sha}" + input_bucket = "gcs-ocn-bq-ingest-e2e-tests${var.short_sha}" + project_id = var.project_id + environment_variables = { + START_BACKFILL_FILENAME = "_HISTORYDONE" + } + # We'll use a shorter timeout for e2e stress subscriber re-triggering + timeout = 60 + force_destroy = "true" +} + +terraform { + backend "local" { + path = "terraform.tfstate" + } +} + diff --git a/tools/cloud_functions/gcs_event_based_ingest/external_query.py b/tools/cloud_functions/gcs_event_based_ingest/external_query.py new file mode 100644 index 000000000..4282e09e6 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/external_query.py @@ -0,0 +1,103 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Command Line utility for dry running BigQuery queries that reference +temporary external tables over data in GCS. +""" +import argparse +import json +import logging +import sys +from typing import List + +from google.cloud import bigquery +from google.cloud import storage + +import gcs_ocn_bq_ingest.common.utils # pylint: disable=import-error + + +def main(args: argparse.Namespace): + """main entry point for dry run external CLI.""" + bq_client: bigquery.Client = bigquery.Client() + gcs_client: storage.Client = storage.Client() + gsurl = None + if args.external_config.startswith("gs://"): + gsurl = args.external_config + external_config = bigquery.ExternalConfig.from_api_repr( + gcs_ocn_bq_ingest.common.utils.read_gcs_file(gcs_client, gsurl)) + else: + with open(args.external_config, 'r') as external_config_file: + external_config = bigquery.ExternalConfig.from_api_repr( + json.load(external_config_file)) + + if (not external_config.source_uris + or external_config.source_uris == ["REPLACEME"]): + if gsurl: + parent_gsurl = "/".join(gsurl.split("/")[:-1]) + external_config.source_uris = f"{parent_gsurl}/*" + else: + # need a source uri that expands to some files so use public uri + external_config.source_uris = [ + "gs://gcp-public-data-landsat/LC08/PRE/063/046/" + "LC80630462016136LGN00/*" + ] + job_config: bigquery.QueryJobConfig = bigquery.QueryJobConfig() + job_config.table_definitions = {'temp_ext': external_config} + job_config.dry_run = args.dry_run + job: bigquery.QueryJob + if args.query.startswith("gs://"): + gsurl = args.query + query = gcs_ocn_bq_ingest.common.utils.read_gcs_file(gcs_client, gsurl) + job = bq_client.query(query, job_config=job_config) + else: + with open(args.query, 'r') as query_file: + job = bq_client.query(query_file.read(), job_config=job_config) + if not args.dry_run: + job.result() + print(f"query job {job.job_id} complete") + print(job.to_api_repr()) + else: + logging.info("successful dry run of %s with temp_ext = %s", + args.query, args.external_config) + + +def parse_args(args: List[str]) -> argparse.Namespace: + """argument parser for backfill CLI""" + parser = argparse.ArgumentParser( + description="utility to dry run external queries.") + + parser.add_argument( + "--query", + "-q", + help="path to file containing the query", + required=True, + ) + + parser.add_argument( + "--external-config", + "-e", + help="path to file containing external table definition", + required=True, + ) + + parser.add_argument("--dry-run", + "-d", + help="perform a dry run of the query", + action='store_true', + default=False) + + return parser.parse_args(args) + + +if __name__ == "__main__": + main(parse_args(sys.argv)) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md index c86dceea4..20c023825 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/README.md @@ -9,12 +9,13 @@ BigQuery Table. 1. [Pub/Sub Notification](https://cloud.google.com/storage/docs/pubsub-notifications) object finalize. 1. Cloud Function subscribes to notifications and ingests all the data into -BigQuery a directory once a `_SUCCESS` file arrives. +BigQuery from a GCS prefix once a `_SUCCESS` file arrives. The success file name +is configurable with environment variable. ## Deployment The source for this Cloud Function can easily be reused to repeat this pattern -for many tables by using the accompanying terraform module (TODO). +for many tables by using the accompanying terraform module. This way we can reuse the tested source code for the Cloud Function. @@ -28,14 +29,43 @@ following default behavior. |-----------------------|---------------------------------------|----------------------------------------------| | `WAIT_FOR_JOB_SECONDS`| How long to wait before deciding BQ job did not fail quickly| `5` | | `SUCCESS_FILENAME` | Filename to trigger a load of a prefix| `_SUCCESS` | -| `DESTINATION_REGEX` | A [Python Regex with named capturing groups](https://docs.python.org/3/howto/regex.html#non-capturing-and-named-groups) for `dataset`, `table`, (optional: `partition` or `yyyy`, `mm`, `dd`, `hh`, `batch`) +| `DESTINATION_REGEX` | A [Python Regex with named capturing groups](https://docs.python.org/3/howto/regex.html#non-capturing-and-named-groups) for `dataset`, `table`, (optional: `partition` or `yyyy`, `mm`, `dd`, `hh`, `batch`) | (see below)| | `MAX_BATCH_BYTES` | Max bytes for BigQuery Load job | `15000000000000` ([15 TB](https://cloud.google.com/bigquery/quotas#load_jobs)| | `JOB_PREFIX` | Prefix for BigQuery Job IDs | `gcf-ingest-` | +| `BQ_PROJECT` | Default BQ project to use to submit load / query jobs | Project where Cloud Function is deployed | +| `BQ_STORAGE_PROJECT` | Default BQ project to use for target table references if not specified in dataset capturing group | Project where Cloud Function is deployed | +| `FUNCTION_TIMEOUT_SEC`| Number of seconds set for this deployment of Cloud Function (no longer part of python38 runtime) | 60 | +| `FAIL_ON_ZERO_DML_ROWS_AFFECTED` | Treat External Queries that result in `numDmlAffectedRows = 0` as failures | True | +| `ORDER_PER_TABLE`\* | Force jobs to be executed sequentially (rather than parallel) based on the backlog. This is the same as having an `ORDERME` file in every config directory | `False` | +| `START_BACKFILL_FILENAME`\*| Block submitting BigQuery Jobs for a table until this file is present at the table prefix. By default this will not happen. | `None` | +| `RESTART_BUFFER_SECONDS`\* | Buffer before Cloud Function timeout to leave before re-triggering the backfill subscriber | 30 | +| `USE_ERROR_REPORTING_API` | Should errors be reported using error reporting api to avoid cold restart (optimization) | True | +\* only affect the behavior when ordering is enabled for a table. +See [ORDERING.md](../ORDERING.md) +## Default Destination Regex +```python3 +DEFAULT_DESTINATION_REGEX = ( + r"^(?P[\w\-\._0-9]+)/" # dataset (required) + r"(?P
[\w\-_0-9]+)/?" # table name (required) + # break up historical v.s. incremental to separate prefixes (optional) + r"(?:historical|incremental)?/?" + r"(?P\$[0-9]+)?/?" # partition decorator (optional) + r"(?:" # [begin] yyyy/mm/dd/hh/ group (optional) + r"(?P[0-9]{4})/?" # partition year (yyyy) (optional) + r"(?P[0-9]{2})?/?" # partition month (mm) (optional) + r"(?P
[0-9]{2})?/?" # partition day (dd) (optional) + r"(?P[0-9]{2})?/?" # partition hour (hh) (optional) + r")?" # [end]yyyy/mm/dd/hh/ group (optional) + r"(?P[\w\-_0-9]+)?/" # batch id (optional) +) +` + ## Implementation notes 1. To support notifications based on a GCS prefix (rather than every object in the bucket), we chose to use manually configure Pub/Sub Notifications manually and use a Pub/Sub triggered Cloud Function. + diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/__init__.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/__init__.py index 7a3efb203..42ed0a407 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/__init__.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC. +# Copyright 2021 Google LLC. # This software is provided as-is, without warranty or representation # for any use or purpose. # Your use of it is subject to your agreement with Google. diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/__init__.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/__init__.py similarity index 96% rename from tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/__init__.py rename to tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/__init__.py index 7a3efb203..42ed0a407 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/__init__.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC. +# Copyright 2021 Google LLC. # This software is provided as-is, without warranty or representation # for any use or purpose. # Your use of it is subject to your agreement with Google. diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py new file mode 100644 index 000000000..27e104586 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/constants.py @@ -0,0 +1,139 @@ +# Copyright 2021 Google LLC. +# This software is provided as-is, without warranty or representation +# for any use or purpose. +# Your use of it is subject to your agreement with Google. + +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Configurations for Cloud Function for loading data from GCS to BigQuery. +""" +import distutils.util +import os +import re + +import google.api_core.client_info +import google.cloud.exceptions + +# Will wait up to this long polling for errors in a bq job before exiting +# This is to check if job fail quickly, not to assert it succeed. +# This may not be honored if longer than cloud function timeout. +# https://cloud.google.com/functions/docs/concepts/exec#timeout +# One might consider lowering this to 1-2 seconds to lower the +# upper bound of expected execution time to stay within the free tier. +# https://cloud.google.com/functions/pricing#free_tier +WAIT_FOR_JOB_SECONDS = int(os.getenv("WAIT_FOR_JOB_SECONDS", "5")) + +DEFAULT_EXTERNAL_TABLE_DEFINITION = { + # The default must be a self describing data format + # because autodetecting CSV /JSON schemas is likely to not match + # expectations / assumptions of the transformation query. + "sourceFormat": "PARQUET", +} + +# Use caution when lowering the job polling rate. +# Keep in mind that many concurrent executions of this cloud function should not +# violate the 300 concurrent requests or 100 request per second. +# https://cloud.google.com/bigquery/quotas#all_api_requests +JOB_POLL_INTERVAL_SECONDS = 1 + +DEFAULT_JOB_LABELS = { + "component": "event-based-gcs-ingest", + "cloud-function-name": os.getenv("K_SERVICE"), +} + +DEFAULT_LOAD_JOB_CONFIG = { + "sourceFormat": "CSV", + "fieldDelimiter": ",", + "writeDisposition": "WRITE_APPEND", + "labels": DEFAULT_JOB_LABELS, +} + +BASE_LOAD_JOB_CONFIG = { + "writeDisposition": "WRITE_APPEND", + "labels": DEFAULT_JOB_LABELS, +} + +# https://cloud.google.com/bigquery/quotas#load_jobs +# 15TB per BQ load job (soft limit). +DEFAULT_MAX_BATCH_BYTES = str(15 * 10**12) + +# 10,000 GCS URIs per BQ load job. +MAX_SOURCE_URIS_PER_LOAD = 10**4 + +SUCCESS_FILENAME = os.getenv("SUCCESS_FILENAME", "_SUCCESS") + +DEFAULT_JOB_PREFIX = "gcf-ingest-" + +# yapf: disable +DEFAULT_DESTINATION_REGEX = ( + r"^(?P[\w\-\._0-9]+)/" # dataset (required) + r"(?P
[\w\-_0-9]+)/?" # table name (required) + # break up historical v.s. incremental to separate prefixes (optional) + r"(?:historical|incremental)?/?" + r"(?P\$[0-9]+)?/?" # partition decorator (optional) + r"(?:" # [begin] yyyy/mm/dd/hh/ group (optional) + r"(?P[0-9]{4})/?" # partition year (yyyy) (optional) + r"(?P[0-9]{2})?/?" # partition month (mm) (optional) + r"(?P
[0-9]{2})?/?" # partition day (dd) (optional) + r"(?P[0-9]{2})?/?" # partition hour (hh) (optional) + r")?" # [end]yyyy/mm/dd/hh/ group (optional) + r"(?P[\w\-_0-9]+)?/" # batch id (optional) +) +# yapf: enable + +DESTINATION_REGEX = re.compile( + os.getenv("DESTINATION_REGEX", DEFAULT_DESTINATION_REGEX)) + +CLIENT_INFO = google.api_core.client_info.ClientInfo( + user_agent="google-pso-tool/bq-severless-loader") + +# Filename used to (re)start the backfill subscriber loop. +BACKFILL_FILENAME = "_BACKFILL" + +# When this file is uploaded the subscriber will start applying items in order +# off the backlog. This is meant to help scenarios where historical loads to GCS +# are parallelized but must be applied in order. One can drop a _HISTORYDONE +# file to indicate the entire history has been uploaded and it is safe to start +# applying items in the backlog in order. By default this will be empty and the +# backlog subscriber will not wait for any file and start applying the first +# items in the backlog. +START_BACKFILL_FILENAME = os.getenv("START_BACKFILL_FILENAME") + +# Filenames that cause cloud function to take action. +ACTION_FILENAMES = { + SUCCESS_FILENAME, + BACKFILL_FILENAME, + START_BACKFILL_FILENAME, +} + +RESTART_BUFFER_SECONDS = int(os.getenv("RESTART_BUFFER_SECONDS", "30")) + +ORDER_PER_TABLE = bool( + distutils.util.strtobool(os.getenv("ORDER_PER_TABLE", "False"))) + +BQ_TRANSFORM_SQL = "*.sql" + +ENSURE_SUBSCRIBER_SECONDS = 5 + +FAIL_ON_ZERO_DML_ROWS_AFFECTED = bool( + distutils.util.strtobool(os.getenv("FAIL_ON_ZERO_DML_ROWS_AFFECTED", + "True"))) + +BQ_DML_STATEMENT_TYPES = { + "INSERT", + "UPDATE", + "DELETE", + "MERGE", +} + +# https://cloud.google.com/bigquery/docs/running-jobs#generate-jobid +NON_BQ_JOB_ID_REGEX = re.compile(r'[^0-9a-zA-Z_\-]+') diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/exceptions.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/exceptions.py new file mode 100644 index 000000000..7f7b0e04b --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/exceptions.py @@ -0,0 +1,51 @@ +# Copyright 2021 Google LLC. +# This software is provided as-is, without warranty or representation +# for any use or purpose. +# Your use of it is subject to your agreement with Google. + +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Custom Exceptions of GCS event based ingest to BigQuery""" + + +class DuplicateNotificationException(Exception): + """Exception to indicate that the function was triggered twice for the same + event.""" + + +class BigQueryJobFailure(Exception): + """Exception to indicate that the function was triggered twice for the same + event.""" + + +class DestinationRegexMatchException(Exception): + """Exception to indicate that a success file did not match the destination + regex specified in the DESTINATION_REGEX environment variable (or the + default)""" + + +class UnexpectedTriggerException(Exception): + """Exception to indicate the cloud function was triggered with an unexpected + payload.""" + + +class BacklogException(Exception): + """Exception to indicate an issue with the backlog mechanics of this + function.""" + + +EXCEPTIONS_TO_REPORT = ( + BigQueryJobFailure, + UnexpectedTriggerException, + DestinationRegexMatchException, + BacklogException, +) diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py new file mode 100644 index 000000000..95fb99195 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/ordering.py @@ -0,0 +1,363 @@ +# Copyright 2021 Google LLC. +# This software is provided as-is, without warranty or representation +# for any use or purpose. +# Your use of it is subject to your agreement with Google. + +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Implement function to ensure loading data from GCS to BigQuery in order. +""" +import datetime +import os +import time +import traceback +from typing import Optional, Tuple + +import google.api_core +import google.api_core.exceptions +import pytz +# pylint in cloud build is being flaky about this import discovery. +# pylint: disable=no-name-in-module +from google.cloud import bigquery +from google.cloud import storage + +from . import constants # pylint: disable=no-name-in-module,import-error +from . import exceptions # pylint: disable=no-name-in-module,import-error +from . import utils # pylint: disable=no-name-in-module,import-error + + +def backlog_publisher( + gcs_client: storage.Client, + event_blob: storage.Blob, +) -> Optional[storage.Blob]: + """add success files to the the backlog and trigger backfill if necessary""" + bkt = event_blob.bucket + + # Create an entry in _backlog for this table for this batch / success file + backlog_blob = success_blob_to_backlog_blob(event_blob) + backlog_blob.upload_from_string("", client=gcs_client) + print(f"added gs://{backlog_blob.bucket.name}/{backlog_blob.name} " + "to the backlog.") + + table_prefix = utils.get_table_prefix(event_blob.name) + return start_backfill_subscriber_if_not_running(gcs_client, bkt, + table_prefix) + + +def backlog_subscriber(gcs_client: Optional[storage.Client], + bq_client: Optional[bigquery.Client], + backfill_blob: storage.Blob, function_start_time: float): + """Pick up the table lock, poll BQ job id until completion and process next + item in the backlog. + """ + print(f"started backfill subscriber for gs://{backfill_blob.bucket.name}/" + f"{backfill_blob.name}") + gcs_client, bq_client = _get_clients_if_none(gcs_client, bq_client) + # We need to retrigger the backfill loop before the Cloud Functions Timeout. + restart_time = function_start_time + ( + float(os.getenv("FUNCTION_TIMEOUT_SEC", "60")) - + constants.RESTART_BUFFER_SECONDS) + print(f"restart time is {restart_time}") + bkt = backfill_blob.bucket + utils.handle_duplicate_notification(gcs_client, backfill_blob) + table_prefix = utils.get_table_prefix(backfill_blob.name) + last_job_done = False + # we will poll for job completion this long in an individual iteration of + # the while loop (before checking if we are too close to cloud function + # timeout and should retrigger). + polling_timeout = 5 # seconds + lock_blob: storage.Blob = bkt.blob(f"{table_prefix}/_bqlock") + if restart_time - polling_timeout < time.monotonic(): + raise EnvironmentError( + "The Cloud Function timeout is too short for " + "backlog subscriber to do it's job. We recommend " + "setting the timeout to 540 seconds or at least " + "1 minute (Cloud Functions default).") + while time.monotonic() < restart_time - polling_timeout - 1: + first_bq_lock_claim = False + lock_contents = utils.read_gcs_file_if_exists( + gcs_client, f"gs://{bkt.name}/{lock_blob.name}") + if lock_contents: + # is this a lock placed by this cloud function. + # the else will handle a manual _bqlock + if lock_contents.startswith( + os.getenv('JOB_PREFIX', constants.DEFAULT_JOB_PREFIX)): + last_job_done = wait_on_last_job(bq_client, lock_blob, + backfill_blob, lock_contents, + polling_timeout) + else: + print(f"sleeping for {polling_timeout} seconds because" + f"found manual lock gs://{bkt.name}/{lock_blob.name} with" + "This will be an infinite loop until the manual lock is " + "released.\n" + f"manual lock contents:\n {lock_contents}. ") + time.sleep(polling_timeout) + continue + else: # this condition handles absence of _bqlock file + first_bq_lock_claim = True + last_job_done = True # there's no running job to poll. + + if not last_job_done: + # keep polling th running job. + continue + + # if reached here, last job is done. + if not first_bq_lock_claim: + # If the BQ lock was missing we do not want to delete a backlog + # item for a job we have not yet submitted. + utils.remove_oldest_backlog_item(gcs_client, bkt, table_prefix) + should_subscriber_exit = handle_backlog(gcs_client, bq_client, bkt, + lock_blob, backfill_blob) + if should_subscriber_exit: + return + # retrigger the subscriber loop by reposting the _BACKFILL file + print("ran out of time, restarting backfill subscriber loop for:" + f"gs://{bkt.name}/{table_prefix}") + backfill_blob = bkt.blob(f"{table_prefix}/{constants.BACKFILL_FILENAME}") + backfill_blob.upload_from_string("") + + +def wait_on_last_job(bq_client: bigquery.Client, lock_blob: storage.Blob, + backfill_blob: storage.blob, job_id: str, + polling_timeout: int): + """wait on a bigquery job or raise informative exception. + + Args: + bq_client: bigquery.Client + lock_blob: storage.Blob _bqlock blob + backfill_blob: storage.blob _BACKFILL blob + job_id: str BigQuery job ID to wait on (read from _bqlock file) + polling_timeout: int seconds to poll before returning. + """ + try: + return utils.wait_on_bq_job_id(bq_client, job_id, polling_timeout) + except (exceptions.BigQueryJobFailure, + google.api_core.exceptions.NotFound) as err: + table_prefix = utils.get_table_prefix(backfill_blob.name) + raise exceptions.BigQueryJobFailure( + f"previous BigQuery job: {job_id} failed or could not " + "be found. This will kill the backfill subscriber for " + f"the table prefix: {table_prefix}." + "Once the issue is dealt with by a human, the lock " + "file at: " + f"gs://{lock_blob.bucket.name}/{lock_blob.name} " + "should be manually removed and a new empty " + f"{constants.BACKFILL_FILENAME} " + "file uploaded to: " + f"gs://{backfill_blob.bucket.name}/{table_prefix}" + "/_BACKFILL " + f"to resume the backfill subscriber so it can " + "continue with the next item in the backlog.\n" + "Original Exception:\n" + f"{traceback.format_exc()}") from err + + +def handle_backlog( + gcs_client: storage.Client, + bq_client: bigquery.Client, + bkt: storage.Bucket, + lock_blob: storage.Blob, + backfill_blob: storage.Blob, +): + """submit the next item in the _backlog if it is non-empty or clean up the + _BACKFILL and _bqlock files. + Args: + gcs_client: storage.Client + bq_client: bigquery.Client + bkt: storage.Bucket + lock_blob: storage.Blob _bqlock blob + backfill_blob: storage.blob _BACKFILL blob + Returns: + bool: should this backlog subscriber exit + """ + table_prefix = utils.get_table_prefix(backfill_blob.name) + check_backlog_time = time.monotonic() + next_backlog_file = utils.get_next_backlog_item(gcs_client, bkt, + table_prefix) + if next_backlog_file: + next_success_file: storage.Blob = bkt.blob( + next_backlog_file.name.replace("/_backlog/", "/")) + if not next_success_file.exists(client=gcs_client): + raise exceptions.BacklogException( + "backlog contains " + f"gs://{next_backlog_file.bucket}/{next_backlog_file.name} " + "but the corresponding success file does not exist at: " + f"gs://{next_success_file.bucket}/{next_success_file.name}") + print("applying next batch for:" + f"gs://{next_success_file.bucket}/{next_success_file.name}") + next_job_id = utils.create_job_id(next_success_file.name) + utils.apply(gcs_client, bq_client, next_success_file, lock_blob, + next_job_id) + return False # BQ job running + print("no more files found in the backlog deleteing backfill blob") + backfill_blob.delete(if_generation_match=backfill_blob.generation, + client=gcs_client) + if (check_backlog_time + constants.ENSURE_SUBSCRIBER_SECONDS < + time.monotonic()): + print("checking if the backlog is still empty for " + f"gs://${bkt.name}/{table_prefix}/_backlog/" + f"There was more than {constants.ENSURE_SUBSCRIBER_SECONDS}" + " seconds between listing items on the backlog and " + f"deleting the {constants.BACKFILL_FILENAME}. " + "This should not happen often but is meant to alleviate a " + "race condition in the event that something caused the " + "delete operation was delayed or had to be retried for a " + "long time.") + next_backlog_file = utils.get_next_backlog_item(gcs_client, bkt, + table_prefix) + if next_backlog_file: + # The backfill file was deleted but the backlog is + # not empty. Re-trigger the backfill subscriber loop by + # dropping a new backfill file. + start_backfill_subscriber_if_not_running(gcs_client, bkt, + table_prefix) + return True # we are re-triggering a new backlog subscriber + utils.handle_bq_lock(gcs_client, lock_blob, None) + print(f"backlog is empty for gs://{bkt.name}/{table_prefix}. " + "backlog subscriber exiting.") + return True # the backlog is empty + + +def start_backfill_subscriber_if_not_running( + gcs_client: Optional[storage.Client], bkt: storage.Bucket, + table_prefix: str) -> Optional[storage.Blob]: + """start the backfill subscriber if it is not already runnning for this + table prefix. + + created a backfill file for the table prefix if not exists. + """ + if not gcs_client: + gcs_client = storage.Client(client_info=constants.CLIENT_INFO) + start_backfill = True + # Do not start subscriber until START_BACKFILL_FILENAME has been dropped + # at the table prefix. + if constants.START_BACKFILL_FILENAME: + start_backfill_blob = bkt.blob( + f"{table_prefix}/{constants.START_BACKFILL_FILENAME}") + start_backfill = start_backfill_blob.exists(client=gcs_client) + if not start_backfill: + print("note triggering backfill because" + f"gs://{start_backfill_blob.bucket.name}/" + f"{start_backfill_blob.name} was not found.") + + if start_backfill: + # Create a _BACKFILL file for this table if not exists + backfill_blob = bkt.blob( + f"{table_prefix}/{constants.BACKFILL_FILENAME}") + try: + backfill_blob.upload_from_string("", + if_generation_match=0, + client=gcs_client) + print("triggered backfill with " + f"gs://{backfill_blob.bucket.name}/{backfill_blob.name} " + f"created at {backfill_blob.time_created}.") + return backfill_blob + except google.api_core.exceptions.PreconditionFailed: + backfill_blob.reload(client=gcs_client) + print("backfill already in progress due to: " + f"gs://{backfill_blob.bucket.name}/{backfill_blob.name} " + f"created at {backfill_blob.time_created}. exiting.") + return backfill_blob + else: + return None + + +def success_blob_to_backlog_blob(success_blob: storage.Blob) -> storage.Blob: + """create a blob object that is a pointer to the input success blob in the + backlog + """ + bkt = success_blob.bucket + table_prefix = utils.get_table_prefix(success_blob.name) + success_file_suffix = utils.removeprefix(success_blob.name, + f"{table_prefix}/") + return bkt.blob(f"{table_prefix}/_backlog/{success_file_suffix}") + + +def subscriber_monitor(gcs_client: Optional[storage.Client], + bkt: storage.Bucket, object_id: str) -> bool: + """ + Monitor to handle a rare race condition where: + + 1. subscriber reads an empty backlog (before it can delete the + _BACKFILL blob...) + 2. a new item is added to the backlog (causing a separate + function invocation) + 3. In this new invocation we reach this point in the code path + and start_backlog_subscriber_if_not_running sees the old _BACKFILL + and does not create a new one. + 4. The subscriber deletes the _BACKFILL blob and exits without + processing the new item on the backlog from #2. + + We handle this by success file added to the backlog starts this monitoring + to wait constants.ENSURE_SUBSCRIBER_SECONDS before checking that the + backfill file exists. On the subscriber side we check if there was more time + than this between list backlog items and delete backfill calls. This way + we always handle this race condition either in this monitor or in the + subscriber itself. + """ + if not gcs_client: + gcs_client = storage.Client(client_info=constants.CLIENT_INFO) + backfill_blob = start_backfill_subscriber_if_not_running( + gcs_client, bkt, utils.get_table_prefix(object_id)) + + # backfill blob may be none if the START_BACKFILL_FILENAME has not been + # dropped + if backfill_blob: + # Handle case where a subscriber loop was not able to repost the + # backfill file before the cloud function timeout. + time_created_utc = backfill_blob.time_created.replace(tzinfo=pytz.UTC) + now_utc = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC) + if (now_utc - time_created_utc > datetime.timedelta( + seconds=int(os.getenv("FUNCTION_TIMEOUT_SEC", "60")))): + print( + f"backfill blob gs://{backfill_blob.bucket.name}/" + f"{backfill_blob.name} appears to be abandoned as it is older " + "than the cloud function timeout of " + f"{os.getenv('FUNCTION_TIMEOUT_SEC', '60')} seconds." + "reposting this backfill blob to restart the backfill" + "subscriber for this table.") + backfill_blob.delete(client=gcs_client) + start_backfill_subscriber_if_not_running( + gcs_client, bkt, utils.get_table_prefix(object_id)) + return True + + time.sleep(constants.ENSURE_SUBSCRIBER_SECONDS) + while not utils.wait_on_gcs_blob(gcs_client, backfill_blob, + constants.ENSURE_SUBSCRIBER_SECONDS): + start_backfill_subscriber_if_not_running( + gcs_client, bkt, utils.get_table_prefix(object_id)) + return True + return False + + +def _get_clients_if_none( + gcs_client: Optional[storage.Client], bq_client: Optional[bigquery.Client] +) -> Tuple[storage.Client, bigquery.Client]: + """method to handle case where clients are None. + + This is a workaround to be able to run the backlog subscriber in a separate + process to facilitate some of our integration tests. Though it should be + harmless if these clients are recreated in the Cloud Function. + """ + print("instantiating missing clients in backlog subscriber this should only" + " happen during integration tests.") + if not gcs_client: + gcs_client = storage.Client(client_info=constants.CLIENT_INFO) + if not bq_client: + default_query_config = bigquery.QueryJobConfig() + default_query_config.use_legacy_sql = False + default_query_config.labels = constants.DEFAULT_JOB_LABELS + bq_client = bigquery.Client( + client_info=constants.CLIENT_INFO, + default_query_job_config=default_query_config, + project=os.getenv("BQ_PROJECT", os.getenv("GCP_PROJECT"))) + return gcs_client, bq_client diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py new file mode 100644 index 000000000..44b8367ee --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/common/utils.py @@ -0,0 +1,754 @@ +# Copyright 2021 Google LLC. +# This software is provided as-is, without warranty or representation +# for any use or purpose. +# Your use of it is subject to your agreement with Google. + +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains utility methods used by the BQIngest process +""" +import collections +import collections.abc +import copy +import fnmatch +import json +import os +import pathlib +import pprint +import time +import uuid +from typing import Any, Deque, Dict, List, Optional, Tuple, Union + +import cachetools +import google.api_core +import google.api_core.client_info +import google.api_core.exceptions +import google.cloud.exceptions +# pylint in cloud build is being flaky about this import discovery. +from google.cloud import bigquery +from google.cloud import storage + +from . import constants # pylint: disable=no-name-in-module,import-error +from . import exceptions # pylint: disable=no-name-in-module,import-error + + +def external_query( # pylint: disable=too-many-arguments + gcs_client: storage.Client, bq_client: bigquery.Client, gsurl: str, + query: str, dest_table_ref: bigquery.TableReference, job_id: str): + """Load from query over external table from GCS. + + This hinges on a SQL query defined in GCS at _config/*.sql and + an external table definition _config/external.json (otherwise will assume + CSV external table) + """ + external_table_config = read_gcs_file_if_exists( + gcs_client, f"{gsurl}_config/external.json") + if not external_table_config: + external_table_config = look_for_config_in_parents( + gcs_client, gsurl, "external.json") + if external_table_config: + external_table_def = json.loads(external_table_config) + else: + print(f" {gsurl}_config/external.json not found in parents of {gsurl}. " + "Falling back to default PARQUET external table:\n" + f"{json.dumps(constants.DEFAULT_EXTERNAL_TABLE_DEFINITION)}") + external_table_def = constants.DEFAULT_EXTERNAL_TABLE_DEFINITION + + # This may cause an issue if >10,000 files. + external_table_def["sourceUris"] = flatten2dlist( + get_batches_for_prefix(gcs_client, gsurl)) + print(f"external table def = {json.dumps(external_table_config, indent=2)}") + external_config = bigquery.ExternalConfig.from_api_repr(external_table_def) + job_config = bigquery.QueryJobConfig( + table_definitions={"temp_ext": external_config}, use_legacy_sql=False) + + # drop partition decorator if present. + table_id = dest_table_ref.table_id.split("$")[0] + + # similar syntax to str.format but doesn't require escaping braces + # elsewhere in query (e.g. in a regex) + rendered_query = query\ + .replace( + "{dest_dataset}", + f"`{dest_table_ref.project}`.{dest_table_ref.dataset_id}")\ + .replace("{dest_table}", table_id) + + job: bigquery.QueryJob = bq_client.query(rendered_query, + job_config=job_config, + job_id=job_id) + + print(f"started asynchronous query job: {job.job_id}") + + start_poll_for_errors = time.monotonic() + # Check if job failed quickly + while time.monotonic( + ) - start_poll_for_errors < constants.WAIT_FOR_JOB_SECONDS: + job.reload(client=bq_client) + if job.state == "DONE": + check_for_bq_job_and_children_errors(bq_client, job) + return + time.sleep(constants.JOB_POLL_INTERVAL_SECONDS) + + +def flatten2dlist(arr: List[List[Any]]) -> List[Any]: + """Flatten list of lists to flat list of elements""" + return [j for i in arr for j in i] + + +def load_batches(gcs_client, bq_client, gsurl, dest_table_ref, job_id): + """orchestrate 1 or more load jobs based on number of URIs and total byte + size of objects at gsurl""" + batches = get_batches_for_prefix(gcs_client, gsurl) + load_config = construct_load_job_config(gcs_client, gsurl) + load_config.labels = constants.DEFAULT_JOB_LABELS + + jobs: List[bigquery.LoadJob] = [] + for batch in batches: + print(load_config.to_api_repr()) + job: bigquery.LoadJob = bq_client.load_table_from_uri( + batch, dest_table_ref, job_config=load_config, job_id=job_id) + + print(f"started asyncronous bigquery load job with id: {job.job_id} for" + f" {gsurl}") + jobs.append(job) + + start_poll_for_errors = time.monotonic() + # Check if job failed quickly + while time.monotonic( + ) - start_poll_for_errors < constants.WAIT_FOR_JOB_SECONDS: + # Check if job failed quickly + for job in jobs: + job.reload(client=bq_client) + check_for_bq_job_and_children_errors(bq_client, job) + time.sleep(constants.JOB_POLL_INTERVAL_SECONDS) + + +def _get_parent_config_file(storage_client, config_filename, bucket, path): + bkt = storage_client.lookup_bucket(bucket) + config_dir_name = "_config" + parent_path = pathlib.Path(path).parent + config_path = parent_path / config_dir_name + config_file_path = config_path / config_filename + # Handle wild card (to support bq transform sql with different names). + if "*" in config_filename: + matches: List[storage.Blob] = list( + filter(lambda blob: fnmatch.fnmatch(blob.name, config_filename), + bkt.list_blobs(prefix=config_path))) + if matches: + if len(matches) > 1: + raise RuntimeError( + f"Multiple matches for gs://{bucket}/{config_file_path}") + return read_gcs_file_if_exists(storage_client, + f"gs://{bucket}/{matches[0].name}") + return None + return read_gcs_file_if_exists(storage_client, + f"gs://{bucket}/{config_file_path}") + + +def look_for_config_in_parents(storage_client: storage.Client, gsurl: str, + config_filename: str) -> Optional[str]: + """look in parent directories for _config/config_filename""" + blob: storage.Blob = storage.Blob.from_string(gsurl) + bucket_name = blob.bucket.name + obj_path = blob.name + parts = removesuffix(obj_path, "/").split("/") + + def _get_parent_config(path): + return _get_parent_config_file(storage_client, config_filename, + bucket_name, path) + + config = None + while parts: + if config is not None: + return config + config = _get_parent_config("/".join(parts)) + parts.pop() + return config + + +def construct_load_job_config(storage_client: storage.Client, + gsurl: str) -> bigquery.LoadJobConfig: + """ + merge dictionaries for loadjob.json configs in parent directories. + The configs closest to gsurl should take precedence. + """ + config_filename = "load.json" + blob: storage.Blob = storage.Blob.from_string(gsurl) + bucket_name = blob.bucket.name + obj_path = blob.name + parts = removesuffix(obj_path, "/").split("/") + + def _get_parent_config(path): + return _get_parent_config_file(storage_client, config_filename, + bucket_name, path) + + config_q: Deque[Dict[str, Any]] = collections.deque() + config_q.append(constants.BASE_LOAD_JOB_CONFIG) + while parts: + config = _get_parent_config("/".join(parts)) + if config: + print(f"found config: {'/'.join(parts)}") + config_q.append(json.loads(config)) + parts.pop() + + merged_config: Dict = {} + while config_q: + recursive_update(merged_config, config_q.popleft(), in_place=True) + if merged_config == constants.BASE_LOAD_JOB_CONFIG: + print("falling back to default CSV load job config. " + "Did you forget load.json?") + return bigquery.LoadJobConfig.from_api_repr( + constants.DEFAULT_LOAD_JOB_CONFIG) + print(f"merged_config: {merged_config}") + return bigquery.LoadJobConfig.from_api_repr({"load": merged_config}) + + +def get_batches_for_prefix( + gcs_client: storage.Client, + prefix_path: str, + ignore_subprefix="_config/", + ignore_file=constants.SUCCESS_FILENAME) -> List[List[str]]: + """ + This function creates batches of GCS uris for a given prefix. + This prefix could be a table prefix or a partition prefix inside a + table prefix. + returns an Array of their batches + (one batch has an array of multiple GCS uris) + """ + batches = [] + blob: storage.Blob = storage.Blob.from_string(prefix_path) + bucket_name = blob.bucket.name + prefix_name = blob.name + + bucket = cached_get_bucket(gcs_client, bucket_name) + blobs = list(bucket.list_blobs(prefix=prefix_name, delimiter="/")) + + cumulative_bytes = 0 + max_batch_size = int( + os.getenv("MAX_BATCH_BYTES", constants.DEFAULT_MAX_BATCH_BYTES)) + batch: List[str] = [] + for blob in blobs: + # API returns root prefix also. Which should be ignored. + # Similarly, the _SUCCESS file should be ignored. + # Finally, anything in the _config/ prefix should be ignored. + if (blob.name + not in {f"{prefix_name}/", f"{prefix_name}/{ignore_file}"} + or blob.name.startswith(f"{prefix_name}/{ignore_subprefix}")): + if blob.size == 0: # ignore empty files + print(f"ignoring empty file: gs://{bucket}/{blob.name}") + continue + cumulative_bytes += blob.size + + # keep adding until we reach threshold + if cumulative_bytes <= max_batch_size or len( + batch) > constants.MAX_SOURCE_URIS_PER_LOAD: + batch.append(f"gs://{bucket_name}/{blob.name}") + else: + batches.append(batch.copy()) + batch.clear() + batch.append(f"gs://{bucket_name}/{blob.name}") + cumulative_bytes = blob.size + + # pick up remaining files in the final batch + if len(batch) > 0: + batches.append(batch.copy()) + batch.clear() + + if len(batches) > 1: + print(f"split into {len(batches)} batches.") + elif len(batches) < 1: + raise google.api_core.exceptions.NotFound( + f"No files to load at {prefix_path}!") + return batches + + +def parse_notification(notification: dict) -> Tuple[str, str]: + """valdiates notification payload + Args: + notification(dict): Pub/Sub Storage Notification + https://cloud.google.com/storage/docs/pubsub-notifications + Or Cloud Functions direct trigger + https://cloud.google.com/functions/docs/tutorials/storage + with notification schema + https://cloud.google.com/storage/docs/json_api/v1/objects#resource + Returns: + tuple of bucketId and objectId attributes + Raises: + KeyError if the input notification does not contain the expected + attributes. + """ + if notification.get("kind") == "storage#object": + # notification is GCS Object reosource from Cloud Functions trigger + # https://cloud.google.com/storage/docs/json_api/v1/objects#resource + return notification["bucket"], notification["name"] + if notification.get("attributes"): + # notification is Pub/Sub message. + try: + attributes = notification["attributes"] + return attributes["bucketId"], attributes["objectId"] + except KeyError: + raise exceptions.UnexpectedTriggerException( + "Issue with Pub/Sub message, did not contain expected " + f"attributes: 'bucketId' and 'objectId': {notification}" + ) from KeyError + raise exceptions.UnexpectedTriggerException( + "Cloud Function received unexpected trigger:\n" + f"{notification}\n" + "This function only supports direct Cloud Functions " + "Background Triggers or Pub/Sub storage notificaitons " + "as described in the following links:\n" + "https://cloud.google.com/storage/docs/pubsub-notifications\n" + "https://cloud.google.com/functions/docs/tutorials/storage") + + +def read_gcs_file(gcs_client: storage.Client, gsurl: str) -> str: + """ + Read a GCS object as a string + + Args: + gcs_client: GCS client + gsurl: GCS URI for object to read in gs://bucket/path/to/object format + Returns: + str + """ + blob = storage.Blob.from_string(gsurl) + return blob.download_as_bytes(client=gcs_client).decode('UTF-8') + + +def read_gcs_file_if_exists(gcs_client: storage.Client, + gsurl: str) -> Optional[str]: + """return string of gcs object contents or None if the object does not exist + """ + try: + return read_gcs_file(gcs_client, gsurl) + except google.cloud.exceptions.NotFound: + return None + + +# cache lookups against GCS API for 1 second as buckets have update +# limit of once per second and we might do several of the same lookup during +# the functions lifetime. This should improve performance by eliminating +# unnecessary API calls. +# https://cloud.google.com/storage/quotas +@cachetools.cached(cachetools.TTLCache(maxsize=1024, ttl=1)) +def cached_get_bucket( + gcs_client: storage.Client, + bucket_id: str, +) -> storage.Bucket: + """get storage.Bucket object by bucket_id string if exists or raise + google.cloud.exceptions.NotFound.""" + return gcs_client.get_bucket(bucket_id) + + +def dict_to_bq_schema(schema: List[Dict]) -> List[bigquery.SchemaField]: + """Converts a list of dicts to list of bigquery.SchemaField for use with + bigquery client library. Dicts must contain name and type keys. + The dict may optionally contain a mode key.""" + default_mode = "NULLABLE" + return [ + bigquery.SchemaField( + x["name"], + x["type"], + mode=x.get("mode") if x.get("mode") else default_mode) + for x in schema + ] + + +# To be added to built in str in python 3.9 +# https://www.python.org/dev/peps/pep-0616/ +def removeprefix(in_str: str, prefix: str) -> str: + """remove string prefix""" + if in_str.startswith(prefix): + return in_str[len(prefix):] + return in_str[:] + + +def removesuffix(in_str: str, suffix: str) -> str: + """removes suffix from a string.""" + # suffix='' should not call self[:-0]. + if suffix and in_str.endswith(suffix): + return in_str[:-len(suffix)] + return in_str[:] + + +def recursive_update(original: Dict, update: Dict, in_place: bool = False): + """ + return a recursively updated dictionary. + + Note, lists will be completely overwritten by value in update if there is a + conflict. + + original: (dict) the base dictionary + update: (dict) the dictionary of updates to apply on original + in_place: (bool) if true then original will be mutated in place else a new + dictionary as a result of the update will be returned. + """ + out = original if in_place else copy.deepcopy(original) + + for key, value in update.items(): + if isinstance(value, dict): + out[key] = recursive_update(out.get(key, {}), value) + else: + out[key] = value + return out + + +def handle_duplicate_notification( + gcs_client: storage.Client, + blob_to_claim: storage.Blob, +): + """ + Need to handle potential duplicate Pub/Sub notifications. + To achieve this we will drop an empty "claimed" file that indicates + an invocation of this cloud function has picked up the success file + with a certain creation timestamp. This will support republishing the + success file as a mechanism of re-running the ingestion while avoiding + duplicate ingestion due to multiple Pub/Sub messages for a success file + with the same creation time. + """ + blob_to_claim.reload(client=gcs_client) + created_unix_timestamp = blob_to_claim.time_created.timestamp() + + basename = os.path.basename(blob_to_claim.name) + claim_blob: storage.Blob = blob_to_claim.bucket.blob( + blob_to_claim.name.replace( + basename, f"_claimed_{basename}_created_at_" + f"{created_unix_timestamp}")) + try: + claim_blob.upload_from_string("", + if_generation_match=0, + client=gcs_client) + except google.api_core.exceptions.PreconditionFailed as err: + blob_to_claim.reload(client=gcs_client) + raise exceptions.DuplicateNotificationException( + f"gs://{blob_to_claim.bucket.name}/{blob_to_claim.name} appears " + "to already have been claimed for created timestamp: " + f"{created_unix_timestamp}." + "This means that another invocation of this cloud function has " + "claimed the work to be one for this file. " + "This may be due to a rare duplicate delivery of the Pub/Sub " + "storage notification.") from err + + +@cachetools.cached(cachetools.LRUCache(maxsize=1024)) +def get_table_prefix(object_id: str) -> str: + """Find the table prefix for a object_id based on the destination regex. + Args: + object_id: str object ID to parse + Returns: + str: table prefix + """ + basename = os.path.basename(object_id) + if basename in { + constants.BACKFILL_FILENAME, + constants.START_BACKFILL_FILENAME, + "_bqlock", + }: + # These files will not match the regex and always should appear at the + # table level. + return removesuffix(object_id, f"/{basename}") + match = constants.DESTINATION_REGEX.match( + object_id.replace("/_backlog/", "/")) + if not match: + raise exceptions.DestinationRegexMatchException( + f"could not determine table prefix for object id: {object_id}" + "because it did not contain a match for destination_regex: " + f"{constants.DESTINATION_REGEX.pattern}") + table_group_index = match.re.groupindex.get("table") + if table_group_index: + table_level_index = match.regs[table_group_index][1] + return object_id[:table_level_index] + raise exceptions.DestinationRegexMatchException( + f"could not determine table prefix for object id: {object_id}" + "because it did not contain a match for the table capturing group " + f"in destination regex: {constants.DESTINATION_REGEX.pattern}") + + +def get_next_backlog_item( + gcs_client: storage.Client, + bkt: storage.Bucket, + table_prefix: str, +) -> Optional[storage.Blob]: + """ + Get next blob in the backlog if the backlog is not empty. + + Args: + gcs_client: storage.Client + bkt: storage.Bucket that this cloud functions is ingesting data for. + table_prefix: the prefix for the table whose backlog should be checked. + + Retruns: + storage.Blob: pointer to a SUCCESS file in the backlog + """ + backlog_blobs = gcs_client.list_blobs(bkt, + prefix=f"{table_prefix}/_backlog/") + # Backlog items will be lexciographically sorted + # https://cloud.google.com/storage/docs/json_api/v1/objects/list + for blob in backlog_blobs: + return blob # Return first item in iterator + return None + + +def remove_oldest_backlog_item( + gcs_client: storage.Client, + bkt: storage.Bucket, + table_prefix: str, +) -> bool: + """ + Remove the oldest pointer in the backlog if the backlog is not empty. + + Args: + gcs_client: storage.Client + bkt: storage.Bucket that this cloud functions is ingesting data for. + table_prefix: the prefix for the table whose backlog should be checked. + + Returns: + bool: True if we removed the oldest blob. False if the backlog was + empty. + """ + backlog_blobs = gcs_client.list_blobs(bkt, + prefix=f"{table_prefix}/_backlog/") + # Backlog items will be lexciographically sorted + # https://cloud.google.com/storage/docs/json_api/v1/objects/list + blob: storage.Blob + for blob in backlog_blobs: + blob.delete(client=gcs_client) + return True # Return after deleteing first blob in the iterator + return False + + +def check_for_bq_job_and_children_errors(bq_client: bigquery.Client, + job: Union[bigquery.LoadJob, + bigquery.QueryJob]): + """checks if BigQuery job (or children jobs in case of multi-statement sql) + should be considered failed because there were errors or the query affected + no rows while FAIL_ON_ZERO_DML_ROWS_AFFECTED env var is set to True + (this is the default). + + Args: + bq_client: bigquery.Client + job: Union[bigquery.LoadJob, bigquery.QueryJob] job to check for errors. + Raises: + exceptions.BigQueryJobFailure + """ + if job.state != "DONE": + wait_on_bq_job_id(bq_client, job.job_id, 5) + if job.errors: + raise exceptions.BigQueryJobFailure( + f"BigQuery Job {job.job_id} failed during backfill with the " + f"following errors: {job.errors}\n" + f"{pprint.pformat(job.to_api_repr())}") + if isinstance(job, bigquery.QueryJob): + if (constants.FAIL_ON_ZERO_DML_ROWS_AFFECTED + and job.statement_type in constants.BQ_DML_STATEMENT_TYPES + and job.num_dml_affected_rows < 1): + raise exceptions.BigQueryJobFailure( + f"query job {job.job_id} ran successfully but did not " + f"affect any rows.\n {pprint.pformat(job.to_api_repr())}") + for child_job in bq_client.list_jobs(parent_job=job): + check_for_bq_job_and_children_errors(bq_client, child_job) + + +def wait_on_bq_job_id(bq_client: bigquery.Client, + job_id: str, + polling_timeout: int, + polling_interval: int = 1) -> bool: + """" + Wait for a BigQuery Job ID to complete. + + Args: + bq_client: bigquery.Client + job_id: str the BQ job ID to wait on + polling_timeout: int number of seconds to poll this job ID + polling_interval: frequency to query the job state during polling + Returns: + bool: if the job ID has finished successfully. True if DONE without + errors, False if RUNNING or PENDING + Raises: + exceptions.BigQueryJobFailure if the job failed. + google.api_core.exceptions.NotFound if the job id cannot be found. + """ + start_poll = time.monotonic() + while time.monotonic() - start_poll < (polling_timeout - polling_interval): + job: Union[bigquery.LoadJob, + bigquery.QueryJob] = bq_client.get_job(job_id) + if job.state == "DONE": + check_for_bq_job_and_children_errors(bq_client, job) + return True + if job.state in {"RUNNING", "PENDING"}: + print(f"waiting on BigQuery Job {job.job_id}") + time.sleep(polling_interval) + print(f"reached polling timeout waiting for bigquery job {job_id}") + return False + + +def wait_on_gcs_blob(gcs_client: storage.Client, + wait_blob: storage.Blob, + polling_timeout: int, + polling_interval: int = 1) -> bool: + """" + Wait for a GCS Object to exists. + + Args: + gcs_client: storage.Client + wait_blob: storage.Bllob the GCS to wait on. + polling_timeout: int number of seconds to poll this job ID + polling_interval: frequency to query the job state during polling + Returns: + bool: if the job ID has finished successfully. True if DONE without + errors, False if RUNNING or PENDING + Raises: + exceptions.BigQueryJobFailure if the job failed. + google.api_core.exceptions.NotFound if the job id cannot be found. + """ + start_poll = time.monotonic() + while time.monotonic() - start_poll < (polling_timeout - polling_interval): + if wait_blob.exists(client=gcs_client): + return True + print( + f"waiting on GCS file gs://{wait_blob.bucket.name}/{wait_blob.name}" + ) + time.sleep(polling_interval) + return False + + +def gcs_path_to_table_ref_and_batch( + object_id: str, default_project: Optional[str] +) -> Tuple[bigquery.TableReference, Optional[str]]: + """extract bigquery table reference and batch id from gcs object id""" + + destination_match = constants.DESTINATION_REGEX.match(object_id) + if not destination_match: + raise RuntimeError(f"Object ID {object_id} did not match regex:" + f" {constants.DESTINATION_REGEX.pattern}") + destination_details = destination_match.groupdict() + try: + dataset = destination_details['dataset'] + table = destination_details['table'] + except KeyError: + raise exceptions.DestinationRegexMatchException( + f"Object ID {object_id} did not match dataset and table in regex:" + f" {constants.DESTINATION_REGEX.pattern}") from KeyError + partition = destination_details.get('partition') + year, month, day, hour = ( + destination_details.get(key, "") for key in ('yyyy', 'mm', 'dd', 'hh')) + part_list = (year, month, day, hour) + if not partition and any(part_list): + partition = '$' + ''.join(part_list) + batch_id = destination_details.get('batch') + labels = constants.DEFAULT_JOB_LABELS + + if batch_id: + labels["batch-id"] = batch_id + + if partition: + + dest_table_ref = bigquery.TableReference.from_string( + f"{dataset}.{table}{partition}", + default_project=os.getenv("BQ_STORAGE_PROJECT", default_project)) + else: + dest_table_ref = bigquery.TableReference.from_string( + f"{dataset}.{table}", + default_project=os.getenv("BQ_STORAGE_PROJECT", default_project)) + return dest_table_ref, batch_id + + +def create_job_id(success_file_path): + """Create job id prefix with a consistent naming convention based on the + success file path to give context of what caused this job to be submitted. + the rules for success file name -> job id are: + 1. slashes to dashes + 2. all non-alphanumeric dash or underscore will be replaced with underscore + Note, gcf-ingest- can be overridden with environment variable JOB_PREFIX + 3. uuid for uniqueness + """ + clean_job_id = os.getenv('JOB_PREFIX', constants.DEFAULT_JOB_PREFIX) + clean_job_id += constants.NON_BQ_JOB_ID_REGEX.sub( + '_', success_file_path.replace('/', '-')) + # add uniqueness in case we have to "re-process" a success file that is + # republished (e.g. to fix a bad batch of data) or handle multiple load jobs + # for a single success file. + clean_job_id += str(uuid.uuid4()) + return clean_job_id[:1024] # make sure job id isn't too long + + +def handle_bq_lock(gcs_client: storage.Client, lock_blob: storage.Blob, + next_job_id: Optional[str]): + """Reclaim the lock blob for the new job id (in-place) or delete the lock + blob if next_job_id is None.""" + try: + if next_job_id: + if lock_blob.exists(client=gcs_client): + lock_blob.upload_from_string( + next_job_id, + if_generation_match=lock_blob.generation, + client=gcs_client) + else: # This happens when submitting the first job in the backlog + lock_blob.upload_from_string(next_job_id, + if_generation_match=0, + client=gcs_client) + else: + print("releasing lock at: " + f"gs://{lock_blob.bucket.name}/{lock_blob.name}") + lock_blob.delete( + if_generation_match=lock_blob.generation, + client=gcs_client, + ) + except google.api_core.exceptions.PreconditionFailed as err: + raise exceptions.BacklogException( + f"The lock at gs://{lock_blob.bucket.name}/{lock_blob.name} " + f"was changed by another process.") from err + + +def apply( + gcs_client: storage.Client, + bq_client: bigquery.Client, + success_blob: storage.Blob, + lock_blob: Optional[storage.Blob], + job_id: str, +): + """ + Apply an incremental batch to the target BigQuery table via an asynchronous + load job or external query. + + Args: + gcs_client: storage.Client + bq_client: bigquery.Client + success_blob: storage.Blob the success file whose batch should be + applied. + lock_blob: storage.Blob _bqlock blob to acquire for this job. + job_id: str + """ + handle_duplicate_notification(gcs_client, success_blob) + if lock_blob: + handle_bq_lock(gcs_client, lock_blob, job_id) + bkt = success_blob.bucket + dest_table_ref, _ = gcs_path_to_table_ref_and_batch(success_blob.name, + bq_client.project) + gsurl = removesuffix(f"gs://{bkt.name}/{success_blob.name}", + constants.SUCCESS_FILENAME) + print( + "looking for a transformation tranformation sql file in parent _config." + ) + external_query_sql = look_for_config_in_parents( + gcs_client, f"gs://{bkt.name}/{success_blob.name}", '*.sql') + + if external_query_sql: + print("EXTERNAL QUERY") + print(f"found external query:\n{external_query_sql}") + external_query(gcs_client, bq_client, gsurl, external_query_sql, + dest_table_ref, job_id) + return + + print("LOAD_JOB") + load_batches(gcs_client, bq_client, gsurl, dest_table_ref, job_id) + return diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py index 32316593e..5b536ff25 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/main.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC. +# Copyright 2021 Google LLC. # This software is provided as-is, without warranty or representation # for any use or purpose. # Your use of it is subject to your agreement with Google. @@ -16,545 +16,190 @@ # limitations under the License. """Background Cloud Function for loading data from GCS to BigQuery. """ -import collections -import json +import distutils.util import os -import pathlib -import re import time -from typing import Any, Deque, Dict, List, Optional, Tuple +import traceback +from typing import Dict, Optional -import cachetools -import google.api_core.client_info -import google.api_core.exceptions -import google.cloud.exceptions -from google.cloud import bigquery, storage +# pylint in cloud build is being flaky about this import discovery. +# pylint: disable=no-name-in-module +from google.cloud import bigquery +from google.cloud import error_reporting +from google.cloud import storage -# https://cloud.google.com/bigquery/quotas#load_jobs -# 15TB per BQ load job (soft limit). -DEFAULT_MAX_BATCH_BYTES = str(15 * 10**12) -# 10,000 GCS URIs per BQ load job. -MAX_SOURCE_URIS_PER_LOAD = 10**4 +try: + from common import constants + from common import exceptions + from common import ordering + from common import utils +except ModuleNotFoundError: + from .common import constants + from .common import exceptions + from .common import ordering + from .common import utils -DEFAULT_EXTERNAL_TABLE_DEFINITION = { - "sourceFormat": "CSV", -} +# Reuse GCP Clients across function invocations using globbals +# https://cloud.google.com/functions/docs/bestpractices/tips#use_global_variables_to_reuse_objects_in_future_invocations +# pylint: disable=global-statement -DEFAULT_JOB_LABELS = { - "component": "event-based-gcs-ingest", - "cloud-function-name": os.getenv("FUNCTION_NAME"), -} +ERROR_REPORTING_CLIENT = None -BASE_LOAD_JOB_CONFIG = { - "sourceFormat": "CSV", - "fieldDelimiter": ",", - "writeDisposition": "WRITE_APPEND", - "labels": DEFAULT_JOB_LABELS, -} +BQ_CLIENT = None -# yapf: disable -DEFAULT_DESTINATION_REGEX = ( - r"^(?P[\w\-\._0-9]+)/" # dataset (required) - r"(?P
[\w\-_0-9]+)/?" # table name (required) - r"(?P\$[0-9]+)?/?" # partition decorator (optional) - r"(?P[0-9]{4})?/?" # partition year (yyyy) (optional) - r"(?P[0-9]{2})?/?" # partition month (mm) (optional) - r"(?P
[0-9]{2})?/?" # partition day (dd) (optional) - r"(?P[0-9]{2})?/?" # partition hour (hh) (optional) - r"(?P[\w\-_0-9]+)?/" # batch id (optional) -) -# yapf: enable - -# Will wait up to this polling for errors before exiting -# This is to check if job fail quickly, not to assert it succeed. -# This may not be honored if longer than cloud function timeout. -# https://cloud.google.com/functions/docs/concepts/exec#timeout -# One might consider lowering this to 1-2 seconds to lower the -# upper bound of expected execution time to stay within the free tier. -# https://cloud.google.com/functions/pricing#free_tier -WAIT_FOR_JOB_SECONDS = int(os.getenv("WAIT_FOR_JOB_SECONDS", "5")) - -# Use caution when lowering the job polling rate. -# Keep in mind that many concurrent executions of this cloud function should not -# violate the 300 concurrent requests or 100 request per second. -# https://cloud.google.com/bigquery/quotas#all_api_requests -JOB_POLL_INTERVAL_SECONDS = 1 - -SUCCESS_FILENAME = os.getenv("SUCCESS_FILENAME", "_SUCCESS") - -CLIENT_INFO = google.api_core.client_info.ClientInfo( - user_agent="google-pso-tool/bq-severless-loader") - -DEFAULT_JOB_PREFIX = "gcf-ingest-" +GCS_CLIENT = None def main(event: Dict, context): # pylint: disable=unused-argument """entry point for background cloud function for event driven GCS to BigQuery ingest.""" - # pylint: disable=too-many-locals - # Set by Cloud Function Execution Environment - # https://cloud.google.com/functions/docs/env-var - destination_regex = os.getenv("DESTINATION_REGEX", - DEFAULT_DESTINATION_REGEX) - dest_re = re.compile(destination_regex) - - bucket_id, object_id = parse_notification(event) - - # Exit eagerly if not a success file. - # we can improve this with pub/sub message filtering once it supports - # a hasSuffix filter function (we can filter on hasSuffix successfile name) - # https://cloud.google.com/pubsub/docs/filtering - if not object_id.endswith(f"/{SUCCESS_FILENAME}"): - print( - f"No-op. This notification was not for a {SUCCESS_FILENAME} file.") - return - - prefix_to_load = removesuffix(object_id, SUCCESS_FILENAME) - gsurl = f"gs://{bucket_id}/{prefix_to_load}" - gcs_client = storage.Client(client_info=CLIENT_INFO) - project = gcs_client.project - bkt = cached_get_bucket(gcs_client, bucket_id) - success_blob: storage.Blob = bkt.blob(object_id) - handle_duplicate_notification(bkt, success_blob, gsurl) - - destination_match = dest_re.match(object_id) - if not destination_match: - raise RuntimeError(f"Object ID {object_id} did not match regex:" - f" {destination_regex}") - destination_details = destination_match.groupdict() try: - dataset = destination_details['dataset'] - table = destination_details['table'] - except KeyError: - raise RuntimeError( - f"Object ID {object_id} did not match dataset and table in regex:" - f" {destination_regex}") from KeyError - partition = destination_details.get('partition') - year, month, day, hour = ( - destination_details.get(key, "") for key in ('yyyy', 'mm', 'dd', 'hh')) - part_list = (year, month, day, hour) - if not partition and any(part_list): - partition = '$' + ''.join(part_list) - batch_id = destination_details.get('batch') - labels = DEFAULT_JOB_LABELS - labels["bucket"] = bucket_id - - if batch_id: - labels["batch-id"] = batch_id - - if partition: - dest_table_ref = bigquery.TableReference.from_string( - f"{dataset}.{table}{partition}", default_project=project) - else: - dest_table_ref = bigquery.TableReference.from_string( - f"{dataset}.{table}", default_project=project) - - default_query_config = bigquery.QueryJobConfig() - default_query_config.use_legacy_sql = False - default_query_config.labels = labels - bq_client = bigquery.Client( - client_info=CLIENT_INFO, - default_query_job_config=default_query_config) - - print(f"looking for {gsurl}_config/bq_transform.sql") - external_query_sql = read_gcs_file_if_exists( - gcs_client, f"{gsurl}_config/bq_transform.sql") - print(f"external_query_sql = {external_query_sql}") - if not external_query_sql: - external_query_sql = look_for_transform_sql(gcs_client, gsurl) - if external_query_sql: - print("EXTERNAL QUERY") - external_query(gcs_client, bq_client, gsurl, external_query_sql, - dest_table_ref, - create_job_id_prefix(dest_table_ref, batch_id)) - return - - print("LOAD_JOB") - load_batches(gcs_client, bq_client, gsurl, dest_table_ref, - create_job_id_prefix(dest_table_ref, batch_id)) - - -def create_job_id_prefix(dest_table_ref: bigquery.TableReference, - batch_id: Optional[str]): - """Create job id prefix with a consistent naming convention. - The naming conventions is as follows: - gcf-ingest----- - Parts that are not inferrable from the GCS path with have a 'None' - placeholder. This naming convention is crucial for monitoring the system. - Note, gcf-ingest- can be overridden with environment variable JOB_PREFIX - - Examples: - - Non-partitioned Non batched tables: - - gs://${BUCKET}/tpch/lineitem/_SUCCESS - - gcf-ingest-tpch-lineitem-None-None- - Non-partitioned batched tables: - - gs://${BUCKET}/tpch/lineitem/batch000/_SUCCESS - - gcf-ingest-tpch-lineitem-None-batch000- - Partitioned Batched tables: - - gs://${BUCKET}/tpch/lineitem/$20201031/batch000/_SUCCESS - - gcf-ingest-tpch-lineitem-20201031-batch000- - """ - table_partition = dest_table_ref.table_id.split("$") - if len(table_partition) < 2: - # If there is no partition put a None placeholder - table_partition.append("None") - return f"{os.getenv('JOB_PREFIX', DEFAULT_JOB_PREFIX)}" \ - f"{dest_table_ref.dataset_id}-" \ - f"{'-'.join(table_partition)}-" \ - f"{batch_id}-" - - -def external_query( # pylint: disable=too-many-arguments - gcs_client: storage.Client, bq_client: bigquery.Client, gsurl: str, - query: str, dest_table_ref: bigquery.TableReference, - job_id_prefix: str): - """Load from query over external table from GCS. - - This hinges on a SQL query defined in GCS at _config/bq_transform.sql and - an external table definition _config/external.json (otherwise will assume - CSV external table) - """ - external_table_config = read_gcs_file_if_exists( - gcs_client, f"{gsurl}_config/external.json") - if external_table_config: - external_table_def = json.loads(external_table_config) - else: - print(f"Falling back to default CSV external table." - f" {gsurl}/_config/external.json not found.") - external_table_def = DEFAULT_EXTERNAL_TABLE_DEFINITION - - external_table_def["sourceUris"] = flatten2dlist( - get_batches_for_prefix(gcs_client, gsurl)) - external_config = bigquery.ExternalConfig.from_api_repr(external_table_def) - job_config = bigquery.QueryJobConfig( - table_definitions={"temp_ext": external_config}, use_legacy_sql=False) - - # Note, dest_table might include a partition decorator. - rendered_query = query.format( - dest_dataset=dest_table_ref.dataset_id, - dest_table=dest_table_ref.table_id, - ) - - job: bigquery.QueryJob = bq_client.query( - rendered_query, - job_config=job_config, - job_id_prefix=job_id_prefix, - ) - - print(f"started asynchronous query job: {job.job_id}") - - start_poll_for_errors = time.monotonic() - # Check if job failed quickly - while time.monotonic() - start_poll_for_errors < WAIT_FOR_JOB_SECONDS: - job.reload() - if job.errors: - msg = f"query job {job.job_id} failed quickly: {job.errors}" - for err in job.errors: - # BQ gives confusing warning about missing dataset if the - # external query refers to the wrong external table name. - # In this case we can give the end user a little more context. - if "missing dataset" in err.get("message", ""): - raise RuntimeError( - "External queries must select from the external table " - "named 'temp_ext'. This error may be due to specifying" - "the wrong name for the external table. " + msg) - raise RuntimeError(msg) - time.sleep(JOB_POLL_INTERVAL_SECONDS) - - -def flatten2dlist(arr: List[List[Any]]) -> List[Any]: - """Flatten list of lists to flat list of elements""" - return [j for i in arr for j in i] - - -def load_batches(gcs_client, bq_client, gsurl, dest_table_ref, job_id_prefix): - """orchestrate 1 or more load jobs based on number of URIs and total byte - size of objects at gsurl""" - batches = get_batches_for_prefix(gcs_client, gsurl) - load_config = construct_load_job_config(gcs_client, gsurl) - load_config.labels = DEFAULT_JOB_LABELS - batch_count = len(batches) - - jobs: List[bigquery.LoadJob] = [] - for batch_num, batch in enumerate(batches): - print(load_config.to_api_repr()) - job: bigquery.LoadJob = bq_client.load_table_from_uri( - batch, - dest_table_ref, - job_config=load_config, - job_id_prefix=f"{job_id_prefix}{batch_num}-of-{batch_count}-", - ) - - print(f"started asyncronous bigquery load job with id: {job.job_id} for" - f" {gsurl}") - jobs.append(job) - - start_poll_for_errors = time.monotonic() - # Check if job failed quickly - while time.monotonic() - start_poll_for_errors < WAIT_FOR_JOB_SECONDS: - # Check if job failed quickly - for job in jobs: - job.reload() - if job.errors: + function_start_time = time.monotonic() + # pylint: disable=too-many-locals + + bucket_id, object_id = utils.parse_notification(event) + + basename_object_id = os.path.basename(object_id) + + # Exit eagerly if this is not a file to take action on + # (e.g. a data, config, or lock file) + if basename_object_id not in constants.ACTION_FILENAMES: + action_filenames = constants.ACTION_FILENAMES + if constants.START_BACKFILL_FILENAME is None: + action_filenames.remove(None) + print(f"No-op. This notification was not for a " + f"{action_filenames} file.") + return + + gcs_client = lazy_gcs_client() + bq_client = lazy_bq_client() + + enforce_ordering = (constants.ORDER_PER_TABLE + or utils.look_for_config_in_parents( + gcs_client, f"gs://{bucket_id}/{object_id}", + "ORDERME") is not None) + + bkt: storage.Bucket = utils.cached_get_bucket(gcs_client, bucket_id) + event_blob: storage.Blob = bkt.blob(object_id) + + triage_event(gcs_client, bq_client, event_blob, function_start_time, + enforce_ordering) + + # Unexpected exceptions will actually raise which may cause a cold restart. + except exceptions.DuplicateNotificationException: + print("recieved duplicate notification. this was handled gracefully.\n " + f"{traceback.format_exc()}") + + except exceptions.EXCEPTIONS_TO_REPORT as original_error: + # We do this because we know these errors do not require a cold restart + # of the cloud function. + if (distutils.util.strtobool( + os.getenv("USE_ERROR_REPORTING_API", "True"))): + try: + lazy_error_reporting_client().report_exception() + except Exception: # pylint: disable=broad-except + # This mostly handles the case where error reporting API is not + # enabled or IAM permissions did not allow us to report errors + # with error reporting API. + raise original_error # pylint: disable=raise-missing-from + else: + raise original_error + + +def triage_event(gcs_client: Optional[storage.Client], + bq_client: Optional[bigquery.Client], + event_blob: storage.Blob, + function_start_time: float, + enforce_ordering: bool = False): + """call the appropriate method based on the details of the trigger event + blob.""" + bkt = event_blob.bucket + basename_object_id = os.path.basename(event_blob.name) + + # pylint: disable=no-else-raise + if enforce_ordering: + # For SUCCESS files in a backlog directory, ensure that subscriber + # is running. + if (basename_object_id == constants.SUCCESS_FILENAME + and "/_backlog/" in event_blob.name): + print(f"This notification was for " + f"gs://{bkt.name}/{event_blob.name} a " + f"{constants.SUCCESS_FILENAME} in a " + "/_backlog/ directory. " + f"Watiting {constants.ENSURE_SUBSCRIBER_SECONDS} seconds to " + "ensure that subscriber is running.") + ordering.subscriber_monitor(gcs_client, bkt, event_blob.name) + return + if (constants.START_BACKFILL_FILENAME + and basename_object_id == constants.START_BACKFILL_FILENAME): + print(f"notification for gs://{event_blob.bucket.name}/" + f"{event_blob.name}") + # This will be the first backfill file. + ordering.start_backfill_subscriber_if_not_running( + gcs_client, bkt, utils.get_table_prefix(event_blob.name)) + return + if basename_object_id == constants.SUCCESS_FILENAME: + ordering.backlog_publisher(gcs_client, event_blob) + return + if basename_object_id == constants.BACKFILL_FILENAME: + if (event_blob.name != f"{utils.get_table_prefix(event_blob.name)}/" + f"{constants.BACKFILL_FILENAME}"): raise RuntimeError( - f"load job {job.job_id} failed quickly: {job.errors}") - time.sleep(JOB_POLL_INTERVAL_SECONDS) - - -def handle_duplicate_notification(bkt: storage.Bucket, - success_blob: storage.Blob, gsurl: str): - """ - Need to handle potential duplicate Pub/Sub notifications. - To achieve this we will drop an empty "claimed" file that indicates - an invocation of this cloud function has picked up the success file - with a certain creation timestamp. This will support republishing the - success file as a mechanism of re-running the ingestion while avoiding - duplicate ingestion due to multiple Pub/Sub messages for a success file - with the same creation time. - """ - success_blob.reload() - success_created_unix_timestamp = success_blob.time_created.timestamp() - - claim_blob: storage.Blob = bkt.blob( - success_blob.name.replace( - SUCCESS_FILENAME, - f"_claimed_{success_created_unix_timestamp}") - ) - try: - claim_blob.upload_from_string("", if_generation_match=0) - except google.api_core.exceptions.PreconditionFailed as err: - raise RuntimeError( - f"The prefix {gsurl} appears to already have been claimed for " - f"{gsurl}{SUCCESS_FILENAME} with created timestamp" - f"{success_created_unix_timestamp}." - "This means that another invocation of this cloud function has" - "claimed the ingestion of this batch." - "This may be due to a rare duplicate delivery of the Pub/Sub " - "storage notification.") from err - - -def _get_parent_config_file(storage_client, config_filename, bucket, path): - config_dir_name = "_config" - parent_path = pathlib.Path(path).parent - config_path = parent_path / config_dir_name / config_filename - return read_gcs_file_if_exists(storage_client, - f"gs://{bucket}/{config_path}") - - -def look_for_transform_sql(storage_client: storage.Client, - gsurl: str) -> Optional[str]: - """look in parent directories for _config/bq_transform.sql""" - config_filename = "bq_transform.sql" - blob: storage.Blob = storage.Blob.from_string(gsurl) - bucket_name = blob.bucket.name - obj_path = blob.name - parts = removesuffix(obj_path, "/").split("/") - - def _get_parent_query(path): - return _get_parent_config_file(storage_client, config_filename, - bucket_name, path) - - config = None - while parts: - if config: - return config - config = _get_parent_query("/".join(parts)) - parts.pop() - return config - - -def construct_load_job_config(storage_client: storage.Client, - gsurl: str) -> bigquery.LoadJobConfig: - """ - merge dictionaries for loadjob.json configs in parent directories. - The configs closest to gsurl should take precedence. + f"recieved notification for gs://{event_blob.bucket.name}/" + f"{event_blob.name}\n" + f"{constants.BACKFILL_FILENAME} files " + "are expected only at the table prefix level.") + ordering.backlog_subscriber(gcs_client, bq_client, event_blob, + function_start_time) + return + raise RuntimeError(f"gs://{event_blob.bucket.name}/" + f"{event_blob.name} could not be triaged.") + else: # Default behavior submit job as soon as success file lands. + if basename_object_id == constants.SUCCESS_FILENAME: + utils.apply( + gcs_client, + bq_client, + event_blob, + None, # no lock blob when ordering not enabled. + utils.create_job_id(event_blob.name)) + + +def lazy_error_reporting_client() -> error_reporting.Client: """ - config_filename = "load.json" - blob: storage.Blob = storage.Blob.from_string(gsurl) - bucket_name = blob.bucket.name - obj_path = blob.name - parts = removesuffix(obj_path, "/").split("/") - - def _get_parent_config(path): - return _get_parent_config_file(storage_client, config_filename, - bucket_name, path) - - config_q: Deque[Dict[str, Any]] = collections.deque() - config_q.append(BASE_LOAD_JOB_CONFIG) - while parts: - config = _get_parent_config("/".join(parts)) - if config: - config_q.append(json.loads(config)) - parts.pop() - - merged_config = dict() - while config_q: - merged_config.update(config_q.popleft()) - print(f"merged_config: {merged_config}") - return bigquery.LoadJobConfig.from_api_repr({"load": merged_config}) - + Return a error reporting client that may be shared between cloud function + invocations. -def get_batches_for_prefix(gcs_client: storage.Client, - prefix_path: str, - ignore_subprefix="_config/", - ignore_file=SUCCESS_FILENAME) -> List[List[str]]: + https://cloud.google.com/functions/docs/monitoring/error-reporting """ - This function creates batches of GCS uris for a given prefix. - This prefix could be a table prefix or a partition prefix inside a - table prefix. - returns an Array of their batches - (one batch has an array of multiple GCS uris) - """ - batches = [] - blob: storage.Blob = storage.Blob.from_string(prefix_path) - bucket_name = blob.bucket.name - prefix_name = blob.name - - prefix_filter = f"{prefix_name}" - bucket = cached_get_bucket(gcs_client, bucket_name) - blobs = list(bucket.list_blobs(prefix=prefix_filter, delimiter="/")) - - cumulative_bytes = 0 - max_batch_size = int(os.getenv("MAX_BATCH_BYTES", DEFAULT_MAX_BATCH_BYTES)) - batch: List[str] = [] - for blob in blobs: - # API returns root prefix also. Which should be ignored. - # Similarly, the _SUCCESS file should be ignored. - # Finally, anything in the _config/ prefix should be ignored. - if (blob.name - not in {f"{prefix_name}/", f"{prefix_name}/{ignore_file}"} - or blob.name.startswith(f"{prefix_name}/{ignore_subprefix}")): - if blob.size == 0: # ignore empty files - print(f"ignoring empty file: gs://{bucket}/{blob.name}") - continue - cumulative_bytes += blob.size - - # keep adding until we reach threshold - if cumulative_bytes <= max_batch_size or len( - batch) > MAX_SOURCE_URIS_PER_LOAD: - batch.append(f"gs://{bucket_name}/{blob.name}") - else: - batches.append(batch.copy()) - batch.clear() - batch.append(f"gs://{bucket_name}/{blob.name}") - cumulative_bytes = blob.size - - # pick up remaining files in the final batch - if len(batch) > 0: - batches.append(batch.copy()) - batch.clear() + global ERROR_REPORTING_CLIENT + if not ERROR_REPORTING_CLIENT: + ERROR_REPORTING_CLIENT = error_reporting.Client() + return ERROR_REPORTING_CLIENT - if len(batches) > 1: - print(f"split into {len(batches)} load jobs.") - elif len(batches) == 1: - print("using single load job.") - else: - raise RuntimeError("No files to load!") - return batches - -def parse_notification(notification: dict) -> Tuple[str, str]: - """valdiates notification payload - Args: - notification(dict): Pub/Sub Storage Notification - https://cloud.google.com/storage/docs/pubsub-notifications - Or Cloud Functions direct trigger - https://cloud.google.com/functions/docs/tutorials/storage - with notification schema - https://cloud.google.com/storage/docs/json_api/v1/objects#resource - Returns: - tuple of bucketId and objectId attributes - Raises: - KeyError if the input notification does not contain the expected - attributes. +def lazy_bq_client() -> bigquery.Client: """ - if notification.get("kind") == "storage#object": - # notification is GCS Object reosource from Cloud Functions trigger - # https://cloud.google.com/storage/docs/json_api/v1/objects#resource - return notification["bucket"], notification["name"] - if notification.get("attributes"): - # notification is Pub/Sub message. - try: - attributes = notification["attributes"] - return attributes["bucketId"], attributes["objectId"] - except KeyError: - raise RuntimeError( - "Issue with Pub/Sub message, did not contain expected" - f"attributes: 'bucketId' and 'objectId': {notification}" - ) from KeyError - raise RuntimeError( - "Cloud Function recieved unexpected trigger:\n" - f"{notification}\n" - "This function only supports direct Cloud Functions" - "Background Triggers or Pub/Sub storage notificaitons" - "as described in the following links:\n" - "https://cloud.google.com/storage/docs/pubsub-notifications\n" - "https://cloud.google.com/functions/docs/tutorials/storage") - - -# cache lookups against GCS API for 1 second as buckets / objects have update -# limit of once per second and we might do several of the same lookup during -# the functions lifetime. This should improve performance by eliminating -# unnecessary API calls. The lookups on bucket and objects in this function -# should not be changing during the function's lifetime as this would lead to -# non-deterministic results with or without this cache. -# https://cloud.google.com/storage/quotas -@cachetools.cached(cachetools.TTLCache(maxsize=1024, ttl=1)) -def read_gcs_file(gcs_client: storage.Client, gsurl: str) -> str: + Return a BigQuery Client that may be shared between cloud function + invocations. """ - Read a GCS object as a string - - Args: - gcs_client: GCS client - gsurl: GCS URI for object to read in gs://bucket/path/to/object format - Returns: - str + global BQ_CLIENT + if not BQ_CLIENT: + default_query_config = bigquery.QueryJobConfig() + default_query_config.use_legacy_sql = False + default_query_config.labels = constants.DEFAULT_JOB_LABELS + BQ_CLIENT = bigquery.Client( + client_info=constants.CLIENT_INFO, + default_query_job_config=default_query_config, + project=os.getenv("BQ_PROJECT", os.getenv("GCP_PROJECT"))) + return BQ_CLIENT + + +def lazy_gcs_client() -> storage.Client: """ - blob = storage.Blob.from_string(gsurl) - return blob.download_as_bytes(client=gcs_client).decode('UTF-8') - - -def read_gcs_file_if_exists(gcs_client: storage.Client, - gsurl: str) -> Optional[str]: - """return string of gcs object contents or None if the object does not exist + Return a BigQuery Client that may be shared between cloud function + invocations. """ - try: - return read_gcs_file(gcs_client, gsurl) - except google.cloud.exceptions.NotFound: - return None - - -# Cache bucket lookups (see reasoning in comment above) -@cachetools.cached(cachetools.TTLCache(maxsize=1024, ttl=1)) -def cached_get_bucket( - gcs_client: storage.Client, - bucket_id: str, -) -> storage.Bucket: - """get storage.Bucket object by bucket_id string if exists or raise - google.cloud.exceptions.NotFound.""" - return gcs_client.get_bucket(bucket_id) - - -def dict_to_bq_schema(schema: List[Dict]) -> List[bigquery.SchemaField]: - """Converts a list of dicts to list of bigquery.SchemaField for use with - bigquery client library. Dicts must contain name and type keys. - The dict may optionally contain a mode key.""" - default_mode = "NULLABLE" - return [ - bigquery.SchemaField( - x["name"], - x["type"], - mode=x.get("mode") if x.get("mode") else default_mode) - for x in schema - ] - - -# To be added to built in str in python 3.9 -# https://www.python.org/dev/peps/pep-0616/ -def removesuffix(in_str: str, suffix: str) -> str: - """removes suffix from a string.""" - # suffix='' should not call self[:-0]. - if suffix and in_str.endswith(suffix): - return in_str[:-len(suffix)] - return in_str[:] + global GCS_CLIENT + if not GCS_CLIENT: + GCS_CLIENT = storage.Client(client_info=constants.CLIENT_INFO) + return GCS_CLIENT diff --git a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/requirements.txt b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/requirements.txt index ccba892ee..f2112fdcc 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/requirements.txt +++ b/tools/cloud_functions/gcs_event_based_ingest/gcs_ocn_bq_ingest/requirements.txt @@ -1,2 +1,3 @@ -google-cloud-bigquery==2.2.0 -google-cloud-storage==1.32.0 +google-cloud-bigquery==2.6.0 +google-cloud-storage==1.33.0 +google-cloud-error-reporting==1.1.0 diff --git a/tools/cloud_functions/gcs_event_based_ingest/img/ordering.png b/tools/cloud_functions/gcs_event_based_ingest/img/ordering.png new file mode 100644 index 000000000..0361ac97c Binary files /dev/null and b/tools/cloud_functions/gcs_event_based_ingest/img/ordering.png differ diff --git a/tools/cloud_functions/gcs_event_based_ingest/pytest.ini b/tools/cloud_functions/gcs_event_based_ingest/pytest.ini index 990ea2ca2..bf550fdcf 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/pytest.ini +++ b/tools/cloud_functions/gcs_event_based_ingest/pytest.ini @@ -1,5 +1,14 @@ [pytest] +log_format = %(asctime)s %(levelname)s %(message)s +log_date_format = %Y-%m-%d %H:%M:%S +log_file_format = %(asctime)s %(levelname)s %(message)s +log_file_date_format = %Y-%m-%d %H:%M:%S +log_file_level = INFO +log_file = test.log markers = IT: marks tests as slow integration test requiring cloud resouces (deselect with '-m "not IT"') + SYS: marks tests as slow system or e2e test requiring cloud resouces (deselect with '-m "not IT"') + ORDERING: marks tests that test features related to ordering CLI: marks tests of CLI utilities addopts = --workers=auto + diff --git a/tools/cloud_functions/gcs_event_based_ingest/requirements-dev.txt b/tools/cloud_functions/gcs_event_based_ingest/requirements-dev.txt index 7682e7da0..2fe24ea9a 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/requirements-dev.txt +++ b/tools/cloud_functions/gcs_event_based_ingest/requirements-dev.txt @@ -7,4 +7,6 @@ mypy pylint pytest-parallel pytest-cov -google-cloud-pubsub +google-cloud-pubsub>=2.2.0 +pytest-mock +pytest-repeat diff --git a/tools/cloud_functions/gcs_event_based_ingest/requirements.txt b/tools/cloud_functions/gcs_event_based_ingest/requirements.txt index c65fa4df4..b715db130 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/requirements.txt +++ b/tools/cloud_functions/gcs_event_based_ingest/requirements.txt @@ -1,3 +1,4 @@ -google-cloud-bigquery>=2.2.0 -google-cloud-storage>=1.32.0 +google-cloud-bigquery>=2.6.1 +google-cloud-storage>=1.34.0 +google-cloud-error-reporting>=1.1.0 cachetools diff --git a/tools/cloud_functions/gcs_event_based_ingest/scripts/install_terraform.sh b/tools/cloud_functions/gcs_event_based_ingest/scripts/install_terraform.sh new file mode 100755 index 000000000..4c1cd6f50 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/scripts/install_terraform.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# Copyright 2021 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# This software is provided as-is, +# without warranty or representation for any use or purpose. +# Your use of it is subject to your agreement with Google. +set -eao pipefail + +TERRAFORM_VERSION="0.14.2" +TERRAFORM_BASE_URL="https://releases.hashicorp.com/terraform" +TERRAFORM_ZIP="terraform_${TERRAFORM_VERSION}_$(uname | tr '[:upper:]' '[:lower:]')_amd64.zip" +echo "Downloading from ${TERRAFORM_BASE_URL}/${TERRAFORM_VERSION}/${TERRAFORM_ZIP}" +curl -Lo /tmp/terraform.zip "${TERRAFORM_BASE_URL}/${TERRAFORM_VERSION}/${TERRAFORM_ZIP}" +sudo unzip /tmp/terraform.zip -d /bin diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/README.md b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/README.md index 1e42b1966..d4ea6dbd1 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/README.md +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/README.md @@ -6,12 +6,11 @@ for event based ingest of GCS data to BigQuery described [here](../README.md). Note that by default all environment variables for the cloud function will be empty deferring to the defaults implemented in the function and documented [here](../gcs_ocn_bq_ingest_function/README.md) - ## Requirements | Name | Version | |------|---------| -| terraform | >= 0.12 | +| terraform | >= 0.13 | | archive | ~> 2.0.0 | | google | >= 3.38.0 | | template | ~> 2.2.0 | @@ -28,23 +27,24 @@ documented [here](../gcs_ocn_bq_ingest_function/README.md) | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | app\_id | Application Name | `any` | n/a | yes | +| bigquery\_project\_ids | Additional project IDs to grant bigquery Admin for the data ingester account | `list(string)` | `[]` | no | | cloudfunctions\_source\_bucket | GCS bucket to store Cloud Functions Source | `any` | n/a | yes | | data\_ingester\_sa | Service Account Email responsible for ingesting data to BigQuery | `any` | n/a | yes | -| destination\_regex | A [Python Regex with named capturing groups](https://docs.python.org/3/howto/regex.html#non-capturing-and-named-groups) for destination `dataset`, `table`, (optional: `partition`, `batch`) | `string` | `""` | no | +| environment\_variables | Environment variables to set on the cloud function. | `map(string)` | `{}` | no | +| force\_destroy | force destroy resources (e.g. for e2e tests) | `string` | `"false"` | no | | function\_source\_folder | Path to Cloud Function source | `string` | `"../gcs_event_based_ingest/gcs_ocn_bq_ingest/"` | no | | input\_bucket | GCS bucket to watch for new files | `any` | n/a | yes | | input\_prefix | GCS prefix to watch for new files in input\_bucket | `any` | `null` | no | -| job\_prefix | Prefix for BigQuery Job IDs | `string` | `""` | no | -| max\_batch\_bytes | Max bytes for BigQuery Load job | `string` | `""` | no | -| project\_id | GCP Project ID | `any` | n/a | yes | +| project\_id | GCP Project ID containing cloud function, and input bucket | `any` | n/a | yes | | region | GCP region in which to deploy cloud function | `string` | `"us-central1"` | no | -| success\_filename | Filename to trigger a load of a prefix | `string` | `""` | no | +| timeout | Cloud Functions timeout in seconds | `number` | `540` | no | | use\_pubsub\_notifications | Setting this to true will use Pub/Sub notifications By default we will use Cloud Functions Event direct notifications. See https://cloud.google.com/storage/docs/pubsub-notifications. | `bool` | `false` | no | -| wait\_for\_job\_seconds | How long to wait before deciding BQ job did not fail quickly | `string` | `""` | no | ## Outputs | Name | Description | |------|-------------| | cloud-function | instance of cloud function deployed by this module. | +| data-ingester-sa | data ingester service account email created as cloud function identity | +| input-bucket | n/a | diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf index 204d9bb42..9899db2d1 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/main.tf @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,19 +23,22 @@ resource "google_pubsub_topic" "notification_topic" { } module "bucket" { - source = "terraform-google-modules/cloud-storage/google//modules/simple_bucket" - version = "~> 1.3" + depends_on = [module.data_ingester_service_account] + source = "terraform-google-modules/cloud-storage/google//modules/simple_bucket" + version = "~> 1.3" - name = var.input_bucket - project_id = var.project_id - location = var.region + name = var.input_bucket + project_id = var.project_id + location = var.region + force_destroy = var.force_destroy iam_members = [{ role = "roles/storage.objectAdmin" - member = module.data_ingester_service_account.iam_email + member = "serviceAccount:${var.data_ingester_sa}@${var.project_id}.iam.gserviceaccount.com" }] } resource "google_storage_notification" "notification" { + depends_on = [google_pubsub_topic_iam_binding.gcs_publisher] count = var.use_pubsub_notifications ? 1 : 0 bucket = module.bucket.bucket object_name_prefix = var.input_prefix @@ -58,26 +61,28 @@ resource "google_storage_bucket_object" "function_zip_object" { content_type = "application/zip" } +locals { + function_name = "gcs_to_bq_${var.app_id}" +} resource "google_cloudfunctions_function" "gcs_to_bq" { + depends_on = [google_storage_bucket_object.function_zip_object] project = var.project_id - name = "gcs_to_bq_${var.app_id}" + name = local.function_name region = var.region runtime = "python38" - timeout = 9 * 60 # seconds - service_account_email = var.data_ingester_sa + timeout = var.timeout + service_account_email = module.data_ingester_service_account.email source_archive_bucket = var.cloudfunctions_source_bucket source_archive_object = google_storage_bucket_object.function_zip_object.name entry_point = "main" - environment_variables = { - WAIT_FOR_JOB_SECONDS = var.wait_for_job_seconds - SUCCESS_FILENAME = var.success_filename - DESTINATION_REGEX = var.destination_regex - MAX_BATCH_BYTES = var.max_batch_bytes - JOB_PREFIX = var.job_prefix - } + environment_variables = merge(var.environment_variables, { + GCP_PROJECT = var.project_id, + FUNCTION_TIMEOUT_SEC = var.timeout + FUNCTION_NAME = local.function_name + }) event_trigger { event_type = var.use_pubsub_notifications ? "providers/cloud.pubsub/eventTypes/topic.publish" : "google.storage.object.finalize" - resource = var.use_pubsub_notifications ? google_pubsub_topic.notification_topic[0].id : module.bucket.name + resource = var.use_pubsub_notifications ? "projects/${var.project_id}/${google_pubsub_topic.notification_topic[0].id}" : module.bucket.bucket.name } } @@ -88,10 +93,19 @@ module "data_ingester_service_account" { names = [var.data_ingester_sa, ] project_roles = [ "${var.project_id}=>roles/bigquery.jobUser", - "${var.project_id}=>roles/bigquery.dataEditor", + "${var.project_id}=>roles/storage.admin", ] } +# Grant the ingester service account permissions to mutate data in +# target project(s) +resource "google_project_iam_binding" "ingester_bq_admin" { + for_each = toset(concat(var.bigquery_project_ids, [var.project_id])) + project = each.key + members = [module.data_ingester_service_account.iam_email] + role = "roles/bigquery.dataEditor" +} + # Allow the GCS service account to publish notification for new objects to the # notification topic. resource "google_pubsub_topic_iam_binding" "gcs_publisher" { @@ -109,3 +123,21 @@ resource "google_pubsub_topic_iam_binding" "cf_subscriber" { members = [module.data_ingester_service_account.iam_email] } +module "project-services" { + source = "terraform-google-modules/project-factory/google//modules/project_services" + version = "4.0.0" + + project_id = var.project_id + disable_services_on_destroy = "false" + + activate_apis = [ + "compute.googleapis.com", + "iam.googleapis.com", + "bigquery.googleapis.com", + "storage.googleapis.com", + "pubsub.googleapis.com", + "clouderrorreporting.googleapis.com", + "cloudresourcemanager.googleapis.com", + "cloudfunctions.googleapis.com", + ] +} diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/outputs.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/outputs.tf index 8ba2f4025..5ad0d2b9b 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/outputs.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/outputs.tf @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,3 +16,12 @@ output "cloud-function" { value = google_cloudfunctions_function.gcs_to_bq } +output "data-ingester-sa" { + description = "data ingester service account email created as cloud function identity" + value = module.data_ingester_service_account.email +} + +output "input-bucket" { + value = module.bucket.bucket.name +} + diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf index cd5e162bd..78b1a1991 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/variables.tf @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. variable "project_id" { - description = "GCP Project ID" + description = "GCP Project ID containing cloud function, and input bucket" } variable "app_id" { @@ -36,27 +36,12 @@ variable "data_ingester_sa" { description = "Service Account Email responsible for ingesting data to BigQuery" } -variable "wait_for_job_seconds" { - description = "How long to wait before deciding BQ job did not fail quickly" - default = "" -} -variable "success_filename" { - description = "Filename to trigger a load of a prefix" - default = "" -} -variable "destination_regex" { - description = "A [Python Regex with named capturing groups](https://docs.python.org/3/howto/regex.html#non-capturing-and-named-groups) for destination `dataset`, `table`, (optional: `partition`, `batch`)" - default = "" -} -variable "max_batch_bytes" { - description = "Max bytes for BigQuery Load job" - default = "" +variable "environment_variables" { + description = "Environment variables to set on the cloud function." + type = map(string) + default = {} } -variable "job_prefix" { - description = "Prefix for BigQuery Job IDs " - default = "" -} variable "region" { description = "GCP region in which to deploy cloud function" @@ -74,3 +59,18 @@ variable "use_pubsub_notifications" { default = false } +variable "bigquery_project_ids" { + description = "Additional project IDs to grant bigquery Admin for the data ingester account" + type = list(string) + default = [] +} + +variable "force_destroy" { + description = "force destroy resources (e.g. for e2e tests)" + default = "false" +} + +variable "timeout" { + description = "Cloud Functions timeout in seconds" + default = 540 +} diff --git a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/versions.tf b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/versions.tf index 68daa41d7..3085198f2 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/versions.tf +++ b/tools/cloud_functions/gcs_event_based_ingest/terraform_module/gcs_ocn_bq_ingest_function/versions.tf @@ -1,5 +1,5 @@ /** - * Copyright 2020 Google LLC + * Copyright 2021 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ */ terraform { - required_version = ">= 0.12" + required_version = ">= 0.13" required_providers { google = ">= 3.38.0" diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/__init__.py b/tools/cloud_functions/gcs_event_based_ingest/tests/__init__.py deleted file mode 100644 index 3deceee10..000000000 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright 2020 Google LLC. -# This software is provided as-is, without warranty or representation -# for any use or purpose. -# Your use of it is subject to your agreement with Google. - -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import sys - -sys.path.append(os.path.realpath(os.path.dirname(__file__) + "/..")) diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/cli/test_backfill.py b/tools/cloud_functions/gcs_event_based_ingest/tests/cli/test_backfill.py index 5e9c20cb1..ac3419706 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/cli/test_backfill.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/cli/test_backfill.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/cli/test_external_query.py b/tools/cloud_functions/gcs_event_based_ingest/tests/cli/test_external_query.py new file mode 100644 index 000000000..8339724bc --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/cli/test_external_query.py @@ -0,0 +1,93 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""integrtion tests for gcs_ocn_bq_ingest""" +import json +import os + +import external_query +import pytest + +TEST_DIR = os.path.realpath(os.path.dirname(__file__) + "/..") + + +@pytest.mark.IT +@pytest.mark.CLI +def test_dry_run_external(tmp_path): + """ + Test basic functionality of dry running and external query. + """ + query_path = tmp_path / "test.sql" + query_path.write_text("SELECT * FROM temp_ext") + + with open(os.path.join(TEST_DIR, "resources", + "nation_schema.json")) as schema: + fields = json.load(schema) + config = { + "schema": { + "fields": fields + }, + "csvOptions": { + "allowJaggedRows": False, + "allowQuotedNewlines": False, + "encoding": "UTF-8", + "fieldDelimiter": "|", + "skipLeadingRows": 0, + }, + "sourceFormat": "CSV", + } + external_path = tmp_path / "external.json" + external_path.write_text(json.dumps(config)) + + args = external_query.parse_args( + [f"-q={query_path}", f"-e={external_path}", "--dry-run"]) + external_query.main(args) + + +@pytest.mark.IT +@pytest.mark.CLI +def test_failed_dry_run_external(tmp_path): + """ + Test failed dry run. + """ + query_path = tmp_path / "test.sql" + # foo is not in the nation_schema + query_path.write_text("SELECT foo FROM temp_ext") + + with open(os.path.join(TEST_DIR, "resources", + "nation_schema.json")) as schema: + fields = json.load(schema) + config = { + "schema": { + "fields": fields + }, + "csvOptions": { + "allowJaggedRows": False, + "allowQuotedNewlines": False, + "encoding": "UTF-8", + "fieldDelimiter": "|", + "skipLeadingRows": 0, + }, + "sourceFormat": "CSV", + } + external_path = tmp_path / "external.json" + external_path.write_text(json.dumps(config)) + + args = external_query.parse_args( + [f"-q={query_path}", f"-e={external_path}", "--dry-run"]) + raised = False + try: + external_query.main(args) + except Exception: + raised = True + assert raised diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py index c0ae3f8ab..f1400ffc4 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/conftest.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,9 +19,12 @@ from typing import List import pytest -from google.cloud import bigquery, storage +from google.cloud import bigquery +from google.cloud import error_reporting +from google.cloud import storage -import gcs_ocn_bq_ingest.main +import gcs_ocn_bq_ingest.common.ordering +import gcs_ocn_bq_ingest.common.utils TEST_DIR = os.path.realpath(os.path.dirname(__file__)) LOAD_JOB_POLLING_TIMEOUT = 10 # seconds @@ -39,7 +42,12 @@ def gcs() -> storage.Client: return storage.Client() -@pytest.mark.usefixtures("gcs") +@pytest.fixture(scope="module") +def error() -> error_reporting.Client: + """GCS Client""" + return error_reporting.Client() + + @pytest.fixture def gcs_bucket(request, gcs) -> storage.bucket.Bucket: """GCS bucket for test artifacts""" @@ -60,19 +68,26 @@ def teardown(): return bucket -@pytest.mark.usefixtures("gcs_bucket") @pytest.fixture def mock_env(gcs, monkeypatch): """environment variable mocks""" # Infer project from ADC of gcs client. monkeypatch.setenv("GCP_PROJECT", gcs.project) monkeypatch.setenv("FUNCTION_NAME", "integration-test") + monkeypatch.setenv("FUNCTION_TIMEOUT_SEC", "540") + monkeypatch.setenv("BQ_PROJECT", gcs.project) + + +@pytest.fixture +def ordered_mock_env(mock_env, monkeypatch): + """environment variable mocks""" + monkeypatch.setenv("ORDER_PER_TABLE", "TRUE") -@pytest.mark.usefixtures("bq", "mock_env") @pytest.fixture def dest_dataset(request, bq, mock_env, monkeypatch): - random_dataset = f"test_bq_ingest_gcf_{str(uuid.uuid4())[:8].replace('-','_')}" + random_dataset = (f"test_bq_ingest_gcf_" + f"{str(uuid.uuid4())[:8].replace('-','_')}") dataset = bigquery.Dataset(f"{os.getenv('GCP_PROJECT')}" f".{random_dataset}") dataset.location = "US" @@ -88,16 +103,17 @@ def teardown(): return dataset -@pytest.mark.usefixtures("bq", "mock_env", "dest_dataset") @pytest.fixture def dest_table(request, bq, mock_env, dest_dataset) -> bigquery.Table: with open(os.path.join(TEST_DIR, "resources", "nation_schema.json")) as schema_file: - schema = gcs_ocn_bq_ingest.main.dict_to_bq_schema( + schema = gcs_ocn_bq_ingest.common.utils.dict_to_bq_schema( json.load(schema_file)) table = bigquery.Table( - f"{os.environ.get('GCP_PROJECT')}.{dest_dataset.dataset_id}.cf_test_nation", + f"{os.environ.get('GCP_PROJECT')}" + f".{dest_dataset.dataset_id}.cf_test_nation_" + f"{str(uuid.uuid4()).replace('-','_')}", schema=schema, ) @@ -111,7 +127,6 @@ def teardown(): @pytest.fixture(scope="function") -@pytest.mark.usefixtures("gcs_bucket", "dest_dataset", "dest_table") def gcs_data(request, gcs_bucket, dest_dataset, dest_table) -> storage.blob.Blob: data_objs = [] @@ -135,7 +150,29 @@ def teardown(): @pytest.fixture(scope="function") -@pytest.mark.usefixtures("gcs_bucket", "dest_dataset", "dest_table") +def gcs_data_under_sub_dirs(request, gcs_bucket, dest_dataset, + dest_table) -> storage.blob.Blob: + data_objs = [] + for test_file in ["part-m-00000", "part-m-00001", "_SUCCESS"]: + data_obj: storage.blob.Blob = gcs_bucket.blob("/".join([ + f"{dest_dataset.project}.{dest_dataset.dataset_id}", + dest_table.table_id, "foo", "bar", "baz", test_file + ])) + data_obj.upload_from_filename( + os.path.join(TEST_DIR, "resources", "test-data", "nation", + test_file)) + data_objs.append(data_obj) + + def teardown(): + for do in data_objs: + if do.exists(): + do.delete() + + request.addfinalizer(teardown) + return data_objs[-1] + + +@pytest.fixture(scope="function") def gcs_truncating_load_config(request, gcs_bucket, dest_dataset, dest_table) -> storage.blob.Blob: config_obj: storage.blob.Blob = gcs_bucket.blob("/".join([ @@ -156,7 +193,6 @@ def teardown(): @pytest.fixture(scope="function") -@pytest.mark.usefixtures("gcs_bucket", "dest_dataset", "dest_table") def gcs_batched_data(request, gcs_bucket, dest_dataset, dest_table) -> List[storage.blob.Blob]: """ @@ -175,20 +211,19 @@ def gcs_batched_data(request, gcs_bucket, dest_dataset, def teardown(): for do in data_objs: - if do.exists: + if do.exists(): do.delete() request.addfinalizer(teardown) return [data_objs[-1], data_objs[-4]] -@pytest.mark.usefixtures("gcs_bucket", "dest_dataset", "dest_table") @pytest.fixture def gcs_external_config(request, gcs_bucket, dest_dataset, dest_table) -> List[storage.blob.Blob]: config_objs = [] sql_obj = gcs_bucket.blob("/".join([ - dest_dataset.dataset_id, + f"{dest_dataset.project}.{dest_dataset.dataset_id}", dest_table.table_id, "_config", "bq_transform.sql", @@ -198,7 +233,8 @@ def gcs_external_config(request, gcs_bucket, dest_dataset, sql_obj.upload_from_string(sql) config_obj = gcs_bucket.blob("/".join([ - dest_dataset.dataset_id, dest_table.table_id, "_config", "external.json" + f"{dest_dataset.project}.{dest_dataset.dataset_id}", + dest_table.table_id, "_config", "external.json" ])) with open(os.path.join(TEST_DIR, "resources", @@ -224,7 +260,7 @@ def gcs_external_config(request, gcs_bucket, dest_dataset, def teardown(): for do in config_objs: - if do.exists: + if do.exists(): do.delete() request.addfinalizer(teardown) @@ -232,7 +268,6 @@ def teardown(): @pytest.fixture(scope="function") -@pytest.mark.usefixtures("gcs_bucket", "dest_dataset", "dest_parttioned_table") def gcs_partitioned_data(request, gcs_bucket, dest_dataset, dest_partitioned_table) -> List[storage.blob.Blob]: data_objs = [] @@ -249,7 +284,8 @@ def gcs_partitioned_data(request, gcs_bucket, dest_dataset, def teardown(): for dobj in data_objs: - if dobj.exists: + # we expect some backfill files to be removed by the cloud function. + if dobj.exists(): dobj.delete() request.addfinalizer(teardown) @@ -257,7 +293,6 @@ def teardown(): @pytest.fixture(scope="function") -@pytest.mark.usefixtures("gcs_bucket", "dest_dataset", "dest_table") def dest_partitioned_table(request, bq: bigquery.Client, mock_env, dest_dataset) -> bigquery.Table: public_table: bigquery.Table = bq.get_table( @@ -267,7 +302,8 @@ def dest_partitioned_table(request, bq: bigquery.Client, mock_env, table: bigquery.Table = bigquery.Table( f"{os.environ.get('GCP_PROJECT')}" - f".{dest_dataset.dataset_id}.cf_test_nyc_311", + f".{dest_dataset.dataset_id}.cf_test_nyc_311_" + f"{str(uuid.uuid4()).replace('-','_')}", schema=schema, ) @@ -310,3 +346,277 @@ def bq_wait_for_rows(bq_client: bigquery.Client, table: bigquery.Table, f"{table.project}.{table.dataset_id}.{table.table_id} to " f"reach {expected_num_rows} rows." f"last poll returned {actual_num_rows} rows.") + + +@pytest.fixture +def dest_ordered_update_table(request, gcs, gcs_bucket, bq, mock_env, + dest_dataset) -> bigquery.Table: + with open(os.path.join(TEST_DIR, "resources", + "ordering_schema.json")) as schema_file: + schema = gcs_ocn_bq_ingest.common.utils.dict_to_bq_schema( + json.load(schema_file)) + + table = bigquery.Table( + f"{os.environ.get('GCP_PROJECT')}.{dest_dataset.dataset_id}" + f".cf_test_ordering_{str(uuid.uuid4()).replace('-','_')}", + schema=schema, + ) + + table = bq.create_table(table) + + # Our test query only updates on a single row so we need to populate + # original row. + # This can be used to simulate an existing _bqlock from a prior run of the + # subscriber loop with a job that has succeeded. + job: bigquery.LoadJob = bq.load_table_from_json( + [{ + "id": 1, + "alpha_update": "" + }], + table, + job_id_prefix=gcs_ocn_bq_ingest.common.constants.DEFAULT_JOB_PREFIX) + + # The subscriber will be responsible for cleaning up this file. + bqlock_obj: storage.blob.Blob = gcs_bucket.blob("/".join([ + f"{dest_dataset.project}.{dest_dataset.dataset_id}", table.table_id, + "_bqlock" + ])) + + bqlock_obj.upload_from_string(job.job_id) + + def teardown(): + bq.delete_table(table, not_found_ok=True) + if bqlock_obj.exists(): + bqlock_obj.delete() + + request.addfinalizer(teardown) + return table + + +@pytest.fixture(scope="function") +def gcs_ordered_update_data( + request, gcs_bucket, dest_dataset, + dest_ordered_update_table) -> List[storage.blob.Blob]: + data_objs = [] + older_success_blob: storage.blob.Blob = gcs_bucket.blob("/".join([ + f"{dest_dataset.project}.{dest_dataset.dataset_id}", + dest_ordered_update_table.table_id, "00", "_SUCCESS" + ])) + older_success_blob.upload_from_string("") + data_objs.append(older_success_blob) + + chunks = { + "01", + "02", + "03", + } + for chunk in chunks: + for test_file in ["data.csv", "_SUCCESS"]: + data_obj: storage.blob.Blob = gcs_bucket.blob("/".join([ + f"{dest_dataset.project}.{dest_dataset.dataset_id}", + dest_ordered_update_table.table_id, chunk, test_file + ])) + data_obj.upload_from_filename( + os.path.join(TEST_DIR, "resources", "test-data", "ordering", + chunk, test_file)) + data_objs.append(data_obj) + + def teardown(): + for dobj in data_objs: + if dobj.exists(): + dobj.delete() + + request.addfinalizer(teardown) + return list(filter(lambda do: do.name.endswith("_SUCCESS"), data_objs)) + + +@pytest.fixture(scope="function") +def gcs_backlog(request, gcs, gcs_bucket, + gcs_ordered_update_data) -> List[storage.blob.Blob]: + data_objs = [] + + # We will deal with the last incremental in the test itself to test the + # behavior of a new backlog subscriber. + for success_blob in gcs_ordered_update_data: + gcs_ocn_bq_ingest.common.ordering.backlog_publisher(gcs, success_blob) + backlog_blob = \ + gcs_ocn_bq_ingest.common.ordering.success_blob_to_backlog_blob( + success_blob + ) + backlog_blob.upload_from_string("") + data_objs.append(backlog_blob) + + def teardown(): + for dobj in data_objs: + if dobj.exists(): + dobj.delete() + + request.addfinalizer(teardown) + return list(filter(lambda do: do.name.endswith("_SUCCESS"), data_objs)) + + +@pytest.fixture +def gcs_external_update_config(request, gcs_bucket, dest_dataset, + dest_ordered_update_table) -> storage.Blob: + config_objs = [] + sql_obj = gcs_bucket.blob("/".join([ + f"{dest_dataset.project}.{dest_dataset.dataset_id}", + dest_ordered_update_table.table_id, + "_config", + "bq_transform.sql", + ])) + + sql = """ + UPDATE {dest_dataset}.{dest_table} dest + SET alpha_update = CONCAT(dest.alpha_update, src.alpha_update) + FROM temp_ext src + WHERE dest.id = src.id + """ + sql_obj.upload_from_string(sql) + + config_obj = gcs_bucket.blob("/".join([ + f"{dest_dataset.project}.{dest_dataset.dataset_id}", + dest_ordered_update_table.table_id, "_config", "external.json" + ])) + + with open(os.path.join(TEST_DIR, "resources", + "ordering_schema.json")) as schema: + fields = json.load(schema) + config = { + "schema": { + "fields": fields + }, + "csvOptions": { + "allowJaggedRows": False, + "allowQuotedNewlines": False, + "encoding": "UTF-8", + "fieldDelimiter": "|", + "skipLeadingRows": 0, + }, + "sourceFormat": "CSV", + "sourceUris": ["REPLACEME"], + } + config_obj.upload_from_string(json.dumps(config)) + backfill_blob = gcs_bucket.blob("/".join([ + f"{dest_dataset.project}.{dest_dataset.dataset_id}", + dest_ordered_update_table.table_id, + gcs_ocn_bq_ingest.common.constants.BACKFILL_FILENAME + ])) + backfill_blob.upload_from_string("") + config_objs.append(sql_obj) + config_objs.append(config_obj) + config_objs.append(backfill_blob) + + def teardown(): + for do in config_objs: + if do.exists(): + do.delete() + + request.addfinalizer(teardown) + return backfill_blob + + +@pytest.mark.usefixtures("bq", "gcs_bucket", "dest_dataset", + "dest_partitioned_table") +@pytest.fixture +def gcs_external_partitioned_config( + request, bq, gcs_bucket, dest_dataset, + dest_partitioned_table) -> List[storage.blob.Blob]: + config_objs = [] + sql_obj = gcs_bucket.blob("/".join([ + dest_dataset.dataset_id, + dest_partitioned_table.table_id, + "_config", + "bq_transform.sql", + ])) + + sql = "INSERT {dest_dataset}.{dest_table} SELECT * FROM temp_ext;" + sql_obj.upload_from_string(sql) + + config_obj = gcs_bucket.blob("/".join([ + dest_dataset.dataset_id, dest_partitioned_table.table_id, "_config", + "external.json" + ])) + + public_table: bigquery.Table = bq.get_table( + bigquery.TableReference.from_string( + "bigquery-public-data.new_york_311.311_service_requests")) + config = { + "schema": public_table.to_api_repr()['schema'], + "csvOptions": { + "allowJaggedRows": False, + "allowQuotedNewlines": False, + "encoding": "UTF-8", + "fieldDelimiter": "|", + "skipLeadingRows": 0, + }, + "sourceFormat": "CSV", + "sourceUris": ["REPLACEME"], + } + config_obj.upload_from_string(json.dumps(config)) + config_objs.append(sql_obj) + config_objs.append(config_obj) + + def teardown(): + for do in config_objs: + if do.exists: + do.delete() + + request.addfinalizer(teardown) + return config_objs + + +@pytest.fixture +def no_use_error_reporting(monkeypatch): + monkeypatch.setenv("USE_ERROR_REPORTING_API", "False") + + +@pytest.fixture +def gcs_external_config_bad_statement( + request, gcs_bucket, dest_dataset, dest_table, + no_use_error_reporting) -> List[storage.blob.Blob]: + config_objs = [] + sql_obj = gcs_bucket.blob("/".join([ + f"{dest_dataset.project}.{dest_dataset.dataset_id}", + dest_table.table_id, + "_config", + "bq_transform.sql", + ])) + + sql = ("INSERT {dest_dataset}.{dest_table} SELECT * FROM temp_ext;\n" + "INSERT {dest_dataset}.{dest_table} SELECT 1/0;") + sql_obj.upload_from_string(sql) + + config_obj = gcs_bucket.blob("/".join([ + f"{dest_dataset.project}.{dest_dataset.dataset_id}", + dest_table.table_id, "_config", "external.json" + ])) + + with open(os.path.join(TEST_DIR, "resources", + "nation_schema.json")) as schema: + fields = json.load(schema) + config = { + "schema": { + "fields": fields + }, + "csvOptions": { + "allowJaggedRows": False, + "allowQuotedNewlines": False, + "encoding": "UTF-8", + "fieldDelimiter": "|", + "skipLeadingRows": 0, + }, + "sourceFormat": "CSV", + "sourceUris": ["REPLACEME"], + } + config_obj.upload_from_string(json.dumps(config)) + config_objs.append(sql_obj) + config_objs.append(config_obj) + + def teardown(): + for do in config_objs: + if do.exists(): + do.delete() + + request.addfinalizer(teardown) + return config_objs diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py index f3e02a50b..be36a397e 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest.py @@ -1,5 +1,5 @@ # dataset/table/_SUCCESS -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,14 +14,19 @@ # limitations under the License. """unit tests for gcs_ocn_bq_ingest""" import re +import time from typing import Dict, Optional +from unittest.mock import Mock import pytest +from google.cloud import storage +import gcs_ocn_bq_ingest.common.constants +import gcs_ocn_bq_ingest.common.utils import gcs_ocn_bq_ingest.main COMPILED_DEFAULT_DENTINATION_REGEX = re.compile( - gcs_ocn_bq_ingest.main.DEFAULT_DESTINATION_REGEX) + gcs_ocn_bq_ingest.common.constants.DEFAULT_DESTINATION_REGEX) @pytest.mark.parametrize( @@ -95,6 +100,26 @@ "hh": "03", "batch": "batch_id" }), + ("project.dataset/table/historical/2020/01/02/03/batch_id/_SUCCESS", { + "dataset": "project.dataset", + "table": "table", + "partition": None, + "yyyy": "2020", + "mm": "01", + "dd": "02", + "hh": "03", + "batch": "batch_id" + }), + ("project.dataset/table/incremental/2020/01/02/04/batch_id/_SUCCESS", { + "dataset": "project.dataset", + "table": "table", + "partition": None, + "yyyy": "2020", + "mm": "01", + "dd": "02", + "hh": "04", + "batch": "batch_id" + }), ]) def test_default_destination_regex(test_input: str, expected: Dict[str, Optional[str]]): @@ -118,4 +143,153 @@ def test_default_destination_regex(test_input: str, ([["foo"], [], ["bar", "baz"]], ["foo", "bar", "baz"]), ]) def test_flattend2dlist(test_input, expected): - assert gcs_ocn_bq_ingest.main.flatten2dlist(test_input) == expected + assert gcs_ocn_bq_ingest.common.utils.flatten2dlist(test_input) == expected + + +@pytest.mark.parametrize( + "original, update, expected", + [ + # yapf: disable + ( # empty original + {}, { + "a": 1 + }, { + "a": 1 + }), + ( # empty update + { + "a": 1 + }, {}, { + "a": 1 + }), + ( # basic update of top-level key + { + "a": 1 + }, { + "a": 2 + }, { + "a": 2 + }), + ( # update of list + { + "a": [1] + }, { + "a": [2] + }, { + "a": [2] + }), + ( # update of nested key + { + "a": { + "b": 1 + } + }, { + "a": { + "b": 2 + } + }, { + "a": { + "b": 2 + } + }), + ( # don't drop keys that only appear in original + { + "a": { + "b": 1, + "c": 2 + }, + "d": 3 + }, { + "a": { + "b": 4 + }, + }, { + "a": { + "b": 4, + "c": 2 + }, + "d": 3 + }), + # yapf: enable + ]) +def test_recursive_update(original, update, expected): + assert gcs_ocn_bq_ingest.common.utils.recursive_update(original, + update) == expected + + +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + "dataset/table/_SUCCESS", # flat + "dataset/table"), + ( + "dataset/table/$20201030/_SUCCESS", # partitioned + "dataset/table"), + ( + "dataset/table/$20201030/batch_id/_SUCCESS", # partitioned, batched + "dataset/table"), + ( + "dataset/table/batch_id/_SUCCESS", # batched (no partitioning) + "dataset/table"), + ("dataset/table/2020/01/02/03/batch_id/_SUCCESS", "dataset/table"), + ("project.dataset/table/2020/01/02/03/batch_id/_SUCCESS", + "project.dataset/table"), + ("dataset/table/_BACKFILL", "dataset/table"), + ("dataset/table/_bqlock", "dataset/table"), + ("dataset/table/_backlog/2020/01/02/03/_SUCCESS", "dataset/table"), + ]) +def test_get_table_prefix(test_input, expected): + assert gcs_ocn_bq_ingest.common.utils.get_table_prefix( + test_input) == expected + + +def test_triage_event(mock_env, mocker): + test_event_blob: storage.Blob = storage.Blob.from_string( + "gs://foo/bar/baz/00/_SUCCESS") + apply_mock = mocker.patch('gcs_ocn_bq_ingest.common.utils.apply') + bq_mock = Mock() + bq_mock.project = "foo" + gcs_ocn_bq_ingest.main.triage_event(None, bq_mock, test_event_blob, + time.monotonic()) + apply_mock.assert_called_once() + + +def test_triage_event_ordered(ordered_mock_env, mocker): + enforce_ordering = True + test_event_blob: storage.Blob = storage.Blob.from_string( + "gs://foo/bar/baz/00/_SUCCESS") + apply_mock = mocker.patch('gcs_ocn_bq_ingest.common.utils.apply') + publisher_mock = mocker.patch( + 'gcs_ocn_bq_ingest.common.ordering.backlog_publisher') + bq_mock = Mock() + bq_mock.project = "foo" + gcs_ocn_bq_ingest.main.triage_event(None, + bq_mock, + test_event_blob, + time.monotonic(), + enforce_ordering=enforce_ordering) + publisher_mock.assert_called_once() + + test_event_blob: storage.Blob = storage.Blob.from_string( + "gs://foo/bar/baz/_BACKFILL") + subscriber_mock = mocker.patch( + 'gcs_ocn_bq_ingest.common.ordering.backlog_subscriber') + gcs_ocn_bq_ingest.main.triage_event(None, + None, + test_event_blob, + time.monotonic(), + enforce_ordering=enforce_ordering) + subscriber_mock.assert_called_once() + + test_event_blob: storage.Blob = storage.Blob.from_string( + "gs://foo/bar/baz/_backlog/00/_SUCCESS") + monitor_mock = mocker.patch( + 'gcs_ocn_bq_ingest.common.ordering.subscriber_monitor') + gcs_ocn_bq_ingest.main.triage_event(None, + None, + test_event_blob, + time.monotonic(), + enforce_ordering=enforce_ordering) + monitor_mock.assert_called_once() + apply_mock.assert_not_called() diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py index a5a81b949..02dbeb318 100644 --- a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_gcs_ocn_bq_ingest_it.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,7 +22,7 @@ import gcs_ocn_bq_ingest.main TEST_DIR = os.path.realpath(os.path.dirname(__file__) + "/..") -LOAD_JOB_POLLING_TIMEOUT = 10 # seconds +LOAD_JOB_POLLING_TIMEOUT = 20 # seconds @pytest.mark.IT @@ -67,8 +67,8 @@ def test_gcf_event_schema(bq, gcs_data, dest_dataset, dest_table, mock_env): @pytest.mark.IT -def test_duplicate_notification(bq, gcs_data, dest_dataset, dest_table, - mock_env): +def test_duplicate_success_notification(bq, gcs_data, dest_dataset, dest_table, + mock_env): """tests behavior with two notifications for the same success file.""" if not gcs_data.exists(): raise EnvironmentError("test data objects must exist") @@ -79,12 +79,6 @@ def test_duplicate_notification(bq, gcs_data, dest_dataset, dest_table, } } gcs_ocn_bq_ingest.main.main(test_event, None) - did_second_invocation_raise = False - try: - gcs_ocn_bq_ingest.main.main(test_event, None) - except RuntimeError: - did_second_invocation_raise = True - assert did_second_invocation_raise test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation", "part-m-00001") @@ -149,8 +143,8 @@ def test_load_job_appending_batches(bq, gcs_batched_data, dest_dataset, @pytest.mark.IT -def test_external_query(bq, gcs_data, gcs_external_config, dest_dataset, - dest_table, mock_env): +def test_external_query_pure(bq, gcs_data, gcs_external_config, dest_dataset, + dest_table, mock_env): """tests the basic external query ingrestion mechanics with bq_transform.sql and external.json """ @@ -209,6 +203,63 @@ def test_load_job_partitioned(bq, gcs_partitioned_data, bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows) +@pytest.mark.IT +def test_external_query_partitioned(bq, gcs_partitioned_data, + gcs_external_partitioned_config, + dest_dataset, dest_partitioned_table, + mock_env): + """tests the basic external query ingrestion mechanics + with bq_transform.sql and external.json + """ + if not all((blob.exists() for blob in gcs_external_partitioned_config)): + raise google.cloud.exceptions.NotFound("config objects must exist") + + for blob in gcs_partitioned_data: + if not blob.exists(): + raise google.cloud.exceptions.NotFound( + "test data objects must exist") + test_event = { + "attributes": { + "bucketId": blob.bucket.name, + "objectId": blob.name + } + } + gcs_ocn_bq_ingest.main.main(test_event, None) + expected_num_rows = 0 + for part in [ + "$2017041101", + "$2017041102", + ]: + test_data_file = os.path.join(TEST_DIR, "resources", "test-data", + "nyc_311", part, "nyc_311.csv") + expected_num_rows += sum(1 for _ in open(test_data_file)) + bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows) + + +@pytest.mark.IT +def test_look_for_config_in_parents(bq, gcs_data_under_sub_dirs, + gcs_external_config, dest_dataset, + dest_table, mock_env): + """test discovery of configuration files for external query in parent + _config paths. + """ + if not all((blob.exists() for blob in gcs_external_config)): + raise google.cloud.exceptions.NotFound("config objects must exist") + if not gcs_data_under_sub_dirs.exists(): + raise google.cloud.exceptions.NotFound("test data objects must exist") + test_event = { + "attributes": { + "bucketId": gcs_data_under_sub_dirs.bucket.name, + "objectId": gcs_data_under_sub_dirs.name + } + } + gcs_ocn_bq_ingest.main.main(test_event, None) + test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation", + "part-m-00001") + expected_num_rows = sum(1 for _ in open(test_data_file)) + bq_wait_for_rows(bq, dest_table, expected_num_rows) + + def bq_wait_for_rows(bq_client: bigquery.Client, table: bigquery.Table, expected_num_rows: int): """ @@ -235,3 +286,30 @@ def bq_wait_for_rows(bq_client: bigquery.Client, table: bigquery.Table, f"{table.project}.{table.dataset_id}.{table.table_id} to " f"reach {expected_num_rows} rows." f"last poll returned {actual_num_rows} rows.") + + +@pytest.mark.IT +def test_external_query_with_bad_statement(bq, gcs_data, + gcs_external_config_bad_statement, + dest_dataset, dest_table, mock_env): + """tests the basic external query ingrestion mechanics + with bq_transform.sql and external.json + """ + if not gcs_data.exists(): + raise google.cloud.exceptions.NotFound("test data objects must exist") + if not all((blob.exists() for blob in gcs_external_config_bad_statement)): + raise google.cloud.exceptions.NotFound("config objects must exist") + + test_event = { + "attributes": { + "bucketId": gcs_data.bucket.name, + "objectId": gcs_data.name + } + } + raised = False + try: + gcs_ocn_bq_ingest.main.main(test_event, None) + except gcs_ocn_bq_ingest.common.exceptions.BigQueryJobFailure: + raised = True + + assert raised, "bad statement did not raise BigQueryJobFailure" diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py new file mode 100644 index 000000000..7fe82d200 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/gcs_ocn_bq_ingest/test_ordering_it.py @@ -0,0 +1,272 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""integration tests for the ordering behavior of backlog gcs_ocn_bq_ingest""" +import multiprocessing +import os +import queue +import time +from typing import Optional + +import pytest +from google.cloud import bigquery +from google.cloud import storage + +import gcs_ocn_bq_ingest.common.constants +import gcs_ocn_bq_ingest.common.ordering +import gcs_ocn_bq_ingest.common.utils +import gcs_ocn_bq_ingest.main + +TEST_DIR = os.path.realpath(os.path.dirname(__file__) + "/..") +LOAD_JOB_POLLING_TIMEOUT = 20 # seconds + +# Testing that the subscriber does not get choked up by a common race condition +# is crucial to ensuring this solution works. +# This parameter is for running the subscriber tests many times. +# During development it can be helpful to tweak this up or down as you are +# experimenting. +NUM_TRIES_SUBSCRIBER_TESTS = 25 + + +@pytest.mark.IT +@pytest.mark.ORDERING +def test_backlog_publisher(gcs, gcs_bucket, gcs_partitioned_data, mock_env): + """Test basic functionality of backlog_publisher + Drop two success files. + Assert that both success files are added to backlog and backfill file + created. + Assert that that only one backfill file is not recreated. + """ + table_prefix = "" + # load each partition. + for gcs_data in gcs_partitioned_data: + if not gcs_data.exists(): + raise EnvironmentError("test data objects must exist") + if gcs_data.name.endswith( + gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME): + table_prefix = gcs_ocn_bq_ingest.common.utils.get_table_prefix( + gcs_data.name) + gcs_ocn_bq_ingest.common.ordering.backlog_publisher(gcs, gcs_data) + + expected_backlog_blobs = queue.Queue() + expected_backlog_blobs.put("/".join([ + table_prefix, "_backlog", "$2017041101", + gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME + ])) + expected_backlog_blobs.put("/".join([ + table_prefix, "_backlog", "$2017041102", + gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME + ])) + + for backlog_blob in gcs_bucket.list_blobs( + prefix=f"{table_prefix}/_backlog"): + assert backlog_blob.name == expected_backlog_blobs.get(block=False) + + backfill_blob: storage.Blob = gcs_bucket.blob( + f"{table_prefix}/{gcs_ocn_bq_ingest.common.constants.BACKFILL_FILENAME}" + ) + assert backfill_blob.exists() + + +@pytest.mark.IT +@pytest.mark.ORDERING +def test_backlog_publisher_with_existing_backfill_file(gcs, gcs_bucket, + dest_dataset, + dest_partitioned_table, + gcs_partitioned_data, + mock_env): + """Test basic functionality of backlog_publisher when the backfill is + already running. It should not repost this backfill file. + """ + table_prefix = "/".join( + [dest_dataset.dataset_id, dest_partitioned_table.table_id]) + backfill_blob: storage.Blob = gcs_bucket.blob( + f"{table_prefix}/{gcs_ocn_bq_ingest.common.constants.BACKFILL_FILENAME}" + ) + backfill_blob.upload_from_string("") + backfill_blob.reload() + original_backfill_blob_generation = backfill_blob.generation + table_prefix = "" + # load each partition. + for gcs_data in gcs_partitioned_data: + if not gcs_data.exists(): + raise EnvironmentError("test data objects must exist") + if gcs_data.name.endswith( + gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME): + table_prefix = gcs_ocn_bq_ingest.common.utils.get_table_prefix( + gcs_data.name) + gcs_ocn_bq_ingest.common.ordering.backlog_publisher(gcs, gcs_data) + + # Use of queue to test that list responses are returned in expected order. + expected_backlog_blobs = queue.Queue() + expected_backlog_blobs.put("/".join([ + table_prefix, "_backlog", "$2017041101", + gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME + ])) + expected_backlog_blobs.put("/".join([ + table_prefix, "_backlog", "$2017041102", + gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME + ])) + + for backlog_blob in gcs_bucket.list_blobs( + prefix=f"{table_prefix}/_backlog"): + assert backlog_blob.name == expected_backlog_blobs.get(block=False) + + backfill_blob.reload() + assert backfill_blob.generation == original_backfill_blob_generation + + +@pytest.mark.IT +@pytest.mark.ORDERING +@pytest.mark.repeat(NUM_TRIES_SUBSCRIBER_TESTS) +def test_backlog_subscriber_in_order_with_new_batch_after_exit( + bq, gcs, gcs_bucket, dest_dataset, dest_ordered_update_table, + gcs_ordered_update_data, gcs_external_update_config, gcs_backlog, + mock_env): + """Test basic functionality of backlog subscriber. + Populate a backlog with 3 files that make updates where we can assert + that these jobs were applied in order. + + To ensure that the subscriber cleans up properly after itself before exit, + we will drop a 4th batch after the subscriber has exited and assert that it + gets applied as expected. + """ + _run_subscriber(gcs, bq, gcs_external_update_config) + table_prefix = gcs_ocn_bq_ingest.common.utils.get_table_prefix( + gcs_external_update_config.name) + backlog_blobs = gcs_bucket.list_blobs(prefix=f"{table_prefix}/_backlog/") + assert backlog_blobs.num_results == 0, "backlog is not empty" + bqlock_blob: storage.Blob = gcs_bucket.blob("_bqlock") + assert not bqlock_blob.exists(), "_bqlock was not cleaned up" + rows = bq.query("SELECT alpha_update FROM " + f"{dest_ordered_update_table.dataset_id}" + f".{dest_ordered_update_table.table_id}") + expected_num_rows = 1 + num_rows = 0 + for row in rows: + num_rows += 1 + assert row["alpha_update"] == "ABC", "backlog not applied in order" + assert num_rows == expected_num_rows + + # Now we will test what happens when the publisher posts another batch after + # the backlog subscriber has exited. + backfill_blob = _post_a_new_batch(gcs_bucket, dest_dataset, + dest_ordered_update_table) + _run_subscriber(gcs, bq, backfill_blob) + + rows = bq.query("SELECT alpha_update FROM " + f"{dest_ordered_update_table.dataset_id}" + f".{dest_ordered_update_table.table_id}") + expected_num_rows = 1 + num_rows = 0 + for row in rows: + num_rows += 1 + assert row["alpha_update"] == "ABCD", "new incremental not applied" + assert num_rows == expected_num_rows + + +@pytest.mark.IT +@pytest.mark.ORDERING +@pytest.mark.repeat(NUM_TRIES_SUBSCRIBER_TESTS) +def test_backlog_subscriber_in_order_with_new_batch_while_running( + bq, gcs, gcs_bucket, dest_dataset, dest_ordered_update_table, + gcs_ordered_update_data, gcs_external_update_config: storage.Blob, + gcs_backlog, mock_env): + """Test functionality of backlog subscriber when new batches are added + before the subscriber is done finishing the existing backlog. + + Populate a backlog with 3 files that make updates where we can assert + that these jobs were applied in order. + In another process populate a fourth batch, and call the publisher. + """ + # Cannot pickle clients to another process so we need to recreate some + # objects without the client property. + backfill_blob = storage.Blob.from_string( + f"gs://{gcs_external_update_config.bucket.name}/" + f"{gcs_external_update_config.name}") + dataset = bigquery.Dataset.from_string( + f"{dest_dataset.project}.{dest_dataset.dataset_id}") + table = bigquery.Table.from_string( + f"{dest_dataset.project}.{dest_dataset.dataset_id}." + f"{dest_ordered_update_table.table_id}") + bkt = storage.Bucket.from_string(f"gs://{gcs_bucket.name}") + + basename = os.path.basename(gcs_external_update_config.name) + claim_blob: storage.Blob = gcs_external_update_config.bucket.blob( + gcs_external_update_config.name.replace( + basename, f"_claimed_{basename}_created_at_" + f"{gcs_external_update_config.time_created.timestamp()}")) + # Run subscriber w/ backlog and publisher w/ new batch in parallel. + with multiprocessing.Pool(processes=3) as pool: + res_subscriber = pool.apply_async(_run_subscriber, + (None, None, backfill_blob)) + # wait for existence of claim blob to ensure subscriber is running. + while not claim_blob.exists(): + pass + res_backlog_publisher = pool.apply_async(_post_a_new_batch, + (bkt, dataset, table)) + res_backlog_publisher.wait() + res_monitor = pool.apply_async( + gcs_ocn_bq_ingest.common.ordering.subscriber_monitor, + (None, bkt, + f"{dataset.project}.{dataset.dataset_id}/{table.table_id}/" + f"_backlog/04/_SUCCESS")) + + if res_monitor.get(): + print("subscriber monitor had to retrigger subscriber loop") + backfill_blob.reload(client=gcs) + _run_subscriber(None, None, backfill_blob) + + res_subscriber.wait() + + table_prefix = gcs_ocn_bq_ingest.common.utils.get_table_prefix( + gcs_external_update_config.name) + backlog_blobs = gcs_bucket.list_blobs(prefix=f"{table_prefix}/" + f"_backlog/") + assert backlog_blobs.num_results == 0, "backlog is not empty" + bqlock_blob: storage.Blob = gcs_bucket.blob("_bqlock") + assert not bqlock_blob.exists(), "_bqlock was not cleaned up" + rows = bq.query("SELECT alpha_update FROM " + f"{dest_ordered_update_table.dataset_id}" + f".{dest_ordered_update_table.table_id}") + expected_num_rows = 1 + num_rows = 0 + for row in rows: + num_rows += 1 + assert row["alpha_update"] == "ABCD", "backlog not applied in order" + assert num_rows == expected_num_rows + + +def _run_subscriber( + gcs_client: Optional[storage.Client], + bq_client: Optional[bigquery.Client], + backfill_blob, +): + gcs_ocn_bq_ingest.common.ordering.backlog_subscriber( + gcs_client, bq_client, backfill_blob, time.monotonic()) + + +def _post_a_new_batch(gcs_bucket, dest_dataset, dest_ordered_update_table): + # We may run this in another process and cannot pickle client objects + gcs = storage.Client() + data_obj: storage.Blob + for test_file in ["data.csv", "_SUCCESS"]: + data_obj = gcs_bucket.blob("/".join([ + f"{dest_dataset.project}.{dest_dataset.dataset_id}", + dest_ordered_update_table.table_id, "04", test_file + ])) + data_obj.upload_from_filename(os.path.join(TEST_DIR, "resources", + "test-data", "ordering", + "04", test_file), + client=gcs) + return gcs_ocn_bq_ingest.common.ordering.backlog_publisher(gcs, data_obj) diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/ordering_schema.json b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/ordering_schema.json new file mode 100644 index 000000000..ea54a4eed --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/ordering_schema.json @@ -0,0 +1,10 @@ +[ + { + "name": "id", + "type": "INT64" + }, + { + "name": "alpha_update", + "type": "STRING" + } +] diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/01/_SUCCESS b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/01/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/01/data.csv b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/01/data.csv new file mode 100644 index 000000000..6b4f72558 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/01/data.csv @@ -0,0 +1 @@ +1|A diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/02/_SUCCESS b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/02/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/02/data.csv b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/02/data.csv new file mode 100644 index 000000000..3b4f35bfc --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/02/data.csv @@ -0,0 +1 @@ +1|B diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/03/_SUCCESS b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/03/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/03/data.csv b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/03/data.csv new file mode 100644 index 000000000..ecf1eb9e0 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/03/data.csv @@ -0,0 +1 @@ +1|C diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/04/_SUCCESS b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/04/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/04/data.csv b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/04/data.csv new file mode 100644 index 000000000..09b72c865 --- /dev/null +++ b/tools/cloud_functions/gcs_event_based_ingest/tests/resources/test-data/ordering/04/data.csv @@ -0,0 +1 @@ +1|D