diff --git a/Databricks/ACTIVE TEST AUTOMATION/BRONZE_PARQUET_TEST.ipynb b/Databricks/ACTIVE TEST AUTOMATION/BRONZE_PARQUET_TEST.ipynb new file mode 100644 index 000000000..de175e2cd --- /dev/null +++ b/Databricks/ACTIVE TEST AUTOMATION/BRONZE_PARQUET_TEST.ipynb @@ -0,0 +1,907 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3b9497a0-9c70-45ef-8ef8-eb69ced4b163", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "from functools import reduce\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.functions import col, to_date, coalesce, greatest, lit, explode, date_add, current_date, count\n", + "from pyspark.sql.types import (\n", + " StructType,\n", + " StructField,\n", + " StringType,\n", + " IntegerType,\n", + " DateType,\n", + " ArrayType,\n", + ")\n", + "from docx import Document\n", + "from docx.shared import Inches\n", + "\n", + "bronze_mnt = \"/mnt/bronze/ARIADM/ACTIVE/APPEALS\"" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "67ada86a-98b3-4547-9ddf-da02ae27c6c2", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Paths printed successfully\n" + ] + } + ], + "source": [ + "try:\n", + " m1_path = f\"{bronze_mnt}/bronze_appealcase_crep_rep_floc_cspon_cfs\"\n", + " csv_m1_path = f\"{bronze_mnt}/APPEALS_LLD_SQL_RESULTS/M1.csv\"\n", + " m2_path = f\"{bronze_mnt}/bronze_appealcase_caseappellant_appellant\"\n", + " csv_m2_path = f\"{bronze_mnt}/APPEALS_LLD_SQL_RESULTS/M2.csv\"\n", + " m3_path = f\"{bronze_mnt}/bronze_status_htype_clist_list_ltype_court_lsitting_adj\"\n", + " csv_m3_path = f\"{bronze_mnt}/APPEALS_LLD_SQL_RESULTS/M3.csv\" \n", + " m4_path = f\"{bronze_mnt}/bronze_appealcase_transaction_transactiontype\"\n", + " csv_m4_path = f\"{bronze_mnt}/APPEALS_LLD_SQL_RESULTS/M4.csv\"\n", + " m5_path = f\"{bronze_mnt}/bronze_appealcase_link_linkdetail\"\n", + " csv_m5_path = f\"{bronze_mnt}/APPEALS_LLD_SQL_RESULTS/M5.csv\"\n", + " m6_path = f\"{bronze_mnt}/bronze_caseadjudicator_adjudicator\"\n", + " csv_m6_path = f\"{bronze_mnt}/APPEALS_LLD_SQL_RESULTS/M6.csv\"\n", + " c_path = f\"{bronze_mnt}/bronze_appealcategory\"\n", + " csv_c_path = f\"{bronze_mnt}/APPEALS_LLD_SQL_RESULTS/C.csv\"\n", + " d_path = f\"{bronze_mnt}/bronze_documentsreceived\"\n", + " csv_d_path = f\"{bronze_mnt}/APPEALS_LLD_SQL_RESULTS/D.csv\"\n", + " h_path = f\"{bronze_mnt}/bronze_history\"\n", + " csv_h_path = f\"{bronze_mnt}/APPEALS_LLD_SQL_RESULTS/H.csv\"\n", + "\n", + " print('Paths printed successfully')\n", + "except:\n", + " print(f\"Error during fetch: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "dd50a77e-cff8-4379-b87e-2a4b922b0df2", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "# Schema comparison\n", + "\n", + "In this test, columns from the bronze parquets are read in and tested against manually defined expected columns. This tests that the bronze transformations conform to the correct schema. The expected schema is defined manually using information from the Active Appeals LLD. " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1ba02895-a3b2-4b90-b64f-b61569f363eb", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "expected_m1_columns = [\n", + " \"CaseNo\",\n", + " \"CasePrefix\",\n", + " \"OutOfTimeIssue\",\n", + " \"DateLodged\",\n", + " \"DateAppealReceived\",\n", + " \"CentreId\",\n", + " \"NationalityId\",\n", + " \"AppealTypeId\",\n", + " \"DeportationDate\",\n", + " \"RemovalDate\",\n", + " \"VisitVisaType\",\n", + " \"DateOfApplicationDecision\",\n", + " \"HORef\",\n", + " \"InCamera\",\n", + " \"CourtPreference\",\n", + " \"LanguageId\",\n", + " \"Interpreter\",\n", + " \"RepresentativeId\",\n", + " \"CaseRepName\",\n", + " \"CaseRepAddress1\",\n", + " \"CaseRepAddress2\",\n", + " \"CaseRepAddress3\",\n", + " \"CaseRepAddress4\",\n", + " \"CaseRepAddress5\",\n", + " \"CaseRepPostcode\",\n", + " \"Contact\",\n", + " \"CaseRepEmail\",\n", + " \"FileSpecificEmail\",\n", + " \"RepName\",\n", + " \"RepAddress1\",\n", + " \"RepAddress2\",\n", + " \"RepAddress3\",\n", + " \"RepAddress4\",\n", + " \"RepAddress5\",\n", + " \"RepPostcode\",\n", + " \"RepEmail\",\n", + " \"SponsorName\",\n", + " \"SponsorForenames\",\n", + " \"SponsorAddress1\",\n", + " \"SponsorAddress2\",\n", + " \"SponsorAddress3\",\n", + " \"SponsorAddress4\",\n", + " \"SponsorAddress5\",\n", + " \"SponsorPostcode\",\n", + " \"SponsorEmail\",\n", + " \"SponsorTelephone\",\n", + " \"SponsorAuthorisation\",\n", + " \"MainRespondentId\",\n", + " \"DeptId\",\n", + " \"PaymentRemissionRequested\",\n", + " \"PaymentRemissionReason\",\n", + " \"PaymentRemissionGranted\",\n", + " \"PaymentRemissionReasonNote\",\n", + " \"LSCReference\", \n", + " \"ASFReferenceNo\", \n", + " \"DateCorrectFeeReceived\"\n", + "]\n", + "\n", + "expected_m2_columns = [\n", + " \"CaseNo\",\n", + " \"AppellantName\",\n", + " \"AppellantForenames\",\n", + " \"BirthDate\",\n", + " \"AppellantEmail\",\n", + "\t\"AppellantTelephone\",\n", + "\t\"AppellantAddress1\",\n", + "\t\"AppellantAddress2\",\n", + "\t\"AppellantAddress3\",\n", + "\t\"AppellantAddress4\",\n", + "\t\"AppellantAddress5\",\n", + "\t\"AppellantPostcode\",\n", + "\t\"AppellantCountryId\",\n", + "\t\"FCONumber\"\n", + "]\n", + "\n", + "expected_m3_columns = [\n", + " \"StatusId\",\n", + " \"CaseNo\",\n", + " \"CaseStatus\",\n", + " \"Outcome\",\n", + " \"HearingDate\",\n", + " \"CentreId\",\n", + " \"DecisionDate\",\n", + " \"Party\",\n", + " \"DateReceived\",\n", + " \"OutOfTime\",\n", + " \"DecisionReserved\",\n", + " \"AdjudicatorId\",\n", + " \"AdjSurname\",\n", + " \"AdjForenames\",\n", + " \"AdjTitle\",\n", + " \"DateOfService\",\n", + " \"AdditionalLanguageId\",\n", + " \"HearingCentre\",\n", + " \"CourtName\",\n", + " \"ListName\",\n", + " \"ListType\",\n", + " \"HearingType\",\n", + " \"StartTime\",\n", + " \"TimeEstimate\",\n", + " \"Judge1FTSurname\",\n", + " \"Judge1FTForenames\",\n", + " \"Judge1FTTitle\",\n", + " \"Judge2FTSurname\",\n", + " \"Judge2FTForenames\",\n", + " \"Judge2FTTitle\",\n", + " \"Judge3FTSurname\",\n", + " \"Judge3FTForenames\",\n", + " \"Judge3FTTitle\",\n", + " \"CourtClerkSurname\",\n", + " \"CourtClerkForenames\",\n", + " \"CourtClerkTitle\",\n", + " \"Notes\"\n", + "]\n", + "\n", + "expected_m4_columns = [\n", + " \"CaseNo\",\n", + " \"TransactionId\",\n", + " \"TransactionTypeId\",\n", + " \"ReferringTransactionId\",\n", + " \"Amount\",\n", + " \"TransactionDate\",\n", + " \"Status\",\n", + " \"SumBalance\",\n", + " \"SumTotalFee\",\n", + " \"SumTotalPay\"\n", + "]\n", + "\n", + "expected_m5_columns = [\n", + " \"CaseNo\",\n", + " \"LinkNo\",\n", + " \"ReasonLinkId\"\n", + "]\n", + "\n", + "expected_m6_columns = [\n", + " \"CaseNo\",\n", + " \"Required\",\n", + " \"JudgeSurname\",\n", + " \"JudgeForenames\",\n", + " \"JudgeTitle\"\n", + "]\n", + "\n", + "expected_c_columns = [\n", + " \"CaseNo\",\n", + " \"CategoryId\"\n", + "]\n", + "\n", + "expected_d_columns = [\n", + " \"CaseNo\",\n", + " \"ReceivedDocumentId\",\n", + " \"DateReceived\"\n", + "]\n", + "\n", + "expected_h_columns = [\n", + " \"HistoryId\",\n", + " \"CaseNo\",\n", + " \"HistType\",\n", + " \"Comment\"\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fe86c484-3b6b-4b58-bd91-a44524c0a987", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "M1 test passed\nM2 test passed\nM3 test passed\nM4 test passed\nM5 test passed\nM6 test passed\nC test passed\nD test passed\nH test passed\n" + ] + } + ], + "source": [ + "try:\n", + " # M1 Test\n", + " df_m1 = spark.read.format(\"delta\").load(m1_path)\n", + " actual_m1_columns = df_m1.columns\n", + " if set(expected_m1_columns) - set(actual_m1_columns):\n", + " m1_test = False\n", + " m1_diff = f\"Missing columns in M1 output: {set(expected_m1_columns) - set(actual_m1_columns)}\"\n", + " else:\n", + " m1_test = True\n", + " print(\"M1 test passed\")\n", + "\n", + " # M2 Test\n", + " df_m2 = spark.read.format(\"delta\").load(m2_path)\n", + " actual_m2_columns = df_m2.columns\n", + " if set(expected_m2_columns) - set(actual_m2_columns):\n", + " m2_test = False\n", + " m2_diff = f\"Missing columns in M2 output: {set(expected_m2_columns) - set(actual_m2_columns)}\"\n", + " else:\n", + " m2_test = True\n", + " print(\"M2 test passed\")\n", + "\n", + " # M3 Test\n", + " df_m3 = spark.read.format(\"delta\").load(m3_path)\n", + " actual_m3_columns = df_m3.columns\n", + " if set(expected_m3_columns) - set(actual_m3_columns):\n", + " m3_test = False\n", + " m3_diff = f\"Missing columns in M3 output: {set(expected_m3_columns) - set(actual_m3_columns)}\"\n", + " else:\n", + " m3_test = True\n", + " print(\"M3 test passed\")\n", + "\n", + " # M4 Test\n", + " df_m4 = spark.read.format(\"delta\").load(m4_path)\n", + " actual_m4_columns = df_m4.columns\n", + " if set(expected_m4_columns) - set(actual_m4_columns):\n", + " m4_test = False\n", + " m4_diff = f\"Missing columns in M4 output: {set(expected_m4_columns) - set(actual_m4_columns)}\"\n", + " else:\n", + " m4_test = True\n", + " print(\"M4 test passed\")\n", + "\n", + " # M5 Test\n", + " df_m5 = spark.read.format(\"delta\").load(m5_path)\n", + " actual_m5_columns = df_m5.columns\n", + " if set(expected_m5_columns) - set(actual_m5_columns):\n", + " m5_test = False\n", + " m5_diff = f\"Missing columns in M5 output: {set(expected_m5_columns) - set(actual_m5_columns)}\"\n", + " else:\n", + " m5_test = True\n", + " print(\"M5 test passed\")\n", + "\n", + " # M6 Test\n", + " df_m6 = spark.read.format(\"delta\").load(m6_path)\n", + " actual_m6_columns = df_m6.columns\n", + " if set(expected_m6_columns) - set(actual_m6_columns):\n", + " m6_test = False\n", + " m6_diff = f\"Missing columns in M6 output: {set(expected_m6_columns) - set(actual_m6_columns)}\"\n", + " else:\n", + " m6_test = True\n", + " print(\"M6 test passed\")\n", + "\n", + " # C Test\n", + " df_c = spark.read.format(\"delta\").load(c_path)\n", + " actual_c_columns = df_c.columns\n", + " if set(expected_c_columns) - set(actual_c_columns):\n", + " c_test = False\n", + " c_diff = f\"Missing columns in C output: {set(expected_c_columns) - set(actual_c_columns)}\"\n", + " else:\n", + " c_test = True\n", + " print(\"C test passed\")\n", + "\n", + " # D Test\n", + " df_d = spark.read.format(\"delta\").load(d_path)\n", + " actual_d_columns = df_d.columns\n", + " if set(expected_d_columns) - set(actual_d_columns):\n", + " d_test = False\n", + " d_diff = f\"Missing columns in D output: {set(expected_d_columns) - set(actual_d_columns)}\"\n", + " else:\n", + " d_test = True\n", + " print(\"D test passed\")\n", + "\n", + " # H Test\n", + " df_h = spark.read.format(\"delta\").load(h_path)\n", + " actual_h_columns = df_h.columns\n", + " if set(expected_h_columns) - set(actual_h_columns):\n", + " h_test = False\n", + " h_diff = f\"Missing columns in H output: {set(expected_h_columns) - set(actual_h_columns)}\"\n", + " else:\n", + " h_test = True\n", + " print(\"H test passed\")\n", + "\n", + "except Exception as e:\n", + " print(f\"Error during fetch: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "82908ee8-0b55-4dff-8702-46ca8899184c", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "\n", + "# `validate content`\n", + "\n", + " Inputs: \n", + "- `csv_path` : path to Mx CSV of ARIA_NLE_BAK data to compare against Mx generated bronze parquet file\n", + "- `mx_path` : path to Mx generated bronze parquet file to compare against Mx CSV of ARIA_NLE_BAK data\n", + "- `identifier_column` : manually specified column of data to perform joins around\n", + "\n", + "Outputs:\n", + "- List of columns that do not match between the parquet and CSV:\n", + "- If `extra_in_csv` > 0 : list of columns contained in the CSV that are not in the parquet\n", + "- If `missing_in_csv` > 0 : list of columns contained in the parquet that are not in the CSV\n", + "\n", + "\n", + "This function is to compare the counts of data in the CSV and parquet files. We are testing that the parquet file contains the same key content from the original CSVs. The CSVs that the parquets are being tested against contain data straight from the ARIA_NLE_BAK database. Therefore, by checking that all of the content in the parquet is present in the CSV file, the transformation is tested to ensure it does not drop all data and any records that are dropped by the transformation are recorded.\n", + "\n", + "| Join Type | Description |\n", + "| -------------- | --------------------------------------------------------------------------- |\n", + "| **inner** | Only rows with matching keys in both dataframes |\n", + "| **left** | All rows from the left dataframe, and matching rows from the right |\n", + "| **right** | All rows from the right dataframe, and matching rows from the left |\n", + "| **full** | All rows from both dataframes, with `null` where there's no match |\n", + "| **left\\_semi** | Returns rows from the left dataframe where a match **exists** in the right |\n", + "| **left\\_anti** | Returns rows from the left dataframe where **no match** exists in the right |\n", + "\n", + " Use of a left anti joins is implemented because we want to return counts and lists of entries that are in the left dataframe but not in the right dataframe. \n", + "\n", + " For example, the left anti join of CSV and parquet will return everything that is in the CSV but not in the parquet (`extra_in_csv`).\n", + "\n", + " And the left anti join of parquet and CSV will return everything that is in the parquet but not in the CSV (`missing_in_csv`)." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c4ec7844-89c5-4c19-b5b9-acb503287057", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "from pyspark.sql.functions import col\n", + "from pyspark.sql.functions import trim, lower\n", + "\n", + "\n", + "def validate_content(csv_path, mx_path, identifier_column):\n", + " try:\n", + " # Load CSV and parquet\n", + " df_csv_raw = spark.read.option(\"header\", \"false\").csv(csv_path)\n", + " df_parquet = spark.read.format(\"delta\").load(mx_path)\n", + "\n", + " # Get column names from parquet and assign to CSV\n", + " expected_columns = df_parquet.columns\n", + " df_csv = df_csv_raw.toDF(*expected_columns)\n", + "\n", + " # Extract distinct identifier values\n", + " parquet_ids = df_parquet.select(identifier_column).distinct()\n", + " csv_ids = df_csv.select(identifier_column).distinct()\n", + "\n", + " # Check what is extra in the CSV (CSV -> parquet)\n", + " extra_in_csv = csv_ids.join(parquet_ids, on=identifier_column, how=\"left_anti\")\n", + " extra_count = extra_in_csv.count()\n", + "\n", + " # Row counts\n", + " # print(f\"Distinct {identifier_column}s in parquet: {parquet_ids.count()}\")\n", + " # print(f\"Distinct {identifier_column}s in CSV: {csv_ids.count()}\")\n", + "\n", + " # Summary\n", + " if extra_count == 0:\n", + " return (\n", + " f\"All {identifier_column}s in the parquet output match the CSV exactly. \\n\"\n", + " f\"Distinct {identifier_column}s in parquet: {parquet_ids.count()} \\n\"\n", + " f\"Distinct {identifier_column}s in CSV: {csv_ids.count()} \\n\"\n", + " )\n", + " else:\n", + " extra_list = [row[identifier_column] for row in extra_in_csv.collect()]\n", + " return (\n", + " f\"All {identifier_column}s in parquet are present in CSV, but {extra_count} {identifier_column}(s) \"\n", + " f\"exist in CSV and not in parquet.\\nExtra {identifier_column}s: {extra_list} \\n\"\n", + " f\"Distinct {identifier_column}s in parquet: {parquet_ids.count()} \\n\"\n", + " f\"Distinct {identifier_column}s in CSV: {csv_ids.count()} \\n\"\n", + " )\n", + "\n", + " except Exception as e:\n", + " print(f\"Error during {identifier_column} validation: {str(e)}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "38554207-a12b-4760-b517-db2c3a3f7ea0", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Content validation results completed\n" + ] + } + ], + "source": [ + "m1_parquet_result = validate_content(csv_m1_path, m1_path, \"CaseNo\")\n", + "m2_parquet_result = validate_content(csv_m2_path, m2_path, \"CaseNo\")\n", + "m3_parquet_result = validate_content(csv_m3_path, m3_path, \"StatusId\")\n", + "m4_parquet_result = validate_content(csv_m4_path, m4_path, \"CaseNo\")\n", + "m5_parquet_result = validate_content(csv_m5_path, m5_path, \"CaseNo\")\n", + "m6_parquet_result = validate_content(csv_m6_path, m6_path, \"CaseNo\")\n", + "c_parquet_result = validate_content(csv_c_path, c_path, \"CaseNo\")\n", + "d_parquet_result = validate_content(csv_d_path, d_path, \"CaseNo\")\n", + "h_parquet_result = validate_content(csv_h_path, h_path, \"HistoryId\")\n", + "\n", + "print(\"Content validation results completed\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e6c84c1e-75e4-453a-bdd9-3363cda2e5e2", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "No mismatches found for M1\n***************************************************\nNo mismatches found for M2\n***************************************************\nNo mismatches found for M3\n***************************************************\nNo mismatches found for M4\n***************************************************\nNo mismatches found for M5\n***************************************************\nNo mismatches found for M6\n***************************************************\nNo mismatches found for C\n***************************************************\nNo mismatches found for D\n***************************************************\nNo mismatches found for H\n***************************************************\n" + ] + } + ], + "source": [ + "m1_mismatches = []\n", + "m2_mismatches = []\n", + "m3_mismatches = []\n", + "m4_mismatches = []\n", + "m5_mismatches = []\n", + "m6_mismatches = []\n", + "c_mismatches = []\n", + "d_mismatches = []\n", + "h_mismatches = []\n", + "\n", + "for column in expected_m1_columns:\n", + " result = validate_content(csv_m1_path, m1_path, column)\n", + " if \"missing\" in result.lower():\n", + " m1_mismatches.append((column, result))\n", + "\n", + "if m1_mismatches == []:\n", + " print(\"No mismatches found for M1\")\n", + "else:\n", + " print(\"Mismatched Columns for M1:\")\n", + " for column, message in m1_mismatches:\n", + " print(f\"\\nColumn: {column}\\n{message}\")\n", + "\n", + "print(\"***************************************************\")\n", + "\n", + "for column in expected_m2_columns:\n", + " result = validate_content(csv_m2_path, m2_path, column)\n", + " if \"missing\" in result.lower():\n", + " m2_mismatches.append((column, result))\n", + "\n", + "if m2_mismatches == []:\n", + " print(\"No mismatches found for M2\")\n", + "else:\n", + " print(\"Mismatched Columns for M2:\")\n", + " for column, message in m2_mismatches:\n", + " print(f\"\\nColumn: {column}\\n{message}\")\n", + "\n", + "print(\"***************************************************\")\n", + "\n", + "for column in expected_m3_columns:\n", + " result = validate_content(csv_m3_path, m3_path, column)\n", + " if \"missing\" in result.lower():\n", + " m3_mismatches.append((column, result))\n", + "\n", + "if m3_mismatches == []:\n", + " print(\"No mismatches found for M3\")\n", + "else:\n", + " print(\"Mismatched Columns for M3:\")\n", + " for column, message in m3_mismatches:\n", + " print(f\"\\nColumn: {column}\\n{message}\")\n", + "\n", + "print(\"***************************************************\")\n", + "\n", + "for column in expected_m4_columns:\n", + " result = validate_content(csv_m4_path, m4_path, column)\n", + " if \"missing\" in result.lower():\n", + " m4_mismatches.append((column, result))\n", + "\n", + "if m4_mismatches == []:\n", + " print(\"No mismatches found for M4\")\n", + "else:\n", + " print(\"Mismatched Columns for M4:\")\n", + " for column, message in m4_mismatches:\n", + " print(f\"\\nColumn: {column}\\n{message}\")\n", + "\n", + "print(\"***************************************************\")\n", + "\n", + "for column in expected_m5_columns:\n", + " result = validate_content(csv_m5_path, m5_path, column)\n", + " if \"missing\" in result.lower():\n", + " m5_mismatches.append((column, result))\n", + "\n", + "if m5_mismatches == []:\n", + " print(\"No mismatches found for M5\")\n", + "else:\n", + " print(\"Mismatched Columns for M5:\")\n", + " for column, message in m5_mismatches:\n", + " print(f\"\\nColumn: {column}\\n{message}\")\n", + "\n", + "print(\"***************************************************\")\n", + "\n", + "for column in expected_m6_columns:\n", + " result = validate_content(csv_m6_path, m6_path, column)\n", + " if \"missing\" in result.lower():\n", + " m6_mismatches.append((column, result))\n", + "\n", + "if m6_mismatches == []:\n", + " print(\"No mismatches found for M6\")\n", + "else:\n", + " print(\"Mismatched Columns for M6:\")\n", + " for column, message in m6_mismatches:\n", + " print(f\"\\nColumn: {column}\\n{message}\")\n", + "\n", + "print(\"***************************************************\")\n", + "\n", + "for column in expected_c_columns:\n", + " result = validate_content(csv_c_path, c_path, column)\n", + " if \"missing\" in result.lower():\n", + " c_mismatches.append((column, result))\n", + "\n", + "if c_mismatches == []:\n", + " print(\"No mismatches found for C\")\n", + "else:\n", + " print(\"Mismatched Columns for C:\")\n", + " for column, message in c_mismatches:\n", + " print(f\"\\nColumn: {column}\\n{message}\")\n", + "\n", + "print(\"***************************************************\")\n", + "\n", + "for column in expected_d_columns:\n", + " result = validate_content(csv_d_path, d_path, column)\n", + " if \"missing\" in result.lower():\n", + " d_mismatches.append((column, result))\n", + "\n", + "if d_mismatches == []:\n", + " print(\"No mismatches found for D\")\n", + "else:\n", + " print(\"Mismatched Columns for D:\")\n", + " for column, message in d_mismatches:\n", + " print(f\"\\nColumn: {column}\\n{message}\")\n", + "\n", + "\n", + "print(\"***************************************************\")\n", + "\n", + "# for column in [\"HistoryId\", \"CaseNo\", \"HistType\"]:\n", + "for column in expected_h_columns:\n", + " result = validate_content(csv_h_path, h_path, column)\n", + " if \"missing\" in result.lower():\n", + " h_mismatches.append((column, result))\n", + "\n", + "if h_mismatches == []:\n", + " print(\"No mismatches found for H\")\n", + "else:\n", + " print(\"Mismatched Columns for H:\")\n", + " for column, message in h_mismatches:\n", + " print(f\"\\nColumn: {column}\\n{message}\")\n", + "\n", + "print(\"***************************************************\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "df230e6d-9779-40f8-98d6-c8e7f8c3cc45", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "document = Document()\n", + "\n", + "# Add title\n", + "document.add_heading(\"Active Appeals Bronze Transformation Tests\", 0)\n", + "\n", + "# M1\n", + "document.add_heading(\"M1 transformation test\", 2)\n", + "document.add_paragraph(f\"Expected columns in {m1_path} tested.\")\n", + "if m1_test == True:\n", + " document.add_paragraph(\"M1 transformation test passed - all expected columns present in outputs.\")\n", + "else:\n", + " document.add_paragraph(\"M1 transformation test failed - some expected columns missing in outputs.\")\n", + " document.add_paragraph(m1_diff)\n", + "document.add_paragraph(m1_parquet_result)\n", + "if m1_mismatches == []:\n", + " document.add_paragraph(\"No mismatched columns found - all rows in required columns tested in the parquet, all content in the CSV is found in the parquet.\")\n", + "else: \n", + " document.add_paragraph(\"Mismatched columns found - all rows in required columns tested in the parquet, there is missing data in the parquet file, some data from the CSV has been lost.\")\n", + "\n", + "# M2\n", + "document.add_heading(\"M2 transformation test\", 2)\n", + "document.add_paragraph(f\"Expected columns in {m2_path} tested.\")\n", + "if m2_test == True:\n", + " document.add_paragraph(\"M2 transformation test passed - all expected columns present in outputs.\")\n", + "else:\n", + " document.add_paragraph(\"M2 transformation test failed - some expected columns missing in outputs.\")\n", + " document.add_paragraph(m2_diff)\n", + "document.add_paragraph(m2_parquet_result)\n", + "if m2_mismatches == []:\n", + " document.add_paragraph(\"No mismatched columns found - all rows in required columns tested in the parquet, all content in the CSV is found in the parquet.\")\n", + "else: \n", + " document.add_paragraph(\"Mismatched columns found - all rows in required columns tested in the parquet, there is missing data in the parquet file, some data from the CSV has been lost.\")\n", + "\n", + "# M3\n", + "document.add_heading(\"M3 transformation test\", 2)\n", + "document.add_paragraph(f\"Expected columns in {m3_path} tested.\")\n", + "if m3_test == True:\n", + " document.add_paragraph(\"M3 transformation test passed - all expected columns present in outputs.\")\n", + "else:\n", + " document.add_paragraph(\"M3 transformation test failed - some expected columns missing in outputs.\")\n", + " document.add_paragraph(m3_diff)\n", + "document.add_paragraph(m3_parquet_result)\n", + "if m3_mismatches == []:\n", + " document.add_paragraph(\"No mismatched columns found - all rows in required columns tested in the parquet, all content in the CSV is found in the parquet.\")\n", + "else: \n", + " document.add_paragraph(\"Mismatched columns found - all rows in required columns tested in the parquet, there is missing data in the parquet file, some data from the CSV has been lost.\")\n", + "\n", + "# M4\n", + "document.add_heading(\"M4 transformation test\", 2)\n", + "document.add_paragraph(f\"Expected columns in {m4_path} tested.\")\n", + "if m4_test == True:\n", + " document.add_paragraph(\"M4 transformation test passed - all expected columns present in outputs.\")\n", + "else:\n", + " document.add_paragraph(\"M4 transformation test failed - some expected columns missing in outputs.\")\n", + " document.add_paragraph(m4_diff)\n", + "document.add_paragraph(m4_parquet_result)\n", + "if m4_mismatches == []:\n", + " document.add_paragraph(\"No mismatched columns found - all rows in required columns tested in the parquet, all content in the CSV is found in the parquet.\")\n", + "else: \n", + " document.add_paragraph(\"Mismatched columns found - all rows in required columns tested in the parquet, there is missing data in the parquet file, some data from the CSV has been lost.\")\n", + "\n", + "# M5\n", + "document.add_heading(\"M5 transformation test\", 2)\n", + "document.add_paragraph(f\"Expected columns in {m5_path} tested.\")\n", + "if m5_test == True:\n", + " document.add_paragraph(\"M5 transformation test passed - all expected columns present in outputs.\")\n", + "else:\n", + " document.add_paragraph(\"M5 transformation test failed - some expected columns missing in outputs.\")\n", + " document.add_paragraph(m5_diff)\n", + "document.add_paragraph(m5_parquet_result)\n", + "if m5_mismatches == []:\n", + " document.add_paragraph(\"No mismatched columns found - all rows in required columns tested in the parquet, all content in the CSV is found in the parquet.\")\n", + "else: \n", + " document.add_paragraph(\"Mismatched columns found - there is missing data in the parquet file, some data from the CSV has been lost.\")\n", + "\n", + "# M6\n", + "document.add_heading(\"M6 transformation test\", 2)\n", + "document.add_paragraph(f\"Expected columns in {m6_path} tested.\")\n", + "if m6_test == True:\n", + " document.add_paragraph(\"M6 transformation test passed - all expected columns present in outputs.\")\n", + "else:\n", + " document.add_paragraph(\"M6 transformation test failed - some expected columns missing in outputs.\")\n", + " document.add_paragraph(m6_diff)\n", + "document.add_paragraph(m6_parquet_result)\n", + "if m6_mismatches == []:\n", + " document.add_paragraph(\"No mismatched columns found - all rows in required columns tested in the parquet, all content in the CSV is found in the parquet.\")\n", + "else: \n", + " document.add_paragraph(\"Mismatched columns found - there is missing data in the parquet file, some data from the CSV has been lost.\")\n", + "\n", + "# C\n", + "document.add_heading(\"'C' AppealCategory transformation test\", 2)\n", + "document.add_paragraph(f\"Expected columns in {c_path} tested.\")\n", + "if c_test == True:\n", + " document.add_paragraph(\"'C' AppealCategory transformation test passed - all expected columns present in outputs.\")\n", + "else:\n", + " document.add_paragraph(\"'C' AppealCategory test failed - some expected columns missing in outputs.\")\n", + " document.add_paragraph(c_diff)\n", + "document.add_paragraph(c_parquet_result)\n", + "if c_mismatches == []:\n", + " document.add_paragraph(\"No mismatched columns found - all rows in required columns tested in the parquet, all content in the CSV is found in the parquet.\")\n", + "else: \n", + " document.add_paragraph(\"Mismatched columns found - all rows in required columns tested in the parquet, there is missing data in the parquet file, some data from the CSV has been lost.\")\n", + "\n", + "# D\n", + "document.add_heading(\"'D' DocumentsReceived transformation test\", 2)\n", + "document.add_paragraph(f\"Expected columns in {d_path} tested.\")\n", + "if d_test == True:\n", + " document.add_paragraph(\"'D' DocumentsReceived transformation test passed - all expected columns present in outputs.\")\n", + "else:\n", + " document.add_paragraph(\"'D' DocumentsReceived test failed - some expected columns missing in outputs.\")\n", + " document.add_paragraph(d_diff)\n", + "document.add_paragraph(d_parquet_result)\n", + "if d_mismatches == []:\n", + " document.add_paragraph(\"No mismatched columns found - all rows in required columns tested in the parquet, all content in the CSV is found in the parquet.\")\n", + "else: \n", + " document.add_paragraph(\"Mismatched columns found - all rows in required columns tested in the parquet, there is missing data in the parquet file, some data from the CSV has been lost.\")\n", + "\n", + "# H\n", + "document.add_heading(\"'H' History transformation test\", 2)\n", + "document.add_paragraph(f\"Expected columns in {h_path} tested.\")\n", + "if h_test == True:\n", + " document.add_paragraph(\"'H' History transformation test passed - all expected columns present in outputs.\")\n", + "else:\n", + " document.add_paragraph(\"'H' History test failed - some expected columns missing in outputs.\")\n", + " document.add_paragraph(h_diff)\n", + "document.add_paragraph(h_parquet_result)\n", + "if h_mismatches == []:\n", + " document.add_paragraph(\"No mismatched columns found - all rows in required columns tested in the parquet, all content in the CSV is found in the parquet.\")\n", + "else: \n", + " document.add_paragraph(\"Mismatched columns found - all rows in required columns tested in the parquet, there is missing data in the parquet file, some data from the CSV has been lost.\")\n", + "\n", + "# Save document\n", + "document.save(\"appeals_bronze_transformations.docx\")" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "environment_version": "2" + }, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "BRONZE_PARQUET_TEST", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}