From d780d76a53b3f47a9184d43060b6bdc60169d3ff Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 6 Jun 2024 17:59:40 +0000 Subject: [PATCH 01/96] First draft of sales script --- .gitignore | 2 + .../reporting.sales_comprehensive.py | 148 ++++++++++++++++++ dbt/models/reporting/sales_comprehensive.sql | 93 +++++++++++ 3 files changed, 243 insertions(+) create mode 100644 dbt/models/reporting/reporting.sales_comprehensive.py create mode 100644 dbt/models/reporting/sales_comprehensive.sql diff --git a/.gitignore b/.gitignore index 396c5dfcf..92ca547c2 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ .Rproj.user .Ruserdata package*.json +settings.json # Directories logs/ @@ -17,3 +18,4 @@ venv/ # Ignore most CSVs, except those that are used as dbt seeds *.csv !dbt/seeds/**/*.csv +*.parquet.gzip diff --git a/dbt/models/reporting/reporting.sales_comprehensive.py b/dbt/models/reporting/reporting.sales_comprehensive.py new file mode 100644 index 000000000..6431a0753 --- /dev/null +++ b/dbt/models/reporting/reporting.sales_comprehensive.py @@ -0,0 +1,148 @@ +# This script generates aggregated summary stats on sales data across a number +# of geographies, class combinations, and time. + +import os.path +import statistics as stats + +# Import libraries +import awswrangler as wr +import numpy as np +import pandas as pd + +# Ingest data if it is not already available +if os.path.isfile("sales.parquet.gzip"): + df = pd.read_parquet("sales.parquet.gzip") + +else: + sql = open("sales_comprehensive.sql").read() + df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False) + df.to_parquet("sales.parquet.gzip", compression="gzip") + +# Declare geographic groups and their associated data years +geos = { + "census_data_year": [ + "census_place", + "census_tract", + "census_congressional_district", + "census_zcta", + ], + "cook_board_of_review_district_data_year": [ + "cook_board_of_review_district" + ], + "cook_commissioner_district_data_year": ["cook_commissioner_district"], + "cook_judicial_district_data_year": ["cook_judicial_district"], + "ward_data_year": ["ward_num"], + "community_area_data_year": ["community_area"], + "police_district_data_year": ["police_district"], + "central_business_district_data_year": ["central_business_district"], + "school_data_year": [ + "school_elementary_district", + "school_secondary_district", + "school_unified_district", + ], + "tax_data_year": [ + "tax_municipality", + "tax_park_district", + "tax_library_district", + "tax_fire_protection_district", + "tax_community_college_district", + "tax_sanitation_district", + "tax_special_service_area", + "tax_tif_district", + ], +} +# Declare class groupings +groups = ["no_group", "class", "major_class", "modeling_group"] + + +# Define aggregation functions +def q10(x): + return x.quantile(0.1) + + +def q25(x): + return x.quantile(0.25) + + +def q75(x): + return x.quantile(0.75) + + +def q90(x): + return x.quantile(0.9) + + +def first(x): + return x.iloc[0] + + +agg_func_math = { + "sale_price": [ + "size", + "count", + "min", + q10, + q25, + "median", + q75, + q90, + "max", + "mean", + "sum", + ], + "price_per_sf": [ + "min", + q10, + q25, + "median", + q75, + q90, + "max", + "mean", + "sum", + ], + "char_bldg_sf": ["median"], + "char_land_sf": ["median"], + "char_yrblt": ["median"], + "class": [stats.multimode], + "data_year": [first], +} + +# Create an empty dataframe to fill with output +output = pd.DataFrame() + +# Loop through group combinations and stack output +for x in np.concatenate(list(geos.values())): + for y in geos.keys(): + if x in geos[y]: + df["data_year"] = df[y] + + for z in groups: + group = [x, z, "year"] + summary = df.groupby(group).agg(agg_func_math).round(2) + summary["geography_type"] = x + summary["group_type"] = z + summary.index.names = ["geography_id", "group_id", "year"] + summary = summary.reset_index().set_index( + [ + "geography_type", + "geography_id", + "group_type", + "group_id", + "year", + ] + ) + + output = pd.concat([output, summary]) + +# Clean combined output and export +output["sale_price", "sum"] = output["sale_price", "sum"].replace(0, np.NaN) +output["price_per_sf", "sum"] = output["price_per_sf", "sum"].replace( + 0, np.NaN +) + +for i in ["median", "mean", "sum"]: + output["sale_price", "delta" + i] = output["sale_price", i].diff() + output["price_per_sf", "delta" + i] = output["price_per_sf", i].diff() + +output.to_csv("output.csv") diff --git a/dbt/models/reporting/sales_comprehensive.sql b/dbt/models/reporting/sales_comprehensive.sql new file mode 100644 index 000000000..c2f565739 --- /dev/null +++ b/dbt/models/reporting/sales_comprehensive.sql @@ -0,0 +1,93 @@ +-- Gather parcel-level land and yrblt +WITH sf AS ( + SELECT + pin, + year, + SUM(char_bldg_sf) AS char_bldg_sf, + SUM(char_land_sf) AS char_land_sf, + ARBITRARY(char_yrblt) AS char_yrblt + FROM default.vw_card_res_char + GROUP BY pin, year +) + +-- Gather parcel-level geographies and join land, sales, and class groupings +SELECT + sales.doc_no, + sales.sale_price, + CASE WHEN sf.char_bldg_sf > 0 + THEN + CAST(sales.sale_price / sf.char_bldg_sf AS DOUBLE) + END AS price_per_sf, + CAST(sf.char_bldg_sf AS INT) AS char_bldg_sf, + CAST(sf.char_land_sf AS INT) AS char_land_sf, + CAST(sf.char_yrblt AS INT) AS char_yrblt, + CAST(hist.oneyr_pri_mailed_bldg AS DOUBLE) AS oneyr_pri_mailed_bldg, + CAST(hist.oneyr_pri_mailed_land AS DOUBLE) AS oneyr_pri_mailed_land, + CAST(hist.oneyr_pri_mailed_tot AS DOUBLE) AS oneyr_pri_mailed_tot, + uni.year, + uni.class, + 'Cook' AS county, + uni.triad_name AS triad, + uni.township_name AS township, + uni.nbhd_code AS nbhd, + uni.tax_code, + uni.zip_code, + uni.chicago_community_area_name AS community_area, + uni.census_place_geoid AS census_place, + uni.census_tract_geoid AS census_tract, + uni.census_congressional_district_geoid + AS + census_congressional_district, + uni.census_zcta_geoid AS census_zcta, + uni.cook_board_of_review_district_num AS cook_board_of_review_district, + uni.cook_commissioner_district_num AS cook_commissioner_district, + uni.cook_judicial_district_num AS cook_judicial_district, + uni.ward_num, + uni.chicago_police_district_num AS police_district, + uni.school_elementary_district_geoid AS school_elementary_district, + uni.school_secondary_district_geoid AS school_secondary_district, + uni.school_unified_district_geoid AS school_unified_district, + uni.tax_municipality_name AS tax_municipality, + uni.tax_park_district_name AS tax_park_district, + uni.tax_library_district_name AS tax_library_district, + uni.tax_fire_protection_district_name AS tax_fire_protection_district, + uni.tax_community_college_district_name + AS + tax_community_college_district, + uni.tax_sanitation_district_name AS tax_sanitation_district, + uni.tax_special_service_area_name AS tax_special_service_area, + uni.tax_tif_district_name AS tax_tif_district, + uni.econ_central_business_district_num AS central_business_district, + uni.census_data_year, + uni.cook_board_of_review_district_data_year, + uni.cook_commissioner_district_data_year, + uni.cook_judicial_district_data_year, + COALESCE( + uni.ward_chicago_data_year, uni.ward_evanston_data_year) AS + ward_data_year, + uni.chicago_community_area_data_year AS community_area_data_year, + uni.chicago_police_district_data_year AS police_district_data_year, + uni.econ_central_business_district_data_year + AS + central_business_district_data_year, + uni.school_data_year, + uni.tax_data_year, + 'no_group' AS no_group, + class_dict.major_class_type AS major_class, + class_dict.modeling_group +FROM vw_pin_universe AS uni +LEFT JOIN sf + ON uni.pin = sf.pin + AND uni.year = sf.year +LEFT JOIN ccao.class_dict + ON uni.class = class_dict.class_code +LEFT JOIN default.vw_pin_history AS hist + ON uni.pin = hist.pin + AND uni.year = hist.year +LEFT JOIN vw_pin_sale AS sales + ON uni.pin = sales.pin + AND uni.year = sales.year + AND NOT sales.is_multisale + AND NOT sales.sale_filter_deed_type + AND NOT sales.sale_filter_less_than_10k + AND NOT sales.sale_filter_same_sale_within_365 From 00909fd2d203c1cb1445afdb57056320ac64d41e Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 6 Jun 2024 19:28:54 +0000 Subject: [PATCH 02/96] File renaming --- ...g.sales_comprehensive.py => reporting.sot_sales.py} | 10 +++++----- ...sales_comprehensive.sql => reporting.sot_sales.sql} | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) rename dbt/models/reporting/{reporting.sales_comprehensive.py => reporting.sot_sales.py} (93%) rename dbt/models/reporting/{sales_comprehensive.sql => reporting.sot_sales.sql} (97%) diff --git a/dbt/models/reporting/reporting.sales_comprehensive.py b/dbt/models/reporting/reporting.sot_sales.py similarity index 93% rename from dbt/models/reporting/reporting.sales_comprehensive.py rename to dbt/models/reporting/reporting.sot_sales.py index 6431a0753..5bc192ed6 100644 --- a/dbt/models/reporting/reporting.sales_comprehensive.py +++ b/dbt/models/reporting/reporting.sot_sales.py @@ -10,13 +10,13 @@ import pandas as pd # Ingest data if it is not already available -if os.path.isfile("sales.parquet.gzip"): - df = pd.read_parquet("sales.parquet.gzip") +if os.path.isfile("sot_sales.parquet.gzip"): + df = pd.read_parquet("sot_sales.parquet.gzip") else: - sql = open("sales_comprehensive.sql").read() + sql = open("sot_sales.sql").read() df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False) - df.to_parquet("sales.parquet.gzip", compression="gzip") + df.to_parquet("sot_sales.parquet.gzip", compression="gzip") # Declare geographic groups and their associated data years geos = { @@ -145,4 +145,4 @@ def first(x): output["sale_price", "delta" + i] = output["sale_price", i].diff() output["price_per_sf", "delta" + i] = output["price_per_sf", i].diff() -output.to_csv("output.csv") +output.to_csv("sot_sales.csv") diff --git a/dbt/models/reporting/sales_comprehensive.sql b/dbt/models/reporting/reporting.sot_sales.sql similarity index 97% rename from dbt/models/reporting/sales_comprehensive.sql rename to dbt/models/reporting/reporting.sot_sales.sql index c2f565739..2be8e4dc5 100644 --- a/dbt/models/reporting/sales_comprehensive.sql +++ b/dbt/models/reporting/reporting.sot_sales.sql @@ -75,7 +75,7 @@ SELECT 'no_group' AS no_group, class_dict.major_class_type AS major_class, class_dict.modeling_group -FROM vw_pin_universe AS uni +FROM default.vw_pin_universe AS uni LEFT JOIN sf ON uni.pin = sf.pin AND uni.year = sf.year @@ -84,7 +84,7 @@ LEFT JOIN ccao.class_dict LEFT JOIN default.vw_pin_history AS hist ON uni.pin = hist.pin AND uni.year = hist.year -LEFT JOIN vw_pin_sale AS sales +LEFT JOIN default.vw_pin_sale AS sales ON uni.pin = sales.pin AND uni.year = sales.year AND NOT sales.is_multisale From 2ac598244e25e6fbb99854a40cca136a5ff59492 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 6 Jun 2024 20:42:06 +0000 Subject: [PATCH 03/96] Cleaner for loop --- dbt/models/reporting/reporting.sot_sales.py | 45 ++++++++++----------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py index 5bc192ed6..3f262e2d2 100644 --- a/dbt/models/reporting/reporting.sot_sales.py +++ b/dbt/models/reporting/reporting.sot_sales.py @@ -14,7 +14,7 @@ df = pd.read_parquet("sot_sales.parquet.gzip") else: - sql = open("sot_sales.sql").read() + sql = open("reporting.sot_sales.sql").read() df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False) df.to_parquet("sot_sales.parquet.gzip", compression="gzip") @@ -112,28 +112,27 @@ def first(x): output = pd.DataFrame() # Loop through group combinations and stack output -for x in np.concatenate(list(geos.values())): - for y in geos.keys(): - if x in geos[y]: - df["data_year"] = df[y] - - for z in groups: - group = [x, z, "year"] - summary = df.groupby(group).agg(agg_func_math).round(2) - summary["geography_type"] = x - summary["group_type"] = z - summary.index.names = ["geography_id", "group_id", "year"] - summary = summary.reset_index().set_index( - [ - "geography_type", - "geography_id", - "group_type", - "group_id", - "year", - ] - ) - - output = pd.concat([output, summary]) +for key, value in geos.items(): + df["data_year"] = df[key] + + for x in value: + for z in groups: + group = [x, z, "year"] + summary = df.groupby(group).agg(agg_func_math).round(2) + summary["geography_type"] = x + summary["group_type"] = z + summary.index.names = ["geography_id", "group_id", "year"] + summary = summary.reset_index().set_index( + [ + "geography_type", + "geography_id", + "group_type", + "group_id", + "year", + ] + ) + + output = pd.concat([output, summary]) # Clean combined output and export output["sale_price", "sum"] = output["sale_price", "sum"].replace(0, np.NaN) From 2107d2a33892c8229cffd77c1762774794f80e8d Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 11 Jun 2024 17:34:41 +0000 Subject: [PATCH 04/96] First draft taxes and exemptions table --- dbt/models/reporting/reporting.sot_sales.py | 38 ++--- .../reporting.sot_taxes_exemptions.py | 141 ++++++++++++++++++ .../reporting.sot_taxes_exemptions.sql | 93 ++++++++++++ 3 files changed, 248 insertions(+), 24 deletions(-) create mode 100644 dbt/models/reporting/reporting.sot_taxes_exemptions.py create mode 100644 dbt/models/reporting/reporting.sot_taxes_exemptions.sql diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py index 3f262e2d2..120a0bd5b 100644 --- a/dbt/models/reporting/reporting.sot_sales.py +++ b/dbt/models/reporting/reporting.sot_sales.py @@ -76,31 +76,21 @@ def first(x): return x.iloc[0] +more_stats = [ + "min", + q10, + q25, + "median", + q75, + q90, + "max", + "mean", + "sum", +] + agg_func_math = { - "sale_price": [ - "size", - "count", - "min", - q10, - q25, - "median", - q75, - q90, - "max", - "mean", - "sum", - ], - "price_per_sf": [ - "min", - q10, - q25, - "median", - q75, - q90, - "max", - "mean", - "sum", - ], + "sale_price": ["size", "count"] + more_stats, + "price_per_sf": more_stats, "char_bldg_sf": ["median"], "char_land_sf": ["median"], "char_yrblt": ["median"], diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py new file mode 100644 index 000000000..7e236b820 --- /dev/null +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py @@ -0,0 +1,141 @@ +# This script generates aggregated summary stats on taxes and exemptions data +# across a number of geographies, class combinations, and time. +# %% +import os.path + +# Import libraries +import awswrangler as wr +import pandas as pd + +# Ingest data if it is not already available +if os.path.isfile("sot_taxes_exemptions.parquet.gzip"): + df = pd.read_parquet("sot_taxes_exemptions.parquet.gzip") + +else: + sql = open("reporting.sot_taxes_exemptions.sql").read() + df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False) + df.to_parquet("sot_taxes_exemptions.parquet.gzip", compression="gzip") + +# Declare geographic groups and their associated data years +geos = { + "census_data_year": [ + "census_place", + "census_tract", + "census_congressional_district", + "census_zcta", + ], + "cook_board_of_review_district_data_year": [ + "cook_board_of_review_district" + ], + "cook_commissioner_district_data_year": ["cook_commissioner_district"], + "cook_judicial_district_data_year": ["cook_judicial_district"], + "ward_data_year": ["ward_num"], + "community_area_data_year": ["community_area"], + "police_district_data_year": ["police_district"], + "central_business_district_data_year": ["central_business_district"], + "school_data_year": [ + "school_elementary_district", + "school_secondary_district", + "school_unified_district", + ], + "tax_data_year": [ + "tax_municipality", + "tax_park_district", + "tax_library_district", + "tax_fire_protection_district", + "tax_community_college_district", + "tax_sanitation_district", + "tax_special_service_area", + "tax_tif_district", + ], +} +# Declare class groupings +groups = ["no_group", "class", "major_class", "modeling_group"] + + +# %% +# Define aggregation functions +def q10(x): + return x.quantile(0.1) + + +def q25(x): + return x.quantile(0.25) + + +def q75(x): + return x.quantile(0.75) + + +def q90(x): + return x.quantile(0.9) + + +def first(x): + return x.iloc[0] + + +more_stats = [ + "min", + q10, + q25, + "median", + q75, + q90, + "max", + "mean", + "sum", +] + +less_stats = ["count", "sum"] + +agg_func_math = { + "eq_factor_final": ["size", first], + "eq_factor_tentative": [first], + "tax_bill_total": more_stats, + "tax_code_rate": more_stats, + "av_clerk": more_stats, + "exe_homeowner": less_stats, + "exe_senior": less_stats, + "exe_freeze": less_stats, + "exe_longtime_homeowner": less_stats, + "exe_disabled": less_stats, + "exe_vet_returning": less_stats, + "exe_vet_dis_lt50": less_stats, + "exe_vet_dis_50_69": less_stats, + "exe_vet_dis_ge70": less_stats, + "exe_abate": less_stats, +} + +# Create an empty dataframe to fill with output +output = pd.DataFrame() +# %% +# Loop through group combinations and stack output +for key, value in geos.items(): + df["data_year"] = df[key] + + for x in value: + for z in groups: + group = [x, z, "year"] + summary = df.groupby(group).agg(agg_func_math).round(2) + summary["geography_type"] = x + summary["group_type"] = z + summary.index.names = ["geography_id", "group_id", "year"] + summary = summary.reset_index().set_index( + [ + "geography_type", + "geography_id", + "group_type", + "group_id", + "year", + ] + ) + + output = pd.concat([output, summary]) + +# Clean combined output and export +for i in ["median", "mean", "sum"]: + output["tax_bill_total", "delta" + i] = output["tax_bill_total", i].diff() + +output.to_csv("sot_taxes_exemptions.csv") +# %% diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.sql b/dbt/models/reporting/reporting.sot_taxes_exemptions.sql new file mode 100644 index 000000000..f96bdc5d4 --- /dev/null +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.sql @@ -0,0 +1,93 @@ +-- Gather parcel-level geographies and join taxes, exemptions, and class +-- groupings +SELECT + tax.year, + tax.av_clerk, + tax.tax_bill_total, + CASE WHEN tax.exe_homeowner = 0 THEN NULL ELSE tax.exe_homeowner END + AS exe_homeowner, + CASE WHEN tax.exe_senior = 0 THEN NULL ELSE tax.exe_senior END + AS exe_senior, + CASE WHEN tax.exe_freeze = 0 THEN NULL ELSE tax.exe_freeze END + AS exe_freeze, + CASE + WHEN tax.exe_longtime_homeowner = 0 THEN NULL ELSE + tax.exe_longtime_homeowner + END AS exe_longtime_homeowner, + CASE WHEN tax.exe_disabled = 0 THEN NULL ELSE tax.exe_disabled END + AS exe_disabled, + CASE + WHEN tax.exe_vet_returning = 0 THEN NULL ELSE tax.exe_vet_returning + END AS exe_vet_returning, + CASE WHEN tax.exe_vet_dis_lt50 = 0 THEN NULL ELSE tax.exe_vet_dis_lt50 END + AS exe_vet_dis_lt50, + CASE + WHEN tax.exe_vet_dis_50_69 = 0 THEN NULL ELSE tax.exe_vet_dis_50_69 + END AS exe_vet_dis_50_69, + CASE WHEN tax.exe_vet_dis_ge70 = 0 THEN NULL ELSE tax.exe_vet_dis_ge70 END + AS exe_vet_dis_ge70, + CASE WHEN tax.exe_abate = 0 THEN NULL ELSE tax.exe_abate END AS exe_abate, + tcd.tax_code_rate, + eqf.eq_factor_tentative, + eqf.eq_factor_final, + uni.class, + 'Cook' AS county, + uni.triad_name AS triad, + uni.township_name AS township, + uni.nbhd_code AS nbhd, + uni.tax_code, + uni.zip_code, + uni.chicago_community_area_name AS community_area, + uni.census_place_geoid AS census_place, + uni.census_tract_geoid AS census_tract, + uni.census_congressional_district_geoid + AS + census_congressional_district, + uni.census_zcta_geoid AS census_zcta, + uni.cook_board_of_review_district_num AS cook_board_of_review_district, + uni.cook_commissioner_district_num AS cook_commissioner_district, + uni.cook_judicial_district_num AS cook_judicial_district, + uni.ward_num, + uni.chicago_police_district_num AS police_district, + uni.school_elementary_district_geoid AS school_elementary_district, + uni.school_secondary_district_geoid AS school_secondary_district, + uni.school_unified_district_geoid AS school_unified_district, + uni.tax_municipality_name AS tax_municipality, + uni.tax_park_district_name AS tax_park_district, + uni.tax_library_district_name AS tax_library_district, + uni.tax_fire_protection_district_name AS tax_fire_protection_district, + uni.tax_community_college_district_name + AS + tax_community_college_district, + uni.tax_sanitation_district_name AS tax_sanitation_district, + uni.tax_special_service_area_name AS tax_special_service_area, + uni.tax_tif_district_name AS tax_tif_district, + uni.econ_central_business_district_num AS central_business_district, + uni.census_data_year, + uni.cook_board_of_review_district_data_year, + uni.cook_commissioner_district_data_year, + uni.cook_judicial_district_data_year, + COALESCE( + uni.ward_chicago_data_year, uni.ward_evanston_data_year) AS + ward_data_year, + uni.chicago_community_area_data_year AS community_area_data_year, + uni.chicago_police_district_data_year AS police_district_data_year, + uni.econ_central_business_district_data_year + AS + central_business_district_data_year, + uni.school_data_year, + uni.tax_data_year, + 'no_group' AS no_group, + class_dict.major_class_type AS major_class, + class_dict.modeling_group +FROM default.vw_pin_universe AS uni +INNER JOIN tax.pin AS tax + ON uni.pin = tax.pin + AND uni.year = tax.year +INNER JOIN tax.eq_factor AS eqf + ON uni.year = eqf.year +INNER JOIN tax.tax_code AS tcd + ON tax.tax_code_num = tcd.tax_code_num + AND tax.year = tcd.year +INNER JOIN ccao.class_dict + ON uni.class = class_dict.class_code From c56aaafbd3761f46fa5673fd8c841ab6a46a4dec Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 12 Jun 2024 22:07:26 +0000 Subject: [PATCH 05/96] Wrap assessment_roll --- .../reporting.sot_assessment_roll.py | 136 ++++++++++++++++++ .../reporting.sot_assessment_roll.sql | 89 ++++++++++++ .../reporting.sot_taxes_exemptions.py | 4 - 3 files changed, 225 insertions(+), 4 deletions(-) create mode 100644 dbt/models/reporting/reporting.sot_assessment_roll.py create mode 100644 dbt/models/reporting/reporting.sot_assessment_roll.sql diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py new file mode 100644 index 000000000..65dc3229b --- /dev/null +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -0,0 +1,136 @@ +# This script generates aggregated summary stats on sales data across a number +# of geographies, class combinations, and time. + +import os.path + +# Import libraries +import awswrangler as wr +import pandas as pd + +# Ingest data if it is not already available +if os.path.isfile("sot_assessment_roll.parquet.gzip"): + df = pd.read_parquet("sot_assessment_roll.parquet.gzip") + +else: + sql = open("reporting.sot_assessment_roll.sql").read() + df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False) + df.to_parquet("sot_assessment_roll.parquet.gzip", compression="gzip") + +# Declare geographic groups and their associated data years +geos = { + "census_data_year": [ + "census_place", + "census_tract", + "census_congressional_district", + "census_zcta", + ], + "cook_board_of_review_district_data_year": [ + "cook_board_of_review_district" + ], + "cook_commissioner_district_data_year": ["cook_commissioner_district"], + "cook_judicial_district_data_year": ["cook_judicial_district"], + "ward_data_year": ["ward_num"], + "community_area_data_year": ["community_area"], + "police_district_data_year": ["police_district"], + "central_business_district_data_year": ["central_business_district"], + "school_data_year": [ + "school_elementary_district", + "school_secondary_district", + "school_unified_district", + ], + "tax_data_year": [ + "tax_municipality", + "tax_park_district", + "tax_library_district", + "tax_fire_protection_district", + "tax_community_college_district", + "tax_sanitation_district", + "tax_special_service_area", + "tax_tif_district", + ], +} +# Declare class groupings +groups = ["no_group", "class", "major_class", "modeling_group", "stage_name"] + + +# Define aggregation functions +def q10(x): + return x.quantile(0.1) + + +def q25(x): + return x.quantile(0.25) + + +def q75(x): + return x.quantile(0.75) + + +def q90(x): + return x.quantile(0.9) + + +def first(x): + return x.iloc[0] + + +def aggregrate(data, geography_type, group_type): + print(geography_type, group_type) + + group = [geography_type, group_type, "year"] + summary = data.groupby(group).agg(stats).round(2) + summary["geography_type"] = geography_type + summary["group_type"] = group_type + summary.index.names = ["geography_id", "group_id", "year"] + summary = summary.reset_index().set_index( + [ + "geography_type", + "geography_id", + "group_type", + "group_id", + "year", + ] + ) + + return summary + + +more_stats = [ + "min", + q10, + q25, + "median", + q75, + q90, + "max", + "mean", + "sum", +] + +stats = { + "size": first, + "tot": ["count"] + more_stats, + "bldg": more_stats, + "land": more_stats, +} + +# Create an empty dataframe to fill with output +output = pd.DataFrame() + +# Loop through group combinations and stack output +for key, value in geos.items(): + df["data_year"] = df[key] + + for x in value: + for z in groups: + output = pd.concat([output, aggregrate(df, x, z)]) + +# Clean combined output and export +for i in ["median", "mean", "sum"]: + output["tot", "delta" + i] = output["tot", i].diff() + output["bldg", "delta" + i] = output["bldg", i].diff() + output["land", "delta" + i] = output["land", i].diff() + +output["tot", "pct_w_value"] = output["tot", "count"] / output["size", "first"] + +output.to_csv("sot_assessment_roll.csv") diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.sql b/dbt/models/reporting/reporting.sot_assessment_roll.sql new file mode 100644 index 000000000..e50aa10a7 --- /dev/null +++ b/dbt/models/reporting/reporting.sot_assessment_roll.sql @@ -0,0 +1,89 @@ +-- Gather parcel-level geographies and join land, sales, and class groupings +WITH counts AS ( + SELECT + year, + COUNT(*) AS size + FROM default.vw_pin_universe + GROUP BY year +) + +SELECT + CAST(vals.tot AS INT) AS tot, + CAST(vals.bldg AS INT) AS bldg, + CAST(vals.land AS INT) AS land, + CASE + WHEN + MOD(CAST(vals.year AS INT), 3) = 0 + AND uni.triad_name = 'North' + THEN TRUE + WHEN + MOD(CAST(vals.year AS INT), 3) = 1 + AND uni.triad_name = 'South' + THEN TRUE + WHEN + MOD(CAST(vals.year AS INT), 3) = 2 + AND uni.triad_name = 'City' + THEN TRUE + ELSE FALSE + END AS reassessment_year, + vals.class, + vals.stage_name, + vals.year, + 'Cook' AS county, + uni.triad_name AS triad, + uni.township_name AS township, + uni.nbhd_code AS nbhd, + uni.tax_code, + uni.zip_code, + uni.chicago_community_area_name AS community_area, + uni.census_place_geoid AS census_place, + uni.census_tract_geoid AS census_tract, + uni.census_congressional_district_geoid + AS + census_congressional_district, + uni.census_zcta_geoid AS census_zcta, + uni.cook_board_of_review_district_num AS cook_board_of_review_district, + uni.cook_commissioner_district_num AS cook_commissioner_district, + uni.cook_judicial_district_num AS cook_judicial_district, + uni.ward_num, + uni.chicago_police_district_num AS police_district, + uni.school_elementary_district_geoid AS school_elementary_district, + uni.school_secondary_district_geoid AS school_secondary_district, + uni.school_unified_district_geoid AS school_unified_district, + uni.tax_municipality_name AS tax_municipality, + uni.tax_park_district_name AS tax_park_district, + uni.tax_library_district_name AS tax_library_district, + uni.tax_fire_protection_district_name AS tax_fire_protection_district, + uni.tax_community_college_district_name + AS + tax_community_college_district, + uni.tax_sanitation_district_name AS tax_sanitation_district, + uni.tax_special_service_area_name AS tax_special_service_area, + uni.tax_tif_district_name AS tax_tif_district, + uni.econ_central_business_district_num AS central_business_district, + uni.census_data_year, + uni.cook_board_of_review_district_data_year, + uni.cook_commissioner_district_data_year, + uni.cook_judicial_district_data_year, + COALESCE( + uni.ward_chicago_data_year, uni.ward_evanston_data_year) AS + ward_data_year, + uni.chicago_community_area_data_year AS community_area_data_year, + uni.chicago_police_district_data_year AS police_district_data_year, + uni.econ_central_business_district_data_year + AS + central_business_district_data_year, + uni.school_data_year, + uni.tax_data_year, + 'no_group' AS no_group, + class_dict.major_class_type AS major_class, + class_dict.modeling_group, + counts.size +FROM default.vw_pin_universe AS uni +LEFT JOIN reporting.vw_pin_value_long AS vals + ON uni.pin = vals.pin + AND uni.year = vals.year +LEFT JOIN ccao.class_dict + ON vals.class = class_dict.class_code +LEFT JOIN counts + ON uni.year = counts.year diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py index 7e236b820..b26e55a68 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py @@ -1,6 +1,5 @@ # This script generates aggregated summary stats on taxes and exemptions data # across a number of geographies, class combinations, and time. -# %% import os.path # Import libraries @@ -53,7 +52,6 @@ groups = ["no_group", "class", "major_class", "modeling_group"] -# %% # Define aggregation functions def q10(x): return x.quantile(0.1) @@ -109,7 +107,6 @@ def first(x): # Create an empty dataframe to fill with output output = pd.DataFrame() -# %% # Loop through group combinations and stack output for key, value in geos.items(): df["data_year"] = df[key] @@ -138,4 +135,3 @@ def first(x): output["tax_bill_total", "delta" + i] = output["tax_bill_total", i].diff() output.to_csv("sot_taxes_exemptions.csv") -# %% From 6c813082a0312e2f5bb070c2d48e962149178030 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 13 Jun 2024 15:39:14 +0000 Subject: [PATCH 06/96] Correct size, count calculations --- .../reporting.sot_assessment_roll.py | 7 +-- .../reporting.sot_assessment_roll.sql | 48 ++++++++++++------- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index 65dc3229b..7bae830b5 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -108,8 +108,7 @@ def aggregrate(data, geography_type, group_type): ] stats = { - "size": first, - "tot": ["count"] + more_stats, + "tot": ["size", "count"] + more_stats, "bldg": more_stats, "land": more_stats, } @@ -131,6 +130,8 @@ def aggregrate(data, geography_type, group_type): output["bldg", "delta" + i] = output["bldg", i].diff() output["land", "delta" + i] = output["land", i].diff() -output["tot", "pct_w_value"] = output["tot", "count"] / output["size", "first"] +output["tot", "pct_w_value"] = output["tot", "count"] / output["tot", "size"] output.to_csv("sot_assessment_roll.csv") + +# %% diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.sql b/dbt/models/reporting/reporting.sot_assessment_roll.sql index e50aa10a7..aa615a533 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.sql +++ b/dbt/models/reporting/reporting.sot_assessment_roll.sql @@ -1,34 +1,51 @@ -- Gather parcel-level geographies and join land, sales, and class groupings -WITH counts AS ( + +/* Ensure every municipality/class/year has a row for every stage through +cross-joining. This is to make sure that combinations that do not yet +exist in iasworld.asmt_all for the current year will exist in the view, but have +largely empty columns. For example: even if no class 4s in the City of Chicago +have been mailed yet for the current assessment year, we would still like an +empty City of Chicago/class 4 row to exist for the mailed stage. */ +WITH stages AS ( + + SELECT 'MAILED' AS stage_name + UNION + SELECT 'ASSESSOR CERTIFIED' AS stage_name + UNION + SELECT 'BOR CERTIFIED' AS stage_name + +), + +uni AS ( SELECT - year, - COUNT(*) AS size + vw_pin_universe.*, + stages.* FROM default.vw_pin_universe - GROUP BY year + CROSS JOIN stages ) SELECT + uni.year, + uni.stage_name, + uni.class, CAST(vals.tot AS INT) AS tot, CAST(vals.bldg AS INT) AS bldg, CAST(vals.land AS INT) AS land, CASE WHEN - MOD(CAST(vals.year AS INT), 3) = 0 + MOD(CAST(uni.year AS INT), 3) = 0 AND uni.triad_name = 'North' THEN TRUE WHEN - MOD(CAST(vals.year AS INT), 3) = 1 + MOD(CAST(uni.year AS INT), 3) = 1 AND uni.triad_name = 'South' THEN TRUE WHEN - MOD(CAST(vals.year AS INT), 3) = 2 + MOD(CAST(uni.year AS INT), 3) = 2 AND uni.triad_name = 'City' THEN TRUE ELSE FALSE END AS reassessment_year, - vals.class, - vals.stage_name, - vals.year, 'Cook' AS county, uni.triad_name AS triad, uni.township_name AS township, @@ -77,13 +94,12 @@ SELECT uni.tax_data_year, 'no_group' AS no_group, class_dict.major_class_type AS major_class, - class_dict.modeling_group, - counts.size -FROM default.vw_pin_universe AS uni + class_dict.modeling_group +FROM uni LEFT JOIN reporting.vw_pin_value_long AS vals ON uni.pin = vals.pin AND uni.year = vals.year + AND uni.stage_name = vals.stage_name LEFT JOIN ccao.class_dict - ON vals.class = class_dict.class_code -LEFT JOIN counts - ON uni.year = counts.year + ON uni.class = class_dict.class_code +LIMIT 10000 From 1bf9b9c7a9b1f1714eee6757769b3023e74f036b Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 18 Jun 2024 00:35:38 +0000 Subject: [PATCH 07/96] Wrap sales table --- .../reporting/reporting.sot_ratio_stats.py | 162 ++++++++++++++++++ .../reporting/reporting.sot_ratio_stats.sql | 115 +++++++++++++ 2 files changed, 277 insertions(+) create mode 100644 dbt/models/reporting/reporting.sot_ratio_stats.py create mode 100644 dbt/models/reporting/reporting.sot_ratio_stats.sql diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py new file mode 100644 index 000000000..e54e42203 --- /dev/null +++ b/dbt/models/reporting/reporting.sot_ratio_stats.py @@ -0,0 +1,162 @@ +# This script generates aggregated summary stats on sales data across a number +# of geographies, class combinations, and time. + +import os.path + +# Import libraries +import assesspy as ass +import awswrangler as wr +import pandas as pd + +# Ingest data if it is not already available +if os.path.isfile("sot_ratio_stats.parquet.gzip"): + df = pd.read_parquet("sot_ratio_stats.parquet.gzip") + +else: + sql = open("reporting.sot_ratio_stats.sql").read() + df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False) + df.to_parquet("sot_ratio_stats.parquet.gzip", compression="gzip") + +# Declare geographic groups and their associated data years +geos = { + "census_data_year": [ + "census_place", + "census_tract", + "census_congressional_district", + "census_zcta", + ], + "cook_board_of_review_district_data_year": [ + "cook_board_of_review_district" + ], + "cook_commissioner_district_data_year": ["cook_commissioner_district"], + "cook_judicial_district_data_year": ["cook_judicial_district"], + "ward_data_year": ["ward_num"], + "community_area_data_year": ["community_area"], + "police_district_data_year": ["police_district"], + "central_business_district_data_year": ["central_business_district"], + "school_data_year": [ + "school_elementary_district", + "school_secondary_district", + "school_unified_district", + ], + "tax_data_year": [ + "tax_municipality", + "tax_park_district", + "tax_library_district", + "tax_fire_protection_district", + "tax_community_college_district", + "tax_sanitation_district", + "tax_special_service_area", + "tax_tif_district", + ], +} +# Declare class groupings +groups = ["no_group", "class", "major_class", "modeling_group", "stage_name"] + + +# %% +# Define aggregation functions +def aggregrate(data, geography_type, group_type): + print(geography_type, group_type) + + group = [geography_type, group_type, "year"] + data["size"] = data.groupby(group)["tot_mv"].transform("size") + data["sale_count"] = data.groupby(group)["sale_price"].transform("count") + data["mv_count"] = data.groupby(group)["tot_mv"].transform("count") + data["ratio_count"] = data.groupby(group)["ratio"].transform("count") + data = data[data["ratio_count"] > 1] + + summary = ( + data.dropna(subset=["ratio"]) + .groupby(group) + .apply( + lambda x: pd.Series( + { + "size": x["size"].iloc[0], + "mv_count": x["mv_count"].iloc[0], + "sale_count": x["sale_count"].iloc[0], + "mv_min": x["tot_mv"].min(), + "mv_q10": x["tot_mv"].quantile(0.1), + "mv_q25": x["tot_mv"].quantile(0.25), + "mv_median": x["tot_mv"].median(), + "mv_q75": x["tot_mv"].quantile(0.75), + "mv_q90": x["tot_mv"].quantile(0.90), + "mv_max": x["tot_mv"].max(), + "mv_mean": x["tot_mv"].mean(), + "mv_sum": x["tot_mv"].sum(), + "ratio_min": x["ratio"].min(), + "ratio_q10": x["ratio"].quantile(0.1), + "ratio_q25": x["ratio"].quantile(0.25), + "ratio_median": x["ratio"].median(), + "ratio_q75": x["ratio"].quantile(0.75), + "ratio_q90": x["ratio"].quantile(0.90), + "ratio_max": x["ratio"].max(), + "ratio_mean": x["ratio"].mean(), + "cod": ass.cod(ratio=x["ratio"]), + "prd": ass.prd(x["tot_mv"], x["sale_price"]), + "prb": ass.prb(x["tot_mv"], x["sale_price"], 3)["prb"], + # "mki": ass.mki(x["tot_mv"], x["sale_price"]), + } + ), + include_groups=False, + ) + ) + summary["geography_type"] = geography_type + summary["group_type"] = group_type + + return summary + + +# Create an empty dataframe to fill with output +output = pd.DataFrame() + +# Loop through group combinations and stack output +for key, value in geos.items(): + df["data_year"] = df[key] + + for x in value: + for z in groups: + output = pd.concat([output, aggregrate(df, x, z)]) + +output.index.names = ["geography_id", "group_id", "year"] + +output = output.reset_index().set_index( + [ + "geography_type", + "geography_id", + "group_type", + "group_id", + "year", + ] +) + +# Clean combined output and export +output["mv_delta_pct_median"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id"]) + .mv_median.diff() +) +output["mv_delta_pct_mean"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id"]) + .mv_mean.diff() +) +output["mv_delta_pct_sum"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id"]) + .mv_sum.diff() +) + +output["mv_delta_pct_median"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id"]) + .mv_median.pct_change() +) +output["mv_delta_pct_mean"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id"]) + .mv_mean.pct_change() +) + +output.dropna(how="all", axis=1, inplace=True) +output.to_csv("sot_ratio_stats.csv") diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.sql b/dbt/models/reporting/reporting.sot_ratio_stats.sql new file mode 100644 index 000000000..267029dd7 --- /dev/null +++ b/dbt/models/reporting/reporting.sot_ratio_stats.sql @@ -0,0 +1,115 @@ +-- Gather parcel-level geographies and join land, sales, and class groupings + +/* Ensure every municipality/class/year has a row for every stage through +cross-joining. This is to make sure that combinations that do not yet +exist in iasworld.asmt_all for the current year will exist in the view, but have +largely empty columns. For example: even if no class 4s in the City of Chicago +have been mailed yet for the current assessment year, we would still like an +empty City of Chicago/class 4 row to exist for the mailed stage. */ +WITH stages AS ( + + SELECT 'MAILED' AS stage_name + UNION + SELECT 'ASSESSOR CERTIFIED' AS stage_name + UNION + SELECT 'BOR CERTIFIED' AS stage_name + +), + +uni AS ( + SELECT + vw_pin_universe.*, + stages.* + FROM default.vw_pin_universe + CROSS JOIN stages +) + +SELECT + CAST(sales.sale_price AS DOUBLE) AS sale_price, + uni.year, + uni.stage_name, + uni.class, + CAST(vals.tot_mv AS DOUBLE) AS tot_mv, + CAST(vals.tot_mv AS DOUBLE) / CAST(sales.sale_price AS DOUBLE) AS ratio, + CASE + WHEN + MOD(CAST(uni.year AS INT), 3) = 0 + AND uni.triad_name = 'North' + THEN TRUE + WHEN + MOD(CAST(uni.year AS INT), 3) = 1 + AND uni.triad_name = 'South' + THEN TRUE + WHEN + MOD(CAST(uni.year AS INT), 3) = 2 + AND uni.triad_name = 'City' + THEN TRUE + ELSE FALSE + END AS reassessment_year, + 'Cook' AS county, + uni.triad_name AS triad, + uni.township_name AS township, + uni.nbhd_code AS nbhd, + uni.tax_code, + uni.zip_code, + uni.chicago_community_area_name AS community_area, + uni.census_place_geoid AS census_place, + uni.census_tract_geoid AS census_tract, + uni.census_congressional_district_geoid + AS + census_congressional_district, + uni.census_zcta_geoid AS census_zcta, + uni.cook_board_of_review_district_num AS cook_board_of_review_district, + uni.cook_commissioner_district_num AS cook_commissioner_district, + uni.cook_judicial_district_num AS cook_judicial_district, + uni.ward_num, + uni.chicago_police_district_num AS police_district, + uni.school_elementary_district_geoid AS school_elementary_district, + uni.school_secondary_district_geoid AS school_secondary_district, + uni.school_unified_district_geoid AS school_unified_district, + uni.tax_municipality_name AS tax_municipality, + uni.tax_park_district_name AS tax_park_district, + uni.tax_library_district_name AS tax_library_district, + uni.tax_fire_protection_district_name AS tax_fire_protection_district, + uni.tax_community_college_district_name + AS + tax_community_college_district, + uni.tax_sanitation_district_name AS tax_sanitation_district, + uni.tax_special_service_area_name AS tax_special_service_area, + uni.tax_tif_district_name AS tax_tif_district, + uni.econ_central_business_district_num AS central_business_district, + uni.census_data_year, + uni.cook_board_of_review_district_data_year, + uni.cook_commissioner_district_data_year, + uni.cook_judicial_district_data_year, + COALESCE( + uni.ward_chicago_data_year, uni.ward_evanston_data_year) AS + ward_data_year, + uni.chicago_community_area_data_year AS community_area_data_year, + uni.chicago_police_district_data_year AS police_district_data_year, + uni.econ_central_business_district_data_year + AS + central_business_district_data_year, + uni.school_data_year, + uni.tax_data_year, + 'no_group' AS no_group, + class_dict.major_class_type AS major_class, + class_dict.modeling_group +FROM uni +LEFT JOIN + z_ci_508_add_mv_to_reportingvw_pin_value_long_reporting.vw_pin_value_long + AS vals + ON uni.pin = vals.pin + AND uni.year = vals.year + AND uni.stage_name = vals.stage_name +LEFT JOIN ccao.class_dict + ON uni.class = class_dict.class_code +LEFT JOIN default.vw_pin_sale AS sales + ON uni.pin = sales.pin + AND uni.year = sales.year + AND NOT sales.is_multisale + AND NOT sales.sale_filter_deed_type + AND NOT sales.sale_filter_less_than_10k + AND NOT sales.sale_filter_same_sale_within_365 +WHERE uni.year >= '2020' + AND (vals.tot_mv > 0 OR vals.tot_mv IS NULL) From 0a9e1f3f124141c079cb5c7711192e631674ece0 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 18 Jun 2024 18:38:53 +0000 Subject: [PATCH 08/96] Correct stage grouping, counting --- .../reporting/reporting.sot_ratio_stats.py | 26 ++++++++++++------- .../reporting/reporting.sot_ratio_stats.sql | 1 - 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py index e54e42203..a37cb2261 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats.py +++ b/dbt/models/reporting/reporting.sot_ratio_stats.py @@ -1,6 +1,5 @@ # This script generates aggregated summary stats on sales data across a number # of geographies, class combinations, and time. - import os.path # Import libraries @@ -51,19 +50,23 @@ ], } # Declare class groupings -groups = ["no_group", "class", "major_class", "modeling_group", "stage_name"] +groups = ["no_group", "class", "major_class", "modeling_group"] -# %% # Define aggregation functions def aggregrate(data, geography_type, group_type): print(geography_type, group_type) - group = [geography_type, group_type, "year"] + group = [geography_type, group_type, "year", "stage_name"] data["size"] = data.groupby(group)["tot_mv"].transform("size") data["sale_count"] = data.groupby(group)["sale_price"].transform("count") data["mv_count"] = data.groupby(group)["tot_mv"].transform("count") + + # Remove parcels with FMVs of 0 since they screw up ratios + data = data[data["tot_mv"] > 0].reset_index() data["ratio_count"] = data.groupby(group)["ratio"].transform("count") + + # Remove groups that only have one sale since we can't calculate stats data = data[data["ratio_count"] > 1] summary = ( @@ -118,7 +121,7 @@ def aggregrate(data, geography_type, group_type): for z in groups: output = pd.concat([output, aggregrate(df, x, z)]) -output.index.names = ["geography_id", "group_id", "year"] +output.index.names = ["geography_id", "group_id", "year", "stage_name"] output = output.reset_index().set_index( [ @@ -127,36 +130,39 @@ def aggregrate(data, geography_type, group_type): "group_type", "group_id", "year", + "stage_name", ] ) # Clean combined output and export output["mv_delta_pct_median"] = ( output.sort_values("year") - .groupby(["geography_id", "group_id"]) + .groupby(["geography_id", "group_id", "stage_name"]) .mv_median.diff() ) output["mv_delta_pct_mean"] = ( output.sort_values("year") - .groupby(["geography_id", "group_id"]) + .groupby(["geography_id", "group_id", "stage_name"]) .mv_mean.diff() ) output["mv_delta_pct_sum"] = ( output.sort_values("year") - .groupby(["geography_id", "group_id"]) + .groupby(["geography_id", "group_id", "stage_name"]) .mv_sum.diff() ) output["mv_delta_pct_median"] = ( output.sort_values("year") - .groupby(["geography_id", "group_id"]) + .groupby(["geography_id", "group_id", "stage_name"]) .mv_median.pct_change() ) output["mv_delta_pct_mean"] = ( output.sort_values("year") - .groupby(["geography_id", "group_id"]) + .groupby(["geography_id", "group_id", "stage_name"]) .mv_mean.pct_change() ) output.dropna(how="all", axis=1, inplace=True) output.to_csv("sot_ratio_stats.csv") + +# %% diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.sql b/dbt/models/reporting/reporting.sot_ratio_stats.sql index 267029dd7..dc6cf73e3 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats.sql +++ b/dbt/models/reporting/reporting.sot_ratio_stats.sql @@ -112,4 +112,3 @@ LEFT JOIN default.vw_pin_sale AS sales AND NOT sales.sale_filter_less_than_10k AND NOT sales.sale_filter_same_sale_within_365 WHERE uni.year >= '2020' - AND (vals.tot_mv > 0 OR vals.tot_mv IS NULL) From 030a7c54e72ebbda37fb154e51246e53812ba192 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 18 Jun 2024 19:10:05 +0000 Subject: [PATCH 09/96] Fix assessment roll stage grouping --- dbt/models/reporting/reporting.sot_assessment_roll.py | 9 ++++----- dbt/models/reporting/reporting.sot_assessment_roll.sql | 1 - dbt/models/reporting/reporting.sot_ratio_stats.sql | 3 +-- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index 7bae830b5..a31adf5b2 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -50,7 +50,7 @@ ], } # Declare class groupings -groups = ["no_group", "class", "major_class", "modeling_group", "stage_name"] +groups = ["no_group", "class", "major_class", "modeling_group"] # Define aggregation functions @@ -77,11 +77,11 @@ def first(x): def aggregrate(data, geography_type, group_type): print(geography_type, group_type) - group = [geography_type, group_type, "year"] + group = [geography_type, group_type, "year", "stage_name"] summary = data.groupby(group).agg(stats).round(2) summary["geography_type"] = geography_type summary["group_type"] = group_type - summary.index.names = ["geography_id", "group_id", "year"] + summary.index.names = ["geography_id", "group_id", "year", "stage_name"] summary = summary.reset_index().set_index( [ "geography_type", @@ -89,6 +89,7 @@ def aggregrate(data, geography_type, group_type): "group_type", "group_id", "year", + "stage_name", ] ) @@ -133,5 +134,3 @@ def aggregrate(data, geography_type, group_type): output["tot", "pct_w_value"] = output["tot", "count"] / output["tot", "size"] output.to_csv("sot_assessment_roll.csv") - -# %% diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.sql b/dbt/models/reporting/reporting.sot_assessment_roll.sql index aa615a533..0d582c814 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.sql +++ b/dbt/models/reporting/reporting.sot_assessment_roll.sql @@ -102,4 +102,3 @@ LEFT JOIN reporting.vw_pin_value_long AS vals AND uni.stage_name = vals.stage_name LEFT JOIN ccao.class_dict ON uni.class = class_dict.class_code -LIMIT 10000 diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.sql b/dbt/models/reporting/reporting.sot_ratio_stats.sql index dc6cf73e3..b142f537a 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats.sql +++ b/dbt/models/reporting/reporting.sot_ratio_stats.sql @@ -97,8 +97,7 @@ SELECT class_dict.modeling_group FROM uni LEFT JOIN - z_ci_508_add_mv_to_reportingvw_pin_value_long_reporting.vw_pin_value_long - AS vals + reporting.vw_pin_value_long AS vals ON uni.pin = vals.pin AND uni.year = vals.year AND uni.stage_name = vals.stage_name From 1c2adaecb32567c6e9050b1049392b6a9b22d12d Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 18 Jun 2024 19:49:18 +0000 Subject: [PATCH 10/96] Clean output before writing --- dbt/models/reporting/reporting.sot_assessment_roll.py | 3 +++ dbt/models/reporting/reporting.sot_sales.py | 3 +++ dbt/models/reporting/reporting.sot_taxes_exemptions.py | 3 +++ 3 files changed, 9 insertions(+) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index a31adf5b2..cfacb1b4b 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -133,4 +133,7 @@ def aggregrate(data, geography_type, group_type): output["tot", "pct_w_value"] = output["tot", "count"] / output["tot", "size"] +output.columns = ["_".join(col) for col in output.columns] +output.reset_index() + output.to_csv("sot_assessment_roll.csv") diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py index 120a0bd5b..c2034f873 100644 --- a/dbt/models/reporting/reporting.sot_sales.py +++ b/dbt/models/reporting/reporting.sot_sales.py @@ -134,4 +134,7 @@ def first(x): output["sale_price", "delta" + i] = output["sale_price", i].diff() output["price_per_sf", "delta" + i] = output["price_per_sf", i].diff() +output.columns = ["_".join(col) for col in output.columns] +output.reset_index() + output.to_csv("sot_sales.csv") diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py index b26e55a68..9ef77b278 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py @@ -134,4 +134,7 @@ def first(x): for i in ["median", "mean", "sum"]: output["tax_bill_total", "delta" + i] = output["tax_bill_total", i].diff() +output.columns = ["_".join(col) for col in output.columns] +output.reset_index() + output.to_csv("sot_taxes_exemptions.csv") From 672bd1ebec6156294ebdd35bf924b764e41f4ef2 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 18 Jun 2024 20:31:17 +0000 Subject: [PATCH 11/96] Begin dbt building --- dbt/models/reporting/docs.md | 36 +++++++++++++++++++ ...> reporting.sot_assessment_roll_input.sql} | 11 ++++-- ...ql => reporting.sot_ratio_stats_input.sql} | 13 ++++--- ...ales.sql => reporting.sot_sales_input.sql} | 16 ++++++--- ... reporting.sot_taxes_exemptions_input.sql} | 16 ++++++--- dbt/models/reporting/schema.yml | 24 +++++++++++++ 6 files changed, 99 insertions(+), 17 deletions(-) rename dbt/models/reporting/{reporting.sot_assessment_roll.sql => reporting.sot_assessment_roll_input.sql} (94%) rename dbt/models/reporting/{reporting.sot_ratio_stats.sql => reporting.sot_ratio_stats_input.sql} (94%) rename dbt/models/reporting/{reporting.sot_sales.sql => reporting.sot_sales_input.sql} (92%) rename dbt/models/reporting/{reporting.sot_taxes_exemptions.sql => reporting.sot_taxes_exemptions_input.sql} (92%) diff --git a/dbt/models/reporting/docs.md b/dbt/models/reporting/docs.md index 2ce0bcb92..baf60aba9 100644 --- a/dbt/models/reporting/docs.md +++ b/dbt/models/reporting/docs.md @@ -30,6 +30,42 @@ Materialized to speed up queries for Tableau. `property_group` {% enddocs %} +# sot_assessment_roll_input + +{% docs table_sot_assessment_roll_input %} +Table to feed the Python dbt job that creates the +`reporting.sot_assessment_roll` table. Feeds public reporting assets. + +**Primary Key**: `year`, `stage_name`, `geography_id`, `group_id` +{% enddocs %} + +# sot_ratio_stats_input + +{% docs table_sot_ratio_stats_input %} +Table to feed the Python dbt job that creates the +`reporting.sot_ratio_stats` table. Feeds public reporting assets. + +**Primary Key**: `year`, `stage_name`, `geography_id`, `group_id` +{% enddocs %} + +# sot_sales_input + +{% docs table_sot_sales_input %} +Table to feed the Python dbt job that creates the +`reporting.sot_sales` table. Feeds public reporting assets. + +**Primary Key**: `year`, `geography_id`, `group_id` +{% enddocs %} + +# sot_taxes_exemptions_input + +{% docs table_sot_taxes_exemptions_input %} +Table to feed the Python dbt job that creates the +`reporting.sot_taxes_exemptions` table. Feeds public reporting assets. + +**Primary Key**: `year`, `geography_id`, `group_id` +{% enddocs %} + # vw_assessment_roll {% docs view_vw_assessment_roll %} diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql similarity index 94% rename from dbt/models/reporting/reporting.sot_assessment_roll.sql rename to dbt/models/reporting/reporting.sot_assessment_roll_input.sql index 0d582c814..395337357 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.sql +++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql @@ -1,4 +1,9 @@ -- Gather parcel-level geographies and join land, sales, and class groupings +{{ + config( + materialized='table' + ) +}} /* Ensure every municipality/class/year has a row for every stage through cross-joining. This is to make sure that combinations that do not yet @@ -20,7 +25,7 @@ uni AS ( SELECT vw_pin_universe.*, stages.* - FROM default.vw_pin_universe + FROM {{ ref('default.vw_pin_universe') }} CROSS JOIN stages ) @@ -96,9 +101,9 @@ SELECT class_dict.major_class_type AS major_class, class_dict.modeling_group FROM uni -LEFT JOIN reporting.vw_pin_value_long AS vals +LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals ON uni.pin = vals.pin AND uni.year = vals.year AND uni.stage_name = vals.stage_name -LEFT JOIN ccao.class_dict +LEFT JOIN {{ ref('ccao.class_dict') }} ON uni.class = class_dict.class_code diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.sql b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql similarity index 94% rename from dbt/models/reporting/reporting.sot_ratio_stats.sql rename to dbt/models/reporting/reporting.sot_ratio_stats_input.sql index b142f537a..f723c10ef 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats.sql +++ b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql @@ -1,4 +1,9 @@ -- Gather parcel-level geographies and join land, sales, and class groupings +{{ + config( + materialized='table' + ) +}} /* Ensure every municipality/class/year has a row for every stage through cross-joining. This is to make sure that combinations that do not yet @@ -20,7 +25,7 @@ uni AS ( SELECT vw_pin_universe.*, stages.* - FROM default.vw_pin_universe + FROM {{ ref('default.vw_pin_universe') }} CROSS JOIN stages ) @@ -97,13 +102,13 @@ SELECT class_dict.modeling_group FROM uni LEFT JOIN - reporting.vw_pin_value_long AS vals + {{ ref('reporting.vw_pin_value_long') }} AS vals ON uni.pin = vals.pin AND uni.year = vals.year AND uni.stage_name = vals.stage_name -LEFT JOIN ccao.class_dict +LEFT JOIN {{ ref('ccao.class_dict') }} ON uni.class = class_dict.class_code -LEFT JOIN default.vw_pin_sale AS sales +LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales ON uni.pin = sales.pin AND uni.year = sales.year AND NOT sales.is_multisale diff --git a/dbt/models/reporting/reporting.sot_sales.sql b/dbt/models/reporting/reporting.sot_sales_input.sql similarity index 92% rename from dbt/models/reporting/reporting.sot_sales.sql rename to dbt/models/reporting/reporting.sot_sales_input.sql index 2be8e4dc5..98dcbec23 100644 --- a/dbt/models/reporting/reporting.sot_sales.sql +++ b/dbt/models/reporting/reporting.sot_sales_input.sql @@ -1,3 +1,9 @@ +{{ + config( + materialized='table' + ) +}} + -- Gather parcel-level land and yrblt WITH sf AS ( SELECT @@ -6,7 +12,7 @@ WITH sf AS ( SUM(char_bldg_sf) AS char_bldg_sf, SUM(char_land_sf) AS char_land_sf, ARBITRARY(char_yrblt) AS char_yrblt - FROM default.vw_card_res_char + FROM {{ ref('default.vw_card_res_char') }} GROUP BY pin, year ) @@ -75,16 +81,16 @@ SELECT 'no_group' AS no_group, class_dict.major_class_type AS major_class, class_dict.modeling_group -FROM default.vw_pin_universe AS uni +FROM {{ ref('default.vw_pin_universe') }} AS uni LEFT JOIN sf ON uni.pin = sf.pin AND uni.year = sf.year -LEFT JOIN ccao.class_dict +LEFT JOIN {{ ref('ccao.class_dict') }} ON uni.class = class_dict.class_code -LEFT JOIN default.vw_pin_history AS hist +LEFT JOIN {{ ref('default.vw_pin_history') }} AS hist ON uni.pin = hist.pin AND uni.year = hist.year -LEFT JOIN default.vw_pin_sale AS sales +LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales ON uni.pin = sales.pin AND uni.year = sales.year AND NOT sales.is_multisale diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.sql b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql similarity index 92% rename from dbt/models/reporting/reporting.sot_taxes_exemptions.sql rename to dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql index f96bdc5d4..80bee99bc 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions.sql +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql @@ -1,3 +1,9 @@ +{{ + config( + materialized='table' + ) +}} + -- Gather parcel-level geographies and join taxes, exemptions, and class -- groupings SELECT @@ -80,14 +86,14 @@ SELECT 'no_group' AS no_group, class_dict.major_class_type AS major_class, class_dict.modeling_group -FROM default.vw_pin_universe AS uni -INNER JOIN tax.pin AS tax +FROM {{ ref('default.vw_pin_universe') }} AS uni +INNER JOIN {{ source('tax', 'pin') }} AS tax ON uni.pin = tax.pin AND uni.year = tax.year -INNER JOIN tax.eq_factor AS eqf +INNER JOIN {{ source('tax', 'eq_factor') }} AS eqf ON uni.year = eqf.year -INNER JOIN tax.tax_code AS tcd +INNER JOIN {{ source('tax', 'tax_code') }} AS tcd ON tax.tax_code_num = tcd.tax_code_num AND tax.year = tcd.year -INNER JOIN ccao.class_dict +INNER JOIN {{ ref('ccao.class_dict') }} ON uni.class = class_dict.class_code diff --git a/dbt/models/reporting/schema.yml b/dbt/models/reporting/schema.yml index daf546e4f..7afa63d71 100644 --- a/dbt/models/reporting/schema.yml +++ b/dbt/models/reporting/schema.yml @@ -34,6 +34,30 @@ models: within_20_pct >= within_10_pct AND within_10_pct >= within_05_pct + - name: reporting.sot_assessment_roll_input + description: '{{ doc("table_sot_assessment_roll_input") }}' + config: + tags: + - daily + + - name: reporting.sot_ratio_stats_input + description: '{{ doc("table_sot_ratio_stats_input") }}' + config: + tags: + - daily + + - name: reporting.sot_sales_input + description: '{{ doc("table_sot_sales_input") }}' + config: + tags: + - daily + + - name: reporting.sot_taxes_exemptions_input + description: '{{ doc("table_sot_taxes_exemptions_input") }}' + config: + tags: + - daily + - name: reporting.ratio_stats_input description: '{{ doc("table_ratio_stats_input") }}' config: From 3f60a77407337b1c41b247190f494d0af985135e Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 20 Jun 2024 18:33:28 +0000 Subject: [PATCH 12/96] Attempt to build assessment_roll table --- dbt/models/reporting/docs.md | 9 + .../reporting.sot_assessment_roll.py | 101 ++++++----- .../reporting.sot_assessment_roll_input.sql | 19 +- .../reporting/reporting.sot_ratio_stats.py | 168 ------------------ dbt/models/reporting/reporting.sot_sales.py | 140 --------------- .../reporting.sot_taxes_exemptions.py | 140 --------------- dbt/models/reporting/schema.yml | 6 + 7 files changed, 82 insertions(+), 501 deletions(-) delete mode 100644 dbt/models/reporting/reporting.sot_ratio_stats.py delete mode 100644 dbt/models/reporting/reporting.sot_sales.py delete mode 100644 dbt/models/reporting/reporting.sot_taxes_exemptions.py diff --git a/dbt/models/reporting/docs.md b/dbt/models/reporting/docs.md index a94a0be67..dd0f7bf68 100644 --- a/dbt/models/reporting/docs.md +++ b/dbt/models/reporting/docs.md @@ -30,6 +30,15 @@ Materialized to speed up queries for Tableau. `property_group` {% enddocs %} +# sot_assessment_roll + +{% docs table_sot_assessment_roll %} +Table to feed the Python dbt job that creates the +`reporting.sot_assessment_roll` table. Feeds public reporting assets. + +**Primary Key**: `year`, `stage_name`, `geography_id`, `group_id` +{% enddocs %} + # sot_assessment_roll_input {% docs table_sot_assessment_roll_input %} diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index cfacb1b4b..2f99c35bc 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -1,21 +1,12 @@ +# pylint: skip-file +# type: ignore + # This script generates aggregated summary stats on sales data across a number # of geographies, class combinations, and time. -import os.path - # Import libraries -import awswrangler as wr import pandas as pd -# Ingest data if it is not already available -if os.path.isfile("sot_assessment_roll.parquet.gzip"): - df = pd.read_parquet("sot_assessment_roll.parquet.gzip") - -else: - sql = open("reporting.sot_assessment_roll.sql").read() - df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False) - df.to_parquet("sot_assessment_roll.parquet.gzip", compression="gzip") - # Declare geographic groups and their associated data years geos = { "census_data_year": [ @@ -74,6 +65,25 @@ def first(x): return x.iloc[0] +more_stats = [ + "min", + q10, + q25, + "median", + q75, + q90, + "max", + "mean", + "sum", +] + +stats = { + "tot": ["size", "count"] + more_stats, + "bldg": more_stats, + "land": more_stats, +} + + def aggregrate(data, geography_type, group_type): print(geography_type, group_type) @@ -96,44 +106,45 @@ def aggregrate(data, geography_type, group_type): return summary -more_stats = [ - "min", - q10, - q25, - "median", - q75, - q90, - "max", - "mean", - "sum", -] +def assemble(df, geos, groups): + # Create an empty dataframe to fill with output + output = pd.DataFrame() -stats = { - "tot": ["size", "count"] + more_stats, - "bldg": more_stats, - "land": more_stats, -} + # Loop through group combinations and stack output + for key, value in geos.items(): + df["data_year"] = df[key] + + for x in value: + for z in groups: + output = pd.concat([output, aggregrate(df, x, z)]) + + # Clean combined output and export + for i in ["median", "mean", "sum"]: + output["tot", "delta" + i] = output["tot", i].diff() + output["bldg", "delta" + i] = output["bldg", i].diff() + output["land", "delta" + i] = output["land", i].diff() + + output["tot", "pct_w_value"] = ( + output["tot", "count"] / output["tot", "size"] + ) + + output.columns = ["_".join(col) for col in output.columns] + output.reset_index() + + return output -# Create an empty dataframe to fill with output -output = pd.DataFrame() -# Loop through group combinations and stack output -for key, value in geos.items(): - df["data_year"] = df[key] +def model(dbt, spark_session): + dbt.config(materialized="table") - for x in value: - for z in groups: - output = pd.concat([output, aggregrate(df, x, z)]) + input = dbt.ref("reporting.sot_assessment_roll_input") -# Clean combined output and export -for i in ["median", "mean", "sum"]: - output["tot", "delta" + i] = output["tot", i].diff() - output["bldg", "delta" + i] = output["bldg", i].diff() - output["land", "delta" + i] = output["land", i].diff() + # Convert the Spark input dataframe to Pandas for + # compatibility with assesspy functions + input = input.toPandas() -output["tot", "pct_w_value"] = output["tot", "count"] / output["tot", "size"] + df = assemble(input, geos=geos, groups=groups) -output.columns = ["_".join(col) for col in output.columns] -output.reset_index() + spark_df = spark_session.createDataFrame(df) -output.to_csv("sot_assessment_roll.csv") + return spark_df diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql index 395337357..e29d3c037 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql +++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql @@ -72,16 +72,19 @@ SELECT uni.school_elementary_district_geoid AS school_elementary_district, uni.school_secondary_district_geoid AS school_secondary_district, uni.school_unified_district_geoid AS school_unified_district, - uni.tax_municipality_name AS tax_municipality, - uni.tax_park_district_name AS tax_park_district, - uni.tax_library_district_name AS tax_library_district, - uni.tax_fire_protection_district_name AS tax_fire_protection_district, - uni.tax_community_college_district_name + ARRAY_JOIN(uni.tax_municipality_name, ', ') AS tax_municipality, + ARRAY_JOIN(uni.tax_park_district_name, ', ') AS tax_park_district, + ARRAY_JOIN(uni.tax_library_district_name, ', ') AS tax_library_district, + ARRAY_JOIN(uni.tax_fire_protection_district_name, ', ') + AS tax_fire_protection_district, + ARRAY_JOIN(uni.tax_community_college_district_name, ', ') AS tax_community_college_district, - uni.tax_sanitation_district_name AS tax_sanitation_district, - uni.tax_special_service_area_name AS tax_special_service_area, - uni.tax_tif_district_name AS tax_tif_district, + ARRAY_JOIN(uni.tax_sanitation_district_name, ', ') + AS tax_sanitation_district, + ARRAY_JOIN(uni.tax_special_service_area_name, ', ') + AS tax_special_service_area, + ARRAY_JOIN(uni.tax_tif_district_name, ', ') AS tax_tif_district, uni.econ_central_business_district_num AS central_business_district, uni.census_data_year, uni.cook_board_of_review_district_data_year, diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py deleted file mode 100644 index a37cb2261..000000000 --- a/dbt/models/reporting/reporting.sot_ratio_stats.py +++ /dev/null @@ -1,168 +0,0 @@ -# This script generates aggregated summary stats on sales data across a number -# of geographies, class combinations, and time. -import os.path - -# Import libraries -import assesspy as ass -import awswrangler as wr -import pandas as pd - -# Ingest data if it is not already available -if os.path.isfile("sot_ratio_stats.parquet.gzip"): - df = pd.read_parquet("sot_ratio_stats.parquet.gzip") - -else: - sql = open("reporting.sot_ratio_stats.sql").read() - df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False) - df.to_parquet("sot_ratio_stats.parquet.gzip", compression="gzip") - -# Declare geographic groups and their associated data years -geos = { - "census_data_year": [ - "census_place", - "census_tract", - "census_congressional_district", - "census_zcta", - ], - "cook_board_of_review_district_data_year": [ - "cook_board_of_review_district" - ], - "cook_commissioner_district_data_year": ["cook_commissioner_district"], - "cook_judicial_district_data_year": ["cook_judicial_district"], - "ward_data_year": ["ward_num"], - "community_area_data_year": ["community_area"], - "police_district_data_year": ["police_district"], - "central_business_district_data_year": ["central_business_district"], - "school_data_year": [ - "school_elementary_district", - "school_secondary_district", - "school_unified_district", - ], - "tax_data_year": [ - "tax_municipality", - "tax_park_district", - "tax_library_district", - "tax_fire_protection_district", - "tax_community_college_district", - "tax_sanitation_district", - "tax_special_service_area", - "tax_tif_district", - ], -} -# Declare class groupings -groups = ["no_group", "class", "major_class", "modeling_group"] - - -# Define aggregation functions -def aggregrate(data, geography_type, group_type): - print(geography_type, group_type) - - group = [geography_type, group_type, "year", "stage_name"] - data["size"] = data.groupby(group)["tot_mv"].transform("size") - data["sale_count"] = data.groupby(group)["sale_price"].transform("count") - data["mv_count"] = data.groupby(group)["tot_mv"].transform("count") - - # Remove parcels with FMVs of 0 since they screw up ratios - data = data[data["tot_mv"] > 0].reset_index() - data["ratio_count"] = data.groupby(group)["ratio"].transform("count") - - # Remove groups that only have one sale since we can't calculate stats - data = data[data["ratio_count"] > 1] - - summary = ( - data.dropna(subset=["ratio"]) - .groupby(group) - .apply( - lambda x: pd.Series( - { - "size": x["size"].iloc[0], - "mv_count": x["mv_count"].iloc[0], - "sale_count": x["sale_count"].iloc[0], - "mv_min": x["tot_mv"].min(), - "mv_q10": x["tot_mv"].quantile(0.1), - "mv_q25": x["tot_mv"].quantile(0.25), - "mv_median": x["tot_mv"].median(), - "mv_q75": x["tot_mv"].quantile(0.75), - "mv_q90": x["tot_mv"].quantile(0.90), - "mv_max": x["tot_mv"].max(), - "mv_mean": x["tot_mv"].mean(), - "mv_sum": x["tot_mv"].sum(), - "ratio_min": x["ratio"].min(), - "ratio_q10": x["ratio"].quantile(0.1), - "ratio_q25": x["ratio"].quantile(0.25), - "ratio_median": x["ratio"].median(), - "ratio_q75": x["ratio"].quantile(0.75), - "ratio_q90": x["ratio"].quantile(0.90), - "ratio_max": x["ratio"].max(), - "ratio_mean": x["ratio"].mean(), - "cod": ass.cod(ratio=x["ratio"]), - "prd": ass.prd(x["tot_mv"], x["sale_price"]), - "prb": ass.prb(x["tot_mv"], x["sale_price"], 3)["prb"], - # "mki": ass.mki(x["tot_mv"], x["sale_price"]), - } - ), - include_groups=False, - ) - ) - summary["geography_type"] = geography_type - summary["group_type"] = group_type - - return summary - - -# Create an empty dataframe to fill with output -output = pd.DataFrame() - -# Loop through group combinations and stack output -for key, value in geos.items(): - df["data_year"] = df[key] - - for x in value: - for z in groups: - output = pd.concat([output, aggregrate(df, x, z)]) - -output.index.names = ["geography_id", "group_id", "year", "stage_name"] - -output = output.reset_index().set_index( - [ - "geography_type", - "geography_id", - "group_type", - "group_id", - "year", - "stage_name", - ] -) - -# Clean combined output and export -output["mv_delta_pct_median"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .mv_median.diff() -) -output["mv_delta_pct_mean"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .mv_mean.diff() -) -output["mv_delta_pct_sum"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .mv_sum.diff() -) - -output["mv_delta_pct_median"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .mv_median.pct_change() -) -output["mv_delta_pct_mean"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .mv_mean.pct_change() -) - -output.dropna(how="all", axis=1, inplace=True) -output.to_csv("sot_ratio_stats.csv") - -# %% diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py deleted file mode 100644 index c2034f873..000000000 --- a/dbt/models/reporting/reporting.sot_sales.py +++ /dev/null @@ -1,140 +0,0 @@ -# This script generates aggregated summary stats on sales data across a number -# of geographies, class combinations, and time. - -import os.path -import statistics as stats - -# Import libraries -import awswrangler as wr -import numpy as np -import pandas as pd - -# Ingest data if it is not already available -if os.path.isfile("sot_sales.parquet.gzip"): - df = pd.read_parquet("sot_sales.parquet.gzip") - -else: - sql = open("reporting.sot_sales.sql").read() - df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False) - df.to_parquet("sot_sales.parquet.gzip", compression="gzip") - -# Declare geographic groups and their associated data years -geos = { - "census_data_year": [ - "census_place", - "census_tract", - "census_congressional_district", - "census_zcta", - ], - "cook_board_of_review_district_data_year": [ - "cook_board_of_review_district" - ], - "cook_commissioner_district_data_year": ["cook_commissioner_district"], - "cook_judicial_district_data_year": ["cook_judicial_district"], - "ward_data_year": ["ward_num"], - "community_area_data_year": ["community_area"], - "police_district_data_year": ["police_district"], - "central_business_district_data_year": ["central_business_district"], - "school_data_year": [ - "school_elementary_district", - "school_secondary_district", - "school_unified_district", - ], - "tax_data_year": [ - "tax_municipality", - "tax_park_district", - "tax_library_district", - "tax_fire_protection_district", - "tax_community_college_district", - "tax_sanitation_district", - "tax_special_service_area", - "tax_tif_district", - ], -} -# Declare class groupings -groups = ["no_group", "class", "major_class", "modeling_group"] - - -# Define aggregation functions -def q10(x): - return x.quantile(0.1) - - -def q25(x): - return x.quantile(0.25) - - -def q75(x): - return x.quantile(0.75) - - -def q90(x): - return x.quantile(0.9) - - -def first(x): - return x.iloc[0] - - -more_stats = [ - "min", - q10, - q25, - "median", - q75, - q90, - "max", - "mean", - "sum", -] - -agg_func_math = { - "sale_price": ["size", "count"] + more_stats, - "price_per_sf": more_stats, - "char_bldg_sf": ["median"], - "char_land_sf": ["median"], - "char_yrblt": ["median"], - "class": [stats.multimode], - "data_year": [first], -} - -# Create an empty dataframe to fill with output -output = pd.DataFrame() - -# Loop through group combinations and stack output -for key, value in geos.items(): - df["data_year"] = df[key] - - for x in value: - for z in groups: - group = [x, z, "year"] - summary = df.groupby(group).agg(agg_func_math).round(2) - summary["geography_type"] = x - summary["group_type"] = z - summary.index.names = ["geography_id", "group_id", "year"] - summary = summary.reset_index().set_index( - [ - "geography_type", - "geography_id", - "group_type", - "group_id", - "year", - ] - ) - - output = pd.concat([output, summary]) - -# Clean combined output and export -output["sale_price", "sum"] = output["sale_price", "sum"].replace(0, np.NaN) -output["price_per_sf", "sum"] = output["price_per_sf", "sum"].replace( - 0, np.NaN -) - -for i in ["median", "mean", "sum"]: - output["sale_price", "delta" + i] = output["sale_price", i].diff() - output["price_per_sf", "delta" + i] = output["price_per_sf", i].diff() - -output.columns = ["_".join(col) for col in output.columns] -output.reset_index() - -output.to_csv("sot_sales.csv") diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py deleted file mode 100644 index 9ef77b278..000000000 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py +++ /dev/null @@ -1,140 +0,0 @@ -# This script generates aggregated summary stats on taxes and exemptions data -# across a number of geographies, class combinations, and time. -import os.path - -# Import libraries -import awswrangler as wr -import pandas as pd - -# Ingest data if it is not already available -if os.path.isfile("sot_taxes_exemptions.parquet.gzip"): - df = pd.read_parquet("sot_taxes_exemptions.parquet.gzip") - -else: - sql = open("reporting.sot_taxes_exemptions.sql").read() - df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False) - df.to_parquet("sot_taxes_exemptions.parquet.gzip", compression="gzip") - -# Declare geographic groups and their associated data years -geos = { - "census_data_year": [ - "census_place", - "census_tract", - "census_congressional_district", - "census_zcta", - ], - "cook_board_of_review_district_data_year": [ - "cook_board_of_review_district" - ], - "cook_commissioner_district_data_year": ["cook_commissioner_district"], - "cook_judicial_district_data_year": ["cook_judicial_district"], - "ward_data_year": ["ward_num"], - "community_area_data_year": ["community_area"], - "police_district_data_year": ["police_district"], - "central_business_district_data_year": ["central_business_district"], - "school_data_year": [ - "school_elementary_district", - "school_secondary_district", - "school_unified_district", - ], - "tax_data_year": [ - "tax_municipality", - "tax_park_district", - "tax_library_district", - "tax_fire_protection_district", - "tax_community_college_district", - "tax_sanitation_district", - "tax_special_service_area", - "tax_tif_district", - ], -} -# Declare class groupings -groups = ["no_group", "class", "major_class", "modeling_group"] - - -# Define aggregation functions -def q10(x): - return x.quantile(0.1) - - -def q25(x): - return x.quantile(0.25) - - -def q75(x): - return x.quantile(0.75) - - -def q90(x): - return x.quantile(0.9) - - -def first(x): - return x.iloc[0] - - -more_stats = [ - "min", - q10, - q25, - "median", - q75, - q90, - "max", - "mean", - "sum", -] - -less_stats = ["count", "sum"] - -agg_func_math = { - "eq_factor_final": ["size", first], - "eq_factor_tentative": [first], - "tax_bill_total": more_stats, - "tax_code_rate": more_stats, - "av_clerk": more_stats, - "exe_homeowner": less_stats, - "exe_senior": less_stats, - "exe_freeze": less_stats, - "exe_longtime_homeowner": less_stats, - "exe_disabled": less_stats, - "exe_vet_returning": less_stats, - "exe_vet_dis_lt50": less_stats, - "exe_vet_dis_50_69": less_stats, - "exe_vet_dis_ge70": less_stats, - "exe_abate": less_stats, -} - -# Create an empty dataframe to fill with output -output = pd.DataFrame() -# Loop through group combinations and stack output -for key, value in geos.items(): - df["data_year"] = df[key] - - for x in value: - for z in groups: - group = [x, z, "year"] - summary = df.groupby(group).agg(agg_func_math).round(2) - summary["geography_type"] = x - summary["group_type"] = z - summary.index.names = ["geography_id", "group_id", "year"] - summary = summary.reset_index().set_index( - [ - "geography_type", - "geography_id", - "group_type", - "group_id", - "year", - ] - ) - - output = pd.concat([output, summary]) - -# Clean combined output and export -for i in ["median", "mean", "sum"]: - output["tax_bill_total", "delta" + i] = output["tax_bill_total", i].diff() - -output.columns = ["_".join(col) for col in output.columns] -output.reset_index() - -output.to_csv("sot_taxes_exemptions.csv") diff --git a/dbt/models/reporting/schema.yml b/dbt/models/reporting/schema.yml index eac6c31a7..9b5aefacb 100644 --- a/dbt/models/reporting/schema.yml +++ b/dbt/models/reporting/schema.yml @@ -34,6 +34,12 @@ models: within_20_pct >= within_10_pct AND within_10_pct >= within_05_pct + - name: reporting.sot_assessment_roll + description: '{{ doc("table_sot_assessment_roll") }}' + config: + tags: + - daily + - name: reporting.sot_assessment_roll_input description: '{{ doc("table_sot_assessment_roll_input") }}' config: From fdff4570ff7ec94e5fb08d372f6ec411a0abab02 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 20 Jun 2024 18:48:55 +0000 Subject: [PATCH 13/96] Testing build on smaller input --- dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql index e29d3c037..a5c23e2ff 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql +++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql @@ -110,3 +110,4 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals AND uni.stage_name = vals.stage_name LEFT JOIN {{ ref('ccao.class_dict') }} ON uni.class = class_dict.class_code +WHERE uni.year IN ('2022', '2023') From 6abd07402b02916267fea92dc873ab35f874cba6 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 20 Jun 2024 18:59:19 +0000 Subject: [PATCH 14/96] Trying to build on limited sample --- dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql index a5c23e2ff..8f3f7c3f2 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql +++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql @@ -110,4 +110,4 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals AND uni.stage_name = vals.stage_name LEFT JOIN {{ ref('ccao.class_dict') }} ON uni.class = class_dict.class_code -WHERE uni.year IN ('2022', '2023') +WHERE uni.stage_name = 'MAILED' AND uni.class = '278' From fd342b6ce1fb4ab5f011c56344f678feb7a1b752 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 20 Jun 2024 19:29:05 +0000 Subject: [PATCH 15/96] Try to build sales table --- dbt/models/reporting/reporting.sot_sales.py | 149 ++++++++++++++++++ .../reporting/reporting.sot_sales_input.sql | 20 ++- 2 files changed, 161 insertions(+), 8 deletions(-) create mode 100644 dbt/models/reporting/reporting.sot_sales.py diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py new file mode 100644 index 000000000..9709e77b3 --- /dev/null +++ b/dbt/models/reporting/reporting.sot_sales.py @@ -0,0 +1,149 @@ +# This script generates aggregated summary stats on sales data across a number +# of geographies, class combinations, and time. + +import statistics as stats + +# Import libraries +import numpy as np +import pandas as pd + +# Declare geographic groups and their associated data years +geos = { + "census_data_year": [ + "census_place", + "census_tract", + "census_congressional_district", + "census_zcta", + ], + "cook_board_of_review_district_data_year": [ + "cook_board_of_review_district" + ], + "cook_commissioner_district_data_year": ["cook_commissioner_district"], + "cook_judicial_district_data_year": ["cook_judicial_district"], + "ward_data_year": ["ward_num"], + "community_area_data_year": ["community_area"], + "police_district_data_year": ["police_district"], + "central_business_district_data_year": ["central_business_district"], + "school_data_year": [ + "school_elementary_district", + "school_secondary_district", + "school_unified_district", + ], + "tax_data_year": [ + "tax_municipality", + "tax_park_district", + "tax_library_district", + "tax_fire_protection_district", + "tax_community_college_district", + "tax_sanitation_district", + "tax_special_service_area", + "tax_tif_district", + ], +} +# Declare class groupings +groups = ["no_group", "class", "major_class", "modeling_group"] + + +# Define aggregation functions +def q10(x): + return x.quantile(0.1) + + +def q25(x): + return x.quantile(0.25) + + +def q75(x): + return x.quantile(0.75) + + +def q90(x): + return x.quantile(0.9) + + +def first(x): + return x.iloc[0] + + +more_stats = [ + "min", + q10, + q25, + "median", + q75, + q90, + "max", + "mean", + "sum", +] + +agg_func_math = { + "sale_price": ["size", "count"] + more_stats, + "price_per_sf": more_stats, + "char_bldg_sf": ["median"], + "char_land_sf": ["median"], + "char_yrblt": ["median"], + "class": [stats.multimode], + "data_year": [first], +} + + +def assemble(df, geos, groups): + # Create an empty dataframe to fill with output + output = pd.DataFrame() + + # Loop through group combinations and stack output + for key, value in geos.items(): + df["data_year"] = df[key] + + for x in value: + for z in groups: + group = [x, z, "year"] + summary = df.groupby(group).agg(agg_func_math).round(2) + summary["geography_type"] = x + summary["group_type"] = z + summary.index.names = ["geography_id", "group_id", "year"] + summary = summary.reset_index().set_index( + [ + "geography_type", + "geography_id", + "group_type", + "group_id", + "year", + ] + ) + + output = pd.concat([output, summary]) + + # Clean combined output and export + output["sale_price", "sum"] = output["sale_price", "sum"].replace( + 0, np.NaN + ) + output["price_per_sf", "sum"] = output["price_per_sf", "sum"].replace( + 0, np.NaN + ) + + for i in ["median", "mean", "sum"]: + output["sale_price", "delta" + i] = output["sale_price", i].diff() + output["price_per_sf", "delta" + i] = output["price_per_sf", i].diff() + + output.columns = ["_".join(col) for col in output.columns] + output.reset_index() + + return output + + +def model(dbt, spark_session): + dbt.config(materialized="table") + + input = dbt.ref("reporting.sot_sales_input") + + # Convert the Spark input dataframe to Pandas for + # compatibility with assesspy functions + input = input.toPandas() + + df = assemble(input, geos=geos, groups=groups) + + spark_df = spark_session.createDataFrame(df) + + return spark_df diff --git a/dbt/models/reporting/reporting.sot_sales_input.sql b/dbt/models/reporting/reporting.sot_sales_input.sql index 98dcbec23..e6ca7afd4 100644 --- a/dbt/models/reporting/reporting.sot_sales_input.sql +++ b/dbt/models/reporting/reporting.sot_sales_input.sql @@ -53,16 +53,19 @@ SELECT uni.school_elementary_district_geoid AS school_elementary_district, uni.school_secondary_district_geoid AS school_secondary_district, uni.school_unified_district_geoid AS school_unified_district, - uni.tax_municipality_name AS tax_municipality, - uni.tax_park_district_name AS tax_park_district, - uni.tax_library_district_name AS tax_library_district, - uni.tax_fire_protection_district_name AS tax_fire_protection_district, - uni.tax_community_college_district_name + ARRAY_JOIN(uni.tax_municipality_name, ', ') AS tax_municipality, + ARRAY_JOIN(uni.tax_park_district_name, ', ') AS tax_park_district, + ARRAY_JOIN(uni.tax_library_district_name, ', ') AS tax_library_district, + ARRAY_JOIN(uni.tax_fire_protection_district_name, ', ') + AS tax_fire_protection_district, + ARRAY_JOIN(uni.tax_community_college_district_name, ', ') AS tax_community_college_district, - uni.tax_sanitation_district_name AS tax_sanitation_district, - uni.tax_special_service_area_name AS tax_special_service_area, - uni.tax_tif_district_name AS tax_tif_district, + ARRAY_JOIN(uni.tax_sanitation_district_name, ', ') + AS tax_sanitation_district, + ARRAY_JOIN(uni.tax_special_service_area_name, ', ') + AS tax_special_service_area, + ARRAY_JOIN(uni.tax_tif_district_name, ', ') AS tax_tif_district, uni.econ_central_business_district_num AS central_business_district, uni.census_data_year, uni.cook_board_of_review_district_data_year, @@ -97,3 +100,4 @@ LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales AND NOT sales.sale_filter_deed_type AND NOT sales.sale_filter_less_than_10k AND NOT sales.sale_filter_same_sale_within_365 +WHERE uni.year = '2023' From cccf8e1ed8dc7a726cc8d850906a1a5dbdd08980 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 20 Jun 2024 19:40:53 +0000 Subject: [PATCH 16/96] Try to build taxes and exemptions table --- .../reporting.sot_ratio_stats_input.sql | 19 ++- .../reporting.sot_taxes_exemptions.py | 149 ++++++++++++++++++ .../reporting.sot_taxes_exemptions_input.sql | 20 ++- 3 files changed, 172 insertions(+), 16 deletions(-) create mode 100644 dbt/models/reporting/reporting.sot_taxes_exemptions.py diff --git a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql index f723c10ef..994f1d192 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql +++ b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql @@ -72,16 +72,19 @@ SELECT uni.school_elementary_district_geoid AS school_elementary_district, uni.school_secondary_district_geoid AS school_secondary_district, uni.school_unified_district_geoid AS school_unified_district, - uni.tax_municipality_name AS tax_municipality, - uni.tax_park_district_name AS tax_park_district, - uni.tax_library_district_name AS tax_library_district, - uni.tax_fire_protection_district_name AS tax_fire_protection_district, - uni.tax_community_college_district_name + ARRAY_JOIN(uni.tax_municipality_name, ', ') AS tax_municipality, + ARRAY_JOIN(uni.tax_park_district_name, ', ') AS tax_park_district, + ARRAY_JOIN(uni.tax_library_district_name, ', ') AS tax_library_district, + ARRAY_JOIN(uni.tax_fire_protection_district_name, ', ') + AS tax_fire_protection_district, + ARRAY_JOIN(uni.tax_community_college_district_name, ', ') AS tax_community_college_district, - uni.tax_sanitation_district_name AS tax_sanitation_district, - uni.tax_special_service_area_name AS tax_special_service_area, - uni.tax_tif_district_name AS tax_tif_district, + ARRAY_JOIN(uni.tax_sanitation_district_name, ', ') + AS tax_sanitation_district, + ARRAY_JOIN(uni.tax_special_service_area_name, ', ') + AS tax_special_service_area, + ARRAY_JOIN(uni.tax_tif_district_name, ', ') AS tax_tif_district, uni.econ_central_business_district_num AS central_business_district, uni.census_data_year, uni.cook_board_of_review_district_data_year, diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py new file mode 100644 index 000000000..4877b2d51 --- /dev/null +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py @@ -0,0 +1,149 @@ +# This script generates aggregated summary stats on taxes and exemptions data +# across a number of geographies, class combinations, and time. + +# Import libraries +import pandas as pd + +# Declare geographic groups and their associated data years +geos = { + "census_data_year": [ + "census_place", + "census_tract", + "census_congressional_district", + "census_zcta", + ], + "cook_board_of_review_district_data_year": [ + "cook_board_of_review_district" + ], + "cook_commissioner_district_data_year": ["cook_commissioner_district"], + "cook_judicial_district_data_year": ["cook_judicial_district"], + "ward_data_year": ["ward_num"], + "community_area_data_year": ["community_area"], + "police_district_data_year": ["police_district"], + "central_business_district_data_year": ["central_business_district"], + "school_data_year": [ + "school_elementary_district", + "school_secondary_district", + "school_unified_district", + ], + "tax_data_year": [ + "tax_municipality", + "tax_park_district", + "tax_library_district", + "tax_fire_protection_district", + "tax_community_college_district", + "tax_sanitation_district", + "tax_special_service_area", + "tax_tif_district", + ], +} +# Declare class groupings +groups = ["no_group", "class", "major_class", "modeling_group"] + + +# Define aggregation functions +def q10(x): + return x.quantile(0.1) + + +def q25(x): + return x.quantile(0.25) + + +def q75(x): + return x.quantile(0.75) + + +def q90(x): + return x.quantile(0.9) + + +def first(x): + return x.iloc[0] + + +more_stats = [ + "min", + q10, + q25, + "median", + q75, + q90, + "max", + "mean", + "sum", +] + +less_stats = ["count", "sum"] + +agg_func_math = { + "eq_factor_final": ["size", first], + "eq_factor_tentative": [first], + "tax_bill_total": more_stats, + "tax_code_rate": more_stats, + "av_clerk": more_stats, + "exe_homeowner": less_stats, + "exe_senior": less_stats, + "exe_freeze": less_stats, + "exe_longtime_homeowner": less_stats, + "exe_disabled": less_stats, + "exe_vet_returning": less_stats, + "exe_vet_dis_lt50": less_stats, + "exe_vet_dis_50_69": less_stats, + "exe_vet_dis_ge70": less_stats, + "exe_abate": less_stats, +} + + +def assemble(df, geos, groups): + # Create an empty dataframe to fill with output + output = pd.DataFrame() + # Loop through group combinations and stack output + for key, value in geos.items(): + df["data_year"] = df[key] + + for x in value: + for z in groups: + group = [x, z, "year"] + summary = df.groupby(group).agg(agg_func_math).round(2) + summary["geography_type"] = x + summary["group_type"] = z + summary.index.names = ["geography_id", "group_id", "year"] + summary = summary.reset_index().set_index( + [ + "geography_type", + "geography_id", + "group_type", + "group_id", + "year", + ] + ) + + output = pd.concat([output, summary]) + + # Clean combined output and export + for i in ["median", "mean", "sum"]: + output["tax_bill_total", "delta" + i] = output[ + "tax_bill_total", i + ].diff() + + output.columns = ["_".join(col) for col in output.columns] + output.reset_index() + + return output + + +def model(dbt, spark_session): + dbt.config(materialized="table") + + input = dbt.ref("reporting.sot_taxes_exemptions_input") + + # Convert the Spark input dataframe to Pandas for + # compatibility with assesspy functions + input = input.toPandas() + + df = assemble(input, geos=geos, groups=groups) + + spark_df = spark_session.createDataFrame(df) + + return spark_df diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql index 80bee99bc..0bf3872b7 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql @@ -58,16 +58,19 @@ SELECT uni.school_elementary_district_geoid AS school_elementary_district, uni.school_secondary_district_geoid AS school_secondary_district, uni.school_unified_district_geoid AS school_unified_district, - uni.tax_municipality_name AS tax_municipality, - uni.tax_park_district_name AS tax_park_district, - uni.tax_library_district_name AS tax_library_district, - uni.tax_fire_protection_district_name AS tax_fire_protection_district, - uni.tax_community_college_district_name + ARRAY_JOIN(uni.tax_municipality_name, ', ') AS tax_municipality, + ARRAY_JOIN(uni.tax_park_district_name, ', ') AS tax_park_district, + ARRAY_JOIN(uni.tax_library_district_name, ', ') AS tax_library_district, + ARRAY_JOIN(uni.tax_fire_protection_district_name, ', ') + AS tax_fire_protection_district, + ARRAY_JOIN(uni.tax_community_college_district_name, ', ') AS tax_community_college_district, - uni.tax_sanitation_district_name AS tax_sanitation_district, - uni.tax_special_service_area_name AS tax_special_service_area, - uni.tax_tif_district_name AS tax_tif_district, + ARRAY_JOIN(uni.tax_sanitation_district_name, ', ') + AS tax_sanitation_district, + ARRAY_JOIN(uni.tax_special_service_area_name, ', ') + AS tax_special_service_area, + ARRAY_JOIN(uni.tax_tif_district_name, ', ') AS tax_tif_district, uni.econ_central_business_district_num AS central_business_district, uni.census_data_year, uni.cook_board_of_review_district_data_year, @@ -97,3 +100,4 @@ INNER JOIN {{ source('tax', 'tax_code') }} AS tcd AND tax.year = tcd.year INNER JOIN {{ ref('ccao.class_dict') }} ON uni.class = class_dict.class_code +WHERE uni.class = '278' From 365696459ce27a5f3023c13dfdac46ef4c8adba4 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 20 Jun 2024 19:53:27 +0000 Subject: [PATCH 17/96] Try to build taxes and exemptions table --- temp/reporting.sot_ratio_stats.py | 168 ++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 temp/reporting.sot_ratio_stats.py diff --git a/temp/reporting.sot_ratio_stats.py b/temp/reporting.sot_ratio_stats.py new file mode 100644 index 000000000..a37cb2261 --- /dev/null +++ b/temp/reporting.sot_ratio_stats.py @@ -0,0 +1,168 @@ +# This script generates aggregated summary stats on sales data across a number +# of geographies, class combinations, and time. +import os.path + +# Import libraries +import assesspy as ass +import awswrangler as wr +import pandas as pd + +# Ingest data if it is not already available +if os.path.isfile("sot_ratio_stats.parquet.gzip"): + df = pd.read_parquet("sot_ratio_stats.parquet.gzip") + +else: + sql = open("reporting.sot_ratio_stats.sql").read() + df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False) + df.to_parquet("sot_ratio_stats.parquet.gzip", compression="gzip") + +# Declare geographic groups and their associated data years +geos = { + "census_data_year": [ + "census_place", + "census_tract", + "census_congressional_district", + "census_zcta", + ], + "cook_board_of_review_district_data_year": [ + "cook_board_of_review_district" + ], + "cook_commissioner_district_data_year": ["cook_commissioner_district"], + "cook_judicial_district_data_year": ["cook_judicial_district"], + "ward_data_year": ["ward_num"], + "community_area_data_year": ["community_area"], + "police_district_data_year": ["police_district"], + "central_business_district_data_year": ["central_business_district"], + "school_data_year": [ + "school_elementary_district", + "school_secondary_district", + "school_unified_district", + ], + "tax_data_year": [ + "tax_municipality", + "tax_park_district", + "tax_library_district", + "tax_fire_protection_district", + "tax_community_college_district", + "tax_sanitation_district", + "tax_special_service_area", + "tax_tif_district", + ], +} +# Declare class groupings +groups = ["no_group", "class", "major_class", "modeling_group"] + + +# Define aggregation functions +def aggregrate(data, geography_type, group_type): + print(geography_type, group_type) + + group = [geography_type, group_type, "year", "stage_name"] + data["size"] = data.groupby(group)["tot_mv"].transform("size") + data["sale_count"] = data.groupby(group)["sale_price"].transform("count") + data["mv_count"] = data.groupby(group)["tot_mv"].transform("count") + + # Remove parcels with FMVs of 0 since they screw up ratios + data = data[data["tot_mv"] > 0].reset_index() + data["ratio_count"] = data.groupby(group)["ratio"].transform("count") + + # Remove groups that only have one sale since we can't calculate stats + data = data[data["ratio_count"] > 1] + + summary = ( + data.dropna(subset=["ratio"]) + .groupby(group) + .apply( + lambda x: pd.Series( + { + "size": x["size"].iloc[0], + "mv_count": x["mv_count"].iloc[0], + "sale_count": x["sale_count"].iloc[0], + "mv_min": x["tot_mv"].min(), + "mv_q10": x["tot_mv"].quantile(0.1), + "mv_q25": x["tot_mv"].quantile(0.25), + "mv_median": x["tot_mv"].median(), + "mv_q75": x["tot_mv"].quantile(0.75), + "mv_q90": x["tot_mv"].quantile(0.90), + "mv_max": x["tot_mv"].max(), + "mv_mean": x["tot_mv"].mean(), + "mv_sum": x["tot_mv"].sum(), + "ratio_min": x["ratio"].min(), + "ratio_q10": x["ratio"].quantile(0.1), + "ratio_q25": x["ratio"].quantile(0.25), + "ratio_median": x["ratio"].median(), + "ratio_q75": x["ratio"].quantile(0.75), + "ratio_q90": x["ratio"].quantile(0.90), + "ratio_max": x["ratio"].max(), + "ratio_mean": x["ratio"].mean(), + "cod": ass.cod(ratio=x["ratio"]), + "prd": ass.prd(x["tot_mv"], x["sale_price"]), + "prb": ass.prb(x["tot_mv"], x["sale_price"], 3)["prb"], + # "mki": ass.mki(x["tot_mv"], x["sale_price"]), + } + ), + include_groups=False, + ) + ) + summary["geography_type"] = geography_type + summary["group_type"] = group_type + + return summary + + +# Create an empty dataframe to fill with output +output = pd.DataFrame() + +# Loop through group combinations and stack output +for key, value in geos.items(): + df["data_year"] = df[key] + + for x in value: + for z in groups: + output = pd.concat([output, aggregrate(df, x, z)]) + +output.index.names = ["geography_id", "group_id", "year", "stage_name"] + +output = output.reset_index().set_index( + [ + "geography_type", + "geography_id", + "group_type", + "group_id", + "year", + "stage_name", + ] +) + +# Clean combined output and export +output["mv_delta_pct_median"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .mv_median.diff() +) +output["mv_delta_pct_mean"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .mv_mean.diff() +) +output["mv_delta_pct_sum"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .mv_sum.diff() +) + +output["mv_delta_pct_median"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .mv_median.pct_change() +) +output["mv_delta_pct_mean"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .mv_mean.pct_change() +) + +output.dropna(how="all", axis=1, inplace=True) +output.to_csv("sot_ratio_stats.csv") + +# %% From 8b0f95f41792d46e0a40b005fb8f871167ec03ed Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 20 Jun 2024 21:08:33 +0000 Subject: [PATCH 18/96] Try to build taxes table --- dbt/models/reporting/docs.md | 16 ++ .../reporting.sot_ratio_stats_input.sql | 1 + .../reporting.sot_taxes_exemptions.py | 4 + .../reporting.sot_taxes_exemptions_input.sql | 13 +- dbt/models/reporting/schema.yml | 12 ++ temp/reporting.sot_ratio_stats.py | 168 ------------------ 6 files changed, 44 insertions(+), 170 deletions(-) delete mode 100644 temp/reporting.sot_ratio_stats.py diff --git a/dbt/models/reporting/docs.md b/dbt/models/reporting/docs.md index dd0f7bf68..3854dbd79 100644 --- a/dbt/models/reporting/docs.md +++ b/dbt/models/reporting/docs.md @@ -57,6 +57,14 @@ Table to feed the Python dbt job that creates the **Primary Key**: `year`, `stage_name`, `geography_id`, `group_id` {% enddocs %} +# sot_sales + +{% docs table_sot_sales %} +Feeds public reporting assets. + +**Primary Key**: `year`, `geography_id`, `group_id` +{% enddocs %} + # sot_sales_input {% docs table_sot_sales_input %} @@ -66,6 +74,14 @@ Table to feed the Python dbt job that creates the **Primary Key**: `year`, `geography_id`, `group_id` {% enddocs %} +# sot_taxes_exemptions + +{% docs table_sot_taxes_exemptions %} +Feeds public reporting assets. + +**Primary Key**: `year`, `geography_id`, `group_id` +{% enddocs %} + # sot_taxes_exemptions_input {% docs table_sot_taxes_exemptions_input %} diff --git a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql index 994f1d192..6cd258c2a 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql +++ b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql @@ -119,3 +119,4 @@ LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales AND NOT sales.sale_filter_less_than_10k AND NOT sales.sale_filter_same_sale_within_365 WHERE uni.year >= '2020' + AND uni.year = '2023' AND uni.class = '278' diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py index 4877b2d51..cb257a49e 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py @@ -1,3 +1,6 @@ +# pylint: skip-file +# type: ignore + # This script generates aggregated summary stats on taxes and exemptions data # across a number of geographies, class combinations, and time. @@ -98,6 +101,7 @@ def first(x): def assemble(df, geos, groups): # Create an empty dataframe to fill with output output = pd.DataFrame() + # Loop through group combinations and stack output for key, value in geos.items(): df["data_year"] = df[key] diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql index 0bf3872b7..d0213ff5a 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql @@ -4,9 +4,18 @@ ) }} +WITH tcd AS ( + SELECT DISTINCT + tax_code_num, + tax_code_rate, + year + FROM {{ source('tax', 'tax_code') }} +) + -- Gather parcel-level geographies and join taxes, exemptions, and class -- groupings SELECT + uni.pin, tax.year, tax.av_clerk, tax.tax_bill_total, @@ -95,9 +104,9 @@ INNER JOIN {{ source('tax', 'pin') }} AS tax AND uni.year = tax.year INNER JOIN {{ source('tax', 'eq_factor') }} AS eqf ON uni.year = eqf.year -INNER JOIN {{ source('tax', 'tax_code') }} AS tcd +INNER JOIN tcd ON tax.tax_code_num = tcd.tax_code_num AND tax.year = tcd.year INNER JOIN {{ ref('ccao.class_dict') }} ON uni.class = class_dict.class_code -WHERE uni.class = '278' +WHERE uni.class = '206' diff --git a/dbt/models/reporting/schema.yml b/dbt/models/reporting/schema.yml index 9b5aefacb..5072ae8cc 100644 --- a/dbt/models/reporting/schema.yml +++ b/dbt/models/reporting/schema.yml @@ -52,12 +52,24 @@ models: tags: - daily + - name: reporting.sot_sales + description: '{{ doc("table_sot_sales") }}' + config: + tags: + - daily + - name: reporting.sot_sales_input description: '{{ doc("table_sot_sales_input") }}' config: tags: - daily + - name: reporting.sot_taxes_exemptions + description: '{{ doc("table_sot_taxes_exemptions") }}' + config: + tags: + - daily + - name: reporting.sot_taxes_exemptions_input description: '{{ doc("table_sot_taxes_exemptions_input") }}' config: diff --git a/temp/reporting.sot_ratio_stats.py b/temp/reporting.sot_ratio_stats.py deleted file mode 100644 index a37cb2261..000000000 --- a/temp/reporting.sot_ratio_stats.py +++ /dev/null @@ -1,168 +0,0 @@ -# This script generates aggregated summary stats on sales data across a number -# of geographies, class combinations, and time. -import os.path - -# Import libraries -import assesspy as ass -import awswrangler as wr -import pandas as pd - -# Ingest data if it is not already available -if os.path.isfile("sot_ratio_stats.parquet.gzip"): - df = pd.read_parquet("sot_ratio_stats.parquet.gzip") - -else: - sql = open("reporting.sot_ratio_stats.sql").read() - df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False) - df.to_parquet("sot_ratio_stats.parquet.gzip", compression="gzip") - -# Declare geographic groups and their associated data years -geos = { - "census_data_year": [ - "census_place", - "census_tract", - "census_congressional_district", - "census_zcta", - ], - "cook_board_of_review_district_data_year": [ - "cook_board_of_review_district" - ], - "cook_commissioner_district_data_year": ["cook_commissioner_district"], - "cook_judicial_district_data_year": ["cook_judicial_district"], - "ward_data_year": ["ward_num"], - "community_area_data_year": ["community_area"], - "police_district_data_year": ["police_district"], - "central_business_district_data_year": ["central_business_district"], - "school_data_year": [ - "school_elementary_district", - "school_secondary_district", - "school_unified_district", - ], - "tax_data_year": [ - "tax_municipality", - "tax_park_district", - "tax_library_district", - "tax_fire_protection_district", - "tax_community_college_district", - "tax_sanitation_district", - "tax_special_service_area", - "tax_tif_district", - ], -} -# Declare class groupings -groups = ["no_group", "class", "major_class", "modeling_group"] - - -# Define aggregation functions -def aggregrate(data, geography_type, group_type): - print(geography_type, group_type) - - group = [geography_type, group_type, "year", "stage_name"] - data["size"] = data.groupby(group)["tot_mv"].transform("size") - data["sale_count"] = data.groupby(group)["sale_price"].transform("count") - data["mv_count"] = data.groupby(group)["tot_mv"].transform("count") - - # Remove parcels with FMVs of 0 since they screw up ratios - data = data[data["tot_mv"] > 0].reset_index() - data["ratio_count"] = data.groupby(group)["ratio"].transform("count") - - # Remove groups that only have one sale since we can't calculate stats - data = data[data["ratio_count"] > 1] - - summary = ( - data.dropna(subset=["ratio"]) - .groupby(group) - .apply( - lambda x: pd.Series( - { - "size": x["size"].iloc[0], - "mv_count": x["mv_count"].iloc[0], - "sale_count": x["sale_count"].iloc[0], - "mv_min": x["tot_mv"].min(), - "mv_q10": x["tot_mv"].quantile(0.1), - "mv_q25": x["tot_mv"].quantile(0.25), - "mv_median": x["tot_mv"].median(), - "mv_q75": x["tot_mv"].quantile(0.75), - "mv_q90": x["tot_mv"].quantile(0.90), - "mv_max": x["tot_mv"].max(), - "mv_mean": x["tot_mv"].mean(), - "mv_sum": x["tot_mv"].sum(), - "ratio_min": x["ratio"].min(), - "ratio_q10": x["ratio"].quantile(0.1), - "ratio_q25": x["ratio"].quantile(0.25), - "ratio_median": x["ratio"].median(), - "ratio_q75": x["ratio"].quantile(0.75), - "ratio_q90": x["ratio"].quantile(0.90), - "ratio_max": x["ratio"].max(), - "ratio_mean": x["ratio"].mean(), - "cod": ass.cod(ratio=x["ratio"]), - "prd": ass.prd(x["tot_mv"], x["sale_price"]), - "prb": ass.prb(x["tot_mv"], x["sale_price"], 3)["prb"], - # "mki": ass.mki(x["tot_mv"], x["sale_price"]), - } - ), - include_groups=False, - ) - ) - summary["geography_type"] = geography_type - summary["group_type"] = group_type - - return summary - - -# Create an empty dataframe to fill with output -output = pd.DataFrame() - -# Loop through group combinations and stack output -for key, value in geos.items(): - df["data_year"] = df[key] - - for x in value: - for z in groups: - output = pd.concat([output, aggregrate(df, x, z)]) - -output.index.names = ["geography_id", "group_id", "year", "stage_name"] - -output = output.reset_index().set_index( - [ - "geography_type", - "geography_id", - "group_type", - "group_id", - "year", - "stage_name", - ] -) - -# Clean combined output and export -output["mv_delta_pct_median"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .mv_median.diff() -) -output["mv_delta_pct_mean"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .mv_mean.diff() -) -output["mv_delta_pct_sum"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .mv_sum.diff() -) - -output["mv_delta_pct_median"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .mv_median.pct_change() -) -output["mv_delta_pct_mean"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .mv_mean.pct_change() -) - -output.dropna(how="all", axis=1, inplace=True) -output.to_csv("sot_ratio_stats.csv") - -# %% From 9383bdc7794e1091781a8c49c1da862a9efd2277 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 20 Jun 2024 21:42:53 +0000 Subject: [PATCH 19/96] Try to build ratio stats table --- dbt/models/reporting/docs.md | 8 + .../reporting/reporting.sot_ratio_stats.py | 176 ++++++++++++++++++ dbt/models/reporting/schema.yml | 6 + 3 files changed, 190 insertions(+) create mode 100644 dbt/models/reporting/reporting.sot_ratio_stats.py diff --git a/dbt/models/reporting/docs.md b/dbt/models/reporting/docs.md index 3854dbd79..80261b340 100644 --- a/dbt/models/reporting/docs.md +++ b/dbt/models/reporting/docs.md @@ -48,6 +48,14 @@ Table to feed the Python dbt job that creates the **Primary Key**: `year`, `stage_name`, `geography_id`, `group_id` {% enddocs %} +# sot_ratio_stats + +{% docs table_sot_ratio_stats %} +Feeds public reporting assets. + +**Primary Key**: `year`, `stage_name`, `geography_id`, `group_id` +{% enddocs %} + # sot_ratio_stats_input {% docs table_sot_ratio_stats_input %} diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py new file mode 100644 index 000000000..2328dc48f --- /dev/null +++ b/dbt/models/reporting/reporting.sot_ratio_stats.py @@ -0,0 +1,176 @@ +# pylint: skip-file +# type: ignore + +# This script generates aggregated summary stats on sales data across a number +# of geographies, class combinations, and time. + +# Import libraries +import assesspy as ass +import pandas as pd + +# Declare geographic groups and their associated data years +geos = { + "census_data_year": [ + "census_place", + "census_tract", + "census_congressional_district", + "census_zcta", + ], + "cook_board_of_review_district_data_year": [ + "cook_board_of_review_district" + ], + "cook_commissioner_district_data_year": ["cook_commissioner_district"], + "cook_judicial_district_data_year": ["cook_judicial_district"], + "ward_data_year": ["ward_num"], + "community_area_data_year": ["community_area"], + "police_district_data_year": ["police_district"], + "central_business_district_data_year": ["central_business_district"], + "school_data_year": [ + "school_elementary_district", + "school_secondary_district", + "school_unified_district", + ], + "tax_data_year": [ + "tax_municipality", + "tax_park_district", + "tax_library_district", + "tax_fire_protection_district", + "tax_community_college_district", + "tax_sanitation_district", + "tax_special_service_area", + "tax_tif_district", + ], +} +# Declare class groupings +groups = ["no_group", "class", "major_class", "modeling_group"] + + +# Define aggregation functions +def aggregrate(data, geography_type, group_type): + print(geography_type, group_type) + + group = [geography_type, group_type, "year", "stage_name"] + data["size"] = data.groupby(group)["tot_mv"].transform("size") + data["sale_count"] = data.groupby(group)["sale_price"].transform("count") + data["mv_count"] = data.groupby(group)["tot_mv"].transform("count") + + # Remove parcels with FMVs of 0 since they screw up ratios + data = data[data["tot_mv"] > 0].reset_index() + data["ratio_count"] = data.groupby(group)["ratio"].transform("count") + + # Remove groups that only have one sale since we can't calculate stats + data = data[data["ratio_count"] >= 30] + + summary = ( + data.dropna(subset=["ratio"]) + .groupby(group) + .apply( + lambda x: pd.Series( + { + "size": x["size"].iloc[0], + "mv_count": x["mv_count"].iloc[0], + "sale_count": x["sale_count"].iloc[0], + "mv_min": x["tot_mv"].min(), + "mv_q10": x["tot_mv"].quantile(0.1), + "mv_q25": x["tot_mv"].quantile(0.25), + "mv_median": x["tot_mv"].median(), + "mv_q75": x["tot_mv"].quantile(0.75), + "mv_q90": x["tot_mv"].quantile(0.90), + "mv_max": x["tot_mv"].max(), + "mv_mean": x["tot_mv"].mean(), + "mv_sum": x["tot_mv"].sum(), + "ratio_min": x["ratio"].min(), + "ratio_q10": x["ratio"].quantile(0.1), + "ratio_q25": x["ratio"].quantile(0.25), + "ratio_median": x["ratio"].median(), + "ratio_q75": x["ratio"].quantile(0.75), + "ratio_q90": x["ratio"].quantile(0.90), + "ratio_max": x["ratio"].max(), + "ratio_mean": x["ratio"].mean(), + "cod": ass.cod(ratio=x["ratio"]), + "prd": ass.prd(x["tot_mv"], x["sale_price"]), + "prb": ass.prb(x["tot_mv"], x["sale_price"], 3)["prb"], + "mki": ass.mki(x["tot_mv"], x["sale_price"]), + } + ), + include_groups=False, + ) + ) + summary["geography_type"] = geography_type + summary["group_type"] = group_type + + return summary + + +def assemble(df, geos, groups): + # Create an empty dataframe to fill with output + output = pd.DataFrame() + + # Loop through group combinations and stack output + for key, value in geos.items(): + df["data_year"] = df[key] + + for x in value: + for z in groups: + output = pd.concat([output, aggregrate(df, x, z)]) + + output.index.names = ["geography_id", "group_id", "year", "stage_name"] + + output = output.reset_index().set_index( + [ + "geography_type", + "geography_id", + "group_type", + "group_id", + "year", + "stage_name", + ] + ) + + # Clean combined output and export + output["mv_delta_pct_median"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .mv_median.diff() + ) + output["mv_delta_pct_mean"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .mv_mean.diff() + ) + output["mv_delta_pct_sum"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .mv_sum.diff() + ) + + output["mv_delta_pct_median"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .mv_median.pct_change() + ) + output["mv_delta_pct_mean"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .mv_mean.pct_change() + ) + + output.dropna(how="all", axis=1, inplace=True) + + return output + + +def model(dbt, spark_session): + dbt.config(materialized="table") + + input = dbt.ref("reporting.sot_ratio_stats_input") + + # Convert the Spark input dataframe to Pandas for + # compatibility with assesspy functions + input = input.toPandas() + + df = assemble(input, geos=geos, groups=groups) + + spark_df = spark_session.createDataFrame(df) + + return spark_df diff --git a/dbt/models/reporting/schema.yml b/dbt/models/reporting/schema.yml index 5072ae8cc..18c0e148d 100644 --- a/dbt/models/reporting/schema.yml +++ b/dbt/models/reporting/schema.yml @@ -46,6 +46,12 @@ models: tags: - daily + - name: reporting.sot_ratio_stats + description: '{{ doc("table_sot_ratio_stats") }}' + config: + tags: + - daily + - name: reporting.sot_ratio_stats_input description: '{{ doc("table_sot_ratio_stats_input") }}' config: From 08d3bd60537cc475e1d2eab9288f8ebbfcda3e6b Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 20 Jun 2024 21:52:31 +0000 Subject: [PATCH 20/96] Add assesspy to ratio_stats table --- dbt/models/reporting/reporting.sot_ratio_stats.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py index 2328dc48f..474672db0 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats.py +++ b/dbt/models/reporting/reporting.sot_ratio_stats.py @@ -1,12 +1,15 @@ # pylint: skip-file # type: ignore +sc.addPyFile( # noqa: F821 + "s3://ccao-athena-dependencies-us-east-1/assesspy==1.1.0.zip" +) # This script generates aggregated summary stats on sales data across a number # of geographies, class combinations, and time. # Import libraries -import assesspy as ass -import pandas as pd +import assesspy as ass # noqa: E402 +import pandas as pd # noqa: E402 # Declare geographic groups and their associated data years geos = { From d2cac224d1ada98ecb551a23c5a084a188181c35 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Mon, 24 Jun 2024 19:56:46 +0000 Subject: [PATCH 21/96] ratio_stats builds in dbt, excluding assesspy funcs --- .../reporting/reporting.sot_ratio_stats.py | 142 +++++++++++------- .../reporting.sot_ratio_stats_input.sql | 2 +- 2 files changed, 92 insertions(+), 52 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py index 474672db0..56b2e281e 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats.py +++ b/dbt/models/reporting/reporting.sot_ratio_stats.py @@ -8,7 +8,8 @@ # of geographies, class combinations, and time. # Import libraries -import assesspy as ass # noqa: E402 +import assesspy as ass # noqa: E402, F401 +import numpy as np # noqa: E402 import pandas as pd # noqa: E402 # Declare geographic groups and their associated data years @@ -58,45 +59,42 @@ def aggregrate(data, geography_type, group_type): data["mv_count"] = data.groupby(group)["tot_mv"].transform("count") # Remove parcels with FMVs of 0 since they screw up ratios - data = data[data["tot_mv"] > 0].reset_index() + data = data[data["tot_mv"] > 0] data["ratio_count"] = data.groupby(group)["ratio"].transform("count") # Remove groups that only have one sale since we can't calculate stats data = data[data["ratio_count"] >= 30] - summary = ( - data.dropna(subset=["ratio"]) - .groupby(group) - .apply( - lambda x: pd.Series( - { - "size": x["size"].iloc[0], - "mv_count": x["mv_count"].iloc[0], - "sale_count": x["sale_count"].iloc[0], - "mv_min": x["tot_mv"].min(), - "mv_q10": x["tot_mv"].quantile(0.1), - "mv_q25": x["tot_mv"].quantile(0.25), - "mv_median": x["tot_mv"].median(), - "mv_q75": x["tot_mv"].quantile(0.75), - "mv_q90": x["tot_mv"].quantile(0.90), - "mv_max": x["tot_mv"].max(), - "mv_mean": x["tot_mv"].mean(), - "mv_sum": x["tot_mv"].sum(), - "ratio_min": x["ratio"].min(), - "ratio_q10": x["ratio"].quantile(0.1), - "ratio_q25": x["ratio"].quantile(0.25), - "ratio_median": x["ratio"].median(), - "ratio_q75": x["ratio"].quantile(0.75), - "ratio_q90": x["ratio"].quantile(0.90), - "ratio_max": x["ratio"].max(), - "ratio_mean": x["ratio"].mean(), - "cod": ass.cod(ratio=x["ratio"]), - "prd": ass.prd(x["tot_mv"], x["sale_price"]), - "prb": ass.prb(x["tot_mv"], x["sale_price"], 3)["prb"], - "mki": ass.mki(x["tot_mv"], x["sale_price"]), - } - ), - include_groups=False, + data = data.dropna(subset=["ratio"]) + + summary = data.groupby(group).apply( + lambda x: pd.Series( + { + "size": x["size"].min(), + "mv_count": x["mv_count"].min(), + "sale_count": x["sale_count"].min(), + "mv_min": x["tot_mv"].min(), + "mv_q10": x["tot_mv"].quantile(0.1), + "mv_q25": x["tot_mv"].quantile(0.25), + "mv_median": x["tot_mv"].median(), + "mv_q75": x["tot_mv"].quantile(0.75), + "mv_q90": x["tot_mv"].quantile(0.90), + "mv_max": x["tot_mv"].max(), + "mv_mean": x["tot_mv"].mean(), + "mv_sum": x["tot_mv"].sum(), + "ratio_min": x["ratio"].min(), + "ratio_q10": x["ratio"].quantile(0.1), + "ratio_q25": x["ratio"].quantile(0.25), + "ratio_median": x["ratio"].median(), + "ratio_q75": x["ratio"].quantile(0.75), + "ratio_q90": x["ratio"].quantile(0.90), + "ratio_max": x["ratio"].max(), + "ratio_mean": x["ratio"].mean(), + # "cod": ass.cod(ratio=x["ratio"]), + # "prd": ass.prd(x["tot_mv"], x["sale_price"]), + # "prb": ass.prb(x["tot_mv"], x["sale_price"], 3)["prb"], + # "mki": ass.mki(x["tot_mv"], x["sale_price"]), + } ) ) summary["geography_type"] = geography_type @@ -117,9 +115,15 @@ def assemble(df, geos, groups): for z in groups: output = pd.concat([output, aggregrate(df, x, z)]) - output.index.names = ["geography_id", "group_id", "year", "stage_name"] + output.dropna(how="all", axis=1, inplace=True) + + return output + + +def clean(dirty): + dirty.index.names = ["geography_id", "group_id", "year", "stage_name"] - output = output.reset_index().set_index( + dirty = dirty.reset_index().set_index( [ "geography_type", "geography_id", @@ -130,37 +134,57 @@ def assemble(df, geos, groups): ] ) - # Clean combined output and export - output["mv_delta_pct_median"] = ( - output.sort_values("year") + # Clean combined dirty and export + dirty["mv_delta_pct_median"] = ( + dirty.sort_values("year") .groupby(["geography_id", "group_id", "stage_name"]) .mv_median.diff() ) - output["mv_delta_pct_mean"] = ( - output.sort_values("year") + dirty["mv_delta_pct_mean"] = ( + dirty.sort_values("year") .groupby(["geography_id", "group_id", "stage_name"]) .mv_mean.diff() ) - output["mv_delta_pct_sum"] = ( - output.sort_values("year") + dirty["mv_delta_pct_sum"] = ( + dirty.sort_values("year") .groupby(["geography_id", "group_id", "stage_name"]) .mv_sum.diff() ) - output["mv_delta_pct_median"] = ( - output.sort_values("year") + dirty["mv_delta_pct_median"] = ( + dirty.sort_values("year") .groupby(["geography_id", "group_id", "stage_name"]) .mv_median.pct_change() ) - output["mv_delta_pct_mean"] = ( - output.sort_values("year") + dirty["mv_delta_pct_mean"] = ( + dirty.sort_values("year") .groupby(["geography_id", "group_id", "stage_name"]) .mv_mean.pct_change() ) - output.dropna(how="all", axis=1, inplace=True) + dirty = dirty.reset_index() + + dirty = dirty.astype( + { + "group_id": "str", + "year": "str", + "stage_name": "str", + "size": np.int64, + "mv_count": np.int64, + "sale_count": np.int64, + "mv_min": np.int64, + "mv_q10": np.int64, + "mv_q25": np.int64, + "mv_median": np.int64, + "mv_q75": np.int64, + "mv_q90": np.int64, + "mv_max": np.int64, + "mv_mean": np.int64, + "mv_sum": np.int64, + } + ) - return output + return dirty def model(dbt, spark_session): @@ -174,6 +198,22 @@ def model(dbt, spark_session): df = assemble(input, geos=geos, groups=groups) - spark_df = spark_session.createDataFrame(df) + df = clean(df) + + schema = ( + "geography_type: string, geography_id: string, " + + "group_type: string, group_id: string, year: string, " + + "stage_name: string, size: bigint, mv_count: bigint, " + + "sale_count: bigint, mv_min: bigint, mv_q10: bigint, " + + "mv_q25: bigint, mv_median: bigint, mv_q75: bigint, " + + "mv_q90: bigint, mv_max: bigint, mv_mean: bigint, " + + "mv_sum: bigint, ratio_min: double, ratio_q10: double, " + + "ratio_q25: double, ratio_median: double, ratio_q75: double, " + + "ratio_q90: double, ratio_max: double, ratio_mean: double, " + + "mv_delta_pct_median: double, mv_delta_pct_mean: double, " + + "mv_delta_pct_sum: double" + ) + + spark_df = spark_session.createDataFrame(df, schema=schema) return spark_df diff --git a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql index 6cd258c2a..767d65990 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql +++ b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql @@ -119,4 +119,4 @@ LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales AND NOT sales.sale_filter_less_than_10k AND NOT sales.sale_filter_same_sale_within_365 WHERE uni.year >= '2020' - AND uni.year = '2023' AND uni.class = '278' + AND uni.year IN ('2022', '2023') AND uni.class = '278' From f55975314760c9bebc43bf18f2436a46c81f243d Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 26 Jun 2024 16:31:18 +0000 Subject: [PATCH 22/96] sot_ratio_stats table building in dbt --- .../reporting/reporting.sot_ratio_stats.py | 64 ++++++++++++++++--- .../reporting.sot_ratio_stats_input.sql | 2 +- 2 files changed, 55 insertions(+), 11 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py index 56b2e281e..e749a5563 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats.py +++ b/dbt/models/reporting/reporting.sot_ratio_stats.py @@ -8,7 +8,7 @@ # of geographies, class combinations, and time. # Import libraries -import assesspy as ass # noqa: E402, F401 +import assesspy as ass # noqa: E402 import numpy as np # noqa: E402 import pandas as pd # noqa: E402 @@ -49,6 +49,44 @@ groups = ["no_group", "class", "major_class", "modeling_group"] +def cod_safe(ratio): + if len(ratio) >= 1: + output = ass.cod(ratio) + else: + output = None + + return output + + +def prd_safe(assessed, sale_price): + if len(sale_price) >= 1: + output = ass.prd(assessed=assessed, sale_price=sale_price) + else: + output = None + + return output + + +def prb_safe(assessed, sale_price): + if len(sale_price) >= 1: + output = ass.prb(assessed=assessed, sale_price=sale_price, round=3)[ + "prb" + ] + else: + output = None + + return output + + +def mki_safe(assessed, sale_price): + if len(sale_price) >= 1: + output = ass.mki(assessed=assessed, sale_price=sale_price) + else: + output = None + + return output + + # Define aggregation functions def aggregrate(data, geography_type, group_type): print(geography_type, group_type) @@ -60,17 +98,15 @@ def aggregrate(data, geography_type, group_type): # Remove parcels with FMVs of 0 since they screw up ratios data = data[data["tot_mv"] > 0] - data["ratio_count"] = data.groupby(group)["ratio"].transform("count") # Remove groups that only have one sale since we can't calculate stats - data = data[data["ratio_count"] >= 30] - - data = data.dropna(subset=["ratio"]) + data = data.dropna(subset=["sale_price"]) + data = data[data["sale_count"] >= 20] summary = data.groupby(group).apply( lambda x: pd.Series( { - "size": x["size"].min(), + "size": np.size(x["ratio"]), "mv_count": x["mv_count"].min(), "sale_count": x["sale_count"].min(), "mv_min": x["tot_mv"].min(), @@ -90,10 +126,17 @@ def aggregrate(data, geography_type, group_type): "ratio_q90": x["ratio"].quantile(0.90), "ratio_max": x["ratio"].max(), "ratio_mean": x["ratio"].mean(), - # "cod": ass.cod(ratio=x["ratio"]), - # "prd": ass.prd(x["tot_mv"], x["sale_price"]), - # "prb": ass.prb(x["tot_mv"], x["sale_price"], 3)["prb"], - # "mki": ass.mki(x["tot_mv"], x["sale_price"]), + # "cod": ' '.join(x['ratio'].astype(str).values), + "cod": cod_safe(ratio=x["ratio"]), + "prd": prd_safe( + assessed=x["tot_mv"], sale_price=x["sale_price"] + ), + "prb": prb_safe( + assessed=x["tot_mv"], sale_price=x["sale_price"] + ), + "mki": mki_safe( + assessed=x["tot_mv"], sale_price=x["sale_price"] + ), } ) ) @@ -210,6 +253,7 @@ def model(dbt, spark_session): + "mv_sum: bigint, ratio_min: double, ratio_q10: double, " + "ratio_q25: double, ratio_median: double, ratio_q75: double, " + "ratio_q90: double, ratio_max: double, ratio_mean: double, " + + "cod: double, prd: double, prb: double, mki: double, " + "mv_delta_pct_median: double, mv_delta_pct_mean: double, " + "mv_delta_pct_sum: double" ) diff --git a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql index 767d65990..0228ee4d8 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql +++ b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql @@ -119,4 +119,4 @@ LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales AND NOT sales.sale_filter_less_than_10k AND NOT sales.sale_filter_same_sale_within_365 WHERE uni.year >= '2020' - AND uni.year IN ('2022', '2023') AND uni.class = '278' + AND uni.year IN ('2022', '2023') AND uni.class IN ('278', '597') From 1f8ad1f1933ff46b42b5e32ddc84b98100e7ef56 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 26 Jun 2024 19:44:16 +0000 Subject: [PATCH 23/96] Add res_other group --- dbt/models/reporting/reporting.sot_assessment_roll.py | 2 +- dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 4 +++- dbt/models/reporting/reporting.sot_ratio_stats.py | 2 +- dbt/models/reporting/reporting.sot_ratio_stats_input.sql | 4 +++- dbt/models/reporting/reporting.sot_sales.py | 2 +- dbt/models/reporting/reporting.sot_sales_input.sql | 4 +++- dbt/models/reporting/reporting.sot_taxes_exemptions.py | 2 +- dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql | 4 +++- 8 files changed, 16 insertions(+), 8 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index 2f99c35bc..6357ef79b 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -41,7 +41,7 @@ ], } # Declare class groupings -groups = ["no_group", "class", "major_class", "modeling_group"] +groups = ["no_group", "class", "major_class", "modeling_group", "res_other"] # Define aggregation functions diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql index 8f3f7c3f2..76f58cc50 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql +++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql @@ -102,7 +102,9 @@ SELECT uni.tax_data_year, 'no_group' AS no_group, class_dict.major_class_type AS major_class, - class_dict.modeling_group + class_dict.modeling_group, + CASE WHEN class_dict.major_class_code = '2' THEN 'RES' ELSE 'OTHER' END + AS res_other FROM uni LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals ON uni.pin = vals.pin diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py index e749a5563..d7bd2fc31 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats.py +++ b/dbt/models/reporting/reporting.sot_ratio_stats.py @@ -46,7 +46,7 @@ ], } # Declare class groupings -groups = ["no_group", "class", "major_class", "modeling_group"] +groups = ["no_group", "class", "major_class", "modeling_group", "res_other"] def cod_safe(ratio): diff --git a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql index 0228ee4d8..29a28ff92 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql +++ b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql @@ -102,7 +102,9 @@ SELECT uni.tax_data_year, 'no_group' AS no_group, class_dict.major_class_type AS major_class, - class_dict.modeling_group + class_dict.modeling_group, + CASE WHEN class_dict.major_class_code = '2' THEN 'RES' ELSE 'OTHER' END + AS res_other FROM uni LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py index 9709e77b3..b00d76f84 100644 --- a/dbt/models/reporting/reporting.sot_sales.py +++ b/dbt/models/reporting/reporting.sot_sales.py @@ -41,7 +41,7 @@ ], } # Declare class groupings -groups = ["no_group", "class", "major_class", "modeling_group"] +groups = ["no_group", "class", "major_class", "modeling_group", "res_other"] # Define aggregation functions diff --git a/dbt/models/reporting/reporting.sot_sales_input.sql b/dbt/models/reporting/reporting.sot_sales_input.sql index e6ca7afd4..dcd6fd085 100644 --- a/dbt/models/reporting/reporting.sot_sales_input.sql +++ b/dbt/models/reporting/reporting.sot_sales_input.sql @@ -83,7 +83,9 @@ SELECT uni.tax_data_year, 'no_group' AS no_group, class_dict.major_class_type AS major_class, - class_dict.modeling_group + class_dict.modeling_group, + CASE WHEN class_dict.major_class_code = '2' THEN 'RES' ELSE 'OTHER' END + AS res_other FROM {{ ref('default.vw_pin_universe') }} AS uni LEFT JOIN sf ON uni.pin = sf.pin diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py index cb257a49e..5deccbd8c 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py @@ -41,7 +41,7 @@ ], } # Declare class groupings -groups = ["no_group", "class", "major_class", "modeling_group"] +groups = ["no_group", "class", "major_class", "modeling_group", "res_other"] # Define aggregation functions diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql index d0213ff5a..6bed59fdf 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql @@ -97,7 +97,9 @@ SELECT uni.tax_data_year, 'no_group' AS no_group, class_dict.major_class_type AS major_class, - class_dict.modeling_group + class_dict.modeling_group, + CASE WHEN class_dict.major_class_code = '2' THEN 'RES' ELSE 'OTHER' END + AS res_other FROM {{ ref('default.vw_pin_universe') }} AS uni INNER JOIN {{ source('tax', 'pin') }} AS tax ON uni.pin = tax.pin From 063591c109b7151a57b636287440b7d216748896 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 27 Jun 2024 15:54:09 +0000 Subject: [PATCH 24/96] Add reassessment year indicator for assessment roll --- .../reporting.sot_assessment_roll.py | 43 ++++++++++++++++++- .../reporting.sot_assessment_roll_input.sql | 15 ------- .../reporting.sot_ratio_stats_input.sql | 15 ------- 3 files changed, 42 insertions(+), 31 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index 6357ef79b..225e15210 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -9,6 +9,14 @@ # Declare geographic groups and their associated data years geos = { + "year": [ + "county", + "triad", + "township", + "nbhd", + "tax_code", + "zip_code", + ], "census_data_year": [ "census_place", "census_tract", @@ -62,7 +70,12 @@ def q90(x): def first(x): - return x.iloc[0] + if len(x) >= 1: + output = x.iloc[0] + else: + output = None + + return output more_stats = [ @@ -81,6 +94,7 @@ def first(x): "tot": ["size", "count"] + more_stats, "bldg": more_stats, "land": more_stats, + "triad": [first], } @@ -131,6 +145,33 @@ def assemble(df, geos, groups): output.columns = ["_".join(col) for col in output.columns] output.reset_index() + output["year"] = output["year"].astype(int) + output["temp"] = output["geography_type"].isin( + ["triad", "township", "nbhd"] + ) + output["reassessment_year"] = None + output["reassessment_year"] = output["reassessment_year"].astype("boolean") + output.loc[(output["temp"] is True), "reassessment_year"] = False + output.loc[ + (output["year"] % 3 == 0) + & (output["triad"] == "North") + & (output["temp"] is True), + "reassessment_year", + ] = True + output.loc[ + (output["year"] % 3 == 1) + & (output["triad"] == "South") + & (output["temp"] is True), + "reassessment_year", + ] = True + output.loc[ + (output["year"] % 3 == 2) + & (output["triad"] == "City") + & (output["temp"] is True), + "reassessment_year", + ] = True + output.drop(["temp", "triad"], axis=1) + return output diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql index 76f58cc50..31e6d7fb5 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql +++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql @@ -36,21 +36,6 @@ SELECT CAST(vals.tot AS INT) AS tot, CAST(vals.bldg AS INT) AS bldg, CAST(vals.land AS INT) AS land, - CASE - WHEN - MOD(CAST(uni.year AS INT), 3) = 0 - AND uni.triad_name = 'North' - THEN TRUE - WHEN - MOD(CAST(uni.year AS INT), 3) = 1 - AND uni.triad_name = 'South' - THEN TRUE - WHEN - MOD(CAST(uni.year AS INT), 3) = 2 - AND uni.triad_name = 'City' - THEN TRUE - ELSE FALSE - END AS reassessment_year, 'Cook' AS county, uni.triad_name AS triad, uni.township_name AS township, diff --git a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql index 29a28ff92..266024e0a 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql +++ b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql @@ -36,21 +36,6 @@ SELECT uni.class, CAST(vals.tot_mv AS DOUBLE) AS tot_mv, CAST(vals.tot_mv AS DOUBLE) / CAST(sales.sale_price AS DOUBLE) AS ratio, - CASE - WHEN - MOD(CAST(uni.year AS INT), 3) = 0 - AND uni.triad_name = 'North' - THEN TRUE - WHEN - MOD(CAST(uni.year AS INT), 3) = 1 - AND uni.triad_name = 'South' - THEN TRUE - WHEN - MOD(CAST(uni.year AS INT), 3) = 2 - AND uni.triad_name = 'City' - THEN TRUE - ELSE FALSE - END AS reassessment_year, 'Cook' AS county, uni.triad_name AS triad, uni.township_name AS township, From a9ffc648e8f712c9df21b2cc20e684d0dd296073 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 27 Jun 2024 19:02:35 +0000 Subject: [PATCH 25/96] Retry assessment_year indicator --- .../reporting.sot_assessment_roll.py | 23 ++++---- .../reporting/reporting.sot_ratio_stats.py | 56 +++++++++++++++++-- 2 files changed, 64 insertions(+), 15 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index 225e15210..6916ab7bd 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -146,31 +146,32 @@ def assemble(df, geos, groups): output.reset_index() output["year"] = output["year"].astype(int) - output["temp"] = output["geography_type"].isin( + output["triennial"] = output["geography_type"].isin( ["triad", "township", "nbhd"] ) - output["reassessment_year"] = None - output["reassessment_year"] = output["reassessment_year"].astype("boolean") - output.loc[(output["temp"] is True), "reassessment_year"] = False + output["reassessment_year"] = "" + output.loc[ + (output["triennial"] == True), "reassessment_year" # noqa: E712 + ] = "No" output.loc[ (output["year"] % 3 == 0) & (output["triad"] == "North") - & (output["temp"] is True), + & (output["triennial"] == True), # noqa: E712 "reassessment_year", - ] = True + ] = "Yes" output.loc[ (output["year"] % 3 == 1) & (output["triad"] == "South") - & (output["temp"] is True), + & (output["triennial"] == True), # noqa: E712 "reassessment_year", - ] = True + ] = "Yes" output.loc[ (output["year"] % 3 == 2) & (output["triad"] == "City") - & (output["temp"] is True), + & (output["triennial"] == True), # noqa: E712 "reassessment_year", - ] = True - output.drop(["temp", "triad"], axis=1) + ] = "Yes" + output = output.drop(["triennial", "triad"], axis=1) return output diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py index d7bd2fc31..d0be7e91f 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats.py +++ b/dbt/models/reporting/reporting.sot_ratio_stats.py @@ -14,6 +14,14 @@ # Declare geographic groups and their associated data years geos = { + "year": [ + "county", + "triad", + "township", + "nbhd", + "tax_code", + "zip_code", + ], "census_data_year": [ "census_place", "census_tract", @@ -87,6 +95,15 @@ def mki_safe(assessed, sale_price): return output +def first(x): + if len(x) >= 1: + output = x.iloc[0] + else: + output = None + + return output + + # Define aggregation functions def aggregrate(data, geography_type, group_type): print(geography_type, group_type) @@ -106,6 +123,7 @@ def aggregrate(data, geography_type, group_type): summary = data.groupby(group).apply( lambda x: pd.Series( { + "triad": first(x["triad"]), "size": np.size(x["ratio"]), "mv_count": x["mv_count"].min(), "sale_count": x["sale_count"].min(), @@ -207,11 +225,40 @@ def clean(dirty): dirty = dirty.reset_index() + dirty["year"] = dirty["year"].astype(int) + dirty["triennial"] = dirty["geography_type"].isin( + ["triad", "township", "nbhd"] + ) + dirty["reassessment_year"] = "" + dirty.loc[ + (dirty["triennial"] == True), "reassessment_year" # noqa: E712 + ] = "No" + dirty.loc[ + (dirty["year"] % 3 == 0) + & (dirty["triad"] == "North") + & (dirty["triennial"] == True), # noqa: E712 + "reassessment_year", + ] = "Yes" + dirty.loc[ + (dirty["year"] % 3 == 1) + & (dirty["triad"] == "South") + & (dirty["triennial"] == True), # noqa: E712 + "reassessment_year", + ] = "Yes" + dirty.loc[ + (dirty["year"] % 3 == 2) + & (dirty["triad"] == "City") + & (dirty["triennial"] == True), # noqa: E712 + "reassessment_year", + ] = "Yes" + dirty = dirty.drop(["triennial", "triad"], axis=1) + dirty = dirty.astype( { "group_id": "str", - "year": "str", + "year": np.int64, "stage_name": "str", + "reassessment_year": "str", "size": np.int64, "mv_count": np.int64, "sale_count": np.int64, @@ -245,8 +292,9 @@ def model(dbt, spark_session): schema = ( "geography_type: string, geography_id: string, " - + "group_type: string, group_id: string, year: string, " - + "stage_name: string, size: bigint, mv_count: bigint, " + + "group_type: string, group_id: string, year: bigint, " + + "stage_name: string, size: bigint, " + + "mv_count: bigint, " + "sale_count: bigint, mv_min: bigint, mv_q10: bigint, " + "mv_q25: bigint, mv_median: bigint, mv_q75: bigint, " + "mv_q90: bigint, mv_max: bigint, mv_mean: bigint, " @@ -255,7 +303,7 @@ def model(dbt, spark_session): + "ratio_q90: double, ratio_max: double, ratio_mean: double, " + "cod: double, prd: double, prb: double, mki: double, " + "mv_delta_pct_median: double, mv_delta_pct_mean: double, " - + "mv_delta_pct_sum: double" + + "mv_delta_pct_sum: double, reassessment_year: string" ) spark_df = spark_session.createDataFrame(df, schema=schema) From 62dd68ee5733075a114adfee7fbbc2f8da7f3357 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Fri, 28 Jun 2024 14:59:12 +0000 Subject: [PATCH 26/96] Assessment_roll should run with reassessment year indicator --- dbt/models/reporting/reporting.sot_assessment_roll.py | 3 ++- dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index 6916ab7bd..eb8ed43d7 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -143,7 +143,8 @@ def assemble(df, geos, groups): ) output.columns = ["_".join(col) for col in output.columns] - output.reset_index() + output = output.reset_index() + output = output.rename(columns={"triad_first": "triad"}) output["year"] = output["year"].astype(int) output["triennial"] = output["geography_type"].isin( diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql index 31e6d7fb5..470a01d27 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql +++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql @@ -97,4 +97,4 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals AND uni.stage_name = vals.stage_name LEFT JOIN {{ ref('ccao.class_dict') }} ON uni.class = class_dict.class_code -WHERE uni.stage_name = 'MAILED' AND uni.class = '278' +WHERE uni.stage_name = 'MAILED' AND uni.class = '278' AND uni.year >= '2018' From c185e8102a84a186990e4a85402d8687c1bb88e6 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Fri, 28 Jun 2024 16:49:56 +0000 Subject: [PATCH 27/96] Add schema to assessment_roll table --- .../reporting.sot_assessment_roll.py | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index eb8ed43d7..5a51c704c 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -188,6 +188,26 @@ def model(dbt, spark_session): df = assemble(input, geos=geos, groups=groups) - spark_df = spark_session.createDataFrame(df) + schema = ( + "geography_type: string, geography_id: string, group_type: string, " + + "group_id: string, year: bigint, stage_name: string, " + + "tot_size: bigint, tot_count: bigint, tot_min: double, " + + "tot_q10: double, tot_q25: double, tot_median: double, " + + "tot_q75: double, tot_q90: double, tot_max: double, " + + "tot_mean: double, tot_sum: double, bldg_min: double, " + + "bldg_q10: double, bldg_q25: double, bldg_median: double, " + + "bldg_q75: double, bldg_q90: double, bldg_max: double, " + + "bldg_mean: double, bldg_sum: double, land_min: double, " + + "land_q10: double, land_q25: double, land_median: double, " + + "land_q75: double, land_q90: double, land_max: double, " + + "land_mean: double, land_sum: double, tot_deltamedian: double, " + + "bldg_deltamedian: double, land_deltamedian: double, " + + "tot_deltamean: double, bldg_deltamean: double, " + + "land_deltamean: double, tot_deltasum: double, " + + "bldg_deltasum: double, land_deltasum: double, " + + "tot_pct_w_value: double, reassessment_year: string" + ) + + spark_df = spark_session.createDataFrame(df, schema=schema) return spark_df From d08bc3d0695a0b81e8898913816c1ccbd2433110 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Fri, 28 Jun 2024 20:21:05 +0000 Subject: [PATCH 28/96] Correct output from sales and taxes tables --- dbt/models/reporting/reporting.sot_sales.py | 10 +++++++++- dbt/models/reporting/reporting.sot_taxes_exemptions.py | 10 +++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py index b00d76f84..899f895c7 100644 --- a/dbt/models/reporting/reporting.sot_sales.py +++ b/dbt/models/reporting/reporting.sot_sales.py @@ -9,6 +9,14 @@ # Declare geographic groups and their associated data years geos = { + "year": [ + "county", + "triad", + "township", + "nbhd", + "tax_code", + "zip_code", + ], "census_data_year": [ "census_place", "census_tract", @@ -128,7 +136,7 @@ def assemble(df, geos, groups): output["price_per_sf", "delta" + i] = output["price_per_sf", i].diff() output.columns = ["_".join(col) for col in output.columns] - output.reset_index() + output = output.reset_index() return output diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py index 5deccbd8c..4e1c89c9c 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py @@ -9,6 +9,14 @@ # Declare geographic groups and their associated data years geos = { + "year": [ + "county", + "triad", + "township", + "nbhd", + "tax_code", + "zip_code", + ], "census_data_year": [ "census_place", "census_tract", @@ -132,7 +140,7 @@ def assemble(df, geos, groups): ].diff() output.columns = ["_".join(col) for col in output.columns] - output.reset_index() + output = output.reset_index() return output From 4808aa4f774e06fba0ef9e60be01c3bd753b212f Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Fri, 28 Jun 2024 21:22:25 +0000 Subject: [PATCH 29/96] Add table schemas --- dbt/models/reporting/reporting.sot_sales.py | 60 +++++++++++----- .../reporting.sot_taxes_exemptions.py | 70 ++++++++++++++----- 2 files changed, 96 insertions(+), 34 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py index 899f895c7..e606109b7 100644 --- a/dbt/models/reporting/reporting.sot_sales.py +++ b/dbt/models/reporting/reporting.sot_sales.py @@ -96,6 +96,27 @@ def first(x): } +def aggregrate(data, geography_type, group_type): + print(geography_type, group_type) + + group = [geography_type, group_type, "year"] + summary = data.groupby(group).agg(agg_func_math).round(2) + summary["geography_type"] = geography_type + summary["group_type"] = group_type + summary.index.names = ["geography_id", "group_id", "year"] + summary = summary.reset_index().set_index( + [ + "geography_type", + "geography_id", + "group_type", + "group_id", + "year", + ] + ) + + return summary + + def assemble(df, geos, groups): # Create an empty dataframe to fill with output output = pd.DataFrame() @@ -106,22 +127,7 @@ def assemble(df, geos, groups): for x in value: for z in groups: - group = [x, z, "year"] - summary = df.groupby(group).agg(agg_func_math).round(2) - summary["geography_type"] = x - summary["group_type"] = z - summary.index.names = ["geography_id", "group_id", "year"] - summary = summary.reset_index().set_index( - [ - "geography_type", - "geography_id", - "group_type", - "group_id", - "year", - ] - ) - - output = pd.concat([output, summary]) + output = pd.concat([output, aggregrate(df, x, z)]) # Clean combined output and export output["sale_price", "sum"] = output["sale_price", "sum"].replace( @@ -152,6 +158,26 @@ def model(dbt, spark_session): df = assemble(input, geos=geos, groups=groups) - spark_df = spark_session.createDataFrame(df) + schema = ( + "geography_type: string, geography_id: string, group_type: string, " + + "group_id: string, year: bigint, sale_price_size: double, " + + "sale_price_count: double, sale_price_min: double, " + + "sale_price_q10: double, sale_price_q25: double, " + + "sale_price_median: double, sale_price_q75: double, " + + "sale_price_q90: double, sale_price_max: double, " + + "sale_price_mean: double, sale_price_sum: double, " + + "price_per_sf_min: double, price_per_sf_q10: double, " + + "price_per_sf_q25: double, price_per_sf_median: double, " + + "price_per_sf_q75: double, price_per_sf_q90: double, " + + "price_per_sf_max: double, price_per_sf_mean: double, " + + "price_per_sf_sum: double, char_bldg_sf_median: double, " + + "char_land_sf_median: double, char_yrblt_median: double, " + + "class_multimode: array, data_year_first: bigint," + + "sale_price_deltamedian: double, price_per_sf_deltamedian: double, " + + "sale_price_deltamean: double, price_per_sf_deltamean: double, " + + "sale_price_deltasum: double, price_per_sf_deltasum: double" + ) + + spark_df = spark_session.createDataFrame(df, schema=schema) return spark_df diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py index 4e1c89c9c..af4f4ca9f 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py @@ -106,6 +106,27 @@ def first(x): } +def aggregrate(data, geography_type, group_type): + print(geography_type, group_type) + + group = [geography_type, group_type, "year"] + summary = data.groupby(group).agg(agg_func_math).round(2) + summary["geography_type"] = geography_type + summary["group_type"] = group_type + summary.index.names = ["geography_id", "group_id", "year"] + summary = summary.reset_index().set_index( + [ + "geography_type", + "geography_id", + "group_type", + "group_id", + "year", + ] + ) + + return summary + + def assemble(df, geos, groups): # Create an empty dataframe to fill with output output = pd.DataFrame() @@ -116,22 +137,7 @@ def assemble(df, geos, groups): for x in value: for z in groups: - group = [x, z, "year"] - summary = df.groupby(group).agg(agg_func_math).round(2) - summary["geography_type"] = x - summary["group_type"] = z - summary.index.names = ["geography_id", "group_id", "year"] - summary = summary.reset_index().set_index( - [ - "geography_type", - "geography_id", - "group_type", - "group_id", - "year", - ] - ) - - output = pd.concat([output, summary]) + output = pd.concat([output, aggregrate(df, x, z)]) # Clean combined output and export for i in ["median", "mean", "sum"]: @@ -156,6 +162,36 @@ def model(dbt, spark_session): df = assemble(input, geos=geos, groups=groups) - spark_df = spark_session.createDataFrame(df) + schema = ( + "geography_type: string, geography_id: string, group_type: string, " + + "group_id: string, year: bigint, eq_factor_final_size: bigint, " + + "eq_factor_final_first: double, eq_factor_tentative_first: double, " + + "tax_bill_total_min: double, tax_bill_total_q10: double, " + + "tax_bill_total_q25: double, tax_bill_total_median: double, " + + "tax_bill_total_q75: double, tax_bill_total_q90: double, " + + "tax_bill_total_max: double, tax_bill_total_mean: double, " + + "tax_bill_total_sum: double, tax_code_rate_min: double, " + + "tax_code_rate_q10: double, tax_code_rate_q25: double, " + + "tax_code_rate_median: double, tax_code_rate_q75: double, " + + "tax_code_rate_q90: double, tax_code_rate_max: double, " + + "tax_code_rate_mean: double, tax_code_rate_sum: double, " + + "av_clerk_min: int, av_clerk_q10: double, av_clerk_q25: double, " + + "av_clerk_median: double, av_clerk_q75: double, " + + "av_clerk_q90: double, av_clerk_max: int, av_clerk_mean: double, " + + "av_clerk_sum: double, exe_homeowner_count: bigint, " + + "exe_homeowner_sum: double, exe_senior_count: bigint, " + + "exe_senior_sum: double, exe_freeze_count: bigint, " + + "exe_freeze_sum: double, exe_longtime_homeowner_count: bigint, " + + "exe_longtime_homeowner_sum: double, exe_disabled_count: bigint, " + + "exe_disabled_sum: double, exe_vet_returning_count: bigint, " + + "exe_vet_returning_sum: double, exe_vet_dis_lt50_count: bigint, " + + "exe_vet_dis_lt50_sum: double, exe_vet_dis_50_69_count: bigint, " + + "exe_vet_dis_50_69_sum: double, exe_vet_dis_ge70_count: bigint, " + + "exe_vet_dis_ge70_sum: double, exe_abate_count: bigint, " + + "exe_abate_sum: double, tax_bill_total_deltamedian: double, " + + "tax_bill_total_deltamean: double, tax_bill_total_deltasum: double" + ) + + spark_df = spark_session.createDataFrame(df, schema=schema) return spark_df From 08c8d53d258d1fd2459d1cbf4e0d13fbb8bcec60 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Fri, 28 Jun 2024 21:40:49 +0000 Subject: [PATCH 30/96] Fix schemas --- dbt/models/reporting/reporting.sot_sales.py | 4 ++-- dbt/models/reporting/reporting.sot_taxes_exemptions.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py index e606109b7..d35e949be 100644 --- a/dbt/models/reporting/reporting.sot_sales.py +++ b/dbt/models/reporting/reporting.sot_sales.py @@ -160,7 +160,7 @@ def model(dbt, spark_session): schema = ( "geography_type: string, geography_id: string, group_type: string, " - + "group_id: string, year: bigint, sale_price_size: double, " + + "group_id: string, year: string, sale_price_size: double, " + "sale_price_count: double, sale_price_min: double, " + "sale_price_q10: double, sale_price_q25: double, " + "sale_price_median: double, sale_price_q75: double, " @@ -172,7 +172,7 @@ def model(dbt, spark_session): + "price_per_sf_max: double, price_per_sf_mean: double, " + "price_per_sf_sum: double, char_bldg_sf_median: double, " + "char_land_sf_median: double, char_yrblt_median: double, " - + "class_multimode: array, data_year_first: bigint," + + "class_multimode: array, data_year_first: string," + "sale_price_deltamedian: double, price_per_sf_deltamedian: double, " + "sale_price_deltamean: double, price_per_sf_deltamean: double, " + "sale_price_deltasum: double, price_per_sf_deltasum: double" diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py index af4f4ca9f..07691ec5b 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py @@ -164,7 +164,7 @@ def model(dbt, spark_session): schema = ( "geography_type: string, geography_id: string, group_type: string, " - + "group_id: string, year: bigint, eq_factor_final_size: bigint, " + + "group_id: string, year: string, eq_factor_final_size: bigint, " + "eq_factor_final_first: double, eq_factor_tentative_first: double, " + "tax_bill_total_min: double, tax_bill_total_q10: double, " + "tax_bill_total_q25: double, tax_bill_total_median: double, " From 2f8dc3dca5be1dffbf2e0f05dc54f084232e19b8 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Mon, 1 Jul 2024 20:33:16 +0000 Subject: [PATCH 31/96] Resolve sales table column type issues --- dbt/models/reporting/reporting.sot_sales.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py index d35e949be..65889953b 100644 --- a/dbt/models/reporting/reporting.sot_sales.py +++ b/dbt/models/reporting/reporting.sot_sales.py @@ -160,8 +160,8 @@ def model(dbt, spark_session): schema = ( "geography_type: string, geography_id: string, group_type: string, " - + "group_id: string, year: string, sale_price_size: double, " - + "sale_price_count: double, sale_price_min: double, " + + "group_id: string, year: string, sale_price_size: bigint, " + + "sale_price_count: int, sale_price_min: double, " + "sale_price_q10: double, sale_price_q25: double, " + "sale_price_median: double, sale_price_q75: double, " + "sale_price_q90: double, sale_price_max: double, " From 88ce0496dd50092dcd45c2ee7c24c6f682cde1ec Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 2 Jul 2024 15:45:39 +0000 Subject: [PATCH 32/96] Add exe_total to exemptions table --- .../reporting/reporting.sot_taxes_exemptions.py | 4 +++- .../reporting.sot_taxes_exemptions_input.sql | 11 +++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py index 07691ec5b..044f59a64 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py @@ -103,6 +103,7 @@ def first(x): "exe_vet_dis_50_69": less_stats, "exe_vet_dis_ge70": less_stats, "exe_abate": less_stats, + "exe_total": less_stats, } @@ -188,7 +189,8 @@ def model(dbt, spark_session): + "exe_vet_dis_lt50_sum: double, exe_vet_dis_50_69_count: bigint, " + "exe_vet_dis_50_69_sum: double, exe_vet_dis_ge70_count: bigint, " + "exe_vet_dis_ge70_sum: double, exe_abate_count: bigint, " - + "exe_abate_sum: double, tax_bill_total_deltamedian: double, " + + "exe_abate_sum: double, exe_total_count: bigint, " + + "exe_total_sum: double, tax_bill_total_deltamedian: double, " + "tax_bill_total_deltamean: double, tax_bill_total_deltasum: double" ) diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql index 6bed59fdf..281b69267 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql @@ -42,6 +42,17 @@ SELECT CASE WHEN tax.exe_vet_dis_ge70 = 0 THEN NULL ELSE tax.exe_vet_dis_ge70 END AS exe_vet_dis_ge70, CASE WHEN tax.exe_abate = 0 THEN NULL ELSE tax.exe_abate END AS exe_abate, + CASE + WHEN tax.exe_homeowner + tax.exe_senior + tax.exe_freeze + + tax.exe_longtime_homeowner + tax.exe_disabled + + tax.exe_vet_returning + tax.exe_vet_dis_lt50 + + tax.exe_vet_dis_50_69 + tax.exe_vet_dis_ge70 + tax.exe_abate = 0 + THEN NULL ELSE + tax.exe_homeowner + tax.exe_senior + tax.exe_freeze + + tax.exe_longtime_homeowner + tax.exe_disabled + + tax.exe_vet_returning + tax.exe_vet_dis_lt50 + + tax.exe_vet_dis_50_69 + tax.exe_vet_dis_ge70 + tax.exe_abate + END AS exe_total, tcd.tax_code_rate, eqf.eq_factor_tentative, eqf.eq_factor_final, From 271576d839c8cde9cb0359b1bff54d4205d22efd Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 2 Jul 2024 16:34:05 +0000 Subject: [PATCH 33/96] Add more ratio stats --- .../reporting/reporting.sot_ratio_stats.py | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py index d0be7e91f..c226dc532 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats.py +++ b/dbt/models/reporting/reporting.sot_ratio_stats.py @@ -104,6 +104,14 @@ def first(x): return output +def met(x, lower_limit, upper_limit): + return np.logical_and(lower_limit <= x, x <= upper_limit) + + +def within(x, limit): + return np.logical_and(1 - limit < x, x < 1 + limit) + + # Define aggregation functions def aggregrate(data, geography_type, group_type): print(geography_type, group_type) @@ -253,6 +261,16 @@ def clean(dirty): ] = "Yes" dirty = dirty.drop(["triennial", "triad"], axis=1) + dirty["cod_met"] = met(dirty["cod"], 5, 15) + dirty["prd_met"] = met(dirty["prd"], 0.98, 1.03) + dirty["prb_met"] = met(dirty["prb"], -0.05, 0.05) + dirty["mki_met"] = met(dirty["mki"], 0.95, 1.05) + + dirty["within_05_pct"] = within(dirty["ratio_mean"], 0.05) + dirty["within_10_pct"] = within(dirty["ratio_mean"], 0.1) + dirty["within_15_pct"] = within(dirty["ratio_mean"], 0.15) + dirty["within_20_pct"] = within(dirty["ratio_mean"], 0.2) + dirty = dirty.astype( { "group_id": "str", @@ -303,7 +321,11 @@ def model(dbt, spark_session): + "ratio_q90: double, ratio_max: double, ratio_mean: double, " + "cod: double, prd: double, prb: double, mki: double, " + "mv_delta_pct_median: double, mv_delta_pct_mean: double, " - + "mv_delta_pct_sum: double, reassessment_year: string" + + "mv_delta_pct_sum: double, reassessment_year: string, " + + "cod_met: boolean, prd_met: boolean, prb_met: boolean, " + + "mki_met: boolean, within_05_pct: boolean, " + + "within_10_pct: boolean, within_15_pct: boolean, " + + "within_20_pct: boolean" ) spark_df = spark_session.createDataFrame(df, schema=schema) From c39a2d82c6178cf176758812c5f4a7ba7a5c3a57 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 2 Jul 2024 22:00:49 +0000 Subject: [PATCH 34/96] Clean sales table columns --- dbt/models/reporting/reporting.sot_sales.py | 110 ++++++++++++++---- .../reporting/reporting.sot_sales_input.sql | 14 +-- 2 files changed, 90 insertions(+), 34 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py index 65889953b..aa8229b87 100644 --- a/dbt/models/reporting/reporting.sot_sales.py +++ b/dbt/models/reporting/reporting.sot_sales.py @@ -87,10 +87,10 @@ def first(x): agg_func_math = { "sale_price": ["size", "count"] + more_stats, - "price_per_sf": more_stats, - "char_bldg_sf": ["median"], - "char_land_sf": ["median"], - "char_yrblt": ["median"], + "sale_price_per_sf": more_stats, + "sale_char_bldg_sf": ["median"], + "sale_char_land_sf": ["median"], + "sale_char_yrblt": ["median"], "class": [stats.multimode], "data_year": [first], } @@ -133,17 +133,76 @@ def assemble(df, geos, groups): output["sale_price", "sum"] = output["sale_price", "sum"].replace( 0, np.NaN ) - output["price_per_sf", "sum"] = output["price_per_sf", "sum"].replace( - 0, np.NaN - ) + output["sale_price_per_sf", "sum"] = output[ + "sale_price_per_sf", "sum" + ].replace(0, np.NaN) for i in ["median", "mean", "sum"]: output["sale_price", "delta" + i] = output["sale_price", i].diff() - output["price_per_sf", "delta" + i] = output["price_per_sf", i].diff() + output["sale_price_per_sf", "delta" + i] = output[ + "sale_price_per_sf", i + ].diff() output.columns = ["_".join(col) for col in output.columns] output = output.reset_index() + output = clean_names(output) + + return output + + +def clean_names(x): + output = x.rename( + columns={ + "sale_price_size": "pin_n_tot", + "year": "sale_year", + "sale_price_count": "sale_n_tot", + "class_multimode": "sale_class_mode", + "data_year_first": "data_year", + } + ) + + output = output[ + [ + "geography_type", + "geography_id", + "group_type", + "group_id", + "sale_year", + "pin_n_tot", + "sale_n_tot", + "sale_price_min", + "sale_price_q10", + "sale_price_q25", + "sale_price_median", + "sale_price_q75", + "sale_price_q90", + "sale_price_max", + "sale_price_mean", + "sale_price_sum", + "sale_price_deltamedian", + "sale_price_deltamean", + "sale_price_deltasum", + "sale_price_per_sf_min", + "sale_price_per_sf_q10", + "sale_price_per_sf_q25", + "sale_price_per_sf_median", + "sale_price_per_sf_q75", + "sale_price_per_sf_q90", + "sale_price_per_sf_max", + "sale_price_per_sf_mean", + "sale_price_per_sf_sum", + "sale_price_per_sf_deltamedian", + "sale_price_per_sf_deltamean", + "sale_price_per_sf_deltasum", + "sale_char_bldg_sf_median", + "sale_char_land_sf_median", + "sale_char_yrblt_median", + "sale_class_mode", + "data_year", + ] + ] + return output @@ -160,22 +219,25 @@ def model(dbt, spark_session): schema = ( "geography_type: string, geography_id: string, group_type: string, " - + "group_id: string, year: string, sale_price_size: bigint, " - + "sale_price_count: int, sale_price_min: double, " - + "sale_price_q10: double, sale_price_q25: double, " - + "sale_price_median: double, sale_price_q75: double, " - + "sale_price_q90: double, sale_price_max: double, " - + "sale_price_mean: double, sale_price_sum: double, " - + "price_per_sf_min: double, price_per_sf_q10: double, " - + "price_per_sf_q25: double, price_per_sf_median: double, " - + "price_per_sf_q75: double, price_per_sf_q90: double, " - + "price_per_sf_max: double, price_per_sf_mean: double, " - + "price_per_sf_sum: double, char_bldg_sf_median: double, " - + "char_land_sf_median: double, char_yrblt_median: double, " - + "class_multimode: array, data_year_first: string," - + "sale_price_deltamedian: double, price_per_sf_deltamedian: double, " - + "sale_price_deltamean: double, price_per_sf_deltamean: double, " - + "sale_price_deltasum: double, price_per_sf_deltasum: double" + + "group_id: string, sale_year: string, pin_n_tot: bigint, " + + "sale_n_tot: int, sale_price_min: double, sale_price_q10: double, " + + "sale_price_q25: double, sale_price_median: double, " + + "sale_price_q75: double, sale_price_q90: double, " + + "sale_price_max: double, sale_price_mean: double, " + + "sale_price_sum: double, sale_price_deltamedian: double, " + + "sale_price_deltamean: double, sale_price_deltasum: double, " + + "sale_price_per_sf_min: double, sale_price_per_sf_q10: double, " + + "sale_price_per_sf_q25: double, sale_price_per_sf_median: double, " + + "sale_price_per_sf_q75: double, sale_price_per_sf_q90: double, " + + "sale_price_per_sf_max: double, sale_price_per_sf_mean: double, " + + "sale_price_per_sf_sum: double, " + + "sale_price_per_sf_deltamedian: double, " + + "sale_price_per_sf_deltamean: double, " + + "sale_price_per_sf_deltasum: double, " + + "sale_char_bldg_sf_median: double, " + + "sale_char_land_sf_median: double, " + + "sale_char_yrblt_median: double, sale_class_mode: array, " + + "data_year: string" ) spark_df = spark_session.createDataFrame(df, schema=schema) diff --git a/dbt/models/reporting/reporting.sot_sales_input.sql b/dbt/models/reporting/reporting.sot_sales_input.sql index dcd6fd085..575bb8aed 100644 --- a/dbt/models/reporting/reporting.sot_sales_input.sql +++ b/dbt/models/reporting/reporting.sot_sales_input.sql @@ -23,13 +23,10 @@ SELECT CASE WHEN sf.char_bldg_sf > 0 THEN CAST(sales.sale_price / sf.char_bldg_sf AS DOUBLE) - END AS price_per_sf, - CAST(sf.char_bldg_sf AS INT) AS char_bldg_sf, - CAST(sf.char_land_sf AS INT) AS char_land_sf, - CAST(sf.char_yrblt AS INT) AS char_yrblt, - CAST(hist.oneyr_pri_mailed_bldg AS DOUBLE) AS oneyr_pri_mailed_bldg, - CAST(hist.oneyr_pri_mailed_land AS DOUBLE) AS oneyr_pri_mailed_land, - CAST(hist.oneyr_pri_mailed_tot AS DOUBLE) AS oneyr_pri_mailed_tot, + END AS sale_price_per_sf, + CAST(sf.char_bldg_sf AS INT) AS sale_char_bldg_sf, + CAST(sf.char_land_sf AS INT) AS sale_char_land_sf, + CAST(sf.char_yrblt AS INT) AS sale_char_yrblt, uni.year, uni.class, 'Cook' AS county, @@ -92,9 +89,6 @@ LEFT JOIN sf AND uni.year = sf.year LEFT JOIN {{ ref('ccao.class_dict') }} ON uni.class = class_dict.class_code -LEFT JOIN {{ ref('default.vw_pin_history') }} AS hist - ON uni.pin = hist.pin - AND uni.year = hist.year LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales ON uni.pin = sales.pin AND uni.year = sales.year From 20c9bd6a59120077991aa14f20caeae60f00f608 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 3 Jul 2024 16:53:36 +0000 Subject: [PATCH 35/96] Clean taxes table columns --- dbt/models/reporting/reporting.sot_sales.py | 26 +-- .../reporting.sot_taxes_exemptions.py | 170 ++++++++++++++---- .../reporting.sot_taxes_exemptions_input.sql | 31 ++-- 3 files changed, 160 insertions(+), 67 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py index aa8229b87..6d9c8f563 100644 --- a/dbt/models/reporting/reporting.sot_sales.py +++ b/dbt/models/reporting/reporting.sot_sales.py @@ -138,8 +138,8 @@ def assemble(df, geos, groups): ].replace(0, np.NaN) for i in ["median", "mean", "sum"]: - output["sale_price", "delta" + i] = output["sale_price", i].diff() - output["sale_price_per_sf", "delta" + i] = output[ + output["sale_price", "delta_" + i] = output["sale_price", i].diff() + output["sale_price_per_sf", "delta_" + i] = output[ "sale_price_per_sf", i ].diff() @@ -180,9 +180,9 @@ def clean_names(x): "sale_price_max", "sale_price_mean", "sale_price_sum", - "sale_price_deltamedian", - "sale_price_deltamean", - "sale_price_deltasum", + "sale_price_delta_median", + "sale_price_delta_mean", + "sale_price_delta_sum", "sale_price_per_sf_min", "sale_price_per_sf_q10", "sale_price_per_sf_q25", @@ -192,9 +192,9 @@ def clean_names(x): "sale_price_per_sf_max", "sale_price_per_sf_mean", "sale_price_per_sf_sum", - "sale_price_per_sf_deltamedian", - "sale_price_per_sf_deltamean", - "sale_price_per_sf_deltasum", + "sale_price_per_sf_delta_median", + "sale_price_per_sf_delta_mean", + "sale_price_per_sf_delta_sum", "sale_char_bldg_sf_median", "sale_char_land_sf_median", "sale_char_yrblt_median", @@ -224,16 +224,16 @@ def model(dbt, spark_session): + "sale_price_q25: double, sale_price_median: double, " + "sale_price_q75: double, sale_price_q90: double, " + "sale_price_max: double, sale_price_mean: double, " - + "sale_price_sum: double, sale_price_deltamedian: double, " - + "sale_price_deltamean: double, sale_price_deltasum: double, " + + "sale_price_sum: double, sale_price_delta_median: double, " + + "sale_price_delta_mean: double, sale_price_delta_sum: double, " + "sale_price_per_sf_min: double, sale_price_per_sf_q10: double, " + "sale_price_per_sf_q25: double, sale_price_per_sf_median: double, " + "sale_price_per_sf_q75: double, sale_price_per_sf_q90: double, " + "sale_price_per_sf_max: double, sale_price_per_sf_mean: double, " + "sale_price_per_sf_sum: double, " - + "sale_price_per_sf_deltamedian: double, " - + "sale_price_per_sf_deltamean: double, " - + "sale_price_per_sf_deltasum: double, " + + "sale_price_per_sf_delta_median: double, " + + "sale_price_per_sf_delta_mean: double, " + + "sale_price_per_sf_delta_sum: double, " + "sale_char_bldg_sf_median: double, " + "sale_char_land_sf_median: double, " + "sale_char_yrblt_median: double, sale_class_mode: array, " diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py index 044f59a64..52a085aab 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py @@ -88,22 +88,22 @@ def first(x): less_stats = ["count", "sum"] agg_func_math = { - "eq_factor_final": ["size", first], - "eq_factor_tentative": [first], + "tax_eq_factor_final": ["size", first], + "tax_eq_factor_tentative": [first], "tax_bill_total": more_stats, - "tax_code_rate": more_stats, - "av_clerk": more_stats, - "exe_homeowner": less_stats, - "exe_senior": less_stats, - "exe_freeze": less_stats, - "exe_longtime_homeowner": less_stats, - "exe_disabled": less_stats, - "exe_vet_returning": less_stats, - "exe_vet_dis_lt50": less_stats, - "exe_vet_dis_50_69": less_stats, - "exe_vet_dis_ge70": less_stats, - "exe_abate": less_stats, - "exe_total": less_stats, + "tax_rate": more_stats, + "tax_av": more_stats, + "tax_exe_homeowner": less_stats, + "tax_exe_senior": less_stats, + "tax_exe_freeze": less_stats, + "tax_exe_longtime_homeowner": less_stats, + "tax_exe_disabled": less_stats, + "tax_exe_vet_returning": less_stats, + "tax_exe_vet_dis_lt50": less_stats, + "tax_exe_vet_dis_50_69": less_stats, + "tax_exe_vet_dis_ge70": less_stats, + "tax_exe_abate": less_stats, + "tax_exe_total": less_stats, } @@ -142,13 +142,103 @@ def assemble(df, geos, groups): # Clean combined output and export for i in ["median", "mean", "sum"]: - output["tax_bill_total", "delta" + i] = output[ + output["tax_bill_total", "delta_" + i] = output[ "tax_bill_total", i ].diff() output.columns = ["_".join(col) for col in output.columns] output = output.reset_index() + output = clean_names(output) + + return output + + +def clean_names(x): + output = x.rename( + columns={ + "tax_eq_factor_final_size": "pin_n_tot", + "year": "tax_year", + "tax_exe_homeowner_count": "tax_exe_n_homeowner", + "tax_exe_senior_count": "tax_exe_n_senior", + "tax_exe_freeze_count": "tax_exe_n_freeze", + "tax_exe_longtime_homeowner_count": "tax_exe_n_longtime_homeowner", + "tax_exe_disabled_count": "tax_exe_n_disabled", + "tax_exe_vet_returning_count": "tax_exe_n_vet_returning", + "tax_exe_vet_dis_lt50_count": "tax_exe_n_vet_dis_lt50", + "tax_exe_vet_dis_50_69_count": "tax_exe_n_vet_dis_50_69", + "tax_exe_vet_dis_ge70_count": "tax_exe_n_vet_dis_ge70", + "tax_exe_abate_count": "tax_exe_n_abate", + "tax_exe_total_count": "tax_exe_n_total", + "tax_eq_factor_final_first": "tax_eq_factor_final", + "tax_eq_factor_tentative_first": "tax_eq_factor_tentative", + } + ) + + output = output[ + [ + "geography_type", + "geography_id", + "group_type", + "group_id", + "tax_year", + "pin_n_tot", + "tax_eq_factor_final", + "tax_eq_factor_tentative", + "tax_bill_total_min", + "tax_bill_total_q10", + "tax_bill_total_q25", + "tax_bill_total_median", + "tax_bill_total_q75", + "tax_bill_total_q90", + "tax_bill_total_max", + "tax_bill_total_mean", + "tax_bill_total_sum", + "tax_bill_total_delta_median", + "tax_bill_total_delta_mean", + "tax_bill_total_delta_sum", + "tax_rate_min", + "tax_rate_q10", + "tax_rate_q25", + "tax_rate_median", + "tax_rate_q75", + "tax_rate_q90", + "tax_rate_max", + "tax_rate_mean", + "tax_rate_sum", + "tax_av_min", + "tax_av_q10", + "tax_av_q25", + "tax_av_median", + "tax_av_q75", + "tax_av_q90", + "tax_av_max", + "tax_av_mean", + "tax_av_sum", + "tax_exe_n_homeowner", + "tax_exe_homeowner_sum", + "tax_exe_n_senior", + "tax_exe_senior_sum", + "tax_exe_n_freeze", + "tax_exe_freeze_sum", + "tax_exe_n_longtime_homeowner", + "tax_exe_longtime_homeowner_sum", + "tax_exe_n_disabled", + "tax_exe_disabled_sum", + "tax_exe_n_vet_returning", + "tax_exe_vet_returning_sum", + "tax_exe_n_vet_dis_lt50", + "tax_exe_vet_dis_lt50_sum", + "tax_exe_n_vet_dis_50_69", + "tax_exe_vet_dis_50_69_sum", + "tax_exe_n_vet_dis_ge70", + "tax_exe_vet_dis_ge70_sum", + "tax_exe_n_abate", + "tax_exe_abate_sum", + "tax_exe_n_total", + "tax_exe_total_sum", + ] + ] return output @@ -165,33 +255,35 @@ def model(dbt, spark_session): schema = ( "geography_type: string, geography_id: string, group_type: string, " - + "group_id: string, year: string, eq_factor_final_size: bigint, " - + "eq_factor_final_first: double, eq_factor_tentative_first: double, " + + "group_id: string, tax_year: string, pin_n_tot: bigint, " + + "tax_eq_factor_final: double, tax_eq_factor_tentative: double, " + "tax_bill_total_min: double, tax_bill_total_q10: double, " + "tax_bill_total_q25: double, tax_bill_total_median: double, " + "tax_bill_total_q75: double, tax_bill_total_q90: double, " + "tax_bill_total_max: double, tax_bill_total_mean: double, " - + "tax_bill_total_sum: double, tax_code_rate_min: double, " - + "tax_code_rate_q10: double, tax_code_rate_q25: double, " - + "tax_code_rate_median: double, tax_code_rate_q75: double, " - + "tax_code_rate_q90: double, tax_code_rate_max: double, " - + "tax_code_rate_mean: double, tax_code_rate_sum: double, " - + "av_clerk_min: int, av_clerk_q10: double, av_clerk_q25: double, " - + "av_clerk_median: double, av_clerk_q75: double, " - + "av_clerk_q90: double, av_clerk_max: int, av_clerk_mean: double, " - + "av_clerk_sum: double, exe_homeowner_count: bigint, " - + "exe_homeowner_sum: double, exe_senior_count: bigint, " - + "exe_senior_sum: double, exe_freeze_count: bigint, " - + "exe_freeze_sum: double, exe_longtime_homeowner_count: bigint, " - + "exe_longtime_homeowner_sum: double, exe_disabled_count: bigint, " - + "exe_disabled_sum: double, exe_vet_returning_count: bigint, " - + "exe_vet_returning_sum: double, exe_vet_dis_lt50_count: bigint, " - + "exe_vet_dis_lt50_sum: double, exe_vet_dis_50_69_count: bigint, " - + "exe_vet_dis_50_69_sum: double, exe_vet_dis_ge70_count: bigint, " - + "exe_vet_dis_ge70_sum: double, exe_abate_count: bigint, " - + "exe_abate_sum: double, exe_total_count: bigint, " - + "exe_total_sum: double, tax_bill_total_deltamedian: double, " - + "tax_bill_total_deltamean: double, tax_bill_total_deltasum: double" + + "tax_bill_total_sum: double, tax_bill_total_delta_median: double, " + + "tax_bill_total_delta_mean: double, " + + "tax_bill_total_delta_sum: double , tax_rate_min: double, " + + "tax_rate_q10: double, tax_rate_q25: double, " + + "tax_rate_median: double, tax_rate_q75: double, " + + "tax_rate_q90: double, tax_rate_max: double, " + + "tax_rate_mean: double, tax_rate_sum: double, " + + "tax_av_min: int, tax_av_q10: double, tax_av_q25: double, " + + "tax_av_median: double, tax_av_q75: double, " + + "tax_av_q90: double, tax_av_max: int, tax_av_mean: double, " + + "tax_av_sum: double, tax_exe_n_homeowner: bigint, " + + "tax_exe_homeowner_sum: double, tax_exe_n_senior: bigint, " + + "tax_exe_senior_sum: double, tax_exe_n_freeze: bigint, " + + "tax_exe_freeze_sum: double, tax_exe_n_longtime_homeowner: bigint, " + + "tax_exe_longtime_homeowner_sum: double, " + + "tax_exe_n_disabled: bigint, tax_exe_disabled_sum: double, " + + "tax_exe_n_vet_returning: bigint, " + + "tax_exe_vet_returning_sum: double, tax_exe_n_vet_dis_lt50: bigint, " + + "tax_exe_vet_dis_lt50_sum: double, tax_exe_n_vet_dis_50_69: bigint, " + + "tax_exe_vet_dis_50_69_sum: double, tax_exe_n_vet_dis_ge70: bigint, " + + "tax_exe_vet_dis_ge70_sum: double, tax_exe_n_abate: bigint, " + + "tax_exe_abate_sum: double, tax_exe_n_total: bigint, " + + "tax_exe_total_sum: double" ) spark_df = spark_session.createDataFrame(df, schema=schema) diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql index 281b69267..97cfff982 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql @@ -17,31 +17,32 @@ WITH tcd AS ( SELECT uni.pin, tax.year, - tax.av_clerk, + tax.av_clerk AS tax_av, tax.tax_bill_total, CASE WHEN tax.exe_homeowner = 0 THEN NULL ELSE tax.exe_homeowner END - AS exe_homeowner, + AS tax_exe_homeowner, CASE WHEN tax.exe_senior = 0 THEN NULL ELSE tax.exe_senior END - AS exe_senior, + AS tax_exe_senior, CASE WHEN tax.exe_freeze = 0 THEN NULL ELSE tax.exe_freeze END - AS exe_freeze, + AS tax_exe_freeze, CASE WHEN tax.exe_longtime_homeowner = 0 THEN NULL ELSE tax.exe_longtime_homeowner - END AS exe_longtime_homeowner, + END AS tax_exe_longtime_homeowner, CASE WHEN tax.exe_disabled = 0 THEN NULL ELSE tax.exe_disabled END - AS exe_disabled, + AS tax_exe_disabled, CASE WHEN tax.exe_vet_returning = 0 THEN NULL ELSE tax.exe_vet_returning - END AS exe_vet_returning, + END AS tax_exe_vet_returning, CASE WHEN tax.exe_vet_dis_lt50 = 0 THEN NULL ELSE tax.exe_vet_dis_lt50 END - AS exe_vet_dis_lt50, + AS tax_exe_vet_dis_lt50, CASE WHEN tax.exe_vet_dis_50_69 = 0 THEN NULL ELSE tax.exe_vet_dis_50_69 - END AS exe_vet_dis_50_69, + END AS tax_exe_vet_dis_50_69, CASE WHEN tax.exe_vet_dis_ge70 = 0 THEN NULL ELSE tax.exe_vet_dis_ge70 END - AS exe_vet_dis_ge70, - CASE WHEN tax.exe_abate = 0 THEN NULL ELSE tax.exe_abate END AS exe_abate, + AS tax_exe_vet_dis_ge70, + CASE WHEN tax.exe_abate = 0 THEN NULL ELSE tax.exe_abate END + AS tax_exe_abate, CASE WHEN tax.exe_homeowner + tax.exe_senior + tax.exe_freeze + tax.exe_longtime_homeowner + tax.exe_disabled @@ -52,10 +53,10 @@ SELECT + tax.exe_longtime_homeowner + tax.exe_disabled + tax.exe_vet_returning + tax.exe_vet_dis_lt50 + tax.exe_vet_dis_50_69 + tax.exe_vet_dis_ge70 + tax.exe_abate - END AS exe_total, - tcd.tax_code_rate, - eqf.eq_factor_tentative, - eqf.eq_factor_final, + END AS tax_exe_total, + tcd.tax_code_rate AS tax_rate, + eqf.eq_factor_tentative AS tax_eq_factor_tentative, + eqf.eq_factor_final AS tax_eq_factor_final, uni.class, 'Cook' AS county, uni.triad_name AS triad, From adc16eaac42f805036ec8ee12ed0f34dbd39b23a Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Sun, 7 Jul 2024 13:37:03 +0000 Subject: [PATCH 36/96] Clean assessment_roll columns --- .../reporting.sot_assessment_roll.py | 122 ++++++++++++++---- .../reporting.sot_assessment_roll_input.sql | 6 +- dbt/models/reporting/reporting.sot_sales.py | 14 +- .../reporting.sot_taxes_exemptions.py | 9 +- 4 files changed, 112 insertions(+), 39 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index 5a51c704c..51a1c6f8d 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -91,10 +91,11 @@ def first(x): ] stats = { - "tot": ["size", "count"] + more_stats, - "bldg": more_stats, - "land": more_stats, + "av_tot": ["size", "count"] + more_stats, + "av_bldg": more_stats, + "av_land": more_stats, "triad": [first], + "geography_data_year": [first], } @@ -126,7 +127,7 @@ def assemble(df, geos, groups): # Loop through group combinations and stack output for key, value in geos.items(): - df["data_year"] = df[key] + df["geography_data_year"] = df[key] for x in value: for z in groups: @@ -134,12 +135,12 @@ def assemble(df, geos, groups): # Clean combined output and export for i in ["median", "mean", "sum"]: - output["tot", "delta" + i] = output["tot", i].diff() - output["bldg", "delta" + i] = output["bldg", i].diff() - output["land", "delta" + i] = output["land", i].diff() + output["av_tot", "delta_" + i] = output["av_tot", i].diff() + output["av_bldg", "delta_" + i] = output["av_bldg", i].diff() + output["av_land", "delta_" + i] = output["av_land", i].diff() - output["tot", "pct_w_value"] = ( - output["tot", "count"] / output["tot", "size"] + output["av_tot", "pct_w_value"] = ( + output["av_tot", "count"] / output["av_tot", "size"] ) output.columns = ["_".join(col) for col in output.columns] @@ -174,6 +175,73 @@ def assemble(df, geos, groups): ] = "Yes" output = output.drop(["triennial", "triad"], axis=1) + output = clean_names(output) + + return output + + +def clean_names(x): + output = x.rename( + columns={ + "av_tot_size": "pin_n_tot", + "av_tot_count": "pin_n_w_value", + "av_tot_pct_w_value": "pin_pct_w_value", + "geography_data_year_first": "geography_data_year", + } + ) + + output = output[ + [ + "geography_type", + "geography_id", + "geography_data_year", + "group_type", + "group_id", + "year", + "reassessment_year", + "stage_name", + "pin_n_tot", + "pin_n_w_value", + "pin_pct_w_value", + "av_tot_min", + "av_tot_q10", + "av_tot_q25", + "av_tot_median", + "av_tot_q75", + "av_tot_q90", + "av_tot_max", + "av_tot_mean", + "av_tot_sum", + "av_tot_delta_median", + "av_tot_delta_mean", + "av_tot_delta_sum", + "av_bldg_min", + "av_bldg_q10", + "av_bldg_q25", + "av_bldg_median", + "av_bldg_q75", + "av_bldg_q90", + "av_bldg_max", + "av_bldg_mean", + "av_bldg_sum", + "av_bldg_delta_median", + "av_bldg_delta_mean", + "av_bldg_delta_sum", + "av_land_min", + "av_land_q10", + "av_land_q25", + "av_land_median", + "av_land_q75", + "av_land_q90", + "av_land_max", + "av_land_mean", + "av_land_sum", + "av_land_delta_median", + "av_land_delta_mean", + "av_land_delta_sum", + ] + ] + return output @@ -189,23 +257,25 @@ def model(dbt, spark_session): df = assemble(input, geos=geos, groups=groups) schema = ( - "geography_type: string, geography_id: string, group_type: string, " - + "group_id: string, year: bigint, stage_name: string, " - + "tot_size: bigint, tot_count: bigint, tot_min: double, " - + "tot_q10: double, tot_q25: double, tot_median: double, " - + "tot_q75: double, tot_q90: double, tot_max: double, " - + "tot_mean: double, tot_sum: double, bldg_min: double, " - + "bldg_q10: double, bldg_q25: double, bldg_median: double, " - + "bldg_q75: double, bldg_q90: double, bldg_max: double, " - + "bldg_mean: double, bldg_sum: double, land_min: double, " - + "land_q10: double, land_q25: double, land_median: double, " - + "land_q75: double, land_q90: double, land_max: double, " - + "land_mean: double, land_sum: double, tot_deltamedian: double, " - + "bldg_deltamedian: double, land_deltamedian: double, " - + "tot_deltamean: double, bldg_deltamean: double, " - + "land_deltamean: double, tot_deltasum: double, " - + "bldg_deltasum: double, land_deltasum: double, " - + "tot_pct_w_value: double, reassessment_year: string" + "geography_type: string, geography_id: string, " + + "geography_data_year: string, group_type: string, group_id: string, " + + "year: string, reassessment_year: string, stage_name: string, " + + "pin_n_tot: bigint, pin_n_w_value: bigint, pin_pct_w_value: double, " + + "av_tot_min: double, av_tot_q10: double, av_tot_q25: double, " + + "av_tot_median: double, av_tot_q75: double, av_tot_q90: double, " + + "av_tot_max: double, av_tot_mean: double, av_tot_sum: double, " + + "av_tot_delta_median: double, av_tot_delta_mean: double, " + + "av_tot_delta_sum: double, av_bldg_min: double, " + + "av_bldg_q10: double, av_bldg_q25: double, av_bldg_median: double, " + + "av_bldg_q75: double, av_bldg_q90: double, av_bldg_max: double, " + + "av_bldg_mean: double, av_bldg_sum: double, " + + "av_bldg_delta_median: double, av_bldg_delta_mean: double, " + + "av_bldg_delta_sum: double, av_land_min: double, " + + "av_land_q10: double, av_land_q25: double, av_land_median: double, " + + "av_land_q75: double, av_land_q90: double, av_land_max: double, " + + "av_land_mean: double, av_land_sum: double, " + + "av_land_delta_median: double, av_land_delta_mean: double, " + + "av_land_delta_sum: double" ) spark_df = spark_session.createDataFrame(df, schema=schema) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql index 470a01d27..91397a3f3 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql +++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql @@ -33,9 +33,9 @@ SELECT uni.year, uni.stage_name, uni.class, - CAST(vals.tot AS INT) AS tot, - CAST(vals.bldg AS INT) AS bldg, - CAST(vals.land AS INT) AS land, + CAST(vals.tot AS INT) AS av_tot, + CAST(vals.bldg AS INT) AS av_bldg, + CAST(vals.land AS INT) AS av_land, 'Cook' AS county, uni.triad_name AS triad, uni.township_name AS township, diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py index 6d9c8f563..81d42b872 100644 --- a/dbt/models/reporting/reporting.sot_sales.py +++ b/dbt/models/reporting/reporting.sot_sales.py @@ -92,7 +92,7 @@ def first(x): "sale_char_land_sf": ["median"], "sale_char_yrblt": ["median"], "class": [stats.multimode], - "data_year": [first], + "geography_data_year": [first], } @@ -123,7 +123,7 @@ def assemble(df, geos, groups): # Loop through group combinations and stack output for key, value in geos.items(): - df["data_year"] = df[key] + df["geography_data_year"] = df[key] for x in value: for z in groups: @@ -158,7 +158,7 @@ def clean_names(x): "year": "sale_year", "sale_price_count": "sale_n_tot", "class_multimode": "sale_class_mode", - "data_year_first": "data_year", + "geography_data_year_first": "geography_data_year", } ) @@ -166,6 +166,7 @@ def clean_names(x): [ "geography_type", "geography_id", + "geography_data_year", "group_type", "group_id", "sale_year", @@ -199,7 +200,6 @@ def clean_names(x): "sale_char_land_sf_median", "sale_char_yrblt_median", "sale_class_mode", - "data_year", ] ] @@ -218,7 +218,8 @@ def model(dbt, spark_session): df = assemble(input, geos=geos, groups=groups) schema = ( - "geography_type: string, geography_id: string, group_type: string, " + "geography_type: string, geography_id: string, " + + "geography_data_year: string, group_type: string, " + "group_id: string, sale_year: string, pin_n_tot: bigint, " + "sale_n_tot: int, sale_price_min: double, sale_price_q10: double, " + "sale_price_q25: double, sale_price_median: double, " @@ -236,8 +237,7 @@ def model(dbt, spark_session): + "sale_price_per_sf_delta_sum: double, " + "sale_char_bldg_sf_median: double, " + "sale_char_land_sf_median: double, " - + "sale_char_yrblt_median: double, sale_class_mode: array, " - + "data_year: string" + + "sale_char_yrblt_median: double, sale_class_mode: array" ) spark_df = spark_session.createDataFrame(df, schema=schema) diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py index 52a085aab..879dc1cf3 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py @@ -104,6 +104,7 @@ def first(x): "tax_exe_vet_dis_ge70": less_stats, "tax_exe_abate": less_stats, "tax_exe_total": less_stats, + "geography_data_year": [first], } @@ -134,7 +135,7 @@ def assemble(df, geos, groups): # Loop through group combinations and stack output for key, value in geos.items(): - df["data_year"] = df[key] + df["geography_data_year"] = df[key] for x in value: for z in groups: @@ -172,6 +173,7 @@ def clean_names(x): "tax_exe_total_count": "tax_exe_n_total", "tax_eq_factor_final_first": "tax_eq_factor_final", "tax_eq_factor_tentative_first": "tax_eq_factor_tentative", + "geography_data_year_first": "geography_data_year", } ) @@ -179,7 +181,7 @@ def clean_names(x): [ "geography_type", "geography_id", - "group_type", + "geography_data_year" "group_type", "group_id", "tax_year", "pin_n_tot", @@ -254,7 +256,8 @@ def model(dbt, spark_session): df = assemble(input, geos=geos, groups=groups) schema = ( - "geography_type: string, geography_id: string, group_type: string, " + "geography_type: string, geography_id: string, " + + "geography_data_year: string, group_type: string, " + "group_id: string, tax_year: string, pin_n_tot: bigint, " + "tax_eq_factor_final: double, tax_eq_factor_tentative: double, " + "tax_bill_total_min: double, tax_bill_total_q10: double, " From f8b87abfd0e0c42cd9fa95a4e8cd7115b6b42bd9 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Sun, 7 Jul 2024 14:29:12 +0000 Subject: [PATCH 37/96] Fix delta columns --- .../reporting.sot_assessment_roll.py | 151 +++++++++++++++--- dbt/models/reporting/reporting.sot_sales.py | 50 ++++-- .../reporting.sot_taxes_exemptions.py | 28 +++- 3 files changed, 193 insertions(+), 36 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index 51a1c6f8d..d0a579b04 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -133,19 +133,122 @@ def assemble(df, geos, groups): for z in groups: output = pd.concat([output, aggregrate(df, x, z)]) + output.columns = ["_".join(col) for col in output.columns] + output = output.reset_index() + output = output.rename(columns={"triad_first": "triad"}) + # Clean combined output and export - for i in ["median", "mean", "sum"]: - output["av_tot", "delta_" + i] = output["av_tot", i].diff() - output["av_bldg", "delta_" + i] = output["av_bldg", i].diff() - output["av_land", "delta_" + i] = output["av_land", i].diff() + output["av_tot_pct_w_value"] = ( + output["av_tot_count"] / output["av_tot_size"] + ) - output["av_tot", "pct_w_value"] = ( - output["av_tot", "count"] / output["av_tot", "size"] + output["av_tot_delta_median"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .av_tot_median.diff() ) - output.columns = ["_".join(col) for col in output.columns] - output = output.reset_index() - output = output.rename(columns={"triad_first": "triad"}) + output["av_tot_delta_mean"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .av_tot_mean.diff() + ) + + output["av_tot_delta_sum"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .av_tot_sum.diff() + ) + + output["av_bldg_delta_median"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .av_bldg_median.diff() + ) + + output["av_bldg_delta_mean"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .av_bldg_mean.diff() + ) + + output["av_bldg_delta_sum"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .av_bldg_sum.diff() + ) + + output["av_land_delta_median"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .av_land_median.diff() + ) + + output["av_land_delta_mean"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .av_land_mean.diff() + ) + + output["av_land_delta_sum"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .av_land_sum.diff() + ) + + output["av_tot_delta_pct_median"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .av_tot_median.pct_change() + ) + + output["av_tot_delta_pct_mean"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .av_tot_mean.pct_change() + ) + + output["av_tot_delta_pct_sum"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .av_tot_sum.pct_change() + ) + + output["av_bldg_delta_pct_median"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .av_bldg_median.pct_change() + ) + + output["av_bldg_delta_pct_mean"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .av_bldg_mean.pct_change() + ) + + output["av_bldg_delta_pct_sum"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .av_bldg_sum.pct_change() + ) + + output["av_land_delta_pct_median"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .av_land_median.pct_change() + ) + + output["av_land_delta_pct_mean"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .av_land_mean.pct_change() + ) + + output["av_land_delta_pct_sum"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .av_land_sum.pct_change() + ) output["year"] = output["year"].astype(int) output["triennial"] = output["geography_type"].isin( @@ -215,6 +318,9 @@ def clean_names(x): "av_tot_delta_median", "av_tot_delta_mean", "av_tot_delta_sum", + "av_tot_delta_pct_median", + "av_tot_delta_pct_mean", + "av_tot_delta_pct_sum", "av_bldg_min", "av_bldg_q10", "av_bldg_q25", @@ -227,6 +333,9 @@ def clean_names(x): "av_bldg_delta_median", "av_bldg_delta_mean", "av_bldg_delta_sum", + "av_bldg_delta_pct_median", + "av_bldg_delta_pct_mean", + "av_bldg_delta_pct_sum", "av_land_min", "av_land_q10", "av_land_q25", @@ -239,6 +348,9 @@ def clean_names(x): "av_land_delta_median", "av_land_delta_mean", "av_land_delta_sum", + "av_land_delta_pct_median", + "av_land_delta_pct_mean", + "av_land_delta_pct_sum", ] ] @@ -265,17 +377,20 @@ def model(dbt, spark_session): + "av_tot_median: double, av_tot_q75: double, av_tot_q90: double, " + "av_tot_max: double, av_tot_mean: double, av_tot_sum: double, " + "av_tot_delta_median: double, av_tot_delta_mean: double, " - + "av_tot_delta_sum: double, av_bldg_min: double, " - + "av_bldg_q10: double, av_bldg_q25: double, av_bldg_median: double, " - + "av_bldg_q75: double, av_bldg_q90: double, av_bldg_max: double, " - + "av_bldg_mean: double, av_bldg_sum: double, " + + "av_tot_delta_sum: double, av_tot_delta_pct_median: double, " + + "av_tot_delta_pct_mean: double, av_tot_delta_pct_sum: double, " + + "av_bldg_min: double, av_bldg_q10: double, av_bldg_q25: double, " + + "av_bldg_median: double, av_bldg_q75: double, av_bldg_q90: double, " + + "av_bldg_max: double, av_bldg_mean: double, av_bldg_sum: double, " + "av_bldg_delta_median: double, av_bldg_delta_mean: double, " - + "av_bldg_delta_sum: double, av_land_min: double, " - + "av_land_q10: double, av_land_q25: double, av_land_median: double, " - + "av_land_q75: double, av_land_q90: double, av_land_max: double, " - + "av_land_mean: double, av_land_sum: double, " + + "av_bldg_delta_sum: double, av_bldg_delta_pct_median: double, " + + "av_bldg_delta_pct_mean: double, av_bldg_delta_pct_sum: double, " + + "av_land_min: double, av_land_q10: double, av_land_q25: double, " + + "av_land_median: double, av_land_q75: double, av_land_q90: double, " + + "av_land_max: double, av_land_mean: double, av_land_sum: double, " + "av_land_delta_median: double, av_land_delta_mean: double, " - + "av_land_delta_sum: double" + + "av_land_delta_sum: double, av_land_delta_pct_median: double, " + + "av_land_delta_pct_mean: double, av_land_delta_pct_sum: double" ) spark_df = spark_session.createDataFrame(df, schema=schema) diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py index 81d42b872..b2d55f7ae 100644 --- a/dbt/models/reporting/reporting.sot_sales.py +++ b/dbt/models/reporting/reporting.sot_sales.py @@ -129,22 +129,50 @@ def assemble(df, geos, groups): for z in groups: output = pd.concat([output, aggregrate(df, x, z)]) + output.columns = ["_".join(col) for col in output.columns] + output = output.reset_index() + # Clean combined output and export - output["sale_price", "sum"] = output["sale_price", "sum"].replace( + output["sale_price_sum"] = output["sale_price_sum"].replace(0, np.NaN) + output["sale_price_per_sf_sum"] = output["sale_price_per_sf_sum"].replace( 0, np.NaN ) - output["sale_price_per_sf", "sum"] = output[ - "sale_price_per_sf", "sum" - ].replace(0, np.NaN) - for i in ["median", "mean", "sum"]: - output["sale_price", "delta_" + i] = output["sale_price", i].diff() - output["sale_price_per_sf", "delta_" + i] = output[ - "sale_price_per_sf", i - ].diff() + output["sale_price_delta_median"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id"]) + .sale_price_median.diff() + ) + + output["sale_price_delta_mean"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id"]) + .sale_price_mean.diff() + ) - output.columns = ["_".join(col) for col in output.columns] - output = output.reset_index() + output["sale_price_delta_sum"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id"]) + .sale_price_sum.diff() + ) + + output["sale_price_per_sf_delta_median"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id"]) + .sale_price_per_sf_median.diff() + ) + + output["sale_price_per_sf_delta_mean"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id"]) + .sale_price_per_sf_mean.diff() + ) + + output["sale_price_per_sf_delta_sum"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id"]) + .sale_price_per_sf_sum.diff() + ) output = clean_names(output) diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py index 879dc1cf3..fc8eb9cc2 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py @@ -141,15 +141,28 @@ def assemble(df, geos, groups): for z in groups: output = pd.concat([output, aggregrate(df, x, z)]) - # Clean combined output and export - for i in ["median", "mean", "sum"]: - output["tax_bill_total", "delta_" + i] = output[ - "tax_bill_total", i - ].diff() - output.columns = ["_".join(col) for col in output.columns] output = output.reset_index() + # Clean combined output and export + output["tax_bill_total_delta_median"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id"]) + .tax_bill_total_median.diff() + ) + + output["tax_bill_total_delta_mean"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id"]) + .tax_bill_total_mean.diff() + ) + + output["tax_bill_total_delta_sum"] = ( + output.sort_values("year") + .groupby(["geography_id", "group_id"]) + .tax_bill_total_sum.diff() + ) + output = clean_names(output) return output @@ -181,7 +194,8 @@ def clean_names(x): [ "geography_type", "geography_id", - "geography_data_year" "group_type", + "geography_data_year", + "group_type", "group_id", "tax_year", "pin_n_tot", From 54ebab83646464af5003a00e1f931009c610ec20 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Sun, 7 Jul 2024 16:49:07 +0000 Subject: [PATCH 38/96] Clean ratio table columns --- .../reporting/reporting.sot_ratio_stats.py | 116 +++++++++++++----- 1 file changed, 88 insertions(+), 28 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py index c226dc532..f3b373583 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats.py +++ b/dbt/models/reporting/reporting.sot_ratio_stats.py @@ -117,24 +117,24 @@ def aggregrate(data, geography_type, group_type): print(geography_type, group_type) group = [geography_type, group_type, "year", "stage_name"] - data["size"] = data.groupby(group)["tot_mv"].transform("size") - data["sale_count"] = data.groupby(group)["sale_price"].transform("count") - data["mv_count"] = data.groupby(group)["tot_mv"].transform("count") + data["pin_n_tot"] = data.groupby(group)["tot_mv"].transform("size") + data["sale_n_tot"] = data.groupby(group)["sale_price"].transform("count") + data["pin_n_w_value"] = data.groupby(group)["tot_mv"].transform("count") # Remove parcels with FMVs of 0 since they screw up ratios data = data[data["tot_mv"] > 0] # Remove groups that only have one sale since we can't calculate stats data = data.dropna(subset=["sale_price"]) - data = data[data["sale_count"] >= 20] + data = data[data["sale_n_tot"] >= 20] summary = data.groupby(group).apply( lambda x: pd.Series( { "triad": first(x["triad"]), - "size": np.size(x["ratio"]), - "mv_count": x["mv_count"].min(), - "sale_count": x["sale_count"].min(), + "pin_n_tot": np.size(x["ratio"]), + "pin_n_w_value": x["pin_n_w_value"].min(), + "sale_n_tot": x["sale_n_tot"].min(), "mv_min": x["tot_mv"].min(), "mv_q10": x["tot_mv"].quantile(0.1), "mv_q25": x["tot_mv"].quantile(0.25), @@ -152,7 +152,6 @@ def aggregrate(data, geography_type, group_type): "ratio_q90": x["ratio"].quantile(0.90), "ratio_max": x["ratio"].max(), "ratio_mean": x["ratio"].mean(), - # "cod": ' '.join(x['ratio'].astype(str).values), "cod": cod_safe(ratio=x["ratio"]), "prd": prd_safe( assessed=x["tot_mv"], sale_price=x["sale_price"] @@ -163,6 +162,7 @@ def aggregrate(data, geography_type, group_type): "mki": mki_safe( assessed=x["tot_mv"], sale_price=x["sale_price"] ), + "geography_data_year": first(x["data_year"]), } ) ) @@ -203,18 +203,20 @@ def clean(dirty): ] ) + dirty["pin_pct_w_value"] = dirty["pin_n_w_value"] / dirty["pin_n_tot"] + # Clean combined dirty and export - dirty["mv_delta_pct_median"] = ( + dirty["mv_delta_median"] = ( dirty.sort_values("year") .groupby(["geography_id", "group_id", "stage_name"]) .mv_median.diff() ) - dirty["mv_delta_pct_mean"] = ( + dirty["mv_delta_mean"] = ( dirty.sort_values("year") .groupby(["geography_id", "group_id", "stage_name"]) .mv_mean.diff() ) - dirty["mv_delta_pct_sum"] = ( + dirty["mv_delta_sum"] = ( dirty.sort_values("year") .groupby(["geography_id", "group_id", "stage_name"]) .mv_sum.diff() @@ -225,12 +227,19 @@ def clean(dirty): .groupby(["geography_id", "group_id", "stage_name"]) .mv_median.pct_change() ) + dirty["mv_delta_pct_mean"] = ( dirty.sort_values("year") .groupby(["geography_id", "group_id", "stage_name"]) .mv_mean.pct_change() ) + dirty["mv_delta_pct_sum"] = ( + dirty.sort_values("year") + .groupby(["geography_id", "group_id", "stage_name"]) + .mv_sum.pct_change() + ) + dirty = dirty.reset_index() dirty["year"] = dirty["year"].astype(int) @@ -274,12 +283,11 @@ def clean(dirty): dirty = dirty.astype( { "group_id": "str", - "year": np.int64, + "year": "str", "stage_name": "str", "reassessment_year": "str", - "size": np.int64, - "mv_count": np.int64, - "sale_count": np.int64, + "pin_n_w_value": np.int64, + "sale_n_tot": np.int64, "mv_min": np.int64, "mv_q10": np.int64, "mv_q25": np.int64, @@ -292,6 +300,58 @@ def clean(dirty): } ) + dirty = dirty[ + [ + "geography_type", + "geography_id", + "geography_data_year", + "group_type", + "group_id", + "year", + "reassessment_year", + "stage_name", + "pin_n_tot", + "pin_n_w_value", + "pin_pct_w_value", + "sale_n_tot", + "mv_min", + "mv_q10", + "mv_q25", + "mv_median", + "mv_q75", + "mv_q90", + "mv_max", + "mv_mean", + "mv_sum", + "mv_delta_median", + "mv_delta_mean", + "mv_delta_sum", + "mv_delta_pct_median", + "mv_delta_pct_mean", + "mv_delta_pct_sum", + "ratio_min", + "ratio_q10", + "ratio_q25", + "ratio_median", + "ratio_q75", + "ratio_q90", + "ratio_max", + "ratio_mean", + "cod", + "prd", + "prb", + "mki", + "cod_met", + "prd_met", + "prb_met", + "mki_met", + "within_05_pct", + "within_10_pct", + "within_15_pct", + "within_20_pct", + ] + ] + return dirty @@ -310,20 +370,20 @@ def model(dbt, spark_session): schema = ( "geography_type: string, geography_id: string, " - + "group_type: string, group_id: string, year: bigint, " - + "stage_name: string, size: bigint, " - + "mv_count: bigint, " - + "sale_count: bigint, mv_min: bigint, mv_q10: bigint, " + + "geography_data_year: string, group_type: string, group_id: string, " + + "year: string, reassessment_year: string, stage_name: string, " + + "pin_n_tot: bigint, pin_n_w_value: bigint, pin_pct_w_value: double, " + + "sale_n_tot: bigint, mv_min: bigint, mv_q10: bigint, " + "mv_q25: bigint, mv_median: bigint, mv_q75: bigint, " - + "mv_q90: bigint, mv_max: bigint, mv_mean: bigint, " - + "mv_sum: bigint, ratio_min: double, ratio_q10: double, " - + "ratio_q25: double, ratio_median: double, ratio_q75: double, " - + "ratio_q90: double, ratio_max: double, ratio_mean: double, " - + "cod: double, prd: double, prb: double, mki: double, " - + "mv_delta_pct_median: double, mv_delta_pct_mean: double, " - + "mv_delta_pct_sum: double, reassessment_year: string, " - + "cod_met: boolean, prd_met: boolean, prb_met: boolean, " - + "mki_met: boolean, within_05_pct: boolean, " + + "mv_q90: bigint, mv_max: bigint, mv_mean: bigint, mv_sum: bigint, " + + "mv_delta_median: bigint, mv_delta_mean: bigint, " + + "mv_delta_sum: bigint, mv_delta_pct_median: double, " + + "mv_delta_pct_mean: double, mv_delta_pct_sum: double, " + + "ratio_min: double, ratio_q10: double, ratio_q25: double, " + + "ratio_median: double, ratio_q75: double, ratio_q90: double, " + + "ratio_max: double, ratio_mean: double, cod: double, prd: double, " + + "prb: double, mki: double, cod_met: boolean, prd_met: boolean, " + + "prb_met: boolean, mki_met: boolean, within_05_pct: boolean, " + "within_10_pct: boolean, within_15_pct: boolean, " + "within_20_pct: boolean" ) From d2dddab9693eabab606bc0763d47e65439e3723e Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Sun, 7 Jul 2024 17:35:01 +0000 Subject: [PATCH 39/96] Attempt to fix pin_n_tot type error that doesn't trigger locally --- dbt/models/reporting/reporting.sot_ratio_stats.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py index f3b373583..3be65774c 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats.py +++ b/dbt/models/reporting/reporting.sot_ratio_stats.py @@ -287,6 +287,7 @@ def clean(dirty): "stage_name": "str", "reassessment_year": "str", "pin_n_w_value": np.int64, + "pin_n_tot": np.int64, "sale_n_tot": np.int64, "mv_min": np.int64, "mv_q10": np.int64, From 00e790cc1729b69620210d1f8dcf444fd330a7a7 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Sun, 7 Jul 2024 18:56:46 +0000 Subject: [PATCH 40/96] Try again to fix pin_n_tot --- .../reporting/reporting.sot_ratio_stats.py | 35 +------------------ 1 file changed, 1 insertion(+), 34 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py index 3be65774c..22500e3ef 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats.py +++ b/dbt/models/reporting/reporting.sot_ratio_stats.py @@ -18,39 +18,6 @@ "county", "triad", "township", - "nbhd", - "tax_code", - "zip_code", - ], - "census_data_year": [ - "census_place", - "census_tract", - "census_congressional_district", - "census_zcta", - ], - "cook_board_of_review_district_data_year": [ - "cook_board_of_review_district" - ], - "cook_commissioner_district_data_year": ["cook_commissioner_district"], - "cook_judicial_district_data_year": ["cook_judicial_district"], - "ward_data_year": ["ward_num"], - "community_area_data_year": ["community_area"], - "police_district_data_year": ["police_district"], - "central_business_district_data_year": ["central_business_district"], - "school_data_year": [ - "school_elementary_district", - "school_secondary_district", - "school_unified_district", - ], - "tax_data_year": [ - "tax_municipality", - "tax_park_district", - "tax_library_district", - "tax_fire_protection_district", - "tax_community_college_district", - "tax_sanitation_district", - "tax_special_service_area", - "tax_tif_district", ], } # Declare class groupings @@ -373,7 +340,7 @@ def model(dbt, spark_session): "geography_type: string, geography_id: string, " + "geography_data_year: string, group_type: string, group_id: string, " + "year: string, reassessment_year: string, stage_name: string, " - + "pin_n_tot: bigint, pin_n_w_value: bigint, pin_pct_w_value: double, " + + "pin_n_tot: double, pin_n_w_value: bigint, pin_pct_w_value: double, " + "sale_n_tot: bigint, mv_min: bigint, mv_q10: bigint, " + "mv_q25: bigint, mv_median: bigint, mv_q75: bigint, " + "mv_q90: bigint, mv_max: bigint, mv_mean: bigint, mv_sum: bigint, " From 408de564d9e6ec68ffee98c84f95384b9b85e471 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Sun, 7 Jul 2024 20:03:26 +0000 Subject: [PATCH 41/96] Change ass roll sample to be able to compare across stages --- dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql index 91397a3f3..a71b66ad2 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql +++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql @@ -97,4 +97,4 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals AND uni.stage_name = vals.stage_name LEFT JOIN {{ ref('ccao.class_dict') }} ON uni.class = class_dict.class_code -WHERE uni.stage_name = 'MAILED' AND uni.class = '278' AND uni.year >= '2018' +WHERE uni.class = '278' AND uni.year IN ('2019', '2020', '2021') From fd95fcb8f878c0f40d11ac45e0cdc4c1e9c9965e Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Sun, 7 Jul 2024 23:23:36 +0000 Subject: [PATCH 42/96] Add commenting for input tables, try to partion assessment_roll table --- .../reporting.sot_assessment_roll_input.sql | 14 ++++++++++---- .../reporting/reporting.sot_ratio_stats_input.sql | 8 +++++++- dbt/models/reporting/reporting.sot_sales_input.sql | 6 +++++- .../reporting.sot_taxes_exemptions_input.sql | 9 +++++++-- 4 files changed, 29 insertions(+), 8 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql index a71b66ad2..89724635c 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql +++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql @@ -1,7 +1,10 @@ --- Gather parcel-level geographies and join land, sales, and class groupings +-- This script gathers parcel-level geographies and joins them to values and +-- class groupings. Its sole purpose is to feed reporting.sot_assessment_roll, +-- and should not be used otherwise. {{ config( - materialized='table' + materialized='table', + partitioned_by=['year'] ) }} @@ -21,6 +24,8 @@ WITH stages AS ( ), +-- Universe of all parcels as defined by iasworld.pardat, expanded with +-- assessment stages. uni AS ( SELECT vw_pin_universe.*, @@ -30,7 +35,6 @@ uni AS ( ) SELECT - uni.year, uni.stage_name, uni.class, CAST(vals.tot AS INT) AS av_tot, @@ -89,7 +93,8 @@ SELECT class_dict.major_class_type AS major_class, class_dict.modeling_group, CASE WHEN class_dict.major_class_code = '2' THEN 'RES' ELSE 'OTHER' END - AS res_other + AS res_other, + uni.year FROM uni LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals ON uni.pin = vals.pin @@ -97,4 +102,5 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals AND uni.stage_name = vals.stage_name LEFT JOIN {{ ref('ccao.class_dict') }} ON uni.class = class_dict.class_code +-- Temporary limit on feeder table to avoid GitHub runner memory issues. WHERE uni.class = '278' AND uni.year IN ('2019', '2020', '2021') diff --git a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql index 266024e0a..35d86e976 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql +++ b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql @@ -1,4 +1,7 @@ --- Gather parcel-level geographies and join land, sales, and class groupings +/* This script gathers parcel-level geographies and joins them to values and +sale prices, and class groupings in order to construct sales ratios. Its sole +purpose is to feed reporting.sot_ratio_stats, and should not be used +otherwise. */ {{ config( materialized='table' @@ -21,6 +24,8 @@ WITH stages AS ( ), +-- Universe of all parcels as defined by iasworld.pardat, expanded with +-- assessment stages. uni AS ( SELECT vw_pin_universe.*, @@ -105,5 +110,6 @@ LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales AND NOT sales.sale_filter_deed_type AND NOT sales.sale_filter_less_than_10k AND NOT sales.sale_filter_same_sale_within_365 +-- Temporary limit on feeder table to avoid GitHub runner memory issues. WHERE uni.year >= '2020' AND uni.year IN ('2022', '2023') AND uni.class IN ('278', '597') diff --git a/dbt/models/reporting/reporting.sot_sales_input.sql b/dbt/models/reporting/reporting.sot_sales_input.sql index 575bb8aed..72c4ba852 100644 --- a/dbt/models/reporting/reporting.sot_sales_input.sql +++ b/dbt/models/reporting/reporting.sot_sales_input.sql @@ -1,3 +1,7 @@ +-- This script gathers parcel-level geographies and joins them to sales and +-- class groupings. Its sole purpose is to feed reporting.sot_sales, +-- and should not be used otherwise. + {{ config( materialized='table' @@ -16,7 +20,6 @@ WITH sf AS ( GROUP BY pin, year ) --- Gather parcel-level geographies and join land, sales, and class groupings SELECT sales.doc_no, sales.sale_price, @@ -96,4 +99,5 @@ LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales AND NOT sales.sale_filter_deed_type AND NOT sales.sale_filter_less_than_10k AND NOT sales.sale_filter_same_sale_within_365 +-- Temporary limit on feeder table to avoid GitHub runner memory issues. WHERE uni.year = '2023' diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql index 97cfff982..89d44c49c 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql @@ -1,9 +1,13 @@ +-- This script gathers parcel-level geographies and joins them to values, tax +-- amounts, exemptions and class groupings. Its sole purpose is to feed +-- reporting.sot_taxes_and_exemptions, and should not be used otherwise. {{ config( materialized='table' ) }} +-- Gather unique tax codes and rates WITH tcd AS ( SELECT DISTINCT tax_code_num, @@ -12,13 +16,13 @@ WITH tcd AS ( FROM {{ source('tax', 'tax_code') }} ) --- Gather parcel-level geographies and join taxes, exemptions, and class --- groupings SELECT uni.pin, tax.year, tax.av_clerk AS tax_av, tax.tax_bill_total, + -- Setting exemptions with values of 0 allows us to count the number of + -- exemptions more easily and doesn't skew stats. CASE WHEN tax.exe_homeowner = 0 THEN NULL ELSE tax.exe_homeowner END AS tax_exe_homeowner, CASE WHEN tax.exe_senior = 0 THEN NULL ELSE tax.exe_senior END @@ -123,4 +127,5 @@ INNER JOIN tcd AND tax.year = tcd.year INNER JOIN {{ ref('ccao.class_dict') }} ON uni.class = class_dict.class_code +-- Temporary limit on feeder table to avoid GitHub runner memory issues. WHERE uni.class = '206' From f296292ae400c5a213a541d2dea45ffa86864898 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Sun, 7 Jul 2024 23:53:36 +0000 Subject: [PATCH 43/96] Comment python scripts --- .../reporting.sot_assessment_roll.py | 28 +++++++++++++++---- dbt/models/reporting/reporting.sot_sales.py | 21 ++++++++++++-- .../reporting.sot_taxes_exemptions.py | 15 +++++++--- 3 files changed, 51 insertions(+), 13 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index d0a579b04..576414d16 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -1,8 +1,5 @@ -# pylint: skip-file -# type: ignore - -# This script generates aggregated summary stats on sales data across a number -# of geographies, class combinations, and time. +# This script generates aggregated summary stats on assessed values across a +# number of geographies, class combinations, and time. # Import libraries import pandas as pd @@ -100,6 +97,11 @@ def first(x): def aggregrate(data, geography_type, group_type): + """ + Function to group a dataframe by whichever geography and group types it is + passed and output aggregate stats for that only for that grouping. + """ + print(geography_type, group_type) group = [geography_type, group_type, "year", "stage_name"] @@ -122,6 +124,12 @@ def aggregrate(data, geography_type, group_type): def assemble(df, geos, groups): + """ + Function that loops over predefined geography and class groups and passes + them to the aggregate function. Outputs stacked aggegrated output from the + aggregate function. + """ + # Create an empty dataframe to fill with output output = pd.DataFrame() @@ -133,11 +141,12 @@ def assemble(df, geos, groups): for z in groups: output = pd.concat([output, aggregrate(df, x, z)]) + # Flatten multi-index output.columns = ["_".join(col) for col in output.columns] output = output.reset_index() output = output.rename(columns={"triad_first": "triad"}) - # Clean combined output and export + # Create additional stat columns post-aggregation output["av_tot_pct_w_value"] = ( output["av_tot_count"] / output["av_tot_size"] ) @@ -254,6 +263,9 @@ def assemble(df, geos, groups): output["triennial"] = output["geography_type"].isin( ["triad", "township", "nbhd"] ) + + # Reassessment year is constructed as a string rather than a boolean to + # avoid PySpark errors with nullable booleans that can likely be resolved. output["reassessment_year"] = "" output.loc[ (output["triennial"] == True), "reassessment_year" # noqa: E712 @@ -284,6 +296,10 @@ def assemble(df, geos, groups): def clean_names(x): + """ + Function to rename and reorder columns. + """ + output = x.rename( columns={ "av_tot_size": "pin_n_tot", diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py index b2d55f7ae..781625ce7 100644 --- a/dbt/models/reporting/reporting.sot_sales.py +++ b/dbt/models/reporting/reporting.sot_sales.py @@ -1,5 +1,5 @@ -# This script generates aggregated summary stats on sales data across a number -# of geographies, class combinations, and time. +# This script generates aggregated summary stats on sales across a number of +# geographies, class combinations, and time. import statistics as stats @@ -97,6 +97,10 @@ def first(x): def aggregrate(data, geography_type, group_type): + """ + Function to group a dataframe by whichever geography and group types it is + passed and output aggregate stats for that only for that grouping. + """ print(geography_type, group_type) group = [geography_type, group_type, "year"] @@ -118,6 +122,12 @@ def aggregrate(data, geography_type, group_type): def assemble(df, geos, groups): + """ + Function that loops over predefined geography and class groups and passes + them to the aggregate function. Outputs stacked aggegrated output from the + aggregate function. + """ + # Create an empty dataframe to fill with output output = pd.DataFrame() @@ -129,10 +139,11 @@ def assemble(df, geos, groups): for z in groups: output = pd.concat([output, aggregrate(df, x, z)]) + # Flatten multi-index output.columns = ["_".join(col) for col in output.columns] output = output.reset_index() - # Clean combined output and export + # Create additional stat columns post-aggregation output["sale_price_sum"] = output["sale_price_sum"].replace(0, np.NaN) output["sale_price_per_sf_sum"] = output["sale_price_per_sf_sum"].replace( 0, np.NaN @@ -180,6 +191,10 @@ def assemble(df, geos, groups): def clean_names(x): + """ + Function to rename and reorder columns. + """ + output = x.rename( columns={ "sale_price_size": "pin_n_tot", diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py index fc8eb9cc2..bf4c49c72 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py @@ -1,6 +1,3 @@ -# pylint: skip-file -# type: ignore - # This script generates aggregated summary stats on taxes and exemptions data # across a number of geographies, class combinations, and time. @@ -109,6 +106,11 @@ def first(x): def aggregrate(data, geography_type, group_type): + """ + Function to group a dataframe by whichever geography and group types it is + passed and output aggregate stats for that only for that grouping. + """ + print(geography_type, group_type) group = [geography_type, group_type, "year"] @@ -141,10 +143,11 @@ def assemble(df, geos, groups): for z in groups: output = pd.concat([output, aggregrate(df, x, z)]) + # Flatten multi-index output.columns = ["_".join(col) for col in output.columns] output = output.reset_index() - # Clean combined output and export + # Create additional stat columns post-aggregation output["tax_bill_total_delta_median"] = ( output.sort_values("year") .groupby(["geography_id", "group_id"]) @@ -169,6 +172,10 @@ def assemble(df, geos, groups): def clean_names(x): + """ + Function to rename and reorder columns. + """ + output = x.rename( columns={ "tax_eq_factor_final_size": "pin_n_tot", From a23ff728bd9205d92cc112f7338c1feffe41dc56 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Mon, 8 Jul 2024 00:58:48 +0000 Subject: [PATCH 44/96] Clean up ratio_stats script --- .../reporting/reporting.sot_ratio_stats.py | 158 ++++++++++++------ 1 file changed, 104 insertions(+), 54 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py index 22500e3ef..a08ad513d 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats.py +++ b/dbt/models/reporting/reporting.sot_ratio_stats.py @@ -4,8 +4,8 @@ "s3://ccao-athena-dependencies-us-east-1/assesspy==1.1.0.zip" ) -# This script generates aggregated summary stats on sales data across a number -# of geographies, class combinations, and time. +# This script generates aggregated summary stats on sales ratios across a +# number of geographies, class combinations, and time. # Import libraries import assesspy as ass # noqa: E402 @@ -18,12 +18,46 @@ "county", "triad", "township", + "nbhd", + "tax_code", + "zip_code", + ], + "census_data_year": [ + "census_place", + "census_tract", + "census_congressional_district", + "census_zcta", + ], + "cook_board_of_review_district_data_year": [ + "cook_board_of_review_district" + ], + "cook_commissioner_district_data_year": ["cook_commissioner_district"], + "cook_judicial_district_data_year": ["cook_judicial_district"], + "ward_data_year": ["ward_num"], + "community_area_data_year": ["community_area"], + "police_district_data_year": ["police_district"], + "central_business_district_data_year": ["central_business_district"], + "school_data_year": [ + "school_elementary_district", + "school_secondary_district", + "school_unified_district", + ], + "tax_data_year": [ + "tax_municipality", + "tax_park_district", + "tax_library_district", + "tax_fire_protection_district", + "tax_community_college_district", + "tax_sanitation_district", + "tax_special_service_area", + "tax_tif_district", ], } # Declare class groupings groups = ["no_group", "class", "major_class", "modeling_group", "res_other"] +# Wrap assesspy functions to avoid GitHub runner errors for length 0 groupings def cod_safe(ratio): if len(ratio) >= 1: output = ass.cod(ratio) @@ -62,6 +96,7 @@ def mki_safe(assessed, sale_price): return output +# Define aggregation functions def first(x): if len(x) >= 1: output = x.iloc[0] @@ -79,8 +114,14 @@ def within(x, limit): return np.logical_and(1 - limit < x, x < 1 + limit) -# Define aggregation functions def aggregrate(data, geography_type, group_type): + """ + Function to group a dataframe by whichever geography and group types it is + passed and output aggregate stats for that only for that grouping. Works + differently than in other SoT scripts since assesspy functions need + multiple inputs. + """ + print(geography_type, group_type) group = [geography_type, group_type, "year", "stage_name"] @@ -88,7 +129,7 @@ def aggregrate(data, geography_type, group_type): data["sale_n_tot"] = data.groupby(group)["sale_price"].transform("count") data["pin_n_w_value"] = data.groupby(group)["tot_mv"].transform("count") - # Remove parcels with FMVs of 0 since they screw up ratios + # Remove parcels with MVs of 0 since they screw up ratios data = data[data["tot_mv"] > 0] # Remove groups that only have one sale since we can't calculate stats @@ -140,6 +181,12 @@ def aggregrate(data, geography_type, group_type): def assemble(df, geos, groups): + """ + Function that loops over predefined geography and class groups and passes + them to the aggregate function. Outputs stacked aggegrated output from the + aggregate function. + """ + # Create an empty dataframe to fill with output output = pd.DataFrame() @@ -153,13 +200,8 @@ def assemble(df, geos, groups): output.dropna(how="all", axis=1, inplace=True) - return output - - -def clean(dirty): - dirty.index.names = ["geography_id", "group_id", "year", "stage_name"] - - dirty = dirty.reset_index().set_index( + output.index.names = ["geography_id", "group_id", "year", "stage_name"] + output = output.reset_index().set_index( [ "geography_type", "geography_id", @@ -170,82 +212,92 @@ def clean(dirty): ] ) - dirty["pin_pct_w_value"] = dirty["pin_n_w_value"] / dirty["pin_n_tot"] + # Create additional stat columns post-aggregation + output["pin_pct_w_value"] = output["pin_n_w_value"] / output["pin_n_tot"] - # Clean combined dirty and export - dirty["mv_delta_median"] = ( - dirty.sort_values("year") + output["mv_delta_median"] = ( + output.sort_values("year") .groupby(["geography_id", "group_id", "stage_name"]) .mv_median.diff() ) - dirty["mv_delta_mean"] = ( - dirty.sort_values("year") + output["mv_delta_mean"] = ( + output.sort_values("year") .groupby(["geography_id", "group_id", "stage_name"]) .mv_mean.diff() ) - dirty["mv_delta_sum"] = ( - dirty.sort_values("year") + output["mv_delta_sum"] = ( + output.sort_values("year") .groupby(["geography_id", "group_id", "stage_name"]) .mv_sum.diff() ) - dirty["mv_delta_pct_median"] = ( - dirty.sort_values("year") + output["mv_delta_pct_median"] = ( + output.sort_values("year") .groupby(["geography_id", "group_id", "stage_name"]) .mv_median.pct_change() ) - dirty["mv_delta_pct_mean"] = ( - dirty.sort_values("year") + output["mv_delta_pct_mean"] = ( + output.sort_values("year") .groupby(["geography_id", "group_id", "stage_name"]) .mv_mean.pct_change() ) - dirty["mv_delta_pct_sum"] = ( - dirty.sort_values("year") + output["mv_delta_pct_sum"] = ( + output.sort_values("year") .groupby(["geography_id", "group_id", "stage_name"]) .mv_sum.pct_change() ) - dirty = dirty.reset_index() + output = output.reset_index() - dirty["year"] = dirty["year"].astype(int) - dirty["triennial"] = dirty["geography_type"].isin( + output["year"] = output["year"].astype(int) + output["triennial"] = output["geography_type"].isin( ["triad", "township", "nbhd"] ) - dirty["reassessment_year"] = "" - dirty.loc[ - (dirty["triennial"] == True), "reassessment_year" # noqa: E712 + output["reassessment_year"] = "" + output.loc[ + (output["triennial"] == True), "reassessment_year" # noqa: E712 ] = "No" - dirty.loc[ - (dirty["year"] % 3 == 0) - & (dirty["triad"] == "North") - & (dirty["triennial"] == True), # noqa: E712 + output.loc[ + (output["year"] % 3 == 0) + & (output["triad"] == "North") + & (output["triennial"] == True), # noqa: E712 "reassessment_year", ] = "Yes" - dirty.loc[ - (dirty["year"] % 3 == 1) - & (dirty["triad"] == "South") - & (dirty["triennial"] == True), # noqa: E712 + output.loc[ + (output["year"] % 3 == 1) + & (output["triad"] == "South") + & (output["triennial"] == True), # noqa: E712 "reassessment_year", ] = "Yes" - dirty.loc[ - (dirty["year"] % 3 == 2) - & (dirty["triad"] == "City") - & (dirty["triennial"] == True), # noqa: E712 + output.loc[ + (output["year"] % 3 == 2) + & (output["triad"] == "City") + & (output["triennial"] == True), # noqa: E712 "reassessment_year", ] = "Yes" - dirty = dirty.drop(["triennial", "triad"], axis=1) + output = output.drop(["triennial", "triad"], axis=1) + + output["cod_met"] = met(output["cod"], 5, 15) + output["prd_met"] = met(output["prd"], 0.98, 1.03) + output["prb_met"] = met(output["prb"], -0.05, 0.05) + output["mki_met"] = met(output["mki"], 0.95, 1.05) + + output["within_05_pct"] = within(output["ratio_mean"], 0.05) + output["within_10_pct"] = within(output["ratio_mean"], 0.1) + output["within_15_pct"] = within(output["ratio_mean"], 0.15) + output["within_20_pct"] = within(output["ratio_mean"], 0.2) - dirty["cod_met"] = met(dirty["cod"], 5, 15) - dirty["prd_met"] = met(dirty["prd"], 0.98, 1.03) - dirty["prb_met"] = met(dirty["prb"], -0.05, 0.05) - dirty["mki_met"] = met(dirty["mki"], 0.95, 1.05) + output = clean(output) + + return output - dirty["within_05_pct"] = within(dirty["ratio_mean"], 0.05) - dirty["within_10_pct"] = within(dirty["ratio_mean"], 0.1) - dirty["within_15_pct"] = within(dirty["ratio_mean"], 0.15) - dirty["within_20_pct"] = within(dirty["ratio_mean"], 0.2) + +def clean(dirty): + """ + Function to change column types and reorder them. + """ dirty = dirty.astype( { @@ -334,8 +386,6 @@ def model(dbt, spark_session): df = assemble(input, geos=geos, groups=groups) - df = clean(df) - schema = ( "geography_type: string, geography_id: string, " + "geography_data_year: string, group_type: string, group_id: string, " From 07f6dfefe9beb6ef8e8db4af31251dccced3a33d Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Mon, 8 Jul 2024 01:40:20 +0000 Subject: [PATCH 45/96] Back to fixing pin_n_tot --- dbt/models/reporting/reporting.sot_ratio_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py index a08ad513d..fd106f53d 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats.py +++ b/dbt/models/reporting/reporting.sot_ratio_stats.py @@ -390,7 +390,7 @@ def model(dbt, spark_session): "geography_type: string, geography_id: string, " + "geography_data_year: string, group_type: string, group_id: string, " + "year: string, reassessment_year: string, stage_name: string, " - + "pin_n_tot: double, pin_n_w_value: bigint, pin_pct_w_value: double, " + + "pin_n_tot: int, pin_n_w_value: bigint, pin_pct_w_value: double, " + "sale_n_tot: bigint, mv_min: bigint, mv_q10: bigint, " + "mv_q25: bigint, mv_median: bigint, mv_q75: bigint, " + "mv_q90: bigint, mv_max: bigint, mv_mean: bigint, mv_sum: bigint, " From b78a072f6e6ffc8a814e6ba0375ab3c68bee3b25 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Mon, 8 Jul 2024 02:43:44 +0000 Subject: [PATCH 46/96] Replace nan with None --- dbt/models/reporting/reporting.sot_ratio_stats.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py index fd106f53d..cf8979f56 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats.py +++ b/dbt/models/reporting/reporting.sot_ratio_stats.py @@ -289,6 +289,8 @@ def assemble(df, geos, groups): output["within_15_pct"] = within(output["ratio_mean"], 0.15) output["within_20_pct"] = within(output["ratio_mean"], 0.2) + output = output.replace(np.nan, None) + output = clean(output) return output From 337954e086bfc73c85852bb4d032a43a27de94f3 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Mon, 8 Jul 2024 02:44:53 +0000 Subject: [PATCH 47/96] Partition input tables by year --- dbt/models/reporting/reporting.sot_ratio_stats_input.sql | 3 ++- dbt/models/reporting/reporting.sot_sales_input.sql | 3 ++- dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql index 35d86e976..e38a17da6 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql +++ b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql @@ -4,7 +4,8 @@ purpose is to feed reporting.sot_ratio_stats, and should not be used otherwise. */ {{ config( - materialized='table' + materialized='table', + partitioned_by=['year'] ) }} diff --git a/dbt/models/reporting/reporting.sot_sales_input.sql b/dbt/models/reporting/reporting.sot_sales_input.sql index 72c4ba852..b6a5d64fb 100644 --- a/dbt/models/reporting/reporting.sot_sales_input.sql +++ b/dbt/models/reporting/reporting.sot_sales_input.sql @@ -4,7 +4,8 @@ {{ config( - materialized='table' + materialized='table', + partitioned_by=['year'] ) }} diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql index 89d44c49c..9dd5f9e04 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql @@ -3,7 +3,8 @@ -- reporting.sot_taxes_and_exemptions, and should not be used otherwise. {{ config( - materialized='table' + materialized='table', + partitioned_by=['year'] ) }} From 1031144ad60a485cf304a71b32005c8fdf6c4646 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Mon, 8 Jul 2024 03:26:34 +0000 Subject: [PATCH 48/96] Fix year partitioning --- dbt/models/reporting/reporting.sot_ratio_stats_input.sql | 4 ++-- dbt/models/reporting/reporting.sot_sales_input.sql | 4 ++-- dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql index e38a17da6..17ed6ef83 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql +++ b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql @@ -37,7 +37,6 @@ uni AS ( SELECT CAST(sales.sale_price AS DOUBLE) AS sale_price, - uni.year, uni.stage_name, uni.class, CAST(vals.tot_mv AS DOUBLE) AS tot_mv, @@ -95,7 +94,8 @@ SELECT class_dict.major_class_type AS major_class, class_dict.modeling_group, CASE WHEN class_dict.major_class_code = '2' THEN 'RES' ELSE 'OTHER' END - AS res_other + AS res_other, + uni.year FROM uni LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals diff --git a/dbt/models/reporting/reporting.sot_sales_input.sql b/dbt/models/reporting/reporting.sot_sales_input.sql index b6a5d64fb..18d0960aa 100644 --- a/dbt/models/reporting/reporting.sot_sales_input.sql +++ b/dbt/models/reporting/reporting.sot_sales_input.sql @@ -31,7 +31,6 @@ SELECT CAST(sf.char_bldg_sf AS INT) AS sale_char_bldg_sf, CAST(sf.char_land_sf AS INT) AS sale_char_land_sf, CAST(sf.char_yrblt AS INT) AS sale_char_yrblt, - uni.year, uni.class, 'Cook' AS county, uni.triad_name AS triad, @@ -86,7 +85,8 @@ SELECT class_dict.major_class_type AS major_class, class_dict.modeling_group, CASE WHEN class_dict.major_class_code = '2' THEN 'RES' ELSE 'OTHER' END - AS res_other + AS res_other, + uni.year FROM {{ ref('default.vw_pin_universe') }} AS uni LEFT JOIN sf ON uni.pin = sf.pin diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql index 9dd5f9e04..95718dca9 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql @@ -19,7 +19,6 @@ WITH tcd AS ( SELECT uni.pin, - tax.year, tax.av_clerk AS tax_av, tax.tax_bill_total, -- Setting exemptions with values of 0 allows us to count the number of @@ -116,7 +115,8 @@ SELECT class_dict.major_class_type AS major_class, class_dict.modeling_group, CASE WHEN class_dict.major_class_code = '2' THEN 'RES' ELSE 'OTHER' END - AS res_other + AS res_other, + tax.year FROM {{ ref('default.vw_pin_universe') }} AS uni INNER JOIN {{ source('tax', 'pin') }} AS tax ON uni.pin = tax.pin From 45ea3054fe2210aab01817bdac3e15bf687cb5bc Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Mon, 8 Jul 2024 12:39:59 +0000 Subject: [PATCH 49/96] Use double for nullable columns --- dbt/models/reporting/reporting.sot_ratio_stats.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py index cf8979f56..42da9363d 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats.py +++ b/dbt/models/reporting/reporting.sot_ratio_stats.py @@ -289,6 +289,7 @@ def assemble(df, geos, groups): output["within_15_pct"] = within(output["ratio_mean"], 0.15) output["within_20_pct"] = within(output["ratio_mean"], 0.2) + # PySpark rejects nan, convert them to None output = output.replace(np.nan, None) output = clean(output) @@ -396,8 +397,8 @@ def model(dbt, spark_session): + "sale_n_tot: bigint, mv_min: bigint, mv_q10: bigint, " + "mv_q25: bigint, mv_median: bigint, mv_q75: bigint, " + "mv_q90: bigint, mv_max: bigint, mv_mean: bigint, mv_sum: bigint, " - + "mv_delta_median: bigint, mv_delta_mean: bigint, " - + "mv_delta_sum: bigint, mv_delta_pct_median: double, " + + "mv_delta_median: double, mv_delta_mean: double, " + + "mv_delta_sum: double, mv_delta_pct_median: double, " + "mv_delta_pct_mean: double, mv_delta_pct_sum: double, " + "ratio_min: double, ratio_q10: double, ratio_q25: double, " + "ratio_median: double, ratio_q75: double, ratio_q90: double, " From ca139f3b862f9b54dcfe8146c996dba36a0568e2 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 9 Jul 2024 17:11:23 +0000 Subject: [PATCH 50/96] Move data year specification to dbt seed --- dbt/dbt_project.yml | 2 + .../reporting.sot_assessment_roll.py | 59 ++++++------------- .../reporting/reporting.sot_ratio_stats.py | 58 ++++++------------ dbt/models/reporting/reporting.sot_sales.py | 58 ++++++------------ .../reporting.sot_taxes_exemptions.py | 58 ++++++------------ dbt/seeds/reporting/docs.md | 6 ++ .../reporting/reporting.sot_data_years.csv | 9 +++ dbt/seeds/reporting/schema.yml | 6 ++ 8 files changed, 92 insertions(+), 164 deletions(-) create mode 100644 dbt/seeds/reporting/docs.md create mode 100644 dbt/seeds/reporting/reporting.sot_data_years.csv create mode 100644 dbt/seeds/reporting/schema.yml diff --git a/dbt/dbt_project.yml b/dbt/dbt_project.yml index a94530b30..40740836a 100644 --- a/dbt/dbt_project.yml +++ b/dbt/dbt_project.yml @@ -71,3 +71,5 @@ seeds: +schema: location model: +schema: model + reporting: + +schema: reporting diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index 576414d16..e62b042f3 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -4,47 +4,6 @@ # Import libraries import pandas as pd -# Declare geographic groups and their associated data years -geos = { - "year": [ - "county", - "triad", - "township", - "nbhd", - "tax_code", - "zip_code", - ], - "census_data_year": [ - "census_place", - "census_tract", - "census_congressional_district", - "census_zcta", - ], - "cook_board_of_review_district_data_year": [ - "cook_board_of_review_district" - ], - "cook_commissioner_district_data_year": ["cook_commissioner_district"], - "cook_judicial_district_data_year": ["cook_judicial_district"], - "ward_data_year": ["ward_num"], - "community_area_data_year": ["community_area"], - "police_district_data_year": ["police_district"], - "central_business_district_data_year": ["central_business_district"], - "school_data_year": [ - "school_elementary_district", - "school_secondary_district", - "school_unified_district", - ], - "tax_data_year": [ - "tax_municipality", - "tax_park_district", - "tax_library_district", - "tax_fire_protection_district", - "tax_community_college_district", - "tax_sanitation_district", - "tax_special_service_area", - "tax_tif_district", - ], -} # Declare class groupings groups = ["no_group", "class", "major_class", "modeling_group", "res_other"] @@ -373,9 +332,27 @@ def clean_names(x): return output +def ingest_geos(geos): + """ + Function to convert dbt seed into a dictionary that can be iterated over. + """ + + geos = geos.toPandas() + + output = { + k: list(geos[k].unique()[pd.notnull(geos[k].unique())]) + for k in geos.columns + } + + return output + + def model(dbt, spark_session): dbt.config(materialized="table") + # Ingest geographies and their associated data years + geos = ingest_geos(dbt.ref("reporting.sot_data_years")) + input = dbt.ref("reporting.sot_assessment_roll_input") # Convert the Spark input dataframe to Pandas for diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py index 42da9363d..406d646c4 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats.py +++ b/dbt/models/reporting/reporting.sot_ratio_stats.py @@ -12,47 +12,6 @@ import numpy as np # noqa: E402 import pandas as pd # noqa: E402 -# Declare geographic groups and their associated data years -geos = { - "year": [ - "county", - "triad", - "township", - "nbhd", - "tax_code", - "zip_code", - ], - "census_data_year": [ - "census_place", - "census_tract", - "census_congressional_district", - "census_zcta", - ], - "cook_board_of_review_district_data_year": [ - "cook_board_of_review_district" - ], - "cook_commissioner_district_data_year": ["cook_commissioner_district"], - "cook_judicial_district_data_year": ["cook_judicial_district"], - "ward_data_year": ["ward_num"], - "community_area_data_year": ["community_area"], - "police_district_data_year": ["police_district"], - "central_business_district_data_year": ["central_business_district"], - "school_data_year": [ - "school_elementary_district", - "school_secondary_district", - "school_unified_district", - ], - "tax_data_year": [ - "tax_municipality", - "tax_park_district", - "tax_library_district", - "tax_fire_protection_district", - "tax_community_college_district", - "tax_sanitation_district", - "tax_special_service_area", - "tax_tif_district", - ], -} # Declare class groupings groups = ["no_group", "class", "major_class", "modeling_group", "res_other"] @@ -378,9 +337,26 @@ def clean(dirty): return dirty +def ingest_geos(geos): + """ + Function to convert dbt seed into a dictionary that can be iterated over. + """ + + geos = geos.toPandas() + output = { + k: list(geos[k].unique()[pd.notnull(geos[k].unique())]) + for k in geos.columns + } + + return output + + def model(dbt, spark_session): dbt.config(materialized="table") + # Ingest geographies and their associated data years + geos = ingest_geos(dbt.ref("reporting.sot_data_years")) + input = dbt.ref("reporting.sot_ratio_stats_input") # Convert the Spark input dataframe to Pandas for diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py index 781625ce7..fee655679 100644 --- a/dbt/models/reporting/reporting.sot_sales.py +++ b/dbt/models/reporting/reporting.sot_sales.py @@ -7,47 +7,6 @@ import numpy as np import pandas as pd -# Declare geographic groups and their associated data years -geos = { - "year": [ - "county", - "triad", - "township", - "nbhd", - "tax_code", - "zip_code", - ], - "census_data_year": [ - "census_place", - "census_tract", - "census_congressional_district", - "census_zcta", - ], - "cook_board_of_review_district_data_year": [ - "cook_board_of_review_district" - ], - "cook_commissioner_district_data_year": ["cook_commissioner_district"], - "cook_judicial_district_data_year": ["cook_judicial_district"], - "ward_data_year": ["ward_num"], - "community_area_data_year": ["community_area"], - "police_district_data_year": ["police_district"], - "central_business_district_data_year": ["central_business_district"], - "school_data_year": [ - "school_elementary_district", - "school_secondary_district", - "school_unified_district", - ], - "tax_data_year": [ - "tax_municipality", - "tax_park_district", - "tax_library_district", - "tax_fire_protection_district", - "tax_community_college_district", - "tax_sanitation_district", - "tax_special_service_area", - "tax_tif_district", - ], -} # Declare class groupings groups = ["no_group", "class", "major_class", "modeling_group", "res_other"] @@ -249,9 +208,26 @@ def clean_names(x): return output +def ingest_geos(geos): + """ + Function to convert dbt seed into a dictionary that can be iterated over. + """ + + geos = geos.toPandas() + output = { + k: list(geos[k].unique()[pd.notnull(geos[k].unique())]) + for k in geos.columns + } + + return output + + def model(dbt, spark_session): dbt.config(materialized="table") + # Ingest geographies and their associated data years + geos = ingest_geos(dbt.ref("reporting.sot_data_years")) + input = dbt.ref("reporting.sot_sales_input") # Convert the Spark input dataframe to Pandas for diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py index bf4c49c72..817dab3ac 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py @@ -4,47 +4,6 @@ # Import libraries import pandas as pd -# Declare geographic groups and their associated data years -geos = { - "year": [ - "county", - "triad", - "township", - "nbhd", - "tax_code", - "zip_code", - ], - "census_data_year": [ - "census_place", - "census_tract", - "census_congressional_district", - "census_zcta", - ], - "cook_board_of_review_district_data_year": [ - "cook_board_of_review_district" - ], - "cook_commissioner_district_data_year": ["cook_commissioner_district"], - "cook_judicial_district_data_year": ["cook_judicial_district"], - "ward_data_year": ["ward_num"], - "community_area_data_year": ["community_area"], - "police_district_data_year": ["police_district"], - "central_business_district_data_year": ["central_business_district"], - "school_data_year": [ - "school_elementary_district", - "school_secondary_district", - "school_unified_district", - ], - "tax_data_year": [ - "tax_municipality", - "tax_park_district", - "tax_library_district", - "tax_fire_protection_district", - "tax_community_college_district", - "tax_sanitation_district", - "tax_special_service_area", - "tax_tif_district", - ], -} # Declare class groupings groups = ["no_group", "class", "major_class", "modeling_group", "res_other"] @@ -265,9 +224,26 @@ def clean_names(x): return output +def ingest_geos(geos): + """ + Function to convert dbt seed into a dictionary that can be iterated over. + """ + + geos = geos.toPandas() + output = { + k: list(geos[k].unique()[pd.notnull(geos[k].unique())]) + for k in geos.columns + } + + return output + + def model(dbt, spark_session): dbt.config(materialized="table") + # Ingest geographies and their associated data years + geos = ingest_geos(dbt.ref("reporting.sot_data_years")) + input = dbt.ref("reporting.sot_taxes_exemptions_input") # Convert the Spark input dataframe to Pandas for diff --git a/dbt/seeds/reporting/docs.md b/dbt/seeds/reporting/docs.md new file mode 100644 index 000000000..ac26fa1a4 --- /dev/null +++ b/dbt/seeds/reporting/docs.md @@ -0,0 +1,6 @@ +# sot_data_years + +{% docs seed_sot_data_years %} +A table containing reporting geographies and their associated data year identifiers. + +{% enddocs %} diff --git a/dbt/seeds/reporting/reporting.sot_data_years.csv b/dbt/seeds/reporting/reporting.sot_data_years.csv new file mode 100644 index 000000000..eed8df432 --- /dev/null +++ b/dbt/seeds/reporting/reporting.sot_data_years.csv @@ -0,0 +1,9 @@ +year,census_data_year,cook_board_of_review_district_data_year,cook_commissioner_district_data_year,cook_judicial_district_data_year,ward_data_year,community_area_data_year,police_district_data_year,central_business_district_data_year,school_data_year,tax_data_year +county,census_place,cook_board_of_review_district,cook_commissioner_district,cook_judicial_district,ward_num,community_area,police_district,central_business_district,school_elementary_district,tax_municipality +triad,census_tract,,,,,,,,school_secondary_district,tax_park_district +township,census_congressional_district,,,,,,,,school_unified_district,tax_library_district +nbhd,census_zcta,,,,,,,,,tax_fire_protection_district +tax_code,,,,,,,,,,tax_community_college_district +zip_code,,,,,,,,,,tax_sanitation_district +,,,,,,,,,,tax_special_service_area +,,,,,,,,,,tax_tif_district diff --git a/dbt/seeds/reporting/schema.yml b/dbt/seeds/reporting/schema.yml new file mode 100644 index 000000000..3dca39299 --- /dev/null +++ b/dbt/seeds/reporting/schema.yml @@ -0,0 +1,6 @@ +seeds: + - name: reporting.sot_data_years + description: '{{ doc("seed_sot_data_years") }}' + config: + column_types: + year: string From 788f97156d3983e166cda1d58b91cc55c81980e7 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 9 Jul 2024 17:12:03 +0000 Subject: [PATCH 51/96] Formatting --- dbt/models/reporting/reporting.sot_assessment_roll.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index e62b042f3..03cc4ff4e 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -338,7 +338,6 @@ def ingest_geos(geos): """ geos = geos.toPandas() - output = { k: list(geos[k].unique()[pd.notnull(geos[k].unique())]) for k in geos.columns From 5449d8c134be1d352b33ee5a680b6e84b77d4ba8 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 9 Jul 2024 19:52:03 +0000 Subject: [PATCH 52/96] Improve diff and pct_change syntax --- .../reporting.sot_assessment_roll.py | 147 ++++++------------ .../reporting/reporting.sot_ratio_stats.py | 57 +++---- dbt/models/reporting/reporting.sot_sales.py | 56 +++---- .../reporting.sot_taxes_exemptions.py | 32 ++-- 4 files changed, 115 insertions(+), 177 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index 03cc4ff4e..60588a573 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -110,112 +110,57 @@ def assemble(df, geos, groups): output["av_tot_count"] / output["av_tot_size"] ) - output["av_tot_delta_median"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .av_tot_median.diff() - ) - - output["av_tot_delta_mean"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .av_tot_mean.diff() - ) - - output["av_tot_delta_sum"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .av_tot_sum.diff() - ) - - output["av_bldg_delta_median"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .av_bldg_median.diff() - ) - - output["av_bldg_delta_mean"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .av_bldg_mean.diff() - ) - - output["av_bldg_delta_sum"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .av_bldg_sum.diff() - ) - - output["av_land_delta_median"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .av_land_median.diff() - ) - - output["av_land_delta_mean"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .av_land_mean.diff() - ) - - output["av_land_delta_sum"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .av_land_sum.diff() - ) - - output["av_tot_delta_pct_median"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .av_tot_median.pct_change() - ) - - output["av_tot_delta_pct_mean"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .av_tot_mean.pct_change() - ) - - output["av_tot_delta_pct_sum"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .av_tot_sum.pct_change() - ) - - output["av_bldg_delta_pct_median"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .av_bldg_median.pct_change() - ) - - output["av_bldg_delta_pct_mean"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .av_bldg_mean.pct_change() - ) - - output["av_bldg_delta_pct_sum"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .av_bldg_sum.pct_change() - ) - - output["av_land_delta_pct_median"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .av_land_median.pct_change() - ) + output = output.sort_values("year") + + diff_cols = [ + "geography_id", + "group_id", + "stage_name", + "av_tot_median", + "av_tot_mean", + "av_tot_sum", + "av_bldg_median", + "av_bldg_mean", + "av_bldg_sum", + "av_land_median", + "av_land_mean", + "av_land_sum", + ] - output["av_land_delta_pct_mean"] = ( - output.sort_values("year") + output[ + [ + "av_tot_delta_median", + "av_tot_delta_mean", + "av_tot_delta_sum", + "av_bldg_delta_median", + "av_bldg_delta_mean", + "av_bldg_delta_sum", + "av_land_delta_median", + "av_land_delta_mean", + "av_land_delta_sum", + ] + ] = ( + output[diff_cols] .groupby(["geography_id", "group_id", "stage_name"]) - .av_land_mean.pct_change() + .diff() ) - output["av_land_delta_pct_sum"] = ( - output.sort_values("year") + output[ + [ + "av_tot_delta_pct_median", + "av_tot_delta_pct_mean", + "av_tot_delta_pct_sum", + "av_bldg_delta_pct_median", + "av_bldg_delta_pct_mean", + "av_bldg_delta_pct_sum", + "av_land_delta_pct_median", + "av_land_delta_pct_mean", + "av_land_delta_pct_sum", + ] + ] = ( + output[diff_cols] .groupby(["geography_id", "group_id", "stage_name"]) - .av_land_sum.pct_change() + .pct_change() ) output["year"] = output["year"].astype(int) diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py index 406d646c4..21ea0ea34 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats.py +++ b/dbt/models/reporting/reporting.sot_ratio_stats.py @@ -171,45 +171,46 @@ def assemble(df, geos, groups): ] ) + output = output.reset_index() + # Create additional stat columns post-aggregation output["pin_pct_w_value"] = output["pin_n_w_value"] / output["pin_n_tot"] - output["mv_delta_median"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .mv_median.diff() - ) - output["mv_delta_mean"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .mv_mean.diff() - ) - output["mv_delta_sum"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .mv_sum.diff() - ) + output = output.sort_values("year") - output["mv_delta_pct_median"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id", "stage_name"]) - .mv_median.pct_change() - ) + diff_cols = [ + "geography_id", + "group_id", + "stage_name", + "mv_median", + "mv_mean", + "mv_sum", + ] - output["mv_delta_pct_mean"] = ( - output.sort_values("year") + output[ + [ + "mv_delta_median", + "mv_delta_mean", + "mv_delta_sum", + ] + ] = ( + output[diff_cols] .groupby(["geography_id", "group_id", "stage_name"]) - .mv_mean.pct_change() + .diff() ) - output["mv_delta_pct_sum"] = ( - output.sort_values("year") + output[ + [ + "mv_delta_pct_median", + "mv_delta_pct_mean", + "mv_delta_pct_sum", + ] + ] = ( + output[diff_cols] .groupby(["geography_id", "group_id", "stage_name"]) - .mv_sum.pct_change() + .pct_change() ) - output = output.reset_index() - output["year"] = output["year"].astype(int) output["triennial"] = output["geography_type"].isin( ["triad", "township", "nbhd"] diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py index fee655679..f387d2320 100644 --- a/dbt/models/reporting/reporting.sot_sales.py +++ b/dbt/models/reporting/reporting.sot_sales.py @@ -108,40 +108,30 @@ def assemble(df, geos, groups): 0, np.NaN ) - output["sale_price_delta_median"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id"]) - .sale_price_median.diff() - ) - - output["sale_price_delta_mean"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id"]) - .sale_price_mean.diff() - ) - - output["sale_price_delta_sum"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id"]) - .sale_price_sum.diff() - ) - - output["sale_price_per_sf_delta_median"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id"]) - .sale_price_per_sf_median.diff() - ) - - output["sale_price_per_sf_delta_mean"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id"]) - .sale_price_per_sf_mean.diff() - ) + output = output.sort_values("year") + + diff_cols = [ + "geography_id", + "group_id", + "sale_price_median", + "sale_price_mean", + "sale_price_sum", + "sale_price_per_sf_median", + "sale_price_per_sf_mean", + "sale_price_per_sf_sum", + ] - output["sale_price_per_sf_delta_sum"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id"]) - .sale_price_per_sf_sum.diff() + output[ + [ + "sale_price_delta_median", + "sale_price_delta_mean", + "sale_price_delta_sum", + "sale_price_per_sf_delta_median", + "sale_price_per_sf_delta_mean", + "sale_price_per_sf_delta_sum", + ] + ] = ( + output[diff_cols].groupby(["geography_id", "group_id"]).diff() ) output = clean_names(output) diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py index 817dab3ac..9c803ee33 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py @@ -107,22 +107,24 @@ def assemble(df, geos, groups): output = output.reset_index() # Create additional stat columns post-aggregation - output["tax_bill_total_delta_median"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id"]) - .tax_bill_total_median.diff() - ) - - output["tax_bill_total_delta_mean"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id"]) - .tax_bill_total_mean.diff() - ) + output = output.sort_values("year") + + diff_cols = [ + "geography_id", + "group_id", + "tax_bill_total_median", + "tax_bill_total_mean", + "tax_bill_total_sum", + ] - output["tax_bill_total_delta_sum"] = ( - output.sort_values("year") - .groupby(["geography_id", "group_id"]) - .tax_bill_total_sum.diff() + output[ + [ + "tax_bill_total_delta_median", + "tax_bill_total_delta_mean", + "tax_bill_total_delta_sum", + ] + ] = ( + output[diff_cols].groupby(["geography_id", "group_id"]).diff() ) output = clean_names(output) From c87713fb79c40518954d911b9ff3f8d74bb93258 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 9 Jul 2024 21:19:01 +0000 Subject: [PATCH 53/96] Simplify reassessment year syntax --- .../reporting/reporting.sot_assessment_roll.py | 17 +++-------------- .../reporting/reporting.sot_ratio_stats.py | 17 +++-------------- 2 files changed, 6 insertions(+), 28 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index 60588a573..f5fbba742 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -175,20 +175,9 @@ def assemble(df, geos, groups): (output["triennial"] == True), "reassessment_year" # noqa: E712 ] = "No" output.loc[ - (output["year"] % 3 == 0) - & (output["triad"] == "North") - & (output["triennial"] == True), # noqa: E712 - "reassessment_year", - ] = "Yes" - output.loc[ - (output["year"] % 3 == 1) - & (output["triad"] == "South") - & (output["triennial"] == True), # noqa: E712 - "reassessment_year", - ] = "Yes" - output.loc[ - (output["year"] % 3 == 2) - & (output["triad"] == "City") + ((output["year"] % 3 == 0) & (output["triad"] == "North")) + | ((output["year"] % 3 == 1) & (output["triad"] == "South")) + | ((output["year"] % 3 == 2) & (output["triad"] == "City")) & (output["triennial"] == True), # noqa: E712 "reassessment_year", ] = "Yes" diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py index 21ea0ea34..615e56c67 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats.py +++ b/dbt/models/reporting/reporting.sot_ratio_stats.py @@ -220,20 +220,9 @@ def assemble(df, geos, groups): (output["triennial"] == True), "reassessment_year" # noqa: E712 ] = "No" output.loc[ - (output["year"] % 3 == 0) - & (output["triad"] == "North") - & (output["triennial"] == True), # noqa: E712 - "reassessment_year", - ] = "Yes" - output.loc[ - (output["year"] % 3 == 1) - & (output["triad"] == "South") - & (output["triennial"] == True), # noqa: E712 - "reassessment_year", - ] = "Yes" - output.loc[ - (output["year"] % 3 == 2) - & (output["triad"] == "City") + ((output["year"] % 3 == 0) & (output["triad"] == "North")) + | ((output["year"] % 3 == 1) & (output["triad"] == "South")) + | ((output["year"] % 3 == 2) & (output["triad"] == "City")) & (output["triennial"] == True), # noqa: E712 "reassessment_year", ] = "Yes" From d1079f016be7b52af368e57637053be7fc3c49a2 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 10 Jul 2024 15:05:45 +0000 Subject: [PATCH 54/96] More commenting --- .../reporting/reporting.sot_assessment_roll.py | 12 ++++++++---- dbt/models/reporting/reporting.sot_ratio_stats.py | 9 ++++++--- dbt/models/reporting/reporting.sot_sales.py | 12 ++++++++---- .../reporting/reporting.sot_taxes_exemptions.py | 13 +++++++++++-- 4 files changed, 33 insertions(+), 13 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index f5fbba742..eef930eb3 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -8,7 +8,8 @@ groups = ["no_group", "class", "major_class", "modeling_group", "res_other"] -# Define aggregation functions +# Define aggregation functions. These are just wrappers for basic python +# functions that make using them easier to use with pandas.agg(). def q10(x): return x.quantile(0.1) @@ -58,7 +59,7 @@ def first(x): def aggregrate(data, geography_type, group_type): """ Function to group a dataframe by whichever geography and group types it is - passed and output aggregate stats for that only for that grouping. + passed and output aggregate stats for that grouping. """ print(geography_type, group_type) @@ -85,8 +86,8 @@ def aggregrate(data, geography_type, group_type): def assemble(df, geos, groups): """ Function that loops over predefined geography and class groups and passes - them to the aggregate function. Outputs stacked aggegrated output from the - aggregate function. + them to the aggregate function. Returns stacked output from the aggregate + function. """ # Create an empty dataframe to fill with output @@ -281,6 +282,9 @@ def ingest_geos(geos): def model(dbt, spark_session): + """ + Function to build a dbt python model using PySpark. + """ dbt.config(materialized="table") # Ingest geographies and their associated data years diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py index 615e56c67..a2677b98f 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats.py +++ b/dbt/models/reporting/reporting.sot_ratio_stats.py @@ -76,7 +76,7 @@ def within(x, limit): def aggregrate(data, geography_type, group_type): """ Function to group a dataframe by whichever geography and group types it is - passed and output aggregate stats for that only for that grouping. Works + passed and output aggregate stats for that grouping. Works differently than in other SoT scripts since assesspy functions need multiple inputs. """ @@ -142,8 +142,8 @@ def aggregrate(data, geography_type, group_type): def assemble(df, geos, groups): """ Function that loops over predefined geography and class groups and passes - them to the aggregate function. Outputs stacked aggegrated output from the - aggregate function. + them to the aggregate function. Returns stacked output from the aggregate + function. """ # Create an empty dataframe to fill with output @@ -342,6 +342,9 @@ def ingest_geos(geos): def model(dbt, spark_session): + """ + Function to build a dbt python model using PySpark. + """ dbt.config(materialized="table") # Ingest geographies and their associated data years diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py index f387d2320..b3d496172 100644 --- a/dbt/models/reporting/reporting.sot_sales.py +++ b/dbt/models/reporting/reporting.sot_sales.py @@ -11,7 +11,8 @@ groups = ["no_group", "class", "major_class", "modeling_group", "res_other"] -# Define aggregation functions +# Define aggregation functions. These are just wrappers for basic python +# functions that make using them easier to use with pandas.agg(). def q10(x): return x.quantile(0.1) @@ -58,7 +59,7 @@ def first(x): def aggregrate(data, geography_type, group_type): """ Function to group a dataframe by whichever geography and group types it is - passed and output aggregate stats for that only for that grouping. + passed and output aggregate stats for that grouping. """ print(geography_type, group_type) @@ -83,8 +84,8 @@ def aggregrate(data, geography_type, group_type): def assemble(df, geos, groups): """ Function that loops over predefined geography and class groups and passes - them to the aggregate function. Outputs stacked aggegrated output from the - aggregate function. + them to the aggregate function. Returns stacked output from the aggregate + function. """ # Create an empty dataframe to fill with output @@ -213,6 +214,9 @@ def ingest_geos(geos): def model(dbt, spark_session): + """ + Function to build a dbt python model using PySpark. + """ dbt.config(materialized="table") # Ingest geographies and their associated data years diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py index 9c803ee33..2aa0cef5a 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py @@ -8,7 +8,8 @@ groups = ["no_group", "class", "major_class", "modeling_group", "res_other"] -# Define aggregation functions +# Define aggregation functions. These are just wrappers for basic python +# functions that make using them easier to use with pandas.agg(). def q10(x): return x.quantile(0.1) @@ -67,7 +68,7 @@ def first(x): def aggregrate(data, geography_type, group_type): """ Function to group a dataframe by whichever geography and group types it is - passed and output aggregate stats for that only for that grouping. + passed and output aggregate stats for that grouping. """ print(geography_type, group_type) @@ -91,6 +92,11 @@ def aggregrate(data, geography_type, group_type): def assemble(df, geos, groups): + """ + Function that loops over predefined geography and class groups and passes + them to the aggregate function. Returns stacked output from the aggregate + function. + """ # Create an empty dataframe to fill with output output = pd.DataFrame() @@ -241,6 +247,9 @@ def ingest_geos(geos): def model(dbt, spark_session): + """ + Function to build a dbt python model using PySpark. + """ dbt.config(materialized="table") # Ingest geographies and their associated data years From 28ba90c70733ad3fa055c7e151a90d7902e7e48f Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 18 Mar 2025 17:01:29 +0000 Subject: [PATCH 55/96] Lint --- dbt/models/reporting/docs.md | 32 +++++++++++++++++++ dbt/models/reporting/reporting.sot_sales.py | 4 +-- .../reporting.sot_taxes_exemptions.py | 4 +-- 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/dbt/models/reporting/docs.md b/dbt/models/reporting/docs.md index 7862e9891..5cd579f5c 100644 --- a/dbt/models/reporting/docs.md +++ b/dbt/models/reporting/docs.md @@ -80,6 +80,38 @@ for every possible geography and reporting group combination. **Primary Key**: `pin`, `year` {% enddocs %} +# sot_assessment_roll +{% docs table_sot_assessment_roll %} +{% enddocs %} + +# sot_assessment_roll_input +{% docs table_sot_assessment_roll_input %} +{% enddocs %} + +# sot_ratio_stats +{% docs table_sot_ratio_stats %} +{% enddocs %} + +# sot_ratio_stats_input +{% docs table_sot_ratio_stats_input %} +{% enddocs %} + +# sot_sales +{% docs table_sot_sales %} +{% enddocs %} + +# sot_sales_input +{% docs table_sot_sales_input %} +{% enddocs %} + +# sot_taxes_exemptions +{% docs table_sot_taxes_exemptions %} +{% enddocs %} + +# sot_taxes_exemptions_input +{% docs table_sot_taxes_exemptions_input %} +{% enddocs %} + # vw_assessment_roll {% docs view_vw_assessment_roll %} diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py index b3d496172..bf8666809 100644 --- a/dbt/models/reporting/reporting.sot_sales.py +++ b/dbt/models/reporting/reporting.sot_sales.py @@ -131,9 +131,7 @@ def assemble(df, geos, groups): "sale_price_per_sf_delta_mean", "sale_price_per_sf_delta_sum", ] - ] = ( - output[diff_cols].groupby(["geography_id", "group_id"]).diff() - ) + ] = output[diff_cols].groupby(["geography_id", "group_id"]).diff() output = clean_names(output) diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py index 2aa0cef5a..1cbd83bf6 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py @@ -129,9 +129,7 @@ def assemble(df, geos, groups): "tax_bill_total_delta_mean", "tax_bill_total_delta_sum", ] - ] = ( - output[diff_cols].groupby(["geography_id", "group_id"]).diff() - ) + ] = output[diff_cols].groupby(["geography_id", "group_id"]).diff() output = clean_names(output) From cb50a51b28f447d84531f835d41e580fa68907f8 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 18 Mar 2025 17:15:27 +0000 Subject: [PATCH 56/96] Clean up --- dbt/models/reporting/docs.md | 16 ++++++++-------- .../reporting/reporting.sot_assessment_roll.py | 4 ++-- ...atio_stats.py => reporting.sot_ratio_stat.py} | 2 +- ...ut.sql => reporting.sot_ratio_stat_input.sql} | 6 ++---- ...orting.sot_sales.py => reporting.sot_sale.py} | 2 +- ...es_input.sql => reporting.sot_sale_input.sql} | 2 +- dbt/models/reporting/schema.yml | 16 ++++++++-------- 7 files changed, 23 insertions(+), 25 deletions(-) rename dbt/models/reporting/{reporting.sot_ratio_stats.py => reporting.sot_ratio_stat.py} (99%) rename dbt/models/reporting/{reporting.sot_ratio_stats_input.sql => reporting.sot_ratio_stat_input.sql} (96%) rename dbt/models/reporting/{reporting.sot_sales.py => reporting.sot_sale.py} (99%) rename dbt/models/reporting/{reporting.sot_sales_input.sql => reporting.sot_sale_input.sql} (99%) diff --git a/dbt/models/reporting/docs.md b/dbt/models/reporting/docs.md index 5cd579f5c..e0969b11b 100644 --- a/dbt/models/reporting/docs.md +++ b/dbt/models/reporting/docs.md @@ -88,20 +88,20 @@ for every possible geography and reporting group combination. {% docs table_sot_assessment_roll_input %} {% enddocs %} -# sot_ratio_stats -{% docs table_sot_ratio_stats %} +# sot_ratio_stat +{% docs table_sot_ratio_stat %} {% enddocs %} -# sot_ratio_stats_input -{% docs table_sot_ratio_stats_input %} +# sot_ratio_stat_input +{% docs table_sot_ratio_stat_input %} {% enddocs %} -# sot_sales -{% docs table_sot_sales %} +# sot_sale +{% docs table_sot_sale %} {% enddocs %} -# sot_sales_input -{% docs table_sot_sales_input %} +# sot_sale_input +{% docs table_sot_sale_input %} {% enddocs %} # sot_taxes_exemptions diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index eef930eb3..0b2cda394 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -56,7 +56,7 @@ def first(x): } -def aggregrate(data, geography_type, group_type): +def aggregrate(data, geography_type, group_type, stats): """ Function to group a dataframe by whichever geography and group types it is passed and output aggregate stats for that grouping. @@ -99,7 +99,7 @@ def assemble(df, geos, groups): for x in value: for z in groups: - output = pd.concat([output, aggregrate(df, x, z)]) + output = pd.concat([output, aggregrate(df, x, z, stats=stats)]) # Flatten multi-index output.columns = ["_".join(col) for col in output.columns] diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stat.py similarity index 99% rename from dbt/models/reporting/reporting.sot_ratio_stats.py rename to dbt/models/reporting/reporting.sot_ratio_stat.py index a2677b98f..f9e436c02 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats.py +++ b/dbt/models/reporting/reporting.sot_ratio_stat.py @@ -350,7 +350,7 @@ def model(dbt, spark_session): # Ingest geographies and their associated data years geos = ingest_geos(dbt.ref("reporting.sot_data_years")) - input = dbt.ref("reporting.sot_ratio_stats_input") + input = dbt.ref("reporting.sot_ratio_stat_input") # Convert the Spark input dataframe to Pandas for # compatibility with assesspy functions diff --git a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql b/dbt/models/reporting/reporting.sot_ratio_stat_input.sql similarity index 96% rename from dbt/models/reporting/reporting.sot_ratio_stats_input.sql rename to dbt/models/reporting/reporting.sot_ratio_stat_input.sql index 17ed6ef83..b00b93404 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql +++ b/dbt/models/reporting/reporting.sot_ratio_stat_input.sql @@ -1,6 +1,6 @@ /* This script gathers parcel-level geographies and joins them to values and sale prices, and class groupings in order to construct sales ratios. Its sole -purpose is to feed reporting.sot_ratio_stats, and should not be used +purpose is to feed reporting.sot_ratio_stat, and should not be used otherwise. */ {{ config( @@ -50,9 +50,7 @@ SELECT uni.chicago_community_area_name AS community_area, uni.census_place_geoid AS census_place, uni.census_tract_geoid AS census_tract, - uni.census_congressional_district_geoid - AS - census_congressional_district, + uni.census_congressional_district_geoid AS census_congressional_district, uni.census_zcta_geoid AS census_zcta, uni.cook_board_of_review_district_num AS cook_board_of_review_district, uni.cook_commissioner_district_num AS cook_commissioner_district, diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sale.py similarity index 99% rename from dbt/models/reporting/reporting.sot_sales.py rename to dbt/models/reporting/reporting.sot_sale.py index bf8666809..eb407e2cd 100644 --- a/dbt/models/reporting/reporting.sot_sales.py +++ b/dbt/models/reporting/reporting.sot_sale.py @@ -220,7 +220,7 @@ def model(dbt, spark_session): # Ingest geographies and their associated data years geos = ingest_geos(dbt.ref("reporting.sot_data_years")) - input = dbt.ref("reporting.sot_sales_input") + input = dbt.ref("reporting.sot_sale_input") # Convert the Spark input dataframe to Pandas for # compatibility with assesspy functions diff --git a/dbt/models/reporting/reporting.sot_sales_input.sql b/dbt/models/reporting/reporting.sot_sale_input.sql similarity index 99% rename from dbt/models/reporting/reporting.sot_sales_input.sql rename to dbt/models/reporting/reporting.sot_sale_input.sql index 18d0960aa..f21e4aee1 100644 --- a/dbt/models/reporting/reporting.sot_sales_input.sql +++ b/dbt/models/reporting/reporting.sot_sale_input.sql @@ -1,5 +1,5 @@ -- This script gathers parcel-level geographies and joins them to sales and --- class groupings. Its sole purpose is to feed reporting.sot_sales, +-- class groupings. Its sole purpose is to feed reporting.sot_sale, -- and should not be used otherwise. {{ diff --git a/dbt/models/reporting/schema.yml b/dbt/models/reporting/schema.yml index bf540a752..759da8528 100644 --- a/dbt/models/reporting/schema.yml +++ b/dbt/models/reporting/schema.yml @@ -77,26 +77,26 @@ models: tags: - daily - - name: reporting.sot_ratio_stats - description: '{{ doc("table_sot_ratio_stats") }}' + - name: reporting.sot_ratio_stat + description: '{{ doc("table_sot_ratio_stat") }}' config: tags: - daily - - name: reporting.sot_ratio_stats_input - description: '{{ doc("table_sot_ratio_stats_input") }}' + - name: reporting.sot_ratio_stat_input + description: '{{ doc("table_sot_ratio_stat_input") }}' config: tags: - daily - - name: reporting.sot_sales - description: '{{ doc("table_sot_sales") }}' + - name: reporting.sot_sale + description: '{{ doc("table_sot_sale") }}' config: tags: - daily - - name: reporting.sot_sales_input - description: '{{ doc("table_sot_sales_input") }}' + - name: reporting.sot_sale_input + description: '{{ doc("table_sot_sale_input") }}' config: tags: - daily From 978ad93126920231117c5a0ca19f1109b4c6dece Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 18 Mar 2025 19:19:48 +0000 Subject: [PATCH 57/96] Use new assesspy inputs --- .../reporting/reporting.sot_ratio_stat.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_ratio_stat.py b/dbt/models/reporting/reporting.sot_ratio_stat.py index f9e436c02..89874cc52 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stat.py +++ b/dbt/models/reporting/reporting.sot_ratio_stat.py @@ -17,9 +17,9 @@ # Wrap assesspy functions to avoid GitHub runner errors for length 0 groupings -def cod_safe(ratio): - if len(ratio) >= 1: - output = ass.cod(ratio) +def cod_safe(assessed, sale_price): + if len(sale_price) >= 1: + output = ass.cod(estimate=assessed, sale_price=sale_price) else: output = None @@ -28,7 +28,7 @@ def cod_safe(ratio): def prd_safe(assessed, sale_price): if len(sale_price) >= 1: - output = ass.prd(assessed=assessed, sale_price=sale_price) + output = ass.prd(estimate=assessed, sale_price=sale_price) else: output = None @@ -37,9 +37,7 @@ def prd_safe(assessed, sale_price): def prb_safe(assessed, sale_price): if len(sale_price) >= 1: - output = ass.prb(assessed=assessed, sale_price=sale_price, round=3)[ - "prb" - ] + output = ass.prb(estimate=assessed, sale_price=sale_price) else: output = None @@ -48,7 +46,7 @@ def prb_safe(assessed, sale_price): def mki_safe(assessed, sale_price): if len(sale_price) >= 1: - output = ass.mki(assessed=assessed, sale_price=sale_price) + output = ass.mki(estimate=assessed, sale_price=sale_price) else: output = None @@ -119,7 +117,9 @@ def aggregrate(data, geography_type, group_type): "ratio_q90": x["ratio"].quantile(0.90), "ratio_max": x["ratio"].max(), "ratio_mean": x["ratio"].mean(), - "cod": cod_safe(ratio=x["ratio"]), + "cod": cod_safe( + assessed=x["tot_mv"], sale_price=x["sale_price"] + ), "prd": prd_safe( assessed=x["tot_mv"], sale_price=x["sale_price"] ), From a471a420e2e3bc0b36e55018f8ec0159635c8e3c Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 18 Mar 2025 19:54:38 +0000 Subject: [PATCH 58/96] Update assesspy version --- dbt/models/reporting/reporting.sot_ratio_stat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/models/reporting/reporting.sot_ratio_stat.py b/dbt/models/reporting/reporting.sot_ratio_stat.py index 89874cc52..4763cc6c2 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stat.py +++ b/dbt/models/reporting/reporting.sot_ratio_stat.py @@ -1,7 +1,7 @@ # pylint: skip-file # type: ignore sc.addPyFile( # noqa: F821 - "s3://ccao-athena-dependencies-us-east-1/assesspy==1.1.0.zip" + "s3://ccao-athena-dependencies-us-east-1/assesspy==2.0.2.zip" ) # This script generates aggregated summary stats on sales ratios across a From d28f02c5dd6e4edc7730c3246a657a5b3c352edc Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 18 Mar 2025 20:54:09 +0000 Subject: [PATCH 59/96] Add back documentation --- dbt/models/reporting/docs.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/dbt/models/reporting/docs.md b/dbt/models/reporting/docs.md index e0969b11b..adb3034ba 100644 --- a/dbt/models/reporting/docs.md +++ b/dbt/models/reporting/docs.md @@ -82,34 +82,63 @@ for every possible geography and reporting group combination. # sot_assessment_roll {% docs table_sot_assessment_roll %} +Table to feed the Python dbt job that creates the +`reporting.sot_assessment_roll` table. Feeds public reporting assets. + +**Primary Key**: `year`, `stage_name`, `geography_id`, `group_id` {% enddocs %} # sot_assessment_roll_input {% docs table_sot_assessment_roll_input %} +Table to feed the Python dbt job that creates the +`reporting.sot_assessment_roll` table. Feeds public reporting assets. + +**Primary Key**: `year`, `stage_name`, `geography_id`, `group_id` {% enddocs %} # sot_ratio_stat {% docs table_sot_ratio_stat %} +Feeds public reporting assets. + +**Primary Key**: `year`, `stage_name`, `geography_id`, `group_id` {% enddocs %} # sot_ratio_stat_input {% docs table_sot_ratio_stat_input %} +Table to feed the Python dbt job that creates the +`reporting.sot_ratio_stats` table. Feeds public reporting assets. + +**Primary Key**: `year`, `stage_name`, `geography_id`, `group_id` {% enddocs %} # sot_sale {% docs table_sot_sale %} +Feeds public reporting assets. + +**Primary Key**: `year`, `geography_id`, `group_id` {% enddocs %} # sot_sale_input {% docs table_sot_sale_input %} +Table to feed the Python dbt job that creates the +`reporting.sot_sale` table. Feeds public reporting assets. + +**Primary Key**: `year`, `geography_id`, `group_id` {% enddocs %} # sot_taxes_exemptions {% docs table_sot_taxes_exemptions %} +Feeds public reporting assets. + +**Primary Key**: `year`, `geography_id`, `group_id` {% enddocs %} # sot_taxes_exemptions_input {% docs table_sot_taxes_exemptions_input %} +Table to feed the Python dbt job that creates the +`reporting.sot_taxes_exemptions` table. Feeds public reporting assets. + +**Primary Key**: `year`, `geography_id`, `group_id` {% enddocs %} # vw_assessment_roll From f8258cf183d4f6514055d51647dc713a2f6b1718 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 19 Mar 2025 14:27:50 +0000 Subject: [PATCH 60/96] Improve documentation --- dbt/models/reporting/docs.md | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/dbt/models/reporting/docs.md b/dbt/models/reporting/docs.md index adb3034ba..3b91b6396 100644 --- a/dbt/models/reporting/docs.md +++ b/dbt/models/reporting/docs.md @@ -82,8 +82,8 @@ for every possible geography and reporting group combination. # sot_assessment_roll {% docs table_sot_assessment_roll %} -Table to feed the Python dbt job that creates the -`reporting.sot_assessment_roll` table. Feeds public reporting assets. +Aggregated summary stats of assessed values across a number of geographies, +class combinations, and time. **Primary Key**: `year`, `stage_name`, `geography_id`, `group_id` {% enddocs %} @@ -98,7 +98,8 @@ Table to feed the Python dbt job that creates the # sot_ratio_stat {% docs table_sot_ratio_stat %} -Feeds public reporting assets. +Aggregated summary stats of sales ratios across a number of geographies, class +combinations, and time. **Primary Key**: `year`, `stage_name`, `geography_id`, `group_id` {% enddocs %} @@ -113,7 +114,8 @@ Table to feed the Python dbt job that creates the # sot_sale {% docs table_sot_sale %} -Feeds public reporting assets. +Aggregated summary stats of sales across a number of geographies, class +combinations, and time. **Primary Key**: `year`, `geography_id`, `group_id` {% enddocs %} @@ -128,7 +130,8 @@ Table to feed the Python dbt job that creates the # sot_taxes_exemptions {% docs table_sot_taxes_exemptions %} -Feeds public reporting assets. +Aggregated summary stats of taxes and exemptions data across a number of +geographies, class combinations, and time. **Primary Key**: `year`, `geography_id`, `group_id` {% enddocs %} From e213a32c8ae867425da59fe98d4e8dc8dae182d8 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 19 Mar 2025 16:06:47 +0000 Subject: [PATCH 61/96] Add outlier sales filtering --- dbt/models/reporting/reporting.sot_ratio_stat.py | 2 +- dbt/models/reporting/reporting.sot_ratio_stat_input.sql | 1 + dbt/models/reporting/reporting.sot_sale_input.sql | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/dbt/models/reporting/reporting.sot_ratio_stat.py b/dbt/models/reporting/reporting.sot_ratio_stat.py index 4763cc6c2..0b2088e7f 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stat.py +++ b/dbt/models/reporting/reporting.sot_ratio_stat.py @@ -89,7 +89,7 @@ def aggregrate(data, geography_type, group_type): # Remove parcels with MVs of 0 since they screw up ratios data = data[data["tot_mv"] > 0] - # Remove groups that only have one sale since we can't calculate stats + # Remove groups with 20 or less sales data = data.dropna(subset=["sale_price"]) data = data[data["sale_n_tot"] >= 20] diff --git a/dbt/models/reporting/reporting.sot_ratio_stat_input.sql b/dbt/models/reporting/reporting.sot_ratio_stat_input.sql index b00b93404..11e497874 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stat_input.sql +++ b/dbt/models/reporting/reporting.sot_ratio_stat_input.sql @@ -109,6 +109,7 @@ LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales AND NOT sales.sale_filter_deed_type AND NOT sales.sale_filter_less_than_10k AND NOT sales.sale_filter_same_sale_within_365 + AND COALESCE(sales.sv_is_outlier, FALSE) = FALSE -- Temporary limit on feeder table to avoid GitHub runner memory issues. WHERE uni.year >= '2020' AND uni.year IN ('2022', '2023') AND uni.class IN ('278', '597') diff --git a/dbt/models/reporting/reporting.sot_sale_input.sql b/dbt/models/reporting/reporting.sot_sale_input.sql index f21e4aee1..e7ab33899 100644 --- a/dbt/models/reporting/reporting.sot_sale_input.sql +++ b/dbt/models/reporting/reporting.sot_sale_input.sql @@ -100,5 +100,6 @@ LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales AND NOT sales.sale_filter_deed_type AND NOT sales.sale_filter_less_than_10k AND NOT sales.sale_filter_same_sale_within_365 + AND COALESCE(sales.sv_is_outlier, FALSE) = FALSE -- Temporary limit on feeder table to avoid GitHub runner memory issues. WHERE uni.year = '2023' From 1c8f1b36a04b550eada03bb8c5ff907a16e7d478 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 19 Mar 2025 17:09:07 +0000 Subject: [PATCH 62/96] Count outlier sales --- dbt/models/reporting/reporting.sot_sale.py | 6 +++++- dbt/models/reporting/reporting.sot_sale_input.sql | 7 +++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_sale.py b/dbt/models/reporting/reporting.sot_sale.py index eb407e2cd..b6809e084 100644 --- a/dbt/models/reporting/reporting.sot_sale.py +++ b/dbt/models/reporting/reporting.sot_sale.py @@ -51,6 +51,7 @@ def first(x): "sale_char_bldg_sf": ["median"], "sale_char_land_sf": ["median"], "sale_char_yrblt": ["median"], + "sale_is_outlier": ["sum"], "class": [stats.multimode], "geography_data_year": [first], } @@ -148,6 +149,7 @@ def clean_names(x): "sale_price_size": "pin_n_tot", "year": "sale_year", "sale_price_count": "sale_n_tot", + "sale_is_outlier_sum": "sale_n_outlier_excluded", "class_multimode": "sale_class_mode", "geography_data_year_first": "geography_data_year", } @@ -191,6 +193,7 @@ def clean_names(x): "sale_char_land_sf_median", "sale_char_yrblt_median", "sale_class_mode", + "sale_n_outlier_excluded", ] ] @@ -248,7 +251,8 @@ def model(dbt, spark_session): + "sale_price_per_sf_delta_sum: double, " + "sale_char_bldg_sf_median: double, " + "sale_char_land_sf_median: double, " - + "sale_char_yrblt_median: double, sale_class_mode: array" + + "sale_char_yrblt_median: double, sale_class_mode: array, " + + "sale_n_outlier_excluded: bigint" ) spark_df = spark_session.createDataFrame(df, schema=schema) diff --git a/dbt/models/reporting/reporting.sot_sale_input.sql b/dbt/models/reporting/reporting.sot_sale_input.sql index e7ab33899..93918dce3 100644 --- a/dbt/models/reporting/reporting.sot_sale_input.sql +++ b/dbt/models/reporting/reporting.sot_sale_input.sql @@ -23,7 +23,11 @@ WITH sf AS ( SELECT sales.doc_no, - sales.sale_price, + -- Code outlier sale prices as NULL so they won't be part of aggregated sale + -- stats, but we can count the number of outliers + CASE WHEN sales.sv_is_outlier THEN NULL ELSE sales.sale_price END + AS sale_price, + COALESCE(sales.sv_is_outlier, FALSE) AS sale_is_outlier, CASE WHEN sf.char_bldg_sf > 0 THEN CAST(sales.sale_price / sf.char_bldg_sf AS DOUBLE) @@ -100,6 +104,5 @@ LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales AND NOT sales.sale_filter_deed_type AND NOT sales.sale_filter_less_than_10k AND NOT sales.sale_filter_same_sale_within_365 - AND COALESCE(sales.sv_is_outlier, FALSE) = FALSE -- Temporary limit on feeder table to avoid GitHub runner memory issues. WHERE uni.year = '2023' From d26fed05615ac168bfaff2388f21e33ddd6a4389 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 19 Mar 2025 18:35:50 +0000 Subject: [PATCH 63/96] Exclude outliers from sales char stats --- dbt/models/reporting/reporting.sot_sale_input.sql | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_sale_input.sql b/dbt/models/reporting/reporting.sot_sale_input.sql index 93918dce3..5b9bd319f 100644 --- a/dbt/models/reporting/reporting.sot_sale_input.sql +++ b/dbt/models/reporting/reporting.sot_sale_input.sql @@ -92,11 +92,6 @@ SELECT AS res_other, uni.year FROM {{ ref('default.vw_pin_universe') }} AS uni -LEFT JOIN sf - ON uni.pin = sf.pin - AND uni.year = sf.year -LEFT JOIN {{ ref('ccao.class_dict') }} - ON uni.class = class_dict.class_code LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales ON uni.pin = sales.pin AND uni.year = sales.year @@ -104,5 +99,12 @@ LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales AND NOT sales.sale_filter_deed_type AND NOT sales.sale_filter_less_than_10k AND NOT sales.sale_filter_same_sale_within_365 +LEFT JOIN sf + ON uni.pin = sf.pin + AND uni.year = sf.year + -- Don't join characteristics onto outliers + AND NOT COALESCE(sales.sv_is_outlier, FALSE) +LEFT JOIN {{ ref('ccao.class_dict') }} + ON uni.class = class_dict.class_code -- Temporary limit on feeder table to avoid GitHub runner memory issues. WHERE uni.year = '2023' From 91c5040efe30e107d722491db4ab31c886f9f9b7 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 19 Mar 2025 18:44:05 +0000 Subject: [PATCH 64/96] Clarify bldg and land sf --- dbt/models/reporting/reporting.sot_sale.py | 12 ++++++------ dbt/models/reporting/reporting.sot_sale_input.sql | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_sale.py b/dbt/models/reporting/reporting.sot_sale.py index b6809e084..60b9f9ee3 100644 --- a/dbt/models/reporting/reporting.sot_sale.py +++ b/dbt/models/reporting/reporting.sot_sale.py @@ -48,8 +48,8 @@ def first(x): agg_func_math = { "sale_price": ["size", "count"] + more_stats, "sale_price_per_sf": more_stats, - "sale_char_bldg_sf": ["median"], - "sale_char_land_sf": ["median"], + "sale_char_tot_bldg_sf": ["median"], + "sale_char_tot_land_sf": ["median"], "sale_char_yrblt": ["median"], "sale_is_outlier": ["sum"], "class": [stats.multimode], @@ -189,8 +189,8 @@ def clean_names(x): "sale_price_per_sf_delta_median", "sale_price_per_sf_delta_mean", "sale_price_per_sf_delta_sum", - "sale_char_bldg_sf_median", - "sale_char_land_sf_median", + "sale_char_tot_bldg_sf_median", + "sale_char_tot_land_sf_median", "sale_char_yrblt_median", "sale_class_mode", "sale_n_outlier_excluded", @@ -249,8 +249,8 @@ def model(dbt, spark_session): + "sale_price_per_sf_delta_median: double, " + "sale_price_per_sf_delta_mean: double, " + "sale_price_per_sf_delta_sum: double, " - + "sale_char_bldg_sf_median: double, " - + "sale_char_land_sf_median: double, " + + "sale_char_tot_bldg_sf_median: double, " + + "sale_char_tot_land_sf_median: double, " + "sale_char_yrblt_median: double, sale_class_mode: array, " + "sale_n_outlier_excluded: bigint" ) diff --git a/dbt/models/reporting/reporting.sot_sale_input.sql b/dbt/models/reporting/reporting.sot_sale_input.sql index 5b9bd319f..c5d2b78c3 100644 --- a/dbt/models/reporting/reporting.sot_sale_input.sql +++ b/dbt/models/reporting/reporting.sot_sale_input.sql @@ -32,8 +32,8 @@ SELECT THEN CAST(sales.sale_price / sf.char_bldg_sf AS DOUBLE) END AS sale_price_per_sf, - CAST(sf.char_bldg_sf AS INT) AS sale_char_bldg_sf, - CAST(sf.char_land_sf AS INT) AS sale_char_land_sf, + CAST(sf.char_bldg_sf AS INT) AS sale_char_tot_bldg_sf, + CAST(sf.char_land_sf AS INT) AS sale_char_tot_land_sf, CAST(sf.char_yrblt AS INT) AS sale_char_yrblt, uni.class, 'Cook' AS county, From c3cc7ba6a3da5428768e41ac33e024e73cfe4b04 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 19 Mar 2025 20:02:55 +0000 Subject: [PATCH 65/96] Improve schema declaration --- .../reporting.sot_assessment_roll.py | 88 +++++++++++++------ 1 file changed, 62 insertions(+), 26 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index 0b2cda394..6eda0abf5 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -297,32 +297,68 @@ def model(dbt, spark_session): input = input.toPandas() df = assemble(input, geos=geos, groups=groups) - - schema = ( - "geography_type: string, geography_id: string, " - + "geography_data_year: string, group_type: string, group_id: string, " - + "year: string, reassessment_year: string, stage_name: string, " - + "pin_n_tot: bigint, pin_n_w_value: bigint, pin_pct_w_value: double, " - + "av_tot_min: double, av_tot_q10: double, av_tot_q25: double, " - + "av_tot_median: double, av_tot_q75: double, av_tot_q90: double, " - + "av_tot_max: double, av_tot_mean: double, av_tot_sum: double, " - + "av_tot_delta_median: double, av_tot_delta_mean: double, " - + "av_tot_delta_sum: double, av_tot_delta_pct_median: double, " - + "av_tot_delta_pct_mean: double, av_tot_delta_pct_sum: double, " - + "av_bldg_min: double, av_bldg_q10: double, av_bldg_q25: double, " - + "av_bldg_median: double, av_bldg_q75: double, av_bldg_q90: double, " - + "av_bldg_max: double, av_bldg_mean: double, av_bldg_sum: double, " - + "av_bldg_delta_median: double, av_bldg_delta_mean: double, " - + "av_bldg_delta_sum: double, av_bldg_delta_pct_median: double, " - + "av_bldg_delta_pct_mean: double, av_bldg_delta_pct_sum: double, " - + "av_land_min: double, av_land_q10: double, av_land_q25: double, " - + "av_land_median: double, av_land_q75: double, av_land_q90: double, " - + "av_land_max: double, av_land_mean: double, av_land_sum: double, " - + "av_land_delta_median: double, av_land_delta_mean: double, " - + "av_land_delta_sum: double, av_land_delta_pct_median: double, " - + "av_land_delta_pct_mean: double, av_land_delta_pct_sum: double" + # %% + schema = { + "geography_type": "string", + "geography_id": "string", + "geography_data_year": "string", + "group_type": "string", + "group_id": "string", + "year": "string", + "reassessment_year": "string", + "stage_name": "string", + "pin_n_tot": "bigint", + "pin_n_w_value": "bigint", + "pin_pct_w_value": "double", + "av_tot_min": "double", + "av_tot_q10": "double", + "av_tot_q25": "double", + "av_tot_median": "double", + "av_tot_q75": "double", + "av_tot_q90": "double", + "av_tot_max": "double", + "av_tot_mean": "double", + "av_tot_sum": "double", + "av_tot_delta_median": "double", + "av_tot_delta_mean": "double", + "av_tot_delta_sum": "double", + "av_tot_delta_pct_median": "double", + "av_tot_delta_pct_mean": "double", + "av_tot_delta_pct_sum": "double", + "av_bldg_min": "double", + "av_bldg_q10": "double", + "av_bldg_q25": "double", + "av_bldg_median": "double", + "av_bldg_q75": "double", + "av_bldg_q90": "double", + "av_bldg_max": "double", + "av_bldg_mean": "double", + "av_bldg_sum": "double", + "av_bldg_delta_median": "double", + "av_bldg_delta_mean": "double", + "av_bldg_delta_sum": "double", + "av_bldg_delta_pct_median": "double", + "av_bldg_delta_pct_mean": "double", + "av_bldg_delta_pct_sum": "double", + "av_land_min": "double", + "av_land_q10": "double", + "av_land_q25": "double", + "av_land_median": "double", + "av_land_q75": "double", + "av_land_q90": "double", + "av_land_max": "double", + "av_land_mean": "double", + "av_land_sum": "double", + "av_land_delta_median": "double", + "av_land_delta_mean": "double", + "av_land_delta_sum": "double", + "av_land_delta_pct_median": "double", + "av_land_delta_pct_mean": "double", + "av_land_delta_pct_sum": "double", + } + # %% + spark_df = spark_session.createDataFrame( + df, schema=", ".join(f"{key}: {val}" for key, val in schema.items()) ) - spark_df = spark_session.createDataFrame(df, schema=schema) - return spark_df From c8230ea2c13e76958ba7e52bfe57f93407f28bd3 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 19 Mar 2025 20:35:31 +0000 Subject: [PATCH 66/96] Update schema declarations --- .../reporting/reporting.sot_ratio_stat.py | 72 +++++++++---- dbt/models/reporting/reporting.sot_sale.py | 66 +++++++----- .../reporting.sot_taxes_exemptions.py | 100 ++++++++++++------ 3 files changed, 160 insertions(+), 78 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_ratio_stat.py b/dbt/models/reporting/reporting.sot_ratio_stat.py index 0b2088e7f..f4ebb98b7 100644 --- a/dbt/models/reporting/reporting.sot_ratio_stat.py +++ b/dbt/models/reporting/reporting.sot_ratio_stat.py @@ -358,26 +358,58 @@ def model(dbt, spark_session): df = assemble(input, geos=geos, groups=groups) - schema = ( - "geography_type: string, geography_id: string, " - + "geography_data_year: string, group_type: string, group_id: string, " - + "year: string, reassessment_year: string, stage_name: string, " - + "pin_n_tot: int, pin_n_w_value: bigint, pin_pct_w_value: double, " - + "sale_n_tot: bigint, mv_min: bigint, mv_q10: bigint, " - + "mv_q25: bigint, mv_median: bigint, mv_q75: bigint, " - + "mv_q90: bigint, mv_max: bigint, mv_mean: bigint, mv_sum: bigint, " - + "mv_delta_median: double, mv_delta_mean: double, " - + "mv_delta_sum: double, mv_delta_pct_median: double, " - + "mv_delta_pct_mean: double, mv_delta_pct_sum: double, " - + "ratio_min: double, ratio_q10: double, ratio_q25: double, " - + "ratio_median: double, ratio_q75: double, ratio_q90: double, " - + "ratio_max: double, ratio_mean: double, cod: double, prd: double, " - + "prb: double, mki: double, cod_met: boolean, prd_met: boolean, " - + "prb_met: boolean, mki_met: boolean, within_05_pct: boolean, " - + "within_10_pct: boolean, within_15_pct: boolean, " - + "within_20_pct: boolean" - ) + schema = { + "geography_type": "string", + "geography_id": "string", + "geography_data_year": "string", + "group_type": "string", + "group_id": "string", + "year": "string", + "reassessment_year": "string", + "stage_name": "string", + "pin_n_tot": "int", + "pin_n_w_value": "bigint", + "pin_pct_w_value": "double", + "sale_n_tot": "bigint", + "mv_min": "bigint", + "mv_q10": "bigint", + "mv_q25": "bigint", + "mv_median": "bigint", + "mv_q75": "bigint", + "mv_q90": "bigint", + "mv_max": "bigint", + "mv_mean": "bigint", + "mv_sum": "bigint", + "mv_delta_median": "double", + "mv_delta_mean": "double", + "mv_delta_sum": "double", + "mv_delta_pct_median": "double", + "mv_delta_pct_mean": "double", + "mv_delta_pct_sum": "double", + "ratio_min": "double", + "ratio_q10": "double", + "ratio_q25": "double", + "ratio_median": "double", + "ratio_q75": "double", + "ratio_q90": "double", + "ratio_max": "double", + "ratio_mean": "double", + "cod": "double", + "prd": "double", + "prb": "double", + "mki": "double", + "cod_met": "boolean", + "prd_met": "boolean", + "prb_met": "boolean", + "mki_met": "boolean", + "within_05_pct": "boolean", + "within_10_pct": "boolean", + "within_15_pct": "boolean", + "within_20_pct": "boolean", + } - spark_df = spark_session.createDataFrame(df, schema=schema) + spark_df = spark_session.createDataFrame( + df, schema=", ".join(f"{key}: {val}" for key, val in schema.items()) + ) return spark_df diff --git a/dbt/models/reporting/reporting.sot_sale.py b/dbt/models/reporting/reporting.sot_sale.py index 60b9f9ee3..212707ab9 100644 --- a/dbt/models/reporting/reporting.sot_sale.py +++ b/dbt/models/reporting/reporting.sot_sale.py @@ -231,30 +231,48 @@ def model(dbt, spark_session): df = assemble(input, geos=geos, groups=groups) - schema = ( - "geography_type: string, geography_id: string, " - + "geography_data_year: string, group_type: string, " - + "group_id: string, sale_year: string, pin_n_tot: bigint, " - + "sale_n_tot: int, sale_price_min: double, sale_price_q10: double, " - + "sale_price_q25: double, sale_price_median: double, " - + "sale_price_q75: double, sale_price_q90: double, " - + "sale_price_max: double, sale_price_mean: double, " - + "sale_price_sum: double, sale_price_delta_median: double, " - + "sale_price_delta_mean: double, sale_price_delta_sum: double, " - + "sale_price_per_sf_min: double, sale_price_per_sf_q10: double, " - + "sale_price_per_sf_q25: double, sale_price_per_sf_median: double, " - + "sale_price_per_sf_q75: double, sale_price_per_sf_q90: double, " - + "sale_price_per_sf_max: double, sale_price_per_sf_mean: double, " - + "sale_price_per_sf_sum: double, " - + "sale_price_per_sf_delta_median: double, " - + "sale_price_per_sf_delta_mean: double, " - + "sale_price_per_sf_delta_sum: double, " - + "sale_char_tot_bldg_sf_median: double, " - + "sale_char_tot_land_sf_median: double, " - + "sale_char_yrblt_median: double, sale_class_mode: array, " - + "sale_n_outlier_excluded: bigint" - ) + schema = { + "geography_type": "string", + "geography_id": "string", + "geography_data_year": "string", + "group_type": "string", + "group_id": "string", + "sale_year": "string", + "pin_n_tot": "bigint", + "sale_n_tot": "int", + "sale_price_min": "double", + "sale_price_q10": "double", + "sale_price_q25": "double", + "sale_price_median": "double", + "sale_price_q75": "double", + "sale_price_q90": "double", + "sale_price_max": "double", + "sale_price_mean": "double", + "sale_price_sum": "double", + "sale_price_delta_median": "double", + "sale_price_delta_mean": "double", + "sale_price_delta_sum": "double", + "sale_price_per_sf_min": "double", + "sale_price_per_sf_q10": "double", + "sale_price_per_sf_q25": "double", + "sale_price_per_sf_median": "double", + "sale_price_per_sf_q75": "double", + "sale_price_per_sf_q90": "double", + "sale_price_per_sf_max": "double", + "sale_price_per_sf_mean": "double", + "sale_price_per_sf_sum": "double", + "sale_price_per_sf_delta_median": "double", + "sale_price_per_sf_delta_mean": "double", + "sale_price_per_sf_delta_sum": "double", + "sale_char_tot_bldg_sf_median": "double", + "sale_char_tot_land_sf_median": "double", + "sale_char_yrblt_median": "double", + "sale_class_mode": "array", + "sale_n_outlier_excluded": "bigint", + } - spark_df = spark_session.createDataFrame(df, schema=schema) + spark_df = spark_session.createDataFrame( + df, schema=", ".join(f"{key}: {val}" for key, val in schema.items()) + ) return spark_df diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py index 1cbd83bf6..86a1f3951 100644 --- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py +++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py @@ -261,40 +261,72 @@ def model(dbt, spark_session): df = assemble(input, geos=geos, groups=groups) - schema = ( - "geography_type: string, geography_id: string, " - + "geography_data_year: string, group_type: string, " - + "group_id: string, tax_year: string, pin_n_tot: bigint, " - + "tax_eq_factor_final: double, tax_eq_factor_tentative: double, " - + "tax_bill_total_min: double, tax_bill_total_q10: double, " - + "tax_bill_total_q25: double, tax_bill_total_median: double, " - + "tax_bill_total_q75: double, tax_bill_total_q90: double, " - + "tax_bill_total_max: double, tax_bill_total_mean: double, " - + "tax_bill_total_sum: double, tax_bill_total_delta_median: double, " - + "tax_bill_total_delta_mean: double, " - + "tax_bill_total_delta_sum: double , tax_rate_min: double, " - + "tax_rate_q10: double, tax_rate_q25: double, " - + "tax_rate_median: double, tax_rate_q75: double, " - + "tax_rate_q90: double, tax_rate_max: double, " - + "tax_rate_mean: double, tax_rate_sum: double, " - + "tax_av_min: int, tax_av_q10: double, tax_av_q25: double, " - + "tax_av_median: double, tax_av_q75: double, " - + "tax_av_q90: double, tax_av_max: int, tax_av_mean: double, " - + "tax_av_sum: double, tax_exe_n_homeowner: bigint, " - + "tax_exe_homeowner_sum: double, tax_exe_n_senior: bigint, " - + "tax_exe_senior_sum: double, tax_exe_n_freeze: bigint, " - + "tax_exe_freeze_sum: double, tax_exe_n_longtime_homeowner: bigint, " - + "tax_exe_longtime_homeowner_sum: double, " - + "tax_exe_n_disabled: bigint, tax_exe_disabled_sum: double, " - + "tax_exe_n_vet_returning: bigint, " - + "tax_exe_vet_returning_sum: double, tax_exe_n_vet_dis_lt50: bigint, " - + "tax_exe_vet_dis_lt50_sum: double, tax_exe_n_vet_dis_50_69: bigint, " - + "tax_exe_vet_dis_50_69_sum: double, tax_exe_n_vet_dis_ge70: bigint, " - + "tax_exe_vet_dis_ge70_sum: double, tax_exe_n_abate: bigint, " - + "tax_exe_abate_sum: double, tax_exe_n_total: bigint, " - + "tax_exe_total_sum: double" - ) + schema = { + "geography_type": "string", + "geography_id": "string", + "geography_data_year": "string", + "group_type": "string", + "group_id": "string", + "tax_year": "string", + "pin_n_tot": "bigint", + "tax_eq_factor_final": "double", + "tax_eq_factor_tentative": "double", + "tax_bill_total_min": "double", + "tax_bill_total_q10": "double", + "tax_bill_total_q25": "double", + "tax_bill_total_median": "double", + "tax_bill_total_q75": "double", + "tax_bill_total_q90": "double", + "tax_bill_total_max": "double", + "tax_bill_total_mean": "double", + "tax_bill_total_sum": "double", + "tax_bill_total_delta_median": "double", + "tax_bill_total_delta_mean": "double", + "tax_bill_total_delta_sum": "double ", + "tax_rate_min": "double", + "tax_rate_q10": "double", + "tax_rate_q25": "double", + "tax_rate_median": "double", + "tax_rate_q75": "double", + "tax_rate_q90": "double", + "tax_rate_max": "double", + "tax_rate_mean": "double", + "tax_rate_sum": "double", + "tax_av_min": "int", + "tax_av_q10": "double", + "tax_av_q25": "double", + "tax_av_median": "double", + "tax_av_q75": "double", + "tax_av_q90": "double", + "tax_av_max": "int", + "tax_av_mean": "double", + "tax_av_sum": "double", + "tax_exe_n_homeowner": "bigint", + "tax_exe_homeowner_sum": "double", + "tax_exe_n_senior": "bigint", + "tax_exe_senior_sum": "double", + "tax_exe_n_freeze": "bigint", + "tax_exe_freeze_sum": "double", + "tax_exe_n_longtime_homeowner": "bigint", + "tax_exe_longtime_homeowner_sum": "double", + "tax_exe_n_disabled": "bigint", + "tax_exe_disabled_sum": "double", + "tax_exe_n_vet_returning": "bigint", + "tax_exe_vet_returning_sum": "double", + "tax_exe_n_vet_dis_lt50": "bigint", + "tax_exe_vet_dis_lt50_sum": "double", + "tax_exe_n_vet_dis_50_69": "bigint", + "tax_exe_vet_dis_50_69_sum": "double", + "tax_exe_n_vet_dis_ge70": "bigint", + "tax_exe_vet_dis_ge70_sum": "double", + "tax_exe_n_abate": "bigint", + "tax_exe_abate_sum": "double", + "tax_exe_n_total": "bigint", + "tax_exe_total_sum": "double", + } - spark_df = spark_session.createDataFrame(df, schema=schema) + spark_df = spark_session.createDataFrame( + df, schema=", ".join(f"{key}: {val}" for key, val in schema.items()) + ) return spark_df From 023c341b43796e4e69680bb660f24db0cfaf77da Mon Sep 17 00:00:00 2001 From: William Ridgeway Date: Wed, 26 Mar 2025 16:26:30 -0500 Subject: [PATCH 67/96] Store testing --- test.py | 150 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 test.py diff --git a/test.py b/test.py new file mode 100644 index 000000000..520018f96 --- /dev/null +++ b/test.py @@ -0,0 +1,150 @@ +# This script generates aggregated summary stats on assessed values across a +# number of geographies, class combinations, and time. +#%% +# Import libraries +import pandas as pd +from pyathena import connect +from pyathena.pandas.cursor import PandasCursor +from pyspark.sql import SparkSession +import pyspark.sql.functions as F +from pyspark.sql.functions import pandas_udf + +# Connect to Athena +cursor = connect( + s3_staging_dir="s3://ccao-athena-results-us-east-1/", + region_name="us-east-1", + cursor_class=PandasCursor, +).cursor(unload=True) + +data = cursor.execute("select * from z_ci_387_reporting_sot_reporting.sot_assessment_roll_input").as_pandas() + +spark = SparkSession.builder.appName("SparkByExamples.com").master("local[*]").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate() +#%% +schema = {'stage_name': 'string', +'class': 'string', +'av_tot': 'double', +'av_bldg': 'double', +'av_land': 'double', +'county': 'string', +'triad': 'string', +'township': 'string', +'nbhd': 'string', +'tax_code': 'string', +'zip_code': 'string', +'community_area': 'string', +'census_place': 'string', +'census_tract': 'string', +'census_congressional_district': 'string', +'census_zcta': 'string', +'cook_board_of_review_district': 'string', +'cook_commissioner_district': 'string', +'cook_judicial_district': 'string', +'ward_num': 'string', +'police_district': 'string', +'school_elementary_district': 'string', +'school_secondary_district': 'string', +'school_unified_district': 'string', +'tax_municipality': 'string', +'tax_park_district': 'string', +'tax_library_district': 'string', +'tax_fire_protection_district': 'string', +'tax_community_college_district': 'string', +'tax_sanitation_district': 'string', +'tax_special_service_area': 'string', +'tax_tif_district': 'string', +'central_business_district': 'string', +'census_data_year': 'string', +'cook_board_of_review_district_data_year': 'string', +'cook_commissioner_district_data_year': 'string', +'cook_judicial_district_data_year': 'string', +'ward_data_year': 'string', +'community_area_data_year': 'string', +'police_district_data_year': 'string', +'central_business_district_data_year': 'string', +'school_data_year': 'string', +'tax_data_year': 'string', +'no_group': 'string', +'major_class': 'string', +'modeling_group': 'string', +'res_other': 'string', +'year': 'string',} +schema = ", ".join(f"{key}: {val}" for key, val in schema.items()) +spark_df = spark.createDataFrame(data, schema=schema) +#%% + +# Define aggregation functions. These are just wrappers for basic python +# functions that make using them easier to use with pandas.agg(). +def q10(x): + return x.quantile(0.1) + + +def q25(x): + return x.quantile(0.25) + + +def q75(x): + return x.quantile(0.75) + + +def q90(x): + return x.quantile(0.9) + + +def first(x): + if len(x) >= 1: + output = x.iloc[0] + else: + output = None + + return output + +more_stats = [ + "min", + q10, + q25, + "median", + q75, + q90, + "max", + "mean", + "sum", +] + +stats = { + "av_tot": ["size", "count"] + more_stats, + "av_bldg": more_stats, + "av_land": more_stats, + "triad": [first], + "geography_data_year": [first], +} + +#%% +schema = {'stage_name': 'string', +'av_tot': 'double','av_bldg': 'double','av_land': 'double',} +schema = ", ".join(f"{key}: {val}" for key, val in schema.items()) +spark_df = spark.createDataFrame(data[['stage_name', 'av_tot', 'av_bldg', 'av_land']], schema=schema) +#%% +def aggregate(key, pdf): + + columns = ['av_tot', 'av_bldg', 'av_land'] + + out = () + for column in ['av_tot', 'av_bldg', 'av_land']: + out += ( + pdf[column].min(), + q10(pdf[column]), + q25(pdf[column]), + pdf[column].median(), + q75(pdf[column]), + q90(pdf[column]), + pdf[column].max(), + pdf[column].mean(), + pdf[column].sum(), + ) + + return pd.DataFrame([ + key + out + ]) + +spark_df.groupby("stage_name").applyInPandas(aggregate, schema="stage_name string, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double").show() +# %% \ No newline at end of file From 57e4cc34fcb49b843c20a8e85b91ead182ca97aa Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 30 Apr 2025 19:17:27 +0000 Subject: [PATCH 68/96] Remove test script --- .gitignore | 1 + test.py | 150 ----------------------------------------------------- 2 files changed, 1 insertion(+), 150 deletions(-) delete mode 100644 test.py diff --git a/.gitignore b/.gitignore index 9a2206a6f..4f1fdf134 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ .Ruserdata package*.json settings.json +test.py # Directories *.egg-info/ diff --git a/test.py b/test.py deleted file mode 100644 index 520018f96..000000000 --- a/test.py +++ /dev/null @@ -1,150 +0,0 @@ -# This script generates aggregated summary stats on assessed values across a -# number of geographies, class combinations, and time. -#%% -# Import libraries -import pandas as pd -from pyathena import connect -from pyathena.pandas.cursor import PandasCursor -from pyspark.sql import SparkSession -import pyspark.sql.functions as F -from pyspark.sql.functions import pandas_udf - -# Connect to Athena -cursor = connect( - s3_staging_dir="s3://ccao-athena-results-us-east-1/", - region_name="us-east-1", - cursor_class=PandasCursor, -).cursor(unload=True) - -data = cursor.execute("select * from z_ci_387_reporting_sot_reporting.sot_assessment_roll_input").as_pandas() - -spark = SparkSession.builder.appName("SparkByExamples.com").master("local[*]").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate() -#%% -schema = {'stage_name': 'string', -'class': 'string', -'av_tot': 'double', -'av_bldg': 'double', -'av_land': 'double', -'county': 'string', -'triad': 'string', -'township': 'string', -'nbhd': 'string', -'tax_code': 'string', -'zip_code': 'string', -'community_area': 'string', -'census_place': 'string', -'census_tract': 'string', -'census_congressional_district': 'string', -'census_zcta': 'string', -'cook_board_of_review_district': 'string', -'cook_commissioner_district': 'string', -'cook_judicial_district': 'string', -'ward_num': 'string', -'police_district': 'string', -'school_elementary_district': 'string', -'school_secondary_district': 'string', -'school_unified_district': 'string', -'tax_municipality': 'string', -'tax_park_district': 'string', -'tax_library_district': 'string', -'tax_fire_protection_district': 'string', -'tax_community_college_district': 'string', -'tax_sanitation_district': 'string', -'tax_special_service_area': 'string', -'tax_tif_district': 'string', -'central_business_district': 'string', -'census_data_year': 'string', -'cook_board_of_review_district_data_year': 'string', -'cook_commissioner_district_data_year': 'string', -'cook_judicial_district_data_year': 'string', -'ward_data_year': 'string', -'community_area_data_year': 'string', -'police_district_data_year': 'string', -'central_business_district_data_year': 'string', -'school_data_year': 'string', -'tax_data_year': 'string', -'no_group': 'string', -'major_class': 'string', -'modeling_group': 'string', -'res_other': 'string', -'year': 'string',} -schema = ", ".join(f"{key}: {val}" for key, val in schema.items()) -spark_df = spark.createDataFrame(data, schema=schema) -#%% - -# Define aggregation functions. These are just wrappers for basic python -# functions that make using them easier to use with pandas.agg(). -def q10(x): - return x.quantile(0.1) - - -def q25(x): - return x.quantile(0.25) - - -def q75(x): - return x.quantile(0.75) - - -def q90(x): - return x.quantile(0.9) - - -def first(x): - if len(x) >= 1: - output = x.iloc[0] - else: - output = None - - return output - -more_stats = [ - "min", - q10, - q25, - "median", - q75, - q90, - "max", - "mean", - "sum", -] - -stats = { - "av_tot": ["size", "count"] + more_stats, - "av_bldg": more_stats, - "av_land": more_stats, - "triad": [first], - "geography_data_year": [first], -} - -#%% -schema = {'stage_name': 'string', -'av_tot': 'double','av_bldg': 'double','av_land': 'double',} -schema = ", ".join(f"{key}: {val}" for key, val in schema.items()) -spark_df = spark.createDataFrame(data[['stage_name', 'av_tot', 'av_bldg', 'av_land']], schema=schema) -#%% -def aggregate(key, pdf): - - columns = ['av_tot', 'av_bldg', 'av_land'] - - out = () - for column in ['av_tot', 'av_bldg', 'av_land']: - out += ( - pdf[column].min(), - q10(pdf[column]), - q25(pdf[column]), - pdf[column].median(), - q75(pdf[column]), - q90(pdf[column]), - pdf[column].max(), - pdf[column].mean(), - pdf[column].sum(), - ) - - return pd.DataFrame([ - key + out - ]) - -spark_df.groupby("stage_name").applyInPandas(aggregate, schema="stage_name string, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double").show() -# %% \ No newline at end of file From b0d9ad7ec6eacdfa7758dcad59cbaa53be4d2041 Mon Sep 17 00:00:00 2001 From: William Ridgeway Date: Thu, 1 May 2025 11:23:45 -0500 Subject: [PATCH 69/96] Test script back to working --- .gitignore | 1 - test.py | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 169 insertions(+), 1 deletion(-) create mode 100644 test.py diff --git a/.gitignore b/.gitignore index 4f1fdf134..9a2206a6f 100644 --- a/.gitignore +++ b/.gitignore @@ -11,7 +11,6 @@ .Ruserdata package*.json settings.json -test.py # Directories *.egg-info/ diff --git a/test.py b/test.py new file mode 100644 index 000000000..47f092988 --- /dev/null +++ b/test.py @@ -0,0 +1,169 @@ +# This script generates aggregated summary stats on assessed values across a +# number of geographies, class combinations, and time. +# %% +# Import libraries +import pandas as pd +from pyathena import connect +from pyathena.pandas.cursor import PandasCursor +from pyspark.sql import SparkSession + +# Connect to Athena +cursor = connect( + s3_staging_dir="s3://ccao-athena-results-us-east-1/", + region_name="us-east-1", + cursor_class=PandasCursor, +).cursor(unload=True) + +data = cursor.execute( + "select * from z_ci_387_reporting_sot_reporting.sot_assessment_roll_input" +).as_pandas() + +spark = ( + SparkSession.builder.appName("SparkByExamples.com") + .master("local[*]") + .config("spark.driver.bindAddress", "127.0.0.1") + .getOrCreate() +) +# %% +schema = { + "stage_name": "string", + "class": "string", + "av_tot": "double", + "av_bldg": "double", + "av_land": "double", + "county": "string", + "triad": "string", + "township": "string", + "nbhd": "string", + "tax_code": "string", + "zip_code": "string", + "community_area": "string", + "census_place": "string", + "census_tract": "string", + "census_congressional_district": "string", + "census_zcta": "string", + "cook_board_of_review_district": "string", + "cook_commissioner_district": "string", + "cook_judicial_district": "string", + "ward_num": "string", + "police_district": "string", + "school_elementary_district": "string", + "school_secondary_district": "string", + "school_unified_district": "string", + "tax_municipality": "string", + "tax_park_district": "string", + "tax_library_district": "string", + "tax_fire_protection_district": "string", + "tax_community_college_district": "string", + "tax_sanitation_district": "string", + "tax_special_service_area": "string", + "tax_tif_district": "string", + "central_business_district": "string", + "census_data_year": "string", + "cook_board_of_review_district_data_year": "string", + "cook_commissioner_district_data_year": "string", + "cook_judicial_district_data_year": "string", + "ward_data_year": "string", + "community_area_data_year": "string", + "police_district_data_year": "string", + "central_business_district_data_year": "string", + "school_data_year": "string", + "tax_data_year": "string", + "no_group": "string", + "major_class": "string", + "modeling_group": "string", + "res_other": "string", + "year": "string", +} +schema = ", ".join(f"{key}: {val}" for key, val in schema.items()) +spark_df = spark.createDataFrame(data, schema=schema) + + +# %% +# Define aggregation functions. These are just wrappers for basic python +# functions that make using them easier to use with pandas.agg(). +def q10(x): + return x.quantile(0.1) + + +def q25(x): + return x.quantile(0.25) + + +def q75(x): + return x.quantile(0.75) + + +def q90(x): + return x.quantile(0.9) + + +def first(x): + if len(x) >= 1: + output = x.iloc[0] + else: + output = None + + return output + + +more_stats = [ + "min", + q10, + q25, + "median", + q75, + q90, + "max", + "mean", + "sum", +] + +stats = { + "av_tot": ["size", "count"] + more_stats, + "av_bldg": more_stats, + "av_land": more_stats, + "triad": [first], + "geography_data_year": [first], +} + +# %% +schema = { + "stage_name": "string", + "av_tot": "double", + "av_bldg": "double", + "av_land": "double", +} +schema = ", ".join(f"{key}: {val}" for key, val in schema.items()) +spark_df = spark.createDataFrame( + data[["stage_name", "av_tot", "av_bldg", "av_land"]], schema=schema +) + + +# %% +def aggregate(key, pdf): + columns = ["av_tot", "av_bldg", "av_land"] + + out = () + for column in columns: + out += ( + pdf[column].min(), + q10(pdf[column]), + q25(pdf[column]), + pdf[column].median(), + q75(pdf[column]), + q90(pdf[column]), + pdf[column].max(), + pdf[column].mean(), + pdf[column].sum(), + ) + + return pd.DataFrame([key + out]) + + +# %% +spark_df.groupby("stage_name").applyInPandas( + aggregate, + schema="stage_name string, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double", +).show() +# %% From f05683512f58d7e10f9be9d7d5c3e13b0f28e796 Mon Sep 17 00:00:00 2001 From: William Ridgeway Date: Tue, 17 Jun 2025 14:50:59 -0500 Subject: [PATCH 70/96] Temp changes --- test.py | 113 ++++++++++++++++++++++++++------------------------------ 1 file changed, 52 insertions(+), 61 deletions(-) diff --git a/test.py b/test.py index 47f092988..70cbaf22f 100644 --- a/test.py +++ b/test.py @@ -24,59 +24,6 @@ .config("spark.driver.bindAddress", "127.0.0.1") .getOrCreate() ) -# %% -schema = { - "stage_name": "string", - "class": "string", - "av_tot": "double", - "av_bldg": "double", - "av_land": "double", - "county": "string", - "triad": "string", - "township": "string", - "nbhd": "string", - "tax_code": "string", - "zip_code": "string", - "community_area": "string", - "census_place": "string", - "census_tract": "string", - "census_congressional_district": "string", - "census_zcta": "string", - "cook_board_of_review_district": "string", - "cook_commissioner_district": "string", - "cook_judicial_district": "string", - "ward_num": "string", - "police_district": "string", - "school_elementary_district": "string", - "school_secondary_district": "string", - "school_unified_district": "string", - "tax_municipality": "string", - "tax_park_district": "string", - "tax_library_district": "string", - "tax_fire_protection_district": "string", - "tax_community_college_district": "string", - "tax_sanitation_district": "string", - "tax_special_service_area": "string", - "tax_tif_district": "string", - "central_business_district": "string", - "census_data_year": "string", - "cook_board_of_review_district_data_year": "string", - "cook_commissioner_district_data_year": "string", - "cook_judicial_district_data_year": "string", - "ward_data_year": "string", - "community_area_data_year": "string", - "police_district_data_year": "string", - "central_business_district_data_year": "string", - "school_data_year": "string", - "tax_data_year": "string", - "no_group": "string", - "major_class": "string", - "modeling_group": "string", - "res_other": "string", - "year": "string", -} -schema = ", ".join(f"{key}: {val}" for key, val in schema.items()) -spark_df = spark.createDataFrame(data, schema=schema) # %% @@ -128,16 +75,58 @@ def first(x): } # %% +groups = { + "res_other": "string", + "major_class": "string", + "no_group": "string", + "class": "string", + "modeling_group": "string", +} + +geographies = { + "county": "string", + "triad": "string", + "township": "string", + "nbhd": "string", + "tax_code": "string", + "zip_code": "string", + "community_area": "string", + "census_place": "string", + "census_tract": "string", + "census_congressional_district": "string", + "census_zcta": "string", + "cook_board_of_review_district": "string", + "cook_commissioner_district": "string", + "cook_judicial_district": "string", + "ward_num": "string", + "police_district": "string", + "school_elementary_district": "string", + "school_secondary_district": "string", + "school_unified_district": "string", + "tax_municipality": "string", + "tax_park_district": "string", + "tax_library_district": "string", + "tax_fire_protection_district": "string", + "tax_community_college_district": "string", + "tax_sanitation_district": "string", + "tax_special_service_area": "string", + "tax_tif_district": "string", + "central_business_district": "string", +} + schema = { + "year": "string", "stage_name": "string", "av_tot": "double", "av_bldg": "double", "av_land": "double", } -schema = ", ".join(f"{key}: {val}" for key, val in schema.items()) -spark_df = spark.createDataFrame( - data[["stage_name", "av_tot", "av_bldg", "av_land"]], schema=schema -) + +schema = schema | groups | geographies + +cols = list(schema.keys()) +schema = ", ".join(f"{key} {val}" for key, val in schema.items()) +spark_df = spark.createDataFrame(data[cols], schema=schema) # %% @@ -162,8 +151,10 @@ def aggregate(key, pdf): # %% -spark_df.groupby("stage_name").applyInPandas( - aggregate, - schema="stage_name string, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double", -).show() +for group in list(groups.keys()): + for geography in list(geographies.keys()): + spark_df.groupby(["stage_name", group, geography]).applyInPandas( + aggregate, + schema="stage_name string, group_id string, geography string, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double", + ).show() # %% From 744ddd656d45a248f33afada4dc0a404e78c201f Mon Sep 17 00:00:00 2001 From: William Ridgeway Date: Wed, 18 Jun 2025 14:39:58 -0500 Subject: [PATCH 71/96] Everything but delta cols --- test.py | 100 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 67 insertions(+), 33 deletions(-) diff --git a/test.py b/test.py index 70cbaf22f..2b9025654 100644 --- a/test.py +++ b/test.py @@ -6,6 +6,7 @@ from pyathena import connect from pyathena.pandas.cursor import PandasCursor from pyspark.sql import SparkSession +from pyspark.sql.functions import lit # Connect to Athena cursor = connect( @@ -83,37 +84,39 @@ def first(x): "modeling_group": "string", } -geographies = { - "county": "string", - "triad": "string", - "township": "string", - "nbhd": "string", - "tax_code": "string", - "zip_code": "string", - "community_area": "string", - "census_place": "string", - "census_tract": "string", - "census_congressional_district": "string", - "census_zcta": "string", - "cook_board_of_review_district": "string", - "cook_commissioner_district": "string", - "cook_judicial_district": "string", - "ward_num": "string", - "police_district": "string", - "school_elementary_district": "string", - "school_secondary_district": "string", - "school_unified_district": "string", - "tax_municipality": "string", - "tax_park_district": "string", - "tax_library_district": "string", - "tax_fire_protection_district": "string", - "tax_community_college_district": "string", - "tax_sanitation_district": "string", - "tax_special_service_area": "string", - "tax_tif_district": "string", - "central_business_district": "string", +years = { + "county": "year", + "triad": "year", + "township": "year", + "nbhd": "year", + "tax_code": "year", + "zip_code": "year", + "community_area": "community_area_data_year", + "census_place": "census_data_year", + "census_tract": "census_data_year", + "census_congressional_district": "census_data_year", + "census_zcta": "census_data_year", + "cook_board_of_review_district": "cook_board_of_review_district_data_year", + "cook_commissioner_district": "cook_commissioner_district_data_year", + "cook_judicial_district": "cook_judicial_district_data_year", + "ward_num": "ward_data_year", + "police_district": "police_district_data_year", + "school_elementary_district": "school_data_year", + "school_secondary_district": "school_data_year", + "school_unified_district": "school_data_year", + "tax_municipality": "tax_data_year", + "tax_park_district": "tax_data_year", + "tax_library_district": "tax_data_yearg", + "tax_fire_protection_district": "tax_data_year", + "tax_community_college_district": "tax_data_year", + "tax_sanitation_district": "tax_data_year", + "tax_special_service_area": "tax_data_year", + "tax_tif_district": "tax_data_year", + "central_business_district": "central_business_district_data_year", } +geographies = dict.fromkeys(list(years.keys()), "string") + schema = { "year": "string", "stage_name": "string", @@ -130,10 +133,35 @@ def first(x): # %% +def reassessment_year(year, geography, triad): + if geography in ["triad", "township", "nbhd"]: + year = int(year) % 3 + + if ( + ((year == 0) & (triad == "North")) + | ((year == 1) & (triad == "South")) + | ((year == 2) & (triad == "City")) + ): + out = "Yes" + else: + out = "No" + else: + out = "" + + return out + + def aggregate(key, pdf): columns = ["av_tot", "av_bldg", "av_land"] out = () + out += ( + reassessment_year(pdf["year"][0], geography, pdf["triad"][0]), + first(pdf[years[geography]]), + len(pdf["av_tot"]), + pdf["av_tot"].count(), + pdf["av_tot"].count() / len(pdf["av_tot"]), + ) for column in columns: out += ( pdf[column].min(), @@ -151,10 +179,16 @@ def aggregate(key, pdf): # %% -for group in list(groups.keys()): - for geography in list(geographies.keys()): - spark_df.groupby(["stage_name", group, geography]).applyInPandas( +for group in [list(groups.keys())[1]]: + for geography in [list(geographies.keys())[1]]: + spark_df.groupby( + ["stage_name", group, geography, "year"] + ).applyInPandas( aggregate, - schema="stage_name string, group_id string, geography string, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double", + schema="stage_name string, group_id string, geography_id string, year string, reassessment_year string, geography_data_year string, pin_n_tot int, pin_n_w_value int, pin_pct_w_value double, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double", + ).select( + "*", + lit(group).alias("group_type"), + lit(geography).alias("geography_type"), ).show() # %% From ae79bc3d5afb7d663b783c0c75393a1a39c5448e Mon Sep 17 00:00:00 2001 From: William Ridgeway Date: Wed, 18 Jun 2025 14:43:35 -0500 Subject: [PATCH 72/96] Remove vestigial objects --- test.py | 110 +++++++++++++++++++++++--------------------------------- 1 file changed, 44 insertions(+), 66 deletions(-) diff --git a/test.py b/test.py index 2b9025654..7c2199f8f 100644 --- a/test.py +++ b/test.py @@ -55,25 +55,50 @@ def first(x): return output -more_stats = [ - "min", - q10, - q25, - "median", - q75, - q90, - "max", - "mean", - "sum", -] - -stats = { - "av_tot": ["size", "count"] + more_stats, - "av_bldg": more_stats, - "av_land": more_stats, - "triad": [first], - "geography_data_year": [first], -} +def reassessment_year(year, geography, triad): + if geography in ["triad", "township", "nbhd"]: + year = int(year) % 3 + + if ( + ((year == 0) & (triad == "North")) + | ((year == 1) & (triad == "South")) + | ((year == 2) & (triad == "City")) + ): + out = "Yes" + else: + out = "No" + else: + out = "" + + return out + + +def aggregate(key, pdf): + columns = ["av_tot", "av_bldg", "av_land"] + + out = () + out += ( + reassessment_year(pdf["year"][0], geography, pdf["triad"][0]), + first(pdf[years[geography]]), + len(pdf["av_tot"]), + pdf["av_tot"].count(), + pdf["av_tot"].count() / len(pdf["av_tot"]), + ) + for column in columns: + out += ( + pdf[column].min(), + q10(pdf[column]), + q25(pdf[column]), + pdf[column].median(), + q75(pdf[column]), + q90(pdf[column]), + pdf[column].max(), + pdf[column].mean(), + pdf[column].sum(), + ) + + return pd.DataFrame([key + out]) + # %% groups = { @@ -131,53 +156,6 @@ def first(x): schema = ", ".join(f"{key} {val}" for key, val in schema.items()) spark_df = spark.createDataFrame(data[cols], schema=schema) - -# %% -def reassessment_year(year, geography, triad): - if geography in ["triad", "township", "nbhd"]: - year = int(year) % 3 - - if ( - ((year == 0) & (triad == "North")) - | ((year == 1) & (triad == "South")) - | ((year == 2) & (triad == "City")) - ): - out = "Yes" - else: - out = "No" - else: - out = "" - - return out - - -def aggregate(key, pdf): - columns = ["av_tot", "av_bldg", "av_land"] - - out = () - out += ( - reassessment_year(pdf["year"][0], geography, pdf["triad"][0]), - first(pdf[years[geography]]), - len(pdf["av_tot"]), - pdf["av_tot"].count(), - pdf["av_tot"].count() / len(pdf["av_tot"]), - ) - for column in columns: - out += ( - pdf[column].min(), - q10(pdf[column]), - q25(pdf[column]), - pdf[column].median(), - q75(pdf[column]), - q90(pdf[column]), - pdf[column].max(), - pdf[column].mean(), - pdf[column].sum(), - ) - - return pd.DataFrame([key + out]) - - # %% for group in [list(groups.keys())[1]]: for geography in [list(geographies.keys())[1]]: From 080d5b79265b5a9d12fb82048106eaefd01b767c Mon Sep 17 00:00:00 2001 From: William Ridgeway Date: Wed, 18 Jun 2025 15:40:10 -0500 Subject: [PATCH 73/96] Simplify schema creation --- test.py | 38 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/test.py b/test.py index 7c2199f8f..e7cf99d08 100644 --- a/test.py +++ b/test.py @@ -101,13 +101,13 @@ def aggregate(key, pdf): # %% -groups = { - "res_other": "string", - "major_class": "string", - "no_group": "string", - "class": "string", - "modeling_group": "string", -} +groups = [ + "res_other", + "major_class", + "no_group", + "class", + "modeling_group", +] years = { "county": "year", @@ -131,7 +131,7 @@ def aggregate(key, pdf): "school_unified_district": "school_data_year", "tax_municipality": "tax_data_year", "tax_park_district": "tax_data_year", - "tax_library_district": "tax_data_yearg", + "tax_library_district": "tax_data_year", "tax_fire_protection_district": "tax_data_year", "tax_community_college_district": "tax_data_year", "tax_sanitation_district": "tax_data_year", @@ -140,25 +140,17 @@ def aggregate(key, pdf): "central_business_district": "central_business_district_data_year", } -geographies = dict.fromkeys(list(years.keys()), "string") - -schema = { - "year": "string", - "stage_name": "string", - "av_tot": "double", - "av_bldg": "double", - "av_land": "double", -} +geographies = list(years.keys()) -schema = schema | groups | geographies - -cols = list(schema.keys()) +schema = dict.fromkeys(data.columns, "string") +schema |= dict.fromkeys(["av_tot", "av_bldg", "av_land"], "double") schema = ", ".join(f"{key} {val}" for key, val in schema.items()) -spark_df = spark.createDataFrame(data[cols], schema=schema) + +spark_df = spark.createDataFrame(data, schema=schema) # %% -for group in [list(groups.keys())[1]]: - for geography in [list(geographies.keys())[1]]: +for group in groups: + for geography in geographies: spark_df.groupby( ["stage_name", group, geography, "year"] ).applyInPandas( From de27eb8776304bc9cb091a96ae76a031951332ac Mon Sep 17 00:00:00 2001 From: William Ridgeway Date: Wed, 18 Jun 2025 16:16:50 -0500 Subject: [PATCH 74/96] Aggregate spark dfs --- test.py | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/test.py b/test.py index e7cf99d08..085ac71ca 100644 --- a/test.py +++ b/test.py @@ -2,10 +2,12 @@ # number of geographies, class combinations, and time. # %% # Import libraries +from functools import reduce + import pandas as pd from pyathena import connect from pyathena.pandas.cursor import PandasCursor -from pyspark.sql import SparkSession +from pyspark.sql import DataFrame, SparkSession from pyspark.sql.functions import lit # Connect to Athena @@ -82,7 +84,7 @@ def aggregate(key, pdf): first(pdf[years[geography]]), len(pdf["av_tot"]), pdf["av_tot"].count(), - pdf["av_tot"].count() / len(pdf["av_tot"]), + pdf["av_tot"].count() / pdf["av_tot"].size, ) for column in columns: out += ( @@ -148,17 +150,24 @@ def aggregate(key, pdf): spark_df = spark.createDataFrame(data, schema=schema) +output_schema = "stage_name string, group_id string, geography_id string, year string, reassessment_year string, geography_data_year string, pin_n_tot int, pin_n_w_value int, pin_pct_w_value double, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double" + # %% +output = [] for group in groups: for geography in geographies: - spark_df.groupby( - ["stage_name", group, geography, "year"] - ).applyInPandas( - aggregate, - schema="stage_name string, group_id string, geography_id string, year string, reassessment_year string, geography_data_year string, pin_n_tot int, pin_n_w_value int, pin_pct_w_value double, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double", - ).select( - "*", - lit(group).alias("group_type"), - lit(geography).alias("geography_type"), - ).show() + output += [ + spark_df.groupby(["stage_name", group, geography, "year"]) + .applyInPandas( + aggregate, + schema=output_schema, + ) + .select( + "*", + lit(group).alias("group_type"), + lit(geography).alias("geography_type"), + ) + ] + +outputs = reduce(DataFrame.unionByName, output) # %% From 214ec7aab765cd3f2b110ae0754bb26261a20c83 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 18 Jun 2025 21:25:54 +0000 Subject: [PATCH 75/96] Remove temp limit on ass roll table --- dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 2 -- 1 file changed, 2 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql index 89724635c..1cab79d90 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql +++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql @@ -102,5 +102,3 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals AND uni.stage_name = vals.stage_name LEFT JOIN {{ ref('ccao.class_dict') }} ON uni.class = class_dict.class_code --- Temporary limit on feeder table to avoid GitHub runner memory issues. -WHERE uni.class = '278' AND uni.year IN ('2019', '2020', '2021') From 26f00e30b0b1ec448f0cd72d94fe38e95a9a4ca6 Mon Sep 17 00:00:00 2001 From: William Ridgeway Date: Wed, 18 Jun 2025 17:02:42 -0500 Subject: [PATCH 76/96] Try table build with spark --- .../models/reporting/reporting.spark_test.py | 88 ++++++++----------- 1 file changed, 38 insertions(+), 50 deletions(-) rename test.py => dbt/models/reporting/reporting.spark_test.py (73%) diff --git a/test.py b/dbt/models/reporting/reporting.spark_test.py similarity index 73% rename from test.py rename to dbt/models/reporting/reporting.spark_test.py index 085ac71ca..0a143ca78 100644 --- a/test.py +++ b/dbt/models/reporting/reporting.spark_test.py @@ -1,35 +1,14 @@ # This script generates aggregated summary stats on assessed values across a # number of geographies, class combinations, and time. -# %% + # Import libraries from functools import reduce import pandas as pd -from pyathena import connect -from pyathena.pandas.cursor import PandasCursor -from pyspark.sql import DataFrame, SparkSession +from pyspark.sql import DataFrame from pyspark.sql.functions import lit -# Connect to Athena -cursor = connect( - s3_staging_dir="s3://ccao-athena-results-us-east-1/", - region_name="us-east-1", - cursor_class=PandasCursor, -).cursor(unload=True) - -data = cursor.execute( - "select * from z_ci_387_reporting_sot_reporting.sot_assessment_roll_input" -).as_pandas() - -spark = ( - SparkSession.builder.appName("SparkByExamples.com") - .master("local[*]") - .config("spark.driver.bindAddress", "127.0.0.1") - .getOrCreate() -) - -# %% # Define aggregation functions. These are just wrappers for basic python # functions that make using them easier to use with pandas.agg(). def q10(x): @@ -80,8 +59,8 @@ def aggregate(key, pdf): out = () out += ( - reassessment_year(pdf["year"][0], geography, pdf["triad"][0]), - first(pdf[years[geography]]), + reassessment_year(pdf["year"][0], geography, pdf["triad"][0]), # noqa: F821 + first(pdf[years[geography]]), # noqa: F821 len(pdf["av_tot"]), pdf["av_tot"].count(), pdf["av_tot"].count() / pdf["av_tot"].size, @@ -102,7 +81,6 @@ def aggregate(key, pdf): return pd.DataFrame([key + out]) -# %% groups = [ "res_other", "major_class", @@ -144,30 +122,40 @@ def aggregate(key, pdf): geographies = list(years.keys()) -schema = dict.fromkeys(data.columns, "string") -schema |= dict.fromkeys(["av_tot", "av_bldg", "av_land"], "double") -schema = ", ".join(f"{key} {val}" for key, val in schema.items()) +output_schema = "stage_name string, group_id string, geography_id string, year string, reassessment_year string, geography_data_year string, pin_n_tot int, pin_n_w_value int, pin_pct_w_value double, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double" -spark_df = spark.createDataFrame(data, schema=schema) -output_schema = "stage_name string, group_id string, geography_id string, year string, reassessment_year string, geography_data_year string, pin_n_tot int, pin_n_w_value int, pin_pct_w_value double, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double" +def model(dbt, spark_session): + dbt.config(materialized="table", engine_config={"MaxConcurrentDpus": 40}) + athena_user_logger.info("Loading assessment roll input table") + + input = dbt.ref("reporting.sot_assessment_roll_input") + spark_schema = dict.fromkeys(input.columns, "string") + spark_schema |= dict.fromkeys(["av_tot", "av_bldg", "av_land"], "double") + spark_schema = ", ".join( + f"{key} {val}" for key, val in spark_schema.items() + ) -# %% -output = [] -for group in groups: - for geography in geographies: - output += [ - spark_df.groupby(["stage_name", group, geography, "year"]) - .applyInPandas( - aggregate, - schema=output_schema, - ) - .select( - "*", - lit(group).alias("group_type"), - lit(geography).alias("geography_type"), - ) - ] - -outputs = reduce(DataFrame.unionByName, output) -# %% + spark_df = spark_session.createDataFrame(input, schema=spark_schema) + + athena_user_logger.info("Dope stuff is happening... maybe?") + + output = [] + for group in groups: + for geography in geographies: + output += [ + spark_df.groupby(["stage_name", group, geography, "year"]) + .applyInPandas( + aggregate, + schema=output_schema, + ) + .select( + "*", + lit(group).alias("group_type"), + lit(geography).alias("geography_type"), + ) + ] + + df = reduce(DataFrame.unionByName, output) + + return df From 2509e404c78f7e07fa3b6fa1cca15d0622b2eb65 Mon Sep 17 00:00:00 2001 From: sweatyhandshake Date: Mon, 23 Jun 2025 10:31:52 -0500 Subject: [PATCH 77/96] Remove old table, rerun build to gen error log --- .../reporting.sot_assessment_roll.py | 425 +++++------------- dbt/models/reporting/reporting.spark_test.py | 161 ------- 2 files changed, 111 insertions(+), 475 deletions(-) delete mode 100644 dbt/models/reporting/reporting.spark_test.py diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index 6eda0abf5..0a143ca78 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -2,10 +2,11 @@ # number of geographies, class combinations, and time. # Import libraries -import pandas as pd +from functools import reduce -# Declare class groupings -groups = ["no_group", "class", "major_class", "modeling_group", "res_other"] +import pandas as pd +from pyspark.sql import DataFrame +from pyspark.sql.functions import lit # Define aggregation functions. These are just wrappers for basic python @@ -35,330 +36,126 @@ def first(x): return output -more_stats = [ - "min", - q10, - q25, - "median", - q75, - q90, - "max", - "mean", - "sum", -] - -stats = { - "av_tot": ["size", "count"] + more_stats, - "av_bldg": more_stats, - "av_land": more_stats, - "triad": [first], - "geography_data_year": [first], -} - - -def aggregrate(data, geography_type, group_type, stats): - """ - Function to group a dataframe by whichever geography and group types it is - passed and output aggregate stats for that grouping. - """ - - print(geography_type, group_type) - - group = [geography_type, group_type, "year", "stage_name"] - summary = data.groupby(group).agg(stats).round(2) - summary["geography_type"] = geography_type - summary["group_type"] = group_type - summary.index.names = ["geography_id", "group_id", "year", "stage_name"] - summary = summary.reset_index().set_index( - [ - "geography_type", - "geography_id", - "group_type", - "group_id", - "year", - "stage_name", - ] - ) - - return summary - - -def assemble(df, geos, groups): - """ - Function that loops over predefined geography and class groups and passes - them to the aggregate function. Returns stacked output from the aggregate - function. - """ - - # Create an empty dataframe to fill with output - output = pd.DataFrame() - - # Loop through group combinations and stack output - for key, value in geos.items(): - df["geography_data_year"] = df[key] - - for x in value: - for z in groups: - output = pd.concat([output, aggregrate(df, x, z, stats=stats)]) - - # Flatten multi-index - output.columns = ["_".join(col) for col in output.columns] - output = output.reset_index() - output = output.rename(columns={"triad_first": "triad"}) - - # Create additional stat columns post-aggregation - output["av_tot_pct_w_value"] = ( - output["av_tot_count"] / output["av_tot_size"] - ) - - output = output.sort_values("year") - - diff_cols = [ - "geography_id", - "group_id", - "stage_name", - "av_tot_median", - "av_tot_mean", - "av_tot_sum", - "av_bldg_median", - "av_bldg_mean", - "av_bldg_sum", - "av_land_median", - "av_land_mean", - "av_land_sum", - ] - - output[ - [ - "av_tot_delta_median", - "av_tot_delta_mean", - "av_tot_delta_sum", - "av_bldg_delta_median", - "av_bldg_delta_mean", - "av_bldg_delta_sum", - "av_land_delta_median", - "av_land_delta_mean", - "av_land_delta_sum", - ] - ] = ( - output[diff_cols] - .groupby(["geography_id", "group_id", "stage_name"]) - .diff() - ) +def reassessment_year(year, geography, triad): + if geography in ["triad", "township", "nbhd"]: + year = int(year) % 3 - output[ - [ - "av_tot_delta_pct_median", - "av_tot_delta_pct_mean", - "av_tot_delta_pct_sum", - "av_bldg_delta_pct_median", - "av_bldg_delta_pct_mean", - "av_bldg_delta_pct_sum", - "av_land_delta_pct_median", - "av_land_delta_pct_mean", - "av_land_delta_pct_sum", - ] - ] = ( - output[diff_cols] - .groupby(["geography_id", "group_id", "stage_name"]) - .pct_change() - ) - - output["year"] = output["year"].astype(int) - output["triennial"] = output["geography_type"].isin( - ["triad", "township", "nbhd"] - ) - - # Reassessment year is constructed as a string rather than a boolean to - # avoid PySpark errors with nullable booleans that can likely be resolved. - output["reassessment_year"] = "" - output.loc[ - (output["triennial"] == True), "reassessment_year" # noqa: E712 - ] = "No" - output.loc[ - ((output["year"] % 3 == 0) & (output["triad"] == "North")) - | ((output["year"] % 3 == 1) & (output["triad"] == "South")) - | ((output["year"] % 3 == 2) & (output["triad"] == "City")) - & (output["triennial"] == True), # noqa: E712 - "reassessment_year", - ] = "Yes" - output = output.drop(["triennial", "triad"], axis=1) - - output = clean_names(output) + if ( + ((year == 0) & (triad == "North")) + | ((year == 1) & (triad == "South")) + | ((year == 2) & (triad == "City")) + ): + out = "Yes" + else: + out = "No" + else: + out = "" - return output + return out -def clean_names(x): - """ - Function to rename and reorder columns. - """ +def aggregate(key, pdf): + columns = ["av_tot", "av_bldg", "av_land"] - output = x.rename( - columns={ - "av_tot_size": "pin_n_tot", - "av_tot_count": "pin_n_w_value", - "av_tot_pct_w_value": "pin_pct_w_value", - "geography_data_year_first": "geography_data_year", - } + out = () + out += ( + reassessment_year(pdf["year"][0], geography, pdf["triad"][0]), # noqa: F821 + first(pdf[years[geography]]), # noqa: F821 + len(pdf["av_tot"]), + pdf["av_tot"].count(), + pdf["av_tot"].count() / pdf["av_tot"].size, ) + for column in columns: + out += ( + pdf[column].min(), + q10(pdf[column]), + q25(pdf[column]), + pdf[column].median(), + q75(pdf[column]), + q90(pdf[column]), + pdf[column].max(), + pdf[column].mean(), + pdf[column].sum(), + ) + + return pd.DataFrame([key + out]) + + +groups = [ + "res_other", + "major_class", + "no_group", + "class", + "modeling_group", +] - output = output[ - [ - "geography_type", - "geography_id", - "geography_data_year", - "group_type", - "group_id", - "year", - "reassessment_year", - "stage_name", - "pin_n_tot", - "pin_n_w_value", - "pin_pct_w_value", - "av_tot_min", - "av_tot_q10", - "av_tot_q25", - "av_tot_median", - "av_tot_q75", - "av_tot_q90", - "av_tot_max", - "av_tot_mean", - "av_tot_sum", - "av_tot_delta_median", - "av_tot_delta_mean", - "av_tot_delta_sum", - "av_tot_delta_pct_median", - "av_tot_delta_pct_mean", - "av_tot_delta_pct_sum", - "av_bldg_min", - "av_bldg_q10", - "av_bldg_q25", - "av_bldg_median", - "av_bldg_q75", - "av_bldg_q90", - "av_bldg_max", - "av_bldg_mean", - "av_bldg_sum", - "av_bldg_delta_median", - "av_bldg_delta_mean", - "av_bldg_delta_sum", - "av_bldg_delta_pct_median", - "av_bldg_delta_pct_mean", - "av_bldg_delta_pct_sum", - "av_land_min", - "av_land_q10", - "av_land_q25", - "av_land_median", - "av_land_q75", - "av_land_q90", - "av_land_max", - "av_land_mean", - "av_land_sum", - "av_land_delta_median", - "av_land_delta_mean", - "av_land_delta_sum", - "av_land_delta_pct_median", - "av_land_delta_pct_mean", - "av_land_delta_pct_sum", - ] - ] - - return output - - -def ingest_geos(geos): - """ - Function to convert dbt seed into a dictionary that can be iterated over. - """ +years = { + "county": "year", + "triad": "year", + "township": "year", + "nbhd": "year", + "tax_code": "year", + "zip_code": "year", + "community_area": "community_area_data_year", + "census_place": "census_data_year", + "census_tract": "census_data_year", + "census_congressional_district": "census_data_year", + "census_zcta": "census_data_year", + "cook_board_of_review_district": "cook_board_of_review_district_data_year", + "cook_commissioner_district": "cook_commissioner_district_data_year", + "cook_judicial_district": "cook_judicial_district_data_year", + "ward_num": "ward_data_year", + "police_district": "police_district_data_year", + "school_elementary_district": "school_data_year", + "school_secondary_district": "school_data_year", + "school_unified_district": "school_data_year", + "tax_municipality": "tax_data_year", + "tax_park_district": "tax_data_year", + "tax_library_district": "tax_data_year", + "tax_fire_protection_district": "tax_data_year", + "tax_community_college_district": "tax_data_year", + "tax_sanitation_district": "tax_data_year", + "tax_special_service_area": "tax_data_year", + "tax_tif_district": "tax_data_year", + "central_business_district": "central_business_district_data_year", +} - geos = geos.toPandas() - output = { - k: list(geos[k].unique()[pd.notnull(geos[k].unique())]) - for k in geos.columns - } +geographies = list(years.keys()) - return output +output_schema = "stage_name string, group_id string, geography_id string, year string, reassessment_year string, geography_data_year string, pin_n_tot int, pin_n_w_value int, pin_pct_w_value double, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double" def model(dbt, spark_session): - """ - Function to build a dbt python model using PySpark. - """ - dbt.config(materialized="table") - - # Ingest geographies and their associated data years - geos = ingest_geos(dbt.ref("reporting.sot_data_years")) + dbt.config(materialized="table", engine_config={"MaxConcurrentDpus": 40}) + athena_user_logger.info("Loading assessment roll input table") input = dbt.ref("reporting.sot_assessment_roll_input") - - # Convert the Spark input dataframe to Pandas for - # compatibility with assesspy functions - input = input.toPandas() - - df = assemble(input, geos=geos, groups=groups) - # %% - schema = { - "geography_type": "string", - "geography_id": "string", - "geography_data_year": "string", - "group_type": "string", - "group_id": "string", - "year": "string", - "reassessment_year": "string", - "stage_name": "string", - "pin_n_tot": "bigint", - "pin_n_w_value": "bigint", - "pin_pct_w_value": "double", - "av_tot_min": "double", - "av_tot_q10": "double", - "av_tot_q25": "double", - "av_tot_median": "double", - "av_tot_q75": "double", - "av_tot_q90": "double", - "av_tot_max": "double", - "av_tot_mean": "double", - "av_tot_sum": "double", - "av_tot_delta_median": "double", - "av_tot_delta_mean": "double", - "av_tot_delta_sum": "double", - "av_tot_delta_pct_median": "double", - "av_tot_delta_pct_mean": "double", - "av_tot_delta_pct_sum": "double", - "av_bldg_min": "double", - "av_bldg_q10": "double", - "av_bldg_q25": "double", - "av_bldg_median": "double", - "av_bldg_q75": "double", - "av_bldg_q90": "double", - "av_bldg_max": "double", - "av_bldg_mean": "double", - "av_bldg_sum": "double", - "av_bldg_delta_median": "double", - "av_bldg_delta_mean": "double", - "av_bldg_delta_sum": "double", - "av_bldg_delta_pct_median": "double", - "av_bldg_delta_pct_mean": "double", - "av_bldg_delta_pct_sum": "double", - "av_land_min": "double", - "av_land_q10": "double", - "av_land_q25": "double", - "av_land_median": "double", - "av_land_q75": "double", - "av_land_q90": "double", - "av_land_max": "double", - "av_land_mean": "double", - "av_land_sum": "double", - "av_land_delta_median": "double", - "av_land_delta_mean": "double", - "av_land_delta_sum": "double", - "av_land_delta_pct_median": "double", - "av_land_delta_pct_mean": "double", - "av_land_delta_pct_sum": "double", - } - # %% - spark_df = spark_session.createDataFrame( - df, schema=", ".join(f"{key}: {val}" for key, val in schema.items()) + spark_schema = dict.fromkeys(input.columns, "string") + spark_schema |= dict.fromkeys(["av_tot", "av_bldg", "av_land"], "double") + spark_schema = ", ".join( + f"{key} {val}" for key, val in spark_schema.items() ) - return spark_df + spark_df = spark_session.createDataFrame(input, schema=spark_schema) + + athena_user_logger.info("Dope stuff is happening... maybe?") + + output = [] + for group in groups: + for geography in geographies: + output += [ + spark_df.groupby(["stage_name", group, geography, "year"]) + .applyInPandas( + aggregate, + schema=output_schema, + ) + .select( + "*", + lit(group).alias("group_type"), + lit(geography).alias("geography_type"), + ) + ] + + df = reduce(DataFrame.unionByName, output) + + return df diff --git a/dbt/models/reporting/reporting.spark_test.py b/dbt/models/reporting/reporting.spark_test.py deleted file mode 100644 index 0a143ca78..000000000 --- a/dbt/models/reporting/reporting.spark_test.py +++ /dev/null @@ -1,161 +0,0 @@ -# This script generates aggregated summary stats on assessed values across a -# number of geographies, class combinations, and time. - -# Import libraries -from functools import reduce - -import pandas as pd -from pyspark.sql import DataFrame -from pyspark.sql.functions import lit - - -# Define aggregation functions. These are just wrappers for basic python -# functions that make using them easier to use with pandas.agg(). -def q10(x): - return x.quantile(0.1) - - -def q25(x): - return x.quantile(0.25) - - -def q75(x): - return x.quantile(0.75) - - -def q90(x): - return x.quantile(0.9) - - -def first(x): - if len(x) >= 1: - output = x.iloc[0] - else: - output = None - - return output - - -def reassessment_year(year, geography, triad): - if geography in ["triad", "township", "nbhd"]: - year = int(year) % 3 - - if ( - ((year == 0) & (triad == "North")) - | ((year == 1) & (triad == "South")) - | ((year == 2) & (triad == "City")) - ): - out = "Yes" - else: - out = "No" - else: - out = "" - - return out - - -def aggregate(key, pdf): - columns = ["av_tot", "av_bldg", "av_land"] - - out = () - out += ( - reassessment_year(pdf["year"][0], geography, pdf["triad"][0]), # noqa: F821 - first(pdf[years[geography]]), # noqa: F821 - len(pdf["av_tot"]), - pdf["av_tot"].count(), - pdf["av_tot"].count() / pdf["av_tot"].size, - ) - for column in columns: - out += ( - pdf[column].min(), - q10(pdf[column]), - q25(pdf[column]), - pdf[column].median(), - q75(pdf[column]), - q90(pdf[column]), - pdf[column].max(), - pdf[column].mean(), - pdf[column].sum(), - ) - - return pd.DataFrame([key + out]) - - -groups = [ - "res_other", - "major_class", - "no_group", - "class", - "modeling_group", -] - -years = { - "county": "year", - "triad": "year", - "township": "year", - "nbhd": "year", - "tax_code": "year", - "zip_code": "year", - "community_area": "community_area_data_year", - "census_place": "census_data_year", - "census_tract": "census_data_year", - "census_congressional_district": "census_data_year", - "census_zcta": "census_data_year", - "cook_board_of_review_district": "cook_board_of_review_district_data_year", - "cook_commissioner_district": "cook_commissioner_district_data_year", - "cook_judicial_district": "cook_judicial_district_data_year", - "ward_num": "ward_data_year", - "police_district": "police_district_data_year", - "school_elementary_district": "school_data_year", - "school_secondary_district": "school_data_year", - "school_unified_district": "school_data_year", - "tax_municipality": "tax_data_year", - "tax_park_district": "tax_data_year", - "tax_library_district": "tax_data_year", - "tax_fire_protection_district": "tax_data_year", - "tax_community_college_district": "tax_data_year", - "tax_sanitation_district": "tax_data_year", - "tax_special_service_area": "tax_data_year", - "tax_tif_district": "tax_data_year", - "central_business_district": "central_business_district_data_year", -} - -geographies = list(years.keys()) - -output_schema = "stage_name string, group_id string, geography_id string, year string, reassessment_year string, geography_data_year string, pin_n_tot int, pin_n_w_value int, pin_pct_w_value double, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double" - - -def model(dbt, spark_session): - dbt.config(materialized="table", engine_config={"MaxConcurrentDpus": 40}) - athena_user_logger.info("Loading assessment roll input table") - - input = dbt.ref("reporting.sot_assessment_roll_input") - spark_schema = dict.fromkeys(input.columns, "string") - spark_schema |= dict.fromkeys(["av_tot", "av_bldg", "av_land"], "double") - spark_schema = ", ".join( - f"{key} {val}" for key, val in spark_schema.items() - ) - - spark_df = spark_session.createDataFrame(input, schema=spark_schema) - - athena_user_logger.info("Dope stuff is happening... maybe?") - - output = [] - for group in groups: - for geography in geographies: - output += [ - spark_df.groupby(["stage_name", group, geography, "year"]) - .applyInPandas( - aggregate, - schema=output_schema, - ) - .select( - "*", - lit(group).alias("group_type"), - lit(geography).alias("geography_type"), - ) - ] - - df = reduce(DataFrame.unionByName, output) - - return df From 23c6fb8e326e6c7bc829ebe1484b710a51273955 Mon Sep 17 00:00:00 2001 From: sweatyhandshake Date: Mon, 23 Jun 2025 11:20:47 -0500 Subject: [PATCH 78/96] Debugging input pyspark dataframe --- dbt/models/reporting/reporting.sot_assessment_roll.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index 0a143ca78..fa1e0a8b2 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -130,21 +130,14 @@ def model(dbt, spark_session): athena_user_logger.info("Loading assessment roll input table") input = dbt.ref("reporting.sot_assessment_roll_input") - spark_schema = dict.fromkeys(input.columns, "string") - spark_schema |= dict.fromkeys(["av_tot", "av_bldg", "av_land"], "double") - spark_schema = ", ".join( - f"{key} {val}" for key, val in spark_schema.items() - ) - - spark_df = spark_session.createDataFrame(input, schema=spark_schema) - + athena_user_logger.info("Dope stuff is happening... maybe?") output = [] for group in groups: for geography in geographies: output += [ - spark_df.groupby(["stage_name", group, geography, "year"]) + input.groupby(["stage_name", group, geography, "year"]) .applyInPandas( aggregate, schema=output_schema, From 763915e55659f8bc2b3bd2c41730d4b7bd8b68b9 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Mon, 23 Jun 2025 20:43:54 +0000 Subject: [PATCH 79/96] Pass geography to aggregate --- .../reporting.sot_assessment_roll.py | 51 ++++++++++--------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index fa1e0a8b2..0b3d9902a 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -54,31 +54,34 @@ def reassessment_year(year, geography, triad): return out -def aggregate(key, pdf): - columns = ["av_tot", "av_bldg", "av_land"] - - out = () - out += ( - reassessment_year(pdf["year"][0], geography, pdf["triad"][0]), # noqa: F821 - first(pdf[years[geography]]), # noqa: F821 - len(pdf["av_tot"]), - pdf["av_tot"].count(), - pdf["av_tot"].count() / pdf["av_tot"].size, - ) - for column in columns: +def aggregate_geography(geography): + def aggregate(key, pdf): + columns = ["av_tot", "av_bldg", "av_land"] + + out = () out += ( - pdf[column].min(), - q10(pdf[column]), - q25(pdf[column]), - pdf[column].median(), - q75(pdf[column]), - q90(pdf[column]), - pdf[column].max(), - pdf[column].mean(), - pdf[column].sum(), + reassessment_year(pdf["year"][0], geography, pdf["triad"][0]), # noqa: F821 + first(pdf[years[geography]]), # noqa: F821 + len(pdf["av_tot"]), + pdf["av_tot"].count(), + pdf["av_tot"].count() / pdf["av_tot"].size, ) + for column in columns: + out += ( + pdf[column].min(), + q10(pdf[column]), + q25(pdf[column]), + pdf[column].median(), + q75(pdf[column]), + q90(pdf[column]), + pdf[column].max(), + pdf[column].mean(), + pdf[column].sum(), + ) + + return pd.DataFrame([key + out]) - return pd.DataFrame([key + out]) + return aggregate groups = [ @@ -130,7 +133,7 @@ def model(dbt, spark_session): athena_user_logger.info("Loading assessment roll input table") input = dbt.ref("reporting.sot_assessment_roll_input") - + athena_user_logger.info("Dope stuff is happening... maybe?") output = [] @@ -139,7 +142,7 @@ def model(dbt, spark_session): output += [ input.groupby(["stage_name", group, geography, "year"]) .applyInPandas( - aggregate, + aggregate_geography(geography), schema=output_schema, ) .select( From 29c15a1117596a3bac9f47c7012468e67cdbfd43 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 24 Jun 2025 15:32:50 +0000 Subject: [PATCH 80/96] Reduce input size to test runner memory limits --- dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql index 1cab79d90..b987c2d24 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql +++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql @@ -102,3 +102,4 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals AND uni.stage_name = vals.stage_name LEFT JOIN {{ ref('ccao.class_dict') }} ON uni.class = class_dict.class_code +WHERE uni.year >= '2021' From b99dd241123c49589fb78f6ea7df22dc2476b835 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 24 Jun 2025 16:24:31 +0000 Subject: [PATCH 81/96] Really reduce input size --- dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql index b987c2d24..cb1c8d819 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql +++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql @@ -102,4 +102,4 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals AND uni.stage_name = vals.stage_name LEFT JOIN {{ ref('ccao.class_dict') }} ON uni.class = class_dict.class_code -WHERE uni.year >= '2021' +WHERE uni.year = '2025' From 76a7df5349091c4b055033e4eae929505baf0d8f Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 24 Jun 2025 18:00:29 +0000 Subject: [PATCH 82/96] Further reduce input size --- dbt/models/reporting/reporting.sot_assessment_roll.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index 0b3d9902a..1923955c3 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -133,6 +133,7 @@ def model(dbt, spark_session): athena_user_logger.info("Loading assessment roll input table") input = dbt.ref("reporting.sot_assessment_roll_input") + input = input.filter(input.stage_name == "BOR CERTIFIED") athena_user_logger.info("Dope stuff is happening... maybe?") From 8fa0e4ea6e394d48ce6b46224a0a3c6b43330512 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 24 Jun 2025 18:12:47 +0000 Subject: [PATCH 83/96] Try a really small input --- dbt/models/reporting/reporting.sot_assessment_roll.py | 1 - dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index 1923955c3..0b3d9902a 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -133,7 +133,6 @@ def model(dbt, spark_session): athena_user_logger.info("Loading assessment roll input table") input = dbt.ref("reporting.sot_assessment_roll_input") - input = input.filter(input.stage_name == "BOR CERTIFIED") athena_user_logger.info("Dope stuff is happening... maybe?") diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql index cb1c8d819..00f7c7afa 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql +++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql @@ -102,4 +102,4 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals AND uni.stage_name = vals.stage_name LEFT JOIN {{ ref('ccao.class_dict') }} ON uni.class = class_dict.class_code -WHERE uni.year = '2025' +LIMIT 100000 From 8f1ee19742c18ccd74380483dbb492fc035d67fd Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 25 Jun 2025 16:24:43 +0000 Subject: [PATCH 84/96] Change int type for pyarrow --- dbt/models/reporting/reporting.sot_assessment_roll.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index 0b3d9902a..fc8738516 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -125,7 +125,7 @@ def aggregate(key, pdf): geographies = list(years.keys()) -output_schema = "stage_name string, group_id string, geography_id string, year string, reassessment_year string, geography_data_year string, pin_n_tot int, pin_n_w_value int, pin_pct_w_value double, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double" +output_schema = "stage_name string, group_id string, geography_id string, year string, reassessment_year string, geography_data_year string, pin_n_tot bigint, pin_n_w_value bigint, pin_pct_w_value double, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double" def model(dbt, spark_session): From b8bdf39e507c6e969ffc457324581f8c38301908 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 25 Jun 2025 18:44:24 +0000 Subject: [PATCH 85/96] Try coercing expected string columns --- .../reporting/reporting.sot_assessment_roll.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index fc8738516..5c16a9698 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -6,7 +6,7 @@ import pandas as pd from pyspark.sql import DataFrame -from pyspark.sql.functions import lit +from pyspark.sql.functions import col, lit # Define aggregation functions. These are just wrappers for basic python @@ -60,8 +60,8 @@ def aggregate(key, pdf): out = () out += ( - reassessment_year(pdf["year"][0], geography, pdf["triad"][0]), # noqa: F821 - first(pdf[years[geography]]), # noqa: F821 + reassessment_year(pdf["year"][0], geography, pdf["triad"][0]), + first(pdf[years[geography]]), len(pdf["av_tot"]), pdf["av_tot"].count(), pdf["av_tot"].count() / pdf["av_tot"].size, @@ -154,4 +154,16 @@ def model(dbt, spark_session): df = reduce(DataFrame.unionByName, output) + for column in [ + "stage_name", + "group_id", + "geography_id", + "year", + "reassessment_year", + "geography_data_year", + "group_type", + "geography_type", + ]: + df = df.withColumn(column, col(column).cast("string")) + return df From f33a2e4c3fda473d2cb0853e8177f5d27fddbd57 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 26 Jun 2025 14:28:04 +0000 Subject: [PATCH 86/96] Remove string coersion for output table --- .../reporting/reporting.sot_assessment_roll.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index 5c16a9698..8ee95a106 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -6,7 +6,7 @@ import pandas as pd from pyspark.sql import DataFrame -from pyspark.sql.functions import col, lit +from pyspark.sql.functions import lit # Define aggregation functions. These are just wrappers for basic python @@ -154,16 +154,4 @@ def model(dbt, spark_session): df = reduce(DataFrame.unionByName, output) - for column in [ - "stage_name", - "group_id", - "geography_id", - "year", - "reassessment_year", - "geography_data_year", - "group_type", - "geography_type", - ]: - df = df.withColumn(column, col(column).cast("string")) - return df From 2264e1d29ad1fe4eb0070f09bbe47a1538d40cc7 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 26 Jun 2025 15:50:21 +0000 Subject: [PATCH 87/96] Try to increase max driver result for spark session --- dbt/models/reporting/reporting.sot_assessment_roll.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index 8ee95a106..bb1de2514 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -130,6 +130,7 @@ def aggregate(key, pdf): def model(dbt, spark_session): dbt.config(materialized="table", engine_config={"MaxConcurrentDpus": 40}) + spark_session.builder.config("spark.driver.maxResultSize", "0") athena_user_logger.info("Loading assessment roll input table") input = dbt.ref("reporting.sot_assessment_roll_input") From dfb7d1db6e543feb9552a45332b4232191dfbf45 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 26 Jun 2025 16:56:27 +0000 Subject: [PATCH 88/96] Change spark driver config access --- dbt/models/reporting/reporting.sot_assessment_roll.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index bb1de2514..cdf1a80a1 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -8,6 +8,8 @@ from pyspark.sql import DataFrame from pyspark.sql.functions import lit +spark.driver.maxResultSize = 0 # noqa:F821 + # Define aggregation functions. These are just wrappers for basic python # functions that make using them easier to use with pandas.agg(). @@ -130,7 +132,6 @@ def aggregate(key, pdf): def model(dbt, spark_session): dbt.config(materialized="table", engine_config={"MaxConcurrentDpus": 40}) - spark_session.builder.config("spark.driver.maxResultSize", "0") athena_user_logger.info("Loading assessment roll input table") input = dbt.ref("reporting.sot_assessment_roll_input") From 2323aeaea10f77a87f27530be419bb8fd6f69c87 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 26 Jun 2025 17:05:25 +0000 Subject: [PATCH 89/96] One more driver attempt --- dbt/models/reporting/reporting.sot_assessment_roll.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index cdf1a80a1..f27042390 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -8,8 +8,6 @@ from pyspark.sql import DataFrame from pyspark.sql.functions import lit -spark.driver.maxResultSize = 0 # noqa:F821 - # Define aggregation functions. These are just wrappers for basic python # functions that make using them easier to use with pandas.agg(). @@ -132,6 +130,7 @@ def aggregate(key, pdf): def model(dbt, spark_session): dbt.config(materialized="table", engine_config={"MaxConcurrentDpus": 40}) + spark_session.driver.maxResultSize = 0 athena_user_logger.info("Loading assessment roll input table") input = dbt.ref("reporting.sot_assessment_roll_input") From 8cfc71394bfb67d03a4e037111bc70aff0784f43 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 26 Jun 2025 19:20:29 +0000 Subject: [PATCH 90/96] Try new engine config --- dbt/models/reporting/reporting.sot_assessment_roll.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index f27042390..8fe07f776 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -129,8 +129,14 @@ def aggregate(key, pdf): def model(dbt, spark_session): - dbt.config(materialized="table", engine_config={"MaxConcurrentDpus": 40}) - spark_session.driver.maxResultSize = 0 + dbt.config( + materialized="table", + engine_config={ + "MaxConcurrentDpus": 40, + "SparkProperties": {"spark.driver.maxResultSize": "4g"}, + }, + ) + athena_user_logger.info("Loading assessment roll input table") input = dbt.ref("reporting.sot_assessment_roll_input") From db21c18914acc8d93be54a4c84d73143da5e116e Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 26 Jun 2025 21:05:44 +0000 Subject: [PATCH 91/96] Test smaller amount of collection --- dbt/models/reporting/reporting.sot_assessment_roll.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index 8fe07f776..b8cf2e4f8 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -124,6 +124,9 @@ def aggregate(key, pdf): } geographies = list(years.keys()) +geographies = [ + geographies[0] +] # For testing purposes, only use the first geography output_schema = "stage_name string, group_id string, geography_id string, year string, reassessment_year string, geography_data_year string, pin_n_tot bigint, pin_n_w_value bigint, pin_pct_w_value double, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double" @@ -144,6 +147,7 @@ def model(dbt, spark_session): athena_user_logger.info("Dope stuff is happening... maybe?") output = [] + for group in groups: for geography in geographies: output += [ @@ -159,6 +163,6 @@ def model(dbt, spark_session): ) ] - df = reduce(DataFrame.unionByName, output) + df = reduce(DataFrame.unionByName, output) return df From e6681fedb68967d080eb4ab511b572d4dd70f245 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 26 Jun 2025 21:08:27 +0000 Subject: [PATCH 92/96] Remove config without permission --- dbt/models/reporting/reporting.sot_assessment_roll.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index b8cf2e4f8..1eed9ae74 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -136,7 +136,6 @@ def model(dbt, spark_session): materialized="table", engine_config={ "MaxConcurrentDpus": 40, - "SparkProperties": {"spark.driver.maxResultSize": "4g"}, }, ) From 34fa863379cb2af47ff31732ffe54cdf33a1e388 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Fri, 27 Jun 2025 16:03:56 +0000 Subject: [PATCH 93/96] Test using entire input --- dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 1 - 1 file changed, 1 deletion(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql index 00f7c7afa..1cab79d90 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql +++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql @@ -102,4 +102,3 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals AND uni.stage_name = vals.stage_name LEFT JOIN {{ ref('ccao.class_dict') }} ON uni.class = class_dict.class_code -LIMIT 100000 From 9b5d48f045f6c474629c90d789ae95c0aba1f592 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Fri, 27 Jun 2025 19:18:47 +0000 Subject: [PATCH 94/96] Revert for now --- dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql index 1cab79d90..00f7c7afa 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql +++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql @@ -102,3 +102,4 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals AND uni.stage_name = vals.stage_name LEFT JOIN {{ ref('ccao.class_dict') }} ON uni.class = class_dict.class_code +LIMIT 100000 From e4aeb0e32da9dc582b62f6aba460dd1417ce871c Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Sat, 28 Jun 2025 18:23:13 +0000 Subject: [PATCH 95/96] Remove limit again for testing --- dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 1 - 1 file changed, 1 deletion(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql index 00f7c7afa..1cab79d90 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql +++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql @@ -102,4 +102,3 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals AND uni.stage_name = vals.stage_name LEFT JOIN {{ ref('ccao.class_dict') }} ON uni.class = class_dict.class_code -LIMIT 100000 From e1460669c146db89c4c3ea28fcde4a96ad1628f4 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Sat, 28 Jun 2025 20:36:26 +0000 Subject: [PATCH 96/96] Attempt to collect more often --- dbt/models/reporting/reporting.sot_assessment_roll.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py index 1eed9ae74..f1331d847 100644 --- a/dbt/models/reporting/reporting.sot_assessment_roll.py +++ b/dbt/models/reporting/reporting.sot_assessment_roll.py @@ -2,10 +2,8 @@ # number of geographies, class combinations, and time. # Import libraries -from functools import reduce import pandas as pd -from pyspark.sql import DataFrame from pyspark.sql.functions import lit @@ -146,7 +144,6 @@ def model(dbt, spark_session): athena_user_logger.info("Dope stuff is happening... maybe?") output = [] - for group in groups: for geography in geographies: output += [ @@ -160,8 +157,9 @@ def model(dbt, spark_session): lit(group).alias("group_type"), lit(geography).alias("geography_type"), ) + .toPandas() ] - df = reduce(DataFrame.unionByName, output) + df = pd.concat(output) return df