From d780d76a53b3f47a9184d43060b6bdc60169d3ff Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Thu, 6 Jun 2024 17:59:40 +0000
Subject: [PATCH 01/96] First draft of sales script

---
 .gitignore                                    |   2 +
 .../reporting.sales_comprehensive.py          | 148 ++++++++++++++++++
 dbt/models/reporting/sales_comprehensive.sql  |  93 +++++++++++
 3 files changed, 243 insertions(+)
 create mode 100644 dbt/models/reporting/reporting.sales_comprehensive.py
 create mode 100644 dbt/models/reporting/sales_comprehensive.sql

diff --git a/.gitignore b/.gitignore
index 396c5dfcf..92ca547c2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@
 .Rproj.user
 .Ruserdata
 package*.json
+settings.json
 
 # Directories
 logs/
@@ -17,3 +18,4 @@ venv/
 # Ignore most CSVs, except those that are used as dbt seeds
 *.csv
 !dbt/seeds/**/*.csv
+*.parquet.gzip
diff --git a/dbt/models/reporting/reporting.sales_comprehensive.py b/dbt/models/reporting/reporting.sales_comprehensive.py
new file mode 100644
index 000000000..6431a0753
--- /dev/null
+++ b/dbt/models/reporting/reporting.sales_comprehensive.py
@@ -0,0 +1,148 @@
+# This script generates aggregated summary stats on sales data across a number
+# of geographies, class combinations, and time.
+
+import os.path
+import statistics as stats
+
+# Import libraries
+import awswrangler as wr
+import numpy as np
+import pandas as pd
+
+# Ingest data if it is not already available
+if os.path.isfile("sales.parquet.gzip"):
+    df = pd.read_parquet("sales.parquet.gzip")
+
+else:
+    sql = open("sales_comprehensive.sql").read()
+    df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False)
+    df.to_parquet("sales.parquet.gzip", compression="gzip")
+
+# Declare geographic groups and their associated data years
+geos = {
+    "census_data_year": [
+        "census_place",
+        "census_tract",
+        "census_congressional_district",
+        "census_zcta",
+    ],
+    "cook_board_of_review_district_data_year": [
+        "cook_board_of_review_district"
+    ],
+    "cook_commissioner_district_data_year": ["cook_commissioner_district"],
+    "cook_judicial_district_data_year": ["cook_judicial_district"],
+    "ward_data_year": ["ward_num"],
+    "community_area_data_year": ["community_area"],
+    "police_district_data_year": ["police_district"],
+    "central_business_district_data_year": ["central_business_district"],
+    "school_data_year": [
+        "school_elementary_district",
+        "school_secondary_district",
+        "school_unified_district",
+    ],
+    "tax_data_year": [
+        "tax_municipality",
+        "tax_park_district",
+        "tax_library_district",
+        "tax_fire_protection_district",
+        "tax_community_college_district",
+        "tax_sanitation_district",
+        "tax_special_service_area",
+        "tax_tif_district",
+    ],
+}
+# Declare class groupings
+groups = ["no_group", "class", "major_class", "modeling_group"]
+
+
+# Define aggregation functions
+def q10(x):
+    return x.quantile(0.1)
+
+
+def q25(x):
+    return x.quantile(0.25)
+
+
+def q75(x):
+    return x.quantile(0.75)
+
+
+def q90(x):
+    return x.quantile(0.9)
+
+
+def first(x):
+    return x.iloc[0]
+
+
+agg_func_math = {
+    "sale_price": [
+        "size",
+        "count",
+        "min",
+        q10,
+        q25,
+        "median",
+        q75,
+        q90,
+        "max",
+        "mean",
+        "sum",
+    ],
+    "price_per_sf": [
+        "min",
+        q10,
+        q25,
+        "median",
+        q75,
+        q90,
+        "max",
+        "mean",
+        "sum",
+    ],
+    "char_bldg_sf": ["median"],
+    "char_land_sf": ["median"],
+    "char_yrblt": ["median"],
+    "class": [stats.multimode],
+    "data_year": [first],
+}
+
+# Create an empty dataframe to fill with output
+output = pd.DataFrame()
+
+# Loop through group combinations and stack output
+for x in np.concatenate(list(geos.values())):
+    for y in geos.keys():
+        if x in geos[y]:
+            df["data_year"] = df[y]
+
+    for z in groups:
+        group = [x, z, "year"]
+        summary = df.groupby(group).agg(agg_func_math).round(2)
+        summary["geography_type"] = x
+        summary["group_type"] = z
+        summary.index.names = ["geography_id", "group_id", "year"]
+        summary = summary.reset_index().set_index(
+            [
+                "geography_type",
+                "geography_id",
+                "group_type",
+                "group_id",
+                "year",
+            ]
+        )
+
+        output = pd.concat([output, summary])
+
+# Clean combined output and export
+output["sale_price", "sum"] = output["sale_price", "sum"].replace(0, np.NaN)
+output["price_per_sf", "sum"] = output["price_per_sf", "sum"].replace(
+    0, np.NaN
+)
+
+for i in ["median", "mean", "sum"]:
+    output["sale_price", "delta" + i] = output["sale_price", i].diff()
+    output["price_per_sf", "delta" + i] = output["price_per_sf", i].diff()
+
+output.to_csv("output.csv")
diff --git a/dbt/models/reporting/sales_comprehensive.sql b/dbt/models/reporting/sales_comprehensive.sql
new file mode 100644
index 000000000..c2f565739
--- /dev/null
+++ b/dbt/models/reporting/sales_comprehensive.sql
@@ -0,0 +1,93 @@
+-- Gather parcel-level land and yrblt
+WITH sf AS (
+    SELECT
+        pin,
+        year,
+        SUM(char_bldg_sf) AS char_bldg_sf,
+        SUM(char_land_sf) AS char_land_sf,
+        ARBITRARY(char_yrblt) AS char_yrblt
+    FROM default.vw_card_res_char
+    GROUP BY pin, year
+)
+
+-- Gather parcel-level geographies and join land, sales, and class groupings
+SELECT
+    sales.doc_no,
+    sales.sale_price,
+    CASE WHEN sf.char_bldg_sf > 0
+            THEN
+            CAST(sales.sale_price / sf.char_bldg_sf AS DOUBLE)
+    END AS price_per_sf,
+    CAST(sf.char_bldg_sf AS INT) AS char_bldg_sf,
+    CAST(sf.char_land_sf AS INT) AS char_land_sf,
+    CAST(sf.char_yrblt AS INT) AS char_yrblt,
+    CAST(hist.oneyr_pri_mailed_bldg AS DOUBLE) AS oneyr_pri_mailed_bldg,
+    CAST(hist.oneyr_pri_mailed_land AS DOUBLE) AS oneyr_pri_mailed_land,
+    CAST(hist.oneyr_pri_mailed_tot AS DOUBLE) AS oneyr_pri_mailed_tot,
+    uni.year,
+    uni.class,
+    'Cook' AS county,
+    uni.triad_name AS triad,
+    uni.township_name AS township,
+    uni.nbhd_code AS nbhd,
+    uni.tax_code,
+    uni.zip_code,
+    uni.chicago_community_area_name AS community_area,
+    uni.census_place_geoid AS census_place,
+    uni.census_tract_geoid AS census_tract,
+    uni.census_congressional_district_geoid
+        AS
+        census_congressional_district,
+    uni.census_zcta_geoid AS census_zcta,
+    uni.cook_board_of_review_district_num AS cook_board_of_review_district,
+    uni.cook_commissioner_district_num AS cook_commissioner_district,
+    uni.cook_judicial_district_num AS cook_judicial_district,
+    uni.ward_num,
+    uni.chicago_police_district_num AS police_district,
+    uni.school_elementary_district_geoid AS school_elementary_district,
+    uni.school_secondary_district_geoid AS school_secondary_district,
+    uni.school_unified_district_geoid AS school_unified_district,
+    uni.tax_municipality_name AS tax_municipality,
+    uni.tax_park_district_name AS tax_park_district,
+    uni.tax_library_district_name AS tax_library_district,
+    uni.tax_fire_protection_district_name AS tax_fire_protection_district,
+    uni.tax_community_college_district_name
+        AS
+        tax_community_college_district,
+    uni.tax_sanitation_district_name AS tax_sanitation_district,
+    uni.tax_special_service_area_name AS tax_special_service_area,
+    uni.tax_tif_district_name AS tax_tif_district,
+    uni.econ_central_business_district_num AS central_business_district,
+    uni.census_data_year,
+    uni.cook_board_of_review_district_data_year,
+    uni.cook_commissioner_district_data_year,
+    uni.cook_judicial_district_data_year,
+    COALESCE(
+        uni.ward_chicago_data_year, uni.ward_evanston_data_year) AS
+    ward_data_year,
+    uni.chicago_community_area_data_year AS community_area_data_year,
+    uni.chicago_police_district_data_year AS police_district_data_year,
+    uni.econ_central_business_district_data_year
+        AS
+        central_business_district_data_year,
+    uni.school_data_year,
+    uni.tax_data_year,
+    'no_group' AS no_group,
+    class_dict.major_class_type AS major_class,
+    class_dict.modeling_group
+FROM vw_pin_universe AS uni
+LEFT JOIN sf
+    ON uni.pin = sf.pin
+    AND uni.year = sf.year
+LEFT JOIN ccao.class_dict
+    ON uni.class = class_dict.class_code
+LEFT JOIN default.vw_pin_history AS hist
+    ON uni.pin = hist.pin
+    AND uni.year = hist.year
+LEFT JOIN vw_pin_sale AS sales
+    ON uni.pin = sales.pin
+    AND uni.year = sales.year
+    AND NOT sales.is_multisale
+    AND NOT sales.sale_filter_deed_type
+    AND NOT sales.sale_filter_less_than_10k
+    AND NOT sales.sale_filter_same_sale_within_365

From 00909fd2d203c1cb1445afdb57056320ac64d41e Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Thu, 6 Jun 2024 19:28:54 +0000
Subject: [PATCH 02/96] File renaming

---
 ...g.sales_comprehensive.py => reporting.sot_sales.py} | 10 +++++-----
 ...sales_comprehensive.sql => reporting.sot_sales.sql} |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)
 rename dbt/models/reporting/{reporting.sales_comprehensive.py => reporting.sot_sales.py} (93%)
 rename dbt/models/reporting/{sales_comprehensive.sql => reporting.sot_sales.sql} (97%)

diff --git a/dbt/models/reporting/reporting.sales_comprehensive.py b/dbt/models/reporting/reporting.sot_sales.py
similarity index 93%
rename from dbt/models/reporting/reporting.sales_comprehensive.py
rename to dbt/models/reporting/reporting.sot_sales.py
index 6431a0753..5bc192ed6 100644
--- a/dbt/models/reporting/reporting.sales_comprehensive.py
+++ b/dbt/models/reporting/reporting.sot_sales.py
@@ -10,13 +10,13 @@
 import pandas as pd
 
 # Ingest data if it is not already available
-if os.path.isfile("sales.parquet.gzip"):
-    df = pd.read_parquet("sales.parquet.gzip")
+if os.path.isfile("sot_sales.parquet.gzip"):
+    df = pd.read_parquet("sot_sales.parquet.gzip")
 
 else:
-    sql = open("sales_comprehensive.sql").read()
+    sql = open("sot_sales.sql").read()
     df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False)
-    df.to_parquet("sales.parquet.gzip", compression="gzip")
+    df.to_parquet("sot_sales.parquet.gzip", compression="gzip")
 
 # Declare geographic groups and their associated data years
 geos = {
@@ -145,4 +145,4 @@ def first(x):
     output["sale_price", "delta" + i] = output["sale_price", i].diff()
     output["price_per_sf", "delta" + i] = output["price_per_sf", i].diff()
 
-output.to_csv("output.csv")
+output.to_csv("sot_sales.csv")
diff --git a/dbt/models/reporting/sales_comprehensive.sql b/dbt/models/reporting/reporting.sot_sales.sql
similarity index 97%
rename from dbt/models/reporting/sales_comprehensive.sql
rename to dbt/models/reporting/reporting.sot_sales.sql
index c2f565739..2be8e4dc5 100644
--- a/dbt/models/reporting/sales_comprehensive.sql
+++ b/dbt/models/reporting/reporting.sot_sales.sql
@@ -75,7 +75,7 @@ SELECT
     'no_group' AS no_group,
     class_dict.major_class_type AS major_class,
     class_dict.modeling_group
-FROM vw_pin_universe AS uni
+FROM default.vw_pin_universe AS uni
 LEFT JOIN sf
     ON uni.pin = sf.pin
     AND uni.year = sf.year
@@ -84,7 +84,7 @@ LEFT JOIN ccao.class_dict
 LEFT JOIN default.vw_pin_history AS hist
     ON uni.pin = hist.pin
     AND uni.year = hist.year
-LEFT JOIN vw_pin_sale AS sales
+LEFT JOIN default.vw_pin_sale AS sales
     ON uni.pin = sales.pin
     AND uni.year = sales.year
     AND NOT sales.is_multisale

From 2ac598244e25e6fbb99854a40cca136a5ff59492 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Thu, 6 Jun 2024 20:42:06 +0000
Subject: [PATCH 03/96] Cleaner for loop

---
 dbt/models/reporting/reporting.sot_sales.py | 45 ++++++++++-----------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py
index 5bc192ed6..3f262e2d2 100644
--- a/dbt/models/reporting/reporting.sot_sales.py
+++ b/dbt/models/reporting/reporting.sot_sales.py
@@ -14,7 +14,7 @@
     df = pd.read_parquet("sot_sales.parquet.gzip")
 
 else:
-    sql = open("sot_sales.sql").read()
+    sql = open("reporting.sot_sales.sql").read()
     df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False)
     df.to_parquet("sot_sales.parquet.gzip", compression="gzip")
 
@@ -112,28 +112,27 @@ def first(x):
 output = pd.DataFrame()
 
 # Loop through group combinations and stack output
-for x in np.concatenate(list(geos.values())):
-    for y in geos.keys():
-        if x in geos[y]:
-            df["data_year"] = df[y]
-
-    for z in groups:
-        group = [x, z, "year"]
-        summary = df.groupby(group).agg(agg_func_math).round(2)
-        summary["geography_type"] = x
-        summary["group_type"] = z
-        summary.index.names = ["geography_id", "group_id", "year"]
-        summary = summary.reset_index().set_index(
-            [
-                "geography_type",
-                "geography_id",
-                "group_type",
-                "group_id",
-                "year",
-            ]
-        )
-
-        output = pd.concat([output, summary])
+for key, value in geos.items():
+    df["data_year"] = df[key]
+
+    for x in value:
+        for z in groups:
+            group = [x, z, "year"]
+            summary = df.groupby(group).agg(agg_func_math).round(2)
+            summary["geography_type"] = x
+            summary["group_type"] = z
+            summary.index.names = ["geography_id", "group_id", "year"]
+            summary = summary.reset_index().set_index(
+                [
+                    "geography_type",
+                    "geography_id",
+                    "group_type",
+                    "group_id",
+                    "year",
+                ]
+            )
+
+            output = pd.concat([output, summary])
 
 # Clean combined output and export
 output["sale_price", "sum"] = output["sale_price", "sum"].replace(0, np.NaN)

From 2107d2a33892c8229cffd77c1762774794f80e8d Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Tue, 11 Jun 2024 17:34:41 +0000
Subject: [PATCH 04/96] First draft taxes and exemptions table

---
 dbt/models/reporting/reporting.sot_sales.py   |  38 ++---
 .../reporting.sot_taxes_exemptions.py         | 141 ++++++++++++++++++
 .../reporting.sot_taxes_exemptions.sql        |  93 ++++++++++++
 3 files changed, 248 insertions(+), 24 deletions(-)
 create mode 100644 dbt/models/reporting/reporting.sot_taxes_exemptions.py
 create mode 100644 dbt/models/reporting/reporting.sot_taxes_exemptions.sql

diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py
index 3f262e2d2..120a0bd5b 100644
--- a/dbt/models/reporting/reporting.sot_sales.py
+++ b/dbt/models/reporting/reporting.sot_sales.py
@@ -76,31 +76,21 @@ def first(x):
     return x.iloc[0]
 
 
+more_stats = [
+    "min",
+    q10,
+    q25,
+    "median",
+    q75,
+    q90,
+    "max",
+    "mean",
+    "sum",
+]
+
 agg_func_math = {
-    "sale_price": [
-        "size",
-        "count",
-        "min",
-        q10,
-        q25,
-        "median",
-        q75,
-        q90,
-        "max",
-        "mean",
-        "sum",
-    ],
-    "price_per_sf": [
-        "min",
-        q10,
-        q25,
-        "median",
-        q75,
-        q90,
-        "max",
-        "mean",
-        "sum",
-    ],
+    "sale_price": ["size", "count"] + more_stats,
+    "price_per_sf": more_stats,
     "char_bldg_sf": ["median"],
     "char_land_sf": ["median"],
     "char_yrblt": ["median"],
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
new file mode 100644
index 000000000..7e236b820
--- /dev/null
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
@@ -0,0 +1,141 @@
+# This script generates aggregated summary stats on taxes and exemptions data
+# across a number of geographies, class combinations, and time.
+# %%
+import os.path
+
+# Import libraries
+import awswrangler as wr
+import pandas as pd
+
+# Ingest data if it is not already available
+if os.path.isfile("sot_taxes_exemptions.parquet.gzip"):
+    df = pd.read_parquet("sot_taxes_exemptions.parquet.gzip")
+
+else:
+    sql = open("reporting.sot_taxes_exemptions.sql").read()
+    df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False)
+    df.to_parquet("sot_taxes_exemptions.parquet.gzip", compression="gzip")
+
+# Declare geographic groups and their associated data years
+geos = {
+    "census_data_year": [
+        "census_place",
+        "census_tract",
+        "census_congressional_district",
+        "census_zcta",
+    ],
+    "cook_board_of_review_district_data_year": [
+        "cook_board_of_review_district"
+    ],
+    "cook_commissioner_district_data_year": ["cook_commissioner_district"],
+    "cook_judicial_district_data_year": ["cook_judicial_district"],
+    "ward_data_year": ["ward_num"],
+    "community_area_data_year": ["community_area"],
+    "police_district_data_year": ["police_district"],
+    "central_business_district_data_year": ["central_business_district"],
+    "school_data_year": [
+        "school_elementary_district",
+        "school_secondary_district",
+        "school_unified_district",
+    ],
+    "tax_data_year": [
+        "tax_municipality",
+        "tax_park_district",
+        "tax_library_district",
+        "tax_fire_protection_district",
+        "tax_community_college_district",
+        "tax_sanitation_district",
+        "tax_special_service_area",
+        "tax_tif_district",
+    ],
+}
+# Declare class groupings
+groups = ["no_group", "class", "major_class", "modeling_group"]
+
+
+# %%
+# Define aggregation functions
+def q10(x):
+    return x.quantile(0.1)
+
+
+def q25(x):
+    return x.quantile(0.25)
+
+
+def q75(x):
+    return x.quantile(0.75)
+
+
+def q90(x):
+    return x.quantile(0.9)
+
+
+def first(x):
+    return x.iloc[0]
+
+
+more_stats = [
+    "min",
+    q10,
+    q25,
+    "median",
+    q75,
+    q90,
+    "max",
+    "mean",
+    "sum",
+]
+
+less_stats = ["count", "sum"]
+
+agg_func_math = {
+    "eq_factor_final": ["size", first],
+    "eq_factor_tentative": [first],
+    "tax_bill_total": more_stats,
+    "tax_code_rate": more_stats,
+    "av_clerk": more_stats,
+    "exe_homeowner": less_stats,
+    "exe_senior": less_stats,
+    "exe_freeze": less_stats,
+    "exe_longtime_homeowner": less_stats,
+    "exe_disabled": less_stats,
+    "exe_vet_returning": less_stats,
+    "exe_vet_dis_lt50": less_stats,
+    "exe_vet_dis_50_69": less_stats,
+    "exe_vet_dis_ge70": less_stats,
+    "exe_abate": less_stats,
+}
+
+# Create an empty dataframe to fill with output
+output = pd.DataFrame()
+# %%
+# Loop through group combinations and stack output
+for key, value in geos.items():
+    df["data_year"] = df[key]
+
+    for x in value:
+        for z in groups:
+            group = [x, z, "year"]
+            summary = df.groupby(group).agg(agg_func_math).round(2)
+            summary["geography_type"] = x
+            summary["group_type"] = z
+            summary.index.names = ["geography_id", "group_id", "year"]
+            summary = summary.reset_index().set_index(
+                [
+                    "geography_type",
+                    "geography_id",
+                    "group_type",
+                    "group_id",
+                    "year",
+                ]
+            )
+
+            output = pd.concat([output, summary])
+
+# Clean combined output and export
+for i in ["median", "mean", "sum"]:
+    output["tax_bill_total", "delta" + i] = output["tax_bill_total", i].diff()
+
+output.to_csv("sot_taxes_exemptions.csv")
+# %%
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.sql b/dbt/models/reporting/reporting.sot_taxes_exemptions.sql
new file mode 100644
index 000000000..f96bdc5d4
--- /dev/null
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.sql
@@ -0,0 +1,93 @@
+-- Gather parcel-level geographies and join taxes, exemptions, and class
+-- groupings
+SELECT
+    tax.year,
+    tax.av_clerk,
+    tax.tax_bill_total,
+    CASE WHEN tax.exe_homeowner = 0 THEN NULL ELSE tax.exe_homeowner END
+        AS exe_homeowner,
+    CASE WHEN tax.exe_senior = 0 THEN NULL ELSE tax.exe_senior END
+        AS exe_senior,
+    CASE WHEN tax.exe_freeze = 0 THEN NULL ELSE tax.exe_freeze END
+        AS exe_freeze,
+    CASE
+        WHEN tax.exe_longtime_homeowner = 0 THEN NULL ELSE
+            tax.exe_longtime_homeowner
+    END AS exe_longtime_homeowner,
+    CASE WHEN tax.exe_disabled = 0 THEN NULL ELSE tax.exe_disabled END
+        AS exe_disabled,
+    CASE
+        WHEN tax.exe_vet_returning = 0 THEN NULL ELSE tax.exe_vet_returning
+    END AS exe_vet_returning,
+    CASE WHEN tax.exe_vet_dis_lt50 = 0 THEN NULL ELSE tax.exe_vet_dis_lt50 END
+        AS exe_vet_dis_lt50,
+    CASE
+        WHEN tax.exe_vet_dis_50_69 = 0 THEN NULL ELSE tax.exe_vet_dis_50_69
+    END AS exe_vet_dis_50_69,
+    CASE WHEN tax.exe_vet_dis_ge70 = 0 THEN NULL ELSE tax.exe_vet_dis_ge70 END
+        AS exe_vet_dis_ge70,
+    CASE WHEN tax.exe_abate = 0 THEN NULL ELSE tax.exe_abate END AS exe_abate,
+    tcd.tax_code_rate,
+    eqf.eq_factor_tentative,
+    eqf.eq_factor_final,
+    uni.class,
+    'Cook' AS county,
+    uni.triad_name AS triad,
+    uni.township_name AS township,
+    uni.nbhd_code AS nbhd,
+    uni.tax_code,
+    uni.zip_code,
+    uni.chicago_community_area_name AS community_area,
+    uni.census_place_geoid AS census_place,
+    uni.census_tract_geoid AS census_tract,
+    uni.census_congressional_district_geoid
+        AS
+        census_congressional_district,
+    uni.census_zcta_geoid AS census_zcta,
+    uni.cook_board_of_review_district_num AS cook_board_of_review_district,
+    uni.cook_commissioner_district_num AS cook_commissioner_district,
+    uni.cook_judicial_district_num AS cook_judicial_district,
+    uni.ward_num,
+    uni.chicago_police_district_num AS police_district,
+    uni.school_elementary_district_geoid AS school_elementary_district,
+    uni.school_secondary_district_geoid AS school_secondary_district,
+    uni.school_unified_district_geoid AS school_unified_district,
+    uni.tax_municipality_name AS tax_municipality,
+    uni.tax_park_district_name AS tax_park_district,
+    uni.tax_library_district_name AS tax_library_district,
+    uni.tax_fire_protection_district_name AS tax_fire_protection_district,
+    uni.tax_community_college_district_name
+        AS
+        tax_community_college_district,
+    uni.tax_sanitation_district_name AS tax_sanitation_district,
+    uni.tax_special_service_area_name AS tax_special_service_area,
+    uni.tax_tif_district_name AS tax_tif_district,
+    uni.econ_central_business_district_num AS central_business_district,
+    uni.census_data_year,
+    uni.cook_board_of_review_district_data_year,
+    uni.cook_commissioner_district_data_year,
+    uni.cook_judicial_district_data_year,
+    COALESCE(
+        uni.ward_chicago_data_year, uni.ward_evanston_data_year) AS
+    ward_data_year,
+    uni.chicago_community_area_data_year AS community_area_data_year,
+    uni.chicago_police_district_data_year AS police_district_data_year,
+    uni.econ_central_business_district_data_year
+        AS
+        central_business_district_data_year,
+    uni.school_data_year,
+    uni.tax_data_year,
+    'no_group' AS no_group,
+    class_dict.major_class_type AS major_class,
+    class_dict.modeling_group
+FROM default.vw_pin_universe AS uni
+INNER JOIN tax.pin AS tax
+    ON uni.pin = tax.pin
+    AND uni.year = tax.year
+INNER JOIN tax.eq_factor AS eqf
+    ON uni.year = eqf.year
+INNER JOIN tax.tax_code AS tcd
+    ON tax.tax_code_num = tcd.tax_code_num
+    AND tax.year = tcd.year
+INNER JOIN ccao.class_dict
+    ON uni.class = class_dict.class_code

From c56aaafbd3761f46fa5673fd8c841ab6a46a4dec Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Wed, 12 Jun 2024 22:07:26 +0000
Subject: [PATCH 05/96] Wrap assessment_roll

---
 .../reporting.sot_assessment_roll.py          | 136 ++++++++++++++++++
 .../reporting.sot_assessment_roll.sql         |  89 ++++++++++++
 .../reporting.sot_taxes_exemptions.py         |   4 -
 3 files changed, 225 insertions(+), 4 deletions(-)
 create mode 100644 dbt/models/reporting/reporting.sot_assessment_roll.py
 create mode 100644 dbt/models/reporting/reporting.sot_assessment_roll.sql

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
new file mode 100644
index 000000000..65dc3229b
--- /dev/null
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -0,0 +1,136 @@
+# This script generates aggregated summary stats on sales data across a number
+# of geographies, class combinations, and time.
+
+import os.path
+
+# Import libraries
+import awswrangler as wr
+import pandas as pd
+
+# Ingest data if it is not already available
+if os.path.isfile("sot_assessment_roll.parquet.gzip"):
+    df = pd.read_parquet("sot_assessment_roll.parquet.gzip")
+
+else:
+    sql = open("reporting.sot_assessment_roll.sql").read()
+    df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False)
+    df.to_parquet("sot_assessment_roll.parquet.gzip", compression="gzip")
+
+# Declare geographic groups and their associated data years
+geos = {
+    "census_data_year": [
+        "census_place",
+        "census_tract",
+        "census_congressional_district",
+        "census_zcta",
+    ],
+    "cook_board_of_review_district_data_year": [
+        "cook_board_of_review_district"
+    ],
+    "cook_commissioner_district_data_year": ["cook_commissioner_district"],
+    "cook_judicial_district_data_year": ["cook_judicial_district"],
+    "ward_data_year": ["ward_num"],
+    "community_area_data_year": ["community_area"],
+    "police_district_data_year": ["police_district"],
+    "central_business_district_data_year": ["central_business_district"],
+    "school_data_year": [
+        "school_elementary_district",
+        "school_secondary_district",
+        "school_unified_district",
+    ],
+    "tax_data_year": [
+        "tax_municipality",
+        "tax_park_district",
+        "tax_library_district",
+        "tax_fire_protection_district",
+        "tax_community_college_district",
+        "tax_sanitation_district",
+        "tax_special_service_area",
+        "tax_tif_district",
+    ],
+}
+# Declare class groupings
+groups = ["no_group", "class", "major_class", "modeling_group", "stage_name"]
+
+
+# Define aggregation functions
+def q10(x):
+    return x.quantile(0.1)
+
+
+def q25(x):
+    return x.quantile(0.25)
+
+
+def q75(x):
+    return x.quantile(0.75)
+
+
+def q90(x):
+    return x.quantile(0.9)
+
+
+def first(x):
+    return x.iloc[0]
+
+
+def aggregrate(data, geography_type, group_type):
+    print(geography_type, group_type)
+
+    group = [geography_type, group_type, "year"]
+    summary = data.groupby(group).agg(stats).round(2)
+    summary["geography_type"] = geography_type
+    summary["group_type"] = group_type
+    summary.index.names = ["geography_id", "group_id", "year"]
+    summary = summary.reset_index().set_index(
+        [
+            "geography_type",
+            "geography_id",
+            "group_type",
+            "group_id",
+            "year",
+        ]
+    )
+
+    return summary
+
+
+more_stats = [
+    "min",
+    q10,
+    q25,
+    "median",
+    q75,
+    q90,
+    "max",
+    "mean",
+    "sum",
+]
+
+stats = {
+    "size": first,
+    "tot": ["count"] + more_stats,
+    "bldg": more_stats,
+    "land": more_stats,
+}
+
+# Create an empty dataframe to fill with output
+output = pd.DataFrame()
+
+# Loop through group combinations and stack output
+for key, value in geos.items():
+    df["data_year"] = df[key]
+
+    for x in value:
+        for z in groups:
+            output = pd.concat([output, aggregrate(df, x, z)])
+
+# Clean combined output and export
+for i in ["median", "mean", "sum"]:
+    output["tot", "delta" + i] = output["tot", i].diff()
+    output["bldg", "delta" + i] = output["bldg", i].diff()
+    output["land", "delta" + i] = output["land", i].diff()
+
+output["tot", "pct_w_value"] = output["tot", "count"] / output["size", "first"]
+
+output.to_csv("sot_assessment_roll.csv")
diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.sql b/dbt/models/reporting/reporting.sot_assessment_roll.sql
new file mode 100644
index 000000000..e50aa10a7
--- /dev/null
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.sql
@@ -0,0 +1,89 @@
+-- Gather parcel-level geographies and join land, sales, and class groupings
+WITH counts AS (
+    SELECT
+        year,
+        COUNT(*) AS size
+    FROM default.vw_pin_universe
+    GROUP BY year
+)
+
+SELECT
+    CAST(vals.tot AS INT) AS tot,
+    CAST(vals.bldg AS INT) AS bldg,
+    CAST(vals.land AS INT) AS land,
+    CASE
+        WHEN
+            MOD(CAST(vals.year AS INT), 3) = 0
+            AND uni.triad_name = 'North'
+            THEN TRUE
+        WHEN
+            MOD(CAST(vals.year AS INT), 3) = 1
+            AND uni.triad_name = 'South'
+            THEN TRUE
+        WHEN
+            MOD(CAST(vals.year AS INT), 3) = 2
+            AND uni.triad_name = 'City'
+            THEN TRUE
+        ELSE FALSE
+    END AS reassessment_year,
+    vals.class,
+    vals.stage_name,
+    vals.year,
+    'Cook' AS county,
+    uni.triad_name AS triad,
+    uni.township_name AS township,
+    uni.nbhd_code AS nbhd,
+    uni.tax_code,
+    uni.zip_code,
+    uni.chicago_community_area_name AS community_area,
+    uni.census_place_geoid AS census_place,
+    uni.census_tract_geoid AS census_tract,
+    uni.census_congressional_district_geoid
+        AS
+        census_congressional_district,
+    uni.census_zcta_geoid AS census_zcta,
+    uni.cook_board_of_review_district_num AS cook_board_of_review_district,
+    uni.cook_commissioner_district_num AS cook_commissioner_district,
+    uni.cook_judicial_district_num AS cook_judicial_district,
+    uni.ward_num,
+    uni.chicago_police_district_num AS police_district,
+    uni.school_elementary_district_geoid AS school_elementary_district,
+    uni.school_secondary_district_geoid AS school_secondary_district,
+    uni.school_unified_district_geoid AS school_unified_district,
+    uni.tax_municipality_name AS tax_municipality,
+    uni.tax_park_district_name AS tax_park_district,
+    uni.tax_library_district_name AS tax_library_district,
+    uni.tax_fire_protection_district_name AS tax_fire_protection_district,
+    uni.tax_community_college_district_name
+        AS
+        tax_community_college_district,
+    uni.tax_sanitation_district_name AS tax_sanitation_district,
+    uni.tax_special_service_area_name AS tax_special_service_area,
+    uni.tax_tif_district_name AS tax_tif_district,
+    uni.econ_central_business_district_num AS central_business_district,
+    uni.census_data_year,
+    uni.cook_board_of_review_district_data_year,
+    uni.cook_commissioner_district_data_year,
+    uni.cook_judicial_district_data_year,
+    COALESCE(
+        uni.ward_chicago_data_year, uni.ward_evanston_data_year) AS
+    ward_data_year,
+    uni.chicago_community_area_data_year AS community_area_data_year,
+    uni.chicago_police_district_data_year AS police_district_data_year,
+    uni.econ_central_business_district_data_year
+        AS
+        central_business_district_data_year,
+    uni.school_data_year,
+    uni.tax_data_year,
+    'no_group' AS no_group,
+    class_dict.major_class_type AS major_class,
+    class_dict.modeling_group,
+    counts.size
+FROM default.vw_pin_universe AS uni
+LEFT JOIN reporting.vw_pin_value_long AS vals
+    ON uni.pin = vals.pin
+    AND uni.year = vals.year
+LEFT JOIN ccao.class_dict
+    ON vals.class = class_dict.class_code
+LEFT JOIN counts
+    ON uni.year = counts.year
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
index 7e236b820..b26e55a68 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
@@ -1,6 +1,5 @@
 # This script generates aggregated summary stats on taxes and exemptions data
 # across a number of geographies, class combinations, and time.
-# %%
 import os.path
 
 # Import libraries
@@ -53,7 +52,6 @@
 groups = ["no_group", "class", "major_class", "modeling_group"]
 
 
-# %%
 # Define aggregation functions
 def q10(x):
     return x.quantile(0.1)
@@ -109,7 +107,6 @@ def first(x):
 
 # Create an empty dataframe to fill with output
 output = pd.DataFrame()
-# %%
 # Loop through group combinations and stack output
 for key, value in geos.items():
     df["data_year"] = df[key]
@@ -138,4 +135,3 @@ def first(x):
     output["tax_bill_total", "delta" + i] = output["tax_bill_total", i].diff()
 
 output.to_csv("sot_taxes_exemptions.csv")
-# %%

From 6c813082a0312e2f5bb070c2d48e962149178030 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Thu, 13 Jun 2024 15:39:14 +0000
Subject: [PATCH 06/96] Correct size, count calculations

---
 .../reporting.sot_assessment_roll.py          |  7 +--
 .../reporting.sot_assessment_roll.sql         | 48 ++++++++++++-------
 2 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index 65dc3229b..7bae830b5 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -108,8 +108,7 @@ def aggregrate(data, geography_type, group_type):
 ]
 
 stats = {
-    "size": first,
-    "tot": ["count"] + more_stats,
+    "tot": ["size", "count"] + more_stats,
     "bldg": more_stats,
     "land": more_stats,
 }
@@ -131,6 +130,8 @@ def aggregrate(data, geography_type, group_type):
     output["bldg", "delta" + i] = output["bldg", i].diff()
     output["land", "delta" + i] = output["land", i].diff()
 
-output["tot", "pct_w_value"] = output["tot", "count"] / output["size", "first"]
+output["tot", "pct_w_value"] = output["tot", "count"] / output["tot", "size"]
 
 output.to_csv("sot_assessment_roll.csv")
+
+# %%
diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.sql b/dbt/models/reporting/reporting.sot_assessment_roll.sql
index e50aa10a7..aa615a533 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.sql
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.sql
@@ -1,34 +1,51 @@
 -- Gather parcel-level geographies and join land, sales, and class groupings
-WITH counts AS (
+
+/* Ensure every municipality/class/year has a row for every stage through
+cross-joining. This is to make sure that combinations that do not yet
+exist in iasworld.asmt_all for the current year will exist in the view, but have
+largely empty columns. For example: even if no class 4s in the City of Chicago
+have been mailed yet for the current assessment year, we would still like an
+empty City of Chicago/class 4 row to exist for the mailed stage. */
+WITH stages AS (
+
+    SELECT 'MAILED' AS stage_name
+    UNION
+    SELECT 'ASSESSOR CERTIFIED' AS stage_name
+    UNION
+    SELECT 'BOR CERTIFIED' AS stage_name
+
+),
+
+uni AS (
     SELECT
-        year,
-        COUNT(*) AS size
+        vw_pin_universe.*,
+        stages.*
     FROM default.vw_pin_universe
-    GROUP BY year
+    CROSS JOIN stages
 )
 
 SELECT
+    uni.year,
+    uni.stage_name,
+    uni.class,
     CAST(vals.tot AS INT) AS tot,
     CAST(vals.bldg AS INT) AS bldg,
     CAST(vals.land AS INT) AS land,
     CASE
         WHEN
-            MOD(CAST(vals.year AS INT), 3) = 0
+            MOD(CAST(uni.year AS INT), 3) = 0
             AND uni.triad_name = 'North'
             THEN TRUE
         WHEN
-            MOD(CAST(vals.year AS INT), 3) = 1
+            MOD(CAST(uni.year AS INT), 3) = 1
             AND uni.triad_name = 'South'
             THEN TRUE
         WHEN
-            MOD(CAST(vals.year AS INT), 3) = 2
+            MOD(CAST(uni.year AS INT), 3) = 2
             AND uni.triad_name = 'City'
             THEN TRUE
         ELSE FALSE
     END AS reassessment_year,
-    vals.class,
-    vals.stage_name,
-    vals.year,
     'Cook' AS county,
     uni.triad_name AS triad,
     uni.township_name AS township,
@@ -77,13 +94,12 @@ SELECT
     uni.tax_data_year,
     'no_group' AS no_group,
     class_dict.major_class_type AS major_class,
-    class_dict.modeling_group,
-    counts.size
-FROM default.vw_pin_universe AS uni
+    class_dict.modeling_group
+FROM uni
 LEFT JOIN reporting.vw_pin_value_long AS vals
     ON uni.pin = vals.pin
     AND uni.year = vals.year
+    AND uni.stage_name = vals.stage_name
 LEFT JOIN ccao.class_dict
-    ON vals.class = class_dict.class_code
-LEFT JOIN counts
-    ON uni.year = counts.year
+    ON uni.class = class_dict.class_code
+LIMIT 10000

From 1bf9b9c7a9b1f1714eee6757769b3023e74f036b Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Tue, 18 Jun 2024 00:35:38 +0000
Subject: [PATCH 07/96] Wrap sales table

---
 .../reporting/reporting.sot_ratio_stats.py    | 162 ++++++++++++++++++
 .../reporting/reporting.sot_ratio_stats.sql   | 115 +++++++++++++
 2 files changed, 277 insertions(+)
 create mode 100644 dbt/models/reporting/reporting.sot_ratio_stats.py
 create mode 100644 dbt/models/reporting/reporting.sot_ratio_stats.sql

diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py
new file mode 100644
index 000000000..e54e42203
--- /dev/null
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.py
@@ -0,0 +1,162 @@
+# This script generates aggregated summary stats on sales data across a number
+# of geographies, class combinations, and time.
+
+import os.path
+
+# Import libraries
+import assesspy as ass
+import awswrangler as wr
+import pandas as pd
+
+# Ingest data if it is not already available
+if os.path.isfile("sot_ratio_stats.parquet.gzip"):
+    df = pd.read_parquet("sot_ratio_stats.parquet.gzip")
+
+else:
+    sql = open("reporting.sot_ratio_stats.sql").read()
+    df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False)
+    df.to_parquet("sot_ratio_stats.parquet.gzip", compression="gzip")
+
+# Declare geographic groups and their associated data years
+geos = {
+    "census_data_year": [
+        "census_place",
+        "census_tract",
+        "census_congressional_district",
+        "census_zcta",
+    ],
+    "cook_board_of_review_district_data_year": [
+        "cook_board_of_review_district"
+    ],
+    "cook_commissioner_district_data_year": ["cook_commissioner_district"],
+    "cook_judicial_district_data_year": ["cook_judicial_district"],
+    "ward_data_year": ["ward_num"],
+    "community_area_data_year": ["community_area"],
+    "police_district_data_year": ["police_district"],
+    "central_business_district_data_year": ["central_business_district"],
+    "school_data_year": [
+        "school_elementary_district",
+        "school_secondary_district",
+        "school_unified_district",
+    ],
+    "tax_data_year": [
+        "tax_municipality",
+        "tax_park_district",
+        "tax_library_district",
+        "tax_fire_protection_district",
+        "tax_community_college_district",
+        "tax_sanitation_district",
+        "tax_special_service_area",
+        "tax_tif_district",
+    ],
+}
+# Declare class groupings
+groups = ["no_group", "class", "major_class", "modeling_group", "stage_name"]
+
+
+# %%
+# Define aggregation functions
+def aggregrate(data, geography_type, group_type):
+    print(geography_type, group_type)
+
+    group = [geography_type, group_type, "year"]
+    data["size"] = data.groupby(group)["tot_mv"].transform("size")
+    data["sale_count"] = data.groupby(group)["sale_price"].transform("count")
+    data["mv_count"] = data.groupby(group)["tot_mv"].transform("count")
+    data["ratio_count"] = data.groupby(group)["ratio"].transform("count")
+    data = data[data["ratio_count"] > 1]
+
+    summary = (
+        data.dropna(subset=["ratio"])
+        .groupby(group)
+        .apply(
+            lambda x: pd.Series(
+                {
+                    "size": x["size"].iloc[0],
+                    "mv_count": x["mv_count"].iloc[0],
+                    "sale_count": x["sale_count"].iloc[0],
+                    "mv_min": x["tot_mv"].min(),
+                    "mv_q10": x["tot_mv"].quantile(0.1),
+                    "mv_q25": x["tot_mv"].quantile(0.25),
+                    "mv_median": x["tot_mv"].median(),
+                    "mv_q75": x["tot_mv"].quantile(0.75),
+                    "mv_q90": x["tot_mv"].quantile(0.90),
+                    "mv_max": x["tot_mv"].max(),
+                    "mv_mean": x["tot_mv"].mean(),
+                    "mv_sum": x["tot_mv"].sum(),
+                    "ratio_min": x["ratio"].min(),
+                    "ratio_q10": x["ratio"].quantile(0.1),
+                    "ratio_q25": x["ratio"].quantile(0.25),
+                    "ratio_median": x["ratio"].median(),
+                    "ratio_q75": x["ratio"].quantile(0.75),
+                    "ratio_q90": x["ratio"].quantile(0.90),
+                    "ratio_max": x["ratio"].max(),
+                    "ratio_mean": x["ratio"].mean(),
+                    "cod": ass.cod(ratio=x["ratio"]),
+                    "prd": ass.prd(x["tot_mv"], x["sale_price"]),
+                    "prb": ass.prb(x["tot_mv"], x["sale_price"], 3)["prb"],
+                    # "mki": ass.mki(x["tot_mv"], x["sale_price"]),
+                }
+            ),
+            include_groups=False,
+        )
+    )
+    summary["geography_type"] = geography_type
+    summary["group_type"] = group_type
+
+    return summary
+
+
+# Create an empty dataframe to fill with output
+output = pd.DataFrame()
+
+# Loop through group combinations and stack output
+for key, value in geos.items():
+    df["data_year"] = df[key]
+
+    for x in value:
+        for z in groups:
+            output = pd.concat([output, aggregrate(df, x, z)])
+
+output.index.names = ["geography_id", "group_id", "year"]
+
+output = output.reset_index().set_index(
+    [
+        "geography_type",
+        "geography_id",
+        "group_type",
+        "group_id",
+        "year",
+    ]
+)
+
+# Clean combined output and export
+output["mv_delta_pct_median"] = (
+    output.sort_values("year")
+    .groupby(["geography_id", "group_id"])
+    .mv_median.diff()
+)
+output["mv_delta_pct_mean"] = (
+    output.sort_values("year")
+    .groupby(["geography_id", "group_id"])
+    .mv_mean.diff()
+)
+output["mv_delta_pct_sum"] = (
+    output.sort_values("year")
+    .groupby(["geography_id", "group_id"])
+    .mv_sum.diff()
+)
+
+output["mv_delta_pct_median"] = (
+    output.sort_values("year")
+    .groupby(["geography_id", "group_id"])
+    .mv_median.pct_change()
+)
+output["mv_delta_pct_mean"] = (
+    output.sort_values("year")
+    .groupby(["geography_id", "group_id"])
+    .mv_mean.pct_change()
+)
+
+output.dropna(how="all", axis=1, inplace=True)
+output.to_csv("sot_ratio_stats.csv")
diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.sql b/dbt/models/reporting/reporting.sot_ratio_stats.sql
new file mode 100644
index 000000000..267029dd7
--- /dev/null
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.sql
@@ -0,0 +1,115 @@
+-- Gather parcel-level geographies and join land, sales, and class groupings
+
+/* Ensure every municipality/class/year has a row for every stage through
+cross-joining. This is to make sure that combinations that do not yet
+exist in iasworld.asmt_all for the current year will exist in the view, but have
+largely empty columns. For example: even if no class 4s in the City of Chicago
+have been mailed yet for the current assessment year, we would still like an
+empty City of Chicago/class 4 row to exist for the mailed stage. */
+WITH stages AS (
+
+    SELECT 'MAILED' AS stage_name
+    UNION
+    SELECT 'ASSESSOR CERTIFIED' AS stage_name
+    UNION
+    SELECT 'BOR CERTIFIED' AS stage_name
+
+),
+
+uni AS (
+    SELECT
+        vw_pin_universe.*,
+        stages.*
+    FROM default.vw_pin_universe
+    CROSS JOIN stages
+)
+
+SELECT
+    CAST(sales.sale_price AS DOUBLE) AS sale_price,
+    uni.year,
+    uni.stage_name,
+    uni.class,
+    CAST(vals.tot_mv AS DOUBLE) AS tot_mv,
+    CAST(vals.tot_mv AS DOUBLE) / CAST(sales.sale_price AS DOUBLE) AS ratio,
+    CASE
+        WHEN
+            MOD(CAST(uni.year AS INT), 3) = 0
+            AND uni.triad_name = 'North'
+            THEN TRUE
+        WHEN
+            MOD(CAST(uni.year AS INT), 3) = 1
+            AND uni.triad_name = 'South'
+            THEN TRUE
+        WHEN
+            MOD(CAST(uni.year AS INT), 3) = 2
+            AND uni.triad_name = 'City'
+            THEN TRUE
+        ELSE FALSE
+    END AS reassessment_year,
+    'Cook' AS county,
+    uni.triad_name AS triad,
+    uni.township_name AS township,
+    uni.nbhd_code AS nbhd,
+    uni.tax_code,
+    uni.zip_code,
+    uni.chicago_community_area_name AS community_area,
+    uni.census_place_geoid AS census_place,
+    uni.census_tract_geoid AS census_tract,
+    uni.census_congressional_district_geoid
+        AS
+        census_congressional_district,
+    uni.census_zcta_geoid AS census_zcta,
+    uni.cook_board_of_review_district_num AS cook_board_of_review_district,
+    uni.cook_commissioner_district_num AS cook_commissioner_district,
+    uni.cook_judicial_district_num AS cook_judicial_district,
+    uni.ward_num,
+    uni.chicago_police_district_num AS police_district,
+    uni.school_elementary_district_geoid AS school_elementary_district,
+    uni.school_secondary_district_geoid AS school_secondary_district,
+    uni.school_unified_district_geoid AS school_unified_district,
+    uni.tax_municipality_name AS tax_municipality,
+    uni.tax_park_district_name AS tax_park_district,
+    uni.tax_library_district_name AS tax_library_district,
+    uni.tax_fire_protection_district_name AS tax_fire_protection_district,
+    uni.tax_community_college_district_name
+        AS
+        tax_community_college_district,
+    uni.tax_sanitation_district_name AS tax_sanitation_district,
+    uni.tax_special_service_area_name AS tax_special_service_area,
+    uni.tax_tif_district_name AS tax_tif_district,
+    uni.econ_central_business_district_num AS central_business_district,
+    uni.census_data_year,
+    uni.cook_board_of_review_district_data_year,
+    uni.cook_commissioner_district_data_year,
+    uni.cook_judicial_district_data_year,
+    COALESCE(
+        uni.ward_chicago_data_year, uni.ward_evanston_data_year) AS
+    ward_data_year,
+    uni.chicago_community_area_data_year AS community_area_data_year,
+    uni.chicago_police_district_data_year AS police_district_data_year,
+    uni.econ_central_business_district_data_year
+        AS
+        central_business_district_data_year,
+    uni.school_data_year,
+    uni.tax_data_year,
+    'no_group' AS no_group,
+    class_dict.major_class_type AS major_class,
+    class_dict.modeling_group
+FROM uni
+LEFT JOIN
+    z_ci_508_add_mv_to_reportingvw_pin_value_long_reporting.vw_pin_value_long
+        AS vals
+    ON uni.pin = vals.pin
+    AND uni.year = vals.year
+    AND uni.stage_name = vals.stage_name
+LEFT JOIN ccao.class_dict
+    ON uni.class = class_dict.class_code
+LEFT JOIN default.vw_pin_sale AS sales
+    ON uni.pin = sales.pin
+    AND uni.year = sales.year
+    AND NOT sales.is_multisale
+    AND NOT sales.sale_filter_deed_type
+    AND NOT sales.sale_filter_less_than_10k
+    AND NOT sales.sale_filter_same_sale_within_365
+WHERE uni.year >= '2020'
+    AND (vals.tot_mv > 0 OR vals.tot_mv IS NULL)

From 0a9e1f3f124141c079cb5c7711192e631674ece0 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Tue, 18 Jun 2024 18:38:53 +0000
Subject: [PATCH 08/96] Correct stage grouping, counting

---
 .../reporting/reporting.sot_ratio_stats.py    | 26 ++++++++++++-------
 .../reporting/reporting.sot_ratio_stats.sql   |  1 -
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py
index e54e42203..a37cb2261 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.py
@@ -1,6 +1,5 @@
 # This script generates aggregated summary stats on sales data across a number
 # of geographies, class combinations, and time.
-
 import os.path
 
 # Import libraries
@@ -51,19 +50,23 @@
     ],
 }
 # Declare class groupings
-groups = ["no_group", "class", "major_class", "modeling_group", "stage_name"]
+groups = ["no_group", "class", "major_class", "modeling_group"]
 
 
-# %%
 # Define aggregation functions
 def aggregrate(data, geography_type, group_type):
     print(geography_type, group_type)
 
-    group = [geography_type, group_type, "year"]
+    group = [geography_type, group_type, "year", "stage_name"]
     data["size"] = data.groupby(group)["tot_mv"].transform("size")
     data["sale_count"] = data.groupby(group)["sale_price"].transform("count")
     data["mv_count"] = data.groupby(group)["tot_mv"].transform("count")
+
+    # Remove parcels with FMVs of 0 since they screw up ratios
+    data = data[data["tot_mv"] > 0].reset_index()
     data["ratio_count"] = data.groupby(group)["ratio"].transform("count")
+
+    # Remove groups that only have one sale since we can't calculate stats
     data = data[data["ratio_count"] > 1]
 
     summary = (
@@ -118,7 +121,7 @@ def aggregrate(data, geography_type, group_type):
         for z in groups:
             output = pd.concat([output, aggregrate(df, x, z)])
 
-output.index.names = ["geography_id", "group_id", "year"]
+output.index.names = ["geography_id", "group_id", "year", "stage_name"]
 
 output = output.reset_index().set_index(
     [
@@ -127,36 +130,39 @@ def aggregrate(data, geography_type, group_type):
         "group_type",
         "group_id",
         "year",
+        "stage_name",
     ]
 )
 
 # Clean combined output and export
 output["mv_delta_pct_median"] = (
     output.sort_values("year")
-    .groupby(["geography_id", "group_id"])
+    .groupby(["geography_id", "group_id", "stage_name"])
     .mv_median.diff()
 )
 output["mv_delta_pct_mean"] = (
     output.sort_values("year")
-    .groupby(["geography_id", "group_id"])
+    .groupby(["geography_id", "group_id", "stage_name"])
     .mv_mean.diff()
 )
 output["mv_delta_pct_sum"] = (
     output.sort_values("year")
-    .groupby(["geography_id", "group_id"])
+    .groupby(["geography_id", "group_id", "stage_name"])
     .mv_sum.diff()
 )
 
 output["mv_delta_pct_median"] = (
     output.sort_values("year")
-    .groupby(["geography_id", "group_id"])
+    .groupby(["geography_id", "group_id", "stage_name"])
     .mv_median.pct_change()
 )
 output["mv_delta_pct_mean"] = (
     output.sort_values("year")
-    .groupby(["geography_id", "group_id"])
+    .groupby(["geography_id", "group_id", "stage_name"])
     .mv_mean.pct_change()
 )
 
 output.dropna(how="all", axis=1, inplace=True)
 output.to_csv("sot_ratio_stats.csv")
+
+# %%
diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.sql b/dbt/models/reporting/reporting.sot_ratio_stats.sql
index 267029dd7..dc6cf73e3 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats.sql
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.sql
@@ -112,4 +112,3 @@ LEFT JOIN default.vw_pin_sale AS sales
     AND NOT sales.sale_filter_less_than_10k
     AND NOT sales.sale_filter_same_sale_within_365
 WHERE uni.year >= '2020'
-    AND (vals.tot_mv > 0 OR vals.tot_mv IS NULL)

From 030a7c54e72ebbda37fb154e51246e53812ba192 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Tue, 18 Jun 2024 19:10:05 +0000
Subject: [PATCH 09/96] Fix assessment roll stage grouping

---
 dbt/models/reporting/reporting.sot_assessment_roll.py  | 9 ++++-----
 dbt/models/reporting/reporting.sot_assessment_roll.sql | 1 -
 dbt/models/reporting/reporting.sot_ratio_stats.sql     | 3 +--
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index 7bae830b5..a31adf5b2 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -50,7 +50,7 @@
     ],
 }
 # Declare class groupings
-groups = ["no_group", "class", "major_class", "modeling_group", "stage_name"]
+groups = ["no_group", "class", "major_class", "modeling_group"]
 
 
 # Define aggregation functions
@@ -77,11 +77,11 @@ def first(x):
 def aggregrate(data, geography_type, group_type):
     print(geography_type, group_type)
 
-    group = [geography_type, group_type, "year"]
+    group = [geography_type, group_type, "year", "stage_name"]
     summary = data.groupby(group).agg(stats).round(2)
     summary["geography_type"] = geography_type
     summary["group_type"] = group_type
-    summary.index.names = ["geography_id", "group_id", "year"]
+    summary.index.names = ["geography_id", "group_id", "year", "stage_name"]
     summary = summary.reset_index().set_index(
         [
             "geography_type",
@@ -89,6 +89,7 @@ def aggregrate(data, geography_type, group_type):
             "group_type",
             "group_id",
             "year",
+            "stage_name",
         ]
     )
 
@@ -133,5 +134,3 @@ def aggregrate(data, geography_type, group_type):
 output["tot", "pct_w_value"] = output["tot", "count"] / output["tot", "size"]
 
 output.to_csv("sot_assessment_roll.csv")
-
-# %%
diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.sql b/dbt/models/reporting/reporting.sot_assessment_roll.sql
index aa615a533..0d582c814 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.sql
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.sql
@@ -102,4 +102,3 @@ LEFT JOIN reporting.vw_pin_value_long AS vals
     AND uni.stage_name = vals.stage_name
 LEFT JOIN ccao.class_dict
     ON uni.class = class_dict.class_code
-LIMIT 10000
diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.sql b/dbt/models/reporting/reporting.sot_ratio_stats.sql
index dc6cf73e3..b142f537a 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats.sql
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.sql
@@ -97,8 +97,7 @@ SELECT
     class_dict.modeling_group
 FROM uni
 LEFT JOIN
-    z_ci_508_add_mv_to_reportingvw_pin_value_long_reporting.vw_pin_value_long
-        AS vals
+    reporting.vw_pin_value_long AS vals
     ON uni.pin = vals.pin
     AND uni.year = vals.year
     AND uni.stage_name = vals.stage_name

From 1c2adaecb32567c6e9050b1049392b6a9b22d12d Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Tue, 18 Jun 2024 19:49:18 +0000
Subject: [PATCH 10/96] Clean output before writing

---
 dbt/models/reporting/reporting.sot_assessment_roll.py  | 3 +++
 dbt/models/reporting/reporting.sot_sales.py            | 3 +++
 dbt/models/reporting/reporting.sot_taxes_exemptions.py | 3 +++
 3 files changed, 9 insertions(+)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index a31adf5b2..cfacb1b4b 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -133,4 +133,7 @@ def aggregrate(data, geography_type, group_type):
 
 output["tot", "pct_w_value"] = output["tot", "count"] / output["tot", "size"]
 
+output.columns = ["_".join(col) for col in output.columns]
+output.reset_index()
+
 output.to_csv("sot_assessment_roll.csv")
diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py
index 120a0bd5b..c2034f873 100644
--- a/dbt/models/reporting/reporting.sot_sales.py
+++ b/dbt/models/reporting/reporting.sot_sales.py
@@ -134,4 +134,7 @@ def first(x):
     output["sale_price", "delta" + i] = output["sale_price", i].diff()
     output["price_per_sf", "delta" + i] = output["price_per_sf", i].diff()
 
+output.columns = ["_".join(col) for col in output.columns]
+output.reset_index()
+
 output.to_csv("sot_sales.csv")
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
index b26e55a68..9ef77b278 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
@@ -134,4 +134,7 @@ def first(x):
 for i in ["median", "mean", "sum"]:
     output["tax_bill_total", "delta" + i] = output["tax_bill_total", i].diff()
 
+output.columns = ["_".join(col) for col in output.columns]
+output.reset_index()
+
 output.to_csv("sot_taxes_exemptions.csv")

From 672bd1ebec6156294ebdd35bf924b764e41f4ef2 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Tue, 18 Jun 2024 20:31:17 +0000
Subject: [PATCH 11/96] Begin dbt building

---
 dbt/models/reporting/docs.md                  | 36 +++++++++++++++++++
 ...> reporting.sot_assessment_roll_input.sql} | 11 ++++--
 ...ql => reporting.sot_ratio_stats_input.sql} | 13 ++++---
 ...ales.sql => reporting.sot_sales_input.sql} | 16 ++++++---
 ... reporting.sot_taxes_exemptions_input.sql} | 16 ++++++---
 dbt/models/reporting/schema.yml               | 24 +++++++++++++
 6 files changed, 99 insertions(+), 17 deletions(-)
 rename dbt/models/reporting/{reporting.sot_assessment_roll.sql => reporting.sot_assessment_roll_input.sql} (94%)
 rename dbt/models/reporting/{reporting.sot_ratio_stats.sql => reporting.sot_ratio_stats_input.sql} (94%)
 rename dbt/models/reporting/{reporting.sot_sales.sql => reporting.sot_sales_input.sql} (92%)
 rename dbt/models/reporting/{reporting.sot_taxes_exemptions.sql => reporting.sot_taxes_exemptions_input.sql} (92%)

diff --git a/dbt/models/reporting/docs.md b/dbt/models/reporting/docs.md
index 2ce0bcb92..baf60aba9 100644
--- a/dbt/models/reporting/docs.md
+++ b/dbt/models/reporting/docs.md
@@ -30,6 +30,42 @@ Materialized to speed up queries for Tableau.
 `property_group`
 {% enddocs %}
 
+# sot_assessment_roll_input
+
+{% docs table_sot_assessment_roll_input %}
+Table to feed the Python dbt job that creates the
+`reporting.sot_assessment_roll` table. Feeds public reporting assets.
+
+**Primary Key**: `year`, `stage_name`, `geography_id`, `group_id`
+{% enddocs %}
+
+# sot_ratio_stats_input
+
+{% docs table_sot_ratio_stats_input %}
+Table to feed the Python dbt job that creates the
+`reporting.sot_ratio_stats` table. Feeds public reporting assets.
+
+**Primary Key**: `year`, `stage_name`, `geography_id`, `group_id`
+{% enddocs %}
+
+# sot_sales_input
+
+{% docs table_sot_sales_input %}
+Table to feed the Python dbt job that creates the
+`reporting.sot_sales` table. Feeds public reporting assets.
+
+**Primary Key**: `year`, `geography_id`, `group_id`
+{% enddocs %}
+
+# sot_taxes_exemptions_input
+
+{% docs table_sot_taxes_exemptions_input %}
+Table to feed the Python dbt job that creates the
+`reporting.sot_taxes_exemptions` table. Feeds public reporting assets.
+
+**Primary Key**: `year`, `geography_id`, `group_id`
+{% enddocs %}
+
 # vw_assessment_roll
 
 {% docs view_vw_assessment_roll %}
diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
similarity index 94%
rename from dbt/models/reporting/reporting.sot_assessment_roll.sql
rename to dbt/models/reporting/reporting.sot_assessment_roll_input.sql
index 0d582c814..395337357 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.sql
+++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
@@ -1,4 +1,9 @@
 -- Gather parcel-level geographies and join land, sales, and class groupings
+{{
+    config(
+        materialized='table'
+    )
+}}
 
 /* Ensure every municipality/class/year has a row for every stage through
 cross-joining. This is to make sure that combinations that do not yet
@@ -20,7 +25,7 @@ uni AS (
     SELECT
         vw_pin_universe.*,
         stages.*
-    FROM default.vw_pin_universe
+    FROM {{ ref('default.vw_pin_universe') }}
     CROSS JOIN stages
 )
 
@@ -96,9 +101,9 @@ SELECT
     class_dict.major_class_type AS major_class,
     class_dict.modeling_group
 FROM uni
-LEFT JOIN reporting.vw_pin_value_long AS vals
+LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals
     ON uni.pin = vals.pin
     AND uni.year = vals.year
     AND uni.stage_name = vals.stage_name
-LEFT JOIN ccao.class_dict
+LEFT JOIN {{ ref('ccao.class_dict') }}
     ON uni.class = class_dict.class_code
diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.sql b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
similarity index 94%
rename from dbt/models/reporting/reporting.sot_ratio_stats.sql
rename to dbt/models/reporting/reporting.sot_ratio_stats_input.sql
index b142f537a..f723c10ef 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats.sql
+++ b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
@@ -1,4 +1,9 @@
 -- Gather parcel-level geographies and join land, sales, and class groupings
+{{
+    config(
+        materialized='table'
+    )
+}}
 
 /* Ensure every municipality/class/year has a row for every stage through
 cross-joining. This is to make sure that combinations that do not yet
@@ -20,7 +25,7 @@ uni AS (
     SELECT
         vw_pin_universe.*,
         stages.*
-    FROM default.vw_pin_universe
+    FROM {{ ref('default.vw_pin_universe') }}
     CROSS JOIN stages
 )
 
@@ -97,13 +102,13 @@ SELECT
     class_dict.modeling_group
 FROM uni
 LEFT JOIN
-    reporting.vw_pin_value_long AS vals
+    {{ ref('reporting.vw_pin_value_long') }} AS vals
     ON uni.pin = vals.pin
     AND uni.year = vals.year
     AND uni.stage_name = vals.stage_name
-LEFT JOIN ccao.class_dict
+LEFT JOIN {{ ref('ccao.class_dict') }}
     ON uni.class = class_dict.class_code
-LEFT JOIN default.vw_pin_sale AS sales
+LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales
     ON uni.pin = sales.pin
     AND uni.year = sales.year
     AND NOT sales.is_multisale
diff --git a/dbt/models/reporting/reporting.sot_sales.sql b/dbt/models/reporting/reporting.sot_sales_input.sql
similarity index 92%
rename from dbt/models/reporting/reporting.sot_sales.sql
rename to dbt/models/reporting/reporting.sot_sales_input.sql
index 2be8e4dc5..98dcbec23 100644
--- a/dbt/models/reporting/reporting.sot_sales.sql
+++ b/dbt/models/reporting/reporting.sot_sales_input.sql
@@ -1,3 +1,9 @@
+{{
+    config(
+        materialized='table'
+    )
+}}
+
 -- Gather parcel-level land and yrblt
 WITH sf AS (
     SELECT
@@ -6,7 +12,7 @@ WITH sf AS (
         SUM(char_bldg_sf) AS char_bldg_sf,
         SUM(char_land_sf) AS char_land_sf,
         ARBITRARY(char_yrblt) AS char_yrblt
-    FROM default.vw_card_res_char
+    FROM {{ ref('default.vw_card_res_char') }}
     GROUP BY pin, year
 )
 
@@ -75,16 +81,16 @@ SELECT
     'no_group' AS no_group,
     class_dict.major_class_type AS major_class,
     class_dict.modeling_group
-FROM default.vw_pin_universe AS uni
+FROM {{ ref('default.vw_pin_universe') }} AS uni
 LEFT JOIN sf
     ON uni.pin = sf.pin
     AND uni.year = sf.year
-LEFT JOIN ccao.class_dict
+LEFT JOIN {{ ref('ccao.class_dict') }}
     ON uni.class = class_dict.class_code
-LEFT JOIN default.vw_pin_history AS hist
+LEFT JOIN {{ ref('default.vw_pin_history') }} AS hist
     ON uni.pin = hist.pin
     AND uni.year = hist.year
-LEFT JOIN default.vw_pin_sale AS sales
+LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales
     ON uni.pin = sales.pin
     AND uni.year = sales.year
     AND NOT sales.is_multisale
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.sql b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
similarity index 92%
rename from dbt/models/reporting/reporting.sot_taxes_exemptions.sql
rename to dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
index f96bdc5d4..80bee99bc 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions.sql
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
@@ -1,3 +1,9 @@
+{{
+    config(
+        materialized='table'
+    )
+}}
+
 -- Gather parcel-level geographies and join taxes, exemptions, and class
 -- groupings
 SELECT
@@ -80,14 +86,14 @@ SELECT
     'no_group' AS no_group,
     class_dict.major_class_type AS major_class,
     class_dict.modeling_group
-FROM default.vw_pin_universe AS uni
-INNER JOIN tax.pin AS tax
+FROM {{ ref('default.vw_pin_universe') }} AS uni
+INNER JOIN {{ source('tax', 'pin') }} AS tax
     ON uni.pin = tax.pin
     AND uni.year = tax.year
-INNER JOIN tax.eq_factor AS eqf
+INNER JOIN {{ source('tax', 'eq_factor') }} AS eqf
     ON uni.year = eqf.year
-INNER JOIN tax.tax_code AS tcd
+INNER JOIN {{ source('tax', 'tax_code') }} AS tcd
     ON tax.tax_code_num = tcd.tax_code_num
     AND tax.year = tcd.year
-INNER JOIN ccao.class_dict
+INNER JOIN {{ ref('ccao.class_dict') }}
     ON uni.class = class_dict.class_code
diff --git a/dbt/models/reporting/schema.yml b/dbt/models/reporting/schema.yml
index daf546e4f..7afa63d71 100644
--- a/dbt/models/reporting/schema.yml
+++ b/dbt/models/reporting/schema.yml
@@ -34,6 +34,30 @@ models:
             within_20_pct >= within_10_pct
             AND within_10_pct >= within_05_pct
 
+  - name: reporting.sot_assessment_roll_input
+    description: '{{ doc("table_sot_assessment_roll_input") }}'
+    config:
+      tags:
+        - daily
+
+  - name: reporting.sot_ratio_stats_input
+    description: '{{ doc("table_sot_ratio_stats_input") }}'
+    config:
+      tags:
+        - daily
+
+  - name: reporting.sot_sales_input
+    description: '{{ doc("table_sot_sales_input") }}'
+    config:
+      tags:
+        - daily
+
+  - name: reporting.sot_taxes_exemptions_input
+    description: '{{ doc("table_sot_taxes_exemptions_input") }}'
+    config:
+      tags:
+        - daily
+
   - name: reporting.ratio_stats_input
     description: '{{ doc("table_ratio_stats_input") }}'
     config:

From 3f60a77407337b1c41b247190f494d0af985135e Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Thu, 20 Jun 2024 18:33:28 +0000
Subject: [PATCH 12/96] Attempt to build assessment_roll table

---
 dbt/models/reporting/docs.md                  |   9 +
 .../reporting.sot_assessment_roll.py          | 101 ++++++-----
 .../reporting.sot_assessment_roll_input.sql   |  19 +-
 .../reporting/reporting.sot_ratio_stats.py    | 168 ------------------
 dbt/models/reporting/reporting.sot_sales.py   | 140 ---------------
 .../reporting.sot_taxes_exemptions.py         | 140 ---------------
 dbt/models/reporting/schema.yml               |   6 +
 7 files changed, 82 insertions(+), 501 deletions(-)
 delete mode 100644 dbt/models/reporting/reporting.sot_ratio_stats.py
 delete mode 100644 dbt/models/reporting/reporting.sot_sales.py
 delete mode 100644 dbt/models/reporting/reporting.sot_taxes_exemptions.py

diff --git a/dbt/models/reporting/docs.md b/dbt/models/reporting/docs.md
index a94a0be67..dd0f7bf68 100644
--- a/dbt/models/reporting/docs.md
+++ b/dbt/models/reporting/docs.md
@@ -30,6 +30,15 @@ Materialized to speed up queries for Tableau.
 `property_group`
 {% enddocs %}
 
+# sot_assessment_roll
+
+{% docs table_sot_assessment_roll %}
+Table to feed the Python dbt job that creates the
+`reporting.sot_assessment_roll` table. Feeds public reporting assets.
+
+**Primary Key**: `year`, `stage_name`, `geography_id`, `group_id`
+{% enddocs %}
+
 # sot_assessment_roll_input
 
 {% docs table_sot_assessment_roll_input %}
diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index cfacb1b4b..2f99c35bc 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -1,21 +1,12 @@
+# pylint: skip-file
+# type: ignore
+
 # This script generates aggregated summary stats on sales data across a number
 # of geographies, class combinations, and time.
 
-import os.path
-
 # Import libraries
-import awswrangler as wr
 import pandas as pd
 
-# Ingest data if it is not already available
-if os.path.isfile("sot_assessment_roll.parquet.gzip"):
-    df = pd.read_parquet("sot_assessment_roll.parquet.gzip")
-
-else:
-    sql = open("reporting.sot_assessment_roll.sql").read()
-    df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False)
-    df.to_parquet("sot_assessment_roll.parquet.gzip", compression="gzip")
-
 # Declare geographic groups and their associated data years
 geos = {
     "census_data_year": [
@@ -74,6 +65,25 @@ def first(x):
     return x.iloc[0]
 
 
+more_stats = [
+    "min",
+    q10,
+    q25,
+    "median",
+    q75,
+    q90,
+    "max",
+    "mean",
+    "sum",
+]
+
+stats = {
+    "tot": ["size", "count"] + more_stats,
+    "bldg": more_stats,
+    "land": more_stats,
+}
+
+
 def aggregrate(data, geography_type, group_type):
     print(geography_type, group_type)
 
@@ -96,44 +106,45 @@ def aggregrate(data, geography_type, group_type):
     return summary
 
 
-more_stats = [
-    "min",
-    q10,
-    q25,
-    "median",
-    q75,
-    q90,
-    "max",
-    "mean",
-    "sum",
-]
+def assemble(df, geos, groups):
+    # Create an empty dataframe to fill with output
+    output = pd.DataFrame()
 
-stats = {
-    "tot": ["size", "count"] + more_stats,
-    "bldg": more_stats,
-    "land": more_stats,
-}
+    # Loop through group combinations and stack output
+    for key, value in geos.items():
+        df["data_year"] = df[key]
+
+        for x in value:
+            for z in groups:
+                output = pd.concat([output, aggregrate(df, x, z)])
+
+    # Clean combined output and export
+    for i in ["median", "mean", "sum"]:
+        output["tot", "delta" + i] = output["tot", i].diff()
+        output["bldg", "delta" + i] = output["bldg", i].diff()
+        output["land", "delta" + i] = output["land", i].diff()
+
+    output["tot", "pct_w_value"] = (
+        output["tot", "count"] / output["tot", "size"]
+    )
+
+    output.columns = ["_".join(col) for col in output.columns]
+    output.reset_index()
+
+    return output
 
-# Create an empty dataframe to fill with output
-output = pd.DataFrame()
 
-# Loop through group combinations and stack output
-for key, value in geos.items():
-    df["data_year"] = df[key]
+def model(dbt, spark_session):
+    dbt.config(materialized="table")
 
-    for x in value:
-        for z in groups:
-            output = pd.concat([output, aggregrate(df, x, z)])
+    input = dbt.ref("reporting.sot_assessment_roll_input")
 
-# Clean combined output and export
-for i in ["median", "mean", "sum"]:
-    output["tot", "delta" + i] = output["tot", i].diff()
-    output["bldg", "delta" + i] = output["bldg", i].diff()
-    output["land", "delta" + i] = output["land", i].diff()
+    # Convert the Spark input dataframe to Pandas for
+    # compatibility with assesspy functions
+    input = input.toPandas()
 
-output["tot", "pct_w_value"] = output["tot", "count"] / output["tot", "size"]
+    df = assemble(input, geos=geos, groups=groups)
 
-output.columns = ["_".join(col) for col in output.columns]
-output.reset_index()
+    spark_df = spark_session.createDataFrame(df)
 
-output.to_csv("sot_assessment_roll.csv")
+    return spark_df
diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
index 395337357..e29d3c037 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
+++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
@@ -72,16 +72,19 @@ SELECT
     uni.school_elementary_district_geoid AS school_elementary_district,
     uni.school_secondary_district_geoid AS school_secondary_district,
     uni.school_unified_district_geoid AS school_unified_district,
-    uni.tax_municipality_name AS tax_municipality,
-    uni.tax_park_district_name AS tax_park_district,
-    uni.tax_library_district_name AS tax_library_district,
-    uni.tax_fire_protection_district_name AS tax_fire_protection_district,
-    uni.tax_community_college_district_name
+    ARRAY_JOIN(uni.tax_municipality_name, ', ') AS tax_municipality,
+    ARRAY_JOIN(uni.tax_park_district_name, ', ') AS tax_park_district,
+    ARRAY_JOIN(uni.tax_library_district_name, ', ') AS tax_library_district,
+    ARRAY_JOIN(uni.tax_fire_protection_district_name, ', ')
+        AS tax_fire_protection_district,
+    ARRAY_JOIN(uni.tax_community_college_district_name, ', ')
         AS
         tax_community_college_district,
-    uni.tax_sanitation_district_name AS tax_sanitation_district,
-    uni.tax_special_service_area_name AS tax_special_service_area,
-    uni.tax_tif_district_name AS tax_tif_district,
+    ARRAY_JOIN(uni.tax_sanitation_district_name, ', ')
+        AS tax_sanitation_district,
+    ARRAY_JOIN(uni.tax_special_service_area_name, ', ')
+        AS tax_special_service_area,
+    ARRAY_JOIN(uni.tax_tif_district_name, ', ') AS tax_tif_district,
     uni.econ_central_business_district_num AS central_business_district,
     uni.census_data_year,
     uni.cook_board_of_review_district_data_year,
diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py
deleted file mode 100644
index a37cb2261..000000000
--- a/dbt/models/reporting/reporting.sot_ratio_stats.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# This script generates aggregated summary stats on sales data across a number
-# of geographies, class combinations, and time.
-import os.path
-
-# Import libraries
-import assesspy as ass
-import awswrangler as wr
-import pandas as pd
-
-# Ingest data if it is not already available
-if os.path.isfile("sot_ratio_stats.parquet.gzip"):
-    df = pd.read_parquet("sot_ratio_stats.parquet.gzip")
-
-else:
-    sql = open("reporting.sot_ratio_stats.sql").read()
-    df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False)
-    df.to_parquet("sot_ratio_stats.parquet.gzip", compression="gzip")
-
-# Declare geographic groups and their associated data years
-geos = {
-    "census_data_year": [
-        "census_place",
-        "census_tract",
-        "census_congressional_district",
-        "census_zcta",
-    ],
-    "cook_board_of_review_district_data_year": [
-        "cook_board_of_review_district"
-    ],
-    "cook_commissioner_district_data_year": ["cook_commissioner_district"],
-    "cook_judicial_district_data_year": ["cook_judicial_district"],
-    "ward_data_year": ["ward_num"],
-    "community_area_data_year": ["community_area"],
-    "police_district_data_year": ["police_district"],
-    "central_business_district_data_year": ["central_business_district"],
-    "school_data_year": [
-        "school_elementary_district",
-        "school_secondary_district",
-        "school_unified_district",
-    ],
-    "tax_data_year": [
-        "tax_municipality",
-        "tax_park_district",
-        "tax_library_district",
-        "tax_fire_protection_district",
-        "tax_community_college_district",
-        "tax_sanitation_district",
-        "tax_special_service_area",
-        "tax_tif_district",
-    ],
-}
-# Declare class groupings
-groups = ["no_group", "class", "major_class", "modeling_group"]
-
-
-# Define aggregation functions
-def aggregrate(data, geography_type, group_type):
-    print(geography_type, group_type)
-
-    group = [geography_type, group_type, "year", "stage_name"]
-    data["size"] = data.groupby(group)["tot_mv"].transform("size")
-    data["sale_count"] = data.groupby(group)["sale_price"].transform("count")
-    data["mv_count"] = data.groupby(group)["tot_mv"].transform("count")
-
-    # Remove parcels with FMVs of 0 since they screw up ratios
-    data = data[data["tot_mv"] > 0].reset_index()
-    data["ratio_count"] = data.groupby(group)["ratio"].transform("count")
-
-    # Remove groups that only have one sale since we can't calculate stats
-    data = data[data["ratio_count"] > 1]
-
-    summary = (
-        data.dropna(subset=["ratio"])
-        .groupby(group)
-        .apply(
-            lambda x: pd.Series(
-                {
-                    "size": x["size"].iloc[0],
-                    "mv_count": x["mv_count"].iloc[0],
-                    "sale_count": x["sale_count"].iloc[0],
-                    "mv_min": x["tot_mv"].min(),
-                    "mv_q10": x["tot_mv"].quantile(0.1),
-                    "mv_q25": x["tot_mv"].quantile(0.25),
-                    "mv_median": x["tot_mv"].median(),
-                    "mv_q75": x["tot_mv"].quantile(0.75),
-                    "mv_q90": x["tot_mv"].quantile(0.90),
-                    "mv_max": x["tot_mv"].max(),
-                    "mv_mean": x["tot_mv"].mean(),
-                    "mv_sum": x["tot_mv"].sum(),
-                    "ratio_min": x["ratio"].min(),
-                    "ratio_q10": x["ratio"].quantile(0.1),
-                    "ratio_q25": x["ratio"].quantile(0.25),
-                    "ratio_median": x["ratio"].median(),
-                    "ratio_q75": x["ratio"].quantile(0.75),
-                    "ratio_q90": x["ratio"].quantile(0.90),
-                    "ratio_max": x["ratio"].max(),
-                    "ratio_mean": x["ratio"].mean(),
-                    "cod": ass.cod(ratio=x["ratio"]),
-                    "prd": ass.prd(x["tot_mv"], x["sale_price"]),
-                    "prb": ass.prb(x["tot_mv"], x["sale_price"], 3)["prb"],
-                    # "mki": ass.mki(x["tot_mv"], x["sale_price"]),
-                }
-            ),
-            include_groups=False,
-        )
-    )
-    summary["geography_type"] = geography_type
-    summary["group_type"] = group_type
-
-    return summary
-
-
-# Create an empty dataframe to fill with output
-output = pd.DataFrame()
-
-# Loop through group combinations and stack output
-for key, value in geos.items():
-    df["data_year"] = df[key]
-
-    for x in value:
-        for z in groups:
-            output = pd.concat([output, aggregrate(df, x, z)])
-
-output.index.names = ["geography_id", "group_id", "year", "stage_name"]
-
-output = output.reset_index().set_index(
-    [
-        "geography_type",
-        "geography_id",
-        "group_type",
-        "group_id",
-        "year",
-        "stage_name",
-    ]
-)
-
-# Clean combined output and export
-output["mv_delta_pct_median"] = (
-    output.sort_values("year")
-    .groupby(["geography_id", "group_id", "stage_name"])
-    .mv_median.diff()
-)
-output["mv_delta_pct_mean"] = (
-    output.sort_values("year")
-    .groupby(["geography_id", "group_id", "stage_name"])
-    .mv_mean.diff()
-)
-output["mv_delta_pct_sum"] = (
-    output.sort_values("year")
-    .groupby(["geography_id", "group_id", "stage_name"])
-    .mv_sum.diff()
-)
-
-output["mv_delta_pct_median"] = (
-    output.sort_values("year")
-    .groupby(["geography_id", "group_id", "stage_name"])
-    .mv_median.pct_change()
-)
-output["mv_delta_pct_mean"] = (
-    output.sort_values("year")
-    .groupby(["geography_id", "group_id", "stage_name"])
-    .mv_mean.pct_change()
-)
-
-output.dropna(how="all", axis=1, inplace=True)
-output.to_csv("sot_ratio_stats.csv")
-
-# %%
diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py
deleted file mode 100644
index c2034f873..000000000
--- a/dbt/models/reporting/reporting.sot_sales.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# This script generates aggregated summary stats on sales data across a number
-# of geographies, class combinations, and time.
-
-import os.path
-import statistics as stats
-
-# Import libraries
-import awswrangler as wr
-import numpy as np
-import pandas as pd
-
-# Ingest data if it is not already available
-if os.path.isfile("sot_sales.parquet.gzip"):
-    df = pd.read_parquet("sot_sales.parquet.gzip")
-
-else:
-    sql = open("reporting.sot_sales.sql").read()
-    df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False)
-    df.to_parquet("sot_sales.parquet.gzip", compression="gzip")
-
-# Declare geographic groups and their associated data years
-geos = {
-    "census_data_year": [
-        "census_place",
-        "census_tract",
-        "census_congressional_district",
-        "census_zcta",
-    ],
-    "cook_board_of_review_district_data_year": [
-        "cook_board_of_review_district"
-    ],
-    "cook_commissioner_district_data_year": ["cook_commissioner_district"],
-    "cook_judicial_district_data_year": ["cook_judicial_district"],
-    "ward_data_year": ["ward_num"],
-    "community_area_data_year": ["community_area"],
-    "police_district_data_year": ["police_district"],
-    "central_business_district_data_year": ["central_business_district"],
-    "school_data_year": [
-        "school_elementary_district",
-        "school_secondary_district",
-        "school_unified_district",
-    ],
-    "tax_data_year": [
-        "tax_municipality",
-        "tax_park_district",
-        "tax_library_district",
-        "tax_fire_protection_district",
-        "tax_community_college_district",
-        "tax_sanitation_district",
-        "tax_special_service_area",
-        "tax_tif_district",
-    ],
-}
-# Declare class groupings
-groups = ["no_group", "class", "major_class", "modeling_group"]
-
-
-# Define aggregation functions
-def q10(x):
-    return x.quantile(0.1)
-
-
-def q25(x):
-    return x.quantile(0.25)
-
-
-def q75(x):
-    return x.quantile(0.75)
-
-
-def q90(x):
-    return x.quantile(0.9)
-
-
-def first(x):
-    return x.iloc[0]
-
-
-more_stats = [
-    "min",
-    q10,
-    q25,
-    "median",
-    q75,
-    q90,
-    "max",
-    "mean",
-    "sum",
-]
-
-agg_func_math = {
-    "sale_price": ["size", "count"] + more_stats,
-    "price_per_sf": more_stats,
-    "char_bldg_sf": ["median"],
-    "char_land_sf": ["median"],
-    "char_yrblt": ["median"],
-    "class": [stats.multimode],
-    "data_year": [first],
-}
-
-# Create an empty dataframe to fill with output
-output = pd.DataFrame()
-
-# Loop through group combinations and stack output
-for key, value in geos.items():
-    df["data_year"] = df[key]
-
-    for x in value:
-        for z in groups:
-            group = [x, z, "year"]
-            summary = df.groupby(group).agg(agg_func_math).round(2)
-            summary["geography_type"] = x
-            summary["group_type"] = z
-            summary.index.names = ["geography_id", "group_id", "year"]
-            summary = summary.reset_index().set_index(
-                [
-                    "geography_type",
-                    "geography_id",
-                    "group_type",
-                    "group_id",
-                    "year",
-                ]
-            )
-
-            output = pd.concat([output, summary])
-
-# Clean combined output and export
-output["sale_price", "sum"] = output["sale_price", "sum"].replace(0, np.NaN)
-output["price_per_sf", "sum"] = output["price_per_sf", "sum"].replace(
-    0, np.NaN
-)
-
-for i in ["median", "mean", "sum"]:
-    output["sale_price", "delta" + i] = output["sale_price", i].diff()
-    output["price_per_sf", "delta" + i] = output["price_per_sf", i].diff()
-
-output.columns = ["_".join(col) for col in output.columns]
-output.reset_index()
-
-output.to_csv("sot_sales.csv")
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
deleted file mode 100644
index 9ef77b278..000000000
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# This script generates aggregated summary stats on taxes and exemptions data
-# across a number of geographies, class combinations, and time.
-import os.path
-
-# Import libraries
-import awswrangler as wr
-import pandas as pd
-
-# Ingest data if it is not already available
-if os.path.isfile("sot_taxes_exemptions.parquet.gzip"):
-    df = pd.read_parquet("sot_taxes_exemptions.parquet.gzip")
-
-else:
-    sql = open("reporting.sot_taxes_exemptions.sql").read()
-    df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False)
-    df.to_parquet("sot_taxes_exemptions.parquet.gzip", compression="gzip")
-
-# Declare geographic groups and their associated data years
-geos = {
-    "census_data_year": [
-        "census_place",
-        "census_tract",
-        "census_congressional_district",
-        "census_zcta",
-    ],
-    "cook_board_of_review_district_data_year": [
-        "cook_board_of_review_district"
-    ],
-    "cook_commissioner_district_data_year": ["cook_commissioner_district"],
-    "cook_judicial_district_data_year": ["cook_judicial_district"],
-    "ward_data_year": ["ward_num"],
-    "community_area_data_year": ["community_area"],
-    "police_district_data_year": ["police_district"],
-    "central_business_district_data_year": ["central_business_district"],
-    "school_data_year": [
-        "school_elementary_district",
-        "school_secondary_district",
-        "school_unified_district",
-    ],
-    "tax_data_year": [
-        "tax_municipality",
-        "tax_park_district",
-        "tax_library_district",
-        "tax_fire_protection_district",
-        "tax_community_college_district",
-        "tax_sanitation_district",
-        "tax_special_service_area",
-        "tax_tif_district",
-    ],
-}
-# Declare class groupings
-groups = ["no_group", "class", "major_class", "modeling_group"]
-
-
-# Define aggregation functions
-def q10(x):
-    return x.quantile(0.1)
-
-
-def q25(x):
-    return x.quantile(0.25)
-
-
-def q75(x):
-    return x.quantile(0.75)
-
-
-def q90(x):
-    return x.quantile(0.9)
-
-
-def first(x):
-    return x.iloc[0]
-
-
-more_stats = [
-    "min",
-    q10,
-    q25,
-    "median",
-    q75,
-    q90,
-    "max",
-    "mean",
-    "sum",
-]
-
-less_stats = ["count", "sum"]
-
-agg_func_math = {
-    "eq_factor_final": ["size", first],
-    "eq_factor_tentative": [first],
-    "tax_bill_total": more_stats,
-    "tax_code_rate": more_stats,
-    "av_clerk": more_stats,
-    "exe_homeowner": less_stats,
-    "exe_senior": less_stats,
-    "exe_freeze": less_stats,
-    "exe_longtime_homeowner": less_stats,
-    "exe_disabled": less_stats,
-    "exe_vet_returning": less_stats,
-    "exe_vet_dis_lt50": less_stats,
-    "exe_vet_dis_50_69": less_stats,
-    "exe_vet_dis_ge70": less_stats,
-    "exe_abate": less_stats,
-}
-
-# Create an empty dataframe to fill with output
-output = pd.DataFrame()
-# Loop through group combinations and stack output
-for key, value in geos.items():
-    df["data_year"] = df[key]
-
-    for x in value:
-        for z in groups:
-            group = [x, z, "year"]
-            summary = df.groupby(group).agg(agg_func_math).round(2)
-            summary["geography_type"] = x
-            summary["group_type"] = z
-            summary.index.names = ["geography_id", "group_id", "year"]
-            summary = summary.reset_index().set_index(
-                [
-                    "geography_type",
-                    "geography_id",
-                    "group_type",
-                    "group_id",
-                    "year",
-                ]
-            )
-
-            output = pd.concat([output, summary])
-
-# Clean combined output and export
-for i in ["median", "mean", "sum"]:
-    output["tax_bill_total", "delta" + i] = output["tax_bill_total", i].diff()
-
-output.columns = ["_".join(col) for col in output.columns]
-output.reset_index()
-
-output.to_csv("sot_taxes_exemptions.csv")
diff --git a/dbt/models/reporting/schema.yml b/dbt/models/reporting/schema.yml
index eac6c31a7..9b5aefacb 100644
--- a/dbt/models/reporting/schema.yml
+++ b/dbt/models/reporting/schema.yml
@@ -34,6 +34,12 @@ models:
             within_20_pct >= within_10_pct
             AND within_10_pct >= within_05_pct
 
+  - name: reporting.sot_assessment_roll
+    description: '{{ doc("table_sot_assessment_roll") }}'
+    config:
+      tags:
+        - daily
+
   - name: reporting.sot_assessment_roll_input
     description: '{{ doc("table_sot_assessment_roll_input") }}'
     config:

From fdff4570ff7ec94e5fb08d372f6ec411a0abab02 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Thu, 20 Jun 2024 18:48:55 +0000
Subject: [PATCH 13/96] Testing build on smaller input

---
 dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
index e29d3c037..a5c23e2ff 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
+++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
@@ -110,3 +110,4 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals
     AND uni.stage_name = vals.stage_name
 LEFT JOIN {{ ref('ccao.class_dict') }}
     ON uni.class = class_dict.class_code
+WHERE uni.year IN ('2022', '2023')

From 6abd07402b02916267fea92dc873ab35f874cba6 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Thu, 20 Jun 2024 18:59:19 +0000
Subject: [PATCH 14/96] Trying to build on limited sample

---
 dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
index a5c23e2ff..8f3f7c3f2 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
+++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
@@ -110,4 +110,4 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals
     AND uni.stage_name = vals.stage_name
 LEFT JOIN {{ ref('ccao.class_dict') }}
     ON uni.class = class_dict.class_code
-WHERE uni.year IN ('2022', '2023')
+WHERE uni.stage_name = 'MAILED' AND uni.class = '278'

From fd342b6ce1fb4ab5f011c56344f678feb7a1b752 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Thu, 20 Jun 2024 19:29:05 +0000
Subject: [PATCH 15/96] Try to build sales table

---
 dbt/models/reporting/reporting.sot_sales.py   | 149 ++++++++++++++++++
 .../reporting/reporting.sot_sales_input.sql   |  20 ++-
 2 files changed, 161 insertions(+), 8 deletions(-)
 create mode 100644 dbt/models/reporting/reporting.sot_sales.py

diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py
new file mode 100644
index 000000000..9709e77b3
--- /dev/null
+++ b/dbt/models/reporting/reporting.sot_sales.py
@@ -0,0 +1,149 @@
+# This script generates aggregated summary stats on sales data across a number
+# of geographies, class combinations, and time.
+
+import statistics as stats
+
+# Import libraries
+import numpy as np
+import pandas as pd
+
+# Declare geographic groups and their associated data years
+geos = {
+    "census_data_year": [
+        "census_place",
+        "census_tract",
+        "census_congressional_district",
+        "census_zcta",
+    ],
+    "cook_board_of_review_district_data_year": [
+        "cook_board_of_review_district"
+    ],
+    "cook_commissioner_district_data_year": ["cook_commissioner_district"],
+    "cook_judicial_district_data_year": ["cook_judicial_district"],
+    "ward_data_year": ["ward_num"],
+    "community_area_data_year": ["community_area"],
+    "police_district_data_year": ["police_district"],
+    "central_business_district_data_year": ["central_business_district"],
+    "school_data_year": [
+        "school_elementary_district",
+        "school_secondary_district",
+        "school_unified_district",
+    ],
+    "tax_data_year": [
+        "tax_municipality",
+        "tax_park_district",
+        "tax_library_district",
+        "tax_fire_protection_district",
+        "tax_community_college_district",
+        "tax_sanitation_district",
+        "tax_special_service_area",
+        "tax_tif_district",
+    ],
+}
+# Declare class groupings
+groups = ["no_group", "class", "major_class", "modeling_group"]
+
+
+# Define aggregation functions
+def q10(x):
+    return x.quantile(0.1)
+
+
+def q25(x):
+    return x.quantile(0.25)
+
+
+def q75(x):
+    return x.quantile(0.75)
+
+
+def q90(x):
+    return x.quantile(0.9)
+
+
+def first(x):
+    return x.iloc[0]
+
+
+more_stats = [
+    "min",
+    q10,
+    q25,
+    "median",
+    q75,
+    q90,
+    "max",
+    "mean",
+    "sum",
+]
+
+agg_func_math = {
+    "sale_price": ["size", "count"] + more_stats,
+    "price_per_sf": more_stats,
+    "char_bldg_sf": ["median"],
+    "char_land_sf": ["median"],
+    "char_yrblt": ["median"],
+    "class": [stats.multimode],
+    "data_year": [first],
+}
+
+
+def assemble(df, geos, groups):
+    # Create an empty dataframe to fill with output
+    output = pd.DataFrame()
+
+    # Loop through group combinations and stack output
+    for key, value in geos.items():
+        df["data_year"] = df[key]
+
+        for x in value:
+            for z in groups:
+                group = [x, z, "year"]
+                summary = df.groupby(group).agg(agg_func_math).round(2)
+                summary["geography_type"] = x
+                summary["group_type"] = z
+                summary.index.names = ["geography_id", "group_id", "year"]
+                summary = summary.reset_index().set_index(
+                    [
+                        "geography_type",
+                        "geography_id",
+                        "group_type",
+                        "group_id",
+                        "year",
+                    ]
+                )
+
+                output = pd.concat([output, summary])
+
+    # Clean combined output and export
+    output["sale_price", "sum"] = output["sale_price", "sum"].replace(
+        0, np.NaN
+    )
+    output["price_per_sf", "sum"] = output["price_per_sf", "sum"].replace(
+        0, np.NaN
+    )
+
+    for i in ["median", "mean", "sum"]:
+        output["sale_price", "delta" + i] = output["sale_price", i].diff()
+        output["price_per_sf", "delta" + i] = output["price_per_sf", i].diff()
+
+    output.columns = ["_".join(col) for col in output.columns]
+    output.reset_index()
+
+    return output
+
+
+def model(dbt, spark_session):
+    dbt.config(materialized="table")
+
+    input = dbt.ref("reporting.sot_sales_input")
+
+    # Convert the Spark input dataframe to Pandas for
+    # compatibility with assesspy functions
+    input = input.toPandas()
+
+    df = assemble(input, geos=geos, groups=groups)
+
+    spark_df = spark_session.createDataFrame(df)
+
+    return spark_df
diff --git a/dbt/models/reporting/reporting.sot_sales_input.sql b/dbt/models/reporting/reporting.sot_sales_input.sql
index 98dcbec23..e6ca7afd4 100644
--- a/dbt/models/reporting/reporting.sot_sales_input.sql
+++ b/dbt/models/reporting/reporting.sot_sales_input.sql
@@ -53,16 +53,19 @@ SELECT
     uni.school_elementary_district_geoid AS school_elementary_district,
     uni.school_secondary_district_geoid AS school_secondary_district,
     uni.school_unified_district_geoid AS school_unified_district,
-    uni.tax_municipality_name AS tax_municipality,
-    uni.tax_park_district_name AS tax_park_district,
-    uni.tax_library_district_name AS tax_library_district,
-    uni.tax_fire_protection_district_name AS tax_fire_protection_district,
-    uni.tax_community_college_district_name
+    ARRAY_JOIN(uni.tax_municipality_name, ', ') AS tax_municipality,
+    ARRAY_JOIN(uni.tax_park_district_name, ', ') AS tax_park_district,
+    ARRAY_JOIN(uni.tax_library_district_name, ', ') AS tax_library_district,
+    ARRAY_JOIN(uni.tax_fire_protection_district_name, ', ')
+        AS tax_fire_protection_district,
+    ARRAY_JOIN(uni.tax_community_college_district_name, ', ')
         AS
         tax_community_college_district,
-    uni.tax_sanitation_district_name AS tax_sanitation_district,
-    uni.tax_special_service_area_name AS tax_special_service_area,
-    uni.tax_tif_district_name AS tax_tif_district,
+    ARRAY_JOIN(uni.tax_sanitation_district_name, ', ')
+        AS tax_sanitation_district,
+    ARRAY_JOIN(uni.tax_special_service_area_name, ', ')
+        AS tax_special_service_area,
+    ARRAY_JOIN(uni.tax_tif_district_name, ', ') AS tax_tif_district,
     uni.econ_central_business_district_num AS central_business_district,
     uni.census_data_year,
     uni.cook_board_of_review_district_data_year,
@@ -97,3 +100,4 @@ LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales
     AND NOT sales.sale_filter_deed_type
     AND NOT sales.sale_filter_less_than_10k
     AND NOT sales.sale_filter_same_sale_within_365
+WHERE uni.year = '2023'

From cccf8e1ed8dc7a726cc8d850906a1a5dbdd08980 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Thu, 20 Jun 2024 19:40:53 +0000
Subject: [PATCH 16/96] Try to build taxes and exemptions table

---
 .../reporting.sot_ratio_stats_input.sql       |  19 ++-
 .../reporting.sot_taxes_exemptions.py         | 149 ++++++++++++++++++
 .../reporting.sot_taxes_exemptions_input.sql  |  20 ++-
 3 files changed, 172 insertions(+), 16 deletions(-)
 create mode 100644 dbt/models/reporting/reporting.sot_taxes_exemptions.py

diff --git a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
index f723c10ef..994f1d192 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
+++ b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
@@ -72,16 +72,19 @@ SELECT
     uni.school_elementary_district_geoid AS school_elementary_district,
     uni.school_secondary_district_geoid AS school_secondary_district,
     uni.school_unified_district_geoid AS school_unified_district,
-    uni.tax_municipality_name AS tax_municipality,
-    uni.tax_park_district_name AS tax_park_district,
-    uni.tax_library_district_name AS tax_library_district,
-    uni.tax_fire_protection_district_name AS tax_fire_protection_district,
-    uni.tax_community_college_district_name
+    ARRAY_JOIN(uni.tax_municipality_name, ', ') AS tax_municipality,
+    ARRAY_JOIN(uni.tax_park_district_name, ', ') AS tax_park_district,
+    ARRAY_JOIN(uni.tax_library_district_name, ', ') AS tax_library_district,
+    ARRAY_JOIN(uni.tax_fire_protection_district_name, ', ')
+        AS tax_fire_protection_district,
+    ARRAY_JOIN(uni.tax_community_college_district_name, ', ')
         AS
         tax_community_college_district,
-    uni.tax_sanitation_district_name AS tax_sanitation_district,
-    uni.tax_special_service_area_name AS tax_special_service_area,
-    uni.tax_tif_district_name AS tax_tif_district,
+    ARRAY_JOIN(uni.tax_sanitation_district_name, ', ')
+        AS tax_sanitation_district,
+    ARRAY_JOIN(uni.tax_special_service_area_name, ', ')
+        AS tax_special_service_area,
+    ARRAY_JOIN(uni.tax_tif_district_name, ', ') AS tax_tif_district,
     uni.econ_central_business_district_num AS central_business_district,
     uni.census_data_year,
     uni.cook_board_of_review_district_data_year,
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
new file mode 100644
index 000000000..4877b2d51
--- /dev/null
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
@@ -0,0 +1,149 @@
+# This script generates aggregated summary stats on taxes and exemptions data
+# across a number of geographies, class combinations, and time.
+
+# Import libraries
+import pandas as pd
+
+# Declare geographic groups and their associated data years
+geos = {
+    "census_data_year": [
+        "census_place",
+        "census_tract",
+        "census_congressional_district",
+        "census_zcta",
+    ],
+    "cook_board_of_review_district_data_year": [
+        "cook_board_of_review_district"
+    ],
+    "cook_commissioner_district_data_year": ["cook_commissioner_district"],
+    "cook_judicial_district_data_year": ["cook_judicial_district"],
+    "ward_data_year": ["ward_num"],
+    "community_area_data_year": ["community_area"],
+    "police_district_data_year": ["police_district"],
+    "central_business_district_data_year": ["central_business_district"],
+    "school_data_year": [
+        "school_elementary_district",
+        "school_secondary_district",
+        "school_unified_district",
+    ],
+    "tax_data_year": [
+        "tax_municipality",
+        "tax_park_district",
+        "tax_library_district",
+        "tax_fire_protection_district",
+        "tax_community_college_district",
+        "tax_sanitation_district",
+        "tax_special_service_area",
+        "tax_tif_district",
+    ],
+}
+# Declare class groupings
+groups = ["no_group", "class", "major_class", "modeling_group"]
+
+
+# Define aggregation functions
+def q10(x):
+    return x.quantile(0.1)
+
+
+def q25(x):
+    return x.quantile(0.25)
+
+
+def q75(x):
+    return x.quantile(0.75)
+
+
+def q90(x):
+    return x.quantile(0.9)
+
+
+def first(x):
+    return x.iloc[0]
+
+
+more_stats = [
+    "min",
+    q10,
+    q25,
+    "median",
+    q75,
+    q90,
+    "max",
+    "mean",
+    "sum",
+]
+
+less_stats = ["count", "sum"]
+
+agg_func_math = {
+    "eq_factor_final": ["size", first],
+    "eq_factor_tentative": [first],
+    "tax_bill_total": more_stats,
+    "tax_code_rate": more_stats,
+    "av_clerk": more_stats,
+    "exe_homeowner": less_stats,
+    "exe_senior": less_stats,
+    "exe_freeze": less_stats,
+    "exe_longtime_homeowner": less_stats,
+    "exe_disabled": less_stats,
+    "exe_vet_returning": less_stats,
+    "exe_vet_dis_lt50": less_stats,
+    "exe_vet_dis_50_69": less_stats,
+    "exe_vet_dis_ge70": less_stats,
+    "exe_abate": less_stats,
+}
+
+
+def assemble(df, geos, groups):
+    # Create an empty dataframe to fill with output
+    output = pd.DataFrame()
+    # Loop through group combinations and stack output
+    for key, value in geos.items():
+        df["data_year"] = df[key]
+
+        for x in value:
+            for z in groups:
+                group = [x, z, "year"]
+                summary = df.groupby(group).agg(agg_func_math).round(2)
+                summary["geography_type"] = x
+                summary["group_type"] = z
+                summary.index.names = ["geography_id", "group_id", "year"]
+                summary = summary.reset_index().set_index(
+                    [
+                        "geography_type",
+                        "geography_id",
+                        "group_type",
+                        "group_id",
+                        "year",
+                    ]
+                )
+
+                output = pd.concat([output, summary])
+
+    # Clean combined output and export
+    for i in ["median", "mean", "sum"]:
+        output["tax_bill_total", "delta" + i] = output[
+            "tax_bill_total", i
+        ].diff()
+
+    output.columns = ["_".join(col) for col in output.columns]
+    output.reset_index()
+
+    return output
+
+
+def model(dbt, spark_session):
+    dbt.config(materialized="table")
+
+    input = dbt.ref("reporting.sot_taxes_exemptions_input")
+
+    # Convert the Spark input dataframe to Pandas for
+    # compatibility with assesspy functions
+    input = input.toPandas()
+
+    df = assemble(input, geos=geos, groups=groups)
+
+    spark_df = spark_session.createDataFrame(df)
+
+    return spark_df
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
index 80bee99bc..0bf3872b7 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
@@ -58,16 +58,19 @@ SELECT
     uni.school_elementary_district_geoid AS school_elementary_district,
     uni.school_secondary_district_geoid AS school_secondary_district,
     uni.school_unified_district_geoid AS school_unified_district,
-    uni.tax_municipality_name AS tax_municipality,
-    uni.tax_park_district_name AS tax_park_district,
-    uni.tax_library_district_name AS tax_library_district,
-    uni.tax_fire_protection_district_name AS tax_fire_protection_district,
-    uni.tax_community_college_district_name
+    ARRAY_JOIN(uni.tax_municipality_name, ', ') AS tax_municipality,
+    ARRAY_JOIN(uni.tax_park_district_name, ', ') AS tax_park_district,
+    ARRAY_JOIN(uni.tax_library_district_name, ', ') AS tax_library_district,
+    ARRAY_JOIN(uni.tax_fire_protection_district_name, ', ')
+        AS tax_fire_protection_district,
+    ARRAY_JOIN(uni.tax_community_college_district_name, ', ')
         AS
         tax_community_college_district,
-    uni.tax_sanitation_district_name AS tax_sanitation_district,
-    uni.tax_special_service_area_name AS tax_special_service_area,
-    uni.tax_tif_district_name AS tax_tif_district,
+    ARRAY_JOIN(uni.tax_sanitation_district_name, ', ')
+        AS tax_sanitation_district,
+    ARRAY_JOIN(uni.tax_special_service_area_name, ', ')
+        AS tax_special_service_area,
+    ARRAY_JOIN(uni.tax_tif_district_name, ', ') AS tax_tif_district,
     uni.econ_central_business_district_num AS central_business_district,
     uni.census_data_year,
     uni.cook_board_of_review_district_data_year,
@@ -97,3 +100,4 @@ INNER JOIN {{ source('tax', 'tax_code') }} AS tcd
     AND tax.year = tcd.year
 INNER JOIN {{ ref('ccao.class_dict') }}
     ON uni.class = class_dict.class_code
+WHERE uni.class = '278'

From 365696459ce27a5f3023c13dfdac46ef4c8adba4 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Thu, 20 Jun 2024 19:53:27 +0000
Subject: [PATCH 17/96] Try to build taxes and exemptions table

---
 temp/reporting.sot_ratio_stats.py | 168 ++++++++++++++++++++++++++++++
 1 file changed, 168 insertions(+)
 create mode 100644 temp/reporting.sot_ratio_stats.py

diff --git a/temp/reporting.sot_ratio_stats.py b/temp/reporting.sot_ratio_stats.py
new file mode 100644
index 000000000..a37cb2261
--- /dev/null
+++ b/temp/reporting.sot_ratio_stats.py
@@ -0,0 +1,168 @@
+# This script generates aggregated summary stats on sales data across a number
+# of geographies, class combinations, and time.
+import os.path
+
+# Import libraries
+import assesspy as ass
+import awswrangler as wr
+import pandas as pd
+
+# Ingest data if it is not already available
+if os.path.isfile("sot_ratio_stats.parquet.gzip"):
+    df = pd.read_parquet("sot_ratio_stats.parquet.gzip")
+
+else:
+    sql = open("reporting.sot_ratio_stats.sql").read()
+    df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False)
+    df.to_parquet("sot_ratio_stats.parquet.gzip", compression="gzip")
+
+# Declare geographic groups and their associated data years
+geos = {
+    "census_data_year": [
+        "census_place",
+        "census_tract",
+        "census_congressional_district",
+        "census_zcta",
+    ],
+    "cook_board_of_review_district_data_year": [
+        "cook_board_of_review_district"
+    ],
+    "cook_commissioner_district_data_year": ["cook_commissioner_district"],
+    "cook_judicial_district_data_year": ["cook_judicial_district"],
+    "ward_data_year": ["ward_num"],
+    "community_area_data_year": ["community_area"],
+    "police_district_data_year": ["police_district"],
+    "central_business_district_data_year": ["central_business_district"],
+    "school_data_year": [
+        "school_elementary_district",
+        "school_secondary_district",
+        "school_unified_district",
+    ],
+    "tax_data_year": [
+        "tax_municipality",
+        "tax_park_district",
+        "tax_library_district",
+        "tax_fire_protection_district",
+        "tax_community_college_district",
+        "tax_sanitation_district",
+        "tax_special_service_area",
+        "tax_tif_district",
+    ],
+}
+# Declare class groupings
+groups = ["no_group", "class", "major_class", "modeling_group"]
+
+
+# Define aggregation functions
+def aggregrate(data, geography_type, group_type):
+    print(geography_type, group_type)
+
+    group = [geography_type, group_type, "year", "stage_name"]
+    data["size"] = data.groupby(group)["tot_mv"].transform("size")
+    data["sale_count"] = data.groupby(group)["sale_price"].transform("count")
+    data["mv_count"] = data.groupby(group)["tot_mv"].transform("count")
+
+    # Remove parcels with FMVs of 0 since they screw up ratios
+    data = data[data["tot_mv"] > 0].reset_index()
+    data["ratio_count"] = data.groupby(group)["ratio"].transform("count")
+
+    # Remove groups that only have one sale since we can't calculate stats
+    data = data[data["ratio_count"] > 1]
+
+    summary = (
+        data.dropna(subset=["ratio"])
+        .groupby(group)
+        .apply(
+            lambda x: pd.Series(
+                {
+                    "size": x["size"].iloc[0],
+                    "mv_count": x["mv_count"].iloc[0],
+                    "sale_count": x["sale_count"].iloc[0],
+                    "mv_min": x["tot_mv"].min(),
+                    "mv_q10": x["tot_mv"].quantile(0.1),
+                    "mv_q25": x["tot_mv"].quantile(0.25),
+                    "mv_median": x["tot_mv"].median(),
+                    "mv_q75": x["tot_mv"].quantile(0.75),
+                    "mv_q90": x["tot_mv"].quantile(0.90),
+                    "mv_max": x["tot_mv"].max(),
+                    "mv_mean": x["tot_mv"].mean(),
+                    "mv_sum": x["tot_mv"].sum(),
+                    "ratio_min": x["ratio"].min(),
+                    "ratio_q10": x["ratio"].quantile(0.1),
+                    "ratio_q25": x["ratio"].quantile(0.25),
+                    "ratio_median": x["ratio"].median(),
+                    "ratio_q75": x["ratio"].quantile(0.75),
+                    "ratio_q90": x["ratio"].quantile(0.90),
+                    "ratio_max": x["ratio"].max(),
+                    "ratio_mean": x["ratio"].mean(),
+                    "cod": ass.cod(ratio=x["ratio"]),
+                    "prd": ass.prd(x["tot_mv"], x["sale_price"]),
+                    "prb": ass.prb(x["tot_mv"], x["sale_price"], 3)["prb"],
+                    # "mki": ass.mki(x["tot_mv"], x["sale_price"]),
+                }
+            ),
+            include_groups=False,
+        )
+    )
+    summary["geography_type"] = geography_type
+    summary["group_type"] = group_type
+
+    return summary
+
+
+# Create an empty dataframe to fill with output
+output = pd.DataFrame()
+
+# Loop through group combinations and stack output
+for key, value in geos.items():
+    df["data_year"] = df[key]
+
+    for x in value:
+        for z in groups:
+            output = pd.concat([output, aggregrate(df, x, z)])
+
+output.index.names = ["geography_id", "group_id", "year", "stage_name"]
+
+output = output.reset_index().set_index(
+    [
+        "geography_type",
+        "geography_id",
+        "group_type",
+        "group_id",
+        "year",
+        "stage_name",
+    ]
+)
+
+# Clean combined output and export
+output["mv_delta_pct_median"] = (
+    output.sort_values("year")
+    .groupby(["geography_id", "group_id", "stage_name"])
+    .mv_median.diff()
+)
+output["mv_delta_pct_mean"] = (
+    output.sort_values("year")
+    .groupby(["geography_id", "group_id", "stage_name"])
+    .mv_mean.diff()
+)
+output["mv_delta_pct_sum"] = (
+    output.sort_values("year")
+    .groupby(["geography_id", "group_id", "stage_name"])
+    .mv_sum.diff()
+)
+
+output["mv_delta_pct_median"] = (
+    output.sort_values("year")
+    .groupby(["geography_id", "group_id", "stage_name"])
+    .mv_median.pct_change()
+)
+output["mv_delta_pct_mean"] = (
+    output.sort_values("year")
+    .groupby(["geography_id", "group_id", "stage_name"])
+    .mv_mean.pct_change()
+)
+
+output.dropna(how="all", axis=1, inplace=True)
+output.to_csv("sot_ratio_stats.csv")
+
+# %%

From 8b0f95f41792d46e0a40b005fb8f871167ec03ed Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Thu, 20 Jun 2024 21:08:33 +0000
Subject: [PATCH 18/96] Try to build taxes table

---
 dbt/models/reporting/docs.md                  |  16 ++
 .../reporting.sot_ratio_stats_input.sql       |   1 +
 .../reporting.sot_taxes_exemptions.py         |   4 +
 .../reporting.sot_taxes_exemptions_input.sql  |  13 +-
 dbt/models/reporting/schema.yml               |  12 ++
 temp/reporting.sot_ratio_stats.py             | 168 ------------------
 6 files changed, 44 insertions(+), 170 deletions(-)
 delete mode 100644 temp/reporting.sot_ratio_stats.py

diff --git a/dbt/models/reporting/docs.md b/dbt/models/reporting/docs.md
index dd0f7bf68..3854dbd79 100644
--- a/dbt/models/reporting/docs.md
+++ b/dbt/models/reporting/docs.md
@@ -57,6 +57,14 @@ Table to feed the Python dbt job that creates the
 **Primary Key**: `year`, `stage_name`, `geography_id`, `group_id`
 {% enddocs %}
 
+# sot_sales
+
+{% docs table_sot_sales %}
+Feeds public reporting assets.
+
+**Primary Key**: `year`, `geography_id`, `group_id`
+{% enddocs %}
+
 # sot_sales_input
 
 {% docs table_sot_sales_input %}
@@ -66,6 +74,14 @@ Table to feed the Python dbt job that creates the
 **Primary Key**: `year`, `geography_id`, `group_id`
 {% enddocs %}
 
+# sot_taxes_exemptions
+
+{% docs table_sot_taxes_exemptions %}
+Feeds public reporting assets.
+
+**Primary Key**: `year`, `geography_id`, `group_id`
+{% enddocs %}
+
 # sot_taxes_exemptions_input
 
 {% docs table_sot_taxes_exemptions_input %}
diff --git a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
index 994f1d192..6cd258c2a 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
+++ b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
@@ -119,3 +119,4 @@ LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales
     AND NOT sales.sale_filter_less_than_10k
     AND NOT sales.sale_filter_same_sale_within_365
 WHERE uni.year >= '2020'
+    AND uni.year = '2023' AND uni.class = '278'
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
index 4877b2d51..cb257a49e 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
@@ -1,3 +1,6 @@
+# pylint: skip-file
+# type: ignore
+
 # This script generates aggregated summary stats on taxes and exemptions data
 # across a number of geographies, class combinations, and time.
 
@@ -98,6 +101,7 @@ def first(x):
 def assemble(df, geos, groups):
     # Create an empty dataframe to fill with output
     output = pd.DataFrame()
+
     # Loop through group combinations and stack output
     for key, value in geos.items():
         df["data_year"] = df[key]
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
index 0bf3872b7..d0213ff5a 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
@@ -4,9 +4,18 @@
     )
 }}
 
+WITH tcd AS (
+    SELECT DISTINCT
+        tax_code_num,
+        tax_code_rate,
+        year
+    FROM {{ source('tax', 'tax_code') }}
+)
+
 -- Gather parcel-level geographies and join taxes, exemptions, and class
 -- groupings
 SELECT
+    uni.pin,
     tax.year,
     tax.av_clerk,
     tax.tax_bill_total,
@@ -95,9 +104,9 @@ INNER JOIN {{ source('tax', 'pin') }} AS tax
     AND uni.year = tax.year
 INNER JOIN {{ source('tax', 'eq_factor') }} AS eqf
     ON uni.year = eqf.year
-INNER JOIN {{ source('tax', 'tax_code') }} AS tcd
+INNER JOIN tcd
     ON tax.tax_code_num = tcd.tax_code_num
     AND tax.year = tcd.year
 INNER JOIN {{ ref('ccao.class_dict') }}
     ON uni.class = class_dict.class_code
-WHERE uni.class = '278'
+WHERE uni.class = '206'
diff --git a/dbt/models/reporting/schema.yml b/dbt/models/reporting/schema.yml
index 9b5aefacb..5072ae8cc 100644
--- a/dbt/models/reporting/schema.yml
+++ b/dbt/models/reporting/schema.yml
@@ -52,12 +52,24 @@ models:
       tags:
         - daily
 
+  - name: reporting.sot_sales
+    description: '{{ doc("table_sot_sales") }}'
+    config:
+      tags:
+        - daily
+
   - name: reporting.sot_sales_input
     description: '{{ doc("table_sot_sales_input") }}'
     config:
       tags:
         - daily
 
+  - name: reporting.sot_taxes_exemptions
+    description: '{{ doc("table_sot_taxes_exemptions") }}'
+    config:
+      tags:
+        - daily
+
   - name: reporting.sot_taxes_exemptions_input
     description: '{{ doc("table_sot_taxes_exemptions_input") }}'
     config:
diff --git a/temp/reporting.sot_ratio_stats.py b/temp/reporting.sot_ratio_stats.py
deleted file mode 100644
index a37cb2261..000000000
--- a/temp/reporting.sot_ratio_stats.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# This script generates aggregated summary stats on sales data across a number
-# of geographies, class combinations, and time.
-import os.path
-
-# Import libraries
-import assesspy as ass
-import awswrangler as wr
-import pandas as pd
-
-# Ingest data if it is not already available
-if os.path.isfile("sot_ratio_stats.parquet.gzip"):
-    df = pd.read_parquet("sot_ratio_stats.parquet.gzip")
-
-else:
-    sql = open("reporting.sot_ratio_stats.sql").read()
-    df = wr.athena.read_sql_query(sql, database="default", ctas_approach=False)
-    df.to_parquet("sot_ratio_stats.parquet.gzip", compression="gzip")
-
-# Declare geographic groups and their associated data years
-geos = {
-    "census_data_year": [
-        "census_place",
-        "census_tract",
-        "census_congressional_district",
-        "census_zcta",
-    ],
-    "cook_board_of_review_district_data_year": [
-        "cook_board_of_review_district"
-    ],
-    "cook_commissioner_district_data_year": ["cook_commissioner_district"],
-    "cook_judicial_district_data_year": ["cook_judicial_district"],
-    "ward_data_year": ["ward_num"],
-    "community_area_data_year": ["community_area"],
-    "police_district_data_year": ["police_district"],
-    "central_business_district_data_year": ["central_business_district"],
-    "school_data_year": [
-        "school_elementary_district",
-        "school_secondary_district",
-        "school_unified_district",
-    ],
-    "tax_data_year": [
-        "tax_municipality",
-        "tax_park_district",
-        "tax_library_district",
-        "tax_fire_protection_district",
-        "tax_community_college_district",
-        "tax_sanitation_district",
-        "tax_special_service_area",
-        "tax_tif_district",
-    ],
-}
-# Declare class groupings
-groups = ["no_group", "class", "major_class", "modeling_group"]
-
-
-# Define aggregation functions
-def aggregrate(data, geography_type, group_type):
-    print(geography_type, group_type)
-
-    group = [geography_type, group_type, "year", "stage_name"]
-    data["size"] = data.groupby(group)["tot_mv"].transform("size")
-    data["sale_count"] = data.groupby(group)["sale_price"].transform("count")
-    data["mv_count"] = data.groupby(group)["tot_mv"].transform("count")
-
-    # Remove parcels with FMVs of 0 since they screw up ratios
-    data = data[data["tot_mv"] > 0].reset_index()
-    data["ratio_count"] = data.groupby(group)["ratio"].transform("count")
-
-    # Remove groups that only have one sale since we can't calculate stats
-    data = data[data["ratio_count"] > 1]
-
-    summary = (
-        data.dropna(subset=["ratio"])
-        .groupby(group)
-        .apply(
-            lambda x: pd.Series(
-                {
-                    "size": x["size"].iloc[0],
-                    "mv_count": x["mv_count"].iloc[0],
-                    "sale_count": x["sale_count"].iloc[0],
-                    "mv_min": x["tot_mv"].min(),
-                    "mv_q10": x["tot_mv"].quantile(0.1),
-                    "mv_q25": x["tot_mv"].quantile(0.25),
-                    "mv_median": x["tot_mv"].median(),
-                    "mv_q75": x["tot_mv"].quantile(0.75),
-                    "mv_q90": x["tot_mv"].quantile(0.90),
-                    "mv_max": x["tot_mv"].max(),
-                    "mv_mean": x["tot_mv"].mean(),
-                    "mv_sum": x["tot_mv"].sum(),
-                    "ratio_min": x["ratio"].min(),
-                    "ratio_q10": x["ratio"].quantile(0.1),
-                    "ratio_q25": x["ratio"].quantile(0.25),
-                    "ratio_median": x["ratio"].median(),
-                    "ratio_q75": x["ratio"].quantile(0.75),
-                    "ratio_q90": x["ratio"].quantile(0.90),
-                    "ratio_max": x["ratio"].max(),
-                    "ratio_mean": x["ratio"].mean(),
-                    "cod": ass.cod(ratio=x["ratio"]),
-                    "prd": ass.prd(x["tot_mv"], x["sale_price"]),
-                    "prb": ass.prb(x["tot_mv"], x["sale_price"], 3)["prb"],
-                    # "mki": ass.mki(x["tot_mv"], x["sale_price"]),
-                }
-            ),
-            include_groups=False,
-        )
-    )
-    summary["geography_type"] = geography_type
-    summary["group_type"] = group_type
-
-    return summary
-
-
-# Create an empty dataframe to fill with output
-output = pd.DataFrame()
-
-# Loop through group combinations and stack output
-for key, value in geos.items():
-    df["data_year"] = df[key]
-
-    for x in value:
-        for z in groups:
-            output = pd.concat([output, aggregrate(df, x, z)])
-
-output.index.names = ["geography_id", "group_id", "year", "stage_name"]
-
-output = output.reset_index().set_index(
-    [
-        "geography_type",
-        "geography_id",
-        "group_type",
-        "group_id",
-        "year",
-        "stage_name",
-    ]
-)
-
-# Clean combined output and export
-output["mv_delta_pct_median"] = (
-    output.sort_values("year")
-    .groupby(["geography_id", "group_id", "stage_name"])
-    .mv_median.diff()
-)
-output["mv_delta_pct_mean"] = (
-    output.sort_values("year")
-    .groupby(["geography_id", "group_id", "stage_name"])
-    .mv_mean.diff()
-)
-output["mv_delta_pct_sum"] = (
-    output.sort_values("year")
-    .groupby(["geography_id", "group_id", "stage_name"])
-    .mv_sum.diff()
-)
-
-output["mv_delta_pct_median"] = (
-    output.sort_values("year")
-    .groupby(["geography_id", "group_id", "stage_name"])
-    .mv_median.pct_change()
-)
-output["mv_delta_pct_mean"] = (
-    output.sort_values("year")
-    .groupby(["geography_id", "group_id", "stage_name"])
-    .mv_mean.pct_change()
-)
-
-output.dropna(how="all", axis=1, inplace=True)
-output.to_csv("sot_ratio_stats.csv")
-
-# %%

From 9383bdc7794e1091781a8c49c1da862a9efd2277 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Thu, 20 Jun 2024 21:42:53 +0000
Subject: [PATCH 19/96] Try to build ratio stats table

---
 dbt/models/reporting/docs.md                  |   8 +
 .../reporting/reporting.sot_ratio_stats.py    | 176 ++++++++++++++++++
 dbt/models/reporting/schema.yml               |   6 +
 3 files changed, 190 insertions(+)
 create mode 100644 dbt/models/reporting/reporting.sot_ratio_stats.py

diff --git a/dbt/models/reporting/docs.md b/dbt/models/reporting/docs.md
index 3854dbd79..80261b340 100644
--- a/dbt/models/reporting/docs.md
+++ b/dbt/models/reporting/docs.md
@@ -48,6 +48,14 @@ Table to feed the Python dbt job that creates the
 **Primary Key**: `year`, `stage_name`, `geography_id`, `group_id`
 {% enddocs %}
 
+# sot_ratio_stats
+
+{% docs table_sot_ratio_stats %}
+Feeds public reporting assets.
+
+**Primary Key**: `year`, `stage_name`, `geography_id`, `group_id`
+{% enddocs %}
+
 # sot_ratio_stats_input
 
 {% docs table_sot_ratio_stats_input %}
diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py
new file mode 100644
index 000000000..2328dc48f
--- /dev/null
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.py
@@ -0,0 +1,176 @@
+# pylint: skip-file
+# type: ignore
+
+# This script generates aggregated summary stats on sales data across a number
+# of geographies, class combinations, and time.
+
+# Import libraries
+import assesspy as ass
+import pandas as pd
+
+# Declare geographic groups and their associated data years
+geos = {
+    "census_data_year": [
+        "census_place",
+        "census_tract",
+        "census_congressional_district",
+        "census_zcta",
+    ],
+    "cook_board_of_review_district_data_year": [
+        "cook_board_of_review_district"
+    ],
+    "cook_commissioner_district_data_year": ["cook_commissioner_district"],
+    "cook_judicial_district_data_year": ["cook_judicial_district"],
+    "ward_data_year": ["ward_num"],
+    "community_area_data_year": ["community_area"],
+    "police_district_data_year": ["police_district"],
+    "central_business_district_data_year": ["central_business_district"],
+    "school_data_year": [
+        "school_elementary_district",
+        "school_secondary_district",
+        "school_unified_district",
+    ],
+    "tax_data_year": [
+        "tax_municipality",
+        "tax_park_district",
+        "tax_library_district",
+        "tax_fire_protection_district",
+        "tax_community_college_district",
+        "tax_sanitation_district",
+        "tax_special_service_area",
+        "tax_tif_district",
+    ],
+}
+# Declare class groupings
+groups = ["no_group", "class", "major_class", "modeling_group"]
+
+
+# Define aggregation functions
+def aggregrate(data, geography_type, group_type):
+    print(geography_type, group_type)
+
+    group = [geography_type, group_type, "year", "stage_name"]
+    data["size"] = data.groupby(group)["tot_mv"].transform("size")
+    data["sale_count"] = data.groupby(group)["sale_price"].transform("count")
+    data["mv_count"] = data.groupby(group)["tot_mv"].transform("count")
+
+    # Remove parcels with FMVs of 0 since they screw up ratios
+    data = data[data["tot_mv"] > 0].reset_index()
+    data["ratio_count"] = data.groupby(group)["ratio"].transform("count")
+
+    # Remove groups that only have one sale since we can't calculate stats
+    data = data[data["ratio_count"] >= 30]
+
+    summary = (
+        data.dropna(subset=["ratio"])
+        .groupby(group)
+        .apply(
+            lambda x: pd.Series(
+                {
+                    "size": x["size"].iloc[0],
+                    "mv_count": x["mv_count"].iloc[0],
+                    "sale_count": x["sale_count"].iloc[0],
+                    "mv_min": x["tot_mv"].min(),
+                    "mv_q10": x["tot_mv"].quantile(0.1),
+                    "mv_q25": x["tot_mv"].quantile(0.25),
+                    "mv_median": x["tot_mv"].median(),
+                    "mv_q75": x["tot_mv"].quantile(0.75),
+                    "mv_q90": x["tot_mv"].quantile(0.90),
+                    "mv_max": x["tot_mv"].max(),
+                    "mv_mean": x["tot_mv"].mean(),
+                    "mv_sum": x["tot_mv"].sum(),
+                    "ratio_min": x["ratio"].min(),
+                    "ratio_q10": x["ratio"].quantile(0.1),
+                    "ratio_q25": x["ratio"].quantile(0.25),
+                    "ratio_median": x["ratio"].median(),
+                    "ratio_q75": x["ratio"].quantile(0.75),
+                    "ratio_q90": x["ratio"].quantile(0.90),
+                    "ratio_max": x["ratio"].max(),
+                    "ratio_mean": x["ratio"].mean(),
+                    "cod": ass.cod(ratio=x["ratio"]),
+                    "prd": ass.prd(x["tot_mv"], x["sale_price"]),
+                    "prb": ass.prb(x["tot_mv"], x["sale_price"], 3)["prb"],
+                    "mki": ass.mki(x["tot_mv"], x["sale_price"]),
+                }
+            ),
+            include_groups=False,
+        )
+    )
+    summary["geography_type"] = geography_type
+    summary["group_type"] = group_type
+
+    return summary
+
+
+def assemble(df, geos, groups):
+    # Create an empty dataframe to fill with output
+    output = pd.DataFrame()
+
+    # Loop through group combinations and stack output
+    for key, value in geos.items():
+        df["data_year"] = df[key]
+
+        for x in value:
+            for z in groups:
+                output = pd.concat([output, aggregrate(df, x, z)])
+
+    output.index.names = ["geography_id", "group_id", "year", "stage_name"]
+
+    output = output.reset_index().set_index(
+        [
+            "geography_type",
+            "geography_id",
+            "group_type",
+            "group_id",
+            "year",
+            "stage_name",
+        ]
+    )
+
+    # Clean combined output and export
+    output["mv_delta_pct_median"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .mv_median.diff()
+    )
+    output["mv_delta_pct_mean"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .mv_mean.diff()
+    )
+    output["mv_delta_pct_sum"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .mv_sum.diff()
+    )
+
+    output["mv_delta_pct_median"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .mv_median.pct_change()
+    )
+    output["mv_delta_pct_mean"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .mv_mean.pct_change()
+    )
+
+    output.dropna(how="all", axis=1, inplace=True)
+
+    return output
+
+
+def model(dbt, spark_session):
+    dbt.config(materialized="table")
+
+    input = dbt.ref("reporting.sot_ratio_stats_input")
+
+    # Convert the Spark input dataframe to Pandas for
+    # compatibility with assesspy functions
+    input = input.toPandas()
+
+    df = assemble(input, geos=geos, groups=groups)
+
+    spark_df = spark_session.createDataFrame(df)
+
+    return spark_df
diff --git a/dbt/models/reporting/schema.yml b/dbt/models/reporting/schema.yml
index 5072ae8cc..18c0e148d 100644
--- a/dbt/models/reporting/schema.yml
+++ b/dbt/models/reporting/schema.yml
@@ -46,6 +46,12 @@ models:
       tags:
         - daily
 
+  - name: reporting.sot_ratio_stats
+    description: '{{ doc("table_sot_ratio_stats") }}'
+    config:
+      tags:
+        - daily
+
   - name: reporting.sot_ratio_stats_input
     description: '{{ doc("table_sot_ratio_stats_input") }}'
     config:

From 08d3bd60537cc475e1d2eab9288f8ebbfcda3e6b Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Thu, 20 Jun 2024 21:52:31 +0000
Subject: [PATCH 20/96] Add assesspy to ratio_stats table

---
 dbt/models/reporting/reporting.sot_ratio_stats.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py
index 2328dc48f..474672db0 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.py
@@ -1,12 +1,15 @@
 # pylint: skip-file
 # type: ignore
+sc.addPyFile(  # noqa: F821
+    "s3://ccao-athena-dependencies-us-east-1/assesspy==1.1.0.zip"
+)
 
 # This script generates aggregated summary stats on sales data across a number
 # of geographies, class combinations, and time.
 
 # Import libraries
-import assesspy as ass
-import pandas as pd
+import assesspy as ass  # noqa: E402
+import pandas as pd  # noqa: E402
 
 # Declare geographic groups and their associated data years
 geos = {

From d2cac224d1ada98ecb551a23c5a084a188181c35 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Mon, 24 Jun 2024 19:56:46 +0000
Subject: [PATCH 21/96] ratio_stats builds in dbt, excluding assesspy funcs

---
 .../reporting/reporting.sot_ratio_stats.py    | 142 +++++++++++-------
 .../reporting.sot_ratio_stats_input.sql       |   2 +-
 2 files changed, 92 insertions(+), 52 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py
index 474672db0..56b2e281e 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.py
@@ -8,7 +8,8 @@
 # of geographies, class combinations, and time.
 
 # Import libraries
-import assesspy as ass  # noqa: E402
+import assesspy as ass  # noqa: E402, F401
+import numpy as np  # noqa: E402
 import pandas as pd  # noqa: E402
 
 # Declare geographic groups and their associated data years
@@ -58,45 +59,42 @@ def aggregrate(data, geography_type, group_type):
     data["mv_count"] = data.groupby(group)["tot_mv"].transform("count")
 
     # Remove parcels with FMVs of 0 since they screw up ratios
-    data = data[data["tot_mv"] > 0].reset_index()
+    data = data[data["tot_mv"] > 0]
     data["ratio_count"] = data.groupby(group)["ratio"].transform("count")
 
     # Remove groups that only have one sale since we can't calculate stats
     data = data[data["ratio_count"] >= 30]
 
-    summary = (
-        data.dropna(subset=["ratio"])
-        .groupby(group)
-        .apply(
-            lambda x: pd.Series(
-                {
-                    "size": x["size"].iloc[0],
-                    "mv_count": x["mv_count"].iloc[0],
-                    "sale_count": x["sale_count"].iloc[0],
-                    "mv_min": x["tot_mv"].min(),
-                    "mv_q10": x["tot_mv"].quantile(0.1),
-                    "mv_q25": x["tot_mv"].quantile(0.25),
-                    "mv_median": x["tot_mv"].median(),
-                    "mv_q75": x["tot_mv"].quantile(0.75),
-                    "mv_q90": x["tot_mv"].quantile(0.90),
-                    "mv_max": x["tot_mv"].max(),
-                    "mv_mean": x["tot_mv"].mean(),
-                    "mv_sum": x["tot_mv"].sum(),
-                    "ratio_min": x["ratio"].min(),
-                    "ratio_q10": x["ratio"].quantile(0.1),
-                    "ratio_q25": x["ratio"].quantile(0.25),
-                    "ratio_median": x["ratio"].median(),
-                    "ratio_q75": x["ratio"].quantile(0.75),
-                    "ratio_q90": x["ratio"].quantile(0.90),
-                    "ratio_max": x["ratio"].max(),
-                    "ratio_mean": x["ratio"].mean(),
-                    "cod": ass.cod(ratio=x["ratio"]),
-                    "prd": ass.prd(x["tot_mv"], x["sale_price"]),
-                    "prb": ass.prb(x["tot_mv"], x["sale_price"], 3)["prb"],
-                    "mki": ass.mki(x["tot_mv"], x["sale_price"]),
-                }
-            ),
-            include_groups=False,
+    data = data.dropna(subset=["ratio"])
+
+    summary = data.groupby(group).apply(
+        lambda x: pd.Series(
+            {
+                "size": x["size"].min(),
+                "mv_count": x["mv_count"].min(),
+                "sale_count": x["sale_count"].min(),
+                "mv_min": x["tot_mv"].min(),
+                "mv_q10": x["tot_mv"].quantile(0.1),
+                "mv_q25": x["tot_mv"].quantile(0.25),
+                "mv_median": x["tot_mv"].median(),
+                "mv_q75": x["tot_mv"].quantile(0.75),
+                "mv_q90": x["tot_mv"].quantile(0.90),
+                "mv_max": x["tot_mv"].max(),
+                "mv_mean": x["tot_mv"].mean(),
+                "mv_sum": x["tot_mv"].sum(),
+                "ratio_min": x["ratio"].min(),
+                "ratio_q10": x["ratio"].quantile(0.1),
+                "ratio_q25": x["ratio"].quantile(0.25),
+                "ratio_median": x["ratio"].median(),
+                "ratio_q75": x["ratio"].quantile(0.75),
+                "ratio_q90": x["ratio"].quantile(0.90),
+                "ratio_max": x["ratio"].max(),
+                "ratio_mean": x["ratio"].mean(),
+                # "cod": ass.cod(ratio=x["ratio"]),
+                # "prd": ass.prd(x["tot_mv"], x["sale_price"]),
+                # "prb": ass.prb(x["tot_mv"], x["sale_price"], 3)["prb"],
+                # "mki": ass.mki(x["tot_mv"], x["sale_price"]),
+            }
         )
     )
     summary["geography_type"] = geography_type
@@ -117,9 +115,15 @@ def assemble(df, geos, groups):
             for z in groups:
                 output = pd.concat([output, aggregrate(df, x, z)])
 
-    output.index.names = ["geography_id", "group_id", "year", "stage_name"]
+    output.dropna(how="all", axis=1, inplace=True)
+
+    return output
+
+
+def clean(dirty):
+    dirty.index.names = ["geography_id", "group_id", "year", "stage_name"]
 
-    output = output.reset_index().set_index(
+    dirty = dirty.reset_index().set_index(
         [
             "geography_type",
             "geography_id",
@@ -130,37 +134,57 @@ def assemble(df, geos, groups):
         ]
     )
 
-    # Clean combined output and export
-    output["mv_delta_pct_median"] = (
-        output.sort_values("year")
+    # Clean combined dirty and export
+    dirty["mv_delta_pct_median"] = (
+        dirty.sort_values("year")
         .groupby(["geography_id", "group_id", "stage_name"])
         .mv_median.diff()
     )
-    output["mv_delta_pct_mean"] = (
-        output.sort_values("year")
+    dirty["mv_delta_pct_mean"] = (
+        dirty.sort_values("year")
         .groupby(["geography_id", "group_id", "stage_name"])
         .mv_mean.diff()
     )
-    output["mv_delta_pct_sum"] = (
-        output.sort_values("year")
+    dirty["mv_delta_pct_sum"] = (
+        dirty.sort_values("year")
         .groupby(["geography_id", "group_id", "stage_name"])
         .mv_sum.diff()
     )
 
-    output["mv_delta_pct_median"] = (
-        output.sort_values("year")
+    dirty["mv_delta_pct_median"] = (
+        dirty.sort_values("year")
         .groupby(["geography_id", "group_id", "stage_name"])
         .mv_median.pct_change()
     )
-    output["mv_delta_pct_mean"] = (
-        output.sort_values("year")
+    dirty["mv_delta_pct_mean"] = (
+        dirty.sort_values("year")
         .groupby(["geography_id", "group_id", "stage_name"])
         .mv_mean.pct_change()
     )
 
-    output.dropna(how="all", axis=1, inplace=True)
+    dirty = dirty.reset_index()
+
+    dirty = dirty.astype(
+        {
+            "group_id": "str",
+            "year": "str",
+            "stage_name": "str",
+            "size": np.int64,
+            "mv_count": np.int64,
+            "sale_count": np.int64,
+            "mv_min": np.int64,
+            "mv_q10": np.int64,
+            "mv_q25": np.int64,
+            "mv_median": np.int64,
+            "mv_q75": np.int64,
+            "mv_q90": np.int64,
+            "mv_max": np.int64,
+            "mv_mean": np.int64,
+            "mv_sum": np.int64,
+        }
+    )
 
-    return output
+    return dirty
 
 
 def model(dbt, spark_session):
@@ -174,6 +198,22 @@ def model(dbt, spark_session):
 
     df = assemble(input, geos=geos, groups=groups)
 
-    spark_df = spark_session.createDataFrame(df)
+    df = clean(df)
+
+    schema = (
+        "geography_type: string, geography_id: string, "
+        + "group_type: string, group_id: string, year: string, "
+        + "stage_name: string, size: bigint, mv_count: bigint, "
+        + "sale_count: bigint, mv_min: bigint, mv_q10: bigint, "
+        + "mv_q25: bigint, mv_median: bigint, mv_q75: bigint, "
+        + "mv_q90: bigint, mv_max: bigint, mv_mean: bigint, "
+        + "mv_sum: bigint, ratio_min: double, ratio_q10: double, "
+        + "ratio_q25: double, ratio_median: double, ratio_q75: double, "
+        + "ratio_q90: double, ratio_max: double, ratio_mean: double, "
+        + "mv_delta_pct_median: double, mv_delta_pct_mean: double, "
+        + "mv_delta_pct_sum: double"
+    )
+
+    spark_df = spark_session.createDataFrame(df, schema=schema)
 
     return spark_df
diff --git a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
index 6cd258c2a..767d65990 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
+++ b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
@@ -119,4 +119,4 @@ LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales
     AND NOT sales.sale_filter_less_than_10k
     AND NOT sales.sale_filter_same_sale_within_365
 WHERE uni.year >= '2020'
-    AND uni.year = '2023' AND uni.class = '278'
+    AND uni.year IN ('2022', '2023') AND uni.class = '278'

From f55975314760c9bebc43bf18f2436a46c81f243d Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Wed, 26 Jun 2024 16:31:18 +0000
Subject: [PATCH 22/96] sot_ratio_stats table building in dbt

---
 .../reporting/reporting.sot_ratio_stats.py    | 64 ++++++++++++++++---
 .../reporting.sot_ratio_stats_input.sql       |  2 +-
 2 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py
index 56b2e281e..e749a5563 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.py
@@ -8,7 +8,7 @@
 # of geographies, class combinations, and time.
 
 # Import libraries
-import assesspy as ass  # noqa: E402, F401
+import assesspy as ass  # noqa: E402
 import numpy as np  # noqa: E402
 import pandas as pd  # noqa: E402
 
@@ -49,6 +49,44 @@
 groups = ["no_group", "class", "major_class", "modeling_group"]
 
 
+def cod_safe(ratio):
+    if len(ratio) >= 1:
+        output = ass.cod(ratio)
+    else:
+        output = None
+
+    return output
+
+
+def prd_safe(assessed, sale_price):
+    if len(sale_price) >= 1:
+        output = ass.prd(assessed=assessed, sale_price=sale_price)
+    else:
+        output = None
+
+    return output
+
+
+def prb_safe(assessed, sale_price):
+    if len(sale_price) >= 1:
+        output = ass.prb(assessed=assessed, sale_price=sale_price, round=3)[
+            "prb"
+        ]
+    else:
+        output = None
+
+    return output
+
+
+def mki_safe(assessed, sale_price):
+    if len(sale_price) >= 1:
+        output = ass.mki(assessed=assessed, sale_price=sale_price)
+    else:
+        output = None
+
+    return output
+
+
 # Define aggregation functions
 def aggregrate(data, geography_type, group_type):
     print(geography_type, group_type)
@@ -60,17 +98,15 @@ def aggregrate(data, geography_type, group_type):
 
     # Remove parcels with FMVs of 0 since they screw up ratios
     data = data[data["tot_mv"] > 0]
-    data["ratio_count"] = data.groupby(group)["ratio"].transform("count")
 
     # Remove groups that only have one sale since we can't calculate stats
-    data = data[data["ratio_count"] >= 30]
-
-    data = data.dropna(subset=["ratio"])
+    data = data.dropna(subset=["sale_price"])
+    data = data[data["sale_count"] >= 20]
 
     summary = data.groupby(group).apply(
         lambda x: pd.Series(
             {
-                "size": x["size"].min(),
+                "size": np.size(x["ratio"]),
                 "mv_count": x["mv_count"].min(),
                 "sale_count": x["sale_count"].min(),
                 "mv_min": x["tot_mv"].min(),
@@ -90,10 +126,17 @@ def aggregrate(data, geography_type, group_type):
                 "ratio_q90": x["ratio"].quantile(0.90),
                 "ratio_max": x["ratio"].max(),
                 "ratio_mean": x["ratio"].mean(),
-                # "cod": ass.cod(ratio=x["ratio"]),
-                # "prd": ass.prd(x["tot_mv"], x["sale_price"]),
-                # "prb": ass.prb(x["tot_mv"], x["sale_price"], 3)["prb"],
-                # "mki": ass.mki(x["tot_mv"], x["sale_price"]),
+                # "cod": ' '.join(x['ratio'].astype(str).values),
+                "cod": cod_safe(ratio=x["ratio"]),
+                "prd": prd_safe(
+                    assessed=x["tot_mv"], sale_price=x["sale_price"]
+                ),
+                "prb": prb_safe(
+                    assessed=x["tot_mv"], sale_price=x["sale_price"]
+                ),
+                "mki": mki_safe(
+                    assessed=x["tot_mv"], sale_price=x["sale_price"]
+                ),
             }
         )
     )
@@ -210,6 +253,7 @@ def model(dbt, spark_session):
         + "mv_sum: bigint, ratio_min: double, ratio_q10: double, "
         + "ratio_q25: double, ratio_median: double, ratio_q75: double, "
         + "ratio_q90: double, ratio_max: double, ratio_mean: double, "
+        + "cod: double, prd: double, prb: double, mki: double, "
         + "mv_delta_pct_median: double, mv_delta_pct_mean: double, "
         + "mv_delta_pct_sum: double"
     )
diff --git a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
index 767d65990..0228ee4d8 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
+++ b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
@@ -119,4 +119,4 @@ LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales
     AND NOT sales.sale_filter_less_than_10k
     AND NOT sales.sale_filter_same_sale_within_365
 WHERE uni.year >= '2020'
-    AND uni.year IN ('2022', '2023') AND uni.class = '278'
+    AND uni.year IN ('2022', '2023') AND uni.class IN ('278', '597')

From 1f8ad1f1933ff46b42b5e32ddc84b98100e7ef56 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Wed, 26 Jun 2024 19:44:16 +0000
Subject: [PATCH 23/96] Add res_other group

---
 dbt/models/reporting/reporting.sot_assessment_roll.py         | 2 +-
 dbt/models/reporting/reporting.sot_assessment_roll_input.sql  | 4 +++-
 dbt/models/reporting/reporting.sot_ratio_stats.py             | 2 +-
 dbt/models/reporting/reporting.sot_ratio_stats_input.sql      | 4 +++-
 dbt/models/reporting/reporting.sot_sales.py                   | 2 +-
 dbt/models/reporting/reporting.sot_sales_input.sql            | 4 +++-
 dbt/models/reporting/reporting.sot_taxes_exemptions.py        | 2 +-
 dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql | 4 +++-
 8 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index 2f99c35bc..6357ef79b 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -41,7 +41,7 @@
     ],
 }
 # Declare class groupings
-groups = ["no_group", "class", "major_class", "modeling_group"]
+groups = ["no_group", "class", "major_class", "modeling_group", "res_other"]
 
 
 # Define aggregation functions
diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
index 8f3f7c3f2..76f58cc50 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
+++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
@@ -102,7 +102,9 @@ SELECT
     uni.tax_data_year,
     'no_group' AS no_group,
     class_dict.major_class_type AS major_class,
-    class_dict.modeling_group
+    class_dict.modeling_group,
+    CASE WHEN class_dict.major_class_code = '2' THEN 'RES' ELSE 'OTHER' END
+        AS res_other
 FROM uni
 LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals
     ON uni.pin = vals.pin
diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py
index e749a5563..d7bd2fc31 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.py
@@ -46,7 +46,7 @@
     ],
 }
 # Declare class groupings
-groups = ["no_group", "class", "major_class", "modeling_group"]
+groups = ["no_group", "class", "major_class", "modeling_group", "res_other"]
 
 
 def cod_safe(ratio):
diff --git a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
index 0228ee4d8..29a28ff92 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
+++ b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
@@ -102,7 +102,9 @@ SELECT
     uni.tax_data_year,
     'no_group' AS no_group,
     class_dict.major_class_type AS major_class,
-    class_dict.modeling_group
+    class_dict.modeling_group,
+    CASE WHEN class_dict.major_class_code = '2' THEN 'RES' ELSE 'OTHER' END
+        AS res_other
 FROM uni
 LEFT JOIN
     {{ ref('reporting.vw_pin_value_long') }} AS vals
diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py
index 9709e77b3..b00d76f84 100644
--- a/dbt/models/reporting/reporting.sot_sales.py
+++ b/dbt/models/reporting/reporting.sot_sales.py
@@ -41,7 +41,7 @@
     ],
 }
 # Declare class groupings
-groups = ["no_group", "class", "major_class", "modeling_group"]
+groups = ["no_group", "class", "major_class", "modeling_group", "res_other"]
 
 
 # Define aggregation functions
diff --git a/dbt/models/reporting/reporting.sot_sales_input.sql b/dbt/models/reporting/reporting.sot_sales_input.sql
index e6ca7afd4..dcd6fd085 100644
--- a/dbt/models/reporting/reporting.sot_sales_input.sql
+++ b/dbt/models/reporting/reporting.sot_sales_input.sql
@@ -83,7 +83,9 @@ SELECT
     uni.tax_data_year,
     'no_group' AS no_group,
     class_dict.major_class_type AS major_class,
-    class_dict.modeling_group
+    class_dict.modeling_group,
+    CASE WHEN class_dict.major_class_code = '2' THEN 'RES' ELSE 'OTHER' END
+        AS res_other
 FROM {{ ref('default.vw_pin_universe') }} AS uni
 LEFT JOIN sf
     ON uni.pin = sf.pin
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
index cb257a49e..5deccbd8c 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
@@ -41,7 +41,7 @@
     ],
 }
 # Declare class groupings
-groups = ["no_group", "class", "major_class", "modeling_group"]
+groups = ["no_group", "class", "major_class", "modeling_group", "res_other"]
 
 
 # Define aggregation functions
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
index d0213ff5a..6bed59fdf 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
@@ -97,7 +97,9 @@ SELECT
     uni.tax_data_year,
     'no_group' AS no_group,
     class_dict.major_class_type AS major_class,
-    class_dict.modeling_group
+    class_dict.modeling_group,
+    CASE WHEN class_dict.major_class_code = '2' THEN 'RES' ELSE 'OTHER' END
+        AS res_other
 FROM {{ ref('default.vw_pin_universe') }} AS uni
 INNER JOIN {{ source('tax', 'pin') }} AS tax
     ON uni.pin = tax.pin

From 063591c109b7151a57b636287440b7d216748896 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Thu, 27 Jun 2024 15:54:09 +0000
Subject: [PATCH 24/96] Add reassessment year indicator for assessment roll

---
 .../reporting.sot_assessment_roll.py          | 43 ++++++++++++++++++-
 .../reporting.sot_assessment_roll_input.sql   | 15 -------
 .../reporting.sot_ratio_stats_input.sql       | 15 -------
 3 files changed, 42 insertions(+), 31 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index 6357ef79b..225e15210 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -9,6 +9,14 @@
 
 # Declare geographic groups and their associated data years
 geos = {
+    "year": [
+        "county",
+        "triad",
+        "township",
+        "nbhd",
+        "tax_code",
+        "zip_code",
+    ],
     "census_data_year": [
         "census_place",
         "census_tract",
@@ -62,7 +70,12 @@ def q90(x):
 
 
 def first(x):
-    return x.iloc[0]
+    if len(x) >= 1:
+        output = x.iloc[0]
+    else:
+        output = None
+
+    return output
 
 
 more_stats = [
@@ -81,6 +94,7 @@ def first(x):
     "tot": ["size", "count"] + more_stats,
     "bldg": more_stats,
     "land": more_stats,
+    "triad": [first],
 }
 
 
@@ -131,6 +145,33 @@ def assemble(df, geos, groups):
     output.columns = ["_".join(col) for col in output.columns]
     output.reset_index()
 
+    output["year"] = output["year"].astype(int)
+    output["temp"] = output["geography_type"].isin(
+        ["triad", "township", "nbhd"]
+    )
+    output["reassessment_year"] = None
+    output["reassessment_year"] = output["reassessment_year"].astype("boolean")
+    output.loc[(output["temp"] is True), "reassessment_year"] = False
+    output.loc[
+        (output["year"] % 3 == 0)
+        & (output["triad"] == "North")
+        & (output["temp"] is True),
+        "reassessment_year",
+    ] = True
+    output.loc[
+        (output["year"] % 3 == 1)
+        & (output["triad"] == "South")
+        & (output["temp"] is True),
+        "reassessment_year",
+    ] = True
+    output.loc[
+        (output["year"] % 3 == 2)
+        & (output["triad"] == "City")
+        & (output["temp"] is True),
+        "reassessment_year",
+    ] = True
+    output.drop(["temp", "triad"], axis=1)
+
     return output
 
 
diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
index 76f58cc50..31e6d7fb5 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
+++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
@@ -36,21 +36,6 @@ SELECT
     CAST(vals.tot AS INT) AS tot,
     CAST(vals.bldg AS INT) AS bldg,
     CAST(vals.land AS INT) AS land,
-    CASE
-        WHEN
-            MOD(CAST(uni.year AS INT), 3) = 0
-            AND uni.triad_name = 'North'
-            THEN TRUE
-        WHEN
-            MOD(CAST(uni.year AS INT), 3) = 1
-            AND uni.triad_name = 'South'
-            THEN TRUE
-        WHEN
-            MOD(CAST(uni.year AS INT), 3) = 2
-            AND uni.triad_name = 'City'
-            THEN TRUE
-        ELSE FALSE
-    END AS reassessment_year,
     'Cook' AS county,
     uni.triad_name AS triad,
     uni.township_name AS township,
diff --git a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
index 29a28ff92..266024e0a 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
+++ b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
@@ -36,21 +36,6 @@ SELECT
     uni.class,
     CAST(vals.tot_mv AS DOUBLE) AS tot_mv,
     CAST(vals.tot_mv AS DOUBLE) / CAST(sales.sale_price AS DOUBLE) AS ratio,
-    CASE
-        WHEN
-            MOD(CAST(uni.year AS INT), 3) = 0
-            AND uni.triad_name = 'North'
-            THEN TRUE
-        WHEN
-            MOD(CAST(uni.year AS INT), 3) = 1
-            AND uni.triad_name = 'South'
-            THEN TRUE
-        WHEN
-            MOD(CAST(uni.year AS INT), 3) = 2
-            AND uni.triad_name = 'City'
-            THEN TRUE
-        ELSE FALSE
-    END AS reassessment_year,
     'Cook' AS county,
     uni.triad_name AS triad,
     uni.township_name AS township,

From a9ffc648e8f712c9df21b2cc20e684d0dd296073 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Thu, 27 Jun 2024 19:02:35 +0000
Subject: [PATCH 25/96] Retry assessment_year indicator

---
 .../reporting.sot_assessment_roll.py          | 23 ++++----
 .../reporting/reporting.sot_ratio_stats.py    | 56 +++++++++++++++++--
 2 files changed, 64 insertions(+), 15 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index 225e15210..6916ab7bd 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -146,31 +146,32 @@ def assemble(df, geos, groups):
     output.reset_index()
 
     output["year"] = output["year"].astype(int)
-    output["temp"] = output["geography_type"].isin(
+    output["triennial"] = output["geography_type"].isin(
         ["triad", "township", "nbhd"]
     )
-    output["reassessment_year"] = None
-    output["reassessment_year"] = output["reassessment_year"].astype("boolean")
-    output.loc[(output["temp"] is True), "reassessment_year"] = False
+    output["reassessment_year"] = ""
+    output.loc[
+        (output["triennial"] == True), "reassessment_year"  # noqa: E712
+    ] = "No"
     output.loc[
         (output["year"] % 3 == 0)
         & (output["triad"] == "North")
-        & (output["temp"] is True),
+        & (output["triennial"] == True),  # noqa: E712
         "reassessment_year",
-    ] = True
+    ] = "Yes"
     output.loc[
         (output["year"] % 3 == 1)
         & (output["triad"] == "South")
-        & (output["temp"] is True),
+        & (output["triennial"] == True),  # noqa: E712
         "reassessment_year",
-    ] = True
+    ] = "Yes"
     output.loc[
         (output["year"] % 3 == 2)
         & (output["triad"] == "City")
-        & (output["temp"] is True),
+        & (output["triennial"] == True),  # noqa: E712
         "reassessment_year",
-    ] = True
-    output.drop(["temp", "triad"], axis=1)
+    ] = "Yes"
+    output = output.drop(["triennial", "triad"], axis=1)
 
     return output
 
diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py
index d7bd2fc31..d0be7e91f 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.py
@@ -14,6 +14,14 @@
 
 # Declare geographic groups and their associated data years
 geos = {
+    "year": [
+        "county",
+        "triad",
+        "township",
+        "nbhd",
+        "tax_code",
+        "zip_code",
+    ],
     "census_data_year": [
         "census_place",
         "census_tract",
@@ -87,6 +95,15 @@ def mki_safe(assessed, sale_price):
     return output
 
 
+def first(x):
+    if len(x) >= 1:
+        output = x.iloc[0]
+    else:
+        output = None
+
+    return output
+
+
 # Define aggregation functions
 def aggregrate(data, geography_type, group_type):
     print(geography_type, group_type)
@@ -106,6 +123,7 @@ def aggregrate(data, geography_type, group_type):
     summary = data.groupby(group).apply(
         lambda x: pd.Series(
             {
+                "triad": first(x["triad"]),
                 "size": np.size(x["ratio"]),
                 "mv_count": x["mv_count"].min(),
                 "sale_count": x["sale_count"].min(),
@@ -207,11 +225,40 @@ def clean(dirty):
 
     dirty = dirty.reset_index()
 
+    dirty["year"] = dirty["year"].astype(int)
+    dirty["triennial"] = dirty["geography_type"].isin(
+        ["triad", "township", "nbhd"]
+    )
+    dirty["reassessment_year"] = ""
+    dirty.loc[
+        (dirty["triennial"] == True), "reassessment_year"  # noqa: E712
+    ] = "No"
+    dirty.loc[
+        (dirty["year"] % 3 == 0)
+        & (dirty["triad"] == "North")
+        & (dirty["triennial"] == True),  # noqa: E712
+        "reassessment_year",
+    ] = "Yes"
+    dirty.loc[
+        (dirty["year"] % 3 == 1)
+        & (dirty["triad"] == "South")
+        & (dirty["triennial"] == True),  # noqa: E712
+        "reassessment_year",
+    ] = "Yes"
+    dirty.loc[
+        (dirty["year"] % 3 == 2)
+        & (dirty["triad"] == "City")
+        & (dirty["triennial"] == True),  # noqa: E712
+        "reassessment_year",
+    ] = "Yes"
+    dirty = dirty.drop(["triennial", "triad"], axis=1)
+
     dirty = dirty.astype(
         {
             "group_id": "str",
-            "year": "str",
+            "year": np.int64,
             "stage_name": "str",
+            "reassessment_year": "str",
             "size": np.int64,
             "mv_count": np.int64,
             "sale_count": np.int64,
@@ -245,8 +292,9 @@ def model(dbt, spark_session):
 
     schema = (
         "geography_type: string, geography_id: string, "
-        + "group_type: string, group_id: string, year: string, "
-        + "stage_name: string, size: bigint, mv_count: bigint, "
+        + "group_type: string, group_id: string, year: bigint, "
+        + "stage_name: string, size: bigint, "
+        + "mv_count: bigint, "
         + "sale_count: bigint, mv_min: bigint, mv_q10: bigint, "
         + "mv_q25: bigint, mv_median: bigint, mv_q75: bigint, "
         + "mv_q90: bigint, mv_max: bigint, mv_mean: bigint, "
@@ -255,7 +303,7 @@ def model(dbt, spark_session):
         + "ratio_q90: double, ratio_max: double, ratio_mean: double, "
         + "cod: double, prd: double, prb: double, mki: double, "
         + "mv_delta_pct_median: double, mv_delta_pct_mean: double, "
-        + "mv_delta_pct_sum: double"
+        + "mv_delta_pct_sum: double, reassessment_year: string"
     )
 
     spark_df = spark_session.createDataFrame(df, schema=schema)

From 62dd68ee5733075a114adfee7fbbc2f8da7f3357 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Fri, 28 Jun 2024 14:59:12 +0000
Subject: [PATCH 26/96] Assessment_roll should run with reassessment year
 indicator

---
 dbt/models/reporting/reporting.sot_assessment_roll.py        | 3 ++-
 dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index 6916ab7bd..eb8ed43d7 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -143,7 +143,8 @@ def assemble(df, geos, groups):
     )
 
     output.columns = ["_".join(col) for col in output.columns]
-    output.reset_index()
+    output = output.reset_index()
+    output = output.rename(columns={"triad_first": "triad"})
 
     output["year"] = output["year"].astype(int)
     output["triennial"] = output["geography_type"].isin(
diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
index 31e6d7fb5..470a01d27 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
+++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
@@ -97,4 +97,4 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals
     AND uni.stage_name = vals.stage_name
 LEFT JOIN {{ ref('ccao.class_dict') }}
     ON uni.class = class_dict.class_code
-WHERE uni.stage_name = 'MAILED' AND uni.class = '278'
+WHERE uni.stage_name = 'MAILED' AND uni.class = '278' AND uni.year >= '2018'

From c185e8102a84a186990e4a85402d8687c1bb88e6 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Fri, 28 Jun 2024 16:49:56 +0000
Subject: [PATCH 27/96] Add schema to assessment_roll table

---
 .../reporting.sot_assessment_roll.py          | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index eb8ed43d7..5a51c704c 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -188,6 +188,26 @@ def model(dbt, spark_session):
 
     df = assemble(input, geos=geos, groups=groups)
 
-    spark_df = spark_session.createDataFrame(df)
+    schema = (
+        "geography_type: string, geography_id: string, group_type: string, "
+        + "group_id: string, year: bigint, stage_name: string, "
+        + "tot_size: bigint, tot_count: bigint, tot_min: double, "
+        + "tot_q10: double, tot_q25: double, tot_median: double, "
+        + "tot_q75: double, tot_q90: double, tot_max: double, "
+        + "tot_mean: double, tot_sum: double, bldg_min: double, "
+        + "bldg_q10: double, bldg_q25: double, bldg_median: double, "
+        + "bldg_q75: double, bldg_q90: double, bldg_max: double, "
+        + "bldg_mean: double, bldg_sum: double, land_min: double, "
+        + "land_q10: double, land_q25: double, land_median: double, "
+        + "land_q75: double, land_q90: double, land_max: double, "
+        + "land_mean: double, land_sum: double, tot_deltamedian: double, "
+        + "bldg_deltamedian: double, land_deltamedian: double, "
+        + "tot_deltamean: double, bldg_deltamean: double, "
+        + "land_deltamean: double, tot_deltasum: double, "
+        + "bldg_deltasum: double, land_deltasum: double, "
+        + "tot_pct_w_value: double, reassessment_year: string"
+    )
+
+    spark_df = spark_session.createDataFrame(df, schema=schema)
 
     return spark_df

From d08bc3d0695a0b81e8898913816c1ccbd2433110 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Fri, 28 Jun 2024 20:21:05 +0000
Subject: [PATCH 28/96] Correct output from sales and taxes tables

---
 dbt/models/reporting/reporting.sot_sales.py            | 10 +++++++++-
 dbt/models/reporting/reporting.sot_taxes_exemptions.py | 10 +++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py
index b00d76f84..899f895c7 100644
--- a/dbt/models/reporting/reporting.sot_sales.py
+++ b/dbt/models/reporting/reporting.sot_sales.py
@@ -9,6 +9,14 @@
 
 # Declare geographic groups and their associated data years
 geos = {
+    "year": [
+        "county",
+        "triad",
+        "township",
+        "nbhd",
+        "tax_code",
+        "zip_code",
+    ],
     "census_data_year": [
         "census_place",
         "census_tract",
@@ -128,7 +136,7 @@ def assemble(df, geos, groups):
         output["price_per_sf", "delta" + i] = output["price_per_sf", i].diff()
 
     output.columns = ["_".join(col) for col in output.columns]
-    output.reset_index()
+    output = output.reset_index()
 
     return output
 
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
index 5deccbd8c..4e1c89c9c 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
@@ -9,6 +9,14 @@
 
 # Declare geographic groups and their associated data years
 geos = {
+    "year": [
+        "county",
+        "triad",
+        "township",
+        "nbhd",
+        "tax_code",
+        "zip_code",
+    ],
     "census_data_year": [
         "census_place",
         "census_tract",
@@ -132,7 +140,7 @@ def assemble(df, geos, groups):
         ].diff()
 
     output.columns = ["_".join(col) for col in output.columns]
-    output.reset_index()
+    output = output.reset_index()
 
     return output
 

From 4808aa4f774e06fba0ef9e60be01c3bd753b212f Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Fri, 28 Jun 2024 21:22:25 +0000
Subject: [PATCH 29/96] Add table schemas

---
 dbt/models/reporting/reporting.sot_sales.py   | 60 +++++++++++-----
 .../reporting.sot_taxes_exemptions.py         | 70 ++++++++++++++-----
 2 files changed, 96 insertions(+), 34 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py
index 899f895c7..e606109b7 100644
--- a/dbt/models/reporting/reporting.sot_sales.py
+++ b/dbt/models/reporting/reporting.sot_sales.py
@@ -96,6 +96,27 @@ def first(x):
 }
 
 
+def aggregrate(data, geography_type, group_type):
+    print(geography_type, group_type)
+
+    group = [geography_type, group_type, "year"]
+    summary = data.groupby(group).agg(agg_func_math).round(2)
+    summary["geography_type"] = geography_type
+    summary["group_type"] = group_type
+    summary.index.names = ["geography_id", "group_id", "year"]
+    summary = summary.reset_index().set_index(
+        [
+            "geography_type",
+            "geography_id",
+            "group_type",
+            "group_id",
+            "year",
+        ]
+    )
+
+    return summary
+
+
 def assemble(df, geos, groups):
     # Create an empty dataframe to fill with output
     output = pd.DataFrame()
@@ -106,22 +127,7 @@ def assemble(df, geos, groups):
 
         for x in value:
             for z in groups:
-                group = [x, z, "year"]
-                summary = df.groupby(group).agg(agg_func_math).round(2)
-                summary["geography_type"] = x
-                summary["group_type"] = z
-                summary.index.names = ["geography_id", "group_id", "year"]
-                summary = summary.reset_index().set_index(
-                    [
-                        "geography_type",
-                        "geography_id",
-                        "group_type",
-                        "group_id",
-                        "year",
-                    ]
-                )
-
-                output = pd.concat([output, summary])
+                output = pd.concat([output, aggregrate(df, x, z)])
 
     # Clean combined output and export
     output["sale_price", "sum"] = output["sale_price", "sum"].replace(
@@ -152,6 +158,26 @@ def model(dbt, spark_session):
 
     df = assemble(input, geos=geos, groups=groups)
 
-    spark_df = spark_session.createDataFrame(df)
+    schema = (
+        "geography_type: string, geography_id: string, group_type: string, "
+        + "group_id: string, year: bigint, sale_price_size: double, "
+        + "sale_price_count: double, sale_price_min: double, "
+        + "sale_price_q10: double, sale_price_q25: double, "
+        + "sale_price_median: double, sale_price_q75: double, "
+        + "sale_price_q90: double, sale_price_max: double, "
+        + "sale_price_mean: double, sale_price_sum: double, "
+        + "price_per_sf_min: double, price_per_sf_q10: double, "
+        + "price_per_sf_q25: double, price_per_sf_median: double, "
+        + "price_per_sf_q75: double, price_per_sf_q90: double, "
+        + "price_per_sf_max: double, price_per_sf_mean: double, "
+        + "price_per_sf_sum: double, char_bldg_sf_median: double, "
+        + "char_land_sf_median: double, char_yrblt_median: double, "
+        + "class_multimode: array<string>, data_year_first: bigint,"
+        + "sale_price_deltamedian: double, price_per_sf_deltamedian: double, "
+        + "sale_price_deltamean: double, price_per_sf_deltamean: double, "
+        + "sale_price_deltasum: double, price_per_sf_deltasum: double"
+    )
+
+    spark_df = spark_session.createDataFrame(df, schema=schema)
 
     return spark_df
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
index 4e1c89c9c..af4f4ca9f 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
@@ -106,6 +106,27 @@ def first(x):
 }
 
 
+def aggregrate(data, geography_type, group_type):
+    print(geography_type, group_type)
+
+    group = [geography_type, group_type, "year"]
+    summary = data.groupby(group).agg(agg_func_math).round(2)
+    summary["geography_type"] = geography_type
+    summary["group_type"] = group_type
+    summary.index.names = ["geography_id", "group_id", "year"]
+    summary = summary.reset_index().set_index(
+        [
+            "geography_type",
+            "geography_id",
+            "group_type",
+            "group_id",
+            "year",
+        ]
+    )
+
+    return summary
+
+
 def assemble(df, geos, groups):
     # Create an empty dataframe to fill with output
     output = pd.DataFrame()
@@ -116,22 +137,7 @@ def assemble(df, geos, groups):
 
         for x in value:
             for z in groups:
-                group = [x, z, "year"]
-                summary = df.groupby(group).agg(agg_func_math).round(2)
-                summary["geography_type"] = x
-                summary["group_type"] = z
-                summary.index.names = ["geography_id", "group_id", "year"]
-                summary = summary.reset_index().set_index(
-                    [
-                        "geography_type",
-                        "geography_id",
-                        "group_type",
-                        "group_id",
-                        "year",
-                    ]
-                )
-
-                output = pd.concat([output, summary])
+                output = pd.concat([output, aggregrate(df, x, z)])
 
     # Clean combined output and export
     for i in ["median", "mean", "sum"]:
@@ -156,6 +162,36 @@ def model(dbt, spark_session):
 
     df = assemble(input, geos=geos, groups=groups)
 
-    spark_df = spark_session.createDataFrame(df)
+    schema = (
+        "geography_type: string, geography_id: string, group_type: string, "
+        + "group_id: string, year: bigint, eq_factor_final_size: bigint, "
+        + "eq_factor_final_first: double, eq_factor_tentative_first: double, "
+        + "tax_bill_total_min: double, tax_bill_total_q10: double, "
+        + "tax_bill_total_q25: double, tax_bill_total_median: double, "
+        + "tax_bill_total_q75: double, tax_bill_total_q90: double, "
+        + "tax_bill_total_max: double, tax_bill_total_mean: double, "
+        + "tax_bill_total_sum: double, tax_code_rate_min: double, "
+        + "tax_code_rate_q10: double, tax_code_rate_q25: double, "
+        + "tax_code_rate_median: double, tax_code_rate_q75: double, "
+        + "tax_code_rate_q90: double, tax_code_rate_max: double, "
+        + "tax_code_rate_mean: double, tax_code_rate_sum: double, "
+        + "av_clerk_min: int, av_clerk_q10: double, av_clerk_q25: double, "
+        + "av_clerk_median: double, av_clerk_q75: double, "
+        + "av_clerk_q90: double, av_clerk_max: int, av_clerk_mean: double, "
+        + "av_clerk_sum: double, exe_homeowner_count: bigint, "
+        + "exe_homeowner_sum: double, exe_senior_count: bigint, "
+        + "exe_senior_sum: double, exe_freeze_count: bigint, "
+        + "exe_freeze_sum: double, exe_longtime_homeowner_count: bigint, "
+        + "exe_longtime_homeowner_sum: double, exe_disabled_count: bigint, "
+        + "exe_disabled_sum: double, exe_vet_returning_count: bigint, "
+        + "exe_vet_returning_sum: double, exe_vet_dis_lt50_count: bigint, "
+        + "exe_vet_dis_lt50_sum: double, exe_vet_dis_50_69_count: bigint, "
+        + "exe_vet_dis_50_69_sum: double, exe_vet_dis_ge70_count: bigint, "
+        + "exe_vet_dis_ge70_sum: double, exe_abate_count: bigint, "
+        + "exe_abate_sum: double, tax_bill_total_deltamedian: double, "
+        + "tax_bill_total_deltamean: double, tax_bill_total_deltasum: double"
+    )
+
+    spark_df = spark_session.createDataFrame(df, schema=schema)
 
     return spark_df

From 08c8d53d258d1fd2459d1cbf4e0d13fbb8bcec60 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Fri, 28 Jun 2024 21:40:49 +0000
Subject: [PATCH 30/96] Fix schemas

---
 dbt/models/reporting/reporting.sot_sales.py            | 4 ++--
 dbt/models/reporting/reporting.sot_taxes_exemptions.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py
index e606109b7..d35e949be 100644
--- a/dbt/models/reporting/reporting.sot_sales.py
+++ b/dbt/models/reporting/reporting.sot_sales.py
@@ -160,7 +160,7 @@ def model(dbt, spark_session):
 
     schema = (
         "geography_type: string, geography_id: string, group_type: string, "
-        + "group_id: string, year: bigint, sale_price_size: double, "
+        + "group_id: string, year: string, sale_price_size: double, "
         + "sale_price_count: double, sale_price_min: double, "
         + "sale_price_q10: double, sale_price_q25: double, "
         + "sale_price_median: double, sale_price_q75: double, "
@@ -172,7 +172,7 @@ def model(dbt, spark_session):
         + "price_per_sf_max: double, price_per_sf_mean: double, "
         + "price_per_sf_sum: double, char_bldg_sf_median: double, "
         + "char_land_sf_median: double, char_yrblt_median: double, "
-        + "class_multimode: array<string>, data_year_first: bigint,"
+        + "class_multimode: array<string>, data_year_first: string,"
         + "sale_price_deltamedian: double, price_per_sf_deltamedian: double, "
         + "sale_price_deltamean: double, price_per_sf_deltamean: double, "
         + "sale_price_deltasum: double, price_per_sf_deltasum: double"
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
index af4f4ca9f..07691ec5b 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
@@ -164,7 +164,7 @@ def model(dbt, spark_session):
 
     schema = (
         "geography_type: string, geography_id: string, group_type: string, "
-        + "group_id: string, year: bigint, eq_factor_final_size: bigint, "
+        + "group_id: string, year: string, eq_factor_final_size: bigint, "
         + "eq_factor_final_first: double, eq_factor_tentative_first: double, "
         + "tax_bill_total_min: double, tax_bill_total_q10: double, "
         + "tax_bill_total_q25: double, tax_bill_total_median: double, "

From 2f8dc3dca5be1dffbf2e0f05dc54f084232e19b8 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Mon, 1 Jul 2024 20:33:16 +0000
Subject: [PATCH 31/96] Resolve sales table column type issues

---
 dbt/models/reporting/reporting.sot_sales.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py
index d35e949be..65889953b 100644
--- a/dbt/models/reporting/reporting.sot_sales.py
+++ b/dbt/models/reporting/reporting.sot_sales.py
@@ -160,8 +160,8 @@ def model(dbt, spark_session):
 
     schema = (
         "geography_type: string, geography_id: string, group_type: string, "
-        + "group_id: string, year: string, sale_price_size: double, "
-        + "sale_price_count: double, sale_price_min: double, "
+        + "group_id: string, year: string, sale_price_size: bigint, "
+        + "sale_price_count: int, sale_price_min: double, "
         + "sale_price_q10: double, sale_price_q25: double, "
         + "sale_price_median: double, sale_price_q75: double, "
         + "sale_price_q90: double, sale_price_max: double, "

From 88ce0496dd50092dcd45c2ee7c24c6f682cde1ec Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Tue, 2 Jul 2024 15:45:39 +0000
Subject: [PATCH 32/96] Add exe_total to exemptions table

---
 .../reporting/reporting.sot_taxes_exemptions.py       |  4 +++-
 .../reporting.sot_taxes_exemptions_input.sql          | 11 +++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
index 07691ec5b..044f59a64 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
@@ -103,6 +103,7 @@ def first(x):
     "exe_vet_dis_50_69": less_stats,
     "exe_vet_dis_ge70": less_stats,
     "exe_abate": less_stats,
+    "exe_total": less_stats,
 }
 
 
@@ -188,7 +189,8 @@ def model(dbt, spark_session):
         + "exe_vet_dis_lt50_sum: double, exe_vet_dis_50_69_count: bigint, "
         + "exe_vet_dis_50_69_sum: double, exe_vet_dis_ge70_count: bigint, "
         + "exe_vet_dis_ge70_sum: double, exe_abate_count: bigint, "
-        + "exe_abate_sum: double, tax_bill_total_deltamedian: double, "
+        + "exe_abate_sum: double, exe_total_count: bigint, "
+        + "exe_total_sum: double, tax_bill_total_deltamedian: double, "
         + "tax_bill_total_deltamean: double, tax_bill_total_deltasum: double"
     )
 
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
index 6bed59fdf..281b69267 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
@@ -42,6 +42,17 @@ SELECT
     CASE WHEN tax.exe_vet_dis_ge70 = 0 THEN NULL ELSE tax.exe_vet_dis_ge70 END
         AS exe_vet_dis_ge70,
     CASE WHEN tax.exe_abate = 0 THEN NULL ELSE tax.exe_abate END AS exe_abate,
+    CASE
+        WHEN tax.exe_homeowner + tax.exe_senior + tax.exe_freeze
+            + tax.exe_longtime_homeowner + tax.exe_disabled
+            + tax.exe_vet_returning + tax.exe_vet_dis_lt50
+            + tax.exe_vet_dis_50_69 + tax.exe_vet_dis_ge70 + tax.exe_abate = 0
+            THEN NULL ELSE
+            tax.exe_homeowner + tax.exe_senior + tax.exe_freeze
+            + tax.exe_longtime_homeowner + tax.exe_disabled
+            + tax.exe_vet_returning + tax.exe_vet_dis_lt50
+            + tax.exe_vet_dis_50_69 + tax.exe_vet_dis_ge70 + tax.exe_abate
+    END AS exe_total,
     tcd.tax_code_rate,
     eqf.eq_factor_tentative,
     eqf.eq_factor_final,

From 271576d839c8cde9cb0359b1bff54d4205d22efd Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Tue, 2 Jul 2024 16:34:05 +0000
Subject: [PATCH 33/96] Add more ratio stats

---
 .../reporting/reporting.sot_ratio_stats.py    | 24 ++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py
index d0be7e91f..c226dc532 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.py
@@ -104,6 +104,14 @@ def first(x):
     return output
 
 
+def met(x, lower_limit, upper_limit):
+    return np.logical_and(lower_limit <= x, x <= upper_limit)
+
+
+def within(x, limit):
+    return np.logical_and(1 - limit < x, x < 1 + limit)
+
+
 # Define aggregation functions
 def aggregrate(data, geography_type, group_type):
     print(geography_type, group_type)
@@ -253,6 +261,16 @@ def clean(dirty):
     ] = "Yes"
     dirty = dirty.drop(["triennial", "triad"], axis=1)
 
+    dirty["cod_met"] = met(dirty["cod"], 5, 15)
+    dirty["prd_met"] = met(dirty["prd"], 0.98, 1.03)
+    dirty["prb_met"] = met(dirty["prb"], -0.05, 0.05)
+    dirty["mki_met"] = met(dirty["mki"], 0.95, 1.05)
+
+    dirty["within_05_pct"] = within(dirty["ratio_mean"], 0.05)
+    dirty["within_10_pct"] = within(dirty["ratio_mean"], 0.1)
+    dirty["within_15_pct"] = within(dirty["ratio_mean"], 0.15)
+    dirty["within_20_pct"] = within(dirty["ratio_mean"], 0.2)
+
     dirty = dirty.astype(
         {
             "group_id": "str",
@@ -303,7 +321,11 @@ def model(dbt, spark_session):
         + "ratio_q90: double, ratio_max: double, ratio_mean: double, "
         + "cod: double, prd: double, prb: double, mki: double, "
         + "mv_delta_pct_median: double, mv_delta_pct_mean: double, "
-        + "mv_delta_pct_sum: double, reassessment_year: string"
+        + "mv_delta_pct_sum: double, reassessment_year: string, "
+        + "cod_met: boolean, prd_met: boolean, prb_met: boolean, "
+        + "mki_met: boolean, within_05_pct: boolean, "
+        + "within_10_pct: boolean, within_15_pct: boolean, "
+        + "within_20_pct: boolean"
     )
 
     spark_df = spark_session.createDataFrame(df, schema=schema)

From c39a2d82c6178cf176758812c5f4a7ba7a5c3a57 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Tue, 2 Jul 2024 22:00:49 +0000
Subject: [PATCH 34/96] Clean sales table columns

---
 dbt/models/reporting/reporting.sot_sales.py   | 110 ++++++++++++++----
 .../reporting/reporting.sot_sales_input.sql   |  14 +--
 2 files changed, 90 insertions(+), 34 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py
index 65889953b..aa8229b87 100644
--- a/dbt/models/reporting/reporting.sot_sales.py
+++ b/dbt/models/reporting/reporting.sot_sales.py
@@ -87,10 +87,10 @@ def first(x):
 
 agg_func_math = {
     "sale_price": ["size", "count"] + more_stats,
-    "price_per_sf": more_stats,
-    "char_bldg_sf": ["median"],
-    "char_land_sf": ["median"],
-    "char_yrblt": ["median"],
+    "sale_price_per_sf": more_stats,
+    "sale_char_bldg_sf": ["median"],
+    "sale_char_land_sf": ["median"],
+    "sale_char_yrblt": ["median"],
     "class": [stats.multimode],
     "data_year": [first],
 }
@@ -133,17 +133,76 @@ def assemble(df, geos, groups):
     output["sale_price", "sum"] = output["sale_price", "sum"].replace(
         0, np.NaN
     )
-    output["price_per_sf", "sum"] = output["price_per_sf", "sum"].replace(
-        0, np.NaN
-    )
+    output["sale_price_per_sf", "sum"] = output[
+        "sale_price_per_sf", "sum"
+    ].replace(0, np.NaN)
 
     for i in ["median", "mean", "sum"]:
         output["sale_price", "delta" + i] = output["sale_price", i].diff()
-        output["price_per_sf", "delta" + i] = output["price_per_sf", i].diff()
+        output["sale_price_per_sf", "delta" + i] = output[
+            "sale_price_per_sf", i
+        ].diff()
 
     output.columns = ["_".join(col) for col in output.columns]
     output = output.reset_index()
 
+    output = clean_names(output)
+
+    return output
+
+
+def clean_names(x):
+    output = x.rename(
+        columns={
+            "sale_price_size": "pin_n_tot",
+            "year": "sale_year",
+            "sale_price_count": "sale_n_tot",
+            "class_multimode": "sale_class_mode",
+            "data_year_first": "data_year",
+        }
+    )
+
+    output = output[
+        [
+            "geography_type",
+            "geography_id",
+            "group_type",
+            "group_id",
+            "sale_year",
+            "pin_n_tot",
+            "sale_n_tot",
+            "sale_price_min",
+            "sale_price_q10",
+            "sale_price_q25",
+            "sale_price_median",
+            "sale_price_q75",
+            "sale_price_q90",
+            "sale_price_max",
+            "sale_price_mean",
+            "sale_price_sum",
+            "sale_price_deltamedian",
+            "sale_price_deltamean",
+            "sale_price_deltasum",
+            "sale_price_per_sf_min",
+            "sale_price_per_sf_q10",
+            "sale_price_per_sf_q25",
+            "sale_price_per_sf_median",
+            "sale_price_per_sf_q75",
+            "sale_price_per_sf_q90",
+            "sale_price_per_sf_max",
+            "sale_price_per_sf_mean",
+            "sale_price_per_sf_sum",
+            "sale_price_per_sf_deltamedian",
+            "sale_price_per_sf_deltamean",
+            "sale_price_per_sf_deltasum",
+            "sale_char_bldg_sf_median",
+            "sale_char_land_sf_median",
+            "sale_char_yrblt_median",
+            "sale_class_mode",
+            "data_year",
+        ]
+    ]
+
     return output
 
 
@@ -160,22 +219,25 @@ def model(dbt, spark_session):
 
     schema = (
         "geography_type: string, geography_id: string, group_type: string, "
-        + "group_id: string, year: string, sale_price_size: bigint, "
-        + "sale_price_count: int, sale_price_min: double, "
-        + "sale_price_q10: double, sale_price_q25: double, "
-        + "sale_price_median: double, sale_price_q75: double, "
-        + "sale_price_q90: double, sale_price_max: double, "
-        + "sale_price_mean: double, sale_price_sum: double, "
-        + "price_per_sf_min: double, price_per_sf_q10: double, "
-        + "price_per_sf_q25: double, price_per_sf_median: double, "
-        + "price_per_sf_q75: double, price_per_sf_q90: double, "
-        + "price_per_sf_max: double, price_per_sf_mean: double, "
-        + "price_per_sf_sum: double, char_bldg_sf_median: double, "
-        + "char_land_sf_median: double, char_yrblt_median: double, "
-        + "class_multimode: array<string>, data_year_first: string,"
-        + "sale_price_deltamedian: double, price_per_sf_deltamedian: double, "
-        + "sale_price_deltamean: double, price_per_sf_deltamean: double, "
-        + "sale_price_deltasum: double, price_per_sf_deltasum: double"
+        + "group_id: string, sale_year: string, pin_n_tot: bigint, "
+        + "sale_n_tot: int, sale_price_min: double, sale_price_q10: double, "
+        + "sale_price_q25: double, sale_price_median: double, "
+        + "sale_price_q75: double, sale_price_q90: double, "
+        + "sale_price_max: double, sale_price_mean: double, "
+        + "sale_price_sum: double, sale_price_deltamedian: double, "
+        + "sale_price_deltamean: double, sale_price_deltasum: double, "
+        + "sale_price_per_sf_min: double, sale_price_per_sf_q10: double, "
+        + "sale_price_per_sf_q25: double, sale_price_per_sf_median: double, "
+        + "sale_price_per_sf_q75: double, sale_price_per_sf_q90: double, "
+        + "sale_price_per_sf_max: double, sale_price_per_sf_mean: double, "
+        + "sale_price_per_sf_sum: double, "
+        + "sale_price_per_sf_deltamedian: double, "
+        + "sale_price_per_sf_deltamean: double, "
+        + "sale_price_per_sf_deltasum: double, "
+        + "sale_char_bldg_sf_median: double, "
+        + "sale_char_land_sf_median: double, "
+        + "sale_char_yrblt_median: double, sale_class_mode: array<string>, "
+        + "data_year: string"
     )
 
     spark_df = spark_session.createDataFrame(df, schema=schema)
diff --git a/dbt/models/reporting/reporting.sot_sales_input.sql b/dbt/models/reporting/reporting.sot_sales_input.sql
index dcd6fd085..575bb8aed 100644
--- a/dbt/models/reporting/reporting.sot_sales_input.sql
+++ b/dbt/models/reporting/reporting.sot_sales_input.sql
@@ -23,13 +23,10 @@ SELECT
     CASE WHEN sf.char_bldg_sf > 0
             THEN
             CAST(sales.sale_price / sf.char_bldg_sf AS DOUBLE)
-    END AS price_per_sf,
-    CAST(sf.char_bldg_sf AS INT) AS char_bldg_sf,
-    CAST(sf.char_land_sf AS INT) AS char_land_sf,
-    CAST(sf.char_yrblt AS INT) AS char_yrblt,
-    CAST(hist.oneyr_pri_mailed_bldg AS DOUBLE) AS oneyr_pri_mailed_bldg,
-    CAST(hist.oneyr_pri_mailed_land AS DOUBLE) AS oneyr_pri_mailed_land,
-    CAST(hist.oneyr_pri_mailed_tot AS DOUBLE) AS oneyr_pri_mailed_tot,
+    END AS sale_price_per_sf,
+    CAST(sf.char_bldg_sf AS INT) AS sale_char_bldg_sf,
+    CAST(sf.char_land_sf AS INT) AS sale_char_land_sf,
+    CAST(sf.char_yrblt AS INT) AS sale_char_yrblt,
     uni.year,
     uni.class,
     'Cook' AS county,
@@ -92,9 +89,6 @@ LEFT JOIN sf
     AND uni.year = sf.year
 LEFT JOIN {{ ref('ccao.class_dict') }}
     ON uni.class = class_dict.class_code
-LEFT JOIN {{ ref('default.vw_pin_history') }} AS hist
-    ON uni.pin = hist.pin
-    AND uni.year = hist.year
 LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales
     ON uni.pin = sales.pin
     AND uni.year = sales.year

From 20c9bd6a59120077991aa14f20caeae60f00f608 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Wed, 3 Jul 2024 16:53:36 +0000
Subject: [PATCH 35/96] Clean taxes table columns

---
 dbt/models/reporting/reporting.sot_sales.py   |  26 +--
 .../reporting.sot_taxes_exemptions.py         | 170 ++++++++++++++----
 .../reporting.sot_taxes_exemptions_input.sql  |  31 ++--
 3 files changed, 160 insertions(+), 67 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py
index aa8229b87..6d9c8f563 100644
--- a/dbt/models/reporting/reporting.sot_sales.py
+++ b/dbt/models/reporting/reporting.sot_sales.py
@@ -138,8 +138,8 @@ def assemble(df, geos, groups):
     ].replace(0, np.NaN)
 
     for i in ["median", "mean", "sum"]:
-        output["sale_price", "delta" + i] = output["sale_price", i].diff()
-        output["sale_price_per_sf", "delta" + i] = output[
+        output["sale_price", "delta_" + i] = output["sale_price", i].diff()
+        output["sale_price_per_sf", "delta_" + i] = output[
             "sale_price_per_sf", i
         ].diff()
 
@@ -180,9 +180,9 @@ def clean_names(x):
             "sale_price_max",
             "sale_price_mean",
             "sale_price_sum",
-            "sale_price_deltamedian",
-            "sale_price_deltamean",
-            "sale_price_deltasum",
+            "sale_price_delta_median",
+            "sale_price_delta_mean",
+            "sale_price_delta_sum",
             "sale_price_per_sf_min",
             "sale_price_per_sf_q10",
             "sale_price_per_sf_q25",
@@ -192,9 +192,9 @@ def clean_names(x):
             "sale_price_per_sf_max",
             "sale_price_per_sf_mean",
             "sale_price_per_sf_sum",
-            "sale_price_per_sf_deltamedian",
-            "sale_price_per_sf_deltamean",
-            "sale_price_per_sf_deltasum",
+            "sale_price_per_sf_delta_median",
+            "sale_price_per_sf_delta_mean",
+            "sale_price_per_sf_delta_sum",
             "sale_char_bldg_sf_median",
             "sale_char_land_sf_median",
             "sale_char_yrblt_median",
@@ -224,16 +224,16 @@ def model(dbt, spark_session):
         + "sale_price_q25: double, sale_price_median: double, "
         + "sale_price_q75: double, sale_price_q90: double, "
         + "sale_price_max: double, sale_price_mean: double, "
-        + "sale_price_sum: double, sale_price_deltamedian: double, "
-        + "sale_price_deltamean: double, sale_price_deltasum: double, "
+        + "sale_price_sum: double, sale_price_delta_median: double, "
+        + "sale_price_delta_mean: double, sale_price_delta_sum: double, "
         + "sale_price_per_sf_min: double, sale_price_per_sf_q10: double, "
         + "sale_price_per_sf_q25: double, sale_price_per_sf_median: double, "
         + "sale_price_per_sf_q75: double, sale_price_per_sf_q90: double, "
         + "sale_price_per_sf_max: double, sale_price_per_sf_mean: double, "
         + "sale_price_per_sf_sum: double, "
-        + "sale_price_per_sf_deltamedian: double, "
-        + "sale_price_per_sf_deltamean: double, "
-        + "sale_price_per_sf_deltasum: double, "
+        + "sale_price_per_sf_delta_median: double, "
+        + "sale_price_per_sf_delta_mean: double, "
+        + "sale_price_per_sf_delta_sum: double, "
         + "sale_char_bldg_sf_median: double, "
         + "sale_char_land_sf_median: double, "
         + "sale_char_yrblt_median: double, sale_class_mode: array<string>, "
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
index 044f59a64..52a085aab 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
@@ -88,22 +88,22 @@ def first(x):
 less_stats = ["count", "sum"]
 
 agg_func_math = {
-    "eq_factor_final": ["size", first],
-    "eq_factor_tentative": [first],
+    "tax_eq_factor_final": ["size", first],
+    "tax_eq_factor_tentative": [first],
     "tax_bill_total": more_stats,
-    "tax_code_rate": more_stats,
-    "av_clerk": more_stats,
-    "exe_homeowner": less_stats,
-    "exe_senior": less_stats,
-    "exe_freeze": less_stats,
-    "exe_longtime_homeowner": less_stats,
-    "exe_disabled": less_stats,
-    "exe_vet_returning": less_stats,
-    "exe_vet_dis_lt50": less_stats,
-    "exe_vet_dis_50_69": less_stats,
-    "exe_vet_dis_ge70": less_stats,
-    "exe_abate": less_stats,
-    "exe_total": less_stats,
+    "tax_rate": more_stats,
+    "tax_av": more_stats,
+    "tax_exe_homeowner": less_stats,
+    "tax_exe_senior": less_stats,
+    "tax_exe_freeze": less_stats,
+    "tax_exe_longtime_homeowner": less_stats,
+    "tax_exe_disabled": less_stats,
+    "tax_exe_vet_returning": less_stats,
+    "tax_exe_vet_dis_lt50": less_stats,
+    "tax_exe_vet_dis_50_69": less_stats,
+    "tax_exe_vet_dis_ge70": less_stats,
+    "tax_exe_abate": less_stats,
+    "tax_exe_total": less_stats,
 }
 
 
@@ -142,13 +142,103 @@ def assemble(df, geos, groups):
 
     # Clean combined output and export
     for i in ["median", "mean", "sum"]:
-        output["tax_bill_total", "delta" + i] = output[
+        output["tax_bill_total", "delta_" + i] = output[
             "tax_bill_total", i
         ].diff()
 
     output.columns = ["_".join(col) for col in output.columns]
     output = output.reset_index()
 
+    output = clean_names(output)
+
+    return output
+
+
+def clean_names(x):
+    output = x.rename(
+        columns={
+            "tax_eq_factor_final_size": "pin_n_tot",
+            "year": "tax_year",
+            "tax_exe_homeowner_count": "tax_exe_n_homeowner",
+            "tax_exe_senior_count": "tax_exe_n_senior",
+            "tax_exe_freeze_count": "tax_exe_n_freeze",
+            "tax_exe_longtime_homeowner_count": "tax_exe_n_longtime_homeowner",
+            "tax_exe_disabled_count": "tax_exe_n_disabled",
+            "tax_exe_vet_returning_count": "tax_exe_n_vet_returning",
+            "tax_exe_vet_dis_lt50_count": "tax_exe_n_vet_dis_lt50",
+            "tax_exe_vet_dis_50_69_count": "tax_exe_n_vet_dis_50_69",
+            "tax_exe_vet_dis_ge70_count": "tax_exe_n_vet_dis_ge70",
+            "tax_exe_abate_count": "tax_exe_n_abate",
+            "tax_exe_total_count": "tax_exe_n_total",
+            "tax_eq_factor_final_first": "tax_eq_factor_final",
+            "tax_eq_factor_tentative_first": "tax_eq_factor_tentative",
+        }
+    )
+
+    output = output[
+        [
+            "geography_type",
+            "geography_id",
+            "group_type",
+            "group_id",
+            "tax_year",
+            "pin_n_tot",
+            "tax_eq_factor_final",
+            "tax_eq_factor_tentative",
+            "tax_bill_total_min",
+            "tax_bill_total_q10",
+            "tax_bill_total_q25",
+            "tax_bill_total_median",
+            "tax_bill_total_q75",
+            "tax_bill_total_q90",
+            "tax_bill_total_max",
+            "tax_bill_total_mean",
+            "tax_bill_total_sum",
+            "tax_bill_total_delta_median",
+            "tax_bill_total_delta_mean",
+            "tax_bill_total_delta_sum",
+            "tax_rate_min",
+            "tax_rate_q10",
+            "tax_rate_q25",
+            "tax_rate_median",
+            "tax_rate_q75",
+            "tax_rate_q90",
+            "tax_rate_max",
+            "tax_rate_mean",
+            "tax_rate_sum",
+            "tax_av_min",
+            "tax_av_q10",
+            "tax_av_q25",
+            "tax_av_median",
+            "tax_av_q75",
+            "tax_av_q90",
+            "tax_av_max",
+            "tax_av_mean",
+            "tax_av_sum",
+            "tax_exe_n_homeowner",
+            "tax_exe_homeowner_sum",
+            "tax_exe_n_senior",
+            "tax_exe_senior_sum",
+            "tax_exe_n_freeze",
+            "tax_exe_freeze_sum",
+            "tax_exe_n_longtime_homeowner",
+            "tax_exe_longtime_homeowner_sum",
+            "tax_exe_n_disabled",
+            "tax_exe_disabled_sum",
+            "tax_exe_n_vet_returning",
+            "tax_exe_vet_returning_sum",
+            "tax_exe_n_vet_dis_lt50",
+            "tax_exe_vet_dis_lt50_sum",
+            "tax_exe_n_vet_dis_50_69",
+            "tax_exe_vet_dis_50_69_sum",
+            "tax_exe_n_vet_dis_ge70",
+            "tax_exe_vet_dis_ge70_sum",
+            "tax_exe_n_abate",
+            "tax_exe_abate_sum",
+            "tax_exe_n_total",
+            "tax_exe_total_sum",
+        ]
+    ]
     return output
 
 
@@ -165,33 +255,35 @@ def model(dbt, spark_session):
 
     schema = (
         "geography_type: string, geography_id: string, group_type: string, "
-        + "group_id: string, year: string, eq_factor_final_size: bigint, "
-        + "eq_factor_final_first: double, eq_factor_tentative_first: double, "
+        + "group_id: string, tax_year: string, pin_n_tot: bigint, "
+        + "tax_eq_factor_final: double, tax_eq_factor_tentative: double, "
         + "tax_bill_total_min: double, tax_bill_total_q10: double, "
         + "tax_bill_total_q25: double, tax_bill_total_median: double, "
         + "tax_bill_total_q75: double, tax_bill_total_q90: double, "
         + "tax_bill_total_max: double, tax_bill_total_mean: double, "
-        + "tax_bill_total_sum: double, tax_code_rate_min: double, "
-        + "tax_code_rate_q10: double, tax_code_rate_q25: double, "
-        + "tax_code_rate_median: double, tax_code_rate_q75: double, "
-        + "tax_code_rate_q90: double, tax_code_rate_max: double, "
-        + "tax_code_rate_mean: double, tax_code_rate_sum: double, "
-        + "av_clerk_min: int, av_clerk_q10: double, av_clerk_q25: double, "
-        + "av_clerk_median: double, av_clerk_q75: double, "
-        + "av_clerk_q90: double, av_clerk_max: int, av_clerk_mean: double, "
-        + "av_clerk_sum: double, exe_homeowner_count: bigint, "
-        + "exe_homeowner_sum: double, exe_senior_count: bigint, "
-        + "exe_senior_sum: double, exe_freeze_count: bigint, "
-        + "exe_freeze_sum: double, exe_longtime_homeowner_count: bigint, "
-        + "exe_longtime_homeowner_sum: double, exe_disabled_count: bigint, "
-        + "exe_disabled_sum: double, exe_vet_returning_count: bigint, "
-        + "exe_vet_returning_sum: double, exe_vet_dis_lt50_count: bigint, "
-        + "exe_vet_dis_lt50_sum: double, exe_vet_dis_50_69_count: bigint, "
-        + "exe_vet_dis_50_69_sum: double, exe_vet_dis_ge70_count: bigint, "
-        + "exe_vet_dis_ge70_sum: double, exe_abate_count: bigint, "
-        + "exe_abate_sum: double, exe_total_count: bigint, "
-        + "exe_total_sum: double, tax_bill_total_deltamedian: double, "
-        + "tax_bill_total_deltamean: double, tax_bill_total_deltasum: double"
+        + "tax_bill_total_sum: double, tax_bill_total_delta_median: double, "
+        + "tax_bill_total_delta_mean: double, "
+        + "tax_bill_total_delta_sum: double , tax_rate_min: double, "
+        + "tax_rate_q10: double, tax_rate_q25: double, "
+        + "tax_rate_median: double, tax_rate_q75: double, "
+        + "tax_rate_q90: double, tax_rate_max: double, "
+        + "tax_rate_mean: double, tax_rate_sum: double, "
+        + "tax_av_min: int, tax_av_q10: double, tax_av_q25: double, "
+        + "tax_av_median: double, tax_av_q75: double, "
+        + "tax_av_q90: double, tax_av_max: int, tax_av_mean: double, "
+        + "tax_av_sum: double, tax_exe_n_homeowner: bigint, "
+        + "tax_exe_homeowner_sum: double, tax_exe_n_senior: bigint, "
+        + "tax_exe_senior_sum: double, tax_exe_n_freeze: bigint, "
+        + "tax_exe_freeze_sum: double, tax_exe_n_longtime_homeowner: bigint, "
+        + "tax_exe_longtime_homeowner_sum: double, "
+        + "tax_exe_n_disabled: bigint, tax_exe_disabled_sum: double, "
+        + "tax_exe_n_vet_returning: bigint, "
+        + "tax_exe_vet_returning_sum: double, tax_exe_n_vet_dis_lt50: bigint, "
+        + "tax_exe_vet_dis_lt50_sum: double, tax_exe_n_vet_dis_50_69: bigint, "
+        + "tax_exe_vet_dis_50_69_sum: double, tax_exe_n_vet_dis_ge70: bigint, "
+        + "tax_exe_vet_dis_ge70_sum: double, tax_exe_n_abate: bigint, "
+        + "tax_exe_abate_sum: double, tax_exe_n_total: bigint, "
+        + "tax_exe_total_sum: double"
     )
 
     spark_df = spark_session.createDataFrame(df, schema=schema)
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
index 281b69267..97cfff982 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
@@ -17,31 +17,32 @@ WITH tcd AS (
 SELECT
     uni.pin,
     tax.year,
-    tax.av_clerk,
+    tax.av_clerk AS tax_av,
     tax.tax_bill_total,
     CASE WHEN tax.exe_homeowner = 0 THEN NULL ELSE tax.exe_homeowner END
-        AS exe_homeowner,
+        AS tax_exe_homeowner,
     CASE WHEN tax.exe_senior = 0 THEN NULL ELSE tax.exe_senior END
-        AS exe_senior,
+        AS tax_exe_senior,
     CASE WHEN tax.exe_freeze = 0 THEN NULL ELSE tax.exe_freeze END
-        AS exe_freeze,
+        AS tax_exe_freeze,
     CASE
         WHEN tax.exe_longtime_homeowner = 0 THEN NULL ELSE
             tax.exe_longtime_homeowner
-    END AS exe_longtime_homeowner,
+    END AS tax_exe_longtime_homeowner,
     CASE WHEN tax.exe_disabled = 0 THEN NULL ELSE tax.exe_disabled END
-        AS exe_disabled,
+        AS tax_exe_disabled,
     CASE
         WHEN tax.exe_vet_returning = 0 THEN NULL ELSE tax.exe_vet_returning
-    END AS exe_vet_returning,
+    END AS tax_exe_vet_returning,
     CASE WHEN tax.exe_vet_dis_lt50 = 0 THEN NULL ELSE tax.exe_vet_dis_lt50 END
-        AS exe_vet_dis_lt50,
+        AS tax_exe_vet_dis_lt50,
     CASE
         WHEN tax.exe_vet_dis_50_69 = 0 THEN NULL ELSE tax.exe_vet_dis_50_69
-    END AS exe_vet_dis_50_69,
+    END AS tax_exe_vet_dis_50_69,
     CASE WHEN tax.exe_vet_dis_ge70 = 0 THEN NULL ELSE tax.exe_vet_dis_ge70 END
-        AS exe_vet_dis_ge70,
-    CASE WHEN tax.exe_abate = 0 THEN NULL ELSE tax.exe_abate END AS exe_abate,
+        AS tax_exe_vet_dis_ge70,
+    CASE WHEN tax.exe_abate = 0 THEN NULL ELSE tax.exe_abate END
+        AS tax_exe_abate,
     CASE
         WHEN tax.exe_homeowner + tax.exe_senior + tax.exe_freeze
             + tax.exe_longtime_homeowner + tax.exe_disabled
@@ -52,10 +53,10 @@ SELECT
             + tax.exe_longtime_homeowner + tax.exe_disabled
             + tax.exe_vet_returning + tax.exe_vet_dis_lt50
             + tax.exe_vet_dis_50_69 + tax.exe_vet_dis_ge70 + tax.exe_abate
-    END AS exe_total,
-    tcd.tax_code_rate,
-    eqf.eq_factor_tentative,
-    eqf.eq_factor_final,
+    END AS tax_exe_total,
+    tcd.tax_code_rate AS tax_rate,
+    eqf.eq_factor_tentative AS tax_eq_factor_tentative,
+    eqf.eq_factor_final AS tax_eq_factor_final,
     uni.class,
     'Cook' AS county,
     uni.triad_name AS triad,

From adc16eaac42f805036ec8ee12ed0f34dbd39b23a Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Sun, 7 Jul 2024 13:37:03 +0000
Subject: [PATCH 36/96] Clean assessment_roll columns

---
 .../reporting.sot_assessment_roll.py          | 122 ++++++++++++++----
 .../reporting.sot_assessment_roll_input.sql   |   6 +-
 dbt/models/reporting/reporting.sot_sales.py   |  14 +-
 .../reporting.sot_taxes_exemptions.py         |   9 +-
 4 files changed, 112 insertions(+), 39 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index 5a51c704c..51a1c6f8d 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -91,10 +91,11 @@ def first(x):
 ]
 
 stats = {
-    "tot": ["size", "count"] + more_stats,
-    "bldg": more_stats,
-    "land": more_stats,
+    "av_tot": ["size", "count"] + more_stats,
+    "av_bldg": more_stats,
+    "av_land": more_stats,
     "triad": [first],
+    "geography_data_year": [first],
 }
 
 
@@ -126,7 +127,7 @@ def assemble(df, geos, groups):
 
     # Loop through group combinations and stack output
     for key, value in geos.items():
-        df["data_year"] = df[key]
+        df["geography_data_year"] = df[key]
 
         for x in value:
             for z in groups:
@@ -134,12 +135,12 @@ def assemble(df, geos, groups):
 
     # Clean combined output and export
     for i in ["median", "mean", "sum"]:
-        output["tot", "delta" + i] = output["tot", i].diff()
-        output["bldg", "delta" + i] = output["bldg", i].diff()
-        output["land", "delta" + i] = output["land", i].diff()
+        output["av_tot", "delta_" + i] = output["av_tot", i].diff()
+        output["av_bldg", "delta_" + i] = output["av_bldg", i].diff()
+        output["av_land", "delta_" + i] = output["av_land", i].diff()
 
-    output["tot", "pct_w_value"] = (
-        output["tot", "count"] / output["tot", "size"]
+    output["av_tot", "pct_w_value"] = (
+        output["av_tot", "count"] / output["av_tot", "size"]
     )
 
     output.columns = ["_".join(col) for col in output.columns]
@@ -174,6 +175,73 @@ def assemble(df, geos, groups):
     ] = "Yes"
     output = output.drop(["triennial", "triad"], axis=1)
 
+    output = clean_names(output)
+
+    return output
+
+
+def clean_names(x):
+    output = x.rename(
+        columns={
+            "av_tot_size": "pin_n_tot",
+            "av_tot_count": "pin_n_w_value",
+            "av_tot_pct_w_value": "pin_pct_w_value",
+            "geography_data_year_first": "geography_data_year",
+        }
+    )
+
+    output = output[
+        [
+            "geography_type",
+            "geography_id",
+            "geography_data_year",
+            "group_type",
+            "group_id",
+            "year",
+            "reassessment_year",
+            "stage_name",
+            "pin_n_tot",
+            "pin_n_w_value",
+            "pin_pct_w_value",
+            "av_tot_min",
+            "av_tot_q10",
+            "av_tot_q25",
+            "av_tot_median",
+            "av_tot_q75",
+            "av_tot_q90",
+            "av_tot_max",
+            "av_tot_mean",
+            "av_tot_sum",
+            "av_tot_delta_median",
+            "av_tot_delta_mean",
+            "av_tot_delta_sum",
+            "av_bldg_min",
+            "av_bldg_q10",
+            "av_bldg_q25",
+            "av_bldg_median",
+            "av_bldg_q75",
+            "av_bldg_q90",
+            "av_bldg_max",
+            "av_bldg_mean",
+            "av_bldg_sum",
+            "av_bldg_delta_median",
+            "av_bldg_delta_mean",
+            "av_bldg_delta_sum",
+            "av_land_min",
+            "av_land_q10",
+            "av_land_q25",
+            "av_land_median",
+            "av_land_q75",
+            "av_land_q90",
+            "av_land_max",
+            "av_land_mean",
+            "av_land_sum",
+            "av_land_delta_median",
+            "av_land_delta_mean",
+            "av_land_delta_sum",
+        ]
+    ]
+
     return output
 
 
@@ -189,23 +257,25 @@ def model(dbt, spark_session):
     df = assemble(input, geos=geos, groups=groups)
 
     schema = (
-        "geography_type: string, geography_id: string, group_type: string, "
-        + "group_id: string, year: bigint, stage_name: string, "
-        + "tot_size: bigint, tot_count: bigint, tot_min: double, "
-        + "tot_q10: double, tot_q25: double, tot_median: double, "
-        + "tot_q75: double, tot_q90: double, tot_max: double, "
-        + "tot_mean: double, tot_sum: double, bldg_min: double, "
-        + "bldg_q10: double, bldg_q25: double, bldg_median: double, "
-        + "bldg_q75: double, bldg_q90: double, bldg_max: double, "
-        + "bldg_mean: double, bldg_sum: double, land_min: double, "
-        + "land_q10: double, land_q25: double, land_median: double, "
-        + "land_q75: double, land_q90: double, land_max: double, "
-        + "land_mean: double, land_sum: double, tot_deltamedian: double, "
-        + "bldg_deltamedian: double, land_deltamedian: double, "
-        + "tot_deltamean: double, bldg_deltamean: double, "
-        + "land_deltamean: double, tot_deltasum: double, "
-        + "bldg_deltasum: double, land_deltasum: double, "
-        + "tot_pct_w_value: double, reassessment_year: string"
+        "geography_type: string, geography_id: string, "
+        + "geography_data_year: string, group_type: string, group_id: string, "
+        + "year: string, reassessment_year: string, stage_name: string, "
+        + "pin_n_tot: bigint, pin_n_w_value: bigint, pin_pct_w_value: double, "
+        + "av_tot_min: double, av_tot_q10: double, av_tot_q25: double, "
+        + "av_tot_median: double, av_tot_q75: double, av_tot_q90: double, "
+        + "av_tot_max: double, av_tot_mean: double, av_tot_sum: double, "
+        + "av_tot_delta_median: double, av_tot_delta_mean: double, "
+        + "av_tot_delta_sum: double, av_bldg_min: double, "
+        + "av_bldg_q10: double, av_bldg_q25: double, av_bldg_median: double, "
+        + "av_bldg_q75: double, av_bldg_q90: double, av_bldg_max: double, "
+        + "av_bldg_mean: double, av_bldg_sum: double, "
+        + "av_bldg_delta_median: double, av_bldg_delta_mean: double, "
+        + "av_bldg_delta_sum: double, av_land_min: double, "
+        + "av_land_q10: double, av_land_q25: double, av_land_median: double, "
+        + "av_land_q75: double, av_land_q90: double, av_land_max: double, "
+        + "av_land_mean: double, av_land_sum: double, "
+        + "av_land_delta_median: double, av_land_delta_mean: double, "
+        + "av_land_delta_sum: double"
     )
 
     spark_df = spark_session.createDataFrame(df, schema=schema)
diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
index 470a01d27..91397a3f3 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
+++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
@@ -33,9 +33,9 @@ SELECT
     uni.year,
     uni.stage_name,
     uni.class,
-    CAST(vals.tot AS INT) AS tot,
-    CAST(vals.bldg AS INT) AS bldg,
-    CAST(vals.land AS INT) AS land,
+    CAST(vals.tot AS INT) AS av_tot,
+    CAST(vals.bldg AS INT) AS av_bldg,
+    CAST(vals.land AS INT) AS av_land,
     'Cook' AS county,
     uni.triad_name AS triad,
     uni.township_name AS township,
diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py
index 6d9c8f563..81d42b872 100644
--- a/dbt/models/reporting/reporting.sot_sales.py
+++ b/dbt/models/reporting/reporting.sot_sales.py
@@ -92,7 +92,7 @@ def first(x):
     "sale_char_land_sf": ["median"],
     "sale_char_yrblt": ["median"],
     "class": [stats.multimode],
-    "data_year": [first],
+    "geography_data_year": [first],
 }
 
 
@@ -123,7 +123,7 @@ def assemble(df, geos, groups):
 
     # Loop through group combinations and stack output
     for key, value in geos.items():
-        df["data_year"] = df[key]
+        df["geography_data_year"] = df[key]
 
         for x in value:
             for z in groups:
@@ -158,7 +158,7 @@ def clean_names(x):
             "year": "sale_year",
             "sale_price_count": "sale_n_tot",
             "class_multimode": "sale_class_mode",
-            "data_year_first": "data_year",
+            "geography_data_year_first": "geography_data_year",
         }
     )
 
@@ -166,6 +166,7 @@ def clean_names(x):
         [
             "geography_type",
             "geography_id",
+            "geography_data_year",
             "group_type",
             "group_id",
             "sale_year",
@@ -199,7 +200,6 @@ def clean_names(x):
             "sale_char_land_sf_median",
             "sale_char_yrblt_median",
             "sale_class_mode",
-            "data_year",
         ]
     ]
 
@@ -218,7 +218,8 @@ def model(dbt, spark_session):
     df = assemble(input, geos=geos, groups=groups)
 
     schema = (
-        "geography_type: string, geography_id: string, group_type: string, "
+        "geography_type: string, geography_id: string, "
+        + "geography_data_year: string, group_type: string, "
         + "group_id: string, sale_year: string, pin_n_tot: bigint, "
         + "sale_n_tot: int, sale_price_min: double, sale_price_q10: double, "
         + "sale_price_q25: double, sale_price_median: double, "
@@ -236,8 +237,7 @@ def model(dbt, spark_session):
         + "sale_price_per_sf_delta_sum: double, "
         + "sale_char_bldg_sf_median: double, "
         + "sale_char_land_sf_median: double, "
-        + "sale_char_yrblt_median: double, sale_class_mode: array<string>, "
-        + "data_year: string"
+        + "sale_char_yrblt_median: double, sale_class_mode: array<string>"
     )
 
     spark_df = spark_session.createDataFrame(df, schema=schema)
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
index 52a085aab..879dc1cf3 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
@@ -104,6 +104,7 @@ def first(x):
     "tax_exe_vet_dis_ge70": less_stats,
     "tax_exe_abate": less_stats,
     "tax_exe_total": less_stats,
+    "geography_data_year": [first],
 }
 
 
@@ -134,7 +135,7 @@ def assemble(df, geos, groups):
 
     # Loop through group combinations and stack output
     for key, value in geos.items():
-        df["data_year"] = df[key]
+        df["geography_data_year"] = df[key]
 
         for x in value:
             for z in groups:
@@ -172,6 +173,7 @@ def clean_names(x):
             "tax_exe_total_count": "tax_exe_n_total",
             "tax_eq_factor_final_first": "tax_eq_factor_final",
             "tax_eq_factor_tentative_first": "tax_eq_factor_tentative",
+            "geography_data_year_first": "geography_data_year",
         }
     )
 
@@ -179,7 +181,7 @@ def clean_names(x):
         [
             "geography_type",
             "geography_id",
-            "group_type",
+            "geography_data_year" "group_type",
             "group_id",
             "tax_year",
             "pin_n_tot",
@@ -254,7 +256,8 @@ def model(dbt, spark_session):
     df = assemble(input, geos=geos, groups=groups)
 
     schema = (
-        "geography_type: string, geography_id: string, group_type: string, "
+        "geography_type: string, geography_id: string, "
+        + "geography_data_year: string, group_type: string, "
         + "group_id: string, tax_year: string, pin_n_tot: bigint, "
         + "tax_eq_factor_final: double, tax_eq_factor_tentative: double, "
         + "tax_bill_total_min: double, tax_bill_total_q10: double, "

From f8b87abfd0e0c42cd9fa95a4e8cd7115b6b42bd9 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Sun, 7 Jul 2024 14:29:12 +0000
Subject: [PATCH 37/96] Fix delta columns

---
 .../reporting.sot_assessment_roll.py          | 151 +++++++++++++++---
 dbt/models/reporting/reporting.sot_sales.py   |  50 ++++--
 .../reporting.sot_taxes_exemptions.py         |  28 +++-
 3 files changed, 193 insertions(+), 36 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index 51a1c6f8d..d0a579b04 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -133,19 +133,122 @@ def assemble(df, geos, groups):
             for z in groups:
                 output = pd.concat([output, aggregrate(df, x, z)])
 
+    output.columns = ["_".join(col) for col in output.columns]
+    output = output.reset_index()
+    output = output.rename(columns={"triad_first": "triad"})
+
     # Clean combined output and export
-    for i in ["median", "mean", "sum"]:
-        output["av_tot", "delta_" + i] = output["av_tot", i].diff()
-        output["av_bldg", "delta_" + i] = output["av_bldg", i].diff()
-        output["av_land", "delta_" + i] = output["av_land", i].diff()
+    output["av_tot_pct_w_value"] = (
+        output["av_tot_count"] / output["av_tot_size"]
+    )
 
-    output["av_tot", "pct_w_value"] = (
-        output["av_tot", "count"] / output["av_tot", "size"]
+    output["av_tot_delta_median"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .av_tot_median.diff()
     )
 
-    output.columns = ["_".join(col) for col in output.columns]
-    output = output.reset_index()
-    output = output.rename(columns={"triad_first": "triad"})
+    output["av_tot_delta_mean"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .av_tot_mean.diff()
+    )
+
+    output["av_tot_delta_sum"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .av_tot_sum.diff()
+    )
+
+    output["av_bldg_delta_median"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .av_bldg_median.diff()
+    )
+
+    output["av_bldg_delta_mean"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .av_bldg_mean.diff()
+    )
+
+    output["av_bldg_delta_sum"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .av_bldg_sum.diff()
+    )
+
+    output["av_land_delta_median"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .av_land_median.diff()
+    )
+
+    output["av_land_delta_mean"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .av_land_mean.diff()
+    )
+
+    output["av_land_delta_sum"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .av_land_sum.diff()
+    )
+
+    output["av_tot_delta_pct_median"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .av_tot_median.pct_change()
+    )
+
+    output["av_tot_delta_pct_mean"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .av_tot_mean.pct_change()
+    )
+
+    output["av_tot_delta_pct_sum"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .av_tot_sum.pct_change()
+    )
+
+    output["av_bldg_delta_pct_median"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .av_bldg_median.pct_change()
+    )
+
+    output["av_bldg_delta_pct_mean"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .av_bldg_mean.pct_change()
+    )
+
+    output["av_bldg_delta_pct_sum"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .av_bldg_sum.pct_change()
+    )
+
+    output["av_land_delta_pct_median"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .av_land_median.pct_change()
+    )
+
+    output["av_land_delta_pct_mean"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .av_land_mean.pct_change()
+    )
+
+    output["av_land_delta_pct_sum"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .av_land_sum.pct_change()
+    )
 
     output["year"] = output["year"].astype(int)
     output["triennial"] = output["geography_type"].isin(
@@ -215,6 +318,9 @@ def clean_names(x):
             "av_tot_delta_median",
             "av_tot_delta_mean",
             "av_tot_delta_sum",
+            "av_tot_delta_pct_median",
+            "av_tot_delta_pct_mean",
+            "av_tot_delta_pct_sum",
             "av_bldg_min",
             "av_bldg_q10",
             "av_bldg_q25",
@@ -227,6 +333,9 @@ def clean_names(x):
             "av_bldg_delta_median",
             "av_bldg_delta_mean",
             "av_bldg_delta_sum",
+            "av_bldg_delta_pct_median",
+            "av_bldg_delta_pct_mean",
+            "av_bldg_delta_pct_sum",
             "av_land_min",
             "av_land_q10",
             "av_land_q25",
@@ -239,6 +348,9 @@ def clean_names(x):
             "av_land_delta_median",
             "av_land_delta_mean",
             "av_land_delta_sum",
+            "av_land_delta_pct_median",
+            "av_land_delta_pct_mean",
+            "av_land_delta_pct_sum",
         ]
     ]
 
@@ -265,17 +377,20 @@ def model(dbt, spark_session):
         + "av_tot_median: double, av_tot_q75: double, av_tot_q90: double, "
         + "av_tot_max: double, av_tot_mean: double, av_tot_sum: double, "
         + "av_tot_delta_median: double, av_tot_delta_mean: double, "
-        + "av_tot_delta_sum: double, av_bldg_min: double, "
-        + "av_bldg_q10: double, av_bldg_q25: double, av_bldg_median: double, "
-        + "av_bldg_q75: double, av_bldg_q90: double, av_bldg_max: double, "
-        + "av_bldg_mean: double, av_bldg_sum: double, "
+        + "av_tot_delta_sum: double, av_tot_delta_pct_median: double, "
+        + "av_tot_delta_pct_mean: double, av_tot_delta_pct_sum: double, "
+        + "av_bldg_min: double, av_bldg_q10: double, av_bldg_q25: double, "
+        + "av_bldg_median: double, av_bldg_q75: double, av_bldg_q90: double, "
+        + "av_bldg_max: double, av_bldg_mean: double, av_bldg_sum: double, "
         + "av_bldg_delta_median: double, av_bldg_delta_mean: double, "
-        + "av_bldg_delta_sum: double, av_land_min: double, "
-        + "av_land_q10: double, av_land_q25: double, av_land_median: double, "
-        + "av_land_q75: double, av_land_q90: double, av_land_max: double, "
-        + "av_land_mean: double, av_land_sum: double, "
+        + "av_bldg_delta_sum: double, av_bldg_delta_pct_median: double, "
+        + "av_bldg_delta_pct_mean: double, av_bldg_delta_pct_sum: double, "
+        + "av_land_min: double, av_land_q10: double, av_land_q25: double, "
+        + "av_land_median: double, av_land_q75: double, av_land_q90: double, "
+        + "av_land_max: double, av_land_mean: double, av_land_sum: double, "
         + "av_land_delta_median: double, av_land_delta_mean: double, "
-        + "av_land_delta_sum: double"
+        + "av_land_delta_sum: double, av_land_delta_pct_median: double, "
+        + "av_land_delta_pct_mean: double, av_land_delta_pct_sum: double"
     )
 
     spark_df = spark_session.createDataFrame(df, schema=schema)
diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py
index 81d42b872..b2d55f7ae 100644
--- a/dbt/models/reporting/reporting.sot_sales.py
+++ b/dbt/models/reporting/reporting.sot_sales.py
@@ -129,22 +129,50 @@ def assemble(df, geos, groups):
             for z in groups:
                 output = pd.concat([output, aggregrate(df, x, z)])
 
+    output.columns = ["_".join(col) for col in output.columns]
+    output = output.reset_index()
+
     # Clean combined output and export
-    output["sale_price", "sum"] = output["sale_price", "sum"].replace(
+    output["sale_price_sum"] = output["sale_price_sum"].replace(0, np.NaN)
+    output["sale_price_per_sf_sum"] = output["sale_price_per_sf_sum"].replace(
         0, np.NaN
     )
-    output["sale_price_per_sf", "sum"] = output[
-        "sale_price_per_sf", "sum"
-    ].replace(0, np.NaN)
 
-    for i in ["median", "mean", "sum"]:
-        output["sale_price", "delta_" + i] = output["sale_price", i].diff()
-        output["sale_price_per_sf", "delta_" + i] = output[
-            "sale_price_per_sf", i
-        ].diff()
+    output["sale_price_delta_median"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id"])
+        .sale_price_median.diff()
+    )
+
+    output["sale_price_delta_mean"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id"])
+        .sale_price_mean.diff()
+    )
 
-    output.columns = ["_".join(col) for col in output.columns]
-    output = output.reset_index()
+    output["sale_price_delta_sum"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id"])
+        .sale_price_sum.diff()
+    )
+
+    output["sale_price_per_sf_delta_median"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id"])
+        .sale_price_per_sf_median.diff()
+    )
+
+    output["sale_price_per_sf_delta_mean"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id"])
+        .sale_price_per_sf_mean.diff()
+    )
+
+    output["sale_price_per_sf_delta_sum"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id"])
+        .sale_price_per_sf_sum.diff()
+    )
 
     output = clean_names(output)
 
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
index 879dc1cf3..fc8eb9cc2 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
@@ -141,15 +141,28 @@ def assemble(df, geos, groups):
             for z in groups:
                 output = pd.concat([output, aggregrate(df, x, z)])
 
-    # Clean combined output and export
-    for i in ["median", "mean", "sum"]:
-        output["tax_bill_total", "delta_" + i] = output[
-            "tax_bill_total", i
-        ].diff()
-
     output.columns = ["_".join(col) for col in output.columns]
     output = output.reset_index()
 
+    # Clean combined output and export
+    output["tax_bill_total_delta_median"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id"])
+        .tax_bill_total_median.diff()
+    )
+
+    output["tax_bill_total_delta_mean"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id"])
+        .tax_bill_total_mean.diff()
+    )
+
+    output["tax_bill_total_delta_sum"] = (
+        output.sort_values("year")
+        .groupby(["geography_id", "group_id"])
+        .tax_bill_total_sum.diff()
+    )
+
     output = clean_names(output)
 
     return output
@@ -181,7 +194,8 @@ def clean_names(x):
         [
             "geography_type",
             "geography_id",
-            "geography_data_year" "group_type",
+            "geography_data_year",
+            "group_type",
             "group_id",
             "tax_year",
             "pin_n_tot",

From 54ebab83646464af5003a00e1f931009c610ec20 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Sun, 7 Jul 2024 16:49:07 +0000
Subject: [PATCH 38/96] Clean ratio table columns

---
 .../reporting/reporting.sot_ratio_stats.py    | 116 +++++++++++++-----
 1 file changed, 88 insertions(+), 28 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py
index c226dc532..f3b373583 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.py
@@ -117,24 +117,24 @@ def aggregrate(data, geography_type, group_type):
     print(geography_type, group_type)
 
     group = [geography_type, group_type, "year", "stage_name"]
-    data["size"] = data.groupby(group)["tot_mv"].transform("size")
-    data["sale_count"] = data.groupby(group)["sale_price"].transform("count")
-    data["mv_count"] = data.groupby(group)["tot_mv"].transform("count")
+    data["pin_n_tot"] = data.groupby(group)["tot_mv"].transform("size")
+    data["sale_n_tot"] = data.groupby(group)["sale_price"].transform("count")
+    data["pin_n_w_value"] = data.groupby(group)["tot_mv"].transform("count")
 
     # Remove parcels with FMVs of 0 since they screw up ratios
     data = data[data["tot_mv"] > 0]
 
     # Remove groups that only have one sale since we can't calculate stats
     data = data.dropna(subset=["sale_price"])
-    data = data[data["sale_count"] >= 20]
+    data = data[data["sale_n_tot"] >= 20]
 
     summary = data.groupby(group).apply(
         lambda x: pd.Series(
             {
                 "triad": first(x["triad"]),
-                "size": np.size(x["ratio"]),
-                "mv_count": x["mv_count"].min(),
-                "sale_count": x["sale_count"].min(),
+                "pin_n_tot": np.size(x["ratio"]),
+                "pin_n_w_value": x["pin_n_w_value"].min(),
+                "sale_n_tot": x["sale_n_tot"].min(),
                 "mv_min": x["tot_mv"].min(),
                 "mv_q10": x["tot_mv"].quantile(0.1),
                 "mv_q25": x["tot_mv"].quantile(0.25),
@@ -152,7 +152,6 @@ def aggregrate(data, geography_type, group_type):
                 "ratio_q90": x["ratio"].quantile(0.90),
                 "ratio_max": x["ratio"].max(),
                 "ratio_mean": x["ratio"].mean(),
-                # "cod": ' '.join(x['ratio'].astype(str).values),
                 "cod": cod_safe(ratio=x["ratio"]),
                 "prd": prd_safe(
                     assessed=x["tot_mv"], sale_price=x["sale_price"]
@@ -163,6 +162,7 @@ def aggregrate(data, geography_type, group_type):
                 "mki": mki_safe(
                     assessed=x["tot_mv"], sale_price=x["sale_price"]
                 ),
+                "geography_data_year": first(x["data_year"]),
             }
         )
     )
@@ -203,18 +203,20 @@ def clean(dirty):
         ]
     )
 
+    dirty["pin_pct_w_value"] = dirty["pin_n_w_value"] / dirty["pin_n_tot"]
+
     # Clean combined dirty and export
-    dirty["mv_delta_pct_median"] = (
+    dirty["mv_delta_median"] = (
         dirty.sort_values("year")
         .groupby(["geography_id", "group_id", "stage_name"])
         .mv_median.diff()
     )
-    dirty["mv_delta_pct_mean"] = (
+    dirty["mv_delta_mean"] = (
         dirty.sort_values("year")
         .groupby(["geography_id", "group_id", "stage_name"])
         .mv_mean.diff()
     )
-    dirty["mv_delta_pct_sum"] = (
+    dirty["mv_delta_sum"] = (
         dirty.sort_values("year")
         .groupby(["geography_id", "group_id", "stage_name"])
         .mv_sum.diff()
@@ -225,12 +227,19 @@ def clean(dirty):
         .groupby(["geography_id", "group_id", "stage_name"])
         .mv_median.pct_change()
     )
+
     dirty["mv_delta_pct_mean"] = (
         dirty.sort_values("year")
         .groupby(["geography_id", "group_id", "stage_name"])
         .mv_mean.pct_change()
     )
 
+    dirty["mv_delta_pct_sum"] = (
+        dirty.sort_values("year")
+        .groupby(["geography_id", "group_id", "stage_name"])
+        .mv_sum.pct_change()
+    )
+
     dirty = dirty.reset_index()
 
     dirty["year"] = dirty["year"].astype(int)
@@ -274,12 +283,11 @@ def clean(dirty):
     dirty = dirty.astype(
         {
             "group_id": "str",
-            "year": np.int64,
+            "year": "str",
             "stage_name": "str",
             "reassessment_year": "str",
-            "size": np.int64,
-            "mv_count": np.int64,
-            "sale_count": np.int64,
+            "pin_n_w_value": np.int64,
+            "sale_n_tot": np.int64,
             "mv_min": np.int64,
             "mv_q10": np.int64,
             "mv_q25": np.int64,
@@ -292,6 +300,58 @@ def clean(dirty):
         }
     )
 
+    dirty = dirty[
+        [
+            "geography_type",
+            "geography_id",
+            "geography_data_year",
+            "group_type",
+            "group_id",
+            "year",
+            "reassessment_year",
+            "stage_name",
+            "pin_n_tot",
+            "pin_n_w_value",
+            "pin_pct_w_value",
+            "sale_n_tot",
+            "mv_min",
+            "mv_q10",
+            "mv_q25",
+            "mv_median",
+            "mv_q75",
+            "mv_q90",
+            "mv_max",
+            "mv_mean",
+            "mv_sum",
+            "mv_delta_median",
+            "mv_delta_mean",
+            "mv_delta_sum",
+            "mv_delta_pct_median",
+            "mv_delta_pct_mean",
+            "mv_delta_pct_sum",
+            "ratio_min",
+            "ratio_q10",
+            "ratio_q25",
+            "ratio_median",
+            "ratio_q75",
+            "ratio_q90",
+            "ratio_max",
+            "ratio_mean",
+            "cod",
+            "prd",
+            "prb",
+            "mki",
+            "cod_met",
+            "prd_met",
+            "prb_met",
+            "mki_met",
+            "within_05_pct",
+            "within_10_pct",
+            "within_15_pct",
+            "within_20_pct",
+        ]
+    ]
+
     return dirty
 
 
@@ -310,20 +370,20 @@ def model(dbt, spark_session):
 
     schema = (
         "geography_type: string, geography_id: string, "
-        + "group_type: string, group_id: string, year: bigint, "
-        + "stage_name: string, size: bigint, "
-        + "mv_count: bigint, "
-        + "sale_count: bigint, mv_min: bigint, mv_q10: bigint, "
+        + "geography_data_year: string, group_type: string, group_id: string, "
+        + "year: string, reassessment_year: string, stage_name: string, "
+        + "pin_n_tot: bigint, pin_n_w_value: bigint, pin_pct_w_value: double, "
+        + "sale_n_tot: bigint, mv_min: bigint, mv_q10: bigint, "
         + "mv_q25: bigint, mv_median: bigint, mv_q75: bigint, "
-        + "mv_q90: bigint, mv_max: bigint, mv_mean: bigint, "
-        + "mv_sum: bigint, ratio_min: double, ratio_q10: double, "
-        + "ratio_q25: double, ratio_median: double, ratio_q75: double, "
-        + "ratio_q90: double, ratio_max: double, ratio_mean: double, "
-        + "cod: double, prd: double, prb: double, mki: double, "
-        + "mv_delta_pct_median: double, mv_delta_pct_mean: double, "
-        + "mv_delta_pct_sum: double, reassessment_year: string, "
-        + "cod_met: boolean, prd_met: boolean, prb_met: boolean, "
-        + "mki_met: boolean, within_05_pct: boolean, "
+        + "mv_q90: bigint, mv_max: bigint, mv_mean: bigint, mv_sum: bigint, "
+        + "mv_delta_median: bigint, mv_delta_mean: bigint, "
+        + "mv_delta_sum: bigint, mv_delta_pct_median: double, "
+        + "mv_delta_pct_mean: double, mv_delta_pct_sum: double, "
+        + "ratio_min: double, ratio_q10: double, ratio_q25: double, "
+        + "ratio_median: double, ratio_q75: double, ratio_q90: double, "
+        + "ratio_max: double, ratio_mean: double, cod: double, prd: double, "
+        + "prb: double, mki: double, cod_met: boolean, prd_met: boolean, "
+        + "prb_met: boolean, mki_met: boolean, within_05_pct: boolean, "
         + "within_10_pct: boolean, within_15_pct: boolean, "
         + "within_20_pct: boolean"
     )

From d2dddab9693eabab606bc0763d47e65439e3723e Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Sun, 7 Jul 2024 17:35:01 +0000
Subject: [PATCH 39/96] Attempt to fix pin_n_tot type error that doesn't
 trigger locally

---
 dbt/models/reporting/reporting.sot_ratio_stats.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py
index f3b373583..3be65774c 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.py
@@ -287,6 +287,7 @@ def clean(dirty):
             "stage_name": "str",
             "reassessment_year": "str",
             "pin_n_w_value": np.int64,
+            "pin_n_tot": np.int64,
             "sale_n_tot": np.int64,
             "mv_min": np.int64,
             "mv_q10": np.int64,

From 00e790cc1729b69620210d1f8dcf444fd330a7a7 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Sun, 7 Jul 2024 18:56:46 +0000
Subject: [PATCH 40/96] Try again to fix pin_n_tot

---
 .../reporting/reporting.sot_ratio_stats.py    | 35 +------------------
 1 file changed, 1 insertion(+), 34 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py
index 3be65774c..22500e3ef 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.py
@@ -18,39 +18,6 @@
         "county",
         "triad",
         "township",
-        "nbhd",
-        "tax_code",
-        "zip_code",
-    ],
-    "census_data_year": [
-        "census_place",
-        "census_tract",
-        "census_congressional_district",
-        "census_zcta",
-    ],
-    "cook_board_of_review_district_data_year": [
-        "cook_board_of_review_district"
-    ],
-    "cook_commissioner_district_data_year": ["cook_commissioner_district"],
-    "cook_judicial_district_data_year": ["cook_judicial_district"],
-    "ward_data_year": ["ward_num"],
-    "community_area_data_year": ["community_area"],
-    "police_district_data_year": ["police_district"],
-    "central_business_district_data_year": ["central_business_district"],
-    "school_data_year": [
-        "school_elementary_district",
-        "school_secondary_district",
-        "school_unified_district",
-    ],
-    "tax_data_year": [
-        "tax_municipality",
-        "tax_park_district",
-        "tax_library_district",
-        "tax_fire_protection_district",
-        "tax_community_college_district",
-        "tax_sanitation_district",
-        "tax_special_service_area",
-        "tax_tif_district",
     ],
 }
 # Declare class groupings
@@ -373,7 +340,7 @@ def model(dbt, spark_session):
         "geography_type: string, geography_id: string, "
         + "geography_data_year: string, group_type: string, group_id: string, "
         + "year: string, reassessment_year: string, stage_name: string, "
-        + "pin_n_tot: bigint, pin_n_w_value: bigint, pin_pct_w_value: double, "
+        + "pin_n_tot: double, pin_n_w_value: bigint, pin_pct_w_value: double, "
         + "sale_n_tot: bigint, mv_min: bigint, mv_q10: bigint, "
         + "mv_q25: bigint, mv_median: bigint, mv_q75: bigint, "
         + "mv_q90: bigint, mv_max: bigint, mv_mean: bigint, mv_sum: bigint, "

From 408de564d9e6ec68ffee98c84f95384b9b85e471 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Sun, 7 Jul 2024 20:03:26 +0000
Subject: [PATCH 41/96] Change ass roll sample to be able to compare across
 stages

---
 dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
index 91397a3f3..a71b66ad2 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
+++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
@@ -97,4 +97,4 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals
     AND uni.stage_name = vals.stage_name
 LEFT JOIN {{ ref('ccao.class_dict') }}
     ON uni.class = class_dict.class_code
-WHERE uni.stage_name = 'MAILED' AND uni.class = '278' AND uni.year >= '2018'
+WHERE uni.class = '278' AND uni.year IN ('2019', '2020', '2021')

From fd95fcb8f878c0f40d11ac45e0cdc4c1e9c9965e Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Sun, 7 Jul 2024 23:23:36 +0000
Subject: [PATCH 42/96] Add commenting for input tables, try to partion
 assessment_roll table

---
 .../reporting.sot_assessment_roll_input.sql        | 14 ++++++++++----
 .../reporting/reporting.sot_ratio_stats_input.sql  |  8 +++++++-
 dbt/models/reporting/reporting.sot_sales_input.sql |  6 +++++-
 .../reporting.sot_taxes_exemptions_input.sql       |  9 +++++++--
 4 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
index a71b66ad2..89724635c 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
+++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
@@ -1,7 +1,10 @@
--- Gather parcel-level geographies and join land, sales, and class groupings
+-- This script gathers parcel-level geographies and joins them to values and
+-- class groupings. Its sole purpose is to feed reporting.sot_assessment_roll,
+-- and should not be used otherwise.
 {{
     config(
-        materialized='table'
+        materialized='table',
+        partitioned_by=['year']
     )
 }}
 
@@ -21,6 +24,8 @@ WITH stages AS (
 
 ),
 
+-- Universe of all parcels as defined by iasworld.pardat, expanded with
+-- assessment stages.
 uni AS (
     SELECT
         vw_pin_universe.*,
@@ -30,7 +35,6 @@ uni AS (
 )
 
 SELECT
-    uni.year,
     uni.stage_name,
     uni.class,
     CAST(vals.tot AS INT) AS av_tot,
@@ -89,7 +93,8 @@ SELECT
     class_dict.major_class_type AS major_class,
     class_dict.modeling_group,
     CASE WHEN class_dict.major_class_code = '2' THEN 'RES' ELSE 'OTHER' END
-        AS res_other
+        AS res_other,
+    uni.year
 FROM uni
 LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals
     ON uni.pin = vals.pin
@@ -97,4 +102,5 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals
     AND uni.stage_name = vals.stage_name
 LEFT JOIN {{ ref('ccao.class_dict') }}
     ON uni.class = class_dict.class_code
+-- Temporary limit on feeder table to avoid GitHub runner memory issues.
 WHERE uni.class = '278' AND uni.year IN ('2019', '2020', '2021')
diff --git a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
index 266024e0a..35d86e976 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
+++ b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
@@ -1,4 +1,7 @@
--- Gather parcel-level geographies and join land, sales, and class groupings
+/* This script gathers parcel-level geographies and joins them to values and
+sale prices, and class groupings in order to construct sales ratios. Its sole
+purpose is to feed reporting.sot_ratio_stats, and should not be used
+otherwise. */
 {{
     config(
         materialized='table'
@@ -21,6 +24,8 @@ WITH stages AS (
 
 ),
 
+-- Universe of all parcels as defined by iasworld.pardat, expanded with
+-- assessment stages.
 uni AS (
     SELECT
         vw_pin_universe.*,
@@ -105,5 +110,6 @@ LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales
     AND NOT sales.sale_filter_deed_type
     AND NOT sales.sale_filter_less_than_10k
     AND NOT sales.sale_filter_same_sale_within_365
+-- Temporary limit on feeder table to avoid GitHub runner memory issues.
 WHERE uni.year >= '2020'
     AND uni.year IN ('2022', '2023') AND uni.class IN ('278', '597')
diff --git a/dbt/models/reporting/reporting.sot_sales_input.sql b/dbt/models/reporting/reporting.sot_sales_input.sql
index 575bb8aed..72c4ba852 100644
--- a/dbt/models/reporting/reporting.sot_sales_input.sql
+++ b/dbt/models/reporting/reporting.sot_sales_input.sql
@@ -1,3 +1,7 @@
+-- This script gathers parcel-level geographies and joins them to sales and
+-- class groupings. Its sole purpose is to feed reporting.sot_sales,
+-- and should not be used otherwise.
+
 {{
     config(
         materialized='table'
@@ -16,7 +20,6 @@ WITH sf AS (
     GROUP BY pin, year
 )
 
--- Gather parcel-level geographies and join land, sales, and class groupings
 SELECT
     sales.doc_no,
     sales.sale_price,
@@ -96,4 +99,5 @@ LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales
     AND NOT sales.sale_filter_deed_type
     AND NOT sales.sale_filter_less_than_10k
     AND NOT sales.sale_filter_same_sale_within_365
+-- Temporary limit on feeder table to avoid GitHub runner memory issues.
 WHERE uni.year = '2023'
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
index 97cfff982..89d44c49c 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
@@ -1,9 +1,13 @@
+-- This script gathers parcel-level geographies and joins them to values, tax
+-- amounts, exemptions and class groupings. Its sole purpose is to feed
+-- reporting.sot_taxes_and_exemptions, and should not be used otherwise.
 {{
     config(
         materialized='table'
     )
 }}
 
+-- Gather unique tax codes and rates
 WITH tcd AS (
     SELECT DISTINCT
         tax_code_num,
@@ -12,13 +16,13 @@ WITH tcd AS (
     FROM {{ source('tax', 'tax_code') }}
 )
 
--- Gather parcel-level geographies and join taxes, exemptions, and class
--- groupings
 SELECT
     uni.pin,
     tax.year,
     tax.av_clerk AS tax_av,
     tax.tax_bill_total,
+    -- Setting exemptions with values of 0 allows us to count the number of
+    -- exemptions more easily and doesn't skew stats.
     CASE WHEN tax.exe_homeowner = 0 THEN NULL ELSE tax.exe_homeowner END
         AS tax_exe_homeowner,
     CASE WHEN tax.exe_senior = 0 THEN NULL ELSE tax.exe_senior END
@@ -123,4 +127,5 @@ INNER JOIN tcd
     AND tax.year = tcd.year
 INNER JOIN {{ ref('ccao.class_dict') }}
     ON uni.class = class_dict.class_code
+-- Temporary limit on feeder table to avoid GitHub runner memory issues.
 WHERE uni.class = '206'

From f296292ae400c5a213a541d2dea45ffa86864898 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Sun, 7 Jul 2024 23:53:36 +0000
Subject: [PATCH 43/96] Comment python scripts

---
 .../reporting.sot_assessment_roll.py          | 28 +++++++++++++++----
 dbt/models/reporting/reporting.sot_sales.py   | 21 ++++++++++++--
 .../reporting.sot_taxes_exemptions.py         | 15 +++++++---
 3 files changed, 51 insertions(+), 13 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index d0a579b04..576414d16 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -1,8 +1,5 @@
-# pylint: skip-file
-# type: ignore
-
-# This script generates aggregated summary stats on sales data across a number
-# of geographies, class combinations, and time.
+# This script generates aggregated summary stats on assessed values across a
+# number of geographies, class combinations, and time.
 
 # Import libraries
 import pandas as pd
@@ -100,6 +97,11 @@ def first(x):
 
 
 def aggregrate(data, geography_type, group_type):
+    """
+    Function to group a dataframe by whichever geography and group types it is
+    passed and output aggregate stats for that only for that grouping.
+    """
+
     print(geography_type, group_type)
 
     group = [geography_type, group_type, "year", "stage_name"]
@@ -122,6 +124,12 @@ def aggregrate(data, geography_type, group_type):
 
 
 def assemble(df, geos, groups):
+    """
+    Function that loops over predefined geography and class groups and passes
+    them to the aggregate function. Outputs stacked aggegrated output from the
+    aggregate function.
+    """
+
     # Create an empty dataframe to fill with output
     output = pd.DataFrame()
 
@@ -133,11 +141,12 @@ def assemble(df, geos, groups):
             for z in groups:
                 output = pd.concat([output, aggregrate(df, x, z)])
 
+    # Flatten multi-index
     output.columns = ["_".join(col) for col in output.columns]
     output = output.reset_index()
     output = output.rename(columns={"triad_first": "triad"})
 
-    # Clean combined output and export
+    # Create additional stat columns post-aggregation
     output["av_tot_pct_w_value"] = (
         output["av_tot_count"] / output["av_tot_size"]
     )
@@ -254,6 +263,9 @@ def assemble(df, geos, groups):
     output["triennial"] = output["geography_type"].isin(
         ["triad", "township", "nbhd"]
     )
+
+    # Reassessment year is constructed as a string rather than a boolean to
+    # avoid PySpark errors with nullable booleans that can likely be resolved.
     output["reassessment_year"] = ""
     output.loc[
         (output["triennial"] == True), "reassessment_year"  # noqa: E712
@@ -284,6 +296,10 @@ def assemble(df, geos, groups):
 
 
 def clean_names(x):
+    """
+    Function to rename and reorder columns.
+    """
+
     output = x.rename(
         columns={
             "av_tot_size": "pin_n_tot",
diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py
index b2d55f7ae..781625ce7 100644
--- a/dbt/models/reporting/reporting.sot_sales.py
+++ b/dbt/models/reporting/reporting.sot_sales.py
@@ -1,5 +1,5 @@
-# This script generates aggregated summary stats on sales data across a number
-# of geographies, class combinations, and time.
+# This script generates aggregated summary stats on sales across a number of
+# geographies, class combinations, and time.
 
 import statistics as stats
 
@@ -97,6 +97,10 @@ def first(x):
 
 
 def aggregrate(data, geography_type, group_type):
+    """
+    Function to group a dataframe by whichever geography and group types it is
+    passed and output aggregate stats for that only for that grouping.
+    """
     print(geography_type, group_type)
 
     group = [geography_type, group_type, "year"]
@@ -118,6 +122,12 @@ def aggregrate(data, geography_type, group_type):
 
 
 def assemble(df, geos, groups):
+    """
+    Function that loops over predefined geography and class groups and passes
+    them to the aggregate function. Outputs stacked aggegrated output from the
+    aggregate function.
+    """
+
     # Create an empty dataframe to fill with output
     output = pd.DataFrame()
 
@@ -129,10 +139,11 @@ def assemble(df, geos, groups):
             for z in groups:
                 output = pd.concat([output, aggregrate(df, x, z)])
 
+    # Flatten multi-index
     output.columns = ["_".join(col) for col in output.columns]
     output = output.reset_index()
 
-    # Clean combined output and export
+    # Create additional stat columns post-aggregation
     output["sale_price_sum"] = output["sale_price_sum"].replace(0, np.NaN)
     output["sale_price_per_sf_sum"] = output["sale_price_per_sf_sum"].replace(
         0, np.NaN
@@ -180,6 +191,10 @@ def assemble(df, geos, groups):
 
 
 def clean_names(x):
+    """
+    Function to rename and reorder columns.
+    """
+
     output = x.rename(
         columns={
             "sale_price_size": "pin_n_tot",
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
index fc8eb9cc2..bf4c49c72 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
@@ -1,6 +1,3 @@
-# pylint: skip-file
-# type: ignore
-
 # This script generates aggregated summary stats on taxes and exemptions data
 # across a number of geographies, class combinations, and time.
 
@@ -109,6 +106,11 @@ def first(x):
 
 
 def aggregrate(data, geography_type, group_type):
+    """
+    Function to group a dataframe by whichever geography and group types it is
+    passed and output aggregate stats for that only for that grouping.
+    """
+
     print(geography_type, group_type)
 
     group = [geography_type, group_type, "year"]
@@ -141,10 +143,11 @@ def assemble(df, geos, groups):
             for z in groups:
                 output = pd.concat([output, aggregrate(df, x, z)])
 
+    # Flatten multi-index
     output.columns = ["_".join(col) for col in output.columns]
     output = output.reset_index()
 
-    # Clean combined output and export
+    # Create additional stat columns post-aggregation
     output["tax_bill_total_delta_median"] = (
         output.sort_values("year")
         .groupby(["geography_id", "group_id"])
@@ -169,6 +172,10 @@ def assemble(df, geos, groups):
 
 
 def clean_names(x):
+    """
+    Function to rename and reorder columns.
+    """
+
     output = x.rename(
         columns={
             "tax_eq_factor_final_size": "pin_n_tot",

From a23ff728bd9205d92cc112f7338c1feffe41dc56 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Mon, 8 Jul 2024 00:58:48 +0000
Subject: [PATCH 44/96] Clean up ratio_stats script

---
 .../reporting/reporting.sot_ratio_stats.py    | 158 ++++++++++++------
 1 file changed, 104 insertions(+), 54 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py
index 22500e3ef..a08ad513d 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.py
@@ -4,8 +4,8 @@
     "s3://ccao-athena-dependencies-us-east-1/assesspy==1.1.0.zip"
 )
 
-# This script generates aggregated summary stats on sales data across a number
-# of geographies, class combinations, and time.
+# This script generates aggregated summary stats on sales ratios across a
+# number of geographies, class combinations, and time.
 
 # Import libraries
 import assesspy as ass  # noqa: E402
@@ -18,12 +18,46 @@
         "county",
         "triad",
         "township",
+        "nbhd",
+        "tax_code",
+        "zip_code",
+    ],
+    "census_data_year": [
+        "census_place",
+        "census_tract",
+        "census_congressional_district",
+        "census_zcta",
+    ],
+    "cook_board_of_review_district_data_year": [
+        "cook_board_of_review_district"
+    ],
+    "cook_commissioner_district_data_year": ["cook_commissioner_district"],
+    "cook_judicial_district_data_year": ["cook_judicial_district"],
+    "ward_data_year": ["ward_num"],
+    "community_area_data_year": ["community_area"],
+    "police_district_data_year": ["police_district"],
+    "central_business_district_data_year": ["central_business_district"],
+    "school_data_year": [
+        "school_elementary_district",
+        "school_secondary_district",
+        "school_unified_district",
+    ],
+    "tax_data_year": [
+        "tax_municipality",
+        "tax_park_district",
+        "tax_library_district",
+        "tax_fire_protection_district",
+        "tax_community_college_district",
+        "tax_sanitation_district",
+        "tax_special_service_area",
+        "tax_tif_district",
     ],
 }
 # Declare class groupings
 groups = ["no_group", "class", "major_class", "modeling_group", "res_other"]
 
 
+# Wrap assesspy functions to avoid GitHub runner errors for length 0 groupings
 def cod_safe(ratio):
     if len(ratio) >= 1:
         output = ass.cod(ratio)
@@ -62,6 +96,7 @@ def mki_safe(assessed, sale_price):
     return output
 
 
+# Define aggregation functions
 def first(x):
     if len(x) >= 1:
         output = x.iloc[0]
@@ -79,8 +114,14 @@ def within(x, limit):
     return np.logical_and(1 - limit < x, x < 1 + limit)
 
 
-# Define aggregation functions
 def aggregrate(data, geography_type, group_type):
+    """
+    Function to group a dataframe by whichever geography and group types it is
+    passed and output aggregate stats for that only for that grouping. Works
+    differently than in other SoT scripts since assesspy functions need
+    multiple inputs.
+    """
+
     print(geography_type, group_type)
 
     group = [geography_type, group_type, "year", "stage_name"]
@@ -88,7 +129,7 @@ def aggregrate(data, geography_type, group_type):
     data["sale_n_tot"] = data.groupby(group)["sale_price"].transform("count")
     data["pin_n_w_value"] = data.groupby(group)["tot_mv"].transform("count")
 
-    # Remove parcels with FMVs of 0 since they screw up ratios
+    # Remove parcels with MVs of 0 since they screw up ratios
     data = data[data["tot_mv"] > 0]
 
     # Remove groups that only have one sale since we can't calculate stats
@@ -140,6 +181,12 @@ def aggregrate(data, geography_type, group_type):
 
 
 def assemble(df, geos, groups):
+    """
+    Function that loops over predefined geography and class groups and passes
+    them to the aggregate function. Outputs stacked aggegrated output from the
+    aggregate function.
+    """
+
     # Create an empty dataframe to fill with output
     output = pd.DataFrame()
 
@@ -153,13 +200,8 @@ def assemble(df, geos, groups):
 
     output.dropna(how="all", axis=1, inplace=True)
 
-    return output
-
-
-def clean(dirty):
-    dirty.index.names = ["geography_id", "group_id", "year", "stage_name"]
-
-    dirty = dirty.reset_index().set_index(
+    output.index.names = ["geography_id", "group_id", "year", "stage_name"]
+    output = output.reset_index().set_index(
         [
             "geography_type",
             "geography_id",
@@ -170,82 +212,92 @@ def clean(dirty):
         ]
     )
 
-    dirty["pin_pct_w_value"] = dirty["pin_n_w_value"] / dirty["pin_n_tot"]
+    # Create additional stat columns post-aggregation
+    output["pin_pct_w_value"] = output["pin_n_w_value"] / output["pin_n_tot"]
 
-    # Clean combined dirty and export
-    dirty["mv_delta_median"] = (
-        dirty.sort_values("year")
+    output["mv_delta_median"] = (
+        output.sort_values("year")
         .groupby(["geography_id", "group_id", "stage_name"])
         .mv_median.diff()
     )
-    dirty["mv_delta_mean"] = (
-        dirty.sort_values("year")
+    output["mv_delta_mean"] = (
+        output.sort_values("year")
         .groupby(["geography_id", "group_id", "stage_name"])
         .mv_mean.diff()
     )
-    dirty["mv_delta_sum"] = (
-        dirty.sort_values("year")
+    output["mv_delta_sum"] = (
+        output.sort_values("year")
         .groupby(["geography_id", "group_id", "stage_name"])
         .mv_sum.diff()
     )
 
-    dirty["mv_delta_pct_median"] = (
-        dirty.sort_values("year")
+    output["mv_delta_pct_median"] = (
+        output.sort_values("year")
         .groupby(["geography_id", "group_id", "stage_name"])
         .mv_median.pct_change()
     )
 
-    dirty["mv_delta_pct_mean"] = (
-        dirty.sort_values("year")
+    output["mv_delta_pct_mean"] = (
+        output.sort_values("year")
         .groupby(["geography_id", "group_id", "stage_name"])
         .mv_mean.pct_change()
     )
 
-    dirty["mv_delta_pct_sum"] = (
-        dirty.sort_values("year")
+    output["mv_delta_pct_sum"] = (
+        output.sort_values("year")
         .groupby(["geography_id", "group_id", "stage_name"])
         .mv_sum.pct_change()
     )
 
-    dirty = dirty.reset_index()
+    output = output.reset_index()
 
-    dirty["year"] = dirty["year"].astype(int)
-    dirty["triennial"] = dirty["geography_type"].isin(
+    output["year"] = output["year"].astype(int)
+    output["triennial"] = output["geography_type"].isin(
         ["triad", "township", "nbhd"]
     )
-    dirty["reassessment_year"] = ""
-    dirty.loc[
-        (dirty["triennial"] == True), "reassessment_year"  # noqa: E712
+    output["reassessment_year"] = ""
+    output.loc[
+        (output["triennial"] == True), "reassessment_year"  # noqa: E712
     ] = "No"
-    dirty.loc[
-        (dirty["year"] % 3 == 0)
-        & (dirty["triad"] == "North")
-        & (dirty["triennial"] == True),  # noqa: E712
+    output.loc[
+        (output["year"] % 3 == 0)
+        & (output["triad"] == "North")
+        & (output["triennial"] == True),  # noqa: E712
         "reassessment_year",
     ] = "Yes"
-    dirty.loc[
-        (dirty["year"] % 3 == 1)
-        & (dirty["triad"] == "South")
-        & (dirty["triennial"] == True),  # noqa: E712
+    output.loc[
+        (output["year"] % 3 == 1)
+        & (output["triad"] == "South")
+        & (output["triennial"] == True),  # noqa: E712
         "reassessment_year",
     ] = "Yes"
-    dirty.loc[
-        (dirty["year"] % 3 == 2)
-        & (dirty["triad"] == "City")
-        & (dirty["triennial"] == True),  # noqa: E712
+    output.loc[
+        (output["year"] % 3 == 2)
+        & (output["triad"] == "City")
+        & (output["triennial"] == True),  # noqa: E712
         "reassessment_year",
     ] = "Yes"
-    dirty = dirty.drop(["triennial", "triad"], axis=1)
+    output = output.drop(["triennial", "triad"], axis=1)
+
+    output["cod_met"] = met(output["cod"], 5, 15)
+    output["prd_met"] = met(output["prd"], 0.98, 1.03)
+    output["prb_met"] = met(output["prb"], -0.05, 0.05)
+    output["mki_met"] = met(output["mki"], 0.95, 1.05)
+
+    output["within_05_pct"] = within(output["ratio_mean"], 0.05)
+    output["within_10_pct"] = within(output["ratio_mean"], 0.1)
+    output["within_15_pct"] = within(output["ratio_mean"], 0.15)
+    output["within_20_pct"] = within(output["ratio_mean"], 0.2)
 
-    dirty["cod_met"] = met(dirty["cod"], 5, 15)
-    dirty["prd_met"] = met(dirty["prd"], 0.98, 1.03)
-    dirty["prb_met"] = met(dirty["prb"], -0.05, 0.05)
-    dirty["mki_met"] = met(dirty["mki"], 0.95, 1.05)
+    output = clean(output)
+
+    return output
 
-    dirty["within_05_pct"] = within(dirty["ratio_mean"], 0.05)
-    dirty["within_10_pct"] = within(dirty["ratio_mean"], 0.1)
-    dirty["within_15_pct"] = within(dirty["ratio_mean"], 0.15)
-    dirty["within_20_pct"] = within(dirty["ratio_mean"], 0.2)
+
+def clean(dirty):
+    """
+    Function to change column types and reorder them.
+    """
 
     dirty = dirty.astype(
         {
@@ -334,8 +386,6 @@ def model(dbt, spark_session):
 
     df = assemble(input, geos=geos, groups=groups)
 
-    df = clean(df)
-
     schema = (
         "geography_type: string, geography_id: string, "
         + "geography_data_year: string, group_type: string, group_id: string, "

From 07f6dfefe9beb6ef8e8db4af31251dccced3a33d Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Mon, 8 Jul 2024 01:40:20 +0000
Subject: [PATCH 45/96] Back to fixing pin_n_tot

---
 dbt/models/reporting/reporting.sot_ratio_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py
index a08ad513d..fd106f53d 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.py
@@ -390,7 +390,7 @@ def model(dbt, spark_session):
         "geography_type: string, geography_id: string, "
         + "geography_data_year: string, group_type: string, group_id: string, "
         + "year: string, reassessment_year: string, stage_name: string, "
-        + "pin_n_tot: double, pin_n_w_value: bigint, pin_pct_w_value: double, "
+        + "pin_n_tot: int, pin_n_w_value: bigint, pin_pct_w_value: double, "
         + "sale_n_tot: bigint, mv_min: bigint, mv_q10: bigint, "
         + "mv_q25: bigint, mv_median: bigint, mv_q75: bigint, "
         + "mv_q90: bigint, mv_max: bigint, mv_mean: bigint, mv_sum: bigint, "

From b78a072f6e6ffc8a814e6ba0375ab3c68bee3b25 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Mon, 8 Jul 2024 02:43:44 +0000
Subject: [PATCH 46/96] Replace nan with None

---
 dbt/models/reporting/reporting.sot_ratio_stats.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py
index fd106f53d..cf8979f56 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.py
@@ -289,6 +289,8 @@ def assemble(df, geos, groups):
     output["within_15_pct"] = within(output["ratio_mean"], 0.15)
     output["within_20_pct"] = within(output["ratio_mean"], 0.2)
 
+    output = output.replace(np.nan, None)
+
     output = clean(output)
 
     return output

From 337954e086bfc73c85852bb4d032a43a27de94f3 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Mon, 8 Jul 2024 02:44:53 +0000
Subject: [PATCH 47/96] Partition input tables by year

---
 dbt/models/reporting/reporting.sot_ratio_stats_input.sql      | 3 ++-
 dbt/models/reporting/reporting.sot_sales_input.sql            | 3 ++-
 dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
index 35d86e976..e38a17da6 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
+++ b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
@@ -4,7 +4,8 @@ purpose is to feed reporting.sot_ratio_stats, and should not be used
 otherwise. */
 {{
     config(
-        materialized='table'
+        materialized='table',
+        partitioned_by=['year']
     )
 }}
 
diff --git a/dbt/models/reporting/reporting.sot_sales_input.sql b/dbt/models/reporting/reporting.sot_sales_input.sql
index 72c4ba852..b6a5d64fb 100644
--- a/dbt/models/reporting/reporting.sot_sales_input.sql
+++ b/dbt/models/reporting/reporting.sot_sales_input.sql
@@ -4,7 +4,8 @@
 
 {{
     config(
-        materialized='table'
+        materialized='table',
+        partitioned_by=['year']
     )
 }}
 
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
index 89d44c49c..9dd5f9e04 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
@@ -3,7 +3,8 @@
 -- reporting.sot_taxes_and_exemptions, and should not be used otherwise.
 {{
     config(
-        materialized='table'
+        materialized='table',
+        partitioned_by=['year']
     )
 }}
 

From 1031144ad60a485cf304a71b32005c8fdf6c4646 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Mon, 8 Jul 2024 03:26:34 +0000
Subject: [PATCH 48/96] Fix year partitioning

---
 dbt/models/reporting/reporting.sot_ratio_stats_input.sql      | 4 ++--
 dbt/models/reporting/reporting.sot_sales_input.sql            | 4 ++--
 dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
index e38a17da6..17ed6ef83 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
+++ b/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
@@ -37,7 +37,6 @@ uni AS (
 
 SELECT
     CAST(sales.sale_price AS DOUBLE) AS sale_price,
-    uni.year,
     uni.stage_name,
     uni.class,
     CAST(vals.tot_mv AS DOUBLE) AS tot_mv,
@@ -95,7 +94,8 @@ SELECT
     class_dict.major_class_type AS major_class,
     class_dict.modeling_group,
     CASE WHEN class_dict.major_class_code = '2' THEN 'RES' ELSE 'OTHER' END
-        AS res_other
+        AS res_other,
+    uni.year
 FROM uni
 LEFT JOIN
     {{ ref('reporting.vw_pin_value_long') }} AS vals
diff --git a/dbt/models/reporting/reporting.sot_sales_input.sql b/dbt/models/reporting/reporting.sot_sales_input.sql
index b6a5d64fb..18d0960aa 100644
--- a/dbt/models/reporting/reporting.sot_sales_input.sql
+++ b/dbt/models/reporting/reporting.sot_sales_input.sql
@@ -31,7 +31,6 @@ SELECT
     CAST(sf.char_bldg_sf AS INT) AS sale_char_bldg_sf,
     CAST(sf.char_land_sf AS INT) AS sale_char_land_sf,
     CAST(sf.char_yrblt AS INT) AS sale_char_yrblt,
-    uni.year,
     uni.class,
     'Cook' AS county,
     uni.triad_name AS triad,
@@ -86,7 +85,8 @@ SELECT
     class_dict.major_class_type AS major_class,
     class_dict.modeling_group,
     CASE WHEN class_dict.major_class_code = '2' THEN 'RES' ELSE 'OTHER' END
-        AS res_other
+        AS res_other,
+    uni.year
 FROM {{ ref('default.vw_pin_universe') }} AS uni
 LEFT JOIN sf
     ON uni.pin = sf.pin
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
index 9dd5f9e04..95718dca9 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions_input.sql
@@ -19,7 +19,6 @@ WITH tcd AS (
 
 SELECT
     uni.pin,
-    tax.year,
     tax.av_clerk AS tax_av,
     tax.tax_bill_total,
     -- Setting exemptions with values of 0 allows us to count the number of
@@ -116,7 +115,8 @@ SELECT
     class_dict.major_class_type AS major_class,
     class_dict.modeling_group,
     CASE WHEN class_dict.major_class_code = '2' THEN 'RES' ELSE 'OTHER' END
-        AS res_other
+        AS res_other,
+    tax.year
 FROM {{ ref('default.vw_pin_universe') }} AS uni
 INNER JOIN {{ source('tax', 'pin') }} AS tax
     ON uni.pin = tax.pin

From 45ea3054fe2210aab01817bdac3e15bf687cb5bc Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Mon, 8 Jul 2024 12:39:59 +0000
Subject: [PATCH 49/96] Use double for nullable columns

---
 dbt/models/reporting/reporting.sot_ratio_stats.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py
index cf8979f56..42da9363d 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.py
@@ -289,6 +289,7 @@ def assemble(df, geos, groups):
     output["within_15_pct"] = within(output["ratio_mean"], 0.15)
     output["within_20_pct"] = within(output["ratio_mean"], 0.2)
 
+    # PySpark rejects nan, convert them to None
     output = output.replace(np.nan, None)
 
     output = clean(output)
@@ -396,8 +397,8 @@ def model(dbt, spark_session):
         + "sale_n_tot: bigint, mv_min: bigint, mv_q10: bigint, "
         + "mv_q25: bigint, mv_median: bigint, mv_q75: bigint, "
         + "mv_q90: bigint, mv_max: bigint, mv_mean: bigint, mv_sum: bigint, "
-        + "mv_delta_median: bigint, mv_delta_mean: bigint, "
-        + "mv_delta_sum: bigint, mv_delta_pct_median: double, "
+        + "mv_delta_median: double, mv_delta_mean: double, "
+        + "mv_delta_sum: double, mv_delta_pct_median: double, "
         + "mv_delta_pct_mean: double, mv_delta_pct_sum: double, "
         + "ratio_min: double, ratio_q10: double, ratio_q25: double, "
         + "ratio_median: double, ratio_q75: double, ratio_q90: double, "

From ca139f3b862f9b54dcfe8146c996dba36a0568e2 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Tue, 9 Jul 2024 17:11:23 +0000
Subject: [PATCH 50/96] Move data year specification to dbt seed

---
 dbt/dbt_project.yml                           |  2 +
 .../reporting.sot_assessment_roll.py          | 59 ++++++-------------
 .../reporting/reporting.sot_ratio_stats.py    | 58 ++++++------------
 dbt/models/reporting/reporting.sot_sales.py   | 58 ++++++------------
 .../reporting.sot_taxes_exemptions.py         | 58 ++++++------------
 dbt/seeds/reporting/docs.md                   |  6 ++
 .../reporting/reporting.sot_data_years.csv    |  9 +++
 dbt/seeds/reporting/schema.yml                |  6 ++
 8 files changed, 92 insertions(+), 164 deletions(-)
 create mode 100644 dbt/seeds/reporting/docs.md
 create mode 100644 dbt/seeds/reporting/reporting.sot_data_years.csv
 create mode 100644 dbt/seeds/reporting/schema.yml

diff --git a/dbt/dbt_project.yml b/dbt/dbt_project.yml
index a94530b30..40740836a 100644
--- a/dbt/dbt_project.yml
+++ b/dbt/dbt_project.yml
@@ -71,3 +71,5 @@ seeds:
       +schema: location
     model:
       +schema: model
+    reporting:
+      +schema: reporting
diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index 576414d16..e62b042f3 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -4,47 +4,6 @@
 # Import libraries
 import pandas as pd
 
-# Declare geographic groups and their associated data years
-geos = {
-    "year": [
-        "county",
-        "triad",
-        "township",
-        "nbhd",
-        "tax_code",
-        "zip_code",
-    ],
-    "census_data_year": [
-        "census_place",
-        "census_tract",
-        "census_congressional_district",
-        "census_zcta",
-    ],
-    "cook_board_of_review_district_data_year": [
-        "cook_board_of_review_district"
-    ],
-    "cook_commissioner_district_data_year": ["cook_commissioner_district"],
-    "cook_judicial_district_data_year": ["cook_judicial_district"],
-    "ward_data_year": ["ward_num"],
-    "community_area_data_year": ["community_area"],
-    "police_district_data_year": ["police_district"],
-    "central_business_district_data_year": ["central_business_district"],
-    "school_data_year": [
-        "school_elementary_district",
-        "school_secondary_district",
-        "school_unified_district",
-    ],
-    "tax_data_year": [
-        "tax_municipality",
-        "tax_park_district",
-        "tax_library_district",
-        "tax_fire_protection_district",
-        "tax_community_college_district",
-        "tax_sanitation_district",
-        "tax_special_service_area",
-        "tax_tif_district",
-    ],
-}
 # Declare class groupings
 groups = ["no_group", "class", "major_class", "modeling_group", "res_other"]
 
@@ -373,9 +332,27 @@ def clean_names(x):
     return output
 
 
+def ingest_geos(geos):
+    """
+    Function to convert dbt seed into a dictionary that can be iterated over.
+    """
+
+    geos = geos.toPandas()
+
+    output = {
+        k: list(geos[k].unique()[pd.notnull(geos[k].unique())])
+        for k in geos.columns
+    }
+
+    return output
+
+
 def model(dbt, spark_session):
     dbt.config(materialized="table")
 
+    # Ingest geographies and their associated data years
+    geos = ingest_geos(dbt.ref("reporting.sot_data_years"))
+
     input = dbt.ref("reporting.sot_assessment_roll_input")
 
     # Convert the Spark input dataframe to Pandas for
diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py
index 42da9363d..406d646c4 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.py
@@ -12,47 +12,6 @@
 import numpy as np  # noqa: E402
 import pandas as pd  # noqa: E402
 
-# Declare geographic groups and their associated data years
-geos = {
-    "year": [
-        "county",
-        "triad",
-        "township",
-        "nbhd",
-        "tax_code",
-        "zip_code",
-    ],
-    "census_data_year": [
-        "census_place",
-        "census_tract",
-        "census_congressional_district",
-        "census_zcta",
-    ],
-    "cook_board_of_review_district_data_year": [
-        "cook_board_of_review_district"
-    ],
-    "cook_commissioner_district_data_year": ["cook_commissioner_district"],
-    "cook_judicial_district_data_year": ["cook_judicial_district"],
-    "ward_data_year": ["ward_num"],
-    "community_area_data_year": ["community_area"],
-    "police_district_data_year": ["police_district"],
-    "central_business_district_data_year": ["central_business_district"],
-    "school_data_year": [
-        "school_elementary_district",
-        "school_secondary_district",
-        "school_unified_district",
-    ],
-    "tax_data_year": [
-        "tax_municipality",
-        "tax_park_district",
-        "tax_library_district",
-        "tax_fire_protection_district",
-        "tax_community_college_district",
-        "tax_sanitation_district",
-        "tax_special_service_area",
-        "tax_tif_district",
-    ],
-}
 # Declare class groupings
 groups = ["no_group", "class", "major_class", "modeling_group", "res_other"]
 
@@ -378,9 +337,26 @@ def clean(dirty):
     return dirty
 
 
+def ingest_geos(geos):
+    """
+    Function to convert dbt seed into a dictionary that can be iterated over.
+    """
+
+    geos = geos.toPandas()
+    output = {
+        k: list(geos[k].unique()[pd.notnull(geos[k].unique())])
+        for k in geos.columns
+    }
+
+    return output
+
+
 def model(dbt, spark_session):
     dbt.config(materialized="table")
 
+    # Ingest geographies and their associated data years
+    geos = ingest_geos(dbt.ref("reporting.sot_data_years"))
+
     input = dbt.ref("reporting.sot_ratio_stats_input")
 
     # Convert the Spark input dataframe to Pandas for
diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py
index 781625ce7..fee655679 100644
--- a/dbt/models/reporting/reporting.sot_sales.py
+++ b/dbt/models/reporting/reporting.sot_sales.py
@@ -7,47 +7,6 @@
 import numpy as np
 import pandas as pd
 
-# Declare geographic groups and their associated data years
-geos = {
-    "year": [
-        "county",
-        "triad",
-        "township",
-        "nbhd",
-        "tax_code",
-        "zip_code",
-    ],
-    "census_data_year": [
-        "census_place",
-        "census_tract",
-        "census_congressional_district",
-        "census_zcta",
-    ],
-    "cook_board_of_review_district_data_year": [
-        "cook_board_of_review_district"
-    ],
-    "cook_commissioner_district_data_year": ["cook_commissioner_district"],
-    "cook_judicial_district_data_year": ["cook_judicial_district"],
-    "ward_data_year": ["ward_num"],
-    "community_area_data_year": ["community_area"],
-    "police_district_data_year": ["police_district"],
-    "central_business_district_data_year": ["central_business_district"],
-    "school_data_year": [
-        "school_elementary_district",
-        "school_secondary_district",
-        "school_unified_district",
-    ],
-    "tax_data_year": [
-        "tax_municipality",
-        "tax_park_district",
-        "tax_library_district",
-        "tax_fire_protection_district",
-        "tax_community_college_district",
-        "tax_sanitation_district",
-        "tax_special_service_area",
-        "tax_tif_district",
-    ],
-}
 # Declare class groupings
 groups = ["no_group", "class", "major_class", "modeling_group", "res_other"]
 
@@ -249,9 +208,26 @@ def clean_names(x):
     return output
 
 
+def ingest_geos(geos):
+    """
+    Function to convert dbt seed into a dictionary that can be iterated over.
+    """
+
+    geos = geos.toPandas()
+    output = {
+        k: list(geos[k].unique()[pd.notnull(geos[k].unique())])
+        for k in geos.columns
+    }
+
+    return output
+
+
 def model(dbt, spark_session):
     dbt.config(materialized="table")
 
+    # Ingest geographies and their associated data years
+    geos = ingest_geos(dbt.ref("reporting.sot_data_years"))
+
     input = dbt.ref("reporting.sot_sales_input")
 
     # Convert the Spark input dataframe to Pandas for
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
index bf4c49c72..817dab3ac 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
@@ -4,47 +4,6 @@
 # Import libraries
 import pandas as pd
 
-# Declare geographic groups and their associated data years
-geos = {
-    "year": [
-        "county",
-        "triad",
-        "township",
-        "nbhd",
-        "tax_code",
-        "zip_code",
-    ],
-    "census_data_year": [
-        "census_place",
-        "census_tract",
-        "census_congressional_district",
-        "census_zcta",
-    ],
-    "cook_board_of_review_district_data_year": [
-        "cook_board_of_review_district"
-    ],
-    "cook_commissioner_district_data_year": ["cook_commissioner_district"],
-    "cook_judicial_district_data_year": ["cook_judicial_district"],
-    "ward_data_year": ["ward_num"],
-    "community_area_data_year": ["community_area"],
-    "police_district_data_year": ["police_district"],
-    "central_business_district_data_year": ["central_business_district"],
-    "school_data_year": [
-        "school_elementary_district",
-        "school_secondary_district",
-        "school_unified_district",
-    ],
-    "tax_data_year": [
-        "tax_municipality",
-        "tax_park_district",
-        "tax_library_district",
-        "tax_fire_protection_district",
-        "tax_community_college_district",
-        "tax_sanitation_district",
-        "tax_special_service_area",
-        "tax_tif_district",
-    ],
-}
 # Declare class groupings
 groups = ["no_group", "class", "major_class", "modeling_group", "res_other"]
 
@@ -265,9 +224,26 @@ def clean_names(x):
     return output
 
 
+def ingest_geos(geos):
+    """
+    Function to convert dbt seed into a dictionary that can be iterated over.
+    """
+
+    geos = geos.toPandas()
+    output = {
+        k: list(geos[k].unique()[pd.notnull(geos[k].unique())])
+        for k in geos.columns
+    }
+
+    return output
+
+
 def model(dbt, spark_session):
     dbt.config(materialized="table")
 
+    # Ingest geographies and their associated data years
+    geos = ingest_geos(dbt.ref("reporting.sot_data_years"))
+
     input = dbt.ref("reporting.sot_taxes_exemptions_input")
 
     # Convert the Spark input dataframe to Pandas for
diff --git a/dbt/seeds/reporting/docs.md b/dbt/seeds/reporting/docs.md
new file mode 100644
index 000000000..ac26fa1a4
--- /dev/null
+++ b/dbt/seeds/reporting/docs.md
@@ -0,0 +1,6 @@
+# sot_data_years
+
+{% docs seed_sot_data_years %}
+A table containing reporting geographies and their associated data year identifiers.
+
+{% enddocs %}
diff --git a/dbt/seeds/reporting/reporting.sot_data_years.csv b/dbt/seeds/reporting/reporting.sot_data_years.csv
new file mode 100644
index 000000000..eed8df432
--- /dev/null
+++ b/dbt/seeds/reporting/reporting.sot_data_years.csv
@@ -0,0 +1,9 @@
+year,census_data_year,cook_board_of_review_district_data_year,cook_commissioner_district_data_year,cook_judicial_district_data_year,ward_data_year,community_area_data_year,police_district_data_year,central_business_district_data_year,school_data_year,tax_data_year
+county,census_place,cook_board_of_review_district,cook_commissioner_district,cook_judicial_district,ward_num,community_area,police_district,central_business_district,school_elementary_district,tax_municipality
+triad,census_tract,,,,,,,,school_secondary_district,tax_park_district
+township,census_congressional_district,,,,,,,,school_unified_district,tax_library_district
+nbhd,census_zcta,,,,,,,,,tax_fire_protection_district
+tax_code,,,,,,,,,,tax_community_college_district
+zip_code,,,,,,,,,,tax_sanitation_district
+,,,,,,,,,,tax_special_service_area
+,,,,,,,,,,tax_tif_district
diff --git a/dbt/seeds/reporting/schema.yml b/dbt/seeds/reporting/schema.yml
new file mode 100644
index 000000000..3dca39299
--- /dev/null
+++ b/dbt/seeds/reporting/schema.yml
@@ -0,0 +1,6 @@
+seeds:
+  - name: reporting.sot_data_years
+    description: '{{ doc("seed_sot_data_years") }}'
+    config:
+      column_types:
+        year: string

From 788f97156d3983e166cda1d58b91cc55c81980e7 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Tue, 9 Jul 2024 17:12:03 +0000
Subject: [PATCH 51/96] Formatting

---
 dbt/models/reporting/reporting.sot_assessment_roll.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index e62b042f3..03cc4ff4e 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -338,7 +338,6 @@ def ingest_geos(geos):
     """
 
     geos = geos.toPandas()
-
     output = {
         k: list(geos[k].unique()[pd.notnull(geos[k].unique())])
         for k in geos.columns

From 5449d8c134be1d352b33ee5a680b6e84b77d4ba8 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Tue, 9 Jul 2024 19:52:03 +0000
Subject: [PATCH 52/96] Improve diff and pct_change syntax

---
 .../reporting.sot_assessment_roll.py          | 147 ++++++------------
 .../reporting/reporting.sot_ratio_stats.py    |  57 +++----
 dbt/models/reporting/reporting.sot_sales.py   |  56 +++----
 .../reporting.sot_taxes_exemptions.py         |  32 ++--
 4 files changed, 115 insertions(+), 177 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index 03cc4ff4e..60588a573 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -110,112 +110,57 @@ def assemble(df, geos, groups):
         output["av_tot_count"] / output["av_tot_size"]
     )
 
-    output["av_tot_delta_median"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id", "stage_name"])
-        .av_tot_median.diff()
-    )
-
-    output["av_tot_delta_mean"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id", "stage_name"])
-        .av_tot_mean.diff()
-    )
-
-    output["av_tot_delta_sum"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id", "stage_name"])
-        .av_tot_sum.diff()
-    )
-
-    output["av_bldg_delta_median"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id", "stage_name"])
-        .av_bldg_median.diff()
-    )
-
-    output["av_bldg_delta_mean"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id", "stage_name"])
-        .av_bldg_mean.diff()
-    )
-
-    output["av_bldg_delta_sum"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id", "stage_name"])
-        .av_bldg_sum.diff()
-    )
-
-    output["av_land_delta_median"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id", "stage_name"])
-        .av_land_median.diff()
-    )
-
-    output["av_land_delta_mean"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id", "stage_name"])
-        .av_land_mean.diff()
-    )
-
-    output["av_land_delta_sum"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id", "stage_name"])
-        .av_land_sum.diff()
-    )
-
-    output["av_tot_delta_pct_median"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id", "stage_name"])
-        .av_tot_median.pct_change()
-    )
-
-    output["av_tot_delta_pct_mean"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id", "stage_name"])
-        .av_tot_mean.pct_change()
-    )
-
-    output["av_tot_delta_pct_sum"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id", "stage_name"])
-        .av_tot_sum.pct_change()
-    )
-
-    output["av_bldg_delta_pct_median"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id", "stage_name"])
-        .av_bldg_median.pct_change()
-    )
-
-    output["av_bldg_delta_pct_mean"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id", "stage_name"])
-        .av_bldg_mean.pct_change()
-    )
-
-    output["av_bldg_delta_pct_sum"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id", "stage_name"])
-        .av_bldg_sum.pct_change()
-    )
-
-    output["av_land_delta_pct_median"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id", "stage_name"])
-        .av_land_median.pct_change()
-    )
+    output = output.sort_values("year")
+
+    diff_cols = [
+        "geography_id",
+        "group_id",
+        "stage_name",
+        "av_tot_median",
+        "av_tot_mean",
+        "av_tot_sum",
+        "av_bldg_median",
+        "av_bldg_mean",
+        "av_bldg_sum",
+        "av_land_median",
+        "av_land_mean",
+        "av_land_sum",
+    ]
 
-    output["av_land_delta_pct_mean"] = (
-        output.sort_values("year")
+    output[
+        [
+            "av_tot_delta_median",
+            "av_tot_delta_mean",
+            "av_tot_delta_sum",
+            "av_bldg_delta_median",
+            "av_bldg_delta_mean",
+            "av_bldg_delta_sum",
+            "av_land_delta_median",
+            "av_land_delta_mean",
+            "av_land_delta_sum",
+        ]
+    ] = (
+        output[diff_cols]
         .groupby(["geography_id", "group_id", "stage_name"])
-        .av_land_mean.pct_change()
+        .diff()
     )
 
-    output["av_land_delta_pct_sum"] = (
-        output.sort_values("year")
+    output[
+        [
+            "av_tot_delta_pct_median",
+            "av_tot_delta_pct_mean",
+            "av_tot_delta_pct_sum",
+            "av_bldg_delta_pct_median",
+            "av_bldg_delta_pct_mean",
+            "av_bldg_delta_pct_sum",
+            "av_land_delta_pct_median",
+            "av_land_delta_pct_mean",
+            "av_land_delta_pct_sum",
+        ]
+    ] = (
+        output[diff_cols]
         .groupby(["geography_id", "group_id", "stage_name"])
-        .av_land_sum.pct_change()
+        .pct_change()
     )
 
     output["year"] = output["year"].astype(int)
diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py
index 406d646c4..21ea0ea34 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.py
@@ -171,45 +171,46 @@ def assemble(df, geos, groups):
         ]
     )
 
+    output = output.reset_index()
+
     # Create additional stat columns post-aggregation
     output["pin_pct_w_value"] = output["pin_n_w_value"] / output["pin_n_tot"]
 
-    output["mv_delta_median"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id", "stage_name"])
-        .mv_median.diff()
-    )
-    output["mv_delta_mean"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id", "stage_name"])
-        .mv_mean.diff()
-    )
-    output["mv_delta_sum"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id", "stage_name"])
-        .mv_sum.diff()
-    )
+    output = output.sort_values("year")
 
-    output["mv_delta_pct_median"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id", "stage_name"])
-        .mv_median.pct_change()
-    )
+    diff_cols = [
+        "geography_id",
+        "group_id",
+        "stage_name",
+        "mv_median",
+        "mv_mean",
+        "mv_sum",
+    ]
 
-    output["mv_delta_pct_mean"] = (
-        output.sort_values("year")
+    output[
+        [
+            "mv_delta_median",
+            "mv_delta_mean",
+            "mv_delta_sum",
+        ]
+    ] = (
+        output[diff_cols]
         .groupby(["geography_id", "group_id", "stage_name"])
-        .mv_mean.pct_change()
+        .diff()
     )
 
-    output["mv_delta_pct_sum"] = (
-        output.sort_values("year")
+    output[
+        [
+            "mv_delta_pct_median",
+            "mv_delta_pct_mean",
+            "mv_delta_pct_sum",
+        ]
+    ] = (
+        output[diff_cols]
         .groupby(["geography_id", "group_id", "stage_name"])
-        .mv_sum.pct_change()
+        .pct_change()
     )
 
-    output = output.reset_index()
-
     output["year"] = output["year"].astype(int)
     output["triennial"] = output["geography_type"].isin(
         ["triad", "township", "nbhd"]
diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py
index fee655679..f387d2320 100644
--- a/dbt/models/reporting/reporting.sot_sales.py
+++ b/dbt/models/reporting/reporting.sot_sales.py
@@ -108,40 +108,30 @@ def assemble(df, geos, groups):
         0, np.NaN
     )
 
-    output["sale_price_delta_median"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id"])
-        .sale_price_median.diff()
-    )
-
-    output["sale_price_delta_mean"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id"])
-        .sale_price_mean.diff()
-    )
-
-    output["sale_price_delta_sum"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id"])
-        .sale_price_sum.diff()
-    )
-
-    output["sale_price_per_sf_delta_median"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id"])
-        .sale_price_per_sf_median.diff()
-    )
-
-    output["sale_price_per_sf_delta_mean"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id"])
-        .sale_price_per_sf_mean.diff()
-    )
+    output = output.sort_values("year")
+
+    diff_cols = [
+        "geography_id",
+        "group_id",
+        "sale_price_median",
+        "sale_price_mean",
+        "sale_price_sum",
+        "sale_price_per_sf_median",
+        "sale_price_per_sf_mean",
+        "sale_price_per_sf_sum",
+    ]
 
-    output["sale_price_per_sf_delta_sum"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id"])
-        .sale_price_per_sf_sum.diff()
+    output[
+        [
+            "sale_price_delta_median",
+            "sale_price_delta_mean",
+            "sale_price_delta_sum",
+            "sale_price_per_sf_delta_median",
+            "sale_price_per_sf_delta_mean",
+            "sale_price_per_sf_delta_sum",
+        ]
+    ] = (
+        output[diff_cols].groupby(["geography_id", "group_id"]).diff()
     )
 
     output = clean_names(output)
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
index 817dab3ac..9c803ee33 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
@@ -107,22 +107,24 @@ def assemble(df, geos, groups):
     output = output.reset_index()
 
     # Create additional stat columns post-aggregation
-    output["tax_bill_total_delta_median"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id"])
-        .tax_bill_total_median.diff()
-    )
-
-    output["tax_bill_total_delta_mean"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id"])
-        .tax_bill_total_mean.diff()
-    )
+    output = output.sort_values("year")
+
+    diff_cols = [
+        "geography_id",
+        "group_id",
+        "tax_bill_total_median",
+        "tax_bill_total_mean",
+        "tax_bill_total_sum",
+    ]
 
-    output["tax_bill_total_delta_sum"] = (
-        output.sort_values("year")
-        .groupby(["geography_id", "group_id"])
-        .tax_bill_total_sum.diff()
+    output[
+        [
+            "tax_bill_total_delta_median",
+            "tax_bill_total_delta_mean",
+            "tax_bill_total_delta_sum",
+        ]
+    ] = (
+        output[diff_cols].groupby(["geography_id", "group_id"]).diff()
     )
 
     output = clean_names(output)

From c87713fb79c40518954d911b9ff3f8d74bb93258 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Tue, 9 Jul 2024 21:19:01 +0000
Subject: [PATCH 53/96] Simplify reassessment year syntax

---
 .../reporting/reporting.sot_assessment_roll.py  | 17 +++--------------
 .../reporting/reporting.sot_ratio_stats.py      | 17 +++--------------
 2 files changed, 6 insertions(+), 28 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index 60588a573..f5fbba742 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -175,20 +175,9 @@ def assemble(df, geos, groups):
         (output["triennial"] == True), "reassessment_year"  # noqa: E712
     ] = "No"
     output.loc[
-        (output["year"] % 3 == 0)
-        & (output["triad"] == "North")
-        & (output["triennial"] == True),  # noqa: E712
-        "reassessment_year",
-    ] = "Yes"
-    output.loc[
-        (output["year"] % 3 == 1)
-        & (output["triad"] == "South")
-        & (output["triennial"] == True),  # noqa: E712
-        "reassessment_year",
-    ] = "Yes"
-    output.loc[
-        (output["year"] % 3 == 2)
-        & (output["triad"] == "City")
+        ((output["year"] % 3 == 0) & (output["triad"] == "North"))
+        | ((output["year"] % 3 == 1) & (output["triad"] == "South"))
+        | ((output["year"] % 3 == 2) & (output["triad"] == "City"))
         & (output["triennial"] == True),  # noqa: E712
         "reassessment_year",
     ] = "Yes"
diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py
index 21ea0ea34..615e56c67 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.py
@@ -220,20 +220,9 @@ def assemble(df, geos, groups):
         (output["triennial"] == True), "reassessment_year"  # noqa: E712
     ] = "No"
     output.loc[
-        (output["year"] % 3 == 0)
-        & (output["triad"] == "North")
-        & (output["triennial"] == True),  # noqa: E712
-        "reassessment_year",
-    ] = "Yes"
-    output.loc[
-        (output["year"] % 3 == 1)
-        & (output["triad"] == "South")
-        & (output["triennial"] == True),  # noqa: E712
-        "reassessment_year",
-    ] = "Yes"
-    output.loc[
-        (output["year"] % 3 == 2)
-        & (output["triad"] == "City")
+        ((output["year"] % 3 == 0) & (output["triad"] == "North"))
+        | ((output["year"] % 3 == 1) & (output["triad"] == "South"))
+        | ((output["year"] % 3 == 2) & (output["triad"] == "City"))
         & (output["triennial"] == True),  # noqa: E712
         "reassessment_year",
     ] = "Yes"

From d1079f016be7b52af368e57637053be7fc3c49a2 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Wed, 10 Jul 2024 15:05:45 +0000
Subject: [PATCH 54/96] More commenting

---
 .../reporting/reporting.sot_assessment_roll.py      | 12 ++++++++----
 dbt/models/reporting/reporting.sot_ratio_stats.py   |  9 ++++++---
 dbt/models/reporting/reporting.sot_sales.py         | 12 ++++++++----
 .../reporting/reporting.sot_taxes_exemptions.py     | 13 +++++++++++--
 4 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index f5fbba742..eef930eb3 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -8,7 +8,8 @@
 groups = ["no_group", "class", "major_class", "modeling_group", "res_other"]
 
 
-# Define aggregation functions
+# Define aggregation functions. These are just wrappers for basic python
+# functions that make using them easier to use with pandas.agg().
 def q10(x):
     return x.quantile(0.1)
 
@@ -58,7 +59,7 @@ def first(x):
 def aggregrate(data, geography_type, group_type):
     """
     Function to group a dataframe by whichever geography and group types it is
-    passed and output aggregate stats for that only for that grouping.
+    passed and output aggregate stats for that grouping.
     """
 
     print(geography_type, group_type)
@@ -85,8 +86,8 @@ def aggregrate(data, geography_type, group_type):
 def assemble(df, geos, groups):
     """
     Function that loops over predefined geography and class groups and passes
-    them to the aggregate function. Outputs stacked aggegrated output from the
-    aggregate function.
+    them to the aggregate function. Returns stacked output from the aggregate
+    function.
     """
 
     # Create an empty dataframe to fill with output
@@ -281,6 +282,9 @@ def ingest_geos(geos):
 
 
 def model(dbt, spark_session):
+    """
+    Function to build a dbt python model using PySpark.
+    """
     dbt.config(materialized="table")
 
     # Ingest geographies and their associated data years
diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stats.py
index 615e56c67..a2677b98f 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stats.py
@@ -76,7 +76,7 @@ def within(x, limit):
 def aggregrate(data, geography_type, group_type):
     """
     Function to group a dataframe by whichever geography and group types it is
-    passed and output aggregate stats for that only for that grouping. Works
+    passed and output aggregate stats for that grouping. Works
     differently than in other SoT scripts since assesspy functions need
     multiple inputs.
     """
@@ -142,8 +142,8 @@ def aggregrate(data, geography_type, group_type):
 def assemble(df, geos, groups):
     """
     Function that loops over predefined geography and class groups and passes
-    them to the aggregate function. Outputs stacked aggegrated output from the
-    aggregate function.
+    them to the aggregate function. Returns stacked output from the aggregate
+    function.
     """
 
     # Create an empty dataframe to fill with output
@@ -342,6 +342,9 @@ def ingest_geos(geos):
 
 
 def model(dbt, spark_session):
+    """
+    Function to build a dbt python model using PySpark.
+    """
     dbt.config(materialized="table")
 
     # Ingest geographies and their associated data years
diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py
index f387d2320..b3d496172 100644
--- a/dbt/models/reporting/reporting.sot_sales.py
+++ b/dbt/models/reporting/reporting.sot_sales.py
@@ -11,7 +11,8 @@
 groups = ["no_group", "class", "major_class", "modeling_group", "res_other"]
 
 
-# Define aggregation functions
+# Define aggregation functions. These are just wrappers for basic python
+# functions that make using them easier to use with pandas.agg().
 def q10(x):
     return x.quantile(0.1)
 
@@ -58,7 +59,7 @@ def first(x):
 def aggregrate(data, geography_type, group_type):
     """
     Function to group a dataframe by whichever geography and group types it is
-    passed and output aggregate stats for that only for that grouping.
+    passed and output aggregate stats for that grouping.
     """
     print(geography_type, group_type)
 
@@ -83,8 +84,8 @@ def aggregrate(data, geography_type, group_type):
 def assemble(df, geos, groups):
     """
     Function that loops over predefined geography and class groups and passes
-    them to the aggregate function. Outputs stacked aggegrated output from the
-    aggregate function.
+    them to the aggregate function. Returns stacked output from the aggregate
+    function.
     """
 
     # Create an empty dataframe to fill with output
@@ -213,6 +214,9 @@ def ingest_geos(geos):
 
 
 def model(dbt, spark_session):
+    """
+    Function to build a dbt python model using PySpark.
+    """
     dbt.config(materialized="table")
 
     # Ingest geographies and their associated data years
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
index 9c803ee33..2aa0cef5a 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
@@ -8,7 +8,8 @@
 groups = ["no_group", "class", "major_class", "modeling_group", "res_other"]
 
 
-# Define aggregation functions
+# Define aggregation functions. These are just wrappers for basic python
+# functions that make using them easier to use with pandas.agg().
 def q10(x):
     return x.quantile(0.1)
 
@@ -67,7 +68,7 @@ def first(x):
 def aggregrate(data, geography_type, group_type):
     """
     Function to group a dataframe by whichever geography and group types it is
-    passed and output aggregate stats for that only for that grouping.
+    passed and output aggregate stats for that grouping.
     """
 
     print(geography_type, group_type)
@@ -91,6 +92,11 @@ def aggregrate(data, geography_type, group_type):
 
 
 def assemble(df, geos, groups):
+    """
+    Function that loops over predefined geography and class groups and passes
+    them to the aggregate function. Returns stacked output from the aggregate
+    function.
+    """
     # Create an empty dataframe to fill with output
     output = pd.DataFrame()
 
@@ -241,6 +247,9 @@ def ingest_geos(geos):
 
 
 def model(dbt, spark_session):
+    """
+    Function to build a dbt python model using PySpark.
+    """
     dbt.config(materialized="table")
 
     # Ingest geographies and their associated data years

From 28ba90c70733ad3fa055c7e151a90d7902e7e48f Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Tue, 18 Mar 2025 17:01:29 +0000
Subject: [PATCH 55/96] Lint

---
 dbt/models/reporting/docs.md                  | 32 +++++++++++++++++++
 dbt/models/reporting/reporting.sot_sales.py   |  4 +--
 .../reporting.sot_taxes_exemptions.py         |  4 +--
 3 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/dbt/models/reporting/docs.md b/dbt/models/reporting/docs.md
index 7862e9891..5cd579f5c 100644
--- a/dbt/models/reporting/docs.md
+++ b/dbt/models/reporting/docs.md
@@ -80,6 +80,38 @@ for every possible geography and reporting group combination.
 **Primary Key**: `pin`, `year`
 {% enddocs %}
 
+# sot_assessment_roll
+{% docs table_sot_assessment_roll %}
+{% enddocs %}
+
+# sot_assessment_roll_input
+{% docs table_sot_assessment_roll_input %}
+{% enddocs %}
+
+# sot_ratio_stats
+{% docs table_sot_ratio_stats %}
+{% enddocs %}
+
+# sot_ratio_stats_input
+{% docs table_sot_ratio_stats_input %}
+{% enddocs %}
+
+# sot_sales
+{% docs table_sot_sales %}
+{% enddocs %}
+
+# sot_sales_input
+{% docs table_sot_sales_input %}
+{% enddocs %}
+
+# sot_taxes_exemptions
+{% docs table_sot_taxes_exemptions %}
+{% enddocs %}
+
+# sot_taxes_exemptions_input
+{% docs table_sot_taxes_exemptions_input %}
+{% enddocs %}
+
 # vw_assessment_roll
 
 {% docs view_vw_assessment_roll %}
diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sales.py
index b3d496172..bf8666809 100644
--- a/dbt/models/reporting/reporting.sot_sales.py
+++ b/dbt/models/reporting/reporting.sot_sales.py
@@ -131,9 +131,7 @@ def assemble(df, geos, groups):
             "sale_price_per_sf_delta_mean",
             "sale_price_per_sf_delta_sum",
         ]
-    ] = (
-        output[diff_cols].groupby(["geography_id", "group_id"]).diff()
-    )
+    ] = output[diff_cols].groupby(["geography_id", "group_id"]).diff()
 
     output = clean_names(output)
 
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
index 2aa0cef5a..1cbd83bf6 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
@@ -129,9 +129,7 @@ def assemble(df, geos, groups):
             "tax_bill_total_delta_mean",
             "tax_bill_total_delta_sum",
         ]
-    ] = (
-        output[diff_cols].groupby(["geography_id", "group_id"]).diff()
-    )
+    ] = output[diff_cols].groupby(["geography_id", "group_id"]).diff()
 
     output = clean_names(output)
 

From cb50a51b28f447d84531f835d41e580fa68907f8 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Tue, 18 Mar 2025 17:15:27 +0000
Subject: [PATCH 56/96] Clean up

---
 dbt/models/reporting/docs.md                     | 16 ++++++++--------
 .../reporting/reporting.sot_assessment_roll.py   |  4 ++--
 ...atio_stats.py => reporting.sot_ratio_stat.py} |  2 +-
 ...ut.sql => reporting.sot_ratio_stat_input.sql} |  6 ++----
 ...orting.sot_sales.py => reporting.sot_sale.py} |  2 +-
 ...es_input.sql => reporting.sot_sale_input.sql} |  2 +-
 dbt/models/reporting/schema.yml                  | 16 ++++++++--------
 7 files changed, 23 insertions(+), 25 deletions(-)
 rename dbt/models/reporting/{reporting.sot_ratio_stats.py => reporting.sot_ratio_stat.py} (99%)
 rename dbt/models/reporting/{reporting.sot_ratio_stats_input.sql => reporting.sot_ratio_stat_input.sql} (96%)
 rename dbt/models/reporting/{reporting.sot_sales.py => reporting.sot_sale.py} (99%)
 rename dbt/models/reporting/{reporting.sot_sales_input.sql => reporting.sot_sale_input.sql} (99%)

diff --git a/dbt/models/reporting/docs.md b/dbt/models/reporting/docs.md
index 5cd579f5c..e0969b11b 100644
--- a/dbt/models/reporting/docs.md
+++ b/dbt/models/reporting/docs.md
@@ -88,20 +88,20 @@ for every possible geography and reporting group combination.
 {% docs table_sot_assessment_roll_input %}
 {% enddocs %}
 
-# sot_ratio_stats
-{% docs table_sot_ratio_stats %}
+# sot_ratio_stat
+{% docs table_sot_ratio_stat %}
 {% enddocs %}
 
-# sot_ratio_stats_input
-{% docs table_sot_ratio_stats_input %}
+# sot_ratio_stat_input
+{% docs table_sot_ratio_stat_input %}
 {% enddocs %}
 
-# sot_sales
-{% docs table_sot_sales %}
+# sot_sale
+{% docs table_sot_sale %}
 {% enddocs %}
 
-# sot_sales_input
-{% docs table_sot_sales_input %}
+# sot_sale_input
+{% docs table_sot_sale_input %}
 {% enddocs %}
 
 # sot_taxes_exemptions
diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index eef930eb3..0b2cda394 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -56,7 +56,7 @@ def first(x):
 }
 
 
-def aggregrate(data, geography_type, group_type):
+def aggregrate(data, geography_type, group_type, stats):
     """
     Function to group a dataframe by whichever geography and group types it is
     passed and output aggregate stats for that grouping.
@@ -99,7 +99,7 @@ def assemble(df, geos, groups):
 
         for x in value:
             for z in groups:
-                output = pd.concat([output, aggregrate(df, x, z)])
+                output = pd.concat([output, aggregrate(df, x, z, stats=stats)])
 
     # Flatten multi-index
     output.columns = ["_".join(col) for col in output.columns]
diff --git a/dbt/models/reporting/reporting.sot_ratio_stats.py b/dbt/models/reporting/reporting.sot_ratio_stat.py
similarity index 99%
rename from dbt/models/reporting/reporting.sot_ratio_stats.py
rename to dbt/models/reporting/reporting.sot_ratio_stat.py
index a2677b98f..f9e436c02 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stat.py
@@ -350,7 +350,7 @@ def model(dbt, spark_session):
     # Ingest geographies and their associated data years
     geos = ingest_geos(dbt.ref("reporting.sot_data_years"))
 
-    input = dbt.ref("reporting.sot_ratio_stats_input")
+    input = dbt.ref("reporting.sot_ratio_stat_input")
 
     # Convert the Spark input dataframe to Pandas for
     # compatibility with assesspy functions
diff --git a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql b/dbt/models/reporting/reporting.sot_ratio_stat_input.sql
similarity index 96%
rename from dbt/models/reporting/reporting.sot_ratio_stats_input.sql
rename to dbt/models/reporting/reporting.sot_ratio_stat_input.sql
index 17ed6ef83..b00b93404 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stats_input.sql
+++ b/dbt/models/reporting/reporting.sot_ratio_stat_input.sql
@@ -1,6 +1,6 @@
 /* This script gathers parcel-level geographies and joins them to values and
 sale prices, and class groupings in order to construct sales ratios. Its sole
-purpose is to feed reporting.sot_ratio_stats, and should not be used
+purpose is to feed reporting.sot_ratio_stat, and should not be used
 otherwise. */
 {{
     config(
@@ -50,9 +50,7 @@ SELECT
     uni.chicago_community_area_name AS community_area,
     uni.census_place_geoid AS census_place,
     uni.census_tract_geoid AS census_tract,
-    uni.census_congressional_district_geoid
-        AS
-        census_congressional_district,
+    uni.census_congressional_district_geoid AS census_congressional_district,
     uni.census_zcta_geoid AS census_zcta,
     uni.cook_board_of_review_district_num AS cook_board_of_review_district,
     uni.cook_commissioner_district_num AS cook_commissioner_district,
diff --git a/dbt/models/reporting/reporting.sot_sales.py b/dbt/models/reporting/reporting.sot_sale.py
similarity index 99%
rename from dbt/models/reporting/reporting.sot_sales.py
rename to dbt/models/reporting/reporting.sot_sale.py
index bf8666809..eb407e2cd 100644
--- a/dbt/models/reporting/reporting.sot_sales.py
+++ b/dbt/models/reporting/reporting.sot_sale.py
@@ -220,7 +220,7 @@ def model(dbt, spark_session):
     # Ingest geographies and their associated data years
     geos = ingest_geos(dbt.ref("reporting.sot_data_years"))
 
-    input = dbt.ref("reporting.sot_sales_input")
+    input = dbt.ref("reporting.sot_sale_input")
 
     # Convert the Spark input dataframe to Pandas for
     # compatibility with assesspy functions
diff --git a/dbt/models/reporting/reporting.sot_sales_input.sql b/dbt/models/reporting/reporting.sot_sale_input.sql
similarity index 99%
rename from dbt/models/reporting/reporting.sot_sales_input.sql
rename to dbt/models/reporting/reporting.sot_sale_input.sql
index 18d0960aa..f21e4aee1 100644
--- a/dbt/models/reporting/reporting.sot_sales_input.sql
+++ b/dbt/models/reporting/reporting.sot_sale_input.sql
@@ -1,5 +1,5 @@
 -- This script gathers parcel-level geographies and joins them to sales and
--- class groupings. Its sole purpose is to feed reporting.sot_sales,
+-- class groupings. Its sole purpose is to feed reporting.sot_sale,
 -- and should not be used otherwise.
 
 {{
diff --git a/dbt/models/reporting/schema.yml b/dbt/models/reporting/schema.yml
index bf540a752..759da8528 100644
--- a/dbt/models/reporting/schema.yml
+++ b/dbt/models/reporting/schema.yml
@@ -77,26 +77,26 @@ models:
       tags:
         - daily
 
-  - name: reporting.sot_ratio_stats
-    description: '{{ doc("table_sot_ratio_stats") }}'
+  - name: reporting.sot_ratio_stat
+    description: '{{ doc("table_sot_ratio_stat") }}'
     config:
       tags:
         - daily
 
-  - name: reporting.sot_ratio_stats_input
-    description: '{{ doc("table_sot_ratio_stats_input") }}'
+  - name: reporting.sot_ratio_stat_input
+    description: '{{ doc("table_sot_ratio_stat_input") }}'
     config:
       tags:
         - daily
 
-  - name: reporting.sot_sales
-    description: '{{ doc("table_sot_sales") }}'
+  - name: reporting.sot_sale
+    description: '{{ doc("table_sot_sale") }}'
     config:
       tags:
         - daily
 
-  - name: reporting.sot_sales_input
-    description: '{{ doc("table_sot_sales_input") }}'
+  - name: reporting.sot_sale_input
+    description: '{{ doc("table_sot_sale_input") }}'
     config:
       tags:
         - daily

From 978ad93126920231117c5a0ca19f1109b4c6dece Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Tue, 18 Mar 2025 19:19:48 +0000
Subject: [PATCH 57/96] Use new assesspy inputs

---
 .../reporting/reporting.sot_ratio_stat.py      | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_ratio_stat.py b/dbt/models/reporting/reporting.sot_ratio_stat.py
index f9e436c02..89874cc52 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stat.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stat.py
@@ -17,9 +17,9 @@
 
 
 # Wrap assesspy functions to avoid GitHub runner errors for length 0 groupings
-def cod_safe(ratio):
-    if len(ratio) >= 1:
-        output = ass.cod(ratio)
+def cod_safe(assessed, sale_price):
+    if len(sale_price) >= 1:
+        output = ass.cod(estimate=assessed, sale_price=sale_price)
     else:
         output = None
 
@@ -28,7 +28,7 @@ def cod_safe(ratio):
 
 def prd_safe(assessed, sale_price):
     if len(sale_price) >= 1:
-        output = ass.prd(assessed=assessed, sale_price=sale_price)
+        output = ass.prd(estimate=assessed, sale_price=sale_price)
     else:
         output = None
 
@@ -37,9 +37,7 @@ def prd_safe(assessed, sale_price):
 
 def prb_safe(assessed, sale_price):
     if len(sale_price) >= 1:
-        output = ass.prb(assessed=assessed, sale_price=sale_price, round=3)[
-            "prb"
-        ]
+        output = ass.prb(estimate=assessed, sale_price=sale_price)
     else:
         output = None
 
@@ -48,7 +46,7 @@ def prb_safe(assessed, sale_price):
 
 def mki_safe(assessed, sale_price):
     if len(sale_price) >= 1:
-        output = ass.mki(assessed=assessed, sale_price=sale_price)
+        output = ass.mki(estimate=assessed, sale_price=sale_price)
     else:
         output = None
 
@@ -119,7 +117,9 @@ def aggregrate(data, geography_type, group_type):
                 "ratio_q90": x["ratio"].quantile(0.90),
                 "ratio_max": x["ratio"].max(),
                 "ratio_mean": x["ratio"].mean(),
-                "cod": cod_safe(ratio=x["ratio"]),
+                "cod": cod_safe(
+                    assessed=x["tot_mv"], sale_price=x["sale_price"]
+                ),
                 "prd": prd_safe(
                     assessed=x["tot_mv"], sale_price=x["sale_price"]
                 ),

From a471a420e2e3bc0b36e55018f8ec0159635c8e3c Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Tue, 18 Mar 2025 19:54:38 +0000
Subject: [PATCH 58/96] Update assesspy version

---
 dbt/models/reporting/reporting.sot_ratio_stat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbt/models/reporting/reporting.sot_ratio_stat.py b/dbt/models/reporting/reporting.sot_ratio_stat.py
index 89874cc52..4763cc6c2 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stat.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stat.py
@@ -1,7 +1,7 @@
 # pylint: skip-file
 # type: ignore
 sc.addPyFile(  # noqa: F821
-    "s3://ccao-athena-dependencies-us-east-1/assesspy==1.1.0.zip"
+    "s3://ccao-athena-dependencies-us-east-1/assesspy==2.0.2.zip"
 )
 
 # This script generates aggregated summary stats on sales ratios across a

From d28f02c5dd6e4edc7730c3246a657a5b3c352edc Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Tue, 18 Mar 2025 20:54:09 +0000
Subject: [PATCH 59/96] Add back documentation

---
 dbt/models/reporting/docs.md | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/dbt/models/reporting/docs.md b/dbt/models/reporting/docs.md
index e0969b11b..adb3034ba 100644
--- a/dbt/models/reporting/docs.md
+++ b/dbt/models/reporting/docs.md
@@ -82,34 +82,63 @@ for every possible geography and reporting group combination.
 
 # sot_assessment_roll
 {% docs table_sot_assessment_roll %}
+Table to feed the Python dbt job that creates the
+`reporting.sot_assessment_roll` table. Feeds public reporting assets.
+
+**Primary Key**: `year`, `stage_name`, `geography_id`, `group_id`
 {% enddocs %}
 
 # sot_assessment_roll_input
 {% docs table_sot_assessment_roll_input %}
+Table to feed the Python dbt job that creates the
+`reporting.sot_assessment_roll` table. Feeds public reporting assets.
+
+**Primary Key**: `year`, `stage_name`, `geography_id`, `group_id`
 {% enddocs %}
 
 # sot_ratio_stat
 {% docs table_sot_ratio_stat %}
+Feeds public reporting assets.
+
+**Primary Key**: `year`, `stage_name`, `geography_id`, `group_id`
 {% enddocs %}
 
 # sot_ratio_stat_input
 {% docs table_sot_ratio_stat_input %}
+Table to feed the Python dbt job that creates the
+`reporting.sot_ratio_stats` table. Feeds public reporting assets.
+
+**Primary Key**: `year`, `stage_name`, `geography_id`, `group_id`
 {% enddocs %}
 
 # sot_sale
 {% docs table_sot_sale %}
+Feeds public reporting assets.
+
+**Primary Key**: `year`, `geography_id`, `group_id`
 {% enddocs %}
 
 # sot_sale_input
 {% docs table_sot_sale_input %}
+Table to feed the Python dbt job that creates the
+`reporting.sot_sale` table. Feeds public reporting assets.
+
+**Primary Key**: `year`, `geography_id`, `group_id`
 {% enddocs %}
 
 # sot_taxes_exemptions
 {% docs table_sot_taxes_exemptions %}
+Feeds public reporting assets.
+
+**Primary Key**: `year`, `geography_id`, `group_id`
 {% enddocs %}
 
 # sot_taxes_exemptions_input
 {% docs table_sot_taxes_exemptions_input %}
+Table to feed the Python dbt job that creates the
+`reporting.sot_taxes_exemptions` table. Feeds public reporting assets.
+
+**Primary Key**: `year`, `geography_id`, `group_id`
 {% enddocs %}
 
 # vw_assessment_roll

From f8258cf183d4f6514055d51647dc713a2f6b1718 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Wed, 19 Mar 2025 14:27:50 +0000
Subject: [PATCH 60/96] Improve documentation

---
 dbt/models/reporting/docs.md | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/dbt/models/reporting/docs.md b/dbt/models/reporting/docs.md
index adb3034ba..3b91b6396 100644
--- a/dbt/models/reporting/docs.md
+++ b/dbt/models/reporting/docs.md
@@ -82,8 +82,8 @@ for every possible geography and reporting group combination.
 
 # sot_assessment_roll
 {% docs table_sot_assessment_roll %}
-Table to feed the Python dbt job that creates the
-`reporting.sot_assessment_roll` table. Feeds public reporting assets.
+Aggregated summary stats of assessed values across a number of geographies,
+class combinations, and time.
 
 **Primary Key**: `year`, `stage_name`, `geography_id`, `group_id`
 {% enddocs %}
@@ -98,7 +98,8 @@ Table to feed the Python dbt job that creates the
 
 # sot_ratio_stat
 {% docs table_sot_ratio_stat %}
-Feeds public reporting assets.
+Aggregated summary stats of sales ratios across a number of geographies, class
+combinations, and time.
 
 **Primary Key**: `year`, `stage_name`, `geography_id`, `group_id`
 {% enddocs %}
@@ -113,7 +114,8 @@ Table to feed the Python dbt job that creates the
 
 # sot_sale
 {% docs table_sot_sale %}
-Feeds public reporting assets.
+Aggregated summary stats of sales across a number of geographies, class
+combinations, and time.
 
 **Primary Key**: `year`, `geography_id`, `group_id`
 {% enddocs %}
@@ -128,7 +130,8 @@ Table to feed the Python dbt job that creates the
 
 # sot_taxes_exemptions
 {% docs table_sot_taxes_exemptions %}
-Feeds public reporting assets.
+Aggregated summary stats of taxes and exemptions data across a number of
+geographies, class combinations, and time.
 
 **Primary Key**: `year`, `geography_id`, `group_id`
 {% enddocs %}

From e213a32c8ae867425da59fe98d4e8dc8dae182d8 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Wed, 19 Mar 2025 16:06:47 +0000
Subject: [PATCH 61/96] Add outlier sales filtering

---
 dbt/models/reporting/reporting.sot_ratio_stat.py        | 2 +-
 dbt/models/reporting/reporting.sot_ratio_stat_input.sql | 1 +
 dbt/models/reporting/reporting.sot_sale_input.sql       | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/dbt/models/reporting/reporting.sot_ratio_stat.py b/dbt/models/reporting/reporting.sot_ratio_stat.py
index 4763cc6c2..0b2088e7f 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stat.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stat.py
@@ -89,7 +89,7 @@ def aggregrate(data, geography_type, group_type):
     # Remove parcels with MVs of 0 since they screw up ratios
     data = data[data["tot_mv"] > 0]
 
-    # Remove groups that only have one sale since we can't calculate stats
+    # Remove groups with 20 or less sales
     data = data.dropna(subset=["sale_price"])
     data = data[data["sale_n_tot"] >= 20]
 
diff --git a/dbt/models/reporting/reporting.sot_ratio_stat_input.sql b/dbt/models/reporting/reporting.sot_ratio_stat_input.sql
index b00b93404..11e497874 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stat_input.sql
+++ b/dbt/models/reporting/reporting.sot_ratio_stat_input.sql
@@ -109,6 +109,7 @@ LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales
     AND NOT sales.sale_filter_deed_type
     AND NOT sales.sale_filter_less_than_10k
     AND NOT sales.sale_filter_same_sale_within_365
+    AND COALESCE(sales.sv_is_outlier, FALSE) = FALSE
 -- Temporary limit on feeder table to avoid GitHub runner memory issues.
 WHERE uni.year >= '2020'
     AND uni.year IN ('2022', '2023') AND uni.class IN ('278', '597')
diff --git a/dbt/models/reporting/reporting.sot_sale_input.sql b/dbt/models/reporting/reporting.sot_sale_input.sql
index f21e4aee1..e7ab33899 100644
--- a/dbt/models/reporting/reporting.sot_sale_input.sql
+++ b/dbt/models/reporting/reporting.sot_sale_input.sql
@@ -100,5 +100,6 @@ LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales
     AND NOT sales.sale_filter_deed_type
     AND NOT sales.sale_filter_less_than_10k
     AND NOT sales.sale_filter_same_sale_within_365
+    AND COALESCE(sales.sv_is_outlier, FALSE) = FALSE
 -- Temporary limit on feeder table to avoid GitHub runner memory issues.
 WHERE uni.year = '2023'

From 1c8f1b36a04b550eada03bb8c5ff907a16e7d478 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Wed, 19 Mar 2025 17:09:07 +0000
Subject: [PATCH 62/96] Count outlier sales

---
 dbt/models/reporting/reporting.sot_sale.py        | 6 +++++-
 dbt/models/reporting/reporting.sot_sale_input.sql | 7 +++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_sale.py b/dbt/models/reporting/reporting.sot_sale.py
index eb407e2cd..b6809e084 100644
--- a/dbt/models/reporting/reporting.sot_sale.py
+++ b/dbt/models/reporting/reporting.sot_sale.py
@@ -51,6 +51,7 @@ def first(x):
     "sale_char_bldg_sf": ["median"],
     "sale_char_land_sf": ["median"],
     "sale_char_yrblt": ["median"],
+    "sale_is_outlier": ["sum"],
     "class": [stats.multimode],
     "geography_data_year": [first],
 }
@@ -148,6 +149,7 @@ def clean_names(x):
             "sale_price_size": "pin_n_tot",
             "year": "sale_year",
             "sale_price_count": "sale_n_tot",
+            "sale_is_outlier_sum": "sale_n_outlier_excluded",
             "class_multimode": "sale_class_mode",
             "geography_data_year_first": "geography_data_year",
         }
@@ -191,6 +193,7 @@ def clean_names(x):
             "sale_char_land_sf_median",
             "sale_char_yrblt_median",
             "sale_class_mode",
+            "sale_n_outlier_excluded",
         ]
     ]
 
@@ -248,7 +251,8 @@ def model(dbt, spark_session):
         + "sale_price_per_sf_delta_sum: double, "
         + "sale_char_bldg_sf_median: double, "
         + "sale_char_land_sf_median: double, "
-        + "sale_char_yrblt_median: double, sale_class_mode: array<string>"
+        + "sale_char_yrblt_median: double, sale_class_mode: array<string>, "
+        + "sale_n_outlier_excluded: bigint"
     )
 
     spark_df = spark_session.createDataFrame(df, schema=schema)
diff --git a/dbt/models/reporting/reporting.sot_sale_input.sql b/dbt/models/reporting/reporting.sot_sale_input.sql
index e7ab33899..93918dce3 100644
--- a/dbt/models/reporting/reporting.sot_sale_input.sql
+++ b/dbt/models/reporting/reporting.sot_sale_input.sql
@@ -23,7 +23,11 @@ WITH sf AS (
 
 SELECT
     sales.doc_no,
-    sales.sale_price,
+    -- Code outlier sale prices as NULL so they won't be part of aggregated sale
+    -- stats, but we can count the number of outliers
+    CASE WHEN sales.sv_is_outlier THEN NULL ELSE sales.sale_price END
+        AS sale_price,
+    COALESCE(sales.sv_is_outlier, FALSE) AS sale_is_outlier,
     CASE WHEN sf.char_bldg_sf > 0
             THEN
             CAST(sales.sale_price / sf.char_bldg_sf AS DOUBLE)
@@ -100,6 +104,5 @@ LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales
     AND NOT sales.sale_filter_deed_type
     AND NOT sales.sale_filter_less_than_10k
     AND NOT sales.sale_filter_same_sale_within_365
-    AND COALESCE(sales.sv_is_outlier, FALSE) = FALSE
 -- Temporary limit on feeder table to avoid GitHub runner memory issues.
 WHERE uni.year = '2023'

From d26fed05615ac168bfaff2388f21e33ddd6a4389 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Wed, 19 Mar 2025 18:35:50 +0000
Subject: [PATCH 63/96] Exclude outliers from sales char stats

---
 dbt/models/reporting/reporting.sot_sale_input.sql | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_sale_input.sql b/dbt/models/reporting/reporting.sot_sale_input.sql
index 93918dce3..5b9bd319f 100644
--- a/dbt/models/reporting/reporting.sot_sale_input.sql
+++ b/dbt/models/reporting/reporting.sot_sale_input.sql
@@ -92,11 +92,6 @@ SELECT
         AS res_other,
     uni.year
 FROM {{ ref('default.vw_pin_universe') }} AS uni
-LEFT JOIN sf
-    ON uni.pin = sf.pin
-    AND uni.year = sf.year
-LEFT JOIN {{ ref('ccao.class_dict') }}
-    ON uni.class = class_dict.class_code
 LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales
     ON uni.pin = sales.pin
     AND uni.year = sales.year
@@ -104,5 +99,12 @@ LEFT JOIN {{ ref('default.vw_pin_sale') }} AS sales
     AND NOT sales.sale_filter_deed_type
     AND NOT sales.sale_filter_less_than_10k
     AND NOT sales.sale_filter_same_sale_within_365
+LEFT JOIN sf
+    ON uni.pin = sf.pin
+    AND uni.year = sf.year
+    -- Don't join characteristics onto outliers
+    AND NOT COALESCE(sales.sv_is_outlier, FALSE)
+LEFT JOIN {{ ref('ccao.class_dict') }}
+    ON uni.class = class_dict.class_code
 -- Temporary limit on feeder table to avoid GitHub runner memory issues.
 WHERE uni.year = '2023'

From 91c5040efe30e107d722491db4ab31c886f9f9b7 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Wed, 19 Mar 2025 18:44:05 +0000
Subject: [PATCH 64/96] Clarify bldg and land sf

---
 dbt/models/reporting/reporting.sot_sale.py        | 12 ++++++------
 dbt/models/reporting/reporting.sot_sale_input.sql |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_sale.py b/dbt/models/reporting/reporting.sot_sale.py
index b6809e084..60b9f9ee3 100644
--- a/dbt/models/reporting/reporting.sot_sale.py
+++ b/dbt/models/reporting/reporting.sot_sale.py
@@ -48,8 +48,8 @@ def first(x):
 agg_func_math = {
     "sale_price": ["size", "count"] + more_stats,
     "sale_price_per_sf": more_stats,
-    "sale_char_bldg_sf": ["median"],
-    "sale_char_land_sf": ["median"],
+    "sale_char_tot_bldg_sf": ["median"],
+    "sale_char_tot_land_sf": ["median"],
     "sale_char_yrblt": ["median"],
     "sale_is_outlier": ["sum"],
     "class": [stats.multimode],
@@ -189,8 +189,8 @@ def clean_names(x):
             "sale_price_per_sf_delta_median",
             "sale_price_per_sf_delta_mean",
             "sale_price_per_sf_delta_sum",
-            "sale_char_bldg_sf_median",
-            "sale_char_land_sf_median",
+            "sale_char_tot_bldg_sf_median",
+            "sale_char_tot_land_sf_median",
             "sale_char_yrblt_median",
             "sale_class_mode",
             "sale_n_outlier_excluded",
@@ -249,8 +249,8 @@ def model(dbt, spark_session):
         + "sale_price_per_sf_delta_median: double, "
         + "sale_price_per_sf_delta_mean: double, "
         + "sale_price_per_sf_delta_sum: double, "
-        + "sale_char_bldg_sf_median: double, "
-        + "sale_char_land_sf_median: double, "
+        + "sale_char_tot_bldg_sf_median: double, "
+        + "sale_char_tot_land_sf_median: double, "
         + "sale_char_yrblt_median: double, sale_class_mode: array<string>, "
         + "sale_n_outlier_excluded: bigint"
     )
diff --git a/dbt/models/reporting/reporting.sot_sale_input.sql b/dbt/models/reporting/reporting.sot_sale_input.sql
index 5b9bd319f..c5d2b78c3 100644
--- a/dbt/models/reporting/reporting.sot_sale_input.sql
+++ b/dbt/models/reporting/reporting.sot_sale_input.sql
@@ -32,8 +32,8 @@ SELECT
             THEN
             CAST(sales.sale_price / sf.char_bldg_sf AS DOUBLE)
     END AS sale_price_per_sf,
-    CAST(sf.char_bldg_sf AS INT) AS sale_char_bldg_sf,
-    CAST(sf.char_land_sf AS INT) AS sale_char_land_sf,
+    CAST(sf.char_bldg_sf AS INT) AS sale_char_tot_bldg_sf,
+    CAST(sf.char_land_sf AS INT) AS sale_char_tot_land_sf,
     CAST(sf.char_yrblt AS INT) AS sale_char_yrblt,
     uni.class,
     'Cook' AS county,

From c3cc7ba6a3da5428768e41ac33e024e73cfe4b04 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Wed, 19 Mar 2025 20:02:55 +0000
Subject: [PATCH 65/96] Improve schema declaration

---
 .../reporting.sot_assessment_roll.py          | 88 +++++++++++++------
 1 file changed, 62 insertions(+), 26 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index 0b2cda394..6eda0abf5 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -297,32 +297,68 @@ def model(dbt, spark_session):
     input = input.toPandas()
 
     df = assemble(input, geos=geos, groups=groups)
-
-    schema = (
-        "geography_type: string, geography_id: string, "
-        + "geography_data_year: string, group_type: string, group_id: string, "
-        + "year: string, reassessment_year: string, stage_name: string, "
-        + "pin_n_tot: bigint, pin_n_w_value: bigint, pin_pct_w_value: double, "
-        + "av_tot_min: double, av_tot_q10: double, av_tot_q25: double, "
-        + "av_tot_median: double, av_tot_q75: double, av_tot_q90: double, "
-        + "av_tot_max: double, av_tot_mean: double, av_tot_sum: double, "
-        + "av_tot_delta_median: double, av_tot_delta_mean: double, "
-        + "av_tot_delta_sum: double, av_tot_delta_pct_median: double, "
-        + "av_tot_delta_pct_mean: double, av_tot_delta_pct_sum: double, "
-        + "av_bldg_min: double, av_bldg_q10: double, av_bldg_q25: double, "
-        + "av_bldg_median: double, av_bldg_q75: double, av_bldg_q90: double, "
-        + "av_bldg_max: double, av_bldg_mean: double, av_bldg_sum: double, "
-        + "av_bldg_delta_median: double, av_bldg_delta_mean: double, "
-        + "av_bldg_delta_sum: double, av_bldg_delta_pct_median: double, "
-        + "av_bldg_delta_pct_mean: double, av_bldg_delta_pct_sum: double, "
-        + "av_land_min: double, av_land_q10: double, av_land_q25: double, "
-        + "av_land_median: double, av_land_q75: double, av_land_q90: double, "
-        + "av_land_max: double, av_land_mean: double, av_land_sum: double, "
-        + "av_land_delta_median: double, av_land_delta_mean: double, "
-        + "av_land_delta_sum: double, av_land_delta_pct_median: double, "
-        + "av_land_delta_pct_mean: double, av_land_delta_pct_sum: double"
+    # %%
+    schema = {
+        "geography_type": "string",
+        "geography_id": "string",
+        "geography_data_year": "string",
+        "group_type": "string",
+        "group_id": "string",
+        "year": "string",
+        "reassessment_year": "string",
+        "stage_name": "string",
+        "pin_n_tot": "bigint",
+        "pin_n_w_value": "bigint",
+        "pin_pct_w_value": "double",
+        "av_tot_min": "double",
+        "av_tot_q10": "double",
+        "av_tot_q25": "double",
+        "av_tot_median": "double",
+        "av_tot_q75": "double",
+        "av_tot_q90": "double",
+        "av_tot_max": "double",
+        "av_tot_mean": "double",
+        "av_tot_sum": "double",
+        "av_tot_delta_median": "double",
+        "av_tot_delta_mean": "double",
+        "av_tot_delta_sum": "double",
+        "av_tot_delta_pct_median": "double",
+        "av_tot_delta_pct_mean": "double",
+        "av_tot_delta_pct_sum": "double",
+        "av_bldg_min": "double",
+        "av_bldg_q10": "double",
+        "av_bldg_q25": "double",
+        "av_bldg_median": "double",
+        "av_bldg_q75": "double",
+        "av_bldg_q90": "double",
+        "av_bldg_max": "double",
+        "av_bldg_mean": "double",
+        "av_bldg_sum": "double",
+        "av_bldg_delta_median": "double",
+        "av_bldg_delta_mean": "double",
+        "av_bldg_delta_sum": "double",
+        "av_bldg_delta_pct_median": "double",
+        "av_bldg_delta_pct_mean": "double",
+        "av_bldg_delta_pct_sum": "double",
+        "av_land_min": "double",
+        "av_land_q10": "double",
+        "av_land_q25": "double",
+        "av_land_median": "double",
+        "av_land_q75": "double",
+        "av_land_q90": "double",
+        "av_land_max": "double",
+        "av_land_mean": "double",
+        "av_land_sum": "double",
+        "av_land_delta_median": "double",
+        "av_land_delta_mean": "double",
+        "av_land_delta_sum": "double",
+        "av_land_delta_pct_median": "double",
+        "av_land_delta_pct_mean": "double",
+        "av_land_delta_pct_sum": "double",
+    }
+    # %%
+    spark_df = spark_session.createDataFrame(
+        df, schema=", ".join(f"{key}: {val}" for key, val in schema.items())
     )
 
-    spark_df = spark_session.createDataFrame(df, schema=schema)
-
     return spark_df

From c8230ea2c13e76958ba7e52bfe57f93407f28bd3 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Wed, 19 Mar 2025 20:35:31 +0000
Subject: [PATCH 66/96] Update schema declarations

---
 .../reporting/reporting.sot_ratio_stat.py     |  72 +++++++++----
 dbt/models/reporting/reporting.sot_sale.py    |  66 +++++++-----
 .../reporting.sot_taxes_exemptions.py         | 100 ++++++++++++------
 3 files changed, 160 insertions(+), 78 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_ratio_stat.py b/dbt/models/reporting/reporting.sot_ratio_stat.py
index 0b2088e7f..f4ebb98b7 100644
--- a/dbt/models/reporting/reporting.sot_ratio_stat.py
+++ b/dbt/models/reporting/reporting.sot_ratio_stat.py
@@ -358,26 +358,58 @@ def model(dbt, spark_session):
 
     df = assemble(input, geos=geos, groups=groups)
 
-    schema = (
-        "geography_type: string, geography_id: string, "
-        + "geography_data_year: string, group_type: string, group_id: string, "
-        + "year: string, reassessment_year: string, stage_name: string, "
-        + "pin_n_tot: int, pin_n_w_value: bigint, pin_pct_w_value: double, "
-        + "sale_n_tot: bigint, mv_min: bigint, mv_q10: bigint, "
-        + "mv_q25: bigint, mv_median: bigint, mv_q75: bigint, "
-        + "mv_q90: bigint, mv_max: bigint, mv_mean: bigint, mv_sum: bigint, "
-        + "mv_delta_median: double, mv_delta_mean: double, "
-        + "mv_delta_sum: double, mv_delta_pct_median: double, "
-        + "mv_delta_pct_mean: double, mv_delta_pct_sum: double, "
-        + "ratio_min: double, ratio_q10: double, ratio_q25: double, "
-        + "ratio_median: double, ratio_q75: double, ratio_q90: double, "
-        + "ratio_max: double, ratio_mean: double, cod: double, prd: double, "
-        + "prb: double, mki: double, cod_met: boolean, prd_met: boolean, "
-        + "prb_met: boolean, mki_met: boolean, within_05_pct: boolean, "
-        + "within_10_pct: boolean, within_15_pct: boolean, "
-        + "within_20_pct: boolean"
-    )
+    schema = {
+        "geography_type": "string",
+        "geography_id": "string",
+        "geography_data_year": "string",
+        "group_type": "string",
+        "group_id": "string",
+        "year": "string",
+        "reassessment_year": "string",
+        "stage_name": "string",
+        "pin_n_tot": "int",
+        "pin_n_w_value": "bigint",
+        "pin_pct_w_value": "double",
+        "sale_n_tot": "bigint",
+        "mv_min": "bigint",
+        "mv_q10": "bigint",
+        "mv_q25": "bigint",
+        "mv_median": "bigint",
+        "mv_q75": "bigint",
+        "mv_q90": "bigint",
+        "mv_max": "bigint",
+        "mv_mean": "bigint",
+        "mv_sum": "bigint",
+        "mv_delta_median": "double",
+        "mv_delta_mean": "double",
+        "mv_delta_sum": "double",
+        "mv_delta_pct_median": "double",
+        "mv_delta_pct_mean": "double",
+        "mv_delta_pct_sum": "double",
+        "ratio_min": "double",
+        "ratio_q10": "double",
+        "ratio_q25": "double",
+        "ratio_median": "double",
+        "ratio_q75": "double",
+        "ratio_q90": "double",
+        "ratio_max": "double",
+        "ratio_mean": "double",
+        "cod": "double",
+        "prd": "double",
+        "prb": "double",
+        "mki": "double",
+        "cod_met": "boolean",
+        "prd_met": "boolean",
+        "prb_met": "boolean",
+        "mki_met": "boolean",
+        "within_05_pct": "boolean",
+        "within_10_pct": "boolean",
+        "within_15_pct": "boolean",
+        "within_20_pct": "boolean",
+    }
 
-    spark_df = spark_session.createDataFrame(df, schema=schema)
+    spark_df = spark_session.createDataFrame(
+        df, schema=", ".join(f"{key}: {val}" for key, val in schema.items())
+    )
 
     return spark_df
diff --git a/dbt/models/reporting/reporting.sot_sale.py b/dbt/models/reporting/reporting.sot_sale.py
index 60b9f9ee3..212707ab9 100644
--- a/dbt/models/reporting/reporting.sot_sale.py
+++ b/dbt/models/reporting/reporting.sot_sale.py
@@ -231,30 +231,48 @@ def model(dbt, spark_session):
 
     df = assemble(input, geos=geos, groups=groups)
 
-    schema = (
-        "geography_type: string, geography_id: string, "
-        + "geography_data_year: string, group_type: string, "
-        + "group_id: string, sale_year: string, pin_n_tot: bigint, "
-        + "sale_n_tot: int, sale_price_min: double, sale_price_q10: double, "
-        + "sale_price_q25: double, sale_price_median: double, "
-        + "sale_price_q75: double, sale_price_q90: double, "
-        + "sale_price_max: double, sale_price_mean: double, "
-        + "sale_price_sum: double, sale_price_delta_median: double, "
-        + "sale_price_delta_mean: double, sale_price_delta_sum: double, "
-        + "sale_price_per_sf_min: double, sale_price_per_sf_q10: double, "
-        + "sale_price_per_sf_q25: double, sale_price_per_sf_median: double, "
-        + "sale_price_per_sf_q75: double, sale_price_per_sf_q90: double, "
-        + "sale_price_per_sf_max: double, sale_price_per_sf_mean: double, "
-        + "sale_price_per_sf_sum: double, "
-        + "sale_price_per_sf_delta_median: double, "
-        + "sale_price_per_sf_delta_mean: double, "
-        + "sale_price_per_sf_delta_sum: double, "
-        + "sale_char_tot_bldg_sf_median: double, "
-        + "sale_char_tot_land_sf_median: double, "
-        + "sale_char_yrblt_median: double, sale_class_mode: array<string>, "
-        + "sale_n_outlier_excluded: bigint"
-    )
+    schema = {
+        "geography_type": "string",
+        "geography_id": "string",
+        "geography_data_year": "string",
+        "group_type": "string",
+        "group_id": "string",
+        "sale_year": "string",
+        "pin_n_tot": "bigint",
+        "sale_n_tot": "int",
+        "sale_price_min": "double",
+        "sale_price_q10": "double",
+        "sale_price_q25": "double",
+        "sale_price_median": "double",
+        "sale_price_q75": "double",
+        "sale_price_q90": "double",
+        "sale_price_max": "double",
+        "sale_price_mean": "double",
+        "sale_price_sum": "double",
+        "sale_price_delta_median": "double",
+        "sale_price_delta_mean": "double",
+        "sale_price_delta_sum": "double",
+        "sale_price_per_sf_min": "double",
+        "sale_price_per_sf_q10": "double",
+        "sale_price_per_sf_q25": "double",
+        "sale_price_per_sf_median": "double",
+        "sale_price_per_sf_q75": "double",
+        "sale_price_per_sf_q90": "double",
+        "sale_price_per_sf_max": "double",
+        "sale_price_per_sf_mean": "double",
+        "sale_price_per_sf_sum": "double",
+        "sale_price_per_sf_delta_median": "double",
+        "sale_price_per_sf_delta_mean": "double",
+        "sale_price_per_sf_delta_sum": "double",
+        "sale_char_tot_bldg_sf_median": "double",
+        "sale_char_tot_land_sf_median": "double",
+        "sale_char_yrblt_median": "double",
+        "sale_class_mode": "array<string>",
+        "sale_n_outlier_excluded": "bigint",
+    }
 
-    spark_df = spark_session.createDataFrame(df, schema=schema)
+    spark_df = spark_session.createDataFrame(
+        df, schema=", ".join(f"{key}: {val}" for key, val in schema.items())
+    )
 
     return spark_df
diff --git a/dbt/models/reporting/reporting.sot_taxes_exemptions.py b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
index 1cbd83bf6..86a1f3951 100644
--- a/dbt/models/reporting/reporting.sot_taxes_exemptions.py
+++ b/dbt/models/reporting/reporting.sot_taxes_exemptions.py
@@ -261,40 +261,72 @@ def model(dbt, spark_session):
 
     df = assemble(input, geos=geos, groups=groups)
 
-    schema = (
-        "geography_type: string, geography_id: string, "
-        + "geography_data_year: string, group_type: string, "
-        + "group_id: string, tax_year: string, pin_n_tot: bigint, "
-        + "tax_eq_factor_final: double, tax_eq_factor_tentative: double, "
-        + "tax_bill_total_min: double, tax_bill_total_q10: double, "
-        + "tax_bill_total_q25: double, tax_bill_total_median: double, "
-        + "tax_bill_total_q75: double, tax_bill_total_q90: double, "
-        + "tax_bill_total_max: double, tax_bill_total_mean: double, "
-        + "tax_bill_total_sum: double, tax_bill_total_delta_median: double, "
-        + "tax_bill_total_delta_mean: double, "
-        + "tax_bill_total_delta_sum: double , tax_rate_min: double, "
-        + "tax_rate_q10: double, tax_rate_q25: double, "
-        + "tax_rate_median: double, tax_rate_q75: double, "
-        + "tax_rate_q90: double, tax_rate_max: double, "
-        + "tax_rate_mean: double, tax_rate_sum: double, "
-        + "tax_av_min: int, tax_av_q10: double, tax_av_q25: double, "
-        + "tax_av_median: double, tax_av_q75: double, "
-        + "tax_av_q90: double, tax_av_max: int, tax_av_mean: double, "
-        + "tax_av_sum: double, tax_exe_n_homeowner: bigint, "
-        + "tax_exe_homeowner_sum: double, tax_exe_n_senior: bigint, "
-        + "tax_exe_senior_sum: double, tax_exe_n_freeze: bigint, "
-        + "tax_exe_freeze_sum: double, tax_exe_n_longtime_homeowner: bigint, "
-        + "tax_exe_longtime_homeowner_sum: double, "
-        + "tax_exe_n_disabled: bigint, tax_exe_disabled_sum: double, "
-        + "tax_exe_n_vet_returning: bigint, "
-        + "tax_exe_vet_returning_sum: double, tax_exe_n_vet_dis_lt50: bigint, "
-        + "tax_exe_vet_dis_lt50_sum: double, tax_exe_n_vet_dis_50_69: bigint, "
-        + "tax_exe_vet_dis_50_69_sum: double, tax_exe_n_vet_dis_ge70: bigint, "
-        + "tax_exe_vet_dis_ge70_sum: double, tax_exe_n_abate: bigint, "
-        + "tax_exe_abate_sum: double, tax_exe_n_total: bigint, "
-        + "tax_exe_total_sum: double"
-    )
+    schema = {
+        "geography_type": "string",
+        "geography_id": "string",
+        "geography_data_year": "string",
+        "group_type": "string",
+        "group_id": "string",
+        "tax_year": "string",
+        "pin_n_tot": "bigint",
+        "tax_eq_factor_final": "double",
+        "tax_eq_factor_tentative": "double",
+        "tax_bill_total_min": "double",
+        "tax_bill_total_q10": "double",
+        "tax_bill_total_q25": "double",
+        "tax_bill_total_median": "double",
+        "tax_bill_total_q75": "double",
+        "tax_bill_total_q90": "double",
+        "tax_bill_total_max": "double",
+        "tax_bill_total_mean": "double",
+        "tax_bill_total_sum": "double",
+        "tax_bill_total_delta_median": "double",
+        "tax_bill_total_delta_mean": "double",
+        "tax_bill_total_delta_sum": "double ",
+        "tax_rate_min": "double",
+        "tax_rate_q10": "double",
+        "tax_rate_q25": "double",
+        "tax_rate_median": "double",
+        "tax_rate_q75": "double",
+        "tax_rate_q90": "double",
+        "tax_rate_max": "double",
+        "tax_rate_mean": "double",
+        "tax_rate_sum": "double",
+        "tax_av_min": "int",
+        "tax_av_q10": "double",
+        "tax_av_q25": "double",
+        "tax_av_median": "double",
+        "tax_av_q75": "double",
+        "tax_av_q90": "double",
+        "tax_av_max": "int",
+        "tax_av_mean": "double",
+        "tax_av_sum": "double",
+        "tax_exe_n_homeowner": "bigint",
+        "tax_exe_homeowner_sum": "double",
+        "tax_exe_n_senior": "bigint",
+        "tax_exe_senior_sum": "double",
+        "tax_exe_n_freeze": "bigint",
+        "tax_exe_freeze_sum": "double",
+        "tax_exe_n_longtime_homeowner": "bigint",
+        "tax_exe_longtime_homeowner_sum": "double",
+        "tax_exe_n_disabled": "bigint",
+        "tax_exe_disabled_sum": "double",
+        "tax_exe_n_vet_returning": "bigint",
+        "tax_exe_vet_returning_sum": "double",
+        "tax_exe_n_vet_dis_lt50": "bigint",
+        "tax_exe_vet_dis_lt50_sum": "double",
+        "tax_exe_n_vet_dis_50_69": "bigint",
+        "tax_exe_vet_dis_50_69_sum": "double",
+        "tax_exe_n_vet_dis_ge70": "bigint",
+        "tax_exe_vet_dis_ge70_sum": "double",
+        "tax_exe_n_abate": "bigint",
+        "tax_exe_abate_sum": "double",
+        "tax_exe_n_total": "bigint",
+        "tax_exe_total_sum": "double",
+    }
 
-    spark_df = spark_session.createDataFrame(df, schema=schema)
+    spark_df = spark_session.createDataFrame(
+        df, schema=", ".join(f"{key}: {val}" for key, val in schema.items())
+    )
 
     return spark_df

From 023c341b43796e4e69680bb660f24db0cfaf77da Mon Sep 17 00:00:00 2001
From: William Ridgeway <william.ridgeway@cookcountyil.gov>
Date: Wed, 26 Mar 2025 16:26:30 -0500
Subject: [PATCH 67/96] Store testing

---
 test.py | 150 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 150 insertions(+)
 create mode 100644 test.py

diff --git a/test.py b/test.py
new file mode 100644
index 000000000..520018f96
--- /dev/null
+++ b/test.py
@@ -0,0 +1,150 @@
+# This script generates aggregated summary stats on assessed values across a
+# number of geographies, class combinations, and time.
+#%%
+# Import libraries
+import pandas as pd
+from pyathena import connect
+from pyathena.pandas.cursor import PandasCursor
+from pyspark.sql import SparkSession
+import pyspark.sql.functions as F
+from pyspark.sql.functions import pandas_udf
+
+# Connect to Athena
+cursor = connect(
+    s3_staging_dir="s3://ccao-athena-results-us-east-1/",
+    region_name="us-east-1",
+    cursor_class=PandasCursor,
+).cursor(unload=True)
+
+data = cursor.execute("select * from z_ci_387_reporting_sot_reporting.sot_assessment_roll_input").as_pandas()
+
+spark = SparkSession.builder.appName("SparkByExamples.com").master("local[*]").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate()
+#%%
+schema = {'stage_name': 'string',
+'class': 'string',
+'av_tot': 'double',
+'av_bldg': 'double',
+'av_land': 'double',
+'county': 'string',
+'triad': 'string',
+'township': 'string',
+'nbhd': 'string',
+'tax_code': 'string',
+'zip_code': 'string',
+'community_area': 'string',
+'census_place': 'string',
+'census_tract': 'string',
+'census_congressional_district': 'string',
+'census_zcta': 'string',
+'cook_board_of_review_district': 'string',
+'cook_commissioner_district': 'string',
+'cook_judicial_district': 'string',
+'ward_num': 'string',
+'police_district': 'string',
+'school_elementary_district': 'string',
+'school_secondary_district': 'string',
+'school_unified_district': 'string',
+'tax_municipality': 'string',
+'tax_park_district': 'string',
+'tax_library_district': 'string',
+'tax_fire_protection_district': 'string',
+'tax_community_college_district': 'string',
+'tax_sanitation_district': 'string',
+'tax_special_service_area': 'string',
+'tax_tif_district': 'string',
+'central_business_district': 'string',
+'census_data_year': 'string',
+'cook_board_of_review_district_data_year': 'string',
+'cook_commissioner_district_data_year': 'string',
+'cook_judicial_district_data_year': 'string',
+'ward_data_year': 'string',
+'community_area_data_year': 'string',
+'police_district_data_year': 'string',
+'central_business_district_data_year': 'string',
+'school_data_year': 'string',
+'tax_data_year': 'string',
+'no_group': 'string',
+'major_class': 'string',
+'modeling_group': 'string',
+'res_other': 'string',
+'year': 'string',}
+schema = ", ".join(f"{key}: {val}" for key, val in schema.items())
+spark_df = spark.createDataFrame(data, schema=schema)
+#%%
+
+# Define aggregation functions. These are just wrappers for basic python
+# functions that make using them easier to use with pandas.agg().
+def q10(x):
+    return x.quantile(0.1)
+
+
+def q25(x):
+    return x.quantile(0.25)
+
+
+def q75(x):
+    return x.quantile(0.75)
+
+
+def q90(x):
+    return x.quantile(0.9)
+
+
+def first(x):
+    if len(x) >= 1:
+        output = x.iloc[0]
+    else:
+        output = None
+
+    return output
+
+more_stats = [
+    "min",
+    q10,
+    q25,
+    "median",
+    q75,
+    q90,
+    "max",
+    "mean",
+    "sum",
+]
+
+stats = {
+    "av_tot": ["size", "count"] + more_stats,
+    "av_bldg": more_stats,
+    "av_land": more_stats,
+    "triad": [first],
+    "geography_data_year": [first],
+}
+
+#%%
+schema = {'stage_name': 'string',
+'av_tot': 'double','av_bldg': 'double','av_land': 'double',}
+schema = ", ".join(f"{key}: {val}" for key, val in schema.items())
+spark_df = spark.createDataFrame(data[['stage_name', 'av_tot', 'av_bldg', 'av_land']], schema=schema)
+#%%
+def aggregate(key, pdf):
+
+    columns = ['av_tot', 'av_bldg', 'av_land']
+
+    out = ()
+    for column in ['av_tot', 'av_bldg', 'av_land']:
+        out += (
+                pdf[column].min(),
+                q10(pdf[column]),
+                q25(pdf[column]),
+                pdf[column].median(),
+                q75(pdf[column]),
+                q90(pdf[column]),
+                pdf[column].max(),
+                pdf[column].mean(),
+                pdf[column].sum(),
+            )
+
+    return pd.DataFrame([
+        key + out
+    ])
+
+spark_df.groupby("stage_name").applyInPandas(aggregate, schema="stage_name string, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double").show()
+# %%
\ No newline at end of file

From 57e4cc34fcb49b843c20a8e85b91ead182ca97aa Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Wed, 30 Apr 2025 19:17:27 +0000
Subject: [PATCH 68/96] Remove test script

---
 .gitignore |   1 +
 test.py    | 150 -----------------------------------------------------
 2 files changed, 1 insertion(+), 150 deletions(-)
 delete mode 100644 test.py

diff --git a/.gitignore b/.gitignore
index 9a2206a6f..4f1fdf134 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,7 @@
 .Ruserdata
 package*.json
 settings.json
+test.py
 
 # Directories
 *.egg-info/
diff --git a/test.py b/test.py
deleted file mode 100644
index 520018f96..000000000
--- a/test.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# This script generates aggregated summary stats on assessed values across a
-# number of geographies, class combinations, and time.
-#%%
-# Import libraries
-import pandas as pd
-from pyathena import connect
-from pyathena.pandas.cursor import PandasCursor
-from pyspark.sql import SparkSession
-import pyspark.sql.functions as F
-from pyspark.sql.functions import pandas_udf
-
-# Connect to Athena
-cursor = connect(
-    s3_staging_dir="s3://ccao-athena-results-us-east-1/",
-    region_name="us-east-1",
-    cursor_class=PandasCursor,
-).cursor(unload=True)
-
-data = cursor.execute("select * from z_ci_387_reporting_sot_reporting.sot_assessment_roll_input").as_pandas()
-
-spark = SparkSession.builder.appName("SparkByExamples.com").master("local[*]").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate()
-#%%
-schema = {'stage_name': 'string',
-'class': 'string',
-'av_tot': 'double',
-'av_bldg': 'double',
-'av_land': 'double',
-'county': 'string',
-'triad': 'string',
-'township': 'string',
-'nbhd': 'string',
-'tax_code': 'string',
-'zip_code': 'string',
-'community_area': 'string',
-'census_place': 'string',
-'census_tract': 'string',
-'census_congressional_district': 'string',
-'census_zcta': 'string',
-'cook_board_of_review_district': 'string',
-'cook_commissioner_district': 'string',
-'cook_judicial_district': 'string',
-'ward_num': 'string',
-'police_district': 'string',
-'school_elementary_district': 'string',
-'school_secondary_district': 'string',
-'school_unified_district': 'string',
-'tax_municipality': 'string',
-'tax_park_district': 'string',
-'tax_library_district': 'string',
-'tax_fire_protection_district': 'string',
-'tax_community_college_district': 'string',
-'tax_sanitation_district': 'string',
-'tax_special_service_area': 'string',
-'tax_tif_district': 'string',
-'central_business_district': 'string',
-'census_data_year': 'string',
-'cook_board_of_review_district_data_year': 'string',
-'cook_commissioner_district_data_year': 'string',
-'cook_judicial_district_data_year': 'string',
-'ward_data_year': 'string',
-'community_area_data_year': 'string',
-'police_district_data_year': 'string',
-'central_business_district_data_year': 'string',
-'school_data_year': 'string',
-'tax_data_year': 'string',
-'no_group': 'string',
-'major_class': 'string',
-'modeling_group': 'string',
-'res_other': 'string',
-'year': 'string',}
-schema = ", ".join(f"{key}: {val}" for key, val in schema.items())
-spark_df = spark.createDataFrame(data, schema=schema)
-#%%
-
-# Define aggregation functions. These are just wrappers for basic python
-# functions that make using them easier to use with pandas.agg().
-def q10(x):
-    return x.quantile(0.1)
-
-
-def q25(x):
-    return x.quantile(0.25)
-
-
-def q75(x):
-    return x.quantile(0.75)
-
-
-def q90(x):
-    return x.quantile(0.9)
-
-
-def first(x):
-    if len(x) >= 1:
-        output = x.iloc[0]
-    else:
-        output = None
-
-    return output
-
-more_stats = [
-    "min",
-    q10,
-    q25,
-    "median",
-    q75,
-    q90,
-    "max",
-    "mean",
-    "sum",
-]
-
-stats = {
-    "av_tot": ["size", "count"] + more_stats,
-    "av_bldg": more_stats,
-    "av_land": more_stats,
-    "triad": [first],
-    "geography_data_year": [first],
-}
-
-#%%
-schema = {'stage_name': 'string',
-'av_tot': 'double','av_bldg': 'double','av_land': 'double',}
-schema = ", ".join(f"{key}: {val}" for key, val in schema.items())
-spark_df = spark.createDataFrame(data[['stage_name', 'av_tot', 'av_bldg', 'av_land']], schema=schema)
-#%%
-def aggregate(key, pdf):
-
-    columns = ['av_tot', 'av_bldg', 'av_land']
-
-    out = ()
-    for column in ['av_tot', 'av_bldg', 'av_land']:
-        out += (
-                pdf[column].min(),
-                q10(pdf[column]),
-                q25(pdf[column]),
-                pdf[column].median(),
-                q75(pdf[column]),
-                q90(pdf[column]),
-                pdf[column].max(),
-                pdf[column].mean(),
-                pdf[column].sum(),
-            )
-
-    return pd.DataFrame([
-        key + out
-    ])
-
-spark_df.groupby("stage_name").applyInPandas(aggregate, schema="stage_name string, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double").show()
-# %%
\ No newline at end of file

From b0d9ad7ec6eacdfa7758dcad59cbaa53be4d2041 Mon Sep 17 00:00:00 2001
From: William Ridgeway <william.ridgeway@cookcountyil.gov>
Date: Thu, 1 May 2025 11:23:45 -0500
Subject: [PATCH 69/96] Test script back to working

---
 .gitignore |   1 -
 test.py    | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 169 insertions(+), 1 deletion(-)
 create mode 100644 test.py

diff --git a/.gitignore b/.gitignore
index 4f1fdf134..9a2206a6f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,7 +11,6 @@
 .Ruserdata
 package*.json
 settings.json
-test.py
 
 # Directories
 *.egg-info/
diff --git a/test.py b/test.py
new file mode 100644
index 000000000..47f092988
--- /dev/null
+++ b/test.py
@@ -0,0 +1,169 @@
+# This script generates aggregated summary stats on assessed values across a
+# number of geographies, class combinations, and time.
+# %%
+# Import libraries
+import pandas as pd
+from pyathena import connect
+from pyathena.pandas.cursor import PandasCursor
+from pyspark.sql import SparkSession
+
+# Connect to Athena
+cursor = connect(
+    s3_staging_dir="s3://ccao-athena-results-us-east-1/",
+    region_name="us-east-1",
+    cursor_class=PandasCursor,
+).cursor(unload=True)
+
+data = cursor.execute(
+    "select * from z_ci_387_reporting_sot_reporting.sot_assessment_roll_input"
+).as_pandas()
+
+spark = (
+    SparkSession.builder.appName("SparkByExamples.com")
+    .master("local[*]")
+    .config("spark.driver.bindAddress", "127.0.0.1")
+    .getOrCreate()
+)
+# %%
+schema = {
+    "stage_name": "string",
+    "class": "string",
+    "av_tot": "double",
+    "av_bldg": "double",
+    "av_land": "double",
+    "county": "string",
+    "triad": "string",
+    "township": "string",
+    "nbhd": "string",
+    "tax_code": "string",
+    "zip_code": "string",
+    "community_area": "string",
+    "census_place": "string",
+    "census_tract": "string",
+    "census_congressional_district": "string",
+    "census_zcta": "string",
+    "cook_board_of_review_district": "string",
+    "cook_commissioner_district": "string",
+    "cook_judicial_district": "string",
+    "ward_num": "string",
+    "police_district": "string",
+    "school_elementary_district": "string",
+    "school_secondary_district": "string",
+    "school_unified_district": "string",
+    "tax_municipality": "string",
+    "tax_park_district": "string",
+    "tax_library_district": "string",
+    "tax_fire_protection_district": "string",
+    "tax_community_college_district": "string",
+    "tax_sanitation_district": "string",
+    "tax_special_service_area": "string",
+    "tax_tif_district": "string",
+    "central_business_district": "string",
+    "census_data_year": "string",
+    "cook_board_of_review_district_data_year": "string",
+    "cook_commissioner_district_data_year": "string",
+    "cook_judicial_district_data_year": "string",
+    "ward_data_year": "string",
+    "community_area_data_year": "string",
+    "police_district_data_year": "string",
+    "central_business_district_data_year": "string",
+    "school_data_year": "string",
+    "tax_data_year": "string",
+    "no_group": "string",
+    "major_class": "string",
+    "modeling_group": "string",
+    "res_other": "string",
+    "year": "string",
+}
+schema = ", ".join(f"{key}: {val}" for key, val in schema.items())
+spark_df = spark.createDataFrame(data, schema=schema)
+
+
+# %%
+# Define aggregation functions. These are just wrappers for basic python
+# functions that make using them easier to use with pandas.agg().
+def q10(x):
+    return x.quantile(0.1)
+
+
+def q25(x):
+    return x.quantile(0.25)
+
+
+def q75(x):
+    return x.quantile(0.75)
+
+
+def q90(x):
+    return x.quantile(0.9)
+
+
+def first(x):
+    if len(x) >= 1:
+        output = x.iloc[0]
+    else:
+        output = None
+
+    return output
+
+
+more_stats = [
+    "min",
+    q10,
+    q25,
+    "median",
+    q75,
+    q90,
+    "max",
+    "mean",
+    "sum",
+]
+
+stats = {
+    "av_tot": ["size", "count"] + more_stats,
+    "av_bldg": more_stats,
+    "av_land": more_stats,
+    "triad": [first],
+    "geography_data_year": [first],
+}
+
+# %%
+schema = {
+    "stage_name": "string",
+    "av_tot": "double",
+    "av_bldg": "double",
+    "av_land": "double",
+}
+schema = ", ".join(f"{key}: {val}" for key, val in schema.items())
+spark_df = spark.createDataFrame(
+    data[["stage_name", "av_tot", "av_bldg", "av_land"]], schema=schema
+)
+
+
+# %%
+def aggregate(key, pdf):
+    columns = ["av_tot", "av_bldg", "av_land"]
+
+    out = ()
+    for column in columns:
+        out += (
+            pdf[column].min(),
+            q10(pdf[column]),
+            q25(pdf[column]),
+            pdf[column].median(),
+            q75(pdf[column]),
+            q90(pdf[column]),
+            pdf[column].max(),
+            pdf[column].mean(),
+            pdf[column].sum(),
+        )
+
+    return pd.DataFrame([key + out])
+
+
+# %%
+spark_df.groupby("stage_name").applyInPandas(
+    aggregate,
+    schema="stage_name string, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double",
+).show()
+# %%

From f05683512f58d7e10f9be9d7d5c3e13b0f28e796 Mon Sep 17 00:00:00 2001
From: William Ridgeway <william.ridgeway@cookcountyil.gov>
Date: Tue, 17 Jun 2025 14:50:59 -0500
Subject: [PATCH 70/96] Temp changes

---
 test.py | 113 ++++++++++++++++++++++++++------------------------------
 1 file changed, 52 insertions(+), 61 deletions(-)

diff --git a/test.py b/test.py
index 47f092988..70cbaf22f 100644
--- a/test.py
+++ b/test.py
@@ -24,59 +24,6 @@
     .config("spark.driver.bindAddress", "127.0.0.1")
     .getOrCreate()
 )
-# %%
-schema = {
-    "stage_name": "string",
-    "class": "string",
-    "av_tot": "double",
-    "av_bldg": "double",
-    "av_land": "double",
-    "county": "string",
-    "triad": "string",
-    "township": "string",
-    "nbhd": "string",
-    "tax_code": "string",
-    "zip_code": "string",
-    "community_area": "string",
-    "census_place": "string",
-    "census_tract": "string",
-    "census_congressional_district": "string",
-    "census_zcta": "string",
-    "cook_board_of_review_district": "string",
-    "cook_commissioner_district": "string",
-    "cook_judicial_district": "string",
-    "ward_num": "string",
-    "police_district": "string",
-    "school_elementary_district": "string",
-    "school_secondary_district": "string",
-    "school_unified_district": "string",
-    "tax_municipality": "string",
-    "tax_park_district": "string",
-    "tax_library_district": "string",
-    "tax_fire_protection_district": "string",
-    "tax_community_college_district": "string",
-    "tax_sanitation_district": "string",
-    "tax_special_service_area": "string",
-    "tax_tif_district": "string",
-    "central_business_district": "string",
-    "census_data_year": "string",
-    "cook_board_of_review_district_data_year": "string",
-    "cook_commissioner_district_data_year": "string",
-    "cook_judicial_district_data_year": "string",
-    "ward_data_year": "string",
-    "community_area_data_year": "string",
-    "police_district_data_year": "string",
-    "central_business_district_data_year": "string",
-    "school_data_year": "string",
-    "tax_data_year": "string",
-    "no_group": "string",
-    "major_class": "string",
-    "modeling_group": "string",
-    "res_other": "string",
-    "year": "string",
-}
-schema = ", ".join(f"{key}: {val}" for key, val in schema.items())
-spark_df = spark.createDataFrame(data, schema=schema)
 
 
 # %%
@@ -128,16 +75,58 @@ def first(x):
 }
 
 # %%
+groups = {
+    "res_other": "string",
+    "major_class": "string",
+    "no_group": "string",
+    "class": "string",
+    "modeling_group": "string",
+}
+
+geographies = {
+    "county": "string",
+    "triad": "string",
+    "township": "string",
+    "nbhd": "string",
+    "tax_code": "string",
+    "zip_code": "string",
+    "community_area": "string",
+    "census_place": "string",
+    "census_tract": "string",
+    "census_congressional_district": "string",
+    "census_zcta": "string",
+    "cook_board_of_review_district": "string",
+    "cook_commissioner_district": "string",
+    "cook_judicial_district": "string",
+    "ward_num": "string",
+    "police_district": "string",
+    "school_elementary_district": "string",
+    "school_secondary_district": "string",
+    "school_unified_district": "string",
+    "tax_municipality": "string",
+    "tax_park_district": "string",
+    "tax_library_district": "string",
+    "tax_fire_protection_district": "string",
+    "tax_community_college_district": "string",
+    "tax_sanitation_district": "string",
+    "tax_special_service_area": "string",
+    "tax_tif_district": "string",
+    "central_business_district": "string",
+}
+
 schema = {
+    "year": "string",
     "stage_name": "string",
     "av_tot": "double",
     "av_bldg": "double",
     "av_land": "double",
 }
-schema = ", ".join(f"{key}: {val}" for key, val in schema.items())
-spark_df = spark.createDataFrame(
-    data[["stage_name", "av_tot", "av_bldg", "av_land"]], schema=schema
-)
+
+schema = schema | groups | geographies
+
+cols = list(schema.keys())
+schema = ", ".join(f"{key} {val}" for key, val in schema.items())
+spark_df = spark.createDataFrame(data[cols], schema=schema)
 
 
 # %%
@@ -162,8 +151,10 @@ def aggregate(key, pdf):
 
 
 # %%
-spark_df.groupby("stage_name").applyInPandas(
-    aggregate,
-    schema="stage_name string, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double",
-).show()
+for group in list(groups.keys()):
+    for geography in list(geographies.keys()):
+        spark_df.groupby(["stage_name", group, geography]).applyInPandas(
+            aggregate,
+            schema="stage_name string, group_id string, geography string, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double",
+        ).show()
 # %%

From 744ddd656d45a248f33afada4dc0a404e78c201f Mon Sep 17 00:00:00 2001
From: William Ridgeway <william.ridgeway@cookcountyil.gov>
Date: Wed, 18 Jun 2025 14:39:58 -0500
Subject: [PATCH 71/96] Everything but delta cols

---
 test.py | 100 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 67 insertions(+), 33 deletions(-)

diff --git a/test.py b/test.py
index 70cbaf22f..2b9025654 100644
--- a/test.py
+++ b/test.py
@@ -6,6 +6,7 @@
 from pyathena import connect
 from pyathena.pandas.cursor import PandasCursor
 from pyspark.sql import SparkSession
+from pyspark.sql.functions import lit
 
 # Connect to Athena
 cursor = connect(
@@ -83,37 +84,39 @@ def first(x):
     "modeling_group": "string",
 }
 
-geographies = {
-    "county": "string",
-    "triad": "string",
-    "township": "string",
-    "nbhd": "string",
-    "tax_code": "string",
-    "zip_code": "string",
-    "community_area": "string",
-    "census_place": "string",
-    "census_tract": "string",
-    "census_congressional_district": "string",
-    "census_zcta": "string",
-    "cook_board_of_review_district": "string",
-    "cook_commissioner_district": "string",
-    "cook_judicial_district": "string",
-    "ward_num": "string",
-    "police_district": "string",
-    "school_elementary_district": "string",
-    "school_secondary_district": "string",
-    "school_unified_district": "string",
-    "tax_municipality": "string",
-    "tax_park_district": "string",
-    "tax_library_district": "string",
-    "tax_fire_protection_district": "string",
-    "tax_community_college_district": "string",
-    "tax_sanitation_district": "string",
-    "tax_special_service_area": "string",
-    "tax_tif_district": "string",
-    "central_business_district": "string",
+years = {
+    "county": "year",
+    "triad": "year",
+    "township": "year",
+    "nbhd": "year",
+    "tax_code": "year",
+    "zip_code": "year",
+    "community_area": "community_area_data_year",
+    "census_place": "census_data_year",
+    "census_tract": "census_data_year",
+    "census_congressional_district": "census_data_year",
+    "census_zcta": "census_data_year",
+    "cook_board_of_review_district": "cook_board_of_review_district_data_year",
+    "cook_commissioner_district": "cook_commissioner_district_data_year",
+    "cook_judicial_district": "cook_judicial_district_data_year",
+    "ward_num": "ward_data_year",
+    "police_district": "police_district_data_year",
+    "school_elementary_district": "school_data_year",
+    "school_secondary_district": "school_data_year",
+    "school_unified_district": "school_data_year",
+    "tax_municipality": "tax_data_year",
+    "tax_park_district": "tax_data_year",
+    "tax_library_district": "tax_data_yearg",
+    "tax_fire_protection_district": "tax_data_year",
+    "tax_community_college_district": "tax_data_year",
+    "tax_sanitation_district": "tax_data_year",
+    "tax_special_service_area": "tax_data_year",
+    "tax_tif_district": "tax_data_year",
+    "central_business_district": "central_business_district_data_year",
 }
 
+geographies = dict.fromkeys(list(years.keys()), "string")
+
 schema = {
     "year": "string",
     "stage_name": "string",
@@ -130,10 +133,35 @@ def first(x):
 
 
 # %%
+def reassessment_year(year, geography, triad):
+    if geography in ["triad", "township", "nbhd"]:
+        year = int(year) % 3
+
+        if (
+            ((year == 0) & (triad == "North"))
+            | ((year == 1) & (triad == "South"))
+            | ((year == 2) & (triad == "City"))
+        ):
+            out = "Yes"
+        else:
+            out = "No"
+    else:
+        out = ""
+
+    return out
+
+
 def aggregate(key, pdf):
     columns = ["av_tot", "av_bldg", "av_land"]
 
     out = ()
+    out += (
+        reassessment_year(pdf["year"][0], geography, pdf["triad"][0]),
+        first(pdf[years[geography]]),
+        len(pdf["av_tot"]),
+        pdf["av_tot"].count(),
+        pdf["av_tot"].count() / len(pdf["av_tot"]),
+    )
     for column in columns:
         out += (
             pdf[column].min(),
@@ -151,10 +179,16 @@ def aggregate(key, pdf):
 
 
 # %%
-for group in list(groups.keys()):
-    for geography in list(geographies.keys()):
-        spark_df.groupby(["stage_name", group, geography]).applyInPandas(
+for group in [list(groups.keys())[1]]:
+    for geography in [list(geographies.keys())[1]]:
+        spark_df.groupby(
+            ["stage_name", group, geography, "year"]
+        ).applyInPandas(
             aggregate,
-            schema="stage_name string, group_id string, geography string, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double",
+            schema="stage_name string, group_id string, geography_id string, year string, reassessment_year string, geography_data_year string, pin_n_tot int, pin_n_w_value int, pin_pct_w_value double, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double",
+        ).select(
+            "*",
+            lit(group).alias("group_type"),
+            lit(geography).alias("geography_type"),
         ).show()
 # %%

From ae79bc3d5afb7d663b783c0c75393a1a39c5448e Mon Sep 17 00:00:00 2001
From: William Ridgeway <william.ridgeway@cookcountyil.gov>
Date: Wed, 18 Jun 2025 14:43:35 -0500
Subject: [PATCH 72/96] Remove vestigial objects

---
 test.py | 110 +++++++++++++++++++++++---------------------------------
 1 file changed, 44 insertions(+), 66 deletions(-)

diff --git a/test.py b/test.py
index 2b9025654..7c2199f8f 100644
--- a/test.py
+++ b/test.py
@@ -55,25 +55,50 @@ def first(x):
     return output
 
 
-more_stats = [
-    "min",
-    q10,
-    q25,
-    "median",
-    q75,
-    q90,
-    "max",
-    "mean",
-    "sum",
-]
-
-stats = {
-    "av_tot": ["size", "count"] + more_stats,
-    "av_bldg": more_stats,
-    "av_land": more_stats,
-    "triad": [first],
-    "geography_data_year": [first],
-}
+def reassessment_year(year, geography, triad):
+    if geography in ["triad", "township", "nbhd"]:
+        year = int(year) % 3
+
+        if (
+            ((year == 0) & (triad == "North"))
+            | ((year == 1) & (triad == "South"))
+            | ((year == 2) & (triad == "City"))
+        ):
+            out = "Yes"
+        else:
+            out = "No"
+    else:
+        out = ""
+
+    return out
+
+
+def aggregate(key, pdf):
+    columns = ["av_tot", "av_bldg", "av_land"]
+
+    out = ()
+    out += (
+        reassessment_year(pdf["year"][0], geography, pdf["triad"][0]),
+        first(pdf[years[geography]]),
+        len(pdf["av_tot"]),
+        pdf["av_tot"].count(),
+        pdf["av_tot"].count() / len(pdf["av_tot"]),
+    )
+    for column in columns:
+        out += (
+            pdf[column].min(),
+            q10(pdf[column]),
+            q25(pdf[column]),
+            pdf[column].median(),
+            q75(pdf[column]),
+            q90(pdf[column]),
+            pdf[column].max(),
+            pdf[column].mean(),
+            pdf[column].sum(),
+        )
+
+    return pd.DataFrame([key + out])
+
 
 # %%
 groups = {
@@ -131,53 +156,6 @@ def first(x):
 schema = ", ".join(f"{key} {val}" for key, val in schema.items())
 spark_df = spark.createDataFrame(data[cols], schema=schema)
 
-
-# %%
-def reassessment_year(year, geography, triad):
-    if geography in ["triad", "township", "nbhd"]:
-        year = int(year) % 3
-
-        if (
-            ((year == 0) & (triad == "North"))
-            | ((year == 1) & (triad == "South"))
-            | ((year == 2) & (triad == "City"))
-        ):
-            out = "Yes"
-        else:
-            out = "No"
-    else:
-        out = ""
-
-    return out
-
-
-def aggregate(key, pdf):
-    columns = ["av_tot", "av_bldg", "av_land"]
-
-    out = ()
-    out += (
-        reassessment_year(pdf["year"][0], geography, pdf["triad"][0]),
-        first(pdf[years[geography]]),
-        len(pdf["av_tot"]),
-        pdf["av_tot"].count(),
-        pdf["av_tot"].count() / len(pdf["av_tot"]),
-    )
-    for column in columns:
-        out += (
-            pdf[column].min(),
-            q10(pdf[column]),
-            q25(pdf[column]),
-            pdf[column].median(),
-            q75(pdf[column]),
-            q90(pdf[column]),
-            pdf[column].max(),
-            pdf[column].mean(),
-            pdf[column].sum(),
-        )
-
-    return pd.DataFrame([key + out])
-
-
 # %%
 for group in [list(groups.keys())[1]]:
     for geography in [list(geographies.keys())[1]]:

From 080d5b79265b5a9d12fb82048106eaefd01b767c Mon Sep 17 00:00:00 2001
From: William Ridgeway <william.ridgeway@cookcountyil.gov>
Date: Wed, 18 Jun 2025 15:40:10 -0500
Subject: [PATCH 73/96] Simplify schema creation

---
 test.py | 38 +++++++++++++++-----------------------
 1 file changed, 15 insertions(+), 23 deletions(-)

diff --git a/test.py b/test.py
index 7c2199f8f..e7cf99d08 100644
--- a/test.py
+++ b/test.py
@@ -101,13 +101,13 @@ def aggregate(key, pdf):
 
 
 # %%
-groups = {
-    "res_other": "string",
-    "major_class": "string",
-    "no_group": "string",
-    "class": "string",
-    "modeling_group": "string",
-}
+groups = [
+    "res_other",
+    "major_class",
+    "no_group",
+    "class",
+    "modeling_group",
+]
 
 years = {
     "county": "year",
@@ -131,7 +131,7 @@ def aggregate(key, pdf):
     "school_unified_district": "school_data_year",
     "tax_municipality": "tax_data_year",
     "tax_park_district": "tax_data_year",
-    "tax_library_district": "tax_data_yearg",
+    "tax_library_district": "tax_data_year",
     "tax_fire_protection_district": "tax_data_year",
     "tax_community_college_district": "tax_data_year",
     "tax_sanitation_district": "tax_data_year",
@@ -140,25 +140,17 @@ def aggregate(key, pdf):
     "central_business_district": "central_business_district_data_year",
 }
 
-geographies = dict.fromkeys(list(years.keys()), "string")
-
-schema = {
-    "year": "string",
-    "stage_name": "string",
-    "av_tot": "double",
-    "av_bldg": "double",
-    "av_land": "double",
-}
+geographies = list(years.keys())
 
-schema = schema | groups | geographies
-
-cols = list(schema.keys())
+schema = dict.fromkeys(data.columns, "string")
+schema |= dict.fromkeys(["av_tot", "av_bldg", "av_land"], "double")
 schema = ", ".join(f"{key} {val}" for key, val in schema.items())
-spark_df = spark.createDataFrame(data[cols], schema=schema)
+
+spark_df = spark.createDataFrame(data, schema=schema)
 
 # %%
-for group in [list(groups.keys())[1]]:
-    for geography in [list(geographies.keys())[1]]:
+for group in groups:
+    for geography in geographies:
         spark_df.groupby(
             ["stage_name", group, geography, "year"]
         ).applyInPandas(

From de27eb8776304bc9cb091a96ae76a031951332ac Mon Sep 17 00:00:00 2001
From: William Ridgeway <william.ridgeway@cookcountyil.gov>
Date: Wed, 18 Jun 2025 16:16:50 -0500
Subject: [PATCH 74/96] Aggregate spark dfs

---
 test.py | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/test.py b/test.py
index e7cf99d08..085ac71ca 100644
--- a/test.py
+++ b/test.py
@@ -2,10 +2,12 @@
 # number of geographies, class combinations, and time.
 # %%
 # Import libraries
+from functools import reduce
+
 import pandas as pd
 from pyathena import connect
 from pyathena.pandas.cursor import PandasCursor
-from pyspark.sql import SparkSession
+from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.functions import lit
 
 # Connect to Athena
@@ -82,7 +84,7 @@ def aggregate(key, pdf):
         first(pdf[years[geography]]),
         len(pdf["av_tot"]),
         pdf["av_tot"].count(),
-        pdf["av_tot"].count() / len(pdf["av_tot"]),
+        pdf["av_tot"].count() / pdf["av_tot"].size,
     )
     for column in columns:
         out += (
@@ -148,17 +150,24 @@ def aggregate(key, pdf):
 
 spark_df = spark.createDataFrame(data, schema=schema)
 
+output_schema = "stage_name string, group_id string, geography_id string, year string, reassessment_year string, geography_data_year string, pin_n_tot int, pin_n_w_value int, pin_pct_w_value double, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double"
+
 # %%
+output = []
 for group in groups:
     for geography in geographies:
-        spark_df.groupby(
-            ["stage_name", group, geography, "year"]
-        ).applyInPandas(
-            aggregate,
-            schema="stage_name string, group_id string, geography_id string, year string, reassessment_year string, geography_data_year string, pin_n_tot int, pin_n_w_value int, pin_pct_w_value double, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double",
-        ).select(
-            "*",
-            lit(group).alias("group_type"),
-            lit(geography).alias("geography_type"),
-        ).show()
+        output += [
+            spark_df.groupby(["stage_name", group, geography, "year"])
+            .applyInPandas(
+                aggregate,
+                schema=output_schema,
+            )
+            .select(
+                "*",
+                lit(group).alias("group_type"),
+                lit(geography).alias("geography_type"),
+            )
+        ]
+
+outputs = reduce(DataFrame.unionByName, output)
 # %%

From 214ec7aab765cd3f2b110ae0754bb26261a20c83 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Wed, 18 Jun 2025 21:25:54 +0000
Subject: [PATCH 75/96] Remove temp limit on ass roll table

---
 dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 2 --
 1 file changed, 2 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
index 89724635c..1cab79d90 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
+++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
@@ -102,5 +102,3 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals
     AND uni.stage_name = vals.stage_name
 LEFT JOIN {{ ref('ccao.class_dict') }}
     ON uni.class = class_dict.class_code
--- Temporary limit on feeder table to avoid GitHub runner memory issues.
-WHERE uni.class = '278' AND uni.year IN ('2019', '2020', '2021')

From 26f00e30b0b1ec448f0cd72d94fe38e95a9a4ca6 Mon Sep 17 00:00:00 2001
From: William Ridgeway <william.ridgeway@cookcountyil.gov>
Date: Wed, 18 Jun 2025 17:02:42 -0500
Subject: [PATCH 76/96] Try table build with spark

---
 .../models/reporting/reporting.spark_test.py  | 88 ++++++++-----------
 1 file changed, 38 insertions(+), 50 deletions(-)
 rename test.py => dbt/models/reporting/reporting.spark_test.py (73%)

diff --git a/test.py b/dbt/models/reporting/reporting.spark_test.py
similarity index 73%
rename from test.py
rename to dbt/models/reporting/reporting.spark_test.py
index 085ac71ca..0a143ca78 100644
--- a/test.py
+++ b/dbt/models/reporting/reporting.spark_test.py
@@ -1,35 +1,14 @@
 # This script generates aggregated summary stats on assessed values across a
 # number of geographies, class combinations, and time.
-# %%
+
 # Import libraries
 from functools import reduce
 
 import pandas as pd
-from pyathena import connect
-from pyathena.pandas.cursor import PandasCursor
-from pyspark.sql import DataFrame, SparkSession
+from pyspark.sql import DataFrame
 from pyspark.sql.functions import lit
 
-# Connect to Athena
-cursor = connect(
-    s3_staging_dir="s3://ccao-athena-results-us-east-1/",
-    region_name="us-east-1",
-    cursor_class=PandasCursor,
-).cursor(unload=True)
-
-data = cursor.execute(
-    "select * from z_ci_387_reporting_sot_reporting.sot_assessment_roll_input"
-).as_pandas()
-
-spark = (
-    SparkSession.builder.appName("SparkByExamples.com")
-    .master("local[*]")
-    .config("spark.driver.bindAddress", "127.0.0.1")
-    .getOrCreate()
-)
-
 
-# %%
 # Define aggregation functions. These are just wrappers for basic python
 # functions that make using them easier to use with pandas.agg().
 def q10(x):
@@ -80,8 +59,8 @@ def aggregate(key, pdf):
 
     out = ()
     out += (
-        reassessment_year(pdf["year"][0], geography, pdf["triad"][0]),
-        first(pdf[years[geography]]),
+        reassessment_year(pdf["year"][0], geography, pdf["triad"][0]),  # noqa: F821
+        first(pdf[years[geography]]),  # noqa: F821
         len(pdf["av_tot"]),
         pdf["av_tot"].count(),
         pdf["av_tot"].count() / pdf["av_tot"].size,
@@ -102,7 +81,6 @@ def aggregate(key, pdf):
     return pd.DataFrame([key + out])
 
 
-# %%
 groups = [
     "res_other",
     "major_class",
@@ -144,30 +122,40 @@ def aggregate(key, pdf):
 
 geographies = list(years.keys())
 
-schema = dict.fromkeys(data.columns, "string")
-schema |= dict.fromkeys(["av_tot", "av_bldg", "av_land"], "double")
-schema = ", ".join(f"{key} {val}" for key, val in schema.items())
+output_schema = "stage_name string, group_id string, geography_id string, year string, reassessment_year string, geography_data_year string, pin_n_tot int, pin_n_w_value int, pin_pct_w_value double, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double"
 
-spark_df = spark.createDataFrame(data, schema=schema)
 
-output_schema = "stage_name string, group_id string, geography_id string, year string, reassessment_year string, geography_data_year string, pin_n_tot int, pin_n_w_value int, pin_pct_w_value double, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double"
+def model(dbt, spark_session):
+    dbt.config(materialized="table", engine_config={"MaxConcurrentDpus": 40})
+    athena_user_logger.info("Loading assessment roll input table")
+
+    input = dbt.ref("reporting.sot_assessment_roll_input")
+    spark_schema = dict.fromkeys(input.columns, "string")
+    spark_schema |= dict.fromkeys(["av_tot", "av_bldg", "av_land"], "double")
+    spark_schema = ", ".join(
+        f"{key} {val}" for key, val in spark_schema.items()
+    )
 
-# %%
-output = []
-for group in groups:
-    for geography in geographies:
-        output += [
-            spark_df.groupby(["stage_name", group, geography, "year"])
-            .applyInPandas(
-                aggregate,
-                schema=output_schema,
-            )
-            .select(
-                "*",
-                lit(group).alias("group_type"),
-                lit(geography).alias("geography_type"),
-            )
-        ]
-
-outputs = reduce(DataFrame.unionByName, output)
-# %%
+    spark_df = spark_session.createDataFrame(input, schema=spark_schema)
+
+    athena_user_logger.info("Dope stuff is happening... maybe?")
+
+    output = []
+    for group in groups:
+        for geography in geographies:
+            output += [
+                spark_df.groupby(["stage_name", group, geography, "year"])
+                .applyInPandas(
+                    aggregate,
+                    schema=output_schema,
+                )
+                .select(
+                    "*",
+                    lit(group).alias("group_type"),
+                    lit(geography).alias("geography_type"),
+                )
+            ]
+
+    df = reduce(DataFrame.unionByName, output)
+
+    return df

From 2509e404c78f7e07fa3b6fa1cca15d0622b2eb65 Mon Sep 17 00:00:00 2001
From: sweatyhandshake <wridgeway@cookcountyassessor.com>
Date: Mon, 23 Jun 2025 10:31:52 -0500
Subject: [PATCH 77/96] Remove old table, rerun build to gen error log

---
 .../reporting.sot_assessment_roll.py          | 425 +++++-------------
 dbt/models/reporting/reporting.spark_test.py  | 161 -------
 2 files changed, 111 insertions(+), 475 deletions(-)
 delete mode 100644 dbt/models/reporting/reporting.spark_test.py

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index 6eda0abf5..0a143ca78 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -2,10 +2,11 @@
 # number of geographies, class combinations, and time.
 
 # Import libraries
-import pandas as pd
+from functools import reduce
 
-# Declare class groupings
-groups = ["no_group", "class", "major_class", "modeling_group", "res_other"]
+import pandas as pd
+from pyspark.sql import DataFrame
+from pyspark.sql.functions import lit
 
 
 # Define aggregation functions. These are just wrappers for basic python
@@ -35,330 +36,126 @@ def first(x):
     return output
 
 
-more_stats = [
-    "min",
-    q10,
-    q25,
-    "median",
-    q75,
-    q90,
-    "max",
-    "mean",
-    "sum",
-]
-
-stats = {
-    "av_tot": ["size", "count"] + more_stats,
-    "av_bldg": more_stats,
-    "av_land": more_stats,
-    "triad": [first],
-    "geography_data_year": [first],
-}
-
-
-def aggregrate(data, geography_type, group_type, stats):
-    """
-    Function to group a dataframe by whichever geography and group types it is
-    passed and output aggregate stats for that grouping.
-    """
-
-    print(geography_type, group_type)
-
-    group = [geography_type, group_type, "year", "stage_name"]
-    summary = data.groupby(group).agg(stats).round(2)
-    summary["geography_type"] = geography_type
-    summary["group_type"] = group_type
-    summary.index.names = ["geography_id", "group_id", "year", "stage_name"]
-    summary = summary.reset_index().set_index(
-        [
-            "geography_type",
-            "geography_id",
-            "group_type",
-            "group_id",
-            "year",
-            "stage_name",
-        ]
-    )
-
-    return summary
-
-
-def assemble(df, geos, groups):
-    """
-    Function that loops over predefined geography and class groups and passes
-    them to the aggregate function. Returns stacked output from the aggregate
-    function.
-    """
-
-    # Create an empty dataframe to fill with output
-    output = pd.DataFrame()
-
-    # Loop through group combinations and stack output
-    for key, value in geos.items():
-        df["geography_data_year"] = df[key]
-
-        for x in value:
-            for z in groups:
-                output = pd.concat([output, aggregrate(df, x, z, stats=stats)])
-
-    # Flatten multi-index
-    output.columns = ["_".join(col) for col in output.columns]
-    output = output.reset_index()
-    output = output.rename(columns={"triad_first": "triad"})
-
-    # Create additional stat columns post-aggregation
-    output["av_tot_pct_w_value"] = (
-        output["av_tot_count"] / output["av_tot_size"]
-    )
-
-    output = output.sort_values("year")
-
-    diff_cols = [
-        "geography_id",
-        "group_id",
-        "stage_name",
-        "av_tot_median",
-        "av_tot_mean",
-        "av_tot_sum",
-        "av_bldg_median",
-        "av_bldg_mean",
-        "av_bldg_sum",
-        "av_land_median",
-        "av_land_mean",
-        "av_land_sum",
-    ]
-
-    output[
-        [
-            "av_tot_delta_median",
-            "av_tot_delta_mean",
-            "av_tot_delta_sum",
-            "av_bldg_delta_median",
-            "av_bldg_delta_mean",
-            "av_bldg_delta_sum",
-            "av_land_delta_median",
-            "av_land_delta_mean",
-            "av_land_delta_sum",
-        ]
-    ] = (
-        output[diff_cols]
-        .groupby(["geography_id", "group_id", "stage_name"])
-        .diff()
-    )
+def reassessment_year(year, geography, triad):
+    if geography in ["triad", "township", "nbhd"]:
+        year = int(year) % 3
 
-    output[
-        [
-            "av_tot_delta_pct_median",
-            "av_tot_delta_pct_mean",
-            "av_tot_delta_pct_sum",
-            "av_bldg_delta_pct_median",
-            "av_bldg_delta_pct_mean",
-            "av_bldg_delta_pct_sum",
-            "av_land_delta_pct_median",
-            "av_land_delta_pct_mean",
-            "av_land_delta_pct_sum",
-        ]
-    ] = (
-        output[diff_cols]
-        .groupby(["geography_id", "group_id", "stage_name"])
-        .pct_change()
-    )
-
-    output["year"] = output["year"].astype(int)
-    output["triennial"] = output["geography_type"].isin(
-        ["triad", "township", "nbhd"]
-    )
-
-    # Reassessment year is constructed as a string rather than a boolean to
-    # avoid PySpark errors with nullable booleans that can likely be resolved.
-    output["reassessment_year"] = ""
-    output.loc[
-        (output["triennial"] == True), "reassessment_year"  # noqa: E712
-    ] = "No"
-    output.loc[
-        ((output["year"] % 3 == 0) & (output["triad"] == "North"))
-        | ((output["year"] % 3 == 1) & (output["triad"] == "South"))
-        | ((output["year"] % 3 == 2) & (output["triad"] == "City"))
-        & (output["triennial"] == True),  # noqa: E712
-        "reassessment_year",
-    ] = "Yes"
-    output = output.drop(["triennial", "triad"], axis=1)
-
-    output = clean_names(output)
+        if (
+            ((year == 0) & (triad == "North"))
+            | ((year == 1) & (triad == "South"))
+            | ((year == 2) & (triad == "City"))
+        ):
+            out = "Yes"
+        else:
+            out = "No"
+    else:
+        out = ""
 
-    return output
+    return out
 
 
-def clean_names(x):
-    """
-    Function to rename and reorder columns.
-    """
+def aggregate(key, pdf):
+    columns = ["av_tot", "av_bldg", "av_land"]
 
-    output = x.rename(
-        columns={
-            "av_tot_size": "pin_n_tot",
-            "av_tot_count": "pin_n_w_value",
-            "av_tot_pct_w_value": "pin_pct_w_value",
-            "geography_data_year_first": "geography_data_year",
-        }
+    out = ()
+    out += (
+        reassessment_year(pdf["year"][0], geography, pdf["triad"][0]),  # noqa: F821
+        first(pdf[years[geography]]),  # noqa: F821
+        len(pdf["av_tot"]),
+        pdf["av_tot"].count(),
+        pdf["av_tot"].count() / pdf["av_tot"].size,
     )
+    for column in columns:
+        out += (
+            pdf[column].min(),
+            q10(pdf[column]),
+            q25(pdf[column]),
+            pdf[column].median(),
+            q75(pdf[column]),
+            q90(pdf[column]),
+            pdf[column].max(),
+            pdf[column].mean(),
+            pdf[column].sum(),
+        )
+
+    return pd.DataFrame([key + out])
+
+
+groups = [
+    "res_other",
+    "major_class",
+    "no_group",
+    "class",
+    "modeling_group",
+]
 
-    output = output[
-        [
-            "geography_type",
-            "geography_id",
-            "geography_data_year",
-            "group_type",
-            "group_id",
-            "year",
-            "reassessment_year",
-            "stage_name",
-            "pin_n_tot",
-            "pin_n_w_value",
-            "pin_pct_w_value",
-            "av_tot_min",
-            "av_tot_q10",
-            "av_tot_q25",
-            "av_tot_median",
-            "av_tot_q75",
-            "av_tot_q90",
-            "av_tot_max",
-            "av_tot_mean",
-            "av_tot_sum",
-            "av_tot_delta_median",
-            "av_tot_delta_mean",
-            "av_tot_delta_sum",
-            "av_tot_delta_pct_median",
-            "av_tot_delta_pct_mean",
-            "av_tot_delta_pct_sum",
-            "av_bldg_min",
-            "av_bldg_q10",
-            "av_bldg_q25",
-            "av_bldg_median",
-            "av_bldg_q75",
-            "av_bldg_q90",
-            "av_bldg_max",
-            "av_bldg_mean",
-            "av_bldg_sum",
-            "av_bldg_delta_median",
-            "av_bldg_delta_mean",
-            "av_bldg_delta_sum",
-            "av_bldg_delta_pct_median",
-            "av_bldg_delta_pct_mean",
-            "av_bldg_delta_pct_sum",
-            "av_land_min",
-            "av_land_q10",
-            "av_land_q25",
-            "av_land_median",
-            "av_land_q75",
-            "av_land_q90",
-            "av_land_max",
-            "av_land_mean",
-            "av_land_sum",
-            "av_land_delta_median",
-            "av_land_delta_mean",
-            "av_land_delta_sum",
-            "av_land_delta_pct_median",
-            "av_land_delta_pct_mean",
-            "av_land_delta_pct_sum",
-        ]
-    ]
-
-    return output
-
-
-def ingest_geos(geos):
-    """
-    Function to convert dbt seed into a dictionary that can be iterated over.
-    """
+years = {
+    "county": "year",
+    "triad": "year",
+    "township": "year",
+    "nbhd": "year",
+    "tax_code": "year",
+    "zip_code": "year",
+    "community_area": "community_area_data_year",
+    "census_place": "census_data_year",
+    "census_tract": "census_data_year",
+    "census_congressional_district": "census_data_year",
+    "census_zcta": "census_data_year",
+    "cook_board_of_review_district": "cook_board_of_review_district_data_year",
+    "cook_commissioner_district": "cook_commissioner_district_data_year",
+    "cook_judicial_district": "cook_judicial_district_data_year",
+    "ward_num": "ward_data_year",
+    "police_district": "police_district_data_year",
+    "school_elementary_district": "school_data_year",
+    "school_secondary_district": "school_data_year",
+    "school_unified_district": "school_data_year",
+    "tax_municipality": "tax_data_year",
+    "tax_park_district": "tax_data_year",
+    "tax_library_district": "tax_data_year",
+    "tax_fire_protection_district": "tax_data_year",
+    "tax_community_college_district": "tax_data_year",
+    "tax_sanitation_district": "tax_data_year",
+    "tax_special_service_area": "tax_data_year",
+    "tax_tif_district": "tax_data_year",
+    "central_business_district": "central_business_district_data_year",
+}
 
-    geos = geos.toPandas()
-    output = {
-        k: list(geos[k].unique()[pd.notnull(geos[k].unique())])
-        for k in geos.columns
-    }
+geographies = list(years.keys())
 
-    return output
+output_schema = "stage_name string, group_id string, geography_id string, year string, reassessment_year string, geography_data_year string, pin_n_tot int, pin_n_w_value int, pin_pct_w_value double, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double"
 
 
 def model(dbt, spark_session):
-    """
-    Function to build a dbt python model using PySpark.
-    """
-    dbt.config(materialized="table")
-
-    # Ingest geographies and their associated data years
-    geos = ingest_geos(dbt.ref("reporting.sot_data_years"))
+    dbt.config(materialized="table", engine_config={"MaxConcurrentDpus": 40})
+    athena_user_logger.info("Loading assessment roll input table")
 
     input = dbt.ref("reporting.sot_assessment_roll_input")
-
-    # Convert the Spark input dataframe to Pandas for
-    # compatibility with assesspy functions
-    input = input.toPandas()
-
-    df = assemble(input, geos=geos, groups=groups)
-    # %%
-    schema = {
-        "geography_type": "string",
-        "geography_id": "string",
-        "geography_data_year": "string",
-        "group_type": "string",
-        "group_id": "string",
-        "year": "string",
-        "reassessment_year": "string",
-        "stage_name": "string",
-        "pin_n_tot": "bigint",
-        "pin_n_w_value": "bigint",
-        "pin_pct_w_value": "double",
-        "av_tot_min": "double",
-        "av_tot_q10": "double",
-        "av_tot_q25": "double",
-        "av_tot_median": "double",
-        "av_tot_q75": "double",
-        "av_tot_q90": "double",
-        "av_tot_max": "double",
-        "av_tot_mean": "double",
-        "av_tot_sum": "double",
-        "av_tot_delta_median": "double",
-        "av_tot_delta_mean": "double",
-        "av_tot_delta_sum": "double",
-        "av_tot_delta_pct_median": "double",
-        "av_tot_delta_pct_mean": "double",
-        "av_tot_delta_pct_sum": "double",
-        "av_bldg_min": "double",
-        "av_bldg_q10": "double",
-        "av_bldg_q25": "double",
-        "av_bldg_median": "double",
-        "av_bldg_q75": "double",
-        "av_bldg_q90": "double",
-        "av_bldg_max": "double",
-        "av_bldg_mean": "double",
-        "av_bldg_sum": "double",
-        "av_bldg_delta_median": "double",
-        "av_bldg_delta_mean": "double",
-        "av_bldg_delta_sum": "double",
-        "av_bldg_delta_pct_median": "double",
-        "av_bldg_delta_pct_mean": "double",
-        "av_bldg_delta_pct_sum": "double",
-        "av_land_min": "double",
-        "av_land_q10": "double",
-        "av_land_q25": "double",
-        "av_land_median": "double",
-        "av_land_q75": "double",
-        "av_land_q90": "double",
-        "av_land_max": "double",
-        "av_land_mean": "double",
-        "av_land_sum": "double",
-        "av_land_delta_median": "double",
-        "av_land_delta_mean": "double",
-        "av_land_delta_sum": "double",
-        "av_land_delta_pct_median": "double",
-        "av_land_delta_pct_mean": "double",
-        "av_land_delta_pct_sum": "double",
-    }
-    # %%
-    spark_df = spark_session.createDataFrame(
-        df, schema=", ".join(f"{key}: {val}" for key, val in schema.items())
+    spark_schema = dict.fromkeys(input.columns, "string")
+    spark_schema |= dict.fromkeys(["av_tot", "av_bldg", "av_land"], "double")
+    spark_schema = ", ".join(
+        f"{key} {val}" for key, val in spark_schema.items()
     )
 
-    return spark_df
+    spark_df = spark_session.createDataFrame(input, schema=spark_schema)
+
+    athena_user_logger.info("Dope stuff is happening... maybe?")
+
+    output = []
+    for group in groups:
+        for geography in geographies:
+            output += [
+                spark_df.groupby(["stage_name", group, geography, "year"])
+                .applyInPandas(
+                    aggregate,
+                    schema=output_schema,
+                )
+                .select(
+                    "*",
+                    lit(group).alias("group_type"),
+                    lit(geography).alias("geography_type"),
+                )
+            ]
+
+    df = reduce(DataFrame.unionByName, output)
+
+    return df
diff --git a/dbt/models/reporting/reporting.spark_test.py b/dbt/models/reporting/reporting.spark_test.py
deleted file mode 100644
index 0a143ca78..000000000
--- a/dbt/models/reporting/reporting.spark_test.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# This script generates aggregated summary stats on assessed values across a
-# number of geographies, class combinations, and time.
-
-# Import libraries
-from functools import reduce
-
-import pandas as pd
-from pyspark.sql import DataFrame
-from pyspark.sql.functions import lit
-
-
-# Define aggregation functions. These are just wrappers for basic python
-# functions that make using them easier to use with pandas.agg().
-def q10(x):
-    return x.quantile(0.1)
-
-
-def q25(x):
-    return x.quantile(0.25)
-
-
-def q75(x):
-    return x.quantile(0.75)
-
-
-def q90(x):
-    return x.quantile(0.9)
-
-
-def first(x):
-    if len(x) >= 1:
-        output = x.iloc[0]
-    else:
-        output = None
-
-    return output
-
-
-def reassessment_year(year, geography, triad):
-    if geography in ["triad", "township", "nbhd"]:
-        year = int(year) % 3
-
-        if (
-            ((year == 0) & (triad == "North"))
-            | ((year == 1) & (triad == "South"))
-            | ((year == 2) & (triad == "City"))
-        ):
-            out = "Yes"
-        else:
-            out = "No"
-    else:
-        out = ""
-
-    return out
-
-
-def aggregate(key, pdf):
-    columns = ["av_tot", "av_bldg", "av_land"]
-
-    out = ()
-    out += (
-        reassessment_year(pdf["year"][0], geography, pdf["triad"][0]),  # noqa: F821
-        first(pdf[years[geography]]),  # noqa: F821
-        len(pdf["av_tot"]),
-        pdf["av_tot"].count(),
-        pdf["av_tot"].count() / pdf["av_tot"].size,
-    )
-    for column in columns:
-        out += (
-            pdf[column].min(),
-            q10(pdf[column]),
-            q25(pdf[column]),
-            pdf[column].median(),
-            q75(pdf[column]),
-            q90(pdf[column]),
-            pdf[column].max(),
-            pdf[column].mean(),
-            pdf[column].sum(),
-        )
-
-    return pd.DataFrame([key + out])
-
-
-groups = [
-    "res_other",
-    "major_class",
-    "no_group",
-    "class",
-    "modeling_group",
-]
-
-years = {
-    "county": "year",
-    "triad": "year",
-    "township": "year",
-    "nbhd": "year",
-    "tax_code": "year",
-    "zip_code": "year",
-    "community_area": "community_area_data_year",
-    "census_place": "census_data_year",
-    "census_tract": "census_data_year",
-    "census_congressional_district": "census_data_year",
-    "census_zcta": "census_data_year",
-    "cook_board_of_review_district": "cook_board_of_review_district_data_year",
-    "cook_commissioner_district": "cook_commissioner_district_data_year",
-    "cook_judicial_district": "cook_judicial_district_data_year",
-    "ward_num": "ward_data_year",
-    "police_district": "police_district_data_year",
-    "school_elementary_district": "school_data_year",
-    "school_secondary_district": "school_data_year",
-    "school_unified_district": "school_data_year",
-    "tax_municipality": "tax_data_year",
-    "tax_park_district": "tax_data_year",
-    "tax_library_district": "tax_data_year",
-    "tax_fire_protection_district": "tax_data_year",
-    "tax_community_college_district": "tax_data_year",
-    "tax_sanitation_district": "tax_data_year",
-    "tax_special_service_area": "tax_data_year",
-    "tax_tif_district": "tax_data_year",
-    "central_business_district": "central_business_district_data_year",
-}
-
-geographies = list(years.keys())
-
-output_schema = "stage_name string, group_id string, geography_id string, year string, reassessment_year string, geography_data_year string, pin_n_tot int, pin_n_w_value int, pin_pct_w_value double, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double"
-
-
-def model(dbt, spark_session):
-    dbt.config(materialized="table", engine_config={"MaxConcurrentDpus": 40})
-    athena_user_logger.info("Loading assessment roll input table")
-
-    input = dbt.ref("reporting.sot_assessment_roll_input")
-    spark_schema = dict.fromkeys(input.columns, "string")
-    spark_schema |= dict.fromkeys(["av_tot", "av_bldg", "av_land"], "double")
-    spark_schema = ", ".join(
-        f"{key} {val}" for key, val in spark_schema.items()
-    )
-
-    spark_df = spark_session.createDataFrame(input, schema=spark_schema)
-
-    athena_user_logger.info("Dope stuff is happening... maybe?")
-
-    output = []
-    for group in groups:
-        for geography in geographies:
-            output += [
-                spark_df.groupby(["stage_name", group, geography, "year"])
-                .applyInPandas(
-                    aggregate,
-                    schema=output_schema,
-                )
-                .select(
-                    "*",
-                    lit(group).alias("group_type"),
-                    lit(geography).alias("geography_type"),
-                )
-            ]
-
-    df = reduce(DataFrame.unionByName, output)
-
-    return df

From 23c6fb8e326e6c7bc829ebe1484b710a51273955 Mon Sep 17 00:00:00 2001
From: sweatyhandshake <wridgeway@cookcountyassessor.com>
Date: Mon, 23 Jun 2025 11:20:47 -0500
Subject: [PATCH 78/96] Debugging input pyspark dataframe

---
 dbt/models/reporting/reporting.sot_assessment_roll.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index 0a143ca78..fa1e0a8b2 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -130,21 +130,14 @@ def model(dbt, spark_session):
     athena_user_logger.info("Loading assessment roll input table")
 
     input = dbt.ref("reporting.sot_assessment_roll_input")
-    spark_schema = dict.fromkeys(input.columns, "string")
-    spark_schema |= dict.fromkeys(["av_tot", "av_bldg", "av_land"], "double")
-    spark_schema = ", ".join(
-        f"{key} {val}" for key, val in spark_schema.items()
-    )
-
-    spark_df = spark_session.createDataFrame(input, schema=spark_schema)
-
+    
     athena_user_logger.info("Dope stuff is happening... maybe?")
 
     output = []
     for group in groups:
         for geography in geographies:
             output += [
-                spark_df.groupby(["stage_name", group, geography, "year"])
+                input.groupby(["stage_name", group, geography, "year"])
                 .applyInPandas(
                     aggregate,
                     schema=output_schema,

From 763915e55659f8bc2b3bd2c41730d4b7bd8b68b9 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Mon, 23 Jun 2025 20:43:54 +0000
Subject: [PATCH 79/96] Pass geography to aggregate

---
 .../reporting.sot_assessment_roll.py          | 51 ++++++++++---------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index fa1e0a8b2..0b3d9902a 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -54,31 +54,34 @@ def reassessment_year(year, geography, triad):
     return out
 
 
-def aggregate(key, pdf):
-    columns = ["av_tot", "av_bldg", "av_land"]
-
-    out = ()
-    out += (
-        reassessment_year(pdf["year"][0], geography, pdf["triad"][0]),  # noqa: F821
-        first(pdf[years[geography]]),  # noqa: F821
-        len(pdf["av_tot"]),
-        pdf["av_tot"].count(),
-        pdf["av_tot"].count() / pdf["av_tot"].size,
-    )
-    for column in columns:
+def aggregate_geography(geography):
+    def aggregate(key, pdf):
+        columns = ["av_tot", "av_bldg", "av_land"]
+
+        out = ()
         out += (
-            pdf[column].min(),
-            q10(pdf[column]),
-            q25(pdf[column]),
-            pdf[column].median(),
-            q75(pdf[column]),
-            q90(pdf[column]),
-            pdf[column].max(),
-            pdf[column].mean(),
-            pdf[column].sum(),
+            reassessment_year(pdf["year"][0], geography, pdf["triad"][0]),  # noqa: F821
+            first(pdf[years[geography]]),  # noqa: F821
+            len(pdf["av_tot"]),
+            pdf["av_tot"].count(),
+            pdf["av_tot"].count() / pdf["av_tot"].size,
         )
+        for column in columns:
+            out += (
+                pdf[column].min(),
+                q10(pdf[column]),
+                q25(pdf[column]),
+                pdf[column].median(),
+                q75(pdf[column]),
+                q90(pdf[column]),
+                pdf[column].max(),
+                pdf[column].mean(),
+                pdf[column].sum(),
+            )
+
+        return pd.DataFrame([key + out])
 
-    return pd.DataFrame([key + out])
+    return aggregate
 
 
 groups = [
@@ -130,7 +133,7 @@ def model(dbt, spark_session):
     athena_user_logger.info("Loading assessment roll input table")
 
     input = dbt.ref("reporting.sot_assessment_roll_input")
-    
+
     athena_user_logger.info("Dope stuff is happening... maybe?")
 
     output = []
@@ -139,7 +142,7 @@ def model(dbt, spark_session):
             output += [
                 input.groupby(["stage_name", group, geography, "year"])
                 .applyInPandas(
-                    aggregate,
+                    aggregate_geography(geography),
                     schema=output_schema,
                 )
                 .select(

From 29c15a1117596a3bac9f47c7012468e67cdbfd43 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Tue, 24 Jun 2025 15:32:50 +0000
Subject: [PATCH 80/96] Reduce input size to test runner memory limits

---
 dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
index 1cab79d90..b987c2d24 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
+++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
@@ -102,3 +102,4 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals
     AND uni.stage_name = vals.stage_name
 LEFT JOIN {{ ref('ccao.class_dict') }}
     ON uni.class = class_dict.class_code
+WHERE uni.year >= '2021'

From b99dd241123c49589fb78f6ea7df22dc2476b835 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Tue, 24 Jun 2025 16:24:31 +0000
Subject: [PATCH 81/96] Really reduce input size

---
 dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
index b987c2d24..cb1c8d819 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
+++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
@@ -102,4 +102,4 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals
     AND uni.stage_name = vals.stage_name
 LEFT JOIN {{ ref('ccao.class_dict') }}
     ON uni.class = class_dict.class_code
-WHERE uni.year >= '2021'
+WHERE uni.year = '2025'

From 76a7df5349091c4b055033e4eae929505baf0d8f Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Tue, 24 Jun 2025 18:00:29 +0000
Subject: [PATCH 82/96] Further reduce input size

---
 dbt/models/reporting/reporting.sot_assessment_roll.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index 0b3d9902a..1923955c3 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -133,6 +133,7 @@ def model(dbt, spark_session):
     athena_user_logger.info("Loading assessment roll input table")
 
     input = dbt.ref("reporting.sot_assessment_roll_input")
+    input = input.filter(input.stage_name == "BOR CERTIFIED")
 
     athena_user_logger.info("Dope stuff is happening... maybe?")
 

From 8fa0e4ea6e394d48ce6b46224a0a3c6b43330512 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Tue, 24 Jun 2025 18:12:47 +0000
Subject: [PATCH 83/96] Try a really small input

---
 dbt/models/reporting/reporting.sot_assessment_roll.py        | 1 -
 dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index 1923955c3..0b3d9902a 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -133,7 +133,6 @@ def model(dbt, spark_session):
     athena_user_logger.info("Loading assessment roll input table")
 
     input = dbt.ref("reporting.sot_assessment_roll_input")
-    input = input.filter(input.stage_name == "BOR CERTIFIED")
 
     athena_user_logger.info("Dope stuff is happening... maybe?")
 
diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
index cb1c8d819..00f7c7afa 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
+++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
@@ -102,4 +102,4 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals
     AND uni.stage_name = vals.stage_name
 LEFT JOIN {{ ref('ccao.class_dict') }}
     ON uni.class = class_dict.class_code
-WHERE uni.year = '2025'
+LIMIT 100000

From 8f1ee19742c18ccd74380483dbb492fc035d67fd Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Wed, 25 Jun 2025 16:24:43 +0000
Subject: [PATCH 84/96] Change int type for pyarrow

---
 dbt/models/reporting/reporting.sot_assessment_roll.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index 0b3d9902a..fc8738516 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -125,7 +125,7 @@ def aggregate(key, pdf):
 
 geographies = list(years.keys())
 
-output_schema = "stage_name string, group_id string, geography_id string, year string, reassessment_year string, geography_data_year string, pin_n_tot int, pin_n_w_value int, pin_pct_w_value double, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double"
+output_schema = "stage_name string, group_id string, geography_id string, year string, reassessment_year string, geography_data_year string, pin_n_tot bigint, pin_n_w_value bigint, pin_pct_w_value double, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double"
 
 
 def model(dbt, spark_session):

From b8bdf39e507c6e969ffc457324581f8c38301908 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Wed, 25 Jun 2025 18:44:24 +0000
Subject: [PATCH 85/96] Try coercing expected string columns

---
 .../reporting/reporting.sot_assessment_roll.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index fc8738516..5c16a9698 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -6,7 +6,7 @@
 
 import pandas as pd
 from pyspark.sql import DataFrame
-from pyspark.sql.functions import lit
+from pyspark.sql.functions import col, lit
 
 
 # Define aggregation functions. These are just wrappers for basic python
@@ -60,8 +60,8 @@ def aggregate(key, pdf):
 
         out = ()
         out += (
-            reassessment_year(pdf["year"][0], geography, pdf["triad"][0]),  # noqa: F821
-            first(pdf[years[geography]]),  # noqa: F821
+            reassessment_year(pdf["year"][0], geography, pdf["triad"][0]),
+            first(pdf[years[geography]]),
             len(pdf["av_tot"]),
             pdf["av_tot"].count(),
             pdf["av_tot"].count() / pdf["av_tot"].size,
@@ -154,4 +154,16 @@ def model(dbt, spark_session):
 
     df = reduce(DataFrame.unionByName, output)
 
+    for column in [
+        "stage_name",
+        "group_id",
+        "geography_id",
+        "year",
+        "reassessment_year",
+        "geography_data_year",
+        "group_type",
+        "geography_type",
+    ]:
+        df = df.withColumn(column, col(column).cast("string"))
+
     return df

From f33a2e4c3fda473d2cb0853e8177f5d27fddbd57 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Thu, 26 Jun 2025 14:28:04 +0000
Subject: [PATCH 86/96] Remove string coersion for output table

---
 .../reporting/reporting.sot_assessment_roll.py     | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index 5c16a9698..8ee95a106 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -6,7 +6,7 @@
 
 import pandas as pd
 from pyspark.sql import DataFrame
-from pyspark.sql.functions import col, lit
+from pyspark.sql.functions import lit
 
 
 # Define aggregation functions. These are just wrappers for basic python
@@ -154,16 +154,4 @@ def model(dbt, spark_session):
 
     df = reduce(DataFrame.unionByName, output)
 
-    for column in [
-        "stage_name",
-        "group_id",
-        "geography_id",
-        "year",
-        "reassessment_year",
-        "geography_data_year",
-        "group_type",
-        "geography_type",
-    ]:
-        df = df.withColumn(column, col(column).cast("string"))
-
     return df

From 2264e1d29ad1fe4eb0070f09bbe47a1538d40cc7 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Thu, 26 Jun 2025 15:50:21 +0000
Subject: [PATCH 87/96] Try to increase max driver result for spark session

---
 dbt/models/reporting/reporting.sot_assessment_roll.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index 8ee95a106..bb1de2514 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -130,6 +130,7 @@ def aggregate(key, pdf):
 
 def model(dbt, spark_session):
     dbt.config(materialized="table", engine_config={"MaxConcurrentDpus": 40})
+    spark_session.builder.config("spark.driver.maxResultSize", "0")
     athena_user_logger.info("Loading assessment roll input table")
 
     input = dbt.ref("reporting.sot_assessment_roll_input")

From dfb7d1db6e543feb9552a45332b4232191dfbf45 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Thu, 26 Jun 2025 16:56:27 +0000
Subject: [PATCH 88/96] Change spark driver config access

---
 dbt/models/reporting/reporting.sot_assessment_roll.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index bb1de2514..cdf1a80a1 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -8,6 +8,8 @@
 from pyspark.sql import DataFrame
 from pyspark.sql.functions import lit
 
+spark.driver.maxResultSize = 0  # noqa:F821
+
 
 # Define aggregation functions. These are just wrappers for basic python
 # functions that make using them easier to use with pandas.agg().
@@ -130,7 +132,6 @@ def aggregate(key, pdf):
 
 def model(dbt, spark_session):
     dbt.config(materialized="table", engine_config={"MaxConcurrentDpus": 40})
-    spark_session.builder.config("spark.driver.maxResultSize", "0")
     athena_user_logger.info("Loading assessment roll input table")
 
     input = dbt.ref("reporting.sot_assessment_roll_input")

From 2323aeaea10f77a87f27530be419bb8fd6f69c87 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Thu, 26 Jun 2025 17:05:25 +0000
Subject: [PATCH 89/96] One more driver attempt

---
 dbt/models/reporting/reporting.sot_assessment_roll.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index cdf1a80a1..f27042390 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -8,8 +8,6 @@
 from pyspark.sql import DataFrame
 from pyspark.sql.functions import lit
 
-spark.driver.maxResultSize = 0  # noqa:F821
-
 
 # Define aggregation functions. These are just wrappers for basic python
 # functions that make using them easier to use with pandas.agg().
@@ -132,6 +130,7 @@ def aggregate(key, pdf):
 
 def model(dbt, spark_session):
     dbt.config(materialized="table", engine_config={"MaxConcurrentDpus": 40})
+    spark_session.driver.maxResultSize = 0
     athena_user_logger.info("Loading assessment roll input table")
 
     input = dbt.ref("reporting.sot_assessment_roll_input")

From 8cfc71394bfb67d03a4e037111bc70aff0784f43 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Thu, 26 Jun 2025 19:20:29 +0000
Subject: [PATCH 90/96] Try new engine config

---
 dbt/models/reporting/reporting.sot_assessment_roll.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index f27042390..8fe07f776 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -129,8 +129,14 @@ def aggregate(key, pdf):
 
 
 def model(dbt, spark_session):
-    dbt.config(materialized="table", engine_config={"MaxConcurrentDpus": 40})
-    spark_session.driver.maxResultSize = 0
+    dbt.config(
+        materialized="table",
+        engine_config={
+            "MaxConcurrentDpus": 40,
+            "SparkProperties": {"spark.driver.maxResultSize": "4g"},
+        },
+    )
+
     athena_user_logger.info("Loading assessment roll input table")
 
     input = dbt.ref("reporting.sot_assessment_roll_input")

From db21c18914acc8d93be54a4c84d73143da5e116e Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Thu, 26 Jun 2025 21:05:44 +0000
Subject: [PATCH 91/96] Test smaller amount of collection

---
 dbt/models/reporting/reporting.sot_assessment_roll.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index 8fe07f776..b8cf2e4f8 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -124,6 +124,9 @@ def aggregate(key, pdf):
 }
 
 geographies = list(years.keys())
+geographies = [
+    geographies[0]
+]  # For testing purposes, only use the first geography
 
 output_schema = "stage_name string, group_id string, geography_id string, year string, reassessment_year string, geography_data_year string, pin_n_tot bigint, pin_n_w_value bigint, pin_pct_w_value double, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double"
 
@@ -144,6 +147,7 @@ def model(dbt, spark_session):
     athena_user_logger.info("Dope stuff is happening... maybe?")
 
     output = []
+
     for group in groups:
         for geography in geographies:
             output += [
@@ -159,6 +163,6 @@ def model(dbt, spark_session):
                 )
             ]
 
-    df = reduce(DataFrame.unionByName, output)
+        df = reduce(DataFrame.unionByName, output)
 
     return df

From e6681fedb68967d080eb4ab511b572d4dd70f245 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Thu, 26 Jun 2025 21:08:27 +0000
Subject: [PATCH 92/96] Remove config without permission

---
 dbt/models/reporting/reporting.sot_assessment_roll.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index b8cf2e4f8..1eed9ae74 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -136,7 +136,6 @@ def model(dbt, spark_session):
         materialized="table",
         engine_config={
             "MaxConcurrentDpus": 40,
-            "SparkProperties": {"spark.driver.maxResultSize": "4g"},
         },
     )
 

From 34fa863379cb2af47ff31732ffe54cdf33a1e388 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Fri, 27 Jun 2025 16:03:56 +0000
Subject: [PATCH 93/96] Test using entire input

---
 dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
index 00f7c7afa..1cab79d90 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
+++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
@@ -102,4 +102,3 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals
     AND uni.stage_name = vals.stage_name
 LEFT JOIN {{ ref('ccao.class_dict') }}
     ON uni.class = class_dict.class_code
-LIMIT 100000

From 9b5d48f045f6c474629c90d789ae95c0aba1f592 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Fri, 27 Jun 2025 19:18:47 +0000
Subject: [PATCH 94/96] Revert for now

---
 dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
index 1cab79d90..00f7c7afa 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
+++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
@@ -102,3 +102,4 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals
     AND uni.stage_name = vals.stage_name
 LEFT JOIN {{ ref('ccao.class_dict') }}
     ON uni.class = class_dict.class_code
+LIMIT 100000

From e4aeb0e32da9dc582b62f6aba460dd1417ce871c Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Sat, 28 Jun 2025 18:23:13 +0000
Subject: [PATCH 95/96] Remove limit again for testing

---
 dbt/models/reporting/reporting.sot_assessment_roll_input.sql | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
index 00f7c7afa..1cab79d90 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
+++ b/dbt/models/reporting/reporting.sot_assessment_roll_input.sql
@@ -102,4 +102,3 @@ LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals
     AND uni.stage_name = vals.stage_name
 LEFT JOIN {{ ref('ccao.class_dict') }}
     ON uni.class = class_dict.class_code
-LIMIT 100000

From e1460669c146db89c4c3ea28fcde4a96ad1628f4 Mon Sep 17 00:00:00 2001
From: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
Date: Sat, 28 Jun 2025 20:36:26 +0000
Subject: [PATCH 96/96] Attempt to collect more often

---
 dbt/models/reporting/reporting.sot_assessment_roll.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/dbt/models/reporting/reporting.sot_assessment_roll.py b/dbt/models/reporting/reporting.sot_assessment_roll.py
index 1eed9ae74..f1331d847 100644
--- a/dbt/models/reporting/reporting.sot_assessment_roll.py
+++ b/dbt/models/reporting/reporting.sot_assessment_roll.py
@@ -2,10 +2,8 @@
 # number of geographies, class combinations, and time.
 
 # Import libraries
-from functools import reduce
 
 import pandas as pd
-from pyspark.sql import DataFrame
 from pyspark.sql.functions import lit
 
 
@@ -146,7 +144,6 @@ def model(dbt, spark_session):
     athena_user_logger.info("Dope stuff is happening... maybe?")
 
     output = []
-
     for group in groups:
         for geography in geographies:
             output += [
@@ -160,8 +157,9 @@ def model(dbt, spark_session):
                     lit(group).alias("group_type"),
                     lit(geography).alias("geography_type"),
                 )
+                .toPandas()
             ]
 
-        df = reduce(DataFrame.unionByName, output)
+    df = pd.concat(output)
 
     return df