ccao-data · wrridgeway · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jun 11, 2024
@@ -78,5 +78,7 @@ seeds:
       +schema: location
     model:
       +schema: model
+    reporting:
+      +schema: reporting
     spatial:
       +schema: spatial
@@ -80,6 +80,70 @@ for every possible geography and reporting group combination.
 **Primary Key**: `pin`, `year`
 {% enddocs %}
 
+# sot_assessment_roll
+{% docs table_sot_assessment_roll %}
+Aggregated summary stats of assessed values across a number of geographies,
+class combinations, and time.
+
+**Primary Key**: `year`, `stage_name`, `geography_id`, `group_id`
+{% enddocs %}
+
+# sot_assessment_roll_input
+{% docs table_sot_assessment_roll_input %}
+Table to feed the Python dbt job that creates the
+`reporting.sot_assessment_roll` table. Feeds public reporting assets.
+
+**Primary Key**: `year`, `stage_name`, `geography_id`, `group_id`
+{% enddocs %}
+
+# sot_ratio_stat
+{% docs table_sot_ratio_stat %}
+Aggregated summary stats of sales ratios across a number of geographies, class
+combinations, and time.
+
+**Primary Key**: `year`, `stage_name`, `geography_id`, `group_id`
+{% enddocs %}
+
+# sot_ratio_stat_input
+{% docs table_sot_ratio_stat_input %}
+Table to feed the Python dbt job that creates the
+`reporting.sot_ratio_stats` table. Feeds public reporting assets.
+
+**Primary Key**: `year`, `stage_name`, `geography_id`, `group_id`
+{% enddocs %}
+
+# sot_sale
+{% docs table_sot_sale %}
+Aggregated summary stats of sales across a number of geographies, class
+combinations, and time.
+
+**Primary Key**: `year`, `geography_id`, `group_id`
+{% enddocs %}
+
+# sot_sale_input
+{% docs table_sot_sale_input %}
+Table to feed the Python dbt job that creates the
+`reporting.sot_sale` table. Feeds public reporting assets.
+
+**Primary Key**: `year`, `geography_id`, `group_id`
+{% enddocs %}
+
+# sot_taxes_exemptions
+{% docs table_sot_taxes_exemptions %}
+Aggregated summary stats of taxes and exemptions data across a number of
+geographies, class combinations, and time.
+
+**Primary Key**: `year`, `geography_id`, `group_id`
+{% enddocs %}
+
+# sot_taxes_exemptions_input
+{% docs table_sot_taxes_exemptions_input %}
+Table to feed the Python dbt job that creates the
+`reporting.sot_taxes_exemptions` table. Feeds public reporting assets.
+
+**Primary Key**: `year`, `geography_id`, `group_id`
+{% enddocs %}
+
 # vw_assessment_roll
 
 {% docs view_vw_assessment_roll %}

@@ -0,0 +1,165 @@
+# This script generates aggregated summary stats on assessed values across a
+# number of geographies, class combinations, and time.
+
+# Import libraries
+
+import pandas as pd
+from pyspark.sql.functions import lit
+
+
+# Define aggregation functions. These are just wrappers for basic python
+# functions that make using them easier to use with pandas.agg().
+def q10(x):
+    return x.quantile(0.1)
+
+
+def q25(x):
+    return x.quantile(0.25)
+
+
+def q75(x):
+    return x.quantile(0.75)
+
+
+def q90(x):
+    return x.quantile(0.9)
+
+
+def first(x):
+    if len(x) >= 1:
+        output = x.iloc[0]
+    else:
+        output = None
+
+    return output
+
+
+def reassessment_year(year, geography, triad):
+    if geography in ["triad", "township", "nbhd"]:
+        year = int(year) % 3
+
+        if (
+            ((year == 0) & (triad == "North"))
+            | ((year == 1) & (triad == "South"))
+            | ((year == 2) & (triad == "City"))
+        ):
+            out = "Yes"
+        else:
+            out = "No"
+    else:
+        out = ""
+
+    return out
+
+
+def aggregate_geography(geography):
+    def aggregate(key, pdf):
+        columns = ["av_tot", "av_bldg", "av_land"]
+
+        out = ()
+        out += (
+            reassessment_year(pdf["year"][0], geography, pdf["triad"][0]),
+            first(pdf[years[geography]]),
+            len(pdf["av_tot"]),
+            pdf["av_tot"].count(),
+            pdf["av_tot"].count() / pdf["av_tot"].size,
+        )
+        for column in columns:
+            out += (
+                pdf[column].min(),
+                q10(pdf[column]),
+                q25(pdf[column]),
+                pdf[column].median(),
+                q75(pdf[column]),
+                q90(pdf[column]),
+                pdf[column].max(),
+                pdf[column].mean(),
+                pdf[column].sum(),
+            )
+
+        return pd.DataFrame([key + out])
+
+    return aggregate
+
+
+groups = [
+    "res_other",
+    "major_class",
+    "no_group",
+    "class",
+    "modeling_group",
+]
+
+years = {
+    "county": "year",
+    "triad": "year",
+    "township": "year",
+    "nbhd": "year",
+    "tax_code": "year",
+    "zip_code": "year",
+    "community_area": "community_area_data_year",
+    "census_place": "census_data_year",
+    "census_tract": "census_data_year",
+    "census_congressional_district": "census_data_year",
+    "census_zcta": "census_data_year",
+    "cook_board_of_review_district": "cook_board_of_review_district_data_year",
+    "cook_commissioner_district": "cook_commissioner_district_data_year",
+    "cook_judicial_district": "cook_judicial_district_data_year",
+    "ward_num": "ward_data_year",
+    "police_district": "police_district_data_year",
+    "school_elementary_district": "school_data_year",
+    "school_secondary_district": "school_data_year",
+    "school_unified_district": "school_data_year",
+    "tax_municipality": "tax_data_year",
+    "tax_park_district": "tax_data_year",
+    "tax_library_district": "tax_data_year",
+    "tax_fire_protection_district": "tax_data_year",
+    "tax_community_college_district": "tax_data_year",
+    "tax_sanitation_district": "tax_data_year",
+    "tax_special_service_area": "tax_data_year",
+    "tax_tif_district": "tax_data_year",
+    "central_business_district": "central_business_district_data_year",
+}
+
+geographies = list(years.keys())
+geographies = [
+    geographies[0]
+]  # For testing purposes, only use the first geography
+
+output_schema = "stage_name string, group_id string, geography_id string, year string, reassessment_year string, geography_data_year string, pin_n_tot bigint, pin_n_w_value bigint, pin_pct_w_value double, min_av_tot double, q10_av_tot double, q25_av_tot double, median_av_tot double, q75_av_tot double, q90_av_tot double, max_av_tot double, mean_av_tot double, sum_av_tot double, min_av_bldg double, q10_av_bldg double, q25_av_bldg double, median_av_bldg double, q75_av_bldg double, q90_av_bldg double, max_av_bldg double, mean_av_bldg double, sum_av_bldg double, min_av_land double, q10_av_land double, q25_av_land double, median_av_land double, q75_av_land double, q90_av_land double, max_av_land double, mean_av_land double, sum_av_land double"
+
+
+def model(dbt, spark_session):
+    dbt.config(
+        materialized="table",
+        engine_config={
+            "MaxConcurrentDpus": 40,
+        },
+    )
+
+    athena_user_logger.info("Loading assessment roll input table")
+
+    input = dbt.ref("reporting.sot_assessment_roll_input")
+
+    athena_user_logger.info("Dope stuff is happening... maybe?")
+
+    output = []
+    for group in groups:
+        for geography in geographies:
+            output += [
+                input.groupby(["stage_name", group, geography, "year"])
+                .applyInPandas(
+                    aggregate_geography(geography),
+                    schema=output_schema,
+                )
+                .select(
+                    "*",
+                    lit(group).alias("group_type"),
+                    lit(geography).alias("geography_type"),
+                )
+                .toPandas()
+            ]
+
+    df = pd.concat(output)
+
+    return df
@@ -0,0 +1,104 @@
+-- This script gathers parcel-level geographies and joins them to values and
+-- class groupings. Its sole purpose is to feed reporting.sot_assessment_roll,
+-- and should not be used otherwise.
+{{
+    config(
+        materialized='table',
+        partitioned_by=['year']
+    )
+}}
+
+/* Ensure every municipality/class/year has a row for every stage through
+cross-joining. This is to make sure that combinations that do not yet
+exist in iasworld.asmt_all for the current year will exist in the view, but have
+largely empty columns. For example: even if no class 4s in the City of Chicago
+have been mailed yet for the current assessment year, we would still like an
+empty City of Chicago/class 4 row to exist for the mailed stage. */
+WITH stages AS (
+
+    SELECT 'MAILED' AS stage_name
+    UNION
+    SELECT 'ASSESSOR CERTIFIED' AS stage_name
+    UNION
+    SELECT 'BOR CERTIFIED' AS stage_name
+
+),
+
+-- Universe of all parcels as defined by iasworld.pardat, expanded with
+-- assessment stages.
+uni AS (
+    SELECT
+        vw_pin_universe.*,
+        stages.*
+    FROM {{ ref('default.vw_pin_universe') }}
+    CROSS JOIN stages
+)
+
+SELECT
+    uni.stage_name,
+    uni.class,
+    CAST(vals.tot AS INT) AS av_tot,
+    CAST(vals.bldg AS INT) AS av_bldg,
+    CAST(vals.land AS INT) AS av_land,
+    'Cook' AS county,
+    uni.triad_name AS triad,
+    uni.township_name AS township,
+    uni.nbhd_code AS nbhd,
+    uni.tax_code,
+    uni.zip_code,
+    uni.chicago_community_area_name AS community_area,
+    uni.census_place_geoid AS census_place,
+    uni.census_tract_geoid AS census_tract,
+    uni.census_congressional_district_geoid
+        AS
+        census_congressional_district,
+    uni.census_zcta_geoid AS census_zcta,
+    uni.cook_board_of_review_district_num AS cook_board_of_review_district,
+    uni.cook_commissioner_district_num AS cook_commissioner_district,
+    uni.cook_judicial_district_num AS cook_judicial_district,
+    uni.ward_num,
+    uni.chicago_police_district_num AS police_district,
+    uni.school_elementary_district_geoid AS school_elementary_district,
+    uni.school_secondary_district_geoid AS school_secondary_district,
+    uni.school_unified_district_geoid AS school_unified_district,
+    ARRAY_JOIN(uni.tax_municipality_name, ', ') AS tax_municipality,
+    ARRAY_JOIN(uni.tax_park_district_name, ', ') AS tax_park_district,
+    ARRAY_JOIN(uni.tax_library_district_name, ', ') AS tax_library_district,
+    ARRAY_JOIN(uni.tax_fire_protection_district_name, ', ')
+        AS tax_fire_protection_district,
+    ARRAY_JOIN(uni.tax_community_college_district_name, ', ')
+        AS
+        tax_community_college_district,
+    ARRAY_JOIN(uni.tax_sanitation_district_name, ', ')
+        AS tax_sanitation_district,
+    ARRAY_JOIN(uni.tax_special_service_area_name, ', ')
+        AS tax_special_service_area,
+    ARRAY_JOIN(uni.tax_tif_district_name, ', ') AS tax_tif_district,
+    uni.econ_central_business_district_num AS central_business_district,
+    uni.census_data_year,
+    uni.cook_board_of_review_district_data_year,
+    uni.cook_commissioner_district_data_year,
+    uni.cook_judicial_district_data_year,
+    COALESCE(
+        uni.ward_chicago_data_year, uni.ward_evanston_data_year) AS
+    ward_data_year,
+    uni.chicago_community_area_data_year AS community_area_data_year,
+    uni.chicago_police_district_data_year AS police_district_data_year,
+    uni.econ_central_business_district_data_year
+        AS
+        central_business_district_data_year,
+    uni.school_data_year,
+    uni.tax_data_year,
+    'no_group' AS no_group,
+    class_dict.major_class_type AS major_class,
+    class_dict.modeling_group,
+    CASE WHEN class_dict.major_class_code = '2' THEN 'RES' ELSE 'OTHER' END
+        AS res_other,
+    uni.year
+FROM uni
+LEFT JOIN {{ ref('reporting.vw_pin_value_long') }} AS vals
+    ON uni.pin = vals.pin
+    AND uni.year = vals.year
+    AND uni.stage_name = vals.stage_name
+LEFT JOIN {{ ref('ccao.class_dict') }}
+    ON uni.class = class_dict.class_code