From 271924bee95bdfd7c5c282a82428e5226c7842b7 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Thu, 6 Feb 2025 19:32:15 +0000 Subject: [PATCH 1/6] Mock up refactor --- glue/flagging_script_glue/flagging.py | 1616 ++++++++----------------- 1 file changed, 511 insertions(+), 1105 deletions(-) diff --git a/glue/flagging_script_glue/flagging.py b/glue/flagging_script_glue/flagging.py index dc61ee84..77d8e375 100644 --- a/glue/flagging_script_glue/flagging.py +++ b/glue/flagging_script_glue/flagging.py @@ -12,756 +12,333 @@ from sklearn.preprocessing import LabelEncoder from sklearn.decomposition import PCA -SHORT_TERM_OWNER_THRESHOLD = 365 # 365 = 365 days or 1 year +# Constants +SHORT_TERM_OWNER_THRESHOLD = 365 # days +# Compile entity keywords regex for performance +ENTITY_KEYWORDS = re.compile( + r"llc| ll$| l$|l l c|estate|training|construction|building|masonry|" + r"apartments|plumbing|service|professional|roofing|advanced|office|" + r"\blaw\b|\bloan\b|legal|production|woodwork|concepts|corp|company|" + r" united|\binc\b|county|entertainment|community|heating|cooling" + r"|partners|equity|indsutries|series|revitalization|collection|" + r"agency|renovation|consulting|flippers|estates|\bthe \b|dept|" + r"funding|opportunity|improvements|servicing|equities|\bsale\b|" + r"judicial| in$|bank|\btrust\b|holding|investment|housing" + r"|properties|limited|realty|development|capital|management" + r"|developers|construction|rentals|group|investments|invest|" + r"residences|enterprise|enterprises|ventures|remodeling|" + r"specialists|homes|business|venture|restoration|renovations" + r"|maintenance|ltd|real estate|builders|buyers|property|financial" + r"|associates|consultants|international|acquisitions|credit|design" + r"|homeownership|solutions|\bhome\b|diversified|assets|family|\bland\b" + r"|revocable|services|rehabbing|\bliving\b|county of cook|fannie mae" + r"|veteran|mortgage|savings|lp$|federal natl|hospital|southport|mtg" + r"|propert|rehab|neighborhood|advantage|chicago|cook c|\bbk\b|\bhud\b" + r"|department|united states|\busa\b|hsbc|midwest|residential|american" + r"|tcf|advantage|real e|advantage|fifth third|baptist church" + r"|apostolic church|lutheran church|catholic church|\bfed\b|nationstar" + r"|advantage|commercial|health|condominium|nationa|association|homeowner" + r"|christ church|christian church|baptist church|community church" + r"|church of c|\bdelaw\b|lawyer|delawar", + re.IGNORECASE, +) -def go( - df: pd.DataFrame, - groups: tuple, - iso_forest_cols: list, - dev_bounds: tuple, - condos: bool, - raw_price_threshold: int, -): - """ - This function runs all of our other functions in the correct sequence. - - Inputs: - df (pandas dataframe): data used to perform the outlier calculation - groups (tuple): which groups to groupby when selecting outliers. - Ex: ('township','class','year') - iso_forest (list): list with columns to run PCA/IsoForest on - dev_bounds (tuple): how many std deviations on either side to select as outliers. - Ex: (2,2) selects outliers as being farther away than 2 - std deviations on both sides. - condos (boolean): determines whether we are running the flagging model for res or condos - Outputs: - df (pandas dataframe): - """ - if condos: - print("Flagging for condos") - else: - print("Flagging for residential") +# ============================================================================= +# Utility Functions +# ============================================================================= +def create_group_string(groups: tuple, sep: str = "_") -> str: + """Joins group names with a separator to create a string for column naming.""" + return sep.join(groups) - print("Initialize") - df = create_stats(df, groups, condos=condos) # 'year', 'township_code', 'class' - print("create_stats() done") - df = string_processing(df) - print("string_processing() done") - df = iso_forest(df, groups, iso_forest_cols) - print("iso_forest() done") - df = outlier_taxonomy( - df, dev_bounds, groups, condos=condos, raw_price_threshold=raw_price_threshold - ) - print("outlier_taxonomy() done\nfinished") +def log_transform(df: pd.DataFrame, columns: list) -> pd.DataFrame: + """Applies base-10 log transformation to the specified columns.""" + for col in columns: + df[col] = np.log10(df[col]) return df -def create_group_string(groups: tuple, sep: str) -> str: - """ - Creates a string joined on a separator from the groups tuple. - For the purpose of making column names and descriptions. - Inputs: - groups (tuple): the columns being used in groupby() - sep (str): string to separate the groups with. - Outputs: - groups as a string joined by given separator - """ - return sep.join(groups) +def z_normalize_groupby(s: pd.Series) -> pd.Series: + """Returns the z-score normalization for a series (used with groupby.apply).""" + return zscore(s, nan_policy="omit") -def outlier_taxonomy( - df: pd.DataFrame, - permut: tuple, - groups: tuple, - condos: bool, - raw_price_threshold: int, -): - """ - Creates columns having to do with our chosen outlier taxonomy. - Ex: Family sale, Home flip sale, Non-person sale, High price (raw and or sqft), etc. - Inputs: - df (pd.DataFrame): dataframe to create taxonomy on. - permut (tuple): permutation of std deviations - groups (tuple): columns to do grouping on. - Probably 'township' and 'class'. - Ouputs: - df (pd.DataFrame): dataframe with outlier taxonomy - """ +def between_two_numbers(num: float, a: float, b: float) -> bool: + """Checks if num is strictly between a and b.""" + return a < num < b - df = check_days(df, SHORT_TERM_OWNER_THRESHOLD) - df = pricing_info(df, permut, groups, condos=condos) - df = outlier_type(df, condos=condos, raw_price_threshold=raw_price_threshold) +# ============================================================================= +# Statistical & Pricing Functions +# ============================================================================= +def grouping_mean(df: pd.DataFrame, groups: tuple, condos: bool) -> pd.DataFrame: + """Computes the mean sale price (and price per sqft for non-condos) using transform.""" + group_str = create_group_string(groups) + df[f"sv_mean_price_{group_str}"] = df.groupby(list(groups))[ + "meta_sale_price" + ].transform("mean") + if not condos: + df[f"sv_mean_price_per_sqft_{group_str}"] = df.groupby(list(groups))[ + "sv_price_per_sqft" + ].transform("mean") return df -def iso_forest(df, groups, columns, n_estimators=1000, max_samples=0.2): - """ - Runs an isolation forest model on our data for outlier detection. - First does PCA, then, attaches township/class info, and then runs the - IsoForest model with given parameters. - Inputs: - df (pd.DataFrame): dataframe with data for IsoForest - groups (tuple): grouping for the data to input into the IsoForest - columns (list): list with columns to run PCA/IsoForest on - n_estimators (int): number of estimators in IsoForest - max_samples(int or float): share of data to use as sample if float, - number to use if int - Outputs: - df (pd.DataFrame): with 'sv_anomaly' column from IsoForest. - """ - # Set index - df.set_index("meta_sale_document_num", inplace=True) - - # Perform PCA (assuming pca is a predefined function) - feed = pca(df, columns) - - feed.index = df.index - - # Label encode non-numeric groups - label_encoders = {} - for group in groups: - if df[group].dtype not in ["int64", "float64", "int32", "float32"]: - le = LabelEncoder() - df[group] = le.fit_transform(df[group]) - label_encoders[group] = le # Store the encoder if needed later - feed[group] = df[group] - - # Initialize and fit the Isolation Forest - isof = IsolationForest( - n_estimators=n_estimators, - max_samples=max_samples, - bootstrap=True, - random_state=42, - ) - df["sv_anomaly"] = isof.fit_predict(feed) - - # Assign labels for anomalies - df["sv_anomaly"] = np.select( - [(df["sv_anomaly"] == -1), (df["sv_anomaly"] == 1)], - ["Outlier", "Not Outlier"], - default="Not Outlier", +def deviation_dollars(df: pd.DataFrame, groups: tuple) -> pd.DataFrame: + """Calculates deviations (in dollars) from group means.""" + group_str = create_group_string(groups) + df[f"sv_deviation_{group_str}_mean_price"] = ( + df["meta_sale_price"] - df[f"sv_mean_price_{group_str}"] ) - - # Restore original values for encoded columns - for group, le in label_encoders.items(): - df[group] = le.inverse_transform(df[group]) - - # Reset index - df.reset_index(inplace=True) - + if f"sv_mean_price_per_sqft_{group_str}" in df.columns: + df[f"sv_deviation_{group_str}_mean_price_per_sqft"] = ( + df["sv_price_per_sqft"] - df[f"sv_mean_price_per_sqft_{group_str}"] + ) return df -def pca(df: pd.DataFrame, columns: list) -> pd.DataFrame: - """ - Runs PCA on data, selects compoents where explained variance > 1. - Inputs: - df (pd.DataFrame): dataframe to run PCA on. - columns (list): columns of dataframe to run PCA on. - Outputs: - df (pd.DataFrame): dataframe of principal components - """ - feed_data = df[columns] - feed_data = feed_data.fillna(0) - feed_data = feed_data.replace([np.inf, -np.inf], 0) - - pca = PCA(n_components=len(feed_data.columns)) - pc = pca.fit_transform(feed_data) - - cols = ["PC" + str(num) for num in range(len(feed_data.columns))] - - pc_df = pd.DataFrame(data=pc, columns=cols) - take = len(pca.explained_variance_[pca.explained_variance_ > 1]) - - df = pc_df[pc_df.columns[:take]] - +def price_sqft(df: pd.DataFrame) -> pd.DataFrame: + """Calculates the price per square foot.""" + df["sv_price_per_sqft"] = df["meta_sale_price"] / df["char_bldg_sf"] + df["sv_price_per_sqft"].replace([np.inf, -np.inf], np.nan, inplace=True) return df -def pricing_info( - df: pd.DataFrame, permut: tuple, groups: tuple, condos: bool -) -> pd.DataFrame: +def transaction_days(df: pd.DataFrame) -> pd.DataFrame: """ - Creates information about whether the price is an outlier, and its movement. - Also fetches the sandard deviation for the record. - pricing is whether it is a high/low outlier and whether it is a price swing. - which_price is whether it is the raw price, price/sqft or both that are outliers. - Inputs: - df (pd.DataFrame): dataframe of sales - permut (tuple): tuple of standard deviation boundaries. - Ex: (2,2) is 2 std away on both sides. - condos (bool): Specifies whether we are running function for condos or residential - Outputs: - df (pd.DataFrame): dataframe with 3 extra columns of price info. + Calculates the days elapsed since the last transaction. + Assumes that 'meta_sale_date' is datetime. """ - group_string = create_group_string(groups, "_") - - columns_to_log = ["meta_sale_price"] - if not condos: - columns_to_log.append("sv_price_per_sqft") - df = log_transform(df, columns_to_log) - - prices = [ - f"sv_price_deviation_{group_string}", - f"sv_cgdr_deviation_{group_string}", - ] - if not condos: - prices.insert(1, f"sv_price_per_sqft_deviation_{group_string}") - - # Persist standard deviation per group - group_std = ( - df.groupby(list(groups), group_keys=False)["meta_sale_price"] - .std(ddof=0) - .reset_index() - ) - group_std = group_std.rename(columns={"meta_sale_price": "group_std"}) - df = df.merge(group_std, on=groups) - - # Add group mean columns - group_mean = ( - df.groupby(list(groups), group_keys=False)["meta_sale_price"] - .mean() - .reset_index() + mask = df["original_observation"] == True + df.loc[mask, "sv_days_since_last_transaction"] = ( + df.sort_values("meta_sale_date") + .groupby("pin")["meta_sale_date"] + .transform(lambda x: x.diff().dt.days) ) - group_mean = group_mean.rename(columns={"meta_sale_price": "group_mean"}) - df = df.merge(group_mean, on=groups) - - if not condos: - # Persist group sqft standard deviation and group mean - group_sqft_std = ( - df.groupby(list(groups), group_keys=False)["sv_price_per_sqft"] - .std(ddof=0) - .reset_index() - ) - group_sqft_std = group_sqft_std.rename( - columns={"sv_price_per_sqft": "group_sqft_std"} - ) - df = df.merge(group_sqft_std, on=groups) - - group_sqft_mean = ( - df.groupby(list(groups), group_keys=False)["sv_price_per_sqft"] - .mean() - .reset_index() - ) - group_sqft_mean = group_sqft_mean.rename( - columns={"sv_price_per_sqft": "group_sqft_mean"} - ) - df = df.merge(group_sqft_mean, on=groups) - - # Calculate standard deviations - df[f"sv_price_deviation_{group_string}"] = df.groupby( - list(groups), group_keys=False - )["meta_sale_price"].apply(z_normalize_groupby) - - if not condos: - df[f"sv_price_per_sqft_deviation_{group_string}"] = df.groupby( - list(groups), group_keys=False - )["sv_price_per_sqft"].apply(z_normalize_groupby) - - df[f"sv_cgdr_deviation_{group_string}"] = df.groupby( - list(groups), group_keys=False - )["sv_cgdr"].apply(z_normalize_groupby) - - holds = get_thresh(df, prices, permut, groups) - df["sv_pricing"] = df.apply(price_column, args=(holds, groups, condos), axis=1) - - if not condos: - df["sv_which_price"] = df.apply(which_price, args=(holds, groups), axis=1) - return df -def which_price(row: pd.Series, thresholds: dict, groups: tuple) -> str: - """ - Determines whether sale_price, price_per_sqft, or both are outliers, - and returns a string resembling it. - Inputs: - thresholds (dict): dict of thresholds from get_thresh - Outputs: - value (str): string saying which of these are outliers. +def percent_change(df: pd.DataFrame) -> pd.DataFrame: """ - value = "Non-outlier" - group_string = create_group_string(groups, "_") - key = tuple(row[group] for group in groups) - - if thresholds.get(f"sv_price_deviation_{group_string}").get(key) and thresholds.get( - f"sv_price_per_sqft_deviation_{group_string}" - ).get(key): - s_std, *s_std_range = thresholds.get(f"sv_price_deviation_{group_string}").get( - key - ) - s_lower, s_upper = s_std_range - sq_std, *sq_std_range = thresholds.get( - f"sv_price_per_sqft_deviation_{group_string}" - ).get(key) - sq_lower, sq_upper = sq_std_range - if not between_two_numbers( - row[f"sv_price_deviation_{group_string}"], s_lower, s_upper - ) and between_two_numbers( - row[f"sv_price_per_sqft_deviation_{group_string}"], sq_lower, sq_upper - ): - value = "(raw)" - elif between_two_numbers( - row[f"sv_price_deviation_{group_string}"], s_lower, s_upper - ) and not between_two_numbers( - row[f"sv_price_per_sqft_deviation_{group_string}"], sq_lower, sq_upper - ): - value = "(sqft)" - elif not between_two_numbers( - row[f"sv_price_deviation_{group_string}"], s_lower, s_upper - ) and not between_two_numbers( - row[f"sv_price_per_sqft_deviation_{group_string}"], sq_lower, sq_upper - ): - value = "(raw & sqft)" - - return value - - -def between_two_numbers(num: int or float, a: int or float, b: int or float) -> bool: - return a < num < b + Calculates the compound growth rate (CGR) using the previous sale price and + the days between transactions. Only applied to original observations. + """ + mask = df["original_observation"] == True + sorted_df = df.sort_values("meta_sale_date") + df.loc[mask, "sv_previous_price"] = sorted_df.groupby("pin")[ + "meta_sale_price" + ].transform(lambda x: x.shift()) + with np.errstate(divide="ignore", invalid="ignore"): + df.loc[mask, "sv_cgdr"] = ( + df.loc[mask, "meta_sale_price"] / df.loc[mask, "sv_previous_price"] + ) ** (1 / df.loc[mask, "sv_days_since_last_transaction"]) - 1 + return df -def price_column(row: pd.Series, thresholds: dict, groups: tuple, condos: bool) -> str: +def dup_stats(df: pd.DataFrame, groups: tuple) -> pd.DataFrame: """ - Determines whether the record is a high price outlier or a low price outlier. - If the record is also a price change outlier, than add 'swing' to the string. - Inputs: - thresholds (dict): dict of standard deviation thresholds from get_thresh() - condos (bool): Specifies whether we are running function for condos or residential - Outputs: - value (str): string showing what kind of price outlier the record is. + For properties with multiple transactions, calculates duplicate sale counts and + the direction of price movement relative to the group mean. """ - value = "Not price outlier" - price = False - - group_string = create_group_string(groups, "_") - key = tuple(row[group] for group in groups) - - if condos == True: - if thresholds.get(f"sv_price_deviation_{group_string}").get(key): - s_std, *s_std_range = thresholds.get( - f"sv_price_deviation_{group_string}" - ).get(key) - s_lower, s_upper = s_std_range - - if row[f"sv_price_deviation_{group_string}"] > s_upper: - value = "High price" - price = True - elif row[f"sv_price_deviation_{group_string}"] < s_lower: - value = "Low price" - price = True - - if ( - price - and pd.notnull(row[f"sv_cgdr_deviation_{group_string}"]) - and thresholds.get(f"sv_cgdr_deviation_{group_string}").get(key) - ): - # not every combo will have pct change info so we need this check - p_std, *p_std_range = thresholds.get( - f"sv_cgdr_deviation_{group_string}" - ).get(key) - - p_lower, p_upper = p_std_range - if row[ - "sv_price_movement" - ] == "Away from mean" and not between_two_numbers( - row[f"sv_cgdr_deviation_{group_string}"], p_lower, p_upper - ): - value += " swing" - - else: - if thresholds.get(f"sv_price_deviation_{group_string}").get( - key - ) and thresholds.get(f"sv_price_per_sqft_deviation_{group_string}").get(key): - s_std, *s_std_range = thresholds.get( - f"sv_price_deviation_{group_string}" - ).get(key) - s_lower, s_upper = s_std_range - - sq_std, *sq_std_range = thresholds.get( - f"sv_price_per_sqft_deviation_{group_string}" - ).get(key) - sq_lower, sq_upper = sq_std_range - - if ( - row[f"sv_price_deviation_{group_string}"] > s_upper - or row[f"sv_price_per_sqft_deviation_{group_string}"] > sq_upper - ): - value = "High price" - price = True - elif ( - row[f"sv_price_deviation_{group_string}"] < s_lower - or row[f"sv_price_per_sqft_deviation_{group_string}"] < sq_lower - ): - value = "Low price" - price = True - - if ( - price - and pd.notnull(row[f"sv_cgdr_deviation_{group_string}"]) - and thresholds.get(f"sv_cgdr_deviation_{group_string}").get(key) - ): - # not every combo will have pct change info so we need this check - p_std, *p_std_range = thresholds.get( - f"sv_cgdr_deviation_{group_string}" - ).get(key) - - p_lower, p_upper = p_std_range - if row[ - "sv_price_movement" - ] == "Away from mean" and not between_two_numbers( - row[f"sv_cgdr_deviation_{group_string}"], p_lower, p_upper - ): - value += " swing" - - return value + group_str = create_group_string(groups) + dup_mask = df.duplicated("pin", keep=False) + df.loc[dup_mask, "sv_sale_dup_counts"] = df.groupby("pin")["pin"].transform("count") + dev_col = f"sv_deviation_{group_str}_mean_price_abs" + df.loc[dup_mask, dev_col] = abs( + df.loc[dup_mask, f"sv_mean_price_{group_str}"] + - df.loc[dup_mask, "meta_sale_price"] + ) + df.loc[dup_mask, "sv_price_movement"] = ( + df.loc[dup_mask] + .sort_values("meta_sale_date") + .groupby("pin")[dev_col] + .transform( + lambda s: s.lt(s.shift()) + .map({True: "Towards mean", False: "Away from mean"}) + .fillna("First sale") + ) + ) + return df def create_stats(df: pd.DataFrame, groups: tuple, condos: bool) -> pd.DataFrame: - """ - Create all statistical outlier measures. - Inputs: - df (pd.DataFrame): Dataframe to create statistics from - groups (tuple): grouping for groupby. Usually 'township' and 'class' - Outputs: - df(pd.DataFrame): dataframe with statistical measures calculated. - """ - + """Runs all the statistical calculations on the DataFrame.""" if not condos: df = price_sqft(df) - - df = grouping_mean(df, groups, condos=condos) - + df = grouping_mean(df, groups, condos) if not condos: df = deviation_dollars(df, groups) - df = dup_stats(df, groups) df = transaction_days(df) df = percent_change(df) - - return df - - -def percent_change(df: pd.DataFrame) -> pd.DataFrame: - """ - Generates CGR for all records. Requires that transaction_days() has already been run. - Creates 'previous_price' column as intermediary to help calculate CGR. - Calculate the compound growth rate where the previous transaction is the - beginning value, the current price is the end value, and the number of periods - is the number of days since the last transaction. - This enables us to better compare percent change accross different time periods - as opposed to pandas pct_change() function which does not account for time period. - Helper for create_stats(). - - Dataframe is subset to work with a rolling window grouping. - - Inputs: - df (pd.DataFrame): datarame to create CGR on. - Outputs: - df (pd.DataFrame): dataframe with CGR statistic and previous_price column - """ - - original_df = df[df["original_observation"] == True].copy() - original_df["sv_previous_price"] = ( - original_df.sort_values("meta_sale_date") - .groupby(["pin"])["meta_sale_price"] - .shift(axis=0) - ) - original_df["sv_cgdr"] = ( - (original_df["meta_sale_price"] / original_df["sv_previous_price"]) - ** (1 / original_df["sv_days_since_last_transaction"]) - ) - 1 - - df = pd.merge( - df, - original_df[["sv_previous_price", "sv_cgdr"]], - left_index=True, - right_index=True, - how="left", - ) return df -def dup_stats(df: pd.DataFrame, groups: tuple) -> pd.DataFrame: - """ - Stats that can only be calculated for PINs occuring more than once, such as sale volatiltiy, - and growth rates. - Helper for create_stats(). - Inputs: - df (pd.DataFrame): dataframe with sales data - groups (tuple): for get_movement groups - Outputs:mean - df(pd.DataFrame): dataframe with sale counts and town_class movement columns. - """ - dups = df[df.pin.duplicated(keep=False)] - dups = get_sale_counts(dups) - dups = get_movement(dups, groups) - - df = pd.merge(df, dups, how="outer") - - return df - - -def price_sqft(df: pd.DataFrame) -> pd.DataFrame: +def pricing_info( + df: pd.DataFrame, permut: tuple, groups: tuple, condos: bool +) -> pd.DataFrame: """ - Creates price/sqft columns in DataFrame. Must contain 'sale_price', - 'sale_price_log10' and 'sqft' in the columns, where the first two names are - self explanatory and 'sqft' is the properties square footage. - Helper for create_stats(). - Inputs: - df (pd.DataFrame): pandas dataframe with required columns. - Outputs: - df (pd.DataFrame): pandas dataframe with _per_sqft columns. + Adds pricing deviation information (z-scores) and computes per-row lower/upper thresholds + for each deviation measure using vectorized operations. + Then applies functions to determine the pricing outlier type. """ - df["sv_price_per_sqft"] = df["meta_sale_price"] / df["char_bldg_sf"] - df["sv_price_per_sqft"].replace([np.inf, -np.inf], np.nan, inplace=True) - - return df + group_str = create_group_string(groups) + cols_to_log = ["meta_sale_price"] + ([] if condos else ["sv_price_per_sqft"]) + df = log_transform(df, cols_to_log) + # (Optional) Persist group-level statistics + df["group_mean"] = df.groupby(list(groups))["meta_sale_price"].transform("mean") + df["group_std"] = df.groupby(list(groups))["meta_sale_price"].transform("std") + if not condos: + df["group_sqft_mean"] = df.groupby(list(groups))["sv_price_per_sqft"].transform( + "mean" + ) + df["group_sqft_std"] = df.groupby(list(groups))["sv_price_per_sqft"].transform( + "std" + ) -def deviation_dollars(df: pd.DataFrame, groups: tuple) -> pd.DataFrame: - """ - Creates the deviation in dollars of this record from the mean - sale_price and price_per_sqft for the groupby groups. - Inputs: - df (pd.DataFrame): dataframe to create deviations on - groups (tuple): tuple of groups being grouped by - Outputs: - df (pd.DataFrame): dataframe with deviation columns - """ - group_string = create_group_string(groups, "_") - - df[f"sv_deviation_{group_string}_mean_price"] = ( - df["meta_sale_price"] - df[f"sv_mean_price_{group_string}"] - ) - df[f"sv_deviation_{group_string}_mean_price_per_sqft"] = ( - df["sv_price_per_sqft"] - df[f"sv_mean_price_per_sqft_{group_string}"] + # Compute deviation columns using z-normalization within groups + df[f"sv_price_deviation_{group_str}"] = df.groupby(list(groups))[ + "meta_sale_price" + ].apply(z_normalize_groupby) + if not condos: + df[f"sv_price_per_sqft_deviation_{group_str}"] = df.groupby(list(groups))[ + "sv_price_per_sqft" + ].apply(z_normalize_groupby) + df[f"sv_cgdr_deviation_{group_str}"] = df.groupby(list(groups))["sv_cgdr"].apply( + z_normalize_groupby ) + # Compute lower and upper thresholds (per row) for each deviation column + for col in [f"sv_price_deviation_{group_str}", f"sv_cgdr_deviation_{group_str}"]: + df[f"{col}_lower"] = df.groupby(list(groups))[col].transform("mean") - permut[ + 0 + ] * df.groupby(list(groups))[col].transform("std") + df[f"{col}_upper"] = df.groupby(list(groups))[col].transform("mean") + permut[ + 1 + ] * df.groupby(list(groups))[col].transform("std") + if not condos: + col = f"sv_price_per_sqft_deviation_{group_str}" + df[f"{col}_lower"] = df.groupby(list(groups))[col].transform("mean") - permut[ + 0 + ] * df.groupby(list(groups))[col].transform("std") + df[f"{col}_upper"] = df.groupby(list(groups))[col].transform("mean") + permut[ + 1 + ] * df.groupby(list(groups))[col].transform("std") + + # Apply outlier type functions that use the computed threshold columns + df["sv_pricing"] = df.apply(lambda row: price_column(row, groups, condos), axis=1) + if not condos: + df["sv_which_price"] = df.apply(lambda row: which_price(row, groups), axis=1) return df -def grouping_mean(df: pd.DataFrame, groups: tuple, condos: bool) -> pd.DataFrame: +def which_price(row: pd.Series, groups: tuple) -> str: """ - Gets sale_price mean by two groupings. Usually town + class. - Helper for create_stats(). - Inputs: - df (pd.DataFrame): dataframe with the grouping columns - groups (tuple): tuple (len == 2) where each element is a column name to be grouped by. - Outputs: - df (pd.DataFrame): dataframe with grouped by mean column + Determines which price measure (raw, per sqft, or both) is flagged as an outlier + by comparing the deviation values with their per-row thresholds. """ - group_string = create_group_string(groups, "_") + group_str = create_group_string(groups) + raw_val = row[f"sv_price_deviation_{group_str}"] + raw_lower = row[f"sv_price_deviation_{group_str}_lower"] + raw_upper = row[f"sv_price_deviation_{group_str}_upper"] + raw_out = not between_two_numbers(raw_val, raw_lower, raw_upper) - group_mean = df.groupby(list(groups))["meta_sale_price"].mean() - - if condos == True: - df.set_index(list(groups), inplace=True) - df[f"sv_mean_price_{group_string}"] = group_mean + sqft_val = row.get(f"sv_price_per_sqft_deviation_{group_str}") + if sqft_val is not None: + sqft_lower = row[f"sv_price_per_sqft_deviation_{group_str}_lower"] + sqft_upper = row[f"sv_price_per_sqft_deviation_{group_str}_upper"] + sqft_out = not between_two_numbers(sqft_val, sqft_lower, sqft_upper) else: - group_mean_sqft = df.groupby(list(groups))["sv_price_per_sqft"].mean() - df.set_index(list(groups), inplace=True) - df[f"sv_mean_price_{group_string}"] = group_mean - df[f"sv_mean_price_per_sqft_{group_string}"] = group_mean_sqft - - df.reset_index(inplace=True) - - return df - - -def get_sale_counts(dups: pd.DataFrame) -> pd.DataFrame: - """ - Calculates how many times transactions occured for a gieven property. - Helper for dup_stats() - Inputs: - df (pd.DataFrame): pandas dataframe - """ - v_counts = ( - dups.pin.value_counts() - .reset_index(name="sv_sale_dup_counts") - .rename(columns={"index": "pin"}) - ) - - dups = pd.merge(dups, v_counts) - - return dups - - -def get_movement(dups: pd.DataFrame, groups: tuple) -> pd.DataFrame: - """ - Creates a coloumn that determines whether the price movement of the records is - towards or away from the mean. - Helper for dup_stats(). - Inputs: - df (pd.DataFrame): duplicate records - groups (tuple): groupby groups - Outputs: - df (pd.DataFrame): duplicate records with new column - """ - group_string = create_group_string(groups, "_") - - dups[f"sv_deviation_{group_string}_mean_price_abs"] = abs( - dups[f"sv_mean_price_{group_string}"] - dups["meta_sale_price"] - ) - - temp = ( - dups.sort_values("meta_sale_date") - .groupby(["pin"])[f"sv_deviation_{group_string}_mean_price_abs"] - .shift() - ) - dups["sv_price_movement"] = ( - dups[f"sv_deviation_{group_string}_mean_price_abs"].lt(temp).astype(float) - ) - dups["sv_price_movement"] = np.select( - [(dups["sv_price_movement"] == 0), (dups["sv_price_movement"] == 1)], - ["Away from mean", "Towards mean"], - default="First sale", - ) - - return dups + sqft_out = False + + if raw_out and not sqft_out: + return "(raw)" + elif not raw_out and sqft_out: + return "(sqft)" + elif raw_out and sqft_out: + return "(raw & sqft)" + else: + return "Non-outlier" -def transaction_days(df: pd.DataFrame) -> pd.DataFrame: +def price_column(row: pd.Series, groups: tuple, condos: bool) -> str: """ - For each record, gets number of days since the last transaction. - Data frame is subset to work with a rolling window grouping. - - Inputs: - df (pd.DataFrame): DataFrame with a sale_date column in datetime - Outputs: - df (pd.DataFrame): DataFrame with new column + Determines whether the record is a high or low price outlier and, if applicable, + whether it exhibits a price swing. Comparisons are made by checking the record's + deviation against its per-row lower/upper threshold. """ + group_str = create_group_string(groups) + value = "Not price outlier" + price_flag = False + raw_val = row[f"sv_price_deviation_{group_str}"] + raw_lower = row[f"sv_price_deviation_{group_str}_lower"] + raw_upper = row[f"sv_price_deviation_{group_str}_upper"] - original_df = df[df["original_observation"] == True].copy() - original_df["sv_days_since_last_transaction"] = ( - original_df.sort_values("meta_sale_date") - .groupby("pin")["meta_sale_date"] - .diff() - .apply(lambda x: x.days) - ) - - df = pd.merge( - df, - original_df[["sv_days_since_last_transaction"]], - left_index=True, - right_index=True, - how="left", - ) - - return df + if condos: + if raw_val > raw_upper: + value = "High price" + price_flag = True + elif raw_val < raw_lower: + value = "Low price" + price_flag = True + if price_flag and pd.notnull(row.get(f"sv_cgdr_deviation_{group_str}")): + cgdr_val = row[f"sv_cgdr_deviation_{group_str}"] + cgdr_lower = row[f"sv_cgdr_deviation_{group_str}_lower"] + cgdr_upper = row[f"sv_cgdr_deviation_{group_str}_upper"] + if row["sv_price_movement"] == "Away from mean" and not between_two_numbers( + cgdr_val, cgdr_lower, cgdr_upper + ): + value += " swing" + else: + raw_out = raw_val > raw_upper or raw_val < raw_lower + sqft_val = row[f"sv_price_per_sqft_deviation_{group_str}"] + sqft_lower = row[f"sv_price_per_sqft_deviation_{group_str}_lower"] + sqft_upper = row[f"sv_price_per_sqft_deviation_{group_str}_upper"] + sqft_out = sqft_val > sqft_upper or sqft_val < sqft_lower + + if raw_out or sqft_out: + if raw_out: + value = "High price" if raw_val > raw_upper else "Low price" + elif sqft_out: + value = "High price" if sqft_val > sqft_upper else "Low price" + price_flag = True + + if price_flag and pd.notnull(row.get(f"sv_cgdr_deviation_{group_str}")): + cgdr_val = row[f"sv_cgdr_deviation_{group_str}"] + cgdr_lower = row[f"sv_cgdr_deviation_{group_str}_lower"] + cgdr_upper = row[f"sv_cgdr_deviation_{group_str}_upper"] + if row[ + "sv_price_movement" + ] == "Away from mean" and not between_two_numbers( + cgdr_val, cgdr_lower, cgdr_upper + ): + value += " swing" + return value def check_days(df: pd.DataFrame, threshold: int) -> pd.DataFrame: """ - Creates a label of whether or not the transaction - was only owned for a short term. - If owned for less than the threshold, is a short term owner. - Inputs: - df (pd.DataFrame): dataframe to have short term owners checked - threshold (int): the threshold fo being a short term owner - Oututs: - df (pd.DataFrame): datafrme with 'short_owner' column + Flags a transaction as a short-term ownership if the days since last transaction + are below the given threshold. """ - df["sv_short_owner"] = np.select( - [(df["sv_days_since_last_transaction"] < threshold)], - ["Short-term owner"], - default=f"Over {threshold} days", + df["sv_short_owner"] = np.where( + df["sv_days_since_last_transaction"] < threshold, + "Short-term owner", + f"Over {threshold} days", ) - return df -def get_thresh(df: pd.DataFrame, cols: list, permut: tuple, groups: tuple) -> dict: - """ - Creates a nested dictionary where the top level key is a column - and the 2nd-level key is a (township, class) combo. - Ex: stds['sale_price'][76, 203] - Needed in order to keep track of specific thresholds for each township/class combo. - Theoretically each std should be 1(because of z_normalization), but in practical terms - it is in a very very small range around 1, so using a uniform cutoff of 2 and -2 - loses us some precision. - - We also want to allow for some flexibility in how the thresholds are calculated; - and this function allows for more flexbility in the event of future changes. - Inputs: - df (pd.DataFrame): Dataframe to create dictionary from. - cols (list): list of columns to get standard deviations for. - permut (tuple): standard deviation range for lower_limit and upper_limit - First term is how many stndard deviations away on the left - Second term is how many standard deviations away on the right. - Outputs: - stds (dict): nested dictionary of std deviations for all columns - from DataFrame. - """ - stds = {} - - for col in cols: - df[col] = df[col].astype(float) - grouped = df.dropna(subset=list(groups) + [col]).groupby(list(groups))[col] - lower_limit = grouped.mean() - (grouped.std(ddof=0) * permut[0]) - upper_limit = grouped.mean() + (grouped.std(ddof=0) * permut[1]) - std = grouped.std(ddof=0) - lower_limit = lower_limit.to_dict() - upper_limit = upper_limit.to_dict() - std = std.to_dict() - - limits = { - x: (std.get(x, 0), lower_limit.get(x, 0), upper_limit.get(x, 0)) - for x in set(std).union(upper_limit, lower_limit) - } - stds[col] = limits - - return stds - - -def log_transform(df: pd.DataFrame, columns: list) -> pd.DataFrame: - """ - Apply log transformation on given column set. - Inputs: - df (pd.DataFrame): - columns (list): columns to be transformed - Outputs: - df (pd.DataFrame): dataframe with given columns replaced - by their logged values - """ - for col in columns: - df[col] = np.log10(df[col]) - - return df - - -def z_normalize_groupby(s: pd.Series): - """ - Function used to z_normalize groups of records. - Pandas stitches it back together into a complete column. - Meant for groupby.apply(). - Inputs: - s(pd.Series): grouped series from groupby.apply - Outputs: - z_normalized series grouped by class and township - that is then stitched into complete column by pandas - """ - - return zscore(s, nan_policy="omit") - - def outlier_type( df: pd.DataFrame, condos: bool, raw_price_threshold: int ) -> pd.DataFrame: """ - This function create indicator columns for each distinct outlier type between price - and characteristic outliers. These columns are prefixed with 'sv_ind_'. - - Inputs: - df (pd.DataFrame): Dataframe - Outputs: - df (pd.DataFrame): Dataframe with indicator columns for each flag type + Creates indicator columns for various outlier types based on both characteristic- + and pricing-based conditions. """ - char_conditions = [ df["sv_short_owner"] == "Short-term owner", df["sv_name_match"] != "No match", @@ -770,8 +347,6 @@ def outlier_type( df["sv_pricing"].str.contains("High price swing") | df["sv_pricing"].str.contains("Low price swing"), ] - - # Define labels for characteristic-based reasons char_labels = [ "sv_ind_char_short_term_owner", "sv_ind_char_family_sale", @@ -781,537 +356,336 @@ def outlier_type( ] if condos: - # Define conditions for price-based reasons price_conditions = [ df["sv_pricing"].str.contains("High"), df["sv_pricing"].str.contains("Low"), ] - - # Define labels for price-based reasons price_labels = [ "sv_ind_price_high_price", "sv_ind_price_low_price", ] - else: - # Define conditions for price-based reasons price_conditions = [ - ( - df["sv_pricing"].str.contains("High") - & (df["sv_which_price"].str.contains("raw")) - ), - ( - df["sv_pricing"].str.contains("Low") - & (df["sv_which_price"].str.contains("raw")) - ), - (df["sv_pricing"].str.contains("High")) - & (df["sv_which_price"].str.contains("sqft")), - (df["sv_pricing"].str.contains("Low")) - & (df["sv_which_price"].str.contains("sqft")), + df["sv_pricing"].str.contains("High") + & df["sv_which_price"].str.contains("raw"), + df["sv_pricing"].str.contains("Low") + & df["sv_which_price"].str.contains("raw"), + df["sv_pricing"].str.contains("High") + & df["sv_which_price"].str.contains("sqft"), + df["sv_pricing"].str.contains("Low") + & df["sv_which_price"].str.contains("sqft"), ] - - # Define labels for price-based reasons price_labels = [ "sv_ind_price_high_price", "sv_ind_price_low_price", "sv_ind_price_high_price_sqft", "sv_ind_price_low_price_sqft", ] - - # Implement raw threshold, unlog price + # Raw price threshold (comparing the unlogged value) price_conditions.append((10 ** df["meta_sale_price"]) > raw_price_threshold) price_labels.append("sv_ind_raw_price_threshold") - combined_conditions = price_conditions + char_conditions - combined_labels = price_labels + char_labels - - # Create indicator columns for each flag type - for label, condition in zip(combined_labels, combined_conditions): + for label, condition in zip( + price_labels + char_labels, price_conditions + char_conditions + ): df[label] = condition.astype(int) - return df -# STRING CLEANUP - -""" - An outline of our overall approach: - - Tries to create an identifier from the buyer/seller name. - Our appraoch is to try to identify if it is a legal identify of some sort, - such as a bank, construction company, trust, LLC, or other and - return the string as-is with some formatting applied if so. We also combine some - spellings/mispellings of big entities. - - If we can't identify the string as a legal entity we assume the string contains a person's name. - We then process these strings to determine if the person is a trustee, successor, - or a successor trustee from the fragements of the strings. - Once we do this, we determine the best place tosplit the string in split_logic(), - looking out for certain tokens. After we've determnined where to split - the string we send the tokens to name_selector, where we attempt to select - the last name of the string. - - We then create a column that tells us whether it's person, or a legal entity, - as per our identification method that we used in get_id(). - - Then we use the trustee, successor, or as successor trustee parts of - the string we constructed earlier to determine the role of the buyer - or seller in the transaction(trustee, successor, successor trustee). - - We then remove the trustee, successor, as successor trustee parts of the string - from buyer/seller id. - - Finally we create a transaction_type column that is just what kind of entity it is - with a dash between them. - - TODO: Process more string types: - - If a name contains 'and', we split the string on it and take - the token directly to the left. We could take a more sophisticated - approach to determine if the last name in this case. - - 'co-trustee' handling. - - Handle different name formats. Assume people use - but sometimes its or other such formats. - - Find trends in string cutoffs(some are cut off at 25, characters, others 25, etc) - that could help use better process strings that are cutoff. - - Cleanup/debug regex. This is a lot of dirty regex, and it is picking up - some names that we don't want, or not correctly identifying every case that we do want. - So it could use some work in some cases. +# ============================================================================= +# Isolation Forest & PCA Functions +# ============================================================================= +def pca_transform(df: pd.DataFrame, columns: list) -> pd.DataFrame: """ + Runs PCA on the specified columns (after filling NAs and infinities) + and returns the principal components with explained variance > 1. + """ + feed = df[columns].fillna(0).replace([np.inf, -np.inf], 0) + pca_model = PCA(n_components=len(feed.columns)) + pcs = pca_model.fit_transform(feed) + pc_df = pd.DataFrame( + pcs, columns=[f"PC{i}" for i in range(len(feed.columns))], index=df.index + ) + n_components = sum(pca_model.explained_variance_ > 1) + return pc_df.iloc[:, :n_components] -entity_keywords = ( - r"llc| ll$| l$|l l c|estate|training|construction|building|masonry|" - r"apartments|plumbing|service|professional|roofing|advanced|office|" - r"\blaw\b|\bloan\b|legal|production|woodwork|concepts|corp|company|" - r" united|\binc\b|county|entertainment|community|heating|cooling" - r"|partners|equity|indsutries|series|revitalization|collection|" - r"agency|renovation|consulting|flippers|estates|\bthe \b|dept|" - r"funding|opportunity|improvements|servicing|equities|\bsale\b|" - r"judicial| in$|bank|\btrust\b|holding|investment|housing" - r"|properties|limited|realty|development|capital|management" - r"|developers|construction|rentals|group|investments|invest|" - r"residences|enterprise|enterprises|ventures|remodeling|" - r"specialists|homes|business|venture|restoration|renovations" - r"|maintenance|ltd|real estate|builders|buyers|property|financial" - r"|associates|consultants|international|acquisitions|credit|design" - r"|homeownership|solutions|\bhome\b|diversified|assets|family|\bland\b" - r"|revocable|services|rehabbing|\bliving\b|county of cook|fannie mae" - r"|veteran|mortgage|savings|lp$|federal natl|hospital|southport|mtg" - r"|propert|rehab|neighborhood|advantage|chicago|cook c|\bbk\b|\bhud\b" - r"|department|united states|\busa\b|hsbc|midwest|residential|american" - r"|tcf|advantage|real e|advantage|fifth third|baptist church" - r"|apostolic church|lutheran church|catholic church|\bfed\b|nationstar" - r"|advantage|commercial|health|condominium|nationa|association|homeowner" - r"|christ church|christian church|baptist church|community church" - r"|church of c|\bdelaw\b|lawyer|delawar" -) - - -def get_id(row: pd.Series, col: str) -> str: +def iso_forest( + df: pd.DataFrame, + groups: tuple, + columns: list, + n_estimators: int = 1000, + max_samples=0.2, +) -> pd.DataFrame: + """ + Runs Isolation Forest on PCA-transformed features (with additional group labels) + to flag statistical anomalies. """ - Creates an ID from the buyer/seller name. + df.set_index("meta_sale_document_num", inplace=True) + pca_features = pca_transform(df, columns) - Returns string as-is if identified as legal entity. - Combined with other entities if its a common mispelling/cutoff. + label_encoders = {} + for group in groups: + if not pd.api.types.is_numeric_dtype(df[group]): + le = LabelEncoder() + df[group] = le.fit_transform(df[group]) + label_encoders[group] = le + pca_features[group] = df[group] - Attempts to identify last name if not a legal entity. + iso = IsolationForest( + n_estimators=n_estimators, + max_samples=max_samples, + bootstrap=True, + random_state=42, + ) + df["sv_anomaly"] = iso.fit_predict(pca_features) + df["sv_anomaly"] = np.where(df["sv_anomaly"] == -1, "Outlier", "Not Outlier") - Inputs: - row: from apply() - col (str): 'buyer' or 'seller' - Outputs: - id (str): string as-is if legal entity - identified last name if otherwise. - """ + for group, le in label_encoders.items(): + df[group] = le.inverse_transform(df[group]) + df.reset_index(inplace=True) + return df - column = col + "_name" - words = str(row[column]).lower() - # Check for missing values first - if pd.isnull(row[column]) or words in [ +# ============================================================================= +# String Processing Functions +# ============================================================================= +def get_id(row: pd.Series, col_prefix: str) -> str: + """ + Generates an identifier from the buyer/seller name. If the name appears to be + a legal entity (based on keywords, presence of digits, or certain suffixes), + returns the cleaned string; otherwise attempts to extract a last name. + """ + col = col_prefix + "_name" + name_str = str(row[col]).lower().strip() + if pd.isnull(name_str) or name_str in { "none", "nan", "unknown", "missing seller name", "missing buyer name", - ]: - id = "Empty Name" - return id - - words = re.sub(r" amp ", "", words) - words = re.sub(" +", " ", words) - - if words.isspace() or re.search(r"^[.]*$", words): - id = "Empty Name" - return id - - if any(x in words for x in ["vt investment corpor", "v t investment corp"]): - return "vt investment corporation" - - if any(x in words for x in ["national residential nomi"]): - return "national residential nominee services" - - if any( - x in words for x in ["first integrity group inc", "first integrity group in"] - ): - return "first integrity group inc" - - if words in ["deutsche bank national tr"]: - return "deutsche bank national trust company" - - if any( - x in words for x in ["cirrus investment group l", "cirrus investment group"] - ): - return "cirrus investment group" - - if any( - x in words - for x in [ - "fannie mae aka federal na", - "fannie mae a k a federal", - "federal national mortgage", - ] - ): - return "fannie mae" - - if any( - x in words - for x in [ - "the judicial sales corpor", - "judicial sales corp", - "judicial sales corporatio", - "judicial sale corp", - "the judicial sales corp", - ] - ): - return "the judicial sales corporation" - - if any(x in words for x in ["jpmorgan chase bank n a", "jpmorgan chase bank nati"]): - return "jp morgan chase bank" - - if any( - x in words - for x in [ - "wells fargo bank na", - "wells fargo bank n a", - "wells fargo bank nationa", - "wells fargo bank n a a", - "wells fargo bk", - ] - ): - return "wells fargo bank national" - - if any( - x in words for x in ["bayview loan servicing l", "bayview loan servicing ll"] - ): - return "bayview loan servicing llc" - - if any(x in words for x in ["thr property illinois l", "thr property illinois lp"]): - return "thr property illinois lp" - - if any(x in words for x in ["ih3 property illinois lp", "ih3 property illinois l"]): - return "ih3 property illinois lp" - - if any(x in words for x in ["ih2 property illinois lp", "ih2 property illinois l"]): - return "ih2 property illinois lp" - - if any( - x in words - for x in [ - "secretary of housing and", - "the secretary of housing", - "secretary of housing ", - ] - ): - return "secretary of housing and urban development" + }: + return "Empty Name" - if any( - x in words for x in ["secretary of veterans aff", "the secretary of veterans"] - ): - return "secretary of veterans affairs" - - if any( - x in words - for x in [ - "bank of america n a", - "bank of america na", - "bank of america national", - ] - ): - return "bank of america national" - - if any( - x in words - for x in [ - "us bank national association", - "u s bank national assoc", - "u s bank national associ", - "u s bank trust n a as", - "u s bank n a", - "us bank national associat", - "u s bank trust national", - "us bk", - "u s bk", - ] - ): - return "us bank national association" + name_str = re.sub(r" amp ", " ", name_str) + name_str = re.sub(r"\s+", " ", name_str).strip() + if not name_str or re.fullmatch(r"[.]*", name_str): + return "Empty Name" - words = re.sub( - "suc t$|as succ t$|successor tr$|successor tru$|" - "successor trus$|successor trust$|successor truste$|" - "successor trustee$|successor t$|as successor t$", + # Handle specific known cases + special_cases = { + "vt investment corpor": "vt investment corporation", + "v t investment corp": "vt investment corporation", + "national residential nomi": "national residential nominee services", + "first integrity group inc": "first integrity group inc", + "first integrity group in": "first integrity group inc", + "deutsche bank national tr": "deutsche bank national trust company", + "cirrus investment group l": "cirrus investment group", + "cirrus investment group": "cirrus investment group", + "fannie mae aka federal na": "fannie mae", + "fannie mae a k a federal": "fannie mae", + "federal national mortgage": "fannie mae", + "judicial sales corpor": "the judicial sales corporation", + "judicial sales corp": "the judicial sales corporation", + "judicial sales corporatio": "the judicial sales corporation", + "judicial sale corp": "the judicial sales corporation", + "the judicial sales corp": "the judicial sales corporation", + "jpmorgan chase bank n a": "jp morgan chase bank", + "jpmorgan chase bank nati": "jp morgan chase bank", + "wells fargo bank na": "wells fargo bank national", + "wells fargo bank n a": "wells fargo bank national", + "wells fargo bank nationa": "wells fargo bank national", + "wells fargo bank n a a": "wells fargo bank national", + "wells fargo bk": "wells fargo bank national", + "bayview loan servicing l": "bayview loan servicing llc", + "bayview loan servicing ll": "bayview loan servicing llc", + "thr property illinois l": "thr property illinois lp", + "thr property illinois lp": "thr property illinois lp", + "ih3 property illinois lp": "ih3 property illinois lp", + "ih3 property illinois l": "ih3 property illinois lp", + "ih2 property illinois lp": "ih2 property illinois lp", + "ih2 property illinois l": "ih2 property illinois lp", + "secretary of housing and": "secretary of housing and urban development", + "the secretary of housing": "secretary of housing and urban development", + "secretary of housing ": "secretary of housing and urban development", + "secretary of veterans aff": "secretary of veterans affairs", + "the secretary of veterans": "secretary of veterans affairs", + "bank of america n a": "bank of america national", + "bank of america na": "bank of america national", + "bank of america national": "bank of america national", + "us bank national association": "us bank national association", + "u s bank national assoc": "us bank national association", + "u s bank national associ": "us bank national association", + "u s bank trust n a as": "us bank national association", + "u s bank n a": "us bank national association", + "us bank national associat": "us bank national association", + "u s bank trust national": "us bank national association", + "us bk": "us bank national association", + "u s bk": "us bank national association", + } + for key, val in special_cases.items(): + if key in name_str: + return val + + # Normalize trustee/successor tokens + name_str = re.sub( + r"(suc t$|as succ t$|successor tr$|successor tru$|successor trus$|" + r"successor trust$|successor truste$|successor trustee$|successor t$|as successor t$)", "as successor trustee", - words, + name_str, ) - words = re.sub( - "as t$|as s t$|as sole t$|as tr$|as tru$|as trus$|as trust$|" - "as truste$|as trustee$|as trustee o$|as trustee of$|trustee of$|" - "trustee of$|tr$|tru$|trus$|truste$|trustee$|, t|, tr|, tru|, trus|" - ", trust|, truste", + name_str = re.sub( + r"(as t$|as s t$|as sole t$|as tr$|as tru$|as trus$|as trust$|as truste$|" + r"as trustee$|as trustee o$|as trustee of$|, t|, tr|, tru|, trus|, trust|, truste)", "as trustee", - words, + name_str, ) - words = re.sub( - "su$|suc$|succ$|succe$|succes$|success$|successo$|successor$|as s$|as su$|" - "as suc$|as succ$|as succe$|as sucess$|as successo$|, s$|, su$|, suc$|, succ$|" - ", succe$|, succes$|, success$|, successo$", + name_str = re.sub( + r"(su$|suc$|succ$|succe$|succes$|success$|successo$|successor$|as s$|as su$|" + r"as suc$|as succ$|as succe$|as sucess$|as successo$|, s$|, su$|, suc$|, succ$|, succe$|, succes$|, success$|, successo$)", "as successor", - words, + name_str, ) if ( - re.search(entity_keywords, words) - or re.search(r"\d{4}|\d{3}", words) - or re.search("as trustee$|as successor$|as successor trustee$", words) + ENTITY_KEYWORDS.search(name_str) + or re.search(r"\d{3,4}", name_str) + or re.search(r"as trustee$|as successor$|as successor trustee$", name_str) ): - id = words - return id + return name_str - words = re.sub( - " in$|indi$|indiv$|indivi$|indivi$|individ$|individu$|individua$|individual$" - "|not i$|not ind$| ind$| inde$|indep$|indepe$|indepen$|independ$|independe$" - "|independen$|independent$", + name_str = re.sub( + r"( in$|indi$|indiv$|indivi$|individ$|individu$|individua$|individual$|" + r"not i$|not ind$| ind$| inde$|indep$|indepe$|indepen$|independ$|independe$|independen$|independent$)", "", - words, + name_str, ) + tokens = split_logic(name_str) + return name_selector(tokens) - tokens = split_logic(words) - - id = name_selector(tokens) - return id - - -def split_logic(words: str): +def split_logic(name_str: str): """ - Given a cleaned string, determines where to split the string. - Splits on 'and', variations of FKA/NKA/KNA if present, on spaces if not. - Helper to get_id(). - Inputs: - words (str): cleaned str from get_id - Outputs: - 'Empty Name' if string is empty - tokens (list): list of tokens in string from split + Splits a cleaned string into tokens using keywords such as 'and' or common abbreviations. + Returns a list of tokens (or "Empty Name" if input is not valid). """ - words = re.sub(" +", " ", words) - - if words.isspace() or re.search(r"^[.]*$", words) or words == "Empty Name": + name_str = re.sub(r"\s+", " ", name_str).strip() + if not name_str or re.fullmatch(r"[.]*", name_str) or name_str == "Empty Name": return "Empty Name" - - words = re.sub(" as$| as $|as $", "", words) - - _and = re.search( - r"\b and\b|\b an$\b|\b a$\b|f k a|\bfka\b| n k a|\bnka\b|" - r"\b aka\b|a k a(?=\\s|$)|\b kna\b|k n a| f k$|n k$|a k$|\b not\b| married", - words, + name_str = re.sub(r"\s+as$|\s+as\s+$|as\s+$", "", name_str) + m = re.search( + r"\b and\b|\b an\b|\b a\b|f k a|\bfka\b| n k a|\bnka\b|\b aka\b|a k a(?=\s|$)|\b kna\b|k n a| f k$|n k$|a k$|\b not\b| married", + name_str, ) - - if _and: - tokens = words.split(_and.group()) - tokens = tokens[0].strip().split() - else: - tokens = words.split() - - return tokens + if m: + tokens = name_str.split(m.group()) + return tokens[0].strip().split() + return name_str.split() def name_selector(tokens) -> str: """ - Attempts to select the last name of a person's name based on the number of tokens. - Inputs: - tokens: list of strings where each string is a name token - Outputs: - 'Empty Name' if name is empty. - id (str): identified last name + Given a list of name tokens, returns the last token as an identifier, + ignoring common suffixes. """ - - suffixes = ["jr", "sr", "ii", "iii", "iv", "v"] - - if tokens == "Empty Name" or tokens == []: + suffixes = {"jr", "sr", "ii", "iii", "iv", "v"} + if tokens == "Empty Name" or not tokens: return "Empty Name" - - while tokens[-1] in suffixes: + while tokens and tokens[-1] in suffixes: tokens = tokens[:-1] - if not tokens: # Avoids IndexError if all tokens are removed. - return "Empty Name" - - id = tokens[-1] - - return id + return tokens[-1] if tokens else "Empty Name" -def get_category(row: pd.Series, col: str) -> str: +def get_category(row: pd.Series, col_prefix: str) -> str: """ - Gets category buyer/seller id. legal_entity if in entity keywords, - person if otherwise. - Inputs: - row: from pandas dataframe - col (str): column to process. 'buyer' or 'seller' - Outputs: - category (str): category of buyer/seller id + Determines whether the identifier belongs to a legal entity or a person. """ - - column = col + "_id" - words = row[column] - - if re.search(entity_keywords, words): - category = "legal_entity" - elif words == "Empty Name": - category = "none" + col = col_prefix + "_id" + name_str = row[col] + if ENTITY_KEYWORDS.search(name_str): + return "legal_entity" + elif name_str == "Empty Name": + return "none" else: - category = "person" - - return category + return "person" -def get_role(row: pd.Series, col: str) -> str: +def clean_id(row: pd.Series, col_prefix: str) -> str: """ - Picks the role th person is playing in the transaction off of the - buyer/seller_id. Meant for apply() - Ex: 'as trustee', or 'as successor' - Inputs: - row: from pandas dataframe - col (str): column to process. 'buyer' or 'seller' - Outputs: - roles(str): the role of the person n the transaction - + Cleans the identifier by removing role-related tokens and, if appropriate, + reselecting the name token. """ - role = None - column = col + "_id" - words = row[column] - - suc_trust = re.search(" as successor trustee", words) - suc = re.search(" as successor", words) - trust = re.search(" as trustee", words) - - if suc_trust: - role = suc_trust.group() - - if suc: - role = suc.group() - - if trust: - role = trust.group() - - return role + col = col_prefix + "_id" + name_str = row[col] + name_str = re.sub( + r" as successor trustee|\bas successor\b| as trustee", "", name_str + ) + name_str = re.sub(r"\s+as$|\s+as\s+$|as\s+$", "", name_str) + if not ( + ENTITY_KEYWORDS.search(name_str) + or re.search(r"\d{3,4}", name_str) + or len(name_str.split()) == 1 + ): + name_str = name_selector(split_logic(name_str)) + return name_str -def clean_id(row: pd.Series, col: str) -> str: +def get_role(row: pd.Series, col_prefix: str) -> str: """ - Cleans id field after get_role() by removing role. - Inputs: - row: from pandas dataframe - col (str): column to process. 'seller' or 'buyer' - Outputs: - words (str): seller/buyer id without role. + Extracts the role (e.g., 'as trustee', 'as successor') from the identifier. """ - - column = col + "_id" - words = row[column] - - words = re.sub(r" as successor trustee|\b as successor\b| as trustee", "", words) - words = re.sub(" as$| as $|as $", "", words) - - if not ( - re.search(entity_keywords, words) - or re.search(r"\d{4}|\d{3}", words) - or len(words.split()) == 1 - ): - words = name_selector(split_logic(words)) - - return words + col = col_prefix + "_id" + name_str = row[col] + for role_token in [" as successor trustee", " as successor", " as trustee"]: + m = re.search(role_token, name_str) + if m: + return m.group() + return None def create_judicial_flag(df: pd.DataFrame) -> pd.DataFrame: """ - Creates a column that contains 1 if sold from a judicial corp - and 0 otherwise. Mean for use with apply(). - Inputs: - df (pd.DataFrame): dataframe to create flag on - Outputs: - df (pd.DataFrame): dataframe with 'sv_is_judicial_sale' column + Creates a binary flag (as string '1' or '0') indicating whether the seller's + identifier corresponds to a judicial sales entity. """ - - df["sv_is_judicial_sale"] = np.select( - [ - (df["sv_seller_id"] == "the judicial sale corporation") - | (df["sv_seller_id"] == "intercounty judicial sale") - ], - ["1"], - default="0", + df["sv_is_judicial_sale"] = np.where( + df["sv_seller_id"].isin( + ["the judicial sale corporation", "intercounty judicial sale"] + ), + "1", + "0", ) - return df def create_name_match(row: pd.Series) -> str: """ - Creates a column that contains the actual string that was matched. - Meant for apply(). - Inputs: - row: from pandas dataframe - Outputs: - value (str or None): string match if applicable, None otherwise + If the buyer and seller identifiers match (and are nontrivial and not both legal entities), + returns the match; otherwise returns "No match". """ if ( row["sv_buyer_id"] == row["sv_seller_id"] and row["sv_buyer_id"] != "Empty Name" - # Prevents the same legal entity as counting as a family name match and row["sv_transaction_type"] != "legal_entity-legal_entity" - # Boots out matches on a single last initial and len(row["sv_buyer_id"]) > 1 ): - value = row["sv_seller_id"] - else: - value = "No match" - - return value + return row["sv_seller_id"] + return "No match" def string_processing(df: pd.DataFrame) -> pd.DataFrame: """ - Brings together all of the apply functions for string processing. - Results in 7 additional columns. - ID, category, and role for buyer and seller. As well as transaction category type - for each record. - Inputs: - df (pd.dataFrame): dataframe with buyer/seller id columns. - Ouputs: - df(pd.DataFrame): dataframe with 7 new columns from apply functions + Processes buyer and seller name strings to generate identifiers, categories, + roles, and transaction type. """ - df.meta_sale_buyer_name = df.meta_sale_buyer_name.str.encode( - "ascii", "ignore" - ).str.decode("ascii") - df.meta_sale_seller_name = df.meta_sale_seller_name.str.encode( - "ascii", "ignore" - ).str.decode("ascii") - df.meta_sale_buyer_name = df.meta_sale_buyer_name.str.replace( - r"[^a-zA-Z0-9\-]", " ", regex=True - ).str.strip() - df.meta_sale_seller_name = df.meta_sale_seller_name.str.replace( - r"[^a-zA-Z0-9\-]", " ", regex=True - ).str.strip() - + for col in ["meta_sale_buyer_name", "meta_sale_seller_name"]: + df[col] = ( + df[col] + .str.encode("ascii", "ignore") + .str.decode("ascii") + .str.replace(r"[^a-zA-Z0-9\-]", " ", regex=True) + .str.strip() + ) df["sv_buyer_id"] = df.apply(get_id, args=("meta_sale_buyer",), axis=1) df["sv_seller_id"] = df.apply(get_id, args=("meta_sale_seller",), axis=1) df["sv_buyer_category"] = df.apply(get_category, args=("sv_buyer",), axis=1) @@ -1319,8 +693,40 @@ def string_processing(df: pd.DataFrame) -> pd.DataFrame: df["sv_buyer_id"] = df.apply(clean_id, args=("sv_buyer",), axis=1) df["sv_seller_id"] = df.apply(clean_id, args=("sv_seller",), axis=1) df["sv_transaction_type"] = df["sv_buyer_category"] + "-" + df["sv_seller_category"] - df = create_judicial_flag(df) df["sv_name_match"] = df.apply(create_name_match, axis=1) + return df + +# ============================================================================= +# Main Pipeline +# ============================================================================= +def go( + df: pd.DataFrame, + groups: tuple, + iso_forest_cols: list, + dev_bounds: tuple, + condos: bool, + raw_price_threshold: int, +) -> pd.DataFrame: + """ + Runs the entire processing pipeline: + 1. Statistical measures & outlier preparations. + 2. String processing. + 3. Isolation Forest anomaly detection. + 4. Outlier taxonomy assignment. + """ + model_type = "condos" if condos else "residential" + print(f"Flagging for {model_type}") + print("Initializing statistics...") + df = create_stats(df, groups, condos=condos) + print("Statistics complete. Processing strings...") + df = string_processing(df) + print("String processing complete. Running isolation forest...") + df = iso_forest(df, groups, iso_forest_cols) + print("Isolation forest complete. Assigning outlier taxonomy...") + df = check_days(df, SHORT_TERM_OWNER_THRESHOLD) + df = pricing_info(df, dev_bounds, groups, condos=condos) + df = outlier_type(df, condos=condos, raw_price_threshold=raw_price_threshold) + print("Processing finished.") return df From ff7b60e8757d1c5452b968501056644a778b20fd Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Thu, 6 Feb 2025 21:35:18 +0000 Subject: [PATCH 2/6] Refactor standard deviation price functionality --- glue/flagging_script_glue/flagging.py | 1333 +++++++++++++++++-------- 1 file changed, 918 insertions(+), 415 deletions(-) diff --git a/glue/flagging_script_glue/flagging.py b/glue/flagging_script_glue/flagging.py index 77d8e375..9bdffc77 100644 --- a/glue/flagging_script_glue/flagging.py +++ b/glue/flagging_script_glue/flagging.py @@ -12,167 +12,173 @@ from sklearn.preprocessing import LabelEncoder from sklearn.decomposition import PCA -# Constants -SHORT_TERM_OWNER_THRESHOLD = 365 # days +SHORT_TERM_OWNER_THRESHOLD = 365 # 365 = 365 days or 1 year -# Compile entity keywords regex for performance -ENTITY_KEYWORDS = re.compile( - r"llc| ll$| l$|l l c|estate|training|construction|building|masonry|" - r"apartments|plumbing|service|professional|roofing|advanced|office|" - r"\blaw\b|\bloan\b|legal|production|woodwork|concepts|corp|company|" - r" united|\binc\b|county|entertainment|community|heating|cooling" - r"|partners|equity|indsutries|series|revitalization|collection|" - r"agency|renovation|consulting|flippers|estates|\bthe \b|dept|" - r"funding|opportunity|improvements|servicing|equities|\bsale\b|" - r"judicial| in$|bank|\btrust\b|holding|investment|housing" - r"|properties|limited|realty|development|capital|management" - r"|developers|construction|rentals|group|investments|invest|" - r"residences|enterprise|enterprises|ventures|remodeling|" - r"specialists|homes|business|venture|restoration|renovations" - r"|maintenance|ltd|real estate|builders|buyers|property|financial" - r"|associates|consultants|international|acquisitions|credit|design" - r"|homeownership|solutions|\bhome\b|diversified|assets|family|\bland\b" - r"|revocable|services|rehabbing|\bliving\b|county of cook|fannie mae" - r"|veteran|mortgage|savings|lp$|federal natl|hospital|southport|mtg" - r"|propert|rehab|neighborhood|advantage|chicago|cook c|\bbk\b|\bhud\b" - r"|department|united states|\busa\b|hsbc|midwest|residential|american" - r"|tcf|advantage|real e|advantage|fifth third|baptist church" - r"|apostolic church|lutheran church|catholic church|\bfed\b|nationstar" - r"|advantage|commercial|health|condominium|nationa|association|homeowner" - r"|christ church|christian church|baptist church|community church" - r"|church of c|\bdelaw\b|lawyer|delawar", - re.IGNORECASE, -) +def go( + df: pd.DataFrame, + groups: tuple, + iso_forest_cols: list, + dev_bounds: tuple, + condos: bool, + raw_price_threshold: int, +): + """ + This function runs all of our other functions in the correct sequence. + + Inputs: + df (pandas dataframe): data used to perform the outlier calculation + groups (tuple): which groups to groupby when selecting outliers. + Ex: ('township','class','year') + iso_forest (list): list with columns to run PCA/IsoForest on + dev_bounds (tuple): how many std deviations on either side to select as outliers. + Ex: (2,2) selects outliers as being farther away than 2 + std deviations on both sides. + condos (boolean): determines whether we are running the flagging model for res or condos + Outputs: + df (pandas dataframe): + """ -# ============================================================================= -# Utility Functions -# ============================================================================= -def create_group_string(groups: tuple, sep: str = "_") -> str: - """Joins group names with a separator to create a string for column naming.""" - return sep.join(groups) + if condos: + print("Flagging for condos") + else: + print("Flagging for residential") + print("Initialize") + df = create_stats(df, groups, condos=condos) # 'year', 'township_code', 'class' + print("create_stats() done") + df = string_processing(df) + print("string_processing() done") + df = iso_forest(df, groups, iso_forest_cols) + print("iso_forest() done") + df = outlier_taxonomy( + df, dev_bounds, groups, condos=condos, raw_price_threshold=raw_price_threshold + ) + print("outlier_taxonomy() done\nfinished") -def log_transform(df: pd.DataFrame, columns: list) -> pd.DataFrame: - """Applies base-10 log transformation to the specified columns.""" - for col in columns: - df[col] = np.log10(df[col]) return df -def z_normalize_groupby(s: pd.Series) -> pd.Series: - """Returns the z-score normalization for a series (used with groupby.apply).""" - return zscore(s, nan_policy="omit") +def create_group_string(groups: tuple, sep: str) -> str: + """ + Creates a string joined on a separator from the groups tuple. + For the purpose of making column names and descriptions. + Inputs: + groups (tuple): the columns being used in groupby() + sep (str): string to separate the groups with. + Outputs: + groups as a string joined by given separator + """ + return sep.join(groups) -def between_two_numbers(num: float, a: float, b: float) -> bool: - """Checks if num is strictly between a and b.""" - return a < num < b +def outlier_taxonomy( + df: pd.DataFrame, + permut: tuple, + groups: tuple, + condos: bool, + raw_price_threshold: int, +): + """ + Creates columns having to do with our chosen outlier taxonomy. + Ex: Family sale, Home flip sale, Non-person sale, High price (raw and or sqft), etc. + Inputs: + df (pd.DataFrame): dataframe to create taxonomy on. + permut (tuple): permutation of std deviations + groups (tuple): columns to do grouping on. + Probably 'township' and 'class'. + Ouputs: + df (pd.DataFrame): dataframe with outlier taxonomy + """ + df = check_days(df, SHORT_TERM_OWNER_THRESHOLD) + df = pricing_info(df, permut, groups, condos=condos) + df = outlier_type(df, condos=condos, raw_price_threshold=raw_price_threshold) -# ============================================================================= -# Statistical & Pricing Functions -# ============================================================================= -def grouping_mean(df: pd.DataFrame, groups: tuple, condos: bool) -> pd.DataFrame: - """Computes the mean sale price (and price per sqft for non-condos) using transform.""" - group_str = create_group_string(groups) - df[f"sv_mean_price_{group_str}"] = df.groupby(list(groups))[ - "meta_sale_price" - ].transform("mean") - if not condos: - df[f"sv_mean_price_per_sqft_{group_str}"] = df.groupby(list(groups))[ - "sv_price_per_sqft" - ].transform("mean") return df -def deviation_dollars(df: pd.DataFrame, groups: tuple) -> pd.DataFrame: - """Calculates deviations (in dollars) from group means.""" - group_str = create_group_string(groups) - df[f"sv_deviation_{group_str}_mean_price"] = ( - df["meta_sale_price"] - df[f"sv_mean_price_{group_str}"] - ) - if f"sv_mean_price_per_sqft_{group_str}" in df.columns: - df[f"sv_deviation_{group_str}_mean_price_per_sqft"] = ( - df["sv_price_per_sqft"] - df[f"sv_mean_price_per_sqft_{group_str}"] - ) - return df +def iso_forest(df, groups, columns, n_estimators=1000, max_samples=0.2): + """ + Runs an isolation forest model on our data for outlier detection. + First does PCA, then, attaches township/class info, and then runs the + IsoForest model with given parameters. + Inputs: + df (pd.DataFrame): dataframe with data for IsoForest + groups (tuple): grouping for the data to input into the IsoForest + columns (list): list with columns to run PCA/IsoForest on + n_estimators (int): number of estimators in IsoForest + max_samples(int or float): share of data to use as sample if float, + number to use if int + Outputs: + df (pd.DataFrame): with 'sv_anomaly' column from IsoForest. + """ + # Set index + df.set_index("meta_sale_document_num", inplace=True) + # Perform PCA (assuming pca is a predefined function) + feed = pca(df, columns) -def price_sqft(df: pd.DataFrame) -> pd.DataFrame: - """Calculates the price per square foot.""" - df["sv_price_per_sqft"] = df["meta_sale_price"] / df["char_bldg_sf"] - df["sv_price_per_sqft"].replace([np.inf, -np.inf], np.nan, inplace=True) - return df + feed.index = df.index + # Label encode non-numeric groups + label_encoders = {} + for group in groups: + if df[group].dtype not in ["int64", "float64", "int32", "float32"]: + le = LabelEncoder() + df[group] = le.fit_transform(df[group]) + label_encoders[group] = le # Store the encoder if needed later + feed[group] = df[group] -def transaction_days(df: pd.DataFrame) -> pd.DataFrame: - """ - Calculates the days elapsed since the last transaction. - Assumes that 'meta_sale_date' is datetime. - """ - mask = df["original_observation"] == True - df.loc[mask, "sv_days_since_last_transaction"] = ( - df.sort_values("meta_sale_date") - .groupby("pin")["meta_sale_date"] - .transform(lambda x: x.diff().dt.days) + # Initialize and fit the Isolation Forest + isof = IsolationForest( + n_estimators=n_estimators, + max_samples=max_samples, + bootstrap=True, + random_state=42, ) - return df + df["sv_anomaly"] = isof.fit_predict(feed) + # Assign labels for anomalies + df["sv_anomaly"] = np.select( + [(df["sv_anomaly"] == -1), (df["sv_anomaly"] == 1)], + ["Outlier", "Not Outlier"], + default="Not Outlier", + ) + + # Restore original values for encoded columns + for group, le in label_encoders.items(): + df[group] = le.inverse_transform(df[group]) + + # Reset index + df.reset_index(inplace=True) -def percent_change(df: pd.DataFrame) -> pd.DataFrame: - """ - Calculates the compound growth rate (CGR) using the previous sale price and - the days between transactions. Only applied to original observations. - """ - mask = df["original_observation"] == True - sorted_df = df.sort_values("meta_sale_date") - df.loc[mask, "sv_previous_price"] = sorted_df.groupby("pin")[ - "meta_sale_price" - ].transform(lambda x: x.shift()) - with np.errstate(divide="ignore", invalid="ignore"): - df.loc[mask, "sv_cgdr"] = ( - df.loc[mask, "meta_sale_price"] / df.loc[mask, "sv_previous_price"] - ) ** (1 / df.loc[mask, "sv_days_since_last_transaction"]) - 1 return df -def dup_stats(df: pd.DataFrame, groups: tuple) -> pd.DataFrame: +def pca(df: pd.DataFrame, columns: list) -> pd.DataFrame: """ - For properties with multiple transactions, calculates duplicate sale counts and - the direction of price movement relative to the group mean. + Runs PCA on data, selects compoents where explained variance > 1. + Inputs: + df (pd.DataFrame): dataframe to run PCA on. + columns (list): columns of dataframe to run PCA on. + Outputs: + df (pd.DataFrame): dataframe of principal components """ - group_str = create_group_string(groups) - dup_mask = df.duplicated("pin", keep=False) - df.loc[dup_mask, "sv_sale_dup_counts"] = df.groupby("pin")["pin"].transform("count") - dev_col = f"sv_deviation_{group_str}_mean_price_abs" - df.loc[dup_mask, dev_col] = abs( - df.loc[dup_mask, f"sv_mean_price_{group_str}"] - - df.loc[dup_mask, "meta_sale_price"] - ) - df.loc[dup_mask, "sv_price_movement"] = ( - df.loc[dup_mask] - .sort_values("meta_sale_date") - .groupby("pin")[dev_col] - .transform( - lambda s: s.lt(s.shift()) - .map({True: "Towards mean", False: "Away from mean"}) - .fillna("First sale") - ) - ) - return df + feed_data = df[columns] + feed_data = feed_data.fillna(0) + feed_data = feed_data.replace([np.inf, -np.inf], 0) + pca = PCA(n_components=len(feed_data.columns)) + pc = pca.fit_transform(feed_data) + + cols = ["PC" + str(num) for num in range(len(feed_data.columns))] + + pc_df = pd.DataFrame(data=pc, columns=cols) + take = len(pca.explained_variance_[pca.explained_variance_ > 1]) + + df = pc_df[pc_df.columns[:take]] -def create_stats(df: pd.DataFrame, groups: tuple, condos: bool) -> pd.DataFrame: - """Runs all the statistical calculations on the DataFrame.""" - if not condos: - df = price_sqft(df) - df = grouping_mean(df, groups, condos) - if not condos: - df = deviation_dollars(df, groups) - df = dup_stats(df, groups) - df = transaction_days(df) - df = percent_change(df) return df @@ -180,15 +186,16 @@ def pricing_info( df: pd.DataFrame, permut: tuple, groups: tuple, condos: bool ) -> pd.DataFrame: """ - Adds pricing deviation information (z-scores) and computes per-row lower/upper thresholds - for each deviation measure using vectorized operations. - Then applies functions to determine the pricing outlier type. + Computes pricing deviations and, using a vectorized approach, computes per-row + lower/upper thresholds based on group means and standard deviations. Then, + determines pricing outlier type. """ - group_str = create_group_string(groups) + group_str = create_group_string(groups, "_") + # Log-transform the columns (price and, if applicable, price per sqft) cols_to_log = ["meta_sale_price"] + ([] if condos else ["sv_price_per_sqft"]) df = log_transform(df, cols_to_log) - # (Optional) Persist group-level statistics + # Persist group-level statistics (for raw price and price per sqft) df["group_mean"] = df.groupby(list(groups))["meta_sale_price"].transform("mean") df["group_std"] = df.groupby(list(groups))["meta_sale_price"].transform("std") if not condos: @@ -199,7 +206,7 @@ def pricing_info( "std" ) - # Compute deviation columns using z-normalization within groups + # Compute z-score deviation columns within groups df[f"sv_price_deviation_{group_str}"] = df.groupby(list(groups))[ "meta_sale_price" ].apply(z_normalize_groupby) @@ -211,7 +218,7 @@ def pricing_info( z_normalize_groupby ) - # Compute lower and upper thresholds (per row) for each deviation column + # Vectorized per-row lower and upper thresholds (mean ± std * multiplier) for col in [f"sv_price_deviation_{group_str}", f"sv_cgdr_deviation_{group_str}"]: df[f"{col}_lower"] = df.groupby(list(groups))[col].transform("mean") - permut[ 0 @@ -228,7 +235,7 @@ def pricing_info( 1 ] * df.groupby(list(groups))[col].transform("std") - # Apply outlier type functions that use the computed threshold columns + # Determine pricing outlier type and (if applicable) which price measure is flagged df["sv_pricing"] = df.apply(lambda row: price_column(row, groups, condos), axis=1) if not condos: df["sv_which_price"] = df.apply(lambda row: which_price(row, groups), axis=1) @@ -238,16 +245,16 @@ def pricing_info( def which_price(row: pd.Series, groups: tuple) -> str: """ Determines which price measure (raw, per sqft, or both) is flagged as an outlier - by comparing the deviation values with their per-row thresholds. + by comparing deviation values with per-row thresholds. """ - group_str = create_group_string(groups) + group_str = create_group_string(groups, "_") raw_val = row[f"sv_price_deviation_{group_str}"] raw_lower = row[f"sv_price_deviation_{group_str}_lower"] raw_upper = row[f"sv_price_deviation_{group_str}_upper"] raw_out = not between_two_numbers(raw_val, raw_lower, raw_upper) - sqft_val = row.get(f"sv_price_per_sqft_deviation_{group_str}") - if sqft_val is not None: + if f"sv_price_per_sqft_deviation_{group_str}" in row: + sqft_val = row[f"sv_price_per_sqft_deviation_{group_str}"] sqft_lower = row[f"sv_price_per_sqft_deviation_{group_str}_lower"] sqft_upper = row[f"sv_price_per_sqft_deviation_{group_str}_upper"] sqft_out = not between_two_numbers(sqft_val, sqft_lower, sqft_upper) @@ -264,13 +271,17 @@ def which_price(row: pd.Series, groups: tuple) -> str: return "Non-outlier" +def between_two_numbers(num: int or float, a: int or float, b: int or float) -> bool: + return a < num < b + + def price_column(row: pd.Series, groups: tuple, condos: bool) -> str: """ Determines whether the record is a high or low price outlier and, if applicable, whether it exhibits a price swing. Comparisons are made by checking the record's deviation against its per-row lower/upper threshold. """ - group_str = create_group_string(groups) + group_str = create_group_string(groups, "_") value = "Not price outlier" price_flag = False raw_val = row[f"sv_price_deviation_{group_str}"] @@ -319,26 +330,347 @@ def price_column(row: pd.Series, groups: tuple, condos: bool) -> str: return value +def create_stats(df: pd.DataFrame, groups: tuple, condos: bool) -> pd.DataFrame: + """ + Create all statistical outlier measures. + Inputs: + df (pd.DataFrame): Dataframe to create statistics from + groups (tuple): grouping for groupby. Usually 'township' and 'class' + Outputs: + df(pd.DataFrame): dataframe with statistical measures calculated. + """ + + if not condos: + df = price_sqft(df) + + df = grouping_mean(df, groups, condos=condos) + + if not condos: + df = deviation_dollars(df, groups) + + df = dup_stats(df, groups) + df = transaction_days(df) + df = percent_change(df) + + return df + + +def percent_change(df: pd.DataFrame) -> pd.DataFrame: + """ + Generates CGR for all records. Requires that transaction_days() has already been run. + Creates 'previous_price' column as intermediary to help calculate CGR. + Calculate the compound growth rate where the previous transaction is the + beginning value, the current price is the end value, and the number of periods + is the number of days since the last transaction. + This enables us to better compare percent change accross different time periods + as opposed to pandas pct_change() function which does not account for time period. + Helper for create_stats(). + + Dataframe is subset to work with a rolling window grouping. + + Inputs: + df (pd.DataFrame): datarame to create CGR on. + Outputs: + df (pd.DataFrame): dataframe with CGR statistic and previous_price column + """ + + original_df = df[df["original_observation"] == True].copy() + original_df["sv_previous_price"] = ( + original_df.sort_values("meta_sale_date") + .groupby(["pin"])["meta_sale_price"] + .shift(axis=0) + ) + original_df["sv_cgdr"] = ( + (original_df["meta_sale_price"] / original_df["sv_previous_price"]) + ** (1 / original_df["sv_days_since_last_transaction"]) + ) - 1 + + df = pd.merge( + df, + original_df[["sv_previous_price", "sv_cgdr"]], + left_index=True, + right_index=True, + how="left", + ) + return df + + +def dup_stats(df: pd.DataFrame, groups: tuple) -> pd.DataFrame: + """ + Stats that can only be calculated for PINs occuring more than once, such as sale volatiltiy, + and growth rates. + Helper for create_stats(). + Inputs: + df (pd.DataFrame): dataframe with sales data + groups (tuple): for get_movement groups + Outputs:mean + df(pd.DataFrame): dataframe with sale counts and town_class movement columns. + """ + dups = df[df.pin.duplicated(keep=False)] + dups = get_sale_counts(dups) + dups = get_movement(dups, groups) + + df = pd.merge(df, dups, how="outer") + + return df + + +def price_sqft(df: pd.DataFrame) -> pd.DataFrame: + """ + Creates price/sqft columns in DataFrame. Must contain 'sale_price', + 'sale_price_log10' and 'sqft' in the columns, where the first two names are + self explanatory and 'sqft' is the properties square footage. + Helper for create_stats(). + Inputs: + df (pd.DataFrame): pandas dataframe with required columns. + Outputs: + df (pd.DataFrame): pandas dataframe with _per_sqft columns. + """ + df["sv_price_per_sqft"] = df["meta_sale_price"] / df["char_bldg_sf"] + df["sv_price_per_sqft"].replace([np.inf, -np.inf], np.nan, inplace=True) + + return df + + +def deviation_dollars(df: pd.DataFrame, groups: tuple) -> pd.DataFrame: + """ + Creates the deviation in dollars of this record from the mean + sale_price and price_per_sqft for the groupby groups. + Inputs: + df (pd.DataFrame): dataframe to create deviations on + groups (tuple): tuple of groups being grouped by + Outputs: + df (pd.DataFrame): dataframe with deviation columns + """ + group_string = create_group_string(groups, "_") + + df[f"sv_deviation_{group_string}_mean_price"] = ( + df["meta_sale_price"] - df[f"sv_mean_price_{group_string}"] + ) + df[f"sv_deviation_{group_string}_mean_price_per_sqft"] = ( + df["sv_price_per_sqft"] - df[f"sv_mean_price_per_sqft_{group_string}"] + ) + + return df + + +def grouping_mean(df: pd.DataFrame, groups: tuple, condos: bool) -> pd.DataFrame: + """ + Gets sale_price mean by two groupings. Usually town + class. + Helper for create_stats(). + Inputs: + df (pd.DataFrame): dataframe with the grouping columns + groups (tuple): tuple (len == 2) where each element is a column name to be grouped by. + Outputs: + df (pd.DataFrame): dataframe with grouped by mean column + """ + group_string = create_group_string(groups, "_") + + group_mean = df.groupby(list(groups))["meta_sale_price"].mean() + + if condos == True: + df.set_index(list(groups), inplace=True) + df[f"sv_mean_price_{group_string}"] = group_mean + else: + group_mean_sqft = df.groupby(list(groups))["sv_price_per_sqft"].mean() + df.set_index(list(groups), inplace=True) + df[f"sv_mean_price_{group_string}"] = group_mean + df[f"sv_mean_price_per_sqft_{group_string}"] = group_mean_sqft + + df.reset_index(inplace=True) + + return df + + +def get_sale_counts(dups: pd.DataFrame) -> pd.DataFrame: + """ + Calculates how many times transactions occured for a gieven property. + Helper for dup_stats() + Inputs: + df (pd.DataFrame): pandas dataframe + """ + v_counts = ( + dups.pin.value_counts() + .reset_index(name="sv_sale_dup_counts") + .rename(columns={"index": "pin"}) + ) + + dups = pd.merge(dups, v_counts) + + return dups + + +def get_movement(dups: pd.DataFrame, groups: tuple) -> pd.DataFrame: + """ + Creates a coloumn that determines whether the price movement of the records is + towards or away from the mean. + Helper for dup_stats(). + Inputs: + df (pd.DataFrame): duplicate records + groups (tuple): groupby groups + Outputs: + df (pd.DataFrame): duplicate records with new column + """ + group_string = create_group_string(groups, "_") + + dups[f"sv_deviation_{group_string}_mean_price_abs"] = abs( + dups[f"sv_mean_price_{group_string}"] - dups["meta_sale_price"] + ) + + temp = ( + dups.sort_values("meta_sale_date") + .groupby(["pin"])[f"sv_deviation_{group_string}_mean_price_abs"] + .shift() + ) + dups["sv_price_movement"] = ( + dups[f"sv_deviation_{group_string}_mean_price_abs"].lt(temp).astype(float) + ) + dups["sv_price_movement"] = np.select( + [(dups["sv_price_movement"] == 0), (dups["sv_price_movement"] == 1)], + ["Away from mean", "Towards mean"], + default="First sale", + ) + + return dups + + +def transaction_days(df: pd.DataFrame) -> pd.DataFrame: + """ + For each record, gets number of days since the last transaction. + Data frame is subset to work with a rolling window grouping. + + Inputs: + df (pd.DataFrame): DataFrame with a sale_date column in datetime + Outputs: + df (pd.DataFrame): DataFrame with new column + """ + + original_df = df[df["original_observation"] == True].copy() + original_df["sv_days_since_last_transaction"] = ( + original_df.sort_values("meta_sale_date") + .groupby("pin")["meta_sale_date"] + .diff() + .apply(lambda x: x.days) + ) + + df = pd.merge( + df, + original_df[["sv_days_since_last_transaction"]], + left_index=True, + right_index=True, + how="left", + ) + + return df + + def check_days(df: pd.DataFrame, threshold: int) -> pd.DataFrame: """ - Flags a transaction as a short-term ownership if the days since last transaction - are below the given threshold. + Creates a label of whether or not the transaction + was only owned for a short term. + If owned for less than the threshold, is a short term owner. + Inputs: + df (pd.DataFrame): dataframe to have short term owners checked + threshold (int): the threshold fo being a short term owner + Oututs: + df (pd.DataFrame): datafrme with 'short_owner' column """ - df["sv_short_owner"] = np.where( - df["sv_days_since_last_transaction"] < threshold, - "Short-term owner", - f"Over {threshold} days", + df["sv_short_owner"] = np.select( + [(df["sv_days_since_last_transaction"] < threshold)], + ["Short-term owner"], + default=f"Over {threshold} days", ) + + return df + + +def get_thresh(df: pd.DataFrame, cols: list, permut: tuple, groups: tuple) -> dict: + """ + Creates a nested dictionary where the top level key is a column + and the 2nd-level key is a (township, class) combo. + Ex: stds['sale_price'][76, 203] + Needed in order to keep track of specific thresholds for each township/class combo. + Theoretically each std should be 1(because of z_normalization), but in practical terms + it is in a very very small range around 1, so using a uniform cutoff of 2 and -2 + loses us some precision. + + We also want to allow for some flexibility in how the thresholds are calculated; + and this function allows for more flexbility in the event of future changes. + Inputs: + df (pd.DataFrame): Dataframe to create dictionary from. + cols (list): list of columns to get standard deviations for. + permut (tuple): standard deviation range for lower_limit and upper_limit + First term is how many stndard deviations away on the left + Second term is how many standard deviations away on the right. + Outputs: + stds (dict): nested dictionary of std deviations for all columns + from DataFrame. + """ + stds = {} + + for col in cols: + df[col] = df[col].astype(float) + grouped = df.dropna(subset=list(groups) + [col]).groupby(list(groups))[col] + lower_limit = grouped.mean() - (grouped.std(ddof=0) * permut[0]) + upper_limit = grouped.mean() + (grouped.std(ddof=0) * permut[1]) + std = grouped.std(ddof=0) + lower_limit = lower_limit.to_dict() + upper_limit = upper_limit.to_dict() + std = std.to_dict() + + limits = { + x: (std.get(x, 0), lower_limit.get(x, 0), upper_limit.get(x, 0)) + for x in set(std).union(upper_limit, lower_limit) + } + stds[col] = limits + + return stds + + +def log_transform(df: pd.DataFrame, columns: list) -> pd.DataFrame: + """ + Apply log transformation on given column set. + Inputs: + df (pd.DataFrame): + columns (list): columns to be transformed + Outputs: + df (pd.DataFrame): dataframe with given columns replaced + by their logged values + """ + for col in columns: + df[col] = np.log10(df[col]) + return df +def z_normalize_groupby(s: pd.Series): + """ + Function used to z_normalize groups of records. + Pandas stitches it back together into a complete column. + Meant for groupby.apply(). + Inputs: + s(pd.Series): grouped series from groupby.apply + Outputs: + z_normalized series grouped by class and township + that is then stitched into complete column by pandas + """ + + return zscore(s, nan_policy="omit") + + def outlier_type( df: pd.DataFrame, condos: bool, raw_price_threshold: int ) -> pd.DataFrame: """ - Creates indicator columns for various outlier types based on both characteristic- - and pricing-based conditions. + This function create indicator columns for each distinct outlier type between price + and characteristic outliers. These columns are prefixed with 'sv_ind_'. + + Inputs: + df (pd.DataFrame): Dataframe + Outputs: + df (pd.DataFrame): Dataframe with indicator columns for each flag type """ + char_conditions = [ df["sv_short_owner"] == "Short-term owner", df["sv_name_match"] != "No match", @@ -347,6 +679,8 @@ def outlier_type( df["sv_pricing"].str.contains("High price swing") | df["sv_pricing"].str.contains("Low price swing"), ] + + # Define labels for characteristic-based reasons char_labels = [ "sv_ind_char_short_term_owner", "sv_ind_char_family_sale", @@ -356,336 +690,537 @@ def outlier_type( ] if condos: + # Define conditions for price-based reasons price_conditions = [ df["sv_pricing"].str.contains("High"), df["sv_pricing"].str.contains("Low"), ] + + # Define labels for price-based reasons price_labels = [ "sv_ind_price_high_price", "sv_ind_price_low_price", ] + else: + # Define conditions for price-based reasons price_conditions = [ - df["sv_pricing"].str.contains("High") - & df["sv_which_price"].str.contains("raw"), - df["sv_pricing"].str.contains("Low") - & df["sv_which_price"].str.contains("raw"), - df["sv_pricing"].str.contains("High") - & df["sv_which_price"].str.contains("sqft"), - df["sv_pricing"].str.contains("Low") - & df["sv_which_price"].str.contains("sqft"), + ( + df["sv_pricing"].str.contains("High") + & (df["sv_which_price"].str.contains("raw")) + ), + ( + df["sv_pricing"].str.contains("Low") + & (df["sv_which_price"].str.contains("raw")) + ), + (df["sv_pricing"].str.contains("High")) + & (df["sv_which_price"].str.contains("sqft")), + (df["sv_pricing"].str.contains("Low")) + & (df["sv_which_price"].str.contains("sqft")), ] + + # Define labels for price-based reasons price_labels = [ "sv_ind_price_high_price", "sv_ind_price_low_price", "sv_ind_price_high_price_sqft", "sv_ind_price_low_price_sqft", ] - # Raw price threshold (comparing the unlogged value) + + # Implement raw threshold, unlog price price_conditions.append((10 ** df["meta_sale_price"]) > raw_price_threshold) price_labels.append("sv_ind_raw_price_threshold") - for label, condition in zip( - price_labels + char_labels, price_conditions + char_conditions - ): + combined_conditions = price_conditions + char_conditions + combined_labels = price_labels + char_labels + + # Create indicator columns for each flag type + for label, condition in zip(combined_labels, combined_conditions): df[label] = condition.astype(int) + return df -# ============================================================================= -# Isolation Forest & PCA Functions -# ============================================================================= -def pca_transform(df: pd.DataFrame, columns: list) -> pd.DataFrame: - """ - Runs PCA on the specified columns (after filling NAs and infinities) - and returns the principal components with explained variance > 1. +# STRING CLEANUP + +""" + An outline of our overall approach: + + Tries to create an identifier from the buyer/seller name. + Our appraoch is to try to identify if it is a legal identify of some sort, + such as a bank, construction company, trust, LLC, or other and + return the string as-is with some formatting applied if so. We also combine some + spellings/mispellings of big entities. + + If we can't identify the string as a legal entity we assume the string contains a person's name. + We then process these strings to determine if the person is a trustee, successor, + or a successor trustee from the fragements of the strings. + Once we do this, we determine the best place tosplit the string in split_logic(), + looking out for certain tokens. After we've determnined where to split + the string we send the tokens to name_selector, where we attempt to select + the last name of the string. + + We then create a column that tells us whether it's person, or a legal entity, + as per our identification method that we used in get_id(). + + Then we use the trustee, successor, or as successor trustee parts of + the string we constructed earlier to determine the role of the buyer + or seller in the transaction(trustee, successor, successor trustee). + + We then remove the trustee, successor, as successor trustee parts of the string + from buyer/seller id. + + Finally we create a transaction_type column that is just what kind of entity it is + with a dash between them. + + TODO: Process more string types: + - If a name contains 'and', we split the string on it and take + the token directly to the left. We could take a more sophisticated + approach to determine if the last name in this case. + - 'co-trustee' handling. + - Handle different name formats. Assume people use + but sometimes its or other such formats. + - Find trends in string cutoffs(some are cut off at 25, characters, others 25, etc) + that could help use better process strings that are cutoff. + - Cleanup/debug regex. This is a lot of dirty regex, and it is picking up + some names that we don't want, or not correctly identifying every case that we do want. + So it could use some work in some cases. """ - feed = df[columns].fillna(0).replace([np.inf, -np.inf], 0) - pca_model = PCA(n_components=len(feed.columns)) - pcs = pca_model.fit_transform(feed) - pc_df = pd.DataFrame( - pcs, columns=[f"PC{i}" for i in range(len(feed.columns))], index=df.index - ) - n_components = sum(pca_model.explained_variance_ > 1) - return pc_df.iloc[:, :n_components] -def iso_forest( - df: pd.DataFrame, - groups: tuple, - columns: list, - n_estimators: int = 1000, - max_samples=0.2, -) -> pd.DataFrame: - """ - Runs Isolation Forest on PCA-transformed features (with additional group labels) - to flag statistical anomalies. - """ - df.set_index("meta_sale_document_num", inplace=True) - pca_features = pca_transform(df, columns) +entity_keywords = ( + r"llc| ll$| l$|l l c|estate|training|construction|building|masonry|" + r"apartments|plumbing|service|professional|roofing|advanced|office|" + r"\blaw\b|\bloan\b|legal|production|woodwork|concepts|corp|company|" + r" united|\binc\b|county|entertainment|community|heating|cooling" + r"|partners|equity|indsutries|series|revitalization|collection|" + r"agency|renovation|consulting|flippers|estates|\bthe \b|dept|" + r"funding|opportunity|improvements|servicing|equities|\bsale\b|" + r"judicial| in$|bank|\btrust\b|holding|investment|housing" + r"|properties|limited|realty|development|capital|management" + r"|developers|construction|rentals|group|investments|invest|" + r"residences|enterprise|enterprises|ventures|remodeling|" + r"specialists|homes|business|venture|restoration|renovations" + r"|maintenance|ltd|real estate|builders|buyers|property|financial" + r"|associates|consultants|international|acquisitions|credit|design" + r"|homeownership|solutions|\bhome\b|diversified|assets|family|\bland\b" + r"|revocable|services|rehabbing|\bliving\b|county of cook|fannie mae" + r"|veteran|mortgage|savings|lp$|federal natl|hospital|southport|mtg" + r"|propert|rehab|neighborhood|advantage|chicago|cook c|\bbk\b|\bhud\b" + r"|department|united states|\busa\b|hsbc|midwest|residential|american" + r"|tcf|advantage|real e|advantage|fifth third|baptist church" + r"|apostolic church|lutheran church|catholic church|\bfed\b|nationstar" + r"|advantage|commercial|health|condominium|nationa|association|homeowner" + r"|christ church|christian church|baptist church|community church" + r"|church of c|\bdelaw\b|lawyer|delawar" +) - label_encoders = {} - for group in groups: - if not pd.api.types.is_numeric_dtype(df[group]): - le = LabelEncoder() - df[group] = le.fit_transform(df[group]) - label_encoders[group] = le - pca_features[group] = df[group] - iso = IsolationForest( - n_estimators=n_estimators, - max_samples=max_samples, - bootstrap=True, - random_state=42, - ) - df["sv_anomaly"] = iso.fit_predict(pca_features) - df["sv_anomaly"] = np.where(df["sv_anomaly"] == -1, "Outlier", "Not Outlier") +def get_id(row: pd.Series, col: str) -> str: + """ + Creates an ID from the buyer/seller name. - for group, le in label_encoders.items(): - df[group] = le.inverse_transform(df[group]) - df.reset_index(inplace=True) - return df + Returns string as-is if identified as legal entity. + Combined with other entities if its a common mispelling/cutoff. + Attempts to identify last name if not a legal entity. -# ============================================================================= -# String Processing Functions -# ============================================================================= -def get_id(row: pd.Series, col_prefix: str) -> str: + Inputs: + row: from apply() + col (str): 'buyer' or 'seller' + Outputs: + id (str): string as-is if legal entity + identified last name if otherwise. """ - Generates an identifier from the buyer/seller name. If the name appears to be - a legal entity (based on keywords, presence of digits, or certain suffixes), - returns the cleaned string; otherwise attempts to extract a last name. - """ - col = col_prefix + "_name" - name_str = str(row[col]).lower().strip() - if pd.isnull(name_str) or name_str in { + + column = col + "_name" + words = str(row[column]).lower() + + # Check for missing values first + if pd.isnull(row[column]) or words in [ "none", "nan", "unknown", "missing seller name", "missing buyer name", - }: - return "Empty Name" + ]: + id = "Empty Name" + return id - name_str = re.sub(r" amp ", " ", name_str) - name_str = re.sub(r"\s+", " ", name_str).strip() - if not name_str or re.fullmatch(r"[.]*", name_str): - return "Empty Name" + words = re.sub(r" amp ", "", words) + words = re.sub(" +", " ", words) + + if words.isspace() or re.search(r"^[.]*$", words): + id = "Empty Name" + return id - # Handle specific known cases - special_cases = { - "vt investment corpor": "vt investment corporation", - "v t investment corp": "vt investment corporation", - "national residential nomi": "national residential nominee services", - "first integrity group inc": "first integrity group inc", - "first integrity group in": "first integrity group inc", - "deutsche bank national tr": "deutsche bank national trust company", - "cirrus investment group l": "cirrus investment group", - "cirrus investment group": "cirrus investment group", - "fannie mae aka federal na": "fannie mae", - "fannie mae a k a federal": "fannie mae", - "federal national mortgage": "fannie mae", - "judicial sales corpor": "the judicial sales corporation", - "judicial sales corp": "the judicial sales corporation", - "judicial sales corporatio": "the judicial sales corporation", - "judicial sale corp": "the judicial sales corporation", - "the judicial sales corp": "the judicial sales corporation", - "jpmorgan chase bank n a": "jp morgan chase bank", - "jpmorgan chase bank nati": "jp morgan chase bank", - "wells fargo bank na": "wells fargo bank national", - "wells fargo bank n a": "wells fargo bank national", - "wells fargo bank nationa": "wells fargo bank national", - "wells fargo bank n a a": "wells fargo bank national", - "wells fargo bk": "wells fargo bank national", - "bayview loan servicing l": "bayview loan servicing llc", - "bayview loan servicing ll": "bayview loan servicing llc", - "thr property illinois l": "thr property illinois lp", - "thr property illinois lp": "thr property illinois lp", - "ih3 property illinois lp": "ih3 property illinois lp", - "ih3 property illinois l": "ih3 property illinois lp", - "ih2 property illinois lp": "ih2 property illinois lp", - "ih2 property illinois l": "ih2 property illinois lp", - "secretary of housing and": "secretary of housing and urban development", - "the secretary of housing": "secretary of housing and urban development", - "secretary of housing ": "secretary of housing and urban development", - "secretary of veterans aff": "secretary of veterans affairs", - "the secretary of veterans": "secretary of veterans affairs", - "bank of america n a": "bank of america national", - "bank of america na": "bank of america national", - "bank of america national": "bank of america national", - "us bank national association": "us bank national association", - "u s bank national assoc": "us bank national association", - "u s bank national associ": "us bank national association", - "u s bank trust n a as": "us bank national association", - "u s bank n a": "us bank national association", - "us bank national associat": "us bank national association", - "u s bank trust national": "us bank national association", - "us bk": "us bank national association", - "u s bk": "us bank national association", - } - for key, val in special_cases.items(): - if key in name_str: - return val - - # Normalize trustee/successor tokens - name_str = re.sub( - r"(suc t$|as succ t$|successor tr$|successor tru$|successor trus$|" - r"successor trust$|successor truste$|successor trustee$|successor t$|as successor t$)", + if any(x in words for x in ["vt investment corpor", "v t investment corp"]): + return "vt investment corporation" + + if any(x in words for x in ["national residential nomi"]): + return "national residential nominee services" + + if any( + x in words for x in ["first integrity group inc", "first integrity group in"] + ): + return "first integrity group inc" + + if words in ["deutsche bank national tr"]: + return "deutsche bank national trust company" + + if any( + x in words for x in ["cirrus investment group l", "cirrus investment group"] + ): + return "cirrus investment group" + + if any( + x in words + for x in [ + "fannie mae aka federal na", + "fannie mae a k a federal", + "federal national mortgage", + ] + ): + return "fannie mae" + + if any( + x in words + for x in [ + "the judicial sales corpor", + "judicial sales corp", + "judicial sales corporatio", + "judicial sale corp", + "the judicial sales corp", + ] + ): + return "the judicial sales corporation" + + if any(x in words for x in ["jpmorgan chase bank n a", "jpmorgan chase bank nati"]): + return "jp morgan chase bank" + + if any( + x in words + for x in [ + "wells fargo bank na", + "wells fargo bank n a", + "wells fargo bank nationa", + "wells fargo bank n a a", + "wells fargo bk", + ] + ): + return "wells fargo bank national" + + if any( + x in words for x in ["bayview loan servicing l", "bayview loan servicing ll"] + ): + return "bayview loan servicing llc" + + if any(x in words for x in ["thr property illinois l", "thr property illinois lp"]): + return "thr property illinois lp" + + if any(x in words for x in ["ih3 property illinois lp", "ih3 property illinois l"]): + return "ih3 property illinois lp" + + if any(x in words for x in ["ih2 property illinois lp", "ih2 property illinois l"]): + return "ih2 property illinois lp" + + if any( + x in words + for x in [ + "secretary of housing and", + "the secretary of housing", + "secretary of housing ", + ] + ): + return "secretary of housing and urban development" + + if any( + x in words for x in ["secretary of veterans aff", "the secretary of veterans"] + ): + return "secretary of veterans affairs" + + if any( + x in words + for x in [ + "bank of america n a", + "bank of america na", + "bank of america national", + ] + ): + return "bank of america national" + + if any( + x in words + for x in [ + "us bank national association", + "u s bank national assoc", + "u s bank national associ", + "u s bank trust n a as", + "u s bank n a", + "us bank national associat", + "u s bank trust national", + "us bk", + "u s bk", + ] + ): + return "us bank national association" + + words = re.sub( + "suc t$|as succ t$|successor tr$|successor tru$|" + "successor trus$|successor trust$|successor truste$|" + "successor trustee$|successor t$|as successor t$", "as successor trustee", - name_str, + words, ) - name_str = re.sub( - r"(as t$|as s t$|as sole t$|as tr$|as tru$|as trus$|as trust$|as truste$|" - r"as trustee$|as trustee o$|as trustee of$|, t|, tr|, tru|, trus|, trust|, truste)", + words = re.sub( + "as t$|as s t$|as sole t$|as tr$|as tru$|as trus$|as trust$|" + "as truste$|as trustee$|as trustee o$|as trustee of$|trustee of$|" + "trustee of$|tr$|tru$|trus$|truste$|trustee$|, t|, tr|, tru|, trus|" + ", trust|, truste", "as trustee", - name_str, + words, ) - name_str = re.sub( - r"(su$|suc$|succ$|succe$|succes$|success$|successo$|successor$|as s$|as su$|" - r"as suc$|as succ$|as succe$|as sucess$|as successo$|, s$|, su$|, suc$|, succ$|, succe$|, succes$|, success$|, successo$)", + words = re.sub( + "su$|suc$|succ$|succe$|succes$|success$|successo$|successor$|as s$|as su$|" + "as suc$|as succ$|as succe$|as sucess$|as successo$|, s$|, su$|, suc$|, succ$|" + ", succe$|, succes$|, success$|, successo$", "as successor", - name_str, + words, ) if ( - ENTITY_KEYWORDS.search(name_str) - or re.search(r"\d{3,4}", name_str) - or re.search(r"as trustee$|as successor$|as successor trustee$", name_str) + re.search(entity_keywords, words) + or re.search(r"\d{4}|\d{3}", words) + or re.search("as trustee$|as successor$|as successor trustee$", words) ): - return name_str + id = words + return id - name_str = re.sub( - r"( in$|indi$|indiv$|indivi$|individ$|individu$|individua$|individual$|" - r"not i$|not ind$| ind$| inde$|indep$|indepe$|indepen$|independ$|independe$|independen$|independent$)", + words = re.sub( + " in$|indi$|indiv$|indivi$|indivi$|individ$|individu$|individua$|individual$" + "|not i$|not ind$| ind$| inde$|indep$|indepe$|indepen$|independ$|independe$" + "|independen$|independent$", "", - name_str, + words, ) - tokens = split_logic(name_str) - return name_selector(tokens) + + tokens = split_logic(words) + + id = name_selector(tokens) + + return id -def split_logic(name_str: str): +def split_logic(words: str): """ - Splits a cleaned string into tokens using keywords such as 'and' or common abbreviations. - Returns a list of tokens (or "Empty Name" if input is not valid). + Given a cleaned string, determines where to split the string. + Splits on 'and', variations of FKA/NKA/KNA if present, on spaces if not. + Helper to get_id(). + Inputs: + words (str): cleaned str from get_id + Outputs: + 'Empty Name' if string is empty + tokens (list): list of tokens in string from split """ - name_str = re.sub(r"\s+", " ", name_str).strip() - if not name_str or re.fullmatch(r"[.]*", name_str) or name_str == "Empty Name": + words = re.sub(" +", " ", words) + + if words.isspace() or re.search(r"^[.]*$", words) or words == "Empty Name": return "Empty Name" - name_str = re.sub(r"\s+as$|\s+as\s+$|as\s+$", "", name_str) - m = re.search( - r"\b and\b|\b an\b|\b a\b|f k a|\bfka\b| n k a|\bnka\b|\b aka\b|a k a(?=\s|$)|\b kna\b|k n a| f k$|n k$|a k$|\b not\b| married", - name_str, + + words = re.sub(" as$| as $|as $", "", words) + + _and = re.search( + r"\b and\b|\b an$\b|\b a$\b|f k a|\bfka\b| n k a|\bnka\b|" + r"\b aka\b|a k a(?=\\s|$)|\b kna\b|k n a| f k$|n k$|a k$|\b not\b| married", + words, ) - if m: - tokens = name_str.split(m.group()) - return tokens[0].strip().split() - return name_str.split() + + if _and: + tokens = words.split(_and.group()) + tokens = tokens[0].strip().split() + else: + tokens = words.split() + + return tokens def name_selector(tokens) -> str: """ - Given a list of name tokens, returns the last token as an identifier, - ignoring common suffixes. + Attempts to select the last name of a person's name based on the number of tokens. + Inputs: + tokens: list of strings where each string is a name token + Outputs: + 'Empty Name' if name is empty. + id (str): identified last name """ - suffixes = {"jr", "sr", "ii", "iii", "iv", "v"} - if tokens == "Empty Name" or not tokens: + + suffixes = ["jr", "sr", "ii", "iii", "iv", "v"] + + if tokens == "Empty Name" or tokens == []: return "Empty Name" - while tokens and tokens[-1] in suffixes: + + while tokens[-1] in suffixes: tokens = tokens[:-1] - return tokens[-1] if tokens else "Empty Name" + if not tokens: # Avoids IndexError if all tokens are removed. + return "Empty Name" + + id = tokens[-1] + + return id -def get_category(row: pd.Series, col_prefix: str) -> str: +def get_category(row: pd.Series, col: str) -> str: """ - Determines whether the identifier belongs to a legal entity or a person. + Gets category buyer/seller id. legal_entity if in entity keywords, + person if otherwise. + Inputs: + row: from pandas dataframe + col (str): column to process. 'buyer' or 'seller' + Outputs: + category (str): category of buyer/seller id """ - col = col_prefix + "_id" - name_str = row[col] - if ENTITY_KEYWORDS.search(name_str): - return "legal_entity" - elif name_str == "Empty Name": - return "none" + + column = col + "_id" + words = row[column] + + if re.search(entity_keywords, words): + category = "legal_entity" + elif words == "Empty Name": + category = "none" else: - return "person" + category = "person" + + return category -def clean_id(row: pd.Series, col_prefix: str) -> str: +def get_role(row: pd.Series, col: str) -> str: """ - Cleans the identifier by removing role-related tokens and, if appropriate, - reselecting the name token. + Picks the role th person is playing in the transaction off of the + buyer/seller_id. Meant for apply() + Ex: 'as trustee', or 'as successor' + Inputs: + row: from pandas dataframe + col (str): column to process. 'buyer' or 'seller' + Outputs: + roles(str): the role of the person n the transaction + """ - col = col_prefix + "_id" - name_str = row[col] - name_str = re.sub( - r" as successor trustee|\bas successor\b| as trustee", "", name_str - ) - name_str = re.sub(r"\s+as$|\s+as\s+$|as\s+$", "", name_str) - if not ( - ENTITY_KEYWORDS.search(name_str) - or re.search(r"\d{3,4}", name_str) - or len(name_str.split()) == 1 - ): - name_str = name_selector(split_logic(name_str)) - return name_str + role = None + column = col + "_id" + words = row[column] + + suc_trust = re.search(" as successor trustee", words) + suc = re.search(" as successor", words) + trust = re.search(" as trustee", words) + + if suc_trust: + role = suc_trust.group() + + if suc: + role = suc.group() + + if trust: + role = trust.group() + + return role -def get_role(row: pd.Series, col_prefix: str) -> str: +def clean_id(row: pd.Series, col: str) -> str: """ - Extracts the role (e.g., 'as trustee', 'as successor') from the identifier. + Cleans id field after get_role() by removing role. + Inputs: + row: from pandas dataframe + col (str): column to process. 'seller' or 'buyer' + Outputs: + words (str): seller/buyer id without role. """ - col = col_prefix + "_id" - name_str = row[col] - for role_token in [" as successor trustee", " as successor", " as trustee"]: - m = re.search(role_token, name_str) - if m: - return m.group() - return None + + column = col + "_id" + words = row[column] + + words = re.sub(r" as successor trustee|\b as successor\b| as trustee", "", words) + words = re.sub(" as$| as $|as $", "", words) + + if not ( + re.search(entity_keywords, words) + or re.search(r"\d{4}|\d{3}", words) + or len(words.split()) == 1 + ): + words = name_selector(split_logic(words)) + + return words def create_judicial_flag(df: pd.DataFrame) -> pd.DataFrame: """ - Creates a binary flag (as string '1' or '0') indicating whether the seller's - identifier corresponds to a judicial sales entity. + Creates a column that contains 1 if sold from a judicial corp + and 0 otherwise. Mean for use with apply(). + Inputs: + df (pd.DataFrame): dataframe to create flag on + Outputs: + df (pd.DataFrame): dataframe with 'sv_is_judicial_sale' column """ - df["sv_is_judicial_sale"] = np.where( - df["sv_seller_id"].isin( - ["the judicial sale corporation", "intercounty judicial sale"] - ), - "1", - "0", + + df["sv_is_judicial_sale"] = np.select( + [ + (df["sv_seller_id"] == "the judicial sale corporation") + | (df["sv_seller_id"] == "intercounty judicial sale") + ], + ["1"], + default="0", ) + return df def create_name_match(row: pd.Series) -> str: """ - If the buyer and seller identifiers match (and are nontrivial and not both legal entities), - returns the match; otherwise returns "No match". + Creates a column that contains the actual string that was matched. + Meant for apply(). + Inputs: + row: from pandas dataframe + Outputs: + value (str or None): string match if applicable, None otherwise """ if ( row["sv_buyer_id"] == row["sv_seller_id"] and row["sv_buyer_id"] != "Empty Name" + # Prevents the same legal entity as counting as a family name match and row["sv_transaction_type"] != "legal_entity-legal_entity" + # Boots out matches on a single last initial and len(row["sv_buyer_id"]) > 1 ): - return row["sv_seller_id"] - return "No match" + value = row["sv_seller_id"] + else: + value = "No match" + + return value def string_processing(df: pd.DataFrame) -> pd.DataFrame: """ - Processes buyer and seller name strings to generate identifiers, categories, - roles, and transaction type. + Brings together all of the apply functions for string processing. + Results in 7 additional columns. + ID, category, and role for buyer and seller. As well as transaction category type + for each record. + Inputs: + df (pd.dataFrame): dataframe with buyer/seller id columns. + Ouputs: + df(pd.DataFrame): dataframe with 7 new columns from apply functions """ - for col in ["meta_sale_buyer_name", "meta_sale_seller_name"]: - df[col] = ( - df[col] - .str.encode("ascii", "ignore") - .str.decode("ascii") - .str.replace(r"[^a-zA-Z0-9\-]", " ", regex=True) - .str.strip() - ) + df.meta_sale_buyer_name = df.meta_sale_buyer_name.str.encode( + "ascii", "ignore" + ).str.decode("ascii") + df.meta_sale_seller_name = df.meta_sale_seller_name.str.encode( + "ascii", "ignore" + ).str.decode("ascii") + df.meta_sale_buyer_name = df.meta_sale_buyer_name.str.replace( + r"[^a-zA-Z0-9\-]", " ", regex=True + ).str.strip() + df.meta_sale_seller_name = df.meta_sale_seller_name.str.replace( + r"[^a-zA-Z0-9\-]", " ", regex=True + ).str.strip() + df["sv_buyer_id"] = df.apply(get_id, args=("meta_sale_buyer",), axis=1) df["sv_seller_id"] = df.apply(get_id, args=("meta_sale_seller",), axis=1) df["sv_buyer_category"] = df.apply(get_category, args=("sv_buyer",), axis=1) @@ -693,40 +1228,8 @@ def string_processing(df: pd.DataFrame) -> pd.DataFrame: df["sv_buyer_id"] = df.apply(clean_id, args=("sv_buyer",), axis=1) df["sv_seller_id"] = df.apply(clean_id, args=("sv_seller",), axis=1) df["sv_transaction_type"] = df["sv_buyer_category"] + "-" + df["sv_seller_category"] + df = create_judicial_flag(df) df["sv_name_match"] = df.apply(create_name_match, axis=1) - return df - -# ============================================================================= -# Main Pipeline -# ============================================================================= -def go( - df: pd.DataFrame, - groups: tuple, - iso_forest_cols: list, - dev_bounds: tuple, - condos: bool, - raw_price_threshold: int, -) -> pd.DataFrame: - """ - Runs the entire processing pipeline: - 1. Statistical measures & outlier preparations. - 2. String processing. - 3. Isolation Forest anomaly detection. - 4. Outlier taxonomy assignment. - """ - model_type = "condos" if condos else "residential" - print(f"Flagging for {model_type}") - print("Initializing statistics...") - df = create_stats(df, groups, condos=condos) - print("Statistics complete. Processing strings...") - df = string_processing(df) - print("String processing complete. Running isolation forest...") - df = iso_forest(df, groups, iso_forest_cols) - print("Isolation forest complete. Assigning outlier taxonomy...") - df = check_days(df, SHORT_TERM_OWNER_THRESHOLD) - df = pricing_info(df, dev_bounds, groups, condos=condos) - df = outlier_type(df, condos=condos, raw_price_threshold=raw_price_threshold) - print("Processing finished.") return df From c08872d67428fd896347abccdc7ed6b65fd52723 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Thu, 6 Feb 2025 21:43:13 +0000 Subject: [PATCH 3/6] Re add full doc strings --- glue/flagging_script_glue/flagging.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/glue/flagging_script_glue/flagging.py b/glue/flagging_script_glue/flagging.py index 9bdffc77..9f4be2fc 100644 --- a/glue/flagging_script_glue/flagging.py +++ b/glue/flagging_script_glue/flagging.py @@ -186,9 +186,17 @@ def pricing_info( df: pd.DataFrame, permut: tuple, groups: tuple, condos: bool ) -> pd.DataFrame: """ - Computes pricing deviations and, using a vectorized approach, computes per-row - lower/upper thresholds based on group means and standard deviations. Then, - determines pricing outlier type. + Computes pricing deviations and computeslower/upper standard + deviation thresholds based on group means and standard deviations. + + Inputs: + df (pd.DataFrame): dataframe of sales + permut (tuple): tuple of standard deviation boundaries. + Ex: (2,2) is 2 std away on both sides. + groups: (tuple): Our statistical grouping columns + condos (bool): Specifies whether we are running function for condos or residential + Outputs: + df (pd.DataFrame): dataframe with 3 extra columns of price info. """ group_str = create_group_string(groups, "_") # Log-transform the columns (price and, if applicable, price per sqft) @@ -280,6 +288,14 @@ def price_column(row: pd.Series, groups: tuple, condos: bool) -> str: Determines whether the record is a high or low price outlier and, if applicable, whether it exhibits a price swing. Comparisons are made by checking the record's deviation against its per-row lower/upper threshold. + + Determines whether the record is a high price outlier or a low price outlier. + If the record is also a price change outlier, than 'swing' is added to the string. + Inputs: + groups: (tuple) Columns for statistical grouping + condos (bool): Specifies whether we are running function for condos or residential + Outputs: + value (str): string showing what kind of price outlier the record is. """ group_str = create_group_string(groups, "_") value = "Not price outlier" From a6ce616a5be113697a7181bf8cb58b0a2c6b6f6b Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Thu, 6 Feb 2025 21:54:34 +0000 Subject: [PATCH 4/6] Improve docstring --- glue/flagging_script_glue/flagging.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/glue/flagging_script_glue/flagging.py b/glue/flagging_script_glue/flagging.py index 9f4be2fc..1530130d 100644 --- a/glue/flagging_script_glue/flagging.py +++ b/glue/flagging_script_glue/flagging.py @@ -254,6 +254,10 @@ def which_price(row: pd.Series, groups: tuple) -> str: """ Determines which price measure (raw, per sqft, or both) is flagged as an outlier by comparing deviation values with per-row thresholds. + Inputs: + groups (tuple): tuple of columns used for statistical grouping + Outputs: + value (str): string saying which of these are outliers. """ group_str = create_group_string(groups, "_") raw_val = row[f"sv_price_deviation_{group_str}"] From 8f7eea505530c5f2e7cfd695e1bde5a3a87a8c51 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Thu, 6 Feb 2025 22:01:27 +0000 Subject: [PATCH 5/6] Remove get_thresh --- glue/flagging_script_glue/flagging.py | 43 --------------------------- 1 file changed, 43 deletions(-) diff --git a/glue/flagging_script_glue/flagging.py b/glue/flagging_script_glue/flagging.py index 1530130d..7758b7dc 100644 --- a/glue/flagging_script_glue/flagging.py +++ b/glue/flagging_script_glue/flagging.py @@ -604,49 +604,6 @@ def check_days(df: pd.DataFrame, threshold: int) -> pd.DataFrame: return df -def get_thresh(df: pd.DataFrame, cols: list, permut: tuple, groups: tuple) -> dict: - """ - Creates a nested dictionary where the top level key is a column - and the 2nd-level key is a (township, class) combo. - Ex: stds['sale_price'][76, 203] - Needed in order to keep track of specific thresholds for each township/class combo. - Theoretically each std should be 1(because of z_normalization), but in practical terms - it is in a very very small range around 1, so using a uniform cutoff of 2 and -2 - loses us some precision. - - We also want to allow for some flexibility in how the thresholds are calculated; - and this function allows for more flexbility in the event of future changes. - Inputs: - df (pd.DataFrame): Dataframe to create dictionary from. - cols (list): list of columns to get standard deviations for. - permut (tuple): standard deviation range for lower_limit and upper_limit - First term is how many stndard deviations away on the left - Second term is how many standard deviations away on the right. - Outputs: - stds (dict): nested dictionary of std deviations for all columns - from DataFrame. - """ - stds = {} - - for col in cols: - df[col] = df[col].astype(float) - grouped = df.dropna(subset=list(groups) + [col]).groupby(list(groups))[col] - lower_limit = grouped.mean() - (grouped.std(ddof=0) * permut[0]) - upper_limit = grouped.mean() + (grouped.std(ddof=0) * permut[1]) - std = grouped.std(ddof=0) - lower_limit = lower_limit.to_dict() - upper_limit = upper_limit.to_dict() - std = std.to_dict() - - limits = { - x: (std.get(x, 0), lower_limit.get(x, 0), upper_limit.get(x, 0)) - for x in set(std).union(upper_limit, lower_limit) - } - stds[col] = limits - - return stds - - def log_transform(df: pd.DataFrame, columns: list) -> pd.DataFrame: """ Apply log transformation on given column set. From 89f515b05e45095a77b7966474c8ddc7ce4c87d8 Mon Sep 17 00:00:00 2001 From: wagnerlmichael Date: Fri, 21 Feb 2025 13:32:21 -0600 Subject: [PATCH 6/6] Fix spacing --- glue/flagging_script_glue/flagging.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/glue/flagging_script_glue/flagging.py b/glue/flagging_script_glue/flagging.py index 7758b7dc..47a1209c 100644 --- a/glue/flagging_script_glue/flagging.py +++ b/glue/flagging_script_glue/flagging.py @@ -186,7 +186,7 @@ def pricing_info( df: pd.DataFrame, permut: tuple, groups: tuple, condos: bool ) -> pd.DataFrame: """ - Computes pricing deviations and computeslower/upper standard + Computes pricing deviations and computes lower/upper standard deviation thresholds based on group means and standard deviations. Inputs: