Skip to content
5 changes: 5 additions & 0 deletions AD450_math.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
def add(num1, num2):
return num1 + num2

def multiply(num1, num2):
return num1 * num2
178 changes: 178 additions & 0 deletions data_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
import pandas as pd
import calendar, datetime
import unicodedata

MONTH_DICT = dict((v,k) for k,v in enumerate(calendar.month_name))
FEATURE_COLUMNS = ['lead_avg_rank', 'lead_avg_points', 'lead_count',
'boulder_avg_rank', 'boulder_count',
'speed_avg_rank', 'speed_count']
GENDER_DF = pd.read_csv('ifsc_climbing_data/genders.csv')

def rename_columns(df):
rename_dict = {name : name.lower().replace(' ', '_') for name in df.columns}
return df.rename(columns=rename_dict)

def remove_youth(df):
return df[~df.competition_title.str.lower().str.contains('youth')]


def get_end_day(string):
value_list = re.split(' ', string)

if len(value_list) == 6:
end_day = int(value_list[3])
else:
end_day = None

return end_day

def date_create(df):

df['year'] = df.competition_date.str.slice(start = -4).astype('int')
df['month_string'] = df.competition_date.str.extract('([A-za-z]+)')
df['month'] = df.month_string.map(MONTH_DICT)
df['day'] = df.competition_date.str.extract('(\d+)').astype('int')
df['start_date'] = pd.to_datetime(df[['year', 'month', 'day']])

return df

def date_filter(df, date):
return df[df.start_date < date]

def data_cleaning(df, filter_dates = True, date = None):
df_renamed = rename_columns(df)
df_dedup = df_renamed.drop_duplicates()
df_adult = remove_youth(df_dedup)
df_date = date_create(df_adult)

if filter_dates:
date_filter_df = date_filter(df_date, date)
return date_filter_df
else:
return df_date

def agg_join_data(lr_df, br_df, sr_df, predicting_comp):
join_col = ['first', 'last']

ld_agg_mean = lr_df[['first', 'last', 'rank', 'points']].groupby(join_col).mean().reset_index()
ld_agg_count = lr_df[['first', 'last', 'rank']].groupby(join_col).count().reset_index()
ld_agg_df = ld_agg_mean.merge(ld_agg_count, on = join_col).rename(columns = {'rank_x': 'lead_avg_rank',
'points': 'lead_avg_points',
'rank_y': 'lead_count'
})
br_agg_mean = br_df[['first', 'last', 'rank']].groupby(join_col).mean().reset_index()
br_agg_count = br_df[['first', 'last', 'rank']].groupby(join_col).count().reset_index()
br_agg_df = br_agg_mean.merge(br_agg_count, on = join_col).rename(columns = {'rank_x': 'boulder_avg_rank',
'rank_y': 'boulder_count'
})
sr_agg_mean = sr_df[['first', 'last', 'rank']].groupby(join_col).mean().reset_index()
sr_agg_count = sr_df[['first', 'last', 'rank']].groupby(join_col).count().reset_index()
sr_agg_df = sr_agg_mean.merge(sr_agg_count, on = join_col).rename(columns = {'rank_x': 'speed_avg_rank',
'rank_y': 'speed_count'
})
predicting_comp['full_name'] = predicting_comp['last'] + ' ' + predicting_comp['first']

pred_aggs_raw = predicting_comp.merge(ld_agg_df, how = 'left', on = join_col
).merge(br_agg_df, how = 'left', on = join_col
).merge(sr_agg_df, how = 'left', on = join_col
).merge(GENDER_DF, on = ['full_name']
)[['first', 'last', 'nation', 'rank', 'gender'] + FEATURE_COLUMNS]

return pred_aggs_raw

def create_fill_value(column, value):
if 'count' in column:
return 0
else:
return value

def fill_features(df):
max_values = df[FEATURE_COLUMNS].max()

fill_dict = {column : create_fill_value(column, value) for
column, value in zip(max_values.keys(), max_values)}

return df.fillna(value = fill_dict)

def process_data(br_raw, lr_raw, sr_raw, cr_raw, date, comp_name):
br_df = data_cleaning(br_raw, date = date)
lr_df = data_cleaning(lr_raw, date = date)
sr_df = data_cleaning(sr_raw, date = date)


cr_df = data_cleaning(cr_raw, False)
predicting_comp = cr_df[cr_df.competition_title == comp_name]

pred_aggs_raw = agg_join_data(lr_df, br_df, sr_df, predicting_comp)

pred_aggs = fill_features(pred_aggs_raw)

pred_aggs['avg_rank_multi'] = pred_aggs.lead_avg_rank * pred_aggs.boulder_avg_rank * pred_aggs.speed_avg_rank

return pred_aggs

def rename_columns(df_raw):
df_columns_renamed = df_raw.copy()

df_columns_renamed.columns = df_columns_renamed.columns.str.lower()
df_columns_renamed.columns = df_columns_renamed.columns.str.strip()
df_columns_renamed.columns = df_columns_renamed.columns.str.replace(' ', '_')

df_columns_renamed.columns = [unicodedata.normalize('NFKD', c).encode('ASCII', 'ignore').decode('utf-8')
for c in df_columns_renamed.columns]


return df_columns_renamed

def remove_fully_null_columns_rows(df_columns_renamed):
df_fully_null = df_columns_renamed.copy()

df_fully_null = df_fully_null.dropna(axis=1, how='all')
df_fully_null = df_fully_null.dropna(axis=0, how='all')

return df_fully_null

def clean_and_fill_content_rating(df_fully_null_removed):
df_clean_content_rating = df_fully_null_removed.copy()

df_clean_content_rating["content_rating"] = df_clean_content_rating["content_rating"].fillna("Unrated")
df_clean_content_rating["content_rating"] = df_clean_content_rating["content_rating"].replace("Not Rated", "Unrated")

return df_clean_content_rating

def clean_release_year(df_clean):
df_clean_release_year_temp = df_clean.copy()

df_clean_release_year_temp["release_year_coerce"] = pd.to_datetime(df_clean_release_year_temp["release_year"], errors = "coerce")
df_clean_release_year_temp["release_year_mixed"] = pd.to_datetime(df_clean_release_year_temp["release_year"], errors = "coerce", format = "mixed")

return df_clean_release_year_temp

def clean_income(clean_release_year):
df_clean_income = clean_release_year.copy()

df_clean_income["income"] = df_clean_income["income"].astype(str).str.replace(r'[^0-9,.]+', '', regex = True)
df_clean_income["income"] = df_clean_income["income"].str.replace(',', '', regex = False)
df_clean_income["income"] = df_clean_income["income"].replace('', None)
df_clean_income["income"] = df_clean_income["income"].astype('Int64')

return df_clean_income



















Loading