Helpful Functions

Motivation

Helpful plotting and calculation functions I refer to often.

Plotting

Five Thirty Eight Theme

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')

x = np.linspace(0, 10)
np.random.seed(19680801)

fig, ax = plt.subplots()

ax.plot(x, np.sin(x) + x + np.random.randn(50))
ax.plot(x, np.sin(x) + 0.5 * x + np.random.randn(50))
ax.plot(x, np.sin(x) + 2 * x + np.random.randn(50))
ax.plot(x, np.sin(x) - 0.5 * x + np.random.randn(50))
ax.plot(x, np.sin(x) - 2 * x + np.random.randn(50))
ax.plot(x, np.sin(x) + np.random.randn(50))
ax.set_title("'fivethirtyeight' style sheet")

plt.show()

FiveThirtyEight Color Palette

fivethirtyeight_colors = ['#30a2da', '#fc4f30', '#e5ae38', '#6d904f', '#8b8b8b']

KDE Plot

fig, ax = plt.subplots(figsize=(18, 8))
sns.kdeplot(df['col1'], shade=True, cut=True, label='Column 1')
sns.kdeplot(df['col2'], shade=True, cut=True, label='Column 2')
sns.kdeplot(df['col3'], shade=True, cut=True, label='Column 3')

plt.title('Comparison of Three Distributions')
ax.set(xlabel='Metric', yticklabels=[])
ax.legend().set_title('Column No.')
plt.tight_layout()
plt.show()

Bar Plot w/ Percentages

fig, axs = plt.subplots(figsize=(18, 8))
sns.barplot(x="col1", y="col2", data=df, estimator=lambda x: len(x) / len(df) * 100, color='#30a2da')
plt.title('Barplot of Percentages')
plt.tight_layout()    
plt.show()

Line Plot

fig, ax = plt.subplots(figsize=(18, 8))
sns.lineplot(x='col1', y='col2', data=df, hue='col3')
plt.title('Lineplot', fontsize=22)

ax.set(xlabel='Column 1', ylabel='Column 2')
plt.legend(fontsize='small', title_fontsize='8')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Swarm Plot

fig, ax = plt.subplots(figsize=(10,8))
sns.swarmplot(x='Sex', y='Age', hue='Survived', data=df)
plt.title('Survival by Sex and Age')
plt.show()

Correlation Matrix

corr = df.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(10,8))
cmap = sns.color_palette('coolwarm')
sns.heatmap(corr, mask=mask, cmap=cmap, center=0, square=True, linewidths=.5,
            cbar_kws={'shrink':.5}, annot=True, fmt='.2f')
plt.title('Correlation Matrix')
plt.xticks(rotation=45, fontsize=10)
plt.yticks(rotation=0, fontsize=10)
plt.tight_layout()
plt.show()

Three Axes

fig, axs = plt.subplots(figsize=(20,5), nrows=1, ncols=3)
sns.countplot(df['col1'], ax=axs[0])
axs[0].set(xlabel='Score', ylabel='Count', title='Metric #1')
sns.countplot(df['col2'], ax=axs[1])
axs[1].set(xlabel='Score', ylabel='Count', title='Metric #2')
sns.countplot(df['col3'], ax=axs[2])
axs[2].set(xlabel='Score', ylabel='Count', title='Metric #1')
plt.suptitle('Hypothetical Columns', fontsize=22)
plt.tight_layout()
fig.subplots_adjust(top=0.8)
display(fig)

Three Axes w/ Centered Plot

fig, axs = plt.subplots(figsize=(20,10), nrows=1, ncols=3)
sns.barplot(x="col1", y="col2", data=df, ax=axs[1])
axs[1].set(xlabel='Column1', ylabel='Column2')
axs[0].remove()
axs[2].remove()
plt.title('Column 1 Metric by Column 2 Type')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Four Axes

fig, axs = plt.subplots(figsize=(18, 8), nrows=2, ncols=2, sharex=True, sharey=True)
sns.kdeplot(df['col1'], shade=True, cut=False, ax=axs[0,0])
axs[0,0].set(title='Plot #1')

sns.kdeplot(df['col2'], shade=True, cut=False, ax=axs[0,1])
axs[0,1].set(title='Plot #2')

sns.kdeplot(df['col3'], shade=True, cut=False, ax=axs[1,0])
axs[1,0].set(title='Plot #3')

sns.kdeplot(df['col4'], shade=True, cut=False, ax=axs[1,1])
axs[1,1].set(title='Plot #4')

plt.suptitle('Four Axes Plot', fontsize=22)
plt.tight_layout()
fig.subplots_adjust(top=0.8)
display(fig)

Plot Duel Axes

fig, ax = plt.subplots()
ax2 = ax.twinx()
sns.barplot(x='col1', y='col2', data=df, color='royalblue', ax=ax)
sns.lineplot(x='col3', y='col4', data=df, color='orange', ax=ax2)
ax.set_ylabel('Y-Label #1', size=12)
ax2.set_ylabel('Y-Label #2', size=12)
ax.set_xlabel('X-Label', size=12)
ax.set_xticklabels(ax.get_xticklabels(), rotation=55, fontsize=8)
blue_patch = mpatches.Patch(color='royalblue', label='Booked Productivity')
yellow_patch = mpatches.Patch(color='orange', label='In-Season Units')
plt.legend(handles=[blue_patch, yellow_patch], loc='upper right')
plt.title("Title", size=20)
plt.tight_layout()
display(fig)

Format Tick Marks

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as tick

def reformat_large_tick_values(tick_val, pos):
    """
    Turns large tick values (in the billions, millions and thousands) such as 4500 into 4.5K and also appropriately turns 4000 into 4K (no zero after the decimal).
    """
    if tick_val >= 1000000000:
        val = round(tick_val/1000000000, 1)
        new_tick_format = '{:}B'.format(val)
    elif tick_val >= 1000000:
        val = round(tick_val/1000000, 1)
        new_tick_format = '{:}M'.format(val)
    elif tick_val >= 1000:
        val = round(tick_val/1000, 1)
        new_tick_format = '{:}K'.format(val)
    elif tick_val < 1000:
        new_tick_format = round(tick_val, 1)
    else:
        new_tick_format = tick_val

    # make new_tick_format into a string value
    new_tick_format = str(new_tick_format)

    # code below will keep 4.5M as is but change values such as 4.0M to 4M since that zero after the decimal isn't needed
    index_of_decimal = new_tick_format.find(".")

    if index_of_decimal != -1:
        value_after_decimal = new_tick_format[index_of_decimal+1]
        if value_after_decimal == "0":
            # remove the 0 after the decimal point since it's not needed
            new_tick_format = new_tick_format[0:index_of_decimal] + new_tick_format[index_of_decimal+2:]

    return new_tick_format

fig, ax = plt.subplots(figsize=(18, 8))
sns.kdeplot(salary_df.query("Tm == 'DEN'")['Salary'], shade=True, cut=True, label='Denver')
sns.kdeplot(salary_df.query("Tm != 'DEN'")['Salary'], shade=True, cut=True, label='Rest of League')
ax.xaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax.set(xlabel='Salary', yticklabels=[], title='2019 Salary Distribution Denver vs. Rest of League\nData Source: Basketball Reference')
plt.tight_layout()
plt.show()

Change Font Used in Entire Plot

plt.rcParams["font.family"] = "monospace"

Only Plot Integer Tick Marks

from matplotlib.ticker import MaxNLocator
ax.yaxis.set_major_locator(MaxNLocator(integer=True))

Calculations

Gini Index

def gini_calc(arr):
  try:
  # try statement catches all instances in which >0 units were sold within a style in a given time period and returns the gini coefficient
    # Order NET_SALES_UNITS in ascending order
    sorted_arr = sorted(arr)
    height, area = 0, 0
    # Loop through NET_SALES_UNITS
    for value in sorted_arr:
      # Add to cumulative sum
      height += value
      # (Cumulative sum - units sold of specific product color)/2
      area += height - value / 2.
    # fair_area is the area under the line of equality
    fair_area = height * len(arr) / 2
    # output is the area under the lorenz curve divided by the area under the line of equality
    output = (fair_area - area) / fair_area
    return output
  except:
  # except statement catches all instances in which zero units were sold within a style in a given time period and returns None
  # Ex: Free Run sold 0 units in SP2014 and then would throw an error in the gini calculation so we instead impute None
    return None

T-Test

from scipy import stats
print('T-Test')
t2, p2 = stats.ttest_ind(df['col1'], df['col2'])
print("t = " + str(round(t2, 4)))
print("p = " + str(round(p2, 4)))

Pandas Styling

Tables

import imgkit

df_styled = (df.style
               .set_table_styles(
                   [{'selector': 'tr:nth-of-type(odd)',
                     'props': [('background', '#eee')]},
                    {'selector': 'tbody td', 'props': [('font-family', 'futura')]},
                    {'selector': 'thead', 'props': [('font-family', 'futura')]},
                    {'selector': 'tr:nth-of-type(even)',
                     'props': [('background', 'white')]},
                    {'selector':'th, td', 'props':[('text-align', 'center')]},
                    {'selector': 'caption', 'props':[("font-size", "130%"),
                    ('font-family', 'futura')]}])
              .set_properties(subset=['col1'], **{'text-align': 'left'})
              .set_caption('Table Title')
              .hide_index())

html = df_styled.render()
imgkit.from_string(html, 'table.png',
                        options={'width': 2925,
                                'disable-smart-width': '',
                                'zoom':3.3})

Name		Name	Last commit message	Last commit date
Latest commit History 6 Commits
readme.md		readme.md

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Repository files navigation

Helpful Functions

Motivation

Plotting

Calculations

Pandas Styling

About

Uh oh!

Releases

Packages

chrisfeller/Helpful_Functions

Folders and files

Latest commit

History

Repository files navigation

Helpful Functions

Motivation

Plotting

Calculations

Pandas Styling

About

Resources

Uh oh!

Stars

Watchers

Forks

Releases

Packages 0

Packages