diff --git a/week1/__MACOSX/2014-citibike-tripdata/._.DS_Store b/week1/__MACOSX/2014-citibike-tripdata/._.DS_Store new file mode 100644 index 000000000..a5b28df1c Binary files /dev/null and b/week1/__MACOSX/2014-citibike-tripdata/._.DS_Store differ diff --git a/week1/citibike.R b/week1/citibike.R index ad01de1d3..b0925f27b 100644 --- a/week1/citibike.R +++ b/week1/citibike.R @@ -6,7 +6,8 @@ library(lubridate) ######################################## # read one month of data -trips <- read_csv('201402-citibike-tripdata.csv') +trips <- read_csv('201402-citibike-tripdata.csv', na = c("", "\\N")) + # replace spaces in column names with underscores names(trips) <- gsub(' ', '_', names(trips)) @@ -24,25 +25,43 @@ trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unkn # count the number of trips (= rows in the data frame) +summarize(trips, count=n()) + # find the earliest and latest birth years (see help for max and min to deal with NAs) +summarize(trips, min_year = min(birth_year, na.rm = TRUE), max_year = max(birth_year, na.rm = TRUE)) # use filter and grepl to find all trips that either start or end on broadway +select(trips, start_station_name, end_station_name) |> filter(grepl(".*Broadway.*", start_station_name) | grepl(".*Broadway.*", end_station_name)) + # do the same, but find all trips that both start and end on broadway +select(trips, start_station_name, end_station_name) |> filter(grepl(".*Broadway.*", start_station_name) & grepl(".*Broadway.*", end_station_name)) + # find all unique station names +trips |> distinct(start_station_name) + # count the number of trips by gender, the average trip time by gender, and the standard deviation in trip time by gender # do this all at once, by using summarize() with multiple arguments +trips |> group_by(gender) |> summarize(count = n(), avg = mean(tripduration) / 60, std = sd(tripduration) / 60) # find the 10 most frequent station-to-station trips +trips |> select(start_station_name, end_station_name) |> count(start_station_name, end_station_name, sort = TRUE) |> head(10) # find the top 3 end stations for trips starting from each start station +trips |> group_by(start_station_name) |> count(end_station_name, sort = TRUE) |> slice_max(n, n = 3) # find the top 3 most common station-to-station trips by gender +trips |> group_by(gender) |> count(end_station_name, sort = TRUE) |> slice_max(n, n = 3) # find the day with the most trips # tip: first add a column for year/month/day without time of day (use as.Date or floor_date from the lubridate package) +trips |> mutate(date = as.Date(starttime)) |> select(date) |> count(date, sort = TRUE) |> head(1) # compute the average number of trips taken during each of the 24 hours of the day across the entire month # what time(s) of day tend to be peak hour(s)? + +date_hour_counts <- trips |> mutate(hour = hour(starttime)) |> mutate(date = as.Date(starttime)) |> select(date, hour) |> group_by(hour) |> count(date, hour) +date_hour_counts |> group_by(hour) |> summarize(avg = mean(n)) |> arrange(avg) |> tail(1) + diff --git a/week1/citibike.sh b/week1/citibike.sh index 25604f545..f56507913 100755 --- a/week1/citibike.sh +++ b/week1/citibike.sh @@ -5,19 +5,36 @@ # count the number of unique stations +cut -d, -f5 201402-citibike-tripdata.csv | sort | uniq -c | wc -l + # count the number of unique bikes +cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c | wc -l + # count the number of trips per day +cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c + # find the day with the most rides +cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort -r | head -n1 + # find the day with the fewest rides +cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort | head -n2 | tail -n1 + # find the id of the bike with the most rides +cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c | sort -r | head -n1 + # count the number of rides by gender and birth year +cut -d, -f14,15 201402-citibike-tripdata.csv | sort | uniq -c + # count the number of trips that start on cross streets that both contain numbers (e.g., "1 Ave & E 15 St", "E 39 St & 2 Ave", ...) +cut -d, -f5 201402-citibike-tripdata.csv | grep '.*[0-9].*&.*[0-9].*' | wc -l # compute the average trip duration + +cut -d, -f1 201402-citibike-tripdata.csv | tail -n +2 | awk '{sum += $1; avg = sum/NR} END {print avg}' \ No newline at end of file diff --git a/week1/plot_trips.R b/week1/plot_trips.R index 4f25437ba..d2c8185b4 100644 --- a/week1/plot_trips.R +++ b/week1/plot_trips.R @@ -10,53 +10,176 @@ library(scales) theme_set(theme_bw()) # load RData file output by load_trips.R -load('trips.RData') +load("trips.RData") ######################################## # plot trip data ######################################## +library(ggplot2) + # plot the distribution of trip times across all rides (compare a histogram vs. a density plot) +trips |> + ggplot(aes(x = tripduration)) + + geom_histogram(bins = 30) + + scale_x_log10() + +trips |> + ggplot(aes(x = tripduration)) + + geom_density(fill = "grey") + + scale_x_log10() + # plot the distribution of trip times by rider type indicated using color and fill (compare a histogram vs. a density plot) + +trips |> + ggplot(aes(x = tripduration, color = usertype, fill = usertype)) + + geom_histogram(bins = 30) + + scale_x_log10() + +trips |> + ggplot(aes(x = tripduration, color = usertype, fill = usertype)) + + geom_density() + + scale_x_log10() + # plot the total number of trips on each day in the dataset +trips |> + mutate(date = as.year(starttime)) |> + ggplot(aes(x = date)) + + geom_histogram(bins = 365) + + # plot the total number of trips (on the y axis) by age (on the x axis) and gender (indicated with color) +trips |> + mutate(age = year(Sys.Date()) - birth_year) |> + group_by(gender, age) |> + summarize(count_trip_by_age = n()) |> + ggplot(aes(x = age, y = count_trip_by_age, color = gender, fill = gender)) + + geom_area(alpha = 0.8) + + scale_y_log10() + + # plot the ratio of male to female trips (on the y axis) by age (on the x axis) # hint: use the pivot_wider() function to reshape things to make it easier to compute this ratio # (you can skip this and come back to it tomorrow if we haven't covered pivot_wider() yet) +head(trips) +trips |> + mutate(age = year(Sys.Date()) - birth_year) |> + select(gender, age) |> + group_by(age, gender) |> + summarize(coumt_age_gender = n()) |> + pivot_wider(names_from = gender, values_from = coumt_age_gender) |> + mutate(male_female_ratio = Male/Female) |> + ggplot(aes(x = age, y = male_female_ratio)) + + geom_point() + + geom_smooth(se = FALSE) + + + + ######################################## # plot weather data ######################################## # plot the minimum temperature (on the y axis) over each day (on the x axis) +weather |> + ggplot(aes(x = date, y = tmin)) + + geom_point() + # plot the minimum temperature and maximum temperature (on the y axis, with different colors) over each day (on the x axis) # hint: try using the pivot_longer() function for this to reshape things before plotting # (you can skip this and come back to it tomorrow if we haven't covered reshaping data yet) +weather |> + select(date, tmin, tmax) |> + pivot_longer(names_to = "min_max", values_to = "temperature", -date) |> + ggplot(aes(x = date, y = temperature, color = min_max, fill = min_max)) + + geom_point() + + ######################################## # plot trip and weather data ######################################## # join trips and weather -trips_with_weather <- inner_join(trips, weather, by="ymd") +trips_with_weather <- inner_join(trips, weather, by = "ymd") # plot the number of trips as a function of the minimum temperature, where each point represents a day # you'll need to summarize the trips and join to the weather data to do this +trips_with_weather |> + group_by(date, tmin) |> + summarize(count_trip = n()) |> + ggplot(aes(x = tmin, y = count_trip)) + + geom_point() + + # repeat this, splitting results by whether there was substantial precipitation or not # you'll need to decide what constitutes "substantial precipitation" and create a new T/F column to indicate this +mean_prcp <- trips_with_weather |> summarize(mean_prcp = mean(prcp)) +mean_prcp <- mean_prcp$mean_prcp + +trips_with_weather |> + mutate(is_subs_prcp = prcp >= mean_prcp) |> + group_by(date, tmin, is_subs_prcp) |> + summarize(count_trip = n()) |> + ggplot(aes(x = tmin, y = count_trip, color = is_subs_prcp)) + + geom_point() + + + # add a smoothed fit on top of the previous plot, using geom_smooth +mean_prcp <- trips_with_weather |> summarize(mean_prcp = mean(prcp)) +mean_prcp <- mean_prcp$mean_prcp + +trips_with_weather |> + mutate(is_subs_prcp = prcp >= mean_prcp) |> + group_by(date, tmin, is_subs_prcp) |> + summarize(count_trip = n()) |> + ggplot(aes(x = tmin, y = count_trip, color = is_subs_prcp)) + + geom_point() + + geom_smooth(method = "lm") + + # compute the average number of trips and standard deviation in number of trips by hour of the day # hint: use the hour() function from the lubridate package +trips_with_weather |> + mutate(hour = hour(starttime), date = as.Date(starttime)) |> + group_by(date, hour) |> + summarize(count = n()) |> + group_by(hour) |> + summarize(avg = mean(count), std = sd(count)) + # plot the above +trips_with_weather |> + mutate(hour = hour(starttime), date = as.Date(starttime)) |> + group_by(date, hour) |> + summarize(count = n()) |> + group_by(hour) |> + summarize(avg = mean(count), std = sd(count)) |> + pivot_longer(cols = c(avg, std), names_to = "stat", values_to = "value") |> + ggplot(aes(x = hour, y = value, color = stat, fill = stat)) + + geom_col(position = "dodge") + # repeat this, but now split the results by day of the week (Monday, Tuesday, ...) or weekday vs. weekend days # hint: use the wday() function from the lubridate package + +trips_with_weather |> + mutate(hour = hour(starttime), date = as.Date(starttime), day = wday(starttime, label = TRUE)) |> + group_by(date, day, hour) |> + summarize(count = n()) |> + group_by(day, hour) |> + summarize(avg = mean(count), std = sd(count)) |> + pivot_longer(cols = c(avg, std), names_to = "stat", values_to = "value") |> + ggplot(aes(x = hour, y = value, color = stat, fill = stat)) + + geom_col(position = "dodge") + + facet_wrap(~ day) diff --git a/week2/diamond-sizes.Rmd b/week2/diamond-sizes.Rmd index 3003fdbd2..eacd5a1d1 100644 --- a/week2/diamond-sizes.Rmd +++ b/week2/diamond-sizes.Rmd @@ -4,9 +4,16 @@ date: 2016-08-25 output: html_document --- +```{r, echo = FALSE} +knitr::opts_chunk$set( + echo = FALSE +) +``` + ```{r setup, include = FALSE} library(ggplot2) library(dplyr) +library(viridis) smaller <- diamonds %>% filter(carat <= 2.5) @@ -17,8 +24,43 @@ We have data about `r nrow(diamonds)` diamonds. Only 2.5 carats. The distribution of the remainder is shown below: -```{r, echo = FALSE} +```{r} smaller %>% ggplot(aes(carat)) + geom_freqpoly(binwidth = 0.01) +``` + +The striking features include: +- Most diamonds (~25000) are approximately 0.25 carats +- Majority of the diamonds are less than 1.25 carats + +# Exercise 27.4.7 +## Exercise 1 + +Add a section that explores how diamond sizes vary by cut, colour, and clarity. Assume you’re +writing a report for someone who doesn’t know R, and instead of setting echo = FALSE on each +chunk, set a global option. + +```{r} + +diamonds |> + ggplot(aes(x = carat, color = color, fill = color)) + + geom_histogram(bins = 30) + +``` + +```{r} + +diamonds |> + ggplot(aes(x = carat, color = cut, fill = cut)) + + geom_histogram(bins = 30) + + scale_fill_viridis_d(option = "mako", direction = -1) +``` + +```{r} + +diamonds |> + ggplot(aes(x = carat, color = clarity, fill = clarity)) + + geom_histogram(bins = 30) + + scale_fill_viridis_d(option = "magma") ``` \ No newline at end of file diff --git a/week2/diamond-sizes.html b/week2/diamond-sizes.html new file mode 100644 index 000000000..d6a3dc732 --- /dev/null +++ b/week2/diamond-sizes.html @@ -0,0 +1,424 @@ + + + + + + + + + + + + + + +Diamond sizes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +

We have data about 53940 diamonds. Only 126 are larger than 2.5 +carats. The distribution of the remainder is shown below:

+

+

The striking features include: - Most diamonds (~25000) are +approximately 0.25 carats - Majority of the diamonds are less than 1.25 +carats

+
+

Exercise 27.4.7

+
+

Exercise 1

+

Add a section that explores how diamond sizes vary by cut, colour, +and clarity. Assume you’re writing a report for someone who doesn’t know +R, and instead of setting echo = FALSE on each chunk, set a global +option.

+

+

+

+
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/week2/statistical_inference.Rmd b/week2/statistical_inference.Rmd index 68c6dde5e..0f9dbd340 100644 --- a/week2/statistical_inference.Rmd +++ b/week2/statistical_inference.Rmd @@ -55,6 +55,7 @@ ggplot(se_data, aes(x = n, y = se)) + stat_function(fun=function(n) {sqrt(p * (1 - p) / n)}, linetype=2) ``` + ## Confidence intervals ```{r} set.seed(42) diff --git a/week2/statistical_inference.html b/week2/statistical_inference.html index db78886b6..a0de9fdf5 100644 --- a/week2/statistical_inference.html +++ b/week2/statistical_inference.html @@ -1,28 +1,38 @@ - + - + - + Statistical Inference - + - + - + + - + + + + + +
-

Hypothesis testing

@@ -489,21 +477,24 @@

Hypothesis testing

ggplot(data.frame(p0_hat), aes(x = p0_hat)) + geom_histogram(binwidth = 0.01) + geom_vline(xintercept=p_hat, linetype=2, color="red") -

+

# compare this to our experiment
 # how likely is it that we would see an estimate this extreme if the coin really were fair?
 num_as_extreme <- sum(p0_hat <= p_hat)
 p_value <- num_as_extreme / length(p0_hat)
 p_value
## [1] 0
-

Only 0 out of 100000 estimates from a fair coin with p=0.5 would result in an estimate of p_hat=0.29 or smaller, corresponding to a p-value of 0.

+

Only 0 out of 100000 estimates from a fair coin with p=0.5 would +result in an estimate of p_hat=0.29 or smaller, corresponding to a +p-value of 0.

Comparing two proportions

Point estimates and sampling distributions

-

Repeatedly flip two coins, each 500 times and estimate their bias.

+

Repeatedly flip two coins, each 500 times and estimate their +bias.

estimate_coin_bias <- function(n, p) {
   mean(rbinom(n,1,p))
 }
@@ -516,24 +507,25 @@ 

Point estimates and sampling distributions

# wrangle the results into one data frame plot_data <- bind_rows(data.frame(split='A', trial=1:length(pa_hat), p_hat=pa_hat), - data.frame(split='B', trial=1:length(pb_hat), p_hat=pb_hat))
-
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
-
## Warning in bind_rows_(x, .id): binding character and factor vector,
-## coercing into character vector
-
-## Warning in bind_rows_(x, .id): binding character and factor vector,
-## coercing into character vector
-
# plot the sampling distributions for each split
+                       data.frame(split='B', trial=1:length(pb_hat), p_hat=pb_hat))
+
+# plot the sampling distributions for each split
 ggplot(data=plot_data, aes(x=p_hat, fill=split)) +
   geom_histogram(position="identity", binwidth=0.002, alpha=0.5) +
   scale_alpha(guide=F)
-

+
## Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
+## ggplot2 3.3.4.
+## ℹ Please use "none" instead.
+## This warning is displayed once every 8 hours.
+## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
+## generated.
+

# plot the sampling distribution of the difference
 ggplot(data.frame(pa_hat, pb_hat), aes(x = pa_hat - pb_hat)) +
   geom_histogram(binwidth = 0.002) +
   geom_vline(xintercept=pa-pb) +
   geom_vline(xintercept=mean(pa_hat-pb_hat), linetype=2, color="red")  
-

+

# note that variances add for independent random variables
 variance_of_difference <- var(pa_hat - pb_hat)
 sum_of_variances <- var(pa_hat) + var(pb_hat)
@@ -549,7 +541,7 @@

Confidence intervals

geom_pointrange(aes(ymin=LCL, ymax=UCL)) + xlab('') + theme(legend.title=element_blank()) -

+

Hypothesis testing

@@ -569,14 +561,16 @@

Hypothesis testing

ggplot(data.frame(dp0_hat), aes(x = dp0_hat)) + geom_histogram(binwidth = 0.01) + geom_vline(xintercept=dp_hat, linetype=2, color="red") -

+

# compare this to our experiment
 # how likely is it that we would see an estimate this extreme both coins were identical?
 num_as_extreme <- sum(dp0_hat >= dp_hat)
 p_value <- num_as_extreme / length(dp0_hat)
 p_value
## [1] 0.00048
-

Only 48 out of 100000 estimates from two identical coins with p=0.08 would result in an estimate of dp_hat=0.058 or smaller, corresponding to a p-value of 4.810^{-4}.

+

Only 48 out of 100000 estimates from two identical coins with p=0.08 +would result in an estimate of dp_hat=0.058 or smaller, corresponding to +a p-value of 4.8^{-4}.

@@ -621,26 +615,21 @@

Computing power by direct simulation

}
## Warning in prop.test(x = c(na, nb), n = c(n, n), alternative = "greater", :
 ## Chi-squared approximation may be incorrect
-
 ## Warning in prop.test(x = c(na, nb), n = c(n, n), alternative = "greater", :
 ## Chi-squared approximation may be incorrect
-
 ## Warning in prop.test(x = c(na, nb), n = c(n, n), alternative = "greater", :
 ## Chi-squared approximation may be incorrect
-
 ## Warning in prop.test(x = c(na, nb), n = c(n, n), alternative = "greater", :
 ## Chi-squared approximation may be incorrect
-
 ## Warning in prop.test(x = c(na, nb), n = c(n, n), alternative = "greater", :
 ## Chi-squared approximation may be incorrect
-
 ## Warning in prop.test(x = c(na, nb), n = c(n, n), alternative = "greater", :
 ## Chi-squared approximation may be incorrect
ggplot(data.frame(N, pow), aes(x = N, y = pow)) +
   geom_smooth(color = "gray") +
   geom_point()
-
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
-

+
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
+

@@ -653,7 +642,7 @@

Computing power by direct simulation

// add bootstrap table styles to pandoc tables function bootstrapStylePandocTables() { - $('tr.header').parent('thead').parent('table').addClass('table table-condensed'); + $('tr.odd').parent('tbody').parent('table').addClass('table table-condensed'); } $(document).ready(function () { bootstrapStylePandocTables(); @@ -662,6 +651,23 @@

Computing power by direct simulation

+ + + + + + +