diff --git a/week1/__MACOSX/2014-citibike-tripdata/._.DS_Store b/week1/__MACOSX/2014-citibike-tripdata/._.DS_Store new file mode 100644 index 000000000..a5b28df1c Binary files /dev/null and b/week1/__MACOSX/2014-citibike-tripdata/._.DS_Store differ diff --git a/week1/citibike.R b/week1/citibike.R index ad01de1d3..b0925f27b 100644 --- a/week1/citibike.R +++ b/week1/citibike.R @@ -6,7 +6,8 @@ library(lubridate) ######################################## # read one month of data -trips <- read_csv('201402-citibike-tripdata.csv') +trips <- read_csv('201402-citibike-tripdata.csv', na = c("", "\\N")) + # replace spaces in column names with underscores names(trips) <- gsub(' ', '_', names(trips)) @@ -24,25 +25,43 @@ trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unkn # count the number of trips (= rows in the data frame) +summarize(trips, count=n()) + # find the earliest and latest birth years (see help for max and min to deal with NAs) +summarize(trips, min_year = min(birth_year, na.rm = TRUE), max_year = max(birth_year, na.rm = TRUE)) # use filter and grepl to find all trips that either start or end on broadway +select(trips, start_station_name, end_station_name) |> filter(grepl(".*Broadway.*", start_station_name) | grepl(".*Broadway.*", end_station_name)) + # do the same, but find all trips that both start and end on broadway +select(trips, start_station_name, end_station_name) |> filter(grepl(".*Broadway.*", start_station_name) & grepl(".*Broadway.*", end_station_name)) + # find all unique station names +trips |> distinct(start_station_name) + # count the number of trips by gender, the average trip time by gender, and the standard deviation in trip time by gender # do this all at once, by using summarize() with multiple arguments +trips |> group_by(gender) |> summarize(count = n(), avg = mean(tripduration) / 60, std = sd(tripduration) / 60) # find the 10 most frequent station-to-station trips +trips |> select(start_station_name, end_station_name) |> count(start_station_name, end_station_name, sort = TRUE) |> head(10) # find the top 3 end stations for trips starting from each start station +trips |> group_by(start_station_name) |> count(end_station_name, sort = TRUE) |> slice_max(n, n = 3) # find the top 3 most common station-to-station trips by gender +trips |> group_by(gender) |> count(end_station_name, sort = TRUE) |> slice_max(n, n = 3) # find the day with the most trips # tip: first add a column for year/month/day without time of day (use as.Date or floor_date from the lubridate package) +trips |> mutate(date = as.Date(starttime)) |> select(date) |> count(date, sort = TRUE) |> head(1) # compute the average number of trips taken during each of the 24 hours of the day across the entire month # what time(s) of day tend to be peak hour(s)? + +date_hour_counts <- trips |> mutate(hour = hour(starttime)) |> mutate(date = as.Date(starttime)) |> select(date, hour) |> group_by(hour) |> count(date, hour) +date_hour_counts |> group_by(hour) |> summarize(avg = mean(n)) |> arrange(avg) |> tail(1) + diff --git a/week1/citibike.sh b/week1/citibike.sh index 25604f545..f56507913 100755 --- a/week1/citibike.sh +++ b/week1/citibike.sh @@ -5,19 +5,36 @@ # count the number of unique stations +cut -d, -f5 201402-citibike-tripdata.csv | sort | uniq -c | wc -l + # count the number of unique bikes +cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c | wc -l + # count the number of trips per day +cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c + # find the day with the most rides +cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort -r | head -n1 + # find the day with the fewest rides +cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort | head -n2 | tail -n1 + # find the id of the bike with the most rides +cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c | sort -r | head -n1 + # count the number of rides by gender and birth year +cut -d, -f14,15 201402-citibike-tripdata.csv | sort | uniq -c + # count the number of trips that start on cross streets that both contain numbers (e.g., "1 Ave & E 15 St", "E 39 St & 2 Ave", ...) +cut -d, -f5 201402-citibike-tripdata.csv | grep '.*[0-9].*&.*[0-9].*' | wc -l # compute the average trip duration + +cut -d, -f1 201402-citibike-tripdata.csv | tail -n +2 | awk '{sum += $1; avg = sum/NR} END {print avg}' \ No newline at end of file diff --git a/week1/plot_trips.R b/week1/plot_trips.R index 4f25437ba..d2c8185b4 100644 --- a/week1/plot_trips.R +++ b/week1/plot_trips.R @@ -10,53 +10,176 @@ library(scales) theme_set(theme_bw()) # load RData file output by load_trips.R -load('trips.RData') +load("trips.RData") ######################################## # plot trip data ######################################## +library(ggplot2) + # plot the distribution of trip times across all rides (compare a histogram vs. a density plot) +trips |> + ggplot(aes(x = tripduration)) + + geom_histogram(bins = 30) + + scale_x_log10() + +trips |> + ggplot(aes(x = tripduration)) + + geom_density(fill = "grey") + + scale_x_log10() + # plot the distribution of trip times by rider type indicated using color and fill (compare a histogram vs. a density plot) + +trips |> + ggplot(aes(x = tripduration, color = usertype, fill = usertype)) + + geom_histogram(bins = 30) + + scale_x_log10() + +trips |> + ggplot(aes(x = tripduration, color = usertype, fill = usertype)) + + geom_density() + + scale_x_log10() + # plot the total number of trips on each day in the dataset +trips |> + mutate(date = as.year(starttime)) |> + ggplot(aes(x = date)) + + geom_histogram(bins = 365) + + # plot the total number of trips (on the y axis) by age (on the x axis) and gender (indicated with color) +trips |> + mutate(age = year(Sys.Date()) - birth_year) |> + group_by(gender, age) |> + summarize(count_trip_by_age = n()) |> + ggplot(aes(x = age, y = count_trip_by_age, color = gender, fill = gender)) + + geom_area(alpha = 0.8) + + scale_y_log10() + + # plot the ratio of male to female trips (on the y axis) by age (on the x axis) # hint: use the pivot_wider() function to reshape things to make it easier to compute this ratio # (you can skip this and come back to it tomorrow if we haven't covered pivot_wider() yet) +head(trips) +trips |> + mutate(age = year(Sys.Date()) - birth_year) |> + select(gender, age) |> + group_by(age, gender) |> + summarize(coumt_age_gender = n()) |> + pivot_wider(names_from = gender, values_from = coumt_age_gender) |> + mutate(male_female_ratio = Male/Female) |> + ggplot(aes(x = age, y = male_female_ratio)) + + geom_point() + + geom_smooth(se = FALSE) + + + + ######################################## # plot weather data ######################################## # plot the minimum temperature (on the y axis) over each day (on the x axis) +weather |> + ggplot(aes(x = date, y = tmin)) + + geom_point() + # plot the minimum temperature and maximum temperature (on the y axis, with different colors) over each day (on the x axis) # hint: try using the pivot_longer() function for this to reshape things before plotting # (you can skip this and come back to it tomorrow if we haven't covered reshaping data yet) +weather |> + select(date, tmin, tmax) |> + pivot_longer(names_to = "min_max", values_to = "temperature", -date) |> + ggplot(aes(x = date, y = temperature, color = min_max, fill = min_max)) + + geom_point() + + ######################################## # plot trip and weather data ######################################## # join trips and weather -trips_with_weather <- inner_join(trips, weather, by="ymd") +trips_with_weather <- inner_join(trips, weather, by = "ymd") # plot the number of trips as a function of the minimum temperature, where each point represents a day # you'll need to summarize the trips and join to the weather data to do this +trips_with_weather |> + group_by(date, tmin) |> + summarize(count_trip = n()) |> + ggplot(aes(x = tmin, y = count_trip)) + + geom_point() + + # repeat this, splitting results by whether there was substantial precipitation or not # you'll need to decide what constitutes "substantial precipitation" and create a new T/F column to indicate this +mean_prcp <- trips_with_weather |> summarize(mean_prcp = mean(prcp)) +mean_prcp <- mean_prcp$mean_prcp + +trips_with_weather |> + mutate(is_subs_prcp = prcp >= mean_prcp) |> + group_by(date, tmin, is_subs_prcp) |> + summarize(count_trip = n()) |> + ggplot(aes(x = tmin, y = count_trip, color = is_subs_prcp)) + + geom_point() + + + # add a smoothed fit on top of the previous plot, using geom_smooth +mean_prcp <- trips_with_weather |> summarize(mean_prcp = mean(prcp)) +mean_prcp <- mean_prcp$mean_prcp + +trips_with_weather |> + mutate(is_subs_prcp = prcp >= mean_prcp) |> + group_by(date, tmin, is_subs_prcp) |> + summarize(count_trip = n()) |> + ggplot(aes(x = tmin, y = count_trip, color = is_subs_prcp)) + + geom_point() + + geom_smooth(method = "lm") + + # compute the average number of trips and standard deviation in number of trips by hour of the day # hint: use the hour() function from the lubridate package +trips_with_weather |> + mutate(hour = hour(starttime), date = as.Date(starttime)) |> + group_by(date, hour) |> + summarize(count = n()) |> + group_by(hour) |> + summarize(avg = mean(count), std = sd(count)) + # plot the above +trips_with_weather |> + mutate(hour = hour(starttime), date = as.Date(starttime)) |> + group_by(date, hour) |> + summarize(count = n()) |> + group_by(hour) |> + summarize(avg = mean(count), std = sd(count)) |> + pivot_longer(cols = c(avg, std), names_to = "stat", values_to = "value") |> + ggplot(aes(x = hour, y = value, color = stat, fill = stat)) + + geom_col(position = "dodge") + # repeat this, but now split the results by day of the week (Monday, Tuesday, ...) or weekday vs. weekend days # hint: use the wday() function from the lubridate package + +trips_with_weather |> + mutate(hour = hour(starttime), date = as.Date(starttime), day = wday(starttime, label = TRUE)) |> + group_by(date, day, hour) |> + summarize(count = n()) |> + group_by(day, hour) |> + summarize(avg = mean(count), std = sd(count)) |> + pivot_longer(cols = c(avg, std), names_to = "stat", values_to = "value") |> + ggplot(aes(x = hour, y = value, color = stat, fill = stat)) + + geom_col(position = "dodge") + + facet_wrap(~ day) diff --git a/week2/diamond-sizes.Rmd b/week2/diamond-sizes.Rmd index 3003fdbd2..eacd5a1d1 100644 --- a/week2/diamond-sizes.Rmd +++ b/week2/diamond-sizes.Rmd @@ -4,9 +4,16 @@ date: 2016-08-25 output: html_document --- +```{r, echo = FALSE} +knitr::opts_chunk$set( + echo = FALSE +) +``` + ```{r setup, include = FALSE} library(ggplot2) library(dplyr) +library(viridis) smaller <- diamonds %>% filter(carat <= 2.5) @@ -17,8 +24,43 @@ We have data about `r nrow(diamonds)` diamonds. Only 2.5 carats. The distribution of the remainder is shown below: -```{r, echo = FALSE} +```{r} smaller %>% ggplot(aes(carat)) + geom_freqpoly(binwidth = 0.01) +``` + +The striking features include: +- Most diamonds (~25000) are approximately 0.25 carats +- Majority of the diamonds are less than 1.25 carats + +# Exercise 27.4.7 +## Exercise 1 + +Add a section that explores how diamond sizes vary by cut, colour, and clarity. Assume you’re +writing a report for someone who doesn’t know R, and instead of setting echo = FALSE on each +chunk, set a global option. + +```{r} + +diamonds |> + ggplot(aes(x = carat, color = color, fill = color)) + + geom_histogram(bins = 30) + +``` + +```{r} + +diamonds |> + ggplot(aes(x = carat, color = cut, fill = cut)) + + geom_histogram(bins = 30) + + scale_fill_viridis_d(option = "mako", direction = -1) +``` + +```{r} + +diamonds |> + ggplot(aes(x = carat, color = clarity, fill = clarity)) + + geom_histogram(bins = 30) + + scale_fill_viridis_d(option = "magma") ``` \ No newline at end of file diff --git a/week2/diamond-sizes.html b/week2/diamond-sizes.html new file mode 100644 index 000000000..d6a3dc732 --- /dev/null +++ b/week2/diamond-sizes.html @@ -0,0 +1,424 @@ + + + + +
+ + + + + + + + + +We have data about 53940 diamonds. Only 126 are larger than 2.5 +carats. The distribution of the remainder is shown below:
+The striking features include: - Most diamonds (~25000) are +approximately 0.25 carats - Majority of the diamonds are less than 1.25 +carats
+Add a section that explores how diamond sizes vary by cut, colour, +and clarity. Assume you’re writing a report for someone who doesn’t know +R, and instead of setting echo = FALSE on each chunk, set a global +option.
+library(tidyverse)
-## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
-## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
-## ✔ tibble 2.0.1 ✔ dplyr 0.7.8
-## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
-## ✔ readr 1.1.1 ✔ forcats 0.3.0
-## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
+## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
+## ✔ dplyr 1.1.4 ✔ readr 2.1.5
+## ✔ forcats 1.0.0 ✔ stringr 1.5.1
+## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
+## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
+## ✔ purrr 1.0.4
+## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
-## ✖ dplyr::lag() masks stats::lag()
-#library(ggplot2)
-#library(reshape)
-#library(dplyr)
-
-theme_set(theme_bw())
+## ✖ dplyr::lag() masks stats::lag()
+## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
+theme_set(theme_bw())
set.seed(42)
Estimating a proportion
Point estimate and sampling distribution
-Repeatedly flip a biased coin 100 times and estimate its bias. Adapted from Yakir 11.2.3.
+Repeatedly flip a biased coin 100 times and estimate its bias.
+Adapted from Yakir 11.2.3.
estimate_coin_bias <- function(n, p) {
mean(rbinom(n,1,p))
}
@@ -429,7 +417,7 @@ Point estimate and sampling distribution
geom_histogram(binwidth = 0.01) +
geom_vline(xintercept=p) +
geom_vline(xintercept=mean(p_hat), linetype=2, color="red")
-
+
# repeat this for different sample sizes
plot_data <- data.frame()
for (n in c(100, 200, 400, 800)) {
@@ -440,7 +428,7 @@ Point estimate and sampling distribution
ggplot(plot_data, aes(x = p_hat)) +
geom_histogram(binwidth = 0.01) +
facet_wrap(~ n, nrow = 1)
-
+
se_data <- plot_data %>%
group_by(n) %>%
summarize(se=sd(p_hat))
@@ -448,7 +436,7 @@ Point estimate and sampling distribution
ggplot(se_data, aes(x = n, y = se)) +
geom_point() +
stat_function(fun=function(n) {sqrt(p * (1 - p) / n)}, linetype=2)
-
+
Confidence intervals
@@ -473,7 +461,7 @@ Confidence intervals
xlab('') +
scale_color_manual(values=c("red","darkgreen")) +
theme(legend.position="none")
-
+
Hypothesis testing
@@ -489,21 +477,24 @@ Hypothesis testing
ggplot(data.frame(p0_hat), aes(x = p0_hat)) +
geom_histogram(binwidth = 0.01) +
geom_vline(xintercept=p_hat, linetype=2, color="red")
-
+
# compare this to our experiment
# how likely is it that we would see an estimate this extreme if the coin really were fair?
num_as_extreme <- sum(p0_hat <= p_hat)
p_value <- num_as_extreme / length(p0_hat)
p_value
## [1] 0
-Only 0 out of 100000 estimates from a fair coin with p=0.5 would result in an estimate of p_hat=0.29 or smaller, corresponding to a p-value of 0.
+Only 0 out of 100000 estimates from a fair coin with p=0.5 would
+result in an estimate of p_hat=0.29 or smaller, corresponding to a
+p-value of 0.
Comparing two proportions
Point estimates and sampling distributions
-Repeatedly flip two coins, each 500 times and estimate their bias.
+Repeatedly flip two coins, each 500 times and estimate their
+bias.
estimate_coin_bias <- function(n, p) {
mean(rbinom(n,1,p))
}
@@ -516,24 +507,25 @@ Point estimates and sampling distributions
# wrangle the results into one data frame
plot_data <- bind_rows(data.frame(split='A', trial=1:length(pa_hat), p_hat=pa_hat),
- data.frame(split='B', trial=1:length(pb_hat), p_hat=pb_hat))
-## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
-## Warning in bind_rows_(x, .id): binding character and factor vector,
-## coercing into character vector
-
-## Warning in bind_rows_(x, .id): binding character and factor vector,
-## coercing into character vector
-# plot the sampling distributions for each split
+ data.frame(split='B', trial=1:length(pb_hat), p_hat=pb_hat))
+
+# plot the sampling distributions for each split
ggplot(data=plot_data, aes(x=p_hat, fill=split)) +
geom_histogram(position="identity", binwidth=0.002, alpha=0.5) +
scale_alpha(guide=F)
-
+## Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
+## ggplot2 3.3.4.
+## ℹ Please use "none" instead.
+## This warning is displayed once every 8 hours.
+## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
+## generated.
+
# plot the sampling distribution of the difference
ggplot(data.frame(pa_hat, pb_hat), aes(x = pa_hat - pb_hat)) +
geom_histogram(binwidth = 0.002) +
geom_vline(xintercept=pa-pb) +
geom_vline(xintercept=mean(pa_hat-pb_hat), linetype=2, color="red")
-
+
# note that variances add for independent random variables
variance_of_difference <- var(pa_hat - pb_hat)
sum_of_variances <- var(pa_hat) + var(pb_hat)
@@ -549,7 +541,7 @@ Confidence intervals
geom_pointrange(aes(ymin=LCL, ymax=UCL)) +
xlab('') +
theme(legend.title=element_blank())
-
+
Hypothesis testing
@@ -569,14 +561,16 @@ Hypothesis testing
ggplot(data.frame(dp0_hat), aes(x = dp0_hat)) +
geom_histogram(binwidth = 0.01) +
geom_vline(xintercept=dp_hat, linetype=2, color="red")
-
+
# compare this to our experiment
# how likely is it that we would see an estimate this extreme both coins were identical?
num_as_extreme <- sum(dp0_hat >= dp_hat)
p_value <- num_as_extreme / length(dp0_hat)
p_value
## [1] 0.00048
-Only 48 out of 100000 estimates from two identical coins with p=0.08 would result in an estimate of dp_hat=0.058 or smaller, corresponding to a p-value of 4.810^{-4}.
+Only 48 out of 100000 estimates from two identical coins with p=0.08
+would result in an estimate of dp_hat=0.058 or smaller, corresponding to
+a p-value of 4.8^{-4}.
@@ -621,26 +615,21 @@ Computing power by direct simulation
}
## Warning in prop.test(x = c(na, nb), n = c(n, n), alternative = "greater", :
## Chi-squared approximation may be incorrect
-
## Warning in prop.test(x = c(na, nb), n = c(n, n), alternative = "greater", :
## Chi-squared approximation may be incorrect
-
## Warning in prop.test(x = c(na, nb), n = c(n, n), alternative = "greater", :
## Chi-squared approximation may be incorrect
-
## Warning in prop.test(x = c(na, nb), n = c(n, n), alternative = "greater", :
## Chi-squared approximation may be incorrect
-
## Warning in prop.test(x = c(na, nb), n = c(n, n), alternative = "greater", :
## Chi-squared approximation may be incorrect
-
## Warning in prop.test(x = c(na, nb), n = c(n, n), alternative = "greater", :
## Chi-squared approximation may be incorrect
ggplot(data.frame(N, pow), aes(x = N, y = pow)) +
geom_smooth(color = "gray") +
geom_point()
-## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
-
+## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
+