msr-ds3 · aisha1021 · May 28, 2025 · May 28, 2025 · May 29, 2025 · May 29, 2025
diff --git a/week1/__MACOSX/2014-citibike-tripdata/._.DS_Store b/week1/__MACOSX/2014-citibike-tripdata/._.DS_Store
diff --git a/week1/citibike.R b/week1/citibike.R
@@ -6,7 +6,8 @@ library(lubridate)
 ########################################
 
 # read one month of data
-trips <- read_csv('201402-citibike-tripdata.csv')
+trips <- read_csv('201402-citibike-tripdata.csv', na = c("", "\\N"))
+
 
 # replace spaces in column names with underscores
 names(trips) <- gsub(' ', '_', names(trips))
@@ -24,25 +25,43 @@ trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unkn
 
 # count the number of trips (= rows in the data frame)
 
+summarize(trips, count=n()) 
+
 # find the earliest and latest birth years (see help for max and min to deal with NAs)
+summarize(trips, min_year = min(birth_year, na.rm = TRUE), max_year = max(birth_year, na.rm = TRUE))
 
 # use filter and grepl to find all trips that either start or end on broadway
 
+select(trips, start_station_name, end_station_name) |> filter(grepl(".*Broadway.*", start_station_name) | grepl(".*Broadway.*",  end_station_name))
+
 # do the same, but find all trips that both start and end on broadway
 
+select(trips, start_station_name, end_station_name) |> filter(grepl(".*Broadway.*", start_station_name) & grepl(".*Broadway.*",  end_station_name))
+
 # find all unique station names
 
+trips |> distinct(start_station_name)
+
 # count the number of trips by gender, the average trip time by gender, and the standard deviation in trip time by gender
 # do this all at once, by using summarize() with multiple arguments
+trips |> group_by(gender) |> summarize(count = n(), avg = mean(tripduration) / 60, std = sd(tripduration) / 60)
 
 # find the 10 most frequent station-to-station trips
+trips |> select(start_station_name, end_station_name) |> count(start_station_name, end_station_name, sort = TRUE) |> head(10)
 
 # find the top 3 end stations for trips starting from each start station
+trips |> group_by(start_station_name) |> count(end_station_name, sort = TRUE) |> slice_max(n, n = 3)
 
 # find the top 3 most common station-to-station trips by gender
+trips |> group_by(gender) |> count(end_station_name, sort = TRUE) |> slice_max(n, n = 3)
 
 # find the day with the most trips
 # tip: first add a column for year/month/day without time of day (use as.Date or floor_date from the lubridate package)
+trips |> mutate(date = as.Date(starttime)) |> select(date) |> count(date, sort = TRUE) |> head(1)
 
 # compute the average number of trips taken during each of the 24 hours of the day across the entire month
 # what time(s) of day tend to be peak hour(s)?
+
+date_hour_counts <- trips |> mutate(hour = hour(starttime)) |> mutate(date = as.Date(starttime)) |> select(date, hour) |> group_by(hour) |> count(date, hour)
+date_hour_counts |> group_by(hour) |> summarize(avg = mean(n)) |> arrange(avg) |> tail(1)
+
diff --git a/week1/citibike.sh b/week1/citibike.sh
@@ -5,19 +5,36 @@
 
 # count the number of unique stations
 
+cut -d, -f5 201402-citibike-tripdata.csv | sort | uniq -c | wc -l
+
 # count the number of unique bikes
 
+cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c | wc -l
+
 # count the number of trips per day
 
+cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c
+
 # find the day with the most rides
 
+cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort -r | head -n1
+
 # find the day with the fewest rides
 
+cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort | head -n2 | tail -n1
+
 # find the id of the bike with the most rides
 
+cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c | sort -r | head -n1
+
 # count the number of rides by gender and birth year
 
+cut -d, -f14,15 201402-citibike-tripdata.csv | sort | uniq -c
+
 # count the number of trips that start on cross streets that both contain numbers (e.g., "1 Ave & E 15 St", "E 39 St & 2 Ave", ...)
 
+cut -d, -f5 201402-citibike-tripdata.csv | grep '.*[0-9].*&.*[0-9].*' | wc -l
 
 # compute the average trip duration
+
+cut -d, -f1 201402-citibike-tripdata.csv | tail -n +2 | awk '{sum += $1; avg = sum/NR} END {print avg}'
diff --git a/week1/plot_trips.R b/week1/plot_trips.R
@@ -10,53 +10,176 @@ library(scales)
 theme_set(theme_bw())
 
 # load RData file output by load_trips.R
-load('trips.RData')
+load("trips.RData")
 
 
 ########################################
 # plot trip data
 ########################################
 
+library(ggplot2)
+
 # plot the distribution of trip times across all rides (compare a histogram vs. a density plot)
+trips |>
+    ggplot(aes(x = tripduration)) +
+    geom_histogram(bins = 30) +
+    scale_x_log10()
+
+trips |>
+    ggplot(aes(x = tripduration)) +
+    geom_density(fill = "grey") +
+    scale_x_log10()
+
 
 # plot the distribution of trip times by rider type indicated using color and fill (compare a histogram vs. a density plot)
 
+
+trips |>
+    ggplot(aes(x = tripduration, color = usertype, fill = usertype)) +
+    geom_histogram(bins = 30) +
+    scale_x_log10()
+
+trips |>
+    ggplot(aes(x = tripduration, color = usertype, fill = usertype)) +
+    geom_density() +
+    scale_x_log10()
+
 # plot the total number of trips on each day in the dataset
 
+trips |>
+    mutate(date = as.year(starttime)) |>
+    ggplot(aes(x = date)) +
+    geom_histogram(bins = 365)
+
+
 # plot the total number of trips (on the y axis) by age (on the x axis) and gender (indicated with color)
 
+trips |>
+    mutate(age = year(Sys.Date()) - birth_year) |>
+    group_by(gender, age) |>
+    summarize(count_trip_by_age = n()) |>
+    ggplot(aes(x = age, y = count_trip_by_age, color = gender, fill = gender)) +
+    geom_area(alpha = 0.8) +
+    scale_y_log10()
+
+
 # plot the ratio of male to female trips (on the y axis) by age (on the x axis)
 # hint: use the pivot_wider() function to reshape things to make it easier to compute this ratio
 # (you can skip this and come back to it tomorrow if we haven't covered pivot_wider() yet)
 
+head(trips)
+trips |> 
+    mutate(age = year(Sys.Date()) - birth_year) |>
+    select(gender, age) |>
+    group_by(age, gender) |>
+    summarize(coumt_age_gender = n()) |>
+    pivot_wider(names_from = gender, values_from = coumt_age_gender) |>
+    mutate(male_female_ratio = Male/Female) |>
+    ggplot(aes(x = age, y = male_female_ratio)) +
+    geom_point() +
+    geom_smooth(se = FALSE)
+
+
+
+
 ########################################
 # plot weather data
 ########################################
 # plot the minimum temperature (on the y axis) over each day (on the x axis)
 
+weather |>
+    ggplot(aes(x = date, y = tmin)) +
+    geom_point()
+
 # plot the minimum temperature and maximum temperature (on the y axis, with different colors) over each day (on the x axis)
 # hint: try using the pivot_longer() function for this to reshape things before plotting
 # (you can skip this and come back to it tomorrow if we haven't covered reshaping data yet)
 
+weather |>
+    select(date, tmin, tmax) |>
+    pivot_longer(names_to = "min_max", values_to = "temperature", -date) |>
+    ggplot(aes(x = date, y = temperature, color = min_max, fill = min_max)) +
+    geom_point()
+
+
 ########################################
 # plot trip and weather data
 ########################################
 
 # join trips and weather
-trips_with_weather <- inner_join(trips, weather, by="ymd")
+trips_with_weather <- inner_join(trips, weather, by = "ymd")
 
 # plot the number of trips as a function of the minimum temperature, where each point represents a day
 # you'll need to summarize the trips and join to the weather data to do this
 
+trips_with_weather |>
+    group_by(date, tmin) |>
+    summarize(count_trip = n()) |>
+    ggplot(aes(x = tmin, y = count_trip)) +
+    geom_point()
+
+
 # repeat this, splitting results by whether there was substantial precipitation or not
 # you'll need to decide what constitutes "substantial precipitation" and create a new T/F column to indicate this
 
+mean_prcp <- trips_with_weather |> summarize(mean_prcp = mean(prcp))
+mean_prcp <- mean_prcp$mean_prcp
+
+trips_with_weather |>
+    mutate(is_subs_prcp = prcp >= mean_prcp) |>
+    group_by(date, tmin, is_subs_prcp) |>
+    summarize(count_trip = n()) |>
+    ggplot(aes(x = tmin, y = count_trip, color = is_subs_prcp)) +
+    geom_point()
+
+
+
 # add a smoothed fit on top of the previous plot, using geom_smooth
 
+mean_prcp <- trips_with_weather |> summarize(mean_prcp = mean(prcp))
+mean_prcp <- mean_prcp$mean_prcp
+
+trips_with_weather |>
+    mutate(is_subs_prcp = prcp >= mean_prcp) |>
+    group_by(date, tmin, is_subs_prcp) |>
+    summarize(count_trip = n()) |>
+    ggplot(aes(x = tmin, y = count_trip, color = is_subs_prcp)) +
+    geom_point() +
+    geom_smooth(method = "lm")
+
+
 # compute the average number of trips and standard deviation in number of trips by hour of the day
 # hint: use the hour() function from the lubridate package
 
+trips_with_weather |>
+    mutate(hour = hour(starttime), date = as.Date(starttime)) |>
+    group_by(date, hour) |>
+    summarize(count = n()) |>
+    group_by(hour) |>
+    summarize(avg = mean(count), std = sd(count))
+
 # plot the above
 
+trips_with_weather |>
+    mutate(hour = hour(starttime), date = as.Date(starttime)) |>
+    group_by(date, hour) |>
+    summarize(count = n()) |>
+    group_by(hour) |>
+    summarize(avg = mean(count), std = sd(count)) |>
+    pivot_longer(cols = c(avg, std), names_to = "stat", values_to = "value") |>
+    ggplot(aes(x = hour, y = value, color = stat, fill = stat)) +
+    geom_col(position = "dodge")
+
 # repeat this, but now split the results by day of the week (Monday, Tuesday, ...) or weekday vs. weekend days
 # hint: use the wday() function from the lubridate package
+
+trips_with_weather |>
+    mutate(hour = hour(starttime), date = as.Date(starttime), day = wday(starttime, label = TRUE)) |>
+    group_by(date, day, hour) |>
+    summarize(count = n()) |>
+    group_by(day, hour) |>
+    summarize(avg = mean(count), std = sd(count)) |>
+    pivot_longer(cols = c(avg, std), names_to = "stat", values_to = "value") |>
+    ggplot(aes(x = hour, y = value, color = stat, fill = stat)) +
+    geom_col(position = "dodge") +
+    facet_wrap(~ day)
diff --git a/week2/diamond-sizes.Rmd b/week2/diamond-sizes.Rmd
@@ -4,9 +4,16 @@ date: 2016-08-25
 output: html_document
 ---
 
+```{r, echo = FALSE}
+knitr::opts_chunk$set(
+  echo = FALSE
+)
+```
+
 ```{r setup, include = FALSE}
 library(ggplot2)
 library(dplyr)
+library(viridis)
 
 smaller <- diamonds %>% 
   filter(carat <= 2.5)
@@ -17,8 +24,43 @@ We have data about `r nrow(diamonds)` diamonds. Only
 2.5 carats. The distribution of the remainder is shown
 below:
 
-```{r, echo = FALSE}
+```{r}
 smaller %>% 
   ggplot(aes(carat)) + 
   geom_freqpoly(binwidth = 0.01)
+```
+
+The striking features include:
+- Most diamonds (~25000) are approximately 0.25 carats
+- Majority of the diamonds are less than 1.25 carats
+
+# Exercise 27.4.7
+## Exercise 1
+
+Add a section that explores how diamond sizes vary by cut, colour, and clarity. Assume you’re 
+writing a report for someone who doesn’t know R, and instead of setting echo = FALSE on each 
+chunk, set a global option.
+
+```{r}
+
+diamonds |>
+  ggplot(aes(x = carat, color = color, fill = color)) +
+  geom_histogram(bins = 30)
+
+```
+
+```{r}
+
+diamonds |>
+  ggplot(aes(x = carat, color = cut, fill = cut)) +
+  geom_histogram(bins = 30) +
+  scale_fill_viridis_d(option = "mako", direction = -1)
+```
+
+```{r}
+
+diamonds |>
+  ggplot(aes(x = carat, color = clarity, fill = clarity)) +
+  geom_histogram(bins = 30) +
+  scale_fill_viridis_d(option = "magma")
 ```
diff --git a/week2/diamond-sizes.html b/week2/diamond-sizes.html
diff --git a/week2/statistical_inference.Rmd b/week2/statistical_inference.Rmd
@@ -55,6 +55,7 @@ ggplot(se_data, aes(x = n, y = se)) +
   stat_function(fun=function(n) {sqrt(p * (1 - p) / n)}, linetype=2)
 ```
 
+
 ## Confidence intervals
 ```{r}
 set.seed(42)

diff --git a/week2/statistical_inference.html b/week2/statistical_inference.html
-Original file line number
+Diff line change
@@ Expand Up / @@ -55,6 +55,7 @@ ggplot(se_data, aes(x = n, y = se)) + @@
       stat_function(fun=function(n) {sqrt(p * (1 - p) / n)}, linetype=2)
     ```
     ## Confidence intervals
     ```{r}
     set.seed(42)
@@ Expand Down @@