msr-ds3 · NaomiBeck · May 28, 2025 · May 28, 2025 · May 28, 2025 · May 28, 2025
diff --git a/week1/__MACOSX/2014-citibike-tripdata/._.DS_Store b/week1/__MACOSX/2014-citibike-tripdata/._.DS_Store
diff --git a/week1/citibike.R b/week1/citibike.R
@@ -23,26 +23,55 @@ trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unkn
 ########################################
 
 # count the number of trips (= rows in the data frame)
+summarize(trips, count=n())
 
 # find the earliest and latest birth years (see help for max and min to deal with NAs)
+trips <- read_csv('201402-citibike-tripdata.csv', na = c("", "\\N")) # clean up \N to NAs
+# can also typecast it to numeric where it will coerce the NAs
+
+summarize(trips, min_year = min(birth_year, na.rm = TRUE), max_year = max(birth_year, na.rm = TRUE))
+
 
 # use filter and grepl to find all trips that either start or end on broadway
+select(trips, start_station_name, end_station_name) |> 
+    filter(grepl('.*Broadway.*', start_station_name) | 
+    grepl('.*Broadway.*', end_station_name))
 
 # do the same, but find all trips that both start and end on broadway
+select(trips, start_station_name, end_station_name) |> 
+    filter(grepl('.*Broadway.*', start_station_name) &
+    grepl('.*Broadway.*', end_station_name))
 
 # find all unique station names
+trips |> distinct(start_station_name)
+# old school way
+unique(trips$start_station_name)   
 
 # count the number of trips by gender, the average trip time by gender, and the standard deviation in trip time by gender
 # do this all at once, by using summarize() with multiple arguments
+trips |> group_by(gender) |>  
+    summarize( count = n(), average_duration = mean(tripduration) /60, 
+    sd_duration = sd(tripduration)/60)
 
 # find the 10 most frequent station-to-station trips
+trips %>%  count(start_station_name, end_station_name, sort = TRUE) |> head(10)
+    |> select(start_station_name, end_station_name)
 
 # find the top 3 end stations for trips starting from each start station
+trips |> group_by(start_station_name) |> count(end_station_name, sort = TRUE) |> slice_max(n, n = 3)
+
 
 # find the top 3 most common station-to-station trips by gender
+trips |> group_by(gender) |> count(end_station_name, sort = TRUE) |> slice_max(n, n = 3)
 
 # find the day with the most trips
 # tip: first add a column for year/month/day without time of day (use as.Date or floor_date from the lubridate package)
+trips |> mutate(date = as.Date(starttime)) |> count(date, sort =TRUE) |> head(1) |> select(date)
 
 # compute the average number of trips taken during each of the 24 hours of the day across the entire month
 # what time(s) of day tend to be peak hour(s)?
+date_hour_counts <- trips |> mutate(hour = hour(starttime), date = as.Date(starttime)) |>  group_by(date, hour)  |> count(date, hour)
+avg_rides_per_hour <- date_hour_counts |> group_by(hour) |> summarize(avg = mean(n)) #--> avgerage rides per hour
+
+# peak hour --> 17th hour
+avg_rides_per_hour |> arrange(avg) |> tail(1)
diff --git a/week1/citibike.sh b/week1/citibike.sh
@@ -4,20 +4,28 @@
 #
 
 # count the number of unique stations
+cat 201402-citibike-tripdata.csv | cut -d,  -f5 | sort | uniq | wc -l
 
 # count the number of unique bikes
+cat 201402-citibike-tripdata.csv | cut -d,  -f5 | sort | uniq | wc -l
 
 # count the number of trips per day
+ cat 201402-citibike-tripdata.csv |cut -d,  -f2 | grep -o '.* ' | sort | uniq -c
 
 # find the day with the most rides
+cat 201402-citibike-tripdata.csv |cut -d,  -f2 | grep -o '^.* ' | sort | uniq -c | sort -nr | head -n1 | cut -d' ' -f4
 
 # find the day with the fewest rides
+cat 201402-citibike-tripdata.csv |cut -d,  -f2 | grep -o '^.* ' | sort | uniq -c | sort -bn | head -n1 | awk '{print $2} 
 
 # find the id of the bike with the most rides
+cat 201402-citibike-tripdata.csv |cut -d,  -f12 | sort | uniq -c | sort -r | head -n1 | awk '{print $2}'
 
 # count the number of rides by gender and birth year
+cat 201402-citibike-tripdata.csv | cut -d, -f 14,15 | sort | uniq | wc -l
 
 # count the number of trips that start on cross streets that both contain numbers (e.g., "1 Ave & E 15 St", "E 39 St & 2 Ave", ...)
-
+cat 201402-citibike-tripdata.csv | cut -d, -f5 | grep '.*[0-9]+*.*&.*[0-9]+*' | wc -l
 
 # compute the average trip duration
+cat 201402-citibike-tripdata.csv | cut -d, -f1 | awk -F, '{sum += $1; count +=1} END {print sum/ count}'
diff --git a/week1/plot_trips.R b/week1/plot_trips.R
@@ -5,6 +5,7 @@
 # load some packages that we'll need
 library(tidyverse)
 library(scales)
+library(ggplot2)
 
 # be picky about white backgrounds on our plots
 theme_set(theme_bw())
@@ -18,25 +19,86 @@ load('trips.RData')
 ########################################
 
 # plot the distribution of trip times across all rides (compare a histogram vs. a density plot)
+ggplot(trips, aes(x = tripduration)) +
+    geom_histogram() +
+    scale_x_log10(label = comma) +
+    xlab('Trip Duration') +
+    ylab('Frequency')
+
+
+ggplot(trips, aes(x = tripduration)) +
+    geom_density(fill="grey") +
+    scale_x_log10(label = comma) +
+    xlab('Trip Duration') +
+    ylab('Frequency')
 
 # plot the distribution of trip times by rider type indicated using color and fill (compare a histogram vs. a density plot)
+ggplot(trips) + 
+    geom_histogram(aes(x = tripduration, color = usertype, fill=usertype)) +
+    scale_x_log10(label = comma) +
+    xlab('Trip Duration') +
+    ylab('Frequency')
+
+ggplot(trips) + 
+    geom_density(aes(x = tripduration, color = usertype, fill=usertype)) +
+    scale_x_log10(label = comma) +
+    xlab('Trip Duration') +
+    ylab('Frequency')
+
 
 # plot the total number of trips on each day in the dataset
+trips |> mutate(Date = as.Date(starttime)) |>
+    ggplot() + 
+    geom_histogram(aes(x = Date)) 
+
 
 # plot the total number of trips (on the y axis) by age (on the x axis) and gender (indicated with color)
+trips |> mutate(age = 2025-birth_year) |>
+    ggplot() + 
+    geom_histogram(aes(x = age, color = gender, fill=gender)) +
+    scale_x_log10(label = comma)
 
 # plot the ratio of male to female trips (on the y axis) by age (on the x axis)
 # hint: use the pivot_wider() function to reshape things to make it easier to compute this ratio
 # (you can skip this and come back to it tomorrow if we haven't covered pivot_wider() yet)
+# Example: Summarize, reshape, and plot ratio by age
+
+# Thank you octopilot
+trips |> 
+    mutate(age = 2025 - birth_year) |> 
+    group_by(age, gender) |> 
+    summarize(n = n(), .groups = "drop") |> 
+    tidyr::pivot_wider(names_from = gender, values_from = n, values_fill = 0) |> 
+    mutate(ratio = Male / Female) |> 
+    ggplot(aes(x = age, y = ratio)) +
+    geom_line() +
+    labs(x = "Age", y = "Male to Female Trip Ratio")
+
 
 ########################################
 # plot weather data
 ########################################
 # plot the minimum temperature (on the y axis) over each day (on the x axis)
+ggplot(weather, aes(x = date, y = tmin)) +
+    geom_point() +
+    xlab('Day') +
+    ylab('Minimum Temperature')
 
 # plot the minimum temperature and maximum temperature (on the y axis, with different colors) over each day (on the x axis)
 # hint: try using the pivot_longer() function for this to reshape things before plotting
 # (you can skip this and come back to it tomorrow if we haven't covered reshaping data yet)
+# Reshape with pivot_longer
+weather_long <- weather %>%
+  pivot_longer(
+    cols = c(tmin, tmax),
+    names_to = "temp_type",
+    values_to = "temperature"
+  )
+
+  View(weather_long )
+
+  ggplot(weather_long, aes(x = date, y = temperature, color = temp_type)) +
+  geom_point()
 
 ########################################
 # plot trip and weather data
@@ -47,16 +109,66 @@ trips_with_weather <- inner_join(trips, weather, by="ymd")
 
 # plot the number of trips as a function of the minimum temperature, where each point represents a day
 # you'll need to summarize the trips and join to the weather data to do this
+num_trips <- trips |> mutate(ymd = as.Date(starttime)) |> group_by(ymd) |> summarize(count=n())
+num_trips_weather<- inner_join(num_trips, weather, by="ymd")
+
+ggplot(num_trips_weather, aes(x=tmin, y=count)) +
+  geom_point() +
+  xlab('Minimum Temperature') +
+  ylab('Number of Trips')
+
 
 # repeat this, splitting results by whether there was substantial precipitation or not
 # you'll need to decide what constitutes "substantial precipitation" and create a new T/F column to indicate this
+mutate(num_trips_weather, significant_prcp=ifelse(prcp >= .05, T,F)) |> 
+ggplot(aes(x=tmin, y=count)) +
+  geom_point() +
+  facet_wrap(~ significant_prcp) +
+  xlab('Minimum Temperature') +
+  ylab('Number of Trips')
 
 # add a smoothed fit on top of the previous plot, using geom_smooth
+mutate(num_trips_weather, significant_prcp=ifelse(prcp >= .05, T,F)) |> 
+ggplot(aes(x=tmin, y=count)) +
+  geom_point() +
+  facet_wrap(~ significant_prcp) +
+  geom_smooth() +
+  xlab('Minimum Temperature') +
+  ylab('Number of Trips')
+
 
 # compute the average number of trips and standard deviation in number of trips by hour of the day
 # hint: use the hour() function from the lubridate package
+date_hour_counts    <- trips |> mutate(hour = hour(starttime), date = as.Date(starttime)) |>  group_by(date, hour)  |> count(date, hour)
+count_mean_sd       <- date_hour_counts |> group_by(hour) |> summarize(avg = mean(n), std=sd(n)) 
+count_mean_sd
 
 # plot the above
+long_count_mean_sd  <- count_mean_sd  %>%
+  pivot_longer(
+    cols = c(avg, std),
+    names_to = "stat_type",
+    values_to = "stat"
+  )
+
+  ggplot(long_count_mean_sd, aes(x =hour, y = stat, color = stat_type)) +
+  geom_point()
 
 # repeat this, but now split the results by day of the week (Monday, Tuesday, ...) or weekday vs. weekend days
 # hint: use the wday() function from the lubridate package
+date_hour_counts        <- trips |> mutate(hour = hour(starttime), date = as.Date(starttime)) |>  group_by(date, hour)  |> count(date, hour)
+DAY_date_hour_counts    <- date_hour_counts |> mutate(day=wday(date)) 
+View(DAY_date_hour_counts)
+DAY_count_mean_sd       <- DAY_date_hour_counts |> group_by(day) |> summarize(avg = mean(n), std=sd(n)) 
+View(DAY_count_mean_sd)
+
+# plot the above
+DAY_long_count_mean_sd  <- DAY_count_mean_sd  %>%
+  pivot_longer(
+    cols = c(avg, std),
+    names_to = "stat_type",
+    values_to = "stat"
+  )
+
+  ggplot(DAY_long_count_mean_sd, aes(x =day, y = stat, color = stat_type)) +
+  geom_point()
diff --git a/week4/predict_citibike.RData b/week4/predict_citibike.RData
diff --git a/week4/test_citibike_predictions.Rmd b/week4/test_citibike_predictions.Rmd
@@ -0,0 +1,68 @@
+# loads in the new trips_per_day_2015.csv and weather_2015.csv files along with your saved model (from yesterday's .Rdata file), and predicts the number of trips for each day. If you used any other data for your model, make sure to include code that downloads and incorporates that data as well.
+
+
+```{r setup, include=FALSE}
+library(dplyr)
+library(readr)
+```
+
+```{r load_data, include=FALSE}
+setwd('./week4')
+trips_per_day <- read_tsv('trips_per_day_2015.tsv')
+head(trips_per_day)
+
+weather_2015 <- read_csv('./weather_2015.csv') 
+#weather$DATE <- ymd(weather$DATE)
+View(head(weather_2015))
+
+trips_per_day <-
+    inner_join(trips_per_day, weather_2015, by=c("ymd" = "DATE"))
+
+names(trips_per_day) <- tolower(names(trips_per_day))
+head(trips_per_day)
+trips_per_day <-
+    trips_per_day %>%
+
+    # devide tmax and tmin by 10
+    mutate(tmax=tmax/10, tmin=tmin/10) %>%
+
+     # mutate the data to add weekday and season
+    mutate(
+    weekday = wday(ymd, label = TRUE),
+    # Season based on month
+    season = case_when(
+      month(ymd) %in% 3:5 ~ "Spring",
+      month(ymd) %in% 6:8 ~ "Summer",
+      month(ymd) %in% 9:11 ~ "Fall",
+      month(ymd) %in% c(12, 1, 2) ~ "Winter"
+    ))
+
+
+View(head(trips_per_day))
+
+```
+
+
+```{r load_model}
+my_model <- load("./predict_citibike.RData")
+my_model <- get(my_model)
+validate_test_prediction <- predict(my_model, trips_per_day)
+
+# Calculate RMSE
+rmse <- sqrt(mean((validate_test_prediction - trips_per_day$num_trips)^2))
+summary(my_model)
+
+rmse
+```
+
+10 percent testing data from 2014:
+rmse
+[1] 3968.397
+r_squared
+[1] 0.8235913
+
+
+2015 test data:
+rmse: 8039.796
+r_squared: 0.8873
+