Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
a84e19f
Update citibike.sh
NaomiBeck May 28, 2025
d9cc8c7
done with assignment2
NaomiBeck May 28, 2025
a602a44
assignemnt2
NaomiBeck May 28, 2025
99c2707
clean up
NaomiBeck May 28, 2025
44c9a52
Merge branch 'master' of https://github.com/msr-ds3/coursework
NaomiBeck May 28, 2025
3e43f32
Merge branch 'master' of https://github.com/msr-ds3/coursework
NaomiBeck May 29, 2025
9136b89
assignment3
NaomiBeck May 29, 2025
92ef391
Merge branch 'master' of https://github.com/msr-ds3/coursework
NaomiBeck May 30, 2025
498149d
Merge branch 'master' of https://github.com/msr-ds3/coursework
NaomiBeck May 30, 2025
756c976
assignment4
NaomiBeck May 30, 2025
e2ffb9e
Merge branch 'master' of https://github.com/msr-ds3/coursework
NaomiBeck Jun 4, 2025
f6558a0
Merge branch 'master' of https://github.com/msr-ds3/coursework
NaomiBeck Jun 5, 2025
b460548
Merge branch 'master' of https://github.com/msr-ds3/coursework
NaomiBeck Jun 6, 2025
b3fe344
Merge branch 'master' of https://github.com/msr-ds3/coursework
NaomiBeck Jun 9, 2025
f6d3e54
Merge branch 'master' of https://github.com/msr-ds3/coursework
NaomiBeck Jun 10, 2025
766742e
Merge branch 'master' of https://github.com/msr-ds3/coursework
NaomiBeck Jun 10, 2025
98c8c40
Merge branch 'master' of https://github.com/msr-ds3/coursework
NaomiBeck Jun 11, 2025
bb0514a
Merge branch 'master' of https://github.com/msr-ds3/coursework
NaomiBeck Jun 13, 2025
a78d888
Merge branch 'master' of https://github.com/msr-ds3/coursework
NaomiBeck Jun 16, 2025
64d9054
citi_bike_best_model
NaomiBeck Jun 16, 2025
271d279
Merge branch 'master' of https://github.com/msr-ds3/coursework
NaomiBeck Jun 17, 2025
009df71
2015 citibike testing
NaomiBeck Jun 17, 2025
f1ce2ab
2015_citibike_final
NaomiBeck Jun 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added week1/__MACOSX/2014-citibike-tripdata/._.DS_Store
Binary file not shown.
29 changes: 29 additions & 0 deletions week1/citibike.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,26 +23,55 @@ trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unkn
########################################

# count the number of trips (= rows in the data frame)
summarize(trips, count=n())

# find the earliest and latest birth years (see help for max and min to deal with NAs)
trips <- read_csv('201402-citibike-tripdata.csv', na = c("", "\\N")) # clean up \N to NAs
# can also typecast it to numeric where it will coerce the NAs

summarize(trips, min_year = min(birth_year, na.rm = TRUE), max_year = max(birth_year, na.rm = TRUE))


# use filter and grepl to find all trips that either start or end on broadway
select(trips, start_station_name, end_station_name) |>
filter(grepl('.*Broadway.*', start_station_name) |
grepl('.*Broadway.*', end_station_name))

# do the same, but find all trips that both start and end on broadway
select(trips, start_station_name, end_station_name) |>
filter(grepl('.*Broadway.*', start_station_name) &
grepl('.*Broadway.*', end_station_name))

# find all unique station names
trips |> distinct(start_station_name)
# old school way
unique(trips$start_station_name)

# count the number of trips by gender, the average trip time by gender, and the standard deviation in trip time by gender
# do this all at once, by using summarize() with multiple arguments
trips |> group_by(gender) |>
summarize( count = n(), average_duration = mean(tripduration) /60,
sd_duration = sd(tripduration)/60)

# find the 10 most frequent station-to-station trips
trips %>% count(start_station_name, end_station_name, sort = TRUE) |> head(10)
|> select(start_station_name, end_station_name)

# find the top 3 end stations for trips starting from each start station
trips |> group_by(start_station_name) |> count(end_station_name, sort = TRUE) |> slice_max(n, n = 3)


# find the top 3 most common station-to-station trips by gender
trips |> group_by(gender) |> count(end_station_name, sort = TRUE) |> slice_max(n, n = 3)

# find the day with the most trips
# tip: first add a column for year/month/day without time of day (use as.Date or floor_date from the lubridate package)
trips |> mutate(date = as.Date(starttime)) |> count(date, sort =TRUE) |> head(1) |> select(date)

# compute the average number of trips taken during each of the 24 hours of the day across the entire month
# what time(s) of day tend to be peak hour(s)?
date_hour_counts <- trips |> mutate(hour = hour(starttime), date = as.Date(starttime)) |> group_by(date, hour) |> count(date, hour)
avg_rides_per_hour <- date_hour_counts |> group_by(hour) |> summarize(avg = mean(n)) #--> avgerage rides per hour

# peak hour --> 17th hour
avg_rides_per_hour |> arrange(avg) |> tail(1)
10 changes: 9 additions & 1 deletion week1/citibike.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,28 @@
#

# count the number of unique stations
cat 201402-citibike-tripdata.csv | cut -d, -f5 | sort | uniq | wc -l

# count the number of unique bikes
cat 201402-citibike-tripdata.csv | cut -d, -f5 | sort | uniq | wc -l

# count the number of trips per day
cat 201402-citibike-tripdata.csv |cut -d, -f2 | grep -o '.* ' | sort | uniq -c

# find the day with the most rides
cat 201402-citibike-tripdata.csv |cut -d, -f2 | grep -o '^.* ' | sort | uniq -c | sort -nr | head -n1 | cut -d' ' -f4

# find the day with the fewest rides
cat 201402-citibike-tripdata.csv |cut -d, -f2 | grep -o '^.* ' | sort | uniq -c | sort -bn | head -n1 | awk '{print $2}

# find the id of the bike with the most rides
cat 201402-citibike-tripdata.csv |cut -d, -f12 | sort | uniq -c | sort -r | head -n1 | awk '{print $2}'

# count the number of rides by gender and birth year
cat 201402-citibike-tripdata.csv | cut -d, -f 14,15 | sort | uniq | wc -l

# count the number of trips that start on cross streets that both contain numbers (e.g., "1 Ave & E 15 St", "E 39 St & 2 Ave", ...)

cat 201402-citibike-tripdata.csv | cut -d, -f5 | grep '.*[0-9]+*.*&.*[0-9]+*' | wc -l

# compute the average trip duration
cat 201402-citibike-tripdata.csv | cut -d, -f1 | awk -F, '{sum += $1; count +=1} END {print sum/ count}'
112 changes: 112 additions & 0 deletions week1/plot_trips.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# load some packages that we'll need
library(tidyverse)
library(scales)
library(ggplot2)

# be picky about white backgrounds on our plots
theme_set(theme_bw())
Expand All @@ -18,25 +19,86 @@ load('trips.RData')
########################################

# plot the distribution of trip times across all rides (compare a histogram vs. a density plot)
ggplot(trips, aes(x = tripduration)) +
geom_histogram() +
scale_x_log10(label = comma) +
xlab('Trip Duration') +
ylab('Frequency')


ggplot(trips, aes(x = tripduration)) +
geom_density(fill="grey") +
scale_x_log10(label = comma) +
xlab('Trip Duration') +
ylab('Frequency')

# plot the distribution of trip times by rider type indicated using color and fill (compare a histogram vs. a density plot)
ggplot(trips) +
geom_histogram(aes(x = tripduration, color = usertype, fill=usertype)) +
scale_x_log10(label = comma) +
xlab('Trip Duration') +
ylab('Frequency')

ggplot(trips) +
geom_density(aes(x = tripduration, color = usertype, fill=usertype)) +
scale_x_log10(label = comma) +
xlab('Trip Duration') +
ylab('Frequency')


# plot the total number of trips on each day in the dataset
trips |> mutate(Date = as.Date(starttime)) |>
ggplot() +
geom_histogram(aes(x = Date))


# plot the total number of trips (on the y axis) by age (on the x axis) and gender (indicated with color)
trips |> mutate(age = 2025-birth_year) |>
ggplot() +
geom_histogram(aes(x = age, color = gender, fill=gender)) +
scale_x_log10(label = comma)

# plot the ratio of male to female trips (on the y axis) by age (on the x axis)
# hint: use the pivot_wider() function to reshape things to make it easier to compute this ratio
# (you can skip this and come back to it tomorrow if we haven't covered pivot_wider() yet)
# Example: Summarize, reshape, and plot ratio by age

# Thank you octopilot
trips |>
mutate(age = 2025 - birth_year) |>
group_by(age, gender) |>
summarize(n = n(), .groups = "drop") |>
tidyr::pivot_wider(names_from = gender, values_from = n, values_fill = 0) |>
mutate(ratio = Male / Female) |>
ggplot(aes(x = age, y = ratio)) +
geom_line() +
labs(x = "Age", y = "Male to Female Trip Ratio")


########################################
# plot weather data
########################################
# plot the minimum temperature (on the y axis) over each day (on the x axis)
ggplot(weather, aes(x = date, y = tmin)) +
geom_point() +
xlab('Day') +
ylab('Minimum Temperature')

# plot the minimum temperature and maximum temperature (on the y axis, with different colors) over each day (on the x axis)
# hint: try using the pivot_longer() function for this to reshape things before plotting
# (you can skip this and come back to it tomorrow if we haven't covered reshaping data yet)
# Reshape with pivot_longer
weather_long <- weather %>%
pivot_longer(
cols = c(tmin, tmax),
names_to = "temp_type",
values_to = "temperature"
)

View(weather_long )

ggplot(weather_long, aes(x = date, y = temperature, color = temp_type)) +
geom_point()

########################################
# plot trip and weather data
Expand All @@ -47,16 +109,66 @@ trips_with_weather <- inner_join(trips, weather, by="ymd")

# plot the number of trips as a function of the minimum temperature, where each point represents a day
# you'll need to summarize the trips and join to the weather data to do this
num_trips <- trips |> mutate(ymd = as.Date(starttime)) |> group_by(ymd) |> summarize(count=n())
num_trips_weather<- inner_join(num_trips, weather, by="ymd")

ggplot(num_trips_weather, aes(x=tmin, y=count)) +
geom_point() +
xlab('Minimum Temperature') +
ylab('Number of Trips')


# repeat this, splitting results by whether there was substantial precipitation or not
# you'll need to decide what constitutes "substantial precipitation" and create a new T/F column to indicate this
mutate(num_trips_weather, significant_prcp=ifelse(prcp >= .05, T,F)) |>
ggplot(aes(x=tmin, y=count)) +
geom_point() +
facet_wrap(~ significant_prcp) +
xlab('Minimum Temperature') +
ylab('Number of Trips')

# add a smoothed fit on top of the previous plot, using geom_smooth
mutate(num_trips_weather, significant_prcp=ifelse(prcp >= .05, T,F)) |>
ggplot(aes(x=tmin, y=count)) +
geom_point() +
facet_wrap(~ significant_prcp) +
geom_smooth() +
xlab('Minimum Temperature') +
ylab('Number of Trips')


# compute the average number of trips and standard deviation in number of trips by hour of the day
# hint: use the hour() function from the lubridate package
date_hour_counts <- trips |> mutate(hour = hour(starttime), date = as.Date(starttime)) |> group_by(date, hour) |> count(date, hour)
count_mean_sd <- date_hour_counts |> group_by(hour) |> summarize(avg = mean(n), std=sd(n))
count_mean_sd

# plot the above
long_count_mean_sd <- count_mean_sd %>%
pivot_longer(
cols = c(avg, std),
names_to = "stat_type",
values_to = "stat"
)

ggplot(long_count_mean_sd, aes(x =hour, y = stat, color = stat_type)) +
geom_point()

# repeat this, but now split the results by day of the week (Monday, Tuesday, ...) or weekday vs. weekend days
# hint: use the wday() function from the lubridate package
date_hour_counts <- trips |> mutate(hour = hour(starttime), date = as.Date(starttime)) |> group_by(date, hour) |> count(date, hour)
DAY_date_hour_counts <- date_hour_counts |> mutate(day=wday(date))
View(DAY_date_hour_counts)
DAY_count_mean_sd <- DAY_date_hour_counts |> group_by(day) |> summarize(avg = mean(n), std=sd(n))
View(DAY_count_mean_sd)

# plot the above
DAY_long_count_mean_sd <- DAY_count_mean_sd %>%
pivot_longer(
cols = c(avg, std),
names_to = "stat_type",
values_to = "stat"
)

ggplot(DAY_long_count_mean_sd, aes(x =day, y = stat, color = stat_type)) +
geom_point()
Binary file added week4/predict_citibike.RData
Binary file not shown.
68 changes: 68 additions & 0 deletions week4/test_citibike_predictions.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# loads in the new trips_per_day_2015.csv and weather_2015.csv files along with your saved model (from yesterday's .Rdata file), and predicts the number of trips for each day. If you used any other data for your model, make sure to include code that downloads and incorporates that data as well.


```{r setup, include=FALSE}
library(dplyr)
library(readr)
```

```{r load_data, include=FALSE}
setwd('./week4')
trips_per_day <- read_tsv('trips_per_day_2015.tsv')
head(trips_per_day)

weather_2015 <- read_csv('./weather_2015.csv')
#weather$DATE <- ymd(weather$DATE)
View(head(weather_2015))

trips_per_day <-
inner_join(trips_per_day, weather_2015, by=c("ymd" = "DATE"))

names(trips_per_day) <- tolower(names(trips_per_day))
head(trips_per_day)
trips_per_day <-
trips_per_day %>%

# devide tmax and tmin by 10
mutate(tmax=tmax/10, tmin=tmin/10) %>%

# mutate the data to add weekday and season
mutate(
weekday = wday(ymd, label = TRUE),
# Season based on month
season = case_when(
month(ymd) %in% 3:5 ~ "Spring",
month(ymd) %in% 6:8 ~ "Summer",
month(ymd) %in% 9:11 ~ "Fall",
month(ymd) %in% c(12, 1, 2) ~ "Winter"
))


View(head(trips_per_day))

```


```{r load_model}
my_model <- load("./predict_citibike.RData")
my_model <- get(my_model)
validate_test_prediction <- predict(my_model, trips_per_day)

# Calculate RMSE
rmse <- sqrt(mean((validate_test_prediction - trips_per_day$num_trips)^2))
summary(my_model)

rmse
```

10 percent testing data from 2014:
rmse
[1] 3968.397
r_squared
[1] 0.8235913


2015 test data:
rmse: 8039.796
r_squared: 0.8873