From c8cd96be1af9e1a0ad9bf5ac8a4ef9bf51efaa2c Mon Sep 17 00:00:00 2001
From: aisha1021
Date: Wed, 28 May 2025 14:30:42 -0400
Subject: [PATCH 01/26] day 2 assignment
---
.../2014-citibike-tripdata/._.DS_Store | Bin 0 -> 120 bytes
week1/citibike.sh | 17 +++++++++++++++++
2 files changed, 17 insertions(+)
create mode 100644 week1/__MACOSX/2014-citibike-tripdata/._.DS_Store
diff --git a/week1/__MACOSX/2014-citibike-tripdata/._.DS_Store b/week1/__MACOSX/2014-citibike-tripdata/._.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..a5b28df1cbc6e15bd0d35cdadd0c2e65d5131c7d
GIT binary patch
literal 120
zcmZQz6=P>$Vqox1Ojhs@R)|o50+1L3ClDI}u^SMB_!U6R08`;00ODZ-jv*mIP;rnB
Iur73U08|YJ=l}o!
literal 0
HcmV?d00001
diff --git a/week1/citibike.sh b/week1/citibike.sh
index 25604f545..f56507913 100755
--- a/week1/citibike.sh
+++ b/week1/citibike.sh
@@ -5,19 +5,36 @@
# count the number of unique stations
+cut -d, -f5 201402-citibike-tripdata.csv | sort | uniq -c | wc -l
+
# count the number of unique bikes
+cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c | wc -l
+
# count the number of trips per day
+cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c
+
# find the day with the most rides
+cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort -r | head -n1
+
# find the day with the fewest rides
+cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort | head -n2 | tail -n1
+
# find the id of the bike with the most rides
+cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c | sort -r | head -n1
+
# count the number of rides by gender and birth year
+cut -d, -f14,15 201402-citibike-tripdata.csv | sort | uniq -c
+
# count the number of trips that start on cross streets that both contain numbers (e.g., "1 Ave & E 15 St", "E 39 St & 2 Ave", ...)
+cut -d, -f5 201402-citibike-tripdata.csv | grep '.*[0-9].*&.*[0-9].*' | wc -l
# compute the average trip duration
+
+cut -d, -f1 201402-citibike-tripdata.csv | tail -n +2 | awk '{sum += $1; avg = sum/NR} END {print avg}'
\ No newline at end of file
From e993c3b575c5c270d5b991725f35173f6131137d Mon Sep 17 00:00:00 2001
From: aisha1021
Date: Thu, 29 May 2025 16:06:03 -0400
Subject: [PATCH 02/26] day 3 R assignment
---
week1/citibike.R | 20 +++++++++++++++++++-
1 file changed, 19 insertions(+), 1 deletion(-)
diff --git a/week1/citibike.R b/week1/citibike.R
index ad01de1d3..cfc601324 100644
--- a/week1/citibike.R
+++ b/week1/citibike.R
@@ -6,7 +6,8 @@ library(lubridate)
########################################
# read one month of data
-trips <- read_csv('201402-citibike-tripdata.csv')
+trips <- read_csv('201402-citibike-tripdata.csv', na = c("", "\\N"))
+
# replace spaces in column names with underscores
names(trips) <- gsub(' ', '_', names(trips))
@@ -24,25 +25,42 @@ trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unkn
# count the number of trips (= rows in the data frame)
+summarize(trips, count=n())
+
# find the earliest and latest birth years (see help for max and min to deal with NAs)
+summarize(trips, min_year = min(birth_year, na.rm = TRUE), max_year = max(birth_year, na.rm = TRUE))
# use filter and grepl to find all trips that either start or end on broadway
+select(trips, start_station_name, end_station_name) |> filter(grepl(".*Broadway.*", start_station_name) | grepl(".*Broadway.*", end_station_name))
+
# do the same, but find all trips that both start and end on broadway
+select(trips, start_station_name, end_station_name) |> filter(grepl(".*Broadway.*", start_station_name) & grepl(".*Broadway.*", end_station_name))
+
# find all unique station names
+trips |> distinct(start_station_name)
+
# count the number of trips by gender, the average trip time by gender, and the standard deviation in trip time by gender
# do this all at once, by using summarize() with multiple arguments
+trips |> group_by(gender) |> summarize(count = n(), avg = mean(tripduration) / 60, std = sd(tripduration) / 60)
# find the 10 most frequent station-to-station trips
+trips |> select(start_station_name, end_station_name) |> count(start_station_name, end_station_name, sort = TRUE) |> head(10)
# find the top 3 end stations for trips starting from each start station
+trips |> group_by(start_station_name) |> count(end_station_name, sort = TRUE) |> slice_max(n, n = 3)
# find the top 3 most common station-to-station trips by gender
+trips |> group_by(gender) |> count(end_station_name, sort = TRUE) |> slice_max(n, n = 3)
# find the day with the most trips
# tip: first add a column for year/month/day without time of day (use as.Date or floor_date from the lubridate package)
+trips |> mutate(date = as.Date(starttime)) |> select(date) |> count(date, sort = TRUE) |> head(1)
# compute the average number of trips taken during each of the 24 hours of the day across the entire month
# what time(s) of day tend to be peak hour(s)?
+
+date_hour_counts <- trips |> mutate(hour = hour(starttime)) |> mutate(date = as.Date(starttime)) |> select(date, hour) |> group_by(hour) |> count(date, hour)
+date_hour_counts |> group_by(hour) |> summarize(mean(n))
From 4f4536c4e1d1300977e6418a5c0be0967451dd92 Mon Sep 17 00:00:00 2001
From: aisha1021
Date: Thu, 29 May 2025 16:53:23 -0400
Subject: [PATCH 03/26] day 3 R assignment - Aisha Malik
---
week1/citibike.R | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/week1/citibike.R b/week1/citibike.R
index cfc601324..b0925f27b 100644
--- a/week1/citibike.R
+++ b/week1/citibike.R
@@ -63,4 +63,5 @@ trips |> mutate(date = as.Date(starttime)) |> select(date) |> count(date, sort =
# what time(s) of day tend to be peak hour(s)?
date_hour_counts <- trips |> mutate(hour = hour(starttime)) |> mutate(date = as.Date(starttime)) |> select(date, hour) |> group_by(hour) |> count(date, hour)
-date_hour_counts |> group_by(hour) |> summarize(mean(n))
+date_hour_counts |> group_by(hour) |> summarize(avg = mean(n)) |> arrange(avg) |> tail(1)
+
From 812460bd020704599875c7c349e5ce727ffc29bf Mon Sep 17 00:00:00 2001
From: aisha1021
Date: Fri, 30 May 2025 16:46:20 -0400
Subject: [PATCH 04/26] complected day 4 plotting assignment - Aisha Malik
---
week1/plot_trips.R | 106 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 106 insertions(+)
diff --git a/week1/plot_trips.R b/week1/plot_trips.R
index 4f25437ba..67fc513ee 100644
--- a/week1/plot_trips.R
+++ b/week1/plot_trips.R
@@ -17,23 +17,68 @@ load('trips.RData')
# plot trip data
########################################
+library(ggplot2)
+
# plot the distribution of trip times across all rides (compare a histogram vs. a density plot)
+trips |>
+ ggplot(aes(x = tripduration)) +
+ geom_histogram(bins=30) +
+ scale_x_log10()
+
+trips |>
+ ggplot(aes(x = tripduration)) +
+ geom_density(fill = "grey") +
+ scale_x_log10()
+
# plot the distribution of trip times by rider type indicated using color and fill (compare a histogram vs. a density plot)
+
+trips |>
+ ggplot(aes(x = tripduration, color = usertype, fill = usertype)) +
+ geom_histogram(bins=30) +
+ scale_x_log10()
+
+trips |>
+ ggplot(aes(x = tripduration, color = usertype, fill = usertype)) +
+ geom_density() +
+ scale_x_log10()
+
# plot the total number of trips on each day in the dataset
+trips |>
+ mutate(date = as.year(starttime)) |>
+ ggplot(aes(x = date)) +
+ geom_histogram(bins=365)
+
+
# plot the total number of trips (on the y axis) by age (on the x axis) and gender (indicated with color)
+trips |>
+ mutate(age = year(Sys.Date()) - birth_year) |>
+ group_by(gender, age) |>
+ summarize(count_trip_by_age = n()) |>
+ ggplot(aes(x = age, y = count_trip_by_age, color=gender, fill=gender)) +
+ geom_area(alpha=0.8) +
+ scale_y_log10()
+
+
# plot the ratio of male to female trips (on the y axis) by age (on the x axis)
# hint: use the pivot_wider() function to reshape things to make it easier to compute this ratio
# (you can skip this and come back to it tomorrow if we haven't covered pivot_wider() yet)
+head(trips)
+trips |> select(gender)
+
########################################
# plot weather data
########################################
# plot the minimum temperature (on the y axis) over each day (on the x axis)
+weather |>
+ ggplot(aes(x = date, y = tmin)) +
+ geom_point()
+
# plot the minimum temperature and maximum temperature (on the y axis, with different colors) over each day (on the x axis)
# hint: try using the pivot_longer() function for this to reshape things before plotting
# (you can skip this and come back to it tomorrow if we haven't covered reshaping data yet)
@@ -48,15 +93,76 @@ trips_with_weather <- inner_join(trips, weather, by="ymd")
# plot the number of trips as a function of the minimum temperature, where each point represents a day
# you'll need to summarize the trips and join to the weather data to do this
+trips_with_weather |>
+ group_by(date, tmin) |>
+ summarize(count_trip = n()) |>
+ ggplot(aes(x = tmin, y = count_trip)) +
+ geom_point()
+
+
# repeat this, splitting results by whether there was substantial precipitation or not
# you'll need to decide what constitutes "substantial precipitation" and create a new T/F column to indicate this
+mean_prcp <- trips_with_weather |> summarize(mean_prcp = mean(prcp))
+mean_prcp <- mean_prcp$mean_prcp
+
+trips_with_weather |>
+ mutate(is_subs_prcp = prcp >= mean_prcp) |>
+ group_by(date, tmin, is_subs_prcp) |>
+ summarize(count_trip = n()) |>
+ ggplot(aes(x = tmin, y = count_trip, color = is_subs_prcp)) +
+ geom_point()
+
+
+
+
+
# add a smoothed fit on top of the previous plot, using geom_smooth
+mean_prcp <- trips_with_weather |> summarize(mean_prcp = mean(prcp))
+mean_prcp <- mean_prcp$mean_prcp
+
+trips_with_weather |>
+ mutate(is_subs_prcp = prcp >= mean_prcp) |>
+ group_by(date, tmin, is_subs_prcp) |>
+ summarize(count_trip = n()) |>
+ ggplot(aes(x = tmin, y = count_trip, color = is_subs_prcp)) +
+ geom_point() +
+ geom_smooth(method = 'lm')
+
+
# compute the average number of trips and standard deviation in number of trips by hour of the day
# hint: use the hour() function from the lubridate package
+trips_with_weather |>
+ mutate(hour = hour(starttime), date = as.Date(starttime)) |>
+ group_by(date, hour) |>
+ summarize(count = n()) |>
+ group_by(hour) |>
+ summarize(avg = mean(count), std = sd(count))
+
# plot the above
+trips_with_weather |>
+ mutate(hour = hour(starttime), date = as.Date(starttime)) |>
+ group_by(date, hour) |>
+ summarize(count = n()) |>
+ group_by(hour) |>
+ summarize(avg = mean(count), std = sd(count)) |>
+ pivot_longer(cols = c(avg, std), names_to = "stat", values_to = "value") |>
+ ggplot(aes(x = hour, y = value, color = stat, fill = stat)) +
+ geom_col(position = "dodge")
+
# repeat this, but now split the results by day of the week (Monday, Tuesday, ...) or weekday vs. weekend days
# hint: use the wday() function from the lubridate package
+
+trips_with_weather |>
+ mutate(hour = hour(starttime), date = as.Date(starttime), day = wday(starttime, label = TRUE)) |>
+ group_by(date, day, hour) |>
+ summarize(count = n()) |>
+ group_by(day, hour) |>
+ summarize(avg = mean(count), std = sd(count)) |>
+ pivot_longer(cols = c(avg, std), names_to = "stat", values_to = "value") |>
+ ggplot(aes(x = hour, y = value, color = stat, fill = stat)) +
+ geom_col(position = "dodge") +
+ facet_wrap(~ day)
From 53811d3b52dd5d3615abc2ff1f7f26da13df577b Mon Sep 17 00:00:00 2001
From: aisha1021
Date: Mon, 2 Jun 2025 13:30:30 -0400
Subject: [PATCH 05/26] completed plotting assignment - Aisha Malik
---
week1/plot_trips.R | 45 +++++++++++++++++++++++++++++++--------------
1 file changed, 31 insertions(+), 14 deletions(-)
diff --git a/week1/plot_trips.R b/week1/plot_trips.R
index 67fc513ee..d2c8185b4 100644
--- a/week1/plot_trips.R
+++ b/week1/plot_trips.R
@@ -10,7 +10,7 @@ library(scales)
theme_set(theme_bw())
# load RData file output by load_trips.R
-load('trips.RData')
+load("trips.RData")
########################################
@@ -22,7 +22,7 @@ library(ggplot2)
# plot the distribution of trip times across all rides (compare a histogram vs. a density plot)
trips |>
ggplot(aes(x = tripduration)) +
- geom_histogram(bins=30) +
+ geom_histogram(bins = 30) +
scale_x_log10()
trips |>
@@ -36,7 +36,7 @@ trips |>
trips |>
ggplot(aes(x = tripduration, color = usertype, fill = usertype)) +
- geom_histogram(bins=30) +
+ geom_histogram(bins = 30) +
scale_x_log10()
trips |>
@@ -49,7 +49,7 @@ trips |>
trips |>
mutate(date = as.year(starttime)) |>
ggplot(aes(x = date)) +
- geom_histogram(bins=365)
+ geom_histogram(bins = 365)
# plot the total number of trips (on the y axis) by age (on the x axis) and gender (indicated with color)
@@ -58,8 +58,8 @@ trips |>
mutate(age = year(Sys.Date()) - birth_year) |>
group_by(gender, age) |>
summarize(count_trip_by_age = n()) |>
- ggplot(aes(x = age, y = count_trip_by_age, color=gender, fill=gender)) +
- geom_area(alpha=0.8) +
+ ggplot(aes(x = age, y = count_trip_by_age, color = gender, fill = gender)) +
+ geom_area(alpha = 0.8) +
scale_y_log10()
@@ -68,7 +68,19 @@ trips |>
# (you can skip this and come back to it tomorrow if we haven't covered pivot_wider() yet)
head(trips)
-trips |> select(gender)
+trips |>
+ mutate(age = year(Sys.Date()) - birth_year) |>
+ select(gender, age) |>
+ group_by(age, gender) |>
+ summarize(coumt_age_gender = n()) |>
+ pivot_wider(names_from = gender, values_from = coumt_age_gender) |>
+ mutate(male_female_ratio = Male/Female) |>
+ ggplot(aes(x = age, y = male_female_ratio)) +
+ geom_point() +
+ geom_smooth(se = FALSE)
+
+
+
########################################
# plot weather data
@@ -83,12 +95,19 @@ weather |>
# hint: try using the pivot_longer() function for this to reshape things before plotting
# (you can skip this and come back to it tomorrow if we haven't covered reshaping data yet)
+weather |>
+ select(date, tmin, tmax) |>
+ pivot_longer(names_to = "min_max", values_to = "temperature", -date) |>
+ ggplot(aes(x = date, y = temperature, color = min_max, fill = min_max)) +
+ geom_point()
+
+
########################################
# plot trip and weather data
########################################
# join trips and weather
-trips_with_weather <- inner_join(trips, weather, by="ymd")
+trips_with_weather <- inner_join(trips, weather, by = "ymd")
# plot the number of trips as a function of the minimum temperature, where each point represents a day
# you'll need to summarize the trips and join to the weather data to do this
@@ -106,7 +125,7 @@ trips_with_weather |>
mean_prcp <- trips_with_weather |> summarize(mean_prcp = mean(prcp))
mean_prcp <- mean_prcp$mean_prcp
-trips_with_weather |>
+trips_with_weather |>
mutate(is_subs_prcp = prcp >= mean_prcp) |>
group_by(date, tmin, is_subs_prcp) |>
summarize(count_trip = n()) |>
@@ -115,26 +134,24 @@ trips_with_weather |>
-
-
# add a smoothed fit on top of the previous plot, using geom_smooth
mean_prcp <- trips_with_weather |> summarize(mean_prcp = mean(prcp))
mean_prcp <- mean_prcp$mean_prcp
-trips_with_weather |>
+trips_with_weather |>
mutate(is_subs_prcp = prcp >= mean_prcp) |>
group_by(date, tmin, is_subs_prcp) |>
summarize(count_trip = n()) |>
ggplot(aes(x = tmin, y = count_trip, color = is_subs_prcp)) +
geom_point() +
- geom_smooth(method = 'lm')
+ geom_smooth(method = "lm")
# compute the average number of trips and standard deviation in number of trips by hour of the day
# hint: use the hour() function from the lubridate package
-trips_with_weather |>
+trips_with_weather |>
mutate(hour = hour(starttime), date = as.Date(starttime)) |>
group_by(date, hour) |>
summarize(count = n()) |>
From 7eefc5489979ad9b7c0f4acbff2ed597ff79e4cf Mon Sep 17 00:00:00 2001
From: aisha1021
Date: Mon, 2 Jun 2025 16:10:11 -0400
Subject: [PATCH 06/26] completed day 5 assignments
---
week2/Day_5_exercises.Rmd | 91 ++++++++
week2/diamond-sizes.Rmd | 44 +++-
week2/diamond-sizes.html | 427 ++++++++++++++++++++++++++++++++++++++
3 files changed, 561 insertions(+), 1 deletion(-)
create mode 100644 week2/Day_5_exercises.Rmd
create mode 100644 week2/diamond-sizes.html
diff --git a/week2/Day_5_exercises.Rmd b/week2/Day_5_exercises.Rmd
new file mode 100644
index 000000000..d7cd699c3
--- /dev/null
+++ b/week2/Day_5_exercises.Rmd
@@ -0,0 +1,91 @@
+# Exercie 1
+
+Compute the rate for table2, and table4a + table4b. You will need to perform four operations:
+
+1. Extract the number of TB cases per country per year.
+2. Extract the matching population per country per year.
+3. Divide cases by population, and multiply by 10000.
+4. Store back in the appropriate place.
+
+
+```{r table2-rates}
+table2 |>
+ pivot_wider(names_from = type, values_from = count) |>
+ mutate(cases_by_pop = (cases/population)*10000) |>
+ pivot_longer(names_to = "type", values_to = "count", -c(country, year, cases_by_pop))
+```
+
+```{r table4a-table4b-rates}
+table4a
+table4b
+
+table4a_longer <- table4a |>
+ pivot_longer(names_to = "year", values_to = "cases", -country)
+
+table4a_longer
+
+table4b_longer <- table4b |>
+ pivot_longer(names_to = "year", values_to = "population", -country)
+table4b_longer
+
+inner_join(table4a_longer, table4b_longer) |>
+ mutate(cases_by_pop = (cases/population)*10000)
+
+
+```
+
+Which representation is easiest to work with? Which is hardest? Why
+- `table2` is easier than `table4a + table4b` to work with because we are working with a single dataframe and it requires less lines of code. However, I do think that `table4a + table4b` makes more logical sense.
+
+# Exercie 12.3.3
+
+## Exercise 1
+
+
+Why are pivot_longer() and pivot_wider() not perfectly symmetrical?
+Carefully consider the following example:
+
+```{r}
+
+stocks <- tibble(
+ year = c(2015, 2015, 2016, 2016),
+ half = c( 1, 2, 1, 2),
+ return = c(1.88, 0.59, 0.92, 0.17)
+)
+stocks %>%
+ pivot_wider(names_from = year, values_from = return) %>%
+ pivot_longer(`2015`:`2016`, names_to = "year", values_to = "return")
+
+```
+
+(Hint: look at the variable types and think about column names.)
+
+- pivot_longer() and pivot_wider() are not symmetrical because when pivot_longer() is executed, the column names are in
+different format than the original value. But for pivot_wider(), the new columns remain in the same format.
+
+pivot_longer() has a names_ptypes argument, e.g. names_ptypes = list(year = double()). What does it do?
+
+- The argument checks if the conversion maintains the original data type of a column such as year (which was originally a double)
+
+
+## Exercie 3
+
+What would happen if you widen this table? Why? How could you add a new column to uniquely identify each value?
+```{r}
+
+people <- tribble(
+ ~name, ~names, ~values,
+ #-----------------|--------|------
+ "Phillip Woods", "age", 45,
+ "Phillip Woods", "height", 186,
+ "Phillip Woods", "age", 50,
+ "Jessica Cordero", "age", 37,
+ "Jessica Cordero", "height", 156
+)
+
+```
+
+ - The table depicts that there are two people with the same name which can cause issues with pivot_wider(). An new column with the specific ID must be
+ established to differenciate each person.
+
+
diff --git a/week2/diamond-sizes.Rmd b/week2/diamond-sizes.Rmd
index 3003fdbd2..1ae0201e0 100644
--- a/week2/diamond-sizes.Rmd
+++ b/week2/diamond-sizes.Rmd
@@ -4,9 +4,16 @@ date: 2016-08-25
output: html_document
---
+```{r, echo = FALSE}
+knitr::opts_chunk$set(
+ echo = FALSE
+)
+```
+
```{r setup, include = FALSE}
library(ggplot2)
library(dplyr)
+library(viridis)
smaller <- diamonds %>%
filter(carat <= 2.5)
@@ -17,8 +24,43 @@ We have data about `r nrow(diamonds)` diamonds. Only
2.5 carats. The distribution of the remainder is shown
below:
-```{r, echo = FALSE}
+```{r}
smaller %>%
ggplot(aes(carat)) +
geom_freqpoly(binwidth = 0.01)
+```
+
+The striking features include:
+- Most diamonds (~25000) are approximately 0.25 carats
+- Majority of the diamonds are less than 1.25 carats
+
+# Exercise 27.4.7
+## Exercise 1
+
+Add a section that explores how diamond sizes vary by cut, colour, and clarity. Assume you’re
+writing a report for someone who doesn’t know R, and instead of setting echo = FALSE on each
+chunk, set a global option.
+
+```{r}
+
+diamonds |>
+ ggplot(aes(x = carat, color = color, fill = color)) +
+ geom_histogram()
+
+```
+
+```{r}
+
+diamonds |>
+ ggplot(aes(x = carat, color = cut, fill = cut)) +
+ geom_histogram() +
+ scale_fill_viridis_d(option = "mako", direction = -1)
+```
+
+```{r}
+
+diamonds |>
+ ggplot(aes(x = carat, color = clarity, fill = clarity)) +
+ geom_histogram() +
+ scale_fill_viridis_d(option = "magma")
```
\ No newline at end of file
diff --git a/week2/diamond-sizes.html b/week2/diamond-sizes.html
new file mode 100644
index 000000000..ab12e6d35
--- /dev/null
+++ b/week2/diamond-sizes.html
@@ -0,0 +1,427 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Diamond sizes
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
We have data about 53940 diamonds. Only 126 are larger than 2.5
+carats. The distribution of the remainder is shown below:
+

+
The striking features include: - Most diamonds (~25000) are
+approximately 0.25 carats - Majority of the diamonds are less than 1.25
+carats
+
+
Exercise 27.4.7
+
+
Exercise 1
+
Add a section that explores how diamond sizes vary by cut, colour,
+and clarity. Assume you’re writing a report for someone who doesn’t know
+R, and instead of setting echo = FALSE on each chunk, set a global
+option.
+
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
+

+
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
+

+
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
+

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
From 2ffbd99512e62967557bf874dc3736ba3cbfdea6 Mon Sep 17 00:00:00 2001
From: aisha1021
Date: Mon, 2 Jun 2025 16:16:39 -0400
Subject: [PATCH 07/26] updated day 5 assignment
---
week2/diamond-sizes.Rmd | 6 +++---
week2/diamond-sizes.html | 3 ---
2 files changed, 3 insertions(+), 6 deletions(-)
diff --git a/week2/diamond-sizes.Rmd b/week2/diamond-sizes.Rmd
index 1ae0201e0..eacd5a1d1 100644
--- a/week2/diamond-sizes.Rmd
+++ b/week2/diamond-sizes.Rmd
@@ -45,7 +45,7 @@ chunk, set a global option.
diamonds |>
ggplot(aes(x = carat, color = color, fill = color)) +
- geom_histogram()
+ geom_histogram(bins = 30)
```
@@ -53,7 +53,7 @@ diamonds |>
diamonds |>
ggplot(aes(x = carat, color = cut, fill = cut)) +
- geom_histogram() +
+ geom_histogram(bins = 30) +
scale_fill_viridis_d(option = "mako", direction = -1)
```
@@ -61,6 +61,6 @@ diamonds |>
diamonds |>
ggplot(aes(x = carat, color = clarity, fill = clarity)) +
- geom_histogram() +
+ geom_histogram(bins = 30) +
scale_fill_viridis_d(option = "magma")
```
\ No newline at end of file
diff --git a/week2/diamond-sizes.html b/week2/diamond-sizes.html
index ab12e6d35..d6a3dc732 100644
--- a/week2/diamond-sizes.html
+++ b/week2/diamond-sizes.html
@@ -369,11 +369,8 @@ Exercise 1
and clarity. Assume you’re writing a report for someone who doesn’t know
R, and instead of setting echo = FALSE on each chunk, set a global
option.
-## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

-## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

-## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

From 254d1990c9efe6d8283789fc3c77bc1e508c50c5 Mon Sep 17 00:00:00 2001
From: aisha1021
Date: Tue, 3 Jun 2025 16:15:43 -0400
Subject: [PATCH 08/26] week 2 day 2 - Aisha
---
week2/statistical_inference.Rmd | 138 ++++++++++++++++++++++++++++++++
1 file changed, 138 insertions(+)
diff --git a/week2/statistical_inference.Rmd b/week2/statistical_inference.Rmd
index 68c6dde5e..0cd079e02 100644
--- a/week2/statistical_inference.Rmd
+++ b/week2/statistical_inference.Rmd
@@ -55,6 +55,144 @@ ggplot(se_data, aes(x = n, y = se)) +
stat_function(fun=function(n) {sqrt(p * (1 - p) / n)}, linetype=2)
```
+
+Value Probability
+0 p
+1 2p
+2 3p
+3 4p
+4 5p
+5 6p
+Table 4.4: The Distribution of Y
+
+Question 4.1. Table 4.6 presents the probabilities of the random variable Y .
+These probabilities are a function of the number p, the probability of the value
+“0”. Answer the following questions:
+1. What is the value of p = 1/21
+2. P(Y < 3) = 6/21
+3. P(Y = odd) = 12/21
+4. P(1 ≤ Y < 4) = 9/21
+5. P(|Y − 3| < 1.5) = 12/21
+6. E(Y ) = 3.333
+7. Var(Y ) = 2.222
+8. What is the standard deviation of Y = sqrt(2.22)
+
+
+Question 4.2. One invests $2 to participate in a game of chance. In this game
+a coin is tossed three times. If all tosses end up “Head” then the player wins
+$10. Otherwise, the player losses the investment.
+1. What is the probability of winning the game?
+
+(0.5 * 0.5 * 0.5) = 1/8
+
+2. What is the probability of loosing the game?
+
+1 - 1/8 = 7/8
+
+3. What is the expected gain for the player that plays this game? (Notice that the expectation can obtain a negative value.)
+
+E[gain] = 8 * 1/8 + (-2) * (7/8)
+
+
+Question 6.1. Consider the problem of establishing regulations concerning the
+maximum number of people who can occupy a lift. In particular, we would like
+to assess the probability of exceeding maximal weight when 8 people are allowed
+to use the lift simultaneously and compare that to the probability of allowing 9
+people into the lift.
+Assume that the total weight of 8 people chosen at random follows a normal
+distribution with a mean of 560kg and a standard deviation of 57kg. Assume
+that the total weight of 9 people chosen at random follows a normal distribution
+with a mean of 630kg and a standard deviation of 61kg.
+1. What is the probability that the total weight of 8 people exceeds 650kg?
+
+```{r}
+
+val <- pnorm(650,560,57)
+1 - val
+
+# 0.05717406
+
+```
+
+2. What is the probability that the total weight of 9 people exceeds 650kg?
+
+```{r}
+
+val <- pnorm(650,630,61)
+1 - val
+
+# 0.3715054
+
+```
+
+3. What is the central region that contains 80% of distribution of the total
+weight of 8 people?
+
+```{r}
+
+val1 <- qnorm(0.1, 560, 57)
+val1
+val2 <- qnorm(0.9, 560, 57)
+val2
+
+# [486.9516, 633.0484]
+
+```
+
+4. What is the central region that contains 80% of distribution of the total
+weight of 9 people?
+
+```{r}
+
+val3 <- qnorm(0.1, 630, 61)
+val3
+val4 <- qnorm(0.9, 630, 61)
+val4
+
+# [551.8254, 708.1746]
+
+```
+
+Question 7.1. The file “pop2.csv” contains information associated to the
+blood pressure of an imaginary population of size 100,000. The file can be found
+on the internet (http://pluto.huji.ac.il/~msby/StatThink/Datasets/pop2.
+csv). The variables in this file are:
+id: A numerical variable. A 7 digits number that serves as a unique identifier
+of the subject.
+sex: A factor variable. The sex of each subject. The values are either “MALE”
+or “FEMALE”.
+age: A numerical variable. The age of each subject.
+bmi: A numerical variable. The body mass index of each subject.
+systolic: A numerical variable. The systolic blood pressure of each subject.
+diastolic: A numerical variable. The diastolic blood pressure of each subject.
+group: A factor variable. The blood pressure category of each subject. The
+values are “NORMAL” both the systolic blood pressure is within its normal
+range (between 90 and 139) and the diastolic blood pressure is within its
+normal range (between 60 and 89). The value is “HIGH” if either measurements of blood pressure are above their normal upper limits and it is
+“LOW” if either measurements are below their normal lower limits.
+Our goal in this question is to investigate the sampling distribution of the sample
+average of the variable “bmi”. We assume a sample of size n = 150.
+1. Compute the population average of the variable “bmi”.
+
+```{r}
+
+pop2 <- read.csv("C:\\Users\\ds3\\Downloads\\pop2.csv")
+head(pop2)
+
+```
+
+2. Compute the population standard deviation of the variable “bmi”
+3. Compute the expectation of the sampling distribution for the sample average of the variable.
+4. Compute the standard deviation of the sampling distribution for the sample average of the variable.
+5. Identify, using simulations, the central region that contains 80% of the
+sampling distribution of the sample average.
+6. Identify, using the Central Limit Theorem, an approximation of the central region that contains 80% of the sampling distribution of the sample
+average.
+
+
+
+
+
## Confidence intervals
```{r}
set.seed(42)
From 59accc1eb6f8e1cb61db77e6d7590b0e2615acda Mon Sep 17 00:00:00 2001
From: aisha1021
Date: Wed, 4 Jun 2025 10:40:04 -0400
Subject: [PATCH 09/26] updated files
---
week2/statistical_inference.Rmd | 137 --------
week2/statistical_inference.html | 324 +++++++++---------
...ay_5_exercises.Rmd => w2_d1_exercises.Rmd} | 0
week2/w2_d2_exercises.Rmd | 134 ++++++++
4 files changed, 299 insertions(+), 296 deletions(-)
rename week2/{Day_5_exercises.Rmd => w2_d1_exercises.Rmd} (100%)
create mode 100644 week2/w2_d2_exercises.Rmd
diff --git a/week2/statistical_inference.Rmd b/week2/statistical_inference.Rmd
index 0cd079e02..0f9dbd340 100644
--- a/week2/statistical_inference.Rmd
+++ b/week2/statistical_inference.Rmd
@@ -56,143 +56,6 @@ ggplot(se_data, aes(x = n, y = se)) +
```
-Value Probability
-0 p
-1 2p
-2 3p
-3 4p
-4 5p
-5 6p
-Table 4.4: The Distribution of Y
-
-Question 4.1. Table 4.6 presents the probabilities of the random variable Y .
-These probabilities are a function of the number p, the probability of the value
-“0”. Answer the following questions:
-1. What is the value of p = 1/21
-2. P(Y < 3) = 6/21
-3. P(Y = odd) = 12/21
-4. P(1 ≤ Y < 4) = 9/21
-5. P(|Y − 3| < 1.5) = 12/21
-6. E(Y ) = 3.333
-7. Var(Y ) = 2.222
-8. What is the standard deviation of Y = sqrt(2.22)
-
-
-Question 4.2. One invests $2 to participate in a game of chance. In this game
-a coin is tossed three times. If all tosses end up “Head” then the player wins
-$10. Otherwise, the player losses the investment.
-1. What is the probability of winning the game?
-
-(0.5 * 0.5 * 0.5) = 1/8
-
-2. What is the probability of loosing the game?
-
-1 - 1/8 = 7/8
-
-3. What is the expected gain for the player that plays this game? (Notice that the expectation can obtain a negative value.)
-
-E[gain] = 8 * 1/8 + (-2) * (7/8)
-
-
-Question 6.1. Consider the problem of establishing regulations concerning the
-maximum number of people who can occupy a lift. In particular, we would like
-to assess the probability of exceeding maximal weight when 8 people are allowed
-to use the lift simultaneously and compare that to the probability of allowing 9
-people into the lift.
-Assume that the total weight of 8 people chosen at random follows a normal
-distribution with a mean of 560kg and a standard deviation of 57kg. Assume
-that the total weight of 9 people chosen at random follows a normal distribution
-with a mean of 630kg and a standard deviation of 61kg.
-1. What is the probability that the total weight of 8 people exceeds 650kg?
-
-```{r}
-
-val <- pnorm(650,560,57)
-1 - val
-
-# 0.05717406
-
-```
-
-2. What is the probability that the total weight of 9 people exceeds 650kg?
-
-```{r}
-
-val <- pnorm(650,630,61)
-1 - val
-
-# 0.3715054
-
-```
-
-3. What is the central region that contains 80% of distribution of the total
-weight of 8 people?
-
-```{r}
-
-val1 <- qnorm(0.1, 560, 57)
-val1
-val2 <- qnorm(0.9, 560, 57)
-val2
-
-# [486.9516, 633.0484]
-
-```
-
-4. What is the central region that contains 80% of distribution of the total
-weight of 9 people?
-
-```{r}
-
-val3 <- qnorm(0.1, 630, 61)
-val3
-val4 <- qnorm(0.9, 630, 61)
-val4
-
-# [551.8254, 708.1746]
-
-```
-
-Question 7.1. The file “pop2.csv” contains information associated to the
-blood pressure of an imaginary population of size 100,000. The file can be found
-on the internet (http://pluto.huji.ac.il/~msby/StatThink/Datasets/pop2.
-csv). The variables in this file are:
-id: A numerical variable. A 7 digits number that serves as a unique identifier
-of the subject.
-sex: A factor variable. The sex of each subject. The values are either “MALE”
-or “FEMALE”.
-age: A numerical variable. The age of each subject.
-bmi: A numerical variable. The body mass index of each subject.
-systolic: A numerical variable. The systolic blood pressure of each subject.
-diastolic: A numerical variable. The diastolic blood pressure of each subject.
-group: A factor variable. The blood pressure category of each subject. The
-values are “NORMAL” both the systolic blood pressure is within its normal
-range (between 90 and 139) and the diastolic blood pressure is within its
-normal range (between 60 and 89). The value is “HIGH” if either measurements of blood pressure are above their normal upper limits and it is
-“LOW” if either measurements are below their normal lower limits.
-Our goal in this question is to investigate the sampling distribution of the sample
-average of the variable “bmi”. We assume a sample of size n = 150.
-1. Compute the population average of the variable “bmi”.
-
-```{r}
-
-pop2 <- read.csv("C:\\Users\\ds3\\Downloads\\pop2.csv")
-head(pop2)
-
-```
-
-2. Compute the population standard deviation of the variable “bmi”
-3. Compute the expectation of the sampling distribution for the sample average of the variable.
-4. Compute the standard deviation of the sampling distribution for the sample average of the variable.
-5. Identify, using simulations, the central region that contains 80% of the
-sampling distribution of the sample average.
-6. Identify, using the Central Limit Theorem, an approximation of the central region that contains 80% of the sampling distribution of the sample
-average.
-
-
-
-
-
## Confidence intervals
```{r}
set.seed(42)
diff --git a/week2/statistical_inference.html b/week2/statistical_inference.html
index db78886b6..a0de9fdf5 100644
--- a/week2/statistical_inference.html
+++ b/week2/statistical_inference.html
@@ -1,28 +1,38 @@
-
+
-
+
-
+
Statistical Inference
-
+
-
+
-
+
+
-
+
+
+
+
+
+
-
@@ -653,7 +642,7 @@
Computing power by direct simulation
// add bootstrap table styles to pandoc tables
function bootstrapStylePandocTables() {
- $('tr.header').parent('thead').parent('table').addClass('table table-condensed');
+ $('tr.odd').parent('tbody').parent('table').addClass('table table-condensed');
}
$(document).ready(function () {
bootstrapStylePandocTables();
@@ -662,6 +651,23 @@ Computing power by direct simulation
+
+
+
+
+
+
+