From c8cd96be1af9e1a0ad9bf5ac8a4ef9bf51efaa2c Mon Sep 17 00:00:00 2001 From: aisha1021 Date: Wed, 28 May 2025 14:30:42 -0400 Subject: [PATCH 01/26] day 2 assignment --- .../2014-citibike-tripdata/._.DS_Store | Bin 0 -> 120 bytes week1/citibike.sh | 17 +++++++++++++++++ 2 files changed, 17 insertions(+) create mode 100644 week1/__MACOSX/2014-citibike-tripdata/._.DS_Store diff --git a/week1/__MACOSX/2014-citibike-tripdata/._.DS_Store b/week1/__MACOSX/2014-citibike-tripdata/._.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..a5b28df1cbc6e15bd0d35cdadd0c2e65d5131c7d GIT binary patch literal 120 zcmZQz6=P>$Vqox1Ojhs@R)|o50+1L3ClDI}u^SMB_!U6R08`;00ODZ-jv*mIP;rnB Iur73U08|YJ=l}o! literal 0 HcmV?d00001 diff --git a/week1/citibike.sh b/week1/citibike.sh index 25604f545..f56507913 100755 --- a/week1/citibike.sh +++ b/week1/citibike.sh @@ -5,19 +5,36 @@ # count the number of unique stations +cut -d, -f5 201402-citibike-tripdata.csv | sort | uniq -c | wc -l + # count the number of unique bikes +cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c | wc -l + # count the number of trips per day +cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c + # find the day with the most rides +cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort -r | head -n1 + # find the day with the fewest rides +cut -d, -f2 201402-citibike-tripdata.csv | cut -d' ' -f1 | sort | uniq -c | sort | head -n2 | tail -n1 + # find the id of the bike with the most rides +cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c | sort -r | head -n1 + # count the number of rides by gender and birth year +cut -d, -f14,15 201402-citibike-tripdata.csv | sort | uniq -c + # count the number of trips that start on cross streets that both contain numbers (e.g., "1 Ave & E 15 St", "E 39 St & 2 Ave", ...) +cut -d, -f5 201402-citibike-tripdata.csv | grep '.*[0-9].*&.*[0-9].*' | wc -l # compute the average trip duration + +cut -d, -f1 201402-citibike-tripdata.csv | tail -n +2 | awk '{sum += $1; avg = sum/NR} END {print avg}' \ No newline at end of file From e993c3b575c5c270d5b991725f35173f6131137d Mon Sep 17 00:00:00 2001 From: aisha1021 Date: Thu, 29 May 2025 16:06:03 -0400 Subject: [PATCH 02/26] day 3 R assignment --- week1/citibike.R | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/week1/citibike.R b/week1/citibike.R index ad01de1d3..cfc601324 100644 --- a/week1/citibike.R +++ b/week1/citibike.R @@ -6,7 +6,8 @@ library(lubridate) ######################################## # read one month of data -trips <- read_csv('201402-citibike-tripdata.csv') +trips <- read_csv('201402-citibike-tripdata.csv', na = c("", "\\N")) + # replace spaces in column names with underscores names(trips) <- gsub(' ', '_', names(trips)) @@ -24,25 +25,42 @@ trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unkn # count the number of trips (= rows in the data frame) +summarize(trips, count=n()) + # find the earliest and latest birth years (see help for max and min to deal with NAs) +summarize(trips, min_year = min(birth_year, na.rm = TRUE), max_year = max(birth_year, na.rm = TRUE)) # use filter and grepl to find all trips that either start or end on broadway +select(trips, start_station_name, end_station_name) |> filter(grepl(".*Broadway.*", start_station_name) | grepl(".*Broadway.*", end_station_name)) + # do the same, but find all trips that both start and end on broadway +select(trips, start_station_name, end_station_name) |> filter(grepl(".*Broadway.*", start_station_name) & grepl(".*Broadway.*", end_station_name)) + # find all unique station names +trips |> distinct(start_station_name) + # count the number of trips by gender, the average trip time by gender, and the standard deviation in trip time by gender # do this all at once, by using summarize() with multiple arguments +trips |> group_by(gender) |> summarize(count = n(), avg = mean(tripduration) / 60, std = sd(tripduration) / 60) # find the 10 most frequent station-to-station trips +trips |> select(start_station_name, end_station_name) |> count(start_station_name, end_station_name, sort = TRUE) |> head(10) # find the top 3 end stations for trips starting from each start station +trips |> group_by(start_station_name) |> count(end_station_name, sort = TRUE) |> slice_max(n, n = 3) # find the top 3 most common station-to-station trips by gender +trips |> group_by(gender) |> count(end_station_name, sort = TRUE) |> slice_max(n, n = 3) # find the day with the most trips # tip: first add a column for year/month/day without time of day (use as.Date or floor_date from the lubridate package) +trips |> mutate(date = as.Date(starttime)) |> select(date) |> count(date, sort = TRUE) |> head(1) # compute the average number of trips taken during each of the 24 hours of the day across the entire month # what time(s) of day tend to be peak hour(s)? + +date_hour_counts <- trips |> mutate(hour = hour(starttime)) |> mutate(date = as.Date(starttime)) |> select(date, hour) |> group_by(hour) |> count(date, hour) +date_hour_counts |> group_by(hour) |> summarize(mean(n)) From 4f4536c4e1d1300977e6418a5c0be0967451dd92 Mon Sep 17 00:00:00 2001 From: aisha1021 Date: Thu, 29 May 2025 16:53:23 -0400 Subject: [PATCH 03/26] day 3 R assignment - Aisha Malik --- week1/citibike.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/week1/citibike.R b/week1/citibike.R index cfc601324..b0925f27b 100644 --- a/week1/citibike.R +++ b/week1/citibike.R @@ -63,4 +63,5 @@ trips |> mutate(date = as.Date(starttime)) |> select(date) |> count(date, sort = # what time(s) of day tend to be peak hour(s)? date_hour_counts <- trips |> mutate(hour = hour(starttime)) |> mutate(date = as.Date(starttime)) |> select(date, hour) |> group_by(hour) |> count(date, hour) -date_hour_counts |> group_by(hour) |> summarize(mean(n)) +date_hour_counts |> group_by(hour) |> summarize(avg = mean(n)) |> arrange(avg) |> tail(1) + From 812460bd020704599875c7c349e5ce727ffc29bf Mon Sep 17 00:00:00 2001 From: aisha1021 Date: Fri, 30 May 2025 16:46:20 -0400 Subject: [PATCH 04/26] complected day 4 plotting assignment - Aisha Malik --- week1/plot_trips.R | 106 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/week1/plot_trips.R b/week1/plot_trips.R index 4f25437ba..67fc513ee 100644 --- a/week1/plot_trips.R +++ b/week1/plot_trips.R @@ -17,23 +17,68 @@ load('trips.RData') # plot trip data ######################################## +library(ggplot2) + # plot the distribution of trip times across all rides (compare a histogram vs. a density plot) +trips |> + ggplot(aes(x = tripduration)) + + geom_histogram(bins=30) + + scale_x_log10() + +trips |> + ggplot(aes(x = tripduration)) + + geom_density(fill = "grey") + + scale_x_log10() + # plot the distribution of trip times by rider type indicated using color and fill (compare a histogram vs. a density plot) + +trips |> + ggplot(aes(x = tripduration, color = usertype, fill = usertype)) + + geom_histogram(bins=30) + + scale_x_log10() + +trips |> + ggplot(aes(x = tripduration, color = usertype, fill = usertype)) + + geom_density() + + scale_x_log10() + # plot the total number of trips on each day in the dataset +trips |> + mutate(date = as.year(starttime)) |> + ggplot(aes(x = date)) + + geom_histogram(bins=365) + + # plot the total number of trips (on the y axis) by age (on the x axis) and gender (indicated with color) +trips |> + mutate(age = year(Sys.Date()) - birth_year) |> + group_by(gender, age) |> + summarize(count_trip_by_age = n()) |> + ggplot(aes(x = age, y = count_trip_by_age, color=gender, fill=gender)) + + geom_area(alpha=0.8) + + scale_y_log10() + + # plot the ratio of male to female trips (on the y axis) by age (on the x axis) # hint: use the pivot_wider() function to reshape things to make it easier to compute this ratio # (you can skip this and come back to it tomorrow if we haven't covered pivot_wider() yet) +head(trips) +trips |> select(gender) + ######################################## # plot weather data ######################################## # plot the minimum temperature (on the y axis) over each day (on the x axis) +weather |> + ggplot(aes(x = date, y = tmin)) + + geom_point() + # plot the minimum temperature and maximum temperature (on the y axis, with different colors) over each day (on the x axis) # hint: try using the pivot_longer() function for this to reshape things before plotting # (you can skip this and come back to it tomorrow if we haven't covered reshaping data yet) @@ -48,15 +93,76 @@ trips_with_weather <- inner_join(trips, weather, by="ymd") # plot the number of trips as a function of the minimum temperature, where each point represents a day # you'll need to summarize the trips and join to the weather data to do this +trips_with_weather |> + group_by(date, tmin) |> + summarize(count_trip = n()) |> + ggplot(aes(x = tmin, y = count_trip)) + + geom_point() + + # repeat this, splitting results by whether there was substantial precipitation or not # you'll need to decide what constitutes "substantial precipitation" and create a new T/F column to indicate this +mean_prcp <- trips_with_weather |> summarize(mean_prcp = mean(prcp)) +mean_prcp <- mean_prcp$mean_prcp + +trips_with_weather |> + mutate(is_subs_prcp = prcp >= mean_prcp) |> + group_by(date, tmin, is_subs_prcp) |> + summarize(count_trip = n()) |> + ggplot(aes(x = tmin, y = count_trip, color = is_subs_prcp)) + + geom_point() + + + + + # add a smoothed fit on top of the previous plot, using geom_smooth +mean_prcp <- trips_with_weather |> summarize(mean_prcp = mean(prcp)) +mean_prcp <- mean_prcp$mean_prcp + +trips_with_weather |> + mutate(is_subs_prcp = prcp >= mean_prcp) |> + group_by(date, tmin, is_subs_prcp) |> + summarize(count_trip = n()) |> + ggplot(aes(x = tmin, y = count_trip, color = is_subs_prcp)) + + geom_point() + + geom_smooth(method = 'lm') + + # compute the average number of trips and standard deviation in number of trips by hour of the day # hint: use the hour() function from the lubridate package +trips_with_weather |> + mutate(hour = hour(starttime), date = as.Date(starttime)) |> + group_by(date, hour) |> + summarize(count = n()) |> + group_by(hour) |> + summarize(avg = mean(count), std = sd(count)) + # plot the above +trips_with_weather |> + mutate(hour = hour(starttime), date = as.Date(starttime)) |> + group_by(date, hour) |> + summarize(count = n()) |> + group_by(hour) |> + summarize(avg = mean(count), std = sd(count)) |> + pivot_longer(cols = c(avg, std), names_to = "stat", values_to = "value") |> + ggplot(aes(x = hour, y = value, color = stat, fill = stat)) + + geom_col(position = "dodge") + # repeat this, but now split the results by day of the week (Monday, Tuesday, ...) or weekday vs. weekend days # hint: use the wday() function from the lubridate package + +trips_with_weather |> + mutate(hour = hour(starttime), date = as.Date(starttime), day = wday(starttime, label = TRUE)) |> + group_by(date, day, hour) |> + summarize(count = n()) |> + group_by(day, hour) |> + summarize(avg = mean(count), std = sd(count)) |> + pivot_longer(cols = c(avg, std), names_to = "stat", values_to = "value") |> + ggplot(aes(x = hour, y = value, color = stat, fill = stat)) + + geom_col(position = "dodge") + + facet_wrap(~ day) From 53811d3b52dd5d3615abc2ff1f7f26da13df577b Mon Sep 17 00:00:00 2001 From: aisha1021 Date: Mon, 2 Jun 2025 13:30:30 -0400 Subject: [PATCH 05/26] completed plotting assignment - Aisha Malik --- week1/plot_trips.R | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/week1/plot_trips.R b/week1/plot_trips.R index 67fc513ee..d2c8185b4 100644 --- a/week1/plot_trips.R +++ b/week1/plot_trips.R @@ -10,7 +10,7 @@ library(scales) theme_set(theme_bw()) # load RData file output by load_trips.R -load('trips.RData') +load("trips.RData") ######################################## @@ -22,7 +22,7 @@ library(ggplot2) # plot the distribution of trip times across all rides (compare a histogram vs. a density plot) trips |> ggplot(aes(x = tripduration)) + - geom_histogram(bins=30) + + geom_histogram(bins = 30) + scale_x_log10() trips |> @@ -36,7 +36,7 @@ trips |> trips |> ggplot(aes(x = tripduration, color = usertype, fill = usertype)) + - geom_histogram(bins=30) + + geom_histogram(bins = 30) + scale_x_log10() trips |> @@ -49,7 +49,7 @@ trips |> trips |> mutate(date = as.year(starttime)) |> ggplot(aes(x = date)) + - geom_histogram(bins=365) + geom_histogram(bins = 365) # plot the total number of trips (on the y axis) by age (on the x axis) and gender (indicated with color) @@ -58,8 +58,8 @@ trips |> mutate(age = year(Sys.Date()) - birth_year) |> group_by(gender, age) |> summarize(count_trip_by_age = n()) |> - ggplot(aes(x = age, y = count_trip_by_age, color=gender, fill=gender)) + - geom_area(alpha=0.8) + + ggplot(aes(x = age, y = count_trip_by_age, color = gender, fill = gender)) + + geom_area(alpha = 0.8) + scale_y_log10() @@ -68,7 +68,19 @@ trips |> # (you can skip this and come back to it tomorrow if we haven't covered pivot_wider() yet) head(trips) -trips |> select(gender) +trips |> + mutate(age = year(Sys.Date()) - birth_year) |> + select(gender, age) |> + group_by(age, gender) |> + summarize(coumt_age_gender = n()) |> + pivot_wider(names_from = gender, values_from = coumt_age_gender) |> + mutate(male_female_ratio = Male/Female) |> + ggplot(aes(x = age, y = male_female_ratio)) + + geom_point() + + geom_smooth(se = FALSE) + + + ######################################## # plot weather data @@ -83,12 +95,19 @@ weather |> # hint: try using the pivot_longer() function for this to reshape things before plotting # (you can skip this and come back to it tomorrow if we haven't covered reshaping data yet) +weather |> + select(date, tmin, tmax) |> + pivot_longer(names_to = "min_max", values_to = "temperature", -date) |> + ggplot(aes(x = date, y = temperature, color = min_max, fill = min_max)) + + geom_point() + + ######################################## # plot trip and weather data ######################################## # join trips and weather -trips_with_weather <- inner_join(trips, weather, by="ymd") +trips_with_weather <- inner_join(trips, weather, by = "ymd") # plot the number of trips as a function of the minimum temperature, where each point represents a day # you'll need to summarize the trips and join to the weather data to do this @@ -106,7 +125,7 @@ trips_with_weather |> mean_prcp <- trips_with_weather |> summarize(mean_prcp = mean(prcp)) mean_prcp <- mean_prcp$mean_prcp -trips_with_weather |> +trips_with_weather |> mutate(is_subs_prcp = prcp >= mean_prcp) |> group_by(date, tmin, is_subs_prcp) |> summarize(count_trip = n()) |> @@ -115,26 +134,24 @@ trips_with_weather |> - - # add a smoothed fit on top of the previous plot, using geom_smooth mean_prcp <- trips_with_weather |> summarize(mean_prcp = mean(prcp)) mean_prcp <- mean_prcp$mean_prcp -trips_with_weather |> +trips_with_weather |> mutate(is_subs_prcp = prcp >= mean_prcp) |> group_by(date, tmin, is_subs_prcp) |> summarize(count_trip = n()) |> ggplot(aes(x = tmin, y = count_trip, color = is_subs_prcp)) + geom_point() + - geom_smooth(method = 'lm') + geom_smooth(method = "lm") # compute the average number of trips and standard deviation in number of trips by hour of the day # hint: use the hour() function from the lubridate package -trips_with_weather |> +trips_with_weather |> mutate(hour = hour(starttime), date = as.Date(starttime)) |> group_by(date, hour) |> summarize(count = n()) |> From 7eefc5489979ad9b7c0f4acbff2ed597ff79e4cf Mon Sep 17 00:00:00 2001 From: aisha1021 Date: Mon, 2 Jun 2025 16:10:11 -0400 Subject: [PATCH 06/26] completed day 5 assignments --- week2/Day_5_exercises.Rmd | 91 ++++++++ week2/diamond-sizes.Rmd | 44 +++- week2/diamond-sizes.html | 427 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 561 insertions(+), 1 deletion(-) create mode 100644 week2/Day_5_exercises.Rmd create mode 100644 week2/diamond-sizes.html diff --git a/week2/Day_5_exercises.Rmd b/week2/Day_5_exercises.Rmd new file mode 100644 index 000000000..d7cd699c3 --- /dev/null +++ b/week2/Day_5_exercises.Rmd @@ -0,0 +1,91 @@ +# Exercie 1 + +Compute the rate for table2, and table4a + table4b. You will need to perform four operations: + +1. Extract the number of TB cases per country per year. +2. Extract the matching population per country per year. +3. Divide cases by population, and multiply by 10000. +4. Store back in the appropriate place. + + +```{r table2-rates} +table2 |> + pivot_wider(names_from = type, values_from = count) |> + mutate(cases_by_pop = (cases/population)*10000) |> + pivot_longer(names_to = "type", values_to = "count", -c(country, year, cases_by_pop)) +``` + +```{r table4a-table4b-rates} +table4a +table4b + +table4a_longer <- table4a |> + pivot_longer(names_to = "year", values_to = "cases", -country) + +table4a_longer + +table4b_longer <- table4b |> + pivot_longer(names_to = "year", values_to = "population", -country) +table4b_longer + +inner_join(table4a_longer, table4b_longer) |> + mutate(cases_by_pop = (cases/population)*10000) + + +``` + +Which representation is easiest to work with? Which is hardest? Why +- `table2` is easier than `table4a + table4b` to work with because we are working with a single dataframe and it requires less lines of code. However, I do think that `table4a + table4b` makes more logical sense. + +# Exercie 12.3.3 + +## Exercise 1 + + +Why are pivot_longer() and pivot_wider() not perfectly symmetrical? +Carefully consider the following example: + +```{r} + +stocks <- tibble( + year = c(2015, 2015, 2016, 2016), + half = c( 1, 2, 1, 2), + return = c(1.88, 0.59, 0.92, 0.17) +) +stocks %>% + pivot_wider(names_from = year, values_from = return) %>% + pivot_longer(`2015`:`2016`, names_to = "year", values_to = "return") + +``` + +(Hint: look at the variable types and think about column names.) + +- pivot_longer() and pivot_wider() are not symmetrical because when pivot_longer() is executed, the column names are in +different format than the original value. But for pivot_wider(), the new columns remain in the same format. + +pivot_longer() has a names_ptypes argument, e.g. names_ptypes = list(year = double()). What does it do? + +- The argument checks if the conversion maintains the original data type of a column such as year (which was originally a double) + + +## Exercie 3 + +What would happen if you widen this table? Why? How could you add a new column to uniquely identify each value? +```{r} + +people <- tribble( + ~name, ~names, ~values, + #-----------------|--------|------ + "Phillip Woods", "age", 45, + "Phillip Woods", "height", 186, + "Phillip Woods", "age", 50, + "Jessica Cordero", "age", 37, + "Jessica Cordero", "height", 156 +) + +``` + + - The table depicts that there are two people with the same name which can cause issues with pivot_wider(). An new column with the specific ID must be + established to differenciate each person. + + diff --git a/week2/diamond-sizes.Rmd b/week2/diamond-sizes.Rmd index 3003fdbd2..1ae0201e0 100644 --- a/week2/diamond-sizes.Rmd +++ b/week2/diamond-sizes.Rmd @@ -4,9 +4,16 @@ date: 2016-08-25 output: html_document --- +```{r, echo = FALSE} +knitr::opts_chunk$set( + echo = FALSE +) +``` + ```{r setup, include = FALSE} library(ggplot2) library(dplyr) +library(viridis) smaller <- diamonds %>% filter(carat <= 2.5) @@ -17,8 +24,43 @@ We have data about `r nrow(diamonds)` diamonds. Only 2.5 carats. The distribution of the remainder is shown below: -```{r, echo = FALSE} +```{r} smaller %>% ggplot(aes(carat)) + geom_freqpoly(binwidth = 0.01) +``` + +The striking features include: +- Most diamonds (~25000) are approximately 0.25 carats +- Majority of the diamonds are less than 1.25 carats + +# Exercise 27.4.7 +## Exercise 1 + +Add a section that explores how diamond sizes vary by cut, colour, and clarity. Assume you’re +writing a report for someone who doesn’t know R, and instead of setting echo = FALSE on each +chunk, set a global option. + +```{r} + +diamonds |> + ggplot(aes(x = carat, color = color, fill = color)) + + geom_histogram() + +``` + +```{r} + +diamonds |> + ggplot(aes(x = carat, color = cut, fill = cut)) + + geom_histogram() + + scale_fill_viridis_d(option = "mako", direction = -1) +``` + +```{r} + +diamonds |> + ggplot(aes(x = carat, color = clarity, fill = clarity)) + + geom_histogram() + + scale_fill_viridis_d(option = "magma") ``` \ No newline at end of file diff --git a/week2/diamond-sizes.html b/week2/diamond-sizes.html new file mode 100644 index 000000000..ab12e6d35 --- /dev/null +++ b/week2/diamond-sizes.html @@ -0,0 +1,427 @@ + + + + + + + + + + + + + + +Diamond sizes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +

We have data about 53940 diamonds. Only 126 are larger than 2.5 +carats. The distribution of the remainder is shown below:

+

+

The striking features include: - Most diamonds (~25000) are +approximately 0.25 carats - Majority of the diamonds are less than 1.25 +carats

+
+

Exercise 27.4.7

+
+

Exercise 1

+

Add a section that explores how diamond sizes vary by cut, colour, +and clarity. Assume you’re writing a report for someone who doesn’t know +R, and instead of setting echo = FALSE on each chunk, set a global +option.

+
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
+

+
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
+

+
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
+

+
+
+ + + + +
+ + + + + + + + + + + + + + + From 2ffbd99512e62967557bf874dc3736ba3cbfdea6 Mon Sep 17 00:00:00 2001 From: aisha1021 Date: Mon, 2 Jun 2025 16:16:39 -0400 Subject: [PATCH 07/26] updated day 5 assignment --- week2/diamond-sizes.Rmd | 6 +++--- week2/diamond-sizes.html | 3 --- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/week2/diamond-sizes.Rmd b/week2/diamond-sizes.Rmd index 1ae0201e0..eacd5a1d1 100644 --- a/week2/diamond-sizes.Rmd +++ b/week2/diamond-sizes.Rmd @@ -45,7 +45,7 @@ chunk, set a global option. diamonds |> ggplot(aes(x = carat, color = color, fill = color)) + - geom_histogram() + geom_histogram(bins = 30) ``` @@ -53,7 +53,7 @@ diamonds |> diamonds |> ggplot(aes(x = carat, color = cut, fill = cut)) + - geom_histogram() + + geom_histogram(bins = 30) + scale_fill_viridis_d(option = "mako", direction = -1) ``` @@ -61,6 +61,6 @@ diamonds |> diamonds |> ggplot(aes(x = carat, color = clarity, fill = clarity)) + - geom_histogram() + + geom_histogram(bins = 30) + scale_fill_viridis_d(option = "magma") ``` \ No newline at end of file diff --git a/week2/diamond-sizes.html b/week2/diamond-sizes.html index ab12e6d35..d6a3dc732 100644 --- a/week2/diamond-sizes.html +++ b/week2/diamond-sizes.html @@ -369,11 +369,8 @@

Exercise 1

and clarity. Assume you’re writing a report for someone who doesn’t know R, and instead of setting echo = FALSE on each chunk, set a global option.

-
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

-
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

-
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

From 254d1990c9efe6d8283789fc3c77bc1e508c50c5 Mon Sep 17 00:00:00 2001 From: aisha1021 Date: Tue, 3 Jun 2025 16:15:43 -0400 Subject: [PATCH 08/26] week 2 day 2 - Aisha --- week2/statistical_inference.Rmd | 138 ++++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) diff --git a/week2/statistical_inference.Rmd b/week2/statistical_inference.Rmd index 68c6dde5e..0cd079e02 100644 --- a/week2/statistical_inference.Rmd +++ b/week2/statistical_inference.Rmd @@ -55,6 +55,144 @@ ggplot(se_data, aes(x = n, y = se)) + stat_function(fun=function(n) {sqrt(p * (1 - p) / n)}, linetype=2) ``` + +Value Probability +0 p +1 2p +2 3p +3 4p +4 5p +5 6p +Table 4.4: The Distribution of Y + +Question 4.1. Table 4.6 presents the probabilities of the random variable Y . +These probabilities are a function of the number p, the probability of the value +“0”. Answer the following questions: +1. What is the value of p = 1/21 +2. P(Y < 3) = 6/21 +3. P(Y = odd) = 12/21 +4. P(1 ≤ Y < 4) = 9/21 +5. P(|Y − 3| < 1.5) = 12/21 +6. E(Y ) = 3.333 +7. Var(Y ) = 2.222 +8. What is the standard deviation of Y = sqrt(2.22) + + +Question 4.2. One invests $2 to participate in a game of chance. In this game +a coin is tossed three times. If all tosses end up “Head” then the player wins +$10. Otherwise, the player losses the investment. +1. What is the probability of winning the game? + +(0.5 * 0.5 * 0.5) = 1/8 + +2. What is the probability of loosing the game? + +1 - 1/8 = 7/8 + +3. What is the expected gain for the player that plays this game? (Notice that the expectation can obtain a negative value.) + +E[gain] = 8 * 1/8 + (-2) * (7/8) + + +Question 6.1. Consider the problem of establishing regulations concerning the +maximum number of people who can occupy a lift. In particular, we would like +to assess the probability of exceeding maximal weight when 8 people are allowed +to use the lift simultaneously and compare that to the probability of allowing 9 +people into the lift. +Assume that the total weight of 8 people chosen at random follows a normal +distribution with a mean of 560kg and a standard deviation of 57kg. Assume +that the total weight of 9 people chosen at random follows a normal distribution +with a mean of 630kg and a standard deviation of 61kg. +1. What is the probability that the total weight of 8 people exceeds 650kg? + +```{r} + +val <- pnorm(650,560,57) +1 - val + +# 0.05717406 + +``` + +2. What is the probability that the total weight of 9 people exceeds 650kg? + +```{r} + +val <- pnorm(650,630,61) +1 - val + +# 0.3715054 + +``` + +3. What is the central region that contains 80% of distribution of the total +weight of 8 people? + +```{r} + +val1 <- qnorm(0.1, 560, 57) +val1 +val2 <- qnorm(0.9, 560, 57) +val2 + +# [486.9516, 633.0484] + +``` + +4. What is the central region that contains 80% of distribution of the total +weight of 9 people? + +```{r} + +val3 <- qnorm(0.1, 630, 61) +val3 +val4 <- qnorm(0.9, 630, 61) +val4 + +# [551.8254, 708.1746] + +``` + +Question 7.1. The file “pop2.csv” contains information associated to the +blood pressure of an imaginary population of size 100,000. The file can be found +on the internet (http://pluto.huji.ac.il/~msby/StatThink/Datasets/pop2. +csv). The variables in this file are: +id: A numerical variable. A 7 digits number that serves as a unique identifier +of the subject. +sex: A factor variable. The sex of each subject. The values are either “MALE” +or “FEMALE”. +age: A numerical variable. The age of each subject. +bmi: A numerical variable. The body mass index of each subject. +systolic: A numerical variable. The systolic blood pressure of each subject. +diastolic: A numerical variable. The diastolic blood pressure of each subject. +group: A factor variable. The blood pressure category of each subject. The +values are “NORMAL” both the systolic blood pressure is within its normal +range (between 90 and 139) and the diastolic blood pressure is within its +normal range (between 60 and 89). The value is “HIGH” if either measurements of blood pressure are above their normal upper limits and it is +“LOW” if either measurements are below their normal lower limits. +Our goal in this question is to investigate the sampling distribution of the sample +average of the variable “bmi”. We assume a sample of size n = 150. +1. Compute the population average of the variable “bmi”. + +```{r} + +pop2 <- read.csv("C:\\Users\\ds3\\Downloads\\pop2.csv") +head(pop2) + +``` + +2. Compute the population standard deviation of the variable “bmi” +3. Compute the expectation of the sampling distribution for the sample average of the variable. +4. Compute the standard deviation of the sampling distribution for the sample average of the variable. +5. Identify, using simulations, the central region that contains 80% of the +sampling distribution of the sample average. +6. Identify, using the Central Limit Theorem, an approximation of the central region that contains 80% of the sampling distribution of the sample +average. + + + + + ## Confidence intervals ```{r} set.seed(42) From 59accc1eb6f8e1cb61db77e6d7590b0e2615acda Mon Sep 17 00:00:00 2001 From: aisha1021 Date: Wed, 4 Jun 2025 10:40:04 -0400 Subject: [PATCH 09/26] updated files --- week2/statistical_inference.Rmd | 137 -------- week2/statistical_inference.html | 324 +++++++++--------- ...ay_5_exercises.Rmd => w2_d1_exercises.Rmd} | 0 week2/w2_d2_exercises.Rmd | 134 ++++++++ 4 files changed, 299 insertions(+), 296 deletions(-) rename week2/{Day_5_exercises.Rmd => w2_d1_exercises.Rmd} (100%) create mode 100644 week2/w2_d2_exercises.Rmd diff --git a/week2/statistical_inference.Rmd b/week2/statistical_inference.Rmd index 0cd079e02..0f9dbd340 100644 --- a/week2/statistical_inference.Rmd +++ b/week2/statistical_inference.Rmd @@ -56,143 +56,6 @@ ggplot(se_data, aes(x = n, y = se)) + ``` -Value Probability -0 p -1 2p -2 3p -3 4p -4 5p -5 6p -Table 4.4: The Distribution of Y - -Question 4.1. Table 4.6 presents the probabilities of the random variable Y . -These probabilities are a function of the number p, the probability of the value -“0”. Answer the following questions: -1. What is the value of p = 1/21 -2. P(Y < 3) = 6/21 -3. P(Y = odd) = 12/21 -4. P(1 ≤ Y < 4) = 9/21 -5. P(|Y − 3| < 1.5) = 12/21 -6. E(Y ) = 3.333 -7. Var(Y ) = 2.222 -8. What is the standard deviation of Y = sqrt(2.22) - - -Question 4.2. One invests $2 to participate in a game of chance. In this game -a coin is tossed three times. If all tosses end up “Head” then the player wins -$10. Otherwise, the player losses the investment. -1. What is the probability of winning the game? - -(0.5 * 0.5 * 0.5) = 1/8 - -2. What is the probability of loosing the game? - -1 - 1/8 = 7/8 - -3. What is the expected gain for the player that plays this game? (Notice that the expectation can obtain a negative value.) - -E[gain] = 8 * 1/8 + (-2) * (7/8) - - -Question 6.1. Consider the problem of establishing regulations concerning the -maximum number of people who can occupy a lift. In particular, we would like -to assess the probability of exceeding maximal weight when 8 people are allowed -to use the lift simultaneously and compare that to the probability of allowing 9 -people into the lift. -Assume that the total weight of 8 people chosen at random follows a normal -distribution with a mean of 560kg and a standard deviation of 57kg. Assume -that the total weight of 9 people chosen at random follows a normal distribution -with a mean of 630kg and a standard deviation of 61kg. -1. What is the probability that the total weight of 8 people exceeds 650kg? - -```{r} - -val <- pnorm(650,560,57) -1 - val - -# 0.05717406 - -``` - -2. What is the probability that the total weight of 9 people exceeds 650kg? - -```{r} - -val <- pnorm(650,630,61) -1 - val - -# 0.3715054 - -``` - -3. What is the central region that contains 80% of distribution of the total -weight of 8 people? - -```{r} - -val1 <- qnorm(0.1, 560, 57) -val1 -val2 <- qnorm(0.9, 560, 57) -val2 - -# [486.9516, 633.0484] - -``` - -4. What is the central region that contains 80% of distribution of the total -weight of 9 people? - -```{r} - -val3 <- qnorm(0.1, 630, 61) -val3 -val4 <- qnorm(0.9, 630, 61) -val4 - -# [551.8254, 708.1746] - -``` - -Question 7.1. The file “pop2.csv” contains information associated to the -blood pressure of an imaginary population of size 100,000. The file can be found -on the internet (http://pluto.huji.ac.il/~msby/StatThink/Datasets/pop2. -csv). The variables in this file are: -id: A numerical variable. A 7 digits number that serves as a unique identifier -of the subject. -sex: A factor variable. The sex of each subject. The values are either “MALE” -or “FEMALE”. -age: A numerical variable. The age of each subject. -bmi: A numerical variable. The body mass index of each subject. -systolic: A numerical variable. The systolic blood pressure of each subject. -diastolic: A numerical variable. The diastolic blood pressure of each subject. -group: A factor variable. The blood pressure category of each subject. The -values are “NORMAL” both the systolic blood pressure is within its normal -range (between 90 and 139) and the diastolic blood pressure is within its -normal range (between 60 and 89). The value is “HIGH” if either measurements of blood pressure are above their normal upper limits and it is -“LOW” if either measurements are below their normal lower limits. -Our goal in this question is to investigate the sampling distribution of the sample -average of the variable “bmi”. We assume a sample of size n = 150. -1. Compute the population average of the variable “bmi”. - -```{r} - -pop2 <- read.csv("C:\\Users\\ds3\\Downloads\\pop2.csv") -head(pop2) - -``` - -2. Compute the population standard deviation of the variable “bmi” -3. Compute the expectation of the sampling distribution for the sample average of the variable. -4. Compute the standard deviation of the sampling distribution for the sample average of the variable. -5. Identify, using simulations, the central region that contains 80% of the -sampling distribution of the sample average. -6. Identify, using the Central Limit Theorem, an approximation of the central region that contains 80% of the sampling distribution of the sample -average. - - - - - ## Confidence intervals ```{r} set.seed(42) diff --git a/week2/statistical_inference.html b/week2/statistical_inference.html index db78886b6..a0de9fdf5 100644 --- a/week2/statistical_inference.html +++ b/week2/statistical_inference.html @@ -1,28 +1,38 @@ - + - + - + Statistical Inference - + - + - + + - + + + + + +
-

Hypothesis testing

@@ -489,21 +477,24 @@

Hypothesis testing

ggplot(data.frame(p0_hat), aes(x = p0_hat)) + geom_histogram(binwidth = 0.01) + geom_vline(xintercept=p_hat, linetype=2, color="red") -

+

# compare this to our experiment
 # how likely is it that we would see an estimate this extreme if the coin really were fair?
 num_as_extreme <- sum(p0_hat <= p_hat)
 p_value <- num_as_extreme / length(p0_hat)
 p_value
## [1] 0
-

Only 0 out of 100000 estimates from a fair coin with p=0.5 would result in an estimate of p_hat=0.29 or smaller, corresponding to a p-value of 0.

+

Only 0 out of 100000 estimates from a fair coin with p=0.5 would +result in an estimate of p_hat=0.29 or smaller, corresponding to a +p-value of 0.

Comparing two proportions

Point estimates and sampling distributions

-

Repeatedly flip two coins, each 500 times and estimate their bias.

+

Repeatedly flip two coins, each 500 times and estimate their +bias.

estimate_coin_bias <- function(n, p) {
   mean(rbinom(n,1,p))
 }
@@ -516,24 +507,25 @@ 

Point estimates and sampling distributions

# wrangle the results into one data frame plot_data <- bind_rows(data.frame(split='A', trial=1:length(pa_hat), p_hat=pa_hat), - data.frame(split='B', trial=1:length(pb_hat), p_hat=pb_hat))
-
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
-
## Warning in bind_rows_(x, .id): binding character and factor vector,
-## coercing into character vector
-
-## Warning in bind_rows_(x, .id): binding character and factor vector,
-## coercing into character vector
-
# plot the sampling distributions for each split
+                       data.frame(split='B', trial=1:length(pb_hat), p_hat=pb_hat))
+
+# plot the sampling distributions for each split
 ggplot(data=plot_data, aes(x=p_hat, fill=split)) +
   geom_histogram(position="identity", binwidth=0.002, alpha=0.5) +
   scale_alpha(guide=F)
-

+
## Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
+## ggplot2 3.3.4.
+## ℹ Please use "none" instead.
+## This warning is displayed once every 8 hours.
+## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
+## generated.
+

# plot the sampling distribution of the difference
 ggplot(data.frame(pa_hat, pb_hat), aes(x = pa_hat - pb_hat)) +
   geom_histogram(binwidth = 0.002) +
   geom_vline(xintercept=pa-pb) +
   geom_vline(xintercept=mean(pa_hat-pb_hat), linetype=2, color="red")  
-

+

# note that variances add for independent random variables
 variance_of_difference <- var(pa_hat - pb_hat)
 sum_of_variances <- var(pa_hat) + var(pb_hat)
@@ -549,7 +541,7 @@

Confidence intervals

geom_pointrange(aes(ymin=LCL, ymax=UCL)) + xlab('') + theme(legend.title=element_blank()) -

+

Hypothesis testing

@@ -569,14 +561,16 @@

Hypothesis testing

ggplot(data.frame(dp0_hat), aes(x = dp0_hat)) + geom_histogram(binwidth = 0.01) + geom_vline(xintercept=dp_hat, linetype=2, color="red") -

+

# compare this to our experiment
 # how likely is it that we would see an estimate this extreme both coins were identical?
 num_as_extreme <- sum(dp0_hat >= dp_hat)
 p_value <- num_as_extreme / length(dp0_hat)
 p_value
## [1] 0.00048
-

Only 48 out of 100000 estimates from two identical coins with p=0.08 would result in an estimate of dp_hat=0.058 or smaller, corresponding to a p-value of 4.810^{-4}.

+

Only 48 out of 100000 estimates from two identical coins with p=0.08 +would result in an estimate of dp_hat=0.058 or smaller, corresponding to +a p-value of 4.8^{-4}.

@@ -621,26 +615,21 @@

Computing power by direct simulation

}
## Warning in prop.test(x = c(na, nb), n = c(n, n), alternative = "greater", :
 ## Chi-squared approximation may be incorrect
-
 ## Warning in prop.test(x = c(na, nb), n = c(n, n), alternative = "greater", :
 ## Chi-squared approximation may be incorrect
-
 ## Warning in prop.test(x = c(na, nb), n = c(n, n), alternative = "greater", :
 ## Chi-squared approximation may be incorrect
-
 ## Warning in prop.test(x = c(na, nb), n = c(n, n), alternative = "greater", :
 ## Chi-squared approximation may be incorrect
-
 ## Warning in prop.test(x = c(na, nb), n = c(n, n), alternative = "greater", :
 ## Chi-squared approximation may be incorrect
-
 ## Warning in prop.test(x = c(na, nb), n = c(n, n), alternative = "greater", :
 ## Chi-squared approximation may be incorrect
ggplot(data.frame(N, pow), aes(x = N, y = pow)) +
   geom_smooth(color = "gray") +
   geom_point()
-
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
-

+
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
+

@@ -653,7 +642,7 @@

Computing power by direct simulation

// add bootstrap table styles to pandoc tables function bootstrapStylePandocTables() { - $('tr.header').parent('thead').parent('table').addClass('table table-condensed'); + $('tr.odd').parent('tbody').parent('table').addClass('table table-condensed'); } $(document).ready(function () { bootstrapStylePandocTables(); @@ -662,6 +651,23 @@

Computing power by direct simulation

+ + + + + + +