From fea8d643e43a0ea92cec265c9771010b2412710f Mon Sep 17 00:00:00 2001 From: Vanessa Date: Wed, 28 May 2025 14:56:59 -0400 Subject: [PATCH 1/7] Citibike Assignment --- week1/citibike.sh | 195 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 194 insertions(+), 1 deletion(-) diff --git a/week1/citibike.sh b/week1/citibike.sh index 25604f545..cb0c5d426 100755 --- a/week1/citibike.sh +++ b/week1/citibike.sh @@ -5,19 +5,212 @@ # count the number of unique stations +# cut -d, -f5 201402-citibike-tripdata.csv | sort | uniq | wc -l +# 330 + # count the number of unique bikes +# cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq | wc -l +# 5700 + # count the number of trips per day +# cut -d, -f2 201402-citibike-tripdata.csv | cut -d ' ' -f1 | sort | uniq -c +# 12771 2014-02-01 +# 13816 2014-02-02 +# 2600 2014-02-03 +# 8709 2014-02-04 +# 2746 2014-02-05 +# 7196 2014-02-06 +# 8495 2014-02-07 +# 5986 2014-02-08 +# 4996 2014-02-09 +# 6846 2014-02-10 +# 8343 2014-02-11 +# 8580 2014-02-12 +# 876 2014-02-13 +# 3609 2014-02-14 +# 2261 2014-02-15 +# 3003 2014-02-16 +# 4854 2014-02-17 +# 5140 2014-02-18 +# 8506 2014-02-19 +# 11792 2014-02-20 +# 8680 2014-02-21 +# 13044 2014-02-22 +# 13324 2014-02-23 +# 12922 2014-02-24 +# 12830 2014-02-25 +# 11188 2014-02-26 +# 12036 2014-02-27 +# 9587 2014-02-28 +# 1 starttime # find the day with the most rides +# cut -d, -f2 201402-citibike-tripdata.csv | cut -d ' ' -f1 | sort | uniq -c | sort -nr | head -n1 +# 13816 2014-02-02 + # find the day with the fewest rides +# cut -d, -f2 201402-citibike-tripdata.csv | cut -d ' ' -f1 | sort | uniq -c | sort | head -n2 +# 1 starttime +# 876 2014-02-13 + # find the id of the bike with the most rides +# cut -d, -f12 201402-citibike-tripdata.csv | sort | uniq -c | sort -nr | head -n1 +# 130 20837 # count the number of rides by gender and birth year +# cut -d, -f14,15 201402-citibike-tripdata.csv | sort | uniq -c +# 6717 \N,0 +# 9 1899,1 +# 68 1900,1 +# 11 1901,1 +# 5 1907,1 +# 4 1910,1 +# 1 1913,1 +# 3 1917,1 +# 1 1921,1 +# 32 1922,1 +# 5 1926,2 +# 2 1927,1 +# 1 1932,1 +# 7 1932,2 +# 10 1933,1 +# 21 1934,1 +# 14 1935,1 +# 31 1936,1 +# 24 1937,1 +# 70 1938,1 +# 5 1938,2 +# 24 1939,1 +# 19 1939,2 +# 83 1940,1 +# 1 1940,2 +# 148 1941,1 +# 16 1941,2 +# 173 1942,1 +# 9 1942,2 +# 108 1943,1 +# 22 1943,2 +# 277 1944,1 +# 34 1944,2 +# 171 1945,1 +# 43 1945,2 +# 424 1946,1 +# 30 1946,2 +# 391 1947,1 +# 60 1947,2 +# 664 1948,1 +# 143 1948,2 +# 624 1949,1 +# 101 1949,2 +# 738 1950,1 +# 152 1950,2 +# 6 1951,0 +# 1006 1951,1 +# 146 1951,2 +# 1040 1952,1 +# 143 1952,2 +# 1474 1953,1 +# 301 1953,2 +# 1636 1954,1 +# 306 1954,2 +# 1568 1955,1 +# 349 1955,2 +# 1777 1956,1 +# 542 1956,2 +# 1676 1957,1 +# 562 1957,2 +# 2333 1958,1 +# 643 1958,2 +# 2281 1959,1 +# 539 1959,2 +# 2679 1960,1 +# 776 1960,2 +# 2315 1961,1 +# 432 1961,2 +# 2808 1962,1 +# 833 1962,2 +# 3514 1963,1 +# 715 1963,2 +# 3679 1964,1 +# 570 1964,2 +# 2957 1965,1 +# 687 1965,2 +# 3440 1966,1 +# 565 1966,2 +# 4016 1967,1 +# 634 1967,2 +# 3931 1968,1 +# 545 1968,2 +# 4557 1969,1 +# 898 1969,2 +# 4657 1970,1 +# 1079 1970,2 +# 4132 1971,1 +# 791 1971,2 +# 4066 1972,1 +# 962 1972,2 +# 4097 1973,1 +# 877 1973,2 +# 4957 1974,1 +# 891 1974,2 +# 4185 1975,1 +# 699 1975,2 +# 4557 1976,1 +# 1022 1976,2 +# 4817 1977,1 +# 1140 1977,2 +# 5645 1978,1 +# 1231 1978,2 +# 6433 1979,1 +# 1338 1979,2 +# 6173 1980,1 +# 1488 1980,2 +# 6620 1981,1 +# 1588 1981,2 +# 6244 1982,1 +# 1724 1982,2 +# 6890 1983,1 +# 1889 1983,2 +# 7348 1984,1 +# 1791 1984,2 +# 7043 1985,1 +# 2262 1985,2 +# 6147 1986,1 +# 1962 1986,2 +# 5776 1987,1 +# 1696 1987,2 +# 6449 1988,1 +# 1599 1988,2 +# 5408 1989,1 +# 1435 1989,2 +# 4541 1990,1 +# 1156 1990,2 +# 8 1991,0 +# 2377 1991,1 +# 689 1991,2 +# 1758 1992,1 +# 410 1992,2 +# 1398 1993,1 +# 289 1993,2 +# 927 1994,1 +# 288 1994,2 +# 664 1995,1 +# 163 1995,2 +# 234 1996,1 +# 100 1996,2 +# 164 1997,1 +# 87 1997,2 +# 1 birth year,gender -# count the number of trips that start on cross streets that both contain numbers (e.g., "1 Ave & E 15 St", "E 39 St & 2 Ave", ...) +# count the number of trips that start on cross streets that both contain numbers (e.g., "1 Ave & E 15 St", "E 39 St & 2 Ave", ...) +# cut -d, -f5 201402-citibike-tripdata.csv | grep '[0-9].*&.*[0-9]' | wc -l +# 90549 # compute the average trip duration + +# awk '{sumvalues += $1} END {print sumvalues/NR}' 201402-citibike-tripdata.csv +# 874.516 From 7a70869dba1f29d69543d820c8e1c727fa2ea1ba Mon Sep 17 00:00:00 2001 From: Vanessa Date: Fri, 30 May 2025 10:10:24 -0400 Subject: [PATCH 2/7] Assignment 3 --- week1/citibike.R | 156 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 155 insertions(+), 1 deletion(-) diff --git a/week1/citibike.R b/week1/citibike.R index ad01de1d3..e414c9fb1 100644 --- a/week1/citibike.R +++ b/week1/citibike.R @@ -6,7 +6,7 @@ library(lubridate) ######################################## # read one month of data -trips <- read_csv('201402-citibike-tripdata.csv') +trips <- read_csv('./week1/201402-citibike-tripdata.csv') # replace spaces in column names with underscores names(trips) <- gsub(' ', '_', names(trips)) @@ -23,26 +23,180 @@ trips <- mutate(trips, gender = factor(gender, levels=c(0,1,2), labels = c("Unkn ######################################## # count the number of trips (= rows in the data frame) +summarize(trips, count=n()) +# A tibble: 1 × 1 + count + +224736 # find the earliest and latest birth years (see help for max and min to deal with NAs) +> min(trips$birth_year, na.rm=TRUE) +[1] 1899 +> max(trips$birth_year, na.rm=TRUE) +[1] 1997 # use filter and grepl to find all trips that either start or end on broadway +filter(trips, grepl('Broadway', start_station_name) | grepl('Broadway', end_station_name)) +# A tibble: 41,469 × 17 + tripduration starttime stoptime start_station_id + + 1 372 2014-02-01 00:00:03 2014-02-01 00:06:15 285 + 2 583 2014-02-01 00:00:32 2014-02-01 00:10:15 357 + 3 439 2014-02-01 00:02:14 2014-02-01 00:09:33 285 + 4 707 2014-02-01 00:02:50 2014-02-01 00:14:37 257 + 5 695 2014-02-01 00:06:53 2014-02-01 00:18:28 490 + 6 892 2014-02-01 00:07:22 2014-02-01 00:22:14 499 + 7 636 2014-02-01 00:08:25 2014-02-01 00:19:01 285 + 8 878 2014-02-01 00:09:03 2014-02-01 00:23:41 497 + 9 1064 2014-02-01 00:12:27 2014-02-01 00:30:11 444 +10 469 2014-02-01 00:12:40 2014-02-01 00:20:29 497 +# ℹ 41,459 more rows +# ℹ 13 more variables: start_station_name , start_station_latitude , +# start_station_longitude , end_station_id , +# end_station_name , end_station_latitude , +# end_station_longitude , bikeid , usertype , +# birth_year , gender , trip_start_date , +# trip_end_date +# ℹ Use `print(n = ...)` to see more rows # do the same, but find all trips that both start and end on broadway +filter(trips, grepl('Broadway', start_station_name) & grepl('Broadway', end_station_name)) +# A tibble: 2,776 × 17 + tripduration starttime stoptime start_station_id + + 1 884 2014-02-01 00:41:29 2014-02-01 00:56:13 500 + 2 282 2014-02-01 01:15:57 2014-02-01 01:20:39 499 + 3 601 2014-02-01 02:08:31 2014-02-01 02:18:32 486 + 4 1467 2014-02-01 03:17:49 2014-02-01 03:42:16 304 + 5 175 2014-02-01 03:18:36 2014-02-01 03:21:31 497 + 6 108 2014-02-01 04:36:45 2014-02-01 04:38:33 500 + 7 171 2014-02-01 06:39:54 2014-02-01 06:42:45 468 + 8 849 2014-02-01 06:44:46 2014-02-01 06:58:55 335 + 9 159 2014-02-01 09:01:23 2014-02-01 09:04:02 285 +10 292 2014-02-01 09:04:52 2014-02-01 09:09:44 499 +# ℹ 2,766 more rows +# ℹ 13 more variables: start_station_name , start_station_latitude , +# start_station_longitude , end_station_id , +# end_station_name , end_station_latitude , +# end_station_longitude , bikeid , usertype , +# birth_year , gender , trip_start_date , +# trip_end_date +# ℹ Use `print(n = ...)` to see more rows + # find all unique station names +trips_name <- group_by(trips, start_station_name) +summarize(trips_name) +# A tibble: 329 × 1 + start_station_name + + 1 1 Ave & E 15 St + 2 1 Ave & E 18 St + 3 1 Ave & E 30 St + 4 1 Ave & E 44 St + 5 10 Ave & W 28 St + 6 11 Ave & W 27 St + 7 11 Ave & W 41 St + 8 12 Ave & W 40 St + 9 2 Ave & E 31 St +10 2 Ave & E 58 St +ℹ 319 more rows +ℹ Use `print(n = ...)` to see more rows # count the number of trips by gender, the average trip time by gender, and the standard deviation in trip time by gender # do this all at once, by using summarize() with multiple arguments +trips %>% group_by(start_station_name, end_station_name, gender) %>% summarize (count = n()) %>% group_by(gender) %>% arrange(desc(count)) %>% slice(1:3) +# A tibble: 3 × 4 + gender count mean_trips_gender sd_gender_trips + +1 Unknown 6731 1741. 5566. +2 Male 176526 814. 5021. +3 Female 41479 991. 7115. + + # find the 10 most frequent station-to-station trips +trips_most_frequent <- group_by (trips, start_station_name, end_station_name) +trips_frequent <- summarize(trips_most_frequent, count=n()) +arrange(trips_frequent, desc(count)) +A tibble: 43,000 × 3 +Groups: start_station_name [329] + start_station_name end_station_name count + + 1 E 43 St & Vanderbilt Ave W 41 St & 8 Ave 156 + 2 Pershing Square N W 33 St & 7 Ave 124 + 3 Norfolk St & Broome St Henry St & Grand St 122 + 4 E 7 St & Avenue A Lafayette St & E 8 St 121 + 5 Henry St & Grand St Norfolk St & Broome St 118 + 6 W 17 St & 8 Ave 8 Ave & W 31 St 118 + 7 Central Park S & 6 Ave Central Park S & 6 Ave 115 + 8 Lafayette St & E 8 St E 6 St & Avenue B 115 + 9 E 10 St & Avenue A Lafayette St & E 8 St 108 +10 Canal St & Rutgers St Henry St & Grand St 103 +# ℹ 42,990 more rows +# ℹ Use `print(n = ...)` to see more rows # find the top 3 end stations for trips starting from each start station +> x <- trips_rank %>% arrange(start_station_name, desc(count)) %>% mutate(rank = row_number()) +filter(x, rank <= 3) +# Groups: start_station_name [329] + start_station_name end_station_name count rank + + 1 1 Ave & E 15 St E 20 St & FDR Drive 57 1 + 2 1 Ave & E 15 St E 17 St & Broadway 52 2 + 3 1 Ave & E 15 St 1 Ave & E 30 St 49 3 + 4 1 Ave & E 18 St E 15 St & 3 Ave 48 1 + 5 1 Ave & E 18 St E 17 St & Broadway 44 2 + 6 1 Ave & E 18 St W 21 St & 6 Ave 43 3 + 7 1 Ave & E 30 St W 33 St & 7 Ave 71 1 + 8 1 Ave & E 30 St Pershing Square N 55 2 + 9 1 Ave & E 30 St W 31 St & 7 Ave 46 3 +10 1 Ave & E 44 St W 33 St & 7 Ave 39 1 +# ℹ 977 more rows +# ℹ Use `print(n = ...)` to see more rows # find the top 3 most common station-to-station trips by gender +trips %>% group_by(start_station_name, end_station_name, gender) %>% summar$ +`summarise()` has grouped output by 'start_station_name', 'end_station_name'. You can override using the `.groups` +argument. +# A tibble: 9 × 4 +# Groups: gender [3] + start_station_name end_station_name gender count + +1 Central Park S & 6 Ave Central Park S & 6 Ave Unkno… 61 +2 Grand Army Plaza & Central Park S Grand Army Plaza & Central Par… Unkno… 53 +3 Broadway & W 58 St Broadway & W 58 St Unkno… 31 +4 E 43 St & Vanderbilt Ave W 41 St & 8 Ave Male 153 +5 Pershing Square N W 33 St & 7 Ave Male 121 +6 W 17 St & 8 Ave 8 Ave & W 31 St Male 108 +7 E 7 St & Avenue A Lafayette St & E 8 St Female 40 +8 Lafayette St & E 8 St E 7 St & Avenue A Female 36 +9 Norfolk St & Broome St Henry St & Grand St Female 36 # find the day with the most trips # tip: first add a column for year/month/day without time of day (use as.Date or floor_date from the lubridate package) +trips %>% mutate (date = as.Date (starttime)) %>% count (date, name = "count") %>% arrange(desc(count)) %>% slice(1) +# A tibble: 1 × 2 + date count + +1 2014-02-02 13816 # compute the average number of trips taken during each of the 24 hours of the day across the entire month # what time(s) of day tend to be peak hour(s)? + +trips %>% mutate ( date = as_date(starttime), hours = hour(starttime)) %>% count (date, hours) %>% group_by(hours) %>% summarise (avg_trips = mean(n)) %>% arrange(desc(avg_trips)) +# A tibble: 24 × 2 + hours avg_trips + + 1 17 800. + 2 18 716. + 3 16 611. + 4 8 591. + 5 15 531. + 6 14 514. + 7 9 510. + 8 19 502. + 9 13 488. +10 12 444. +# ℹ 14 more rows +# ℹ Use `print(n = ...)` to see more rows \ No newline at end of file From 5968205dec9ffd6bfa47375a9c6a557dea603076 Mon Sep 17 00:00:00 2001 From: Vanessa Date: Mon, 2 Jun 2025 09:58:34 -0400 Subject: [PATCH 3/7] Assingment 4 --- week1/plot_trips.R | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/week1/plot_trips.R b/week1/plot_trips.R index 4f25437ba..b6a537c1c 100644 --- a/week1/plot_trips.R +++ b/week1/plot_trips.R @@ -19,12 +19,22 @@ load('trips.RData') # plot the distribution of trip times across all rides (compare a histogram vs. a density plot) + ggplot(trips, aes(x = tripduration)) + geom_histogram(bins = 50) + scale_x_log10(labels = comma) + ggplot(trips, aes(x = tripduration)) + geom_density(fill = 'blue') + scale_x_log10(labels = comma) + + # plot the distribution of trip times by rider type indicated using color and fill (compare a histogram vs. a density plot) + ggplot(trips, aes(x = tripduration, fill = gender )) + geom_histogram(bins = 10) + scale_x_log10(labels = comma) + labs(fill = "User Type") + ggplot(trips, aes(x = tripduration, fill = gender )) + geom_density() + scale_x_log10(labels = comma) + labs(fill = "User Type") # plot the total number of trips on each day in the dataset +trips %>% mutate(date = as.Date(starttime)) %>% ggplot(aes(x = date)) + geom_histogram() + # plot the total number of trips (on the y axis) by age (on the x axis) and gender (indicated with color) +trips %>% group_by(birth_year, gender) %>% ggplot(aes(x = birth_year, fill = gender)) + geom_histogram() + # plot the ratio of male to female trips (on the y axis) by age (on the x axis) # hint: use the pivot_wider() function to reshape things to make it easier to compute this ratio # (you can skip this and come back to it tomorrow if we haven't covered pivot_wider() yet) @@ -48,15 +58,26 @@ trips_with_weather <- inner_join(trips, weather, by="ymd") # plot the number of trips as a function of the minimum temperature, where each point represents a day # you'll need to summarize the trips and join to the weather data to do this +trips_with_weather %>% group_by(date, tmin) %>% summarise (count = n()) %>% ggplot(aes(x = count, y = tmin)) + geom_point() # repeat this, splitting results by whether there was substantial precipitation or not # you'll need to decide what constitutes "substantial precipitation" and create a new T/F column to indicate this + trips_with_weather %>% group_by(date, tmin) %>% summarise (count = n(), avg_prcp = mean(prcp)) %>% ggplot(aes(x = count, y = tmin)) + geom_point() + facet_wrap(~ avg_prcp) + + # add a smoothed fit on top of the previous plot, using geom_smooth + trips_with_weather %>% group_by(date, tmin) %>% summarise (count = n(), avg_prcp = mean(prcp)) %>% ggplot(aes(x = count, y = tmin)) + geom_point() + facet_wrap(~ avg_prcp) + geom_smooth(method = 'lm') + + # compute the average number of trips and standard deviation in number of trips by hour of the day # hint: use the hour() function from the lubridate package - # plot the above +trips %>% mutate ( date = as_date(starttime), hours = hour(starttime)) %>% count (date, hours) %>% group_by(hours) %>% summarise (avg_trips = mean(n), Stand_Dev_Trips = sd(n)) %>% ggplot (aes(x= hours , y= avg_trips )) + geom_line(aes(color = avg_trips)) + geom_ribbon(aes(ymin = avg_trips - Stand_Dev_Trips, ymax = avg_trips + Stand_Dev_Trips),alpha = 0.25 ) + + # repeat this, but now split the results by day of the week (Monday, Tuesday, ...) or weekday vs. weekend days # hint: use the wday() function from the lubridate package + +trips %>% mutate ( date = wday(starttime, label = TRUE), hours = hour(starttime)) %>% count (date, hours) %>% group_by(date) %>% summarise (avg_trips = mean(n), Stand_Dev_Trips = sd(n)) %>% ungroup() %>% ggplot (aes(x= date , y= avg_trips )) + geom_line(aes(group = 1)) + geom_ribbon(aes(ymin = avg_trips - Stand_Dev_Trips, ymax = avg_trips + Stand_Dev_Trips, group = 1),alpha = 0.25 ) From 42b97924ca316c0c954c7f9380afe513a75d13ef Mon Sep 17 00:00:00 2001 From: Vanessa Date: Mon, 16 Jun 2025 16:40:24 -0400 Subject: [PATCH 4/7] Prediction Model --- week4/model.Rdata | Bin 0 -> 10081 bytes week4/predict_citibike.Rmd | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 week4/model.Rdata create mode 100644 week4/predict_citibike.Rmd diff --git a/week4/model.Rdata b/week4/model.Rdata new file mode 100644 index 0000000000000000000000000000000000000000..8a21ad4d84fccab6ab13e6e5a2d572dbd63ceeaf GIT binary patch literal 10081 zcmV-nC!W|JiwFP!000002JL(YR8`5c_92KMs02kN379ZT20^=3%mKwbCXlR1R)Pr$ zB7zA)a#C_W1VKReMnIIH zdNW!YJLE_07{y>PI2fEG`v2Si!(fcqx@E;&6$XQoy}!f8;AHUh%L#|f^z=>kS?U;@ z?z1vAv)N~(uYXYARIh)LQ2#I=gTX##U@aNZ+u8oWLklu<6(SydVk2`OY1>3~=8)F} z3icSj6eZKD);*oT-960w2veOy`d0l7{(%mySlU8wBAJVZ!ZF5Uq zbJQO2iEfWFwqkk5_)dEk1T**7-hUwZioCdDN@Iyb3h8>-NSzFP3=>|*yOgbcOgg-} zAj`Es8pdjL7pd-sr0$XTPuOP|kUJWWoac5FBV%Vj7t}3UMLLHB?i#tuA10;PcirDq z1CMza+Ohk3K@0b@+xw>7fw4^SF(S2knq963eW{i-^Ii zQI#Ptb*GW;Epub~PA8KQw^9~^;8SF}_wHIoxiHLpGa~$*=uF6*DX)|9@e=7Ac+eHr zq>%BCrcLq5;)9Vv=H`pHKr&{m=G8aeW#rX4)mqa0Zt%Fzj_JWi4#Vg~bBQLV4~!l^ zvTt8cCYfb_ael4NXP9;A)~o!xQ83h{b0MiF0o|khJ5;^cNv5QF#XEj(n9(~Ccuv^0n@x2UnIGWtq_S@WX+BLtijjJQbXj!u-N@`5=r_e}=P}0#@LcZk4H@T} zp+{zQs6$FMX?J1PzHFZdq*s@E((9l*Fu-O~3yhpbUVb)BbMjgpGU=^b+=jT9WGGXe zM|4pGjF4)Z9SUB;1h0LkwWmly(4I#;&)-Ol&x13(D zF7}*EN?Y?_>^zoXrV1hLC+4|{^5O)c9m4l-={i6>gy*0nR z9_0-q!)Kik@ezbqCzXgfXr#i(sBRUHl1tDol2_oJng@)rm|}i^wmA&xn=y;G^C-L` zY#-d}@dp_$eIt?W>MSziONabQ4|5o*AqxCX+K_3T<|_;CIFKGIb`>h$P9vFuE&HF{ z=Y^UN>>_=g?!xTS)g6Wl)nKU7MXopN`N-Sg(_ytI6)^qUqH*0e%A{Al#grMkM@g58 zg;k705-??tz_vMTTx7~rAI(kn_h8KZNqK#@wvd@ZEB|m9eGNvo=X$+cIhnk&Yk+@k zGJI>bdAXH4yfJcAoc%X9@`^$Z$4T8?@UZyU5A2o~U_zonp4qc0WZM0Vt{g5BCJIfI z*Gue%4xw{)XEb=isP(&aBzE#b4{zqBx%ak_mp(7t3s$(p$fxQN^WSBVmv*i^1LwRU zGnZOCl`L5c(}7PpiP3=7{O96Z?+X32(h z)s3G;`l&6eogionLk$wHL`FEE{Ko|AaL-jLr$mH9VjoZtP zz%YTNpy1^-(3x4WjbXQ$jFpXgq0{O~dO0!kp057@JvWHu?bbd;`aVo?(^dRVoviGA z^kEuApNedXD} z5o0J~3?+=AgfWybh7!h5!Wc>zLkVLjVGJdVp@cD%F@`e6P{tU_7(*FjC}Rv|jG>G% zlre@f#!$f+Di}irW2j&Z6^x;RF;p;y3dT^u7%CV;6=SGk3{{MwiZN6%hAPHT#Tcp> zLltAFA`A}M1q)bBSyoey)s$y76m&UOLVtrd;-G{cI}9nSVLOstNOfpoJnr-U3()}6_N=vj>fH%-bQ-d9OpQ0&lVWJ ztoECLQW0rh1Yd|*mBByUD_`0c^7sGxG@W*;G#`e&C7C6&9Z1)RQ**a@@{!p=vq0=B zlFa7yjgE-RicdxJ$^4GZ$ps*de?gy>VS}sx(YCO-D zXn35+?@1q-3izMTHD>rv^i;-akHeUWo6{N+S7dfTNH^PWL`<_Y} zI>U&&mLo@NhLhPTib9V>TH%d_<=&DV4$#MYRp;`ub@1>*b)N1I+9dOOMVM%FB}~6D zh)cT83M~HoVGlHkgX711V~_>#Do^w-9rBpRtL_m5A9<$PB1-101HA0*RlKRy2wn{C zH4_ki2W`jQN#hWzBaiJZuE?{qB}0;q&1+4xf)}OC3Njz@ks&3{-r+tfBr|HZHrKXU zWTfDdjFZ_<`#(NL?@m0x1!L}B5;ESKMXoeay?$|XD4A-s@>7#aI7~T2`E@;Rlk?}p z`J|8k{`5u7z9e({8$~%CaTr;AnREW+Xp$KrD=|7Hi%fpB`bk)~FPv&uGz4gVo!wK|-?F zvU%fO=wmJ;=O1s4RLp0RI<&Lz#!+I^goh8vQ>Qv^N^cmM-(CC6Xi>*b_ zi(H-T%)6hA=8@PW$a#W{snkCvBB%%b)H`$U1}z}3i;}G;O6^IXS$jsyoUta;)Ww%I zUwcKGtEOMuxYGey^WUVHLfA^TF=F}zdr_3JcX9KzYiq?w!RL0uzMBh zzQV;Z`I!Wa?_O24ZTBcLOYTCVlF&vN^A31+WlSeCbIX#qi9Ld0Ns06N;+x1Y$)#&5 z)&`Q%x(^qdnUuhcD7lrk<5!TO-fO=n@oyxLWbONG>*xUERkbfpa-BzJa+*BY&s@;| z&xXX?TIyHeDG8O%hPChDwJXh~`|U#EA%(B7ObwF3qa4yEjb9HV1dKK>INL#bSuOm& z#4L}@(sy6c>!1ypPakYCs)&Y(zG-Y5-zJl{bp&dA-mHR#2T#l`oTv>wZ3_H6H?Hfi zC*{aRsk2DuuQDQ;`BO<|^kAOITU)%>scA}+aWhXVHpY9x)Fsu^v&J8Tp>vWJ*6sfe zGiMCui44u~6*Ao00=;a;b*e|%!z0qwZ>ey&Uw{an)pn zYkI9PP%-el(PJ5C%v~woTx7LcU;ajI%0LgHTqjndC;hpC- zP9;3|VXMwo<*VP8!sabvc4y?r!>Um2JDF9s@NUwsLND%MnCWz=K}DJyR*yQ)zRqqn ztcdSvuUohr)^9Zr<5DbuZM=oDqrX0Y#XF}YK&}c{t{S*^0UJB4Vr#8VRNV{jzTWam z<=9(TTH<|x|D+~t(~nlnJvbS*oONn&|H1*wp4x=)tJ%Ub>zWM`Y6Y;#yuLPEJ`USy z*dr(E8(j`Vcg)cqAJ_=%HZu=t_9(%2?YJqAx}U?wb3Tfs!V1`w_x(ymcsf%f%jh&ZDFOG@uxi%Sup49ffxa94fyawdx-w&>9Dqs z*)!F+6jq2mWQ=1N!!q^9yr1ii!?OH~%g2<~z~-nWDp6koVN?52nJeW{U!=4EpC5!(dP(DCr7{M9uW+SQX1wnj^@Ccg(D9bDL;g%F=g9kj?sV( zy>?pPz8b?kH9M`|rdzN&=ws3Mdrg?vw(F|1Y`^Y;RSB|+#?{B+!@{UjQw*~)fAvbA z#>#&Sg!y1huq1CVERS21B+W?2ezgRBkH6qL2DV96Oj3Af-5(dv&X>HQus+&bd)l{+ z@Gg(w>Mh65z_P+`qZhMngq4r>PLVVa@kad%IpS zNCY{)*deY0Z+|653p$Cw2KPF4p0C$osBZPCCq{4J!$WtDnROk2_mfUBn@XBs;j{-) z<|n4Y3bkFAo-Xu+C1V0aZk>{ag(s8ar@T2#=W{u%YupoQ^!_}o?d|d9D-(d_QcVKy zEka>k?S~fCTf(s2?`X;H?p#=vv18es)PCNS4>?E5TEIHXxqZ?~d*LHqaq$h^*07?+ zJS+FA9IV>0T~O|b3a-<%!Z$IM-@n2W>l($r?QV@S$O;PL~+wzdkA} zQDmM4ylZYByVOw~=NU<=?v!E6!H(XyY{{@`VPN;=15%KD)4Nc?aXV~?-Ez4};UR1h z*0MTsDIK=G<*9i8RRKP@XI_)WQGx4(+J{20No&vY@@^+=zoL1?T1lR1xGwIhMQOee z@4$T4bIk2I+T@7yRQBc_bNOUFoX?h?X|s=aU58~H3cYjG1#zC5c79r9VIc!sI%ln) zy(b*kHQJ9k{I)uGCtrnaLJt(b$oRkl=}nN6mkTyc^%O9%J_Z{sS984>Zwlj#P8PS$ z{SF)00#ZtY^Kc#;M3k5lzH#7lra>ij=}W~qu;T0Vya@el%uB_&h~TgCi(ym3#GXq^ zDX@J;!_fm@$I|E4a(dq@V*le_)wO+`-VG}yg5`@;zo31|`2Bq=-TM7~>+kPde}CWl z`}zt%D!22j9?OP#?7iUZCJ$415r$Uc#V1dUFVnGskYXdd&YAq?vB_ zFU!~eEq}}3^0)lICP}S_Z{N5yTSMjNOqI=DYXQLpau^3}iL|}V0`Y5Yh|H)p(q$vo z5fL^IwkcPnfSh(7dDC+jK*8HQoAlzZAn0Mpk+;k!V4kJ{_5XN4Byz{to?Nk;$nc)m z;woGRZp396T)sUYFh!a*)@?XLoZ4Zs;#z_SNanuH;Lsk2_1Vn_pMJ>E`vD}efhx6AQuH#s37`Q|!Ijkh6~GVU7X4wEpy$PS%P zoG+AblutS?Tqh_lnlFM7>d0{k$0DFC^!owPXzUT1x^OS;x?li zi4sG(>ab(SK`bcf?6S82$r80LJ%wH%s6(@CO^6%_UOY=~!QOmu$xtb05=j&Y>KXpj)UcE;I{-$AJJam{0Me85GI@n-F|9z=SLm1}U@ z9TXoCz97IR!1M(;{Qg2jWNHHlPdapFWYr5|o2+KAoY^(PgH8E?euy}@EN-`DlhAb> zZ%l>a#wWZ0<08J!oN0c#Wr+f~W{~Y&=DGz$i+;!q*kA*~mo&9~*3SSw`jg9f4n~7` zT|dRmkq3Y;Tp~Vi%_tCH@@m;qCnutSJ0jM0|1^wq@}pFJp>!zW^f>QnR6-7+%N7+G zE$xN*Lp&pUG~dWC;_9;(E_za13Ac_#WiNF^hzqxTbXw}vfPZ-eTi_E#%yYz-$@W0{ z1i0Eg>+P+HQ$U=^vt?pCDsen$JZg_?^0monAmo!tk-expNOU!^N~+jT)Mo8HB)KgZ z_=)Z}-YjDd0`BMcoWFezT+n&G@~FZFqJe)ibj#7ec0wktJ+XDs3=lQPzp|IH0EAgp zD~Q;);(T5#IQ2!Xwv_NW#^apWtO@)cj&FPw=|Y5i$6I_2-Ulp~jGkbpt_Qpv+s>x* zzXD#5vo+W>yKtTi%0`%kdqxrE0WAe*eclq$vw|xfx~}6ojXCwlo4&OP!1CZmlcU$a zfTS`B{b!TA@c6m7h$LB?Gl+k1Cbr323Qi|73PgmCZrFu+L6WXJBImVKirmyoz~a$X z=F02^!|f4%a_hvt_zQz$rvqc;V8)MDWb?}CfCpB!>{`YJUKoKL#`L0GbF@9Cm*I8Uecc28M(-i=6C z`a^GbLp`o*v>%i9pChW03y2$$rh?ppULdZM`@#}sWpGhVZFEoOS0evBZ%j?b4}Gdb9+sCoZ4V&spZap73goklJ7^1}^H1I$bxZ9ow;;aPP!4|FuNIb>(;K z{HGIca+^+1`hFPr^W0f8)g}PAFPdQIeDO5ctSH#YcD|5^oYisA=~EApXlZ<}!1y6? z)v4NfDAy{PM})i#T^*4XN?i3l z^ZB_<6A_mBw(gW-H84v&S)8=R9LF22Ug9)POA+HDd*}MDEc4$#jR?5YF5<#iKt$ND z&;0J-MFhKPKlU%LBn+JlzmMRXMZ{~LO;O~FBrH}iFVCB)MTCEp-K6F6i174#VIa9# z3gH0nr`)0(H>VRmdOqE{s-D2*d@9?O9tF%F;u+ba`9^+$6V{G7uIwX$r{{bhqh5u6 zK5mG;jY=dOURKYjb92W$UmZDLU3Z}y;je!+-F|`&5xXJIKRn44$3u^Us_~*4pE*Ah z7jvF|n0q6hNUDsPSEj`zVu>$q&b#IjM~ahX>?{cc^=w<~_dZ$;96tw$E)$I)!cGb) zJhB{zan5IGiYyOHCL-4e_nLe#CBoA9fbT6kluyDXq=Wy2+!kQuetgrr6*fee_KT(W zml_hzT5laXC)r*Io+-Mi_E;SFsf`9uDc1=~O2ahJ?VPj+-H$NW3^c;|IG z*Z1e$W_@YqD#Q!shpszt^0uyX@y>6=f@tIR52A*DvS*`tqg>Ot0Qjt2=`bdulelJN z8fH4>4^$_3A5QNQsWyC{!K$apna|~KzsB_!yxKwh2qMAd_5LA?_)Uqp zq?dh3zKeFixCfK-y{D)+X<6LbM~Ia=K;+N zId)a&w6XEGK%0Ps`cBU>qWp+ymg~-?#ET6zEzcDvlfrG$M>S<;6HJ3KugApPBO*6~ z5lOuS5vHOj`AQ*&NVpU=?!b5{kkfnxKE7`RvZTMqzL<2GNE2L8xnOT35oMUzxbk5< z5xcQ|T+`z!kh7^}+r(x|Ko|$euatX9#PTe6e%xU|q-A$DKLu7qN>IRc{U!l$)2H>p znz2l9^Ny9}BW+HQ+0-MO@oWpSLog@nb{*zz1Gh%3>k-o3K_m-Y$>*@SNTkmUjlK&n zgBuH_Y+t&5A()Rg$uRuy5>fe8()JEo#5KclB_dyZiJ0ff#fm%Sz>QBPBQ@_W1KCSo zzF4qS5|AZ}%+!wR5~=rWcyhH>L6+-?%J5QQB93>;wWZU=iP%#gwAAXIh&Wl*mO1mw zi4=ImOkV8}$V@aoFX6*RL?-C1YY*1NxQMURbYAz~TMQyz`m${UHy;r#)oq+0*$D`~ z!>`AAE(h7hY*R(nR)Jg?&%H2Tj$qE@@(b8`5|HyoJ6-1yAeht2-8a1N!MvsgJ-^wb z8Av2fGZayp{E0}_3}^p%NCNRkB%9{&Dos=*!cRDdr8Jx-ViUga9_wO*{YqU#NPIS1 zMx^O&p73VeIwH1wbItB8CPZ@3hQtR(u7L2JThr>KM#OwfEsfP!K*XQSB>AF~hzRdb zHfwT@fE&E$t2Ze}pmC5>21QnwPa%?cZ>E@ymL;yn7cEN9SWhIF@Qr60>?4xDT6MDX z%mLXFpDH#51`sJ<$zI=M83a>r;bQ)WQGmEZ+S^s02RHd-9-olNCZcW&=`=6*1VrG5 zu996laXyoc7rK4BdL3jbhif`(%_Cxi?CP|q_7KdGy`Jj6u0;9{`E@!MpMdNxy9Z7- zXNiQTXD1bmI0LdArhE|1QYF&UcU<<8bt9N-T=ie>-3YSxs;oNJrGo1;E_K6e!P~Qm zw2I6+i;=!WnD-w>;?>%CJay*gFxgTb%ztuzpZUXx9wNqYz3KLg<8YplM0J)2I0J_9#=R*>Q{K}5<^(H*`0|Gr5pIjZ9w@`i{}H3rMsALBa7 zJX!P7WkmwPOuOZ5xaKgnpFEmvtC3|Qstb_a)bRMGix1+HV4f=T8|8kDl}GC(X3sCa z$N5YtnYXiVjvl!AnM?Rs@>ZOu%$D7rT+S&(YCA`?d4ngaYqTGUun#AbHXa#4q;Hy_ z!FU!)gg3@Ke{ysV!DRO46^*?PGH%{7>^k*=h-8>O5zXF1FekMiS$JhXE04u(T1J@- zMCxhV=X!?{!Ht4?-Rrh*h=lhO@1M;hF)wi$v8LWj;t6KkyFzBK5s~(7%!qmKmVj*D z#-?X+3(&qt{)3F9uMfiy%?3G196@VOj7R&DG4zi-IrNMV(bpIQ>LBXMV#pNk{we5B zL51jfZ)l0;jMdXO(6_YI*V|`hZE0+3IBb}6zmAnYtwS#*@zXabKQlK**X)qFj;{4S zV^eD@eG6)ot8bWv(X@quSDQGXZs_1840y*FA{j$pg+hIP#y|TvtwR{&(y_8KHZ=XO zuZOVfBf$F=PwW!-R)BZ0kNjf}9tZF0Y8OsyF9h$D?B{*tkpk~sdrhOa9Q?uFa9bm2 zTWpK%Yd`qsB&}i$dHZX~+Yl&hb_+|oHNwJD=P(L)gt_q%Giy{H)@CLsbW0u6gTn$H zp=;6~Ds7kT2fIT__w82-Q6voU1SNy($JHi>hJlPUFta>#*hFX8M79&diiXGZ@S%PE z)n{%+4-AM%{;dDgpFpFZA;EI$=~(Oh&=(pE zXrH;hov|93-webS{l zfiAbdRgSy^TV&b5!81_l>_3*o$o`Vhw=~xMkGelUE&qXPqxU=Cu;2#>8S}H#=NIiE zWPSgEuz1x{5Fl&L0SBSnqPH+($~0AvM`jIPXZ&+4YR5FMLm>0FOHHec9g7i zrqss?E{atsO zkMC6f)=jE^FNEq>snPaMP_p4H74N;1R6qFxHNT;p(%-&8=?jn2{vM?8Qk|)Jt#y>X zVh%OG@FS(KHm3AtBdPgCOR0W657n;`qhtdYZT~JMYd=$VT5TjLe+#L3O|vL{16@B& zos_-~P_l9q9S5p6Ps(1}IJ)mk^ASSz+xE0ll7r&0R-DpT*i-UhIUQdhQMWgyjW-cHSL9ZU7=0UcjAHSZqH ze^@Lft7yI&w+u=KI`71&4jgAN_&^@NfybXZ?fUhSB?C_>6r;y^3)1$GUW)Fc^}=*t7}*;(UxfB+0>Y=qrD#3!ANf6omYnps0KHCv z)*~KJd+os8-|PR(USgn2*=WWWJi>iBWa1^D9_OZUBxor<%nr&Y z$`|rOa#&m_P81j7OB&S&sv}w=9O+>^AYM_OveSB$9|mm~jZdcah<}k`JRx3?L~#t? zN5kzQKF~hkr~AkcydLpE>k%KQE>JwkAF*Nnjic?Le4_eC`xxa3)&KCiMtri-dTAP; z8`V86={$gM!|EC35%GrlD1KbeXn&wM(Ec74*TiA|BVH#C^`AkXcMSRgA8SV~GQU>K z_{9S{Br-R>a+&-my_bF4%EWO;f3@ykJ)rE`0B|_CHDF#M^WR>_9GJc7HR$<#`@(LS z&cBQsKGbhek9nc`cL&|k0x`q%`P4daZ>avi>%nn8rSkWuapX_pU*r$PmqxEI8W8qv zf7{RSpVtBSA~TSnnYe#r7@yrg=Mgy5egzMW3*mq~YCoX3!Tn+L=(-%9KRo_^plm+p zq5Aui{rrEgu91DvO7Z#2dClenKo-Bs6N=|w;r(gd(_wLc7@9{=JJ4I1i2HT_wV!w% z;=ktSb;q(dbpK)AA5rHC?&l1xgJ0SkhKKoe{x|bM4$bEe&(QPke`x>ye*SMhKin7m z*YiI9y?y;Z^?(=j{`&I2?aBX*2ekeVsa1?f*ONGJ9Er{Y3Xy?l;8; z=F)eW68}MIOz!)AQOWq17nMV=u`SJP=AqY)2CLqI+6_U&(um)$7`PUmXJFaCgk=u1 z!-p=abfk#v6l?zvH7`T2FJJ%w DHkJmt literal 0 HcmV?d00001 diff --git a/week4/predict_citibike.Rmd b/week4/predict_citibike.Rmd new file mode 100644 index 000000000..e69de29bb From 447f0dd5f95e9ce7ea6edd346500a8c43496932e Mon Sep 17 00:00:00 2001 From: Vanessa Date: Tue, 17 Jun 2025 13:52:25 -0400 Subject: [PATCH 5/7] 2015 CitiBike Prediction --- week4/test_citibike_predictions.Rmd | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 week4/test_citibike_predictions.Rmd diff --git a/week4/test_citibike_predictions.Rmd b/week4/test_citibike_predictions.Rmd new file mode 100644 index 000000000..e69de29bb From 7e0186dc3dd46896eed52b6a22754eeae9afec95 Mon Sep 17 00:00:00 2001 From: Vanessa Date: Tue, 17 Jun 2025 16:56:39 -0400 Subject: [PATCH 6/7] Data Preprocessing --- week4/project/community-notes-2025-group-5 | 1 + .../project/vanessa/01_download_notes_data.sh | 9 ++++++ week4/project/vanessa/02_filter_notes_data.sh | 25 +++++++++++++++ week4/project/vanessa/03_download_ratings.sh | 32 +++++++++++++++++++ week4/project/vanessa/04_reformat_ratings.sh | 25 +++++++++++++++ 5 files changed, 92 insertions(+) create mode 160000 week4/project/community-notes-2025-group-5 create mode 100644 week4/project/vanessa/01_download_notes_data.sh create mode 100644 week4/project/vanessa/02_filter_notes_data.sh create mode 100644 week4/project/vanessa/03_download_ratings.sh create mode 100644 week4/project/vanessa/04_reformat_ratings.sh diff --git a/week4/project/community-notes-2025-group-5 b/week4/project/community-notes-2025-group-5 new file mode 160000 index 000000000..0d2d66d29 --- /dev/null +++ b/week4/project/community-notes-2025-group-5 @@ -0,0 +1 @@ +Subproject commit 0d2d66d29dea5117294eed311f2e6adb073f6561 diff --git a/week4/project/vanessa/01_download_notes_data.sh b/week4/project/vanessa/01_download_notes_data.sh new file mode 100644 index 000000000..c65387122 --- /dev/null +++ b/week4/project/vanessa/01_download_notes_data.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# use curl or wget to download the version 2 1gram file with all terms starting with "1", googlebooks-eng-all-1gram-20120701-1.gz +curl -o birdwatch-public-data-2025-06-16-notes.gz https://ton.twimg.com/birdwatch-public-data/2025/06/16/notes/notes-00000.zip +# update the timestamp on the resulting file using touch +# do not remove, this will keep make happy and avoid re-downloading of the data once you have it +touch birdwatch-public-data-2025-06-16-notes.gz + + \ No newline at end of file diff --git a/week4/project/vanessa/02_filter_notes_data.sh b/week4/project/vanessa/02_filter_notes_data.sh new file mode 100644 index 000000000..ad358a89e --- /dev/null +++ b/week4/project/vanessa/02_filter_notes_data.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# filter original 1gram file googlebooks-eng-all-1gram-20120701-1.gz to only lines where the ngram exactly matches a year (18xx, 19xx, or 20xx, where x is a digit) +# decompress the first using gunzip, zless, zcat or similar +# then filter out rows that match using grep -E, egrep, awk, or similar +# write results to year_counts.tsv + +#zcat birdwatch-public-data-2025-06-16-notes.gz > birdwatch-public-data-2025-06-16-notes.tsv + + +#!/bin/bash + +START_MILLI=1611360000000 #jan 23 2021 12:00:00 AM +END_MILLI=1627775999000 # july 31 2021 23:59:59 PM +OUTPUT_FILE="notes.tsv" +INPUT_FILE="C:\Users\ds3\Desktop\coursework\week4\project\vanessa\data\birdwatch-public-data-2025-06-16-notes.gz" +#1627696800000 +#end time gmt 1627732799000 + +#1611360000000 gmt start time +# Copy header +zcat "$INPUT_FILE" | head -n 1 > "$OUTPUT_FILE" + +# Filter by createdAtMillis (column 3) +zcat "$INPUT_FILE" | tail -n +2 | awk -F '\t' -v min="$START_MILLI" -v max="$END_MILLI" '{if ($3 >= min && $3 <= max) print $0;}' >> "$OUTPUT_FILE" \ No newline at end of file diff --git a/week4/project/vanessa/03_download_ratings.sh b/week4/project/vanessa/03_download_ratings.sh new file mode 100644 index 000000000..036d91f53 --- /dev/null +++ b/week4/project/vanessa/03_download_ratings.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# # use curl or wget to download the version 2 of the total counts file, googlebooks-eng-all-totalcounts-20120701.txt +# curl -o birdwatch-public-data/2025/06/16/noteRatings/ratings.gz https://ton.twimg.com/birdwatch-public-data/2025/06/16/noteRatings/ratings-000xx.zip +# # update the timestamp on the resulting file using touch +# # do not remove, this will keep make happy and avoid re-downloading of the data once you have it +# touch birdwatch-public-data/2025/06/16/noteRatings/ratings.gz + + +# Base URL for the ratings data +baseUrl="https://ton.twimg.com/birdwatch-public-data/2025/06/16/noteRatings/ratings-" + +# Output directory for downloaded files +outputDir="$HOME/C:\Users\ds3\Desktop\coursework\week4\project\vanessa\data\birdwatch-ratings" + +# Ensure the output directory exists +mkdir -p "$outputDir" + +# Loop through numbers 00000 to 00019 +for i in {0..19}; do + # Format the number with leading zeros + num=$(printf "%05d" $i) + + # Construct the full URL + url="${baseUrl}${num}.zip" + + # Download the file + echo "Downloading $url..." + curl -s -O "$url" -o "${outputDir}/ratings-${num}.zip" +done + +echo "All downloads complete." \ No newline at end of file diff --git a/week4/project/vanessa/04_reformat_ratings.sh b/week4/project/vanessa/04_reformat_ratings.sh new file mode 100644 index 000000000..32d6f24ed --- /dev/null +++ b/week4/project/vanessa/04_reformat_ratings.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# Set start and end timestamps in milliseconds +start_milli=1611360000000 # Jan 23, 2021 00:00:00 UTC +end_milli=1627775999000 # July 31, 2021 23:59:59 UTC + +# Directory paths +zip_dir="C:\Users\ds3\Desktop\coursework\week4\project\vanessa\data" # Update with the correct path to your zip files +output_dir="filtered_data" # Directory to store filtered files +mkdir -p "$output_dir" # Create the output directory if it doesn't exist + +# Loop through ratings files, extract and filter by createdAtMillis (column 3) +for i in $(seq -w 0 19); do + unzip -p "$zip_dir/ratings-000${i}.zip" | awk -F'\t' -v min="$start_milli" -v max="$end_milli" \ + 'NR == 1 || ($3 >= min && $3 <= max)' > "$output_dir/filtered_ratings_${i}.tsv" +done + +# Merge filtered files, keeping the header from the first file only +head -n 1 "$output_dir/filtered_ratings_00.tsv" > complete_filtered_ratings.tsv +for i in $(seq -w 0 9); do + tail -n +2 "$output_dir/filtered_ratings_0${i}.tsv" >> complete_filtered_ratings.tsv +done + +echo "Merging complete. Final file: complete_filtered_ratings.tsv" + \ No newline at end of file From 366355124c8ae9eb5b6d4c35565c281182553773 Mon Sep 17 00:00:00 2001 From: Vanessa Date: Thu, 26 Jun 2025 12:37:19 -0400 Subject: [PATCH 7/7] Final Edits --- README.html | 334 +++++++++++++ allbut.pl | 35 ++ split_ratings.sh | 36 ++ week1/R_Markdown_1.cv | 0 week1/R_Markdown_1.rmd | 0 week1/R_Markdown_Excercises.html | 408 +++++++++++++++ week1/R_Markdown_Excercises.rmd | 26 + week1/load_trips.R | 4 +- week1/plot_trips.R | 23 +- week1/textbook_excercises.R | 67 +++ week1/vanessa_reino.cv | 0 week1/vanessar_cv.html | 464 ++++++++++++++++++ week1/vanessar_cv.rmd | 53 ++ week2/day_three_txt.R | 211 ++++++++ week2/diamond-sizes.Rmd | 20 +- week2/diamond-sizes.html | 408 +++++++++++++++ week2/txt_questions.R | 51 ++ week3/movielens.Rmd | 42 ++ week3/ngrams/01_download_1grams.sh | 3 +- week3/ngrams/02_filter_1grams.sh | 4 + week3/ngrams/03_download_totals.sh | 2 +- week3/ngrams/04_reformat_totals.sh | 2 + week3/ngrams/05_final_report.Rmd | 30 +- week3/trips_per_day.tsv | 366 -------------- week3/week3_txtbook_questions.R | 66 +++ week4/model.RDS | Bin 0 -> 10056 bytes week4/project/community-notes-2025-group-5 | 2 +- .../project/vanessa/01_download_notes_data.sh | 9 - week4/project/vanessa/02_filter_notes_data.sh | 25 - week4/project/vanessa/03_download_ratings.sh | 32 -- week4/project/vanessa/04_reformat_ratings.sh | 25 - week4/test_citibike_predictions.Rmd | 46 ++ 32 files changed, 2322 insertions(+), 472 deletions(-) create mode 100644 README.html create mode 100644 allbut.pl create mode 100644 split_ratings.sh create mode 100644 week1/R_Markdown_1.cv create mode 100644 week1/R_Markdown_1.rmd create mode 100644 week1/R_Markdown_Excercises.html create mode 100644 week1/R_Markdown_Excercises.rmd create mode 100644 week1/textbook_excercises.R create mode 100644 week1/vanessa_reino.cv create mode 100644 week1/vanessar_cv.html create mode 100644 week1/vanessar_cv.rmd create mode 100644 week2/day_three_txt.R create mode 100644 week2/diamond-sizes.html create mode 100644 week2/txt_questions.R delete mode 100644 week3/trips_per_day.tsv create mode 100644 week3/week3_txtbook_questions.R create mode 100644 week4/model.RDS delete mode 100644 week4/project/vanessa/01_download_notes_data.sh delete mode 100644 week4/project/vanessa/02_filter_notes_data.sh delete mode 100644 week4/project/vanessa/03_download_ratings.sh delete mode 100644 week4/project/vanessa/04_reformat_ratings.sh diff --git a/README.html b/README.html new file mode 100644 index 000000000..7fa99884d --- /dev/null +++ b/README.html @@ -0,0 +1,334 @@ + + + + + + + MovieLens 10M/100k Data Set README + + +

+ Summary +

+

+ This data set contains 10000054 ratings and 95580 tags + applied to 10681 movies by 71567 users of the + online movie recommender service MovieLens. +

+

+ Users were selected at random for inclusion. All users selected had rated + at least 20 movies. Unlike previous MovieLens data sets, no demographic + information is included. Each user is represented by an id, and no other + information is provided. +

+ +

+ The data are contained in three files, movies.dat, + ratings.dat and tags.dat. + Also included are scripts for generating subsets of the data to support five-fold + cross-validation of rating predictions. More details about the contents and use + of all these files follows. +

+ +

+ This and other GroupLens data sets are publicly available for download at + GroupLens Data Sets. +

+

+ Usage License +

+

+ Neither the University of Minnesota nor any of the researchers + involved can guarantee the correctness of the data, its suitability + for any particular purpose, or the validity of results based on the + use of the data set. The data set may be used for any research + purposes under the following conditions: +

+
    +
  • The user may not state or imply any endorsement from the + University of Minnesota or the GroupLens Research Group.
  • + +
  • The user must acknowledge the use of the data set in + publications resulting from the use of the data set (see below + for citation information).
  • + +
  • The user may not redistribute the data without separate + permission.
  • + +
  • The user may not use this information for any commercial or + revenue-bearing purposes without first obtaining permission + from a faculty member of the GroupLens Research Project at the + University of Minnesota.
  • +
+

+ The executable software scripts are provided "as is" without warranty + of any kind, either expressed or implied, including, but not limited to, + the implied warranties of merchantability and fitness for a particular purpose. + The entire risk as to the quality and performance of them is with you. + Should the program prove defective, you assume the cost of all + necessary servicing, repair or correction. +

+

+ In no event shall the University of Minnesota, its affiliates or employees + be liable to you for any damages arising out of the use or inability to use + these programs (including but not limited to loss of data or data being + rendered inaccurate). +

+ +

+ If you have any further questions or comments, please email grouplens-info +

+ +

+ Citation +

+

+ To acknowledge use of the dataset in publications, please cite the + following paper: +

+

+ F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: + History and Context. ACM Transactions on Interactive Intelligent + Systems (TiiS) 5, 4, Article 19 (December 2015), 19 pages. + DOI=http://dx.doi.org/10.1145/2827872 +

+ +

+ Acknowledgements +

+

+ Thanks to Rich Davies for generating the data set. +

+ +

+ Further Information About GroupLens +

+

+ GroupLens is a research group in the + Department of Computer Science and Engineering + at the University of Minnesota. Since its + inception in 1992, GroupLens' research projects have explored a variety of fields + including: +

+
    +
  • Information Filtering
  • +
  • Recommender Systems
  • +
  • Online Communities
  • +
  • Mobile and Ubiquitious Technologies
  • +
  • Digital Libraries
  • +
  • Local Geographic Information Systems.
  • +
+

+ GroupLens Research operates a movie recommender based on + collaborative filtering, MovieLens, + which is the source of these data. +

+ +

+ Content and Use of Files +

+ +

+ Character Encoding +

+

+ The three data files are encoded as + UTF-8. This is a departure + from previous MovieLens data sets, which used different character encodings. + If accented characters in movie titles or tag values (e.g. Misérables, Les (1995)) + display incorrectly, make sure that any program reading the data, such as a + text editor, terminal, or script, is configured for UTF-8. +

+ +

+ User Ids +

+

+ Movielens users were selected at random for inclusion. Their ids have been + anonymized. +

+

+ Users were selected separately for inclusion + in the ratings and tags data sets, which implies that user ids may appear in + one set but not the other. +

+

+ The anonymized values are consistent between the ratings and tags data files. + That is, user id n, if it appears in both files, refers to the same + real MovieLens user. +

+ +

+ Ratings Data File Structure +

+

+ All ratings are contained in the file ratings.dat. Each line of this + file represents one rating of one movie by one user, and has the following format: +

+

+ UserID::MovieID::Rating::Timestamp +

+

+ The lines within this file are ordered first by UserID, then, within user, + by MovieID. +

+

+ Ratings are made on a 5-star scale, with half-star increments. +

+

+ Timestamps represent + seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970. +

+ +

+ Tags Data File Structure +

+

+ All tags are contained in the file tags.dat. Each line of this + file represents one tag applied to one movie by one user, and has + the following format: +

+

+ UserID::MovieID::Tag::Timestamp +

+

+ The lines within this file are ordered first by UserID, then, within user, + by MovieID. +

+

+ Tags are user + generated metadata about movies. Each tag is typically a single word, or + short phrase. The meaning, value and purpose of a particular tag is + determined by each user. +

+

+ Timestamps represent + seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970. +

+ +

+ Movies Data File Structure +

+

+ Movie information is contained in the file movies.dat. + Each line of this file represents one movie, and has the following format: +

+

+ MovieID::Title::Genres +

+

+ MovieID is the real MovieLens id. +

+

+ Movie titles, by policy, should be entered identically to those + found in IMDB, including year of release. + However, they are entered manually, so errors and inconsistencies may exist. +

+

+ Genres are a pipe-separated list, and are selected from the following: +

+
    +
  • Action
  • +
  • Adventure
  • +
  • Animation
  • +
  • Children's
  • +
  • Comedy
  • +
  • Crime
  • +
  • Documentary
  • +
  • Drama
  • +
  • Fantasy
  • +
  • Film-Noir
  • +
  • Horror
  • +
  • Musical
  • +
  • Mystery
  • +
  • Romance
  • +
  • Sci-Fi
  • +
  • Thriller
  • +
  • War
  • +
  • Western
  • +
+ +

+ Cross-Validation Subset Generation Scripts +

+

+ A Unix shell script, split_ratings.sh, is provided that, if desired, + can be used to split the ratings data for five-fold cross-validation + of rating predictions. It depends on a second script, allbut.pl, which + is also included and is written in Perl. They should run without modification + under Linux, Mac OS X, Cygwin or other Unix like systems. +

+

+ Running split_ratings.sh will use ratings.dat + as input, and produce the fourteen output files described below. Multiple + runs of the script will produce identical results. +

+ + + + + + + + + + + + + +
File NamesDescription
+ r1.train, r2.train, r3.train, r4.train, r5.train
+ r1.test, r2.test, r3.test, r4.test, r5.test
+
+ The data sets r1.train and r1.test through r5.train and r5.test + are 80%/20% splits of the ratings data into training and test data. + Each of r1, ..., r5 have disjoint test sets; this if for + 5 fold cross validation (where you repeat your experiment + with each training and test set and average the results). +
+ ra.train, rb.train
+ ra.test, rb.test
+
+ The data sets ra.train, ra.test, rb.train, and rb.test + split the ratings data into a training set and a test set with + exactly 10 ratings per user in the test set. The sets + ra.test and rb.test are disjoint. +
+

+ + Valid XHTML 1.0 Strict + + + + Valid CSS! + +

+ + + diff --git a/allbut.pl b/allbut.pl new file mode 100644 index 000000000..a22afad08 --- /dev/null +++ b/allbut.pl @@ -0,0 +1,35 @@ +#!/usr/bin/env perl + +# get args +if (@ARGV < 3) { + print STDERR "Usage: $0 base_name start stop max_test [ratings ...]\n"; + exit 1; +} +$basename = shift; +$start = shift; +$stop = shift; +$maxtest = shift; + +# open files +open( TESTFILE, ">$basename.test" ) or die "Cannot open $basename.test for writing\n"; +open( BASEFILE, ">$basename.train" ) or die "Cannot open $basename.train for writing\n"; + +# init variables +$testcnt = 0; + +while (<>) { + ($user) = split /::/, $_, 2; + if (! defined $ratingcnt{$user}) { + $ratingcnt{$user} = 1; + } else { + ++$ratingcnt{$user}; + } + if (($testcnt < $maxtest || $maxtest <= 0) + && $ratingcnt{$user} >= $start && $ratingcnt{$user} <= $stop) { + ++$testcnt; + print TESTFILE; + } + else { + print BASEFILE; + } +} diff --git a/split_ratings.sh b/split_ratings.sh new file mode 100644 index 000000000..34af4a9b4 --- /dev/null +++ b/split_ratings.sh @@ -0,0 +1,36 @@ +#!/bin/sh + +RATINGS_COUNT=`wc -l ratings.dat | xargs | cut -d ' ' -f 1` +echo "ratings count: $RATINGS_COUNT" +SET_SIZE=`expr $RATINGS_COUNT / 5` +echo "set size: $SET_SIZE" +REMAINDER=`expr $RATINGS_COUNT % 5` +echo "remainder: $REMAINDER" + +for i in 1 2 3 4 5 + do + head -`expr $i \* $SET_SIZE` ratings.dat | tail -$SET_SIZE > r$i.test + + # XXX: OSX users will see the message "head: illegal line count -- 0" here, + # but this is just a warning; the script still works as intended. + head -`expr \( $i - 1 \) \* $SET_SIZE` ratings.dat > r$i.train + tail -`expr \( 5 - $i \) \* $SET_SIZE` ratings.dat >> r$i.train + + if [ $i -eq 5 ]; then + tail -$REMAINDER ratings.dat >> r5.test + else + tail -$REMAINDER ratings.dat >> r$i.train + fi + + echo "r$i.test created. `wc -l r$i.test | xargs | cut -d " " -f 1` lines." + echo "r$i.train created. `wc -l r$i.train | xargs | cut -d " " -f 1` lines." +done + +./allbut.pl ra 1 10 0 ratings.dat +echo "ra.test created. `wc -l ra.test | xargs | cut -d " " -f 1` lines." +echo "ra.train created. `wc -l ra.train | xargs | cut -d " " -f 1` lines." + +./allbut.pl rb 11 20 0 ratings.dat +echo "rb.test created. `wc -l rb.test | xargs | cut -d " " -f 1` lines." +echo "rb.train created. `wc -l rb.train | xargs | cut -d " " -f 1` lines." + diff --git a/week1/R_Markdown_1.cv b/week1/R_Markdown_1.cv new file mode 100644 index 000000000..e69de29bb diff --git a/week1/R_Markdown_1.rmd b/week1/R_Markdown_1.rmd new file mode 100644 index 000000000..e69de29bb diff --git a/week1/R_Markdown_Excercises.html b/week1/R_Markdown_Excercises.html new file mode 100644 index 000000000..da3dab5fa --- /dev/null +++ b/week1/R_Markdown_Excercises.html @@ -0,0 +1,408 @@ + + + + + + + + + + + + + + +Diamond sizes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +

We have data about 53940 diamonds. Only 126 are larger than 2.5 +carats. The distribution of the remainder is shown below:

+

+ + + + +
+ + + + + + + + + + + + + + + diff --git a/week1/R_Markdown_Excercises.rmd b/week1/R_Markdown_Excercises.rmd new file mode 100644 index 000000000..445058c6c --- /dev/null +++ b/week1/R_Markdown_Excercises.rmd @@ -0,0 +1,26 @@ +--- +title: "Diamond sizes" +date: 2016-08-25 +output: html_document +--- + +```{r setup, include = FALSE} +library(ggplot2) +library(dplyr) + +smaller <- diamonds %>% + filter(carat <= 2.5) +``` + +We have data about `r nrow(diamonds)` diamonds. Only +`r nrow(diamonds) - nrow(smaller)` are larger than +2.5 carats. The distribution of the remainder is shown +below: + +```{r, echo = FALSE} +smaller %>% + ggplot(aes(carat)) + + geom_freqpoly(binwidth = 0.01) +``` + +There exists the most diamons from 1.0 carat and lower \ No newline at end of file diff --git a/week1/load_trips.R b/week1/load_trips.R index 6333d786e..cbe76def7 100644 --- a/week1/load_trips.R +++ b/week1/load_trips.R @@ -11,7 +11,7 @@ parse_datetime <- function(s, format="%Y-%m-%d %H:%M:%S") { ######################################## # load each month of the trip data into one big data frame -csvs <- Sys.glob('*-tripdata.csv') +csvs <- Sys.glob('./week1/*-tripdata.csv') trips <- data.frame() for (csv in csvs) { print(csv) @@ -44,7 +44,7 @@ trips <- mutate(trips, gender=factor(gender, levels=c(0,1,2), labels=c("Unknown" # https://www.ncei.noaa.gov/orders/cdo/2992179.csv # ordered from # http://www.ncdc.noaa.gov/cdo-web/datasets/GHCND/stations/GHCND:USW00094728/detail -weather <- read.table('weather.csv', header=T, sep=',') +weather <- read.table('./week1/weather.csv', header=T, sep=',') # extract just a few columns, lowercase column names, and parse dates weather <- select(weather, DATE, PRCP, SNWD, SNOW, TMAX, TMIN) diff --git a/week1/plot_trips.R b/week1/plot_trips.R index b6a537c1c..c4d53ec38 100644 --- a/week1/plot_trips.R +++ b/week1/plot_trips.R @@ -19,17 +19,17 @@ load('trips.RData') # plot the distribution of trip times across all rides (compare a histogram vs. a density plot) - ggplot(trips, aes(x = tripduration)) + geom_histogram(bins = 50) + scale_x_log10(labels = comma) + ggplot(trips, aes(x = tripduration)) + geom_histogram(bins = 30) + scale_x_log10(labels = comma) ggplot(trips, aes(x = tripduration)) + geom_density(fill = 'blue') + scale_x_log10(labels = comma) # plot the distribution of trip times by rider type indicated using color and fill (compare a histogram vs. a density plot) - ggplot(trips, aes(x = tripduration, fill = gender )) + geom_histogram(bins = 10) + scale_x_log10(labels = comma) + labs(fill = "User Type") - ggplot(trips, aes(x = tripduration, fill = gender )) + geom_density() + scale_x_log10(labels = comma) + labs(fill = "User Type") + ggplot(trips, aes(x = tripduration, fill = usertype )) + geom_histogram() + scale_x_log10(labels = comma) + labs(fill = "User Type") + ggplot(trips, aes(x = tripduration, fill = )) + geom_density() + scale_x_log10(labels = comma) + labs(fill = "User Type") # plot the total number of trips on each day in the dataset -trips %>% mutate(date = as.Date(starttime)) %>% ggplot(aes(x = date)) + geom_histogram() +trips %>% mutate(date = as.Date(starttime)) %>% ggplot(aes(x = date)) + geom_histogram(bins=365) # plot the total number of trips (on the y axis) by age (on the x axis) and gender (indicated with color) @@ -39,6 +39,11 @@ trips %>% group_by(birth_year, gender) %>% ggplot(aes(x = birth_year, fill = gen # hint: use the pivot_wider() function to reshape things to make it easier to compute this ratio # (you can skip this and come back to it tomorrow if we haven't covered pivot_wider() yet) +trips %>% mutate(year = year(starttime)) %>% mutate(age = year - birth_year) +group_by(gender, age ) %>% summarise(count= n()) %>% +pivot_wider(names_from = gender, values_from = count) %>% +mutate(gender_ratio = Male/Female) %>% ggplot(aes(x= age, y= gender_ratio)) + geom_point() + ######################################## # plot weather data ######################################## @@ -47,7 +52,8 @@ trips %>% group_by(birth_year, gender) %>% ggplot(aes(x = birth_year, fill = gen # plot the minimum temperature and maximum temperature (on the y axis, with different colors) over each day (on the x axis) # hint: try using the pivot_longer() function for this to reshape things before plotting # (you can skip this and come back to it tomorrow if we haven't covered reshaping data yet) - +weather %>% pivot_longer(cols = c(tmin, tmax), names_to = "temp_type", values_to = "temp_value") %>% ggplot(aes(x = ymd, y = temp_value, color = temp_type)) + geom_line() + ######################################## # plot trip and weather data ######################################## @@ -62,7 +68,7 @@ trips_with_weather %>% group_by(date, tmin) %>% summarise (count = n()) %>% ggpl # repeat this, splitting results by whether there was substantial precipitation or not # you'll need to decide what constitutes "substantial precipitation" and create a new T/F column to indicate this - trips_with_weather %>% group_by(date, tmin) %>% summarise (count = n(), avg_prcp = mean(prcp)) %>% ggplot(aes(x = count, y = tmin)) + geom_point() + facet_wrap(~ avg_prcp) + trips_with_weather %>% group_by(date, tmin) %>% summarise (count = n(), avg_prcp = mean(prcp)) %>% mutate(substantial_prcp = prcp >= avg_prcp) %>% ggplot(aes(x = count, y = tmin)) + geom_point(color = substantial_prcp) # add a smoothed fit on top of the previous plot, using geom_smooth @@ -80,4 +86,7 @@ trips %>% mutate ( date = as_date(starttime), hours = hour(starttime)) %>% count # repeat this, but now split the results by day of the week (Monday, Tuesday, ...) or weekday vs. weekend days # hint: use the wday() function from the lubridate package -trips %>% mutate ( date = wday(starttime, label = TRUE), hours = hour(starttime)) %>% count (date, hours) %>% group_by(date) %>% summarise (avg_trips = mean(n), Stand_Dev_Trips = sd(n)) %>% ungroup() %>% ggplot (aes(x= date , y= avg_trips )) + geom_line(aes(group = 1)) + geom_ribbon(aes(ymin = avg_trips - Stand_Dev_Trips, ymax = avg_trips + Stand_Dev_Trips, group = 1),alpha = 0.25 ) +trips %>% mutate ( date = wday(starttime, label = TRUE), hours = hour(starttime)) %>% count (date, hours) +%>% group_by(date) %>% summarise (avg_trips = mean(n), Stand_Dev_Trips = sd(n)) %>% ungroup() +%>% ggplot (aes(x= date , y= avg_trips )) + geom_line(aes(group = 1)) ++ geom_ribbon(aes(ymin = avg_trips - Stand_Dev_Trips, ymax = avg_trips + Stand_Dev_Trips, group = 1),alpha = 0.25 ) diff --git a/week1/textbook_excercises.R b/week1/textbook_excercises.R new file mode 100644 index 000000000..0cd63849c --- /dev/null +++ b/week1/textbook_excercises.R @@ -0,0 +1,67 @@ +# Compute the rate for table2, and table4a + table4b. You will need to perform four operations: +# Extract the number of TB cases per country per year. +# Extract the matching population per country per year. +# Divide cases by population, and multiply by 10000. +# Store back in the appropriate place. +# Which representation is easiest to work with? Which is hardest? Why? +#rate = cases / population + +table2_wider <- pivot_wider(table2, names_from = type, values_from = count) %>% +mutate( rate_table2 = cases/population) + +table4a_longer <- pivot_longer(table4a, cols = c('1999','2000'), names_to = 'Year', values_to = 'Cases' ) +table4b_longer <- pivot_longer(table4b, cols = c('1999','2000'), names_to = 'Year', values_to = 'Population' ) %>% + inner_join(table4a, table4b) + +rate_table4a_table4b <- inner_join(table4a_longer, table4b_longer) %>% mutate(Rate = (Cases / Population) *1000) + +# Why are pivot_longer() and pivot_wider() not perfectly symmetrical? +# Carefully consider the following example: + +stocks <- tibble( + year = c(2015, 2015, 2016, 2016), + half = c( 1, 2, 1, 2), + return = c(1.88, 0.59, 0.92, 0.17) +) +stocks %>% + pivot_wider(names_from = year, values_from = return) %>% + pivot_longer(`2015`:`2016`, names_to = "year", values_to = "return") +# (Hint: look at the variable types and think about column names.) + +# pivot_longer() and pivot_wider() are not perfectly symmetrical becuase in pivot_wider() each unique value will be turned into a column name +# while in pivot_longer() calls keys and valuse columns to create new columns in order to made the data "longer" + +# Why does this code fail? + +table4a %>% + pivot_longer(c(1999, 2000), names_to = "year", values_to = "cases") + +#This code fails becuase the columns names 1999 and 2000 must be in quotes due to their data type being integers and without them R will +# interpret the column names as number values. + +table4a %>% + pivot_longer(c('1999', '2000'), names_to = "year", values_to = "cases") + +# What would happen if you widen this table? Why? How could you add a new column to uniquely identify each value? + +people <- tribble( + + ~name, ~names, ~values, + #-----------------|--------|------ + "Phillip Woods", "age", 45, + "Phillip Woods", "height", 186, + "Phillip Woods", "age", 50, + "Jessica Cordero", "age", 37, + "Jessica Cordero", "height", 156 +) + +# This is a bad data set there must be another column that has a unique identody to be ble to dignies bewyeen people who have the same name, age or hieght. + +#5.2 TEXTBOOK QUESTIONS + + + + + + + diff --git a/week1/vanessa_reino.cv b/week1/vanessa_reino.cv new file mode 100644 index 000000000..e69de29bb diff --git a/week1/vanessar_cv.html b/week1/vanessar_cv.html new file mode 100644 index 000000000..f9a5cab36 --- /dev/null +++ b/week1/vanessar_cv.html @@ -0,0 +1,464 @@ + + + + + + + + + + + + + +vanessar_cv.knit + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Text formatting

+

italic or italic bold +bold code superscript2 and +subscript2

+
+
+

Education

+

Rider University

+
+
+

2nd Level Header

+
+

3rd Level Header

+
+
+
+

Lists

+
    +
  • Bulleted list item 1

  • +
  • Item 2

    +
      +
    • Item 2a

    • +
    • Item 2b

    • +
  • +
+
    +
  1. Numbered list item 1

  2. +
  3. Item 2. The numbers are incremented automatically in the +output.

  4. +
+
+ +
+

Tables

+ + + + + + + + + + + + + + + + + +
First HeaderSecond Header
Content CellContent Cell
Content CellContent Cell
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/week1/vanessar_cv.rmd b/week1/vanessar_cv.rmd new file mode 100644 index 000000000..6e00b835b --- /dev/null +++ b/week1/vanessar_cv.rmd @@ -0,0 +1,53 @@ +Text formatting +------------------------------------------------------------ + +*italic* or _italic_ +**bold** __bold__ +`code` +superscript^2^ and subscript~2~ + +Education +------------------------------------------------------------ + +Rider University +**2026** + + +## Job + +Target +**current** + +### 3rd Level Header + +Experience +------------------------------------------------------------ + +* B.S Computer Science + +* IT L'Oreal + + * Trouble shoot software + + * Setup workstations + +1. Numbered list item 1 + +1. Item 2. The numbers are incremented automatically in the output. + +Links and images +------------------------------------------------------------ + + + +[linked phrase](http://example.com) + +![optional caption text](path/to/img.png) + +Tables +------------------------------------------------------------ + +First Header | Second Header +------------- | ------------- +Content Cell | Content Cell +Content Cell | Content Cell \ No newline at end of file diff --git a/week2/day_three_txt.R b/week2/day_three_txt.R new file mode 100644 index 000000000..aeeaa06c0 --- /dev/null +++ b/week2/day_three_txt.R @@ -0,0 +1,211 @@ +library(tidyverse) + +magnets <- read_csv("C:/Users/ds3/Desktop/coursework/week2/magnets.csv") + +#9.1 TEXTBOOK QUESTIONS + +# What is the sample average of the change in score between the patients +# rating before the application of the device and the rating after the application? + +summary(magnets) #The mean will be 3.5 + +# Is the variable active a factor or a numeric variable? + +#The variable 'active' is a factor and not a numeric variable as 1 represents the active device 2 represents the placebo. + +# Compute the average value of the variable change for the patients that +# received and active magnet and average value for those that received an +# inactive placebo. (Hint: Notice that the rst 29 patients received an active +# magnet and the last 21 patients received an inactive placebo. The sub +# sequence of the rst 29 values of the given variables can be obtained via +# the expression change[1:29] and the last 21 vales are obtained via the +# expression change[30:50].) + +magnets %>% + slice(1:29) %>% + summarize(mean_change_active = mean(change, na.rm = TRUE)) + +magnets %>% + slice(30:50) %>% + summarize(mean_change_placebo = mean(change, na.rm = TRUE)) + +# Compute the sample standard deviation of the variable change for the +# patients that received and active magnet and the sample standard deviation +# for those that received an inactive placebo. + +magnets %>% slice(1:29) %>% summarize(sd_change_active = sd(change, na.rm = TRUE)) +magnets %>% slice(30:50) %>% summarize(sd_change_placebo = sd(change, na.rm = TRUE)) + +# Produce a boxplot of the variable change for the patients that received +# and active magnet and for patients that received an inactive placebo. +# What is the number of outliers in each subsequence? + +magnets %>% + slice(1:29) %>% + ggplot(aes(y = change)) + + geom_boxplot() + +magnets %>% + slice(30:50) %>% + ggplot(aes(y = change)) + + geom_boxplot() + + +#10.1 & 10.2 TEXTBOOK QUESTIONS + +# Simulate the sampling distribution of average and the median of a sample +# of size n = 100 from the Normal(3,2) distribution. Compute the expectation +# and the variance of the sample average and of the sample median. +# Which of the two estimators has a smaller mean square error? + +mu <- 3 +sig <- sqrt(2) +X.bar <- rep(0, 10^5) +X.med <- rep(0,10^5) +for(i in 1:10^5){ + X <- rnorm(100, mu, sig) + X.bar[i] <- mean(X) + X.med[i] <- median(X) +} + +mean(X.bar) +mean(X.med) + +var(X.bar) +var(X.med) + +# Simulate the sampling distribution of average and the median of a sample +# of size n = 100 from the Uniform(0.5,5.5) distribution. Compute the +# expectation and the variance of the sample average and of the sample median. +# Which of the two estimators has a smaller mean square error? + +a <- 0.5 +b <- 5.5 +X.bar <- rep(0, 10^5) +mid.range <- rep(0,10^5) + +for( i in 1:10^5){ + X <- runif(100, a,b) + X.bar[i] <- mean(X) + X.med[i] <- median(X) + +} + + +mean(X.bar) +mean(X.med) + +var(X.bar) +var(X.med) + + + +# Compute the proportion in the sample of those with a high level of blood +# pressure. + +ex2 <- read_csv("C:/Users/ds3/Desktop/coursework/week2/ex2 (1).csv") + +ex2 %>% filter(group == "HIGH") %>% summarise(count_high=n()) %>% + mutate(proportion_high = count_high/150) + +# Compute the proportion in the population of those with a high level of +# blood pressure. + +pop2 <- read_csv("C:/Users/ds3/Desktop/coursework/week2/pop2 (1).csv") + +p <- pop2 %>% filter(group == "HIGH") %>% summarise(count_high=n()) %>% + mutate(proportion_high = count_high/100000) + +# Simulate the sampling distribution of the sample proportion and compute +# its expectation. + +P.hat <- rep(0,10^5) + +for (i in 1:10^5 ) +{ + X <- sample(pop2$group, 150) + P.hat[i] <- mean (X == "HIGH") +} + +mean(P.hat) + +# Compute the variance of the sample proportion + +var(P.hat) + +# It is proposed in Section 10.5 that the variance of the sample proportion +# is Var( P) = p(1-p)/n, where p is the probability of the event (having a +# high blood pressure in our case) and n is the sample size (n = 150 in our +# case). Examine this proposal in the current setting. + +p <- mean(P.hat) +n <- 150 + +p * (1-p) / n + +#CHAPTER 2 TEXTBOOK QUESTIONS + +# (a)What proportion of patients in the treatment group and what proportion of patients in the +# control group died? + +proportion_treatment_dead <- 45/69 +proportion_control_dead <- 30/34 + +# (b)One approach for investigating whether or not the treatment is effective is to use a +# randomization technique. + +# i.What are the claims being tested? Use the same null and alternative hypothesis notation +# used in the section. + +#H0 - The results of the control group and the treatment are equal meaning thr treaments is unsuccessful +#HA - The treatment is sucessful with the prortion of alive of the tramnt group is greater than the control +# group. + +# i. The paragraph below describes the setup for such approach, if we were to do it with +# out using statistical software. Fill in the blanks with a number or phrase,whichever is +# appropriate. + +# We write alive on (blue) cards representing patients who were alive at +# the end of the study, and dead on (red) cards representing patients +# who were not. Then,we shuffle these cards and split them into two groups: +# one group of size (69) representing treatment, and another group of +# size (34) representing control. We calculate the difference between +# the proportion of dead cards in the treatment and control groups(treatment +# control)andrecordthisvalue.We repeat this many times to build a distribution +# centered at (45 - 30 = 15) .Lastly,we calculate the fraction of simulations where +# the simulated differences in proportions are (0.65, 0.88) . If this fraction is low, +# we conclude that it is unlikely to have observed such an outcome by chance and that the null +# hypothesis shoul be rejected in favor of the alternative. + +# iii.What do the simulation results shown below suggest about the effectiveness of the +# transplant program? + +#The simulayion results belwo suggect that the transplant program is not effective. + + +#EXCERCISE 2.6 + +# (a) What are the hypotheses? + +# HA - A person yawning causes another person nearby to yawn +# HO - A person yawing does not to cause another person nearby to yawn + +# (b) Calculate the observed difference between the yawning rates under the two scenarios. + +proportion_t = 34/50 +proportion_c = 16/50 + +observed_difference = proportion_c - proportion_t + +# (c) Estimate the p-value using the figure above and determine the conclusion of the hypothesis +# test. + + +# 9.2 TEXTBOOK QUESTION + +expectation <- 3.5 +sd_active <- 3 +sd_inactive <- 1.5 +n <- 29 + 21 + + diff --git a/week2/diamond-sizes.Rmd b/week2/diamond-sizes.Rmd index 3003fdbd2..644b86213 100644 --- a/week2/diamond-sizes.Rmd +++ b/week2/diamond-sizes.Rmd @@ -21,4 +21,22 @@ below: smaller %>% ggplot(aes(carat)) + geom_freqpoly(binwidth = 0.01) -``` \ No newline at end of file +``` + +```{r setup, include = FALSE} +library(ggplot2) +library(dplyr) + +color <- diamonds %>% + filter(color) +``` + +```{r, echo = FALSE} +smaller %>% + ggplot(aes(x= color, y = count())) + + geom_bar() +``` + + + +We have data about 'r nrow(diamonds)' diamonds. Only diff --git a/week2/diamond-sizes.html b/week2/diamond-sizes.html new file mode 100644 index 000000000..da3dab5fa --- /dev/null +++ b/week2/diamond-sizes.html @@ -0,0 +1,408 @@ + + + + + + + + + + + + + + +Diamond sizes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +

We have data about 53940 diamonds. Only 126 are larger than 2.5 +carats. The distribution of the remainder is shown below:

+

+ + + + +
+ + + + + + + + + + + + + + + diff --git a/week2/txt_questions.R b/week2/txt_questions.R new file mode 100644 index 000000000..b23c4862f --- /dev/null +++ b/week2/txt_questions.R @@ -0,0 +1,51 @@ +library(MASS) +library(ISLR2) +library(tidyverse) + +head(Boston) + +?Boston + + lm.fit <-lm(medv~lstat) + +lm.fit <-lm(medv~lstat, data = Boston) +attach(Boston) +lm.fit <-lm(medv~lstat) + +summary(lm.fit) + +lm.fit + +names (lm.fit) + +summary(df.residual()) + +coef(lm.fit) + +confint(lm.fit) + +predict(lm.fit, data.frame(lstat = (c(5, 10, 15))), interval = "confidence") + +predict(lm.fit, data.frame(lstat = (c(5, 10, 15))), interval = "prediction") + +plot(lstat, medv) +abline(lm.fit) + +abline(lm.fit, lwd = 3) +abline(lm.fit, lwd = 3, col = "red") + +plot(lstat, medv, col = "red") +plot(lstat, medv, pch = 20) +plot(lstat, medv, pch = "+") +plot(1:20, 1:20, pch = 1:20) + +par(mfrow = c(2, 2)) +plot(lm.fit) + +plot(predict(lm.fit), residuals(lm.fit)) +plot(predict(lm.fit), rstudent(lm.fit)) + +plot(hatvalues(lm.fit)) +which.max(hatvalues(lm.fit)) + + diff --git a/week3/movielens.Rmd b/week3/movielens.Rmd index 78a442d9c..4675e8c1b 100644 --- a/week3/movielens.Rmd +++ b/week3/movielens.Rmd @@ -35,6 +35,7 @@ head(ratings) %>% kable() ```{r dist-ratings} # plot the distribution of rating values https://speakerdeck.com/jhofman/modeling-social-data-lecture-2-introduction-to-counting?slide=26 +ggplot(ratings, aes(x = rating)) + geom_bar() ``` ## Per-movie stats @@ -42,16 +43,30 @@ head(ratings) %>% kable() ```{r aggregate-by-movie} # aggregate ratings by movie, computing mean rating and number of ratings # hint: use the n() function for easy counting within a group +ratings %>% group_by(movie_id) %>% +summarise( avg_rating = mean(rating), num_rating = n()) + ``` ```{r dist-movie-popularity} # plot distribution of movie popularity (= number of ratings the movie received) # hint: try scale_x_log10() for a logarithmic x axis + +ratings %>% group_by(movie_id) %>% +summarise(popularity = n()) %>% +ggplot(aes(x= popularity)) + geom_histogram() + scale_x_log10() ``` ```{r dist-mean-ratings-by-movie} # plot distribution of mean ratings by movie https://speakerdeck.com/jhofman/modeling-social-data-lecture-2-introduction-to-counting?slide=28 # hint: try geom_histogram and geom_density +ratings %>% group_by(movie_id) %>% +summarise(avg_rating = mean(rating)) %>% +ggplot(aes(x= avg_rating)) + geom_histogram() + +ratings %>% group_by(movie_id) %>% +summarise(avg_rating = mean(rating)) %>% +ggplot(aes(x= avg_rating)) + geom_density(fill = "pink") ``` ```{r cdf-movie-pop} @@ -59,7 +74,18 @@ head(ratings) %>% kable() # hint: use dplyr's rank and arrange functions, and the base R sum and cumsum functions # store the result in a new data frame so you can use it in creating figure 2 from the paper below +rank_ratings <- ratings %>% +group_by(movie_id) %>% +summarise(num_ratings = n()) %>% +mutate(tot_ratings = nrow(ratings)) %>% +mutate(pop_fraction = (cumsum(num_ratings)/ tot_ratings)) %>% +arrange(pop_fraction)%>% +mutate(rank = row_number(pop_fraction)) + + # plot the CDF of movie popularity + +ggplot(rank_ratings, aes(x = rank, y = pop_fraction)) + geom_line() ``` @@ -67,11 +93,19 @@ head(ratings) %>% kable() ```{r aggregate-by-user} # aggregate ratings by user, computing mean and number of ratings +ratings %>% group_by(user_id) %>% +summarise(mean_user = mean(rating), num_user_rating = n()) ``` ```{r dist-user-activity} # plot distribution of user activity (= number of ratings the user made) # hint: try a log scale here + +ratings %>% group_by(user_id) %>% +summarise(num_user_rating = n()) %>% +ggplot(aes(x= num_user_rating)) + geom_histogram() + scale_x_log10() + + ``` # Anatomy of the long tail @@ -91,4 +125,12 @@ head(ratings) %>% kable() # paper, produce one curve for the 100% user satisfaction level and # another for 90%---do not, however, bother implementing the null # model (shown in the dashed lines). + + +ratings %>% group_by(user_id) %>% +summarize(num_movies_user = n()) + + + + ``` diff --git a/week3/ngrams/01_download_1grams.sh b/week3/ngrams/01_download_1grams.sh index 1d6d5bf10..3ef97c287 100644 --- a/week3/ngrams/01_download_1grams.sh +++ b/week3/ngrams/01_download_1grams.sh @@ -1,7 +1,8 @@ #!/bin/bash # use curl or wget to download the version 2 1gram file with all terms starting with "1", googlebooks-eng-all-1gram-20120701-1.gz - +curl -o googlebooks-eng-all-1gram-20120701-1.gz http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-1.gz # update the timestamp on the resulting file using touch # do not remove, this will keep make happy and avoid re-downloading of the data once you have it touch googlebooks-eng-all-1gram-20120701-1.gz + diff --git a/week3/ngrams/02_filter_1grams.sh b/week3/ngrams/02_filter_1grams.sh index 3b8e9ec29..96a93fce9 100644 --- a/week3/ngrams/02_filter_1grams.sh +++ b/week3/ngrams/02_filter_1grams.sh @@ -4,3 +4,7 @@ # decompress the first using gunzip, zless, zcat or similar # then filter out rows that match using grep -E, egrep, awk, or similar # write results to year_counts.tsv +zcat googlebooks-eng-all-1gram-20120701-1.gz | grep -E '^(18|19|20)[0-9]{2}\b' > year_count.tsv + + + diff --git a/week3/ngrams/03_download_totals.sh b/week3/ngrams/03_download_totals.sh index f53381e8e..aeb8cf0f8 100644 --- a/week3/ngrams/03_download_totals.sh +++ b/week3/ngrams/03_download_totals.sh @@ -1,7 +1,7 @@ #!/bin/bash # use curl or wget to download the version 2 of the total counts file, googlebooks-eng-all-totalcounts-20120701.txt - +curl -o googlebooks-eng-all-totalcounts-20120701.txt http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-totalcounts-20120701.txt # update the timestamp on the resulting file using touch # do not remove, this will keep make happy and avoid re-downloading of the data once you have it touch googlebooks-eng-all-totalcounts-20120701.txt diff --git a/week3/ngrams/04_reformat_totals.sh b/week3/ngrams/04_reformat_totals.sh index 0445c1ff6..f14d33a28 100644 --- a/week3/ngrams/04_reformat_totals.sh +++ b/week3/ngrams/04_reformat_totals.sh @@ -3,3 +3,5 @@ # reformat total counts in googlebooks-eng-all-totalcounts-20120701.txt to a valid csv # use tr, awk, or sed to convert tabs to newlines # write results to total_counts.csv + +tr "\t" "\n" < googlebooks-eng-all-totalcounts-20120701.txt > total_counts.csv \ No newline at end of file diff --git a/week3/ngrams/05_final_report.Rmd b/week3/ngrams/05_final_report.Rmd index a7c90c1fb..540dcea61 100644 --- a/week3/ngrams/05_final_report.Rmd +++ b/week3/ngrams/05_final_report.Rmd @@ -15,6 +15,7 @@ output: library(here) library(scales) library(tidyverse) +library(readr) theme_set(theme_bw()) @@ -52,6 +53,13 @@ Then edit the `03_download_totals.sh` file to down the `googlebooks-eng-all-tota Load in the `year_counts.tsv` and `total_counts.csv` files. Use the `here()` function around the filename to keep things portable.Give the columns of `year_counts.tsv` the names `term`, `year`, `volume`, and `book_count`. Give the columns of `total_counts.csv` the names `year`, `total_volume`, `page_count`, and `book_count`. Note that column order in these files may not match the examples in the documentation. ```{r load-counts} +year_counts <- read_tsv("./week3/ngrams/year_count.tsv", col_names = FALSE) %>% +rename(term = X1, year = X2, volume = X3, book_count = X4 ) +summary(year_counts) + +total_counts <- read_csv("./week3/ngrams/total_counts.csv", col_names = FALSE) %>% +rename(year = X1, total_volume = X2, page_count = X3, book_count = X4 ) +summary(total_counts) ``` @@ -60,15 +68,21 @@ Load in the `year_counts.tsv` and `total_counts.csv` files. Use the `here()` fun Add a line below using Rmarkdown's inline syntax to print the total number of lines in each dataframe you've created. +The total number of lines in the year_counts data frame is 'nrow(year_counts)' +The total number of lines in the total_counts data frame is 'nrow(total_counts)' # Part B > Recreate the main part of figure 3a of Michel et al. (2011). To recreate this figure, you will need two files: the one you downloaded in part (a) and the “total counts” file, which you can use to convert the raw counts into proportions. Note that the total counts file has a structure that may make it a bit hard to read in. Does version 2 of the NGram data produce similar results to those presented in Michel et al. (2011), which are based on version 1 data? + ## Join ngram year counts and totals Join the raw year term counts with the total counts and divide to get a proportion of mentions for each term normalized by the total counts for each year. ```{r join-years-and-totals} +join_years_totals <- left_join(year_counts, total_counts, by = "year") %>% +mutate(proportion = volume / total_volume) +join_years_totals ``` @@ -78,7 +92,13 @@ Join the raw year term counts with the total counts and divide to get a proporti Plot the proportion of mentions for the terms "1883", "1910", and "1950" over time from 1850 to 2012, as in the main figure 3a of the original paper. Use the `percent` function from the `scales` package for a readable y axis. Each term should have a different color, it's nice if these match the original paper but not strictly necessary. ```{r plot-proportion-over-time} - +join_years_totals %>% + filter(term %in% c("1883", "1910", "1950")) %>% + group_by(term, year) %>% + filter(year >= 1850 & year <= 2012) %>% + ggplot(aes(x = year, y = proportion, color = term)) + + geom_line() + + scale_y_continuous(labels = scales::percent) ``` ## Your written answer @@ -96,7 +116,7 @@ Go to the ngram viewer, enter the terms "1883", "1910", and "1950" and take a sc ## Your written answer Add your screenshot for Part C below this line using the `![](figure_filename.png)` syntax and comment on similarities / differences. - +![](Graph.png) # Part D @@ -107,6 +127,12 @@ Add your screenshot for Part C below this line using the `![](figure_filename.pn Plot the raw counts for the terms "1883", "1910", and "1950" over time from 1850 to 2012. Use the `comma` function from the `scales` package for a readable y axis. The colors for each term should match your last plot, and it's nice if these match the original paper but not strictly necessary. ```{r plot-raw-mentions-over-time} +join_years_totals %>% + filter(term %in% c("1883", "1910", "1950")) %>% + group_by(term, year) %>% + filter(year >= 1850 & year <= 2012) %>% + ggplot(aes(x = year, y = total_volume, color = term)) + + geom_line() ``` diff --git a/week3/trips_per_day.tsv b/week3/trips_per_day.tsv deleted file mode 100644 index 41a5887df..000000000 --- a/week3/trips_per_day.tsv +++ /dev/null @@ -1,366 +0,0 @@ -ymd num_trips date prcp snwd snow tmax tmin -2014-01-01 6059 20140101 0 0 0 3.3 2.4 -2014-01-02 8600 20140102 0.33 0 3.1 3.3 1.8 -2014-01-03 1144 20140103 0.29 5.9 3.3 1.8 0.9 -2014-01-04 2292 20140104 0 5.9 0 2.9 0.8 -2014-01-05 2678 20140105 0.14 3.9 0 4 2.7 -2014-01-06 9510 20140106 0.36 1.2 0 5.5 1.9 -2014-01-07 6267 20140107 0 0 0 1.9 0.4 -2014-01-08 9246 20140108 0 0 0 2.2 0.9 -2014-01-09 13354 20140109 0 0 0 3.2 2.2 -2014-01-10 9847 20140110 0.11 0 0 3.7 3 -2014-01-11 7695 20140111 0.5 0 0 5.8 3.7 -2014-01-12 12515 20140112 0.05 0 0 5.4 3.8 -2014-01-13 20633 20140113 0 0 0 5.1 3.7 -2014-01-14 9999 20140114 0.38 0 0 5.2 4.4 -2014-01-15 21630 20140115 0 0 0 4.7 3.9 -2014-01-16 19953 20140116 0 0 0 4.2 3.6 -2014-01-17 20301 20140117 0 0 0 4.4 3.3 -2014-01-18 9438 20140118 0.07 0 0 4.1 2.9 -2014-01-19 9028 20140119 0 0 0 3.8 2.4 -2014-01-20 13407 20140120 0 0 0 4.6 3.1 -2014-01-21 3804 20140121 0.46 0 11 3.1 1.1 -2014-01-22 2451 20140122 0.02 11 0.5 1.7 0.5 -2014-01-23 5071 20140123 0 7.1 0 2 0.7 -2014-01-24 6176 20140124 0 5.9 0 2 1 -2014-01-25 4338 20140125 0.04 3.9 1 2.8 1.9 -2014-01-26 4980 20140126 0 3.9 0 3.4 1.7 -2014-01-27 13119 20140127 0 3.1 0 4.4 2.1 -2014-01-28 10033 20140128 0 1.2 0 2.1 1.2 -2014-01-29 10021 20140129 0.04 2 0.8 2.3 1.4 -2014-01-30 12158 20140130 0 1 0 3 1.6 -2014-01-31 14653 20140131 0 1.2 0 3.9 2.5 -2014-02-01 12771 20140201 0 1.2 0 4.5 3.6 -2014-02-02 13816 20140202 0 1.2 0 5.6 3.9 -2014-02-03 2600 20140203 1.17 1.2 8 4.3 2.7 -2014-02-04 8709 20140204 0 7.9 0 3.5 2.2 -2014-02-05 2746 20140205 1.43 9.1 4 3.4 2.9 -2014-02-06 7196 20140206 0 9.1 0 3.2 2.1 -2014-02-07 8495 20140207 0 9.1 0 3.2 2.4 -2014-02-08 5986 20140208 0 9.1 0 2.9 2.1 -2014-02-09 4996 20140209 0.1 7.9 1.2 3.1 2.1 -2014-02-10 6846 20140210 0 9.8 0 2.9 2.1 -2014-02-11 8343 20140211 0 9.8 0 2.6 1.6 -2014-02-12 8580 20140212 0 9.8 0 2.5 1.3 -2014-02-13 876 20140213 1.78 11.8 9.5 3.6 2.4 -2014-02-14 3609 20140214 0.3 18.1 3 4 3.1 -2014-02-15 2261 20140215 0.13 18.1 1.6 3.7 2.7 -2014-02-16 3003 20140216 0 18.1 0 3 2.1 -2014-02-17 4854 20140217 0 16.9 0 3.2 1.8 -2014-02-18 5140 20140218 0.16 18.1 1.5 3.9 2.6 -2014-02-19 8506 20140219 0.26 16.9 0 4.5 3.4 -2014-02-20 11792 20140220 0.03 15 0 5.1 3.7 -2014-02-21 8680 20140221 0.09 13 0 4.9 3.6 -2014-02-22 13044 20140222 0 9.8 0 5.4 4 -2014-02-23 13324 20140223 0 9.1 0 5.4 4.3 -2014-02-24 12922 20140224 0 5.9 0 4.4 2.7 -2014-02-25 12830 20140225 0 5.9 0 3.3 2.4 -2014-02-26 11188 20140226 0.03 5.9 0.2 3.1 2 -2014-02-27 12036 20140227 0 5.9 0 3.4 1.4 -2014-02-28 9587 20140228 0 5.9 0 2.4 0.9 -2014-03-01 9202 20140301 0 5.9 0 3.7 2 -2014-03-02 8195 20140302 0 5.1 0 4 3.2 -2014-03-03 7708 20140303 0.04 5.1 0.1 3.2 1.7 -2014-03-04 10398 20140304 0 5.1 0 2.9 1.3 -2014-03-05 13801 20140305 0 3.9 0 3.9 2.6 -2014-03-06 12773 20140306 0 3.9 0 3.1 1.6 -2014-03-07 13304 20140307 0 3.9 0 3.7 2.6 -2014-03-08 16350 20140308 0 3.1 0 5.7 3.5 -2014-03-09 12262 20140309 0 2 0 4.4 3.6 -2014-03-10 17231 20140310 0 1.2 0 5.1 3.6 -2014-03-11 24356 20140311 0 0 0 6.6 4.5 -2014-03-12 15101 20140312 0.35 0 0 5.6 3.2 -2014-03-13 10979 20140313 0 0 0 3.2 1.8 -2014-03-14 16299 20140314 0 0 0 4.6 2.2 -2014-03-15 18053 20140315 0 0 0 5.8 4.2 -2014-03-16 9851 20140316 0 0 0 4.2 3 -2014-03-17 12503 20140317 0 0 0 3.5 2.3 -2014-03-18 16679 20140318 0 0 0 4.3 2.8 -2014-03-19 13341 20140319 0.92 0 0 4.6 3.2 -2014-03-20 19695 20140320 0 0 0 5.4 4 -2014-03-21 20718 20140321 0 0 0 5.1 3.9 -2014-03-22 19880 20140322 0 0 0 6.3 4.1 -2014-03-23 11740 20140323 0 0 0 4.2 2.7 -2014-03-24 13598 20140324 0 0 0 3.5 2.1 -2014-03-25 16037 20140325 0 0 0 3.9 2.6 -2014-03-26 14500 20140326 0 0 0 3.6 2.4 -2014-03-27 16666 20140327 0 0 0 4.4 2.2 -2014-03-28 18425 20140328 0.04 0 0 6.2 3.7 -2014-03-29 4480 20140329 1.81 0 0 5.9 4.3 -2014-03-30 7907 20140330 0.35 0 0 4.9 3.9 -2014-03-31 17085 20140331 0.16 0 0 5.6 3.6 -2014-04-01 23908 20140401 0 0 0 6 3.9 -2014-04-02 22515 20140402 0 0 0 5.4 4.2 -2014-04-03 26321 20140403 0.07 0 0 6.7 4.6 -2014-04-04 12566 20140404 0.21 0 0 4.7 4 -2014-04-05 18655 20140405 0 0 0 5.4 4 -2014-04-06 20750 20140406 0 0 0 6.1 3.6 -2014-04-07 14677 20140407 0.52 0 0 5.3 4.3 -2014-04-08 21884 20140408 0.34 0 0 6.4 4.6 -2014-04-09 26931 20140409 0 0 0 6.1 4.5 -2014-04-10 27463 20140410 0 0 0 5.8 4.2 -2014-04-11 26798 20140411 0 0 0 7.5 5.6 -2014-04-12 27108 20140412 0 0 0 7.3 5 -2014-04-13 26871 20140413 0 0 0 7.7 5.4 -2014-04-14 28832 20140414 0 0 0 7.5 5.9 -2014-04-15 10523 20140415 0.71 0 0 6.3 3.3 -2014-04-16 20082 20140416 0.05 0 0 4.9 3.1 -2014-04-17 22282 20140417 0 0 0 4.8 3.6 -2014-04-18 20770 20140418 0 0 0 4.9 3.5 -2014-04-19 24369 20140419 0 0 0 6.8 4.1 -2014-04-20 18464 20140420 0 0 0 6 4.5 -2014-04-21 27124 20140421 0 0 0 6.7 4.1 -2014-04-22 27259 20140422 0.01 0 0 7.1 5.1 -2014-04-23 26949 20140423 0 0 0 6.1 4.4 -2014-04-24 28107 20140424 0 0 0 6.2 4.2 -2014-04-25 27919 20140425 0.02 0 0 6.3 4.4 -2014-04-26 19899 20140426 0.92 0 0 6.7 4.6 -2014-04-27 21926 20140427 0 0 0 5.9 4.7 -2014-04-28 28557 20140428 0 0 0 6.7 4.5 -2014-04-29 18404 20140429 0.03 0 0 5.2 4.3 -2014-04-30 2867 20140430 4.97 0 0 5.2 4.1 -2014-05-01 26762 20140501 0.12 0 0 7.8 5.1 -2014-05-02 32377 20140502 0 0 0 7 5.6 -2014-05-03 25501 20140503 0.08 0 0 7.1 5.4 -2014-05-04 21277 20140504 0.02 0 0 6.5 5.4 -2014-05-05 30288 20140505 0 0 0 7 5 -2014-05-06 32233 20140506 0 0 0 7.1 5.2 -2014-05-07 32882 20140507 0 0 0 6.9 5 -2014-05-08 17619 20140508 0.41 0 0 5.9 5.3 -2014-05-09 22641 20140509 0.04 0 0 6.3 5.5 -2014-05-10 21832 20140510 0.37 0 0 8.3 5.7 -2014-05-11 28326 20140511 0.03 0 0 8.2 6 -2014-05-12 31644 20140512 0 0 0 8.5 6.4 -2014-05-13 32847 20140513 0 0 0 7.3 5.3 -2014-05-14 30044 20140514 0.01 0 0 7.1 5.2 -2014-05-15 26057 20140515 0.15 0 0 7.1 5.8 -2014-05-16 17784 20140516 1.54 0 0 6.8 5.8 -2014-05-17 29912 20140517 0 0 0 7 5.3 -2014-05-18 27436 20140518 0 0 0 6.7 5.1 -2014-05-19 31692 20140519 0 0 0 7.2 4.9 -2014-05-20 35567 20140520 0 0 0 7.8 5.3 -2014-05-21 34420 20140521 0 0 0 7.4 6.3 -2014-05-22 24535 20140522 0.24 0 0 6.7 5.9 -2014-05-23 24484 20140523 0.91 0 0 7.1 5.6 -2014-05-24 16798 20140524 0.4 0 0 7 5.7 -2014-05-25 25637 20140525 0 0 0 8 5.5 -2014-05-26 26930 20140526 0 0 0 8.6 6.6 -2014-05-27 32936 20140527 0 0 0 8.6 6.4 -2014-05-28 29690 20140528 0 0 0 6.4 5.4 -2014-05-29 33889 20140529 0 0 0 6.6 5.1 -2014-05-30 34343 20140530 0 0 0 7.5 5.5 -2014-05-31 27734 20140531 0.05 0 0 7.3 5.7 -2014-06-01 31333 20140601 0 0 0 7.7 5.5 -2014-06-02 34519 20140602 0 0 0 8 5.9 -2014-06-03 29501 20140603 0.12 0 0 8.7 6.5 -2014-06-04 36068 20140604 0 0 0 8.2 6.3 -2014-06-05 24619 20140605 0.87 0 0 7.6 6.1 -2014-06-06 37719 20140606 0 0 0 7.6 6.1 -2014-06-07 31204 20140607 0 0 0 8.2 6 -2014-06-08 30162 20140608 0 0 0 8.6 6.6 -2014-06-09 18611 20140609 1.6 0 0 7.3 6.3 -2014-06-10 33435 20140610 0 0 0 7.7 6.5 -2014-06-11 30548 20140611 0.02 0 0 7 6.1 -2014-06-12 29084 20140612 0.07 0 0 7.3 6 -2014-06-13 18308 20140613 1.28 0 0 7.9 6.5 -2014-06-14 28417 20140614 0 0 0 7.4 6 -2014-06-15 28382 20140615 0 0 0 8 5.9 -2014-06-16 34932 20140616 0 0 0 8.1 6.3 -2014-06-17 35097 20140617 0 0 0 8.9 7.1 -2014-06-18 35531 20140618 0 0 0 8.9 7.6 -2014-06-19 31724 20140619 0.15 0 0 7.7 6.8 -2014-06-20 34405 20140620 0 0 0 7.9 6.4 -2014-06-21 30288 20140621 0 0 0 7.8 6.2 -2014-06-22 29417 20140622 0 0 0 7.9 6.4 -2014-06-23 32580 20140623 0 0 0 8.1 6.5 -2014-06-24 35897 20140624 0 0 0 8.1 6.8 -2014-06-25 34340 20140625 0.08 0 0 8.5 7 -2014-06-26 34846 20140626 0.07 0 0 8.5 7 -2014-06-27 35823 20140627 0 0 0 8.3 6.7 -2014-06-28 28507 20140628 0 0 0 8.7 6.6 -2014-06-29 27986 20140629 0 0 0 8.3 6.8 -2014-06-30 33597 20140630 0 0 0 8.4 6.9 -2014-07-01 34854 20140701 0 0 0 8.9 7.2 -2014-07-02 26582 20140702 0.96 0 0 9.1 7.2 -2014-07-03 27587 20140703 1.78 0 0 8.7 6.9 -2014-07-04 13612 20140704 0.14 0 0 7.4 6.5 -2014-07-05 22913 20140705 0 0 0 8.1 6.3 -2014-07-06 23822 20140706 0 0 0 8.4 6.6 -2014-07-07 31863 20140707 0.04 0 0 9 7.2 -2014-07-08 32713 20140708 0.39 0 0 9.1 7.1 -2014-07-09 34426 20140709 0.09 0 0 8.8 7.1 -2014-07-10 36288 20140710 0 0 0 8.3 7.2 -2014-07-11 35000 20140711 0 0 0 8.6 7.1 -2014-07-12 28718 20140712 0 0 0 8.5 7.1 -2014-07-13 24869 20140713 0.03 0 0 8.3 7.2 -2014-07-14 25825 20140714 0.46 0 0 8.4 7.2 -2014-07-15 22963 20140715 1.3 0 0 8.6 7.2 -2014-07-16 35391 20140716 0 0 0 8.1 6.8 -2014-07-17 38619 20140717 0 0 0 8.1 6.7 -2014-07-18 36105 20140718 0 0 0 8.1 6.4 -2014-07-19 29291 20140719 0 0 0 7.6 6.8 -2014-07-20 30296 20140720 0 0 0 8 6.6 -2014-07-21 35670 20140721 0 0 0 8.5 6.7 -2014-07-22 36886 20140722 0 0 0 8.6 7.1 -2014-07-23 34173 20140723 0.19 0 0 8.8 7.2 -2014-07-24 36693 20140724 0 0 0 8 7 -2014-07-25 36384 20140725 0 0 0 8.2 6.6 -2014-07-26 27179 20140726 0 0 0 8.1 6.9 -2014-07-27 27004 20140727 0.02 0 0 8.5 7.1 -2014-07-28 33189 20140728 0.19 0 0 8.2 6.8 -2014-07-29 37092 20140729 0 0 0 7.6 6.4 -2014-07-30 37377 20140730 0 0 0 8 6.3 -2014-07-31 35458 20140731 0 0 0 8.2 6.8 -2014-08-01 32654 20140801 0 0 0 8.4 7.1 -2014-08-02 26784 20140802 0.41 0 0 7.4 6.3 -2014-08-03 25276 20140803 0.07 0 0 7.6 6.6 -2014-08-04 33822 20140804 0 0 0 8.4 7 -2014-08-05 34392 20140805 0 0 0 9 7.1 -2014-08-06 36336 20140806 0 0 0 8.3 7 -2014-08-07 36362 20140807 0 0 0 8.3 6.6 -2014-08-08 34073 20140808 0 0 0 8.3 6.5 -2014-08-09 31636 20140809 0 0 0 8.7 6.6 -2014-08-10 26749 20140810 0 0 0 8.8 6.8 -2014-08-11 33664 20140811 0 0 0 8.7 7.1 -2014-08-12 26261 20140812 0.19 0 0 7.9 7 -2014-08-13 27977 20140813 0.53 0 0 8.2 6.8 -2014-08-14 35154 20140814 0 0 0 7.7 6.3 -2014-08-15 32480 20140815 0 0 0 7.3 6.1 -2014-08-16 32017 20140816 0 0 0 7.8 6.3 -2014-08-17 27417 20140817 0.01 0 0 8.2 6.6 -2014-08-18 34339 20140818 0 0 0 8.1 6.3 -2014-08-19 35367 20140819 0 0 0 8.3 6.3 -2014-08-20 35532 20140820 0 0 0 8.4 7 -2014-08-21 33804 20140821 0.35 0 0 8.3 6.5 -2014-08-22 30406 20140822 0.06 0 0 7.9 6.5 -2014-08-23 26429 20140823 0.01 0 0 7.7 6.7 -2014-08-24 27493 20140824 0 0 0 8 6.4 -2014-08-25 32892 20140825 0 0 0 8.8 6.4 -2014-08-26 34519 20140826 0 0 0 8.9 7 -2014-08-27 33721 20140827 0 0 0 9 7 -2014-08-28 33927 20140828 0 0 0 8.2 6.6 -2014-08-29 31264 20140829 0 0 0 8 6.1 -2014-08-30 22689 20140830 0 0 0 8 6.5 -2014-08-31 18053 20140831 0.62 0 0 9 7.3 -2014-09-01 20725 20140901 0 0 0 8.8 7.5 -2014-09-02 29657 20140902 0 0 0 9.2 7.7 -2014-09-03 34843 20140903 0 0 0 8.6 7.2 -2014-09-04 36392 20140904 0 0 0 8.7 6.9 -2014-09-05 35579 20140905 0 0 0 8.7 7.2 -2014-09-06 26808 20140906 0.11 0 0 9.1 6.7 -2014-09-07 28901 20140907 0 0 0 8.1 6.5 -2014-09-08 33979 20140908 0 0 0 7.5 6.5 -2014-09-09 34166 20140909 0 0 0 7.3 6.3 -2014-09-10 37418 20140910 0 0 0 8 6.3 -2014-09-11 36668 20140911 0 0 0 8.3 6.9 -2014-09-12 38481 20140912 0 0 0 7.8 6.2 -2014-09-13 19499 20140913 0.26 0 0 6.9 5.8 -2014-09-14 27187 20140914 0 0 0 7.1 5.3 -2014-09-15 34258 20140915 0 0 0 7.1 5.5 -2014-09-16 25579 20140916 0.37 0 0 7 5.8 -2014-09-17 36791 20140917 0 0 0 7.3 5.5 -2014-09-18 37300 20140918 0 0 0 7.6 5.7 -2014-09-19 35674 20140919 0 0 0 6.6 5.4 -2014-09-20 29999 20140920 0 0 0 7.5 5.7 -2014-09-21 26650 20140921 0.15 0 0 7.5 6.7 -2014-09-22 32937 20140922 0 0 0 7.1 5.5 -2014-09-23 35599 20140923 0 0 0 7.1 5.2 -2014-09-24 35838 20140924 0 0 0 7.1 5.8 -2014-09-25 17165 20140925 0.32 0 0 6.4 5.7 -2014-09-26 34500 20140926 0 0 0 7.7 5.8 -2014-09-27 30463 20140927 0 0 0 8.3 6 -2014-09-28 29491 20140928 0 0 0 8.4 6.4 -2014-09-29 32385 20140929 0 0 0 7.9 6.7 -2014-09-30 34901 20140930 0 0 0 7.1 6.2 -2014-10-01 28053 20141001 0.02 0 0 6.5 6.1 -2014-10-02 34154 20141002 0 0 0 7 6.1 -2014-10-03 35966 20141003 0 0 0 7.1 5.6 -2014-10-04 14173 20141004 1.18 0 0 6.9 5.2 -2014-10-05 23578 20141005 0 0 0 6.1 4.6 -2014-10-06 30628 20141006 0 0 0 6.9 5 -2014-10-07 32756 20141007 0.06 0 0 7.1 6.3 -2014-10-08 33437 20141008 0.04 0 0 7.3 6.2 -2014-10-09 32322 20141009 0 0 0 6.8 5.5 -2014-10-10 31616 20141010 0 0 0 6.4 5.2 -2014-10-11 13807 20141011 0.33 0 0 6 5 -2014-10-12 23079 20141012 0 0 0 6.3 4.8 -2014-10-13 23668 20141013 0.05 0 0 6.5 5.2 -2014-10-14 30252 20141014 0 0 0 7.6 6.3 -2014-10-15 30477 20141015 0.69 0 0 7.7 6.9 -2014-10-16 24829 20141016 1.11 0 0 7.1 6.1 -2014-10-17 34755 20141017 0 0 0 7.1 5.9 -2014-10-18 25854 20141018 0 0 0 7 5.6 -2014-10-19 19646 20141019 0 0 0 5.6 4.4 -2014-10-20 26440 20141020 0 0 0 6 4.2 -2014-10-21 31087 20141021 0.11 0 0 6.7 5.5 -2014-10-22 12023 20141022 1.51 0 0 5.8 5 -2014-10-23 11194 20141023 0.61 0 0 5.3 5 -2014-10-24 29606 20141024 0 0 0 6.3 5.1 -2014-10-25 25232 20141025 0.01 0 0 6.7 5 -2014-10-26 21488 20141026 0 0 0 6.3 5.3 -2014-10-27 27532 20141027 0 0 0 6.3 4.8 -2014-10-28 32313 20141028 0 0 0 7.2 5.3 -2014-10-29 30651 20141029 0 0 0 7.2 5.1 -2014-10-30 31079 20141030 0 0 0 5.9 4.7 -2014-10-31 28843 20141031 0.05 0 0 5.5 4.5 -2014-11-01 7484 20141101 0.35 0 0 4.7 4.2 -2014-11-02 12990 20141102 0 0 0 4.8 4.1 -2014-11-03 24019 20141103 0 0 0 6.1 3.9 -2014-11-04 30181 20141104 0 0 0 6.8 5.3 -2014-11-05 30766 20141105 0 0 0 6.4 5.6 -2014-11-06 13949 20141106 0.37 0 0 5.7 4.8 -2014-11-07 25648 20141107 0 0 0 5.3 4 -2014-11-08 18211 20141108 0 0 0 4.8 3.6 -2014-11-09 18128 20141109 0 0 0 5.7 4.6 -2014-11-10 25573 20141110 0 0 0 6.1 4.4 -2014-11-11 28787 20141111 0 0 0 6.4 4.9 -2014-11-12 28164 20141112 0 0 0 6.5 4.7 -2014-11-13 23972 20141113 0.2 0 0 4.8 3.6 -2014-11-14 19709 20141114 0.06 0 0 4.2 3.5 -2014-11-15 14856 20141115 0 0 0 4.2 3.3 -2014-11-16 13445 20141116 0.03 0 0 4.5 3.5 -2014-11-17 7346 20141117 1.54 0 0 5.2 4 -2014-11-18 17010 20141118 0 0 0 4.5 2.4 -2014-11-19 16270 20141119 0 0 0 3.6 2.2 -2014-11-20 19987 20141120 0 0 0 4.5 3.1 -2014-11-21 18837 20141121 0 0 0 3.7 2.8 -2014-11-22 13154 20141122 0 0 0 4.4 2.8 -2014-11-23 15218 20141123 0 0 0 5.7 4.3 -2014-11-24 20794 20141124 0.7 0 0 6.9 5.3 -2014-11-25 26064 20141125 0 0 0 6.8 5.1 -2014-11-26 7479 20141126 1.24 0 0.2 5.1 3.4 -2014-11-27 3757 20141127 0.02 0 0 3.8 3.4 -2014-11-28 7839 20141128 0 0 0 3.7 2.9 -2014-11-29 7869 20141129 0 0 0 4.5 2.7 -2014-11-30 11772 20141130 0 0 0 5.5 4.5 -2014-12-01 18569 20141201 0.09 0 0 6.5 4.2 -2014-12-02 15175 20141202 0.08 0 0 4.3 3.5 -2014-12-03 12177 20141203 0.06 0 0 4.6 4.1 -2014-12-04 21055 20141204 0 0 0 4.5 3.7 -2014-12-05 18920 20141205 0.51 0 0 4.4 3.4 -2014-12-06 4441 20141206 1.22 0 0 5 3.9 -2014-12-07 9319 20141207 0.04 0 0 4.2 3 -2014-12-08 14283 20141208 0 0 0 3.7 2.4 -2014-12-09 6912 20141209 2.54 0 0 4.2 3.6 -2014-12-10 11098 20141210 0.08 0 1 4 3.2 -2014-12-11 16413 20141211 0.01 1.2 0 3.8 3.1 -2014-12-12 18850 20141212 0 0 0 3.8 3.2 -2014-12-13 13173 20141213 0 0 0 4.4 3.4 -2014-12-14 12096 20141214 0 0 0 4.6 3.8 -2014-12-15 17761 20141215 0 0 0 4.8 3.7 -2014-12-16 18941 20141216 0.2 0 0 4.9 3.8 -2014-12-17 18196 20141217 0.02 0 0 5.4 4.2 -2014-12-18 19206 20141218 0 0 0 4.2 3.7 -2014-12-19 18256 20141219 0 0 0 3.8 3.1 -2014-12-20 10421 20141220 0 0 0 3.3 3 -2014-12-21 8854 20141221 0 0 0 3.6 3.1 -2014-12-22 13120 20141222 0.04 0 0 4.4 3.5 -2014-12-23 9849 20141223 0.16 0 0 4.6 4.3 -2014-12-24 5049 20141224 0.8 0 0 5.8 4.4 -2014-12-25 4620 20141225 0.09 0 0 6.2 4.4 -2014-12-26 9360 20141226 0 0 0 5 4 -2014-12-27 10070 20141227 0 0 0 5.5 4.4 -2014-12-28 8055 20141228 0.1 0 0 5.4 4.3 -2014-12-29 13055 20141229 0 0 0 4.4 3.4 -2014-12-30 12483 20141230 0 0 0 3.4 2.8 -2014-12-31 10493 20141231 0 0 0 3.2 2.7 diff --git a/week3/week3_txtbook_questions.R b/week3/week3_txtbook_questions.R new file mode 100644 index 000000000..5a9fa1390 --- /dev/null +++ b/week3/week3_txtbook_questions.R @@ -0,0 +1,66 @@ +body <- read_table("body.dat.txt") + +weight <- body$height[116:120] +height <- body$height[122:126] + +body %>% + ggplot(aes(x= x23, y = x24)) + geom_point() + +library(tidyverse) + +model <- lm(x23 ~ x24, data = body) +model + +#3.6.3 + +lm.fit <- lm(medv ~ lstat + age, data = Boston) +summary(lm.fit) + +lm.fit <-lm(medv~., data = Boston) +summary(lm.fit) + +library(car) +vif(lm.fit) + +lm.fit1 <-lm(medv~.- age, data = Boston) +summary(lm.fit1) + +lm.fit1 <-update(lm.fit,~.- age) + + +#3.6.4 +summary(lm(medv~lstat * age, data = Boston)) + +#3.6.5 +lm.fit2 <-lm(medv∼lstat + I(lstat^2)) +summary(lm.fit2) +lm.fit <-lm(medv∼lstat) +anova(lm.fit, lm.fit2) + +par(mfrow = c(2, 2)) +plot(lm.fit2) + +lm.fit5 <- lm(medv ∼ poly(lstat, 5)) +summary(lm.fit5) +summary(lm(medv ∼ log(rm), data = Boston)) + +#3.6.6 +head(Carseats) +lm.fit <- lm(Sales ∼ . + Income:Advertising + Price:Age, data = Carseats) +summary(lm.fit) + +attach(Carseats) +contrasts(ShelveLoc) + +#6.1 + + +#6.2 + + +#6.3 + + + + + diff --git a/week4/model.RDS b/week4/model.RDS new file mode 100644 index 0000000000000000000000000000000000000000..4351138a2f8574e6bc2a2d42d75dd7e31e3a41a5 GIT binary patch literal 10056 zcmV-OC%4!iiwFP!000002JL$XR29pz_9loRs02kN379ZT20^D)%mH&aCXlR1R)Pr$ zB7zA)a#C{M1VKQ15D+COIfH;XU=~CPKQPn1g`@A@bMATXt^X}&Ev&BU?)s{}svfp` zhMl7r31ccCz8Nrc6v7{VW+QVR zYgtEjWRuqg^7k6N5+zeB*FBrS-Nj&VviGlNV{kHf2KJd~AJ((%AMg(-w2@b4wTYFU zg^r$?)lAyNX?0lJ293=ubj;A$(!|u79*;7z+;6R?cSz4fw_od}BMSr>2WlQXlzdHI zS}~=e*glzbIbx_z20Q`rZ{nOw*FGWbUtg5vIuHe7G`b2^_W)Ao=!d85GxW)w4M#6< zJBX1nvtJ166s{tjf&+GsT;&H6lkGYmY^nyw-3@HneLR7=+qoTm)9!*8rudjso$VlO zao6SxiepKq{0%8C4?dEa`)5^3_6CvZ@0TR-?)*+NkB>b#zlE2KU6%dzbi*w&^TaL{ zn|I?t$P(G5%?Dz@wOxaLNN0&(S2u<$ne|Ai;1AqWSZBW z8b+Bg$ap&@y_G}5rJlAi?;$YdaUNPw_c^> zwK-LrrTJaK3GbcLgN_~nQ3+-ejZAM4HGX8@{_YGi)9%vz8tpG2^YZQ2dG{hgh;zq6 zQcVK5MftU>da{#DN%ivg{M;bDZxEMssLHkluC?I2uuB)4>MAnN-}z}p-w4udnuHW1 zww z^`vGHF^#VFEZ&Y|;Ht1)P>cH?WSI2L1h#9l$ndZ2@+;lVK!}DY z;d{!OOzkjRnSa-wbYHQ%K>1E8$rNlp@caQU(0phc;p2D@WR!iqZgCOBC+O#zKA%FSK1lD(<|09Y&_sFNgf3toGG|YEy$6U~zgt^k7cX%4VqTtm ze>-{k%ffxc3O5k(Og()5`*ia1u9auOoVR4gQuAk$#cM&TO*R3Uvca9T<7bh+YRhUS2-<)U{rIclf^FcGyIg5+ zp97gvL!U>{vFfH|78x4zX}^Q33GjS+r0-E=CGh8JeyNhNl+3v%IJ$gRBYDZG&zs!c z4VZ?P74AEICeNIEmce!Q8n|lg`PN>u8bo;x+-F_@ri0w)9kVsbq}Tpqysox`WP|vm zi*ua8rS=H^J;f5hS-s_O-OOw<=}tw%j?$wbR3I@hXn8eoVwP`b*lrpudI4Pv=_v`&*gkCI(=6nm*VD_b9ZGYzEgifpoIAcqF>XrO=wifEvO2Fhrl zf(EMefL(S09?9a793IKzkpdnm;*k;_DdUj}9$^eQj3I|HRD#*o7pau`DnW5{6) zIgBBPG2}6ZJjRg681fiH9%IO340((pk1^yihCIelz!(Y`LjhwbU4z6fuS(#!$o z3?+=AgfWychBC%b#u&;NLm6WzV+>`Cp^PzzD#lR77^)aU6=SF(3=Y`^3s^&0)=-W$lxGbUSVKkD zP>D5EW(`$XLl&AW3r&`VCd)#TWueKk&}3O?vMe-N7Md&zO_qfw$3l~1p~pg(lBJlV_pHv(V&OXbLPe1s0kD z3r&HAroci|V4*3n&=go`3M@1Q7MdapO_7DB$U;+Op((P^6j^ABEVTYR#19{izmj z2D!;+?adrjK*rBF7Q0S*JL!IFoWr=iTS45in(qQig``;_cqwLC3jT1bcx6+--~a2= zWZLPHJP`VhWERi1Ctbo%&)x39M`i`iB4SpNWEQUn=UP(+nerjFEUK`FjC;NAae(Lq zGHdyagA3kr0iSI}k{`cSfe`gUKjf83@vGOa_y_`)+>^2>S0ufrKMl}Wv4M>3QnxwH zT}-B*y(r_#4#175jq8e1q(PEN(ki}OZ_@c={gVWK4>F(iS!Okj&|C73H+WK}69N&iRw0 zNM^XK#OUNqGU@T^r=eXwV5)85)Ga0q(o1~%Oo8q$GFU@}^l_O;GN(}US4$i;YNkyE zvCCZcyZdng?@{LpFMs9$zN2amRqePApk$Fn)5d$i+e}8zFZ3wz?b&(UvZfFOYp9FM z9c>0jbenxTilac}>5R8&Gj4#u(fPIJ+X{gvxjM;-_W&8iBe6-4^CTHvp?6$FP#5^B zcjVj)TtMCsC0kCG*pc3|_Kuc0YelB2i!W=s{+cvXO}oBvmp#Z_%oX?A=?U63`+{FYXp^YH= zJ>k)rKAp_SDNWig_85dFCd})LYa~M@m#!&a8$d?sJX&mOTny49Ts!G$xv+PMX)c+m=eDB9UJEdvJ=|(o9t9G7QrR}X zOCs-R3)FPKT?Gsdot#@RQ44rj=lgnWT-V=E$`OlFW|2W(?Me3`zG9GT7G)Jgvuds7Km?qtaDx zmr7TXsln8F44JmX@s`a*GAeA(1?BOD;ELTfq3S+$;32T$&g43KGM9hfbju^6q_6lo zPOq{BAmM~uQT1y>@`|APwW-|Q!2Q*hiV6o~a9uW;-)&wtNYra+49JKklW$diZgwaK zN$w`knChj#C4{T8oZlYoqo!|`yviZnN<8g+two$YE+9@pJgyKp(E+h!KZrI-&|c?)Dme|rdu zc1?)~T;-rlHDKQYHg-_S)>4(Ax)0oYv-P#g@pqu4*y|wwDNWF-7p0hUXfkL%=UDId zl>?MMvkvB0vjL@6)f*(#@*q6$WrOO4BM%xXKLRPyrQ2tY(E#;5wwu3yGXlA4wwrqzZ-c79Pldhr8!@k~ zH&kcYeA^8w<7E|%s!o7M1(Bzx7-V7o>Xbf@mH!?9@`y1(lDt8nEOt?%G$Rei)g0Ix zchO}GXq76Tr10LVKQEqLuXsa1U6hyBwC@|iJs!c;TTh$?r3K$dFJ{{aDjw@DYv@y? z`=cB*^r@Ulmj8<9{gc$#Iqv0w>fRDN+a56h107!O6juRvzQNIgjv}Dmt(KkV+YJz+ zQ+4{O;al+N@ZIC4od>~##M8{i;zm#~?O~+Z$*G`RZTID83q3&b7=Mx5r)5FGsie3m zZ;#ORTn=g*_C^?fxBzN;x_$Uc1wffpqreCA5Kvq5v03%DFlh5VR=lSx2UMo-Ts9}A zpEu>hP7$)^pw?n;pS03G@R(Ozd_$KND6cll%(*59DtGP>lsl?|`!u!SZFEKNH;^l~ zHZtm#F34Q{NrE9+jprXdGAPmR6vO=2MP??5%rgh~%3-t&$v z2{bMY=(=)H3XpGm7AiRG0QE6juQVz=0?opkEstJK1Fi3P%0GNl01xk*Ri|>4<36Fz zp%7@?ymxt7mm~IH-n3$^B+oS57x&a6HD8LiV?OIR=5`-zbij2gefyrde6lXCXLI+o z*(W-0fKm>Ho;m7*xK53`J})vimjTTkv)0ev8;1KDoyTl`8|{0OuYp#fhl*ciyg|P7 zCcw$d1sbP%2pC%(2lW=Kxn7Pp0kMXsidyFOf_gUpHF4l`rIqx_+ww!wtkx4 z1u7(hslA9mBd+Xd}1(aVf zRkn1lB_J1(%{XWSQ};Cq#I3c48IfzGOGm7O;nokgE0-q|*=;=XCg(2_`R{VA(~7sc@L zZkQx&#yETV8oa*6rzHGKA<7To!Dv$5V7I#q5#PSv+Ul_yjyv|g@Y}~+ThKhtH|t^h zyJ^89!})?~W?#2`QGHEB+DA9Rbtj2*7yD+mrz^0ZAmJUhoYt?19EnwVhyC8*{L*yR zY%}BD3q$80SUzrI8^VE^MfS1UZ3>7dn6fUXL}xDt<{$A)`$KV~xbXU$H}woAej-tR z(DmgC!`D(v5YLDwB7BeKwz;NPaettE(0oqhc6|@Q8HDSxYTX>qY$CVF{oRyz0!W8m z&L_6t;sgZAH`jh@oHb<1xM`HxPs02nKXg5Dy->YTJ?Xq~pP;;Gz91B8&vp*rftMHY zFZW$zhWiKAi3|{wcFVTGJkY%2KF!A3BdeoulFY5Fhk8AW}EY7nm{W8Hxk%r~4G+AU;rEpnOm~sIL*f*HzSu{XL>l zJyHLobByYQ`+rzpBR+||rYL#iCTB! z(yqqez?jjVSA5+(8SZp9y6?*&x5kus6blRB{NfI^A-36XR zV7q4NnqWC1Xz?t$1^e=d%LYo>Be%aJoW*vgkp-bdROdKOa!vyge(pl$oRsy1f85>m zIk~<>xLEHz5e+fuUnt>tURW6VdQPj>sFT3^L#lQ#wcpGnLYKbSVYPcVai-jV!zd>c z?8ho$y2RKtPuTdx#v&+#TMxsNw(vvG%M)s`+nt#?s=Fl>RpwZF+r;>d@K;SnkIL|EeC zvm+~C!tJt}L2{e;)=NK)=fe;aK6#yiW{Hu5*Qcpb@pu2v#m=Mi0k@U zUZpNui73&J8U7oriLfP&tzY!g32(j0Wju$Xh&UZz#VrvB2_LXTeBPQ-gun6YWlJ3$ zVLo?wjLm^*80XX{sk#E`5a{?M_gQ3oHq>E@jEIu<#QY(ikw2Pm6c@bq{H3$5)Hdka zzNqw-wg|j<+grQ2R*mp03ug;>s)%_G|2o-@kUmLV>zeiM_QWYftjP0aVmm8vK4?C2 zuS?SP$*Dx}XX8RUQ8yyN#n>{j`~a-U+;>=Vdl2C(dcbIlj2YqoAg}wvo%6&+?H4PL zDQtlC{9Ax)wg&bSJZbI8ZHs0Qk#qbidKe3cP|GR>5xW*#&rA8IzpB-gK=0!`P6Er~i1{w>F-zIJD9D*p075VyT4Q^U0lf{(NkBqOA2<#6NL1 zrqN3ZOo!?DB0|SD?8dwxN%tMhej}AEH}x`M{&*X6W!8e>{-CeiIQ!o=}20nzf& z6N!ila95YnhWms*hlE!`lL|~|MgC!4)|pD~S2s~#z;fo@@mitnh)GZy?DJw6y!Zf8nbobQP;l4)aG4a58SecX$Z$_91 zatnGAu^rqOmnbU}m(*(9zm}bLO zfpydm3C)%o1>Y?(&we8v`I^tHB|Ij*s!`qMPS}b}unC!Ohj~Ef9>wq8BlKKEDGS>X zFRgnEj~0#I4qxgvP3Sx+Mr2MgH~RK8gB)=-WV|?+A#r7j>&EuiF3|I{y1Rz(Vi?tK ztmu0;2r}0Sbz2I`z^k{*gKnQoH_k5=BJ5h##>{(95|{6+e*Ks|14cBQm*6qd zg`wLox;EXFhK})DKD^y63q!)+)$H7q3B&B~?RIg@Bs_O(mn%J9g7(8Jr}VOyxvhtu zE#Xodti*^*+M~|YPHMw`Y$n`4In8e^jK87$ex2WR=qk7A%%t8Ugdfk{HB+tq3AaTP zY@IHhA+{(AcCcM2fDyCW4>^ABh6xr%_w$V&!E26HP9qm=ChXBx<2EZ|T;%V3-_>P)2c|*)yKN%Qj0G^lwEw$F1qmTi3fwN7aLHzL3Ip zwOax6hj>Q*XueTg#7QfMY!~*Cgonp`Z^Isiem-uBy^BnM_OGgD)VjK1p0ACZudcJu z75eF2OS7Ax4P!RM`h_K$;C$$LqH?^b#uv^n@KW}(k8^Lv!NiK_d8M0~Fa~~gb=o}- z9xY0ov8yY?sS%L*MeOov}$1;HeA8zfOI49`z|vQkXKQL%5Y_*|+Jc z!A&ig^f7PBf*qgnymQ8+r#su1WB%>Ey>dI8>iX+$y}l%472*Z+L-!qV>W+?6(XQ`s zL6lM3M^S@6`Lou%S*B@}Pk66fX+I{s170^Y2{oDW2kH}i4ySjDR2h6oXZ6$Mj2H5E z-r)WVT5T_WYVT*vr^Egyce$tC!+NOn-i;+ns;Hj0|F152a%R__c3h`R(&;)}5X$j9P`{9 zF2X13%+pC1iVzRz+@tu3)|1AM=j=O&?*}w5t^zE;SF@s}gV9UL!3WH(&} zPaYT&nbN&6FDG4rse%hC7VL|Fkp>A3D<8$dn2mMg8lO}W*_)cTPi(Rvppn1)O1W1s zhG)6clXiWWn$^+tjIe~sf&Mr28U=`3-YpN;jAatH?pj(r*5V{G8oOoFpKnEeAak-# z=Mmmk;`WGj-9kD$VUobrJPzwiFl}Z?)ID&8xVccu=9SA=$b7s>hT(S)M&?yY+u3i1 z*A2!Mi+uHg(JzvU6nDuHH$NMX)V#lp$Xfd9<$|S>1X;YuRPC4!Ou28(lcS|dWV(!~ z2rChWvAk2RFP$z9V@`kEtXAg;V`WvF=gcpI$>6A|yxL(RBf;o`gf|XfGK!b%ARyn7H{(2(6In)VQ$^NR5;-7_dtsg&WX|OB z_1|@hAm@#Cyul*?nbXSLHhk#Dyru@exYey0028Jeh$v0|3{y12*gqYXK>WcZlWbn4 ziHb1nq*G{e{TUb&-@9k5vo($@Wf7G4V!8~b>Ta3vcHBA`Q?{jg&sJlY6u2Sbp`i-_ zedboTII6+uPbnoa8Vg|DsSJ`YDiMZzeYReceU!M#d!cHRayXhNvP*%;3bQFNk@r@z z>1bJaEv|4;TKakzZ_GEIslOj4eY5Of=b1xfNqjEf6yOh&zmYvY$I~HGcj03GN09`) zOWN61Tp(`o$vinJkp&~~2x&Jh_aI=vhR)*MyKp^|j260nzjlMjR1VX0+B^@&1lrbW zP3?xvkv$&jJ}xkAr~EqYOHYZcPTPl$*5_dSvvZU3N1P=x?WcSc%~XYHX*;iY%DO`4 z8W+7+`!*6;`&3pP?^MBk8k@4=jo_WxFtt3R)_kN74E6fMP`pYD&!^1X5-MB5gZWR& z>oa>4-VLJ-)|>3OG!EApNz`|#Y!))n2KOMdVazj`SLX=wjJ9o+!*k-M_X<*6CJ-h+ z6W!U<|NooR;$zxg!Ea%-su8iA{R!@qj8oOGoL9s{X6kJxgEdF6|D@4u+YBudP+t&P zjrC7%IeQ~MA@g*p?!R3?; zQ`$J9%<4T*U!(H~LqDEM+<0^ZOxrX;gYi59hBZXLczSFOWHNhl3&-9d(r?{1=sf)r zMlejDie~MF%t>uW7hXNUs$)^>X2Xnnm~zJEh3?@5;%0uG&JCNlF#f~D2j_B0%u8%~ zjEUEhILK^$U%>1!gsI=hjF|U+36aIy(D*!d0Xp|6ej+{b+oQ0BkrY=0NI^#gHZ3{Y%iFf(p^=-qI4S8LO+OuV-PQr@PjDPE^SLoBLkEF`tcABeFX7g#Z%jOz7@p#n8$w6hfWahYikxx zY%3t%E7{Ha#3My~aOp9L+Ir{*f5Ux^pnb6|wyXK*mz}tZG34#9A#X#VwAsxq=+Ovs z3+*E)-4SL+M@_9zdsvwoqtq?5O%4r9bcBv^f2y=!wjcZsCEcgrC`6GkBob5%t{-}7lOu=s_jjL}B|R}@wam|qvsuP5eLoqq4QYpAw!yYw z^)TCz7WuRNPk#c9eugB=sjF?J{evzvn9zPRJ&XOi+P3}aX&aeP+c>y}ZRI(tZDFK+ zz^Kb7S&J%C{P2})AW91Giy@$Y{UIA`W8a#F2mNF&t z&Fu$e8;~57jm6}kta(Yz-_xe%GbAa!ET59)+>~ruORaC!Xc?46PpS2l7b$zSDhtl*=$S63MXoRT`IMnX;0Z}tY}@5m|3qZqo%3d{@lDX=XtTCe2J<6c-hLoQic51#=o65hy zftqg}L)q`FqV(b*I*$Ex{KZtf=?5si;W4G(F{Sj%4Yc3qw46ZMOB1PeO%xHJ#aoNY z>m{Y)sYh za!@?hh*S1*J4!w(qw~w7=F2K6y-tO~d(=beIbD=&@}y+*ciR7cYF~K{gsd|!o7^S+e5gzm3L z8kAlUKE%~!+LLCHY(J26TD~%} zbX*eJQXMv_v@4!+1cvqCRD( z?WjHs+Ao@)OxqFvBExt>yda747=Dh1`$K%7bHGpQC=R?I@j=@WAE+--J}4fsVeyTl z{h)fH{zvB+)d}_g@V-WTve9;F8lM~WJuT@v5Z{OOGpZxv4e2O<+|THIpghp|9+ua{ zVeungCk~CDLErBf^hJEC8MVmldJW?j7tt<}vFWw*$yioeR!EiPc(ZlR{)IQ?=Q2T$^ zh4Xwy)$dR9D4vAB$REltmEK=CAnaQIcAQ~9?*rni%s_!=V*TbYKD&O-BjH5H6*M$2 zghS*~=Kc2nv&-?fK8u=$$C_aC=uE}fw$oyAzLizkF zyg%)GHZ1RtL+eP?3}`D8uwMIL$BEY={;Pl9cPxKH&mZRfF?Bz|diKyh_@%#Lc$i+UBpZJe0}}jj^zKwMYQ}6^Yq{J7-r~`HR{_uz+q*??^g^w7M`bX(Z7Y|4zt6D9#rd?>gnqn=@{vmSXp8yJxe3qBihEe z3Hq1l>EL6v*QElTRdT0T9-)H!HHN3j|^N?@k+QwM#s2)b*)X_FJ e{#hT6IBcq`XZ(v)2j7ul^!`7P9}xooUjP7y{S$rw literal 0 HcmV?d00001 diff --git a/week4/project/community-notes-2025-group-5 b/week4/project/community-notes-2025-group-5 index 0d2d66d29..2608e9a83 160000 --- a/week4/project/community-notes-2025-group-5 +++ b/week4/project/community-notes-2025-group-5 @@ -1 +1 @@ -Subproject commit 0d2d66d29dea5117294eed311f2e6adb073f6561 +Subproject commit 2608e9a83baf208d10af1d88b7ca7d6ebc89d342 diff --git a/week4/project/vanessa/01_download_notes_data.sh b/week4/project/vanessa/01_download_notes_data.sh deleted file mode 100644 index c65387122..000000000 --- a/week4/project/vanessa/01_download_notes_data.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -# use curl or wget to download the version 2 1gram file with all terms starting with "1", googlebooks-eng-all-1gram-20120701-1.gz -curl -o birdwatch-public-data-2025-06-16-notes.gz https://ton.twimg.com/birdwatch-public-data/2025/06/16/notes/notes-00000.zip -# update the timestamp on the resulting file using touch -# do not remove, this will keep make happy and avoid re-downloading of the data once you have it -touch birdwatch-public-data-2025-06-16-notes.gz - - \ No newline at end of file diff --git a/week4/project/vanessa/02_filter_notes_data.sh b/week4/project/vanessa/02_filter_notes_data.sh deleted file mode 100644 index ad358a89e..000000000 --- a/week4/project/vanessa/02_filter_notes_data.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -# filter original 1gram file googlebooks-eng-all-1gram-20120701-1.gz to only lines where the ngram exactly matches a year (18xx, 19xx, or 20xx, where x is a digit) -# decompress the first using gunzip, zless, zcat or similar -# then filter out rows that match using grep -E, egrep, awk, or similar -# write results to year_counts.tsv - -#zcat birdwatch-public-data-2025-06-16-notes.gz > birdwatch-public-data-2025-06-16-notes.tsv - - -#!/bin/bash - -START_MILLI=1611360000000 #jan 23 2021 12:00:00 AM -END_MILLI=1627775999000 # july 31 2021 23:59:59 PM -OUTPUT_FILE="notes.tsv" -INPUT_FILE="C:\Users\ds3\Desktop\coursework\week4\project\vanessa\data\birdwatch-public-data-2025-06-16-notes.gz" -#1627696800000 -#end time gmt 1627732799000 - -#1611360000000 gmt start time -# Copy header -zcat "$INPUT_FILE" | head -n 1 > "$OUTPUT_FILE" - -# Filter by createdAtMillis (column 3) -zcat "$INPUT_FILE" | tail -n +2 | awk -F '\t' -v min="$START_MILLI" -v max="$END_MILLI" '{if ($3 >= min && $3 <= max) print $0;}' >> "$OUTPUT_FILE" \ No newline at end of file diff --git a/week4/project/vanessa/03_download_ratings.sh b/week4/project/vanessa/03_download_ratings.sh deleted file mode 100644 index 036d91f53..000000000 --- a/week4/project/vanessa/03_download_ratings.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -# # use curl or wget to download the version 2 of the total counts file, googlebooks-eng-all-totalcounts-20120701.txt -# curl -o birdwatch-public-data/2025/06/16/noteRatings/ratings.gz https://ton.twimg.com/birdwatch-public-data/2025/06/16/noteRatings/ratings-000xx.zip -# # update the timestamp on the resulting file using touch -# # do not remove, this will keep make happy and avoid re-downloading of the data once you have it -# touch birdwatch-public-data/2025/06/16/noteRatings/ratings.gz - - -# Base URL for the ratings data -baseUrl="https://ton.twimg.com/birdwatch-public-data/2025/06/16/noteRatings/ratings-" - -# Output directory for downloaded files -outputDir="$HOME/C:\Users\ds3\Desktop\coursework\week4\project\vanessa\data\birdwatch-ratings" - -# Ensure the output directory exists -mkdir -p "$outputDir" - -# Loop through numbers 00000 to 00019 -for i in {0..19}; do - # Format the number with leading zeros - num=$(printf "%05d" $i) - - # Construct the full URL - url="${baseUrl}${num}.zip" - - # Download the file - echo "Downloading $url..." - curl -s -O "$url" -o "${outputDir}/ratings-${num}.zip" -done - -echo "All downloads complete." \ No newline at end of file diff --git a/week4/project/vanessa/04_reformat_ratings.sh b/week4/project/vanessa/04_reformat_ratings.sh deleted file mode 100644 index 32d6f24ed..000000000 --- a/week4/project/vanessa/04_reformat_ratings.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -# Set start and end timestamps in milliseconds -start_milli=1611360000000 # Jan 23, 2021 00:00:00 UTC -end_milli=1627775999000 # July 31, 2021 23:59:59 UTC - -# Directory paths -zip_dir="C:\Users\ds3\Desktop\coursework\week4\project\vanessa\data" # Update with the correct path to your zip files -output_dir="filtered_data" # Directory to store filtered files -mkdir -p "$output_dir" # Create the output directory if it doesn't exist - -# Loop through ratings files, extract and filter by createdAtMillis (column 3) -for i in $(seq -w 0 19); do - unzip -p "$zip_dir/ratings-000${i}.zip" | awk -F'\t' -v min="$start_milli" -v max="$end_milli" \ - 'NR == 1 || ($3 >= min && $3 <= max)' > "$output_dir/filtered_ratings_${i}.tsv" -done - -# Merge filtered files, keeping the header from the first file only -head -n 1 "$output_dir/filtered_ratings_00.tsv" > complete_filtered_ratings.tsv -for i in $(seq -w 0 9); do - tail -n +2 "$output_dir/filtered_ratings_0${i}.tsv" >> complete_filtered_ratings.tsv -done - -echo "Merging complete. Final file: complete_filtered_ratings.tsv" - \ No newline at end of file diff --git a/week4/test_citibike_predictions.Rmd b/week4/test_citibike_predictions.Rmd index e69de29bb..58f812a64 100644 --- a/week4/test_citibike_predictions.Rmd +++ b/week4/test_citibike_predictions.Rmd @@ -0,0 +1,46 @@ +```{r load-data} + +library(tidyverse) +library(scales) +library(modelr) +library(lubridate) +library(ggplot2) + +trips_per_day_2015 <- read_tsv('./week4/trips_per_day_2015.tsv') +weather_2015 <- read_csv('./week4/weather_2015.csv') + +weather_2015 <- weather_2015 |> select("DATE", "PRCP", "SNOW", "SNWD", "TMIN", "TMAX") + +``` + +```{r join-data} + +join_weather_trip_2015 <- left_join(trips_per_day_2015, weather_2015, by = c("ymd" = "DATE")) + +join_weather_trip_2015 <- join_weather_trip_2015 %>% +mutate(is_weekend = ifelse(wday(ymd, label = TRUE) %in% c("Sat", "Sun"), 1, 0)) %>% +rename(prcp = PRCP, snow = SNOW, snwd = SNWD, tmin = TMIN, tmax = TMAX) %>% +mutate (tmin = tmin /10) %>% +mutate (tmax = tmax / 10) + + +join_weather_trip_2015 +``` + +```{r load-lm} + +model_rain_snow_weekend <- readRDS("model.RDS") + +test_2015_trip_weather <- predict(model_rain_snow_weekend, join_weather_trip_2015) + + +complete_cases <- complete.cases(test_2015_trip_weather, join_weather_trip_2015$num_trips) +test_2015_trip_weather <- test_2015_trip_weather[complete_cases] +join_weather_trip_2015 <- join_weather_trip_2015[complete_cases, ] + +test_err <- sqrt(mean((predict(model_rain_snow_weekend, join_weather_trip_2015) - join_weather_trip_2015$num_trips)^2)) +test_err + +test_2015_r_squared <- cor(test_2015_trip_weather, join_weather_trip_2015$num_trips)^2 +test_2015_r_squared +``` \ No newline at end of file