From b82e372ab8895157f62c5ba630ca57a661e4edb5 Mon Sep 17 00:00:00 2001 From: Will Townes Date: Mon, 4 Feb 2013 15:54:53 -0500 Subject: [PATCH 1/4] made the number of super bowls a variable, fixed some minor version related bugs in the ggplot commands, added library declarations that were missing --- R/Super Bowl boxes/super_bowl.R | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/R/Super Bowl boxes/super_bowl.R b/R/Super Bowl boxes/super_bowl.R index 75e36cd..5321750 100644 --- a/R/Super Bowl boxes/super_bowl.R +++ b/R/Super Bowl boxes/super_bowl.R @@ -17,6 +17,8 @@ library(ggplot2) library(RCurl) library(XML) +library(plyr) +library(reshape) # A function that converts a given integer into its Roman Numeral equivalent to.RomanNumeral<-function(x) { @@ -72,8 +74,9 @@ get.probs<-function(score.df) { return(prob.df) } -# There have been 45 Super Bowls -bowls<-lapply(1:45, to.RomanNumeral) +# There have been 47 Super Bowls +bowl.number<-47 +bowls<-lapply(1:bowl.number, to.RomanNumeral) # Create data frame of all Super Bowl scores scores.list<-lapply(bowls, get.scores) @@ -91,7 +94,7 @@ quarters.list<-lapply(1:nrow(scores.df), function(i) c(scores.df[i,2],sum(scores quarters.df<-as.data.frame(do.call(rbind, quarters.list)) # Final data set -super.df<-cbind(scores.df, quarters.df,rep(as.factor(c("Home","Away")),45)) +super.df<-cbind(scores.df, quarters.df,rep(as.factor(c("Home","Away")),bowl.number)) names(super.df)<-c("Team","Q1","Q2","Q3","Q4","Total","SB","Q1T","Q2T","Q3T","Type") # Get digit count totals in workable data frame for visualization @@ -132,31 +135,28 @@ boxes$Q4<-melt(q4.probs)$value # Create a heatmap of probability of winning given different digit combinations by quarter q1.heatmap<-ggplot(boxes, aes(xmin=x,xmax=x+1,ymin=y,ymax=y+1))+geom_rect(aes(color="white", fill=Q1))+ scale_fill_gradient(limits=c(0,.047), low="lightgrey", high="darkred", name="Pr(Winning)")+ - scale_color_manual(values=c("white"="white"), legend=FALSE)+theme_bw()+ + scale_color_manual(values=c("white"="white"), guide="none")+theme_bw()+ scale_x_continuous(breaks=.5:9.5,labels=0:9)+scale_y_continuous(breaks=.5:9.5,labels=0:9)+ - xlab("Home Team")+ylab("Away Team")+opts(title="Heat Map of Win Probabilties -- First Quater") + xlab("Home Team")+ylab("Away Team")+labs(title="Heat Map of Win Probabilities -- First Quarter") ggsave(plot=q1.heatmap, filename="images/q1_heatmap.png", height=12, width=12) q2.heatmap<-ggplot(boxes, aes(xmin=x,xmax=x+1,ymin=y,ymax=y+1))+geom_rect(aes(color="white", fill=Q2))+ scale_fill_gradient(limits=c(0,.047), low="lightgrey", high="darkred", name="Pr(Winning)")+ - scale_color_manual(values=c("white"="white"), legend=FALSE)+theme_bw()+ + scale_color_manual(values=c("white"="white"), guide="none")+theme_bw()+ scale_x_continuous(breaks=.5:9.5,labels=0:9)+scale_y_continuous(breaks=.5:9.5,labels=0:9)+ - xlab("Home Team")+ylab("Away Team")+opts(title="Heat Map of Win Probabilties -- Half Time") + xlab("Home Team")+ylab("Away Team")+labs(title="Heat Map of Win Probabilities -- Half Time") ggsave(plot=q2.heatmap, filename="images/q2_heatmap.png", height=12, width=12) q3.heatmap<-ggplot(boxes, aes(xmin=x,xmax=x+1,ymin=y,ymax=y+1))+geom_rect(aes(color="white", fill=Q3))+ scale_fill_gradient(limits=c(0,.047), low="lightgrey", high="darkred", name="Pr(Winning)")+ - scale_color_manual(values=c("white"="white"), legend=FALSE)+theme_bw()+ + scale_color_manual(values=c("white"="white"), guide="none")+theme_bw()+ scale_x_continuous(breaks=.5:9.5,labels=0:9)+scale_y_continuous(breaks=.5:9.5,labels=0:9)+ - xlab("Home Team")+ylab("Away Team")+opts(title="Heat Map of Win Probabilties -- Third Quarter") + xlab("Home Team")+ylab("Away Team")+labs(title="Heat Map of Win Probabilities -- Third Quarter") ggsave(plot=q3.heatmap, filename="images/q3_heatmap.png", height=12, width=12) q4.heatmap<-ggplot(boxes, aes(xmin=x,xmax=x+1,ymin=y,ymax=y+1))+geom_rect(aes(color="white", fill=Q4))+ scale_fill_gradient(limits=c(0,.047), low="lightgrey", high="darkred", name="Pr(Winning)")+ - scale_color_manual(values=c("white"="white"), legend=FALSE)+theme_bw()+ + scale_color_manual(values=c("white"="white"), guide="none")+theme_bw()+ scale_x_continuous(breaks=.5:9.5,labels=0:9)+scale_y_continuous(breaks=.5:9.5,labels=0:9)+ - xlab("Home Team")+ylab("Away Team")+opts(title="Heat Map of Win Probabilties -- Final") -ggsave(plot=q4.heatmap, filename="images/q4_heatmap.png", height=12, width=12) - - - + xlab("Home Team")+ylab("Away Team")+labs(title="Heat Map of Win Probabilities -- Final") +ggsave(plot=q4.heatmap, filename="images/q4_heatmap.png", height=12, width=12) \ No newline at end of file From 534a734766644fc2d9e37f2772b5ae37b1bc73a9 Mon Sep 17 00:00:00 2001 From: Will Townes Date: Mon, 4 Feb 2013 21:21:58 -0500 Subject: [PATCH 2/4] added new functions and code to run simulations to see if choosing a certain number of squares is advantageous --- R/Super Bowl boxes/super_bowl.R | 62 ++++++++++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 4 deletions(-) diff --git a/R/Super Bowl boxes/super_bowl.R b/R/Super Bowl boxes/super_bowl.R index 5321750..bac3122 100644 --- a/R/Super Bowl boxes/super_bowl.R +++ b/R/Super Bowl boxes/super_bowl.R @@ -138,25 +138,79 @@ q1.heatmap<-ggplot(boxes, aes(xmin=x,xmax=x+1,ymin=y,ymax=y+1))+geom_rect(aes(co scale_color_manual(values=c("white"="white"), guide="none")+theme_bw()+ scale_x_continuous(breaks=.5:9.5,labels=0:9)+scale_y_continuous(breaks=.5:9.5,labels=0:9)+ xlab("Home Team")+ylab("Away Team")+labs(title="Heat Map of Win Probabilities -- First Quarter") -ggsave(plot=q1.heatmap, filename="images/q1_heatmap.png", height=12, width=12) +#ggsave(plot=q1.heatmap, filename="images/q1_heatmap.png", height=12, width=12) q2.heatmap<-ggplot(boxes, aes(xmin=x,xmax=x+1,ymin=y,ymax=y+1))+geom_rect(aes(color="white", fill=Q2))+ scale_fill_gradient(limits=c(0,.047), low="lightgrey", high="darkred", name="Pr(Winning)")+ scale_color_manual(values=c("white"="white"), guide="none")+theme_bw()+ scale_x_continuous(breaks=.5:9.5,labels=0:9)+scale_y_continuous(breaks=.5:9.5,labels=0:9)+ xlab("Home Team")+ylab("Away Team")+labs(title="Heat Map of Win Probabilities -- Half Time") -ggsave(plot=q2.heatmap, filename="images/q2_heatmap.png", height=12, width=12) +#ggsave(plot=q2.heatmap, filename="images/q2_heatmap.png", height=12, width=12) q3.heatmap<-ggplot(boxes, aes(xmin=x,xmax=x+1,ymin=y,ymax=y+1))+geom_rect(aes(color="white", fill=Q3))+ scale_fill_gradient(limits=c(0,.047), low="lightgrey", high="darkred", name="Pr(Winning)")+ scale_color_manual(values=c("white"="white"), guide="none")+theme_bw()+ scale_x_continuous(breaks=.5:9.5,labels=0:9)+scale_y_continuous(breaks=.5:9.5,labels=0:9)+ xlab("Home Team")+ylab("Away Team")+labs(title="Heat Map of Win Probabilities -- Third Quarter") -ggsave(plot=q3.heatmap, filename="images/q3_heatmap.png", height=12, width=12) +#ggsave(plot=q3.heatmap, filename="images/q3_heatmap.png", height=12, width=12) q4.heatmap<-ggplot(boxes, aes(xmin=x,xmax=x+1,ymin=y,ymax=y+1))+geom_rect(aes(color="white", fill=Q4))+ scale_fill_gradient(limits=c(0,.047), low="lightgrey", high="darkred", name="Pr(Winning)")+ scale_color_manual(values=c("white"="white"), guide="none")+theme_bw()+ scale_x_continuous(breaks=.5:9.5,labels=0:9)+scale_y_continuous(breaks=.5:9.5,labels=0:9)+ xlab("Home Team")+ylab("Away Team")+labs(title="Heat Map of Win Probabilities -- Final") -ggsave(plot=q4.heatmap, filename="images/q4_heatmap.png", height=12, width=12) \ No newline at end of file +#ggsave(plot=q4.heatmap, filename="images/q4_heatmap.png", height=12, width=12) + +########################################################### +#Simulating the probability of success based on the number of squares chosen. +#By Will Townes (will.townes@gmail.com) + +#we don't care at this point about the identities of the squares (only care about the chance of winning as a function of the number of squares randomly chosen), so we can flatten quarterly probabilities into vectors and stack in data frame +probs<-as.data.frame(lapply(list(q1.probs,q2.probs,q3.probs,q4.probs),function(q){as.numeric(c(q,recursive=T))})) +colnames(probs)<-c('q1','q2','q3','q4') +cdfs<-cumsum(probs) +rq<-function(n,cdf){ + #pseudo random variate based on the quarterly probability of "winning" as defined by the empirical "cdf" vector. Returns a vector of length n. Each item in the vector is an index of a single square in the super bowl card + cdf<-sort(cdf) #cdf must be sorted in ascending order + u<-runif(n) #get uniform variates + res<-sapply(u,function(val){min(which(cdf>val))}) + #res is a vector of length n. Each element is the value of the cdf whose probability is closest to the uniform variate but still greater than it (this is basically an inverse transformation from the uniform distribution into the empirical discrete probability distribution) + return(res) +} +rchoice<-function(){ + #generates a list of samples of size 1,2,3,...100 chosen from the integers 1:100. Simulates choosing a certain number of squares from the board on a single super bowl. + res<-list() + rng<-1:100 + for(j in rng){ + res[[j]]<-sample(rng,j) + } + return(res) +} +getwinnings<-function(qwinners,choices,rewards){ + #qwinners is a 4-vector of quarterly scores, choices is a 100-list of square choices, and rewards is a 4-vector assigning the revenue value of winning a particular quarter. Returns a 100-vector with the amount of revenue gained by each group of "k" square choices + winnings<-rep(0,100) + for(i in 1:4){ + winners<-which(sapply(choices,function(squares){qwinners[i] %in% squares})) + winnings[winners]<-winnings[winners]+rewards[i] + } + return(winnings) +} +run.sims<-function(nsims){ + #runs the specified number of super bowl simulations and returns the expected profits for each choice of square count + winners<-apply(cdfs,2,FUN=function(cdf){rq(nsims,cdf)}) #simulate a bunch of games and see the predicted scores at the end of each quarter of each game. A data frame of same dimension as cdfs + squarecost<-10 + rewards<-squarecost*c(20,20,20,40) + profit<-rep(0,100) + costs<-seq(1,100)*squarecost + for(sim in 1:nsims){ + choices<-rchoice() + profit<-profit-costs+getwinnings(winners[sim,],choices,rewards) + } + return(profit) +} +for(i in 1:10){ + profit<-run.sims(1000) + barplot(profit) + which(profit==max(profit)) + which(profit==min(profit)) +} \ No newline at end of file From 3eae044d78cdc8988d700c46d437f72fa5fbbcfe Mon Sep 17 00:00:00 2001 From: Will Townes Date: Tue, 5 Feb 2013 20:48:04 -0500 Subject: [PATCH 3/4] split off my new code into a separate file so the original file is left relatively intact. Including the CSVs of the quarterly probabilities --- R/Super Bowl boxes/q1.probs.csv | 11 +++ R/Super Bowl boxes/q2.probs.csv | 11 +++ R/Super Bowl boxes/q3.probs.csv | 11 +++ R/Super Bowl boxes/q4.probs.csv | 11 +++ R/Super Bowl boxes/super_bowl.R | 72 ++++--------------- R/Super Bowl boxes/super_bowl_simulation.R | 81 ++++++++++++++++++++++ 6 files changed, 137 insertions(+), 60 deletions(-) create mode 100644 R/Super Bowl boxes/q1.probs.csv create mode 100644 R/Super Bowl boxes/q2.probs.csv create mode 100644 R/Super Bowl boxes/q3.probs.csv create mode 100644 R/Super Bowl boxes/q4.probs.csv create mode 100644 R/Super Bowl boxes/super_bowl_simulation.R diff --git a/R/Super Bowl boxes/q1.probs.csv b/R/Super Bowl boxes/q1.probs.csv new file mode 100644 index 0000000..ed4e923 --- /dev/null +++ b/R/Super Bowl boxes/q1.probs.csv @@ -0,0 +1,11 @@ +"0","1","2","3","4","5","6","7","8","9" +0.0457446808510638,0.025531914893617,0.025531914893617,0.0372340425531915,0.0276595744680851,0.025531914893617,0.0276595744680851,0.0382978723404255,0.025531914893617,0.0265957446808511 +0.0202127659574468,0,0,0.0117021276595745,0.00212765957446809,0,0.00212765957446809,0.0127659574468085,0,0.00106382978723404 +0.0202127659574468,0,0,0.0117021276595745,0.00212765957446809,0,0.00212765957446809,0.0127659574468085,0,0.00106382978723404 +0.0319148936170213,0.0117021276595745,0.0117021276595745,0.0234042553191489,0.0138297872340426,0.0117021276595745,0.0138297872340426,0.024468085106383,0.0117021276595745,0.0127659574468085 +0.025531914893617,0.00531914893617021,0.00531914893617021,0.0170212765957447,0.0074468085106383,0.00531914893617021,0.0074468085106383,0.0180851063829787,0.00531914893617021,0.00638297872340425 +0.0202127659574468,0,0,0.0117021276595745,0.00212765957446809,0,0.00212765957446809,0.0127659574468085,0,0.00106382978723404 +0.0202127659574468,0,0,0.0117021276595745,0.00212765957446809,0,0.00212765957446809,0.0127659574468085,0,0.00106382978723404 +0.0276595744680851,0.0074468085106383,0.0074468085106383,0.0191489361702128,0.00957446808510638,0.0074468085106383,0.00957446808510638,0.0202127659574468,0.0074468085106383,0.00851063829787234 +0.0202127659574468,0,0,0.0117021276595745,0.00212765957446809,0,0.00212765957446809,0.0127659574468085,0,0.00106382978723404 +0.0202127659574468,0,0,0.0117021276595745,0.00212765957446809,0,0.00212765957446809,0.0127659574468085,0,0.00106382978723404 diff --git a/R/Super Bowl boxes/q2.probs.csv b/R/Super Bowl boxes/q2.probs.csv new file mode 100644 index 0000000..6048fff --- /dev/null +++ b/R/Super Bowl boxes/q2.probs.csv @@ -0,0 +1,11 @@ +"0","1","2","3","4","5","6","7","8","9" +0.0297872340425532,0.0191489361702128,0.0191489361702128,0.0276595744680851,0.0212765957446809,0.0180851063829787,0.024468085106383,0.024468085106383,0.0170212765957447,0.0191489361702128 +0.0148936170212766,0.00425531914893617,0.00425531914893617,0.0127659574468085,0.00638297872340425,0.00319148936170213,0.00957446808510638,0.00957446808510638,0.00212765957446809,0.00425531914893617 +0.0127659574468085,0.00212765957446809,0.00212765957446809,0.0106382978723404,0.00425531914893617,0.00106382978723404,0.0074468085106383,0.0074468085106383,0,0.00212765957446809 +0.0202127659574468,0.00957446808510638,0.00957446808510638,0.0180851063829787,0.0117021276595745,0.00851063829787234,0.0148936170212766,0.0148936170212766,0.0074468085106383,0.00957446808510638 +0.0180851063829787,0.0074468085106383,0.0074468085106383,0.0159574468085106,0.00957446808510638,0.00638297872340425,0.0127659574468085,0.0127659574468085,0.00531914893617021,0.0074468085106383 +0.0127659574468085,0.00212765957446809,0.00212765957446809,0.0106382978723404,0.00425531914893617,0.00106382978723404,0.0074468085106383,0.0074468085106383,0,0.00212765957446809 +0.0159574468085106,0.00531914893617021,0.00531914893617021,0.0138297872340426,0.0074468085106383,0.00425531914893617,0.0106382978723404,0.0106382978723404,0.00319148936170213,0.00531914893617021 +0.0234042553191489,0.0127659574468085,0.0127659574468085,0.0212765957446809,0.0148936170212766,0.0117021276595745,0.0180851063829787,0.0180851063829787,0.0106382978723404,0.0127659574468085 +0.0159574468085106,0.00531914893617021,0.00531914893617021,0.0138297872340426,0.0074468085106383,0.00425531914893617,0.0106382978723404,0.0106382978723404,0.00319148936170213,0.00531914893617021 +0.0138297872340426,0.00319148936170213,0.00319148936170213,0.0117021276595745,0.00531914893617021,0.00212765957446809,0.00851063829787234,0.00851063829787234,0.00106382978723404,0.00319148936170213 diff --git a/R/Super Bowl boxes/q3.probs.csv b/R/Super Bowl boxes/q3.probs.csv new file mode 100644 index 0000000..b5506eb --- /dev/null +++ b/R/Super Bowl boxes/q3.probs.csv @@ -0,0 +1,11 @@ +"0","1","2","3","4","5","6","7","8","9" +0.0191489361702128,0.0117021276595745,0.00957446808510638,0.0117021276595745,0.0127659574468085,0.00957446808510638,0.0127659574468085,0.0148936170212766,0.00957446808510638,0.0127659574468085 +0.0138297872340426,0.00638297872340425,0.00425531914893617,0.00638297872340425,0.0074468085106383,0.00425531914893617,0.0074468085106383,0.00957446808510638,0.00425531914893617,0.0074468085106383 +0.0127659574468085,0.00531914893617021,0.00319148936170213,0.00531914893617021,0.00638297872340425,0.00319148936170213,0.00638297872340425,0.00851063829787234,0.00319148936170213,0.00638297872340425 +0.0202127659574468,0.0127659574468085,0.0106382978723404,0.0127659574468085,0.0138297872340426,0.0106382978723404,0.0138297872340426,0.0159574468085106,0.0106382978723404,0.0138297872340426 +0.0180851063829787,0.0106382978723404,0.00851063829787234,0.0106382978723404,0.0117021276595745,0.00851063829787234,0.0117021276595745,0.0138297872340426,0.00851063829787234,0.0117021276595745 +0.0138297872340426,0.00638297872340425,0.00425531914893617,0.00638297872340425,0.0074468085106383,0.00425531914893617,0.0074468085106383,0.00957446808510638,0.00425531914893617,0.0074468085106383 +0.0159574468085106,0.00851063829787234,0.00638297872340425,0.00851063829787234,0.00957446808510638,0.00638297872340425,0.00957446808510638,0.0117021276595745,0.00638297872340425,0.00957446808510638 +0.0276595744680851,0.0202127659574468,0.0180851063829787,0.0202127659574468,0.0212765957446809,0.0180851063829787,0.0212765957446809,0.0234042553191489,0.0180851063829787,0.0212765957446809 +0.0138297872340426,0.00638297872340425,0.00425531914893617,0.00638297872340425,0.0074468085106383,0.00425531914893617,0.0074468085106383,0.00957446808510638,0.00425531914893617,0.0074468085106383 +0.0117021276595745,0.00425531914893617,0.00212765957446809,0.00425531914893617,0.00531914893617021,0.00212765957446809,0.00531914893617021,0.0074468085106383,0.00212765957446809,0.00531914893617021 diff --git a/R/Super Bowl boxes/q4.probs.csv b/R/Super Bowl boxes/q4.probs.csv new file mode 100644 index 0000000..c610111 --- /dev/null +++ b/R/Super Bowl boxes/q4.probs.csv @@ -0,0 +1,11 @@ +"0","1","2","3","4","5","6","7","8","9" +0.0127659574468085,0.0127659574468085,0.0106382978723404,0.0106382978723404,0.0159574468085106,0.0117021276595745,0.0170212765957447,0.0212765957446809,0.00851063829787234,0.0138297872340426 +0.0138297872340426,0.0138297872340426,0.0117021276595745,0.0117021276595745,0.0170212765957447,0.0127659574468085,0.0180851063829787,0.0223404255319149,0.00957446808510638,0.0148936170212766 +0.00638297872340425,0.00638297872340425,0.00425531914893617,0.00425531914893617,0.00957446808510638,0.00531914893617021,0.0106382978723404,0.0148936170212766,0.00212765957446809,0.0074468085106383 +0.00957446808510638,0.00957446808510638,0.0074468085106383,0.0074468085106383,0.0127659574468085,0.00851063829787234,0.0138297872340426,0.0180851063829787,0.00531914893617021,0.0106382978723404 +0.00957446808510638,0.00957446808510638,0.0074468085106383,0.0074468085106383,0.0127659574468085,0.00851063829787234,0.0138297872340426,0.0180851063829787,0.00531914893617021,0.0106382978723404 +0.00638297872340425,0.00638297872340425,0.00425531914893617,0.00425531914893617,0.00957446808510638,0.00531914893617021,0.0106382978723404,0.0148936170212766,0.00212765957446809,0.0074468085106383 +0.00638297872340425,0.00638297872340425,0.00425531914893617,0.00425531914893617,0.00957446808510638,0.00531914893617021,0.0106382978723404,0.0148936170212766,0.00212765957446809,0.0074468085106383 +0.0127659574468085,0.0127659574468085,0.0106382978723404,0.0106382978723404,0.0159574468085106,0.0117021276595745,0.0170212765957447,0.0212765957446809,0.00851063829787234,0.0138297872340426 +0.0074468085106383,0.0074468085106383,0.00531914893617021,0.00531914893617021,0.0106382978723404,0.00638297872340425,0.0117021276595745,0.0159574468085106,0.00319148936170213,0.00851063829787234 +0.0074468085106383,0.0074468085106383,0.00531914893617021,0.00531914893617021,0.0106382978723404,0.00638297872340425,0.0117021276595745,0.0159574468085106,0.00319148936170213,0.00851063829787234 diff --git a/R/Super Bowl boxes/super_bowl.R b/R/Super Bowl boxes/super_bowl.R index bac3122..7a352d6 100644 --- a/R/Super Bowl boxes/super_bowl.R +++ b/R/Super Bowl boxes/super_bowl.R @@ -5,7 +5,7 @@ # Purpose: Gather and manipulate historic Super Bowl score data # Data Used: Wikipedia Super Bowl data # Packages Used: ggplot2, RCurl, XML -# Output File: +# Output File: q1probs.csv, q2probs.csv, q3probs.csv, q4probs.csv # Data Output: # Machine: Drew Conway's MacBook Pro @@ -41,7 +41,7 @@ to.RomanNumeral<-function(x) { } } -# Function returns quater scores from Wikipedia Super Bown pages +# Function returns quater scores from Wikipedia Super Bowl pages get.scores<-function(numeral) { # Base URL for Wikipedia wp.url<-getURL(paste("http://en.wikipedia.org/wiki/Super_Bowl_",numeral,sep="")) @@ -138,79 +138,31 @@ q1.heatmap<-ggplot(boxes, aes(xmin=x,xmax=x+1,ymin=y,ymax=y+1))+geom_rect(aes(co scale_color_manual(values=c("white"="white"), guide="none")+theme_bw()+ scale_x_continuous(breaks=.5:9.5,labels=0:9)+scale_y_continuous(breaks=.5:9.5,labels=0:9)+ xlab("Home Team")+ylab("Away Team")+labs(title="Heat Map of Win Probabilities -- First Quarter") -#ggsave(plot=q1.heatmap, filename="images/q1_heatmap.png", height=12, width=12) +ggsave(plot=q1.heatmap, filename="images/q1_heatmap.png", height=12, width=12) q2.heatmap<-ggplot(boxes, aes(xmin=x,xmax=x+1,ymin=y,ymax=y+1))+geom_rect(aes(color="white", fill=Q2))+ scale_fill_gradient(limits=c(0,.047), low="lightgrey", high="darkred", name="Pr(Winning)")+ scale_color_manual(values=c("white"="white"), guide="none")+theme_bw()+ scale_x_continuous(breaks=.5:9.5,labels=0:9)+scale_y_continuous(breaks=.5:9.5,labels=0:9)+ xlab("Home Team")+ylab("Away Team")+labs(title="Heat Map of Win Probabilities -- Half Time") -#ggsave(plot=q2.heatmap, filename="images/q2_heatmap.png", height=12, width=12) +ggsave(plot=q2.heatmap, filename="images/q2_heatmap.png", height=12, width=12) q3.heatmap<-ggplot(boxes, aes(xmin=x,xmax=x+1,ymin=y,ymax=y+1))+geom_rect(aes(color="white", fill=Q3))+ scale_fill_gradient(limits=c(0,.047), low="lightgrey", high="darkred", name="Pr(Winning)")+ scale_color_manual(values=c("white"="white"), guide="none")+theme_bw()+ scale_x_continuous(breaks=.5:9.5,labels=0:9)+scale_y_continuous(breaks=.5:9.5,labels=0:9)+ xlab("Home Team")+ylab("Away Team")+labs(title="Heat Map of Win Probabilities -- Third Quarter") -#ggsave(plot=q3.heatmap, filename="images/q3_heatmap.png", height=12, width=12) +ggsave(plot=q3.heatmap, filename="images/q3_heatmap.png", height=12, width=12) q4.heatmap<-ggplot(boxes, aes(xmin=x,xmax=x+1,ymin=y,ymax=y+1))+geom_rect(aes(color="white", fill=Q4))+ scale_fill_gradient(limits=c(0,.047), low="lightgrey", high="darkred", name="Pr(Winning)")+ scale_color_manual(values=c("white"="white"), guide="none")+theme_bw()+ scale_x_continuous(breaks=.5:9.5,labels=0:9)+scale_y_continuous(breaks=.5:9.5,labels=0:9)+ xlab("Home Team")+ylab("Away Team")+labs(title="Heat Map of Win Probabilities -- Final") -#ggsave(plot=q4.heatmap, filename="images/q4_heatmap.png", height=12, width=12) - -########################################################### -#Simulating the probability of success based on the number of squares chosen. -#By Will Townes (will.townes@gmail.com) - -#we don't care at this point about the identities of the squares (only care about the chance of winning as a function of the number of squares randomly chosen), so we can flatten quarterly probabilities into vectors and stack in data frame -probs<-as.data.frame(lapply(list(q1.probs,q2.probs,q3.probs,q4.probs),function(q){as.numeric(c(q,recursive=T))})) -colnames(probs)<-c('q1','q2','q3','q4') -cdfs<-cumsum(probs) -rq<-function(n,cdf){ - #pseudo random variate based on the quarterly probability of "winning" as defined by the empirical "cdf" vector. Returns a vector of length n. Each item in the vector is an index of a single square in the super bowl card - cdf<-sort(cdf) #cdf must be sorted in ascending order - u<-runif(n) #get uniform variates - res<-sapply(u,function(val){min(which(cdf>val))}) - #res is a vector of length n. Each element is the value of the cdf whose probability is closest to the uniform variate but still greater than it (this is basically an inverse transformation from the uniform distribution into the empirical discrete probability distribution) - return(res) -} -rchoice<-function(){ - #generates a list of samples of size 1,2,3,...100 chosen from the integers 1:100. Simulates choosing a certain number of squares from the board on a single super bowl. - res<-list() - rng<-1:100 - for(j in rng){ - res[[j]]<-sample(rng,j) - } - return(res) -} -getwinnings<-function(qwinners,choices,rewards){ - #qwinners is a 4-vector of quarterly scores, choices is a 100-list of square choices, and rewards is a 4-vector assigning the revenue value of winning a particular quarter. Returns a 100-vector with the amount of revenue gained by each group of "k" square choices - winnings<-rep(0,100) - for(i in 1:4){ - winners<-which(sapply(choices,function(squares){qwinners[i] %in% squares})) - winnings[winners]<-winnings[winners]+rewards[i] - } - return(winnings) -} -run.sims<-function(nsims){ - #runs the specified number of super bowl simulations and returns the expected profits for each choice of square count - winners<-apply(cdfs,2,FUN=function(cdf){rq(nsims,cdf)}) #simulate a bunch of games and see the predicted scores at the end of each quarter of each game. A data frame of same dimension as cdfs - squarecost<-10 - rewards<-squarecost*c(20,20,20,40) - profit<-rep(0,100) - costs<-seq(1,100)*squarecost - for(sim in 1:nsims){ - choices<-rchoice() - profit<-profit-costs+getwinnings(winners[sim,],choices,rewards) - } - return(profit) -} -for(i in 1:10){ - profit<-run.sims(1000) - barplot(profit) - which(profit==max(profit)) - which(profit==min(profit)) -} \ No newline at end of file +ggsave(plot=q4.heatmap, filename="images/q4_heatmap.png", height=12, width=12) + +### Store quarterly probabilities as local CSV so they can be used again without having to re-download from the web. +write.csv(q1.probs,"q1.probs.csv",row.names=F) +write.csv(q2.probs,"q2.probs.csv",row.names=F) +write.csv(q3.probs,"q3.probs.csv",row.names=F) +write.csv(q4.probs,"q4.probs.csv",row.names=F) \ No newline at end of file diff --git a/R/Super Bowl boxes/super_bowl_simulation.R b/R/Super Bowl boxes/super_bowl_simulation.R new file mode 100644 index 0000000..8931097 --- /dev/null +++ b/R/Super Bowl boxes/super_bowl_simulation.R @@ -0,0 +1,81 @@ +# File-Name: super_bowl_simulation.R +# Date: 2013-01-05 +# Author: Will Townes +# Email: will.townes@gmail.com +# Purpose: Simulating the expected profits as a function of number of squares chosen in a super bowl digits matrix based on historic Super Bowl score data. +# Data Used: Wikipedia Super Bowl data +# Packages Used: super_bowl.R +# Output File: + +# Copyright (c) 2013, under the Simplified BSD License. +# For more information on FreeBSD see: http://www.opensource.org/licenses/bsd-license.php +# All rights reserved. + +###Read in quarterly probabilities from local CSV files, if they exist. Otherwise call super_bowl.R to download the data from wikipedia and write the CSV files. +fnames<-c("q1.probs.csv","q2.probs.csv","q3.probs.csv","q4.probs.csv") +if(all(file.exists(fnames))){ + for(fname in fnames){ + vname<-strtrim(fname,8) + assign(vname,read.csv(fname)) + } +}else{ + print('files not found') + source('super_bowl.R') +} +#we don't care at this point about the identities of the squares (only care about the chance of winning as a function of the number of squares randomly chosen), so we can flatten quarterly probabilities into vectors and stack in data frame +probs<-as.data.frame(lapply(list(q1.probs,q2.probs,q3.probs,q4.probs),function(q){as.numeric(c(q,recursive=T))})) +colnames(probs)<-c('q1','q2','q3','q4') +cdfs<-cumsum(probs) +rq<-function(n,cdf){ + #pseudo random variate based on the quarterly probability of "winning" as defined by the empirical "cdf" vector. Returns a vector of length n. Each item in the vector is an index of a single square in the super bowl card + cdf<-sort(cdf) #cdf must be sorted in ascending order + u<-runif(n) #get uniform variates + res<-sapply(u,function(val){min(which(cdf>val))}) + #res is a vector of length n. Each element is the value of the cdf whose probability is closest to the uniform variate but still greater than it (this is basically an inverse transformation from the uniform distribution into the empirical discrete probability distribution) + return(res) +} +rchoice<-function(){ + #generates a list of samples of size 1,2,3,...100 chosen from the integers 1:100. Simulates choosing a certain number of squares from the board on a single super bowl. + res<-list() + rng<-1:100 + for(j in rng){ + res[[j]]<-sample(rng,j) + } + return(res) +} +getwinnings<-function(qwinners,choices,rewards){ + #qwinners is a 4-vector of quarterly scores, choices is a 100-list of square choices, and rewards is a 4-vector assigning the revenue value of winning a particular quarter. Returns a 100-vector with the amount of revenue gained by each group of "k" square choices + winnings<-rep(0,100) + for(i in 1:4){ + winners<-which(sapply(choices,function(squares){qwinners[i] %in% squares})) + winnings[winners]<-winnings[winners]+rewards[i] + } + return(winnings) +} +run.sim<-function(nsims,squarecost=10,rewards=c(20,20,20,40)){ + #runs the specified number of super bowl simulations and returns the expected (average for a single game) profits for each choice of square count + winners<-apply(cdfs,2,FUN=function(cdf){rq(nsims,cdf)}) #simulate a bunch of games and see the predicted scores at the end of each quarter of each game. A data frame of same dimension as cdfs + rewards<-squarecost*rewards + profit<-rep(0,100) + costs<-seq(1,100)*squarecost + for(sim in 1:nsims){ + choices<-rchoice() + profit<-profit-costs+getwinnings(winners[sim,],choices,rewards) + } + return(profit/nsims) +} +nsims<-c(10,100,1000,10000) +results<-matrix(nrow=100,ncol=length(nsims),dimnames=list(as.character(1:100),as.character(nsims))) +colnames(results)<-as.character(nsims) +for(i in 1:length(nsims)){ + results[,i]<-run.sim(nsims[i]) +} +#plot of expected profits for the 10,000 simulations experiment shows high variability: +barplot(results[,length(nsims)],xlab='Number of Squares Chosen',ylab='Average Profit',main="Unpredictable variation in Profits") +scatter.smooth(1:100,results[,length(nsims)],xlab="Number of Squares Purchased",ylab="Average Profit",main="Unpredictable variation in Profits") + +#but note that as the number of experiments run increases, the range of variation between maximum and minimum profits tends toward zero: + +plot(log10(nsims),apply(results,2,max),type='b',ylim=c(min(results),max(results)),ylab="Maximum and Minimum Average Profits",xlab="Number of experiments run, log scale",main="Expected Profits squeezed to zero") +lines(log10(nsims),apply(results,2,min),type='b') +abline(0,0) From 16cdd52f6677c95ace3156a1baae73ddd3271d95 Mon Sep 17 00:00:00 2001 From: Will Townes Date: Tue, 5 Feb 2013 21:46:49 -0500 Subject: [PATCH 4/4] minor update to plots --- R/Super Bowl boxes/super_bowl_simulation.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/R/Super Bowl boxes/super_bowl_simulation.R b/R/Super Bowl boxes/super_bowl_simulation.R index 8931097..9d5aa1a 100644 --- a/R/Super Bowl boxes/super_bowl_simulation.R +++ b/R/Super Bowl boxes/super_bowl_simulation.R @@ -70,12 +70,14 @@ colnames(results)<-as.character(nsims) for(i in 1:length(nsims)){ results[,i]<-run.sim(nsims[i]) } +profits<-results[,length(nsims)] +#roi<-profits/(10*1:100) #plot of expected profits for the 10,000 simulations experiment shows high variability: -barplot(results[,length(nsims)],xlab='Number of Squares Chosen',ylab='Average Profit',main="Unpredictable variation in Profits") -scatter.smooth(1:100,results[,length(nsims)],xlab="Number of Squares Purchased",ylab="Average Profit",main="Unpredictable variation in Profits") +barplot(profits,xlab='Number of Squares Chosen',ylab='Average Profit ($)',main="Unpredictable variation in Profits") +scatter.smooth(1:100,profits,xlab="Number of Squares Purchased",ylab="Average Profit ($)",main="Unpredictable variation in Profits") #but note that as the number of experiments run increases, the range of variation between maximum and minimum profits tends toward zero: -plot(log10(nsims),apply(results,2,max),type='b',ylim=c(min(results),max(results)),ylab="Maximum and Minimum Average Profits",xlab="Number of experiments run, log scale",main="Expected Profits squeezed to zero") +plot(log10(nsims),apply(results,2,max),type='b',ylim=c(min(results),max(results)),ylab="Maximum and Minimum Average Profits",xlab="Number of simulations run, log scale",main="Expected Profits squeezed to zero") lines(log10(nsims),apply(results,2,min),type='b') -abline(0,0) +abline(0,0) \ No newline at end of file