diff --git a/R/Super Bowl boxes/q1.probs.csv b/R/Super Bowl boxes/q1.probs.csv new file mode 100644 index 0000000..ed4e923 --- /dev/null +++ b/R/Super Bowl boxes/q1.probs.csv @@ -0,0 +1,11 @@ +"0","1","2","3","4","5","6","7","8","9" +0.0457446808510638,0.025531914893617,0.025531914893617,0.0372340425531915,0.0276595744680851,0.025531914893617,0.0276595744680851,0.0382978723404255,0.025531914893617,0.0265957446808511 +0.0202127659574468,0,0,0.0117021276595745,0.00212765957446809,0,0.00212765957446809,0.0127659574468085,0,0.00106382978723404 +0.0202127659574468,0,0,0.0117021276595745,0.00212765957446809,0,0.00212765957446809,0.0127659574468085,0,0.00106382978723404 +0.0319148936170213,0.0117021276595745,0.0117021276595745,0.0234042553191489,0.0138297872340426,0.0117021276595745,0.0138297872340426,0.024468085106383,0.0117021276595745,0.0127659574468085 +0.025531914893617,0.00531914893617021,0.00531914893617021,0.0170212765957447,0.0074468085106383,0.00531914893617021,0.0074468085106383,0.0180851063829787,0.00531914893617021,0.00638297872340425 +0.0202127659574468,0,0,0.0117021276595745,0.00212765957446809,0,0.00212765957446809,0.0127659574468085,0,0.00106382978723404 +0.0202127659574468,0,0,0.0117021276595745,0.00212765957446809,0,0.00212765957446809,0.0127659574468085,0,0.00106382978723404 +0.0276595744680851,0.0074468085106383,0.0074468085106383,0.0191489361702128,0.00957446808510638,0.0074468085106383,0.00957446808510638,0.0202127659574468,0.0074468085106383,0.00851063829787234 +0.0202127659574468,0,0,0.0117021276595745,0.00212765957446809,0,0.00212765957446809,0.0127659574468085,0,0.00106382978723404 +0.0202127659574468,0,0,0.0117021276595745,0.00212765957446809,0,0.00212765957446809,0.0127659574468085,0,0.00106382978723404 diff --git a/R/Super Bowl boxes/q2.probs.csv b/R/Super Bowl boxes/q2.probs.csv new file mode 100644 index 0000000..6048fff --- /dev/null +++ b/R/Super Bowl boxes/q2.probs.csv @@ -0,0 +1,11 @@ +"0","1","2","3","4","5","6","7","8","9" +0.0297872340425532,0.0191489361702128,0.0191489361702128,0.0276595744680851,0.0212765957446809,0.0180851063829787,0.024468085106383,0.024468085106383,0.0170212765957447,0.0191489361702128 +0.0148936170212766,0.00425531914893617,0.00425531914893617,0.0127659574468085,0.00638297872340425,0.00319148936170213,0.00957446808510638,0.00957446808510638,0.00212765957446809,0.00425531914893617 +0.0127659574468085,0.00212765957446809,0.00212765957446809,0.0106382978723404,0.00425531914893617,0.00106382978723404,0.0074468085106383,0.0074468085106383,0,0.00212765957446809 +0.0202127659574468,0.00957446808510638,0.00957446808510638,0.0180851063829787,0.0117021276595745,0.00851063829787234,0.0148936170212766,0.0148936170212766,0.0074468085106383,0.00957446808510638 +0.0180851063829787,0.0074468085106383,0.0074468085106383,0.0159574468085106,0.00957446808510638,0.00638297872340425,0.0127659574468085,0.0127659574468085,0.00531914893617021,0.0074468085106383 +0.0127659574468085,0.00212765957446809,0.00212765957446809,0.0106382978723404,0.00425531914893617,0.00106382978723404,0.0074468085106383,0.0074468085106383,0,0.00212765957446809 +0.0159574468085106,0.00531914893617021,0.00531914893617021,0.0138297872340426,0.0074468085106383,0.00425531914893617,0.0106382978723404,0.0106382978723404,0.00319148936170213,0.00531914893617021 +0.0234042553191489,0.0127659574468085,0.0127659574468085,0.0212765957446809,0.0148936170212766,0.0117021276595745,0.0180851063829787,0.0180851063829787,0.0106382978723404,0.0127659574468085 +0.0159574468085106,0.00531914893617021,0.00531914893617021,0.0138297872340426,0.0074468085106383,0.00425531914893617,0.0106382978723404,0.0106382978723404,0.00319148936170213,0.00531914893617021 +0.0138297872340426,0.00319148936170213,0.00319148936170213,0.0117021276595745,0.00531914893617021,0.00212765957446809,0.00851063829787234,0.00851063829787234,0.00106382978723404,0.00319148936170213 diff --git a/R/Super Bowl boxes/q3.probs.csv b/R/Super Bowl boxes/q3.probs.csv new file mode 100644 index 0000000..b5506eb --- /dev/null +++ b/R/Super Bowl boxes/q3.probs.csv @@ -0,0 +1,11 @@ +"0","1","2","3","4","5","6","7","8","9" +0.0191489361702128,0.0117021276595745,0.00957446808510638,0.0117021276595745,0.0127659574468085,0.00957446808510638,0.0127659574468085,0.0148936170212766,0.00957446808510638,0.0127659574468085 +0.0138297872340426,0.00638297872340425,0.00425531914893617,0.00638297872340425,0.0074468085106383,0.00425531914893617,0.0074468085106383,0.00957446808510638,0.00425531914893617,0.0074468085106383 +0.0127659574468085,0.00531914893617021,0.00319148936170213,0.00531914893617021,0.00638297872340425,0.00319148936170213,0.00638297872340425,0.00851063829787234,0.00319148936170213,0.00638297872340425 +0.0202127659574468,0.0127659574468085,0.0106382978723404,0.0127659574468085,0.0138297872340426,0.0106382978723404,0.0138297872340426,0.0159574468085106,0.0106382978723404,0.0138297872340426 +0.0180851063829787,0.0106382978723404,0.00851063829787234,0.0106382978723404,0.0117021276595745,0.00851063829787234,0.0117021276595745,0.0138297872340426,0.00851063829787234,0.0117021276595745 +0.0138297872340426,0.00638297872340425,0.00425531914893617,0.00638297872340425,0.0074468085106383,0.00425531914893617,0.0074468085106383,0.00957446808510638,0.00425531914893617,0.0074468085106383 +0.0159574468085106,0.00851063829787234,0.00638297872340425,0.00851063829787234,0.00957446808510638,0.00638297872340425,0.00957446808510638,0.0117021276595745,0.00638297872340425,0.00957446808510638 +0.0276595744680851,0.0202127659574468,0.0180851063829787,0.0202127659574468,0.0212765957446809,0.0180851063829787,0.0212765957446809,0.0234042553191489,0.0180851063829787,0.0212765957446809 +0.0138297872340426,0.00638297872340425,0.00425531914893617,0.00638297872340425,0.0074468085106383,0.00425531914893617,0.0074468085106383,0.00957446808510638,0.00425531914893617,0.0074468085106383 +0.0117021276595745,0.00425531914893617,0.00212765957446809,0.00425531914893617,0.00531914893617021,0.00212765957446809,0.00531914893617021,0.0074468085106383,0.00212765957446809,0.00531914893617021 diff --git a/R/Super Bowl boxes/q4.probs.csv b/R/Super Bowl boxes/q4.probs.csv new file mode 100644 index 0000000..c610111 --- /dev/null +++ b/R/Super Bowl boxes/q4.probs.csv @@ -0,0 +1,11 @@ +"0","1","2","3","4","5","6","7","8","9" +0.0127659574468085,0.0127659574468085,0.0106382978723404,0.0106382978723404,0.0159574468085106,0.0117021276595745,0.0170212765957447,0.0212765957446809,0.00851063829787234,0.0138297872340426 +0.0138297872340426,0.0138297872340426,0.0117021276595745,0.0117021276595745,0.0170212765957447,0.0127659574468085,0.0180851063829787,0.0223404255319149,0.00957446808510638,0.0148936170212766 +0.00638297872340425,0.00638297872340425,0.00425531914893617,0.00425531914893617,0.00957446808510638,0.00531914893617021,0.0106382978723404,0.0148936170212766,0.00212765957446809,0.0074468085106383 +0.00957446808510638,0.00957446808510638,0.0074468085106383,0.0074468085106383,0.0127659574468085,0.00851063829787234,0.0138297872340426,0.0180851063829787,0.00531914893617021,0.0106382978723404 +0.00957446808510638,0.00957446808510638,0.0074468085106383,0.0074468085106383,0.0127659574468085,0.00851063829787234,0.0138297872340426,0.0180851063829787,0.00531914893617021,0.0106382978723404 +0.00638297872340425,0.00638297872340425,0.00425531914893617,0.00425531914893617,0.00957446808510638,0.00531914893617021,0.0106382978723404,0.0148936170212766,0.00212765957446809,0.0074468085106383 +0.00638297872340425,0.00638297872340425,0.00425531914893617,0.00425531914893617,0.00957446808510638,0.00531914893617021,0.0106382978723404,0.0148936170212766,0.00212765957446809,0.0074468085106383 +0.0127659574468085,0.0127659574468085,0.0106382978723404,0.0106382978723404,0.0159574468085106,0.0117021276595745,0.0170212765957447,0.0212765957446809,0.00851063829787234,0.0138297872340426 +0.0074468085106383,0.0074468085106383,0.00531914893617021,0.00531914893617021,0.0106382978723404,0.00638297872340425,0.0117021276595745,0.0159574468085106,0.00319148936170213,0.00851063829787234 +0.0074468085106383,0.0074468085106383,0.00531914893617021,0.00531914893617021,0.0106382978723404,0.00638297872340425,0.0117021276595745,0.0159574468085106,0.00319148936170213,0.00851063829787234 diff --git a/R/Super Bowl boxes/super_bowl.R b/R/Super Bowl boxes/super_bowl.R index 75e36cd..7a352d6 100644 --- a/R/Super Bowl boxes/super_bowl.R +++ b/R/Super Bowl boxes/super_bowl.R @@ -5,7 +5,7 @@ # Purpose: Gather and manipulate historic Super Bowl score data # Data Used: Wikipedia Super Bowl data # Packages Used: ggplot2, RCurl, XML -# Output File: +# Output File: q1probs.csv, q2probs.csv, q3probs.csv, q4probs.csv # Data Output: # Machine: Drew Conway's MacBook Pro @@ -17,6 +17,8 @@ library(ggplot2) library(RCurl) library(XML) +library(plyr) +library(reshape) # A function that converts a given integer into its Roman Numeral equivalent to.RomanNumeral<-function(x) { @@ -39,7 +41,7 @@ to.RomanNumeral<-function(x) { } } -# Function returns quater scores from Wikipedia Super Bown pages +# Function returns quater scores from Wikipedia Super Bowl pages get.scores<-function(numeral) { # Base URL for Wikipedia wp.url<-getURL(paste("http://en.wikipedia.org/wiki/Super_Bowl_",numeral,sep="")) @@ -72,8 +74,9 @@ get.probs<-function(score.df) { return(prob.df) } -# There have been 45 Super Bowls -bowls<-lapply(1:45, to.RomanNumeral) +# There have been 47 Super Bowls +bowl.number<-47 +bowls<-lapply(1:bowl.number, to.RomanNumeral) # Create data frame of all Super Bowl scores scores.list<-lapply(bowls, get.scores) @@ -91,7 +94,7 @@ quarters.list<-lapply(1:nrow(scores.df), function(i) c(scores.df[i,2],sum(scores quarters.df<-as.data.frame(do.call(rbind, quarters.list)) # Final data set -super.df<-cbind(scores.df, quarters.df,rep(as.factor(c("Home","Away")),45)) +super.df<-cbind(scores.df, quarters.df,rep(as.factor(c("Home","Away")),bowl.number)) names(super.df)<-c("Team","Q1","Q2","Q3","Q4","Total","SB","Q1T","Q2T","Q3T","Type") # Get digit count totals in workable data frame for visualization @@ -132,31 +135,34 @@ boxes$Q4<-melt(q4.probs)$value # Create a heatmap of probability of winning given different digit combinations by quarter q1.heatmap<-ggplot(boxes, aes(xmin=x,xmax=x+1,ymin=y,ymax=y+1))+geom_rect(aes(color="white", fill=Q1))+ scale_fill_gradient(limits=c(0,.047), low="lightgrey", high="darkred", name="Pr(Winning)")+ - scale_color_manual(values=c("white"="white"), legend=FALSE)+theme_bw()+ + scale_color_manual(values=c("white"="white"), guide="none")+theme_bw()+ scale_x_continuous(breaks=.5:9.5,labels=0:9)+scale_y_continuous(breaks=.5:9.5,labels=0:9)+ - xlab("Home Team")+ylab("Away Team")+opts(title="Heat Map of Win Probabilties -- First Quater") + xlab("Home Team")+ylab("Away Team")+labs(title="Heat Map of Win Probabilities -- First Quarter") ggsave(plot=q1.heatmap, filename="images/q1_heatmap.png", height=12, width=12) q2.heatmap<-ggplot(boxes, aes(xmin=x,xmax=x+1,ymin=y,ymax=y+1))+geom_rect(aes(color="white", fill=Q2))+ scale_fill_gradient(limits=c(0,.047), low="lightgrey", high="darkred", name="Pr(Winning)")+ - scale_color_manual(values=c("white"="white"), legend=FALSE)+theme_bw()+ + scale_color_manual(values=c("white"="white"), guide="none")+theme_bw()+ scale_x_continuous(breaks=.5:9.5,labels=0:9)+scale_y_continuous(breaks=.5:9.5,labels=0:9)+ - xlab("Home Team")+ylab("Away Team")+opts(title="Heat Map of Win Probabilties -- Half Time") + xlab("Home Team")+ylab("Away Team")+labs(title="Heat Map of Win Probabilities -- Half Time") ggsave(plot=q2.heatmap, filename="images/q2_heatmap.png", height=12, width=12) q3.heatmap<-ggplot(boxes, aes(xmin=x,xmax=x+1,ymin=y,ymax=y+1))+geom_rect(aes(color="white", fill=Q3))+ scale_fill_gradient(limits=c(0,.047), low="lightgrey", high="darkred", name="Pr(Winning)")+ - scale_color_manual(values=c("white"="white"), legend=FALSE)+theme_bw()+ + scale_color_manual(values=c("white"="white"), guide="none")+theme_bw()+ scale_x_continuous(breaks=.5:9.5,labels=0:9)+scale_y_continuous(breaks=.5:9.5,labels=0:9)+ - xlab("Home Team")+ylab("Away Team")+opts(title="Heat Map of Win Probabilties -- Third Quarter") + xlab("Home Team")+ylab("Away Team")+labs(title="Heat Map of Win Probabilities -- Third Quarter") ggsave(plot=q3.heatmap, filename="images/q3_heatmap.png", height=12, width=12) q4.heatmap<-ggplot(boxes, aes(xmin=x,xmax=x+1,ymin=y,ymax=y+1))+geom_rect(aes(color="white", fill=Q4))+ scale_fill_gradient(limits=c(0,.047), low="lightgrey", high="darkred", name="Pr(Winning)")+ - scale_color_manual(values=c("white"="white"), legend=FALSE)+theme_bw()+ + scale_color_manual(values=c("white"="white"), guide="none")+theme_bw()+ scale_x_continuous(breaks=.5:9.5,labels=0:9)+scale_y_continuous(breaks=.5:9.5,labels=0:9)+ - xlab("Home Team")+ylab("Away Team")+opts(title="Heat Map of Win Probabilties -- Final") + xlab("Home Team")+ylab("Away Team")+labs(title="Heat Map of Win Probabilities -- Final") ggsave(plot=q4.heatmap, filename="images/q4_heatmap.png", height=12, width=12) - - +### Store quarterly probabilities as local CSV so they can be used again without having to re-download from the web. +write.csv(q1.probs,"q1.probs.csv",row.names=F) +write.csv(q2.probs,"q2.probs.csv",row.names=F) +write.csv(q3.probs,"q3.probs.csv",row.names=F) +write.csv(q4.probs,"q4.probs.csv",row.names=F) \ No newline at end of file diff --git a/R/Super Bowl boxes/super_bowl_simulation.R b/R/Super Bowl boxes/super_bowl_simulation.R new file mode 100644 index 0000000..9d5aa1a --- /dev/null +++ b/R/Super Bowl boxes/super_bowl_simulation.R @@ -0,0 +1,83 @@ +# File-Name: super_bowl_simulation.R +# Date: 2013-01-05 +# Author: Will Townes +# Email: will.townes@gmail.com +# Purpose: Simulating the expected profits as a function of number of squares chosen in a super bowl digits matrix based on historic Super Bowl score data. +# Data Used: Wikipedia Super Bowl data +# Packages Used: super_bowl.R +# Output File: + +# Copyright (c) 2013, under the Simplified BSD License. +# For more information on FreeBSD see: http://www.opensource.org/licenses/bsd-license.php +# All rights reserved. + +###Read in quarterly probabilities from local CSV files, if they exist. Otherwise call super_bowl.R to download the data from wikipedia and write the CSV files. +fnames<-c("q1.probs.csv","q2.probs.csv","q3.probs.csv","q4.probs.csv") +if(all(file.exists(fnames))){ + for(fname in fnames){ + vname<-strtrim(fname,8) + assign(vname,read.csv(fname)) + } +}else{ + print('files not found') + source('super_bowl.R') +} +#we don't care at this point about the identities of the squares (only care about the chance of winning as a function of the number of squares randomly chosen), so we can flatten quarterly probabilities into vectors and stack in data frame +probs<-as.data.frame(lapply(list(q1.probs,q2.probs,q3.probs,q4.probs),function(q){as.numeric(c(q,recursive=T))})) +colnames(probs)<-c('q1','q2','q3','q4') +cdfs<-cumsum(probs) +rq<-function(n,cdf){ + #pseudo random variate based on the quarterly probability of "winning" as defined by the empirical "cdf" vector. Returns a vector of length n. Each item in the vector is an index of a single square in the super bowl card + cdf<-sort(cdf) #cdf must be sorted in ascending order + u<-runif(n) #get uniform variates + res<-sapply(u,function(val){min(which(cdf>val))}) + #res is a vector of length n. Each element is the value of the cdf whose probability is closest to the uniform variate but still greater than it (this is basically an inverse transformation from the uniform distribution into the empirical discrete probability distribution) + return(res) +} +rchoice<-function(){ + #generates a list of samples of size 1,2,3,...100 chosen from the integers 1:100. Simulates choosing a certain number of squares from the board on a single super bowl. + res<-list() + rng<-1:100 + for(j in rng){ + res[[j]]<-sample(rng,j) + } + return(res) +} +getwinnings<-function(qwinners,choices,rewards){ + #qwinners is a 4-vector of quarterly scores, choices is a 100-list of square choices, and rewards is a 4-vector assigning the revenue value of winning a particular quarter. Returns a 100-vector with the amount of revenue gained by each group of "k" square choices + winnings<-rep(0,100) + for(i in 1:4){ + winners<-which(sapply(choices,function(squares){qwinners[i] %in% squares})) + winnings[winners]<-winnings[winners]+rewards[i] + } + return(winnings) +} +run.sim<-function(nsims,squarecost=10,rewards=c(20,20,20,40)){ + #runs the specified number of super bowl simulations and returns the expected (average for a single game) profits for each choice of square count + winners<-apply(cdfs,2,FUN=function(cdf){rq(nsims,cdf)}) #simulate a bunch of games and see the predicted scores at the end of each quarter of each game. A data frame of same dimension as cdfs + rewards<-squarecost*rewards + profit<-rep(0,100) + costs<-seq(1,100)*squarecost + for(sim in 1:nsims){ + choices<-rchoice() + profit<-profit-costs+getwinnings(winners[sim,],choices,rewards) + } + return(profit/nsims) +} +nsims<-c(10,100,1000,10000) +results<-matrix(nrow=100,ncol=length(nsims),dimnames=list(as.character(1:100),as.character(nsims))) +colnames(results)<-as.character(nsims) +for(i in 1:length(nsims)){ + results[,i]<-run.sim(nsims[i]) +} +profits<-results[,length(nsims)] +#roi<-profits/(10*1:100) +#plot of expected profits for the 10,000 simulations experiment shows high variability: +barplot(profits,xlab='Number of Squares Chosen',ylab='Average Profit ($)',main="Unpredictable variation in Profits") +scatter.smooth(1:100,profits,xlab="Number of Squares Purchased",ylab="Average Profit ($)",main="Unpredictable variation in Profits") + +#but note that as the number of experiments run increases, the range of variation between maximum and minimum profits tends toward zero: + +plot(log10(nsims),apply(results,2,max),type='b',ylim=c(min(results),max(results)),ylab="Maximum and Minimum Average Profits",xlab="Number of simulations run, log scale",main="Expected Profits squeezed to zero") +lines(log10(nsims),apply(results,2,min),type='b') +abline(0,0) \ No newline at end of file