Walt Wells - 07.31-08.07.2016
Prepare Environment and Load Data
# Set Environment
if (!require("plyr")) install.packages("plyr")
if (!require("dplyr")) install.packages("dplyr")
if (!require("DT")) install.packages('DT')
if (!require("ggplot2")) install.packages("ggplot2")
# Let's look at baseball data
df <- baseball
head(df)
## id year stint team lg g ab r h X2b X3b hr rbi sb cs bb so
## 4 ansonca01 1871 1 RC1 25 120 29 39 11 3 0 16 6 2 2 1
## 44 forceda01 1871 1 WS3 32 162 45 45 9 4 0 29 8 0 4 0
## 68 mathebo01 1871 1 FW1 19 89 15 24 3 1 0 10 2 1 2 0
## 99 startjo01 1871 1 NY2 33 161 35 58 5 1 1 34 4 2 3 0
## 102 suttoez01 1871 1 CL1 29 128 35 45 3 7 3 23 3 1 1 0
## 106 whitede01 1871 1 CL1 29 146 40 47 6 5 1 21 2 2 4 1
## ibb hbp sh sf gidp
## 4 NA NA NA NA NA
## 44 NA NA NA NA NA
## 68 NA NA NA NA NA
## 99 NA NA NA NA NA
## 102 NA NA NA NA NA
## 106 NA NA NA NA NA
Explore and Subset
## lets compare the HRs for Barry Bonds (bondsba01), Mark McGwire (mcgwima01), and Sammy Sosa (sosasa01)
# id source - http://www.baseball-reference.com/data/war_daily_bat.txt
sub <- df %>%
filter(id=="bondsba01" | id == "mcgwima01" | id == "sosasa01")
table(sub$id, sub$year)
##
## 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997
## bondsba01 1 1 1 1 1 1 1 1 1 1 1 1
## mcgwima01 1 1 1 1 1 1 1 1 1 1 1 2
## sosasa01 0 0 0 2 1 1 1 1 1 1 1 1
##
## 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007
## bondsba01 1 1 1 1 1 1 1 1 1 1
## mcgwima01 1 1 1 1 0 0 0 0 0 0
## sosasa01 1 1 1 1 1 1 1 1 0 1
# we can see there is overlap between the 3 players from 1989 - 2001. let's just look at those years.
sub <- sub %>%
filter(year >= 1989 & year <= 2001)
table(sub$id, sub$year)
##
## 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000
## bondsba01 1 1 1 1 1 1 1 1 1 1 1 1
## mcgwima01 1 1 1 1 1 1 1 1 2 1 1 1
## sosasa01 2 1 1 1 1 1 1 1 1 1 1 1
##
## 2001
## bondsba01 1
## mcgwima01 1
## sosasa01 1
Clean, Rename
# we can see that there are some double entries where players played for two teams in a year. we should a) subset further to only the data we care about b) show only one entry per player/id per year, c) rename cols, players
# a) subset further
homers <- sub %>%
select(id, year, hr)
# b) one year per id
homers <- homers %>%
group_by(year, id) %>%
summarise_each(funs(sum))
# c) rename
names(homers) <- c("Year", "Player", "HomeRuns")
homers$Player <- factor(revalue(homers$Player, c("bondsba01"="Barry Bonds",
"mcgwima01"="Mark McGwire",
"sosasa01" = "Sammy Sosa")))
# let's confirm this is what we want to plot:
datatable(homers, options = list(pageLength = 5))
Exploratory Plots
#hist
p <- ggplot(homers, aes(HomeRuns)) + geom_histogram(aes(fill=Player), binwidth=5, color="white")
p + facet_grid(. ~Player) + ggtitle("Annual HR, 1989-2001") +
theme(legend.position='none')
data:image/s3,"s3://crabby-images/fdfa0/fdfa0a865e240b5714130702fe893c6aa9f56fc0" alt=""
#boxplot
p <- ggplot(homers, aes(x=Player, y=HomeRuns))
p + geom_boxplot(aes(fill=Player)) + ggtitle("Annual HR, 1989-2001") +
theme(legend.position='none') + xlab("")
data:image/s3,"s3://crabby-images/d35d1/d35d1caccb99b8e512125739ff1a85c6e8593fc3" alt=""
#scatterplot
p <- ggplot(homers, aes(x=Year, y=HomeRuns))
p + geom_point(aes(color=Player)) + geom_hline(yintercept=61, color="red") +
annotate("text", 1990, 61+4, label = "Roger Maris \n HR Record",
size =3, color = 'red') + ggtitle("Annual HR, 1989-2001")
data:image/s3,"s3://crabby-images/0f221/0f22149cd3c65a137cadd93786f340b39eb607b6" alt=""