Walt Wells - 07.31-08.07.2016

Prepare Environment and Load Data

# Set Environment
if (!require("plyr")) install.packages("plyr")
if (!require("dplyr")) install.packages("dplyr")
if (!require("DT")) install.packages('DT')
if (!require("ggplot2")) install.packages("ggplot2")

# Let's look at baseball data
df <- baseball
head(df)
##            id year stint team lg  g  ab  r  h X2b X3b hr rbi sb cs bb so
## 4   ansonca01 1871     1  RC1    25 120 29 39  11   3  0  16  6  2  2  1
## 44  forceda01 1871     1  WS3    32 162 45 45   9   4  0  29  8  0  4  0
## 68  mathebo01 1871     1  FW1    19  89 15 24   3   1  0  10  2  1  2  0
## 99  startjo01 1871     1  NY2    33 161 35 58   5   1  1  34  4  2  3  0
## 102 suttoez01 1871     1  CL1    29 128 35 45   3   7  3  23  3  1  1  0
## 106 whitede01 1871     1  CL1    29 146 40 47   6   5  1  21  2  2  4  1
##     ibb hbp sh sf gidp
## 4    NA  NA NA NA   NA
## 44   NA  NA NA NA   NA
## 68   NA  NA NA NA   NA
## 99   NA  NA NA NA   NA
## 102  NA  NA NA NA   NA
## 106  NA  NA NA NA   NA

Explore and Subset

## lets compare the HRs for Barry Bonds (bondsba01), Mark McGwire (mcgwima01), and Sammy Sosa (sosasa01)
# id source - http://www.baseball-reference.com/data/war_daily_bat.txt
sub <- df %>% 
    filter(id=="bondsba01" | id == "mcgwima01" | id == "sosasa01")
table(sub$id, sub$year)
##            
##             1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997
##   bondsba01    1    1    1    1    1    1    1    1    1    1    1    1
##   mcgwima01    1    1    1    1    1    1    1    1    1    1    1    2
##   sosasa01     0    0    0    2    1    1    1    1    1    1    1    1
##            
##             1998 1999 2000 2001 2002 2003 2004 2005 2006 2007
##   bondsba01    1    1    1    1    1    1    1    1    1    1
##   mcgwima01    1    1    1    1    0    0    0    0    0    0
##   sosasa01     1    1    1    1    1    1    1    1    0    1
# we can see there is overlap between the 3 players from 1989 - 2001.   let's just look at those years. 
sub <- sub %>%
    filter(year >= 1989 & year <= 2001)
table(sub$id, sub$year)
##            
##             1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000
##   bondsba01    1    1    1    1    1    1    1    1    1    1    1    1
##   mcgwima01    1    1    1    1    1    1    1    1    2    1    1    1
##   sosasa01     2    1    1    1    1    1    1    1    1    1    1    1
##            
##             2001
##   bondsba01    1
##   mcgwima01    1
##   sosasa01     1

Clean, Rename

# we can see that there are some double entries where players played for two teams in a year.   we should a) subset further to only the data we care about b) show only one entry per player/id per year, c) rename cols, players

# a) subset further
homers <- sub %>%
    select(id, year, hr)

# b) one year per id
homers <-  homers %>% 
    group_by(year, id) %>%
    summarise_each(funs(sum))

# c) rename
names(homers) <- c("Year", "Player", "HomeRuns")
homers$Player <- factor(revalue(homers$Player, c("bondsba01"="Barry Bonds", 
                                          "mcgwima01"="Mark McGwire", 
                                          "sosasa01" = "Sammy Sosa")))

# let's confirm this is what we want to plot:
datatable(homers, options = list(pageLength = 5))

Exploratory Plots

#hist
p <- ggplot(homers, aes(HomeRuns)) + geom_histogram(aes(fill=Player), binwidth=5, color="white")
p + facet_grid(. ~Player) + ggtitle("Annual HR, 1989-2001") + 
    theme(legend.position='none')

#boxplot
p <- ggplot(homers, aes(x=Player, y=HomeRuns)) 
p + geom_boxplot(aes(fill=Player)) + ggtitle("Annual HR, 1989-2001") + 
    theme(legend.position='none') + xlab("")

#scatterplot
p <- ggplot(homers, aes(x=Year, y=HomeRuns)) 
p + geom_point(aes(color=Player)) + geom_hline(yintercept=61, color="red") +
    annotate("text", 1990, 61+4, label = "Roger Maris \n HR Record", 
             size =3, color = 'red') + ggtitle("Annual HR, 1989-2001")