Walt Wells - 07.23-07.30.2016

Prepare Environment and Load Data

# Set Environment
if (!require("RCurl")) install.packages('RCurl')
if (!require("data.table")) install.packages('data.table')
if (!require("DT")) install.packages('DT')
if (!require("plyr")) install.packages('plyr')

# Import Data
if(!exists("dat")) {
    URL <- "https://raw.githubusercontent.com/wwells/CUNYBridge_R/master/agaricus-lepiota.data"
    x <- getURL(URL)
    dat <- fread(x, header=FALSE)
}

# Change to DF instead of DT
dat <- data.frame(dat)

# Preview
head(dat)
##   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1  p  x  s  n  t  p  f  c  n   k   e   e   s   s   w   w   p   w   o   p
## 2  e  x  s  y  t  a  f  c  b   k   e   c   s   s   w   w   p   w   o   p
## 3  e  b  s  w  t  l  f  c  b   n   e   c   s   s   w   w   p   w   o   p
## 4  p  x  y  w  t  p  f  c  n   n   e   e   s   s   w   w   p   w   o   p
## 5  e  x  s  g  f  n  f  w  b   k   t   e   s   s   w   w   p   w   o   e
## 6  e  x  y  y  t  a  f  c  b   n   e   c   s   s   w   w   p   w   o   p
##   V21 V22 V23
## 1   k   s   u
## 2   n   n   g
## 3   n   n   m
## 4   k   s   u
## 5   n   a   g
## 6   k   n   g

Subset

# choose class, cap-shape, cap-surface, cap-color, population, and habitat
##  these decisions are arbitratry for the purposes of this project.   we'd otherwise 
##  want to do some additional data exploration before finalizing our subset
mdat <- dat[,c(1:4, 22, 23)]

# Preview Data
head(mdat)
##   V1 V2 V3 V4 V22 V23
## 1  p  x  s  n   s   u
## 2  e  x  s  y   n   g
## 3  e  b  s  w   n   m
## 4  p  x  y  w   s   u
## 5  e  x  s  g   a   g
## 6  e  x  y  y   n   g

Rename Columns and Data

# Update vars and colnames by creating new cols
mdat$class <- revalue(mdat$V1, c("e"="edible", "p"="poisonous"))
mdat$class <- factor(mdat$class)
mdat$cshape <- revalue(mdat$V2, c("b"="bell", "c"="conical", "x"="convex",
                                  "f"="flat", "k"="knobbed", "s"="sunken"))
mdat$csurface <- revalue(mdat$V3, c("f"="fibrous", "g"="grooves", "y"="scaly",
                                    "s"="smooth"))
mdat$ccolor <- revalue(mdat$V4, c("n"="brown", "b"="buff", "c"="cinnamon",
                                  "g"="gray", "r"="green", "p"= "pink",
                                  "u"="purple", "e"="red", "w"="white",
                                  "y"="yellow"))
mdat$population <- revalue(mdat$V22, c("a"="abundant", "c"="clustered",
                                       "n"="numerous", "s"="scattered",
                                       "v"="several", "y"="solitary"))
mdat$habitat <- revalue(mdat$V23, c("g"="grasses", "l"="leaves", "m"="meadows",
                                    "p"="paths", "u"="urban", "w"="waste",
                                    "d"="woods"))

# Remove raw columns, once accuracy of renaming assured
mdat <- mdat[, c(7:12)]

# Use datatable to review curated data
datatable(mdat, options = list(pageLength = 5))