Walt Wells - 07.23-07.30.2016
Prepare Environment and Load Data
# Set Environment
if (!require("RCurl")) install.packages('RCurl')
if (!require("data.table")) install.packages('data.table')
if (!require("DT")) install.packages('DT')
if (!require("plyr")) install.packages('plyr')
# Import Data
if(!exists("dat")) {
URL <- "https://raw.githubusercontent.com/wwells/CUNYBridge_R/master/agaricus-lepiota.data"
x <- getURL(URL)
dat <- fread(x, header=FALSE)
}
# Change to DF instead of DT
dat <- data.frame(dat)
# Preview
head(dat)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1 p x s n t p f c n k e e s s w w p w o p
## 2 e x s y t a f c b k e c s s w w p w o p
## 3 e b s w t l f c b n e c s s w w p w o p
## 4 p x y w t p f c n n e e s s w w p w o p
## 5 e x s g f n f w b k t e s s w w p w o e
## 6 e x y y t a f c b n e c s s w w p w o p
## V21 V22 V23
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
Subset
# choose class, cap-shape, cap-surface, cap-color, population, and habitat
## these decisions are arbitratry for the purposes of this project. we'd otherwise
## want to do some additional data exploration before finalizing our subset
mdat <- dat[,c(1:4, 22, 23)]
# Preview Data
head(mdat)
## V1 V2 V3 V4 V22 V23
## 1 p x s n s u
## 2 e x s y n g
## 3 e b s w n m
## 4 p x y w s u
## 5 e x s g a g
## 6 e x y y n g
Rename Columns and Data
# Update vars and colnames by creating new cols
mdat$class <- revalue(mdat$V1, c("e"="edible", "p"="poisonous"))
mdat$class <- factor(mdat$class)
mdat$cshape <- revalue(mdat$V2, c("b"="bell", "c"="conical", "x"="convex",
"f"="flat", "k"="knobbed", "s"="sunken"))
mdat$csurface <- revalue(mdat$V3, c("f"="fibrous", "g"="grooves", "y"="scaly",
"s"="smooth"))
mdat$ccolor <- revalue(mdat$V4, c("n"="brown", "b"="buff", "c"="cinnamon",
"g"="gray", "r"="green", "p"= "pink",
"u"="purple", "e"="red", "w"="white",
"y"="yellow"))
mdat$population <- revalue(mdat$V22, c("a"="abundant", "c"="clustered",
"n"="numerous", "s"="scattered",
"v"="several", "y"="solitary"))
mdat$habitat <- revalue(mdat$V23, c("g"="grasses", "l"="leaves", "m"="meadows",
"p"="paths", "u"="urban", "w"="waste",
"d"="woods"))
# Remove raw columns, once accuracy of renaming assured
mdat <- mdat[, c(7:12)]
# Use datatable to review curated data
datatable(mdat, options = list(pageLength = 5))