Walt Wells, 07.31-08.11.2016
Prepare Environment and Load Data
# Set Environment
if (!require("plyr")) install.packages("plyr")
if (!require("dplyr")) install.packages("dplyr")
if (!require("hflights")) install.packages("hflights")
# Let's take an initial look at the data
# derived from: http://www.transtats.bts.gov/DatabaseInfo.asp?DB_ID=120&Link=0
data(hflights)
str(hflights)
'data.frame': 227496 obs. of 21 variables:
$ Year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
$ Month : int 1 1 1 1 1 1 1 1 1 1 ...
$ DayofMonth : int 1 2 3 4 5 6 7 8 9 10 ...
$ DayOfWeek : int 6 7 1 2 3 4 5 6 7 1 ...
$ DepTime : int 1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
$ ArrTime : int 1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
$ UniqueCarrier : chr "AA" "AA" "AA" "AA" ...
$ FlightNum : int 428 428 428 428 428 428 428 428 428 428 ...
$ TailNum : chr "N576AA" "N557AA" "N541AA" "N403AA" ...
$ ActualElapsedTime: int 60 60 70 70 62 64 70 59 71 70 ...
$ AirTime : int 40 45 48 39 44 45 43 40 41 45 ...
$ ArrDelay : int -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
$ DepDelay : int 0 1 -8 3 5 -1 -1 -5 43 43 ...
$ Origin : chr "IAH" "IAH" "IAH" "IAH" ...
$ Dest : chr "DFW" "DFW" "DFW" "DFW" ...
$ Distance : int 224 224 224 224 224 224 224 224 224 224 ...
$ TaxiIn : int 7 6 5 9 9 6 12 7 8 6 ...
$ TaxiOut : int 13 9 17 22 9 13 15 12 22 19 ...
$ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
$ CancellationCode : chr "" "" "" "" ...
$ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
Explore
# Look at origin patterns
table(hflights$UniqueCarrier)
AA AS B6 CO DL EV F9 FL MQ OO UA US
3244 365 695 70032 2641 2204 838 2139 4648 16061 2072 4082
WN XE YV
45343 73053 79
# Look at arrival delay patterns
table(cut(hflights$Distance,6), cut(hflights$ArrDelay, 6))
(-71,105] (105,279] (279,454] (454,629] (629,803]
(75.2,716] 98627 1745 97 6 4
(716,1.35e+03] 95495 1675 58 6 3
(1.35e+03,1.99e+03] 24651 567 18 6 1
(1.99e+03,2.63e+03] 383 4 1 0 0
(2.63e+03,3.27e+03] 115 8 1 0 0
(3.27e+03,3.91e+03] 390 6 2 0 0
(803,979]
(75.2,716] 2
(716,1.35e+03] 1
(1.35e+03,1.99e+03] 1
(1.99e+03,2.63e+03] 0
(2.63e+03,3.27e+03] 0
(3.27e+03,3.91e+03] 1
table(hflights$UniqueCarrier, cut(hflights$ArrDelay, 6))
(-71,105] (105,279] (279,454] (454,629] (629,803] (803,979]
AA 3132 42 0 1 2 1
AS 362 2 0 0 0 0
B6 644 27 2 0 0 0
CO 68264 1080 24 2 2 1
DL 2518 64 6 2 1 0
EV 2032 84 4 1 0 0
F9 822 10 0 0 0 0
FL 2074 31 4 2 0 0
MQ 4364 118 16 2 2 2
OO 15444 332 5 0 0 0
UA 1968 56 3 5 0 1
US 3976 53 1 0 0 0
WN 43669 818 47 2 0 0
XE 70314 1288 65 1 1 0
YV 78 0 0 0 0 0
hflights %>%
group_by(UniqueCarrier) %>%
summarise(MeanArrDelay = mean(ArrDelay, na.rm=TRUE), SDArrDelay = sd(ArrDelay, na.rm=TRUE))
Source: local data frame [15 x 3]
UniqueCarrier MeanArrDelay SDArrDelay
1 AA 0.8917558 37.39939
2 AS 3.1923077 25.45696
3 B6 9.8588410 47.64176
4 CO 6.0986983 28.38512
5 DL 6.0841374 41.44595
6 EV 7.2569543 43.26771
7 F9 7.6682692 24.49275
8 FL 1.8536239 33.74713
9 MQ 7.1529751 47.01261
10 OO 8.6934922 30.40658
11 UA 10.4628628 47.72488
12 US -0.6307692 25.20307
13 WN 7.5871430 30.54575
14 XE 8.1865242 29.81871
15 YV 4.0128205 18.82972
# Let's double check an assumption. The table for unique carrier shows 70032 flights for CO carrier. But it looks like the table showinng arrival delay bins by carrier < 70032. Where are the missing values?
co <- filter(hflights, UniqueCarrier == "CO")
sum(table(cut(co$ArrDelay, 6)))
[1] 69373
sum(table(cut(co$ArrDelay, 6))) + sum(co$Diverted) + sum(co$Cancelled)
[1] 70032
Subset, Clean
# OK - let's look at the airlines with more than 10,000 records. We'll explore the relationship between those carriers and the Arrival Delays.
sub <- hflights %>%
select(UniqueCarrier, ArrDelay, Diverted, Cancelled)
fdat <- sub %>%
filter(UniqueCarrier %in% c('CO', 'OO', 'WN', 'XE'))
# now we'll let's bin according to arrival time. We'll define 5 categories:
# early, ontime (+ or - 5 min), 5~30, 30~120, >120
ftab <- table(fdat$UniqueCarrier, cut(fdat$ArrDelay, breaks=c(-90, -5, 5, 30, 120, 1000), labels = c("Early", "OnTime", "5-30 Late", "30-120 Late", "> 120 Late")))
row.names(ftab) <- c("Continental", "SkyWest", "SouthWest", "ExpressJet")
Plots
# Let's try some Cleveland plots.
dotchart(ftab, main= "Frequency of Arrival Delays for Airlines with > 10,000 flights", xlab="Frequency")

# Great! This gives us a good rough idea of the arrival delays for each carrier. Now, let's normalize by calculating the percentage of occurences for each carrier's total flights.
CO <- sum(sub$UniqueCarrier=="CO")
OO <- sum(sub$UniqueCarrier=="OO")
WN <- sum(sub$UniqueCarrier=="WN")
XE <- sum(sub$UniqueCarrier=="XE")
df <- data.frame(ftab)
df <- df %>%
mutate(Per = ifelse(Var1 == "Continental", round(Freq/CO, 3) * 100,
ifelse(Var1 == "SkyWest", round(Freq/OO, 3) * 100,
ifelse(Var1 == "SouthWest", round(Freq/WN, 3) * 100,
round(Freq/XE, 3) * 100)
)
)
)
## we double check our work, confirming that the reason that arrival delays don't add up to 100% is that they don't account for flights diverted or cancelled
sum(df$Per[df$Var1=="SouthWest"])
[1] 98.3
sum(df$Freq[df$Var1=="SouthWest"]) + sum(sub$Diverted[sub$UniqueCarrier=="WN"]) +
sum(sub$Cancelled[sub$UniqueCarrier=="WN"])
[1] 45343
#now let's plot and see what we get
dotchart(df$Per, labels=df$Var1, groups=df$Var2, main="% of Arrival Delays for Airlines with > 10,000 flights", xlab="% of Carrier Flights")
