Walt Wells, 07.31-08.11.2016

Prepare Environment and Load Data

# Set Environment
if (!require("plyr")) install.packages("plyr")
if (!require("dplyr")) install.packages("dplyr")
if (!require("hflights")) install.packages("hflights")

# Let's take an initial look at the data
# derived from:  http://www.transtats.bts.gov/DatabaseInfo.asp?DB_ID=120&Link=0
data(hflights)
str(hflights)
'data.frame':   227496 obs. of  21 variables:
 $ Year             : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
 $ Month            : int  1 1 1 1 1 1 1 1 1 1 ...
 $ DayofMonth       : int  1 2 3 4 5 6 7 8 9 10 ...
 $ DayOfWeek        : int  6 7 1 2 3 4 5 6 7 1 ...
 $ DepTime          : int  1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
 $ ArrTime          : int  1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
 $ UniqueCarrier    : chr  "AA" "AA" "AA" "AA" ...
 $ FlightNum        : int  428 428 428 428 428 428 428 428 428 428 ...
 $ TailNum          : chr  "N576AA" "N557AA" "N541AA" "N403AA" ...
 $ ActualElapsedTime: int  60 60 70 70 62 64 70 59 71 70 ...
 $ AirTime          : int  40 45 48 39 44 45 43 40 41 45 ...
 $ ArrDelay         : int  -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
 $ DepDelay         : int  0 1 -8 3 5 -1 -1 -5 43 43 ...
 $ Origin           : chr  "IAH" "IAH" "IAH" "IAH" ...
 $ Dest             : chr  "DFW" "DFW" "DFW" "DFW" ...
 $ Distance         : int  224 224 224 224 224 224 224 224 224 224 ...
 $ TaxiIn           : int  7 6 5 9 9 6 12 7 8 6 ...
 $ TaxiOut          : int  13 9 17 22 9 13 15 12 22 19 ...
 $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
 $ CancellationCode : chr  "" "" "" "" ...
 $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...

Explore

# Look at origin patterns
table(hflights$UniqueCarrier)

   AA    AS    B6    CO    DL    EV    F9    FL    MQ    OO    UA    US 
 3244   365   695 70032  2641  2204   838  2139  4648 16061  2072  4082 
   WN    XE    YV 
45343 73053    79 
# Look at arrival delay patterns
table(cut(hflights$Distance,6), cut(hflights$ArrDelay, 6))
                     
                      (-71,105] (105,279] (279,454] (454,629] (629,803]
  (75.2,716]              98627      1745        97         6         4
  (716,1.35e+03]          95495      1675        58         6         3
  (1.35e+03,1.99e+03]     24651       567        18         6         1
  (1.99e+03,2.63e+03]       383         4         1         0         0
  (2.63e+03,3.27e+03]       115         8         1         0         0
  (3.27e+03,3.91e+03]       390         6         2         0         0
                     
                      (803,979]
  (75.2,716]                  2
  (716,1.35e+03]              1
  (1.35e+03,1.99e+03]         1
  (1.99e+03,2.63e+03]         0
  (2.63e+03,3.27e+03]         0
  (3.27e+03,3.91e+03]         1
table(hflights$UniqueCarrier, cut(hflights$ArrDelay, 6))
    
     (-71,105] (105,279] (279,454] (454,629] (629,803] (803,979]
  AA      3132        42         0         1         2         1
  AS       362         2         0         0         0         0
  B6       644        27         2         0         0         0
  CO     68264      1080        24         2         2         1
  DL      2518        64         6         2         1         0
  EV      2032        84         4         1         0         0
  F9       822        10         0         0         0         0
  FL      2074        31         4         2         0         0
  MQ      4364       118        16         2         2         2
  OO     15444       332         5         0         0         0
  UA      1968        56         3         5         0         1
  US      3976        53         1         0         0         0
  WN     43669       818        47         2         0         0
  XE     70314      1288        65         1         1         0
  YV        78         0         0         0         0         0
hflights %>% 
    group_by(UniqueCarrier) %>%
    summarise(MeanArrDelay = mean(ArrDelay, na.rm=TRUE), SDArrDelay = sd(ArrDelay, na.rm=TRUE))
Source: local data frame [15 x 3]

   UniqueCarrier MeanArrDelay SDArrDelay
1             AA    0.8917558   37.39939
2             AS    3.1923077   25.45696
3             B6    9.8588410   47.64176
4             CO    6.0986983   28.38512
5             DL    6.0841374   41.44595
6             EV    7.2569543   43.26771
7             F9    7.6682692   24.49275
8             FL    1.8536239   33.74713
9             MQ    7.1529751   47.01261
10            OO    8.6934922   30.40658
11            UA   10.4628628   47.72488
12            US   -0.6307692   25.20307
13            WN    7.5871430   30.54575
14            XE    8.1865242   29.81871
15            YV    4.0128205   18.82972
# Let's double check an assumption.   The table for unique carrier shows 70032 flights for CO carrier.   But it looks like the table showinng arrival delay bins by carrier < 70032.   Where are the missing values?  
co <- filter(hflights, UniqueCarrier == "CO")
sum(table(cut(co$ArrDelay, 6)))
[1] 69373
sum(table(cut(co$ArrDelay, 6))) + sum(co$Diverted) + sum(co$Cancelled)
[1] 70032

Subset, Clean

# OK - let's look at the airlines with more than 10,000 records.   We'll explore the relationship between those carriers and the Arrival Delays.  

sub <- hflights %>%
    select(UniqueCarrier, ArrDelay, Diverted, Cancelled)

fdat <- sub %>%
    filter(UniqueCarrier %in% c('CO', 'OO', 'WN', 'XE'))

# now we'll let's bin according to arrival time.   We'll define 5 categories: 
# early, ontime (+ or - 5 min), 5~30, 30~120, >120

ftab <- table(fdat$UniqueCarrier, cut(fdat$ArrDelay, breaks=c(-90, -5, 5, 30, 120, 1000), labels = c("Early", "OnTime", "5-30 Late", "30-120 Late", "> 120 Late")))
row.names(ftab) <- c("Continental", "SkyWest", "SouthWest", "ExpressJet")

Plots

# Let's try some Cleveland plots. 

dotchart(ftab, main= "Frequency of Arrival Delays for Airlines with > 10,000 flights", xlab="Frequency")

# Great! This gives us a good rough idea of the arrival delays for each carrier.   Now, let's normalize by calculating the percentage of occurences for each carrier's total flights. 

CO <- sum(sub$UniqueCarrier=="CO")
OO <- sum(sub$UniqueCarrier=="OO")
WN <- sum(sub$UniqueCarrier=="WN")
XE <- sum(sub$UniqueCarrier=="XE")

df <- data.frame(ftab)
df <- df %>%
    mutate(Per = ifelse(Var1 == "Continental", round(Freq/CO, 3) * 100, 
                        ifelse(Var1 == "SkyWest", round(Freq/OO, 3) * 100,
                               ifelse(Var1 == "SouthWest", round(Freq/WN, 3) * 100,
                                      round(Freq/XE, 3) * 100)
                               )
                        )
           ) 

## we double check our work, confirming that the reason that arrival delays don't add up to 100% is that they don't account for flights diverted or cancelled
sum(df$Per[df$Var1=="SouthWest"])
[1] 98.3
sum(df$Freq[df$Var1=="SouthWest"]) + sum(sub$Diverted[sub$UniqueCarrier=="WN"]) +
    sum(sub$Cancelled[sub$UniqueCarrier=="WN"])
[1] 45343
#now let's plot and see what we get
dotchart(df$Per, labels=df$Var1, groups=df$Var2, main="% of Arrival Delays for Airlines with > 10,000 flights", xlab="% of Carrier Flights")