Class Notes

With Prof. Lyford

Published

September 25, 2025

These are the live code notes from Professor Lyford. Note that there’s another set of notes on the schedule that gives more detailed descriptions of these functions using the books dataset from Wednesday!

# install the package if necessary!
library(nycflights23)
library(tidyverse)
#What is the average departure delay for each carrier's flights?
# (By carrier, what is the avg flight delay?)

#The long way:
flights |>
  filter(carrier == "DL") |>
  summarize(mean_delay = mean(dep_delay, na.rm = TRUE))
# A tibble: 1 × 1
  mean_delay
       <dbl>
1       15.1
flights |>
  filter(carrier == "AA") |>
  summarize(mean_delay = mean(dep_delay, na.rm = TRUE))
# A tibble: 1 × 1
  mean_delay
       <dbl>
1       14.2

group_by() for grouped operations

#Introducing "group_by()"
flights |>
  group_by(carrier) |>
  summarize(mean_delay = mean(dep_delay, na.rm = TRUE))
# A tibble: 14 × 2
   carrier mean_delay
   <chr>        <dbl>
 1 9E            7.44
 2 AA           14.2 
 3 AS           12.0 
 4 B6           23.8 
 5 DL           15.1 
 6 F9           35.7 
 7 G4            3.98
 8 HA           22.9 
 9 MQ           10.5 
10 NK           18.2 
11 OO           19.8 
12 UA           17.6 
13 WN           16.1 
14 YX            4.21
flights |>
  filter(!is.na(dep_delay)) |>
  group_by(carrier) |>
  summarize(mean_delay = mean(dep_delay, na.rm = TRUE))
# A tibble: 14 × 2
   carrier mean_delay
   <chr>        <dbl>
 1 9E            7.44
 2 AA           14.2 
 3 AS           12.0 
 4 B6           23.8 
 5 DL           15.1 
 6 F9           35.7 
 7 G4            3.98
 8 HA           22.9 
 9 MQ           10.5 
10 NK           18.2 
11 OO           19.8 
12 UA           17.6 
13 WN           16.1 
14 YX            4.21

arrange() to sort rows

flights |>
  group_by(carrier) |>
  summarize(mean_delay = mean(dep_delay, na.rm = TRUE)) |>
  arrange(-mean_delay)
# A tibble: 14 × 2
   carrier mean_delay
   <chr>        <dbl>
 1 F9           35.7 
 2 B6           23.8 
 3 HA           22.9 
 4 OO           19.8 
 5 NK           18.2 
 6 UA           17.6 
 7 WN           16.1 
 8 DL           15.1 
 9 AA           14.2 
10 AS           12.0 
11 MQ           10.5 
12 9E            7.44
13 YX            4.21
14 G4            3.98
flights |>
  filter(dep_delay > 0) |>
  group_by(carrier) |>
  summarize(mean_delay = mean(dep_delay, na.rm = TRUE)) |>
  arrange(-mean_delay)
# A tibble: 14 × 2
   carrier mean_delay
   <chr>        <dbl>
 1 OO            66.6
 2 F9            63.1
 3 AA            55.7
 4 B6            55.7
 5 NK            51.8
 6 UA            45.7
 7 DL            44.8
 8 9E            44.8
 9 AS            43.4
10 YX            43.4
11 G4            41.0
12 HA            40.9
13 WN            29.3
14 MQ            27.2
#Let's find the average departure delay by origin
flights |>
  filter(dep_delay > 0) |>
  group_by(origin) |>
  summarize(mean_delay = mean(dep_delay, na.rm = TRUE)) |>
  arrange(-mean_delay)
# A tibble: 3 × 2
  origin mean_delay
  <chr>       <dbl>
1 JFK          50.4
2 LGA          47.7
3 EWR          45.7

select() to choose certain columns

flights |>
  select(carrier, dep_delay)
# A tibble: 435,352 × 2
   carrier dep_delay
   <chr>       <dbl>
 1 UA            203
 2 DL             78
 3 B6             47
 4 B6            173
 5 UA            228
 6 AA              3
 7 B6             10
 8 AA             -6
 9 UA             17
10 NK              2
# ℹ 435,342 more rows
flights |>
  select(carrier, dep_delay, arr_delay, air_time) |>
  group_by(carrier) |>
  summarize_all(mean, na.rm=T) # this can be a helpful function!
# A tibble: 14 × 4
   carrier dep_delay arr_delay air_time
   <chr>       <dbl>     <dbl>    <dbl>
 1 9E           7.44   -2.23       79.1
 2 AA          14.2     5.27      166. 
 3 AS          12.0     0.0844    331. 
 4 B6          23.8    15.6       161. 
 5 DL          15.1     1.64      179. 
 6 F9          35.7    26.2       137. 
 7 G4           3.98   -5.88      107. 
 8 HA          22.9    21.4       623. 
 9 MQ          10.5     0.119     118. 
10 NK          18.2     9.89      154. 
11 OO          19.8    13.7       105. 
12 UA          17.6     9.04      173. 
13 WN          16.1     5.76      149. 
14 YX           4.21   -4.64       80.9

piping directly into ggplot()

#Let's make a graph of all departure delays
flights |>
  ggplot() +
  geom_boxplot(aes(x = dep_delay))
Warning: Removed 10738 rows containing non-finite outside the scale range
(`stat_boxplot()`).