ggplot2 ecosystem
& designing visualizations

Lecture 10

Dr. Colin Rundel

The wider ggplot2 ecosystem

ggthemes

ggplot2 themes

g = ggplot( palmerpenguins::penguins, aes(x=species, y=body_mass_g, fill=species)) + 
    geom_boxplot()
g

g + theme_dark()

g + theme_minimal()

g + theme_void()

ggthemes

g + ggthemes::theme_economist() + 
  ggthemes::scale_fill_economist()

g + ggthemes::theme_fivethirtyeight() + 
  ggthemes::scale_fill_fivethirtyeight()

g + ggthemes::theme_gdocs() +
  ggthemes::scale_fill_gdocs()

g + ggthemes::theme_wsj() +
  ggthemes::scale_fill_wsj()

And for those who miss Excel

g + ggthemes::theme_excel() +
  ggthemes::scale_fill_excel()

g + ggthemes::theme_excel_new() +
  ggthemes::scale_fill_excel_new()

d = tibble(
  car = rownames(mtcars),
  weight = mtcars$wt,
  mpg = mtcars$mpg
) %>%
  filter(weight > 2.75, weight < 3.45)
ggplot(d, aes(x=weight, y=mpg)) +
  geom_point(color="red") +
  geom_text(
    aes(label = car)
  )

ggplot(d, aes(x=weight, y=mpg)) +
  geom_point(color="red") +
  ggrepel::geom_text_repel(
    aes(label = car)
  )

ggplot(d, aes(x=weight, y=mpg)) +
  geom_point(color="red") +
  ggrepel::geom_text_repel(
    aes(label = car),
    nudge_x = .1, box.padding = 1, point.padding = 0.6,
    arrow = arrow(length = unit(0.02, "npc")), segment.alpha = 0.25
  )

ggplot objects

library(patchwork)

p1 = ggplot(palmerpenguins::penguins) + 
  geom_boxplot(aes(x = island, y = body_mass_g))

p2 = ggplot(palmerpenguins::penguins) + 
  geom_boxplot(aes(x = species, y = body_mass_g))

p3 = ggplot(palmerpenguins::penguins) + 
  geom_point(aes(x = flipper_length_mm, y = body_mass_g, color = sex))

p4 = ggplot(palmerpenguins::penguins) + 
  geom_point(aes(x = bill_length_mm, y = body_mass_g, color = sex))
class(p1)
[1] "gg"     "ggplot"

p1 + p2 + p3 + p4

p1 + p2 + p3 + p4 + plot_layout(nrow=1)

p1 / (p2 + p3 + p4)

p1 + p2 + p3 + p4 + 
  plot_annotation(title = "Palmer Penguins", tag_levels = c("A"))

p1 + {
  p2 + {
    p3 + p4 + plot_layout(ncol = 1) + plot_layout(tag_level = 'new')
  }
} + 
  plot_layout(ncol = 1) +
  plot_annotation(tag_levels = c("1","a"), tag_prefix = "Fig ")

GGally

GGally::ggpairs(palmerpenguins::penguins)

airq = airquality
airq$Month = month.name[airq$Month]

ggplot(
  airq, 
  aes(Day, Temp, group = Month)
) + 
  geom_line() + 
  geom_segment(
    aes(xend = 31, yend = Temp), 
    linetype = 2, 
    colour = 'grey'
  ) + 
  geom_point(size = 2) + 
  geom_text(
    aes(x = 31.1, label = Month), 
    hjust = 0
  ) + 
  gganimate::transition_reveal(Day) +
  coord_cartesian(clip = 'off') + 
  labs(
    title = 'Temperature in New York', 
    y = 'Temperature (°F)'
  ) + 
  theme_minimal() + 
  theme(plot.margin = margin(5.5, 40, 5.5, 5.5))

More extensions

Why do we visualize?

Asncombe’s Quartet

datasets::anscombe %>% as_tibble()
# A tibble: 11 × 8
      x1    x2    x3    x4    y1    y2    y3    y4
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1    10    10    10     8  8.04  9.14  7.46  6.58
 2     8     8     8     8  6.95  8.14  6.77  5.76
 3    13    13    13     8  7.58  8.74 12.7   7.71
 4     9     9     9     8  8.81  8.77  7.11  8.84
 5    11    11    11     8  8.33  9.26  7.81  8.47
 6    14    14    14     8  9.96  8.1   8.84  7.04
 7     6     6     6     8  7.24  6.13  6.08  5.25
 8     4     4     4    19  4.26  3.1   5.39 12.5 
 9    12    12    12     8 10.8   9.13  8.15  5.56
10     7     7     7     8  4.82  7.26  6.42  7.91
11     5     5     5     8  5.68  4.74  5.73  6.89

Tidy anscombe

(tidy_anscombe = datasets::anscombe %>%
  pivot_longer(everything(), names_sep = 1, names_to = c("var", "group")) %>%
  pivot_wider(id_cols = group, names_from = var, 
              values_from = value, values_fn = list(value = list)) %>% 
  unnest(cols = c(x,y)))
# A tibble: 44 × 3
   group     x     y
   <chr> <dbl> <dbl>
 1 1        10  8.04
 2 1         8  6.95
 3 1        13  7.58
 4 1         9  8.81
 5 1        11  8.33
 6 1        14  9.96
 7 1         6  7.24
 8 1         4  4.26
 9 1        12 10.8 
10 1         7  4.82
# ℹ 34 more rows

tidy_anscombe %>%
  group_by(group) %>%
  summarize(
    mean_x = mean(x), mean_y = mean(y), 
    sd_x = sd(x), sd_y = sd(y),
    cor = cor(x,y), .groups = "drop"
  )
# A tibble: 4 × 6
  group mean_x mean_y  sd_x  sd_y   cor
  <chr>  <dbl>  <dbl> <dbl> <dbl> <dbl>
1 1          9   7.50  3.32  2.03 0.816
2 2          9   7.50  3.32  2.03 0.816
3 3          9   7.5   3.32  2.03 0.816
4 4          9   7.50  3.32  2.03 0.817

ggplot(tidy_anscombe, aes(x = x, y = y, color = as.factor(group))) +
  geom_point(size=2) +
  facet_wrap(~group) +
  geom_smooth(method="lm", se=FALSE, fullrange=TRUE, formula = y~x) +
  guides(color="none")

DatasauRus

ggplot(datasauRus::datasaurus_dozen, aes(x = x, y = y)
) +
  geom_point() +
  facet_wrap(~dataset, ncol=5)

datasauRus::datasaurus_dozen
# A tibble: 1,846 × 3
   dataset     x     y
   <chr>   <dbl> <dbl>
 1 dino     55.4  97.2
 2 dino     51.5  96.0
 3 dino     46.2  94.5
 4 dino     42.8  91.4
 5 dino     40.8  88.3
 6 dino     38.7  84.9
 7 dino     35.6  79.9
 8 dino     33.1  77.6
 9 dino     29.0  74.5
10 dino     26.2  71.4
# ℹ 1,836 more rows
datasauRus::datasaurus_dozen %>%
  group_by(dataset) %>%
  summarize(mean_x = mean(x), mean_y = mean(y), 
            sd_x = sd(x), sd_y = sd(y), 
            cor = cor(x,y), .groups = "drop")
# A tibble: 13 × 6
   dataset    mean_x mean_y  sd_x  sd_y     cor
   <chr>       <dbl>  <dbl> <dbl> <dbl>   <dbl>
 1 away         54.3   47.8  16.8  26.9 -0.0641
 2 bullseye     54.3   47.8  16.8  26.9 -0.0686
 3 circle       54.3   47.8  16.8  26.9 -0.0683
 4 dino         54.3   47.8  16.8  26.9 -0.0645
 5 dots         54.3   47.8  16.8  26.9 -0.0603
 6 h_lines      54.3   47.8  16.8  26.9 -0.0617
 7 high_lines   54.3   47.8  16.8  26.9 -0.0685
 8 slant_down   54.3   47.8  16.8  26.9 -0.0690
 9 slant_up     54.3   47.8  16.8  26.9 -0.0686
10 star         54.3   47.8  16.8  26.9 -0.0630
11 v_lines      54.3   47.8  16.8  26.9 -0.0694
12 wide_lines   54.3   47.8  16.8  26.9 -0.0666
13 x_shape      54.3   47.8  16.8  26.9 -0.0656

Simpson’s Paradox

Simpson’s Paradox

Designing effective visualizations

Gapminder

Keep it simple




Judging relative area

Use color to draw attention



Tell a story


Leave out non-story details



Ordering matter

Clearly indicate missing data


Reduce cognitive load


Use descriptive titles

Annotate figures


All of the data doesn’t tell a story

All of the data doesn’t tell a story

All of the data doesn’t tell a story

Chart Remakes / Makeovers

The Why Axis - Gender Gap

The Why Axis - BLS

Other Resources

  • Duke Library - Center for Data and Visualization Sciences - https://library.duke.edu/data/

  • Tidy tuesday - https://github.com/rfordatascience/tidytuesday

  • Flowing data - https://flowingdata.com/

  • Twitter - #dataviz, #tidytuesday

  • Books:

    • Wickham, Navarro, Pedersen. ggplot2: Elegant Graphics for Data Analysis. 3rd edition. Springer, 2021.
    • Wilke. Fundamentals of Data Visualization. O’Reilly Media, 2019.
    • Healy. Data Visualization: A Practical Introduction. Princeton University Press, 2018.
    • Tufte. The visual display of quantitative information. 2nd edition. Connecticut Graphics Press, 2015.

Acknowledgments

Above materials are derived in part from the following sources:

  • Visualization training materials developed by Angela Zoss and Eric Monson, Duke DVS