# load packageslibrary(countdown)library(tidyverse)library(glue)library(lubridate)library(scales)# set theme for ggplot2ggplot2::theme_set(ggplot2::theme_minimal(base_size =14))# set width of code outputoptions(width =65)# set figure parameters for knitrknitr::opts_chunk$set(fig.width =7, # 7" widthfig.asp =0.618, # the golden ratiofig.retina =3, # dpi multiplier for displaying HTML output on retinafig.align ="center", # center align figuresdpi =300# higher dpi, sharper image)
Transforming and reshaping a single data frame (cont.)
Data: Hotel bookings
Data from two hotels: one resort and one city hotel
want to slice it, and dice it, and juice it, and process it, so we can plot it
Monthly bookings
Come up with a plan for making the following visualization and write the pseudocode.
Livecoding
Reveal below for code developed during live coding session.
Code
hotels <- hotels |>mutate(arrival_date_month =fct_relevel(arrival_date_month, month.name),season =case_when( arrival_date_month %in%c("December", "January", "February") ~"Winter", arrival_date_month %in%c("March", "April", "May") ~"Spring", arrival_date_month %in%c("June", "July", "August") ~"Summer",TRUE~"Fall" ),season =fct_relevel(season, "Winter", "Spring", "Summer", "Fall"),season_emoji =case_when( season =="Winter"~"❄️", season =="Spring"~"⛅️️", season =="Summer"~"☀️", season =="Fall"~"☂️" ) )hotels |>count(season_emoji, hotel, arrival_date_month) |>ggplot(aes(x = arrival_date_month, y = n, group = hotel, linetype = hotel)) +geom_line(linewidth =0.8, color ="cornsilk4") +geom_text(aes(label = season_emoji), size =6, show.legend =FALSE) +scale_x_discrete(labels = month.abb) +labs(x ="Arrival month", y ="Number of bookings", linetype =NULL,title ="Number of monthly bookings",subtitle ="July 2015 to August 2017",caption ="Source: Antonio, Almeida and Nunes (2019) | TidyTuesday" ) +coord_cartesian(clip ="off") +theme(legend.position =c(0.12, 0.9),legend.box.background =element_rect(fill ="white", color ="white"),plot.subtitle =element_text(color ="cornsilk4"),plot.caption =element_text(color ="cornsilk4") )
A few takeaways
forcats::fct_relevel() in a mutate() is useful for custom ordering of levels of a factor variable
summarize() after group_by() with multiple variables results in a message about the grouping structure of the resulting data frame – the message can be supressed by defining .groups (e.g., .groups = "drop" or .groups = "keep")
summarize() also lets you get away with being sloppy and not naming your new column, but that’s not recommended!
Rowwise operations
We want to calculate the total number of guests for each booking. Why does the following not work?
# A tibble: 119,390 × 4
adults children babies guests
<dbl> <dbl> <dbl> <dbl>
1 2 0 0 NA
2 2 0 0 NA
3 1 0 0 NA
4 1 0 0 NA
5 2 0 0 NA
6 2 0 0 NA
7 2 0 0 NA
8 2 0 0 NA
9 2 0 0 NA
10 2 0 0 NA
# … with 119,380 more rows
Rowwise operations
hotels |>select(adults, children, babies) |>rowwise() |>mutate(guests =sum(c(adults, children, babies))) |>filter(adults >0, children >0, babies >0) # to show sum works
# A tibble: 4 × 4
hotel is_canceled mean_stays_in_weekend_nights mean_st…¹
<chr> <dbl> <dbl> <dbl>
1 City Hotel 0 0.801 2.12
2 City Hotel 1 0.788 2.27
3 Resort Hotel 0 1.13 3.01
4 Resort Hotel 1 1.34 3.44
# … with abbreviated variable name ¹mean_stays_in_week_nights
Which variables are plotted in the following visualization? Which aesthetics are they mapped to? Recreate the visualization.
Livecoding
Reveal below for code developed during live coding session.
Code
hotels_summary |>mutate(is_canceled =if_else(is_canceled ==0, "Not canceled", "Canceled")) |>pivot_longer(cols =starts_with("mean"),names_to ="day_type",values_to ="mean_stays",names_prefix ="mean_stays_in_") |>mutate(day_type =if_else(str_detect(day_type, "weekend"), "Weekend", "Weekday") ) |>ggplot(aes(x =str_wrap(is_canceled, 10), y = mean_stays, group = hotel, color = hotel)) +geom_point(show.legend =FALSE) +geom_line(aes(linetype = hotel), linewidth =1) +facet_wrap(~day_type) +labs(x ="Booking status",y ="Mean number of\nnights of stay",color =NULL, linetype =NULL,title ="Mean number of stays",subtitle ="By hotel type and booking status",caption ="Source: Antonio, Almeida and Nunes (2019) | TidyTuesday" ) +scale_color_manual(values =c("cornsilk4", "deepskyblue3")) +scale_y_continuous(limits =c(0, 4), breaks =0:4) +theme(legend.position ="bottom")
pivot_wider() and pivot_longer()
From tidyr
Incredibly useful for reshaping for plotting
Lots of extra arguments to help with reshaping pain!