library(tidyverse)
Presentation 1: Base R and Tidyverse
Load Packages
Load Dataset
Here we load a dataframe that contains sales data for employees (in thousands DKK) from 2020 to 2023. We load the data as df_baseR
which we will use to demonstrate base R commands. A copy of the same dataframe, df_tidyverse
, is used to demonstrate tidyverse commands.
<- readxl::read_excel('../data/df_sales_1.xlsx') %>% as.data.frame()
df_baseR <- readxl::read_excel('../data/df_sales_1.xlsx') %>% as_tibble() df_tidyverse
Table format in baseR is called data.frame. Have a look at the object in the terminal. It is very simple.
class(df_baseR)
[1] "data.frame"
Table format in tidyverse is called tibble. Have a look at the object in the terminal. The dimensions of the tibble is provided together with the classes of each column.
class(df_tidyverse)
[1] "tbl_df" "tbl" "data.frame"
Base R and Tidyverse
Access the Age
column:
# Base R
'Age']
df_baseR[
# Tidyverse
%>% select(Age) df_tidyverse
Access the Age
column as a vector:
# Base R
'Age']]
df_baseR[[$Age
df_tidyverse
# Tidyverse
%>% pull(Age) df_tidyverse
Add a column containing the difference in sales in 2022 and 2022.
# Base R
$sales_diff <- df_baseR$sales_2022 - df_baseR$sales_2020
df_baseR
# Tidyverse
<- df_tidyverse %>%
df_tidyverse mutate(sales_diff = sales_2022 - sales_2020)
Remove the sales_diff
column.
# Base R
$sales_diff <- NULL
df_baseR
# Tidyverse
<- df_tidyverse %>%
df_tidyverse select(!sales_diff)
Select columns with sales numbers.
# Base R
startsWith(colnames(df_baseR), 'sales_')]
df_baseR[,
# Tidyverse
%>%
df_tidyverse select(starts_with('sales_'))
Filter rows for people older than 25.
# Base R
$Age > 25 ,]
df_baseR[df_baseR
# Tidyverse
%>%
df_tidyverse filter(Age > 25)
Filter row for people that are 30 years old or younger and have sales in 2022 over 200.
# Base R
!is.na(df_baseR$Age) & !is.na(df_baseR$sales_2022) &
df_baseR[$Age <= 30 & df_baseR$sales_2022 > 200 ,]
df_baseR
# Tidyverse
%>%
df_tidyverse filter(Age <= 30, sales_2022 > 200)
Add column.
# Base R
$mood <- "happy"
df_baseR
# Tidyverse
<- df_tidyverse %>%
df_tidyverse mutate(mood = "happy")
Conditions using ifelse
.
# Base R
$raise <- ifelse(df_baseR$sales_2023 > df_baseR$sales_2022, "yes", "no")
df_baseR
# Tidyverse
<- df_tidyverse %>%
df_tidyverse mutate(raise = ifelse(sales_2023 > sales_2022, "yes", "no"))
Conditions using case_when
.
# Base R
$group <- ifelse(df_baseR$Age < 30 & df_baseR$Sex == "Female", "young_female",
df_baseRifelse(df_baseR$Age < 30 & df_baseR$Sex == "Male", "young_male",
ifelse(df_baseR$Age >= 30 & df_baseR$Sex == "Female", "mature_female",
ifelse(df_baseR$Age >= 30 & df_baseR$Sex == "Male", "mature_male", NA))))
# Tidyverse
<- df_tidyverse %>%
df_tidyverse mutate(group = case_when(Age < 30 & Sex == "Female" ~ "young_female",
< 30 & Sex == "Male" ~ "young_male",
Age >= 30 & Sex == "Female" ~ "mature_female",
Age >= 30 & Sex == "Male" ~ "mature_male",
Age .default = NA))
Check which columns have NAs.
# Base R
colSums(is.na(df_baseR))
# Tidyverse
%>% summarise(across(everything(), ~ sum(is.na(.)))) df_tidyverse
Remove rows with any NAs.
# Base R
<- df_baseR[complete.cases(df_baseR), ]
df_baseR_no_na
# Tidyverse
<- df_tidyverse %>% drop_na() df_tidyverse_no_na
Sort data based on sales in 2022 in descending order.
# Base R
order(-df_baseR$sales_2022) ,]
df_baseR[
# Tidyverse
%>% arrange(desc(sales_2022)) df_tidyverse
Do the filtering in 4. and select the Name and Sex column. This far, do you prefer the base R or tidyverse way? Do you see pros and cons with both?
# Base R, NAs not removed
!is.na(df_baseR$Age) & !is.na(df_baseR$sales_2022) &
df_baseR[$Age <= 30 & df_baseR$sales_2022 > 200 , c('Age', 'Sex')]
df_baseR
# Base R, NAs removed
$Age <= 30 & df_baseR_no_na$sales_2022 > 200 , c('Age', 'Sex')]
df_baseR_no_na[df_baseR_no_na
# Tidyverse
%>%
df_tidyverse filter(Age <= 30, sales_2022 > 200) %>%
select(Age, Sex)
The df_location
data frame contains information about the cities of the sales people.
set.seed(101)
<- data.frame(
df_location ID = sample(10),
City = c("New York", "LA", "New York", "Chicago", "Miami", "Miami", "LA", "Chicago", "LA", "New York")
)
head(df_location)
Join df
and df_location
. For base R we use merge
and for tidyverse we use left_join
.
# Base R
<- merge(df_baseR, df_location, by = 'ID')
df_baseR_merged head(df_baseR_merged)
# Tidyverse
<- df_tidyverse %>% left_join(df_location, by = 'ID')
df_tidyverse_join head(df_tidyverse_join)
Count the number of people in each of the unique cities.
# Base R
table(df_baseR_merged$City)
# Tidyverse
%>% count(City) df_tidyverse_join
String manipulation
We will do string manipulation the tidyverse way.
The paste
function concatenates two strings to one.
paste('Alice', 'Hansen')
[1] "Alice Hansen"
The sep
argument is a space by default, but can be changed to any character.
paste('Alice', 'Hansen', sep = "_")
[1] "Alice_Hansen"
The paste0
function concatenates two strings to one without adding any separator between them.
paste0('Alice', 'Hansen')
[1] "AliceHansen"
Let’s give all the employees the last name Hansen.
<- df_tidyverse_join %>%
df_tidyverse_join mutate(Name = paste(Name, 'Hansen'))
head(df_tidyverse_join)
# A tibble: 6 × 12
ID Name Age Sex sales_2020 sales_2021 sales_2022 sales_2023 mood
<dbl> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
1 1 Alice Han… 25 Fema… 100 110 120 100 happy
2 2 Bob Hansen 30 Male 200 210 220 230 happy
3 3 Charlie H… 22 Male 150 160 170 200 happy
4 4 Sophie Ha… 35 Fema… 300 320 340 250 happy
5 5 Eve Hansen 28 Fema… 250 240 250 270 happy
6 6 Frank Han… NA Male NA 260 270 280 happy
# ℹ 3 more variables: raise <chr>, group <chr>, City <chr>
We use the str_split
function to split a string into multiple parts in a list.
str_split('Alice Hansen', pattern = ' ')
[[1]]
[1] "Alice" "Hansen"
str_split('Alice_Hansen_Jensen', pattern = '_')
[[1]]
[1] "Alice" "Hansen" "Jensen"
Indexing one of the strings in the list using the str_split_i
function.
str_split_i('Alice_Hansen_Jensen', pattern = '_', i = 2)
[1] "Hansen"
Let’s remove the last name we just gave the employees.
<- df_tidyverse_join %>%
df_tidyverse_join mutate(Name = str_split_i(Name, pattern = ' ', i = 1))
head(df_tidyverse_join)
# A tibble: 6 × 12
ID Name Age Sex sales_2020 sales_2021 sales_2022 sales_2023 mood
<dbl> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
1 1 Alice 25 Female 100 110 120 100 happy
2 2 Bob 30 Male 200 210 220 230 happy
3 3 Charlie 22 Male 150 160 170 200 happy
4 4 Sophie 35 Female 300 320 340 250 happy
5 5 Eve 28 Female 250 240 250 270 happy
6 6 Frank NA Male NA 260 270 280 happy
# ℹ 3 more variables: raise <chr>, group <chr>, City <chr>
Detect substring in main string using str_detect
.
str_detect('Alice', 'A')
[1] TRUE
str_detect('Alice', 'B')
[1] FALSE
Add column that check if “A” or “a” in Name
and place it after the Name
column.
%>%
df_tidyverse_join mutate(A_in_name = str_detect(Name, 'A|a'),
.after = Name) %>%
head()
# A tibble: 6 × 13
ID Name A_in_name Age Sex sales_2020 sales_2021 sales_2022 sales_2023
<dbl> <chr> <lgl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
1 1 Alice TRUE 25 Fema… 100 110 120 100
2 2 Bob FALSE 30 Male 200 210 220 230
3 3 Charl… TRUE 22 Male 150 160 170 200
4 4 Sophie FALSE 35 Fema… 300 320 340 250
5 5 Eve FALSE 28 Fema… 250 240 250 270
6 6 Frank TRUE NA Male NA 260 270 280
# ℹ 4 more variables: mood <chr>, raise <chr>, group <chr>, City <chr>
Whitespace
Whitespace includes spaces, newlines, and other blank characters in text. It can cause errors or inconsistencies in data, so removing unnecessary whitespace is an important step in cleaning data.
Let’s have a look at a version of the sales data frame with whitespaces. In the tibble format it cannot be spotted.
<- read_delim('../data/df_sales_messy.csv')
df_sales_messy head(df_sales_messy)
# A tibble: 6 × 8
ID Name Age Sex sales_2020 sales_2021 sales_2022 sales_2023
<dbl> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
1 1 Alice 25 Female 100 110 120 100
2 2 Bob 30 Male 200 210 220 230
3 3 Charlie 22 Male 150 160 170 200
4 4 Sophie 35 Female 300 320 340 250
5 5 Eve 28 Female 250 240 250 270
6 6 Frank NA Male NA 260 270 280
Accessing the unique sexes of the Sex
column before cleaning.
$Sex %>% unique() df_sales_messy
[1] "Female" "Male" "Female " "Male "
Use the str_trim
function to remove whitespace.
$Sex %>% str_trim() %>% unique() df_sales_messy
[1] "Female" "Male"
Like other function, the str_trim
function can also be used inside the mutate
function to alter the data frame.
<- df_sales_messy %>%
df_sales_clean mutate(Sex = str_trim(Sex))
Accessing the unique sexes of the Sex
column after cleaning.
$Sex %>% unique() df_sales_clean
[1] "Female" "Male"
Export Dataset
Export tidyverse dataset
::write_xlsx(df_tidyverse_join, '../out/sales_data_2.xlsx') writexl