library(tidyverse)
Presentation 1: Base R and Tidyverse
Load Packages
Load Dataset
Here we load a dataframe that contains sales data for employees (in thousands DKK) from 2020 to 2023. We load the data as df_baseR
which we will use to demonstrate base R commands. A copy of the same dataframe, df_tidyverse
, is used to demonstrate tidyverse commands.
<- readxl::read_excel('../data/df_sales_1.xlsx')
df_baseR <- df_baseR df_tidyverse
Base R and Tidyverse
From here, each task is solved in both the base R and tidyverse way.
Access the Age column:
# Base R
'Age']]
df_baseR[[$Age
df_tidyverse
# Tidyverse
%>% pull(Age) df_tidyverse
Add a column containing the difference in sales in 2022 and 2022.
# Base R
$sales_diff <- df_baseR$sales_2022 - df_baseR$sales_2020
df_baseR
# Tidyverse
<- df_tidyverse %>%
df_tidyverse mutate(sales_diff = sales_2022 - sales_2020)
Remove the sales_diff
column.
# Base R
$sales_diff <- NULL
df_baseR
# Tidyverse
<- df_tidyverse %>%
df_tidyverse select(!sales_diff)
Select columns with sales numbers.
# Base R
startsWith(colnames(df_baseR), 'sales_')]
df_baseR[,
# Tidyverse
%>%
df_tidyverse select(starts_with('sales_'))
Filter rows for people older than 25.
# Base R
$Age > 25 ,]
df_baseR[df_baseR
# Tidyverse
%>%
df_tidyverse filter(Age > 25)
Filter row for people that are 30 years old or younger and have sales in 2022 over 200.
# Base R
!is.na(df_baseR$Age) & !is.na(df_baseR$sales_2022) &
df_baseR[$Age <= 30 & df_baseR$sales_2022 > 200 ,]
df_baseR
# Tidyverse
%>%
df_tidyverse filter(Age <= 30, sales_2022 > 200)
Check which columns have NAs.
# Base R
colSums(is.na(df_baseR))
# Tidyverse
%>% summarise(across(everything(), ~ sum(is.na(.)))) df_tidyverse
Remove rows with any NAs.
# Base R
<- df_baseR[complete.cases(df_baseR), ]
df_baseR_no_na
# Tidyverse
<- df_tidyverse %>% drop_na() df_tidyverse_no_na
Sort data based on sales in 2022 in descending order.
# Base R
order(-df_baseR$sales_2022) ,]
df_baseR[
# Tidyverse
%>% arrange(desc(sales_2022)) df_tidyverse
Do the filtering in 4. and select the Name and Sex column. This far, do you prefer the base R or tidyverse way? Do you see pros and cons with both?
# Base R, NAs not removed
!is.na(df_baseR$Age) & !is.na(df_baseR$sales_2022) &
df_baseR[$Age <= 30 & df_baseR$sales_2022 > 200 , c('Age', 'Sex')]
df_baseR
# Base R, NAs removed
$Age <= 30 & df_baseR_no_na$sales_2022 > 200 , c('Age', 'Sex')]
df_baseR_no_na[df_baseR_no_na
# Tidyverse
%>%
df_tidyverse filter(Age <= 30, sales_2022 > 200) %>%
select(Age, Sex)
The df_location
data frame contains information about the cities of the sales people. Copy paste this code into a new chunk and run the code.
set.seed(101)
<- data.frame(
df_location ID = sample(10),
City = c("New York", "LA", "New York", "Chicago", "Miami", "Miami", "LA", "Chicago", "LA", "New York")
)
head(df_location)
Join df
and df_location
. For base R we use merge
and for tidyverse we use left_join
.
# Base R
<- merge(df_baseR, df_location, by = 'ID')
df_baseR_merged head(df_baseR_merged)
# Tidyverse
<- df_tidyverse %>% left_join(df_location, by = 'ID')
df_tidyverse_join head(df_tidyverse_join)
Count the number of people in each of the unique cities.
# Base R
table(df_baseR_merged$City)
# Tidyverse
%>% count(City) df_tidyverse_join
Plot the city distribution.
# Base R
barplot(table(df_baseR_merged$City))
# Tidyverse
%>%
df_baseR_merged ggplot(aes(x = City)) +
geom_bar()
Export Dataset
Export tidyverse dataset
::write_xlsx(df_tidyverse_join, '../out/sales_data_2.xlsx') writexl