Presentation 1: Base R and Tidyverse

Load Packages

library(tidyverse)

Load Dataset

Here we load a dataframe that contains sales data for employees (in thousands DKK) from 2020 to 2023. We load the data as df_baseR which we will use to demonstrate base R commands. A copy of the same dataframe, df_tidyverse, is used to demonstrate tidyverse commands.

df_baseR <- readxl::read_excel('../data/df_sales_1.xlsx')
df_tidyverse <- df_baseR

Base R and Tidyverse

From here, each task is solved in both the base R and tidyverse way.

Access the Age column:

# Base R 
df_baseR[['Age']]
df_tidyverse$Age

# Tidyverse 
df_tidyverse %>% pull(Age)

Add a column containing the difference in sales in 2022 and 2022.

# Base R 
df_baseR$sales_diff <- df_baseR$sales_2022 - df_baseR$sales_2020

# Tidyverse 
df_tidyverse <- df_tidyverse %>% 
  mutate(sales_diff = sales_2022 - sales_2020)

Remove the sales_diff column.

# Base R 
df_baseR$sales_diff <- NULL

# Tidyverse 
df_tidyverse <- df_tidyverse %>% 
  select(!sales_diff)

Select columns with sales numbers.

# Base R 
df_baseR[, startsWith(colnames(df_baseR), 'sales_')]

# Tidyverse 
df_tidyverse %>% 
  select(starts_with('sales_'))

Filter rows for people older than 25.

# Base R 
df_baseR[df_baseR$Age > 25 ,]

# Tidyverse 
df_tidyverse %>% 
  filter(Age > 25)

Filter row for people that are 30 years old or younger and have sales in 2022 over 200.

# Base R 
df_baseR[!is.na(df_baseR$Age) & !is.na(df_baseR$sales_2022) &
         df_baseR$Age <= 30 & df_baseR$sales_2022 > 200 ,]

# Tidyverse 
df_tidyverse %>% 
  filter(Age <= 30, sales_2022 > 200)

Check which columns have NAs.

# Base R
colSums(is.na(df_baseR))

# Tidyverse
df_tidyverse %>% summarise(across(everything(), ~ sum(is.na(.))))

Remove rows with any NAs.

# Base R
df_baseR_no_na <- df_baseR[complete.cases(df_baseR), ]

# Tidyverse
df_tidyverse_no_na <- df_tidyverse %>% drop_na()

Sort data based on sales in 2022 in descending order.

# Base R 
df_baseR[order(-df_baseR$sales_2022) ,]

# Tidyverse 
df_tidyverse %>% arrange(desc(sales_2022))

Do the filtering in 4. and select the Name and Sex column. This far, do you prefer the base R or tidyverse way? Do you see pros and cons with both?

# Base R, NAs not removed
df_baseR[!is.na(df_baseR$Age) & !is.na(df_baseR$sales_2022) &
         df_baseR$Age <= 30 & df_baseR$sales_2022 > 200 , c('Age', 'Sex')]

# Base R, NAs removed
df_baseR_no_na[df_baseR_no_na$Age <= 30 & df_baseR_no_na$sales_2022 > 200 , c('Age', 'Sex')]

# Tidyverse 
df_tidyverse %>% 
  filter(Age <= 30, sales_2022 > 200) %>% 
  select(Age, Sex)

The df_location data frame contains information about the cities of the sales people. Copy paste this code into a new chunk and run the code.

set.seed(101)

df_location <- data.frame(
  ID = sample(10),
  City = c("New York", "LA", "New York", "Chicago", "Miami", "Miami", "LA", "Chicago", "LA", "New York")
)

head(df_location)

Join df and df_location. For base R we use merge and for tidyverse we use left_join.

# Base R 
df_baseR_merged <- merge(df_baseR, df_location, by = 'ID')
head(df_baseR_merged)

# Tidyverse 
df_tidyverse_join <- df_tidyverse %>% left_join(df_location, by = 'ID')
head(df_tidyverse_join)

Count the number of people in each of the unique cities.

# Base R
table(df_baseR_merged$City)

# Tidyverse
df_tidyverse_join %>% count(City)

Plot the city distribution.

# Base R
barplot(table(df_baseR_merged$City))

# Tidyverse
df_baseR_merged %>% 
  ggplot(aes(x = City)) + 
  geom_bar()

Export Dataset

Export tidyverse dataset

writexl::write_xlsx(df_tidyverse_join, '../out/sales_data_2.xlsx')