Exercise 4A - Solutions: Scripting in R - Conditions and For-loops

In this exercise you will practice your scripting in R.

Getting started

Load libraries and the joined diabetes data set.

library(tidyverse)
library(glue)

diabetes_glucose <- readxl::read_excel('../data/exercise2_diabetes_glucose.xlsx')
diabetes_glucose

# A tibble: 1,422 × 13
   ID    Sex      Age BloodPressure   BMI PhysicalActivity Smoker  Diabetes
   <chr> <chr>  <dbl>         <dbl> <dbl>            <dbl> <chr>      <dbl>
 1 34120 Female    28            75  25.4               92 Never          0
 2 34120 Female    28            75  25.4               92 Never          0
 3 34120 Female    28            75  25.4               92 Never          0
 4 27458 Female    55            72  24.6               86 Never          0
 5 27458 Female    55            72  24.6               86 Never          0
 6 27458 Female    55            72  24.6               86 Never          0
 7 70630 Male      22            80  24.9              139 Unknown        0
 8 70630 Male      22            80  24.9              139 Unknown        0
 9 70630 Male      22            80  24.9              139 Unknown        0
10 13861 Female    56            72  37.1               64 Unknown        1
# ℹ 1,412 more rows
# ℹ 5 more variables: Serum_ca2 <dbl>, Married <chr>, Work <chr>,
#   Measurement <chr>, `Glucose (mmol/L)` <dbl>

If-else statements

In these exercises we don’t use the dataframe yet, that comes later when we have loops. For this part, just declare variables to test your statements, e.g. bp <- 120.

Write an if-else statement that prints whether a person has high (more than 100), low (lower than 50) or normal blood pressure (between 50 and 100).

bp <- 80

if (bp > 100){
  print('High blood pressure')
} else if (bp < 50) {
  print('Low blood pressure')
} else {
  print('Normal blood pressure')
}

[1] "Normal blood pressure"

Write an if-else statement that assigns people high, moderate or low health risk based on their smoking habits (variable Smoker) and BMI:

Smoker and BMI greater than 35 -> high risk
Smoker or BMI greater than 35 -> moderate risk
otherwise low risk

And Smoker should be one of “Smoker”, “Former”, “Never”, “Unknown”.

Verify that your statement works for different combinations of smoking habits and BMI.

Smoker <- 'Smoker'
BMI <- 40

if (Smoker == 'Smoker' & BMI > 35){
  print('High risk')
} else if (Smoker == 'Smoker' | BMI > 35) {
  print('Moderate risk')
} else {
  print('Low risk')
}

[1] "High risk"

Loops

Create a vector with at least five elements of your choice. Use a for loop to print each element individually.

my_v <- c(1, 78, 5, 'hello', 7)

for (el in my_v) {
  print(el)
}

[1] "1"
[1] "78"
[1] "5"
[1] "hello"
[1] "7"

Print each column name in the diabetes_glucose data frame using a for loop.

for (col in colnames(diabetes_glucose)) {
  print(col)
}

[1] "ID"
[1] "Sex"
[1] "Age"
[1] "BloodPressure"
[1] "BMI"
[1] "PhysicalActivity"
[1] "Smoker"
[1] "Diabetes"
[1] "Serum_ca2"
[1] "Married"
[1] "Work"
[1] "Measurement"
[1] "Glucose (mmol/L)"

Loop over all rows of diabetes_glucose and determine whether the person’s blood pressure is high, low or normal with the same conditions as in 1. Print the blood pressure value as well as the statement so you can verify whether you have classified the blood pressure correctly as high, normal or low.

#We'll only show the first 10 rows here for brevity
#for (i in 1:nrow(diabetes_glucose)) {

for (i in 1:10) {
  bp <- diabetes_glucose$BloodPressure[i]

  if (bp > 100){
    print(paste(bp,'is high blood pressure'))
  } else if (bp < 50) {
    print(paste(bp,'is low blood pressure'))
  } else {
    print(paste(bp,'is normal blood pressure'))
  } 
  
}

[1] "75 is normal blood pressure"
[1] "75 is normal blood pressure"
[1] "75 is normal blood pressure"
[1] "72 is normal blood pressure"
[1] "72 is normal blood pressure"
[1] "72 is normal blood pressure"
[1] "80 is normal blood pressure"
[1] "80 is normal blood pressure"
[1] "80 is normal blood pressure"
[1] "72 is normal blood pressure"

Loop over all rows of diabetes_glucose and extract the smoking habits and BMI for each row and determine the health risk with the same conditions as in Exercise 4.2. Print the smoking habits and BMI as well as the health risk level to make it easier to see whether your code works correctly.

Hint

Extract value for i’th row in specific column: df$col1[i]

An easy way to printing several variables is to pass a vector into print: print(c(this, and_that, and_this_too))

#We'll only show the first 10 rows here for brevity
#for (i in 1:nrow(diabetes_glucose)) {

for (i in 1:10) {
  Smoker <- diabetes_glucose$Smoker[i]
  BMI <- diabetes_glucose$BMI[i]

  if (Smoker == 'Smoker' & BMI > 35){
    print(c(Smoker, BMI, 'High risk'))
  } else if (Smoker == 'Smoker' | BMI > 35) {
    print(c(Smoker, BMI,'Moderate risk'))
  } else {
    print(c(Smoker, BMI,'Low risk'))
  }
}

[1] "Never"    "25.4"     "Low risk"
[1] "Never"    "25.4"     "Low risk"
[1] "Never"    "25.4"     "Low risk"
[1] "Never"    "24.6"     "Low risk"
[1] "Never"    "24.6"     "Low risk"
[1] "Never"    "24.6"     "Low risk"
[1] "Unknown"  "24.9"     "Low risk"
[1] "Unknown"  "24.9"     "Low risk"
[1] "Unknown"  "24.9"     "Low risk"
[1] "Unknown"       "37.1"          "Moderate risk"

Do the same as above but instead of printing the risk status, append it to a list. Start by initiating an empty list.

# Initiate list
risk_status <- list()

for (i in 1:nrow(diabetes_glucose)) {
  Smoker <- diabetes_glucose$Smoker[i]
  BMI <- diabetes_glucose$BMI[i]
    
  if (Smoker == 'Smoker' & BMI > 35){
    risk_status <- append(risk_status, 'High risk')
  } else if (Smoker == 'Smoker' | BMI > 35) {
    risk_status <- append(risk_status, 'Moderate risk')
  } else {
    risk_status <- append(risk_status, 'Low risk')
  }
}

risk_status %>% head()

[[1]]
[1] "Low risk"

[[2]]
[1] "Low risk"

[[3]]
[1] "Low risk"

[[4]]
[1] "Low risk"

[[5]]
[1] "Low risk"

[[6]]
[1] "Low risk"

Check the length of the list. Is it as expected?

Since we looped through all the rows in the diabetes_glucose dataframe, the list should be as long as there are row in the dataframe.

length(risk_status)

[1] 1422

nrow(diabetes_glucose)

[1] 1422

Add the list as a new column in the diabetes_glucose data frame. Note: Before assigning it, use the unlist() function to convert the list to a flat vector. This ensures that each value aligns correctly with the rows of the data frame.

diabetes_glucose$risk_status <- unlist(risk_status)

diabetes_glucose %>% select(BMI, Smoker, risk_status)

# A tibble: 1,422 × 3
     BMI Smoker  risk_status  
   <dbl> <chr>   <chr>        
 1  25.4 Never   Low risk     
 2  25.4 Never   Low risk     
 3  25.4 Never   Low risk     
 4  24.6 Never   Low risk     
 5  24.6 Never   Low risk     
 6  24.6 Never   Low risk     
 7  24.9 Unknown Low risk     
 8  24.9 Unknown Low risk     
 9  24.9 Unknown Low risk     
10  37.1 Unknown Moderate risk
# ℹ 1,412 more rows

Make a list of all the column names in diabetes_glucose that contain categorical variables. Write a for loop that goes through the list and prints a barplot for each of the categorical variables.

categorical <- list('Sex', 'Smoker', 'Diabetes', 'Married', 'Work')

for (var in categorical){
  
  p <- ggplot(diabetes_glucose, 
         aes(x = !!sym(var))) + 
    geom_bar() +
    labs(title = paste('Barplot of', var))
  
  print(p)
  
}

Make a list of all the column names in diabetes_glucose that contain numeric variables. Make a for loop that goes through the list and prints a boxplot for each of the categorical variables.

head(diabetes_glucose)

# A tibble: 6 × 14
  ID    Sex      Age BloodPressure   BMI PhysicalActivity Smoker Diabetes
  <chr> <chr>  <dbl>         <dbl> <dbl>            <dbl> <chr>     <dbl>
1 34120 Female    28            75  25.4               92 Never         0
2 34120 Female    28            75  25.4               92 Never         0
3 34120 Female    28            75  25.4               92 Never         0
4 27458 Female    55            72  24.6               86 Never         0
5 27458 Female    55            72  24.6               86 Never         0
6 27458 Female    55            72  24.6               86 Never         0
# ℹ 6 more variables: Serum_ca2 <dbl>, Married <chr>, Work <chr>,
#   Measurement <chr>, `Glucose (mmol/L)` <dbl>, risk_status <chr>

numeric <- list('Age', 'BloodPressure', 'BMI', 'PhysicalActivity', 'Serum_ca2')

for (var in numeric){
  
  p <- ggplot(diabetes_glucose, 
         aes(y = !!sym(var))) + 
    geom_boxplot() + 
    labs(title = paste('Boxplot of', var))
  
  print(p)
  
}

Warning: Removed 6 rows containing non-finite outside the scale range
(`stat_boxplot()`).