Exercise 4B - Solutions: Scripting in R - Functions

In this exercise you will practice creating and applying user-defined functions.

Getting started

Load libraries and the joined diabetes data set.

library(tidyverse)
library(glue)

diabetes_glucose <- readxl::read_excel('../data/exercise2_diabetes_glucose.xlsx')
diabetes_glucose

# A tibble: 1,422 × 13
   ID    Sex      Age BloodPressure   BMI PhysicalActivity Smoker  Diabetes
   <chr> <chr>  <dbl>         <dbl> <dbl>            <dbl> <chr>      <dbl>
 1 34120 Female    28            75  25.4               92 Never          0
 2 34120 Female    28            75  25.4               92 Never          0
 3 34120 Female    28            75  25.4               92 Never          0
 4 27458 Female    55            72  24.6               86 Never          0
 5 27458 Female    55            72  24.6               86 Never          0
 6 27458 Female    55            72  24.6               86 Never          0
 7 70630 Male      22            80  24.9              139 Unknown        0
 8 70630 Male      22            80  24.9              139 Unknown        0
 9 70630 Male      22            80  24.9              139 Unknown        0
10 13861 Female    56            72  37.1               64 Unknown        1
# ℹ 1,412 more rows
# ℹ 5 more variables: Serum_ca2 <dbl>, Married <chr>, Work <chr>,
#   Measurement <chr>, `Glucose (mmol/L)` <dbl>

User defined Functions

Create a function named calculate_risk_score().

It should accept the following parameters:

BloodPressure
BMI
Smoking
PhysicalActivity

Initiate the risk score with a value of 0 and calculate the final risk score with the following rules:

if BloodPressure is > 90, add 0.5
if Smoking is ‘Smoker’, add 1
if BMI is > 30, add 1. If BMI > 40, add 2
if PhysicalActivity is > 110, substract 1

The function should return the final risk score. Test it with some different inputs to verify that it works according to the rules detailed above.

calculate_risk_score <- function(BloodPressure, BMI, Smoking, PhysicalActivity){
  
  risk_score <- 0
  
  if (BloodPressure > 90){
    risk_score <- risk_score + 0.5
    }

  if (Smoking == 'Smoker'){
    risk_score <- risk_score + 1
    }
  
  #BMI > 30 includes BMIs that are larger than 40, so we need to be careful of
  #the order in which we check, or include that the BMI should be <= 40 in order 
  #to add 1.   
  
 if (BMI > 40){
    risk_score <- risk_score + 2
    } 
  else if( BMI > 30) {
    risk_score <- risk_score + 1
    } 
  
  if (PhysicalActivity > 110){
    risk_score <- risk_score - 1
    }
    
  return(risk_score)
  
}

#demo
calculate_risk_score(120, 45, 'no', 115)

[1] 1.5

calculate_risk_score(120, 45, 'Smoker', 115)

[1] 2.5

calculate_risk_score(90, 35, 'Smoker', 60)

[1] 2

Add a check to your function whether the supplied parameters are numeric, except for Smoking which should be a factor or a character (you can check that too if you want to). Test that your check works correctly.

calculate_risk_score <- function(BloodPressure, BMI, Smoking, PhysicalActivity){
  
  if (!is.numeric(BloodPressure) | !is.numeric(BMI) | !is.numeric(PhysicalActivity)) {
    stop("BloodPressure, BMI, and PhysicalActivity must be numeric values.")
  }
  
  if(! (is.character(Smoking) | is.factor(Smoking)) ){
    stop("Smoking should be a factor or character.")
  }
  
  risk_score <- 0
  
  if (BloodPressure > 90){
    risk_score <- risk_score + 0.5
    }

  if (Smoking == 'Smoker'){
    risk_score <- risk_score + 1
    }
  
  #BMI > 30 includes BMIs that are larger than 40, so we need to be careful of
  #the order in which we check, or include that the BMI should be <= 40 in order 
  #to add 1.   
  
 if (BMI > 40){
    risk_score <- risk_score + 2
    } 
  else if( BMI > 30) {
    risk_score <- risk_score + 1
    } 
  
  if (PhysicalActivity > 110){
    risk_score <- risk_score - 1
    }
    
  return(risk_score)
  
}

calculate_risk_score('Hi', 35, 'Smoker', 60)

# Error in calculate_risk_score("Hi", 35, "Smoker", 60) : 
#   BloodPressure, BMI, and PhysicalActivity must be numeric values.

calculate_risk_score(90, 35, 1, 60)

# Error in calculate_risk_score(90, 35, 1, 60) : 
#   Smoking should be a factor or character.

calculate_risk_score(90, 35, 'Smoker', 60)

[1] 2

In a for-loop, apply your calculate_risk_score to each row in diabetes_glucose. Remember to use the appropriate column, i.e. diabetes_glucose$BloodPressure should be the argument for BloodPressure in the function. Add the calculated risk score to diabetes_glucose as a column Risk_score.

for (i in 1:nrow(diabetes_glucose)) {
  
  risk_score <- calculate_risk_score(BloodPressure = diabetes_glucose$BloodPressure[i], 
                                     BMI = diabetes_glucose$BMI[i], 
                                     Smoking = diabetes_glucose$Smoker[i], 
                                     PhysicalActivity = diabetes_glucose$PhysicalActivity[i])
  
  diabetes_glucose[i, 'Risk_score'] <- risk_score
  
}

diabetes_glucose

# A tibble: 1,422 × 14
   ID    Sex      Age BloodPressure   BMI PhysicalActivity Smoker  Diabetes
   <chr> <chr>  <dbl>         <dbl> <dbl>            <dbl> <chr>      <dbl>
 1 34120 Female    28            75  25.4               92 Never          0
 2 34120 Female    28            75  25.4               92 Never          0
 3 34120 Female    28            75  25.4               92 Never          0
 4 27458 Female    55            72  24.6               86 Never          0
 5 27458 Female    55            72  24.6               86 Never          0
 6 27458 Female    55            72  24.6               86 Never          0
 7 70630 Male      22            80  24.9              139 Unknown        0
 8 70630 Male      22            80  24.9              139 Unknown        0
 9 70630 Male      22            80  24.9              139 Unknown        0
10 13861 Female    56            72  37.1               64 Unknown        1
# ℹ 1,412 more rows
# ℹ 6 more variables: Serum_ca2 <dbl>, Married <chr>, Work <chr>,
#   Measurement <chr>, `Glucose (mmol/L)` <dbl>, Risk_score <dbl>

Now, run calculate_risk_score on each row in diabetes_glucose by using mapply instead. Confirm that your result is the same.

risk_scores <- mapply(FUN = calculate_risk_score, 
                      BloodPressure = diabetes_glucose$BloodPressure, 
                      BMI = diabetes_glucose$BMI,
                      Smoking = diabetes_glucose$Smoker,
                      PhysicalActivity = diabetes_glucose$PhysicalActivity)

A quick check confirms that the risk_scores vector contains the same as the column we previously added.

all(risk_scores == diabetes_glucose$Risk_score)

[1] TRUE

Create an R script file to contain your functions. Copy your functions there and remove them from your global environment with rm(list="name_of_your_function"). Now source the function R script in your quarto document and test that the functions work.

#remove function from global environment so we can test if it loads properly from the script
rm(list = "calculate_risk_score")

Source the script we copied our calculate_risk_score function into:

source('tdhh4B.R')

The function works still:

calculate_risk_score(90, 35, 'Smoker', 60)

[1] 2

Extra exercises

These exercises will ask you to first perform some tidyverse operations. Once you succeeded, you should abstract your code into a function.

e1. Calculate the mean of Glucose (mmol/L) for each measuring time point across all patients. You should obtain 3 values, one for 0, one for 60 and one for 120.

diabetes_glucose %>%
  group_by(Measurement) %>%
  summarise(mean_gluc = mean(`Glucose (mmol/L)`))

# A tibble: 3 × 2
  Measurement mean_gluc
  <chr>           <dbl>
1 0                8.07
2 120             11.0 
3 60               9.71

e2. Now we would like to stratify this mean further by a second variable, Sex. We would like to obtain 6 means: 0_female, 0_male, 60_female, 60_male, 120_female, 120_male.

diabetes_glucose %>%
  group_by(Measurement, Sex) %>%
  summarize(glucose_mean = mean(`Glucose (mmol/L)`))

`summarise()` has grouped output by 'Measurement'. You can override using the
`.groups` argument.

# A tibble: 6 × 3
# Groups:   Measurement [3]
  Measurement Sex    glucose_mean
  <chr>       <chr>         <dbl>
1 0           Female         8.10
2 0           Male           8.03
3 120         Female        10.9 
4 120         Male          11.1 
5 60          Female         9.71
6 60          Male           9.72

e3. Now we would like to abstract the name of the second column. Imagine we would like glucose measurement means per marriage status or per workplace instead of per sex. Create a variable column <- 'Sex' and redo what you did above but using column instead of the literal name of the column (Sex). This works in the same way as when plotting with a variable instead of the literal column name. Have a look at presentation 4A if you don’t remember how to do it.

column <- 'Sex'

glucose_group_mean <- diabetes_glucose %>%
  group_by(Measurement, !!sym(column)) %>%
  summarize(glucose_mean = mean(`Glucose (mmol/L)`))

`summarise()` has grouped output by 'Measurement'. You can override using the
`.groups` argument.

glucose_group_mean

# A tibble: 6 × 3
# Groups:   Measurement [3]
  Measurement Sex    glucose_mean
  <chr>       <chr>         <dbl>
1 0           Female         8.10
2 0           Male           8.03
3 120         Female        10.9 
4 120         Male          11.1 
5 60          Female         9.71
6 60          Male           9.72

e4. Now, make your code into a function calc_mean_meta() that can calculate glucose measurement means based on different meta data columns. It should be called like so calc_mean_meta('Sex') and give you the same result as before. Try it on other metadata columns in diabetes_glucose_unnest too!

calc_mean_meta <- function(column){
  glucose_group_mean <- diabetes_glucose %>%
    group_by(Measurement, !!sym(column)) %>%
    summarize(glucose_mean = mean(`Glucose (mmol/L)`))
  
  return(glucose_group_mean)
}

calc_mean_meta('Sex')

`summarise()` has grouped output by 'Measurement'. You can override using the
`.groups` argument.

# A tibble: 6 × 3
# Groups:   Measurement [3]
  Measurement Sex    glucose_mean
  <chr>       <chr>         <dbl>
1 0           Female         8.10
2 0           Male           8.03
3 120         Female        10.9 
4 120         Male          11.1 
5 60          Female         9.71
6 60          Male           9.72

calc_mean_meta('Work')

`summarise()` has grouped output by 'Measurement'. You can override using the
`.groups` argument.

# A tibble: 12 × 3
# Groups:   Measurement [3]
   Measurement Work         glucose_mean
   <chr>       <chr>               <dbl>
 1 0           Private              8.01
 2 0           Public               7.92
 3 0           Retired              8.41
 4 0           SelfEmployed         8.50
 5 120         Private             11.1 
 6 120         Public              10.8 
 7 120         Retired             11.8 
 8 120         SelfEmployed        11.3 
 9 60          Private              9.74
10 60          Public               9.50
11 60          Retired             10.2 
12 60          SelfEmployed         9.99