Worked Example Using the IST Dataset • RESIDE

Introduction

This R Markdown document illustrates example usage of the RESIDE package using the IST dataset.

Setup

Load the RESIDE package and set a seed for reproducibility and store the folder directory for export / import.

# Load the Library
library(RESIDE)
# Set the seed
set.seed(1234)
# Store the folder path used for import / export
folder_path <- tempdir()

Summarise original data

Select the variables of interest and summarise. Selection of variables is optional, if the variables are known. They may not be known until the marginal distributions have been received.

# Select variables of interest from the IST dataset.
IST_original <- IST |> dplyr::select(
  AGE, # AGE at Randomisation
  SEX, # SEX M/F
  RATRIAL, # Atrial Fibrillation Y/N at Randomisation 
  # (not coded for 984 patients in the pilot phase)
  RSBP, # Systolic Blood Pressure at Randomisation
  STRK14 # Indicator of Any Stroke at 14 days
)

# Convert the character variables to factors (to allow for summary)
IST_original <- IST_original |> dplyr::mutate_if(is.character, factor)

# Produce a summary of the variables
summary(IST_original)
#>       AGE        SEX       RATRIAL        RSBP           STRK14       
#>  Min.   :16.00   F: 9028    :  984   Min.   : 70.0   Min.   :0.00000  
#>  1st Qu.:65.00   M:10407   N:15282   1st Qu.:140.0   1st Qu.:0.00000  
#>  Median :73.00             Y: 3169   Median :160.0   Median :0.00000  
#>  Mean   :71.72                       Mean   :160.2   Mean   :0.04152  
#>  3rd Qu.:80.00                       3rd Qu.:180.0   3rd Qu.:0.00000  
#>  Max.   :99.00                       Max.   :295.0   Max.   :1.00000

Fit a cox model on the original data

# Load survival and dplyr libraries
library(survival) # For Cox PH model
library(dplyr) # For data manipulation
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

# Stroke event is measured at 14 days, so set this for patients
IST_original$DAY <- 14

# Illustrate the 984 missing values
sum(IST_original$RATRIAL == "")
#> [1] 984

# Remove the missing values
IST_original <- IST_original[!IST_original$RATRIAL == "",]

# Drop the factor name for the missing values
IST_original$RATRIAL <- droplevels(IST_original$RATRIAL)

# Summarise the variable to show there are no longer missing values
summary(IST_original$RATRIAL)
#>     N     Y 
#> 15282  3169

# Fit a Cox PH model
cox.ph <- coxph(Surv(DAY, STRK14) ~ AGE + SEX + RATRIAL + RSBP, data = IST_original) 

# Output the summary of the Cox PH Model
cox.ph
#> Call:
#> coxph(formula = Surv(DAY, STRK14) ~ AGE + SEX + RATRIAL + RSBP, 
#>     data = IST_original)
#> 
#>              coef exp(coef) se(coef)     z      p
#> AGE      0.005237  1.005251 0.003387 1.546 0.1220
#> SEXM     0.077489  1.080570 0.074388 1.042 0.2976
#> RATRIALY 0.231692  1.260732 0.091818 2.523 0.0116
#> RSBP     0.000994  1.000995 0.001310 0.759 0.4479
#> 
#> Likelihood ratio test=11.8  on 4 df, p=0.01893
#> n= 18451, number of events= 764

Get the marginal distributions

Use the get_marginal_distributions() function to get the marginal distributions, additionally selecting which variables using the variables parameter.

# Get the Marginal Distributions for the selected variables
marginals <- get_marginal_distributions(
  IST,
  variables = c(
    "AGE",
    "SEX",
    "RATRIAL",
    "RSBP",
    "STRK14"
  )
)

Export the marginal distributions

Export the marginal distributions using the export_marginal_distributions() function, using the force parameter to override any existing files.

# Export the Marginal Distributions
export_marginal_distributions(marginals,
                              folder_path = folder_path,
                              force = TRUE)
#> Exporting  categorical to:  /tmp/RtmpDC59au/categorical_variables.csv
#> Exporting  binary to:  /tmp/RtmpDC59au/binary_variables.csv
#> Exporting  continuous to:  /tmp/RtmpDC59au/continuous_variables.csv
#> Exporting  quantiles to:  /tmp/RtmpDC59au/continuous_quantiles.csv
#> Exporting  summary to:  /tmp/RtmpDC59au/summary.csv

Reimport marginal distributions

Import the exported marginal distributions using the import_marginal_distributions() function

# Import the Marginal Distributions
imported_marginals <- import_marginal_distributions(folder_path = folder_path)

Synthesise data from marginal distributions (without correlations)

Synthesise data from the imported marginals using the synthesise_data function.

# Synthesise a dataset from the imported Marginal Distributions (without correlations)
sim_df <- synthesise_data(imported_marginals)

Summarise the synthesised data

Summarise the simulated data

# Convert any Character variables to Factors
sim_df <- sim_df |> dplyr::mutate_if(is.character, factor)
# Summarise the synthesised data
summary(sim_df)
#>        id        SEX       RATRIAL       STRK14             AGE       
#>  Min.   :    1   F: 9086    :  982   Min.   :0.00000   Min.   :17.00  
#>  1st Qu.: 4860   M:10349   N:15266   1st Qu.:0.00000   1st Qu.:65.00  
#>  Median : 9718             Y: 3187   Median :0.00000   Median :73.00  
#>  Mean   : 9718                       Mean   :0.04229   Mean   :71.57  
#>  3rd Qu.:14576                       3rd Qu.:0.00000   3rd Qu.:80.00  
#>  Max.   :19435                       Max.   :1.00000   Max.   :98.00  
#>       RSBP      
#>  Min.   : 79.0  
#>  1st Qu.:141.0  
#>  Median :160.0  
#>  Mean   :159.8  
#>  3rd Qu.:176.0  
#>  Max.   :290.0

Fit a cox model on the simulated data

Fit the same cox model as earlier except this time on the simulated data.


# As before the events are measured at day 14
sim_df$DAY <- 14

# Show that the missing observations are in the data
sum(sim_df$RATRIAL == "")
#> [1] 982

# Remove the missing observations
sim_df <- sim_df[!sim_df$RATRIAL == "",]

# Remove the missing factor name
sim_df$RATRIAL <- droplevels(sim_df$RATRIAL)

# Show that there are no missing observations
summary(sim_df$RATRIAL)
#>     N     Y 
#> 15266  3187

# Fit the model on the synthesised data
cox.ph.sim <- coxph(Surv(DAY, STRK14) ~ AGE + SEX + RATRIAL + RSBP, data = sim_df) 

# Show a summary of the model
cox.ph.sim
#> Call:
#> coxph(formula = Surv(DAY, STRK14) ~ AGE + SEX + RATRIAL + RSBP, 
#>     data = sim_df)
#> 
#>               coef exp(coef)  se(coef)      z      p
#> AGE       0.006539  1.006561  0.003154  2.073 0.0381
#> SEXM     -0.128197  0.879680  0.072031 -1.780 0.0751
#> RATRIALY -0.001382  0.998619  0.095327 -0.014 0.9884
#> RSBP     -0.002892  0.997113  0.001323 -2.186 0.0288
#> 
#> Likelihood ratio test=12.41  on 4 df, p=0.01452
#> n= 18453, number of events= 771

Synthesise data with correlations

Synthesise data from the imported marginals with correlations, by exporting an empty correlation matrix, then reimporting it and using the correlation_matrix parameter to specify the correlation matrix. Note the matrix was edited in R for this example but users would be expected to edit the correlation matrix in a program such as excel.

# Export an empty correlation matrix
export_empty_cor_matrix(
  imported_marginals,
  folder_path = folder_path
)
# Reimport the matrix
cor_matrix <- import_cor_matrix(file.path(folder_path, "correlation_matrix.csv"))
# Add assumed correlations (note the symmetry)
cor_matrix["RATRIAL_N", "STRK14"] <- -0.02
cor_matrix["STRK14", "RATRIAL_N"] <- -0.02

# Synthesise data specifying the correlation matrix
sim_df_cor <- synthesise_data(imported_marginals, correlation_matrix = cor_matrix)

Summarise the synthesised data

Summarise the synthesised data (with correlations)

# Convert to Factors from Character variables
sim_df_cor <- sim_df_cor |> dplyr::mutate_if(is.character, factor)
# Summarise the synthesised dataset
summary(sim_df_cor)
#>        id        SEX       RATRIAL       STRK14             AGE      
#>  Min.   :    1   F: 4130    :  151   Min.   :0.00000   Min.   :16.0  
#>  1st Qu.: 4860   M:15305   N:18620   1st Qu.:0.00000   1st Qu.:65.0  
#>  Median : 9718             Y:  664   Median :0.00000   Median :73.0  
#>  Mean   : 9718                       Mean   :0.03828   Mean   :71.6  
#>  3rd Qu.:14576                       3rd Qu.:0.00000   3rd Qu.:80.0  
#>  Max.   :19435                       Max.   :1.00000   Max.   :99.0  
#>       RSBP    
#>  Min.   : 77  
#>  1st Qu.:141  
#>  Median :160  
#>  Mean   :160  
#>  3rd Qu.:176  
#>  Max.   :291

Fit a cox model on the simulated data with correlations

Using the synthesised data (with correlations) fit a Cox PH model, with the same parameters as earlier.


# Again events are measured at 14 days
sim_df_cor$DAY <- 14

# Again check that the missing values where added
sum(sim_df_cor$RATRIAL == "")
#> [1] 151

# Again remove the missing values
sim_df_cor <- sim_df_cor[!sim_df_cor$RATRIAL == "",]

# Again drop the missing factor
sim_df_cor$RATRIAL <- droplevels(sim_df_cor$RATRIAL)

# Show there are no missing values
summary(sim_df_cor$RATRIAL)
#>     N     Y 
#> 18620   664

# Fit the model on the synthesised data (with correlations)
cox.ph.sim.cor <- coxph(Surv(DAY, STRK14) ~ AGE + SEX + RATRIAL + RSBP, data = sim_df_cor)

# Show a summary of the model
cox.ph.sim.cor
#> Call:
#> coxph(formula = Surv(DAY, STRK14) ~ AGE + SEX + RATRIAL + RSBP, 
#>     data = sim_df_cor)
#> 
#>              coef exp(coef) se(coef)     z     p
#> AGE      0.003287  1.003292 0.003197 1.028 0.304
#> SEXM     0.026197  1.026543 0.090866 0.288 0.773
#> RATRIALY 0.182798  1.200571 0.186449 0.980 0.327
#> RSBP     0.000207  1.000207 0.001336 0.155 0.877
#> 
#> Likelihood ratio test=2.06  on 4 df, p=0.7241
#> n= 19284, number of events= 735