diff --git a/Bact_IDA_plan.qmd b/Bact_IDA_plan.qmd new file mode 100644 index 0000000..058f37b --- /dev/null +++ b/Bact_IDA_plan.qmd @@ -0,0 +1,124 @@ +# IDA plan {#IDA_plan} + +```{r} +library(here) +library(tidyverse) + + +load(here::here("data", "a_bact.rda")) +``` + + +This document exemplifies the prespecified plan for initial data analysis (IDA plan) for the bacteremia study. + +## Prerequisites for the IDA plan + +### Analysis strategy + +We assume that the aims of the study are to fit a diagnostic prediction model and to describe the functional form of each predictor. These aims are addressed by fitting a logistic regression model with bacteremia status as the dependent variable. + +Based on domain expertise, the predictors are grouped by their assumed importance to predict bacteremia. Variables with known strong associations with bacteremia are age (AGE), leukocytes (WBC), blood urea neutrogen (BUN), creatinine (CREA), thrombocytes (PLT), and neutrophiles (NEU) and these predictors will be included in the model as key predictors. Predictors of medium importance are potassium (POTASS), and some acute-phase related parameters such as fibrinogen (FIB), C-reactive protein (CRP), aspartate transaminase (ASAT), alanine transaminase (ALAT), and gamma-glutamyl transpeptidase (GGT). All other predictors are of minor importance. + +Continuous predictors should be modelled by allowing for flexible functional forms, where for all key predictors four degrees of freedom will be spent, and for predictors of medium and minor importance, three or two degrees of freedom should be foreseen at maximum, respectively. The decision on whether to use only key predictors, or to consider predictors also from the predictor sets of medium or minor importance depends on results of data screening, but will be made before uncovering the association of predictors with the outcome variable. + +An adequate strategy to cope with missing values will also be chosen after screening the data. Candidate strategies are omission of predictors with abundant missing values, complete case analysis, single value imputation or multiple imputation with chained equations. + +### Data dictionary + +The data dictionary of the bacteremia data set consists of columns for variable names, variable labels, scale of measurement (continuous or categorical), units, plausibility limits, and remarks: + +```{r} +bact.dd<-read.csv(here::here("misc", "bacteremia-DataDictionary.csv")) + +bact.dd +``` + + +### Domain expertise + +The demographic variables age and sex are are chosen as the structural variables in this analysis for illustration purposes, since they are commonly considered important for describing a cohort in health studies. Key predictors and predictors of medium importance are as defined above. Laboratory analyses always bear the risk of machine failures, and hence missing values are a frequent challenge. This may differ between laboratory variables, but no a priori estimate about the expected proportion of missing values can be assumed. As most predictors measure concentrations of chemical compounds or cell counts, skewed distributions are expected. Some predictors describe related types of cells or chemical compounds, and hence some correlation between them is to be expected. For example, leukocytes consist of five different types of blood cells (BASO, EOS, NEU, LYM and MONO), and the sum of the concentration of these types approximately (but not exactly) gives the leukocyte count, which is recorded in the variable WBC. Moreover, these variables are given as absolute counts and as percentages of the sum of the five variables, which creates some correlation. Some laboratory variables differ by sex and age, but the special selection of patients for this study (suspicion of bacteremia) may distort or alter the expected correlations with sex and age. + +For the purpose of stratifying IDA results by age, age will be categorized into the following three groups: (16, 50], (50, 65], (65, 101]. + +The predictor grouping is defined here: + +```{r, echo=TRUE} +#demog_vars <-c("AGE", "SEX") +structural_vars <- c("AGE", "SEX") +key_predictors <- c("WBC", "AGE", "BUN","CREA","NEU","PLT") +medimp_predictors <-c("POTASS", "FIB", "CRP", "ASAT", "ALAT", "GGT") +outcome_vars <-c("BloodCulture", "BC") +remaining_predictors <- names(a_bact)[is.na(match(names(a_bact),c("ID", structural_vars, key_predictors,medimp_predictors, outcome_vars)))] + +b_bact <- + a_bact %>% mutate(GENDER = factor(SEX, levels=c(1,2),labels=c("male", "female")), AGEGROUP = factor(cut(AGE, c(min(AGE)-1,50, 65, max(AGE))), labels=c("1:(15,50]","2:(50,65]","3:(65,101]"))) + + +bact_variables <- list(structural_vars=structural_vars, key_predictors=key_predictors, medimp_predictors=medimp_predictors, + remaining_predictors=remaining_predictors, outcome_vars=outcome_vars) + +``` + +## IDA plan + + +### M1: Prevalence of missing values + +Numbers and proportions of missing values will be reported for each predictor separately (M1). Type of missingness has not been recorded. + +### M2: Complete cases + +The number of available complete cases (outcome and predictors) will be reported when considering: + +1. the outcome variable (BC) +2. outcome and structural variables (BC, AGE, SEX) +3. outcome and key predictors only (BC, AGE, WBC, BUN, CREA, PLT, NEU) +4. outcome, key predictors and predictors of medium importance (BC, AGE, WBC, BUN, CREA, PLT, NEU, POTASS, FIB, CRP, ASAT, ALAT, GGT) +5. outcome and all predictors. + +### M3: Patterns of missing values + +Patterns of missing values will be investigated by: + +1. computing a table of complete cases (for the three predictor sets described above) for strata defined by the structural variables age and sex, +2. constructing a dendrogram of missing values to explore which predictors tend to be missing together. + + + +### U1: Univariate descriptions: categorical variables + +For sex and bacteremia status, the frequency and proportion of each category will be described numerically. + +### U2: Univariate descriptions: continuous variables + +For all continuous predictors, combo plots consisting of high-resolution histograms, boxplots and dotplots will be created. Because of the expected skew distribution, combo plots will also be created for log-transformed predictors. + +As numerical summaries, minimum and maximum values, main quantiles (5th, 10th, 25th, 50th, 75th, 90th, 95th), and the first four moments (mean, standard deviation, skewness, curtosis) will be reported. The number of distinct values and the five most frequent values will be given, as well as the concentration ratio (ratio of frequency of most frequent value and mean frequency of each unique value). + +Graphical and parametric multivariate analyses of the predictor space such as cluster analyses or the computation of variance inflation factors are heavily influenced by the distribution of the predictors. In order to make this set of analyses more robust to highly influential points or areas of the predictor support, some predictors may need transformation (e.g. logarithmic). We will compute the correlation of the untransformed and log-transformed predictors with normal deviates. Since some predictors may have values at or close to 0, we will consider the pseudolog transformation $f(x;\sigma) = sinh^{-1}(x/2\sigma)/\log10$ (Johnson, 1949) which provides a smooth transition from linear (close to 0) to logarithmic (further away from 0). The transformation has a parameter $\sigma$ which we will optimize separately for each predictor in order to achieve an optimal approximation to a normal distribution monitored via the correlation of normal deviates with the transformed predictor. For those predictors for which the pseudolog-transformation increases correlation with normal deviates by at least 0.2 units of the correlation coefficient, the pseudolog-transformed predictor will be used in multivariate IDA instead of the respective original predictor. For those predictors, histograms and boxplots will be provided on both the original and the transformed scale. + +### V1: Multivariate descriptions: associations of predictors with structural variables + +A scatterplot of each predictor with age, with different panels for males and females will be constructed. Associated Spearman correlation coefficients will be computed. + +### V2: Multivariate descriptions: correlation analyses + +A matrix of Spearman correlation coefficients between all pairs of predictors will be computed and described numerically as well as by means of a heatmap. + +### VE1: Multivariate descriptions: comparing nonparametric and parametric predictor correlation + +A matrix of Pearson correlation coefficients will be computed. Predictor pairs for which Spearman and Pearson correlation coefficients differ by more than 0.1 correlation units will be depicted in scatterplots. + +### VE2: Variable clustering + +A variable clustering analysis will be performed to evaluate which predictors are closely associated. A dendrogram groups predictors by their correlation. Scatterplots of pairs of predictors with Spearman correlation coefficients greater than 0.8 will be created. + +### VE3: Redundancy + +Variance inflation factors will be computed between the candidate predictors. This will be done for the three possible candidate models, and using all complete cases in the respective candidate predictor sets. Redundancy will further be explored by computing parametric additive models for each predictor in the three candidate models. + + +```{r} + +save(list=c("b_bact", "bact_variables", "structural_vars","key_predictors","medimp_predictors","remaining_predictors"), file = here::here("data", "bact_env_b.rda")) +``` \ No newline at end of file diff --git a/Bact_SAP.qmd b/Bact_SAP_orphaned.Rmd similarity index 99% rename from Bact_SAP.qmd rename to Bact_SAP_orphaned.Rmd index e227a7d..2f811c0 100644 --- a/Bact_SAP.qmd +++ b/Bact_SAP_orphaned.Rmd @@ -1,4 +1,13 @@ -# IDA plan +--- +title: "Bact_SAP_orphaned" +author: "GH" +date: "2022-08-30" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` ```{r, echo=FALSE, warning=FALSE, message=FALSE, echo=FALSE} ## Load libraries @@ -274,3 +283,4 @@ Tudela P, Lacoma A, Prat C, Modol JM, Gimenez M, et al. (2010) Prediction of bac save(list=c("b_bact", "bact_variables", "demog_vars", "structural_vars","key_predictors","leuko_related_vars","leuko_ratio_vars","kidney_related_vars", "acute_related_vars","remaining_vars"), file = here::here("data", "bact_env_b.rda")) ``` + diff --git a/Bact_intro.qmd b/Bact_intro.qmd index 07dc766..5487208 100644 --- a/Bact_intro.qmd +++ b/Bact_intro.qmd @@ -1,5 +1,4 @@ -# Bacteremia {#Bacteremia} - +# Bacteremia study {#Bacteremia} ```{r, echo=FALSE, warning=FALSE, message=FALSE, echo=FALSE} ## Load libraries for this chapter @@ -8,27 +7,16 @@ library(tidyverse) library(Hmisc) ``` -# Introduction to Bacteremia - -To demonstrate the workflow and content of IDA, we created a hypothetical research aim and corresponding statistical analysis plan, which is described in more detail in the section [Bact_SAP.Rmd](Bact_SAP.Rmd). - -**Hypothetical research aim for IDA** is to develop a multivariable diagnostic model for bacteremia using 49 continuous laboratory blood parameters, age and gender with the primary aim of prediction and a secondary aim of describing the association of each variable with the outcome ('explaining' the multivariable model). - -A diagnostic prediction model was developed based on this data set and validated in "A Risk Prediction Model for Screening Bacteremic Patients: A Cross Sectional Study" [Ratzinger et al, PLoS One 2014](https://doi.org/10.1371/journal.pone.0106765). The assumed research aim is in line with this diagnostic prediction model. - -## Dataset Description -Ratzinger et al (2014) performed a diagnostic study in which age, sex and 49 laboratory variables can be used to diagnose bacteremia status of a blood sample using a multivariable model. Between January 2006 and December 2010, patients with the clinical suspicion to suffer from bacteraemia were included if blood culture analysis was requested by the responsible physician and blood was sampled for assessment of haematology and biochemistry. The data consists of 14,691 observations from different patients. +## Overview of the bacteremia study CHANGED SUBHEADER -Our version of this data was slightly modified compared to original version, and this modified version was cleared by the Medical University of Vienna for public use (DC 2019-0054). Variable names have been kept as they were (partly German abbreviations). A data dictionary is available in the **misc** folder of the project directory ('bacteremia-DataDictionary.csv'). +We will exemplify our proposed systematic approach to data screening by means of a diagnostic study with the primary aim of using age, sex and 49 laboratory variables to fit a diagnostic prediction model for the bacteremia status (= presence of bacteria in the blood stream) of a blood sample. A secondary aim of the study is to describe the functional form of each predictor in the model. Between January 2006 and December 2010, patients with the clinical suspicion to suffer from bacteremia were included if blood culture analysis was requested by the responsible physician and blood was sampled for assessment of hematology and biochemistry. An analysis of this study can be found in Ratzinger et al: "A Risk Prediction Model for Screening Bacteremic Patients: A Cross Sectional Study" [Ratzinger et al, PLoS One 2014](https://doi.org/10.1371/journal.pone.0106765). -In the original paper describing the study [(Ratzinger et al, PLoS One 2014)](https://doi.org/10.1371/journal.pone.0106765), a machine learning approach was taken to diagnose a positive status of blood culture. The true status was determined for all blood samples by blood culture analysis, which is the gold standard. Here we will make use of a multivariable logistic regression model. +The data consists of 14,691 observations from different patients and 51 potential predictors. To protect data privacy our version of this data was slightly modified compared to the original version, and this modified version was cleared by the Medical University of Vienna for public use (DC 2019-0054). Compared to the official results given in Ratzinger et al (2014), our results may differ to a negligible degree. -## Bacteremia dataset contents - -### Source dataset +### Source dataset CHANGED Hierarchy to third level We refer to the source data set as the dataset available in this repository. @@ -59,25 +47,11 @@ As a cross check we display the contents again to ensure the additional data is ```{r contents_abact, warning=FALSE, message=FALSE, echo=FALSE, results='asis'} +### COMMENT: I removed the definition of a bact_subset as we don't need it anymore! (this comment can be deleted) + ## Complete metadata by adding missing labels. ## Generate a derived dataset stored in data as we are adding to the original source dataset obtained. -bact_subset <- bact - -## select candidate predictor variables. -- See SAP - -#bact_subset <- -# bact %>% -# dplyr::select( -# ID, -# AGE, -# WBC, -# KREA, -# BUN, -# PLT, -# NEU, -# BloodCulture -# ) labels_list <- bact.dd$Label units_list <- bact.dd$Units @@ -86,7 +60,7 @@ names(labels_list) <- names(units_list) <- bact.dd$Variable ## Complete metadata by adding missing labels. a_bact <- Hmisc::upData( - bact_subset, + bact, labels = labels_list[names(bact_subset)], units=units_list[names(bact_subset)]) ## Derive outcome variable diff --git a/Bact_missing.qmd b/Bact_missing.qmd index 5861489..a76e500 100644 --- a/Bact_missing.qmd +++ b/Bact_missing.qmd @@ -1,4 +1,6 @@ -# Missing data +# Results of IDA: Missing values {#Missing} + + ```{r, echo=FALSE, warning=FALSE, message=FALSE, echo=FALSE} @@ -15,9 +17,9 @@ load(here::here("data", "bact_env_b.rda")) ``` -## Per variable missingness +### M1: Prevalence of missing values -Number and percentage of missing. +Number and percentage of missingness for each predictor, sorted by descending missingness proportion. ```{r, message =FALSE, warning =FALSE , echo=FALSE} b_bact %>% @@ -37,25 +39,26 @@ b_bact %>% ``` -Investigate for groups of variables: +### M2: Complete cases + +Number of available complete cases (outcome and predictors): ```{r, message =FALSE, warning =FALSE , echo=FALSE} +## COMMENT: is there a way to remove the underliners (e.g. in Key_predictors_only) + b_bact %>% select() %>% - mutate(Any_Variable_missing = ifelse(apply(b_bact,1,any_miss),NA,1)) %>% - mutate(Any_Demographics_missing = ifelse(apply(b_bact[,demog_vars],1,any_miss),NA,1)) %>% - mutate(Any_key_structural_missing = ifelse(apply(b_bact[,c(demog_vars,structural_vars, key_predictors)],1,any_miss),NA,1)) %>% - mutate(Any_Leuko_missing = ifelse(apply(b_bact[,c(leuko_related_vars)],1,any_miss),NA,1)) %>% - mutate(Any_Kidney_missing = ifelse(apply(b_bact[,c(kidney_related_vars)],1,any_miss),NA,1)) %>% - mutate(Any_Acute_missing = ifelse(apply(b_bact[,c(acute_related_vars)],1,any_miss),NA,1)) %>% - mutate(Any_remaining_missing = ifelse(apply(b_bact[,c(remaining_vars)],1,any_miss),NA,1)) %>% - mutate(Any_key_structural_leuko_kidney_acute_missing = ifelse(apply(b_bact[,c(demog_vars,structural_vars, key_predictors, leuko_related_vars, leuko_ratio_vars, kidney_related_vars, acute_related_vars)],1,any_miss),NA,1)) %>% - miss_var_summary() %>% + mutate(Outcome = ifelse(1-any_miss(b_bact[,c("BC")]),NA,1)) %>% + mutate(Outcome_and_structural_variables = ifelse(apply(b_bact[,c("BC",structural_vars)],1,function(X) 1-any_miss(X)),NA,1)) %>% + mutate(Outcome_and_key_predictors_only = ifelse(apply(b_bact[,c("BC",key_predictors)],1,function(X) 1-any_miss(X)),NA,1)) %>% + mutate(Outcome_key_predictors_and_predictors_of_medium_importance = ifelse(apply(b_bact[,c("BC",key_predictors,medimp_predictors)],1,function(X) 1-any_miss(X)),NA,1)) %>% + mutate(Outcome_and_all_predictors = ifelse(apply(b_bact,1,function(X) 1-any_miss(X)),NA,1)) %>% + miss_var_summary(order=FALSE) %>% gt::gt() %>% gt::cols_label( - variable = md("**Variable**"), - n_miss = md("**Missing (count)**"), - pct_miss = md("**Missing (%)**") + variable = md("**Set**"), + n_miss = md("**Complete (count)**"), + pct_miss = md("**Complete (%)**") ) %>% gt::fmt_number( columns = vars(pct_miss), @@ -64,35 +67,55 @@ b_bact %>% ``` -From this table we learn that as long as we model with only VIPs or with leukocyte-related variables, we can expect less than 10% missing values and this may justify a complete-case analysis. Including also kidney- and acute phase related variables will raise the proportion of missing values to about 36% which leads to a significant drop in power. A multiple imputation may then recover a lot of the information and may in particular be beneficial to keep the power of the (otherwise very complete) VIPs. +### ME1: Patterns of missing values -## Missingness patterns over variables +#### Complete cases by strata defined by structural variables +```{r, message =FALSE, warning =FALSE , echo=FALSE} -First we create a dendogram that shows which variables tend to be missing together: +## COMMENT @Mark: can you please make the table nicer? I can't get a proper structure with GENDER and AGEGROUP. + +b_bact %>% + select(GENDER, AGEGROUP) %>% + mutate(All_predictors = ifelse(apply(b_bact,1,function(X) 1-any_miss(X)),NA,1)) %>% + mutate(Structural_variables = ifelse(apply(b_bact[,structural_vars],1,function(X) 1-any_miss(X)),NA,1)) %>% + mutate(Key_predictors = ifelse(apply(b_bact[,c(key_predictors)],1,function(X) 1-any_miss(X)),NA,1)) %>% + mutate(Key_and_medium_importance_predictors = ifelse(apply(b_bact[,c(key_predictors,medimp_predictors)],1,function(X) 1-any_miss(X)),NA,1)) %>% + group_by(GENDER, AGEGROUP) %>% + miss_var_summary(order=FALSE) %>% + gt::gt() %>% + gt::cols_label( +# GENDER = md("**Sex**"), +# AGEGROUP = md("**Age group**"), + variable = md("**Set**"), + n_miss = md("**Complete (count)**"), + pct_miss = md("**Complete (%)**") + ) %>% + gt::fmt_number( + columns = vars(pct_miss), + decimals = 2 + ) -```{r, message =FALSE, warning =FALSE , echo=FALSE} -s_bact <- b_bact[sample(1:nrow(b_bact), size=10000, repl=FALSE),] -is_bact <- is.na(b_bact)*1 -exl_vars <- which(colnames(is_bact) %in% c("BC","BloodCulture","ID")) -perc_miss <- round(apply(is_bact, 2, mean)*100,0) -colnames(is_bact)<-paste(colnames(is_bact),"(",perc_miss,")",sep="") -hobj <- hclust(dist(t(is_bact[,-exl_vars]))) -plot(hobj, cex=0.5) ``` -Furthermore, with variables missing in more than 10% of the cases, we create a heatmap that simultaneously shows the clusters of patients with missing values and the variables: +#### Dendrogram of missingness indicators +The dendrogram depicts the results of a cluster analysis using the complete linkage method based on the percentage of discordant missing indicators. (This percentage was computed via the squared Euclidian distance of missingness indicators between predictors.) The vertical axis shows the distance between two clusters, which is given by the maximum distance between any element of the first and the second clusters. For example, if two clusters are merged at a height of 25 it means that in 25% of the observations the missingness indicators of the most discordant predictors contained in the two clusters are discordant. -```{r, message =FALSE, warning =FALSE , echo=FALSE} -heatmap(is_bact[,perc_miss>10]) -# naniar::vis_miss(sort_miss = TRUE, cluster=TRUE,show_perc_col = FALSE) -``` +The numbers in brackets are the percentages of missing observations for each predictor. -In this heatmap, we see that CHOL and TRIG are always missing together (lowest hierarchy in dendogram), but there are no further such pairs among any other variables. There is also some evidence that when CHOL and TRIG are missing, also PAMY is missing, although this is not the case for a small proportion of patients. The big white space in the middle of the heatmap represents the approx. 30% of patients with no missing values in those variables. +```{r, message =FALSE, warning =FALSE , echo=FALSE} +## COMMENT @Mark: can we remove the text under the x-axis? I tried various (xlab, sub) but all failed. +is_bact <- is.na(b_bact)*1 +exl_vars <- which(colnames(is_bact) %in% c("BC","BloodCulture","ID","AGEGROUP","SEX")) +perc_miss <- round(apply(is_bact, 2, mean)*100,0) +colnames(is_bact)<-paste(colnames(is_bact),"(",perc_miss,")",sep="") +hobj <- hclust(dist(t(is_bact[,-exl_vars]))^2 / nrow(b_bact) * 100) +plot(hobj, cex=0.5, ylab="Percent discordantly missing", hang=0.01) +``` diff --git a/Bact_missing_orphaned.Rmd b/Bact_missing_orphaned.Rmd new file mode 100644 index 0000000..8738234 --- /dev/null +++ b/Bact_missing_orphaned.Rmd @@ -0,0 +1,12 @@ +--- +title: "Bact_missing_orphaned" +author: "GH" +date: "2022-08-30" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +From this table we learn that as long as we model with only VIPs or with leukocyte-related variables, we can expect less than 10% missing values and this may justify a complete-case analysis. Including also kidney- and acute phase related variables will raise the proportion of missing values to about 36% which leads to a significant drop in power. A multiple imputation may then recover a lot of the information and may in particular be beneficial to keep the power of the (otherwise very complete) VIPs. diff --git a/Bact_multivar.qmd b/Bact_multivar.qmd index 3fa4123..a6165b3 100644 --- a/Bact_multivar.qmd +++ b/Bact_multivar.qmd @@ -1,5 +1,5 @@ -# Multivariate distributions +# Multivariate analyses ```{r, echo = FALSE, message = FALSE, warning = FALSE } library(here) @@ -13,263 +13,258 @@ library(mice) load(here::here("data", "bact_env_c.rda")) alpha_value <- 0.1 -``` +variables <- unique(c(bact_transformed$structural_vars, + bact_transformed$key_predictors, + bact_transformed$medimp_predictors, + bact_transformed$remaining_predictors)) +``` -## Overview -### Variable correlation -Compute correlations between the independent variables or their suggested transformations. +## V1: Association with structural variables +A scatterplot of each predictor with age, with different panels for males and females have been constructed. Associated Spearman correlation coefficients have been computed. +### Key predictors ```{r} -variables <- unique(c(bact_transformed$demog_vars, - bact_transformed$structural_vars, - bact_transformed$key_predictors, - bact_transformed$leuko_related_vars, - bact_transformed$leuko_ratio_vars, - bact_transformed$kidney_related_vars, - bact_transformed$acute_related_vars, - bact_transformed$remaining_vars)) +for(j in 1:length(bact_transformed$key_predictors)){ + predictor <- bact_transformed$key_predictors[j] + if(predictor!="AGE"){ + p1<-ggplot(data=c_bact, mapping=aes(x=.data[["AGE"]], y=.data[[predictor]]))+ geom_point(alpha=alpha_value) + facet_grid(cols=vars(.data[["GENDER"]])) + print(p1) + + spear_sex<-by(c_bact[,c("AGE",predictor)], c_bact$GENDER, FUN=function(X) data = cor(X,use="pairwise.complete.obs",method="spearman")[1,2]) + cat("\n\nSpearman correlation coefficients of AGE with ", predictor, ":\n", paste(c("male", "female"), round(spear_sex,3), sep=":" )) + # COMMENT @Mark: is there some way to print the spearman correlation coefficients (spear_sex) into the graphs or into the column labels (in a standardized way)?? + # COMMENT @Mark: can we display inverse pseudolog (back-transformed) y-axis labels for better interpretation? see function inv_pseudo_log() in ida_trans.R -corrp <- c_bact %>% - dplyr::select(all_of(variables)) %>% - cor(use="pairwise.complete.obs", method="pearson") - -corrs <- c_bact %>% - dplyr::select(all_of(variables)) %>% - cor(use="pairwise.complete.obs", method="spearman") + } +} -# differences of pearson and spearman correlations to check for outliers -corrd <- corrp-corrs ``` -Next, we depict the correlation coefficient in a quadratic heat map: +### Predictors of medium importance ```{r} -ggcorrplot(corrp, tl.cex=5, tl.srt=90) -``` +for(j in 1:length(bact_transformed$medimp_predictors)){ + predictor <- bact_transformed$medimp_predictors[j] + p1<-ggplot(data=c_bact, mapping=aes(x=.data[["AGE"]], y=.data[[predictor]]))+ geom_point(alpha=alpha_value) + facet_grid(cols=vars(.data[["GENDER"]])) + print(p1) + + spear_sex<-by(c_bact[,c("AGE",predictor)], c_bact$GENDER, FUN=function(X) data = cor(X,use="pairwise.complete.obs",method="spearman")[1,2]) + cat("\n\nSpearman correlation coefficients of AGE with ", predictor, ":\n", paste(c("male", "female"), round(spear_sex,3), sep=":" )) + # @Mark: see comments above -Explore if there are clusters of variables. Such clusters may give rise to define groups of variables for which a summary or only a representative may be considered in modeling: +} -```{r} -vc_bact<-Hmisc::varclus(as.matrix(c_bact[,variables])) -plot(vc_bact, cex=0.7) ``` -Some of the clusters that pop up here are related to width/volume of blood cells (MPV, PDW), red blood cells (RBC, HGB, HCT; MCV, MCH), and some further 'known' associations such as that between CREA and eGFR (which follows from the construction of eGFR), and between ASAT and ALAT, between AMY and PAMY or between TP and ALB). - -In the following scatterplots we have a look at those associations: +### Remaining predictors ```{r} -ggplot(c_bact, aes(MPV, PDW))+geom_point(alpha = alpha_value, shape = 20) + geom_smooth() +for(j in 1:length(bact_transformed$remaining_predictors)){ + predictor <- bact_transformed$remaining_predictors[j] + p1<-ggplot(data=c_bact, mapping=aes(x=.data[["AGE"]], y=.data[[predictor]]))+ geom_point(alpha=alpha_value) + facet_grid(cols=vars(.data[["GENDER"]])) + print(p1) + + spear_sex<-by(c_bact[,c("AGE",predictor)], c_bact$GENDER, FUN=function(X) data = cor(X,use="pairwise.complete.obs",method="spearman")[1,2]) + cat("\n\nSpearman correlation coefficients of AGE with ", predictor, ":\n", paste(c("male", "female"), round(spear_sex,3), sep=":" )) + # @Mark: see comments above -ggplot(c_bact, aes(RBC, HGB))+geom_point(alpha = alpha_value, shape = 20) + geom_smooth() +} -ggplot(c_bact, aes(RBC, HCT))+geom_point(alpha = alpha_value, shape = 20) + geom_smooth() +``` -ggplot(c_bact, aes(HGB, HCT))+geom_point(alpha = alpha_value, shape = 20) + geom_smooth() -ggplot(c_bact, aes(MCV, MCH))+geom_point(alpha = alpha_value, shape = 20) + geom_smooth() +## V2: Correlation coefficients between all predictors -#ggplot(c_bact, aes(t_KREA, eGFR))+geom_point(alpha = alpha_value, shape = 20) + geom_smooth() +```{r} +corrs <- c_bact %>% + dplyr::select(all_of(variables)) %>% + cor(use="pairwise.complete.obs", method="spearman") -ggplot(c_bact, aes(t_ASAT, t_ALAT))+geom_point(alpha = alpha_value, shape = 20) + geom_smooth() +``` -ggplot(c_bact, aes(t_AMY, t_PAMY))+geom_point(alpha = alpha_value, shape = 20) + geom_smooth() +The Spearman correlation coefficients are depicted in a quadratic heat map: +```{r} +ggcorrplot(corrs, tl.cex=5, tl.srt=90) +``` -ggplot(c_bact, aes(t_WBC, t_NEU))+geom_point(alpha = alpha_value, shape = 20) + geom_smooth() +### VE1: Comparing nonparametric and parametric predictor correlation +```{r} +# differences of pearson and spearman correlations to check for outliers +corrp <- c_bact %>% + dplyr::select(all_of(variables)) %>% + cor(use="pairwise.complete.obs", method="pearson") +corrd <- corrp-corrs -``` +# sparsified differences of correlation coefficients +corrd_sp <- corrd +corrd_sp[abs(corrd)<0.1] <-0 +ggcorrplot(corrd_sp, tl.cex=5, tl.srt=90) -Create scatterplots for pairs of variables with a large distance between Spearman and Pearson correlations (could be an indication of nonlinear association): +``` +Predictor pairs for which Spearman and Pearson correlation coefficients differ by more than 0.1 correlation units will be depicted in scatterplots: ```{r} for(j in 1:(length(variables)-1)){ for(jj in (j+1):(length(variables))){ - if(abs(corrd[j, jj])>0.1) print(ggplot(data=c_bact, mapping=aes(x=.data[[variables[j]]],y=.data[[variables[jj]]]))+ geom_point(alpha = alpha_value)+geom_smooth() + + if(abs(corrd[j, jj])>0.1) print(ggplot(data=c_bact, mapping=aes(x=.data[[variables[j]]],y=.data[[variables[jj]]]))+ geom_point(alpha = alpha_value) + theme_minimal()) } } - +## COMMENT @Mark can we print pearson and spearman correlation coefficients into/over the graphs? ``` +### VE2: Variable clustering +A variable clustering analysis has been performed to evaluate which predictors are closely associated. The dendrogram groups predictors by their correlation. +```{r} +vc_bact<-Hmisc::varclus(as.matrix(c_bact[,variables])) +plot(vc_bact, cex=0.7, hang=0.01) +``` +In the following scatterplots we show predictor pairs with Spearman correlation coefficients greater than 0.8: -### Distribution of age by sex - -```{r, message=FALSE, warning =FALSE , echo=FALSE, fig.cap= "Distribution of age by sex"} -# only plot observatons with a non-missing value for Sex -c_bact %>% - filter(!(is.na(SEX))) %>% - with(., histboxp( - x = AGE, - group = SEX, - sd = TRUE, - bins = 200 - )) - +```{r} +for(j in 1:(length(variables)-1)){ + for(jj in (j+1):length(variables)){ + if(abs(corrs[j, jj])>0.8){ + print(ggplot(c_bact, aes(.data[[variables[j]]], .data[[variables[jj]]]))+geom_point(alpha = alpha_value, shape = 20)) + } + } +} ``` -### Distribution of leukocytes by age, coloured by sex +### VE3: Redundancy -```{r} -c_bact$gender=factor(c_bact$SEX, levels=c(1,2), labels=c("male","female")) +Variance inflation factors (VIF) will be computed between the candidate predictors. This will be done for the three possible candidate models, and using all complete cases in the respective candidate predictor sets. Since $VIF = (1-R^2)^{-1}$, we also report the multiple R-squared values. Redundancy was further explored by computing parametric additive models for each predictor in the key predictor model and the extended predictor model. VIFs and multiple $R^2$ are reported from those models, again for the three predictor sets. -#c_bact %>% ggplot(data=c_bact, mapping=aes(x=Alter, y=t_WBC, color=gender)) + geom_point(shape = 20) + geom_smooth() -``` - -### Plot all variables vs. WBC in age/sex groups +#### VIF for key predictor model ```{r} -c_bact$Agegroup <- factor(cut(c_bact$AGE, c(min(c_bact$AGE), 50, 65, max(c_bact$AGE)))) -table(c_bact$gender,c_bact$Agegroup) -``` +formula <- as.formula(paste(c("~",paste(bact_variables$key_predictors, collapse="+")), collapse="")) +red<-Hmisc::redun(formula, data=c_bact, nk=0, pr=FALSE) +vif<-1/(1-red$rsq1) +cat("\nAvailable sample size:\n", red$n, " (", round(100*red$n/nrow(c_bact),2), "%)\n") -```{r} -for(j in 4:length(variables)){ - p1 <- - c_bact %>% - filter(!is.na(Agegroup)) %>% - ggplot(c_bact, mapping=aes(x=t_WBC,y=.data[[variables[j]]])) + - geom_point(alpha = alpha_value, shape = 20) + - geom_smooth() + - geom_rug(alpha = alpha_value) + - facet_grid(gender ~ Agegroup) - print(p1) -} -#+ -# theme_minimal() +cat("\nVariance inflation factors:\n") +print(round(vif,2)) +cat("\nMultiple R-squared:\n") +print(round(red$rsq1,4)) ``` -### Plot all variables vs. WBC in age/sex groups: loess curves only +#### VIF for model with key predictors and predictors of medium importance ```{r} -for(j in 4:length(variables)){ - p1 <- - c_bact %>% - filter(!is.na(Agegroup)) %>% - ggplot(c_bact, mapping=aes(x=t_WBC,y=.data[[variables[j]]])) + - # geom_point(alpha = alpha_value) + - geom_smooth() + - # geom_rug() + - facet_grid(gender ~ Agegroup) - print(p1) -} +formula <- as.formula(paste(c("~",paste(c(bact_variables$key_predictors,bact_variables$medimp_predictors), collapse="+")), collapse="")) +red<-Hmisc::redun(formula, data=c_bact, nk=0, pr=FALSE) +vif<-1/(1-red$rsq1) -``` +cat("\nAvailable sample size:\n", red$n, " (", round(100*red$n/nrow(c_bact),2), "%)\n") -### Plot all variables vs. WBC in age/sex groups: loess curves only +cat("\nVariance inflation factors:\n") +print(round(vif,2)) -```{r} -for(j in 4:length(variables)){ - p1 <- - c_bact %>% - filter(!is.na(Agegroup)) %>% - ggplot(c_bact, mapping=aes(x=t_WBC,y=.data[[variables[j]]])) + - # geom_point(alpha = alpha_value) + - geom_smooth() + - # geom_rug() + - facet_grid(gender ~ Agegroup, labeller=label_both) - print(p1) + - theme_minimal() -} +cat("\nMultiple R-squared:\n") +print(round(red$rsq1,4)) ``` -## Variable redundancy - -### Redundancy among key predictors - -First we start with a redundancy analysis with the predictors deemed important by preceding studies. +#### VIF for all predictor model ```{r} -formula <- as.formula( - paste("~AGE+", - paste(unique(c(bact_transformed$structural_vars, - bact_transformed$key_predictors)), - collapse="+"))) +formula <- as.formula(paste(c("~",paste(c(bact_variables$key_predictors,bact_variables$medimp_predictors, bact_variables$remaining_predictors), collapse="+")), collapse="")) + +red<-Hmisc::redun(formula, data=c_bact, nk=0, pr=FALSE) +vif<-1/(1-red$rsq1) +cat("\nAvailable sample size:\n", red$n, " (", round(100*red$n/nrow(c_bact),2), "%)\n") -Hmisc::redun(formula, data=c_bact) +cat("\nVariance inflation factors:\n") +print(round(vif,2)) +cat("\nMultiple R-squared:\n") +print(round(red$rsq1,4)) ``` -This analysis suggests redundancy of WBC after NEU is in the predictor set. We investigate this further by looking only at leukocyte-related variables. -### Redundancy among leukocyte-related variables -In redundancy analysis of all predictor variables, the large number of non-complete cases is an issue. First we start with the leukocyte ratios - there should be large redundancy issues. This is even the case if no nonlinear modeling is applied. + +#### Redundancy by parametric additive model: key predictor model ```{r} -Hmisc::redun(~I(EOSR)+I(BASOR)+I(NEUR)+I(LYMR)+I(MONOR), data=c_bact) -``` +formula <- as.formula(paste(c("~",paste(bact_variables$key_predictors, collapse="+")), collapse=" ")) -Now we use the absolute concentrations. BASO is modeled as linear because it has very few distinct values: +red<-Hmisc::redun(formula, data=c_bact, pr=FALSE) +vif<-1/(1-red$rsq1) -```{r} -Hmisc::redun(~t_EOS+I(t_BASO)+t_NEU+t_LYM+MONO, data=c_bact) -``` +cat("\nAvailable sample size:\n", red$n, " (", round(100*red$n/nrow(c_bact),2), "%)\n") -Now we add WBC: +cat("\nVariance inflation factors:\n") +print(round(vif,2)) -```{r} -Hmisc::redun(~t_EOS+I(t_BASO)+t_NEU+t_LYM+MONO+t_WBC, data=c_bact) +cat("\nMultiple R-squared:\n") +print(round(red$rsq1,4)) ``` -This indicates very large $R^2$ values for NEU and WBC, meaning that probably because of their large correlation, the variables are nearly redundant. (This also becomes clear from the histogram of NEUR, which shows the ratios of NEU to WBC.) *When including both covariates in a model, it may happen that standard errors are inflated such that an association cannot be statistically proven; nevertheless, according to the preceding studies both variables could be important!* One way to circumvent the problem is to generate a complementary variable that is defined as the difference of WBC and NEU and use this variable instead of WBC. +#### Redundancy by parametric additive model: key predictors and predictors of medium importance ```{r} -WBC_NEU <- c_bact$WBC - c_bact$NEU -sum_noNEU <- apply(c_bact[,c("EOS","BASO","LYM","MONO")],1,sum) +formula <- as.formula(paste(c("~",paste(c(bact_variables$key_predictors,bact_variables$medimp_predictors), collapse="+")), collapse="")) -ggplot(c_bact, aes(NEU, WBC_NEU)) + geom_point(alpha = alpha_value) + geom_smooth() + scale_x_continuous(trans="pseudo_log") + scale_y_continuous(trans="pseudo_log") -``` +red<-Hmisc::redun(formula, data=c_bact, pr=FALSE) +vif<-1/(1-red$rsq1) -This variable is expected to be highly correlated with the sum of MONO, LYM, BASO and EOS: +cat("\nAvailable sample size:\n", red$n, " (", round(100*red$n/nrow(c_bact),2), "%)\n") -```{r} -ggplot(c_bact, aes(sum_noNEU, WBC_NEU)) + geom_point(alpha = alpha_value) + geom_smooth() + scale_x_continuous(trans="pseudo_log") + scale_y_continuous(trans="pseudo_log") +cat("\nVariance inflation factors:\n") +print(round(vif,2)) +cat("\nMultiple R-squared:\n") +print(round(red$rsq1,4)) ``` -### Redundancy among all potential predictors -Now we perform a full redundancy analysis, but omitting WBC, the leukocyte ratio variables, and BUN and KREA (for their use in constructing BUN_CREA and eGFR). +#### Redundancy by parametric additive model: all predictors ```{r} -formula <- as.formula( - paste("~I(t_EOS)+I(t_BASO)+t_LYM+MONO+t_NEU+", - paste(unique(c(bact_transformed$demog_vars, - c("t_BUN_CREA","eGFR","POTASS"), - bact_transformed$acute_related_vars, - bact_transformed$remaining_vars)),collapse="+"))) +# For redun(), for heavily clumped predictors, linearity must be enforced +remaining_predictors_redun <- bact_variables$remaining_predictors +remaining_predictors_redun[remaining_predictors_redun=="BASO"] <- "I(BASO)" +remaining_predictors_redun[remaining_predictors_redun=="EOS"] <- "I(EOS)" +formula <- as.formula(paste(c("~",paste(c(bact_variables$key_predictors,bact_variables$medimp_predictors, remaining_predictors_redun), collapse="+")), collapse="")) -Hmisc::redun(formula, data=c_bact) -``` +red<-Hmisc::redun(formula, data=c_bact, pr=FALSE) +vif<-1/(1-red$rsq1) -This analysis suggests that HCT, MCH, HGB and MPV may be redundant on top of the other variables. Note that BUN, CREA and WBC were already omitted from this redundancy analysis. +cat("\nAvailable sample size:\n", red$n, " (", round(100*red$n/nrow(c_bact),2), "%)\n") +cat("\nVariance inflation factors:\n") +print(round(vif,2)) + +cat("\nMultiple R-squared:\n") +print(round(red$rsq1,4)) +``` diff --git a/Bact_suppl.qmd b/Bact_suppl.qmd index 494d756..6fed623 100644 --- a/Bact_suppl.qmd +++ b/Bact_suppl.qmd @@ -1,4 +1,6 @@ -# Supplementary Example +# Supplementary Examples: R notebook + +This R notebook includes the full code of the examples presented in the supplement to the paper. ```{r setup, echo = FALSE, message = FALSE, warning = FALSE} library(here) @@ -58,13 +60,13 @@ scaled_brier <- function(model){ ```{r} # define key predictors without pseudo-log trafo ('orig') and transformed ('trans'), replace WBC with WBC_noNEU -key_predictors_orig <- bact_variables$vip_vars %>% +key_predictors_orig <- bact_variables$key_predictors %>% str_replace('WBC', 'WBC_noNEU') -key_predictors_trans <- bact_transformed$vip_vars %>% +key_predictors_trans <- bact_transformed$key_predictors %>% str_replace('WBC', 'WBC_noNEU') # include only complete cases -model_df_complete <- c_bact[c('BC', unique(c(bact_variables$vip_vars, bact_transformed$vip_vars)))] %>% +model_df_complete <- c_bact[c('BC', unique(c(bact_variables$key_predictors, bact_transformed$key_predictors)))] %>% mutate( WBC_noNEU = WBC - NEU ) %>% @@ -80,13 +82,13 @@ pct_excl <- round((1-(dim(model_df_complete)[1] / dim(c_bact)[1])) * 100,1) #pct_complete <- dim(model_df_complete)[1] / dim(c_bact)[1] * 100 ``` -In the following examples we use the Bacteremia data with complete observations regarding the key predictors `r paste(key_predictors_orig, collapse = ', ')`, which represent `r round(pct_complete, 1)`% of the whole dataset. We will fit a global logistic regression model with the outcome 'BC' and the key predictors as covariates. We will use pseudo-log transformations as suggested in the IDA. Within the model, all key predictors will be transformed by fractional polynomials of order 1 (df = 2). +In the following examples we use the Bacteremia data with complete observations regarding the key predictors `r paste(key_predictors_orig, collapse = ', ')`, which represent `r round(100-pct_excl, 1)`% of the whole dataset. We will fit a global logistic regression model with the outcome 'BC' and the key predictors as covariates. We will use pseudo-log transformations as suggested in the IDA. Within the model, all key predictors will be transformed by fractional polynomials of order 1 (df = 2). The aim of the examples is to showcase how decisions derived from IDA influence the results of the fitted model. ## Global Model -The global model will be fitted by the *mfp* function. If not indicated otherwise, we will use the fp-transformations of the key predictors determined in global model in all consecutive models. For all models we report McFaddens's R² and the AUC, i.e. the area under the ROC curve, and boxplots comparing BC predictions with outcomes. +The global model will be fitted by the *mfp* function. If not indicated otherwise, we will use the fp-transformations of the key predictors determined in global model in all consecutive models. For all models we report the scaled Brier score and the AUC, i.e. the area under the ROC curve, and boxplots comparing BC predictions with outcomes. ### Model Summary @@ -96,7 +98,7 @@ global_formula <- paste0('BC ~ ', paste(paste0(paste0('fp(', key_predictors_tran fit_mfp_complete <- mfp(as.formula(global_formula), data = model_df_complete, - family = binomial) + family = binomial, alpha=0.05) # save global formula with fp-trafos global_formula_fp <- paste('BC ~ ', paste0(tidy(fit_mfp_complete)$term[-1], collapse = ' + ')) @@ -190,7 +192,7 @@ p_model_notrans <- logreg_summary_plot(fit_mfp_complete_notrans, 'global model \ p_model_complete + coord_cartesian(ylim = c(0,.8)) + p_model_notrans + theme(axis.title.y = element_blank()) + coord_cartesian(ylim = c(0,.8)) ``` -With regards to McFadden's R² and the AUC, the differences between the two approaches is marginal. +With regards to the scaled Brier score and the AUC, the differences between the two approaches is marginal. Next we will compare the differences of the functional forms in the two models for those covariates where a pseudo-log transformation was suggested in IDA. We will look at the log odds for bacteremia by each covariate on the original and the transformed scale, and compare the global model using the original and the pseudo-log transformed covariates. Each variable is adjusted for the median of all other variables used. @@ -307,8 +309,8 @@ Next we compare the global model to a model were for an important variable, in o m_pct <- .5 -sel_central <- (model_df_complete$Alter > quantile(model_df_complete$Alter, 0.5-m_pct/2)) & - (model_df_complete$Alter < quantile(model_df_complete$Alter, 0.5+m_pct/2)) #needed later +sel_central <- (model_df_complete$AGE > quantile(model_df_complete$AGE, 0.5-m_pct/2)) & + (model_df_complete$AGE < quantile(model_df_complete$AGE, 0.5+m_pct/2)) #needed later set.seed(2) sel_sample <- as.logical(round(runif(dim(model_df_complete)[1]))) # 50% random selection, needed later @@ -380,16 +382,16 @@ tidy(fit_mfp_complete) %>% gt_model_table('global model') ``` -Creatinine ('KREA') is significant at a level that might not survive substantial missingness. We thus create a dataset were we artificially introduce 50% missing creatinine values, missing completely at random. +We pick creatinine ('CREA') as the predictor for which we artificially introduce 50% missingness completely at random. ```{r create missing crea, message = FALSE, warning = FALSE, fig.width = 10, fig.height=6, fig.width=12} -# create 50% missings for t_KREA -set.seed(3) # with seed=3, z-statistic for t_KREA is 1.48 +# create 50% missings for t_CREA +set.seed(3) # with seed=3, z-statistic for t_CREA is 1.48 model_df_missings <- model_df_complete %>% mutate( - t_KREA = ifelse( + t_CREA = ifelse( runif(dim(model_df_complete)[1]) < .5, #~50%/50% TURE/FALSE - t_KREA, + t_CREA, NA ) ) @@ -452,75 +454,23 @@ bind_rows( ``` -The z-statistic for creatinine drops from 2.98 to 1.49 when half the data is missing. Also in other variables the z-statistic is less extreme in the 'missing, complete case analysis' compared to the global model. The interesting observations is that MI recreates estimates and standard errors very close to the global model in most variables, but not in the one that was being imputed, namely creatinine. In variable selection, chreatinine, which is highly significant in the 'true' model, is likely to be dropped, based on the imputed data. ```{r, message = FALSE, warning = FALSE} #pearson & spearman correalation of CREA and BUN -r_pearson <- cor(model_df_complete$t_KREA, model_df_complete$BUN, method = 'pearson') %>% round(3) -r_spearman <- cor(model_df_complete$t_KREA, model_df_complete$BUN, method = 'spearman') %>% round(3) +r_pearson <- cor(model_df_complete$t_CREA, model_df_complete$BUN, method = 'pearson') %>% round(3) +r_spearman <- cor(model_df_complete$t_CREA, model_df_complete$BUN, method = 'spearman') %>% round(3) ``` +The standard error of the regression coefficient of CREA increases from 0.198 to 0.285 when half the data is missing. Also in other variables the standard error increases approximately by a factor of $\sqrt 2$, which is to be expected given that 50% of the data was not used for estimation. The interesting observations is that MI recreates estimates and standard errors very close to the global model in most variables, but not in the imputed variable CREA. Only for variable BUN the standard error of the original, full data model cannot be fully recovered, which can be explained by its correlation with CREA (Pearson: `r_pearson`, Spearman: `r_spearman`. In the complete case analysis CREA is relatively more important than in the analysis with imputed data. -## Example 4: Interpretation of regression coefficient 'size' - -The variables WBC_noNEU and t_WBC_noNEU are on two very different scales: - -```{r ex 4 plots, message = FALSE, warning = FALSE, fig.width=6, fig.height=4} - -p_ex4 <- model_df_complete %>% - select(key_predictors_trans[str_detect(key_predictors_trans, 't_')]) %>% - mutate_all(as.numeric) %>% - pivot_longer(cols = everything()) %>% - ggplot(aes(x = value, group = name)) + - facet_wrap(~name, scales = 'free', strip.position = "bottom") + - geom_histogram(fill = 'firebrick2', color = NA, alpha = 0.5) + - theme_minimal() + - theme(strip.placement = 'outside') - -p_ex4 -# standardized regression coefficients -tidy(fit_mfp_complete) %>% - select(term, estimate) %>% - filter(term != '(Intercept)') - -model_df_complete %>% - summarise( - WBC_noNEU = sd(((t_WBC_noNEU + 0.1)^0.5) * fit_mfp_complete$coefficients[2]), - NEU = sd((t_NEU + 0.1) * fit_mfp_complete$coefficients[3]), - Age = sd((Alter / 100) * fit_mfp_complete$coefficients[4]), - CREA = sd((t_KREA) * fit_mfp_complete$coefficients[5]), - PLT = sd(((PLT + 1) / 100) * fit_mfp_complete$coefficients[6]), - BUN = sd((t_BUN) * fit_mfp_complete$coefficients[7]) - ) %>% - pivot_longer(cols = everything(), names_to = 'variable', values_to = 'standardized beta') %>% - gt %>% - fmt_number( - 2, decimals = 4 - ) - -``` - -Let us recall the two estimates to the covariates WBC_noNEU and t_WBC_noNEU. - -```{r ex 4 estimates, message = FALSE, warning = FALSE} - -bind_rows( - tidy(fit_mfp_complete) %>% select(term, estimate), - tidy(fit_mfp_complete_notrans) %>% select(term, estimate) - ) %>% - filter(str_detect(term, 'WBC')) %>% - gt %>% - fmt_number(2, decimals = 2) - -``` -(Suggestion: show this with models without fp trafo?) +## Example 4: Interpretation of regression coefficient 'size' -Because the fp-transformations further complicate the interpretation of the regression coefficients, let us consider two logisitc regression models with WBC_noNEU and t_WBC_noNEU as single covariate, respectively. +Let's consider two univariable logistic regression models with WBC_noNEU or t_WBC_noNEU as the only covariates. ```{r} @@ -546,12 +496,11 @@ fit_wbc_orig$coefficients[2] %>% round(2) ``` -The estimates `r fit_wbc_orig$coefficients[2] %>% round(2)` and `r fit_wbc_trans$coefficients[2] %>% round(2)` denote the change in log odds for the outcome when the 'term' variable changes by 1 unit, but cannot be compared directly. A $1$ unit change is only a small step on the original scale, where WBC_noNEU covers values from `r range(model_df_complete$WBC_noNEU)[1]` up to `r range(model_df_complete$WBC_noNEU)[2]`. In comparison, t_WBC_noNEU lies between `r round(range(model_df_complete$t_WBC_noNEU)[1],2)` up to `r round(range(model_df_complete$t_WBC_noNEU)[2],2)`, so change of $1$ unit cover almost half the range of the variable. +The estimates `r fit_wbc_orig$coefficients[2] %>% round(2)` and `r fit_wbc_trans$coefficients[2] %>% round(2)` denote the difference in log odds for the outcome associated with a 1 unit difference in WBC_noNEU or t_WBC_noNEU. A 1 unit difference is only a small fraction of the range of WBC_noNEU (`r range(model_df_complete$WBC_noNEU)[1]` to `r range(model_df_complete$WBC_noNEU)[2]`). In comparison, t_WBC_noNEU ranges between `r round(range(model_df_complete$t_WBC_noNEU)[1],2)` and `r round(range(model_df_complete$t_WBC_noNEU)[2],2)`, so difference of 1 unit covers almost half the range of the variable. ## Example 5: Plot of functional form should be resticted to areas with high density -The functional forms have wide confidence intervals when the data is sparse. In presentations of the effects, plots of the functional forms can be limited to areas with high density. In this analysis, PLT was very sparse above ~800 [UNITS], which is reflected in a large confidence interval for high PLT values. In the effect plot PLT values could be limited to values <800 [UNITS]. - +Estimated effects have wide confidence intervals when the data is sparse. When plotting the functional form of an effect, the plotted function can be limited to areas with sufficient predictor support. For example, for t_WBC_noNEU the mfp effect plot differs largely from the effect plot resulting from assuming a linear functional form for that predictor. However, when restricting the plotted area to the range between the 5th and 95th percentile for WBC_noNEU, the deviation of the MFP fit to the linear fit appears small. ```{r} fit_linear_complete <- glm(as.formula(paste0('BC ~ ', paste(key_predictors_trans, collapse = '+'))), @@ -595,7 +544,7 @@ plot_df <- p_ex5 <- plot_df %>% ggplot(aes(x = t_WBC_noNEU, y = yhat, ymin = yhat.lwr, ymax = yhat.upr, color = model, group = model)) + geom_ribbon(alpha = .2, color = NA) + - geom_line(size = 1) + + geom_line(linewidth = 1) + geom_rug( data = fit_mfp_complete$X %>% as.data.frame, aes(x = t_WBC_noNEU), @@ -615,6 +564,7 @@ p_ex5 + (p_ex5 + coord_cartesian(xlim = quantile(model_df_complete$t_WBC_noNEU, ``` +PLT was very sparse above values of ~800 G/L, which is reflected in a large confidence interval for high PLT values. In the effect plot PLT values could be limited to values <800 G/L, where it appears less extreme. diff --git a/Bact_univar.qmd b/Bact_univar.qmd index 2561716..667a0a6 100644 --- a/Bact_univar.qmd +++ b/Bact_univar.qmd @@ -6,76 +6,20 @@ This section reports a series of univariate summary checks of the bacteremia dat library(here) library(tidyverse) library(Hmisc) +library(e1071) # ADD for skewness and kurtosis source(here("R", "ida_plot_univar.R")) ## function to plot univariate summaries. source(here("R", "ida_plot_univar_orig_vs_trans.R")) ## function for side-by-side comparison of original vs. transformed scale, calls ida_plot_univar.R -source(here("R", "ida_trans.R")) ## function to determine transformation (log(x+c) or identity). +source(here("R", "ida_trans.R")) ## function to determine transformation (pseudolog(x, sigma) or identity). ## Load the dataset. load(here::here("data", "bact_env_b.rda")) ``` -## Data set overview -Using the [Hmisc](https://cran.r-project.org/web/packages/Hmisc/) describe function, we provide an overview of the data set. The descriptive report also provides histograms of continuous variables. For ease of scanning the information, we group the report by measurement type. +## U1: Categorical variables -### Demographic variables - -```{r desc_b_bact_1, message = FALSE, results='asis', warning=FALSE, , echo=FALSE} -b_bact %>% - dplyr::select(all_of(demog_vars)) %>% - Hmisc::describe(descript = "Demographic variables") %>% - Hmisc::html(size = 80) -``` - -### Structural covariates and key predictors - -```{r desc_b_bact_2, message = FALSE, results='asis', warning=FALSE, , echo=FALSE} -b_bact %>% - dplyr::select(all_of(c(structural_vars, key_predictors))) %>% - Hmisc::describe(descript = "Structural covariates and key predictors") %>% - Hmisc::html(size = 80) -``` - -### Further variables related to leukocyte types and leukocyte ratios - -```{r desc_b_bact_3, message = FALSE, results='asis', warning=FALSE, echo=FALSE} -b_bact %>% - dplyr::select(all_of(c(leuko_related_vars,leuko_ratio_vars))) %>% - Hmisc::describe(descript = "Leukocyte related variables and leukocyte ratios") %>% - Hmisc::html(size = 80) -``` - -### Kidney function related variables - -```{r desc_b_bact_4, message = FALSE, results='asis', warning=FALSE, echo=FALSE} -b_bact %>% - dplyr::select(all_of(c(kidney_related_vars))) %>% - Hmisc::describe(descript = "Kidney function related variables") %>% - Hmisc::html(size = 80) -``` - -### Acute phase reaction related variables - -```{r desc_b_bact_5, message = FALSE, results='asis', warning=FALSE, echo=FALSE} -b_bact %>% - dplyr::select(all_of(c(acute_related_vars))) %>% - Hmisc::describe(descript = "Acute phase related variables") %>% - Hmisc::html(size = 80) -``` - -### Remaining variables - -```{r desc_b_bact_6, message = FALSE, results='asis', warning=FALSE, echo=FALSE} -b_bact %>% - dplyr::select(all_of(c(remaining_vars))) %>% - Hmisc::describe(descript = "Remaining variables") %>% - Hmisc::html(size = 80) -``` - -## Categorical variables - -We now provide a closer visual examination of the categorical predictors. +SEX and BC (bactermia status) are described by frequencies and proportions in each category. ```{r catplot, message=FALSE, warning =FALSE , echo=FALSE} b_bact %>% @@ -118,19 +62,96 @@ b_bact %>% ``` + ## Continuous variables + +### U2: Univariate distributions of continuous variables + +#### U2: Structural variables + +The only structural continuous variables is AGE. This variable is also a key predictor (see below). + + +#### U2: Key predictors + + +```{r} +unique.variables <- key_predictors +for(j in 1:length(unique.variables)){ + print(ida_plot_univar(b_bact, unique.variables[j], sigma=NA, n_bars=100, transform = FALSE)) +} +``` + +#### U2: Predictors of medium importance + +```{r} +unique.variables <- unique(medimp_predictors) +for(j in 1:length(unique.variables)){ + print(ida_plot_univar(b_bact, unique.variables[j], sigma=NA, n_bars=100, transform = FALSE)) +} +``` + +#### U2: Remaining predictors + +```{r} +unique.variables <- unique(remaining_predictors) +for(j in 1:length(unique.variables)){ + print(ida_plot_univar(b_bact, unique.variables[j], sigma=NA, n_bars=100, transform = FALSE)) +} +``` + +### Numerical summaries + +COMMENT @Mark is there a way to make nicer looking tables out of the numerical summaries?? + +#### Key predictors + +```{r} +unique.variables <- unique(key_predictors) +for(j in 1:length(unique.variables)){ + cat("Numerical summary of ", unique.variables[j],"(",label(b_bact[,unique.variables[j]]), "[", units(b_bact[,unique.variables[j]]), "]):\n\n") + print(main_descriptives(b_bact[,unique.variables[j]])) +} +``` + + + +#### Predictors of medium importance + +```{r} +unique.variables <- unique(medimp_predictors) +for(j in 1:length(unique.variables)){ + cat("Numerical summary of ", unique.variables[j],"(",label(b_bact[,unique.variables[j]]), "[", units(b_bact[,unique.variables[j]]), "]):\n\n") + print(main_descriptives(b_bact[,unique.variables[j]])) +} +``` + +#### Remaining predictors + +```{r} +unique.variables <- unique(remaining_predictors) +for(j in 1:length(unique.variables)){ + cat("Numerical summary of ", unique.variables[j],"(",label(b_bact[,unique.variables[j]]), "[", units(b_bact[,unique.variables[j]]), "]):\n\n") + print(main_descriptives(b_bact[,unique.variables[j]])) +} +``` + + + ### Suggested transformations -Next we investigate whether a transformation of continuous variables may improve any further analyses to reduce disproportional impact of highly influential points, also in multivariate summaries. We employ a function `ida_trans` for this purpose, which optimises the parameter `sigma` of the pseudo-logarithm for that purpose. The optimization targets the best possible linear correlation of the transformed values with normal deviates. If no better transformation can be found, or if the improvement in correlation is less than 0.1 correlation units, no transformation is suggested. +Next we investigate whether a pseudolog transformation of continuous variables may substantially symmetrize the univariate distributions of the continuous variables, and may hence be useful for multivariate summaries. We employ a function `ida_trans` for this purpose, which optimises the parameter `sigma` of the pseudo-logarithm for that purpose. The optimization targets the best possible linear correlation of the transformed values with normal deviates. If no better transformation can be found, or if the improvement in correlation is less than 0.2 correlation units, no transformation is suggested. ```{r} -variables<- c("AGE", structural_vars, key_predictors, leuko_related_vars, leuko_ratio_vars, kidney_related_vars, acute_related_vars, remaining_vars) +variables<- c(structural_vars, key_predictors, medimp_predictors, remaining_predictors) unique.variables <- unique(variables) res<-sapply(unique.variables, function(X) ida_trans(b_bact[,X], equ.marg=0.2)$const) #takes long, calculate once, and save? res -mean(!is.na(res)) +cat("Predictors where a transformation may symmetrize the distribution:\n") +cat("Number: ", sum(!is.na(res)), "\n") +cat("Proportion: ", mean(!is.na(res)), "\n") ``` Register transformed variables in the data set: @@ -166,16 +187,7 @@ for(j in 1:length(bact_variables)){ ``` -### Univariate distribution with variables using the original variable and the suggested transformations -```{r} -for(j in 1:length(unique.variables)){ - print(ida_plot_univar(b_bact, unique.variables[j], sigma=res[j], n_bars=100)) -# if(!is.na(res[j])){ -# print(ida_plot_univar(b_bact, paste("t_",variables[j],sep=""))) -# } -} -``` ### Comparison of univariate distributions with and without pseudo-log transformation @@ -184,10 +196,12 @@ The comparison is only shown for variables where a transformation is suggested. ```{r, fig.width=12} for(j in 1:length(unique.variables)){ # print(ida_plot_univar_orig_vs_trans(b_bact, unique.variables[j], sigma=res[j], n_bars=100)) - if(!is.na(res[j])){ - print(ida_plot_univar_orig_vs_trans(b_bact, unique.variables[j], sigma=res[j], n_bars=100)) + if(!is.na(sigma_values[j])){ + print(ida_plot_univar_orig_vs_trans(c_bact, unique.variables[j], sigma=sigma_values[j], n_bars=100)) } } + +## COMMENT @Mark we must pick two from these plots for the paper - the old version has BUN which is now no longer transformed (as we raised the threshold for indicating relevance of a pseudolog transformation) I would suggest ASAT and GGT. Can you please produce a TIF plot for these two (similar to what we already have in the paper?) ``` ```{r} @@ -195,29 +209,7 @@ save(list=c("c_bact", "bact_variables", "sigma_values", "bact_transformed"), file=here::here("data", "bact_env_c.rda")) ``` -### Univariate distribution with variables using only the original variable without the suggested transformations -```{r} -for(j in 1:length(unique.variables)){ - print(ida_plot_univar(b_bact, unique.variables[j], sigma=res[j], n_bars=100, transform = FALSE)) -# if(!is.na(res[j])){ -# print(ida_plot_univar(b_bact, paste("t_",variables[j],sep=""))) -# } -} -``` - -### Comparison of univariate distributions with and without pseudo-log transformation - -The comparison is only shown for variables where a transformation is suggested. - -```{r, fig.width=12} -for(j in 1:length(unique.variables)){ -# print(ida_plot_univar_orig_vs_trans(b_bact, unique.variables[j], sigma=res[j], n_bars=100)) - if(!is.na(res[j])){ - print(ida_plot_univar_orig_vs_trans(b_bact, unique.variables[j], sigma=res[j], n_bars=100)) - } -} -``` ## Section session info diff --git a/Pseudo_log_explainer.qmd b/Pseudo_log_explainer.qmd index f1d2354..6b98447 100644 --- a/Pseudo_log_explainer.qmd +++ b/Pseudo_log_explainer.qmd @@ -1,6 +1,6 @@ --- title: "Pseudo-log transformations" -author: "GH" +author: "Georg Heinze, Mark Baillie and Marianne Huebner" date: "2022-08-08" output: html_document --- @@ -11,7 +11,7 @@ knitr::opts_chunk$set(echo = TRUE) ## Introduction -This side document illustrates how pseudo-log transformations can be used to transform skewed distributions towards normality. +This supplemental section illustrates how pseudo-log transformations can be used to transform skewed distributions towards normality. The transformation $f(x) = sinh^-1(x/2\sigma)/log(10)$ is a pseudo-logarithmic transformation mentioned by Johnson (1949). It has the following advantages over ordinary logarithmic transformations: diff --git a/R/main_descriptives.R b/R/main_descriptives.R new file mode 100644 index 0000000..0c01532 --- /dev/null +++ b/R/main_descriptives.R @@ -0,0 +1,27 @@ + + +main_descriptives<-function(x, quantiles=c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95), moments=4, distinct=TRUE, most_frequent=5){ + ran <- c(min(x, na.rm=T), max(x, na.rm=T)) + ran <- c(ran, diff(ran)) + names(ran) <-c("minimum", "maximum", "range") + if(!is.null(quantiles)) quan <- quantile(x, quantiles, na.rm=TRUE) + else quan <- NULL + if(!is.null(moments)) { + meanx <- mean(x) + mom <- c(mean=mean(x, na.rm=T), sd=sd(x, na.rm=T), skewness=skewness(x, na.rm=T), kurtosis=kurtosis(x, na.rm=T)) + } else mom <- NULL + if(distinct) dis <- length(unique(x)) + else dis <- NULL + if(!is.null(most_frequent)){ + tabx <- table(x) + mostfreq <- tail(sort(tabx),most_frequent)[most_frequent:1] + concrat <- mostfreq[1]/mean(table(x)) + } else { + mostfreq <- NULL + concrat <- NULL + } + + + + list(quantiles=quan, range=ran, moments=mom, distinct_values=dis, most_frequent=mostfreq, concentration_ratio = concrat) +} \ No newline at end of file diff --git a/_quarto.yml b/_quarto.yml index 258e3c9..ff9a0d9 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -8,19 +8,21 @@ execute: book: title: "Regression without regrets" - author: "S. Hödlmoser, M. Baillie, G. Heinze & M. Huebner" - date: "7/13/2022" + author: "M. Baillie, G. Heinze & M. Huebner" ## changed + date: "11/14/2022" ## changed chapters: - - index.qmd - - IDA_framework.qmd - - scope.qmd - - data_screen.qmd - - Bact_intro.qmd - - Bact_SAP.qmd - - Bact_missing.qmd - - Pseudo_log_explainer.qmd + - index.qmd ## REMOVE + - IDA_framework.qmd ## REMOVE + - scope.qmd ## REMOVE + - data_screen.qmd ## REMOVE + - Bact_intro.qmd ## KEEP + - Bact_IDA_plan.qmd ## KEEP + - Bact_missing.qmd ## KEEP + - Pseudo_log_explainer.qmd ## KEEP - Bact_univar.qmd ## long running jobs - use ARd or freeze - - Bact_multivar.qmd + - Bact_multivar.qmd ## KEEP + - Bact_suppl.qmd ## ADD + - Pseudo_log_explainer.qmd ## ADD - references.qmd bibliography: references.bib diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..cfcd733 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1 @@ +bact_env_c.rda diff --git a/data/a_bact.rda b/data/a_bact.rda index 55cff6e..b31ceea 100644 Binary files a/data/a_bact.rda and b/data/a_bact.rda differ diff --git a/data/bact_env_b.rda b/data/bact_env_b.rda index 4787628..0371019 100644 Binary files a/data/bact_env_b.rda and b/data/bact_env_b.rda differ diff --git a/data/bact_env_c.rda b/data/bact_env_c.rda index a6a13f1..05c4cb7 100644 Binary files a/data/bact_env_c.rda and b/data/bact_env_c.rda differ