Title: | Functions for the MSK Biostatistics Department |
---|---|
Description: | A miscellaneous collection of functions to used by members of the Biostatistics Department at MSKCC. |
Authors: | Karissa Whiting [aut, cre] , Daniel D. Sjoberg [aut, ccp] , Jessica A. Lavery [aut], Margie Hannum [ctb] , Meier Hsu [ctb], Caroline Kostrzewa [ctb], Jasme Lee [ctb], Amy Tin [ctb] , Emily Vertosick [ctb], Gustavo Zapata Wainberg [ctb] , Christine Zhou [ctb], Stephanie Lobaugh [ctb] |
Maintainer: | Karissa Whiting <[email protected]> |
License: | MIT + file LICENSE |
Version: | 0.5.1 |
Built: | 2024-10-31 22:11:25 UTC |
Source: | https://github.com/ddsjoberg/bstfun |
cuminc()
plotPlot cumulative incidence estimates with a risk table and estimates below the figure.
add_cuminc_risktable( cuminc, survfit, timepts, lg, numgrps, line = 3, at = -1, col.list = 1 )
add_cuminc_risktable( cuminc, survfit, timepts, lg, numgrps, line = 3, at = -1, col.list = 1 )
cuminc |
|
survfit |
|
timepts |
a numeric vector of time points of the estimates to display. E.g. c(0,1,2,3,4,5) or seq(0,12,by=2) |
lg |
legend label for each cumulative incidence curve to be displayed. E.g. c("Male", "Female") |
numgrps |
the number of groups of the stratification variable: 1 is no stratification, can stratify up to 3 groups |
line |
to adjust position of risk table. A lower value will shift table up, a larger value will shift table down; default is 3 |
at |
to adjust position of left margin. Default is -1. |
col.list |
list of colors for legend text Should match the colors of plot legend. Default is 1 (black). |
Meier Hsu
library(cmprsk) library(survival) data(pbc) # recode time pbc$time.y <- pbc$time / 365.25 # recode status- switch competing events pbc$status2 <- ifelse(pbc$status > 0, ifelse(pbc$status == 1, 2, 1), 0) # recode stage for 3 groups pbc$stage.3g <- ifelse(pbc$stage %in% c(1,2), "1-2", as.character(pbc$stage)) # Example 1 ------------------------------------- # CIR and KM for no strata cif1 <- cuminc(ftime = pbc$time.y, fstatus = pbc$status2) km1 <- survfit(Surv(pbc$time.y, pbc$status2 == 1) ~ 1) # Plot and add risk table for no strata (numgrps=1) windows(5, 5) par(mfrow = c(1, 1), mar = c(12.5, 5.7, 2, 2), mgp = c(2, 0.65, 0)) plot(cif1, curvlab = c("recurred", "died"), xlim = c(0, 12), xaxt = "n") axis(1, at = seq(0, 12, 3)) add_cuminc_risktable(cif1, km1, timepts = seq(0, 12, 3), lg = "", numgrps = 1) # Example 2 ------------------------------------- cif2 <- cuminc(ftime = pbc$time.y, fstatus = pbc$status2, group = pbc$sex) km2 <- survfit(Surv(pbc$time.y, pbc$status2 == 1) ~ pbc$sex) # Plot and add risk table for 2 groups (numgrps=2) windows(5, 5) par(mfrow = c(1, 1), mar = c(12.5, 5.7, 2, 2), mgp = c(2, 0.65, 0)) plot(cif2, curvlab = c("male", "female", "", ""), lty = c(1, 2, 0, 0), xlim = c(0, 12), xaxt = "n", col = c(1, 2, 0, 0)) axis(1, at = seq(0, 12, 3)) add_cuminc_risktable(cif2, km2, timepts = seq(0, 12, 3), lg = c("male", "female"), numgrps = 2, col.list = c(1,2)) # Example 3 ------------------------------------- cif3 <- cuminc(ftime = pbc$time.y, fstatus = pbc$status2, group = pbc$stage.3g) km3 <- survfit(Surv(pbc$time.y, pbc$status2 == 1) ~ pbc$stage.3g) windows(6,6) par(mfrow = c(1, 1), mar = c(14, 5.7, 2, 2), # change bottom margin mgp = c(2, 0.65, 0)) plot(cif3, curvlab = c("1-2", "3", "4", rep("",3)), lty = c(1, 2, 3, rep(0, 3)), xlim = c(0, 12), xaxt = "n", col = c(1, 2, 4, rep(0, 3))) axis(1, at = seq(0, 12, 3)) add_cuminc_risktable(cif3, survfit = km3, timepts = seq(0, 12, 3), lg = c("1-2", "3", "4"), numgrps = 3, col.list = c(1,2,4))
library(cmprsk) library(survival) data(pbc) # recode time pbc$time.y <- pbc$time / 365.25 # recode status- switch competing events pbc$status2 <- ifelse(pbc$status > 0, ifelse(pbc$status == 1, 2, 1), 0) # recode stage for 3 groups pbc$stage.3g <- ifelse(pbc$stage %in% c(1,2), "1-2", as.character(pbc$stage)) # Example 1 ------------------------------------- # CIR and KM for no strata cif1 <- cuminc(ftime = pbc$time.y, fstatus = pbc$status2) km1 <- survfit(Surv(pbc$time.y, pbc$status2 == 1) ~ 1) # Plot and add risk table for no strata (numgrps=1) windows(5, 5) par(mfrow = c(1, 1), mar = c(12.5, 5.7, 2, 2), mgp = c(2, 0.65, 0)) plot(cif1, curvlab = c("recurred", "died"), xlim = c(0, 12), xaxt = "n") axis(1, at = seq(0, 12, 3)) add_cuminc_risktable(cif1, km1, timepts = seq(0, 12, 3), lg = "", numgrps = 1) # Example 2 ------------------------------------- cif2 <- cuminc(ftime = pbc$time.y, fstatus = pbc$status2, group = pbc$sex) km2 <- survfit(Surv(pbc$time.y, pbc$status2 == 1) ~ pbc$sex) # Plot and add risk table for 2 groups (numgrps=2) windows(5, 5) par(mfrow = c(1, 1), mar = c(12.5, 5.7, 2, 2), mgp = c(2, 0.65, 0)) plot(cif2, curvlab = c("male", "female", "", ""), lty = c(1, 2, 0, 0), xlim = c(0, 12), xaxt = "n", col = c(1, 2, 0, 0)) axis(1, at = seq(0, 12, 3)) add_cuminc_risktable(cif2, km2, timepts = seq(0, 12, 3), lg = c("male", "female"), numgrps = 2, col.list = c(1,2)) # Example 3 ------------------------------------- cif3 <- cuminc(ftime = pbc$time.y, fstatus = pbc$status2, group = pbc$stage.3g) km3 <- survfit(Surv(pbc$time.y, pbc$status2 == 1) ~ pbc$stage.3g) windows(6,6) par(mfrow = c(1, 1), mar = c(14, 5.7, 2, 2), # change bottom margin mgp = c(2, 0.65, 0)) plot(cif3, curvlab = c("1-2", "3", "4", rep("",3)), lty = c(1, 2, 3, rep(0, 3)), xlim = c(0, 12), xaxt = "n", col = c(1, 2, 4, rep(0, 3))) axis(1, at = seq(0, 12, 3)) add_cuminc_risktable(cif3, survfit = km3, timepts = seq(0, 12, 3), lg = c("1-2", "3", "4"), numgrps = 3, col.list = c(1,2,4))
This function works with HTML output from the gt package only. Adds an in-line forest plot to a summary table.
add_inline_forest_plot( x, header = "**Forest Plot**", spec_pointrange.args = NULL )
add_inline_forest_plot( x, header = "**Forest Plot**", spec_pointrange.args = NULL )
x |
a gtsummary object |
header |
string indicating column header of new forest plot column.
Default is |
spec_pointrange.args |
named list of arguments that will be passed to
|
Estimates from tbl_regression()
and tbl_uvregression()
that have
been exponentiated are shown on the log scale.
gtsummary object
Other gtsummary-related functions:
add_sparkline()
,
as_ggplot()
,
bold_italicize_group_labels()
,
logistic_reg_adj_diff()
,
style_tbl_compact()
,
tbl_likert()
,
theme_gtsummary_msk()
library(gtsummary) # Example 1 ---------------------------------- add_inline_forest_plot_ex1 <- lm(mpg ~ cyl + am + drat, mtcars) %>% tbl_regression() %>% add_inline_forest_plot()
library(gtsummary) # Example 1 ---------------------------------- add_inline_forest_plot_ex1 <- lm(mpg ~ cyl + am + drat, mtcars) %>% tbl_regression() %>% add_inline_forest_plot()
This function wraps gtExtras::gt_plt_dist()
and adds a new column
illustrating the distribution of a continuous variable. This function converts
the gtsummary table into a gt table.
add_sparkline( x, type = c("boxplot", "histogram", "rug_strip", "density", "sparkline"), column_header = NULL, same_limit = FALSE, ... )
add_sparkline( x, type = c("boxplot", "histogram", "rug_strip", "density", "sparkline"), column_header = NULL, same_limit = FALSE, ... )
x |
'tbl_summary' object |
type |
sparkline type. Must be one of |
column_header |
string indicating column header |
same_limit |
A logical indicating that the plots will use the same axis range ( |
... |
Arguments passed on to
|
a gt table
Example 1
Other gtsummary-related functions:
add_inline_forest_plot()
,
as_ggplot()
,
bold_italicize_group_labels()
,
logistic_reg_adj_diff()
,
style_tbl_compact()
,
tbl_likert()
,
theme_gtsummary_msk()
library(gtsummary) add_sparkline_ex1 <- trial %>% select(age, marker) %>% tbl_summary(missing = "no") %>% add_sparkline()
library(gtsummary) add_sparkline_ex1 <- trial %>% select(age, marker) %>% tbl_summary(missing = "no") %>% add_sparkline()
Adds spline terms calculated via Hmisc::rcspline.eval()
to a data frame.
add_splines(data, variable, knots = NULL, nk = 5, norm = 2, new_names = NULL)
add_splines(data, variable, knots = NULL, nk = 5, norm = 2, new_names = NULL)
data |
a data frame |
variable |
name of column in data |
knots |
knot locations. If not given, knots will be estimated using default
quantiles of |
nk |
number of knots. Default is 5. The minimum value is 3. |
norm |
‘0’ to use the terms as originally given by Devlin and
Weeks (1986), ‘1’ to normalize non-linear terms by the cube
of the spacing between the last two knots, ‘2’ to normalize by
the square of the spacing between the first and last knots (the
default). |
new_names |
Optionally specify names of new spline columns |
data frame
Knot locations are returned in attr(data[[variable]], "knots")
trial %>% add_splines(age)
trial %>% add_splines(age)
Some data are inherently grouped, and should be reported together. For example, one person likely belongs to multiple racial groups and the results of these tabulations belong in a grouped portion of a summary table.
Grouped variables are all indented together. The label row is a single indent, and the other rows are double indented.
add_variable_grouping(x, ...)
add_variable_grouping(x, ...)
x |
a gtsummary table |
... |
named arguments. The name is the group label that will be inserted into the table. The values are character names of variables that will be grouped |
a gtsummary table
While the returned table is the same class as the input, it does not follow the structure expected in other gtsummary functions that accept these objects: errors may occur.
Example 1
set.seed(11234) add_variable_grouping_ex1 <- data.frame( race_asian = sample(c(TRUE, FALSE), 20, replace = TRUE), race_black = sample(c(TRUE, FALSE), 20, replace = TRUE), race_white = sample(c(TRUE, FALSE), 20, replace = TRUE), age = rnorm(20, mean = 50, sd = 10) ) %>% gtsummary::tbl_summary( label = list(race_asian = "Asian", race_black = "Black", race_white = "White", age = "Age") ) %>% add_variable_grouping( "Race (check all that apply)" = c("race_asian", "race_black", "race_white") )
set.seed(11234) add_variable_grouping_ex1 <- data.frame( race_asian = sample(c(TRUE, FALSE), 20, replace = TRUE), race_black = sample(c(TRUE, FALSE), 20, replace = TRUE), race_white = sample(c(TRUE, FALSE), 20, replace = TRUE), age = rnorm(20, mean = 50, sd = 10) ) %>% gtsummary::tbl_summary( label = list(race_asian = "Asian", race_black = "Black", race_white = "White", age = "Age") ) %>% add_variable_grouping( "Race (check all that apply)" = c("race_asian", "race_black", "race_white") )
This function takes a gtsummary table and converts it to a
forest plot using forestplot::forestplot()
.
as_forest_plot( x, col_names = c("estimate", "ci", "p.value"), graph.pos = 2, boxsize = 0.3, title_line_color = "darkblue", xlog = x$inputs$exponentiate, ... )
as_forest_plot( x, col_names = c("estimate", "ci", "p.value"), graph.pos = 2, boxsize = 0.3, title_line_color = "darkblue", xlog = x$inputs$exponentiate, ... )
x |
a gtsummary object of class |
col_names |
names of columns in |
graph.pos |
The position of the graph element within the table of text. The
position can be |
boxsize |
Override the default box size based on precision |
title_line_color |
color of line that appears above forest plot.
Default is |
xlog |
If TRUE, x-axis tick marks are to follow a logarithmic scale, e.g. for
logistic regression (OR), survival estimates (HR), Poisson regression etc.
Note: This is an intentional break with the original |
... |
arguments passed to |
Christine Zhou
library(gtsummary) library(survival) # Example 1 ---------------------------------- tbl_uvregression( trial[c("response", "age", "grade")], method = glm, y = response, method.args = list(family = binomial), exponentiate = TRUE ) %>% as_forest_plot() # Example 2 ------------------------------------ tbl <- coxph(Surv(ttdeath, death) ~ age + marker, trial) %>% tbl_regression(exponentiate = TRUE) %>% add_n() as_forest_plot(tbl, col_names = c("stat_n", "estimate", "ci", "p.value")) # Example 3 ---------------------------------- tbl %>% modify_cols_merge( pattern = "{estimate} ({ci})", rows = !is.na(estimate) ) %>% modify_header(estimate = "HR (95% CI)") %>% as_forest_plot( col_names = c("estimate", "p.value"), boxsize = 0.2, col = forestplot::fpColors(box = "darkred") )
library(gtsummary) library(survival) # Example 1 ---------------------------------- tbl_uvregression( trial[c("response", "age", "grade")], method = glm, y = response, method.args = list(family = binomial), exponentiate = TRUE ) %>% as_forest_plot() # Example 2 ------------------------------------ tbl <- coxph(Surv(ttdeath, death) ~ age + marker, trial) %>% tbl_regression(exponentiate = TRUE) %>% add_n() as_forest_plot(tbl, col_names = c("stat_n", "estimate", "ci", "p.value")) # Example 3 ---------------------------------- tbl %>% modify_cols_merge( pattern = "{estimate} ({ci})", rows = !is.na(estimate) ) %>% modify_header(estimate = "HR (95% CI)") %>% as_forest_plot( col_names = c("estimate", "p.value"), boxsize = 0.2, col = forestplot::fpColors(box = "darkred") )
useful when you want to place a ggplot and gt table side-by-side. To use this function you must install the magick R package AND system program (see https://docs.ropensci.org/magick/articles/intro.html#installing-magick-1)
as_ggplot(x, ...)
as_ggplot(x, ...)
x |
gt or gtsummary table |
... |
arguments passed to |
a ggplot object
Other gtsummary-related functions:
add_inline_forest_plot()
,
add_sparkline()
,
bold_italicize_group_labels()
,
logistic_reg_adj_diff()
,
style_tbl_compact()
,
tbl_likert()
,
theme_gtsummary_msk()
library(gtsummary) library(ggplot2) library(patchwork) # # convert gtsummary table to ggplot # tbl <- # trial %>% # select(age, marker, trt) %>% # tbl_summary(by = trt, missing = "no") %>% # as_ggplot() # # # create basic ggplot # gg <- # trial %>% # ggplot(aes(x = age, y = marker, color = trt)) + # geom_point() # # # stack tables using patchwork # gg / tbl
library(gtsummary) library(ggplot2) library(patchwork) # # convert gtsummary table to ggplot # tbl <- # trial %>% # select(age, marker, trt) %>% # tbl_summary(by = trt, missing = "no") %>% # as_ggplot() # # # create basic ggplot # gg <- # trial %>% # ggplot(aes(x = age, y = marker, color = trt)) + # geom_point() # # # stack tables using patchwork # gg / tbl
Given a data set that has a measure collected over time and you want to extract, for example the 3 month measurement, this function will find the measure closest to 3 months within a defined window.
assign_timepoint( data, id, ref_date, measure_date, timepoints, windows, time_units = c("days", "weeks", "months", "years"), new_var = "timepoint", keep_all_obs = FALSE, keep_all_vars = TRUE )
assign_timepoint( data, id, ref_date, measure_date, timepoints, windows, time_units = c("days", "weeks", "months", "years"), new_var = "timepoint", keep_all_obs = FALSE, keep_all_vars = TRUE )
data |
data frame |
id |
id variable name, such as |
ref_date |
baseline or reference date column name |
measure_date |
date the measure was collected |
timepoints |
vector of time point to identify |
windows |
list of windows around a time point that are acceptable |
time_units |
one of |
new_var |
name of new variable, default is |
keep_all_obs |
logical indicating whether to return a data frame with only the assigned time points (default), or to return a data frame with all rows. |
keep_all_vars |
logical indicating whether to return a data frame
with all the variables in |
data frame passed in data
with additional column new_var
ggplot2::economics_long %>% dplyr::group_by(variable) %>% dplyr::mutate(min_date = min(date)) %>% dplyr::ungroup() %>% assign_timepoint( id = variable, ref_date = min_date, measure_date = date, timepoints = c(6, 12, 24), windows = list(c(-2, 2), c(-2, 2), c(-2, 2)), time_units = "months" )
ggplot2::economics_long %>% dplyr::group_by(variable) %>% dplyr::mutate(min_date = min(date)) %>% dplyr::ungroup() %>% assign_timepoint( id = variable, ref_date = min_date, measure_date = date, timepoints = c(6, 12, 24), windows = list(c(-2, 2), c(-2, 2), c(-2, 2)), time_units = "months" )
Provided a distribution of risk in a population, this function calculates the exact AUC of a model that produces the risk estimates. For example, a logistic regression model built with a normal linear predictor yields logit-normal distributed predicted risks. The AUC from the logistic regression model is the same as the AUC estimated from the distribution of the predicted risks, independent of the outcome. This method for AUC calculation is useful for simulation studies where the predicted risks are a mixture of two distributions. The exact prevalence of the outcome can easily be calculated, along with the exact AUC of the model.
auc_density(density, cut.points = seq(from = 0, to = 1, by = 0.001), ...) auc_histogram(x)
auc_density(density, cut.points = seq(from = 0, to = 1, by = 0.001), ...) auc_histogram(x)
density |
a function name that describes the continuous probability density function of the risk from 0 to 1. |
cut.points |
sequence of points in [0, 1] where the sensitivity and specificity are calculated. More points lead to a more precise estimate of the AUC. Default is seq(from = 0, to = 1,by = 0.001). |
... |
arguments for the function specified in density. For example, dbeta(x, shape1=1, shape2=1) has need for two additional arguments to specify the density function (shape1 and shape2). |
x |
histogram object from graphics::hist |
Returns a list sensitivity and specificity at each cut point, the expected value or mean risk, and the AUC associated with the distribution.
Daniel D Sjoberg
auc_density(density = dbeta, shape1 = 1, shape2 = 1) runif(10000) %>% hist(breaks = 250) %>% auc_histogram()
auc_density(density = dbeta, shape1 = 1, shape2 = 1) runif(10000) %>% hist(breaks = 250) %>% auc_histogram()
Set bold and/or italic style for groups labels in stacked tables
bold_italicize_group_labels( x, bold = FALSE, italics = FALSE, print_engine = c("gt", "flextable", "huxtable") )
bold_italicize_group_labels( x, bold = FALSE, italics = FALSE, print_engine = c("gt", "flextable", "huxtable") )
x |
a gtsummary stacked table |
bold |
logical indicating whether to bold the group header rows |
italics |
logical indicating whether to italicize the group header rows |
print_engine |
Choose a print engine to render the table, must be one of
|
A table of class selected in print_engine. Of note, the output will no longer be a gtsummary table.
Example 1
Other gtsummary-related functions:
add_inline_forest_plot()
,
add_sparkline()
,
as_ggplot()
,
logistic_reg_adj_diff()
,
style_tbl_compact()
,
tbl_likert()
,
theme_gtsummary_msk()
library(gtsummary) bold_italicize_group_labels_ex1 <- trial %>% select(age, trt, grade) %>% tbl_strata( strata = grade, ~ .x %>% tbl_summary(by = trt, missing = "no"), .combine_with = "tbl_stack" ) %>% bold_italicize_group_labels(bold = TRUE)
library(gtsummary) bold_italicize_group_labels_ex1 <- trial %>% select(age, trt, grade) %>% tbl_strata( strata = grade, ~ .x %>% tbl_summary(by = trt, missing = "no"), .combine_with = "tbl_stack" ) %>% bold_italicize_group_labels(bold = TRUE)
Add citations to R and R packages in an R Markdown report.
cite_r(pkgs = c("tidyverse", "gtsummary"), add_citations = TRUE)
cite_r(pkgs = c("tidyverse", "gtsummary"), add_citations = TRUE)
pkgs |
character vector of package names to cite. Default is
|
add_citations |
logical indicating whether to include the bibtex citations
of R and the packages. Default is |
Below is an example how the cite_r()
function would be used in an R
Markdown report.
--- output: html_document bibliography: references.bib --- Analyses were conducted with `r bstfun::cite_r(pkgs = "tidyverse")`.
This assumes there is a bib file of references called references.bib
that
contain the following entries.
@Manual{r, title = {R: A Language and Environment for Statistical Computing}, author = {{R Core Team}}, organization = {R Foundation for Statistical Computing}, address = {Vienna, Austria}, year = {2021}, url = {https://www.R-project.org/}, } @Article{tidyverse, title = {Welcome to the {tidyverse}}, author = {Hadley Wickham and Mara Averick and Jennifer Bryan and Winston Chang and Lucy D'Agostino McGowan and Romain François and #' Garrett Grolemund and Alex Hayes and Lionel Henry and Jim Hester and Max Kuhn and Thomas Lin Pedersen and Evan Miller and Stephan Milton #' Bache and Kirill Müller and Jeroen Ooms and David Robinson and Dana Paige Seidel and Vitalie Spinu and Kohske Takahashi and Davis #' Vaughan and Claus Wilke and Kara Woo and Hiroaki Yutani}, year = {2019}, journal = {Journal of Open Source Software}, volume = {4}, number = {43}, pages = {1686}, doi = {10.21105/joss.01686}, }
# cite R and the tidyverse cite_r(pkgs = "tidyverse") # cite R and the tidyverse, but text only cite_r(pkgs = "tidyverse", add_citations = FALSE) # only cite R cite_r(pkgs = NULL)
# cite R and the tidyverse cite_r(pkgs = "tidyverse") # cite R and the tidyverse, but text only cite_r(pkgs = "tidyverse", add_citations = FALSE) # only cite R cite_r(pkgs = NULL)
An MRN follows specific rules
Must be character
Must contain only numeric components
Must be eight characters long and include leading zeros.
This function converts numeric MRNs to character and ensures it follows MRN conventions. Character MRNs can also be passed, and leading zeros will be appended and checked for consistency.
clean_mrn(x, allow_na = FALSE, check_unique = FALSE)
clean_mrn(x, allow_na = FALSE, check_unique = FALSE)
x |
vector to be converted and checked to MRN |
allow_na |
logical indicating whether |
check_unique |
Check if MRNs are unique |
character MRN vector
1000:1001 %>% clean_mrn()
1000:1001 %>% clean_mrn()
Function assists in checking the values of new, derived variables against the raw, source variables.
count_map(data, ...)
count_map(data, ...)
data |
data frame |
... |
sets of variables to check. variables that are checked together are included in the same vector. See example below. |
count_map( mtcars, c(cyl, am), c(gear, carb) )
count_map( mtcars, c(cyl, am), c(gear, carb) )
Pass a data frame and the missing pattern of all columns in the data frame. The data frame is returned unmodified.
count_na(data, include = NULL, exclude = NULL)
count_na(data, include = NULL, exclude = NULL)
data |
data frame |
include |
character vector of names to include |
exclude |
character vector of names to exclude |
original data frame invisibly returned
trial %>% count_na()
trial %>% count_na()
Creates a directory with the essential files for a new project. The function can be used on existing project directories as well.
The folder name should be structured as "<PI Last Name> <Short Description>"
,
e.g. "Sjoberg MRI detects Path Stage after Surgery"
. PI last name
is used in file naming.
create_bst_project(path, path_data = NULL, git = NA, ...)
create_bst_project(path, path_data = NULL, git = NA, ...)
path |
A path. If it exists, it is used. If it does not exist, it is created. |
path_data |
A path. The directory where the secure data exist. Default is
|
git |
Logical indicating whether to create Git repository. Default is |
... |
Arguments passed on to
|
# specifying project folder location (folder does not yet exist) project_path <- fs::path(tempdir(), "My Project Folder") # creating folder where secure data would be stored (typically will be a network drive) secure_data_path <- fs::path(tempdir(), "secure_data") dir.create(secure_data_path) # creating new project folder create_bst_project(project_path, path_data = secure_data_path)
# specifying project folder location (folder does not yet exist) project_path <- fs::path(tempdir(), "My Project Folder") # creating folder where secure data would be stored (typically will be a network drive) secure_data_path <- fs::path(tempdir(), "secure_data") dir.create(secure_data_path) # creating new project folder create_bst_project(project_path, path_data = secure_data_path)
Calculate eGFR
egfr_ckdepi(creatinine, age, female, aa, label = "eGFR, mL/min/1.73m²") egfr_mdrd(creatinine, age, female, aa, label = "eGFR, mL/min/1.73m²")
egfr_ckdepi(creatinine, age, female, aa, label = "eGFR, mL/min/1.73m²") egfr_mdrd(creatinine, age, female, aa, label = "eGFR, mL/min/1.73m²")
creatinine |
serum creatinine level in mg/dL |
age |
patient age |
female |
logical indicating whether patient is female |
aa |
logical indicating whether patient is African-American |
label |
label that will be applied to result,
e.g. |
numeric vector
egfr_mdrd(creatinine = 1.2, age = 60, female = TRUE, aa = TRUE) egfr_ckdepi(creatinine = 1.2, age = 60, female = TRUE, aa = TRUE)
egfr_mdrd(creatinine = 1.2, age = 60, female = TRUE, aa = TRUE) egfr_ckdepi(creatinine = 1.2, age = 60, female = TRUE, aa = TRUE)
Use this function to make updates to your data while avoiding adding PHI,
such as MRNs, to your code and subsequently PHI in GitHub. You provide a file
of database fixes that is three columns: 1. An expression that selects a line
in the database to update (e.g. MRN == "12345678"
), 2. The column name that
will be updated, and 3. The updated value. See the examples for the
structure of the database fix input.
fix_database_error(data, engine = readr::read_csv, ...)
fix_database_error(data, engine = readr::read_csv, ...)
data |
data frame with errors |
engine |
function to import file of database fixes |
... |
arguments passed to the engine function to import the database fixes |
updated data frame
df_fixes <- tibble::tribble( ~id, ~variable, ~value, "id == 1", "age", "56", "id == 2", "trt", "Drug C" ) trial %>% dplyr::mutate(id = dplyr::row_number()) %>% fix_database_error( engine = I, x = df_fixes )
df_fixes <- tibble::tribble( ~id, ~variable, ~value, "id == 1", "age", "56", "id == 2", "trt", "Drug C" ) trial %>% dplyr::mutate(id = dplyr::row_number()) %>% fix_database_error( engine = I, x = df_fixes )
Function accepts survival::Surv()
object, extracts the follow-up
times among the censored observations, and returns the requested
summary statistics. Use this function to report follow-up in-line in
R Markdown reports.
followup_time( Surv, data = NULL, pattern = "{median} (IQR {p25}, {p75})", style_fun = gtsummary::style_sigfig )
followup_time( Surv, data = NULL, pattern = "{median} (IQR {p25}, {p75})", style_fun = gtsummary::style_sigfig )
Surv |
An object of class |
data |
A data frame |
pattern |
Statistics pattern to return. Default is
|
style_fun |
Function used to style/format the summary statistics.
Default is |
string of summary statistics
library(survival) followup_time(Surv(time, status), data = lung) followup_time( Surv(time, status), data = lung, pattern = "{median} days", style_fun = ~gtsummary::style_sigfig(., digits = 4) ) followup_time( survival::Surv(time, status), data = survival::lung, pattern = "{n} events with a follow-up time of {median} days (IQR {p25}, {p75})" )
library(survival) followup_time(Surv(time, status), data = lung) followup_time( Surv(time, status), data = lung, pattern = "{median} days", style_fun = ~gtsummary::style_sigfig(., digits = 4) ) followup_time( survival::Surv(time, status), data = survival::lung, pattern = "{n} events with a follow-up time of {median} days (IQR {p25}, {p75})" )
This function calculates the most common value(s) of a given set
get_mode(x, moden = 1, quiet = FALSE)
get_mode(x, moden = 1, quiet = FALSE)
x |
A variable or vector (numeric, character or factor) |
moden |
If there are multiple modes, which mode to use. The default is the first mode. |
quiet |
By default, messages are printed if multiple modes are selected. To hide these messages, set |
A vector of length 1 containing the mode
get_mode(trial$stage) get_mode(trial$trt) get_mode(trial$response) get_mode(trial$grade)
get_mode(trial$stage) get_mode(trial$trt) get_mode(trial$response) get_mode(trial$grade)
Assess a model's calibration via a calibration plot.
ggcalibration( data, y, x, n.groups = 10, conf.level = 0.95, ci.method = c("exact", "ac", "asymptotic", "wilson", "prop.test", "bayes", "logit", "cloglog", "probit"), geom_smooth.args = list(method = "loess", se = FALSE, formula = y ~ x, color = "black"), geom_errorbar.args = list(width = 0), geom_point.args = list(), geom_function.args = list(colour = "gray", linetype = "dashed") )
ggcalibration( data, y, x, n.groups = 10, conf.level = 0.95, ci.method = c("exact", "ac", "asymptotic", "wilson", "prop.test", "bayes", "logit", "cloglog", "probit"), geom_smooth.args = list(method = "loess", se = FALSE, formula = y ~ x, color = "black"), geom_errorbar.args = list(width = 0), geom_point.args = list(), geom_function.args = list(colour = "gray", linetype = "dashed") )
data |
a data frame |
y |
variable name of the outcome coded as 0/1 |
x |
variable name of the risk predictions |
n.groups |
number of groups |
conf.level |
level of confidence to be used in the confidence interval |
ci.method |
method to use to construct the interval.
See |
geom_smooth.args |
named list of arguments that will be passed
to |
geom_errorbar.args |
named list of arguments that will be passed
to |
geom_point.args |
named list of arguments that will be passed
to |
geom_function.args |
named list of arguments that will be passed
to |
ggplot
glm(response ~ age + marker + grade, trial, family = binomial) %>% broom::augment(type.predict = "response") %>% ggcalibration(y = response, x = .fitted, n.groups = 6) + ggplot2::xlim(0, 1) + ggplot2::labs(x = "Model Risk")
glm(response ~ age + marker + grade, trial, family = binomial) %>% broom::augment(type.predict = "response") %>% ggcalibration(y = response, x = .fitted, n.groups = 6) + ggplot2::xlim(0, 1) + ggplot2::labs(x = "Model Risk")
Uses data_date.txt
to create a path with the data date populated.
here_data()
: Returns here::here("secure_data", data_date, ...)
path_data()
: Returns fs::path(path, "secure_data", data_date, ...)
here_data( ..., data_folder_name = "secure_data", path_to_data_date = here::here() ) path_data( ..., path = getOption("path_data"), data_folder_name = "secure_data", path_to_data_date = here::here() ) get_data_date(path_to_data_date = here::here())
here_data( ..., data_folder_name = "secure_data", path_to_data_date = here::here() ) path_data( ..., path = getOption("path_data"), data_folder_name = "secure_data", path_to_data_date = here::here() ) get_data_date(path_to_data_date = here::here())
... |
Path components to be appended to the end of the returned path string. |
data_folder_name |
name of data folder. Default is |
path_to_data_date |
path to data date folder or file. If folder is passed,
expecting the data date file to be named one of
|
path |
path to folder where data is saved, e.g. |
The function expects the user to version their data using a text file indicating
the date the data was last received, and the data to be stored in a corresponding
folder name, e.g. ~/Project Folder/secure_data/2020-01-01
.
path to data folder
if (FALSE) { here_data() #> "C:/Users/SjobergD/GitHub/My Project/secure_data/2020-01-01" here_data("Raw Data.xlsx") #> "C:/Users/SjobergD/GitHub/My Project/secure_data/2020-01-01/Raw Data.xlsx" path_data(path = "O:/My Project", "Raw Data.xlsx") #> "O:/My Project/secure_data/2020-01-01/Raw Data.xlsx" }
if (FALSE) { here_data() #> "C:/Users/SjobergD/GitHub/My Project/secure_data/2020-01-01" here_data("Raw Data.xlsx") #> "C:/Users/SjobergD/GitHub/My Project/secure_data/2020-01-01/Raw Data.xlsx" path_data(path = "O:/My Project", "Raw Data.xlsx") #> "O:/My Project/secure_data/2020-01-01/Raw Data.xlsx" }
Functions for working with the high performance computing cluster
hpcc_get_arg()
retrieve character argument passed from terminal,
e.g. qsubR bootstrap_analysis.R "mets"
hpcc_get_seq_number()
retrieve sequence integer when a sequence of jobs
were submitted from the terminal, e.g qsubR -a 1-50%10 bootstrap_analysis.R
hpcc_get_arg() hpcc_get_seq_number()
hpcc_get_arg() hpcc_get_seq_number()
Get variable labels and store in named list
list_labels(data)
list_labels(data)
data |
Data frame |
Daniel D. Sjoberg
list_labels(trial)
list_labels(trial)
This function works with gtsummary::add_difference()
to calculate
adjusted differences and confidence intervals based on results from a
logistic regression model. Adjustment covariates are set to the mean to
estimate the adjusted difference. The function uses bootstrap methods to
estimate the adjusted difference between two groups.
The CI is estimate by either using the SD from the bootstrap difference
estimates and calculating the CI assuming normality or using the centiles
of the bootstrapped differences as the confidence limits
The function can also be used in add_p()
, and if you do, be sure to
set boot_n = 1
to avoid long, unused computation.
logistic_reg_adj_diff( data, variable, by, adj.vars, conf.level, type, ci_type = c("sd", "centile"), boot_n = 250, ... )
logistic_reg_adj_diff( data, variable, by, adj.vars, conf.level, type, ci_type = c("sd", "centile"), boot_n = 250, ... )
data |
a data frame |
variable |
string of binary variable in |
by |
string of the |
adj.vars |
character vector of variable names to adjust model for |
conf.level |
Must be strictly greater than 0 and less than 1. Defaults to 0.95, which corresponds to a 95 percent confidence interval. |
type |
string indicating the summary type |
ci_type |
string dictation bootstrap method for CI estimation.
Must be one of |
boot_n |
number of bootstrap iterations to use. In most cases, it is
reasonable to used 250 for the |
... |
not used |
tibble with difference estimate
Example 1
Example 2
Other gtsummary-related functions:
add_inline_forest_plot()
,
add_sparkline()
,
as_ggplot()
,
bold_italicize_group_labels()
,
style_tbl_compact()
,
tbl_likert()
,
theme_gtsummary_msk()
library(gtsummary) tbl <- tbl_summary(trial, by = trt, include = response, missing = "no") # Example 1 ----------------------------------------------------------------- logistic_reg_adj_diff_ex1 <- tbl %>% add_difference( test = everything() ~ logistic_reg_adj_diff, adj.vars = "stage" ) # Example 2 ----------------------------------------------------------------- # Use the centile method, and # change the number of bootstrap resamples to perform logistic_reg_adj_diff_ex2 <- tbl %>% add_difference( test = everything() ~ purrr::partial(logistic_reg_adj_diff, ci_type = "centile", boot_n = 100), adj.vars = "stage" )
library(gtsummary) tbl <- tbl_summary(trial, by = trt, include = response, missing = "no") # Example 1 ----------------------------------------------------------------- logistic_reg_adj_diff_ex1 <- tbl %>% add_difference( test = everything() ~ logistic_reg_adj_diff, adj.vars = "stage" ) # Example 2 ----------------------------------------------------------------- # Use the centile method, and # change the number of bootstrap resamples to perform logistic_reg_adj_diff_ex2 <- tbl %>% add_difference( test = everything() ~ purrr::partial(logistic_reg_adj_diff, ci_type = "centile", boot_n = 100), adj.vars = "stage" )
The project_templates
object defines the contents of the Biostatistics project
templates used in create_bst_project()
and use_bst_file()
.
project_templates
project_templates
A named list containing the Biostatistics project template.
if (FALSE) { create_hot_project( path = file.path(tempdir(), "Sjoberg New Project"), template = bstfun::project_templates[["default"]] ) }
if (FALSE) { create_hot_project( path = file.path(tempdir(), "Sjoberg New Project"), template = bstfun::project_templates[["default"]] ) }
When an R script is submitted to the server, Linux generates a log file named
myRscript.R.o######
. This program searches a folder for files named like this
and removes/deletes them.
rm_logs(path = here::here(), recursive = FALSE)
rm_logs(path = here::here(), recursive = FALSE)
path |
folder location to search for log files |
recursive |
logical. should log files in subdirectories also be deleted? |
Takes labels from the Derived Variables excel file and applies them to the
passed data frame.
The excel sheet must have columns "varname"
and "label"
.
set_derived_variables(data, path, sheet = NULL, drop = TRUE)
set_derived_variables(data, path, sheet = NULL, drop = TRUE)
data |
Data frame |
path |
Path to Derived Variables xls/xlsx file |
sheet |
Sheet to read. Either a string (the name of a sheet), or an
integer (the position of the sheet). Ignored if the sheet is specified via
|
drop |
Logical indicating whether to drop unlabelled variables |
Daniel D. Sjoberg
trial %>% set_derived_variables("derived_variables_sjoberg.xlsx")
trial %>% set_derived_variables("derived_variables_sjoberg.xlsx")
Apply the same compact styling available from
gtsummary::theme_gtsummary_compact()
to any
gt, flextable, huxtable, or knitr::kable()
table.
knitr::kable()
stlying uses the kableExtra package
style_tbl_compact(data)
style_tbl_compact(data)
data |
a gt, flextable, huxtable, or |
Example 1
Other gtsummary-related functions:
add_inline_forest_plot()
,
add_sparkline()
,
as_ggplot()
,
bold_italicize_group_labels()
,
logistic_reg_adj_diff()
,
tbl_likert()
,
theme_gtsummary_msk()
style_tbl_compact_ex1 <- head(trial) %>% gt::gt() %>% style_tbl_compact()
style_tbl_compact_ex1 <- head(trial) %>% gt::gt() %>% style_tbl_compact()
tbl_likert()
creates a summary of Likert scales following the gtsummary structure.
add_n()
adds a column to the table with the total number of observations.
add_continuous_stat()
converts Likert scales into a numeric score and computes
continuous statistics based on this score.
tbl_likert( data, label = NULL, statistic = NULL, digits = NULL, include = everything(), sort = c("default", "ascending", "descending") ) ## S3 method for class 'tbl_likert' add_n( x, statistic = "{n}", col_label = "**N**", footnote = FALSE, last = FALSE, ... ) add_continuous_stat(x, ...) ## S3 method for class 'tbl_likert' add_continuous_stat( x, statistic = "{mean}", digits = NULL, col_label = NULL, footnote = FALSE, last = TRUE, score_values = NULL, stat_col_name = NULL, ... )
tbl_likert( data, label = NULL, statistic = NULL, digits = NULL, include = everything(), sort = c("default", "ascending", "descending") ) ## S3 method for class 'tbl_likert' add_n( x, statistic = "{n}", col_label = "**N**", footnote = FALSE, last = FALSE, ... ) add_continuous_stat(x, ...) ## S3 method for class 'tbl_likert' add_continuous_stat( x, statistic = "{mean}", digits = NULL, col_label = NULL, footnote = FALSE, last = TRUE, score_values = NULL, stat_col_name = NULL, ... )
data |
A data frame |
label |
List of formulas specifying variables labels,
e.g. |
statistic |
String or formula indicating the statistic to be reported.
Default is the mean score. Other possible continuous statistics are described
in |
digits |
Formula or list of formulas indicating how to display the
computed statistics, see |
include |
variables to include in the summary table. Default is |
sort |
Sort table based on mean scores? Must be one of
|
x |
Object with class |
col_label |
String indicating the column label. Default is generated
from |
footnote |
Logical argument indicating whether to print a footnote
clarifying the statistics presented. Default is |
last |
Logical indicator to include the new column last in table.
Default is |
... |
not used |
score_values |
Vector indicating the numeric value of each factor level.
Default is |
stat_col_name |
Optional string indicating the name of the new column
added to |
Example 1
Other gtsummary-related functions:
add_inline_forest_plot()
,
add_sparkline()
,
as_ggplot()
,
bold_italicize_group_labels()
,
logistic_reg_adj_diff()
,
style_tbl_compact()
,
theme_gtsummary_msk()
library(dplyr) set.seed(1123) likert_lvls <- c("Never", "Sometimes", "Often", "Always") df <- tibble::tibble( Q1 = sample(likert_lvls, size = 100, replace = TRUE), Q2 = sample(likert_lvls, size = 100, replace = TRUE) ) %>% mutate(across(c(Q1, Q2), ~factor(., levels = likert_lvls))) tbl_likert_ex1 <- tbl_likert(df) %>% add_n() %>% add_continuous_stat(statistic = "{mean}") %>% add_continuous_stat(statistic = "{sd}")
library(dplyr) set.seed(1123) likert_lvls <- c("Never", "Sometimes", "Often", "Always") df <- tibble::tibble( Q1 = sample(likert_lvls, size = 100, replace = TRUE), Q2 = sample(likert_lvls, size = 100, replace = TRUE) ) %>% mutate(across(c(Q1, Q2), ~factor(., levels = likert_lvls))) tbl_likert_ex1 <- tbl_likert(df) %>% add_n() %>% add_continuous_stat(statistic = "{mean}") %>% add_continuous_stat(statistic = "{sd}")
This is a place for any member of the MSK community to add a personal gtsummary theme. Reach out if you're interested in adding yours!
theme_gtsummary_msk( name = c("hot", "karissa", "ally", "mauguen", "esther", "curry", "lavery", "meier", "leej", "whitingk", "kwhiting", "mauguena", "drille", "currym1", "laveryj", "hsum1", "leej22"), font_size = NULL )
theme_gtsummary_msk( name = c("hot", "karissa", "ally", "mauguen", "esther", "curry", "lavery", "meier", "leej", "whitingk", "kwhiting", "mauguena", "drille", "currym1", "laveryj", "hsum1", "leej22"), font_size = NULL )
name |
string indicating the custom theme to set. |
font_size |
Numeric font size for compact theme. Default is 13 for gt tables, and 8 for all other output types |
Visit the gtsummary themes vignette for a full list of preferences that can be set.
Other gtsummary-related functions:
add_inline_forest_plot()
,
add_sparkline()
,
as_ggplot()
,
bold_italicize_group_labels()
,
logistic_reg_adj_diff()
,
style_tbl_compact()
,
tbl_likert()
theme_gtsummary_msk("hot")
theme_gtsummary_msk("hot")
A dataset containing the baseline characteristics of 200 patients who received Drug A or Drug B. Dataset also contains the outcome of tumor response to the treatment.
trial
trial
A data frame with 200 rows–one row per patient
Chemotherapy Treatment
Age, yrs
Marker Level, ng/mL
T Stage
Grade
Tumor Response
Patient Died
Months to Death/Censor
The function wraps rstudio.prefs::use_rstudio_prefs()
and sets the following
preferences in RStudio.
Preference | Value |
always_save_history | FALSE |
graphics_backend | ragg |
load_workspace | FALSE |
show_hidden_files | TRUE |
show_line_numbers | TRUE |
margin_column | 80 |
save_workspace | never |
rainbow_parentheses | TRUE |
restore_last_project | FALSE |
rmd_chunk_output_inline | FALSE |
show_last_dot_value | TRUE |
show_margin | TRUE |
show_invisibles | TRUE |
use_bst_rstudio_prefs(profile = tolower(Sys.info()[["user"]]))
use_bst_rstudio_prefs(profile = tolower(Sys.info()[["user"]]))
profile |
Used to set additional preferences saved under specified profile. Default is your current system username. If no profile exists, it is ignored. |
use_bst_rstudio_prefs()
use_bst_rstudio_prefs()
Rather than using create_bst_project()
to start a new project folder, you
may use use_bst_file()
to write a single file from any project template.
The functions use_bst_gitignore()
and use_bst_readme()
are shortcuts for
use_bst_file("gitignore")
and use_bst_file("readme")
.
use_bst_file(name = NULL, filename = NULL, open = interactive()) use_bst_gitignore(filename = NULL) use_bst_readme(filename = NULL) use_bst_setup(filename = NULL) use_bst_analysis(filename = NULL) use_bst_report(filename = NULL)
use_bst_file(name = NULL, filename = NULL, open = interactive()) use_bst_gitignore(filename = NULL) use_bst_readme(filename = NULL) use_bst_setup(filename = NULL) use_bst_analysis(filename = NULL) use_bst_report(filename = NULL)
name |
Name of file to write. Not sure of the files available to you? Run the function without specifying a name, and all files available within the template will print. |
filename |
Optional argument to specify the name of the file to be written.
Paths/filename is relative to project base (e.g. |
open |
If |
if (FALSE) { # create gitignore file use_bst_file("gitignore") use_bst_gitignore() # create README.md file use_bst_file("readme") use_bst_readme() }
if (FALSE) { # create gitignore file use_bst_file("gitignore") use_bst_gitignore() # create README.md file use_bst_file("readme") use_bst_readme() }
Use Variable Names as Labels
use_varnames_as_labels(data, caps = NULL, exclude = NULL)
use_varnames_as_labels(data, caps = NULL, exclude = NULL)
data |
a data frame |
caps |
variables to be entirely capitalized |
exclude |
variables to exclude from labeling |
a labeled data frame
library(gtsummary) mtcars %>% select(mpg, cyl, vs, am, carb) %>% use_varnames_as_labels(caps = c(mpg, vs, am), exclude = cyl) %>% tbl_summary() %>% as_kable(format = "simple")
library(gtsummary) mtcars %>% select(mpg, cyl, vs, am, carb) %>% use_varnames_as_labels(caps = c(mpg, vs, am), exclude = cyl) %>% tbl_summary() %>% as_kable(format = "simple")