11  Computing Summary Statistics

flowchart LR
oav[Statistical Summaries of All Variables]
osv[Summaries of Variable Subsets]
ofun[Summaries Using Functions<br>Returning Multi-dimensional Results]
ms["Marginal Summaries<br>(e.g. Subtotals)"]

Many applications can use the automatically created data.table object .SD which stands for the data table for the current group being processed. If .SDcols were not specified, all variables would be attempted to be analyzed. Specify a vector of variable names as .SDcols to restrict the analysis. If there were no by variable(s), .SD stands for the whole data table.

require(Hmisc)
require(data.table)
getHdata(stressEcho)
d <- stressEcho
setDT(d)
# Compute the number of distinct values for all variables
d[, sapply(.SD, uniqueN)]   # uniqueN is in data.table
      bhr    basebp    basedp      pkhr       sbp        dp      dose     maxhr 
       69        94       441       105       142       508         7       103 
  pctMphr       mbp   dpmaxdo   dobdose       age    gender    baseEF     dobEF 
       78       132       484         8        62         2        54        60 
chestpain   restwma     posSE     newMI   newPTCA   newCABG     death    hxofHT 
        2         2         2         2         2         2         2         2 
   hxofDM   hxofCig    hxofMI  hxofPTCA  hxofCABG any.event       ecg 
        2         3         2         2         2         2         3 
# Same but only for variables whose names contain hx and either D or M
d[, sapply(.SD, uniqueN), .SDcols=patterns('hx', 'D|M')]
hxofDM hxofMI 
     2      2 
# Compute means on all numeric variables
mn <- function(x) mean(x, na.rm=TRUE)
d[, lapply(.SD, mn), .SDcols=is.numeric]
        bhr   basebp   basedp     pkhr      sbp       dp     dose    maxhr
      <num>    <num>    <num>    <num>    <num>    <num>    <num>    <num>
1: 75.29032 135.3244 10181.31 120.5502 146.9158 17633.84 33.75448 119.3692
    pctMphr   mbp  dpmaxdo  dobdose      age   baseEF    dobEF chestpain
      <num> <num>    <num>    <num>    <num>    <num>    <num>     <num>
1: 78.56989   156 18549.88 30.24194 67.34409 55.60394 65.24194 0.3082437
     restwma     posSE      newMI   newPTCA    newCABG      death    hxofHT
       <num>     <num>      <num>     <num>      <num>      <num>     <num>
1: 0.4605735 0.2437276 0.05017921 0.0483871 0.05913978 0.04301075 0.7043011
      hxofDM    hxofMI  hxofPTCA  hxofCABG any.event
       <num>     <num>     <num>     <num>     <num>
1: 0.3691756 0.2759857 0.0734767 0.1577061 0.1594982
# Compute means on all numeric non-binary variables
nnb <- function(x) is.numeric(x) && uniqueN(x) > 2
d[, lapply(.SD, mn), .SDcols=nnb]
        bhr   basebp   basedp     pkhr      sbp       dp     dose    maxhr
      <num>    <num>    <num>    <num>    <num>    <num>    <num>    <num>
1: 75.29032 135.3244 10181.31 120.5502 146.9158 17633.84 33.75448 119.3692
    pctMphr   mbp  dpmaxdo  dobdose      age   baseEF    dobEF
      <num> <num>    <num>    <num>    <num>    <num>    <num>
1: 78.56989   156 18549.88 30.24194 67.34409 55.60394 65.24194
# Print frequency tables of all categorical variables with > 2 levels
cmult <- function(x) ! is.numeric(x) && uniqueN(x) > 2
tab <- function(x) {
  z <- table(x, useNA='ifany')
  paste(paste0(names(z), ': ', z), collapse=', ')
}
d[, lapply(.SD, tab), .SDcols=cmult]
                                      hxofCig
                                       <char>
1: heavy: 122, moderate: 138, non-smoker: 298
                                   ecg
                                <char>
1: normal: 311, equivocal: 176, MI: 71

Tabulate all variables having between 3 and 10 distinct values and create a side effect when data.table is running that makes the summarization function tab store all values and frequencies in a growing list Z so that kable can render a markdown table after we pad columns to the maximum length of all columns (maximum number of distinct values).

# Using <<- makes data.table have a side effect of augmenting Z and
# Align in the global environment
tab <- function(x) {
  z <- table(x, useNA='ifany')
  i <- length(Z)
  Z[[i+1]] <<- names(z)
  Z[[i+2]] <<- as.vector(z)
  Align <<- c(Align, if(is.numeric(x)) 'r' else 'l', 'r')
  length(z)
}
discr <- function(x) { i <- uniqueN(x); i > 2 & i < 11 }
#                                       or i %between% c(2,11)
Z    <- list(); Align <- character(0)
w    <- d[, lapply(.SD, tab), .SDcols=discr]
maxl <- max(w)
# Pad shorter vectors with blanks
Z <- lapply(Z, function(x) c(x, rep('', maxl - length(x))))
Z <- do.call('cbind', Z)  # combine all into columns of a matrix
colnames(Z) <- rep(names(w), each=2)
colnames(Z)[seq(2, ncol(Z), by=2)] <- 'Freq'
knitr::kable(Z, align=Align)
dose Freq dobdose Freq hxofCig Freq ecg Freq
10 2 5 7 heavy 122 normal 311
15 28 10 7 moderate 138 equivocal 176
20 47 15 55 non-smoker 298 MI 71
25 56 20 73
30 64 25 71
35 61 30 78
40 300 35 62
40 205

A better approach is to let the kables function put together a series of separate markdown tables of different sizes. By using the “updating Z in the global environment” side effect we are able to let data.table output any type of objects of non-conformable dimensions over variables (such as frequency tabulations).

tab <- function(x) {
  z <- table(x, useNA='ifany')
  i <- length(Z)
  w <- matrix(cbind(names(z), z), ncol=2,
              dimnames=list(NULL, c(vnames[i+1], 'Freq')))
  Z[[i+1]] <<- knitr::kable(w, align=c(if(is.numeric(x)) 'r' else 'l', 'r'))
  length(z)
}
discr <- function(x) { i <- uniqueN(x); i > 2 & i < 11 }
Z      <- list()
vnames <- names(d[, .SD, .SDcols=discr])
w      <- d[, lapply(.SD, tab), .SDcols=discr]
knitr::kables(Z)
dose Freq
10 2
15 28
20 47
25 56
30 64
35 61
40 300
dobdose Freq
5 7
10 7
15 55
20 73
25 71
30 78
35 62
40 205
hxofCig Freq
heavy 122
moderate 138
non-smoker 298
ecg Freq
normal 311
equivocal 176
MI 71

Use a similar side-effect approach to get separate html describe output by gender.

g <- function(x, by) {
  Z[[length(Z) + 1]] <<- describe(x, descript=paste('age for', by))
  by
}
Z <- list()
by <- d[, g(age, gender), by=gender]
# Make Z look like describe() output for multiple variables
class(Z) <- 'describe'
attr(Z, 'dimensions') <- c(nrow(d), nrow(by))
attr(Z, 'descript') <- 'Age by Gender'
html(Z)
Age by Gender Descriptives
Age by Gender

2 Variables   558 Observations

age for male: Age years
image
nmissingdistinctInfoMeanpMedianGmd.05.10.25.50.75.90.95
2200510.99967.8668.512.9145.9551.0062.0069.0075.0081.0086.00
lowest : 30 34 38 40 43 , highest: 88 89 91 92 93
age for female: Age years
image
nmissingdistinctInfoMeanpMedianGmd.05.10.25.50.75.90.95
3380580.99967.0167.513.7447.0050.7059.2568.0076.0083.0085.00
lowest : 26 28 29 33 34 , highest: 87 88 89 90 91
# Compute a 1-valued statistic on multiple variables, by cross-classification
# of two variables.  Do this on a subset.  .SDcols=a:b uses variables in order
# Use keyby instead of by to order output the usual way
d[age < 70, lapply(.SD, mean), keyby=.(gender, newMI), .SDcols=pkhr:dp]
Key: <gender, newMI>
   gender      newMI       pkhr        sbp         dp
   <fctr> <labelled> <labelled> <labelled> <labelled>
1:   male          0   122.0561   147.7664   17836.24
2:   male          1   115.6667   140.6667   16437.67
3: female          0   122.8492   150.5084   18509.03
4: female          1   123.5714   171.5714   21506.71
# Compute multiple statistics on one variable
# Note: .N is a special variable: count of obs for current group
d[, .(Max=max(bhr), Min=min(bhr), Mean=mean(bhr), N=.N), by=.(gender, newMI)]
   gender      newMI        Max        Min       Mean     N
   <fctr> <labelled> <labelled> <labelled> <labelled> <int>
1:   male          0        127         42   74.15385   208
2:   male          1         89         59   71.25000    12
3: female          0        210         45   75.96894   322
4: female          1        108         65   79.43750    16
# Same result another way
g <- function(x) list(Max=max(x), Min=min(x), Mean=mean(x), N=length(x))
d[, g(bhr), by=.(gender, newMI)]  # if g returned a vector instead, use as.list(g(bhr))
   gender      newMI   Max   Min     Mean     N
   <fctr> <labelled> <int> <int>    <num> <int>
1:   male          0   127    42 74.15385   208
2:   male          1    89    59 71.25000    12
3: female          0   210    45 75.96894   322
4: female          1   108    65 79.43750    16
d[, as.list(quantile(bhr)), by=gender]
   gender    0%   25%   50%   75%  100%
   <fctr> <num> <num> <num> <num> <num>
1:   male    42    63    72    83   127
2: female    45    65    75    85   210
# Compute mean bhr by quintiles of age using Hmisc::cut2
# Bad statistical practice; use scatterplot + smoother instead
d[, .(Mean=mean(bhr)), keyby=.(fifth=cut2(age, g=5))]
Key: <fifth>
        fifth       Mean
   <labelled> <labelled>
1:    [26,60)   78.54545
2:    [60,67)   76.96907
3:    [67,72)   74.26316
4:    [72,78)   75.92793
5:    [78,93]   70.03846
# Compute multiple statistics on multiple variables
d[, lapply(.SD, quantile), by=gender, .SDcols=.q(bhr, pkhr, sbp)]
    gender        bhr       pkhr        sbp
    <fctr> <labelled> <labelled> <labelled>
 1:   male         42      52.00      80.00
 2:   male         63     106.75     120.00
 3:   male         72     123.00     140.00
 4:   male         83     136.00     166.00
 5:   male        127     176.00     250.00
 6: female         45      61.00      40.00
 7: female         65     106.25     122.25
 8: female         75     122.00     141.50
 9: female         85     134.00     170.00
10: female        210     210.00     309.00
# Similar but put percentile number in front of statistic value
# Do only quartiles
g <- function(x) {
  z <- quantile(x, (1:3)/4, na.rm=TRUE)
  paste(format(names(z)), format(round(z)))
}
d[, lapply(.SD, g), by=gender, .SDcols=.q(bhr, pkhr, sbp)]
   gender    bhr    pkhr     sbp
   <fctr> <char>  <char>  <char>
1:   male 25% 63 25% 107 25% 120
2:   male 50% 72 50% 123 50% 140
3:   male 75% 83 75% 136 75% 166
4: female 25% 65 25% 106 25% 122
5: female 50% 75 50% 122 50% 142
6: female 75% 85 75% 134 75% 170
# To have more control over labeling and to have one row per sex:
g <- function(x) {
  s <- sapply(x, quantile, na.rm=TRUE)  # compute quantiles for all variables -> matrix
  h <- as.list(s)  # vectorizes first
  # Cross row names (percentile names) with column (variable) names
  # paste(b, a) puts variable name in front of percentile
  names(h) <- outer(rownames(s), colnames(s), function(a, b) paste(b, a))
  h
}
d[, g(.SD), by=gender, .SDcols=.q(bhr, pkhr, sbp)]
   gender bhr 0% bhr 25% bhr 50% bhr 75% bhr 100% pkhr 0% pkhr 25% pkhr 50%
   <fctr>  <num>   <num>   <num>   <num>    <num>   <num>    <num>    <num>
1:   male     42      63      72      83      127      52   106.75      123
2: female     45      65      75      85      210      61   106.25      122
   pkhr 75% pkhr 100% sbp 0% sbp 25% sbp 50% sbp 75% sbp 100%
      <num>     <num>  <num>   <num>   <num>   <num>    <num>
1:      136       176     80  120.00   140.0     166      250
2:      134       210     40  122.25   141.5     170      309
# Restrict to variables bhr - basedp in order columns created in data table
d[, g(.SD), by=gender, .SDcols=bhr : basedp]
   gender bhr 0% bhr 25% bhr 50% bhr 75% bhr 100% basebp 0% basebp 25%
   <fctr>  <num>   <num>   <num>   <num>    <num>     <num>      <num>
1:   male     42      63      72      83      127        85     120.00
2: female     45      65      75      85      210        90     122.25
   basebp 50% basebp 75% basebp 100% basedp 0% basedp 25% basedp 50% basedp 75%
        <num>      <num>       <num>     <num>      <num>      <num>      <num>
1:        130        145         203      5220    7976.25       9483   11183.50
2:        136        150         201      5000    8562.00      10063   11891.25
   basedp 100%
         <num>
1:       16704
2:       27300
# Can put ! in front of a sequence of variables to do the opposite

# To add duplicated means to raw data use e.g.
# d[, Mean := mean(x), by=sex]

11.1 Summary Statistics Using Functions Returning Two-Dimensional Results

data.table can create objects where results are matrices instead of data tables, or where table elements are not just scalar quantities but are lists. We previously saw examples where the summarization function produced a vector or where complex side effects happened without storing the results in a data table. Now consider examples where the summarization function produces a matrix or a list of lists and the final result is a matrix or a more complex data table. Start with calculating three quartiles on one variable two different ways. The examples that follow do not give satisfactory labeling when a by variable is added.

qu <- function(x, type=7) {
  y <- quantile(x, (1:3)/4, na.rm=TRUE, type=type)
  names(y) <- .q(Q1, Q2, Q3)
  y
}
g  <- function(x) rbind(type7=qu(x), type8=qu(x, type=8))
d[, g(pkhr)]   # 2 x 3 matrix
          Q1  Q2       Q3
type7 106.25 122 135.0000
type8 106.00 122 135.0833
# d[, g(pkhr), by=gender] does not retain names

Compute 3 quartiles for each of 3 variables.

g <- function(x) lapply(x, qu)
d[, g(.SD), .SDcols=.q(bhr, pkhr, sbp)]  # a data.table w/out Q1 Q2 Q3 labels
          bhr       pkhr        sbp
   <labelled> <labelled> <labelled>
1:         64     106.25        120
2:         74     122.00        141
3:         84     135.00        170
g <- function(x) apply(x, 2, qu)
d[, g(.SD), .SDcols=.q(bhr, pkhr, sbp)]  # a matrix with Q1 Q2 Q3 labels
   bhr   pkhr sbp
Q1  64 106.25 120
Q2  74 122.00 141
Q3  84 135.00 170

Now consider a summarization function computing a 2 \(\times\) 2 result where some values are numeric and some are character strings. Separately for each value of a by variable we desire the name of the variable having the lowest number of missing values, and the frequency of NAs, and the name of the variable having the highest number of missing values, and that frequency. We use the pbc dataset that comes with the survival package, and do the variable search separately by outcome status. The summarization function g operates on a data table for the current outcome status. Consider only the variables in pbc that are ever NA.

data(pbc, package='survival')
p <- copy(pbc)
setDT(p, key='status')  # sort by status
numna <- sapply(p, function(x) sum(is.na(x)))  # loops over vars and counts NAs -> vector
v     <- names(numna[numna > 0])   # names of variables with any NA 

g <- function(x) {
  m <- sapply(x, function(y) sum(is.na(y)))
  mn <- m[which.min(m)]   # keeps variable name in names attribute
  mx <- m[which.max(m)]
  list(Lowest  = list(VariableLow  = names(mn), Nlow  = mn),
       Highest = list(VariableHigh = names(mx), Nhigh = mx))
}
w=p[, g(.SD), by=status, .SDcols = v]   # data.table
w
Key: <status>
   status   Lowest Highest
    <int>   <list>  <list>
1:      0  protime    trig
2:      0        1      81
3:      1 platelet    chol
4:      1        0       7
5:      2  protime    trig
6:      2        1      48
sapply(w, class)
   status    Lowest   Highest 
"integer"    "list"    "list" 
w[1]
Key: <status>
   status  Lowest Highest
    <int>  <list>  <list>
1:      0 protime    trig
w[1, Lowest]
[[1]]
[1] "protime"
w[2, Lowest]
[[1]]
protime 
      1 

The Lowest and Highest elements of a given row of w are R lists, allowing for mixed object types (character/integer). This doesn’t make for the most concise way to retrieve individual elements but w prints well. We can make a more conventional data table with the following code.

g <- function(x) {
  m <- sapply(x, function(y) sum(is.na(y)))
  mn <- m[which.min(m)]
  mx <- m[which.max(m)]
  list(VarLow  = names(mn), Nlow  = mn,
       VarHigh = names(mx), Nhigh = mx)
}
w=p[, g(.SD), by=status, .SDcols = v]   # data.table
sapply(w, class)
     status      VarLow        Nlow     VarHigh       Nhigh 
  "integer" "character"   "integer" "character"   "integer" 
w
Key: <status>
   status   VarLow  Nlow VarHigh Nhigh
    <int>   <char> <int>  <char> <int>
1:      0  protime     1    trig    81
2:      1 platelet     0    chol     7
3:      2  protime     1    trig    48

11.2 Summary Statistics With Marginal Summaries

The cube function in the data.table package will compute all possible marginal statistics. When there is only one by variable, the overall statistic is computed in addition to compute stratified values. When a dimension is being marginalized over, the value of the by variable for that dimension will be NA.

mn  <- function(x) as.double(mean(x, na.rm=TRUE))
# as.double ensures consistency of numeric type across groups
Nna <- function(x) sum(! is.na(x))
cube(d, .(Meanbhr = mn(bhr), N = Nna(bhr)), by='gender', id=TRUE)
   grouping gender  Meanbhr     N
      <int> <fctr>    <num> <int>
1:        0   male 73.99545   220
2:        0 female 76.13314   338
3:        1   <NA> 75.29032   558
cube(d, .(Meanbhr = mn(bhr), N = Nna(bhr)), by=.q(gender, hxofMI), id=TRUE)
   grouping gender     hxofMI  Meanbhr     N
      <int> <fctr> <labelled>    <num> <int>
1:        0   male          1 73.22472    89
2:        0 female          0 76.30403   273
3:        0   male          0 74.51908   131
4:        0 female          1 75.41538    65
5:        1   male         NA 73.99545   220
6:        1 female         NA 76.13314   338
7:        2   <NA>          1 74.14935   154
8:        2   <NA>          0 75.72525   404
9:        3   <NA>         NA 75.29032   558
# id=TRUE creates output variable grouping to detail level of marginalization
# It is a binary representation, e.g. if by has 3 variables and a row
# is marginalizing over the first and third variables,
# grouping=binary 101 = 5
# Use groupingsets() to control the marginalizations
# Example: marginalize only one variable at a time
groupingsets(d, .(Meanbhr = mn(bhr), N=Nna(bhr)), by=.q(gender, hxofMI),
             sets=list('gender', 'hxofMI'), id=TRUE)
   grouping gender     hxofMI  Meanbhr     N
      <int> <fctr> <labelled>    <num> <int>
1:        1   male         NA 73.99545   220
2:        1 female         NA 76.13314   338
3:        2   <NA>          1 74.14935   154
4:        2   <NA>          0 75.72525   404

11.3 Customizing Tables of Summary Statistics

Suppose we wanted to compute N, the mean, the quartiles, and Gini’s mean difference for 4 continuous variables, stratified by treatments A and B and not randomized. Start by writing the per-variable per-stratum summarization function g.

g <- function(x) {
  x <- x[! is.na(x)]
    n <- length(x)
    if(n < 2) return(list(N=n, Mean=if(n == 1) x else NA_real_,
                          Q1=NA_real_, Q2=NA_real_, Q3=NA_real_, Gini=NA_real_))
    q <- quantile(x, (1:3) / 4)
    # data table wants constant numeric types across by-groups
    list(N=n,
       Mean = as.double(mean(x)),
       Q1   = as.double(q[1]),
       Q2   = as.double(q[2]),
       Q3   = as.double(q[3]),
       Gini = as.double(GiniMd(x)))  # GiniMd is in Hmisc
    }
set.seed(1)
g(runif(1000))  # test it
$N
[1] 1000

$Mean
[1] 0.4996917

$Q1
[1] 0.2581286

$Q2
[1] 0.4832595

$Q3
[1] 0.7469325

$Gini
[1] 0.3330362

Next, melt the data table to make it tall and thin so that all the analysis variables may be processed in the same way with repetitive programming. The Hmisc meltData function makes this easy, and defaults to using variable labels. Let’s put units of measurement as a separate variable Units.

p <- upData(p,
            trt = ifelse(is.na(trt), 3, trt),
            trt = factor(trt, 1:3, c('A', 'B', 'Not Randomized')),
            labels=c(age ='Age', bili='Serum Bilirubin',
                     chol='Serum Cholesterol', albumin='Serum Albumin'),
            units=c(age='years', chol='mg/dL'))
Input object size:   50560 bytes;    20 variables    418 observations
Modified variable   trt
Modified variable   trt
New object size:    53720 bytes;    20 variables    418 observations
m <- meltData(age + bili + chol + albumin ~ trt, data=p, tall='left',
              sepunits=TRUE)   # creates Units variable

Now for each variable compute the statistical summary. Then use data.table::dcast to put treatments side-by-side for each variable (row). In the code below we are careful to follow the good programming practice of computing column numbers instead of hard-coding column numbers for gt.

s <- m[, g(value), by=.(variable, Units, trt)]
s
             variable  Units            trt     N       Mean        Q1
               <fctr> <char>         <fctr> <int>      <num>     <num>
 1:               Age  years              A   158  51.419108  42.98015
 2:               Age  years              B   154  48.582540  41.43326
 3:               Age  years Not Randomized   106  52.868286  46.00137
 4:   Serum Bilirubin                     A   158   2.873418   0.80000
 5:   Serum Bilirubin                     B   154   3.648701   0.72500
 6:   Serum Bilirubin        Not Randomized   106   3.116981   0.72500
 7: Serum Cholesterol  mg/dL              A   140 365.014286 247.75000
 8: Serum Cholesterol  mg/dL              B   144 373.881944 254.25000
 9: Serum Cholesterol  mg/dL Not Randomized     0         NA        NA
10:     Serum Albumin                     A   158   3.516266   3.21250
11:     Serum Albumin                     B   154   3.523831   3.34250
12:     Serum Albumin        Not Randomized   106   3.431038   3.12500
           Q2        Q3        Gini
        <num>     <num>       <num>
 1:  51.93155  58.90486  12.6002465
 2:  48.11088  55.80424  11.3909817
 3:  52.99932  60.99932  11.2543580
 4:   1.40000   3.20000   3.1456422
 5:   1.30000   3.60000   4.4610050
 6:   1.40000   3.07500   3.5932435
 7: 315.50000 417.00000 178.4799589
 8: 303.50000 377.00000 210.0807110
 9:        NA        NA          NA
10:   3.56500   3.83000   0.4973885
11:   3.54500   3.77750   0.4289780
12:   3.47000   3.72000   0.4934106
w <- dcast(s, variable + Units ~ trt, value.var=.q(N, Mean, Q1, Q2, Q3, Gini))
# Sort columns so that treatments are together
v <- names(w)
# Function to get list of variable names ending with a
u <- function(a) v[grep(paste0(a, '$'), v)]
# Check it
u('_A')
[1] "N_A"    "Mean_A" "Q1_A"   "Q2_A"   "Q3_A"   "Gini_A"
# Put columns in desired order using data.table::setcolorder
setcolorder(w, c('variable', 'Units',
                 cola <- u('_A'), colb <- u('_B'),
                 colnr <- u('_Not Randomized')))
# Make short column names
sn <- sub('_A$|_B$|_Not Randomized$', '', names(w))
sn[1] <- ''
# Reformat quantile headings to use html subscripts
# qt will not vertically align these properly if using markdown instead.
sn <- sub('^Q(.)', 'Q<sub>\\1</sub>', sn)
names(sn) <- names(w)
sn
           variable               Units                 N_A              Mean_A 
                 ""             "Units"                 "N"              "Mean" 
               Q1_A                Q2_A                Q3_A              Gini_A 
    "Q<sub>1</sub>"     "Q<sub>2</sub>"     "Q<sub>3</sub>"              "Gini" 
                N_B              Mean_B                Q1_B                Q2_B 
                "N"              "Mean"     "Q<sub>1</sub>"     "Q<sub>2</sub>" 
               Q3_B              Gini_B    N_Not Randomized Mean_Not Randomized 
    "Q<sub>3</sub>"              "Gini"                 "N"              "Mean" 
  Q1_Not Randomized   Q2_Not Randomized   Q3_Not Randomized Gini_Not Randomized 
    "Q<sub>1</sub>"     "Q<sub>2</sub>"     "Q<sub>3</sub>"              "Gini" 
# Save column numbers for Ns
colN <- grep('^N_', names(w))

Compose the table using the gt package. See also Section 4.6.1 and Section 4.10.

require(gt)
gt(w) |>
  tab_style(style=cell_text(size='small'),
            locations=cells_column_labels(columns='Units')) |>
  tab_style(style=cell_text(size='x-small', font='arial'),
            locations=cells_body(columns=Units))  |>
  cols_label(.list=sn, .fn=html)                  |>
  sub_missing(missing_text='')                    |>
  tab_spanner('A', columns=cola)                  |>
  tab_spanner('B', columns=colb)                  |>
  tab_spanner('Not Randomized', columns=colnr)    |>
  fmt_number(decimals=2)                          |>
  fmt_number(columns=colN, decimals=0)
1
smaller font for Units column heading
2
extra-small arial font for Units column contents
3
rename columns to short names and recognize html
4
turn NA into blank
5
group columns
6
2 decimals to the right for all numeric columns except …
7
integer format for N
Units
A
B
Not Randomized
N Mean Q1 Q2 Q3 Gini N Mean Q1 Q2 Q3 Gini N Mean Q1 Q2 Q3 Gini
Age years 158 51.42 42.98 51.93 58.90 12.60 154 48.58 41.43 48.11 55.80 11.39 106 52.87 46.00 53.00 61.00 11.25
Serum Bilirubin 158 2.87 0.80 1.40 3.20 3.15 154 3.65 0.72 1.30 3.60 4.46 106 3.12 0.72 1.40 3.08 3.59
Serum Cholesterol mg/dL 140 365.01 247.75 315.50 417.00 178.48 144 373.88 254.25 303.50 377.00 210.08 0




Serum Albumin 158 3.52 3.21 3.56 3.83 0.50 154 3.52 3.34 3.54 3.78 0.43 106 3.43 3.12 3.47 3.72 0.49

But don’t forget the Hmisc summaryM which, aside from substituting the less useful standard deviation for Gini’s mean difference, provides all the needed summaries for both continuous and categorical variables, as shown below. summaryM will also print test statistics, although these are inappropriate for a randomized trial when testing baseline characteristics.

options(prType='html')
s <- summaryM(age + bili + chol + albumin + sex + edema + stage ~ trt,
              data=p)
# Define a function to specify a 20% smaller font size in html
smaller <- markupSpecs$html$smaller   # is in Hmisc; see also smaller2
print(s, npct='both', prmsd=TRUE, msdsize=smaller, prN=TRUE,
      middle.bold=TRUE, round='auto')
Descriptive Statistics (N=418).
N
A
N=158
B
N=154
Not Randomized
N=106
Age
years
418 43.0 51.9 58.9  (51.4 ±11.0) N=158 41.4 48.1 55.8  (48.6 ±10.0) N=154 46.0 53.0 61.0  (52.9 ± 9.8) N=106
Serum Bilirubin 418 0.80 1.40 3.20  (2.87 ±3.63) N=158 0.72 1.30 3.60  (3.65 ±5.28) N=154 0.72 1.40 3.08  (3.12 ±4.04) N=106
Serum Cholesterol
mg/dL
284 248 316 417  (365 ±210) N=140 254 304 377  (374 ±252) N=144
Serum Albumin 418 3.21 3.56 3.83  (3.52 ±0.44) N=158 3.34 3.54 3.78  (3.52 ±0.40) N=154 3.12 3.47 3.72  (3.43 ±0.43) N=106
sex : f 418 0.87 137158 0.90 139154 0.92 98106
edema : 0 418 0.84 132158 0.85 131154 0.86 91106
  0.5 0.10 16158 0.08 13154 0.14 15106
  1 0.06 10158 0.06 10154 0.00 0106
stage : 1 412 0.08 12158 0.03 4154 0.05 5100
  2 0.22 35158 0.21 32154 0.25 25100
  3 0.35 56158 0.42 64154 0.35 35100
  4 0.35 55158 0.35 54154 0.35 35100
a b c represent the lower quartile a, the median b, and the upper quartile c for continuous variables. x ± s represents X ± 1 SD.   N is the number of non-missing values.