11  Computing Summary Statistics

flowchart LR
oav[Statistical Summaries of All Variables]
osv[Summaries of Variable Subsets]
ofun[Summaries Using Functions<br>Returning Multi-dimensional Results]
ms["Marginal Summaries<br>(e.g. Subtotals)"]

Many applications can use the automatically created data.table object .SD which stands for the data table for the current group being processed. If .SDcols were not specified, all variables would be attempted to be analyzed. Specify a vector of variable names as .SDcols to restrict the analysis. If there were no by variable(s), .SD stands for the whole data table.

require(Hmisc)
require(data.table)
getHdata(stressEcho)
d <- stressEcho
setDT(d)
# Compute the number of distinct values for all variables
d[, sapply(.SD, uniqueN)]   # uniqueN is in data.table
      bhr    basebp    basedp      pkhr       sbp        dp      dose     maxhr 
       69        94       441       105       142       508         7       103 
  pctMphr       mbp   dpmaxdo   dobdose       age    gender    baseEF     dobEF 
       78       132       484         8        62         2        54        60 
chestpain   restwma     posSE     newMI   newPTCA   newCABG     death    hxofHT 
        2         2         2         2         2         2         2         2 
   hxofDM   hxofCig    hxofMI  hxofPTCA  hxofCABG any.event       ecg 
        2         3         2         2         2         2         3 
# Same but only for variables whose names contain hx and either D or M
d[, sapply(.SD, uniqueN), .SDcols=patterns('hx', 'D|M')]
hxofDM hxofMI 
     2      2 
# Compute means on all numeric variables
mn <- function(x) mean(x, na.rm=TRUE)
d[, lapply(.SD, mn), .SDcols=is.numeric]
        bhr   basebp   basedp     pkhr      sbp       dp     dose    maxhr
1: 75.29032 135.3244 10181.31 120.5502 146.9158 17633.84 33.75448 119.3692
    pctMphr mbp  dpmaxdo  dobdose      age   baseEF    dobEF chestpain
1: 78.56989 156 18549.88 30.24194 67.34409 55.60394 65.24194 0.3082437
     restwma     posSE      newMI   newPTCA    newCABG      death    hxofHT
1: 0.4605735 0.2437276 0.05017921 0.0483871 0.05913978 0.04301075 0.7043011
      hxofDM    hxofMI  hxofPTCA  hxofCABG any.event
1: 0.3691756 0.2759857 0.0734767 0.1577061 0.1594982
# Compute means on all numeric non-binary variables
nnb <- function(x) is.numeric(x) && uniqueN(x) > 2
d[, lapply(.SD, mn), .SDcols=nnb]
        bhr   basebp   basedp     pkhr      sbp       dp     dose    maxhr
1: 75.29032 135.3244 10181.31 120.5502 146.9158 17633.84 33.75448 119.3692
    pctMphr mbp  dpmaxdo  dobdose      age   baseEF    dobEF
1: 78.56989 156 18549.88 30.24194 67.34409 55.60394 65.24194
# Print frequency tables of all categorical variables with > 2 levels
cmult <- function(x) ! is.numeric(x) && uniqueN(x) > 2
tab <- function(x) {
  z <- table(x, useNA='ifany')
  paste(paste0(names(z), ': ', z), collapse=', ')
}
d[, lapply(.SD, tab), .SDcols=cmult]
                                      hxofCig
1: heavy: 122, moderate: 138, non-smoker: 298
                                   ecg
1: normal: 311, equivocal: 176, MI: 71

Tabulate all variables having between 3 and 10 distinct values and create a side effect when data.table is running that makes the summarization function tab store all values and frequencies in a growing list Z so that kable can render a markdown table after we pad columns to the maximum length of all columns (maximum number of distinct values).

# Using <<- makes data.table have a side effect of augmenting Z and
# Align in the global environment
tab <- function(x) {
  z <- table(x, useNA='ifany')
  i <- length(Z)
  Z[[i+1]] <<- names(z)
  Z[[i+2]] <<- as.vector(z)
  Align <<- c(Align, if(is.numeric(x)) 'r' else 'l', 'r')
  length(z)
}
discr <- function(x) { i <- uniqueN(x); i > 2 & i < 11 }
#                                       or i %between% c(2,11)
Z    <- list(); Align <- character(0)
w    <- d[, lapply(.SD, tab), .SDcols=discr]
maxl <- max(w)
# Pad shorter vectors with blanks
Z <- lapply(Z, function(x) c(x, rep('', maxl - length(x))))
Z <- do.call('cbind', Z)  # combine all into columns of a matrix
colnames(Z) <- rep(names(w), each=2)
colnames(Z)[seq(2, ncol(Z), by=2)] <- 'Freq'
knitr::kable(Z, align=Align)
dose Freq dobdose Freq hxofCig Freq ecg Freq
10 2 5 7 heavy 122 normal 311
15 28 10 7 moderate 138 equivocal 176
20 47 15 55 non-smoker 298 MI 71
25 56 20 73
30 64 25 71
35 61 30 78
40 300 35 62
40 205

A better approach is to let the kables function put together a series of separate markdown tables of different sizes. By using the “updating Z in the global environment” side effect we are able to let data.table output any type of objects of non-conformable dimensions over variables (such as frequency tabulations).

tab <- function(x) {
  z <- table(x, useNA='ifany')
  i <- length(Z)
  w <- matrix(cbind(names(z), z), ncol=2,
              dimnames=list(NULL, c(vnames[i+1], 'Freq')))
  Z[[i+1]] <<- knitr::kable(w, align=c(if(is.numeric(x)) 'r' else 'l', 'r'))
  length(z)
}
discr <- function(x) { i <- uniqueN(x); i > 2 & i < 11 }
Z      <- list()
vnames <- names(d[, .SD, .SDcols=discr])
w      <- d[, lapply(.SD, tab), .SDcols=discr]
knitr::kables(Z)
dose Freq
10 2
15 28
20 47
25 56
30 64
35 61
40 300
dobdose Freq
5 7
10 7
15 55
20 73
25 71
30 78
35 62
40 205
hxofCig Freq
heavy 122
moderate 138
non-smoker 298
ecg Freq
normal 311
equivocal 176
MI 71

Use a similar side-effect approach to get separate html describe output by gender.

g <- function(x, by) {
  Z[[length(Z) + 1]] <<- describe(x, descript=paste('age for', by))
  by
}
Z <- list()
by <- d[, g(age, gender), by=gender]
# Make Z look like describe() output for multiple variables
class(Z) <- 'describe'
attr(Z, 'dimensions') <- c(nrow(d), nrow(by))
attr(Z, 'descript') <- 'Age by Gender'
html(Z)
Age by Gender Descriptives
Age by Gender

2 Variables   558 Observations

age for male: Age years
image
nmissingdistinctInfoMeanGmd.05.10.25.50.75.90.95
2200510.99967.8612.9145.9551.0062.0069.0075.0081.0086.00
lowest : 30 34 38 40 43 , highest: 88 89 91 92 93
age for female: Age years
image
nmissingdistinctInfoMeanGmd.05.10.25.50.75.90.95
3380580.99967.0113.7447.0050.7059.2568.0076.0083.0085.00
lowest : 26 28 29 33 34 , highest: 87 88 89 90 91
# Compute a 1-valued statistic on multiple variables, by cross-classification
# of two variables.  Do this on a subset.  .SDcols=a:b uses variables in order
# Use keyby instead of by to order output the usual way
d[age < 70, lapply(.SD, mean), keyby=.(gender, newMI), .SDcols=pkhr:dp]
   gender newMI     pkhr      sbp       dp
1:   male     0 122.0561 147.7664 17836.24
2:   male     1 115.6667 140.6667 16437.67
3: female     0 122.8492 150.5084 18509.03
4: female     1 123.5714 171.5714 21506.71
# Compute multiple statistics on one variable
# Note: .N is a special variable: count of obs for current group
d[, .(Max=max(bhr), Min=min(bhr), Mean=mean(bhr), N=.N), by=.(gender, newMI)]
   gender newMI Max Min     Mean   N
1:   male     0 127  42 74.15385 208
2:   male     1  89  59 71.25000  12
3: female     0 210  45 75.96894 322
4: female     1 108  65 79.43750  16
# Same result another way
g <- function(x) list(Max=max(x), Min=min(x), Mean=mean(x), N=length(x))
d[, g(bhr), by=.(gender, newMI)]  # if g returned a vector instead, use as.list(g(bhr))
   gender newMI Max Min     Mean   N
1:   male     0 127  42 74.15385 208
2:   male     1  89  59 71.25000  12
3: female     0 210  45 75.96894 322
4: female     1 108  65 79.43750  16
d[, as.list(quantile(bhr)), by=gender]
   gender 0% 25% 50% 75% 100%
1:   male 42  63  72  83  127
2: female 45  65  75  85  210
# Compute mean bhr by quintiles of age using Hmisc::cut2
# Bad statistical practice; use scatterplot + smoother instead
d[, .(Mean=mean(bhr)), keyby=.(fifth=cut2(age, g=5))]
     fifth     Mean
1: [26,60) 78.54545
2: [60,67) 76.96907
3: [67,72) 74.26316
4: [72,78) 75.92793
5: [78,93] 70.03846
# Compute multiple statistics on multiple variables
d[, lapply(.SD, quantile), by=gender, .SDcols=.q(bhr, pkhr, sbp)]
    gender bhr   pkhr    sbp
 1:   male  42  52.00  80.00
 2:   male  63 106.75 120.00
 3:   male  72 123.00 140.00
 4:   male  83 136.00 166.00
 5:   male 127 176.00 250.00
 6: female  45  61.00  40.00
 7: female  65 106.25 122.25
 8: female  75 122.00 141.50
 9: female  85 134.00 170.00
10: female 210 210.00 309.00
# Similar but put percentile number in front of statistic value
# Do only quartiles
g <- function(x) {
  z <- quantile(x, (1:3)/4, na.rm=TRUE)
  paste(format(names(z)), format(round(z)))
}
d[, lapply(.SD, g), by=gender, .SDcols=.q(bhr, pkhr, sbp)]
   gender    bhr    pkhr     sbp
1:   male 25% 63 25% 107 25% 120
2:   male 50% 72 50% 123 50% 140
3:   male 75% 83 75% 136 75% 166
4: female 25% 65 25% 106 25% 122
5: female 50% 75 50% 122 50% 142
6: female 75% 85 75% 134 75% 170
# To have more control over labeling and to have one row per sex:
g <- function(x) {
  s <- sapply(x, quantile, na.rm=TRUE)  # compute quantiles for all variables -> matrix
  h <- as.list(s)  # vectorizes first
  # Cross row names (percentile names) with column (variable) names
  # paste(b, a) puts variable name in front of percentile
  names(h) <- outer(rownames(s), colnames(s), function(a, b) paste(b, a))
  h
}
d[, g(.SD), by=gender, .SDcols=.q(bhr, pkhr, sbp)]
   gender bhr 0% bhr 25% bhr 50% bhr 75% bhr 100% pkhr 0% pkhr 25% pkhr 50%
1:   male     42      63      72      83      127      52   106.75      123
2: female     45      65      75      85      210      61   106.25      122
   pkhr 75% pkhr 100% sbp 0% sbp 25% sbp 50% sbp 75% sbp 100%
1:      136       176     80  120.00   140.0     166      250
2:      134       210     40  122.25   141.5     170      309
# Restrict to variables bhr - basedp in order columns created in data table
d[, g(.SD), by=gender, .SDcols=bhr : basedp]
   gender bhr 0% bhr 25% bhr 50% bhr 75% bhr 100% basebp 0% basebp 25%
1:   male     42      63      72      83      127        85     120.00
2: female     45      65      75      85      210        90     122.25
   basebp 50% basebp 75% basebp 100% basedp 0% basedp 25% basedp 50% basedp 75%
1:        130        145         203      5220    7976.25       9483   11183.50
2:        136        150         201      5000    8562.00      10063   11891.25
   basedp 100%
1:       16704
2:       27300
# Can put ! in front of a sequence of variables to do the opposite

# To add duplicated means to raw data use e.g.
# d[, Mean := mean(x), by=sex]

11.1 Summary Statistics Using Functions Returning Two-Dimensional Results

data.table can create objects where results are matrices instead of data tables, or where table elements are not just scalar quantities but are lists. We previously saw examples where the summarization function produced a vector or where complex side effects happened without storing the results in a data table. Now consider examples where the summarization function produces a matrix or a list of lists and the final result is a matrix or a more complex data table. Start with calculating three quartiles on one variable two different ways. The examples that follow do not give satisfactory labeling when a by variable is added.

qu <- function(x, type=7) {
  y <- quantile(x, (1:3)/4, na.rm=TRUE, type=type)
  names(y) <- .q(Q1, Q2, Q3)
  y
}
g  <- function(x) rbind(type7=qu(x), type8=qu(x, type=8))
d[, g(pkhr)]   # 2 x 3 matrix
          Q1  Q2       Q3
type7 106.25 122 135.0000
type8 106.00 122 135.0833
# d[, g(pkhr), by=gender] does not retain names

Compute 3 quartiles for each of 3 variables.

g <- function(x) lapply(x, qu)
d[, g(.SD), .SDcols=.q(bhr, pkhr, sbp)]  # a data.table w/out Q1 Q2 Q3 labels
   bhr   pkhr sbp
1:  64 106.25 120
2:  74 122.00 141
3:  84 135.00 170
g <- function(x) apply(x, 2, qu)
d[, g(.SD), .SDcols=.q(bhr, pkhr, sbp)]  # a matrix with Q1 Q2 Q3 labels
   bhr   pkhr sbp
Q1  64 106.25 120
Q2  74 122.00 141
Q3  84 135.00 170

Now consider a summarization function computing a 2 \(\times\) 2 result where some values are numeric and some are character strings. Separately for each value of a by variable we desire the name of the variable having the lowest number of missing values, and the frequency of NAs, and the name of the variable having the highest number of missing values, and that frequency. We use the pbc dataset that comes with the survival package, and do the variable search separately by outcome status. The summarization function g operates on a data table for the current outcome status. Consider only the variables in pbc that are ever NA.

data(pbc, package='survival')
p <- copy(pbc)
setDT(p, key='status')  # sort by status
numna <- sapply(p, function(x) sum(is.na(x)))  # loops over vars and counts NAs -> vector
v     <- names(numna[numna > 0])   # names of variables with any NA 

g <- function(x) {
  m <- sapply(x, function(y) sum(is.na(y)))
  mn <- m[which.min(m)]   # keeps variable name in names attribute
  mx <- m[which.max(m)]
  list(Lowest  = list(VariableLow  = names(mn), Nlow  = mn),
       Highest = list(VariableHigh = names(mx), Nhigh = mx))
}
w=p[, g(.SD), by=status, .SDcols = v]   # data.table
w
   status   Lowest Highest
1:      0  protime    trig
2:      0        1      81
3:      1 platelet    chol
4:      1        0       7
5:      2  protime    trig
6:      2        1      48
sapply(w, class)
   status    Lowest   Highest 
"integer"    "list"    "list" 
w[1]
   status  Lowest Highest
1:      0 protime    trig
w[1, Lowest]
[[1]]
[1] "protime"
w[2, Lowest]
[[1]]
protime 
      1 

The Lowest and Highest elements of a given row of w are R lists, allowing for mixed object types (character/integer). This doesn’t make for the most concise way to retrieve individual elements but w prints well. We can make a more conventional data table with the following code.

g <- function(x) {
  m <- sapply(x, function(y) sum(is.na(y)))
  mn <- m[which.min(m)]
  mx <- m[which.max(m)]
  list(VarLow  = names(mn), Nlow  = mn,
       VarHigh = names(mx), Nhigh = mx)
}
w=p[, g(.SD), by=status, .SDcols = v]   # data.table
sapply(w, class)
     status      VarLow        Nlow     VarHigh       Nhigh 
  "integer" "character"   "integer" "character"   "integer" 
w
   status   VarLow Nlow VarHigh Nhigh
1:      0  protime    1    trig    81
2:      1 platelet    0    chol     7
3:      2  protime    1    trig    48

11.2 Summary Statistics With Marginal Summaries

The cube function in the data.table package will compute all possible marginal statistics. When there is only one by variable, the overall statistic is computed in addition to compute stratified values. When a dimension is being marginalized over, the value of the by variable for that dimension will be NA.

mn  <- function(x) as.double(mean(x, na.rm=TRUE))
# as.double ensures consistency of numeric type across groups
Nna <- function(x) sum(! is.na(x))
cube(d, .(Meanbhr = mn(bhr), N = Nna(bhr)), by='gender', id=TRUE)
   grouping gender  Meanbhr   N
1:        0   male 73.99545 220
2:        0 female 76.13314 338
3:        1   <NA> 75.29032 558
cube(d, .(Meanbhr = mn(bhr), N = Nna(bhr)), by=.q(gender, hxofMI), id=TRUE)
   grouping gender hxofMI  Meanbhr   N
1:        0   male      1 73.22472  89
2:        0 female      0 76.30403 273
3:        0   male      0 74.51908 131
4:        0 female      1 75.41538  65
5:        1   male     NA 73.99545 220
6:        1 female     NA 76.13314 338
7:        2   <NA>      1 74.14935 154
8:        2   <NA>      0 75.72525 404
9:        3   <NA>     NA 75.29032 558
# id=TRUE creates output variable grouping to detail level of marginalization
# It is a binary representation, e.g. if by has 3 variables and a row
# is marginalizing over the first and third variables,
# grouping=binary 101 = 5
# Use groupingsets() to control the marginalizations
# Example: marginalize only one variable at a time
groupingsets(d, .(Meanbhr = mn(bhr), N=Nna(bhr)), by=.q(gender, hxofMI),
             sets=list('gender', 'hxofMI'), id=TRUE)
   grouping gender hxofMI  Meanbhr   N
1:        1   male     NA 73.99545 220
2:        1 female     NA 76.13314 338
3:        2   <NA>      1 74.14935 154
4:        2   <NA>      0 75.72525 404

11.3 Customizing Tables of Summary Statistics

Suppose we wanted to compute N, the mean, the quartiles, and Gini’s mean difference for 4 continuous variables, stratified by treatments A and B and not randomized. Start by writing the per-variable per-stratum summarization function g.

g <- function(x) {
  x <- x[! is.na(x)]
    n <- length(x)
    if(n < 2) return(list(N=n, Mean=if(n == 1) x else NA_real_,
                          Q1=NA_real_, Q2=NA_real_, Q3=NA_real_, Gini=NA_real_))
    q <- quantile(x, (1:3) / 4)
    # data table wants constant numeric types across by-groups
    list(N=n,
       Mean = as.double(mean(x)),
       Q1   = as.double(q[1]),
       Q2   = as.double(q[2]),
       Q3   = as.double(q[3]),
       Gini = as.double(GiniMd(x)))  # GiniMd is in Hmisc
    }
set.seed(1)
g(runif(1000))  # test it
$N
[1] 1000

$Mean
[1] 0.4996917

$Q1
[1] 0.2581286

$Q2
[1] 0.4832595

$Q3
[1] 0.7469325

$Gini
[1] 0.3330362

Next, melt the data table to make it tall and thin so that all the analysis variables may be processed in the same way with repetitive programming. The Hmisc meltData function makes this easy, and defaults to using variable labels. Let’s put units of measurement as a separate variable Units.

p <- upData(p,
            trt = ifelse(is.na(trt), 3, trt),
            trt = factor(trt, 1:3, c('A', 'B', 'Not Randomized')),
            labels=c(age ='Age', bili='Serum Bilirubin',
                     chol='Serum Cholesterol', albumin='Serum Albumin'),
            units=c(age='years', chol='mg/dL'))
Input object size:   50560 bytes;    20 variables    418 observations
Modified variable   trt
Modified variable   trt
New object size:    53720 bytes;    20 variables    418 observations
m <- meltData(age + bili + chol + albumin ~ trt, data=p, tall='left',
              sepunits=TRUE)   # creates Units variable

Now for each variable compute the statistical summary. Then use data.table::dcast to put treatments side-by-side for each variable (row). In the code below we are careful to follow the good programming practice of computing column numbers instead of hard-coding column numbers for gt.

s <- m[, g(value), by=.(variable, Units, trt)]
s
             variable Units            trt   N       Mean        Q1        Q2
 1:               Age years              A 158  51.419108  42.98015  51.93155
 2:               Age years              B 154  48.582540  41.43326  48.11088
 3:               Age years Not Randomized 106  52.868286  46.00137  52.99932
 4:   Serum Bilirubin                    A 158   2.873418   0.80000   1.40000
 5:   Serum Bilirubin                    B 154   3.648701   0.72500   1.30000
 6:   Serum Bilirubin       Not Randomized 106   3.116981   0.72500   1.40000
 7: Serum Cholesterol mg/dL              A 140 365.014286 247.75000 315.50000
 8: Serum Cholesterol mg/dL              B 144 373.881944 254.25000 303.50000
 9: Serum Cholesterol mg/dL Not Randomized   0         NA        NA        NA
10:     Serum Albumin                    A 158   3.516266   3.21250   3.56500
11:     Serum Albumin                    B 154   3.523831   3.34250   3.54500
12:     Serum Albumin       Not Randomized 106   3.431038   3.12500   3.47000
           Q3        Gini
 1:  58.90486  12.6002465
 2:  55.80424  11.3909817
 3:  60.99932  11.2543580
 4:   3.20000   3.1456422
 5:   3.60000   4.4610050
 6:   3.07500   3.5932435
 7: 417.00000 178.4799589
 8: 377.00000 210.0807110
 9:        NA          NA
10:   3.83000   0.4973885
11:   3.77750   0.4289780
12:   3.72000   0.4934106
w <- dcast(s, variable + Units ~ trt, value.var=.q(N, Mean, Q1, Q2, Q3, Gini))
# Sort columns so that treatments are together
v <- names(w)
# Function to get list of variable names ending with a
u <- function(a) v[grep(paste0(a, '$'), v)]
# Check it
u('_A')
[1] "N_A"    "Mean_A" "Q1_A"   "Q2_A"   "Q3_A"   "Gini_A"
# Put columns in desired order using data.table::setcolorder
setcolorder(w, c('variable', 'Units',
                 cola <- u('_A'), colb <- u('_B'),
                 colnr <- u('_Not Randomized')))
# Make short column names
sn <- sub('_A$|_B$|_Not Randomized$', '', names(w))
sn[1] <- ''
# Reformat quantile headings to use html subscripts
# qt will not vertically align these properly if using markdown instead.
sn <- sub('^Q(.)', 'Q<sub>\\1</sub>', sn)
names(sn) <- names(w)
sn
           variable               Units                 N_A              Mean_A 
                 ""             "Units"                 "N"              "Mean" 
               Q1_A                Q2_A                Q3_A              Gini_A 
    "Q<sub>1</sub>"     "Q<sub>2</sub>"     "Q<sub>3</sub>"              "Gini" 
                N_B              Mean_B                Q1_B                Q2_B 
                "N"              "Mean"     "Q<sub>1</sub>"     "Q<sub>2</sub>" 
               Q3_B              Gini_B    N_Not Randomized Mean_Not Randomized 
    "Q<sub>3</sub>"              "Gini"                 "N"              "Mean" 
  Q1_Not Randomized   Q2_Not Randomized   Q3_Not Randomized Gini_Not Randomized 
    "Q<sub>1</sub>"     "Q<sub>2</sub>"     "Q<sub>3</sub>"              "Gini" 
# Save column numbers for Ns
colN <- grep('^N_', names(w))

Compose the table using the gt package. See also Section 4.5.1 and Section 4.8.

require(gt)
gt(w) |>
  tab_style(style=cell_text(size='small'),
            locations=cells_column_labels(columns='Units')) |>
  tab_style(style=cell_text(size='x-small', font='arial'),
            locations=cells_body(columns=Units))  |>
  cols_label(.list=sn, .fn=html)                  |>
  sub_missing(missing_text='')                    |>
  tab_spanner('A', columns=cola)                  |>
  tab_spanner('B', columns=colb)                  |>
  tab_spanner('Not Randomized', columns=colnr)    |>
  fmt_number(decimals=2)                          |>
  fmt_number(columns=colN, decimals=0)
1
smaller font for Units column heading
2
extra-small arial font for Units column contents
3
rename columns to short names and recognize html
4
turn NA into blank
5
group columns
6
2 decimals to the right for all numeric columns except …
7
integer format for N
Units A B Not Randomized
N Mean Q1 Q2 Q3 Gini N Mean Q1 Q2 Q3 Gini N Mean Q1 Q2 Q3 Gini
Age years 158 51.42 42.98 51.93 58.90 12.60 154 48.58 41.43 48.11 55.80 11.39 106 52.87 46.00 53.00 61.00 11.25
Serum Bilirubin 158 2.87 0.80 1.40 3.20 3.15 154 3.65 0.72 1.30 3.60 4.46 106 3.12 0.72 1.40 3.08 3.59
Serum Cholesterol mg/dL 140 365.01 247.75 315.50 417.00 178.48 144 373.88 254.25 303.50 377.00 210.08 0




Serum Albumin 158 3.52 3.21 3.56 3.83 0.50 154 3.52 3.34 3.54 3.78 0.43 106 3.43 3.12 3.47 3.72 0.49

But don’t forget the Hmisc summaryM which, aside from substituting the less useful standard deviation for Gini’s mean difference, provides all the needed summaries for both continuous and categorical variables, as shown below. summaryM will also print test statistics, although these are inappropriate for a randomized trial when testing baseline characteristics.

options(prType='html')
s <- summaryM(age + bili + chol + albumin + sex + edema + stage ~ trt,
              data=p)
# Define a function to specify a 20% smaller font size in html
smaller <- markupSpecs$html$smaller   # is in Hmisc; see also smaller2
print(s, npct='both', prmsd=TRUE, msdsize=smaller, prN=TRUE,
      middle.bold=TRUE, round='auto')
Descriptive Statistics (N=418).
N
A
N=158
B
N=154
Not Randomized
N=106
Age
years
418 43.0 51.9 58.9  (51.4 ±11.0) N=158 41.4 48.1 55.8  (48.6 ±10.0) N=154 46.0 53.0 61.0  (52.9 ± 9.8) N=106
Serum Bilirubin 418 0.80 1.40 3.20  (2.87 ±3.63) N=158 0.72 1.30 3.60  (3.65 ±5.28) N=154 0.72 1.40 3.08  (3.12 ±4.04) N=106
Serum Cholesterol
mg/dL
284 248 316 417  (365 ±210) N=140 254 304 377  (374 ±252) N=144
Serum Albumin 418 3.21 3.56 3.83  (3.52 ±0.44) N=158 3.34 3.54 3.78  (3.52 ±0.40) N=154 3.12 3.47 3.72  (3.43 ±0.43) N=106
sex : f 418 0.87 137158 0.90 139154 0.92 98106
edema : 0 418 0.84 132158 0.85 131154 0.86 91106
  0.5 0.10 16158 0.08 13154 0.14 15106
  1 0.06 10158 0.06 10154 0.00 0106
stage : 1 412 0.08 12158 0.03 4154 0.05 5100
  2 0.22 35158 0.21 32154 0.25 25100
  3 0.35 56158 0.42 64154 0.35 35100
  4 0.35 55158 0.35 54154 0.35 35100
a b c represent the lower quartile a, the median b, and the upper quartile c for continuous variables. x ± s represents X ± 1 SD.   N is the number of non-missing values.