11 Computing Summary Statistics
Many applications can use the automatically created data.table
object .SD
which stands for the data table for the current group being processed. If .SDcols
were not specified, all variables would be attempted to be analyzed. Specify a vector of variable names as .SDcols
to restrict the analysis. If there were no by
variable(s), .SD
stands for the whole data table.
require(Hmisc)
require(data.table)
getHdata(stressEcho)
<- stressEcho
d setDT(d)
# Compute the number of distinct values for all variables
sapply(.SD, uniqueN)] # uniqueN is in data.table d[,
bhr basebp basedp pkhr sbp dp dose maxhr
69 94 441 105 142 508 7 103
pctMphr mbp dpmaxdo dobdose age gender baseEF dobEF
78 132 484 8 62 2 54 60
chestpain restwma posSE newMI newPTCA newCABG death hxofHT
2 2 2 2 2 2 2 2
hxofDM hxofCig hxofMI hxofPTCA hxofCABG any.event ecg
2 3 2 2 2 2 3
# Same but only for variables whose names contain hx and either D or M
sapply(.SD, uniqueN), .SDcols=patterns('hx', 'D|M')] d[,
hxofDM hxofMI
2 2
# Compute means on all numeric variables
<- function(x) mean(x, na.rm=TRUE)
mn lapply(.SD, mn), .SDcols=is.numeric] d[,
bhr basebp basedp pkhr sbp dp dose maxhr
<num> <num> <num> <num> <num> <num> <num> <num>
1: 75.29032 135.3244 10181.31 120.5502 146.9158 17633.84 33.75448 119.3692
pctMphr mbp dpmaxdo dobdose age baseEF dobEF chestpain
<num> <num> <num> <num> <num> <num> <num> <num>
1: 78.56989 156 18549.88 30.24194 67.34409 55.60394 65.24194 0.3082437
restwma posSE newMI newPTCA newCABG death hxofHT
<num> <num> <num> <num> <num> <num> <num>
1: 0.4605735 0.2437276 0.05017921 0.0483871 0.05913978 0.04301075 0.7043011
hxofDM hxofMI hxofPTCA hxofCABG any.event
<num> <num> <num> <num> <num>
1: 0.3691756 0.2759857 0.0734767 0.1577061 0.1594982
# Compute means on all numeric non-binary variables
<- function(x) is.numeric(x) && uniqueN(x) > 2
nnb lapply(.SD, mn), .SDcols=nnb] d[,
bhr basebp basedp pkhr sbp dp dose maxhr
<num> <num> <num> <num> <num> <num> <num> <num>
1: 75.29032 135.3244 10181.31 120.5502 146.9158 17633.84 33.75448 119.3692
pctMphr mbp dpmaxdo dobdose age baseEF dobEF
<num> <num> <num> <num> <num> <num> <num>
1: 78.56989 156 18549.88 30.24194 67.34409 55.60394 65.24194
# Print frequency tables of all categorical variables with > 2 levels
<- function(x) ! is.numeric(x) && uniqueN(x) > 2
cmult <- function(x) {
tab <- table(x, useNA='ifany')
z paste(paste0(names(z), ': ', z), collapse=', ')
}lapply(.SD, tab), .SDcols=cmult] d[,
hxofCig
<char>
1: heavy: 122, moderate: 138, non-smoker: 298
ecg
<char>
1: normal: 311, equivocal: 176, MI: 71
Tabulate all variables having between 3 and 10 distinct values and create a side effect when data.table
is running that makes the summarization function tab
store all values and frequencies in a growing list Z
so that kable
can render a markdown
table after we pad columns to the maximum length of all columns (maximum number of distinct values).
# Using <<- makes data.table have a side effect of augmenting Z and
# Align in the global environment
<- function(x) {
tab <- table(x, useNA='ifany')
z <- length(Z)
i +1]] <<- names(z)
Z[[i+2]] <<- as.vector(z)
Z[[i<<- c(Align, if(is.numeric(x)) 'r' else 'l', 'r')
Align length(z)
}<- function(x) { i <- uniqueN(x); i > 2 & i < 11 }
discr # or i %between% c(2,11)
<- list(); Align <- character(0)
Z <- d[, lapply(.SD, tab), .SDcols=discr]
w <- max(w)
maxl # Pad shorter vectors with blanks
<- lapply(Z, function(x) c(x, rep('', maxl - length(x))))
Z <- do.call('cbind', Z) # combine all into columns of a matrix
Z colnames(Z) <- rep(names(w), each=2)
colnames(Z)[seq(2, ncol(Z), by=2)] <- 'Freq'
::kable(Z, align=Align) knitr
dose | Freq | dobdose | Freq | hxofCig | Freq | ecg | Freq |
---|---|---|---|---|---|---|---|
10 | 2 | 5 | 7 | heavy | 122 | normal | 311 |
15 | 28 | 10 | 7 | moderate | 138 | equivocal | 176 |
20 | 47 | 15 | 55 | non-smoker | 298 | MI | 71 |
25 | 56 | 20 | 73 | ||||
30 | 64 | 25 | 71 | ||||
35 | 61 | 30 | 78 | ||||
40 | 300 | 35 | 62 | ||||
40 | 205 |
A better approach is to let the kables
function put together a series of separate markdown
tables of different sizes. By using the “updating Z
in the global environment” side effect we are able to let data.table
output any type of objects of non-conformable dimensions over variables (such as frequency tabulations).
<- function(x) {
tab <- table(x, useNA='ifany')
z <- length(Z)
i <- matrix(cbind(names(z), z), ncol=2,
w dimnames=list(NULL, c(vnames[i+1], 'Freq')))
+1]] <<- knitr::kable(w, align=c(if(is.numeric(x)) 'r' else 'l', 'r'))
Z[[ilength(z)
}<- function(x) { i <- uniqueN(x); i > 2 & i < 11 }
discr <- list()
Z <- names(d[, .SD, .SDcols=discr])
vnames <- d[, lapply(.SD, tab), .SDcols=discr]
w ::kables(Z) knitr
|
|
|
|
Use a similar side-effect approach to get separate html
describe
output by gender
.
<- function(x, by) {
g length(Z) + 1]] <<- describe(x, descript=paste('age for', by))
Z[[
by
}<- list()
Z <- d[, g(age, gender), by=gender]
by # Make Z look like describe() output for multiple variables
class(Z) <- 'describe'
attr(Z, 'dimensions') <- c(nrow(d), nrow(by))
attr(Z, 'descript') <- 'Age by Gender'
html(Z)
2 Variables 558 Observations
age for male: Age years
n | missing | distinct | Info | Mean | pMedian | Gmd | .05 | .10 | .25 | .50 | .75 | .90 | .95 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
220 | 0 | 51 | 0.999 | 67.86 | 68.5 | 12.91 | 45.95 | 51.00 | 62.00 | 69.00 | 75.00 | 81.00 | 86.00 |
age for female: Age years
n | missing | distinct | Info | Mean | pMedian | Gmd | .05 | .10 | .25 | .50 | .75 | .90 | .95 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
338 | 0 | 58 | 0.999 | 67.01 | 67.5 | 13.74 | 47.00 | 50.70 | 59.25 | 68.00 | 76.00 | 83.00 | 85.00 |
# Compute a 1-valued statistic on multiple variables, by cross-classification
# of two variables. Do this on a subset. .SDcols=a:b uses variables in order
# Use keyby instead of by to order output the usual way
< 70, lapply(.SD, mean), keyby=.(gender, newMI), .SDcols=pkhr:dp] d[age
Key: <gender, newMI>
gender newMI pkhr sbp dp
<fctr> <labelled> <labelled> <labelled> <labelled>
1: male 0 122.0561 147.7664 17836.24
2: male 1 115.6667 140.6667 16437.67
3: female 0 122.8492 150.5084 18509.03
4: female 1 123.5714 171.5714 21506.71
# Compute multiple statistics on one variable
# Note: .N is a special variable: count of obs for current group
Max=max(bhr), Min=min(bhr), Mean=mean(bhr), N=.N), by=.(gender, newMI)] d[, .(
gender newMI Max Min Mean N
<fctr> <labelled> <labelled> <labelled> <labelled> <int>
1: male 0 127 42 74.15385 208
2: male 1 89 59 71.25000 12
3: female 0 210 45 75.96894 322
4: female 1 108 65 79.43750 16
# Same result another way
<- function(x) list(Max=max(x), Min=min(x), Mean=mean(x), N=length(x))
g g(bhr), by=.(gender, newMI)] # if g returned a vector instead, use as.list(g(bhr)) d[,
gender newMI Max Min Mean N
<fctr> <labelled> <int> <int> <num> <int>
1: male 0 127 42 74.15385 208
2: male 1 89 59 71.25000 12
3: female 0 210 45 75.96894 322
4: female 1 108 65 79.43750 16
as.list(quantile(bhr)), by=gender] d[,
gender 0% 25% 50% 75% 100%
<fctr> <num> <num> <num> <num> <num>
1: male 42 63 72 83 127
2: female 45 65 75 85 210
# Compute mean bhr by quintiles of age using Hmisc::cut2
# Bad statistical practice; use scatterplot + smoother instead
Mean=mean(bhr)), keyby=.(fifth=cut2(age, g=5))] d[, .(
Key: <fifth>
fifth Mean
<labelled> <labelled>
1: [26,60) 78.54545
2: [60,67) 76.96907
3: [67,72) 74.26316
4: [72,78) 75.92793
5: [78,93] 70.03846
# Compute multiple statistics on multiple variables
lapply(.SD, quantile), by=gender, .SDcols=.q(bhr, pkhr, sbp)] d[,
gender bhr pkhr sbp
<fctr> <labelled> <labelled> <labelled>
1: male 42 52.00 80.00
2: male 63 106.75 120.00
3: male 72 123.00 140.00
4: male 83 136.00 166.00
5: male 127 176.00 250.00
6: female 45 61.00 40.00
7: female 65 106.25 122.25
8: female 75 122.00 141.50
9: female 85 134.00 170.00
10: female 210 210.00 309.00
# Similar but put percentile number in front of statistic value
# Do only quartiles
<- function(x) {
g <- quantile(x, (1:3)/4, na.rm=TRUE)
z paste(format(names(z)), format(round(z)))
}lapply(.SD, g), by=gender, .SDcols=.q(bhr, pkhr, sbp)] d[,
gender bhr pkhr sbp
<fctr> <char> <char> <char>
1: male 25% 63 25% 107 25% 120
2: male 50% 72 50% 123 50% 140
3: male 75% 83 75% 136 75% 166
4: female 25% 65 25% 106 25% 122
5: female 50% 75 50% 122 50% 142
6: female 75% 85 75% 134 75% 170
# To have more control over labeling and to have one row per sex:
<- function(x) {
g <- sapply(x, quantile, na.rm=TRUE) # compute quantiles for all variables -> matrix
s <- as.list(s) # vectorizes first
h # Cross row names (percentile names) with column (variable) names
# paste(b, a) puts variable name in front of percentile
names(h) <- outer(rownames(s), colnames(s), function(a, b) paste(b, a))
h
}g(.SD), by=gender, .SDcols=.q(bhr, pkhr, sbp)] d[,
gender bhr 0% bhr 25% bhr 50% bhr 75% bhr 100% pkhr 0% pkhr 25% pkhr 50%
<fctr> <num> <num> <num> <num> <num> <num> <num> <num>
1: male 42 63 72 83 127 52 106.75 123
2: female 45 65 75 85 210 61 106.25 122
pkhr 75% pkhr 100% sbp 0% sbp 25% sbp 50% sbp 75% sbp 100%
<num> <num> <num> <num> <num> <num> <num>
1: 136 176 80 120.00 140.0 166 250
2: 134 210 40 122.25 141.5 170 309
# Restrict to variables bhr - basedp in order columns created in data table
g(.SD), by=gender, .SDcols=bhr : basedp] d[,
gender bhr 0% bhr 25% bhr 50% bhr 75% bhr 100% basebp 0% basebp 25%
<fctr> <num> <num> <num> <num> <num> <num> <num>
1: male 42 63 72 83 127 85 120.00
2: female 45 65 75 85 210 90 122.25
basebp 50% basebp 75% basebp 100% basedp 0% basedp 25% basedp 50% basedp 75%
<num> <num> <num> <num> <num> <num> <num>
1: 130 145 203 5220 7976.25 9483 11183.50
2: 136 150 201 5000 8562.00 10063 11891.25
basedp 100%
<num>
1: 16704
2: 27300
# Can put ! in front of a sequence of variables to do the opposite
# To add duplicated means to raw data use e.g.
# d[, Mean := mean(x), by=sex]
11.1 Summary Statistics Using Functions Returning Two-Dimensional Results
data.table
can create objects where results are matrices instead of data tables, or where table elements are not just scalar quantities but are list
s. We previously saw examples where the summarization function produced a vector or where complex side effects happened without storing the results in a data table. Now consider examples where the summarization function produces a matrix or a list of lists and the final result is a matrix or a more complex data table. Start with calculating three quartiles on one variable two different ways. The examples that follow do not give satisfactory labeling when a by
variable is added.
<- function(x, type=7) {
qu <- quantile(x, (1:3)/4, na.rm=TRUE, type=type)
y names(y) <- .q(Q1, Q2, Q3)
y
}<- function(x) rbind(type7=qu(x), type8=qu(x, type=8))
g g(pkhr)] # 2 x 3 matrix d[,
Q1 Q2 Q3
type7 106.25 122 135.0000
type8 106.00 122 135.0833
# d[, g(pkhr), by=gender] does not retain names
Compute 3 quartiles for each of 3 variables.
<- function(x) lapply(x, qu)
g g(.SD), .SDcols=.q(bhr, pkhr, sbp)] # a data.table w/out Q1 Q2 Q3 labels d[,
bhr pkhr sbp
<labelled> <labelled> <labelled>
1: 64 106.25 120
2: 74 122.00 141
3: 84 135.00 170
<- function(x) apply(x, 2, qu)
g g(.SD), .SDcols=.q(bhr, pkhr, sbp)] # a matrix with Q1 Q2 Q3 labels d[,
bhr pkhr sbp
Q1 64 106.25 120
Q2 74 122.00 141
Q3 84 135.00 170
Now consider a summarization function computing a 2 \(\times\) 2 result where some values are numeric and some are character strings. Separately for each value of a by
variable we desire the name of the variable having the lowest number of missing values, and the frequency of NA
s, and the name of the variable having the highest number of missing values, and that frequency. We use the pbc
dataset that comes with the survival
package, and do the variable search separately by outcome status. The summarization function g
operates on a data table for the current outcome status. Consider only the variables in pbc
that are ever NA
.
data(pbc, package='survival')
<- copy(pbc)
p setDT(p, key='status') # sort by status
<- sapply(p, function(x) sum(is.na(x))) # loops over vars and counts NAs -> vector
numna <- names(numna[numna > 0]) # names of variables with any NA
v
<- function(x) {
g <- sapply(x, function(y) sum(is.na(y)))
m <- m[which.min(m)] # keeps variable name in names attribute
mn <- m[which.max(m)]
mx list(Lowest = list(VariableLow = names(mn), Nlow = mn),
Highest = list(VariableHigh = names(mx), Nhigh = mx))
}=p[, g(.SD), by=status, .SDcols = v] # data.table
w w
Key: <status>
status Lowest Highest
<int> <list> <list>
1: 0 protime trig
2: 0 1 81
3: 1 platelet chol
4: 1 0 7
5: 2 protime trig
6: 2 1 48
sapply(w, class)
status Lowest Highest
"integer" "list" "list"
1] w[
Key: <status>
status Lowest Highest
<int> <list> <list>
1: 0 protime trig
1, Lowest] w[
[[1]]
[1] "protime"
2, Lowest] w[
[[1]]
protime
1
The Lowest
and Highest
elements of a given row of w
are R list
s, allowing for mixed object types (character/integer). This doesn’t make for the most concise way to retrieve individual elements but w
prints well. We can make a more conventional data table with the following code.
<- function(x) {
g <- sapply(x, function(y) sum(is.na(y)))
m <- m[which.min(m)]
mn <- m[which.max(m)]
mx list(VarLow = names(mn), Nlow = mn,
VarHigh = names(mx), Nhigh = mx)
}=p[, g(.SD), by=status, .SDcols = v] # data.table
wsapply(w, class)
status VarLow Nlow VarHigh Nhigh
"integer" "character" "integer" "character" "integer"
w
Key: <status>
status VarLow Nlow VarHigh Nhigh
<int> <char> <int> <char> <int>
1: 0 protime 1 trig 81
2: 1 platelet 0 chol 7
3: 2 protime 1 trig 48
11.2 Summary Statistics With Marginal Summaries
The cube
function in the data.table
package will compute all possible marginal statistics. When there is only one by
variable, the overall statistic is computed in addition to compute stratified values. When a dimension is being marginalized over, the value of the by
variable for that dimension will be NA
.
<- function(x) as.double(mean(x, na.rm=TRUE))
mn # as.double ensures consistency of numeric type across groups
<- function(x) sum(! is.na(x))
Nna cube(d, .(Meanbhr = mn(bhr), N = Nna(bhr)), by='gender', id=TRUE)
grouping gender Meanbhr N
<int> <fctr> <num> <int>
1: 0 male 73.99545 220
2: 0 female 76.13314 338
3: 1 <NA> 75.29032 558
cube(d, .(Meanbhr = mn(bhr), N = Nna(bhr)), by=.q(gender, hxofMI), id=TRUE)
grouping gender hxofMI Meanbhr N
<int> <fctr> <labelled> <num> <int>
1: 0 male 1 73.22472 89
2: 0 female 0 76.30403 273
3: 0 male 0 74.51908 131
4: 0 female 1 75.41538 65
5: 1 male NA 73.99545 220
6: 1 female NA 76.13314 338
7: 2 <NA> 1 74.14935 154
8: 2 <NA> 0 75.72525 404
9: 3 <NA> NA 75.29032 558
# id=TRUE creates output variable grouping to detail level of marginalization
# It is a binary representation, e.g. if by has 3 variables and a row
# is marginalizing over the first and third variables,
# grouping=binary 101 = 5
# Use groupingsets() to control the marginalizations
# Example: marginalize only one variable at a time
groupingsets(d, .(Meanbhr = mn(bhr), N=Nna(bhr)), by=.q(gender, hxofMI),
sets=list('gender', 'hxofMI'), id=TRUE)
grouping gender hxofMI Meanbhr N
<int> <fctr> <labelled> <num> <int>
1: 1 male NA 73.99545 220
2: 1 female NA 76.13314 338
3: 2 <NA> 1 74.14935 154
4: 2 <NA> 0 75.72525 404
11.3 Customizing Tables of Summary Statistics
Suppose we wanted to compute N, the mean, the quartiles, and Gini’s mean difference for 4 continuous variables, stratified by treatments A and B and not randomized. Start by writing the per-variable per-stratum summarization function g
.
<- function(x) {
g <- x[! is.na(x)]
x <- length(x)
n if(n < 2) return(list(N=n, Mean=if(n == 1) x else NA_real_,
Q1=NA_real_, Q2=NA_real_, Q3=NA_real_, Gini=NA_real_))
<- quantile(x, (1:3) / 4)
q # data table wants constant numeric types across by-groups
list(N=n,
Mean = as.double(mean(x)),
Q1 = as.double(q[1]),
Q2 = as.double(q[2]),
Q3 = as.double(q[3]),
Gini = as.double(GiniMd(x))) # GiniMd is in Hmisc
}set.seed(1)
g(runif(1000)) # test it
$N
[1] 1000
$Mean
[1] 0.4996917
$Q1
[1] 0.2581286
$Q2
[1] 0.4832595
$Q3
[1] 0.7469325
$Gini
[1] 0.3330362
Next, melt
the data table to make it tall and thin so that all the analysis variables may be processed in the same way with repetitive programming. The Hmisc
meltData
function makes this easy, and defaults to using variable labels. Let’s put units of measurement as a separate variable Units
.
<- upData(p,
p trt = ifelse(is.na(trt), 3, trt),
trt = factor(trt, 1:3, c('A', 'B', 'Not Randomized')),
labels=c(age ='Age', bili='Serum Bilirubin',
chol='Serum Cholesterol', albumin='Serum Albumin'),
units=c(age='years', chol='mg/dL'))
Input object size: 50560 bytes; 20 variables 418 observations
Modified variable trt
Modified variable trt
New object size: 53720 bytes; 20 variables 418 observations
<- meltData(age + bili + chol + albumin ~ trt, data=p, tall='left',
m sepunits=TRUE) # creates Units variable
Now for each variable compute the statistical summary. Then use data.table::dcast
to put treatments side-by-side for each variable (row). In the code below we are careful to follow the good programming practice of computing column numbers instead of hard-coding column numbers for gt
.
<- m[, g(value), by=.(variable, Units, trt)]
s s
variable Units trt N Mean Q1
<fctr> <char> <fctr> <int> <num> <num>
1: Age years A 158 51.419108 42.98015
2: Age years B 154 48.582540 41.43326
3: Age years Not Randomized 106 52.868286 46.00137
4: Serum Bilirubin A 158 2.873418 0.80000
5: Serum Bilirubin B 154 3.648701 0.72500
6: Serum Bilirubin Not Randomized 106 3.116981 0.72500
7: Serum Cholesterol mg/dL A 140 365.014286 247.75000
8: Serum Cholesterol mg/dL B 144 373.881944 254.25000
9: Serum Cholesterol mg/dL Not Randomized 0 NA NA
10: Serum Albumin A 158 3.516266 3.21250
11: Serum Albumin B 154 3.523831 3.34250
12: Serum Albumin Not Randomized 106 3.431038 3.12500
Q2 Q3 Gini
<num> <num> <num>
1: 51.93155 58.90486 12.6002465
2: 48.11088 55.80424 11.3909817
3: 52.99932 60.99932 11.2543580
4: 1.40000 3.20000 3.1456422
5: 1.30000 3.60000 4.4610050
6: 1.40000 3.07500 3.5932435
7: 315.50000 417.00000 178.4799589
8: 303.50000 377.00000 210.0807110
9: NA NA NA
10: 3.56500 3.83000 0.4973885
11: 3.54500 3.77750 0.4289780
12: 3.47000 3.72000 0.4934106
<- dcast(s, variable + Units ~ trt, value.var=.q(N, Mean, Q1, Q2, Q3, Gini))
w # Sort columns so that treatments are together
<- names(w)
v # Function to get list of variable names ending with a
<- function(a) v[grep(paste0(a, '$'), v)]
u # Check it
u('_A')
[1] "N_A" "Mean_A" "Q1_A" "Q2_A" "Q3_A" "Gini_A"
# Put columns in desired order using data.table::setcolorder
setcolorder(w, c('variable', 'Units',
<- u('_A'), colb <- u('_B'),
cola <- u('_Not Randomized')))
colnr # Make short column names
<- sub('_A$|_B$|_Not Randomized$', '', names(w))
sn 1] <- ''
sn[# Reformat quantile headings to use html subscripts
# qt will not vertically align these properly if using markdown instead.
<- sub('^Q(.)', 'Q<sub>\\1</sub>', sn)
sn names(sn) <- names(w)
sn
variable Units N_A Mean_A
"" "Units" "N" "Mean"
Q1_A Q2_A Q3_A Gini_A
"Q<sub>1</sub>" "Q<sub>2</sub>" "Q<sub>3</sub>" "Gini"
N_B Mean_B Q1_B Q2_B
"N" "Mean" "Q<sub>1</sub>" "Q<sub>2</sub>"
Q3_B Gini_B N_Not Randomized Mean_Not Randomized
"Q<sub>3</sub>" "Gini" "N" "Mean"
Q1_Not Randomized Q2_Not Randomized Q3_Not Randomized Gini_Not Randomized
"Q<sub>1</sub>" "Q<sub>2</sub>" "Q<sub>3</sub>" "Gini"
# Save column numbers for Ns
<- grep('^N_', names(w)) colN
Compose the table using the gt
package. See also Section 4.6.1 and Section 4.10.
require(gt)
gt(w) |>
tab_style(style=cell_text(size='small'),
locations=cells_column_labels(columns='Units')) |>
tab_style(style=cell_text(size='x-small', font='arial'),
locations=cells_body(columns=Units)) |>
cols_label(.list=sn, .fn=html) |>
sub_missing(missing_text='') |>
tab_spanner('A', columns=cola) |>
tab_spanner('B', columns=colb) |>
tab_spanner('Not Randomized', columns=colnr) |>
fmt_number(decimals=2) |>
fmt_number(columns=colN, decimals=0)
- 1
- smaller font for Units column heading
- 2
- extra-small arial font for Units column contents
- 3
- rename columns to short names and recognize html
- 4
- turn NA into blank
- 5
- group columns
- 6
- 2 decimals to the right for all numeric columns except …
- 7
- integer format for N
Units |
A
|
B
|
Not Randomized
|
||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
N | Mean | Q1 | Q2 | Q3 | Gini | N | Mean | Q1 | Q2 | Q3 | Gini | N | Mean | Q1 | Q2 | Q3 | Gini | ||
Age | years | 158 | 51.42 | 42.98 | 51.93 | 58.90 | 12.60 | 154 | 48.58 | 41.43 | 48.11 | 55.80 | 11.39 | 106 | 52.87 | 46.00 | 53.00 | 61.00 | 11.25 |
Serum Bilirubin | 158 | 2.87 | 0.80 | 1.40 | 3.20 | 3.15 | 154 | 3.65 | 0.72 | 1.30 | 3.60 | 4.46 | 106 | 3.12 | 0.72 | 1.40 | 3.08 | 3.59 | |
Serum Cholesterol | mg/dL | 140 | 365.01 | 247.75 | 315.50 | 417.00 | 178.48 | 144 | 373.88 | 254.25 | 303.50 | 377.00 | 210.08 | 0 | |||||
Serum Albumin | 158 | 3.52 | 3.21 | 3.56 | 3.83 | 0.50 | 154 | 3.52 | 3.34 | 3.54 | 3.78 | 0.43 | 106 | 3.43 | 3.12 | 3.47 | 3.72 | 0.49 |
But don’t forget the Hmisc
summaryM
which, aside from substituting the less useful standard deviation for Gini’s mean difference, provides all the needed summaries for both continuous and categorical variables, as shown below. summaryM
will also print test statistics, although these are inappropriate for a randomized trial when testing baseline characteristics.
options(prType='html')
<- summaryM(age + bili + chol + albumin + sex + edema + stage ~ trt,
s data=p)
# Define a function to specify a 20% smaller font size in html
<- markupSpecs$html$smaller # is in Hmisc; see also smaller2
smaller print(s, npct='both', prmsd=TRUE, msdsize=smaller, prN=TRUE,
middle.bold=TRUE, round='auto')
Descriptive Statistics (N=418). | ||||
---|---|---|---|---|
N |
A N=158 |
B N=154 |
Not Randomized N=106 |
|
Age
years
|
418 | 43.0 51.9 58.9 (51.4 ±11.0) N=158 | 41.4 48.1 55.8 (48.6 ±10.0) N=154 | 46.0 53.0 61.0 (52.9 ± 9.8) N=106 |
Serum Bilirubin | 418 | 0.80 1.40 3.20 (2.87 ±3.63) N=158 | 0.72 1.30 3.60 (3.65 ±5.28) N=154 | 0.72 1.40 3.08 (3.12 ±4.04) N=106 |
Serum Cholesterol
mg/dL
|
284 | 248 316 417 (365 ±210) N=140 | 254 304 377 (374 ±252) N=144 | |
Serum Albumin | 418 | 3.21 3.56 3.83 (3.52 ±0.44) N=158 | 3.34 3.54 3.78 (3.52 ±0.40) N=154 | 3.12 3.47 3.72 (3.43 ±0.43) N=106 |
sex : f | 418 | 0.87 137⁄158 | 0.90 139⁄154 | 0.92 98⁄106 |
edema : 0 | 418 | 0.84 132⁄158 | 0.85 131⁄154 | 0.86 91⁄106 |
0.5 | 0.10 16⁄158 | 0.08 13⁄154 | 0.14 15⁄106 | |
1 | 0.06 10⁄158 | 0.06 10⁄154 | 0.00 0⁄106 | |
stage : 1 | 412 | 0.08 12⁄158 | 0.03 4⁄154 | 0.05 5⁄100 |
2 | 0.22 35⁄158 | 0.21 32⁄154 | 0.25 25⁄100 | |
3 | 0.35 56⁄158 | 0.42 64⁄154 | 0.35 35⁄100 | |
4 | 0.35 55⁄158 | 0.35 54⁄154 | 0.35 35⁄100 | |
a b c represent the lower quartile a, the median b, and the upper quartile c for continuous variables. x ± s represents X ± 1 SD. N is the number of non-missing values. |