2] # print 2nd row
d[2:4] # print rows 2,3,4
d[> 2 & z > 3] # rows satisfying conditions
d[y # retrieve one variable
d[, age] # make a new table with two variables
d[, .(age, gender)] <- 2; d[, ..i] # get column 2
i <- 'age'; d[, ..v] # get variable named by contents of v
v # another way; these return data tables
d[, (v)] # last row
d[.N] =age] # number of rows for each age, sorted
d[, .N, keyby1:10, bhr:pkhr] # first 10 rows, variables bhr - pkhr
d[1:10, !(bhr:pkhr)] # all but those variables
d[2:4] # get columns 2-4
d[, # run any number of lines of code in ...
d[, {...}] # that reference variables inside d
9 Data Manipulation and Aggregation
The data.table
package provides a concise, consistent syntax for managing simple and complex data manipulation tasks, and it is extremely efficient for large datasets. One of the best organized tutorials is this, and a cheatsheet for data transformation is here. A master cheatsheet for data.table
is here from which the general syntax below is taken, where DT
represents a data table.
DT[ i, j, by ] # + extra arguments
| | |
| | -------> grouped by what?
| -------> what to do?
---> on which rows?
Data tables are also data frames so work on any method handling data frames. But data tables do not contain rownames
.
Several data.table
examples follow. I like to hold the current dataset in d
to save typing. Some basic operations on data tables are:
data.table
does many of its operations by reference to avoid the overhead of having multiple copies of data tables. This idea carries over to apparent copies of data tables. Here is an example.
require(Hmisc)
require(data.table)
<- knitr::normal_print # make knitr standard print output (not kable)
knit_print <- data.table(x1=1:3, x2=letters[1:3])
a a
x1 x2
1: 1 a
2: 2 b
3: 3 c
<- a # no copy; b is just a pointer to a
b := toupper(x2)] # changes a
b[, x2 a
x1 x2
1: 1 A
2: 2 B
3: 3 C
<- data.table(x1=1:3, x2=letters[1:3])
a <- copy(a) # fresh copy with its own memory space
a2 := toupper(x2)] # doesn't change a
a2[, x2 a
x1 x2
1: 1 a
2: 2 b
3: 3 c
9.1 Analyzing Selected Variables and Subsets
getHdata(stressEcho)
<- stressEcho
d setDT(d)
html(describe(age))] d[,
n | missing | distinct | Info | Mean | Gmd | .05 | .10 | .25 | .50 | .75 | .90 | .95 |
---|---|---|---|---|---|---|---|---|---|---|---|---|
558 | 0 | 62 | 0.999 | 67.34 | 13.41 | 46.85 | 51.00 | 60.00 | 69.00 | 75.00 | 82.00 | 85.00 |
html(describe(~ age + gender))] d[,
2 Variables 558 Observations
age: Age years
n | missing | distinct | Info | Mean | Gmd | .05 | .10 | .25 | .50 | .75 | .90 | .95 |
---|---|---|---|---|---|---|---|---|---|---|---|---|
558 | 0 | 62 | 0.999 | 67.34 | 13.41 | 46.85 | 51.00 | 60.00 | 69.00 | 75.00 | 82.00 | 85.00 |
gender
n | missing | distinct |
---|---|---|
558 | 0 | 2 |
Value male female Frequency 220 338 Proportion 0.394 0.606
== 'female', html(describe(age))] # analyze age for females d[gender
n | missing | distinct | Info | Mean | Gmd | .05 | .10 | .25 | .50 | .75 | .90 | .95 |
---|---|---|---|---|---|---|---|---|---|---|---|---|
338 | 0 | 58 | 0.999 | 67.01 | 13.74 | 47.00 | 50.70 | 59.25 | 68.00 | 76.00 | 83.00 | 85.00 |
html(describe(d[, .(age, gender)], 'Age and Gender Stats'))
2 Variables 558 Observations
age: Age years
n | missing | distinct | Info | Mean | Gmd | .05 | .10 | .25 | .50 | .75 | .90 | .95 |
---|---|---|---|---|---|---|---|---|---|---|---|---|
558 | 0 | 62 | 0.999 | 67.34 | 13.41 | 46.85 | 51.00 | 60.00 | 69.00 | 75.00 | 82.00 | 85.00 |
gender
n | missing | distinct |
---|---|---|
558 | 0 | 2 |
Value male female Frequency 220 338 Proportion 0.394 0.606
# Separate analysis by female, male. Use keyby instead of by to sort the usual way.
print(describe(age, descript=gender)), by=gender] d[,
male : Age [years]
n missing distinct Info Mean Gmd .05 .10
220 0 51 0.999 67.86 12.91 45.95 51.00
.25 .50 .75 .90 .95
62.00 69.00 75.00 81.00 86.00
lowest : 30 34 38 40 43, highest: 88 89 91 92 93
female : Age [years]
n missing distinct Info Mean Gmd .05 .10
338 0 58 0.999 67.01 13.74 47.00 50.70
.25 .50 .75 .90 .95
59.25 68.00 76.00 83.00 85.00
lowest : 26 28 29 33 34, highest: 87 88 89 90 91
Empty data.table (0 rows and 1 cols): gender
# Compute mean and median age by gender
Mean=mean(age), Median=median(age)), by=gender] d[, .(
gender Mean Median
1: male 67.85909 69
2: female 67.00888 68
# To create a new subset
<- d[gender == 'female' & age < 70, ] w
9.2 Adding or Changing Variables
With data.table
you can create a new data table with added variables, or you can add or redefine variables in a data table in place. The latter has major speed and memory efficiency implications when processing massive data tables. Here d
refers to a different data table from the one used above.
# Rename a variable
setnames(d, c('gender', 'height'),
c( 'sex', 'ht'))
# Easier:
setnames(d, .q(gender, height),
.q( sex, ht))
# Or
<- .q(gender=sex, height=ht)
ren setnames(d, names(ren), ren)
# Or
<- function(x, n) setnames(x, names(n), n)
rename rename(d, .q(gender=sex, height=ht))
# For all variables having baseline_ at the start of their names, remove it
<- names(d)
n setnames(d, n, sub('^baseline_', '', n)) # ^ = at start; use $ for at end
# For all variables having baseline_ at the start of their names, remove it
# and add Baseline to start of variable label
<- names(d)
n <- n[grepl('^baseline_', n)]
n <- sub('^baseline_', '', n); names(ren) <- n
ren # Fetch vector of labels for variables whose names start with baseline_
<- sapply(d, label)[n] # label() result is "" if no label
labs <- paste('Baseline', labs)
labs <- updata(d, rename=ren, labels=labs)
d
# Change all names to lower case
<- names(d)
n setnames(d, n, tolower(n))
# Abbreviate names of all variables longer than 10 characters
# abbreviate() will ensure that all names are unique
<- names(d)
n setnames(d, n, abbreviate(n, minlength=10))
# For any variables having [...] or (...) in their labels, assume these
# are units of measurement and move them from the label to the units
# attribute
<- upData(d, moveUnits=TRUE)
d
# Add two new variables, storing
in a new data table
result <- d[, .(bmi=wt / ht ^ 2, bsa=0.016667 * sqrt(wt * ht))]
z
# Add one new variable in place
:= wt / ht ^ 2]
d[, bmi
# Add two new variables in place
`:=`(bmi = wt / ht ^ 2, bas=0.016667 * sqrt(wt * ht))]
d[, .q(bmi, bsa) := .(wt / ht ^ 2, 0.016667 * sqrt(wt * ht))]
d[,
# Compute something requiring a different formula for different types
# of observations
:= ifelse(sex == 'male', ht, ht * 1.07)] # better" use fifelse in data.table
d[, htAdj := ht * ifelse(sex == 'male', 1, 1.07)]
d[, htAdj := (sex == 'male') * ht + (sex == 'female') * ht * 1.07]
d[, htAdj == 'male', htAdj := ht]
d[sex == 'female', htAdj := ht * 1.07]
d[sex := fcase(sex == 'male', ht, # fcase is in dta.table
d[, htAdj == 'female', ht * 1.07)]
sex := fcase(sex = 'female', ht * 1.07, default = ht)]
d[, htAdj
# Add label & optional units (better to use upData which works on data tables)
<- function(x, lab, un='') {
adlab label(x) <- lab
if(un != '') units(x) <- un
x
}:= adlab(maxhr, 'Maximum Heart Rate', '/m')]
d[, maxhr
# Delete a variable (or use upData)
:= NULL]
d[, bsa
# Delete two variables
`:=`(bsa=NULL, bmi=NULL)]
d[, .q(bsa, bmi) := NULL] d[,
9.3 Recoding Variables
# Group levels a1, a2 as a & b1,b2,b3 as b
:= factor(x, .q(a1,a2,b1,b2,b3),
d[, x .q( a, a, b, b, b))]
# Regroup an existing factor variable
levels(d$x) <- list(a=.q(a1,a2), b=.q(b1,b2,b3))
# or
<- upData(d, levels=list(x=list(a=.q(a1,a2), b=.q(b1,b2,b3))))
d # Or manipulate character strings
:= substring(x, 1, 1)] # replace x with first character of levels
d[, x # or
levels(d$x) <- substring(levels(d$x), 1, 1)
# Recode a numeric variable with values 0, 1, 2, 3, 4 to 0, 1, 1, 1, 2
:= 1 * (x %in% 1:3) + 2 * (x == 4)]
d[, x := fcase(x %in% 1:3, 1, # fcase is in data.table
d[, x == 4, 2)]
x := fcase(x %between% c(1,3), 1,
d[, x == 4, 2)] # %between% is in data.table
x
# Recode a series of conditions to a factor variable whose value is taken
# from the last condition that is TRUE using Hmisc::score.binary
# Result is a factor variable unless you add retfactor=FALSE
:= score.binary(x1 == 'symptomatic',
d[, x %in% .q(stroke, MI),
x2
death)]# Same but code with numeric points
:= score.binary(x1 == 'symptomatic',
d[, x %in% .q(stroke, MI),
x2 # TRUE/FALSE or 1/0 variable
death, points=c(1,2,10))]
# Or just reverse the conditions and use fcase which stops at the first
# condition met
:= fcase(death, 'death', # takes precedence
d[, x %in% .q(stroke, MI), 'stroke/MI', # takes next precedence
x2 == 'symptomatic', 'symptomatic',
x1 default = 'none')]
# Recode from one set of character strings to another using named vector
# A named vector can be subscripted with character strings as well as integers
<- c(AL='Alabama', AK='Alaska', AZ='Arizona', ...)
states # Could also do:
# states <- .q(AL=Alabama, AK=Alaska, AZ=Arizona, ..., NM='New Mexico', ...)
# or do a merge for table lookup (see later)
:= states[state]]
d[, State # states are unique, state can have duplicates and all are recoded
:= fcase(state == 'AL', 'Alabama', state='AK', 'Alaska', ...)]
d[, State
# Recode from integers 1, 2, ..., to character strings
<- .q(elephant, giraffe, dog, cat)
labs := labs[x]]
d[, x
# Recode from character strings to integers
:= match(x, labs)]
d[, x := fcase(x == 'elephant', 1,
d[, x == 'giraffe', 2,
x == 'dog', 3,
x == 'cat', 4)] x
As an example of more complex hierarchical recoding let’s define codes in a nested list
.
<- list(plant =
a list(vegetable = .q(spinach, lettuce, potato),
fruit = .q(apple, orange, banana, pear)),
animal =
list(domestic = .q(dog, cat, horse),
wild = .q(giraffe, elephant, lion, tiger)) )
a
$plant
$plant$vegetable
[1] "spinach" "lettuce" "potato"
$plant$fruit
[1] "apple" "orange" "banana" "pear"
$animal
$animal$domestic
[1] "dog" "cat" "horse"
$animal$wild
[1] "giraffe" "elephant" "lion" "tiger"
<- unlist(a)
a a
plant.vegetable1 plant.vegetable2 plant.vegetable3 plant.fruit1
"spinach" "lettuce" "potato" "apple"
plant.fruit2 plant.fruit3 plant.fruit4 animal.domestic1
"orange" "banana" "pear" "dog"
animal.domestic2 animal.domestic3 animal.wild1 animal.wild2
"cat" "horse" "giraffe" "elephant"
animal.wild3 animal.wild4
"lion" "tiger"
# Pick names of unlist'ed elements apart to define kingdom and type
<- sub('[0-9]*$', '', names(a)) # remove sequence numbers from ends of names
n # Names are of the form kingdom.type; split at .
<- strsplit(n, '.', fixed=TRUE)
s <- sapply(s, function(x) x[1])
kingdom <- sapply(s, function(x) x[2])
type # or: (note \\. is escaped . meaning not to use as wild card)
# .* = wild card: any number of characters
# kingdom <- sub('\\..*', '', n) # in xxx.yyy remove .yyy
# type <- sub('.*\\.', '', n) # in xxx.yyy remove xxx.
names(kingdom) <- names(type) <- a
<- data.table(kingdom, type, item=a, key=c('kingdom', 'item'))
w w
kingdom type item
1: animal domestic cat
2: animal domestic dog
3: animal wild elephant
4: animal wild giraffe
5: animal domestic horse
6: animal wild lion
7: animal wild tiger
8: plant fruit apple
9: plant fruit banana
10: plant vegetable lettuce
11: plant fruit orange
12: plant fruit pear
13: plant vegetable potato
14: plant vegetable spinach
# Example table lookups
cat(kingdom['dog'], ':', type['dog'], '\n')
animal : domestic
.q(dog, cat, spinach)] kingdom[
dog cat spinach
"animal" "animal" "plant"
.q(dog, cat, giraffe, spinach)] type [
dog cat giraffe spinach
"domestic" "domestic" "wild" "vegetable"
# But what if there is a plant named the same as an animal?
# Then look up on two keys
'animal', 'lion'), type] w[.(
[1] "wild"
9.4 Reshaping Data
To reshape data from long to wide format, take the L
data table above as an example.
<- data.table(id = c(2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 5),
L day = c(1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2, 3),
y = 1 : 12, key='id')
L
id day y
1: 2 1 1
2: 2 2 2
3: 2 3 3
4: 3 1 4
5: 3 2 5
6: 3 3 6
7: 3 4 7
8: 4 1 8
9: 4 2 9
10: 5 1 10
11: 5 2 11
12: 5 3 12
<- dcast(L, id ~ day, value.var='y')
w w
id 1 2 3 4
1: 2 1 2 3 NA
2: 3 4 5 6 7
3: 4 8 9 NA NA
4: 5 10 11 12 NA
# Now reverse the procedure (see also meltData in reptools)
<- melt(w, id.vars='id', variable.name='day', value.name='y')
m m
id day y
1: 2 1 1
2: 3 1 4
3: 4 1 8
4: 5 1 10
5: 2 2 2
6: 3 2 5
7: 4 2 9
8: 5 2 11
9: 2 3 3
10: 3 3 6
11: 4 3 NA
12: 5 3 12
13: 2 4 NA
14: 3 4 7
15: 4 4 NA
16: 5 4 NA
setkey(m, id, day) # sorts
! is.na(y)] m[
id day y
1: 2 1 1
2: 2 2 2
3: 2 3 3
4: 3 1 4
5: 3 2 5
6: 3 3 6
7: 3 4 7
8: 4 1 8
9: 4 2 9
10: 5 1 10
11: 5 2 11
12: 5 3 12