-
Couldn't load subscription status.
- Fork 1k
Do's and Don'ts
NB : Timings updated with v1.8.2 (improves 3rd case dramatically)
n=10000
DT = data.table(grp=1:n,name=as.character(as.hexmode(1:n)),x=rnorm(10*n),y=rnorm(10*n))
setkey(DT,grp)
system.time(ans1<-DT[,lapply(.SD[,list(x,y)],sum),by=grp])
# user system elapsed
# 31.130 0.088 31.288 # bad
system.time(ans2<-DT[,lapply(list(x,y),sum),by=grp])
# user system elapsed
# 0.284 0.004 0.291 # good
setnames(ans2,names(ans1))
identical(ans1,ans2)
# [1] TRUE
system.time(ans3<-DT[,lapply(.SD,sum),by=grp,.SDcols=c("x","y")])
# user system elapsed
# 0.080 0.004 0.085 # even better (prior to v1.8.2 was slower and not recommended, no longer)
identical(ans1,ans3)
# [1] TRUE
tables()
# NAME NROW MB COLS KEY
# [1,] ans1 10,000 1 grp,x,y grp
# [2,] ans2 10,000 1 grp,x,y grp
# [3,] ans3 10,000 1 grp,x,y grp
# [4,] DT 100,000 3 grp,name,x,y grp
# Total: 6MBNB: The motivation for lapply through a subset of .SD is when there are non-group column(s) in DT (such as name above). Otherwise you could just do DT[,lapply(.SD,sum),by=grp].
Subsetting .SD (as in ans1) is so much slower because a new copy of .SD for those columns is created for each and every group.
As from v1.8.2, := by group has been implemented.
library(data.table)
n=10000
DT = data.table(grp=1:n,x=rnorm(10*n),y=rnorm(10*n))
setkey(DT,grp)
system.time(ans1 <- DT[,transform(.SD,x2=x/sum(x),y2=y/sum(y)),by=grp])
# user system elapsed
# 5.46 0.00 5.48 # slow
head(ans1,3)
# grp x y x2 y2
# 1: 1 -0.5848814 -0.41560829 0.6241268 0.5695575
# 2: 1 -0.6314059 -0.49076645 0.6737731 0.6725557
# 3: 1 -1.7694071 0.08860505 1.8881340 -0.1214260
system.time(tt <- DT[,list(x2=x/sum(x),y2=y/sum(y)),by=grp])
# user system elapsed
# 0.02 0.00 0.02 (274 times faster!!!)
head(tt,3)
# grp x2 y2
# 1: 1 0.6241268 0.5695575
# 2: 1 0.6737731 0.6725557
# 3: 1 1.8881340 -0.1214260
system.time(ans2 <- cbind(DT,tt[,list(x2,y2)]))
# user system elapsed
# 0.05 0.00 0.05 # very fast to add afterwards in bulk
head(ans2,3)
# grp x y x2 y2
# 1: 1 -0.5848814 -0.41560829 0.6241268 0.5695575
# 2: 1 -0.6314059 -0.49076645 0.6737731 0.6725557
# 3: 1 -1.7694071 0.08860505 1.8881340 -0.1214260
setkey(ans2,grp)
identical(ans1,ans2)
[1] TRUE
system.time(DT[, c('x2', 'y2') := list(x / sum(x), y / sum(y)), by = grp])
# user system elapsed
# 0.07 0.00 0.07 # equivalent to cbind afterwards approach, but more memory efficient
# now DT has been updated
identical(ans1, DT)
# [1] TRUE
# remove new columns to show different approach
DT[, c('x2', 'y2') := NULL]
system.time(DT[, `:=`(x2=x / sum(x),y2= y / sum(y)), by = grp])
# user system elapsed
#0.04 0.00 0.05 # this is slightly faster
identical(ans1, DT)
# [1] TRUEWhen called repeatedly, it may be surprising to learn that mean() is inefficient compared to sum(). mean() is S3 generic which means it takes time to dispatch to one of methods(mean), usually mean.default(). Also, mean.default() contains a few lines of R before finally calling .Internal(mean(x)). sum() on the other hand is a primitive function.
For this reason data.table will optimize calls to mean() automatically.
TO DO. Revision needed here for v1.9.2
library(data.table)
n<-100000
DT<-data.table(grp1=sample(1:750, n, replace=TRUE),
grp2=sample(1:750, n, replace=TRUE),
x=rnorm(n),
y=rnorm(n))
system.time(ans1<-DT[,list(mean(x),mean(y)),by=list(grp1,grp2), verbose = TRUE])
# Detected that j uses these columns: x,y
# Optimized j from 'list(mean(x), mean(y))' to 'list(.External(Cfastmean, x, FALSE), .External(Cfastmean, y, FALSE))'
# Starting dogroups ... done dogroups in 0.2 secs
# user system elapsed
# 0.25 0.00 0.25
# without optimization
options(datatable.optimize = 0)
system.time(ans2<-DT[,list(mean(x),mean(y)),by=list(grp1,grp2),verbose = TRUE])
# Finding groups (bysameorder=FALSE) ... done in 0.04secs. bysameorder=FALSE and o__ is length 100000
# Detected that j uses these columns: x,y
# Starting dogroups ... done dogroups in 1.61 secs
# user system elapsed
# 1.64 0.00 1.65 # 6.6 times faster
identical(ans1,ans2)
# [1] TRUEmedian() is another S3 generic with overhead that can be sped up a lot using a similar technique. However, sum(), length(), var() and many other functions don't suffer the same performance issue :
system.time(DT[,list(sum(x),sum(y)),by=list(grp1,grp2)])
# user system elapsed
# 0.19 0.00 0.19 # not much slower than optimized mean> n=10000
> DT = data.table(grp=1:n,x=rnorm(10*n),y=rnorm(10*n))
> setkey(DT,grp)
> system.time(ans1<-DT[,as.list(cbind(sum(x),sum(y))),by=grp])
user system elapsed
1.472 0.016 1.494 # bad
> system.time(ans2<-DT[,data.table(sum(x),sum(y)),by=grp])
user system elapsed
18.494 0.308 18.914 # worse
> system.time(ans3<-DT[,data.frame(sum(x),sum(y)),by=grp])
user system elapsed
40.502 1.136 41.807 # terrible
> colnames(ans3)=colnames(ans1)
> system.time(ans4<-DT[,list(sum(x),sum(y)),by=grp])
user system elapsed
0.176 0.004 0.181 # best
> identical(ans1,ans2)
[1] TRUE
> identical(ans1,ans3)
[1] TRUE
> identical(ans1,ans4)
[1] TRUE> n=10000
> DT = data.table(grp=1:n,x=rnorm(10*n),y=rnorm(10*n))
> setkey(DT,grp)
> system.time(ans1<-DT[,as.list(colSums(.SD)),by=grp])
user system elapsed
1.296 0.000 1.298 #bad
> system.time(ans2<-DT[,lapply(.SD,sum),by=grp])
user system elapsed
0.020 0.000 0.021 #best
> identical(ans1,ans2)
[1] TRUE
> system.time(ans3<-DT[,list(x=sum(x),y=sum(y)),by=grp])
user system elapsed
0.020 0.000 0.022 #similar but longer to write
> identical(ans1,ans3)
[1] TRUE