Optimizing R Code with Rcpp
Romain François
Consulting Datactive, ThinkR
rollmean1 <- function(x, window = 3){
n <- length(x)
# create empty vector full of NA
res <- rep(NA, n)
# fill the values
for( i in seq(window, n) ){
idx <- seq(i-window+1,i)
res[i] <- mean(x[idx])
}
res
}
x
mean
function on that extractrollmean2 <- function(x, window = 3){
n <- length(x)
res <- rep(NA, n)
# first value
total <- sum(head(x,window))
res[window] <- total / window
# remaining values
for( i in seq(window+1, n) ){
total <- total + x[i] - x[i-window]
res[i] <- total / window
}
res
}
x <- c(1.3, 3.2, 4.2, 4.5, 6.8)
start <- sum(x[1:3])
head( x, -3 )
tail( x, -3 )
1.3 3.2
4.5 6.8
c( start, start + cumsum( tail(x, -3) - head( x, -3 ) ) )
c( start, start + cumsum( tail(x, -3) - head( x, -3 ) ) ) / 3
8.7 11.9 15.5
2.900000 3.966667 5.166667
library(microbenchmark)
x <- rnorm(1e5)
microbenchmark(
rollmean1(x, 3),
rollmean2(x, 3),
rollmean3(x, 3)
)
Unit: milliseconds
expr min lq mean median ...
rollmean1(x, 3) 833.667884 857.507753 971.250098 893.206776 ...
rollmean2(x, 3) 10.539993 11.034244 12.293105 11.396629 ...
rollmean3(x, 3) 1.429817 1.625453 3.070925 3.067068 ...
na_locf1 <- function(x){
current <- NA
res <- x
for( i in seq_along(x)){
if( is.na(x[i]) ){
# replace with current
res[i] <- current
} else {
# set current
current <- x[i]
}
}
res
}
na_meancf1 <- function(x){
# ( cumulative sum of non NA values ) / ( cumulative count of non NA )
means <- cumsum( replace(x, is.na(x), 0) ) / cumsum(!is.na(x))
# replace the missing values by the means
x[is.na(x)] <- means[is.na(x)]
x }
# iterative version
na_meancf2 <- function(x){
total <- 0
n <- 0
for( i in seq_along(x) ){
if( is.na(x[i]) ){
x[i] <- total / n}
else {
total <- x[i] + total
n <- n + 1} }
}
x <- rnorm(1e5)
x[ sample(1e5, 100) ] <- NA
microbenchmark( na_meancf1(x), na_meancf2(x) )
Unit: milliseconds
expr min lq mean median ...
na_meancf1(x) 1.176276 2.785667 3.237009 3.474028 ...
na_meancf2(x) 16.945271 17.430133 19.678276 18.625274 ...
Optimizing R Code with Rcpp