Using foreach

Parallel Programming in R

Nabeel Imam

Data Scientist

A new loop

Native for loop in R

numbers <- 1:1e6

sqroots <- rep(0, length(numbers))

for (i in 1:length(numbers)) {
  sqroots[i] <- sqrt(numbers[i])
}

The foreach loop

numbers <- 1:1e6

library(foreach)


sqroots <- foreach(i = numbers) %do% { sqrt(i) }
Parallel Programming in R

Parallel loops

numbers <- 1:1e6

sqroots <- foreach(i = numbers) %do% {
  sqrt(i)
}
cl <- makeCluster(4)


library(doParallel) registerDoParallel(cl)
sqroots <- foreach(i = numbers # The parallel operator ) %dopar% { sqrt(i) }
stopCluster(cl)
Parallel Programming in R

Top engineering universities

print(uni_list)
 [1] "./uni_data/Argentina.csv"
 [2] "./uni_data/Australia.csv"
 [3] "./uni_data/Austria.csv"
 [4] "./uni_data/Azerbaijan.csv"
 [5] "./uni_data/Bahrain.csv"
 [6] "./uni_data/Bangladesh.csv"
 [7] "./uni_data/Belarus.csv"
 [8] "./uni_data/Belgium.csv"
 [9] "./uni_data/Bolivia.csv"
[10] "./uni_data/Bosnia and Herzegovina.csv"
...
cl <- makeCluster(4)

registerDoParallel(cl)
ls_df <- foreach(csv = uni_list) %dopar% { read.csv(csv) } stopCluster(cl)
Parallel Programming in R

Collecting results with foreach

cl <- makeCluster(4)
registerDoParallel(cl)
ls_df <- foreach(csv = uni_list) %dopar% {
  read.csv(csv)
}
stopCluster(cl)
[[1]]
 location                  institution score
Argentina  Universidad de Buenos Aires  68.9
...
[[2]]
 location                     institution score
Australia  Australian National University  82.1
...
cl <- makeCluster(4)
registerDoParallel(cl)

df_uni <- foreach(csv = uni_list,
                 .combine = "rbind") %dopar% {
  read.csv(csv)
}
stopCluster(cl)
   location                     institution score
1 Argentina     Universidad de Buenos Aires  68.9
2 Argentina  Universidad Católica Argentina  33.3
3 Argentina     Universidad de Palermo (UP)  29.1
...
Parallel Programming in R

Read, filter, and combine

library(dplyr)

n_unis <- 3


# Empty list ls_df <- list()
for (i in 1:length(uni_list)) { # Read, filter, collect in empty list ls_df[[i]] <- read.csv(uni_list[[i]]) %>% top_n(n_unis, total_score) }
# Combine the list into one combined_df <- Reduce("rbind", ls_df)
Parallel Programming in R

foreach for the win

n_unis <- 3


cl <- makeCluster(4) registerDoParallel(cl)
df_top3 <- foreach(csv = uni_list,
.packages = "dplyr",
.export = "n_unis",
.combine = "rbind") %dopar% { read.csv(csv) %>% top_n(n_unis, score) } stopCluster(cl)
 location                     institution score
Argentina     Universidad de Buenos Aires  68.9
Argentina  Universidad Católica Argentina  33.3
Argentina     Universidad de Palermo (UP)  29.1
Australia  Australian National University  82.1
Australia     The University of Melbourne  81.6
Australia        The University of Sydney  79.6
  Austria            University of Vienna  50.6
  Austria     Technische Universität Wien  45.7
...
Parallel Programming in R

Let's practice!

Parallel Programming in R

Preparing Video For Download...