foreach gebruiken

Parallel programmeren in R

Nabeel Imam

Data Scientist

Een nieuwe lus

Native for-lus in R

numbers <- 1:1e6

sqroots <- rep(0, length(numbers))

for (i in 1:length(numbers)) {
  sqroots[i] <- sqrt(numbers[i])
}

De foreach-lus

numbers <- 1:1e6

library(foreach)


sqroots <- foreach(i = numbers) %do% { sqrt(i) }
Parallel programmeren in R

Parallelle lussen

numbers <- 1:1e6

sqroots <- foreach(i = numbers) %do% {
  sqrt(i)
}
cl <- makeCluster(4)


library(doParallel) registerDoParallel(cl)
sqroots <- foreach(i = numbers # De parallelle operator ) %dopar% { sqrt(i) }
stopCluster(cl)
Parallel programmeren in R

Top technische universiteiten

print(uni_list)
 [1] "./uni_data/Argentina.csv"
 [2] "./uni_data/Australia.csv"
 [3] "./uni_data/Austria.csv"
 [4] "./uni_data/Azerbaijan.csv"
 [5] "./uni_data/Bahrain.csv"
 [6] "./uni_data/Bangladesh.csv"
 [7] "./uni_data/Belarus.csv"
 [8] "./uni_data/Belgium.csv"
 [9] "./uni_data/Bolivia.csv"
[10] "./uni_data/Bosnia and Herzegovina.csv"
...
cl <- makeCluster(4)

registerDoParallel(cl)
ls_df <- foreach(csv = uni_list) %dopar% { read.csv(csv) } stopCluster(cl)
Parallel programmeren in R

Resultaten verzamelen met foreach

cl <- makeCluster(4)
registerDoParallel(cl)
ls_df <- foreach(csv = uni_list) %dopar% {
  read.csv(csv)
}
stopCluster(cl)
[[1]]
 location                  institution score
Argentina  Universidad de Buenos Aires  68.9
...
[[2]]
 location                     institution score
Australia  Australian National University  82.1
...
cl <- makeCluster(4)
registerDoParallel(cl)

df_uni <- foreach(csv = uni_list,
                 .combine = "rbind") %dopar% {
  read.csv(csv)
}
stopCluster(cl)
   location                     institution score
1 Argentina     Universidad de Buenos Aires  68.9
2 Argentina  Universidad Católica Argentina  33.3
3 Argentina     Universidad de Palermo (UP)  29.1
...
Parallel programmeren in R

Inlezen, filteren en samenvoegen

library(dplyr)

n_unis <- 3


# Lege lijst ls_df <- list()
for (i in 1:length(uni_list)) { # Inlezen, filteren, opslaan in lege lijst ls_df[[i]] <- read.csv(uni_list[[i]]) %>% top_n(n_unis, total_score) }
# Lijst samenvoegen combined_df <- Reduce("rbind", ls_df)
Parallel programmeren in R

foreach voor de winst

n_unis <- 3


cl <- makeCluster(4) registerDoParallel(cl)
df_top3 <- foreach(csv = uni_list,
.packages = "dplyr",
.export = "n_unis",
.combine = "rbind") %dopar% { read.csv(csv) %>% top_n(n_unis, score) } stopCluster(cl)
 location                     institution score
Argentina     Universidad de Buenos Aires  68.9
Argentina  Universidad Católica Argentina  33.3
Argentina     Universidad de Palermo (UP)  29.1
Australia  Australian National University  82.1
Australia     The University of Melbourne  81.6
Australia        The University of Sydney  79.6
  Austria            University of Vienna  50.6
  Austria     Technische Universität Wien  45.7
...
Parallel programmeren in R

Laten we oefenen!

Parallel programmeren in R

Preparing Video For Download...