Uso di foreach

Programmazione parallela in R

Nabeel Imam

Data Scientist

Un nuovo ciclo

For nativo in R

numbers <- 1:1e6

sqroots <- rep(0, length(numbers))

for (i in 1:length(numbers)) {
  sqroots[i] <- sqrt(numbers[i])
}

Il ciclo foreach

numbers <- 1:1e6

library(foreach)


sqroots <- foreach(i = numbers) %do% { sqrt(i) }
Programmazione parallela in R

Cicli paralleli

numbers <- 1:1e6

sqroots <- foreach(i = numbers) %do% {
  sqrt(i)
}
cl <- makeCluster(4)


library(doParallel) registerDoParallel(cl)
sqroots <- foreach(i = numbers # L'operatore parallelo ) %dopar% { sqrt(i) }
stopCluster(cl)
Programmazione parallela in R

Migliori università di ingegneria

print(uni_list)
 [1] "./uni_data/Argentina.csv"
 [2] "./uni_data/Australia.csv"
 [3] "./uni_data/Austria.csv"
 [4] "./uni_data/Azerbaijan.csv"
 [5] "./uni_data/Bahrain.csv"
 [6] "./uni_data/Bangladesh.csv"
 [7] "./uni_data/Belarus.csv"
 [8] "./uni_data/Belgium.csv"
 [9] "./uni_data/Bolivia.csv"
[10] "./uni_data/Bosnia and Herzegovina.csv"
...
cl <- makeCluster(4)

registerDoParallel(cl)
ls_df <- foreach(csv = uni_list) %dopar% { read.csv(csv) } stopCluster(cl)
Programmazione parallela in R

Raccogliere risultati con foreach

cl <- makeCluster(4)
registerDoParallel(cl)
ls_df <- foreach(csv = uni_list) %dopar% {
  read.csv(csv)
}
stopCluster(cl)
[[1]]
 location                  institution score
Argentina  Universidad de Buenos Aires  68.9
...
[[2]]
 location                     institution score
Australia  Australian National University  82.1
...
cl <- makeCluster(4)
registerDoParallel(cl)

df_uni <- foreach(csv = uni_list,
                 .combine = "rbind") %dopar% {
  read.csv(csv)
}
stopCluster(cl)
   location                     institution score
1 Argentina     Universidad de Buenos Aires  68.9
2 Argentina  Universidad Católica Argentina  33.3
3 Argentina     Universidad de Palermo (UP)  29.1
...
Programmazione parallela in R

Leggi, filtra e unisci

library(dplyr)

n_unis <- 3


# Lista vuota ls_df <- list()
for (i in 1:length(uni_list)) { # Leggi, filtra, inserisci nella lista ls_df[[i]] <- read.csv(uni_list[[i]]) %>% top_n(n_unis, total_score) }
# Unisci la lista in un unico df combined_df <- Reduce("rbind", ls_df)
Programmazione parallela in R

foreach vince su tutto

n_unis <- 3


cl <- makeCluster(4) registerDoParallel(cl)
df_top3 <- foreach(csv = uni_list,
.packages = "dplyr",
.export = "n_unis",
.combine = "rbind") %dopar% { read.csv(csv) %>% top_n(n_unis, score) } stopCluster(cl)
 location                     institution score
Argentina     Universidad de Buenos Aires  68.9
Argentina  Universidad Católica Argentina  33.3
Argentina     Universidad de Palermo (UP)  29.1
Australia  Australian National University  82.1
Australia     The University of Melbourne  81.6
Australia        The University of Sydney  79.6
  Austria            University of Vienna  50.6
  Austria     Technische Universität Wien  45.7
...
Programmazione parallela in R

Passons à la pratique !

Programmazione parallela in R

Preparing Video For Download...