Web Scraping in R
Timo Grossenbacher
Instructor
library(httr)
while(TRUE){
print(Sys.time())
response <-
GET("https://httpbin.org")
print(status_code(response))
}
[1] "2020-06-20 10:31:17 CEST"
[1] 200
[1] "2020-06-20 10:31:17 CEST"
[1] 200
[1] "2020-06-20 10:31:17 CEST"
[1] 200
[1] "2020-06-20 10:31:17 CEST"
[1] 200
[1] "2020-06-20 10:31:17 CEST"
[1] 200
[1] "2020-06-20 10:31:18 CEST"
[1] 200
...
while(TRUE){
# Wait one second
# ...
print(Sys.time())
response <-
GET("https://httpbin.org")
print(status_code(response))
}
[1] "2020-06-20 10:36:06 CEST"
[1] 200
[1] "2020-06-20 10:36:07 CEST"
[1] 200
[1] "2020-06-20 10:36:08 CEST"
[1] 200
[1] "2020-06-20 10:36:09 CEST"
[1] 200
[1] "2020-06-20 10:36:10 CEST"
[1] 200
[1] "2020-06-20 10:36:11 CEST"
[1] 200
...
Throttling a function = introducing a time delay between calls
library(httr) library(purrr) throttled_GET <- slowly( ~ GET("https://httbin.org"),
rate = rate_delay(3))
while(TRUE){ print(Sys.time()) response <- throttled_GET() print(status_code(response)) }
[1] "2020-06-20 10:53:44 CEST"
[1] 200
[1] "2020-06-20 10:53:47 CEST"
[1] 200
[1] "2020-06-20 10:53:50 CEST"
[1] 200
[1] "2020-06-20 10:53:53 CEST"
[1] 200
[1] "2020-06-20 10:53:56 CEST"
[1] 200
...
library(httr) library(purrr) throttled_GET <- # instead of GET("https://...") slowly(~ GET(.), rate = rate_delay(3))
while(TRUE){ print(Sys.time()) response <- throttled_GET("https://wikipedia.org") print(status_code(response)) }
[1] "2020-06-20 10:53:44 CEST"
[1] 200
[1] "2020-06-20 10:53:47 CEST"
[1] 200
[1] "2020-06-20 10:53:50 CEST"
[1] 200
[1] "2020-06-20 10:53:53 CEST"
[1] 200
[1] "2020-06-20 10:53:56 CEST"
[1] 200
...
library(httr)
url_list <- c("https://httbin.org/anything/1",
"https://httbin.org/anything/2",
"https://httbin.org/anything/3")
for(url in url_list){
response <- throttled_GET(url)
print(status_code(response))
}
[1] 200
[1] 200
[1] 200
library(httr)
url_list <- c("https://wikipedia.org/wiki/K2",
"https://wikipedia.org/wiki/\
Mount_Everest")
for(url in url_list){
response <- throttled_GET(url)
print(status_code(response))
}
[1] 200
[1] 200
Web Scraping in R