Web Scraping in Python
Thomas Laetsch
Data Scientist, NYU
# response loaded with HTML from https://www.datacamp.com/courses/all
course_divs = response.css('div.course-block')
print( len(course_divs) )
>>> 185

first_div = course_divs[0]children = first_div.xpath('./*')print( len(children) ) >>> 3

first_div = course_divs[0]children = first_div.xpath('./*')
first_child = children[0]print( first_child.extract() ) >>> <a class=... />

first_div = course_divs[0]children = first_div.xpath('./*')
second_child = children[1]print( second_child.extract() ) >>> <div class=... />

first_div = course_divs[0]children = first_div.xpath('./*')
third_child = children[2]print( third_child.extract() ) >>> <span class=... />
links = response.css('div.course-block > a::attr(href)').extract()
# step 1: course blocks course_divs = response.css('div.course-block')# step 2: hyperlink elements hrefs = course_divs.xpath('./a/@href')# step 3: extract the links links = hrefs.extract()
for l in links:
print( l )
>>> /courses/free-introduction-to-r
>>> /courses/data-table-data-manipulation-r-tutorial
>>> /courses/dplyr-data-manipulation-r-tutorial
>>> /courses/ggvis-data-visualization-r-tutorial
>>> /courses/reporting-with-r-markdown
>>> /courses/intermediate-r
...
Web Scraping in Python