Eindopdracht

Webscraping in Python

Thomas Laetsch

Data Scientist, NYU

Elementen inspecteren

import scrapy
from scrapy.crawler import CrawlerProcess

class DC_Chapter_Spider(scrapy.Spider):

    name = "dc_chapter_spider"

    def start_requests( self ):
        url = 'https://www.datacamp.com/courses/all'
        yield scrapy.Request( url = url,
                              callback = self.parse_front )

    def parse_front( self, response ):
        ## Code to parse the front courses page

    def parse_pages( self, response ):
        ## Code to parse course pages
        ## Fill in dc_dict here

dc_dict = dict()

process = CrawlerProcess()
process.crawl(DC_Chapter_Spider)
process.start()

Webscraping in Python

De voorpagina parsen

    def parse_front( self, response ):

# Inzoomen op de course-blokken course_blocks = response.css( 'div.course-block' )
# Ga naar de cursuslinks course_links = course_blocks.xpath( './a/@href' )
# Links extraheren (als lijst met strings) links_to_follow = course_links.extract()
# Links volgen naar de volgende parser for url in links_to_follow: yield response.follow( url = url, callback = self.parse_pages )
Webscraping in Python

Cursuspagina's parsen

def parse_pages( self, response ):

# Ga naar de cursustiteltekst crs_title = response.xpath('//h1[contains(@class,"title")]/text()')
# Cursustitel extraheren en opschonen crs_title_ext = crs_title.extract_first().strip()
# Ga naar de hoofdstuktitels ch_titles = response.css( 'h4.chapter__title::text' )
# Hoofdstuktitels extraheren en opschonen ch_titles_ext = [t.strip() for t in ch_titles.extract()]
# Opslaan in onze dictionary dc_dict[ crs_title_ext ] = ch_titles_ext
Webscraping in Python

Tijd om te Weave'n

Webscraping in Python

Preparing Video For Download...