Web Scraping in Python
Thomas Laetsch
Data Scientist, NYU
import scrapy
from scrapy.crawler import CrawlerProcess
class SpiderClassName(scrapy.Spider):
name = "spider_name"
# the code for your spider
...
process = CrawlerProcess()
process.crawl(SpiderClassName)
process.start()
class DCspider( scrapy.Spider ):
name = "dc_spider"
def start_requests( self ):
urls = [ 'https://www.datacamp.com/courses/all' ]
for url in urls:
yield scrapy.Request( url = url, callback = self.parse )
def parse( self, response ):
# simple example: write out the html
html_file = 'DC_courses.html'
with open( html_file, 'wb' ) as fout:
fout.write( response.body )
def start_requests( self ):
urls = ['https://www.datacamp.com/courses/all']
for url in urls: yield scrapy.Request( url = url, callback = self.parse )
def start_requests( self ):
url = 'https://www.datacamp.com/courses/all'
yield scrapy.Request( url = url, callback = self.parse )
scrapy.Request
here will fill in a response variable for usurl
argument tells us which site to scrapecallback
argument tells us where to send the response variable for processingclass DCspider( scrapy.Spider ):
name = "dc_spider"
def start_requests( self ):
urls = [ 'https://www.datacamp.com/courses/all' ]
for url in urls:
yield scrapy.Request( url = url, callback = self.parse )
def parse( self, response ):
# simple example: write out the html
html_file = 'DC_courses.html'
with open( html_file, 'wb' ) as fout:
fout.write( response.body )
Web Scraping in Python