本文主要通过实例介绍了scrapy框架的使用,分享了两个例子,爬豆瓣文本例程 douban 和图片例程 douban_imgs ,具体如下。
例程1: douban
目录树
douban
--douban
--spiders
--__init__.py
--bookspider.py
--douban_comment_spider.py
--doumailspider.py
--__init__.py
--items.py
--pipelines.py
--settings.py
--scrapy.cfg
–spiders–init.py
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
bookspider.py
# -*- coding:utf-8 -*-
'''by sudo rm -rf http://imchenkun.com'''
import scrapy
from douban.items import DoubanBookItem
class BookSpider(scrapy.Spider):
name = 'douban-book'
allowed_domains = ['douban.com']
start_urls = [
'https://book.douban.com/top250'
]
def parse(self, response):
# 请求第一页
yield scrapy.Request(response.url, callback=self.parse_next)
# 请求其它页
for page in response.xpath('//div[@class="paginator"]/a'):
link = page.xpath('@href').extract()[0]
yield scrapy.Request(link, callback=self.parse_next)
def parse_next(self, response):
for item in response.xpath('//tr[@class="item"]'):
book = DoubanBookItem()
book['name'] = item.xpath('td[2]/div[1]/a/@title').extract()[0]
book['content'] = item.xpath('td[2]/p/text()').extract()[0]
book['ratings'] = item.xpath('td[2]/div[2]/span[2]/text()').extract()[0]
yield book
douban_comment_spider.py
# -*- coding:utf-8 -*-
import scrapy
from faker import Factory
from douban.items import DoubanMovieCommentItem
import urlparse
f = Factory.create()
class MailSpider(scrapy.Spider):
name = 'douban-comment'
allowed_domains = ['accounts.douban.com', 'douban.com']
start_urls = [
'https://www.douban.com/'
]
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Connection': 'keep-alive',
'Host': 'accounts.douban.com',
'User-Agent': f.user_agent()
}
formdata = {
'form_email': '你的邮箱',
'form_password': '你的密码',
# 'captcha-solution': '',
# 'captcha-id': '',
'login': '登录',
'redir': 'https://www.douban.com/',
'source': 'None'
}
def start_requests(self):
return [scrapy.Request(url='https://www.douban.com/accounts/login',
headers=self.headers,
meta={'cookiejar': 1},
callback=self.parse_login)]
def parse_login(self, response):
# 如果有验证码要人为处理
if 'captcha_image' in response.body:
print 'Copy the link:'
link = response.xpath('//img[@class="captcha_image"]/@src').extract()[0]
print link
captcha_solution = raw_input('captcha-solution:')
captcha_id = urlparse.parse_qs(urlparse.urlparse(link).query, True)['id']
self.formdata['captcha-solution'] = captcha_solution
self.formdata['captcha-id'] = captcha_id
return [scrapy.FormRequest.from_response(response,
formdata=self.formdata,
headers=self.headers,
meta={'cookiejar': response.meta['cookiejar']},
callback=self.after_login
)]
def after_login(self, response):
print response.status
self.headers['Host'] = "www.douban.com"
yield scrapy.Request(url='https://movie.douban.com/subject/22266320/reviews',
meta={'cookiejar': response.meta['cookiejar']},
headers=self.headers,
callback=self.parse_comment_url)
yield scrapy.Request(url='https://movie.douban.com/subject/22266320/reviews',
meta={'cookiejar': response.meta['cookiejar']},
headers=self.headers,
callback=self.parse_next_page,
dont_filter = True) #不去重
def parse_next_page(self, response):
print response.status
try:
next_url = response.urljoin(response.xpath('//span[@class="next"]/a/@href').extract()[0])
print "下一页"
print next_url
yield scrapy.Request(url=next_url,
meta={'cookiejar': response.meta['cookiejar']},
headers=self.headers,
callback=self.parse_comment_url,
dont_filter = True)
yield scrapy.Request(url=n