python scrapy 爬虫实例_scrapy爬虫完整实例

本文主要通过实例介绍了scrapy框架的使用,分享了两个例子,爬豆瓣文本例程 douban 和图片例程 douban_imgs ,具体如下。

例程1: douban

目录树

douban

--douban

--spiders

--__init__.py

--bookspider.py

--douban_comment_spider.py

--doumailspider.py

--__init__.py

--items.py

--pipelines.py

--settings.py

--scrapy.cfg

–spiders–init.py

# This package will contain the spiders of your Scrapy project

#

# Please refer to the documentation for information on how to create and manage

# your spiders.

bookspider.py

# -*- coding:utf-8 -*-

'''by sudo rm -rf http://imchenkun.com'''

import scrapy

from douban.items import DoubanBookItem

class BookSpider(scrapy.Spider):

name = 'douban-book'

allowed_domains = ['douban.com']

start_urls = [

'https://book.douban.com/top250'

]

def parse(self, response):

# 请求第一页

yield scrapy.Request(response.url, callback=self.parse_next)

# 请求其它页

for page in response.xpath('//div[@class="paginator"]/a'):

link = page.xpath('@href').extract()[0]

yield scrapy.Request(link, callback=self.parse_next)

def parse_next(self, response):

for item in response.xpath('//tr[@class="item"]'):

book = DoubanBookItem()

book['name'] = item.xpath('td[2]/div[1]/a/@title').extract()[0]

book['content'] = item.xpath('td[2]/p/text()').extract()[0]

book['ratings'] = item.xpath('td[2]/div[2]/span[2]/text()').extract()[0]

yield book

douban_comment_spider.py

# -*- coding:utf-8 -*-

import scrapy

from faker import Factory

from douban.items import DoubanMovieCommentItem

import urlparse

f = Factory.create()

class MailSpider(scrapy.Spider):

name = 'douban-comment'

allowed_domains = ['accounts.douban.com', 'douban.com']

start_urls = [

'https://www.douban.com/'

]

headers = {

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

'Accept-Encoding': 'gzip, deflate, br',

'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',

'Connection': 'keep-alive',

'Host': 'accounts.douban.com',

'User-Agent': f.user_agent()

}

formdata = {

'form_email': '你的邮箱',

'form_password': '你的密码',

# 'captcha-solution': '',

# 'captcha-id': '',

'login': '登录',

'redir': 'https://www.douban.com/',

'source': 'None'

}

def start_requests(self):

return [scrapy.Request(url='https://www.douban.com/accounts/login',

headers=self.headers,

meta={'cookiejar': 1},

callback=self.parse_login)]

def parse_login(self, response):

# 如果有验证码要人为处理

if 'captcha_image' in response.body:

print 'Copy the link:'

link = response.xpath('//img[@class="captcha_image"]/@src').extract()[0]

print link

captcha_solution = raw_input('captcha-solution:')

captcha_id = urlparse.parse_qs(urlparse.urlparse(link).query, True)['id']

self.formdata['captcha-solution'] = captcha_solution

self.formdata['captcha-id'] = captcha_id

return [scrapy.FormRequest.from_response(response,

formdata=self.formdata,

headers=self.headers,

meta={'cookiejar': response.meta['cookiejar']},

callback=self.after_login

)]

def after_login(self, response):

print response.status

self.headers['Host'] = "www.douban.com"

yield scrapy.Request(url='https://movie.douban.com/subject/22266320/reviews',

meta={'cookiejar': response.meta['cookiejar']},

headers=self.headers,

callback=self.parse_comment_url)

yield scrapy.Request(url='https://movie.douban.com/subject/22266320/reviews',

meta={'cookiejar': response.meta['cookiejar']},

headers=self.headers,

callback=self.parse_next_page,

dont_filter = True) #不去重

def parse_next_page(self, response):

print response.status

try:

next_url = response.urljoin(response.xpath('//span[@class="next"]/a/@href').extract()[0])

print "下一页"

print next_url

yield scrapy.Request(url=next_url,

meta={'cookiejar': response.meta['cookiejar']},

headers=self.headers,

callback=self.parse_comment_url,

dont_filter = True)

yield scrapy.Request(url=n

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值