准备工作
配置文件 settings. py
BOT_NAME = 'scrapyTest'
SPIDER_MODULES = ['scrapyTest.spiders']
NEWSPIDER_MODULE = 'scrapyTest.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'scrapyTest (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False //不能遵循robots
准备许多浏览器头
agents = [
"Mozilla/2.02E (Win95; U)",
"Mozilla/3.01Gold (Win95; I)"
.
.
]
分析cookies(掌阅貌似没这方面反爬)
""" 换Cookie """
cookie = {
'Hm_lpvt_2583df02aa8541db9378beae2ed00ba0': '1502265076',
'Hm_lvt_2583df02aa8541db9378beae2ed00ba0': '1502263527',
'ZyId': 'ada56e4598ab89a9944f'
}
打日志(记录问题)
import logging
logging.getLogger("requests").setLevel(logging.WARNING) # 将requests的日志级别设成WARNING
logging.basicConfig(
level=logging.DEBUG,
format=
'%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S',
filename='cataline.log',
filemode='w')
例如:logging.info('爬取地址' + view_url)
编写spiders主类
根据对掌阅的分析,首先获取需要爬取的地址,并且压入待爬取队列。这里采取递归获取的策略。
# 开始URL
start_urls = [
"http://www.ireader.com/index.php?ca=booksort.index&pca=booksort.index&pid=92",
"http://www.ireader.com/index.php?ca=booksort.index&pca=booksort.index&pid=10",
"http://www.ireader.com/index.php?ca=booksort.index&pca=booksort.index&pid=68"
]
def start_requests(self):
for ph_type in self.start_urls:
yield Request(url=ph_type, callback=self.parse_type_key)
#遍历所有类型
def parse_type_key(self, response):
selector = Selector(response)
types = selector.xpath('//div[@class="difgenre"]')[1].xpath('.//div[@class="right"]/ul/li')
for type in types:
type_url = type.xpath('.//a/@href')[0].extract()
logging.info('类型' + type_url)
yield Request(url=type_url,callback=self.parse_ph_key)
#每个类型遍历所有链接,并且自动下一页
def parse_ph_key(self, response):
selector = Selector(response)
lis = selector.xpath('//ul[@class="newShow"]/li')
for li in lis:
view_url = li.xpath('.//a/@href')[0].extract()
logging.info('爬取地址' + view_url)
yield Request(url=view_url, callback=self.parse_content)
url_next = selector.xpath('//a[@class="down"]/@href')[0].extract()
if url_next:
logging.info('下一页地址' + url_next)
yield Request(url=url_next,callback=self.parse_ph_key)
获取完地址就要对每个地址进行下载解析(代码不完整)
# 解析内容函数
def parse_content(self, response):
logging.debug('正在爬取地址' + response.url)
item = ScrapytestItem()
item['_id'] = dict([(k, v[0]) for k, v in urlparse.parse_qs(urlparse.urlparse(response.url).query).items()])['bid']
# 当前URL
item['url'] = response.url
# title
item['title'] = response.selector.xpath('//div[@class="bookname"]/h2/a/text()')[0].extract().decode('utf-8')
item['tag'] = response.selector.xpath('//div[@class="bookL"]/s/text()')[0].extract().decode('utf-8')
try:
# 评分
item['rate'] = response.selector.xpath('//div[@class="bookname"]/span/text()')[0].extract().decode('utf-8')
# 评价人数
item['num_rate'] = response.selector.xpath('//div[@class="bookinf01"]/p/span[@class="manyMan"]/text()')[0].extract().decode('utf-8').split('人')[0]
except Exception:
item['rate'] = ''
item['num_rate'] = ''
yield item
item字段定义
定义你解析html所需要的字段(根据spiders类中parse_content)
import scrapy
class ScrapytestItem(scrapy.Item):
url = scrapy.Field()
_id = scrapy.Field()
title = scrapy.Field()
author = scrapy.Field()
num_word = scrapy.Field()
press = scrapy.Field()
num_rate = scrapy.Field()
rate = scrapy.Field()
tag = scrapy.Field()
img = scrapy.Field()
des = scrapy.Field()
price = scrapy.Field()
similar = scrapy.Field()
pipelines类(存储数据)
使用mongodb进行数据存储
import pymongo
from items import ScrapytestItem
class ScrapytestPipeline(object):
def __init__(self):
clinet = pymongo.MongoClient("localhost", 27017)
db = clinet["book"]
self.book = db["book"]
def process_item(self, item, spider):
""" 判断类型 存入MongoDB """
if isinstance(item, ScrapytestItem):
try:
self.book.insert(dict(item))
except Exception:
pass
return item
middlewares中间件
这里只用到了两个中间件,而且比较简单
class UserAgentMiddleware(object):
""" 换User-Agent """
def process_request(self, request, spider):
agent = random.choice(agents)
request.headers["User-Agent"] = agent
class CookiesMiddleware(object):
""" 换Cookie """
cookie = {
'Hm_lpvt_2583df02aa8541db9378beae2ed00ba0': '1502265076',
'Hm_lvt_2583df02aa8541db9378beae2ed00ba0': '1502263527',
'ZyId': 'ada56e4598ab89a9944f'
}
def process_request(self, request, spider):
request.cookies = self.cookie
使用中间件,需要进行配置settings. py
# 后面数字貌似决定执行的顺序
DOWNLOADER_MIDDLEWARES = {
'scrapyTest.middlewares.UserAgentMiddleware': 401,
'scrapyTest.middlewares.CookiesMiddleware': 402,
}
结果
添加启动入口
from scrapy import cmdline
cmdline.execute("scrapy crawl scrapyTest".split())
完整目录结构
最终爬取了91914条数据
问题分析
- 没有做详细的异常处理
- 便利到数据量太小,只考虑到页面有的数据
- 日志用的比较随意,可以考虑分小时,分情况打日志