这里爬取的电商网站为当当网的地方特产为例
首先建立爬虫项目
scrapy startproject autop
然后就要编写items文件了
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class AutopItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name=scrapy.Field()#商品名称
price=scrapy.Field()#商品价格
link=scrapy.Field()#商品的链接
comnum=scrapy.Field()#商品的评论数
pipelines的编写
# -*- coding: utf-8 -*-
import codecs
import json
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class AutopPipeline(object):
def __init__(self):#初始化
self.f=codecs.open('D:/AuI18N/2.json','wb',encoding= 'utf-8')
def process_item(self, item, spider):
for j in range(0,len(item['name'])):#每页的商品总数为循环次数
name=item['name'][j]#提取namekey下的第j个元素,以下同理
price=item['price'][j]
comnum=item['comnum'][j]
link=item['link'][j]
g={'name':name,'price':price,'comnum':comnum,'link':link}#将提取的到的元素重新创建一个字典
i=json.dumps(dict(g),ensure_ascii= False )#设置json文件的编码
line=i+'\n'#每行后加换行
self.f.write(line)#写入文件
return item
def close_spider(self):#关闭文件
self.f.close()
然后要修改settings文件,因为默认pipelines是关闭的,所以要先开启
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'autop.pipelines.AutopPipeline': 300,
}
还要关闭Cookie,防止网站根据Cookie信息对我们进行屏蔽
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
再进行网址和网页源码的分析
由此可见决定网页页数的是‘’pg2‘’字段
然后要对网页源码分析找出名称、价格、链接和评论数的字段,这里可以自行查看网页源代码分析
ddclick="act=normalResult_picture&pos=60637121_0_2_m" class="pic"
相应的xpath表达式#"//a[@class='pic']/@title"
网页价格源码段# <span class="price_n">¥13.90</span>
相应的xpath表达式#"//span[@class='price_n']"
网页商品链接源码段# href="http://product.dangdang.com/60637121.html" target="_blank" >
相应的xpath表达式#"//a[@class='pic']/#href"
网页评论源码段# ddclick="act=click_review_count&pos=60637121_0_2_m">1569条评论</a></p>
相应的xpath表达式#"//a[@name='itemlist-review']/text()"
然后开始编写爬虫
scrapy genspider -t basic autos dangdang.com#创建爬虫文件
蜘蛛源代码
# -*- coding: utf-8 -*-
import scrapy
from autop.items import AutopItem
from scrapy.http import Request
class AutosSpider(scrapy.Spider):
name = 'autos'
allowed_domains = ['dangdang.com']#允许爬取的域名
start_urls = ['http://category.dangdang.com/pg1-cid4002203.html']#起始网址
def parse(self, response):
item=AutopItem()#引入之前编写的类
#用XPath表达式分别匹配名称、价格、链接、评论数
item['name']=response.xpath("//a[@class='pic']/@title").extract()
item['price']=response.xpath("//span[@class='price_n']/text()").extract()
item['link']=response.xpath("//a[@class='pic']/@href").extract()
item['comnum']=response.xpath("//a[@name='itemlist-review']/text()").extract()
yield item#提取完之后返回item
for i in range(1,45):#通过循环自动爬取45页数据
url='http://category.dangdang.com/pg'+str(i)+'-cid4002203.html'#对网址进行字符串拼接
yield Request(url,callback=self.parse)#通过yield返回Request,并制定要爬取的网址和回调函数
然后运行,会有报错,因为还需要在settings文件中的robots设置为False
ROBOTSTXT_OBEY = False
再运行
scrapy crawl autos --nolog
成功爬取并保存为json文件