一、项目创建
#创建步骤:
scrapy startproject mySpider #mySpider是项目名字
scrapy genspider apider文件名 www.XXX.com
#配置文件settings中
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'ROBOTSTXT_OBEY=False
LOG_LEVEL= 'ERROR'
#parse文件中解析爬取数据#运行scrapy
scrapy crawl 爬虫文件名字 #crawl(抓取的意思)
scrapy crawl 爬虫名字 --nolog
#-*- coding: utf-8 -*-
importscrapyclassFirstSpider(scrapy.Spider):#爬虫文件的名称:就是爬虫文件的一个唯一标示
name = 'first'
#允许的域名
#allowed_domains = ['www.xxx.com']
#起始url列表:列表元素都会被自动的进行请求的发送
start_urls = ['https://www.qiushibaike.com/text/']#解析数据
defparse(self, response):pass
二、get和post方法
1、分页数据爬取(发起get请求)
deftitle_parse(self, response):
detailnews_div_list= response.xpath('//div[@class="ndi_main"]/div')for detailnews_div in detailnews_div_list[0:2]:
title= detailnews_div.xpath('./div/div[1]/h3/a//text()').extract_first()
detailnews_url= detailnews_div.xpath('./div/div[1]/h3/a//@href').extract_first()
item=WangyinewsallItem()
item['title'] =titleyield scrapy.Request(url=detailnews_url, callback=self.detail_parse, meta={'item': item})
def detail_parse(self, response):
item=response.meta['item']#进一步解析文件中提取传递的参数item
detail_list=response.xpath('//*[@id="endText"]/p//text()').extract()
desc='\n'.join(detail_list)
print(desc)
item['desc']=desc
yield item
2、scrapy发送post请求
classPostdemoSpider(scrapy.Spider):
name= 'postDemo'
#allowed_domains = ['www.xxx.com']
start_urls = ['https://fanyi.baidu.com/sug']defstart_requests(self):
data={'kw':'dog'}for url inself.start_urls:yield scrapy.FormRequest(url=url,callback=self.parse,formdata=data)
三、下载中间件和selenium
settings文件中开启下载中间件
DOWNLOADER_MIDDLEWARES ={'wangyinewsall.middlewares.WangyinewsallDownloaderMiddleware': 543,
}
UA池和代理池的使用
from scrapy importsignalsimportrandomclassMiddleproDownloaderMiddleware(object):
user_agent_list=["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1"
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1","Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11"
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
]
PROXY_http=['153.180.102.104:80','195.208.131.189:56055',
]
PROXY_https=['120.83.49.90:9000','95.189.112.214:35508',
]
@classmethoddeffrom_crawler(cls, crawler):#This method is used by Scrapy to create your spiders.
s =cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)returns#可以处理拦截到所有的非异常的请求
defprocess_request(self, request, spider):
request.headers['User-Agent'] =random.choice(self.user_agent_list)
request.meta['proxy'] = 'https://218.60.8.83:3129'
returnNone#拦截所有的响应
defprocess_response(self, request, response, spider):returnresponse#拦截发生异常的请求对象
defprocess_exception(self, request, exception, spider):if request.url.split(':')[0] == 'https':
request.meta['proxy'] = 'https://'+random.choice(self.PROXY_https)else:
request.meta['proxy'] = 'http://' +random.choice(self.PROXY_http)defspider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
scrapy中selenium的使用和process_response 方法
from scrapy importsignalsfrom scrapy.http importHtmlResponsedefprocess_response(self, request, response, spider):
bro=spider.bro
models_url=spider.models_urlif request.url inmodels_url:
bro.get(request.url)
page_text=bro.page_sourcereturn HtmlResponse(url=request.url,body=page_text,encoding='utf-8',request=request)returnresponse#selenium浏览器对象的创建(爬虫文件中)
importscrapyfrom selenium importwebdriverfrom selenium.webdriver importChromeOptionsclassWangyinewSpider(scrapy.Spider):
option=ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])#创建一个浏览器对象
bro=webdriver.Chrome(executable_path='D:\Jpyter_notebook_work\爬虫day06\wangyinewsall\wangyinewsall\spiders\chromedriver.exe',options=option)
name= "wangyinew"
#allowed_domains = ["www.asd.com"]
start_urls = ['https://news.163.com/']
models_url=[]defparse(self, response):
model_li_list=response.xpath('//div[@class="ns_area list"]/ul/li')
num_list=[3,4,6,7]for num in num_list[0:1]:
model_url=model_li_list[num].xpath('./a/@href').extract_first()
self.models_url.append(model_url)yield scrapy.Request(url=model_url,callback=self.title_parse)
四、正则、xpath、beautifulsoop
爬虫中正则的使用
defdetail_parse(self, response):
news_text=response.text
content_list=re.findall('"digest":"(.*?)"',news_text)
detail_page_text = requests.get(url=detail_url,headers=headers).text
video_url= re.findall(ex,detail_page_text,re.S)[0]
for k,div inenumerate(div_list):
title=re.findall('(.*?)',page_text)[k]
爬虫中xpath的使用
1.下载:pip install lxml2.导包:from lxml importetree3.将html文档或者xml文档转换成一个etree对象,然后调用对象中的方法查找指定的节点2.1 本地文件:tree =etree.parse(文件名)
tree.xpath("xpath表达式")2.2 网络数据:tree =etree.HTML(网页内容字符串)
tree.xpath("xpath表达式")
4.注意xpath匹配的对象均以列表的形式返回
deftitle_parse(self, response):
detailnews_div_list= response.xpath('//div[@class="ndi_main"]/div')for detailnews_div in detailnews_div_list[0:2]:
title= detailnews_div.xpath('./div/div[1]/h3/a//text()').extract_first()
detailnews_url= detailnews_div.xpath('./div/div[1]/h3/a//@href').extract_first()
item=WangyinewsallItem()
item['title'] =titleyield scrapy.Request(url=detailnews_url, callback=self.detail_parse, meta={'item': item})
new_detail_url = div.xpath('./div/div[1]/h3/a/@href').extract_first()
li_list= response.xpath('//div[@class="ns_area list"]/ul/li')
VIEWSTATE_value=tree.xpath('//*[@id=\"__VIEWSTATE\"]/@value')[0]
from lxml importetree
page_text= requests.get(url=url,headers=headers,proxies={'http':'116.228.233.90:8082'}).text
tree=etree.HTML(page_text)
code_img_url= tree.xpath('//*[@id=\"verifyPic_login\"]/@src')[0]
爬虫中beautifulsoop的使用
#安装环境
pip install bs4
pip install lxml#导入beautiful包
from bs4 importBeautifulSoup#实例化一个BeautifulSoup对象#本地加载:
soup=BeautifulSoup(fp,'lxml')#网络加载:
soup=BeautifulSoup(page_text,'lxml'),其中page_text为响应对象
基础巩固:
(1)根据标签名查找-soup.a 只能找到第一个符合要求的标签
(2)获取属性-soup.a.attrs 获取a所有的属性和属性值,返回一个字典- soup.a.attrs['href'] 获取href属性- soup.a['href'] 也可简写为这种形式
(3)获取内容-soup.a.string-soup.a.text-soup.a.get_text()
【注意】如果标签还有标签,那么string获取到的结果为None,而其它两个,可以获取文本内容
(4)find:找到第一个符合要求的标签- soup.find('a') 找到第一个符合要求的- soup.find('a', title="xxx")- soup.find('a', class_="xxx")- soup.find('a', id="xxx")
(5)find_all:找到所有符合要求的标签- soup.find_all('a')- soup.find_all(['a','b']) 找到所有的a和b标签- soup.find_all('a', limit=2) 限制前两个
(6)根据选择器选择指定的内容
select:soup.select('#feng')- 常见的选择器:标签选择器(a)、类选择器(.)、id选择器(#)、层级选择器
-层级选择器:
div .dudu#lala .meme .xixi 下面好多级
div > p > a >.lala 只能是下面一级
【注意】select选择器返回永远是列表,需要通过下标提取指定的对象
#爬取斗破苍穹网站的小说
importrequestsimportreimportjsonimportosimporttimefrom bs4 importBeautifulSoup
url="https://doupocangqiong1.com/486/37zw/list_1.html"headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"}
page_text=requests.get(url=url,headers=headers).text
soup=BeautifulSoup(page_text,'lxml')
span_list=soup.find_all('span',class_='n')
with open('神道丹尊.txt','wt',encoding='utf-8')as f:for span in span_list[0:1]:
title=' '.join(span.a.string.split(' ')[1:])
data_list=span.a['href'].split('/')
cid=re.search('\d+',data_list[2]).group()
data={"siteid":'69',"bid":data_list[1],"cid":cid
}
noval_url="https://doupocangqiong1.com/novelsearch/chapter/transcode.html"json_content=requests.post(url=noval_url,headers=headers,data=data).content
init_content=json.loads(json_content).get('info','')
content=init_content.replace('
\n
','')
f.write(title+'\n\n')
f.write(content)
soup.p.text
soup.p.string
soup.p.get_text()
soup.find_all('div')
soup.find('div',class_=\"song\").get_text()
soup.select('.song')
soup.select('.tang > ul > li')
标签定位:
soup.标签名称:定位标签;如有多个,返回第一个符合的标签。
soup.find(tagname,attrName="value"):基于属性定位实现的标签定位,语法格式'div',class_='song',关键字(只有class??)需要下划线,返回单个标签
soup.find_all(tagname,attrName="value")返回列表
取数据:
取文本
soup.tagname.text 取得标签下所有的标签内容
soup.tagname.get_text 取得标签下所有的标签内容
soup.tagname.string 取得标签中直系的文本内容
取属性
soup.tagname['attrName']
select:使用选择器定位标签
标签,类,id选择器:soup.select('.song')返回列表
层级选择器:
单层级:
soup.select('.song>ul>li')
多层级:
soup.select('.song li')
五、持久化存储
settigs文件中(300表示的是优先级,数值越小优先级越高):
ITEM_PIPELINES ={'xioahuaPro.pipelines.XioahuaproPipeline': 300,#'xioahuaPro.pipelines.MysqlPipeline': 301,
#'xioahuaPro.pipelines.RedisPipeline': 302,
}
#items文件中:
classWangyinewsallItem(scrapy.Item):#define the fields for your item here like:
#name = scrapy.Field()
title =scrapy.Field()
desc= scrapy.Field()
#多个Item
class DetailItem(scrapy.Item):
# define the fields for your item here like:
job_desc = scrapy.Field()
class FirstItem(scrapy.Item):
# define the fields for your item here like:
job_title = scrapy.Field()
#爬虫文件中:
from kuixunexam.items importKuixunexamItemdefdetail_parse(self, response):
news_text=response.text
content_list= re.findall('"digest":"(.*?)"', news_text)for content in content_list[0:2]:
item=KuixunexamItem()
title, cont= re.findall('【(.*?)】(.*)', content)[0]print(title, cont)
item['new_title'] =title
item['new_content'] =contyielditem#示例2,当item存储的内容不在同一页时
from moviePro.items importMovieproItemclassMovieSpider(scrapy.Spider):
name= 'movie'start_urls= ['https://www.4567tv.tv/index.php/vod/show/id/9.html']#接收一个请求传递过来的数据
defdetail_parse(self,response):
item= response.meta['item']
desc= response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first()
item['desc'] =descyielditemdefparse(self, response):
li_list= response.xpath('//div[@class="stui-pannel_bd"]/ul/li')for li inli_list:
name= li.xpath('.//h4[@class="title text-overflow"]/a/text()').extract_first()
detail_url= 'https://www.4567tv.tv'+li.xpath('.//h4[@class="title text-overflow"]/a/@href').extract_first()
item=MovieproItem()
item['name'] =name#meta是一个字典,字典中所有的键值对都可以传递给指定好的回调函数
yield scrapy.Request(url=detail_url,callback=self.detail_parse,meta={'item':item})
#pipelines.py文件中,文件、mysql、redis三种存储方式
importpymysqlfrom redis importRedisclassXioahuaproPipeline(object):
fp=Nonedefopen_spider(self,spider):
self.fp= open('./xiaohua.txt','w',encoding='utf-8')defprocess_item(self, item, spider):
name= item['name']
img_url= item['img_url']
self.fp.write(name+':'+img_url+'\n')#返回值的作用:就是将item传递给下一个即将被执行的管道类
returnitemdefclose_spider(self,spider):
self.fp.close()classMysqlPipeline(object):
conn=None
cursor=Nonedefopen_spider(self, spider):#解决数据库字段无法存储中文处理:alter table tableName convert to charset utf8;
self.conn = pymysql.Connect(host='127.0.0.1',port=3306,user='root',password='123',db='test')print(self.conn)defprocess_item(self, item, spider):
self.cursor=self.conn.cursor()try:
self.cursor.execute('insert into xiahua values ("%s","%s")'%(item['name'],item['img_url']))
self.conn.commit()exceptException as e:print(e)
self.conn.rollback()returnitemdefclose_spider(self, spider):
self.cursor.close()
self.conn.close()classRedisPipeline(object):
conn=Nonedefopen_spider(self, spider):
self.conn= Redis(host='127.0.0.1',port=6379)print(self.conn)defprocess_item(self, item, spider):
dic={'name':item['name'],'img_url':item['img_url']
}print(dic)
self.conn.lpush('xiaohua',dic)returnitemdefclose_spider(self, spider):pass
六、crawlspider
crawlspider就是spider的一个子类(派生)。功能比crawlspider多
之所以使用crawlspider是为了进行全栈数据爬取
crawlspider具有的机制:
连接提取器:提取连接
规则解析器:解析页面数据,解析页面源码数据
crawlspider创建爬虫文件
scrapy genspider-t crawl 文件名 域名
对于不显示的页码,可通过将follow=true取到所有页码(包括隐藏页码)
follow的作用:将连接提取器继续作用到连接提取器提取到的连接所对应的的页面源码中
importscrapy
from scrapy.linkextractors importLinkExtractorfrom scrapy.spiders importCrawlSpider, Rulefrom redis importRedisfrom moviePro.items importMovieproItemclassMovieSpider(CrawlSpider):
conn= Redis(host='127.0.0.1',port=6379)
name= 'movie'
#allowed_domains = ['www.xxx.com']
start_urls = ['https://www.4567tv.tv/frim/index1.html']
rules=(
Rule(LinkExtractor(allow=r'/frim/index1-\d+\.html'), callback='parse_item', follow=True),
)defparse_item(self, response):#解析出当前页码对应页面中电影详情页的url
li_list = response.xpath('//div[@class="stui-pannel_bd"]/ul/li')for li inli_list:#解析详情页的url
detail_url = 'https://www.4567tv.tv'+li.xpath('./div/a/@href').extract_first()
ex= self.conn.sadd('movie_detail_urls',detail_url)if ex == 1:print('有新数据可爬取......')yield scrapy.Request(url=detail_url,callback=self.parse_detail)else:print('暂无新数据可爬取!')defparse_detail(self,response):
name= response.xpath('/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first()
m_type= response.xpath('/html/body/div[1]/div/div/div/div[2]/p[1]/a[1]/text()').extract_first()
item=MovieproItem()
item['name'] =name
item['m_type'] =m_typeyield item
importscrapyfrom scrapy.linkextractors importLinkExtractorfrom scrapy.spiders importCrawlSpider, Rulefrom bossPro.items importDetailItem,FirstItem#爬取的是岗位名称(首页)和岗位描述(详情页)
classBossSpider(CrawlSpider):
name= 'boss'
#allowed_domains = ['www.xxx.com']
start_urls = ['https://www.zhipin.com/c101010100/?query=python%E5%BC%80%E5%8F%91&page=1&ka=page-prev']#获取所有的页码连接
link = LinkExtractor(allow=r'page=\d+')
link_detail= LinkExtractor(allow=r'/job_detail/.*?html')
rules =(
Rule(link, callback='parse_item', follow=True),
Rule(link_detail, callback='parse_detail'),
)#将页码连接对应的页面数据中的岗位名称进行解析
defparse_item(self, response):
li_list= response.xpath('//div[@class="job-list"]/ul/li')for li inli_list:
item=FirstItem()
job_title= li.xpath('.//div[@class="job-title"]/text()').extract_first()
item['job_title'] =job_title#print(job_title)
yielditemdefparse_detail(self,response):
job_desc= response.xpath('//*[@id="main"]/div[3]/div/div[2]/div[2]/div[1]/div//text()').extract()
item=DetailItem()
job_desc= ''.join(job_desc)
item['job_desc'] =job_descyield item
七、分布式爬虫
pip install scrapy-redis-概念:使用多台机器组成一个分布式的机群,在机群中运行同一组程序,进行联合数据的爬取。-原生的scrapy是不可以实现分布式:-原生的scrapy中的调度器不可以被共享-原生的scrapy的管道不可以被共享
实现分布式的核心:将所有的url放到一个调度器中,所有的机群共享一个调度器
解决办法:使用scrapy-redis(模块),实现调度器和管道共享- 如果实现分布式就必须使用scrapy-redis(模块)-可以给原生的scrapy提供可以被共享的管道和调度器-pip install scrapy_redis-搭建流程:-创建工程-爬虫文件-修改爬虫文件:- 导包:from scrapy_redis.spiders importRedisCrawlSpider-将当前爬虫类的父类进行修改RedisCrawlSpider-allowed_domains,start_url删除,添加一个新属性redis_key(调度器队列的名称)-数据解析,将解析的数据封装到item中然后向管道提交-配置文件的编写:-指定管道:
ITEM_PIPELINES={'scrapy_redis.pipelines.RedisPipeline': 400}-指定调度器:#增加了一个去重容器类的配置, 作用使用Redis的set集合来存储请求的指纹数据, 从而实现请求去重的持久化
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
#使用scrapy-redis组件自己的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
#配置调度器是否要持久化, 也就是当爬虫结束了, 要不要清空Redis中请求队列和去重指纹的set。如果是True, 就表示要持久化存储, 就不清空数据, 否则清空数据
SCHEDULER_PERSIST =True-指定具体的redis:
REDIS_HOST= 'redis服务的ip地址'REDIS_PORT= 6379REDIS_ENCODING= ‘utf-8’
REDIS_PARAMS= {‘password’:’123456’}- 开启redis服务(携带redis的配置文件:redis-server ./redis.windows.conf),和客户端:-对redis的配置文件进行适当的配置:- #bind 127.0.0.1
- protected-mode no-开启-启动程序:scrapy runspider xxx.py-向调度器队列中扔入一个起始的url(redis的客户端):lpush xxx www.xxx.com- xxx表示的就是redis_key的属性值
#爬虫文件中
importscrapyfrom scrapy.linkextractors importLinkExtractorfrom scrapy.spiders importCrawlSpider, Rulefrom scrapy_redis.spiders importRedisCrawlSpiderfrom fbsPro.items importFbsproItemclassTestSpider(RedisCrawlSpider):
name= 'test'
#allowed_domains = ['www.xxx.com']
#start_urls = ['http://www.xxx.com/']
#调度器队列的名称
redis_key = 'dongguan'rules=(
Rule(LinkExtractor(allow=r'type=4&page=\d+'), callback='parse_item', follow=True),
)defparse_item(self, response):
a_list= response.xpath('//a[@class="news14"]')for a ina_list:
item=FbsproItem()
item['title']= a.xpath('./text()').extract_first()yield item
settings中配置
#settings中增加配置
ITEM_PIPELINES={#'fbsPro.pipelines.FbsproPipeline': 300,
'scrapy_redis.pipelines.RedisPipeline': 400}#增加了一个去重容器类的配置, 作用使用Redis的set集合来存储请求的指纹数据, 从而实现请求去重的持久化
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
#使用scrapy-redis组件自己的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
#配置调度器是否要持久化, 也就是当爬虫结束了, 要不要清空Redis中请求队列和去重指纹的set。如果是True, 就表示要持久化存储, 就不清空数据, 否则清空数据
SCHEDULER_PERSIST =True
REDIS_HOST= '192.168.11.154'REDIS_PORT= 6379
启动redis
#redis的配置文件进行适当配置#bind 127.0.0.1
protected-mode yes
redis-server +指定配置的文件路径
redis-cli
多台机器启动爬虫文件
scrapy runspider test.py
redis写入其实url开始分布式爬虫
lpush dongguan 起始url(redis当中)
八、增量式爬虫
概念:通过爬虫程序监测某网站数据更新的情况,以便可以爬取到该网站更新出的新数据。
如何进行增量式的爬取工作:
在发送请求之前判断这个URL是不是之前爬取过
在解析内容后判断这部分内容是不是之前爬取过
写入存储介质时判断内容是不是已经在介质中存在
分析:
不难发现,其实增量爬取的核心是去重, 至于去重的操作在哪个步骤起作用,只能说各有利弊。在我看来,前两种思路需要根据实际情况取一个(也可能都用)。第一种思路适合不断有新页面出现的网站,比如说小说的新章节,每天的最新新闻等等;第二种思路则适合页面内容会更新的网站。第三个思路是相当于是最后的一道防线。这样做可以最大程度上达到去重的目的。
去重方法
将爬取过程中产生的url进行存储,存储在redis的set中。当下次进行数据爬取时,首先对即将要发起的请求对应的url在存储的url的set中做判断,如果存在则不进行请求,否则才进行请求。
对爬取到的网页内容进行唯一标识的制定,然后将该唯一表示存储至redis的set中。当下次爬取到网页数据的时候,在进行持久化存储之前,首先可以先判断该数据的唯一标识在redis的set中是否存在,在决定是否进行持久化存储。
1.基于url的增量式爬虫
#爬虫文件中
importscrapyfrom scrapy.linkextractors importLinkExtractorfrom scrapy.spiders importCrawlSpider, Rulefrom redis importRedisfrom moviePro.items importMovieproItemclassMovieSpider(CrawlSpider):
conn= Redis(host='127.0.0.1',port=6379)
name= 'movie'
#allowed_domains = ['www.xxx.com']
start_urls = ['https://www.4567tv.tv/frim/index1.html']
rules=(
Rule(LinkExtractor(allow=r'/frim/index1-\d+\.html'), callback='parse_item', follow=True),
)defparse_item(self, response):#解析出当前页码对应页面中电影详情页的url
li_list = response.xpath('//div[@class="stui-pannel_bd"]/ul/li')for li inli_list:#解析详情页的url
detail_url = 'https://www.4567tv.tv'+li.xpath('./div/a/@href').extract_first()#ex == 1:该url没有被请求过 ex == 0:该url已经被请求过了
ex = self.conn.sadd('movie_detail_urls',detail_url)if ex == 1:print('有新数据可爬取......')yield scrapy.Request(url=detail_url,callback=self.parse_detail)else:print('暂无新数据可爬取!')defparse_detail(self,response):
name= response.xpath('/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first()
m_type= response.xpath('/html/body/div[1]/div/div/div/div[2]/p[1]/a[1]/text()').extract_first()
item=MovieproItem()
item['name'] =name
item['m_type'] =m_typeyield item
#管道文件中
classMovieproPipeline(object):defprocess_item(self, item, spider):
conn=spider.conn
dic={'name':item['name'],'m_type':item['m_type']
}
conn.lpush('movie_data',dic)return item
2、基于内容的增量式爬取
- 需求:爬取糗事百科中的段子和作者数据。
#爬虫文件中#-*- coding: utf-8 -*-
importscrapyfrom scrapy.linkextractors importLinkExtractorfrom scrapy.spiders importCrawlSpider, Rulefrom incrementByDataPro.items importIncrementbydataproItemfrom redis importRedisimporthashlibclassQiubaiSpider(CrawlSpider):
name= 'qiubai'
#allowed_domains = ['www.xxx.com']
start_urls = ['https://www.qiushibaike.com/text/']
rules=(
Rule(LinkExtractor(allow=r'/text/page/\d+/'), callback='parse_item', follow=True),
Rule(LinkExtractor(allow=r'/text/$'), callback='parse_item', follow=True),
)#创建redis链接对象
conn = Redis(host='127.0.0.1',port=6379)defparse_item(self, response):
div_list= response.xpath('//div[@id="content-left"]/div')for div indiv_list:
item=IncrementbydataproItem()
item['author'] = div.xpath('./div[1]/a[2]/h2/text() | ./div[1]/span[2]/h2/text()').extract_first()
item['content'] = div.xpath('.//div[@class="content"]/span/text()').extract_first()#将解析到的数据值生成一个唯一的标识进行redis存储
source = item['author']+item['content']
source_id=hashlib.sha256(source.encode()).hexdigest()#将解析内容的唯一表示存储到redis的data_id中
ex = self.conn.sadd('data_id',source_id)if ex == 1:print('该条数据没有爬取过,可以爬取......')yielditemelse:print('该条数据已经爬取过了,不需要再次爬取了!!!')
#管道文件中#-*- coding: utf-8 -*-
#Define your item pipelines here#
#Don't forget to add your pipeline to the ITEM_PIPELINES setting#See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from redis importRedisclassIncrementbydataproPipeline(object):
conn=Nonedefopen_spider(self, spider):
self.conn= Redis(host='127.0.0.1', port=6379)defprocess_item(self, item, spider):
dic={'author': item['author'],'content': item['content']
}#print(dic)
self.conn.lpush('qiubaiData', dic)return item
分类: 爬虫