1.scrapy框架的基本使用
-
项目创建
使用终端:
1)cd …/… (进入到指定的目录)
2) scrapy startproject 名称 (创建scrapy框架)
3) cd 框架名称
4)scrapy genspider 爬虫文件名称 www.xxx.com
5)执行:scrapy crawl 爬虫文件名称
6)持久化存储:scrapy crawl 爬虫文件名称 -o 将要存储的路径import scrapy class FirstSpider(scrapy.Spider): #爬虫文件的名称 name = 'first' #允许的域名:用来限定start_urls列表中哪写url可以进行请求发送 #allowed_domains = ['www.xxx.com'] #起始的url列表:该列表中存放的url会被scrapy自动进行请求的发送 start_urls = ['https://www.baidu.com/','https://www.sogou.com/'] #用作与数据解析:response参数表示的就是请求成功后对应的响应对象。 #start_urls列表中有多少个元素就会请求多少次 def parse(self, response): print(response)
-
scrapy数据解析操作
解析糗图百科的段子
import scrapy class BaiduSpider(scrapy.Spider): name = 'baidu' #allowed_domains = ['www.baidu.com'] start_urls = ['https://www.qiushibaike.com/text/'] def parse(self, response): all_data=[] #存储所有数据 div_list=response.xpath('//div[@class="col1 old-style-col1"]/div') for div in div_list: #extract 可以将selector对象中的data参数存储的字符串提取出来 author=div.xpath('./div[1]/a[2]/h2/text()')[0].extract() #列表调用了extract之后,则表示将列表中每一个Selector对象中data对应的字符串提取出来 content=div.xpath('./a[1]/div/span//text()').extract() content="".join(content) dic={ "author":author, "content":content } all_data.append(dic) return all_data ```
2.基于管道持久化存储
1)数据解析
2)在item类中定义相关的属性
3)将解析的数据封装存储到item类型的对象
4)将item类型的对象提交给管道进行持久化存储的操作
5)在管道类的process_item中要将其接收到的item对象中存储的数据进行持久化存储操作
6)在配置文件中开启管道
main.py
import scrapy
from FirstBlood.items import FirstbloodItem
class BaiduSpider(scrapy.Spider):
name = 'baidu'
#allowed_domains = ['www.baidu.com']
start_urls = ['https://www.qiushibaike.com/text/']
def parse(self, response):
all_data=[] #存储所有数据
div_list=response.xpath('//div[@class="col1 old-style-col1"]/div')
for div in div_list:
#extract 可以将selector对象中的data参数存储的字符串提取出来
author=div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
#列表调用了extract之后,则表示将列表中每一个Selector对象中data对应的字符串提取出来
content=div.xpath('./a[1]/div/span//text()').extract()
content="".join(content)
item=FirstbloodItem()
item['author']=author
item['content']=content
#将itme提交给管道
yield item
items.py
import scrapy
class FirstbloodItem(scrapy.Item):
# define the fields for your item here like:
author = scrapy.Field()
content=scrapy.Field()
pipelines.py
class FirstbloodPipeline:
fp=None
#重写父类的方法:该方法只在开始爬虫的时候被调用一次
def open_spider(self,spider):
print("开始爬取")
self.fp=open('./qiubai.txt','w',encoding='utf-8')
#专门用来处理itme类型对象
#该方法可以接收爬虫文件提交过来的item对象
#该方法每接收一个item就会调用一次
def process_item(self, item, spider):
author=item['author']
content=item['content']
self.fp.write(author+':'+content+'\n')
return item
def close_spider(self,spider):
print("结束爬取")
self.fp.close()
#存储到mysql
class mysqlPipeLine(object):
conn=None
cursor=None
def open_spider(self,spider):
self.conn=pymysql.connect(host="localhost",user="root",password="1234567",database="python_db",port=3306,charset='utf8')
def process_item(self,item,spider):
self.cursor=self.conn.cursor()
try:
self.cursor.execute('insert into qiubai values("%s","%s")'%(item["author"],item["content"]))
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self,spider):
self.cursor.close()
self.conn.close()
seetings.py
ITEM_PIPELINES = {
'FirstBlood.pipelines.FirstbloodPipeline': 300,
'FirstBlood.pipelines.mysqlPipeLine':301
}
3.基于spider的全站数据爬取
就是将网站中某板块下的全部页码对应的数据进行爬取
实现方式:
1)将所有页面的url添加到start_urls列表(不推荐)
2)自动手动进行请求发送
爬取彼岸图片中图片的名字
import scrapy
from fengjing.items import FengjingItem
class TupianSpider(scrapy.Spider):
name = 'tupian'
#allowed_domains = ['www.xxx.com']
start_urls = ['https://pic.netbian.com/4kfengjing/']
#生成通用的url模板
url='https://pic.netbian.com/4kfengjing/index_%d.html'
page_num=2
def parse(self, response):
li_list=response.xpath('//*[@id="main"]/div[3]/ul/li')
for li in li_list:
name=li.xpath('./a/b/text()').extract()
name="".join(name)
print(name)
item=FengjingItem()
item['name']=name
yield item
if self.page_num<=5:
new_url=format(self.url%self.page_num)
self.page_num += 1;
#手动请求发送:callback回调函数是专门用作与数据解析
yield scrapy.Request(url=new_url,callback=self.parse)
4.图片数据爬取之ImagesPipeline
基于scrapy爬取字符串类型的数据和爬取图片类型的数据区别:
1)字符串:只需要基于xpath进行解析且提交管道进行持久化存储
2)图片:xpath解析出图片src的属性值。单独的对图片地址发起请求获取图片二进制类型的数据。
ImagesPipeline:只需要将img的src的属性值进行解析,提交到管道,管道就会对图片的src进行请求发送获取图片的二进制信息。
-
爬取彼岸图片
main.pyimport scrapy from bian.items import BianItem class TupianSpider(scrapy.Spider): name = 'tupian' #allowed_domains = ['www.xxx.com'] start_urls = ['https://pic.netbian.com/4kfengjing/'] def parse(self, response): li_list=response.xpath('//*[@id="main"]/div[3]/ul/li') for li in li_list: src='https://pic.netbian.com'+li.xpath('./a/img/@src').extract_first() item=BianItem() item['src']=src #提交item到管道 yield item print(src)
items.py
# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class BianItem(scrapy.Item): # define the fields for your item here like: src = scrapy.Field()
pipelines
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface from itemadapter import ItemAdapter import scrapy from scrapy.pipelines.images import ImagesPipeline # class BianPipeline: # def process_item(self, item, spider): # return item class imagesPipeline(ImagesPipeline): #对item中的图片进行请求操作 def get_media_requests(self, item, info): yield scrapy.Request(item['src']) #定制图片名称 def file_path(self, request, response=None, info=None, *, item=None): file_name=request.url.split('/')[-1] return file_name def item_completed(self, results, item, info): return item #该返回只会传递给下一个即将被执行的管道类