scrapy基本使用
安装scrapy
pip install pyopenssl
pip install Twisted
pip install scrapy
创建项目
进入准备存放项目的文件夹打开shell
scrapy startproject 项目名称
创建爬虫
切换到项目目录
cd 项目名称
scrapy genspider 爬虫名 -t "目标网站域名"
scrapy genspider -t crawl 爬虫名 "目标网站域名"
使用manager启动
在项目同名目录下添加manager.py模块
from scrapy.cmdline import execute
execute('scrapy crawl 爬虫名'.split())
爬虫内主动退出爬虫
spider.crawler.engine.close_spider(spider, “主动退出爬虫”)
settings模块设置
USER_AGENT = "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1"
ROBOTSTXT_OBEY = False # 不遵守爬虫协议
SCHEDULER_PERSIST = True # 允许中断
DOWNLOAD_DELAY = 3 # 延时下载
# 设置 分布式的 去重组件
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 设置 分布式的 调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
piplines
class DataSourcePipeline(object):
"""添加数据源信息的管道, 优先执行"""
def process_item(self, item, spider):
item['data_source'] = spider.name
item['data_time'] = str(datetime.utcnow())
return item
class JsonPipeline(object):
"""存json文件的管道"""
def open_spider(self, spider):
self.file = open("aqi.json", 'w')
def process_item(self, item, spider):
str_item = json.dumps(dict(item)) + '\n'
self.file.write(str_item)
return item
def close_spider(self, spider):
self.file.close()
class CsvPipeline(object):
"""存csv文件的管道"""
def open_spider(self, spider):
self.file = open("aqi.csv", 'w')
self.csv_writer = CsvItemExporter(self.file)
self.csv_writer.start_exporting()
def process_item(self, item, spider):
self.csv_writer.export_item(item)
return item
def close_spider(self, spider):
self.file.close()
self.csv_writer.finish_exporting()
class RedisPipeline(object):
"""存redis数据库的管道"""
def open_spider(self, spider):
self.client = redis.Redis("127.0.0.1", 6379)
def process_item(self, item, spider):
self.client.lpush("AQI_List", dict(item))
return item
class MongoPipeline1(object):
"""存mongodb的管道"""
def open_spider(self, spider):
self.client = pymongo.MongoClient("127.0.0.1",27017)
self.db = self.client['MongoAQI']
self.collections = self.db['aqi']
def process_item(self, item, spider):
self.collections.insert(dict(item))
return item
class MongoPipeline2():
collection_name = ‘users’
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
# 存在整体替换,不存在插入
self.db[self.collection_name].update({'url_token': item['url_token']}, dict(item), True)
return item
from_crawler(cls, crawler)方法
使用settings中的配置信息进行初始化, 会直接传入初始化方法中
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.db = mongo_db
@classmethod
def friom_crawler(cls, crawler):
return cls(
mongo_uri = crewler.settings.get("MONGO_URI"),
mongo_db = crawler.settings.get("DB", "items_{}".format(datetime.date.today()))
)
process_item(self, item)方法
截获item 在进入存储管道之前对item进行一些处理, 如数据清洗, 去重等.
需要return item
middlewares
import time
import scrapy
from selenium import webdriver
class ChromeMiddleware(UserAgentMiddleware):
def __init__(self, user_agent):
super().__init__()
self.user_agent = user_agent
@classmethod
def from_crawler(cls, crawler):
return cls(user_agent=crawler.settings.get('USER_AGENT_LIST'))
def process_request(self, request, spider):
# 过滤渲染页面
url = request.url
if url = "https://www.aqistudy.cn/historydata/":
driver = webdriver.Chrome()
driver.get(url)
time.sleep(2)
data = driver.page_source
driver.quit()
return scrapy.http.HtmlResponse(
url=url, body=data.encode("utf-8"), encoding="utf-8", request=request)
process_excepyion(self, request, response, spider)
捕获异常进行处理
return request.meta["proxy"] = "ip:port"
串行spider
import scrapy
from 项目名.items import Item
class AqiSpider(scrapy.Spider):
name = 'aqi'
allowed_domains = []
start_urls = []
# 指定当前爬虫个性化的pipeline
custom_settings = {
'ITEM_PIPELINES': {
'FxtDataAcquisition.pipelines.CityPipeline': 300,
}
}
def parse(self, response):
names_list = response.xpath('//li/text()').extract()
links_list = response.xpath('//li/a/@href').extract()
item = Item()
for link, name in zip(links_list, names_list):
item['city_name'] = name
url = 'https://www.aqistudy.cn/historydata/' + link
yield scrapy.Request(url=url, meta={"itemkey": item}, callback=self.detail_parse)
def detail_parse(self, response):
# 取出item
item = response.meta['itemkey']
# 取出所有 的 tr
tr_list = response.xpath('//tr')
for tr in tr_list:
item['date'] = tr.xpath('./td[1]/text()').extract_first()
yield item
并行crawlspider
# coding: utf-8
import scrapy
from Xxx.items import Item
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class XxxSpider(CrawlSpider):
name = ''
allowed_domains = []
start_urls = []
# item不能传递,无法对应,一般只解析最后一层数据
# 默认 如果有了callback= follow= False
rules = (
# 提取 第二层urls
Rule(LinkExtractor(allow=r"month\.php\?city=")),
# 提取 第三层urls
Rule(LinkExtractor(allow=r"day\.php\?city="), callback="day_parse", follow=False),
)
def day_parse(self, response):
item = Item()
tr_list = response.xpath('//tr')
for tr in tr_list:
item['date'] = tr.xpath('./td[1]/text()').extract_first()
yield item
make_requests_from_url(self, url)
# 设置超时时间
yield scrapy.Request(url=url, meta={"download_timeout": 10})
需要全局调用的变量可以定义在spider的init方法中, 通过spider调用.
如browser等
内容解析
xpath和css解析参考
https://blog.csdn.net/mouday/article/details/80455560
webdriver动作与解析
https://blog.csdn.net/qq_38284543/article/details/75267168
https://blog.csdn.net/fkew2009/article/details/83501991
scrapy的去重
url先经过标准化
from w3lib.url import canonicalize_url
url = canonicalize_url(url)
判断url的指纹是否在已完成url的集合中,
如果不在则生成request对象推入请求队列
def get_fingerprint(url):
standard_url = canonicalize_url(url)
md = md5()
md.update(url.encode('utf8'))
finger = md.hexdigest()
return finger
訪問成功加入集合
否则说明重复