-
介绍
Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架。 其可以应用在数据挖掘,信息处理或存储历史数据等一系列的程序中。
其最初是为了页面抓取 (更确切来说, 网络抓取 )所设计的, 也可以应用在获取API所返回的数据(例如 Amazon Associates Web Services ) 或者通用的网络爬虫。Scrapy用途广泛,可以用于数据挖掘、监测和自动化测试。Scrapy 使用了 Twisted异步网络库来处理网络通讯。整体架构大致如下
1)引擎找到要执行的爬虫,并执行爬虫的 start_requests 方法,并的到一个 迭代器。
2) 迭代器循环时会获取Request对象,而request对象中封装了要访问的URL和回调函数。
3) 将所有的request对象(任务)放到调度器中,用于以后被下载器下载。
4) 下载器去调度器中获取要下载任务(就是Request对象),下载完成后执行回调函数。
5) 回到spider的回调函数中,yield Request() yield Item()
-
安装
- Windows: 安装依赖包twisted 下载地址:http://www.lfd.uci.edu/~gohlke/pythonlibs/ 安装whl安装插件 pip3 install wheel pip install Twisted‑18.4.0‑cp36‑cp36m‑win_amd64.whl 安装依赖pywin32 pip3 install pywin32 安装scrapy pip3 install scrapy - Linux: pip3 install scrapy
-
创建工程
# 创建project scrapy startproject 项目名称 cd 项目名称 # 创建爬虫(以抽屉为例) scrapy genspider chouti chouti.com scrapy genspider cnblogs cnblogs.com # 启动爬虫 scrapy crawl chouti
-
项目目录介绍
-
简单实用
# -*- coding: utf-8 -*- import scrapy from ..items import FirstScrapyItem import sys, io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="gb18030") class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['http://chouti.com/' ] def parse(self, response): f = open("new.log", mode="a+") content_list = response.xpath('//div[@id="content-list"]/div[@class="item"]') # //div去子孙中找div for item in content_list: text = item.xpath(".//a/text()").extract_first() href = item.xpath(".//a/@href").extract_first() f.write(href + "\n")
注释:parse:接收响应数据
-
分页递归爬取
# -*- coding: utf-8 -*- import scrapy from ..items import FirstScrapyItem import sys, io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="gb18030") class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['http://chouti.com/' ] def parse(self, response): f = open("new.log", mode="a+") content_list = response.xpath('//div[@id="content-list"]/div[@class="item"]') # //div去子孙中找div for item in content_list: text = item.xpath(".//a/text()").extract_first() href = item.xpath(".//a/@href").extract_first() f.write(href + "\n") f.close() # 分页 page_list = response.xpath('//div[@id="dig_lcpage"]//a/@href').extract() for page in page_list: from scrapy import Request page = "https://dig.chouti.com" + page yield Request(url=page, callback=self.parse) 注释:yield Request(url=page, callback=self.parse):再次发起请求,callback=self.parse回调函数
-
response封装的响应相关的所有数据
- response.text - response.encoding - response.body - response.request # 当前响应是由那个请求发起;请求中 封装(要访问的url,下载完成之后执行那个函数)
-
xpath解析器
response.xpath('//div[@href="x1"]/a').extract_first() # 获取属性href为x1的div标签下第一个a标签 response.xpath('//div[@href="x1"]/a').extract() # 获取属性href为x1的div标签下所有a标签 response.xpath('//div[@href="x1"]/a/text()').extract() # 获取属性href为x1的div标签下所有a标签的文本 response.xpath('//div[@href="x1"]/a/@href').extract() # 获取属性href为x1的div标签下第所有a标签的href属性值 tag_list = response.xpath('//div[@href="x1"]/a') for tag in tag_list: tag.xpath('.//p/text()').extract_first() # 获取属性href为x1的div标签下第所有a标签下的第一个p标签的文本
-
持久化
a. 先写pipeline类
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html from scrapy.exceptions import DropItem class FirstScrapyPipeline(object): def __init__(self, path): self.f = None self.path = path @classmethod def from_crawler(cls, crawler): """ 初始化时候,用于创建pipeline对象 :param crawler: :return: """ path = crawler.settings.get("HREF_FILE_PATH") return cls(path) def open_spider(self, spider): """ 爬虫开始时,调用 :param spider: :return: """ self.f = open(self.path, "a+") def process_item(self, item, spider): """ 正在爬取时 :param item: :param spider: :return: """ if spider.name == "chouti": self.f.write(item["href"] + "\n") return item # 交给下一个pipeline的process_item方法 # raise DropItem #后续的pipeline的process_item方法不再执行 def close_spider(self, spider): """ 爬虫结束时调用 :param spider: :return: """ self.f.close()
b. 写Item类
class XdbItem(scrapy.Item): href = scrapy.Field() title = scrapy.Field()
c. 配置
ITEM_PIPELINES = { 'xdb.pipelines.XdbPipeline': 300, # 优先级配置 }
d. 爬虫,yield每执行一次,process_item就调用一次。
yield FirstScrapyItem(text=text, href=href)
pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html from scrapy.exceptions import DropItem class FirstScrapyPipeline(object): def __init__(self, path): self.f = None self.path = path @classmethod def from_crawler(cls, crawler): """ 初始化时候,用于创建pipeline对象 :param crawler: :return: """ path = crawler.settings.get("HREF_FILE_PATH") return cls(path) def open_spider(self, spider): """ 爬虫开始时,调用 :param spider: :return: """ self.f = open(self.path, "a+") def process_item(self, item, spider): """ 正在爬取时 :param item: :param spider: :return: """ if spider.name == "chouti": self.f.write(item["href"] + "\n") return item # 交给下一个pipeline的process_item方法 # raise DropItem #后续的pipeline的process_item方法不再执行 def close_spider(self, spider): """ 爬虫结束时调用 :param spider: :return: """ self.f.close()
items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class FirstScrapyItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() href = scrapy.Field() text = scrapy.Field()
settings.py
ITEM_PIPELINES = { 'first_scrapy.pipelines.FirstScrapyPipeline': 300, }
chouti.py
# -*- coding: utf-8 -*- import scrapy from ..items import FirstScrapyItem import sys, io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="gb18030") class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['http://chouti.com/' ] def parse(self, response): # f = open("new.log", mode="a+") content_list = response.xpath('//div[@id="content-list"]/div[@class="item"]') # //div去子孙中找div for item in content_list: text = item.xpath(".//a/text()").extract_first() href = item.xpath(".//a/@href").extract_first() # f.write(href + "\n") yield FirstScrapyItem(text=text, href=href) # f.close() # 分页 page_list = response.xpath('//div[@id="dig_lcpage"]//a/@href').extract() for page in page_list: from scrapy import Request page = "https://dig.chouti.com" + page yield Request(url=page, callback=self.parse)
-
去重规则
a. 编写类
from scrapy.dupefilter import BaseDupeFilter from scrapy.utils.request import request_fingerprint class XdbDupeFilter(BaseDupeFilter): def __init__(self): self.visited_fd = set() @classmethod def from_settings(cls, settings): return cls() def request_seen(self, request): fd = request_fingerprint(request=request) # 会将请求url转化为MD5值 if fd in self.visited_fd: # 若MD5值在集合里就返回 return True self.visited_fd.add(fd) def open(self): # can return deferred print('开始') def close(self, reason): # can return a deferred print('结束') # def log(self, request, spider): # log that a request has been filtered # print('日志')
b. 配置
# 修改默认的去重规则 # DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter' DUPEFILTER_CLASS = 'xdb.dupefilters.XdbDupeFilter' # 自己编写类的地方
c. 爬虫使用:
class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['https://dig.chouti.com/'] def parse(self, response): print(response.request.url) page_list = response.xpath('//div[@id="dig_lcpage"]//a/@href').extract() for page in page_list: from scrapy.http import Request page = "https://dig.chouti.com" + page yield Request(url=page,callback=self.parse,dont_filter=False) # https://dig.chouti.com/all/hot/recent/2 # yield Request(url=page,callback=self.parse,dont_filter=True) # https://dig.chouti.com/all/hot/recent/2 不遵循去重规则 注意: - request_seen中编写正确逻辑 - dont_filter=False # 遵循去重规则
-
深度
request.meta.get("depth", 0) # 获取深度 配置文件: # 限制深度 DEPTH_LIMIT = 3 深度和优先级 - 深度 - 最开始是0 - 每次yield时,会根据原来请求中的depth + 1 配置:DEPTH_LIMIT 深度控制 - 优先级 - 请求被下载的优先级 -= 深度 * 配置 DEPTH_PRIORITY 配置:DEPTH_PRIORITY
-
cookie(抽屉点赞)
自动拼接成k1=1&k2=2
from urllib.parse import urlencode d = { "k1":1, "k2":2 } ret = urlencode(d)
settings.py(将请求头
USER_AGENT
放入setting中)USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
chouti.py
# -*- coding: utf-8 -*- import scrapy from scrapy.http.response.html import HtmlResponse # import sys,os,io # sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') from xdb.items import XdbItem import scrapy from scrapy.http.cookies import CookieJar from scrapy.http import Request from urllib.parse import urlencode class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['https://dig.chouti.com/'] cookie_dict = {} def parse(self, response): """ 第一次访问抽屉返回的内容:response :param response: :return: """ # 去响应头中获取cookie # 去响应头中获取cookie,cookie保存在cookie_jar对象 cookie_jar = CookieJar() cookie_jar.extract_cookies(response, response.request) # 去对象中将cookie解析到字典 for k, v in cookie_jar._cookies.items(): for i, j in v.items(): for m, n in j.items(): self.cookie_dict[m] = n.value yield Request( url='https://dig.chouti.com/login', method='POST', body="phone=8613121758648&password=woshiniba&oneMonth=1",# # body=urlencode({})"phone=8615131255555&password=12sdf32sdf&oneMonth=1" cookies=self.cookie_dict, headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' }, callback=self.check_login ) def check_login(self,response): print(response.text) yield Request( url='https://dig.chouti.com/all/hot/recent/1', cookies=self.cookie_dict, callback=self.index ) def index(self,response): news_list = response.xpath('//div[@id="content-list"]/div[@class="item"]') for new in news_list: link_id = new.xpath('.//div[@class="part2"]/@share-linkid').extract_first() yield Request( url='http://dig.chouti.com/link/vote?linksId=%s' % (link_id,), method='POST', cookies=self.cookie_dict, callback=self.check_result ) page_list = response.xpath('//div[@id="dig_lcpage"]//a/@href').extract() for page in page_list: page = "https://dig.chouti.com" + page yield Request(url=page, callback=self.index) # https://dig.chouti.com/all/hot/recent/2 def check_result(self, response): print(response.text)
方式二(
meta={'cookiejar': True}
)# -*- coding: utf-8 -*- import scrapy from scrapy.http.response.html import HtmlResponse from scrapy.http import Request from scrapy.http.cookies import CookieJar class ChoutiSpider(scrapy.Spider): name = "chouti" allowed_domains = ["chouti.com"] start_urls = ( 'http://www.chouti.com/', ) def start_requests(self): url = 'http://dig.chouti.com/' yield Request(url=url, callback=self.login, meta={'cookiejar': True}) def login(self, response): print(response.headers.getlist('Set-Cookie')) req = Request( url='http://dig.chouti.com/login', method='POST', headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}, body='phone=8613121758648&password=woshiniba&oneMonth=1', callback=self.check_login, meta={'cookiejar': True} ) yield req def check_login(self, response): print(response.text)
-
start_urls
chouti.py def start_requests(self): import os os.environ["HTTP_PROXY"] = "http://root:sss@1.1.1.1" # 添加代理 os.environ["HTTPS_PROXY"] = "1.1.1.1" for url in self.start_urls: yield Request(url=url, callback=self.parse) 提示:定制可以去redis中获取
-
scrapy中设置代理
内置
在爬虫启动时,提前在os.envrion中设置代理即可。 class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['https://dig.chouti.com/'] cookie_dict = {} def start_requests(self): import os os.environ['HTTPS_PROXY'] = "http://root:woshiniba@192.168.11.11:9999/" os.environ['HTTP_PROXY'] = '19.11.2.32', for url in self.start_urls: yield Request(url=url,callback=self.parse) meta: class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['https://dig.chouti.com/'] cookie_dict = {} def start_requests(self): for url in self.start_urls: yield Request(url=url,callback=self.parse,meta={'proxy':'"http://root:woshiniba@192.168.11.11:9999/"'})
自定义
方式一:import base64 import random from six.moves.urllib.parse import unquote try: from urllib2 import _parse_proxy except ImportError: from urllib.request import _parse_proxy from six.moves.urllib.parse import urlunparse from scrapy.utils.python import to_bytes class XdbProxyMiddleware(object): def _basic_auth_header(self, username, password): user_pass = to_bytes( '%s:%s' % (unquote(username), unquote(password)), encoding='latin-1') return base64.b64encode(user_pass).strip() def process_request(self, request, spider): PROXIES = [ "http://root:woshiniba@192.168.11.11:9999/", "http://root:woshiniba@192.168.11.12:9999/", "http://root:woshiniba@192.168.11.13:9999/", "http://root:woshiniba@192.168.11.14:9999/", "http://root:woshiniba@192.168.11.15:9999/", "http://root:woshiniba@192.168.11.16:9999/", ] url = random.choice(PROXIES) orig_type = "" proxy_type, user, password, hostport = _parse_proxy(url) proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', '')) if user: creds = self._basic_auth_header(user, password) else: creds = None request.meta['proxy'] = proxy_url if creds: request.headers['Proxy-Authorization'] = b'Basic ' + creds
方式二
class DdbProxyMiddleware(object): def process_request(self, request, spider): PROXIES = [ {'ip_port': '111.11.228.75:80', 'user_pass': ''}, {'ip_port': '120.198.243.22:80', 'user_pass': ''}, {'ip_port': '111.8.60.9:8123', 'user_pass': ''}, {'ip_port': '101.71.27.120:80', 'user_pass': ''}, {'ip_port': '122.96.59.104:80', 'user_pass': ''}, {'ip_port': '122.224.249.122:8088', 'user_pass': ''}, ] proxy = random.choice(PROXIES) if proxy['user_pass'] is not None: request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port']) encoded_user_pass = base64.b64encode(to_bytes(proxy['user_pass'])) request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass) else: request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
python笔记(爬虫 scrapy框架)
最新推荐文章于 2022-12-27 16:33:38 发布