Scrapy爬虫常用代码及遇到的一些非反爬功能报错的个人理解
一 、 常用代码
1、初始创建命令
# 创建项目
scrapy startproject 爬虫项目名字 # 例如 scrapy startproject fang_spider
scrapy genspider 爬虫名字 ‘域名’ #例如 scrapy genspider fang 'fang.com'
# 设置启动文件 在项目目录下建立就行 写入以下代码以后直接运行则可以启动爬虫
# 这里第二行的 fang 是你创建的爬虫的名字
from scrapy import cmdline
cmdline.execute("scrapy crawl fang".split())
2、常用请求头及中间件设置
① 常用请求头
需要更多可以点击去这里复制http://www.useragentstring.com
user_agent = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2919.83 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2762.73 Safari/537.36",
"ozilla/5.0 (X11; Ubuntu; Linux i686 on x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2820.59 Safari/537.36"
]
② middleware.py设置每次访问时的随机请求头
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
import random
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class FangSpiderSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class FangSpiderDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
user_agent = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2919.83 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2762.73 Safari/537.36",
"ozilla/5.0 (X11; Ubuntu; Linux i686 on x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2820.59 Safari/537.36"
]
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
useragent = random.choice(self.user_agent)
request.headers["User-Agent"] = useragent
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
3、piplines.py 异步入库 以mysql为例
只需要更改数据库的配置 item元素的获取 和 sql语句即可
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from twisted.enterprise import adbapi
from pymysql import cursors
class newFangSpiderPipeline:
x = 0
def __init__(self):
dbparams = {
'host': '127.0.0.1',
'port': 3306,
'user': 'root',
'password': 'password',
'database': 'fangtianxia',
'charset': 'utf8',
'cursorclass': cursors.DictCursor
}
self.dbpool = adbapi.ConnectionPool('pymysql', **dbparams)
self._sql = None
@property
def sql(self):
self.x += 1
print('*-*' * 10, '第{}条数据进来了++++++'.format(self.x))
if not self._sql:
self._sql = """
insert into newhouse(id,name ,province,city,price,areas,state ,style, address,ori_url) values
(null,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
return self._sql
return self._sql
def process_item(self, item, spider):
defer = self.dbpool.runInteraction(self.insert_item, item)
defer.addErrback(self.handle_error, item, spider)
def insert_item(self, cursor, item):
cursor.execute(self.sql, (
item['name'], item['province'], item['city'], item['price'], item['areas'], item['state'],
item['style'],item['address'],item['ori_url']))
# self.conn.commit()
def handle_error(self, error, item, spider):
print('=^=' * 5, 'error_start:')
print(error)
print('=^=' * 5, 'error_end')
4 、使用selenium获取浏览器重定向后返回的全部cookie
4.1 思路:
4.2 中间件代码如下 下面有个小的函数用于调整cookie
re_cookie = '' # 该变量用于存储selenium 返回的COOKIE
# 下面是selenium程序 用于获取cookie
class ErShouFangSpiderDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
# def __init__(self):
def process_request(self, request, spider):
if "esf" in request.url and "house/i" not in request.url:
global re_cookie
chrome_options = Options()
chrome_options.add_argument("--headless") # 设置为无头浏览器
self.driver = webdriver.Chrome(chrome_options=chrome_options)
print("这是selenium中总的某个城市页面链接",request.url)
self.driver.get("https://www.fang.com")
self.driver.get(request.url)
time.sleep(2)
cookie = self.driver.get_cookies()
# print("这是请求到的原始cookie",cookie)
cookies_list = []
for cookie_dict in cookie:
cookie = cookie_dict['name'] + '=' + cookie_dict['value']
cookies_list.append(cookie)
# print("拼接后的cookir",cookies_list)
cookies = ';'.join(cookies_list)
# print("这是二手房某城市初始页面的cookie:",cookies)
# request.headers["cookie"] = header_cookie
source = self.driver.page_source
# print("获取到的类型为",type(source))
response = HtmlResponse(url=self.driver.current_url, body=source, request=request,encoding='utf8')
self.driver.quit()
re_cookie = cookies
return response
else:
return None
def re_cookies():
global re_cookie
re_cookie2 = re_cookie
re_cookie = None
# print("我被调用了,来给你发返回cook值",re_cookie2)
return re_cookie2
5 、在请求时设置cookie
配置:
设置访问cookie:
(红框里只是cookie的格式,从网页直接复制带入即可) 实际使用selenium自动及时获取的
设置代码段如下
记得 from …middlewares import re_cookies
def esf_parse(self,response):
print('第%s页开始了' % response.xpath(".//div[@class='page_al']//span[@class='on']/text()").get())
state = re_cookies() # 下载器中间件中的一个函数 用于返回selenium抓取到的cookie
province, city = response.meta.get("info")
# 设置cookie 用三元运算替换下面长长的if判断
cooki = {"Cookie": state} if state else response.meta.get("cooki")
# if state is None:
# print("没有经过selenium")
# cooki = response.meta.get("cooki")
# # print('返回0', province, city, type(response),"coo:", cooki)
# else:
# print("经过了selenium有新cookie")
# cooki = {"Cookie": state}
# # Cookie2 = response.headers.getlist('Set-Cookie') # 响应
# # print("第一次响应的cookie", Cookie2)
# # print('返回1', city, response.url, type(response), cooki)
# cooki = {"cookie":"""global_cookie=dwi8uf9ky7lfteoil0w77ugjl28kjfxu44j; city=www; global_wapandm_cookie=4qqfzddnve98p4xdoo2qvct4c16kjg3r1sd; g_sourcepage=esf_fy%5Elb_pc; unique_cookie=U_n0b9vm5xw7xixtvj67lbe96cl20kjhf0ndm*2; __utma=147393320.1550169561.1609605973.1609612191.1609695300.3; __utmc=147393320; __utmz=147393320.1609695300.3.3.utmcsr=fang.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; __utmb=147393320.3.10.1609695300
# """}
if response.xpath("(//div[@class='page_al']//a)[last()]/text()").get() == "尾页":
next_url = response.xpath("(//div[@class='page_al']//a)[last()-1]/@href").get()
if next_url:
print('下一页是:',next_url)
print('第%s页结束了' % response.xpath(".//div[@class='page_al']//span[@class='on']/text()").get())
yield scrapy.Request(url=response.urljoin(next_url),headers=cooki,callback=self.esf_parse,meta={
'info':("123456", city), 'cooki':cooki
})
else:
print("{},{}二手房爬取完毕".format(province,city))
二、小心得
1、使用git时设置需要忽略的文件方法
创建 .gitignore 文件 (注意没有后缀名)
/vent/ 代表不追踪和添加该文件夹及其中所有内容
2、add或者coomit之后的撤销操作
git reset
三 、 非反爬功能报错的个人理解
1、no more duplicates will be shown 不再显示重复项
原因:
是scrapy本身也有链接去重功能,同样的链接不会重复访问。但是有些网站是在你请求A的时候重定向到B,重定向到B的时候又给你重定向回A,然后才让你顺利访问,此时scrapy由于默认去重,将拒绝访问A.
解决方式:
在yield访问新链接时,加上 dont_filter=True 参数,不让它自动过滤
顺便复习个单词呗
2、获取请求或者响应的cookie
Cookie = response.request.headers.getlist('Cookie') # 请求
Cookie2 = response.headers.getlist('Set-Cookie') # 响应
print(Cookie,'111cookoooo222:',Cookie2)