selenium
# 向下滑动一屏幕
# bro 浏览器对象
bro.execute_script()
jscode = window.scrollTo(0,document.body.scrollHeight)
# 无头浏览器
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
# 规避检测
from selenium.webdriver import ChromeOptions
options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
bro = webdriver.Chrome(chrome_options= chrome_options,options=options)
下载中间件
设置拦截请求 或 代理IP 或 拦截响应 : 最后都需要在settings中 注释开 下载中间件管道
-
拦截请求
user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " ".", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] # 在 process_requests 处理请求函数中配置进行配置 def process_request(self, request, spider): request.headers['User-Agent'] = random.choice(self.user_agent_list)
-
代理ip
PROXY_http = [ '153.180.102.104:80', '195.208.131.189:56055', ] PROXY_https = [ '120.83.49.90:9000', '95.189.112.214:35508', ] # 在 process_exception 处理异常函数中进行配置 def process_exception(self, request, exception, spider): if request.url.split(":")[0] == "http": request.meta['proxy'] = "http://" + random.choice(self.PROXY_http) else: request.meta['proxy'] = "https://" + random.choice(self.PROXY_https) return request
-
拦截响应
# 通常用于动态加载 ajax ,使用 scrapy 和 selenium 操作 # 1.在 process_response 中设置,导入 HtmlResponse: def process_response(self, request, response, spider): # 用于拦截特定的响应 bro = spider.bro if request.url in spider.model_urls: # 篡改响应数据 bro.get(request.url) page_source = bro.page_source response = HtmlResponse(url=request.url, body=page_source, encoding='utf-8', request=request) return response else: return response # 2.在爬虫文件中 # - 构造 selenium 对象, # - 创建关闭方法: def close(self,spider): self.bro.quit() # 3.在管道文件中 # 注释开 下载中间件
多个item操作
# 有时需要多个item 保存不同数据,
# 在pipelines 中的操作
def process_item(self, item, spider):
if item.__class__.__name__ =='QuotesproItem':
author = item['author']
quote = item['quote']
else:
detail_desc = item['detail_desc']
detail_author = item['detail_author']
scrapy模拟登录
-
使用Requests cookies 模拟登录
# 操作: # - 创建一个scrapy项目,修改配置 # - 创建爬虫文件 # - 重写start_requests方法(直接粘贴复制过去,改一下cookie值) def start_requests(self): # 登录之后用 chrome 的 debug 工具从请求中获取的 cookies cookiesstr = "实际登录网页的cookie值" cookies = {i.split("=")[0]:i.split("=")[1] for i in cookiesstr.split("; ")} # 携带 cookies 的 Request 请求 yield scrapy.Request( self.start_urls[0], callback=self.parse, cookies=cookies ) # - 运行爬虫文件(其实就是那么简单,但是我喜欢这种方式,实用) # - 查看cookies 传递过程 # 在 settings 配置中加 COOKIES_DEBUG = True
-
使用Requests post 模拟登录
# - 创建项目 # - 创建爬虫文件 # - 发送data 数据 def parse(self, response): # request header postData = { 'ck': '', 'name': '****', # 用户名 'password': '****', # 密码 'remember': 'false', 'ticket': '' } return [FormRequest.from_response( response, formdata = postData, callback = self.after_login, dont_filter = True)] # - 写 after_login 方法 进行数据解析 # - 执行爬虫文件
如何发送post 请求
def __init__(self):
self.headers = {
"User-Agent":"自己找个UA",
"X-Requested-With":"XMLHttpRequest"
}
# 由于这是一个post 请求,
# 需要重写start_requests() 方法
def start_requests(self):
for page in range(0,7):
form_data = {
"otype": "4",
"city":"",
"start":str(25*page),
"amount": "25"
}
request = FormRequest(self.start_urls[0],headers=self.headers,formdata=form_data,callback=self.parse)
yield request
def parse(self, response):
pass
爬虫文件运行的两种方式
运行爬虫文件
-
方式一: 指令运行 scrapy crawl 爬虫文件名
-
方式二: 在项目根目录创建一个begin.py 文件:****
from scrapy import cmdline
cmdline.execute((“scrapy crawl 爬虫文件名”).split())
Pipelines 存储方式
-
csv
fp = None fw = None def open_spider(self,spider): self.fp = open('./yunwoke.csv','wt',encoding='utf-8') self.fw = csv.writer(self.fp) self.fw.writerow(['needTitle', 'needDetail', 'many', 'timer', 'publicTime', 'status',]) def process_item(self, item, spider): self.fw.writerow() return item def close_spider(self,spider): self.fp.close()
-
mysql
conn = None
cursor = None
def open_spider(self,spider):
self.conn = pymysql.connect(
host="127.0.0.1",
port=3306,
user='root',
password='034312',
db='',
charset='utf8'
)
def process_item(self, item, spider):
try:
self.cursor.execute(
'insert into yunwoke(needTitle,needDetail,many,timer,publicTime,status) values (%s,%s,%s,%s,%s,%s)',
[needTitle,
needDetail,
many,
timer,
publicTime,
status,]
)
self.conn.commit()
except Exception as e:
self.conn.rollback()
print(e)
return item
def close_spider(self,spider):
self.cursor.close()
self.conn.close()