5.selenium模块
selenium是基于浏览器自动化的一个模块,类似脚本模拟人的操作
selenium可以便捷的获取动态加载的数据,可以便捷的实现模拟登录;
selenium直接获取ajax动态加载的数据:
from selenium import webdriver
from lxml import etree
driver = webdriver.Chrome(executable_path="./chromedriver")
url = "https://www.pearvideo.com/video_1782544"
driver.get(url)
# 获取网页源代码
html = driver.page_source
etree_html = etree.HTML(html)
video = etree_html.xpath("//div[@id='drag_target1']//video/@src")[0]
print(video)
selenium自动化操作
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
# 加载 浏览器驱动
driver = webdriver.Chrome(executable_path="./chromedriver")
url = "https://www.taobao.com/"
driver.get(url)
# 获取搜索框
input = driver.find_element(by=By.ID, value="q")
# # 输入搜索内容
input.send_keys("西屋浴霸")
# # 获取搜索按钮
button = driver.find_element(by=By.CLASS_NAME, value="btn-search")
# 页面滚动
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
# 页面暂停3秒
time.sleep(2)
# # 点击搜索按钮
button.click()
# 页面跳转
driver.get("http://www.baidu.com")
#页面返回
driver.back()
#页面前进
driver.forward()
进入iframe与动作链
import time
from selenium import webdriver
from lxml import etree
from selenium.webdriver.common.by import By
# 导入动作链
from selenium.webdriver import ActionChains
# 加载 浏览器驱动
driver = webdriver.Chrome(executable_path="./chromedriver")
url = "https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable"
driver.get(url)
# 跳转到iframe
driver.switch_to.frame("iframeResult")
# 导入动作链
action = ActionChains(driver)
# 获取div元素
div = driver.find_element(by=By.ID, value="draggable")
# 动作链点击并长按div
action.click_and_hold(div)
# 动作链拖拽
for i in range(5):
# perform()方法立即执行 20,0表示x轴移动20个像素,y轴移动0个像素
action.move_by_offset(50, 0).perform()
time.sleep(0.3)
# 释放动作链
action.release()
# 关闭浏览器
driver.quit()
selenium隐形浏览器与规避检测
# 导入selenium
from selenium import webdriver
# 导入time
import time
# 导入By
from selenium.webdriver.common.by import By
# 导入无可视化界面包
from selenium.webdriver.chrome.options import Options
# 导入规避检测包
from selenium.webdriver import ChromeOptions
# 创建无可视化界面操作,无头浏览器
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
# 实现规避检测
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
# 加载浏览器驱动
driver = webdriver.Chrome(
executable_path="./chromedriver", chrome_options=chrome_options, options=option)
url = "https://www.baidu.com/"
driver.get(url)
# 获取页面
html = driver.page_source
print(html)
time.sleep(2)
# 关闭浏览器
driver.quit()
selenium截图、裁剪
# 导入selenium
from selenium import webdriver
# 导入time
import time
# 导入By
from selenium.webdriver.common.by import By
# 导入裁剪包
from PIL import Image
# 加载浏览器驱动
driver = webdriver.Chrome(
executable_path="./chromedriver")
url = "https://www.baidu.com/"
# 打开网页,网页全屏
driver.get(url)
driver.maximize_window()
time.sleep(2)
# 截取屏幕
driver.save_screenshot("./baidu.png")
# 获取lg元素
lg = driver.find_element(by=By.CLASS_NAME, value="s_form_wrapper")
# 获取元素左上角坐标
location = lg.location
# 获取元素宽高
size = lg.size
print(location, size)
# 电脑屏幕分辨率必须调到100%才能截取正确的图片
# 左上角到右下角坐标
rangle = (
int(location['x']), int(location['y']), int(location['x']+size['width']), int(location['y']+size['height']))
# 用裁剪包打开图片
i = Image.open("./baidu.png")
# 裁剪图片
frame = i.crop(rangle)
# 保存图片
frame.save("./baidu_cut.png")
模拟zhihu登录
# 模拟知乎登录
# 导入Chaojiying_Client模块
from chaojiying.chaojiying import Chaojiying_Client
# 导入selenium模块
from selenium import webdriver
# 导入time模块
import time
# 导入By模块
from selenium.webdriver.common.by import By
import requests
# 导入selenium动作模块
from selenium.webdriver.common.action_chains import ActionChains
from PIL import Image
options = webdriver.ChromeOptions()
# 防止程序执行完浏览器自动关闭
options.add_experimental_option('detach', True)
# 加载浏览器驱动
driver = webdriver.Chrome(executable_path="./chromedriver", options=options)
url = "https://www.zhihu.com/signin?next=%2Fhot"
# 打开网页
driver.get(url)
time.sleep(1)
# 点击密码登录
tabs = driver.find_elements(by=By.CLASS_NAME, value="SignFlow-tab")
tabs[1].click()
# # 找到用户名输入框
username_inputs = driver.find_elements(
by=By.CLASS_NAME, value="username-input")
username_inputs[0].send_keys("15257741312")
username_inputs[1].send_keys("p64821120")
# # 找到登录按钮
btn = driver.find_element(
by=By.CLASS_NAME, value="SignFlow-submitButton")
# # 点击按钮
btn.click()
time.sleep(1)
# 跳转iframe
# driver.switch_to.frame(0)
# time.sleep(5000)
# 获取图片
img = driver.find_element(By.CLASS_NAME, "yidun_bg-img")
# 获取图片位置
location = img.location
# 获取图片大小
size = img.size
# 获取图片左上角坐标
rangle = (
int(location['x']), int(location['y']), int(location['x']+size['width']), int(location['y']+size['height']))
time.sleep(1)
# 获取滑块元素
slider_move = driver.find_element(By.CLASS_NAME, "yidun_slider")
print(slider_move.size)
# 获取滑块的宽度
slider_width = slider_move.size['width']
print(slider_width)
# 截图
driver.save_screenshot("./drag1.png")
# 用裁剪包打开图片
i = Image.open("./drag1.png")
# 裁剪图片
frame = i.crop(rangle)
# 保存图片
frame.save("./drag1_cut.png")
time.sleep(1)
cjy = Chaojiying_Client(
'username', 'password', '947550') # 用户中心>>软件ID 生成一个替换 96001
im = open('./drag1_cut.png', 'rb').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
obj = (cjy.PostPic(im, 9101))
print(obj)
# 识别坐标
location = obj['pic_str'].split(',')
# 获取x坐标
x = location[0]
print(x)
# 导入动作链
action = ActionChains(driver)
# 点击并长按slider
action.click_and_hold(slider_move).perform()
# 超级鹰默认从滑块中点开始滑,需要减掉滑块一半宽度,计算移动距离
distance = int(x) - int(slider_width)/2
# 动作链拖拽
# perform()方法立即执行 20,0表示x轴移动20个像素,y轴移动0个像素
action.move_by_offset(distance, 0).perform()
# 释放动作链
action.release().perform()
6.scrapy爬虫框架
功能:高性能的持久化存储,异步的数据下载,高性能的数据解析,分布式;
#安装
pip install scrapy
#创建项目
scrapy startproject proName www.xxx.com
cd proName
scrapy genspider yourspidername www.baidu.com
#配置文件修改ROBOTS协议
ROBOTSTXT_OBEY = False
#设置UA伪装
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
LOG_LEVEL = "error"
#执行爬虫
scrapy crawl yourspidername
scrapy数据解析
class DoubanSpider(scrapy.Spider):
name = "douban"
# allowed_domains = ["movie.douban.com"]
start_urls = ["https://movie.douban.com/"]
def parse(self, response):
li = response.xpath(
"//div[@id='screening']//ul[@class='ui-slide-content']/li")
for i in li:
# xpath返回的列表元素都是selector对象
# extract_first()将列表中的第一个元素提取出来
print(i.xpath("./@data-title").extract_first())
scrapy数据存储
基于终端指令存储:只能存储通过parse方法获得的数据,只能存储在本地文件
def parse(self, response):
list = []
li = response.xpath(
"//div[@id='screening']//ul[@class='ui-slide-content']/li")
for i in li:
# xpath返回的列表元素都是selector对象
# extract_first()将列表中的第一个元素提取出来
movie_name = i.xpath("./@data-title").extract_first()
# 获取电影评分
movie_score = i.xpath("./@data-rate").extract_first()
if (movie_name != None):
list.append({
"movie_name": movie_name,
"movie_score": movie_score
})
return list
指令:scrapy crawl douban -o ./douban.scv
终端指令存储操作简单,但是也有局限性大的缺点
基于管道的存储:
流程:
在配置文件中开启管道
ITEM_PIPELINES = {
#300表示优先级,越小越高
"dbdyPro.pipelines.DbdyproPipeline": 300,
}
解析数据
在item类中定义相关属性
class DbdyproItem(scrapy.Item):
# define the fields for your item here like:
movie_name = scrapy.Field()
movie_score = scrapy.Field()
pass
将解析的数据封装存储到item对象中
将item对象提交给管道进行持久化操作
def parse(self, response):
li = response.xpath(
"//div[@id='screening']//ul[@class='ui-slide-content']/li")
pipitem = DbdyproItem()
for i in li:
# xpath返回的列表元素都是selector对象
# extract_first()将列表中的第一个元素提取出来
movie_name = i.xpath("./@data-title").extract_first()
# 获取电影评分
movie_score = i.xpath("./@data-rate").extract_first()
if (movie_name != None):
pipitem['movie_name'] = movie_name
pipitem['movie_score'] = movie_score
# 提交到管道处理
yield pipitem
在管道类的process_item中将items数据进行持久化操作,可以存到本地或者数据库中
from itemadapter import ItemAdapter
class DbdyproPipeline:
fp = None
# 重写父类方法:该方法只在开始爬虫的时候执行一次,文件只被打开一次
def open_spider(self, spider):
print("开始爬虫")
self.fp = open("./douban.txt", "w", encoding="utf-8")
# 接收爬虫类传过来的item
# 每接收到一个item就会调用一次
def process_item(self, item, spider):
movie_name = item['movie_name']
movie_score = item['movie_score']
# 将数据写入文件
self.fp.write(movie_name + ":" + movie_score + "\n")
return item
# 重写父类方法
def close_spider(self, spider):
print("结束爬虫")
self.fp.close()
多管道(多平台)存储数据
配置管道配置
ITEM_PIPELINES = {
# 300表示优先级,越小越高
"dbdyPro.pipelines.DbdyproPipeline": 300,
"dbdyPro.pipelines.MysqlPipeline": 301,
}
编写多个管道类
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql
class DbdyproPipeline:
fp = None
# 重写父类方法:该方法只在开始爬虫的时候执行一次,文件只被打开一次
def open_spider(self, spider):
print("开始爬虫")
self.fp = open("./douban.txt", "a", encoding="utf-8")
# 接收爬虫类传过来的item
# 每接收到一个item就会调用一次
def process_item(self, item, spider):
movie_name = item['movie_name']
movie_score = item['movie_score']
# 将数据写入文件
self.fp.write(movie_name + ":" + movie_score + "\n")
# 返回item,会传递给下一个即将执行的管道类
return item
# 重写父类方法
def close_spider(self, spider):
print("结束爬虫")
self.fp.close()
# 一个管道类对应将一组数据存储到一个平台或载体中
class MysqlPipeline(object):
# mysql数据库配置信息
conn = None
# mysql游标对象
cursor = None
# 重写父类方法:该方法只在开始爬虫的时候执行一次,文件只被打开一次
def open_spider(self, spider):
# 连接数据库
print("数据库写入开始。。。")
self.conn = pymysql.Connect(host="localhost", port=3306,
user='root', password='root', db='py_douban_data', charset='utf8')
# 编写process_item方法,用来接收item对象
def process_item(self, item, spider):
movie_name = item['movie_name']
movie_score = item['movie_score']
self.cursor = self.conn.cursor()
sql = "insert into m_data (m_name,m_score) values('%s','%s')" % (
movie_name, str(movie_score))
# 执行sql语句
# 进行事务处理
try:
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
# 关闭数据库
def close_spider(self, spider):
print("数据库写入结束。。。")
self.cursor.close()
self.conn.close()
多页面数据爬取,多页面的数据获取是异步的,所以保存的数据可能不会按照页面顺序存储
class GushiciSpider(scrapy.Spider):
name = "gushici"
# allowed_domains = ["www.gushici.net"]
start_urls = ["https://www.gushici.net/mingju/"]
# 爬取全栈数据
# 生成通用的url模板
url = "https://www.gushici.net/mingju/index_%d.html"
start_page_num = 2
def parse(self, response):
boxs = response.xpath(
"//div[@class='left']/div[@class='ju']/div[@class='ju-box']")
item = PronameItem()
for box in boxs:
a_arr = box.xpath("./a")
con = a_arr[0].xpath("./text()").extract_first()
author = a_arr[1].xpath("./text()").extract_first()
item['con'] = con
item['author'] = author
# 提交到管道处理
yield item
# 新的url
if (self.start_page_num <= 3):
new_url = self.url % self.start_page_num
self.start_page_num += 1
# 提交到管道处理,请求新的地址,回调执行parse方法
yield scrapy.Request(new_url, callback=self.parse)
scrapy5大核心组件
parse方法跨页面爬取传参
import scrapy
from bossPro.items import BossproItem
class BossSpider(scrapy.Spider):
name = "boss"
# allowed_domains = ["www.zhipin.com"]
# boss直聘是用ajax获取的,所有换个网站
start_urls = [
"https://www.gushici.net/mingju/index.html"]
new_page_model="https://www.gushici.net/mingju/index_%d.html"
page_num=2
def parse(self, response):
li_list = response.xpath("//div[@class='ju']/div[@class='ju-box']")
for li in li_list:
title = li.xpath("./a[1]/text()").extract_first()
link_url = li.xpath("./a[1]/@href").extract_first()
#实例化不能写在循环外面,否则只会保存最后一条数据,因为item是可变的,每次重新赋值都会通过指针修改原来的值
item = BossproItem()
item['title'] = title
# url地址全路径
link_url = response.urljoin(link_url)
# 请求传参,将item作为参数传递给回调函数
yield scrapy.Request(link_url, callback=self.parse_detail, meta={'item': item})
# 翻页
if self.page_num <= 4:
new_url = format(self.new_page_model % self.page_num)
self.page_num += 1
# print(new_url)
# 递归调用
yield scrapy.Request(new_url, callback=self.parse)
def parse_detail(self, response):
item = response.meta['item']
content = response.xpath(
"//div[@class='gushici-box-text']//a/text()")
cons = content.extract()
# cons拼接
cons_p = "".join(cons)
item['con_p'] = cons_p
yield item
爬取图片
①解析图片路径
class ZztpSpider(scrapy.Spider):
name = "zztp"
# allowed_domains = ["sc.chinaz.com"]
start_urls = ["https://acg.gamersky.com/pic/wallpaper/"]
def parse(self, response):
lis = response.xpath("//ul[@class='pictxt block contentpaging']/li")
for li in lis:
img_src = li.xpath("./a/img/@src").extract_first()
item = PronameItem()
item['img_src'] = img_src
yield item
②设置item字段
class PronameItem(scrapy.Item):
# define the fields for your item here like:
img_src = scrapy.Field()
# pass
③自定义管道类
# 导入图片管道类
from scrapy.pipelines.images import ImagesPipeline
import scrapy
# 定义自定义图片管道类,继承ImagesPipeline
class MyImagesPipeline(ImagesPipeline):
# 重写父类的3个方法
# 1.获取图片的请求
def get_media_requests(self, item, info):
# 获取图片的请求
yield scrapy.Request(url=item['img_src'])
# 2.图片的保存
def file_path(self, request, response=None, info=None, *, item=None):
# 获取图片的名称
img_name = request.url.split('/')[-1]
# 拼接图片的保存路径
return img_name
# 3.返回item到下一个管道类
def item_completed(self, results, item, info):
return item
④定义图片路径,开放管道类
ITEM_PIPELINES = {
"proName.pipelines.MyImagesPipeline": 300,
}
# 设置图片的保存路径
IMAGES_STORE = "./images"
数据下载中间件
class MidDownloaderMiddleware:
# 代理ip池
PROXY_IP = [
'http://ip:port',
'http://ip:port',
]
# https
PROXY_IPS = [
'https://ip:port',
'https://ip:port',
]
# UA池
UA_LIST = [
'ua1',
'ua2',
]
def process_request(self, request, spider):
request.headers['User-Agent'] = random.choice(self.UA_LIST)
return None
def process_response(self, request, response, spider):
return response
def process_exception(self, request, exception, spider):
# 如果ip被封禁发生异常,在这里替换代理ip
# 从request中获取http或者https
if request.url.split(':')[0] == 'https':
request.meta['proxy'] = random.choice(self.PROXY_IPS)
else:
request.meta['proxy'] = random.choice(self.PROXY_IP)
# 将替换后的request对象重新发送
return request
DOWNLOADER_MIDDLEWARES = {
"mid.middlewares.MidDownloaderMiddleware": 543,
}
结合selenium
class WySpider(scrapy.Spider):
name = "wy"
# 定义初始方法
def __init__(self):
self.driver = webdriver.Chrome(executable_path="../chromedriver.exe")
# allowed_domains = ["163.com"]
start_urls = ["https://money.163.com/"]
navs = ["T1495590561605.html",
"T1524534586033.html", "T1603455486882.html", "T1624330083376.html"]
selenium_urls = []
def parse(self, response):
# 循环拼接地址
for nav in self.navs:
url = "https://www.163.com/dy/media/"+nav
self.selenium_urls.append(url)
yield scrapy.Request(url, callback=self.parse_nav)
# 解析导航页面
def parse_nav(self, response):
list = response.xpath("//div[@class='tab_content']//li")
for li in list:
link_url = li.xpath("./a/@href").extract_first()
title = li.xpath("./a/img/@alt").extract_first()
item = WangyiItem()
item['title'] = title
item['link_url'] = link_url
# print(link_url, title)
if (link_url != None):
yield scrapy.Request(link_url, callback=self.parse_detail, meta={'item': item})
def parse_detail(self, response):
item = response.meta['item']
p_text = response.xpath(
"//div[@class='post_body']/p/text()").extract_first()
item['p_text'] = p_text
#提交到管道
yield item
def closed(self, spider):
self.driver.quit()
class WangyiItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
link_url = scrapy.Field()
p_text = scrapy.Field()
class WangyiDownloaderMiddleware:
def process_request(self, request, spider):
return None
def process_response(self, request, response, spider):
driver = spider.driver
# 判断是否目标页面
if request.url in spider.selenium_urls:
# 这里展示引入selenium的方法
driver.get(request.url)
# 睡眠3秒,等待页面加载完成
time.sleep(3)
page_text = driver.page_source
new_response = HtmlResponse(
url=request.url, body=page_text, encoding="utf-8")
return new_response
else:
return response
def process_exception(self, request, exception, spider):
pass
class WangyiPipeline:
def process_item(self, item, spider):
# 处理数据
print(item)
return item
全站爬取
#创建项目
scrapy startproject proName dianying.2345.com
cd dianying.2345.com
scrapy genspider -t crawl movie2345 2345.com
class GscSpider(CrawlSpider):
name = "gsc"
# allowed_domains = ["gushici.com"]
start_urls = ["https://www.gushici.net/"]
# LinkExtractor(allow=r"Items/") # 提取链接的规则,allow表示允许提取的链接,正则表达式
url = LinkExtractor(allow=r"\/[a-z_0-9]+\/")
# Follow=True表示跟进,如果为False则不跟进,递归爬取
rules = (Rule(url, callback="parse_item", follow=True),)
def parse_item(self, response):
# 获取gushici-box的内容
gushici_box = response.xpath('//div[@class="gushici-box"]')
# 循环gushici_box 获取标题
for box in gushici_box:
title = box.xpath('./p[@class="tit"]//b/text()').extract_first()
author = box.xpath(
'./p[@class="source"]//a/text()').extract()
author_str = ""
# 获取author的长度
if (len(author)):
author_str = author[0]+": "+author[1]
content = box.xpath(
'./div[@class="gushici-box-text"]')
content_str = content.xpath("string()").get()
item=GushiciItem()
item['title']=title
item['author_str']=author_str
item['content_str']=content_str
yield item
class GushiciItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
author_str = scrapy.Field()
content_str = scrapy.Field()
from itemadapter import ItemAdapter
class GushiciPipeline:
# 定义文件
fp = None
# 重写父类的方法
def open_spider(self, spider):
print("开始爬虫")
self.fp = open("./gushici.txt", "a", encoding="utf-8")
def process_item(self, item, spider):
# 将item 写入文件
if (item['title'] != None and item['author_str'] != None and item['content_str'] != None):
self.fp.write(item['title']+"\n" +
item['author_str']+"\n"+item['content_str']+"\r\n\n")
return item
def close_spider(self, spider):
print("结束爬虫")
self.fp.close()
分布式爬取:
原生的scrapy不能用来做分布式爬取,因为不同服务器他们的调度器与管道是各自独立;所有需要安装scrapy-redis实现分布式爬取;
scrapy-redis可以为scrapy提供共享的调度器和管道