概要:
本篇内容主要讲猿人学第一届Web攻防大赛第八题的逆向过程,一道图文点选形式的验证码反爬虫题目。不用打码平台,而是自己训练一个模型去识别
网站链接:https://match.yuanrenxue.cn/match/8
题目要求
- 题目概要:正确通过每一页的验证码后给我们返回每一页的数据,然后对数据进行筛选,筛选出出现频率最高的那个数据并提交就算通关
- 分析:每次访问验证码上的文字都是随机的,我们也不知道他到底用了哪些文字来生成的图片,所以训练的时候分类数要尽可能的多,以确保这上边出现的文字是我们训练过的,我这里就用了7012个字去训练它。大概用了351万张图片去训练这个模型,每个字有500张左右,如果数据集太小,模型是没办法收敛的,这点也需要注意
- 做题思路:简化图片(降低训练难度),把每一个字都分割成单独的一个字,然后再训练模型,模型训练好后,开一个flask接口加载模型(提高识别效率,因为每次加载模型都需要花费大量时间),爬虫代码请求开的接口给我们返回图片识别信息
图片预处理
先来看看图片,可以看到它是一个300*300像素的一张图片,我这里是要给它把底色和黑色干扰点去掉,其它方面你们可以补充,比如去掉干扰线、二值化等,我就直接交给模型了
在网上看了很多的文章和博客,都没有找到很好的处理代码,只能自己写了。我的做法是:遍历每一个像素点,把颜色信息存到字典里,再记录出现次数,把出现次数最多的两个颜色给它替换成白色,这样就实现了去除底色和干扰点的目的了
处理完后就长这个样子,还是比较清晰的—>
然后再分割它,这样就可以开始训练啦
- 具体处理代码
import random
import time
from PIL import Image
import numpy as np
import os
from tqdm import tqdm # 进度条
png_list = list(os.walk('./img'))[0][-1] # 读取存放待处理图片的文件夹
for png in tqdm(png_list):
img = Image.open('./img/{}'.format(png)) # 打开图片
pixdata = img.load() # 加载
w,h = img.size # 获取图片宽高
l = []
d = {}
for x in range(h): # 遍历每个像素点
for y in range(w):
if pixdata[x,y] not in l:
l.append(pixdata[x,y])
d[pixdata[x,y]] = 1 # 给像素点信息做好标记
else:
d[pixdata[x, y]] += 1 # 给像素点信息做好标记
pop_list = [] # 存放待删除的像素点
for i in range(2):
for key,value in d.items():
if value == max(d.values()): # 获取出现次数最多的颜色标记
pop_list.append(key) # 添加待待删除列表
d.pop(pop_list[i]) # 删除找到的最大值,避免下一次重复
for pop in pop_list:
for x in range(h):
for y in range(w):
if pixdata[x,y] == pop:
pixdata[x, y] = (255,255,255) # 替换成白色
index = 0
name = [x for x in png.split('_')[0]] # 图片标签名,分割时命名需要
for i in range(3):
for j in range(3):
image_cut = img.crop(([0, 100, 200][j], [0, 100, 200][i], [100, 200, 300][j], [100, 200, 300][i]))
save_time = int(''.join(str(time.time()).split('.'))) + int(random.randint(1, 50000)) # 时间戳+随机数,避免命名重复导致文件覆盖
image_cut.save('./inferring/{}_{}.png'.format(name[index], save_time), 'PNG') # 保存分割好的图片
index += 1
开始训练模型
- 模型选择:训练模型我们用ResNet18去训练,7012分类是我自己的mapping文件里存有7012个文字
from torch import nn
from torchvision import models
class MyResNet18(nn.Module):
def __init__(self):
super(MyResNet18, self).__init__()
# 7012分类结果,有1个字符,所以1*7012,每一块都有7012个权重
self.resnet18 = models.resnet18(num_classes=1 * 7012)
def forward(self, x):
x = self.resnet18(x)
return x
- 自定义数据集
import os
import torch
from PIL import Image
from torch.utils.data import Dataset
from tqdm import tqdm
import numpy as np
class CharsDataset(Dataset):
def __init__(self, root: str, transforms=None):
super(CharsDataset, self).__init__()
self.path = root # 存放图片的文件夹路径
self.transforms = transforms # 图片标准化
with open('mapping.txt', 'r', encoding='utf-8')as f:
data = f.read() # 读取存放文字的文件
self.mapping = [i for i in data] # 列表推导式给它变成一个映射列表
self.pic_path = self.get_image_path() # 在初始化的时候就获取一下所有图片文件名
def get_image_path(self): # 获取所有图片名的方法
img = list(os.walk(self.path))[0][2]
return img
def __len__(self): # 获取图片的数量
return len(self.get_image_path())
def __getitem__(self, item): # 返回图片的tensor和图片的标签的tensor
# 打开图片,返回图片tensor和labels的tensor
image_path = self.pic_path
images = Image.open(self.path + '/' + image_path[item])
if self.transforms:
images = self.transforms(images)
labels = [self.mapping.index(i) for i in image_path[item].split('_')[0]]
labels = torch.as_tensor(labels, dtype=torch.int64)
return images, labels
def slice(self, start, end): # 计算图片标准值的方法
load_image = self.pic_path
images = []
for i in tqdm(range(start, end)):
image = Image.open(self.path + '/' + load_image[i])
if self.transforms:
image = self.transforms(image)
images.append(image.numpy())
images = np.array(images)
images = torch.Tensor(images)
return images
- 训练部分代码:
- 优化器选择:自适应优化器
- 损失函数选择:交叉熵损失
import torch
from torch import optim,nn
import os
import numpy as np
from torch import save, load
from torch.utils.data import DataLoader
from torchvision import transforms
from tqdm import tqdm
from MyModels import MyResNet18
from MyDataset import CharsDataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 创建GPU设备
# 实例化模型
model = MyResNet18()
model = model.to(device)
# 优化器选择
optimizer = optim.Adam(model.parameters())
# 损失函数选择
loss_func = nn.CrossEntropyLoss()
epoch = 0
# 加载已经训练好的模型继续训练
if os.path.exists('./models/checkpoint.pth'):
checkpoint = load('./models/checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
# 标准化处理
my_transforms = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=(0.8988, 0.8988, 0.8988), std=(0.2377, 0.2377, 0.2377))
])
# 数据源/标注
Mydataset = CharsDataset(root='./train_png', transforms=my_transforms)
def train_mnist(i):
loss_list = []
batch_size = 16
# 数据加载
data_loader = DataLoader(Mydataset, batch_size=batch_size, shuffle=True, drop_last=True)
data_loader = tqdm(data_loader, total=len(data_loader))
model.train()
for images, labels in data_loader:
# 使用GPU训练
images = images.to(device)
labels = labels.to(device)
# 梯度置零
optimizer.zero_grad()
# 前向传播
output = model(images)
# 通过结果计算损失,判断是否收敛
output = output.view(batch_size * 1, 7012)
labels = labels.view(-1)
loss = loss_func(output, labels)
loss_list.append(loss.item())
data_loader.set_description('loss: {} '.format(np.mean(loss_list)))
# 反向传播
loss.backward()
# 优化器更新
optimizer.step()
# 保存检查点,这样做得好处就是程序意外终止了,还能接着上次继续训练,坏处还没找到...
checkpoint = {
'epoch': i, # 这个不要也应该可以
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
}
save(checkpoint, './models/checkpoint.pth') # 保存模型
- 测试和推理就不一一展示了,下面开始搭建接口
接口搭建
这里需要注意的就两个点
- 模型是GPU训练出来的,加载的时候要映射要CPU上
- 每次识别完后需要删掉图片,避免下次识别的时候出现污染
from flask import Flask
import torch
from test_txt.Mymodels import MyResNet18
from torchvision import transforms
import os
from PIL import Image
import shutil
app = Flask(__name__)
class ModelsAnlan:
def __init__(self):
# 实例化模型
self.model = MyResNet18()
# 加载已经训练好的模型,优化器就不需要加载了
if os.path.exists('./models/checkpoint.pth'):
checkpoint = torch.load('./models/checkpoint.pth', map_location='cpu') # 映射到cpu
self.model.load_state_dict(checkpoint['model_state_dict'])
# 数据标准化
self.my_transforms = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=(0.8988, 0.8988, 0.8988), std=(0.2377, 0.2377, 0.2377))
])
self.model.eval()
with open('mapping.txt', 'r', encoding='utf-8')as f:
data = f.read()
self.mapping = [i for i in data]
def test_models(self):
try:
img = list(os.walk('inferring_png'))[0][2]
a = []
for x in img:
with torch.no_grad():
images = self.my_transforms(Image.open('./inferring_png/{}'.format(x)))
images = images.view(1, 3, 100, 100) # 要4维数据,但读取的图片是3维的,所以这里这样给它加1维
output = self.model(images)
output = output.view(1, 7012) # 拍平
output = output.max(dim=1).indices # 在第一维度上取出最大值,并获取索引
output = output.numpy()
output = ''.join([self.mapping[i] for i in output]) # 推理出正确的文字
a.append(output)
# 删文件夹以及图片文件,再创建文件,相当于清空文件夹了
shutil.rmtree('inferring_png')
os.makedirs('inferring_png')
return a # 爬虫请求时返回的数据
except Exception as e:
print(e)
@app.route('/')
def index():
# 每次访问根目录的时候就执行这个方法
return link.test_models()
if __name__ == '__main__':
link = ModelsAnlan() # 初始化
app.run(host='127.0.0.1',port=5000)
爬虫业务代码
这里唯一要注意的就是识别错误的问题,模型它也不是百分百能识别出来的,要让程序把出错的地方重新走一遍,具体请看total()函数,直到不会出错为止。当然你的模型识别准确率也要高,因为是拆分开每个字训练的,如果错了一个,那其他八个识别正确了也没用。
import requests
from lxml import etree
import base64
import random
import time
from PIL import Image
import os
import json
import shutil
class Test:
# download_img():保存图片的方法
# preprocess_img():处理图片的方法
# click_order():推理出点击顺序的方法
# total():处理5页所有数据的方法
# submit():筛选出出现频率最多的值并提交答案的方法
# run():总运行方法
def __init__(self):
self.cookies = {
'sessionid': 'ql0r9626qj4y0slo1iybbwnif369fl4k'
}
self.mapping = [155, 165, 175, 455, 465, 475, 755, 765, 745] # 每个字的点击顺序
self.l = []
self.d = {}
self.submit_url = 'https://match.yuanrenxue.cn/api/answer' # 提交验证码答案的url
self.up_url = 'https://match.yuanrenxue.cn/api/match/8' # 提交答案的url
self.num_list = [] # 存放5页数据的列表
def download_img(self):
url = 'https://match.yuanrenxue.cn/api/match/8_verify'
req = requests.get(url=url, cookies=self.cookies).text
element_lxml = etree.HTML(json.loads(req)['html'])
txt = ''
for i in element_lxml:
txt = i.xpath('./div/p/text()')
img_url = i.xpath('./img/@src')[0]
img_data = img_url.replace('data:image/jpeg;base64,', '')
with open('./pic/pic.png', 'wb') as f:
f.write(base64.b64decode(img_data))
return txt # 返回网站提示的要点击的文字
def preprocess_img(self):
png_list = list(os.walk('./pic'))[0][-1]
for png in png_list:
img = Image.open('./pic/{}'.format(png))
pixdata = img.load()
w, h = img.size
l = []
d = {}
for x in range(h):
for y in range(w):
if pixdata[x, y] not in l:
l.append(pixdata[x, y])
d[pixdata[x, y]] = 1
else:
d[pixdata[x, y]] += 1
pop_list = []
for i in range(2):
for key, value in d.items():
if value == max(d.values()):
pop_list.append(key)
d.pop(pop_list[i])
for pop in pop_list:
for x in range(h):
for y in range(w):
if pixdata[x, y] == pop:
pixdata[x, y] = (255, 255, 255)
index = 0
for i in range(3):
for j in range(3):
image_cut = img.crop(([0, 100, 200][j], [0, 100, 200][i], [100, 200, 300][j], [100, 200, 300][i]))
save_time = int(''.join(str(time.time()).split('.'))) + int(random.randint(1, 50000))
image_cut.save(r'D:\inferring_png\{}_{}.png'.format(index, save_time), 'PNG')
index += 1
shutil.rmtree('pic')
os.makedirs('pic')
def click_order(self, txt_list, i):
res = requests.get('http://127.0.0.1:5000')
data_list = json.loads(res.text)
clicks = [data_list.index(i) for i in txt_list]
click = ''
for i in clicks:
click += str(self.mapping[i]) + '|'
print(f'第{i}页点击顺序:', clicks, click)
return click
def total(self, i):
try:
txt_list = self.download_img()
self.preprocess_img()
click = self.click_order(txt_list,i)
# print(click)
params = {
'page': i,
'answer': click
}
result = requests.get(url=self.up_url, params=params, cookies=self.cookies).text
if result:
for j in json.loads(result)['data']:
result = j['value']
self.num_list.append(int(result))
except Exception as e:
print(f'第{i}页异常', e)
self.total(i)
return self.num_list
def submit(self, data):
for num in data:
if num not in self.l:
self.l.append(num)
self.d[num] = 1
else:
self.d[num] += 1
for key, value in self.d.items():
if value == max(self.d.values()):
# key = key
print('出现频率最高的值是:',key, '出现次数:',value)
params = {
'answer': key,
'id': 8
}
req = requests.get(url=self.submit_url, cookies=self.cookies, params=params).json()
print(req) # state是success就代表通关了
def run(self):
data = ''
for i in range(1, 6): # 1到5页
data= self.total(i)
print(data)
self.submit(data)
if __name__ == '__main__':
Test().run()
至此结束,谢谢收看!