简介
最近刚好学习了pytorch,于是想写写项目,而这个项目的最开始是使用keras
和TensorFlow
,我在这篇blog中使用pytorch
。
项目原文链接 Emojify - Create your own emoji with Deep Learning
数据集与代码
下面是百度网盘分享的数据集(FER2013,在data文件夹中)和代码(moduel.py为神经网络模型,model.ipynb用于训练并保存模型, main.py使用创建窗口,利用模型进行表情识别),emojis文件夹是显示映射后的表情的图片
当然原项目链接也有数据集及代码链接(keras版本)
简单解释代码
model.ipynb
使用jupyter notebook
打开
1. 加载数据集
-
train_path与test_path需要安装自己数据集路径设置(如果不知道工作路径,可以使用绝对路径)
-
transforms_train的第2,3行用于加强数据
-
transforms.RandomHorizontaiFlip()效果如下:
# 下载数据集并加载数据
train_path = "./data/train"
test_path = "./data/test"
transforms_train = transforms.Compose([
transforms.Grayscale(),#使用ImageFolder默认扩展为三通道,重新变回去就行
transforms.RandomHorizontalFlip(),#随机水平翻转
transforms.ColorJitter(brightness=0.5, contrast=0.5),#随机调整亮度和对比度
transforms.ToTensor()
])
transforms_test = transforms.Compose([
transforms.Grayscale(),
transforms.ToTensor()
])
data_train = torchvision.datasets.ImageFolder(root=train_path,transform=transforms_train)
data_test = torchvision.datasets.ImageFolder(root=test_path,transform=transforms_test)
dataload_train = DataLoader(data_train, batch_size=BATCH_SZIE, shuffle=True)
dataload_test = DataLoader(data_test, batch_size=BATCH_SZIE, shuffle=True)
2. 网络模型
下面代码中将展示每个阶段的数据的形状
(1, 48, 48):1代表是单通道因为是灰度图片,(48,48)代表图片的大小
nn.Softmax(dim = 1)需要设置参数dim,否则会有warning,dim=1表示按列计算概率,dim = 0代表按行计算(错误的),你可以通过计算出来结果打印查看是否正确。
在jupyter notebook可以查看打印结果:
# 测试模型
module_test_data = torch.ones(6, 1, 48, 48)
output = face_recognition_module(module_test_data)
output
# 构建模型
class t716(nn.Module):
def __init__(self):
super().__init__()
self.module = Sequential(
# 输入:(1, 48, 48)
Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1),
# 输出:(32, 46, 46)
nn.ReLU(True),
Conv2d(32, 64, 3),
# 输入:(64, 44, 44)
nn.ReLU(),
MaxPool2d(2),
# 输出:(64, 22, 22)
Dropout(0.25),
Conv2d(64, 128, 3),
# (128, 20, 20)
nn.ReLU(),
MaxPool2d(2),
# (128, 10, 10)
Conv2d(128, 128, 3),
# (128, 8, 8)
nn.ReLU(),
MaxPool2d(2),
# (128, 4, 4)
Dropout(0.25),
Flatten(),
# (128 * 4 * 4)
Linear(2048, 1024),
nn.ReLU(),
Dropout(0.5),
Linear(1024, 7),
# dim = 1表示按列计算softmax(这里列为类别)
nn.Softmax(dim=1)
)
def forward(self, data):
y = self.module(data)
return y
3. 模型训练
我设置的部分尝试是这样的
# 设置常数
BATCH_SZIE = 6
LR = 0.0001 # 学习率
EPOCH = 50 # 学习轮数
由于使用的是cpu训练,在这个参数设计写,一共训练了大概20个小时。(如果有独显,可以使用gpu训练)
main.py中的一写设置
main.py主要是利用cv2
与tkinter
,实现一个GUI界面,利用模型进行识别,不讲解代码,但是有一些设置需要解释及本人遇到的问题需要解释。
结果展示:
红色框是logo.png加载的位置。
问题1
show_vid()函数中的cascade_path为自己下载的文件haarcascade_frontalface_default.xml的路径,你可以根据下面下载链接下载,随便存储在某个文件夹。
问题2
show_void()的lmain.after(300, show_vid)与show_vid2的lmain2.after(300, show_vid2)的第一个参数如果填的比较小,那么窗口会打不开,太大打开的窗口延迟比较高,由于没有学过opencv,所以也不知道是为什么?
问题3
main函数中加载的logo.png可以自己随便设置一张。
源码
训练与保存模型部分
# 导入相关库
import numpy as np
import torch
import torchvision
import matplotlib.pyplot as plt
import torch.nn as nn
from torch.nn import Conv2d, Sequential, MaxPool2d, Flatten, Linear, Dropout, Softmax
from torchvision import transforms, datasets
from torch.utils.data import DataLoader
# 设置常数
BATCH_SZIE = 6
LR = 0.0001
EPOCH = 50
# 下载数据集并加载数据
train_path = "./data/train"
test_path = "./data/test"
transforms_train = transforms.Compose([
transforms.Grayscale(),#使用ImageFolder默认扩展为三通道,重新变回去就行
transforms.RandomHorizontalFlip(),#随机翻转
transforms.ColorJitter(brightness=0.5, contrast=0.5),#随机调整亮度和对比度
transforms.ToTensor()
])
transforms_test = transforms.Compose([
transforms.Grayscale(),
transforms.ToTensor()
])
data_train = torchvision.datasets.ImageFolder(root=train_path,transform=transforms_train)
data_test = torchvision.datasets.ImageFolder(root=test_path,transform=transforms_test)
dataload_train = DataLoader(data_train, batch_size=BATCH_SZIE, shuffle=True)
dataload_test = DataLoader(data_test, batch_size=BATCH_SZIE, shuffle=True)
# 查看一下数据集
for data in dataload_train:
image, target = data
break
print(f"图片形状: {image.shape}")
print(f"类别:{data_train.classes}\n取出的为: {target}")
# imshow()对于灰度图片只支持两个维度(m, n),因而需要reshape
for i in range(1, 7):
plt.subplot(4, 4, i)
newimage = image[i - 1].reshape((48, 48))
plt.imshow(newimage, cmap='grey')
plt.axis('off')
plt.show()
# 构建模型
class t716(nn.Module):
def __init__(self):
super().__init__()
self.module = Sequential(
# 输入:(1, 48, 48)
Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1),
# 输出:(32, 46, 46)
nn.ReLU(True),
Conv2d(32, 64, 3),
# 输入:(64, 44, 44)
nn.ReLU(),
MaxPool2d(2),
# 输出:(64, 22, 22)
Dropout(0.25),
Conv2d(64, 128, 3),
# (128, 20, 20)
nn.ReLU(),
MaxPool2d(2),
# (128, 10, 10)
Conv2d(128, 128, 3),
# (128, 8, 8)
nn.ReLU(),
MaxPool2d(2),
# (128, 4, 4)
Dropout(0.25),
Flatten(),
# (128 * 4 * 4)
Linear(2048, 1024),
nn.ReLU(),
Dropout(0.5),
Linear(1024, 7),
# dim = 1表示按列计算softmax(这里列为类别)
nn.Softmax(dim=1)
)
def forward(self, data):
y = self.module(data)
return y
# 创建模型
face_recognition_module = t716()
loss = nn.CrossEntropyLoss()
optim = torch.optim.Adam(face_recognition_module.parameters(), lr=LR)
# 测试模型
module_test_data = torch.ones(6, 1, 48, 48)
output = face_recognition_module(module_test_data)
output
# 训练模型
face_recognition_module.train()
iterator = 0
for epoch in range(EPOCH):
print(f'-------第{epoch}轮训练-------')
for data in dataload_train:
iterator += 1
images, target = data
output = face_recognition_module(images)
loss_output = loss(output, target)
optim.zero_grad()
loss_output.backward()
optim.step()
if iterator % 500 == 0:
print(f"第{iterator}次的损失为: {loss_output}")
# 模型测试
face_recognition_module.eval()
total_right = 0
with torch.no_grad():
for data in dataload_test:
images, targets = data
output = face_recognition_module(images)
right = (output.argmax(1) == targets).sum()
total_right += right
accuracy = total_right / len(data_test)
print(accuracy)
# 保存模型
torch.save(face_recognition_module.state_dict(), './modules/face_recognition_module_716.pth')
GUI的代码
# 导入相关的库
import cv2
import torch
import numpy as np
import tkinter as tk
import time
from tkinter import *
from PIL import Image, ImageTk
from moduel import t716
# 加载模型
par = torch.load('./modules/face_recognition_module_716.pth')
face_recognition_module = t716()
face_recognition_module.load_state_dict(par)
print(face_recognition_module)
cv2.ocl.setUseOpenCL(False) # CPU训练
emotion_dict = {0: "Angry", 1: "Disgusted", 2: "Fearful", 3: "Happy", 4: "Neutral", 5: "Sad", 6: "Surprised"}
emoji_dist={0:"./emojis/angry.png",1:"./emojis/disgusted.png",2:"./emojis/fearful.png",3:"./emojis/happy.png",
4:"./emojis/neutral.png",5:"./emojis/sad.png",6:"./emojis/surpriced.png"}
global last_frame1
last_frame1 = np.zeros((480, 640, 3), dtype=np.uint8)
global cap1
show_text=[0]
def show_vid():
cascade_path = "D:/APP2/anaconda/anaconda/envs/pytorch/Lib/site-packages/cv2/data/haarcascade_frontalface_default.xml"
cap1 = cv2.VideoCapture(0)
if not cap1.isOpened():
print("cant open the camera1")
flag1, frame1 = cap1.read()
# (500, 600, 3)
frame1 = cv2.resize(frame1,(600,500))
bounding_box = cv2.CascadeClassifier(cascade_path)
gray_frame = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY) # 转变为灰度图片
num_faces = bounding_box.detectMultiScale(gray_frame,scaleFactor=1.3, minNeighbors=5)
for (x, y, w, h) in num_faces:
cv2.rectangle(frame1, (x, y-50), (x+w, y+h+10), (255, 0, 0), 2)
roi_gray_frame = gray_frame[y:y + h, x:x + w]
cropped_img = np.expand_dims(np.expand_dims(cv2.resize(roi_gray_frame, (48, 48)), 0), 0)
cropped_img = torch.tensor(cropped_img).to(torch.float32)
with torch.no_grad():
prediction = face_recognition_module(cropped_img)
maxindex = int(np.argmax(prediction))
cv2.putText(frame1, emotion_dict[maxindex], (x+20, y-60), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
show_text[0]=maxindex
if flag1 is None:
print ("Major error!")
elif flag1:
global last_frame1
global lmain
last_frame1 = frame1.copy()
pic = cv2.cvtColor(last_frame1, cv2.COLOR_BGR2RGB)
img = Image.fromarray(pic)
imgtk = ImageTk.PhotoImage(image=img)
lmain.imgtk = imgtk
lmain.configure(image=imgtk)
lmain.after(300, show_vid)
if cv2.waitKey(1) & 0xFF == ord('q'):
exit()
return
def show_vid2():
global lmain2, lmain3
frame2=cv2.imread(emoji_dist[show_text[0]])
pic2=cv2.cvtColor(frame2,cv2.COLOR_BGR2RGB)
img2=Image.fromarray(frame2)
imgtk2=ImageTk.PhotoImage(image=img2)
lmain2.imgtk2=imgtk2
lmain3.configure(text=emotion_dict[show_text[0]],font=('arial',45,'bold'))
lmain2.configure(image=imgtk2)
lmain2.after(300, show_vid2)
if __name__ == '__main__':
root=tk.Tk()
img = ImageTk.PhotoImage(Image.open("logo.png"))
heading = Label(root,image=img,bg='black')
heading.pack()
heading2=Label(root,text="Photo to Emoji",pady=20, font=('arial',45,'bold'),bg='black',fg='#CDCDCD')
heading2.pack()
lmain = tk.Label(master=root,padx=50,bd=10)
lmain2 = tk.Label(master=root,bd=10)
lmain3=tk.Label(master=root,bd=10,fg="#CDCDCD",bg='black')
#lmain.pack(side=LEFT)
lmain.place(x=50,y=250)
#lmain3.pack()
lmain3.place(x=960,y=250)
#lmain2.pack(side=RIGHT)
lmain2.place(x=900,y=350)
root.title("Photo To Emoji")
root.geometry("1400x900+100+10")
root['bg']='black'
exitbutton = Button(root, text='Quit',fg="red",command=root.destroy,font=('arial',25,'bold'))
exitbutton.pack(side = BOTTOM)
show_vid()
show_vid2()
root.mainloop()