项目进度:
对考生答案尝试分句分词处理,拼接成二维矩阵,准备对考生的答案做二维卷积处理,提取考生答案特征向量。
数据处理过程:
import pandas as pd
import jieba
import torch
import numpy as np
df = pd.read_csv('clean_input.csv', encoding='GBK').astype(str)
df.drop(['id'], axis=1, inplace=True)
x = df['answer'].values
y = df['1p'].values
ysk = []
for i in range(15000):
for w in jieba.cut(x[i]):
if isinstance(w, str):
ysk.append(w)
yskl = set(ysk)
print(len(yskl))
print(yskl)
# yskl为总词典
vocab = set(yskl)
word_to_ix = {word: i for i, word in enumerate(vocab)}
print(word_to_ix)
embedding = torch.nn.Embedding(2982, 30)
hello_idx = torch.LongTensor([word_to_ix['李某']])
hello_embed = embedding(hello_idx)
# print(hello_embed)
res = []
for i in range(10):
res.append(x[i].split(','))
# 分句
all = []
# print(res)
for i in range(10):
print(i)
ssml = []
for j in range(len(res[i])):
for w in jieba.cut(res[i][j]):
k = 0
sml = np.zeros((1, 30))
if isinstance(w, str):
hello_idx = torch.LongTensor([word_to_ix[w]])
hello_embed = embedding(hello_idx)
sml += (hello_embed.cpu().detach().numpy().tolist()[0])
# print(sml)
k += 1
sml /= k
ssml.append(sml.tolist())
all.append(ssml)
print(all)
定义二维卷积层,池化层,全连接层
conv1 = torch.nn.Conv1d(in_channels=30, out_channels=10, kernel_size=3)
# input = torch.tensor(cs)
# for i in range(len(input)):
# print(input[i])
# print(len(input[i]))
# for j in range(input[i]):
# print(input[i][j])
input = torch.tensor(temp)
# batch_size x max_sent_len x embedding_size -> batch_size x embedding_size x max_sent_len
input = temp.permute(0, 2, 1)
# print("input:", input.size())
output = conv1(input)
print("output:", output.size())
# 最大池化
pool1d = torch.nn.MaxPool1d(kernel_size=6)
pool1d_value = pool1d(output)
print("最大池化输出:", pool1d_value.size())
# 全连接
fc = torch.nn.Linear(in_features=10, out_features=3)
fc_inp = pool1d_value.view(-1, pool1d_value.size(1))
print("全连接输入:", fc_inp.size())
fc_outp = fc(fc_inp)
print("全连接输出:", fc_outp.size())
# softmax
m = torch.nn.Softmax()
out = m(fc_outp)
print("输出结果值:", out)
输出结果示例
: