文本情感分析
本例子基于IMDB的数据集,对数据集中每句话进行情感分析,并输出它的情感色彩指数。
1. 获取数据集
def tokenize(content):
content = re.sub("<.*?>", " ", content)
# print(content)
filter = ['\t','\n', '\x96', '\x97', "\.", "\,", "\(", "\)"]
content = re.sub("|".join(filter), " ", content)
tokens = [i.strip().lower() for i in content.split()]
return tokens
class ImdbDataset(Dataset):
def __init__(self, train=True):
self.traindataPath = "/tmp/flying/pycharm_demo1/文本情感分类/aclImdb_v1/aclImdb/train"
self.testdataPath = "/tmp/flying/pycharm_demo1/文本情感分类/aclImdb_v1/aclImdb/test"
self.dataPath = self.traindataPath if train else self.testdataPath
# 分别获取pos和neg的文件夹
pndata_path = [os.path.join(self.dataPath, "pos"), os.path.join(self.dataPath, "neg")]
self.all_file_path = []
for path in pndata_path:
filename_list = os.listdir(path)
filepath_list = [os.path.join(path, i) for i in filename_list if i.endswith('.txt')]
self.all_file_path.extend(filepath_list)
def __getitem__(self, index):
file_path = self.all_file_path[index]
lable = file_path.split("/")[-2]
lable = 0 if lable == "neg" else 1 # label为0则为消极,否则为积极状态
# 文件读取
content = open(file_path).read()
return tokenize(content), lable
def __len__(self):
return len(self.all_file_path)
def get_dataloader(train=True):
imdb = ImdbDataset(train)
data_loader = DataLoader(imdb, batch_size=128,shuffle=True, collate_fn=collate_fn)
return data_loader
def collate_fn(batch): # batch的形式为:[batch_size, 2] batch[i][0]为token列表,batch[i][1]为label
content, label = list(zip(*batch))
content = [ws.transform(i, maxlen=max_len) for i in content] # 将短语转化为数字序列
content = torch.LongTensor(content) ##将内容变为tensor
label = torch.LongTensor(label)
return content, label
if __name__ == '__main__':
loader = get_dataloader()
for idx,(input, target) in enumerate(get_dataloader()):
print(idx)
print(input)
print(target)
break
运行结果:
注意!!!
如果不更换collate_fn函数可能会报RuntimeError: each element in list of batch should be of equal size的错误,必须要重写或者改变列表的数据类型才能解决。
2. word2sequence类
通过word2sequence类,将句子中的词语转化为整形数列,并记录它们的使用次数(手写实现tokenizer)
class word2sequence:
UNK_TAG = "UNK"
PAD_TAG = "PAD"
UNK = 0
PAD = 1
def __init__(self):
self.dict={
self.UNK_TAG:self.UNK,
self.PAD_TAG:self.PAD
}
self.count = {}
def fit(self, sentence):
'''
将某个句子保存到字典dict中
:param sentence:
:return:
'''
for word in sentence:
self.count[word] = self.count.get(word, 0) + 1
def build_dict(self, min=1, max=None, max_account=None):
'''
生成词典
:param min: 最小的次数
:param max: 最大的次数
:param max_account: 保留的最多的词数
:return:
'''
if min is not None:
self.count = {word:value for (word, value) in self.count.items() if value>=min}
if max is not None:
self.count = {word:value for (word, value) in self.count.items() if value<=max}
if max_account is not None:
temp = sorted(self.count.items(), key=lambda x:x[-1], reverse=True)[:max_account]
self.count = {i0:i1 for i0,i1 in temp}
# 赋值给词典:{unk:0,pad:1,"i":2 ...}
for word in self.count:
self.dict[word] = len(self.dict)
# 获取翻转的dict
self.inverse_dict = {key:value for value,key in self.dict.items()}
def transform(self, sentence, maxlen=None):
'''
将句子转化为序列
:param sentence:
:return:
'''
if maxlen is not None:
if maxlen < len(sentence):
sentence = sentence[:maxlen]
else:
sentence = sentence + [self.PAD_TAG]*(maxlen - len(sentence))
return [self.dict.get(word, self.UNK) for word in sentence]
def inverse_transform(self, sequence):
'''
将序列转化为句子
:param sequence:
:return:
'''
return [self.inverse_dict.get(seq) for seq in sequence]
if __name__ == "__main__":
ws = word2sequence()
ws.fit(["我","爱","莫"])
ws.fit(["我","你","谁"])
ws.build_dict()
print(ws.dict)
res = ws.transform(["我","爱","北京"], maxlen=5)
print(res)
print(ws.inverse_transform(res))
结果显示:
3. 建立模型
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.embedding = nn.Embedding(len(ws), 100)
self.fc = nn.Linear(max_len*100, 2)
def forward(self, input):
x = self.embedding(input) #对输入进行词嵌入,形状变成[batch_size, max_len, 100]
x = x.view([-1, max_len*100]) # 将x转化成[batch_size, max_len*100]的形状
out = self.fc(x)
return F.log_softmax(out, dim=-1)
model = MyModel()
optimizer = Adam(model.parameters(), lr=0.001)
def train(epoch):
for idx,(input, target) in enumerate(get_dataloader(True)):
optimizer.zero_grad()
output = model(input)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
print(loss.item())
if __name__ == '__main__':
for i in range(1):
train(1)
由于模型过于简单,loss损失有些严重,就不附上结果了(╥﹏╥)