文本情感分析实战

文本情感分析

本例子基于IMDB的数据集,对数据集中每句话进行情感分析,并输出它的情感色彩指数。

1. 获取数据集

def tokenize(content):
    content = re.sub("<.*?>", " ", content)
    # print(content)
    filter = ['\t','\n', '\x96', '\x97', "\.", "\,", "\(", "\)"]
    content = re.sub("|".join(filter), " ", content)
    tokens =  [i.strip().lower() for i in content.split()]
    return tokens


class ImdbDataset(Dataset):
    def __init__(self, train=True):
        self.traindataPath = "/tmp/flying/pycharm_demo1/文本情感分类/aclImdb_v1/aclImdb/train"
        self.testdataPath = "/tmp/flying/pycharm_demo1/文本情感分类/aclImdb_v1/aclImdb/test"
        self.dataPath = self.traindataPath if train else self.testdataPath

        # 分别获取pos和neg的文件夹
        pndata_path = [os.path.join(self.dataPath, "pos"), os.path.join(self.dataPath, "neg")]
        self.all_file_path = []
        for path in pndata_path:
            filename_list = os.listdir(path)
            filepath_list = [os.path.join(path, i) for i in filename_list if i.endswith('.txt')]
            self.all_file_path.extend(filepath_list)

    def __getitem__(self, index):
        file_path = self.all_file_path[index]
        lable = file_path.split("/")[-2]
        lable = 0 if lable == "neg" else 1 # label为0则为消极,否则为积极状态

        # 文件读取
        content = open(file_path).read()
        return tokenize(content), lable

    def __len__(self):
        return len(self.all_file_path)

def get_dataloader(train=True):
    imdb = ImdbDataset(train)
    data_loader = DataLoader(imdb, batch_size=128,shuffle=True, collate_fn=collate_fn)
    return data_loader

def collate_fn(batch):  # batch的形式为:[batch_size, 2] batch[i][0]为token列表,batch[i][1]为label
    content, label = list(zip(*batch))
    content = [ws.transform(i, maxlen=max_len) for i in content]	# 将短语转化为数字序列

    content = torch.LongTensor(content)	##将内容变为tensor
    label = torch.LongTensor(label)
    return content, label

if __name__ == '__main__':
    loader = get_dataloader()
    for idx,(input, target) in enumerate(get_dataloader()):
        print(idx)
        print(input)
        print(target)
        break

运行结果:

在这里插入图片描述

注意!!!

如果不更换collate_fn函数可能会报RuntimeError: each element in list of batch should be of equal size的错误,必须要重写或者改变列表的数据类型才能解决。

2. word2sequence类

通过word2sequence类,将句子中的词语转化为整形数列,并记录它们的使用次数(手写实现tokenizer)

class word2sequence:
    UNK_TAG = "UNK"
    PAD_TAG = "PAD"

    UNK = 0
    PAD = 1

    def __init__(self):
        self.dict={
            self.UNK_TAG:self.UNK,
            self.PAD_TAG:self.PAD
        }
        self.count = {}


    def fit(self, sentence):
        '''
        将某个句子保存到字典dict中
        :param sentence:
        :return:
        '''

        for word in sentence:
            self.count[word] = self.count.get(word, 0) + 1

    def build_dict(self, min=1, max=None, max_account=None):
        '''
        生成词典
        :param min: 最小的次数
        :param max: 最大的次数
        :param max_account: 保留的最多的词数
        :return:
        '''
        if min is not None:
            self.count = {word:value for (word, value) in self.count.items() if value>=min}

        if max is not None:
            self.count = {word:value for (word, value) in self.count.items() if value<=max}

        if max_account is not None:
            temp = sorted(self.count.items(), key=lambda x:x[-1], reverse=True)[:max_account]
            self.count = {i0:i1 for i0,i1 in temp}

        # 赋值给词典:{unk:0,pad:1,"i":2 ...}
        for word in self.count:
            self.dict[word] = len(self.dict)

        # 获取翻转的dict
        self.inverse_dict = {key:value for value,key in self.dict.items()}

    def transform(self, sentence, maxlen=None):
        '''
        将句子转化为序列
        :param sentence:
        :return:
        '''
        if maxlen is not None:
            if maxlen < len(sentence):
                sentence = sentence[:maxlen]
            else:
                sentence = sentence + [self.PAD_TAG]*(maxlen - len(sentence))
        return [self.dict.get(word, self.UNK) for word in sentence]

    def inverse_transform(self, sequence):
        '''
        将序列转化为句子
        :param sequence:
        :return:
        '''

        return [self.inverse_dict.get(seq) for seq in sequence]




if __name__ == "__main__":
    ws = word2sequence()
    ws.fit(["我","爱","莫"])
    ws.fit(["我","你","谁"])
    ws.build_dict()
    print(ws.dict)
    res = ws.transform(["我","爱","北京"], maxlen=5)
    print(res)
    print(ws.inverse_transform(res))

结果显示:

在这里插入图片描述

3. 建立模型

class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(len(ws), 100)
        self.fc = nn.Linear(max_len*100, 2)

    def forward(self, input):
        x = self.embedding(input)   #对输入进行词嵌入,形状变成[batch_size, max_len, 100]
        x = x.view([-1, max_len*100]) # 将x转化成[batch_size, max_len*100]的形状
        out = self.fc(x)
        return F.log_softmax(out, dim=-1)


model = MyModel()
optimizer = Adam(model.parameters(), lr=0.001)

def train(epoch):
    for idx,(input, target) in enumerate(get_dataloader(True)):
        optimizer.zero_grad()
        output = model(input)

        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        print(loss.item())

if __name__ == '__main__':
    for i in range(1):
        train(1)

由于模型过于简单,loss损失有些严重,就不附上结果了(╥﹏╥)

本文仅供自己参考学习使用,如有侵权立删

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值