分类前之数据预处理

6 篇文章 0 订阅

之前在情感分析方法之nltk情感分析器和SVM分类器(二)一文中的第二部分,仅仅记录了最后一步分类器的处理,现在想要把前四步也记录下来。

1. 原始语料的规整

# -*- coding: utf-8 -*-
# 获取正负向语料库与停用词词典
# 将原始数据规整到一个txt文件中

import os

# 文件夹及结果文件的存储路径
path = r"D:/file_download/BaiduNetdiskDownload/PyCharm_File/senti_analysis/data/ChnSentiCorp_htl_ba_2000/"
folder = [r'neg', r'pos']

# 读取文件内容
def getContent(filename):
    with open(filename, 'rb') as f:    # 打开该中文文件的方式需二进制方式打开:rb
        contents = f.readline()
        f.close()
        return contents
# 得到文件名,取出对应文件的内容,新建TXT文件,将内容储存到同一个TXT文件中

for name in folder:
    result_path = path + name + r'.txt'
    result_file = open(result_path, 'wb+')   # 同样新建的储存中文的文件也要是二进制方式:wb+
    source_file = path + name
    for picname in os.listdir(source_file):
        picpath = os.path.join(source_file, picname)
        content = getContent(picpath)
        result_file.write(content)
    result_file.close()

2. 分词,清洗文本,文本分割

# -*- coding: utf-8 -*-
# 用结巴分词对语料进行分析处理
# 分词,清洗文本,文本分割

import jieba
import jieba.analyse
import re
import codecs

def prepareData(sourcefile, targerfile):
    source = codecs.open(sourcefile, 'rb')
    target = codecs.open(targerfile, 'w', encoding='utf-8')
    for line in source:
        line = line.decode('gbk', 'ignore')
        line = clearTxt(line)
        seg_line = sent2word(line)
        # seg_line = seg_line.encode('gb2312')
        target.writelines(seg_line + '\n')

    print('well done!')
    source.close()
    target.close()

# 去除文本中的特殊符号
def clearTxt(line):
    if line != '':
        line = line.strip()
        # line = line.decode('gb2312')
        # 去除文本中的英文和数字
        line = re.sub("[a-zA-Z0-9]", "", line)
        # 去除中文符号和英文符号
        line = re.sub("[\s+\.\!\/_,$%^*(+\"\';:“”.]+|[+——!,。??、~@#¥%……&*()]+", "", line)
    return line

# 分词
def sent2word(line):
    segList = jieba.cut(line, cut_all=False)
    segSentence = ''
    for word in segList:
        if word != '\t':
            segSentence += word + ' '
    return segSentence


if __name__ == '__main__':
    sourcefile = r'D:/file_download/BaiduNetdiskDownload/PyCharm_File/senti_analysis/data/ChnSentiCorp_htl_ba_2000/neg.txt'
    targetfile = r'D:/file_download/BaiduNetdiskDownload/PyCharm_File/senti_analysis/data/ChnSentiCorp_htl_ba_2000/neg_cut.txt'
    prepareData(sourcefile, targetfile)

    sourcefile = r'D:/file_download/BaiduNetdiskDownload/PyCharm_File/senti_analysis/data/ChnSentiCorp_htl_ba_2000/pos.txt'
    targetfile = r'D:/file_download/BaiduNetdiskDownload/PyCharm_File/senti_analysis/data/ChnSentiCorp_htl_ba_2000/pos_cut.txt'
    prepareData(sourcefile, targetfile)

3. 去除停用词

# -*- coding: utf-8 -*-
# 去除停用词
import codecs

# 加载停用词
def stopword(sourcefile, targerfile, stopkeyfile):
    source = codecs.open(sourcefile, 'rb')
    target = codecs.open(targerfile, 'w', encoding='utf-8')
    stopfile = codecs.open(stopkeyfile, 'rb').readlines()
    stopkey = [w.strip() for w in stopfile]
    for line in source:
        line = line.decode('utf-8', 'ignore')
        sentence = delstopword(line, stopkey)
        target.writelines(sentence + '\n')
    print('well done!')
    source.close()
    target.close()

# 删除停用词
def delstopword(line, stopkey):
    wordlist = line.split(' ')
    sentence = ''
    for word in wordlist:
        word = word.strip()
        if word not in stopkey:
            if word != '\t':
                sentence += word + ' '
    return sentence.strip()


if __name__ == '__main__':
    sourcefile = r'D:\file_download\BaiduNetdiskDownload\PyCharm_File\senti_analysis\data\ChnSentiCorp_htl_ba_2000\neg_cut.txt'
    targetfile = r'D:\file_download\BaiduNetdiskDownload\PyCharm_File\senti_analysis\data\ChnSentiCorp_htl_ba_2000\neg_cut_stopdel.txt'
    stopkeyfile = r'D:\file_download\BaiduNetdiskDownload\PyCharm_File\senti_analysis\data\stopWord.txt'
    stopword(sourcefile, targetfile, stopkeyfile)

    sourcefile = r'D:\file_download\BaiduNetdiskDownload\PyCharm_File\senti_analysis\data\ChnSentiCorp_htl_ba_2000\pos_cut.txt'
    targetfile = r'D:\file_download\BaiduNetdiskDownload\PyCharm_File\senti_analysis\data\ChnSentiCorp_htl_ba_2000\pos_cut_stopdel.txt'
    stopword(sourcefile, targetfile, stopkeyfile)

4. 文本向量化,获取特征词向量

# -*- coding: utf-8 -*-
# 文本向量化:获取特征词向量

import codecs
import gensim
import numpy as np
import pandas as pd

# 构建特征词向量
def getwordvecs(wordlist, model):
    vecs = []
    for word in wordlist:
        word = word.replace('\n', '')
        try:
            vecs.append(model[word])
        except KeyError:
            continue
    return np.array(vecs, dtype='float')

# 构建文本词向量,主要是对每一句进行矩阵平均来代表每一行
def buildvecs(filename, model):
    filevec = []
    with codecs.open(filename, 'rb') as contents:
        for line in contents:
            line = line.decode('utf-8', 'ignore')
            wordlist = line.split(' ')
            vecs = getwordvecs(wordlist, model)
            if len(vecs) > 0:
                vecsarray = sum(np.array(vecs)) / len(vecs)
                filevec.append(vecsarray)
    return filevec

# 已建立word2vec模型,只需加载
if __name__ == '__main__':
    fdir1 = 'D:/file_download/BaiduNetdiskDownload/PyCharm_File/wiki_zh_word2vec-master/'
    fdir2 = 'D:/file_download/BaiduNetdiskDownload/PyCharm_File/senti_analysis/data/ChnSentiCorp_htl_ba_2000/'
    modelinput = fdir1 + 'wiki.zh.text.vector'
    model = gensim.models.KeyedVectors.load_word2vec_format(modelinput, binary=False)

    posinput = buildvecs(fdir2 + 'pos_cut_stopdel.txt', model)
    neginput = buildvecs(fdir2 + 'neg_cut_stopdel.txt', model)

    Y = np.concatenate((np.ones(len(posinput)), np.zeros(len(neginput))))

    X = posinput[:]
    for neg in neginput:
        X.append(neg)
    X = np.array(X)

    df_x = pd.DataFrame(X)
    df_y = pd.DataFrame(Y)
    data = pd.concat([df_y, df_x], axis=1)
    data.to_csv(fdir2 + '2000_data.csv')

其中对每个句子采用词向量平均的方式来表示;

word2vec是直接加载已训练好的模型。

最后一步就是之前提到的数据降维和情感分类器

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值