分类前之数据预处理

最新推荐文章于 2023-01-02 18:29:53 发布

涵星同学

最新推荐文章于 2023-01-02 18:29:53 发布

阅读量2.1k

点赞数

分类专栏： NLP Python

本文链接：https://blog.csdn.net/sinat_36972314/article/details/81434776

版权

NLP 同时被 2 个专栏收录

24 篇文章 2 订阅

订阅专栏

Python

6 篇文章 0 订阅

订阅专栏

之前在情感分析方法之nltk情感分析器和SVM分类器（二）一文中的第二部分，仅仅记录了最后一步分类器的处理，现在想要把前四步也记录下来。

1. 原始语料的规整

# -*- coding: utf-8 -*-
# 获取正负向语料库与停用词词典
# 将原始数据规整到一个txt文件中

import os

# 文件夹及结果文件的存储路径
path = r"D:/file_download/BaiduNetdiskDownload/PyCharm_File/senti_analysis/data/ChnSentiCorp_htl_ba_2000/"
folder = [r'neg', r'pos']

# 读取文件内容
def getContent(filename):
    with open(filename, 'rb') as f:    # 打开该中文文件的方式需二进制方式打开：rb
        contents = f.readline()
        f.close()
        return contents
# 得到文件名,取出对应文件的内容，新建TXT文件，将内容储存到同一个TXT文件中

for name in folder:
    result_path = path + name + r'.txt'
    result_file = open(result_path, 'wb+')   # 同样新建的储存中文的文件也要是二进制方式：wb+
    source_file = path + name
    for picname in os.listdir(source_file):
        picpath = os.path.join(source_file, picname)
        content = getContent(picpath)
        result_file.write(content)
    result_file.close()

2. 分词，清洗文本，文本分割

# -*- coding: utf-8 -*-
# 用结巴分词对语料进行分析处理
# 分词，清洗文本，文本分割

import jieba
import jieba.analyse
import re
import codecs

def prepareData(sourcefile, targerfile):
    source = codecs.open(sourcefile, 'rb')
    target = codecs.open(targerfile, 'w', encoding='utf-8')
    for line in source:
        line = line.decode('gbk', 'ignore')
        line = clearTxt(line)
        seg_line = sent2word(line)
        # seg_line = seg_line.encode('gb2312')
        target.writelines(seg_line + '\n')

    print('well done!')
    source.close()
    target.close()

# 去除文本中的特殊符号
def clearTxt(line):
    if line != '':
        line = line.strip()
        # line = line.decode('gb2312')
        # 去除文本中的英文和数字
        line = re.sub("[a-zA-Z0-9]", "", line)
        # 去除中文符号和英文符号
        line = re.sub("[\s+\.\!\/_,$%^*(+\"\'；：“”．]+|[+——！，。？?、~@#￥%……&*（）]+", "", line)
    return line

# 分词
def sent2word(line):
    segList = jieba.cut(line, cut_all=False)
    segSentence = ''
    for word in segList:
        if word != '\t':
            segSentence += word + ' '
    return segSentence


if __name__ == '__main__':
    sourcefile = r'D:/file_download/BaiduNetdiskDownload/PyCharm_File/senti_analysis/data/ChnSentiCorp_htl_ba_2000/neg.txt'
    targetfile = r'D:/file_download/BaiduNetdiskDownload/PyCharm_File/senti_analysis/data/ChnSentiCorp_htl_ba_2000/neg_cut.txt'
    prepareData(sourcefile, targetfile)

    sourcefile = r'D:/file_download/BaiduNetdiskDownload/PyCharm_File/senti_analysis/data/ChnSentiCorp_htl_ba_2000/pos.txt'
    targetfile = r'D:/file_download/BaiduNetdiskDownload/PyCharm_File/senti_analysis/data/ChnSentiCorp_htl_ba_2000/pos_cut.txt'
    prepareData(sourcefile, targetfile)

3. 去除停用词

# -*- coding: utf-8 -*-
# 去除停用词
import codecs

# 加载停用词
def stopword(sourcefile, targerfile, stopkeyfile):
    source = codecs.open(sourcefile, 'rb')
    target = codecs.open(targerfile, 'w', encoding='utf-8')
    stopfile = codecs.open(stopkeyfile, 'rb').readlines()
    stopkey = [w.strip() for w in stopfile]
    for line in source:
        line = line.decode('utf-8', 'ignore')
        sentence = delstopword(line, stopkey)
        target.writelines(sentence + '\n')
    print('well done!')
    source.close()
    target.close()

# 删除停用词
def delstopword(line, stopkey):
    wordlist = line.split(' ')
    sentence = ''
    for word in wordlist:
        word = word.strip()
        if word not in stopkey:
            if word != '\t':
                sentence += word + ' '
    return sentence.strip()


if __name__ == '__main__':
    sourcefile = r'D:\file_download\BaiduNetdiskDownload\PyCharm_File\senti_analysis\data\ChnSentiCorp_htl_ba_2000\neg_cut.txt'
    targetfile = r'D:\file_download\BaiduNetdiskDownload\PyCharm_File\senti_analysis\data\ChnSentiCorp_htl_ba_2000\neg_cut_stopdel.txt'
    stopkeyfile = r'D:\file_download\BaiduNetdiskDownload\PyCharm_File\senti_analysis\data\stopWord.txt'
    stopword(sourcefile, targetfile, stopkeyfile)

    sourcefile = r'D:\file_download\BaiduNetdiskDownload\PyCharm_File\senti_analysis\data\ChnSentiCorp_htl_ba_2000\pos_cut.txt'
    targetfile = r'D:\file_download\BaiduNetdiskDownload\PyCharm_File\senti_analysis\data\ChnSentiCorp_htl_ba_2000\pos_cut_stopdel.txt'
    stopword(sourcefile, targetfile, stopkeyfile)

4. 文本向量化，获取特征词向量

# -*- coding: utf-8 -*-
# 文本向量化：获取特征词向量

import codecs
import gensim
import numpy as np
import pandas as pd

# 构建特征词向量
def getwordvecs(wordlist, model):
    vecs = []
    for word in wordlist:
        word = word.replace('\n', '')
        try:
            vecs.append(model[word])
        except KeyError:
            continue
    return np.array(vecs, dtype='float')

# 构建文本词向量，主要是对每一句进行矩阵平均来代表每一行
def buildvecs(filename, model):
    filevec = []
    with codecs.open(filename, 'rb') as contents:
        for line in contents:
            line = line.decode('utf-8', 'ignore')
            wordlist = line.split(' ')
            vecs = getwordvecs(wordlist, model)
            if len(vecs) > 0:
                vecsarray = sum(np.array(vecs)) / len(vecs)
                filevec.append(vecsarray)
    return filevec

# 已建立word2vec模型，只需加载
if __name__ == '__main__':
    fdir1 = 'D:/file_download/BaiduNetdiskDownload/PyCharm_File/wiki_zh_word2vec-master/'
    fdir2 = 'D:/file_download/BaiduNetdiskDownload/PyCharm_File/senti_analysis/data/ChnSentiCorp_htl_ba_2000/'
    modelinput = fdir1 + 'wiki.zh.text.vector'
    model = gensim.models.KeyedVectors.load_word2vec_format(modelinput, binary=False)

    posinput = buildvecs(fdir2 + 'pos_cut_stopdel.txt', model)
    neginput = buildvecs(fdir2 + 'neg_cut_stopdel.txt', model)

    Y = np.concatenate((np.ones(len(posinput)), np.zeros(len(neginput))))

    X = posinput[:]
    for neg in neginput:
        X.append(neg)
    X = np.array(X)

    df_x = pd.DataFrame(X)
    df_y = pd.DataFrame(Y)
    data = pd.concat([df_y, df_x], axis=1)
    data.to_csv(fdir2 + '2000_data.csv')

其中对每个句子采用词向量平均的方式来表示；

word2vec是直接加载已训练好的模型。

最后一步就是之前提到的数据降维和情感分类器。

涵星同学

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
1
评论
分类前之数据预处理

之前在情感分析方法之nltk情感分析器和SVM分类器（二）一文中的第二部分，仅仅记录了最后一步分类器的处理，现在想要把前四步也记录下来。1. 原始语料的规整# -*- coding: utf-8 -*-# 获取正负向语料库与停用词词典# 将原始数据规整到一个txt文件中import os# 文件夹及结果文件的存储路径path = r"D:/file_download/Bai...
复制链接

扫一扫