情感分析：数据预处理_情感分类的数据处理-CSDN博客

本文链接：https://blog.csdn.net/qq_38248561/article/details/105589028

本文介绍如何对京东商品评论数据进行预处理，包括数据清洗、结巴分词及去除停用词，以适配SVM模型。通过Python代码实现了文本的英文、数字和符号去除，使用结巴库进行分词，再依据停用词字典过滤无意义词汇。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

技术介绍

本文是对SVM模型输入数据的初步处理，数据是爬取的京东商品评论（爬虫技术在我前几篇博客有写到）
对数据预处理大致分为以下3步：

数据清洗
去除文本中的英文、数字及中英文符号等无用信息
结巴分词
调用结巴库进行分词
去除停用词
调用停用词字典，遍历分词后的文本，文本中的此出现在停用词字典中则删除。

下面是实现代码，有较详细注释

# -*- coding:utf-8 -*-
#进行数据清洗和结巴分词，并去除停用词
import jieba
import pandas as pd
import re
import codecs

#加载数据，返回清洗后数据
def loaddata(datapath):
	#利用pandas读取数据文件
    data = pd.read_table(datapath, header=None, index_col=None) # header=None 表示文件的第一行不是列的名字，是数据
    #数据清洗（读取的数据为列表，data[0]为文本列表，data[0][i]为每条数据）
    for i in range(len(data[0])):
        data[0][i] = clearTxt(data[0][i])
    return data

#数据清洗
def clearTxt(line):
    if line != '':
        line = line.strip()	#去除文本前后空格
        #去除文本中的英文和数字
        line = re.sub("[a-zA-Z0-9]","",line)
        #去除文本中的中文符号和英文符号
        line = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？?、~@#￥%……&*（）]+", "",line)
    return line

#删除停用词
def delstopword(wordList,stopkey):
    sentence = ""
    for word in wordList:
        word = word.strip()
        if word not in stopkey:
            if word != '\t':
                sentence += word + " "
    return sentence.strip()

#存储到csv文件
def savedata(savepath, columnname, data):
    dataframe = pd.DataFrame({columnname: data})
    dataframe.to_csv(savepath, index=False)

if __name__ == '__main__':
    # 读入数据，得到清洗后结果，返回形式为列表
    pos = loaddata('F:/t/test/pos.csv')
    neg = loaddata('F:/t/test/neg.csv')

    # 分词，结果返回pos['c_w']列表
    pos['c_w'] = [jieba.lcut(sent) for sent in pos[0]]  # 使用for循环来获得分词后得到的每一个词语，返回一个列表
    neg['c_w'] = [jieba.lcut(sent) for sent in neg[0]]

    #去掉停用词，结果返回pos['d_w']列表
    stopkey = [w.strip() for w in codecs.open('F:/t/test/stopWord.txt', 'r', encoding='utf-8').readlines()] #广播形式获取停用词表
    pos['d_w'] = [delstopword(line, stopkey) for line in pos['c_w']]
    neg['d_w'] = [delstopword(line, stopkey) for line in neg['c_w']]

    # 存数据到csv
    savedata('F:/t/test/pos_jieba.csv', 'pos_jieba', pos['d_w'])
    savedata('F:/t/test/neg_jieba.csv', 'neg_jieba', neg['d_w'])