英文数据简单预处理及简单情感分析_英文评论数据情感分析-CSDN博客

本文链接：https://blog.csdn.net/ywp_2016/article/details/131808460

该代码示例展示了如何使用Python的nltk和spacy库进行文本预处理，包括去除标点符号、分句、词干提取和停用词过滤。之后，利用TextBlob进行情感分析，得出文本的情感极性和主观性程度。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

简单预处理及简单情感分析

#主要参考：https://blog.csdn.net/weixin_37834134/article/details/82710993
#Python数据操作—词干与词形化 https://blog.csdn.net/kan2281123066/article/details/81458641
# -*- coding: utf8 -*-
import nltk
import re
import os
import spacy
from nltk.stem import SnowballStemmer
filepath = r'D:\Desktop\Lancang-Mekong\update_2021_2022\country_year\China/2021/China_2021.txt'

#读文件
f=open(filepath,'r', encoding='utf-8')
raw=f.read()
print('raw\n',raw)

#分句子
#sent_tokenizer=nltk.data.load(r'C:\Users\Lenovo\AppData\Roaming/nltk_data/tokenizers\punkt/english.pickle')
#sents=sent_tokenizer.tokenize(raw)


#去除标点等无用的符号
p1=re.compile(r'-\{.*?(zh-hans|zh-cn):([^;]*?)(;.*?)?\}-')
p2=re.compile(r'[(][: @ . , ？！\s][)]')
p3=re.compile(r'[「『]')
p4=re.compile(r'[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”！，。？、~@#￥%……&*（）0-9 , : ; \-\ \[\ \]\ ]')


line=p1.sub(r' ',raw)
line=p2.sub(r' ',line)
line=p3.sub(r' ',line)
line=p4.sub(r' ',line)
line=re.findall(r'([a-z -Z \n])',line,re.MULTILINE) #只保留字母
writepath = r'D:/Desktop/Lancang-Mekong/update_2021_2022/country_year/China/2021/China_2021_english.txt'
f=open(writepath,'a', encoding='utf-8')
f.writelines(line)


f=open(writepath,'r', encoding='utf-8')
line=f.read()
#分词
wordsinstr=nltk.word_tokenize(line)
print('wordsinstr\n',wordsinstr)


#去除停用词
cleanwords=[]
sr={}.fromkeys([line.strip() for line in open(r'D:\Desktop\Lancang-Mekong\update_2021_2022\preprocessing/en_stopwords.txt')])
cleanwords+=[[words.lower() for words in wordsinstr if words.lower() not in sr]]
print('cleanwords\n',cleanwords)

f=open(r'D:/Desktop/Lancang-Mekong/update_2021_2022/country_year/China/2021/China_2021_processresult.txt','w', encoding='utf-8')
print(str(cleanwords),file=f)
f.close()

from textblob import TextBlob
import nltk


#读文件
filepath = r'D:/Desktop/Lancang-Mekong/update_2021_2022/country_year/China/2021/China_2021_processresult.txt'
f=open(filepath,'r', encoding='utf-8')
raw=f.read()
#print('raw:\n',raw)


blob = TextBlob(raw)
print(blob.sentiment)
# polarity代表情感极性，range从-1到1，负数表示负面情感，正数表示正面情感
# subjectivity代表主观性程度，范围从0到1，越接近1说明越是自己的情感态度