#主要参考:https://blog.csdn.net/weixin_37834134/article/details/82710993
#Python数据操作—词干与词形化 https://blog.csdn.net/kan2281123066/article/details/81458641
# -*- coding: utf8 -*-
import nltk
import re
import os
import spacy
from nltk.stem import SnowballStemmer
filepath = r'D:\Desktop\Lancang-Mekong\update_2021_2022\country_year\China/2021/China_2021.txt'
#读文件
f=open(filepath,'r', encoding='utf-8')
raw=f.read()
print('raw\n',raw)
#分句子
#sent_tokenizer=nltk.data.load(r'C:\Users\Lenovo\AppData\Roaming/nltk_data/tokenizers\punkt/english.pickle')
#sents=sent_tokenizer.tokenize(raw)
#去除标点等无用的符号
p1=re.compile(r'-\{.*?(zh-hans|zh-cn):([^;]*?)(;.*?)?\}-')
p2=re.compile(r'[(][: @ . , ?!\s][)]')
p3=re.compile(r'[「『]')
p4=re.compile(r'[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()0-9 , : ; \-\ \[\ \]\ ]')
line=p1.sub(r' ',raw)
line=p2.sub(r' ',line)
line=p3.sub(r' ',line)
line=p4.sub(r' ',line)
line=re.findall(r'([a-z -Z \n])',line,re.MULTILINE) #只保留字母
writepath = r'D:/Desktop/Lancang-Mekong/update_2021_2022/country_year/China/2021/China_2021_english.txt'
f=open(writepath,'a', encoding='utf-8')
f.writelines(line)
f=open(writepath,'r', encoding='utf-8')
line=f.read()
#分词
wordsinstr=nltk.word_tokenize(line)
print('wordsinstr\n',wordsinstr)
#去除停用词
cleanwords=[]
sr={}.fromkeys([line.strip() for line in open(r'D:\Desktop\Lancang-Mekong\update_2021_2022\preprocessing/en_stopwords.txt')])
cleanwords+=[[words.lower() for words in wordsinstr if words.lower() not in sr]]
print('cleanwords\n',cleanwords)
f=open(r'D:/Desktop/Lancang-Mekong/update_2021_2022/country_year/China/2021/China_2021_processresult.txt','w', encoding='utf-8')
print(str(cleanwords),file=f)
f.close()
from textblob import TextBlob
import nltk
#读文件
filepath = r'D:/Desktop/Lancang-Mekong/update_2021_2022/country_year/China/2021/China_2021_processresult.txt'
f=open(filepath,'r', encoding='utf-8')
raw=f.read()
#print('raw:\n',raw)
blob = TextBlob(raw)
print(blob.sentiment)
# polarity代表情感极性,range从-1到1,负数表示负面情感,正数表示正面情感
# subjectivity代表主观性程度,范围从0到1,越接近1说明越是自己的情感态度