#主要参考:https://blog.csdn.net/weixin_37834134/article/details/82710993
#Python数据操作—词干与词形化 https://blog.csdn.net/kan2281123066/article/details/81458641
# -*- coding: utf8 -*-
import nltk
import re
import os
import spacy
from nltk.stem import SnowballStemmer
filepath = r'D:\Desktop\Lancang-Mekong\update_2021_2022\country_year\China/2021/China_2021.txt'
#读文件
f=open(filepath,'r', encoding='utf-8')
raw=f.read()
print('raw\n',raw)
#分句子
#sent_tokenizer=nltk.data.load(r'C:\Users\Lenovo\AppData\Roaming/nltk_data/tokenizers\punkt/english.pickle')
#sents=sent_tokenizer.tokenize(raw)
#去除标点等无用的符号
p1=re.compile(r'-\{.*?(zh-hans|zh-cn):([^;]*?)(;.*?)?\}-')
p2=re.compile(r'[(][: @ . , ?!\s][)]')
p3=re.compile(r'[「『]')
p4=re.compile(r'[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()0-9 , : ; \-\ \[\ \]\ ]')
line=p1.sub(r' ',raw)
line=p2.sub(r' ',line)
line=p3.sub(r' ',line)
line=p4.sub(r' ',line)
line=re.findall(r'([a-z