#本文是将名为27.docx的word文档分句、查找含china关键词的句子、去掉句子标点,然后将相关句子输入名为27okst的excel文档,然后为词语共现分析做准备。大家可以根据自己需求,更改相应的文档名称即可。
#如果你没有安装相应程序包,需要先pip进行安装。
#参考了前辈的大量文章,欢迎大家指正批评。
import nltk
import readDocx
import openpyxl
import re
text=readDocx.getText(‘27.docx’).lower()
sens=nltk.sent_tokenize(text)
sens_china=[]
for sentence in sens:
if ‘china’ in sentence:
sens_china.append(sentence)
wb=openpyxl.Workbook()
sheet=wb[‘Sheet’]
for i in range(len(sens_china)):
sheet[‘A’+str(i+1)]=sens_china[i]
for i in range(len(sens_china)):
text=sheet[‘A’+str(i+1)].value
text_list=re.sub("[^a-zA-Z]"," “,text).split()
english_punctuations = [’,’, ‘.’, ‘:’, ‘;’, ‘?’, ‘(’, ‘)’, ‘[’, ‘]’, ‘&’, ‘!’, ‘*’, ‘@’, ‘#’, ‘$’, ‘%’,’%’,’–’,”’’",’:”``”’]
text_list = [word for word in text_list if word not in english_punctuations]
text1=’ '.join(text_list)
sheet[‘A’+str(i+1)]=str(text1)
wb.save(‘27okst.xlsx’)