1.jieba 分词
# -*- coding: UTF-8 -*-
from collections import Counter
import jieba
def stopwordslist():
stopwords = [line.strip() for line in open('stopwords_en.txt',encoding='UTF-8').readlines()]
return stopwords
def seg_depart(sentence):
sentence_depart = jieba.cut(sentence.strip())
stopwords = stopwordslist()
outstr = ''
for word in sentence_depart:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
return outstr
filename = "2.txt"
outfilename = "2stop.txt"
inputs = open(filename, 'r', encoding='UTF-8')
outputs = open(outfilename, 'w', encoding='UTF-8')
for line in inputs:
line_seg = seg_depart(line)
outputs.write(line_seg + '\n')
outputs.close()
inputs.close()
2.nltk分词(英文)
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
def stopwordslist():
stopwords = [line.strip() for line in open('stopwords_en.txt',encoding='UTF-8').readlines()]
return stopwords
def seg_depart(sentence):
cutwords1 = nltk.word_tokenize(sentence.strip())
stopwords = stopwordslist()
outstr = ''
interpunctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%'] #去标点
cutwords2 = [word for word in cutwords1 if (word not in interpunctuations)] #去停用词
for word in cutwords2:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
return outstr
filename = "E:\\实验\\data\\test2.txt" #读写
outfilename = "E:\\实验\\data\\test2stop.txt"
inputs = open(filename, 'r', encoding='UTF-8')
outputs = open(outfilename, 'w', encoding='UTF-8')
for line in inputs:
line_seg = seg_depart(line)
outputs.write(line_seg + '\n')
outputs.close()
inputs.close()
'''
english='E:\\实验\\data\\test2.txt'
with open(english,'r',encoding='utf-8') as file:
u=file.read()
str = seg_depart(line)
str=re.sub('[^\w ]','',u)
cutwords1 = nltk.word_tokenize(str)
interpunctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
cutwords2 = [word for word in cutwords1 if (word not in interpunctuations)]
stops = [line.strip() for line in open('stopwords_en.txt',encoding='UTF-8').readlines()]
cutwords3 = [word for word in cutwords2 if word not in stops]
print('\n【NLTK分词后去除停用词结果:】')
print(cutwords3)
cutwords4 = []
for cutword in cutwords3:
cutwords4.append(PorterStemmer().stem(cutword)) #词干提取
print(cutwords4)
print('\n【NLTK分词进行词形还原:】')
cutwords5 = []
for cutword2 in cutwords4:
cutwords5.append(WordNetLemmatizer().lemmatize(cutword2,pos='v')) #指定还原词性为名词
print(cutwords5)
#print(nltk.pos_tag(cutwords2)) #词性标注
'''
nltk_data资源:
链接:https://pan.baidu.com/s/1WHpPZvdJhLQJU-HXqkdlLw
提取码:bm1u