中文分词后文档转换成英文处理,代码:
paths = r"路径" #文件名0.txt
posts = [open(os.path.join(paths,f)).read() for f in os.listdir(paths)] #打开方式1
# 中文分词/写入分词后的文件
pathsCut = r"C:\Users\Administrator\workspace\eclipse\mlTest\src\kmeans\cut"
postAll = []
for n in range(5):
postTmp = []
tmp = jieba.cut(posts[n],cut_all = False) #存储在生成器
fileName=pathsCut+"/" +str(n) + ".txt"
f = open(fileName,"a")
for s in tmp:
#f.write(s.join(" "))#词之间加空格
f.write(s)
f.write(" ")
postTmp.append(s)
postAll.append(postTmp)
文件打开方式2:
for line in open("file.txt"):
for word in line.split():
print(word)
jieba分词概述:
import jieba as jb
import jieba.posseg as pos
jb.load_userdict("C:\et_corpus.txt") #加载自定义词典
tmp = jb.cut(posts[n],cut_all = False) #存储在生成器,接上述代码
#cut_all模式,True全模式;False精确模式(默认);jb.cut_for_search 搜索引擎模式
cutPos = pos.cut ("text") # 词性标注
for w in cutPos:
print(w.word,w.flag)
jieba词干提取:
def tfidf1(self, word,file,files): # 逐词计算-人工 tf = float(file.count(word)) / sum(file.count(w) for w in set(file)) idf = sp.log(float(len(files)) / (len([doc for doc in files if word in doc]))) return tf * idf def tfidf2(self, paths,n): posts = [open(os.path.join(paths,f)).readlines() for f in os.listdir(paths)] import jieba.analyse #jieba自带词干提取功能,原理同tfidf keyWords = [] for f in posts: keyWord = jieba.analyse.extract_tags(f[0],n) keyWords.append(str(keyWord).replace("'", "")) return keyWords def tfidf3(self, paths,n): from sklearn.feature_extraction.text import TfidfTransformer as tf from sklearn.feature_extraction.text import CountVectorizer as cv ins = PreProcessing() posts = ins.loadData(paths) vectorizer = cv() transformer = tf() tfidf = transformer.fit_transform(vectorizer.fit_transform(posts)) word = vectorizer.get_feature_names() print(word) weight = tfidf.toarray() for i in range(len(weight)): for j in range(len(word)): print(word[j],weight[i][i])