中文转换成英文处理(文件加载&jieba分词)

  中文分词后文档转换成英文处理,代码:

paths = r"路径" #文件名0.txt
posts = [open(os.path.join(paths,f)).read() for f in os.listdir(paths)]   #打开方式1
# 中文分词/写入分词后的文件
pathsCut = r"C:\Users\Administrator\workspace\eclipse\mlTest\src\kmeans\cut"
postAll = []
for n in range(5):
    postTmp = []
    tmp = jieba.cut(posts[n],cut_all = False)  #存储在生成器
    fileName=pathsCut+"/" +str(n) + ".txt"
    f = open(fileName,"a")
    for s in tmp:
        #f.write(s.join(" "))#词之间加空格
     f.write(s)
     f.write(" ")
        postTmp.append(s)
    postAll.append(postTmp)

  文件打开方式2:

for line in open("file.txt"):
    for word in line.split():
        print(word)

  jieba分词概述:

import jieba as jb
import jieba.posseg as pos
jb.load_userdict("C:\et_corpus.txt") #加载自定义词典
tmp = jb.cut(posts[n],cut_all = False)  #存储在生成器,接上述代码
#cut_all模式,True全模式;False精确模式(默认);jb.cut_for_search 搜索引擎模式
cutPos = pos.cut ("text") # 词性标注
for w in cutPos:
    print(w.word,w.flag)

  jieba词干提取:

    def tfidf1(self, word,file,files):  # 逐词计算-人工
        tf = float(file.count(word)) / sum(file.count(w) for w in set(file))
        idf = sp.log(float(len(files)) / (len([doc for doc in files if word in doc])))
        return tf * idf
    def tfidf2(self, paths,n):
        posts = [open(os.path.join(paths,f)).readlines() for f in os.listdir(paths)]
        import jieba.analyse   #jieba自带词干提取功能,原理同tfidf
        keyWords = []
        for f in posts:
            keyWord = jieba.analyse.extract_tags(f[0],n)
            keyWords.append(str(keyWord).replace("'", ""))
        return keyWords
    def tfidf3(self, paths,n):
        from sklearn.feature_extraction.text import TfidfTransformer as tf
        from sklearn.feature_extraction.text import CountVectorizer as cv
        ins = PreProcessing()
        posts = ins.loadData(paths)
        vectorizer = cv()
        transformer = tf()
        tfidf = transformer.fit_transform(vectorizer.fit_transform(posts))
        word = vectorizer.get_feature_names()
        print(word)
        weight = tfidf.toarray()
        for i in range(len(weight)):
            for j in range(len(word)):
                print(word[j],weight[i][i])

 

转载于:https://www.cnblogs.com/250apples/p/5569225.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值