中英文分词

最新推荐文章于 2022-05-08 22:57:56 发布

此昵称为什么总存在

最新推荐文章于 2022-05-08 22:57:56 发布

阅读量251

点赞数

分类专栏：分词文章标签： nlp

本文链接：https://blog.csdn.net/weixin_42317302/article/details/111449077

版权

分词专栏收录该内容

2 篇文章 0 订阅

订阅专栏

1.jieba 分词

# -*- coding: UTF-8 -*-
from collections import Counter
import jieba

def stopwordslist():
    stopwords = [line.strip() for line in open('stopwords_en.txt',encoding='UTF-8').readlines()]
    return stopwords

def seg_depart(sentence):
     sentence_depart = jieba.cut(sentence.strip())
     stopwords = stopwordslist()
     outstr = ''
     for word in sentence_depart:
         if word not in stopwords:
             if word != '\t':
                 outstr += word
                 outstr += " "
     return outstr  
filename = "2.txt"
outfilename = "2stop.txt"
inputs = open(filename, 'r', encoding='UTF-8')
outputs = open(outfilename, 'w', encoding='UTF-8')
for line in inputs:
       line_seg = seg_depart(line)
       outputs.write(line_seg + '\n')

outputs.close()
inputs.close()

2.nltk分词（英文）

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

def stopwordslist():
    stopwords = [line.strip() for line in open('stopwords_en.txt',encoding='UTF-8').readlines()]
    return stopwords
    
def seg_depart(sentence):
    cutwords1 = nltk.word_tokenize(sentence.strip())
    stopwords = stopwordslist()
    outstr = ''
    interpunctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']    #去标点
    cutwords2 = [word for word in cutwords1 if (word not in interpunctuations)] #去停用词
    for word in cutwords2:
         if word not in stopwords:
             if word != '\t':
                 outstr += word
                 outstr += " "
    return outstr 

filename = "E:\\实验\\data\\test2.txt"            #读写
outfilename = "E:\\实验\\data\\test2stop.txt"
inputs = open(filename, 'r', encoding='UTF-8')
outputs = open(outfilename, 'w', encoding='UTF-8')
for line in inputs:
       line_seg = seg_depart(line)
       outputs.write(line_seg + '\n')
outputs.close()
inputs.close()

'''
english='E:\\实验\\data\\test2.txt'
with open(english,'r',encoding='utf-8') as file:
    u=file.read()  
str = seg_depart(line)
str=re.sub('[^\w ]','',u)
cutwords1 = nltk.word_tokenize(str)
interpunctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']    
cutwords2 = [word for word in cutwords1 if (word not in interpunctuations)]

stops = [line.strip() for line in open('stopwords_en.txt',encoding='UTF-8').readlines()]
cutwords3 = [word for word in cutwords2 if word not in stops]
print('\n【NLTK分词后去除停用词结果：】')
print(cutwords3)


cutwords4 = []
for cutword in cutwords3:
    cutwords4.append(PorterStemmer().stem(cutword))    #词干提取
print(cutwords4)
print('\n【NLTK分词进行词形还原：】')
cutwords5 = []
for cutword2 in cutwords4:
    cutwords5.append(WordNetLemmatizer().lemmatize(cutword2,pos='v'))   #指定还原词性为名词
print(cutwords5)

#print(nltk.pos_tag(cutwords2)) #词性标注
'''

nltk_data资源：

链接：https://pan.baidu.com/s/1WHpPZvdJhLQJU-HXqkdlLw
提取码：bm1u

参考：https://blog.csdn.net/sk_berry/article/details/105240317?ops_request_misc=%25257B%252522request%25255Fid%252522%25253A%252522160845009916780299088803%252522%25252C%252522scm%252522%25253A%25252220140713.130102334..%252522%25257D&request_id=160845009916780299088803&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~all~top_click~default-1-105240317.nonecase&utm_term=nltk%E5%88%86%E8%AF%8D