中英文分词

1.jieba 分词

# -*- coding: UTF-8 -*-
from collections import Counter
import jieba

def stopwordslist():
    stopwords = [line.strip() for line in open('stopwords_en.txt',encoding='UTF-8').readlines()]
    return stopwords

def seg_depart(sentence):
     sentence_depart = jieba.cut(sentence.strip())
     stopwords = stopwordslist()
     outstr = ''
     for word in sentence_depart:
         if word not in stopwords:
             if word != '\t':
                 outstr += word
                 outstr += " "
     return outstr  
filename = "2.txt"
outfilename = "2stop.txt"
inputs = open(filename, 'r', encoding='UTF-8')
outputs = open(outfilename, 'w', encoding='UTF-8')
for line in inputs:
       line_seg = seg_depart(line)
       outputs.write(line_seg + '\n')

outputs.close()
inputs.close()

2.nltk分词(英文)

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

def stopwordslist():
    stopwords = [line.strip() for line in open('stopwords_en.txt',encoding='UTF-8').readlines()]
    return stopwords
    
def seg_depart(sentence):
    cutwords1 = nltk.word_tokenize(sentence.strip())
    stopwords = stopwordslist()
    outstr = ''
    interpunctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']    #去标点
    cutwords2 = [word for word in cutwords1 if (word not in interpunctuations)] #去停用词
    for word in cutwords2:
         if word not in stopwords:
             if word != '\t':
                 outstr += word
                 outstr += " "
    return outstr 

filename = "E:\\实验\\data\\test2.txt"            #读写
outfilename = "E:\\实验\\data\\test2stop.txt"
inputs = open(filename, 'r', encoding='UTF-8')
outputs = open(outfilename, 'w', encoding='UTF-8')
for line in inputs:
       line_seg = seg_depart(line)
       outputs.write(line_seg + '\n')
outputs.close()
inputs.close()

'''
english='E:\\实验\\data\\test2.txt'
with open(english,'r',encoding='utf-8') as file:
    u=file.read()  
str = seg_depart(line)
str=re.sub('[^\w ]','',u)
cutwords1 = nltk.word_tokenize(str)
interpunctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']    
cutwords2 = [word for word in cutwords1 if (word not in interpunctuations)]

stops = [line.strip() for line in open('stopwords_en.txt',encoding='UTF-8').readlines()]
cutwords3 = [word for word in cutwords2 if word not in stops]
print('\n【NLTK分词后去除停用词结果:】')
print(cutwords3)


cutwords4 = []
for cutword in cutwords3:
    cutwords4.append(PorterStemmer().stem(cutword))    #词干提取
print(cutwords4)
print('\n【NLTK分词进行词形还原:】')
cutwords5 = []
for cutword2 in cutwords4:
    cutwords5.append(WordNetLemmatizer().lemmatize(cutword2,pos='v'))   #指定还原词性为名词
print(cutwords5)

#print(nltk.pos_tag(cutwords2)) #词性标注
'''

nltk_data资源:

链接:https://pan.baidu.com/s/1WHpPZvdJhLQJU-HXqkdlLw
提取码:bm1u

参考:https://blog.csdn.net/sk_berry/article/details/105240317?ops_request_misc=%25257B%252522request%25255Fid%252522%25253A%252522160845009916780299088803%252522%25252C%252522scm%252522%25253A%25252220140713.130102334..%252522%25257D&request_id=160845009916780299088803&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~all~top_click~default-1-105240317.nonecase&utm_term=nltk%E5%88%86%E8%AF%8D

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值