NLTK与jieba的方式分词、去除停用词

goceany

于 2024-06-19 10:18:49 发布

阅读量269

点赞数 3

文章标签：算法

本文链接：https://blog.csdn.net/weixin_51903430/article/details/139794988

版权

'''
nltk 用来处理英文分词、去除停用词，jieba用来处理中文分词、去除停用词
'''

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

# 下载nltk的停用词列表和punkt分词模型
nltk.download('punkt')
nltk.download('stopwords')

# 定义一个函数来去除停用词
def remove_stopwords(text):
    # 获取英文停用词列表，也可以根据需要使用其他语言的停用词列表
    stop_words = set(stopwords.words('english'))
    # 分词
    words = word_tokenize(text)
    # 去除停用词
    filtered_words = [word for word in words if word.lower() not in stop_words and word.isalnum()]
    return ' '.join(filtered_words)

if __name__ == '__main__':

    # 示例文本
    texts = [
        "This is a sample sentence, showing off the stop words filtration.",
        "Another example to demonstrate the removal of common words."
    ]

    # 去除停用词
    filtered_texts = [remove_stopwords(text) for text in texts]
    out_list = []
    # 输出处理后的文本
    for text in filtered_texts:
        out_list.append(text)
    print(out_list)

    '''
    以上为nitk的使用，以下为jieba的使用
    '''
    import pandas as pd
    raw = pd.read_csv("C:\job\AI数据集平台工具\数据\逻辑回归.csv")
    text = raw['content'].values.tolist()
    print(text)
    import jieba.analyse as ana
    ana.set_stop_words('C:\job\AI数据集平台工具\数据\逻辑回归停用词.txt')  # 输入停用词
    text_list = []
    for w in text:
        word_list = ana.extract_tags(str(w), topK=10000, withWeight=False)  # 去除停用词+词频分析
        text_list.append(word_list)
    print(text_list)

goceany

关注

3
点赞
踩
5

收藏

觉得还不错? 一键收藏
0
评论
NLTK与jieba的方式分词、去除停用词

word_list = ana.extract_tags(str(w), topK=10000, withWeight=False) # 去除停用词+词频分析。ana.set_stop_words('C:\job\AI数据集平台工具\数据\逻辑回归停用词.txt') # 输入停用词。nltk 用来处理英文分词、去除停用词，jieba用来处理中文分词、去除停用词。# 获取英文停用词列表，也可以根据需要使用其他语言的停用词列表。# 下载nltk的停用词列表和punkt分词模型。# 输出处理后的文本。
复制链接

扫一扫