python文本预处理+获取值

最新推荐文章于 2024-05-01 21:26:03 发布

多读文献少吃饭

最新推荐文章于 2024-05-01 21:26:03 发布

阅读量202

点赞数

分类专栏： Python

本文链接：https://blog.csdn.net/KJ171309113/article/details/116237755

版权

Python 专栏收录该内容

13 篇文章 0 订阅

订阅专栏

该博客主要展示了如何进行数据预处理，包括HTML标签移除、重音字符处理、特殊字符移除、停用词移除等步骤，并使用NLTK和Spacy库进行词干提取和词形还原。接着，应用TF-IDF向量化技术进行文本特征提取，最后通过绘制条形图展示高频率词汇。博客涵盖了自然语言处理和信息检索的基础操作。

摘要由CSDN通过智能技术生成

准备数据集

**.csv

代码示例：

import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import gutenberg
import numpy as np
import unicodedata
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.stem import RegexpStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk
from nltk.stem import LancasterStemmer
from nltk import word_tokenize , PorterStemmer,WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import spacy
from nltk.tokenize.toktok import ToktokTokenizer
def strip_html_tags(text):
    bs=BeautifulSoup(text,'html.parser')
    stripped_text=bs.get_text()
    stripped_text=re.sub(r'[\r|\n|\r\n]+','\n',stripped_text)
    return stripped_text
def remove_accented_chars(text):
    text=unicodedata.normalize('NFKD',text).encode('ascii','ignore').decode('utf-8','ignore')
    return text
def remove_special_characters(text,remove_digits=False):
    pattern=r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text=re.sub(pattern,'',text)
    return text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text=' '.join([ps.stem(word) for word in text.split()])
    return text
def lemmatize_text(text):
    nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)
    text=nlp(text)
    text=' '.join([word.lemma_ if word.lemma_!='-PRON-' else word.text for word in text])
    return text
def remove_stopwords(text,is_lower_case=False):
    tokenizer = ToktokTokenizer()
    stopword_list = nltk.corpus.stopwords.words('english')
    tokens=tokenizer.tokenize(text)
    tokens=[token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens=[token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens=[token for token in tokens if token.lower() not in stopword_list]
    filtered_text=' '.join(filtered_tokens)
    return filtered_text
def normalize_corpus(corpus,html_stripping=True,contraction_expansion=True,
                     accented_char_removal=True,text_lower_case=True,text_lemmatization=True,
                     special_char_removal=True,stopword_removal=True,remove_digits=True):
    normalized_corpus=[]
    for doc in corpus:
        print('=== raw text\n',doc)
        if html_stripping:
            doc=strip_html_tags(doc)
            print('=== no html tag\n',doc)
        if accented_char_removal:
            doc=remove_accented_chars(doc)
            print('=== no accented chars\n',doc)
        if text_lower_case:
            doc=doc.lower()
        '''
        # remove extra newlines
        doc=re.sub(r'[\r|\n|\r\n]+',' ',doc)
        if text_lemmatization:
            doc=lemmatize_text(doc)
            print('=== lemmas\n',doc)
        '''
        if special_char_removal:
            special_char_pattern=re.compile(r'([{.(-)!}])')
            doc=special_char_pattern.sub(" \\1 ",doc)
            doc=remove_special_characters(doc,remove_digits=remove_digits)
            print('=== no special chars\n',doc)
        # remove extra whitespace
        doc=re.sub(' +',' ',doc)
        if stopword_removal:
            doc=remove_stopwords(doc,is_lower_case=text_lower_case)
            print('=== no stopwords\n',doc)
        normalized_corpus.append(doc)
    return normalized_corpus
if __name__ == '__main__':
    io = r'C:\Users\Thinker\Desktop\FBPosts.csv'
    data = pd.read_csv(io)
    data_key = data['imdb_id']
    data_value = data['message_and_description']
    dict_data = pd.DataFrame({'imdb_id': data_key, 'message_and_description': data_value})
    a = ['tt0848228', 'tt1345836', 'tt1392170', 'tt1673434']  # , '','tt1392170','tt1673434
    num = 0
    all_data = []
    for list_i in a:
        courp = []
        for i, v in zip(dict_data['imdb_id'], dict_data['message_and_description']):
            listi = []
            if i == list_i:
                if v != ' ':
                    listi.append(v)
                    courp.append(listi)
        corpua = [i for i in courp]
        courps = [n for a in corpua for n in a]
        alice=gutenberg.raw(fileids='carroll-alice.txt')
        k = normalize_corpus(courps)
        vectorizer = TfidfVectorizer(stop_words='english')
        X = vectorizer.fit_transform(k)
        feature_array = vectorizer.get_feature_names()
        top_n = 15
        data = sorted(list(zip(vectorizer.get_feature_names(), X.sum(0).getA1())),key=lambda x: x[1], reverse=True)[:top_n]
        data1 = []
        data2 = []
        for i in data:
            data1.append(i[0])
            data2.append(i[1])
        x = data1
        y = data2
        plt.bar(x,y)
        print(data1)
        print(data2)
        plt.xticks(rotation=75)  # 旋转90度
        plt.title('top_scoures')
        plt.show()

多读文献少吃饭

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
python文本预处理+获取值

准备数据集**.csv代码示例：import requestsimport reimport pandas as pdfrom bs4 import BeautifulSoupimport nltkimport matplotlib.pyplot as pltfrom nltk.corpus import gutenbergimport numpy as npimport unicodedatafrom nltk.corpus import wordnetfrom nltk.ste
复制链接

扫一扫

专栏目录