python文本预处理+获取值

该博客主要展示了如何进行数据预处理,包括HTML标签移除、重音字符处理、特殊字符移除、停用词移除等步骤,并使用NLTK和Spacy库进行词干提取和词形还原。接着,应用TF-IDF向量化技术进行文本特征提取,最后通过绘制条形图展示高频率词汇。博客涵盖了自然语言处理和信息检索的基础操作。
摘要由CSDN通过智能技术生成

准备数据集

**.csv

代码示例:

import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import gutenberg
import numpy as np
import unicodedata
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.stem import RegexpStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk
from nltk.stem import LancasterStemmer
from nltk import word_tokenize , PorterStemmer,WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import spacy
from nltk.tokenize.toktok import ToktokTokenizer
def strip_html_tags(text):
    bs=BeautifulSoup(text,'html.parser')
    stripped_text=bs.get_text()
    stripped_text=re.sub(r'[\r|\n|\r\n]+','\n',stripped_text)
    return stripped_text
def remove_accented_chars(text):
    text=unicodedata.normalize('NFKD',text).encode('ascii','ignore').decode('utf-8','ignore')
    return text
def remove_special_characters(text,remove_digits=False):
    pattern=r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text=re.sub(pattern,'',text)
    return text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text=' '.join([ps.stem(word) for word in text.split()])
    return text
def lemmatize_text(text):
    nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)
    text=nlp(text)
    text=' '.join([word.lemma_ if word.lemma_!='-PRON-' else word.text for word in text])
    return text
def remove_stopwords(text,is_lower_case=False):
    tokenizer = ToktokTokenizer()
    stopword_list = nltk.corpus.stopwords.words('english')
    tokens=tokenizer.tokenize(text)
    tokens=[token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens=[token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens=[token for token in tokens if token.lower() not in stopword_list]
    filtered_text=' '.join(filtered_tokens)
    return filtered_text
def normalize_corpus(corpus,html_stripping=True,contraction_expansion=True,
                     accented_char_removal=True,text_lower_case=True,text_lemmatization=True,
                     special_char_removal=True,stopword_removal=True,remove_digits=True):
    normalized_corpus=[]
    for doc in corpus:
        print('=== raw text\n',doc)
        if html_stripping:
            doc=strip_html_tags(doc)
            print('=== no html tag\n',doc)
        if accented_char_removal:
            doc=remove_accented_chars(doc)
            print('=== no accented chars\n',doc)
        if text_lower_case:
            doc=doc.lower()
        '''
        # remove extra newlines
        doc=re.sub(r'[\r|\n|\r\n]+',' ',doc)
        if text_lemmatization:
            doc=lemmatize_text(doc)
            print('=== lemmas\n',doc)
        '''
        if special_char_removal:
            special_char_pattern=re.compile(r'([{.(-)!}])')
            doc=special_char_pattern.sub(" \\1 ",doc)
            doc=remove_special_characters(doc,remove_digits=remove_digits)
            print('=== no special chars\n',doc)
        # remove extra whitespace
        doc=re.sub(' +',' ',doc)
        if stopword_removal:
            doc=remove_stopwords(doc,is_lower_case=text_lower_case)
            print('=== no stopwords\n',doc)
        normalized_corpus.append(doc)
    return normalized_corpus
if __name__ == '__main__':
    io = r'C:\Users\Thinker\Desktop\FBPosts.csv'
    data = pd.read_csv(io)
    data_key = data['imdb_id']
    data_value = data['message_and_description']
    dict_data = pd.DataFrame({'imdb_id': data_key, 'message_and_description': data_value})
    a = ['tt0848228', 'tt1345836', 'tt1392170', 'tt1673434']  # , '','tt1392170','tt1673434
    num = 0
    all_data = []
    for list_i in a:
        courp = []
        for i, v in zip(dict_data['imdb_id'], dict_data['message_and_description']):
            listi = []
            if i == list_i:
                if v != ' ':
                    listi.append(v)
                    courp.append(listi)
        corpua = [i for i in courp]
        courps = [n for a in corpua for n in a]
        alice=gutenberg.raw(fileids='carroll-alice.txt')
        k = normalize_corpus(courps)
        vectorizer = TfidfVectorizer(stop_words='english')
        X = vectorizer.fit_transform(k)
        feature_array = vectorizer.get_feature_names()
        top_n = 15
        data = sorted(list(zip(vectorizer.get_feature_names(), X.sum(0).getA1())),key=lambda x: x[1], reverse=True)[:top_n]
        data1 = []
        data2 = []
        for i in data:
            data1.append(i[0])
            data2.append(i[1])
        x = data1
        y = data2
        plt.bar(x,y)
        print(data1)
        print(data2)
        plt.xticks(rotation=75)  # 旋转90度
        plt.title('top_scoures')
        plt.show()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值