E4523 Text Mining

最新推荐文章于 2024-07-09 23:43:07 发布

weixin_44602072

最新推荐文章于 2024-07-09 23:43:07 发布

阅读量645

点赞数

文章标签： leetcode 机器学习人工智能

本文链接：https://blog.csdn.net/weixin_44602072/article/details/121483038

版权

Types of analysis

1. Sentiment Analysis*

1.1 simple sentiment analysis

1.2 Using NRC Data

1.3 Weighted sentiment analysis using Vader

2. Text Corpora

2.1 read directories

2.2 Inaugural Speeches

5. Text Summarization

5.1 Frequency Based

5.2 Gensim

6. Topic

6.1 LDA: Latent Dirichlet Allocation Model¶

6.2 Matching topics to doc

7. Similarity

Types of analysis

Sentiment analysis: Deciding whether a document (or concept) is positive or negative
Entity analysis: Identifying entities (Named entities, Parts of speech) and properties of these entities
Topic analysis: Deciding what the major topics associated with a piece of tex
Text summarization: Summarizing a document (Cliff notes version!)

1. Sentiment Analysis*

1.1 simple sentiment analysis

get pos & neg words list

def get_pos_neg_words():
    def get_words(url):
        import requests
        words = requests.get(url).content.decode('latin-1')
        word_list = words.split('\n')
        index = 0
        while index < len(word_list):
            word = word_list[index]
            if ';' in word or not word:
                word_list.pop(index)
            else:
                index+=1
        return word_list

    #Get lists of positive and negative words
    p_url = 'http://ptrckprry.com/course/ssd/data/positive-words.txt'
    n_url = 'http://ptrckprry.com/course/ssd/data/negative-words.txt'
    positive_words = get_words(p_url)
    negative_words = get_words(n_url)
    return positive_words,negative_words

positive_words,negative_words = get_pos_neg_words()

打开文件

with open('community.txt','r') as f:
    community = f.read()

计算正/负词比例

def do_pos_neg_sentiment_analysis(text_list,debug=False):
    positive_words,negative_words = get_pos_neg_words()
    from nltk import word_tokenize
    results = list()
    for text in text_list:
        cpos = cneg = lpos = lneg = 0
        for word in word_tokenize(text[1]):
            if word in positive_words:
                if debug:
                    print("Positive",word)
                cpos+=1
            if word in negative_words:
                if debug:
                    print("Negative",word)
                cneg+=1
        results.append((text[0],cpos/len(word_tokenize(text[1])),cneg/len(word_tokenize(text[1]))))
    return results

do_pos_neg_sentiment_analysis([('community',community),('le_monde',le_monde)])

1.2 Using NRC Data

读取NRC data

def get_nrc_data():
    nrc = "NRC-Emotion-Lexicon-Senselevel-v0.92.txt"
    count=0
    emotion_dict=dict()
    with open(nrc,'r') as f:
        all_lines = list()
        for line in f:
            if count < 46:
                count+=1
                continue
            line = line.strip().split('\t')
            if int(line[2]) == 1:
                if emotion_dict.get(line[0]):
                    emotion_dict[line[0]].append(line[1])
                else:
                    emotion_dict[line[0]] = [line[1]]
    return emotion_dict

计算emotions的占比

def emotion_analyzer(text,emotion_dict=emotion_dict):
    #Set up the result dictionary
    emotions = {x for y in emotion_dict.values() for x in y}
    emotion_count = dict()
    for emotion in emotions:
        emotion_count[emotion] = 0

    #Analyze the text and normalize by total number of words
    total_words = len(text.split())
    for word in text.split():
        if emotion_dict.get(word):
            for emotion in emotion_dict.get(word):
                emotion_count[emotion] += 1/len(text.split())
    return emotion_count

结果分析；输出dataframe

def comparative_emotion_analyzer(text_tuples,object_name="Restaurant",print_output=False):
    if print_output:
        print("%-20s %1s\t%1s %1s %1s %1s   %1s %1s %1s %1s"%(object_name,
                                                              "fear","trust","negative","positive",
                                                              "joy","disgust","anticip", "sadness",
                                                              "surprise"))
    import pandas as pd
    df = pd.DataFrame(columns=[object_name,'Fear','Trust','Negative',
                           'Positive','Joy','Disgust','Anticipation',
                           'Sadness','Surprise'],)
    df.set_index(object_name,inplace=True)
    
    output = df    
    for text_tuple in text_tuples:
        text = text_tuple[1] 
        result = emotion_analyzer(text)
        if print_output:
            print("%-20s %1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f"%(
                text_tuple[0][0:20],result['fear'],result['trust'],
                  result['negative'],result['positive'],result['joy'],result['disgust'],
                  result['anticipation'],result['sadness'],result['surprise']))
        df.loc[text_tuple[0]] = [result['fear'],result['trust'],
                  result['negative'],result['positive'],result['joy'],result['disgust'],
                  result['anticipation'],result['sadness'],result['surprise']]
    return output

1.3 Weighted sentiment analysis using Vader

Vader contains a list of 7500 features weighted by how positive or negative they are. Human trained on twitter data and generally considered good for informal communication. 10 humans rated each feature in each tweet in context from -4 to +4. Computes a "compound" score based on heuristics (between -1 and +1). Includes sentiment of emoticons, punctuation, and other 'social media' lexicon elements.

def vader_comparison(texts):
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    headers = ['pos','neg','neu','compound']
    print("Name\t",'  pos\t','neg\t','neu\t','compound')
    analyzer = SentimentIntensityAnalyzer()
    for i in range(len(texts)):
        name = texts[i][0]
        sentences = sent_tokenize(texts[i][1])
        pos=compound=neu=neg=0
        for sentence in sentences:
            vs = analyzer.polarity_scores(sentence)
            pos+=vs['pos']/(len(sentences))
            compound+=vs['compound']/(len(sentences))
            neu+=vs['neu']/(len(sentences))
            neg+=vs['neg']/(len(sentences))
        print('%-10s'%name,'%1.2f\t'%pos,'%1.2f\t'%neg,'%1.2f\t'%neu,'%1.2f\t'%compound)

2. Text Corpora

Corpus: An organized set of text documents

2.1 read directories

import nltk
from nltk.corpus import PlaintextCorpusReader
community_root = "/Users/yu/Desktop/graduate/4523 Data Analytics/Lecture/Class 17-20- Text Mining/data/community"
community_files = "community.*"
community_data = PlaintextCorpusReader(community_root,community_files)
restaurant_data = [('community',community_data.raw())]

2.2 Inaugural Speeches

读取文件

inaugural.fileids()
all_addresses = list()
for file in inaugural.fileids():
    all_addresses.append((file,inaugural.raw(file)))
all_speeches = comparative_emotion_analyzer(all_addresses,print_output=False,object_name="President")

结果处理

all_speeches.sort_values(by="Surprise",ascending=False)
all_speeches["All_Pos"]=(all_speeches['Trust']+all_speeches['Positive']+ all_speeches['Joy']+ all_speeches['Anticipation'])
all_speeches["All_Neg"]=(all_speeches['Fear']+all_speeches['Negative']+ all_speeches['Disgust']+ all_speeches['Sadness'])
all_speeches['Net']=all_speeches["All_Pos"]-all_speeches["All_Neg"]
all_speeches.sort_values(by="Net",ascending=False)['Net']

sents = do_pos_neg_sentiment_analysis([(x[0],x[1]) for x in all_addresses])
sorted(sents,key=lambda x: x[1]-x[2],reverse=True)

3. Simple Analysis

3.1 World Cloud

简单生成单个

from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline

wordcloud = WordCloud(stopwords=STOPWORDS,background_color='white',width=3000,height=3000).generate(text)


plt.imshow(wordcloud)
plt.axis('off')
plt.show()

词云组图比较

texts = [('trump',inaugural.raw('2017-Trump.txt')),('Obama',inaugural.raw('2013-Obama.txt')),
         ('Bush',inaugural.raw('2001-Bush.txt')),('Clinton',inaugural.raw('1997-Clinton.txt'))]
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline
#Remove unwanted words
#As we look at the cloud, we can get rid of words that don't make sense by adding them to this variable
DELETE_WORDS = []
def remove_words(text_string,DELETE_WORDS=DELETE_WORDS):
    for word in DELETE_WORDS:
        text_string = text_string.replace(word,' ')
    return text_string

#Remove short words
MIN_LENGTH = 0
def remove_short_words(text_string,min_length = MIN_LENGTH):
    word_list = text_string.split()
    for word in word_list:
        if len(word) < min_length:
            text_string = text_string.replace(' '+word+' ',' ',1)
    return text_string


#Set up side by side clouds
COL_NUM = 2
ROW_NUM = 2
fig, axes = plt.subplots(ROW_NUM, COL_NUM, figsize=(12,12))

for i in range(0,len(texts)):
    text_string = remove_words(texts[i][1])
    text_string = remove_short_words(text_string)
    ax = axes[i//2, i%2] 
    ax.set_title(texts[i][0])
    wordcloud = WordCloud(stopwords=STOPWORDS,background_color='white',width=1200,height=1000,max_words=20).generate(text_string)
    ax.imshow(wordcloud)
    ax.axis('off')
plt.show()

3.2 Complexity

Concenpt

Complexity Factors:

+ average word length: longer words adds to complexity

+ average sentence length: longer sentences are more complex (unless the text is rambling!)

+ vocabulary: the ratio of unique words used to the total number of words (more variety, more complexity)

token: A sequence (or group) of characters of interest. For e.g., in the below analysis, a token = a word

+ Generally: A token is the base unit of analysis

+ So, the first step is to convert text into tokens and nltk text object

construct tokens

text = le_monde_data.raw()
import nltk
from nltk import sent_tokenize,word_tokenize 
sentences = nltk.Text(sent_tokenize(text))
words = nltk.Text(word_tokenize(text))

Complexity

def get_complexity(text):
    num_chars=len(text)
    num_words=len(word_tokenize(text))
    num_sentences=len(sent_tokenize(text))
    vocab = {x.lower() for x in word_tokenize(text)}
    return len(vocab),int(num_chars/num_words),int(num_words/num_sentences),len(vocab)/num_words
get_complexity(le_monde_data.raw())

3.3 Graph Analysis

Over Time

from nltk.corpus import inaugural
sentence_lengths = list()
for fileid in inaugural.fileids():
    sentence_lengths.append(get_complexity(' '.join(inaugural.words(fileid)))[2])
plt.plot(sentence_lengths)

Dispersion Plots

show how the frequency of some words has changed over the course of the republic

text4.dispersion_plot(["government", "citizen", "freedom", "duties", "America",'independence','God','patriotism'])

分析谁是主角：

text2.dispersion_plot(['Elinor','Marianne','Edward','Willoughby','Brandon','Fanny'])

Stemming

from nltk.stem.porter import PorterStemmer
p_stemmer = PorterStemmer()
text = inaugural.raw()
striptext = text.replace('\n\n', ' ')
striptext = striptext.replace('\n', ' ')
sentences = sent_tokenize(striptext)
words = word_tokenize(striptext)
text = nltk.Text([p_stemmer.stem(i).lower() for i in words])
text.dispersion_plot(["govern", "citizen", "free", "america",'independ','god','patriot'])

4. Entity Detection

Named entity detection is based on Part-of-speech tagging of words and chunks (groups of words)

Start with sentences (using a sentence tokenizer)
tokenize words in each sentence
chunk them. ne_chunk identifies likely chunked candidates (ne = named entity)
Finally build chunks using nltk's guess on what members of chunk represent (people, place, organization)

4.1

分成句子

import nltk
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sample_text = """
I was walking along thinking of many things. For e.g., I walked with my friend Bilkees Bijou through the campus of Columbia University. I 
thought of birds, of bees, of sealing wax. I thought of cabbages and kings.
"""
sent_detector.tokenize(sample_text)

句子分成词语

word_list = nltk.word_tokenize(sent_detector.tokenize(sample_text)[1])

pos_tag：the word with nltk's best guess as to the part of speech

tagged=nltk.pos_tag(word_list)

ne_chunk: a "Sentence Tree" of parts of speech using a tokenized list of words. words that are candidate entities have an attribute "label"

chunked = nltk.ne_chunk(tagged)
chunked[-2].label()

hasattr(): checks whether a name is an attribute of an object

class my_class(object):
    def __init__(self,x):
        name = x
    def check(self):
        return self.name
y = my_class('Jack')
hasattr(y,'check') #Return True

集合

def get_labeled_text(text,label_type='ALL'):
    en={}
    try:
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        sentences = sent_detector.tokenize(text.strip())
        for sentence in sentences:
                tokenized = nltk.word_tokenize(sentence)
                tagged = nltk.pos_tag(tokenized)
                chunked = nltk.ne_chunk(tagged)
                for tree in chunked:
                    if hasattr(tree, 'label'):
                        if not label_type == "ALL":
                            if not tree.label() == label_type:
                                continue
                        ne = ' '.join(c[0] for c in tree.leaves())
                        en[ne] = [tree.label(), ' '.join(c[1] for c in tree.leaves())]
    except Exception as e:
        return str(e)
    return en
get_labeled_text(community_data.raw(),'ORGANIZATION')

affect calculator

def get_affect(text,word,lower=True):
    import nltk
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    analyzer = SentimentIntensityAnalyzer()
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = sent_detector.tokenize(text.strip())
    sentence_count = 0
    running_total = 0
    for sentence in sentences:
        if lower: 
            sentence = sentence.lower()
            word = word.lower()
        if word in sentence:
            vs = analyzer.polarity_scores(sentence) 
            running_total += vs['compound']
            sentence_count += 1
    if sentence_count == 0: return 0
    return running_total/sentence_count

5. Text Summarization

5.1 Frequency Based

A naive form of summarization is to identify the most frequent words in a piece of text and use the occurrence of these words in sentences to rate the importance of a sentence.

Import

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from collections import OrderedDict
import pprint

Prep

text = community_data.raw()
striptext = text.replace('\n\n', ' ')
striptext = striptext.replace('\n', ' ')

去除stopwords

words = word_tokenize(striptext)
lowercase_words = [word.lower() for word in words
                  if word not in stopwords.words() and word.isalpha()]

选择最频繁的20个词

word_frequencies = FreqDist(lowercase_words)
most_frequent_words = FreqDist(lowercase_words).most_common(20)
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(most_frequent_words)

Initialize

+ candidate_sentences is a dictionary with the original sentence as the key, and its lowercase version as the value

+ summary_sentences is a list containing the sentences that will be included in the summary

+ candidate_sentence_counts is a dictionary with the original sentence as the key, and the sum of the frequencies of each word in the sentence as the value

summary_sentences = []
candidate_sentences = {}
candidate_sentence_counts = {}
sentences = sent_tokenize(striptext)
for sentence in sentences:
    candidate_sentences[sentence] = sentence.lower()

for upper, lower in candidate_sentences.items():
    count = 0
    for freq_word, frequency_score in most_frequent_words:
        if freq_word in lower:
            count += frequency_score
            candidate_sentence_counts[upper] = count

排序

sorted_sentences = OrderedDict(sorted(
                    candidate_sentence_counts.items(),
                    key = lambda x: x[1],
                    reverse = True)[:4])
pp.pprint(sorted_sentences)

连成一句

'\n'.join(sorted_sentences)

汇总成function

def build_naive_summary(text):
    from nltk.tokenize import word_tokenize
    from nltk.tokenize import sent_tokenize
    from nltk.probability import FreqDist
    from nltk.corpus import stopwords
    from collections import OrderedDict
    summary_sentences = []
    candidate_sentences = {}
    candidate_sentence_counts = {}
    striptext = text.replace('\n\n', ' ')
    striptext = striptext.replace('\n', ' ')
    words = word_tokenize(striptext)
    lowercase_words = [word.lower() for word in words
                      if word not in stopwords.words() and word.isalpha()]
    word_frequencies = FreqDist(lowercase_words)
    most_frequent_words = FreqDist(lowercase_words).most_common(20)
    sentences = sent_tokenize(striptext)
    for sentence in sentences:
        candidate_sentences[sentence] = sentence.lower()
    for long, short in candidate_sentences.items():
        count = 0
        for freq_word, frequency_score in most_frequent_words:
            if freq_word in short:
                count += frequency_score
                candidate_sentence_counts[long] = count   
    sorted_sentences = OrderedDict(sorted(
                        candidate_sentence_counts.items(),
                        key = lambda x: x[1],
                        reverse = True)[:4])
    return sorted_sentences

5.2 Gensim

Gensim uses a network with sentences as nodes and 'lexical similarity' as weights on the arcs between nodes

Import

from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk import sent_tokenize,word_tokenize 
from nltk.book import *
import gensim.summarization

Initialize

text = community_data.raw()
summary_sentences = []
candidate_sentences = {}
candidate_sentence_counts = {}
striptext = text.replace('\n\n', ' ')
striptext = striptext.replace('\n', ' ')
summary = gensim.summarization.summarize(striptext, word_count=100) 
print(gensim.summarization.keywords(striptext,words=10))

6. Topic

+ The goal of topic modeling is to identify the major concepts underlying a piece of text

+ Topic modeling uses "Unsupervised Learning". No a-priori knowledge is necessary

6.1 LDA: Latent Dirichlet Allocation Model¶

+ Identifies potential topics using pruning techniques like 'upward closure'

+ Computes conditional probabilities for topic word sets

+ Identifies the most likely topics

+ Does this over multiple passes probabilistically picking topics in each pass

Basic assumptions:

Every document will be associated with a set of topics
The topics will be distributed across a probability distribution
Each topic will be represented in the document by a set of words
The words associated with the topic will be distributed across a probability distribution

爬text，存下

import requests
from bs4 import BeautifulSoup
url="https://www.slate.com"
page = requests.get(url)
bs_page = BeautifulSoup(page.content,'lxml')
all_links = bs_page.find_all('a')
categories = ['news_and_politics','news-and-politics']
followable_links = list()
for link in all_links:
    href = link.get('href')
    if href:
        for cat in categories:
            if cat in href:
                followable_links.append(href)

story_list = list()
count=0
for link in followable_links:
    try:
        page=BeautifulSoup(requests.get(link).content,'lxml')
        text=page.find('body').find('section',class_='article__body').get_text().strip()
        story_list.append(text)
        count+=1
    except:
        continue

Import

from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
import pprint

Prepare Text

+ Clean it (remove numbers, end of line characters, common words)

+ Sentence tokenize it

+ Convert each sentence into a list of words

for i in range(len(story_list)):
    story = story_list[i]
    sents = sent_tokenize(story)
    for j in range(len(sents)):
        sent = sents[j]
        sent = sent.strip().replace('\n','')
        sents[j] = sent
    story_list[i] = '. '.join(sents)

Doc转成一系列words

texts = [[word for word in story.lower().split()
        if word not in STOPWORDS and word.isalnum() and not word.lower() == 'slate']
        for story in story_list]

Dictionary (word, frequency)

+ dictionary: key = word, value = integer (a unique number attached to each word). corpora.Dictionary generates this.

+ corpus: A list of (word index, frequency) pairs for each text. doc2bow generates this

dictionary = corpora.Dictionary(texts) #(word_id,word) pairs
corpus = [dictionary.doc2bow(text) for text in texts] #(word_id,freq) pairs by sentence

Parameters:

+ Number of topics: The number of topics you want generated.

+ Passes: The LDA model makes through the document. More passes, slower analysis

LDA first randomly assigns words and word weights to each topic
In each pass, it refines the weights
In short, you want the number of passes to be wherever the gain (improved weights) is minimal

#Set parameters
num_topics = 5 #The number of topics that should be generated
passes = 10
lda = LdaModel(corpus,
              id2word=dictionary,
              num_topics=num_topics,
              passes=passes)
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(lda.print_topics(num_words=8))
lda.show_topic(0)

from operator import itemgetter
lda.get_document_topics(corpus[0],minimum_probability=0.05,per_word_topics=False)
sorted(lda.get_document_topics(corpus[0],minimum_probability=0,per_word_topics=False),key=itemgetter(1),reverse=True)

6.2 Matching topics to doc

create corpus

text = newdoc
striptext = text.replace('\n\n', ' ')
striptext = striptext.replace('\n', ' ')
new_text = [nltk.word_tokenize(striptext)]

textdictionary = corpora.Dictionary(new_text) #(word_id,frequency) pairs
corpus_new = [dictionary.doc2bow(text) for text in new_text] #(word_id,freq) pairs by sentence

get topic

from operator import itemgetter
lda.get_document_topics(corpus_new[0],minimum_probability=0.05,per_word_topics=False)
sorted(lda.get_document_topics(corpus_new[0],minimum_probability=0,per_word_topics=False),key=itemgetter(1),reverse=True)
lda.print_topic(topicno=1)
lda.show_topic(1)

word cloud

def draw_wordcloud(lda,topicnum,min_size=0,STOPWORDS=[]):
    word_list=[]
    prob_total = 0
    for word,prob in lda.show_topic(topicnum,topn=50):
        prob_total +=prob
    for word,prob in lda.show_topic(topicnum,topn=50):
        if word in STOPWORDS or  len(word) < min_size:
            continue
#         freq = int(prob/prob_total*1000)
        freq = int(prob/prob_total*100)
        alist=[word]
        word_list.extend(alist*freq)
        
#     print(word_list)
    from wordcloud import WordCloud, STOPWORDS
    import matplotlib.pyplot as plt
    %matplotlib inline
    from random import shuffle
    shuffle(word_list)
    text = ' '.join(word_list)
    wordcloud = WordCloud(stopwords=STOPWORDS,background_color='white',max_words=20).generate(text)

    plt.axis('off')
    plt.imshow(wordcloud)

    return None
draw_wordcloud(lda,4)

LDA可视化

import pyLDAvis.gensim_models as gensimvis
lda_display = gensimvis.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

7. Similarity

doc_list = [community_data,le_monde_data,fiveguys_data,shakeshack_data]
all_text = community_data.raw() + le_monde_data.raw() + fiveguys_data.raw() + shakeshack_data.raw()

documents = [doc.raw() for doc in doc_list]
texts = [[word for word in document.lower().split()
        if word not in STOPWORDS and word.isalnum()]
        for document in documents]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

from gensim.similarities.docsim import Similarity
from gensim import corpora, models, similarities
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
doc = """???"""
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
index = similarities.MatrixSimilarity(lsi[corpus])
sims = index[vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])

doc="""xxx"""
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
index = similarities.MatrixSimilarity(lsi[corpus])
sims = index[vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
sims

weixin_44602072

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
E4523 Text Mining

Sentiment Analysis#Example 1: Compute the proportion of positive and negative words in a text.1.获得pos/neg词setdef get_pos_neg_words(): def get_words(url): import requests words = requests.get(url).content.decode('latin-1')
复制链接

扫一扫