目录
1.3 Weighted sentiment analysis using Vader
6.1 LDA: Latent Dirichlet Allocation Model¶
Types of analysis
- Sentiment analysis: Deciding whether a document (or concept) is positive or negative
- Entity analysis: Identifying entities (Named entities, Parts of speech) and properties of these entities
- Topic analysis: Deciding what the major topics associated with a piece of tex
- Text summarization: Summarizing a document (Cliff notes version!)
1. Sentiment Analysis*
1.1 simple sentiment analysis
- get pos & neg words list
def get_pos_neg_words():
def get_words(url):
import requests
words = requests.get(url).content.decode('latin-1')
word_list = words.split('\n')
index = 0
while index < len(word_list):
word = word_list[index]
if ';' in word or not word:
word_list.pop(index)
else:
index+=1
return word_list
#Get lists of positive and negative words
p_url = 'http://ptrckprry.com/course/ssd/data/positive-words.txt'
n_url = 'http://ptrckprry.com/course/ssd/data/negative-words.txt'
positive_words = get_words(p_url)
negative_words = get_words(n_url)
return positive_words,negative_words
positive_words,negative_words = get_pos_neg_words()
- 打开文件
with open('community.txt','r') as f:
community = f.read()
- 计算正/负词比例
def do_pos_neg_sentiment_analysis(text_list,debug=False):
positive_words,negative_words = get_pos_neg_words()
from nltk import word_tokenize
results = list()
for text in text_list:
cpos = cneg = lpos = lneg = 0
for word in word_tokenize(text[1]):
if word in positive_words:
if debug:
print("Positive",word)
cpos+=1
if word in negative_words:
if debug:
print("Negative",word)
cneg+=1
results.append((text[0],cpos/len(word_tokenize(text[1])),cneg/len(word_tokenize(text[1]))))
return results
do_pos_neg_sentiment_analysis([('community',community),('le_monde',le_monde)])
1.2 Using NRC Data
- 读取NRC data
def get_nrc_data():
nrc = "NRC-Emotion-Lexicon-Senselevel-v0.92.txt"
count=0
emotion_dict=dict()
with open(nrc,'r') as f:
all_lines = list()
for line in f:
if count < 46:
count+=1
continue
line = line.strip().split('\t')
if int(line[2]) == 1:
if emotion_dict.get(line[0]):
emotion_dict[line[0]].append(line[1])
else:
emotion_dict[line[0]] = [line[1]]
return emotion_dict
- 计算emotions的占比
def emotion_analyzer(text,emotion_dict=emotion_dict):
#Set up the result dictionary
emotions = {x for y in emotion_dict.values() for x in y}
emotion_count = dict()
for emotion in emotions:
emotion_count[emotion] = 0
#Analyze the text and normalize by total number of words
total_words = len(text.split())
for word in text.split():
if emotion_dict.get(word):
for emotion in emotion_dict.get(word):
emotion_count[emotion] += 1/len(text.split())
return emotion_count
- 结果分析;输出dataframe
def comparative_emotion_analyzer(text_tuples,object_name="Restaurant",print_output=False):
if print_output:
print("%-20s %1s\t%1s %1s %1s %1s %1s %1s %1s %1s"%(object_name,
"fear","trust","negative","positive",
"joy","disgust","anticip", "sadness",
"surprise"))
import pandas as pd
df = pd.DataFrame(columns=[object_name,'Fear','Trust','Negative',
'Positive','Joy','Disgust','Anticipation',
'Sadness','Surprise'],)
df.set_index(object_name,inplace=True)
output = df
for text_tuple in text_tuples:
text = text_tuple[1]
result = emotion_analyzer(text)
if print_output:
print("%-20s %1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f"%(
text_tuple[0][0:20],result['fear'],result['trust'],
result['negative'],result['positive'],result['joy'],result['disgust'],
result['anticipation'],result['sadness'],result['surprise']))
df.loc[text_tuple[0]] = [result['fear'],result['trust'],
result['negative'],result['positive'],result['joy'],result['disgust'],
result['anticipation'],result['sadness'],result['surprise']]
return output
1.3 Weighted sentiment analysis using Vader
Vader contains a list of 7500 features weighted by how positive or negative they are. Human trained on twitter data and generally considered good for informal communication. 10 humans rated each feature in each tweet in context from -4 to +4. Computes a "compound" score based on heuristics (between -1 and +1). Includes sentiment of emoticons, punctuation, and other 'social media' lexicon elements.
def vader_comparison(texts):
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
headers = ['pos','neg','neu','compound']
print("Name\t",' pos\t','neg\t','neu\t','compound')
analyzer = SentimentIntensityAnalyzer()
for i in range(len(texts)):
name = texts[i][0]
sentences = sent_tokenize(texts[i][1])
pos=compound=neu=neg=0
for sentence in sentences:
vs = analyzer.polarity_scores(sentence)
pos+=vs['pos']/(len(sentences))
compound+=vs['compound']/(len(sentences))
neu+=vs['neu']/(len(sentences))
neg+=vs['neg']/(len(sentences))
print('%-10s'%name,'%1.2f\t'%pos,'%1.2f\t'%neg,'%1.2f\t'%neu,'%1.2f\t'%compound)
2. Text Corpora
Corpus: An organized set of text documents
2.1 read directories
import nltk
from nltk.corpus import PlaintextCorpusReader
community_root = "/Users/yu/Desktop/graduate/4523 Data Analytics/Lecture/Class 17-20- Text Mining/data/community"
community_files = "community.*"
community_data = PlaintextCorpusReader(community_root,community_files)
restaurant_data = [('community',community_data.raw())]
2.2 Inaugural Speeches
- 读取文件
inaugural.fileids()
all_addresses = list()
for file in inaugural.fileids():
all_addresses.append((file,inaugural.raw(file)))
all_speeches = comparative_emotion_analyzer(all_addresses,print_output=False,object_name="President")
- 结果处理
all_speeches.sort_values(by="Surprise",ascending=False)
all_speeches["All_Pos"]=(all_speeches['Trust']+all_speeches['Positive']+ all_speeches['Joy']+ all_speeches['Anticipation'])
all_speeches["All_Neg"]=(all_speeches['Fear']+all_speeches['Negative']+ all_speeches['Disgust']+ all_speeches['Sadness'])
all_speeches['Net']=all_speeches["All_Pos"]-all_speeches["All_Neg"]
all_speeches.sort_values(by="Net",ascending=False)['Net']
sents = do_pos_neg_sentiment_analysis([(x[0],x[1]) for x in all_addresses])
sorted(sents,key=lambda x: x[1]-x[2],reverse=True)
3. Simple Analysis
3.1 World Cloud
- 简单生成单个
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline
wordcloud = WordCloud(stopwords=STOPWORDS,background_color='white',width=3000,height=3000).generate(text)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
-
词云组图比较
texts = [('trump',inaugural.raw('2017-Trump.txt')),('Obama',inaugural.raw('2013-Obama.txt')),
('Bush',inaugural.raw('2001-Bush.txt')),('Clinton',inaugural.raw('1997-Clinton.txt'))]
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline
#Remove unwanted words
#As we look at the cloud, we can get rid of words that don't make sense by adding them to this variable
DELETE_WORDS = []
def remove_words(text_string,DELETE_WORDS=DELETE_WORDS):
for word in DELETE_WORDS:
text_string = text_string.replace(word,' ')
return text_string
#Remove short words
MIN_LENGTH = 0
def remove_short_words(text_string,min_length = MIN_LENGTH):
word_list = text_string.split()
for word in word_list:
if len(word) < min_length:
text_string = text_string.replace(' '+word+' ',' ',1)
return text_string
#Set up side by side clouds
COL_NUM = 2
ROW_NUM = 2
fig, axes = plt.subplots(ROW_NUM, COL_NUM, figsize=(12,12))
for i in range(0,len(texts)):
text_string = remove_words(texts[i][1])
text_string = remove_short_words(text_string)
ax = axes[i//2, i%2]
ax.set_title(texts[i][0])
wordcloud = WordCloud(stopwords=STOPWORDS,background_color='white',width=1200,height=1000,max_words=20).generate(text_string)
ax.imshow(wordcloud)
ax.axis('off')
plt.show()
3.2 Complexity
- Concenpt
Complexity Factors:
+ average word length: longer words adds to complexity
+ average sentence length: longer sentences are more complex (unless the text is rambling!)
+ vocabulary: the ratio of unique words used to the total number of words (more variety, more complexity)
token: A sequence (or group) of characters of interest. For e.g., in the below analysis, a token = a word
+ Generally: A token is the base unit of analysis
+ So, the first step is to convert text into tokens and nltk text object
- construct tokens
text = le_monde_data.raw()
import nltk
from nltk import sent_tokenize,word_tokenize
sentences = nltk.Text(sent_tokenize(text))
words = nltk.Text(word_tokenize(text))
- Complexity
def get_complexity(text):
num_chars=len(text)
num_words=len(word_tokenize(text))
num_sentences=len(sent_tokenize(text))
vocab = {x.lower() for x in word_tokenize(text)}
return len(vocab),int(num_chars/num_words),int(num_words/num_sentences),len(vocab)/num_words
get_complexity(le_monde_data.raw())
3.3 Graph Analysis
- Over Time
from nltk.corpus import inaugural
sentence_lengths = list()
for fileid in inaugural.fileids():
sentence_lengths.append(get_complexity(' '.join(inaugural.words(fileid)))[2])
plt.plot(sentence_lengths)
- Dispersion Plots
show how the frequency of some words has changed over the course of the republic
text4.dispersion_plot(["government", "citizen", "freedom", "duties", "America",'independence','God','patriotism'])
分析谁是主角:
text2.dispersion_plot(['Elinor','Marianne','Edward','Willoughby','Brandon','Fanny'])
- Stemming
from nltk.stem.porter import PorterStemmer
p_stemmer = PorterStemmer()
text = inaugural.raw()
striptext = text.replace('\n\n', ' ')
striptext = striptext.replace('\n', ' ')
sentences = sent_tokenize(striptext)
words = word_tokenize(striptext)
text = nltk.Text([p_stemmer.stem(i).lower() for i in words])
text.dispersion_plot(["govern", "citizen", "free", "america",'independ','god','patriot'])
4. Entity Detection
Named entity detection is based on Part-of-speech tagging of words and chunks (groups of words)
- Start with sentences (using a sentence tokenizer)
- tokenize words in each sentence
- chunk them. ne_chunk identifies likely chunked candidates (ne = named entity)
- Finally build chunks using nltk's guess on what members of chunk represent (people, place, organization)
4.1
- 分成句子
import nltk
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sample_text = """
I was walking along thinking of many things. For e.g., I walked with my friend Bilkees Bijou through the campus of Columbia University. I
thought of birds, of bees, of sealing wax. I thought of cabbages and kings.
"""
sent_detector.tokenize(sample_text)
- 句子分成词语
word_list = nltk.word_tokenize(sent_detector.tokenize(sample_text)[1])
- pos_tag:the word with nltk's best guess as to the part of speech
tagged=nltk.pos_tag(word_list)
- ne_chunk: a "Sentence Tree" of parts of speech using a tokenized list of words. words that are candidate entities have an attribute "label"
chunked = nltk.ne_chunk(tagged)
chunked[-2].label()
- hasattr(): checks whether a name is an attribute of an object
class my_class(object):
def __init__(self,x):
name = x
def check(self):
return self.name
y = my_class('Jack')
hasattr(y,'check') #Return True
- 集合
def get_labeled_text(text,label_type='ALL'):
en={}
try:
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = sent_detector.tokenize(text.strip())
for sentence in sentences:
tokenized = nltk.word_tokenize(sentence)
tagged = nltk.pos_tag(tokenized)
chunked = nltk.ne_chunk(tagged)
for tree in chunked:
if hasattr(tree, 'label'):
if not label_type == "ALL":
if not tree.label() == label_type:
continue
ne = ' '.join(c[0] for c in tree.leaves())
en[ne] = [tree.label(), ' '.join(c[1] for c in tree.leaves())]
except Exception as e:
return str(e)
return en
get_labeled_text(community_data.raw(),'ORGANIZATION')
- affect calculator
def get_affect(text,word,lower=True):
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = sent_detector.tokenize(text.strip())
sentence_count = 0
running_total = 0
for sentence in sentences:
if lower:
sentence = sentence.lower()
word = word.lower()
if word in sentence:
vs = analyzer.polarity_scores(sentence)
running_total += vs['compound']
sentence_count += 1
if sentence_count == 0: return 0
return running_total/sentence_count
5. Text Summarization
5.1 Frequency Based
A naive form of summarization is to identify the most frequent words in a piece of text and use the occurrence of these words in sentences to rate the importance of a sentence.
-
Import
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from collections import OrderedDict
import pprint
- Prep
text = community_data.raw()
striptext = text.replace('\n\n', ' ')
striptext = striptext.replace('\n', ' ')
- 去除stopwords
words = word_tokenize(striptext)
lowercase_words = [word.lower() for word in words
if word not in stopwords.words() and word.isalpha()]
- 选择最频繁的20个词
word_frequencies = FreqDist(lowercase_words)
most_frequent_words = FreqDist(lowercase_words).most_common(20)
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(most_frequent_words)
- Initialize
+ candidate_sentences is a dictionary with the original sentence as the key, and its lowercase version as the value
+ summary_sentences is a list containing the sentences that will be included in the summary
+ candidate_sentence_counts is a dictionary with the original sentence as the key, and the sum of the frequencies of each word in the sentence as the value
summary_sentences = []
candidate_sentences = {}
candidate_sentence_counts = {}
sentences = sent_tokenize(striptext)
for sentence in sentences:
candidate_sentences[sentence] = sentence.lower()
for upper, lower in candidate_sentences.items():
count = 0
for freq_word, frequency_score in most_frequent_words:
if freq_word in lower:
count += frequency_score
candidate_sentence_counts[upper] = count
- 排序
sorted_sentences = OrderedDict(sorted(
candidate_sentence_counts.items(),
key = lambda x: x[1],
reverse = True)[:4])
pp.pprint(sorted_sentences)
- 连成一句
'\n'.join(sorted_sentences)
- 汇总成function
def build_naive_summary(text):
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from collections import OrderedDict
summary_sentences = []
candidate_sentences = {}
candidate_sentence_counts = {}
striptext = text.replace('\n\n', ' ')
striptext = striptext.replace('\n', ' ')
words = word_tokenize(striptext)
lowercase_words = [word.lower() for word in words
if word not in stopwords.words() and word.isalpha()]
word_frequencies = FreqDist(lowercase_words)
most_frequent_words = FreqDist(lowercase_words).most_common(20)
sentences = sent_tokenize(striptext)
for sentence in sentences:
candidate_sentences[sentence] = sentence.lower()
for long, short in candidate_sentences.items():
count = 0
for freq_word, frequency_score in most_frequent_words:
if freq_word in short:
count += frequency_score
candidate_sentence_counts[long] = count
sorted_sentences = OrderedDict(sorted(
candidate_sentence_counts.items(),
key = lambda x: x[1],
reverse = True)[:4])
return sorted_sentences
5.2 Gensim
Gensim uses a network with sentences as nodes and 'lexical similarity' as weights on the arcs between nodes
- Import
from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk import sent_tokenize,word_tokenize
from nltk.book import *
import gensim.summarization
- Initialize
text = community_data.raw()
summary_sentences = []
candidate_sentences = {}
candidate_sentence_counts = {}
striptext = text.replace('\n\n', ' ')
striptext = striptext.replace('\n', ' ')
summary = gensim.summarization.summarize(striptext, word_count=100)
print(gensim.summarization.keywords(striptext,words=10))
6. Topic
+ The goal of topic modeling is to identify the major concepts underlying a piece of text
+ Topic modeling uses "Unsupervised Learning". No a-priori knowledge is necessary
6.1 LDA: Latent Dirichlet Allocation Model¶
+ Identifies potential topics using pruning techniques like 'upward closure'
+ Computes conditional probabilities for topic word sets
+ Identifies the most likely topics
+ Does this over multiple passes probabilistically picking topics in each pass
Basic assumptions:
- Every document will be associated with a set of topics
- The topics will be distributed across a probability distribution
- Each topic will be represented in the document by a set of words
- The words associated with the topic will be distributed across a probability distribution
- 爬text,存下
import requests
from bs4 import BeautifulSoup
url="https://www.slate.com"
page = requests.get(url)
bs_page = BeautifulSoup(page.content,'lxml')
all_links = bs_page.find_all('a')
categories = ['news_and_politics','news-and-politics']
followable_links = list()
for link in all_links:
href = link.get('href')
if href:
for cat in categories:
if cat in href:
followable_links.append(href)
story_list = list()
count=0
for link in followable_links:
try:
page=BeautifulSoup(requests.get(link).content,'lxml')
text=page.find('body').find('section',class_='article__body').get_text().strip()
story_list.append(text)
count+=1
except:
continue
- Import
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
import pprint
- Prepare Text
+ Clean it (remove numbers, end of line characters, common words)
+ Sentence tokenize it
+ Convert each sentence into a list of words
for i in range(len(story_list)):
story = story_list[i]
sents = sent_tokenize(story)
for j in range(len(sents)):
sent = sents[j]
sent = sent.strip().replace('\n','')
sents[j] = sent
story_list[i] = '. '.join(sents)
- Doc转成一系列words
texts = [[word for word in story.lower().split()
if word not in STOPWORDS and word.isalnum() and not word.lower() == 'slate']
for story in story_list]
- Dictionary (word, frequency)
+ dictionary: key = word, value = integer (a unique number attached to each word). corpora.Dictionary generates this.
+ corpus: A list of (word index, frequency) pairs for each text. doc2bow generates this
dictionary = corpora.Dictionary(texts) #(word_id,word) pairs
corpus = [dictionary.doc2bow(text) for text in texts] #(word_id,freq) pairs by sentence
- LDA
Parameters:
+ Number of topics: The number of topics you want generated.
+ Passes: The LDA model makes through the document. More passes, slower analysis
- LDA first randomly assigns words and word weights to each topic
- In each pass, it refines the weights
- In short, you want the number of passes to be wherever the gain (improved weights) is minimal
#Set parameters
num_topics = 5 #The number of topics that should be generated
passes = 10
lda = LdaModel(corpus,
id2word=dictionary,
num_topics=num_topics,
passes=passes)
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(lda.print_topics(num_words=8))
lda.show_topic(0)
from operator import itemgetter
lda.get_document_topics(corpus[0],minimum_probability=0.05,per_word_topics=False)
sorted(lda.get_document_topics(corpus[0],minimum_probability=0,per_word_topics=False),key=itemgetter(1),reverse=True)
6.2 Matching topics to doc
-
create corpus
text = newdoc
striptext = text.replace('\n\n', ' ')
striptext = striptext.replace('\n', ' ')
new_text = [nltk.word_tokenize(striptext)]
textdictionary = corpora.Dictionary(new_text) #(word_id,frequency) pairs
corpus_new = [dictionary.doc2bow(text) for text in new_text] #(word_id,freq) pairs by sentence
- get topic
from operator import itemgetter
lda.get_document_topics(corpus_new[0],minimum_probability=0.05,per_word_topics=False)
sorted(lda.get_document_topics(corpus_new[0],minimum_probability=0,per_word_topics=False),key=itemgetter(1),reverse=True)
lda.print_topic(topicno=1)
lda.show_topic(1)
- word cloud
def draw_wordcloud(lda,topicnum,min_size=0,STOPWORDS=[]):
word_list=[]
prob_total = 0
for word,prob in lda.show_topic(topicnum,topn=50):
prob_total +=prob
for word,prob in lda.show_topic(topicnum,topn=50):
if word in STOPWORDS or len(word) < min_size:
continue
# freq = int(prob/prob_total*1000)
freq = int(prob/prob_total*100)
alist=[word]
word_list.extend(alist*freq)
# print(word_list)
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline
from random import shuffle
shuffle(word_list)
text = ' '.join(word_list)
wordcloud = WordCloud(stopwords=STOPWORDS,background_color='white',max_words=20).generate(text)
plt.axis('off')
plt.imshow(wordcloud)
return None
draw_wordcloud(lda,4)
- LDA可视化
import pyLDAvis.gensim_models as gensimvis
lda_display = gensimvis.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)
7. Similarity
doc_list = [community_data,le_monde_data,fiveguys_data,shakeshack_data]
all_text = community_data.raw() + le_monde_data.raw() + fiveguys_data.raw() + shakeshack_data.raw()
documents = [doc.raw() for doc in doc_list]
texts = [[word for word in document.lower().split()
if word not in STOPWORDS and word.isalnum()]
for document in documents]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
from gensim.similarities.docsim import Similarity
from gensim import corpora, models, similarities
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
doc = """???"""
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
index = similarities.MatrixSimilarity(lsi[corpus])
sims = index[vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
doc="""xxx"""
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
index = similarities.MatrixSimilarity(lsi[corpus])
sims = index[vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
sims