《精通Python自然语言处理》
Deepti Chopra(印度)
王威 译
第七章 情感分析:我很快乐
情感分析(情感生成)被定义为确定一个字符序列背后所隐含的情感信息的过程。
7.1情感分析简介
对电影评价进行情感分析:
import nltk
import random
from nltk.corpus import movie_reviews
docs = [(list(movie_reviews.words(fid)), cat) for cat in movie_reviews.categories() for fid in movie_reviews.fileids(cat)]
random.shuffle(docs)
all_tokens = nltk.FreqDist(x.lower() for x in movie_reviews.words())
token_features = list(all_tokens.keys())[:2000]
print(token_features[:100])
def doc_features(docs):
doc_words = set(docs)
features = {}
for word in token_features:
features['contains(%s)' % word] = (word in doc_words)
return features
print(doc_features(movie_reviews.words('pos/cv957_8737.txt')))
feature_sets = [(doc_features(d), c) for (d,c) in docs]
train_sets, test_sets = feature_sets[100:], feature_sets[:100]
classifiers = nltk.NaiveBayesClassifier.train(train_sets)
print(nltk.classify.accuracy(classifiers, test_sets))
classifiers.show_most_informative_features(5)
Most Informative Features
contains (damon) = True pos:neg= 11.2:1.0
contains (outstanding) = True pos:neg= 10.6:1.0
contains (mulan) = True pos:neg= 8.8 :1.0
contains (seagal) = True neg:pos= 8.4 : 1.0
contains (wonderfully) = True pos;neg= 7.4:1.0
每个标识符还包括三个实体,即:单词、词条和标记。
执行文本预处理:
import nltk
class Splitter (object) :
def init__ (self) :
self.nitk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
self.nltk_tokenizer = nltk.tokenize.TreebankWwordTokenizer()
def split(self, .text) :
sentences = self.nltk_splitter.tokenize (text)
tokenized_sentences = [self.nitk_tokenizer.tokenize(sent) for sent in sentences]
return tokenized_sentences
class POSTagger (object):
def init_ (self) :
pass
def pos_tag(self, sentences):
pos = [nltk.pos_tag(sentence) for sentence in sentences]
pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos]
return pos
为毎一个标识符生成包含三个元素的元组(即单词、词条和词性标记):
text = “”“Why are you looking disappointed. We will go to restaurantfor dinner."""
splitter = spiitter()
postagger = POSTagger()
splitted_sentences = splitter.split(text)
print (splitted_sentences)
pos_tagged_sentences = postagger.pos_tag (splitted_sentences)
print (pos_tagged_sentences)
使用字典来执行词性标注:
class DictionaryTagger (object) :
def __init__(self, dictionary_paths) :
files = lopen(path, 'z') for path in dictionary paths]
dictionaries = [yam1.load(dict_file) for dict_file in files]
map(lambda x: x.close(), files)
self.dictionary = {}
self.max_key_size = 0
for curr_dict in dictionaries:
for key in curr_dict:
if key in self.dictionary:
self.dictionary[key].extend(curr_dict [key])
else:
self.dictionary[key] = curr_dict[key]
self.max_key_size = max (self.max_key_size, len(key))
def tag(self, postagged_sentences):
return [self.tag_sentence (sentence) for sentence in postagged_sentences]
def tag_sentence (self, sentence, tag_with_lemmas = False):
tag_sentence = []
N len(sentence)
if self.max_key_size == 0:
self.max_key_size = N
i=0
while (i< N):
j = min (i + self.max_key_size, N) #avoid overflow
tagged = False
while (j >i) :
expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower()
expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower()
if tag_with_lemmas:
literal = expression_lemma
else:
literal = expression_form
if literal in self.dictionary:
is_single_token = j - i == 1
original_position = i
i=j
taggings = [tag for tag in self.dictionary[literal]]
tagged_expression (expression_form, expression_lemma, taggings)
if is_single_token: #if the taged literal is a single token, conserve its previous taggings:
original_token_tagging=sentenceloriginal_position][2]
tagged_expression[2].extend(loriginal_token_tagging)
tag_sentence.append (tagged_expression)
tagged = True
else:
j = j-1
if not tagged: ,
tag_sentence.append (sentence[i])
i += 1
return tag_ sentence
计算积板表达和消极表达的数量:
def value_of (sentiment):
if sentiment == 'positive': return 1
if sentiment == 'negative': return -1
return 0
def sentimert score (review) :
return sum([value_of(tag) for sentence in dict_tagged_sentences for token in sentence for tag in token[2]])
7.1.1使用 NER执行情感分析
NER是一个找出命名实体并将其分类为不同的命名实体类的过程。我们可以使用不同的技术来执行NER,例如基于规则的方法、列表查找方法和统计学方法(隐马尔科夫模型、最大熵马尔科夫模型、支持向量机、条件随机场和决策树)。
7.1.2使用机器学习执行情感分析
hltk.sentiment.sentiment_analyzer 模块用于执行情感分析:
from __future__ import print_function
from collections import defaultdict
from nltk.classify.uti1 import apply_features, accuracy as eval_accuracy
from nltk.collecations import BigramCollocationFinder
ftrom nltk.metrice import (BigramAssccMeasures, precisicn as eval_precision,recall as eval_recall, f_measure as eval_f_measure)
from nltk.probability import FreqDist
from nltk.sentiment.util import save_ file, timer
class SentimentAnalyzer (cbject):
“””
A tool for Sentiment Analysis which is based on machine learning techniques.
“””
def __init__ (self, classifier = None):
self.feat_extractors = defaultdict (list)
self.classifier = classifier
返回文本中所有重复的单词:
def all_words (self, documents, labeled=None) :
all_words = []
if labeled is None:
labeled = documents and isinstance (documents[0], tuple)
if labeled == True:
for words, sentiment in documents:
all_ words.extend (words)
elif labeled False:
for words in documents:
all_ words . extend(words)
return all_ words
在文本上应用特征提取函数:
def apply_features(self, documents, labeled=None) :
return apply_features (self. extract_features, documents,labeled)
返回单词特征:
def unigram_word_feats(self, words, top. nwNone, min_freq=o):
unigram_feats_freqs = FreqDist(word for word in words):
return [w for w, f in unigram teats_freqs ,most_common(top_n)
if unigram_feats.treq[sw] > min_freq]
返回二元语法特征:
def bagram_collocation_feats(self, dacuments, top_n=Nore, min_freq = 3,assoc_measure = BigramAssocMeasures.pmi) :
finder = BigramCollocallonFinder.from_documents (documents)
finder.apply_freq_filter (min_freg)
return finder.nbest (assoc_measure, top_n)
通过使用特征集来分类一个给定的实例:
def classify(selE, instance) :
instance_feats = self.apply_features ([instance], labeled=False)
return self.classirier.classify (instance_feats[0])
抽取文本的特征:
def add_feat_extractor (self, furction, **kwargs) :
self.feat_extractorst [function].append (kwargs)
def extract_features (self, document) :
all_features = {}
for extcactor in self. feat extractors:
for para set in self.feat_extractors [extractor] :
feats = extractor (document, **param_set}
all_features.update (feats)
return all_features
使用save_ classifier将输出结果保存到一个文件中:
def train(self, trainer, training_set, save_classifier =None,**kwarqs):
print("Training classifier")
self.classifier = trainer(training_set, **kwargs)
if save_classifier:
save_file (self.clasifier, save_classifier)
return self.classifier
通过使用测试数据能够对我们的分类器执行性能评估:
def evaluate(self, test_set, classifier = None, accuracy = True, f_measure=True,precision=True, recall=True, verbose=False) :
if classifier is None:
classifier = self.classifier
print("Evaluating (01 results..".format (type (classifier) .__name__))
metrics results =()
if accuracy mm True:
accuracy_score = eval_accuracy(classifier, test_set)
metrics_results['Accuracy'] = accuracy_ score
gold_results = defaultdict (set)
test_results = defaultdict (set)
labels = set()
for i, (feats, label) in enumeratetest_set) :
labels.add (label)
gold_results[label].add(i)
observed = classifier.classify (feats)
test_results [observed].add(i)
for label in labels:
if precision == True:
precision_score = eval_precision(gold_results[label],test_results [label])
metrics_results['Precision [{0}]'.format (label)]=precision_score
if recall == True:
recall_score = eval_recall( gold results [label], test_results [labe1])
metrics results['Recall [{0}]'.format (label)] =recall_score
if f_measure == True:
f_measure_score = eval_f_measure(gold_ resultas[label],test_results[label])
metrics_results['F-measure [{0}]'format(label)] = f_measure_score
if verbose == True:
for result in sorted (metrica_results):
print('{0}:{1}'.format(result, metrics_results[result]))
return metrics_results
执行特征提取:
stopWords = []
#if there is occurrence of two or more same character, then replace it with the character itself.
def replaceTwoorMore(s) :
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
def getstopWordList (stopWordListFileName) :
# This function will read the stopwords from a file and builds a list.
stopWords = []
stopWords.append('AT_USER')
stopWords.append('URL')
fp = open (stopWordListEileName, 'r')
line = fp.readline()
while line:
word = line.strip()
stopWords.append (word)
line = fp.readline()
fp.close()
return stopWords
def getFeatureVector (tweet) :
featureVector =[]
#Tweets are firstly split into words
words = tweet.split()
for W in words:
#replace two or more with two occurrences
w = replaceTwoOrMore (w)
#strip punctuation
W=w.strip('\'"?,.')
#Words begin with alphabet is checked.
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$”, w)
#If there is a stop word, then it is ignored.
if(w in stopWords or val is None):
continue
else:
featureVector.append (w. lower())
return featureVector
#end
#Tweets are read one by one and then processed.
fp = open('data/sampleTweets.txt' ,'r')
line = fp.readline ()
st = open('data/feature list/stopwords.txt', 'r')
stopwords = getStopWordList ('data/feature_list/stopwords.txt')
while line:
processedTweet = processTweet (line)
featureVector = getFeatureVector (processedTweet)
print (featureVector)
line = fp.readline()
#end loop
fp.close()
#Tweets are read one by one and then processed.
inpTweets = csv.reader (open(' data/sampleTweets.cs 'rb'),delimiter=',', quotechar='|')
tweets = []
for row in inpTweets:
sentiment.row[0]
tweet.row[1]
processedTweet = processTweet (tweet)
featurevector = getFeatureVector (processedTweete, stopWords)
tweets.append( (featureVector, sentiment));
#Features Extraction takes place using following method
def extract_ features (tweetl:
tweet_words = set (tweet)
features = {}
for word in featureList:
features['contains(%s)' % word] = (word in tweet_words)
return features
使用朴素贝叶斯分类器执行情感分析:
NaiveBClassifier = nltk. NaiveBayesClassifier. train(training set)
# Testing the classifier testTweet = 'I liked this book on SentimentAnalysis a lot. '
processedTestTweet = processTweet (testTweet)
print (NaiveBClassifier.classify (extract_features (getFeatureVector (processedTestTweet) ) ))
testTweet = 'I am so badly hurt'
processedTestTweet = processTweet (testTweet)
print (NBClassifier.classify (extract_features (getFeatureVector (processedTestTweet))))
使用最大熵执行情感分析:
MaxEntclassifier = n1tk. classify.maxent .MaxentClassifier.
train(training_set, 'GIS', trace=3, \encoding=None, labels=None, sparse=True, gaussian_prior _sigma=0, max_iter = 10)
testTweet = 'I liked the book on sentiment analysis a lot'
processedTestTweet = processTweet (testTweet)
print (MaxEntclassifier.classify (extract_features (getFeatureVector (processedTestTweet))))
print (MaxEntClassifier.show_most_ informative_ features(10))
7.1.3 NER系统的评估
性能指标或评估有助于展示一个NER系统的性能。NER标注器的结果可以认为是一个回答,人们的一种解释称作答案要点。因此,我们提供了如下定义:
Correct | 如果回答与答案要点完全相同 |
---|---|
Incorrect | 如果回答与答案要点不同 |
Missing | 如果答案要点被标注,但回答未被标注 |
Spurious | 如果回答被标注,但答案要点未被标注 |
通过使用以下参数可以评价一个基于NER的系统的性能:
精确率(P) | P=Correct/(Correct+Incorret+Missing) |
---|---|
召回率(R) | R=Correct/(Correct+Incorrect+Spurious) |
F值 | F-Mcasure = (2PR)/(P+R) |
使用HMM执行NER:
#***** Function to find all cags in corpus*****
def find_tag_set(tra lines):
global tag_set
tag_set = []
for line in tra_ lines:
tok = line.split()
for t in tok:
wd = t.split("/")
if not wd{1] in tag_ set:
tag_set.append (wd[1])
return
#***** Function to find frequency of each tag in tagged corpus*****
def cnt_tag(tr_ln):
global start_li
global li
global tag_set
global c
global line_cnt
global lines
lines= tr_ln
start_li = [] # list ofstarting tags
find_tag_set(tr_ln)
line_cnt = 0
for line in lines:
tok = line.split()
x = tok[0].split("/")
if not x[1] in start_li:
start_li.append(x[1])
line_cnt = line_cnt+ 1
find_freq_tag()
find_ freq_srttag()
return
def find_freq_tagl) :
global tag_cnt
global tag_ set
tag_cnt={}
i=0
for w in tag_set:
cal_freq_tag(tag_set[i])
i=i+1
tag_cnt.update({w:freq_tg})
return
def cal_freq_tag(tg):
global freq_tg
global lines
freq_tg = 0
for line in lines:
freq_tg = freq _tg + line.count (tg)
return
#**** Function to find frequency of each starting tag in tagged corpus****
def find_freq_srttag():
global lst
lst ={} # start probability
i=0
for w in start_li:
cc = freq_srt_tag(start_li[i])
prob = cc / line_cnt
lst.update({start_li[i] :prob})
i=i+1
return
def freq_srt_tag(stg) :
global lines
freq_srt_tg = 0
for line in lines:
tok = line.split()
if stg in tok[0]:
freq_srt _tg = freq_srt_tg + 1
return(freq_srt_tg)
import tkinter as tk
import vit
import random
import cal_start_P
import calle_prob
import trans_mat
import time
import trans
import dict5
from tkinter import *
from tkinter import ttk
from tkinter.filedialog import askopenfllename
from tkinter.messagebox import showerror
import languagedetectl
import languagedetect3
e_dict = dictll
t_dict = dictll
def calculatel(*args) :
import listboxi
def calculate2(*args) :
import listhox2
def calculate3(*args):
import listbox3
def dispdlgl):
global file_ name
root = tk.Tk()
root.withdraw()
file_name = askopenfilename ()
return
def tranhmm():
ttk.Style().configure ("TButton", padding=6, relief="flat",background="Pink", foreground="Red")
ttk.Button(mainframe, text= " BROWSE", command=find_train_corpus).grid(column=7, row=5, sticky=W)
# The following code will be used to display or accept the testingcorpus from the user.
def testhmm():
ttk. Button (mainframe,text="Develop a new testing Corpus",command=calculate3).grid(column=9, row=5, sticky=E)
ttk.Button (mainframe, text="BROWSE", command=find_obs).grid(columne=9,row=7, sticky=E)
#In HMM, We require parameters such as Start Probability, Transition Probability and Emission Probability. The following code is used to calculate emission probability matrix
def cal_emit_mat():
global emission_probability
global corpus
global tlines
calle_prob.m_prg(e_dict, corpus, tlines)
emission_probability = e_dict
return
# to calculate states
def cal_states) :
global states
global tlines
cal_start_p.cnt_tag (tlines)
states = cal_start_p.tag_set
return
# to take observations
def find_obs():
global observationsg
lobal test_ lines
global tra
global w4
global co
global tra
global wo1
global wo2
global test1
global wo3
global te
global definitionText
global definitionseroll
qlobal dt2
global ds2
global dt11
global ds11
wo3=[ ]
woo=[ ]
wo1=[ ]
wo2=[ ]
co=0
w4=[ ]
if(flag2!=0):
definitionText11.pack_forget ()
definitionScroll11.pack_forget()
dt1.pack_forget()
ds1.pack_forget()
dispdlg()
f.open(file_name, "r+",encoding = 'utf-8')
test_lines = f.readlines()
f.close()
fname="C:/Python32/file_name1"
for x in states:
if not x in start_probability:
start_probability.update({x:0.0})
for line in test_ lines:
ob = line.splitl)
observations = ( ob )
fe=open ("C: \Python32\output3_file", "w+" ,encoding = 'utf-8')
fe.write("")
fe.close()
ff=open ("C: \Python32\output4_file" , "w+" , encoding = 'utf-8' )
ff.write ("")
ff.close ()
ff7=open ("C:\Python32\output5_file", "w+", encoding = 'utf-8')
ff7.write ("")
ff7.close()
ff8=open ("C: \Python32\output6_file", "w+",encoding = 'utf-8')
ff8.write ("")
ff8.close()
ff81=open("C: \Python32\output7_file", "w+",encoding = 'utt-8')
ff81.write ("")
ff81.close()
dict5.search_obs_train_corpus (filel, fname, tlines, test_lines, observations, states,
start_probability, transition_probability,emission_probability)
f20 = open("C: \Python32\output5_file", "r+",encoding = 'utf-8')
te = f20.readlines()
tee=f20.read ()
f = open(fname, "r+",encoding = 'utf-8')
train_llines = f.readlines()
ds11 = Scrollbar (root)
dt11 = Text(root, width=10, height=20, fg='black' ,bg='pink' ,yscrollcommand=dsll.set)
ds11.config (command=dt11.yview)
dt11.insert ("1.0",train_lines)
dt11.insert ("1.0", "\n")
dt11.insert("1.0", "\n")
dtll.insert ("1.0", "***TRAINING SENTENCES***")
#an example of how to add new text to the text area
dt11.pack (padx=10, pady=150)
ds11.pack (padx=10, pady=150)
ds11.pack(side=LEFT, fill=BOTH)
dt11.pack(side=LEFT, fill=BOTH, expand=True)
ds2 = Scrollbar (root)
dt2 = Text(root, width=10, height=10, fg='black' ,bg= 'pink',yscrollcommand=ds2.set)
ds2.config (command=dt2.yview)
dt2.insert("1.0",test_lines)
dt2.insert("1.0","\n")
dt2.insert("1.0", "\n")
dt2.insert("1.0", "******TESTING SENTENCES*****")
# an example of how to add new text to the text area
dt2.pack (padx=10, pady=150)
ds2.pack (padx=10, pady=150)
ds2.pack (side=LEFT, fill=BOTH)
dt2.pack (side=LEFT, fill=BOTH, expand=True)
definitionScroll = Scrollbar (root)
definitionText=Text(root,width=10, height=10,fg='black' ,bg='pink' ,
yscrollcommand=definiticnScroll. set)
definitionScroll.config(command=definitionText.yview)
definitionText.insert ("1.0", te)
definitionText.insert ("1.0", "\n")
definitionText.insert ("1.0", "\n")
definitionText.insert ("1.0", "*******OUTPUT*****")
# an example of how to add new text to the text area
definitionText.pack (padx=10,pady=150)
definitionScroll.pack (padx=10, pady=150)
definitionScroll.pack (side=LEFT, fill=BOTH)
definitionText.pack (side=LEFT, fill=BOTH, expand=True)
1 = tk.Label (root, text="NOTE: *****The Entities which are not taggedin Output are not Named Entities*****" , fg='black', bg='pink')
1.place(x = 500, y =650, width=500, height=25)
#ttk. Button (mainframe, text="View Parameters", command-parame) .grid(column=11, row=10, sticky=E)
#definitionText.place(x= 19, y = 200, height=25)
f20.close ()
f14 = open("C: \Python32\output2_file", "r+",encoding = 'utf-8')
testl = f14.readlines ()
for lines in test1:
toke.lines.split()
for t in toke:
w4.append(t)
f14.close ()
f12 = open("C: \Python32\output_file", "w+" ,encoding = 'utf-8')
f12.write("")
f12.close ()
ttk.Button (mainframe, text="SAVE OUTPUT", command=save_output).grid (column=11, row=7, sticky=E)
ttk.Button (mainframe, text="NER EVALUATICN", comrand=evaluate).grid (column=13, row=7, sticky=E)
ttk.Button (mainframe, text="REFRESH", command=ref) .grid (column=15, row=7, sticky=E)
return
def ref() :
root.destroy()
import new1
return
评估通过HMM来执行NER后所生成的输出结果:
def evaluate() :
global wDict
global woe
global woe1
global woe2
woel=[ ]
woe =[ ]
woe2=[ ]
ws = [ ]
wDict ={}
i = 0
j = 0
k = 0
sp = 0
f141.open ("C:\Python32\output1_file","r+",encoding = 'utf-8')
tesl = f141.readlines ()
for lines in tesl :
toke.lines.split()
for t in toke;
ws.append (t)
if t in wDict:
wDict[t] += 1
else:
wDict[t] = I
for line in tlines:
tok = line.split()
for t in tok:
wd = t.split("/")
if (wd[1] !='OTHER') :
if t in wDict:
wDict[tJ += 1
else:
wDict[t] = 1
print ("words in train corpus ", wDict)
for key in wDict:
i=i+1
print("total words in Dictionary are:",i)
for line in train_lines:
toe=line.split()
for t1 in toe: .
if '/' not in t1:
sp=sp+1
woe2.append(t1)
print ("Spurious words are")
for w in woe2:
print (w)
print ("Total spurious words are:",sp)
for l in te:
to=l.split()
for t1 in to:
if '/', in t1:
#print (tl)
If t1 in ws or t1 in wDict:
woe.append(t1)
j=j+1
if t1 not in wDict:
wdd=t1.split("/")
if wdd[0] not in woe2:
woel.append (t1)
k=k+1
print ("Word found in Dict are:")
for w in woe:
print(w)
print ("Word not found in Dict are:")
for w in woel:
print (w)
print("Total correctly tagqed words are:",j)
print ("Total incorrectly tagged words are:",k)
pr=(j)/(j+k)
re=(j)/ (j+k+sp)
f141.close()
root=Tk()
root.title ("NER EVALUATION")
root.gecmetry ("1000x1000")
ds21 = Scrollbar (root)
dt21 = Text (root, width=10, height=10, fg='black', bg='pink', yscrollcommand=ds2l.set)
ds21.config (command=dt2l.yview)
dt21.insert ("1.0", (2*pr*re)/ (pr+re))
dt21.insert("1.0","\n") ,
dt21.insert ("1.0", "F-MEASURE=")
dt21.insert("1.0", "\n")
dt21.insert ("1.0","F-MEASURE= (2* PRECISION*RECALL)/(PRECISION+RECALLI)")
dt21.insertl"1.0", "\n")
dt21.insert("1.0", "\n")
dt2l.insert("1.0",re)
dt21.insert"1.0", "RECALL=")
dt2l.insert("1.0", "\n")
dt21.insert ("1.0", "RECALL= CORRECT/ (CORRECT +INCORRECT +SPURIOUS) ")
dt21.insert("1.0", "\n")
dt21.insert("1,0", "\n")
dt21. insert ("1.0",pr)
dt21.insert("1.0", "PRECISION=")
dt21.insert("1.0", "\n")
dt21.insert ("1.0", "PRECISION= CORRECT/ CORRECT +INCORRECT +MISSING)")
dt21.insert("1.0", "\n")
dt21.insert("1.0", "\n")
dt21.insert("1.0", "Total No. of Missing words are: 0")
dt21.insert("1.0", "\n")
dt21.insert("1.0", "\n"J
dt21.insert("1.0", sp)
dt2l.insert("1.0", "Total No. of Spurious Words are:")
dt21.insert("1.0", "\n")
for w in woe2:
dt21.insert ("1.0",w)
dt21.insert("1.0"," ")
dt21.insert("1.0", "Total Spuricus Words are:")
dt21.insert("1.0","\n")
dt21.insert("1.0", "\n")
dt21.insert("1.0",k)
dt21.insert("1.0","Total No. of Incorrectly tagged words are:"J
dt21.insert("1.0", "\n")
for w in woel:
dt21.insert("1.0",w)
dt2l. insert("1.0"," ")
dt21.insert("1.0", "Total Incorrectly tagged words are:")
dt21.insert("1.0", "\n")
dt21.insert("1.0", "\n")
dt2l.insert("1.0",j)
dt21.insert("1.0", "Total No. of Correctly tagged words are:")
dt21.insert("1.0","\n")
for w in woe:
dt21.insert("1.0" ,w)
dt21.insert("1.0"," ")
dt21.insert ("1.0", "Total Correctly tagged words are:")
dt21.insert("1.0","\n")
dt21.insert("1.0", "\n")
dt2l.insert("1.0", "**********PERFORMANCE EVALUATION OF NERHMM*********")
# an example of how to add new text to the text area
dt21.pack (padx=5,pady=5)
ds21.pack (padx=5,pady=5)
ds21.pack(side=LEFT, fill=BOTH)
dt21.pack(side=LEFT, fill=BOTH, expand=True)
root mainloop()
return
def save_output() :
#dispdlg()
f.open("C:\Python32\save", "w+", encoding = 'utf-8')
f20.open("C:\Python32\output5_file", "r+",encoding = 'utf-8')
te.f20.readlines()
for t in te:
f.write(t)
f.close()
f20.close()
#to calculate start probability matrix
def cal_srt_prob() :
global start_probability
start_probability = cal_start_p.lst
return
#to print vitarbi parameter if required
def pr_param() :
l1 = tk.Label(root, text="HMM Training is going on.... .Don't Click any Button!!", fg='black' ,bg= 'pink')
l1.place(x .300, y = 150, height=25)
print("states")
print (states)
print(" ")
print(" ")
print("start probability")
print(start_ probability)
print(" ")
print(" ")
print ("transition probability")
print (transition_ probability)
print(" ")
print(" ")
print ("emission probability")
print (emission_ probability)
11 = tk.Label (root, text=””)
l1.place(x = 300, y = 150,height = 25)
global flagl
flag1 = 0
global flag2
flag2 = 0
ttk.Button (mainframe, text="View Parameters", command =parame).grid(column=7, row=5, sticky=W)
return
def parame():
global flag2
flag2=flag1+1
global definitionText11
global definitionScroll11
definitionScroll11 = Scrollbar (root)
definitionTextl1 = Text (root, width=10, height=10, fg='black',bg='pink',
yscrollcommand=definit ionScroll11.set)
#definitionText.place(x- 19, y = 200,height=25)
definitionScroll11.config (command=definitionText11.yview)
definitionText11.delece("1.0" END) # an example of how to deleteall current text .
definitionText11.insert ("1.0", emissicn_ probability )
definitionText11.insert("1.0", "\n")
definitionText11.insert("1.0", "Emissicn Probability")
definitionText11.insert("1.0", "\n")
definitionText11.inset("1.0".transition_probability)
definitionText11.insert("1.0", "Transition Probability")
definitionText11.insert("1.0", "\n")
definitionText11.insert("1.0", start_ probability)
definitionText11.insert("1.0", "Start Probability")
# an example of how to add new text to the text area
definitionText11.pack (padx=10, pady=175)
definitionScroll11.pack (padx=10, pady=175)
definitionScroll11.pack(side=LEFT, fill=BOTH)
definitionText11.pack (side=LEFT, fill=BOTH, expand=True)
return
# to calculate transition probability matrix
def cat_trans_prob() :
global transition_probability
global corpus
global tlines
trans_mat ,main_prg(t_dict,corpus, tlines)
transition_ probability = t_dict
return
def find_train_corpus():
global train_lines
global tlines
global c
global corpus
global words1
global w1
global train1
global fname
global file1
global ds1
global dt1
global w21
words1 = [ ]
c=0
w1 = [ ]
w21 = [ ]
f11 = open ("C:\Python32\output1_file", "w+" , encoding='utf-B')
f11.write("")
f11.closel)
fr . open("C:\Python32\output_file", "w+" , encoding='utf-B')
fr.write ("")
fr.close()
fgl=open("C:\Python32\ladetect1", "w+",encoding = 'utf-8')
fgl.write("")
fgl.close()
fgl-open("C:\Python32\ladetect","w+",encoding = 'utf-8')
fgl.write("")
fgl.close ()
dispdlg()
f = open(file_name, "r+",encoding = 'utf-8')
train_lines = f.readlines()
ds1 = Scrollbar (root)
dt1 = Text(root, width=10, height=10, fg='black',bg='pink' , yscrollcormand=dsl.set)
ds1.config (command=dt1.yview)
dt1.insert ("1.0", train_lines)
dt1.insert("1.0", "\n")
dt1.insert("1.0","\n")
dt1.insert("1.0","****TRAINING SENTENCES***")
# an example of how to add new text to the text area
dt1.pack (padx=10, pady=175)
ds1.pack (padx=10, pady=175)
ds1.pack(side=LEFT, fill=BOTH)
dt1.pack(side=LEFT, fill=BOTH, expand=True)
fname="C: /Python32/file_name1"
f = open(file_name, "r+",encoding = 'utf-8')
file1=file_name
p = open(fname, "w+",encoding = 'utf-8')
corpus = f.read()
for line in train_lines:
tok = line.split()
for t in tok:
n = t.split ()
le=len(t)
i=0
j=0
for n1 in n:
while(j<le) :
if(n1[j]!='/'):
i=i+1
j=j+1
else:
j=j+1
if(i==le) :
p.write(t)
p.write("/OTHER " ) #Handling Spurious words
else:
p.write(t)
p.write(" ")
p.write("\n")
p.closel
fname= "c:/Python32/file_name1"
f00 = open (fname, "r+" ,encoding = 'utf-8')
tlines = f00.readlines()
for line in tlines:
tok line.split()
for t in tok:
wd t.split("/")
if(wd[l]!='OTHER'):
if not wd[0] in wordsl:
words1.append (wd[OI)
w1.append(wd[1])
f00.close ()
f157 = open("C:\Python32\input_file", "w+",encoding = 'utf-8')
f157.write("")
f157.close()
f1 = open("C:\Python32\input_file", "w+", encoding = 'utf-8') #input_file has list of Named Entities of training file
for w in words1:
f1.write(w)
f1.write("\n")
f1.close()
fr = open ("C:\Python32 \detect" , "w+",encoding = 'utf-8')
fr.write("")
fr.close()
f.close ()
f.close ()
cal_states ()
cal_emit_mat ()
cal_srt_prob()
cat_trans_prob()
pr_param()
return
root = Tk()
root.title ("NAMED ENTITY RECOGNITION IN NATURAL LANGUAGES USING HIDDEN MARKOV MODEL")
root.geometry("1000x1000”)
mainframe = ttk.Frame(root, padding= "20 20 12 12")
mainframe.grid(column = 0, row = 0, sticky=(N, W, E, S))
b=StringVar ()
a=StringVar ()
ttk.style().configure ("TButton", padding=6, relief="flat", background="Pink", foreground= "Red") .
ttk.Button (mainframe, text="ANOTATION", cormand=calculatel).grid(column=5, row=3, sticky=w)
ttk.Button (mainframe, text"TRAIN EMM", command= tr anhmm) .grid (column=7, row=3, sticky=E)
ttk.Button (mainframe, text="TEST HVM", conmand= testhmn) .grid (column=9,row=3, sticky=E)
ttk.Button (mainframe, text="EELP", command=hmmhelp).grid(column=11,row=3, sticky=E)
# To call viterbi for particular observations find in find_ obs
def call_ vitar():
global test_ lines
global train_ lines
global corpus
global observations
global states
global start_probability
global transition_probability
global emission_probability
find_train_corpus()
cal_states()
find_obs ()
cal_emit_mat()
cal_srt_prob()
cat_trans_prob()
# print ("Vitarbi Parameters are for selected corpus")
#pr_ param()
# ----------To add all states not in start probability----------
for x in states:
if not x in start_probability:
start_probability.update({x:0.0})
for line in test_lines;
ob = line.split()
observations = ( ob )
print(" ")
print(" ")
print (line)
print(“********************”)
print(vit.viterbi(observations, states, start_probability, transition_probability, emission_probability) ,bg='Pink', fg= 'Red')
return
root.mainloop ()
以上Python代码展示了如何通过HMM来执行NER,以及如何使用性能指标(精确率、召回率和F值)来评估一个NER系统的性能,
“”"***笔者的话:整理了《精通Python自然语言处理》的第七章内容:情感分析。情感分析现在也是一个热门话题,主要用来分析人在文字上对事物的认识。后续会整理这本书的后面章节。本博客记录了书中的每段代码。希望对阅读这本书的人有所帮助。FIGHTING...(热烈欢迎大家批评指正,互相讨论)
(Nobody gives away anything valuable for free.
) ***"""
(第六章):语义分析(https://blog.csdn.net/cjx14060307101/article/details/88541214)
(第五章):语法分析(https://blog.csdn.net/cjx14060307101/article/details/88378177)
(第四章):词性标注(https://blog.csdn.net/cjx14060307101/article/details/88357016)
(第三章):形态学(https://blog.csdn.net/cjx14060307101/article/details/88316108)
(第二章):统计语言建模(https://blog.csdn.net/cjx14060307101/article/details/88087305)
(第一章):字符串操作(https://blog.csdn.net/cjx14060307101/article/details/87980631)