《精通Python自然语言处理( Deepti Chopra)》读书笔记(第七章):情感分析

《精通Python自然语言处理》

Deepti Chopra(印度)
王威 译


第七章 情感分析:我很快乐

情感分析(情感生成)被定义为确定一个字符序列背后所隐含的情感信息的过程。


7.1情感分析简介

对电影评价进行情感分析:
import nltk
import random
from nltk.corpus import movie_reviews
docs = [(list(movie_reviews.words(fid)), cat) for cat in movie_reviews.categories() for fid in movie_reviews.fileids(cat)]
random.shuffle(docs)
all_tokens = nltk.FreqDist(x.lower() for x in movie_reviews.words())
token_features = list(all_tokens.keys())[:2000]
print(token_features[:100])
 
def doc_features(docs):
    doc_words = set(docs)
    features = {}
    for word in token_features:
        features['contains(%s)' % word] = (word in doc_words)
        return features

print(doc_features(movie_reviews.words('pos/cv957_8737.txt')))
feature_sets = [(doc_features(d), c) for (d,c) in docs]
train_sets, test_sets = feature_sets[100:], feature_sets[:100]
classifiers = nltk.NaiveBayesClassifier.train(train_sets)
print(nltk.classify.accuracy(classifiers, test_sets))
classifiers.show_most_informative_features(5)
	Most Informative Features
contains (damon) = True			pos:neg=	11.2:1.0
contains (outstanding) = True		pos:neg=	10.6:1.0
contains (mulan) = True 		pos:neg= 8.8 :1.0
contains (seagal) = True			neg:pos= 8.4 : 1.0
contains (wonderfully) = True 	pos;neg=	7.4:1.0

每个标识符还包括三个实体,即:单词、词条和标记。

执行文本预处理:
import nltk

class Splitter (object) :
	def  init__ (self) :
		self.nitk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
		self.nltk_tokenizer = nltk.tokenize.TreebankWwordTokenizer()

	def split(self, .text) :
		sentences = self.nltk_splitter.tokenize (text)
		tokenized_sentences = [self.nitk_tokenizer.tokenize(sent)  for  sent  in sentences]
		return tokenized_sentences
class POSTagger (object):
	def  init_ (self) :
		pass

def pos_tag(self, sentences):
    pos = [nltk.pos_tag(sentence) for sentence in sentences] 
	pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos]
	return pos
为毎一个标识符生成包含三个元素的元组(即单词、词条和词性标记):
text = “”“Why are you looking disappointed. We will go to restaurantfor dinner."""
splitter = spiitter()
postagger = POSTagger()
splitted_sentences = splitter.split(text)
print (splitted_sentences)

pos_tagged_sentences = postagger.pos_tag (splitted_sentences)
print (pos_tagged_sentences)
使用字典来执行词性标注:
class DictionaryTagger (object) :
	def __init__(self, dictionary_paths) :
		files = lopen(path, 'z') for path in dictionary paths]
		dictionaries = [yam1.load(dict_file) for dict_file in files]
		map(lambda x: x.close(), files)
		self.dictionary = {}
		self.max_key_size = 0
		for curr_dict in dictionaries:
			for key in curr_dict:
				if key in self.dictionary:
					self.dictionary[key].extend(curr_dict [key])
				else:
					self.dictionary[key] = curr_dict[key]

      				self.max_key_size = max (self.max_key_size, len(key))

	def  tag(self, postagged_sentences):
		return [self.tag_sentence (sentence) for sentence in postagged_sentences]

	def  tag_sentence (self, sentence, tag_with_lemmas = False):
		tag_sentence = []
		N len(sentence)
		if self.max_key_size == 0:
			self.max_key_size = N
			i=0
		while (i< N):
			j = min (i + self.max_key_size, N)	 #avoid overflow
		tagged = False
		while (j >i) :
			expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower()
			expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower()
		if tag_with_lemmas:
			literal = expression_lemma
		else:
			literal = expression_form
		if literal in self.dictionary:
			is_single_token = j - i == 1
		original_position = i
		i=j
		taggings = [tag for tag in self.dictionary[literal]]
		tagged_expression (expression_form, expression_lemma, taggings)
		if is_single_token: #if the taged literal is a single token, conserve its previous taggings:
			original_token_tagging=sentenceloriginal_position][2]
			tagged_expression[2].extend(loriginal_token_tagging)
			tag_sentence.append (tagged_expression)
			tagged = True
		else:
			j = j-1
		if not tagged: ,
			tag_sentence.append (sentence[i])
			i += 1
	return tag_ sentence
计算积板表达和消极表达的数量:
def value_of (sentiment):
	if sentiment == 'positive': return 1
	if sentiment == 'negative': return -1
return 0
def sentimert score (review) :
return sum([value_of(tag)  for sentence in dict_tagged_sentences  for  token in sentence for tag in token[2]])
7.1.1使用 NER执行情感分析

NER是一个找出命名实体并将其分类为不同的命名实体类的过程。我们可以使用不同的技术来执行NER,例如基于规则的方法、列表查找方法和统计学方法(隐马尔科夫模型、最大熵马尔科夫模型、支持向量机、条件随机场和决策树)。

7.1.2使用机器学习执行情感分析
hltk.sentiment.sentiment_analyzer 模块用于执行情感分析:
from  __future__  import print_function
from collections import defaultdict
from nltk.classify.uti1 import apply_features, accuracy as eval_accuracy
from nltk.collecations import BigramCollocationFinder
ftrom nltk.metrice import (BigramAssccMeasures, precisicn as eval_precision,recall as eval_recall, 	f_measure as eval_f_measure)
from nltk.probability import FreqDist
from nltk.sentiment.util import save_ file, timer 
class SentimentAnalyzer (cbject):
	“””	
	A tool for Sentiment Analysis which is based on machine learning techniques.
	“””
	def __init__ (self, classifier = None):
		self.feat_extractors = defaultdict (list)
		self.classifier = classifier
返回文本中所有重复的单词:
def  all_words (self, documents, labeled=None) :
	all_words = []
	if labeled is None:
		labeled = documents and isinstance (documents[0], tuple)
	if labeled == True:
		for words, sentiment in documents:
			all_ words.extend (words)
	elif labeled False:
		for words in documents:
		all_ words . extend(words)
	return all_ words
在文本上应用特征提取函数:
def  apply_features(self, documents, labeled=None) :
	return apply_features (self. extract_features, documents,labeled)
返回单词特征:
def unigram_word_feats(self, words, top. nwNone, min_freq=o):
	unigram_feats_freqs = FreqDist(word for word in words):
	return [w for w, f in unigram teats_freqs ,most_common(top_n)
		if unigram_feats.treq[sw] > min_freq]
返回二元语法特征:
def bagram_collocation_feats(self, dacuments, top_n=Nore, min_freq = 3,assoc_measure = 	BigramAssocMeasures.pmi) :
	finder = BigramCollocallonFinder.from_documents (documents)
	finder.apply_freq_filter (min_freg)
	return finder.nbest (assoc_measure, top_n)
通过使用特征集来分类一个给定的实例:
def classify(selE, instance) :
	instance_feats = self.apply_features ([instance], labeled=False)
	return self.classirier.classify (instance_feats[0])
抽取文本的特征:
def add_feat_extractor (self, furction, **kwargs) :
	self.feat_extractorst [function].append (kwargs)

def extract_features (self, document) :
	all_features = {}
	for extcactor in self. feat extractors:
		for para set in self.feat_extractors [extractor] :
			feats = extractor (document, **param_set}
		all_features.update (feats)
	return all_features
使用save_ classifier将输出结果保存到一个文件中:
def train(self, trainer, training_set, save_classifier =None,**kwarqs):
	print("Training classifier")
	self.classifier = trainer(training_set, **kwargs)
	if save_classifier:
		save_file (self.clasifier, save_classifier)
	return self.classifier
通过使用测试数据能够对我们的分类器执行性能评估:
def evaluate(self, test_set, classifier = None, accuracy = True, f_measure=True,precision=True, recall=True, verbose=False) :
	if classifier is None:
		classifier = self.classifier
	print("Evaluating (01 results..".format (type (classifier) .__name__))
	metrics results =()
	if accuracy mm True:
		accuracy_score = eval_accuracy(classifier, test_set)
		metrics_results['Accuracy'] = accuracy_ score

	gold_results = defaultdict (set)
	test_results = defaultdict (set)
	labels = set()
	for i, (feats, label) in enumeratetest_set) :
		labels.add (label)
		gold_results[label].add(i)
		observed = classifier.classify (feats)
		test_results [observed].add(i)

	for label in labels:
		if precision == True:
			precision_score = eval_precision(gold_results[label],test_results [label])
			metrics_results['Precision [{0}]'.format (label)]=precision_score
		if recall == True:
			recall_score = eval_recall( gold results [label], test_results [labe1])
			metrics results['Recall [{0}]'.format (label)] =recall_score
		if f_measure == True:
			f_measure_score = eval_f_measure(gold_ resultas[label],test_results[label])
			metrics_results['F-measure [{0}]'format(label)] = f_measure_score
		if verbose == True:
			for result in sorted (metrica_results):
				print('{0}:{1}'.format(result, metrics_results[result]))
		return metrics_results
执行特征提取:
stopWords = []
#if there is occurrence of two or more same character, then replace it with the character itself.
def replaceTwoorMore(s) :
	pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
	return pattern.sub(r"\1\1", s)
def getstopWordList (stopWordListFileName) :
	# This function will read the stopwords from a file and builds a list.
	stopWords = []
	stopWords.append('AT_USER')
	stopWords.append('URL')
	fp = open (stopWordListEileName, 'r')
	line = fp.readline()
	while line:
		word = line.strip()
		stopWords.append (word)
		line = fp.readline()
	fp.close()
	return stopWords

def  getFeatureVector (tweet) :
	featureVector =[]
	#Tweets are firstly split into words
	words = tweet.split()
	for W in words:
		#replace two or more with two occurrences
		w = replaceTwoOrMore (w)
		#strip punctuation
		W=w.strip('\'"?,.')
		#Words begin with alphabet is checked.
		val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$”, w)
		#If there is a stop word, then it is ignored.
		if(w in stopWords or val is None):
			continue
		else:
			featureVector.append (w. lower())
	return featureVector
#end

#Tweets are read one by one and then processed.
fp = open('data/sampleTweets.txt' ,'r')
line = fp.readline ()
st = open('data/feature list/stopwords.txt', 'r')
stopwords = getStopWordList ('data/feature_list/stopwords.txt')

while line:
	processedTweet = processTweet (line)
	featureVector = getFeatureVector (processedTweet)
	print (featureVector)
	line = fp.readline()
#end loop
fp.close()

#Tweets are read one by one and then processed.
inpTweets = csv.reader (open(' data/sampleTweets.cs 'rb'),delimiter=',', quotechar='|')
tweets = []

for row in inpTweets:
	sentiment.row[0]
	tweet.row[1]
	processedTweet = processTweet (tweet)
	featurevector = getFeatureVector (processedTweete, stopWords)
	tweets.append( (featureVector,  sentiment)); 

#Features Extraction takes place using following method
def extract_ features (tweetl:
	tweet_words = set (tweet)
	features = {}
	for word in featureList:
		features['contains(%s)' % word] = (word in tweet_words)
	return features
使用朴素贝叶斯分类器执行情感分析:
NaiveBClassifier = nltk. NaiveBayesClassifier. train(training set)
# Testing the classifier testTweet = 'I liked this book on SentimentAnalysis a lot. '
processedTestTweet = processTweet (testTweet)
print (NaiveBClassifier.classify (extract_features (getFeatureVector (processedTestTweet) ) ))
testTweet = 'I am so badly hurt'
processedTestTweet = processTweet (testTweet)
print (NBClassifier.classify (extract_features (getFeatureVector (processedTestTweet))))
使用最大熵执行情感分析:
MaxEntclassifier = n1tk. classify.maxent .MaxentClassifier.
train(training_set, 'GIS', trace=3, \encoding=None, labels=None, sparse=True, gaussian_prior 			_sigma=0, max_iter = 10)
testTweet = 'I liked the book on sentiment analysis a lot'
processedTestTweet = processTweet (testTweet)

print (MaxEntclassifier.classify (extract_features (getFeatureVector (processedTestTweet))))
print (MaxEntClassifier.show_most_ informative_ features(10))
7.1.3 NER系统的评估

性能指标或评估有助于展示一个NER系统的性能。NER标注器的结果可以认为是一个回答,人们的一种解释称作答案要点。因此,我们提供了如下定义:

Correct如果回答与答案要点完全相同
Incorrect如果回答与答案要点不同
Missing如果答案要点被标注,但回答未被标注
Spurious如果回答被标注,但答案要点未被标注

通过使用以下参数可以评价一个基于NER的系统的性能:

精确率(P)P=Correct/(Correct+Incorret+Missing)
召回率(R)R=Correct/(Correct+Incorrect+Spurious)
F值F-Mcasure = (2PR)/(P+R)
使用HMM执行NER:
#***** Function to find all cags in corpus*****
def find_tag_set(tra lines):
	global tag_set
	tag_set = []
	for line in tra_ lines:
		tok = line.split()
	for t in tok:
		wd = t.split("/")
	if not wd{1] in tag_ set:
		tag_set.append (wd[1])
return
#***** Function to find frequency of each tag in tagged corpus*****
def cnt_tag(tr_ln):
	global start_li
	global li
	global tag_set
	global c
	global line_cnt
	global lines
	lines= tr_ln
	start_li = []   # list ofstarting tags
	find_tag_set(tr_ln)
	line_cnt = 0
	for line in lines:
		tok = line.split()
		x = tok[0].split("/")
	if not x[1] in start_li:
		start_li.append(x[1])
		line_cnt = line_cnt+ 1
		find_freq_tag()
		find_ freq_srttag()
	return
def find_freq_tagl) :
	global tag_cnt
	global tag_ set
	tag_cnt={}
	i=0
	for w in tag_set:
		cal_freq_tag(tag_set[i])
		i=i+1
		tag_cnt.update({w:freq_tg})
	return
def cal_freq_tag(tg):
	global freq_tg
	global lines
	freq_tg = 0

	for line in lines:
		freq_tg = freq _tg + line.count (tg)
	return

#**** Function to find frequency of each starting tag in tagged corpus****
def find_freq_srttag():
	global lst
	lst ={}   # start probability
	i=0
	for w in start_li:
		cc = freq_srt_tag(start_li[i])
		prob = cc / line_cnt
		lst.update({start_li[i] :prob})
		i=i+1
	return 
def freq_srt_tag(stg) :
	global lines
	freq_srt_tg = 0

	for line in lines:
		tok = line.split()
		if stg in tok[0]:
			freq_srt _tg = freq_srt_tg + 1
	return(freq_srt_tg)

import tkinter as tk
import vit
import random
import cal_start_P
import calle_prob
import trans_mat
import time
import trans
import dict5
from tkinter import *
from tkinter import ttk
from tkinter.filedialog import askopenfllename
from tkinter.messagebox import showerror
import languagedetectl
import languagedetect3
e_dict = dictll
t_dict = dictll

def calculatel(*args) :
import listboxi
def calculate2(*args) :
import listhox2
def calculate3(*args):
import listbox3

def dispdlgl):
	global file_ name
	root = tk.Tk()
	root.withdraw()
	file_name = askopenfilename ()
return

def tranhmm():
ttk.Style().configure ("TButton", padding=6, relief="flat",background="Pink", foreground="Red")
ttk.Button(mainframe, text= " BROWSE", command=find_train_corpus).grid(column=7, row=5, sticky=W)

# The following code will be used to display or accept the testingcorpus from the user.
def testhmm():
ttk. Button (mainframe,text="Develop a new testing Corpus",command=calculate3).grid(column=9, row=5, sticky=E)
ttk.Button (mainframe, text="BROWSE", command=find_obs).grid(columne=9,row=7, sticky=E)

#In HMM, We require parameters such as Start Probability, Transition Probability and Emission Probability. The following code is used to calculate emission probability matrix

def cal_emit_mat():
	global emission_probability 
	global corpus
	global tlines

	calle_prob.m_prg(e_dict, corpus, tlines) 
	emission_probability = e_dict
	return
# to calculate states

def cal_states) :
	global states
	global tlines
	cal_start_p.cnt_tag (tlines)
	states = cal_start_p.tag_set
	return
# to take observations
def find_obs():
global observationsg
lobal test_ lines
global tra
global w4
global co
global tra
global wo1
global wo2
global test1
global wo3
global te
global definitionText
global definitionseroll
qlobal dt2
global ds2
global dt11
global ds11
wo3=[ ]
woo=[ ]
wo1=[ ]
wo2=[ ]
co=0
w4=[ ] 
if(flag2!=0):
	definitionText11.pack_forget ()
	definitionScroll11.pack_forget()
	dt1.pack_forget()
	ds1.pack_forget()
	dispdlg()
	f.open(file_name, "r+",encoding = 'utf-8')
	test_lines = f.readlines()
	f.close()
	fname="C:/Python32/file_name1"

for x in states:
	if not x in start_probability:

      	start_probability.update({x:0.0})
for line in test_ lines:
	ob = line.splitl)
	observations = ( ob )

fe=open ("C: \Python32\output3_file", "w+" ,encoding = 'utf-8')
fe.write("")
fe.close()
ff=open ("C: \Python32\output4_file" , "w+" , encoding = 'utf-8' )

ff.write ("")
ff.close ()
ff7=open ("C:\Python32\output5_file", "w+", encoding = 'utf-8')
ff7.write ("")
ff7.close()
ff8=open ("C: \Python32\output6_file", "w+",encoding = 'utf-8')
ff8.write ("")
ff8.close()
ff81=open("C: \Python32\output7_file", "w+",encoding = 'utt-8')
ff81.write ("")
ff81.close()
dict5.search_obs_train_corpus (filel, fname, tlines, test_lines, observations, states, 
							start_probability, transition_probability,emission_probability)
f20 = open("C: \Python32\output5_file", "r+",encoding = 'utf-8')
te = f20.readlines()
tee=f20.read ()
f = open(fname, "r+",encoding = 'utf-8')
train_llines = f.readlines()

ds11 = Scrollbar (root)
dt11 = Text(root, width=10, height=20, fg='black' ,bg='pink' ,yscrollcommand=dsll.set)
ds11.config (command=dt11.yview)
dt11.insert ("1.0",train_lines)
dt11.insert ("1.0", "\n")
dt11.insert("1.0", "\n")

dtll.insert ("1.0", "***TRAINING SENTENCES***")

      #an example of how to add new text to the text area
dt11.pack (padx=10, pady=150)
ds11.pack (padx=10, pady=150)

ds11.pack(side=LEFT, fill=BOTH)

dt11.pack(side=LEFT, fill=BOTH, expand=True)

ds2 = Scrollbar (root)
dt2 = Text(root, width=10, height=10, fg='black' ,bg= 'pink',yscrollcommand=ds2.set)
ds2.config (command=dt2.yview)
dt2.insert("1.0",test_lines)
dt2.insert("1.0","\n")
dt2.insert("1.0", "\n")
dt2.insert("1.0", "******TESTING SENTENCES*****")

      # an example of how to add new text to the text area
dt2.pack (padx=10, pady=150)
ds2.pack (padx=10, pady=150)

ds2.pack (side=LEFT, fill=BOTH)
dt2.pack (side=LEFT, fill=BOTH, expand=True)

definitionScroll = Scrollbar (root) 
definitionText=Text(root,width=10, height=10,fg='black' ,bg='pink' ,
				yscrollcommand=definiticnScroll. set)
definitionScroll.config(command=definitionText.yview)
definitionText.insert ("1.0", te)
definitionText.insert ("1.0", "\n")
definitionText.insert ("1.0", "\n")
definitionText.insert ("1.0", "*******OUTPUT*****")

      # an example of how to add new text to the text area
definitionText.pack (padx=10,pady=150)
definitionScroll.pack (padx=10, pady=150)

definitionScroll.pack (side=LEFT, fill=BOTH)
definitionText.pack (side=LEFT, fill=BOTH, expand=True)

1 = tk.Label (root, text="NOTE: *****The Entities which are not taggedin Output are not Named Entities*****" , fg='black', bg='pink')
1.place(x = 500, y =650, width=500, height=25)

	#ttk. Button (mainframe, text="View Parameters", command-parame) .grid(column=11, row=10, sticky=E)
	#definitionText.place(x= 19, y = 200, height=25)

f20.close ()
f14 = open("C: \Python32\output2_file", "r+",encoding = 'utf-8')
testl = f14.readlines ()
for lines in test1:
	toke.lines.split()
for t in toke:
	w4.append(t)
f14.close ()
f12 = open("C: \Python32\output_file", "w+" ,encoding = 'utf-8')
f12.write("") 
f12.close ()

ttk.Button (mainframe, text="SAVE OUTPUT", command=save_output).grid (column=11, row=7, sticky=E)
ttk.Button (mainframe, text="NER EVALUATICN", comrand=evaluate).grid (column=13, row=7, sticky=E)
ttk.Button (mainframe, text="REFRESH", command=ref) .grid (column=15, row=7, sticky=E)

return
def ref() :
	root.destroy()
import new1
return
评估通过HMM来执行NER后所生成的输出结果:
def evaluate() :
global wDict
global woe
global woe1
global woe2
woel=[ ]
woe =[ ]
woe2=[ ]
ws = [ ]
wDict ={}
i = 0
	j = 0
	k = 0
sp = 0
f141.open ("C:\Python32\output1_file","r+",encoding = 'utf-8')
tesl = f141.readlines ()
for lines in tesl :
	toke.lines.split()
for t in toke;
	ws.append (t)

if t in wDict: 
	wDict[t] += 1
else: 
	wDict[t] = I
for line in tlines:
	tok = line.split()

for t in tok:
	wd = t.split("/")
if (wd[1] !='OTHER') :
	if t in wDict: 
		wDict[tJ += 1
	else: 
		wDict[t] = 1
print ("words in train corpus ", wDict)
for key in wDict:
	i=i+1
print("total words in Dictionary are:",i)
for line in train_lines:
	toe=line.split()
for t1 in toe: .
	if '/' not in t1:
		sp=sp+1
		woe2.append(t1)
print ("Spurious words are")
for w in woe2:
	print (w)	
	print ("Total spurious words are:",sp)
for l in te:
	to=l.split()
for t1 in to:
	if '/', in t1:
		#print (tl)
If t1 in ws or t1 in wDict:
	woe.append(t1)
		j=j+1
if t1 not in wDict:
	wdd=t1.split("/")

if wdd[0] not in woe2:
	woel.append (t1)
		k=k+1
print ("Word found in Dict are:")

for w in woe:
	print(w)
	print ("Word not found in Dict are:") 
for w in woel:
	print (w)
	print("Total correctly tagqed words are:",j)
	print ("Total incorrectly tagged words are:",k)
	pr=(j)/(j+k)
	re=(j)/ (j+k+sp)
	f141.close()
	root=Tk()
	root.title ("NER EVALUATION")
	root.gecmetry ("1000x1000")

ds21 = Scrollbar (root)
dt21 = Text (root, width=10, height=10, fg='black', bg='pink', yscrollcommand=ds2l.set)
ds21.config (command=dt2l.yview)
dt21.insert ("1.0", (2*pr*re)/ (pr+re))
dt21.insert("1.0","\n") ,
dt21.insert ("1.0", "F-MEASURE=")
dt21.insert("1.0", "\n")
dt21.insert ("1.0","F-MEASURE= (2* PRECISION*RECALL)/(PRECISION+RECALLI)")
dt21.insertl"1.0", "\n")
dt21.insert("1.0", "\n")
dt2l.insert("1.0",re)
dt21.insert"1.0", "RECALL=")
dt2l.insert("1.0", "\n")
dt21.insert ("1.0", "RECALL= CORRECT/ (CORRECT +INCORRECT +SPURIOUS) ")
dt21.insert("1.0", "\n")
dt21.insert("1,0", "\n")
dt21. insert ("1.0",pr)
dt21.insert("1.0", "PRECISION=")
dt21.insert("1.0", "\n")
dt21.insert ("1.0", "PRECISION= CORRECT/ CORRECT +INCORRECT +MISSING)")
dt21.insert("1.0", "\n")
dt21.insert("1.0", "\n")
dt21.insert("1.0", "Total No. of Missing words are: 0")
dt21.insert("1.0", "\n")
dt21.insert("1.0", "\n"J
dt21.insert("1.0", sp)
dt2l.insert("1.0", "Total No. of Spurious Words are:")
dt21.insert("1.0", "\n")
for w in woe2:
	dt21.insert ("1.0",w)
	dt21.insert("1.0"," ")
	dt21.insert("1.0", "Total Spuricus Words are:")
	dt21.insert("1.0","\n")
	dt21.insert("1.0", "\n")
	dt21.insert("1.0",k)
	dt21.insert("1.0","Total No. of Incorrectly tagged words are:"J
	dt21.insert("1.0", "\n")
for w in woel:
	dt21.insert("1.0",w)
	dt2l. insert("1.0"," ")
	dt21.insert("1.0", "Total Incorrectly tagged words are:")
	dt21.insert("1.0", "\n")
	dt21.insert("1.0", "\n")
	dt2l.insert("1.0",j)
	dt21.insert("1.0", "Total No. of Correctly tagged words are:")
	dt21.insert("1.0","\n")
for w in woe:
	dt21.insert("1.0" ,w)
	dt21.insert("1.0"," ")
	dt21.insert ("1.0", "Total Correctly tagged words are:")
	dt21.insert("1.0","\n")
	dt21.insert("1.0", "\n")
	dt2l.insert("1.0", "**********PERFORMANCE EVALUATION OF NERHMM*********")

      # an example of how to add new text to the text area
dt21.pack (padx=5,pady=5)
ds21.pack (padx=5,pady=5)
ds21.pack(side=LEFT, fill=BOTH)
dt21.pack(side=LEFT, fill=BOTH, expand=True)
root mainloop()
return
def save_output() :
	#dispdlg()
	f.open("C:\Python32\save", "w+", encoding = 'utf-8')
	f20.open("C:\Python32\output5_file", "r+",encoding = 'utf-8')
	te.f20.readlines()
for t in te:
	f.write(t)
	f.close()
	f20.close()

#to calculate start probability matrix
def cal_srt_prob() :
	global start_probability
	start_probability = cal_start_p.lst
return

#to print vitarbi parameter if required
def pr_param() :
l1 = tk.Label(root, text="HMM Training is going on.... .Don't Click any Button!!", fg='black' ,bg= 'pink')
l1.place(x .300, y = 150, height=25)

print("states")
print (states)
print(" ")
print(" ")
print("start probability")
print(start_ probability)
print(" ")
print(" ")
print ("transition probability")
print (transition_ probability)
print(" ")
print(" ")
print ("emission probability")
print (emission_ probability)
11 = tk.Label (root, text=””)
l1.place(x = 300, y = 150,height = 25)
global flagl
	flag1 = 0
global flag2
	flag2 = 0
ttk.Button (mainframe, text="View Parameters", command =parame).grid(column=7, row=5, sticky=W)
return

def parame():
	global flag2
	flag2=flag1+1
global definitionText11
global definitionScroll11
definitionScroll11 = Scrollbar (root)
definitionTextl1 = Text (root, width=10, height=10, fg='black',bg='pink', 
				yscrollcommand=definit ionScroll11.set)

      #definitionText.place(x- 19, y = 200,height=25)
definitionScroll11.config (command=definitionText11.yview)
definitionText11.delece("1.0" END)  # an example of how to deleteall current text .
definitionText11.insert ("1.0", emissicn_ probability )
definitionText11.insert("1.0", "\n")
definitionText11.insert("1.0", "Emissicn Probability")
definitionText11.insert("1.0", "\n")
definitionText11.inset("1.0".transition_probability)
definitionText11.insert("1.0", "Transition Probability")
definitionText11.insert("1.0", "\n")
definitionText11.insert("1.0", start_ probability)
definitionText11.insert("1.0", "Start Probability")

      # an example of how to add new text to the text area
definitionText11.pack (padx=10, pady=175)
definitionScroll11.pack (padx=10, pady=175)

definitionScroll11.pack(side=LEFT, fill=BOTH)
definitionText11.pack (side=LEFT, fill=BOTH, expand=True)
return

# to calculate transition probability matrix
def cat_trans_prob() :
	global transition_probability
	global corpus
	global tlines
	
trans_mat ,main_prg(t_dict,corpus, tlines)
transition_ probability = t_dict
return

def find_train_corpus():
global train_lines
global tlines
global c
global corpus
global words1
global w1
global train1
global fname
global file1
global ds1
global dt1
global w21
words1 = [ ]
	c=0
w1 = [ ]
w21 = [ ]
f11 = open ("C:\Python32\output1_file", "w+" , encoding='utf-B')
f11.write("")
f11.closel)
fr . open("C:\Python32\output_file", "w+" , encoding='utf-B')
fr.write ("")
fr.close()
fgl=open("C:\Python32\ladetect1", "w+",encoding = 'utf-8')
fgl.write("")
fgl.close()

fgl-open("C:\Python32\ladetect","w+",encoding = 'utf-8')
fgl.write("")
fgl.close ()
dispdlg()
f = open(file_name, "r+",encoding = 'utf-8')
train_lines = f.readlines()

ds1 = Scrollbar (root)
dt1 = Text(root, width=10, height=10, fg='black',bg='pink' , yscrollcormand=dsl.set)
ds1.config (command=dt1.yview)
dt1.insert ("1.0", train_lines)
dt1.insert("1.0", "\n")
dt1.insert("1.0","\n")
dt1.insert("1.0","****TRAINING SENTENCES***")

      # an example of how to add new text to the text area
dt1.pack (padx=10, pady=175)
ds1.pack (padx=10, pady=175)

ds1.pack(side=LEFT, fill=BOTH)
dt1.pack(side=LEFT, fill=BOTH, expand=True)
fname="C: /Python32/file_name1"
f = open(file_name, "r+",encoding = 'utf-8') 
      file1=file_name
p = open(fname, "w+",encoding = 'utf-8')

corpus = f.read()
for line in train_lines:
	tok = line.split()
for t in tok:
n = t.split ()

le=len(t)
i=0
j=0
for n1 in n:
while(j<le) :

if(n1[j]!='/'):
	i=i+1
	j=j+1
else:
	j=j+1
if(i==le) :
	p.write(t)
	p.write("/OTHER " )		#Handling Spurious words
else:
	p.write(t)
	p.write(" ")

p.write("\n")
p.closel
fname= "c:/Python32/file_name1"
f00 = open (fname, "r+" ,encoding = 'utf-8')
tlines = f00.readlines()
for line in tlines:
	tok line.split()
for t in tok:
	wd t.split("/")
if(wd[l]!='OTHER'):
	if not wd[0] in wordsl: 
		words1.append (wd[OI)
		w1.append(wd[1])
	f00.close ()

f157 = open("C:\Python32\input_file", "w+",encoding = 'utf-8')
f157.write("")
f157.close()
f1 = open("C:\Python32\input_file", "w+", encoding = 'utf-8')  #input_file has list of Named Entities of training file
for w in words1:
	f1.write(w)
	f1.write("\n")
	f1.close()
fr = open ("C:\Python32 \detect" , "w+",encoding = 'utf-8')
fr.write("")
fr.close()

f.close ()
f.close ()

cal_states ()
cal_emit_mat ()
cal_srt_prob()
cat_trans_prob()
pr_param()

return

root = Tk()
root.title ("NAMED ENTITY RECOGNITION IN NATURAL LANGUAGES USING HIDDEN MARKOV MODEL")
root.geometry("1000x1000”)
mainframe = ttk.Frame(root, padding= "20 20 12 12")
mainframe.grid(column = 0, row = 0, sticky=(N, W, E, S))

b=StringVar ()
a=StringVar ()

ttk.style().configure ("TButton", padding=6, relief="flat", background="Pink", foreground= "Red") .
ttk.Button (mainframe, text="ANOTATION", cormand=calculatel).grid(column=5, row=3, sticky=w)
ttk.Button (mainframe, text"TRAIN EMM", command= tr anhmm) .grid (column=7, row=3, sticky=E)
ttk.Button (mainframe, text="TEST HVM", conmand= testhmn) .grid (column=9,row=3, sticky=E) 
ttk.Button (mainframe, text="EELP", command=hmmhelp).grid(column=11,row=3, sticky=E)

# To call viterbi for particular observations find in find_ obs
def call_ vitar():
	global test_ lines
	global train_ lines
	global corpus
	global observations
	global states
	global start_probability
	global transition_probability
	global emission_probability

	find_train_corpus()
	cal_states()
	find_obs ()
	cal_emit_mat()
	cal_srt_prob()
	cat_trans_prob()

	# print ("Vitarbi Parameters are for selected corpus")
	#pr_ param()
	# ----------To add all states not in start probability----------
for x in states:
	if not x in start_probability:
		start_probability.update({x:0.0})
for line in test_lines;
	ob = line.split()
	observations = ( ob )
	print(" ")
	print(" ")
	print (line)
	print(“********************”)
	print(vit.viterbi(observations, states, start_probability, transition_probability, emission_probability) ,bg='Pink', fg= 'Red')
	return

root.mainloop ()

以上Python代码展示了如何通过HMM来执行NER,以及如何使用性能指标(精确率、召回率和F值)来评估一个NER系统的性能,

“”"***笔者的话:整理了《精通Python自然语言处理》的第七章内容:情感分析。情感分析现在也是一个热门话题,主要用来分析人在文字上对事物的认识。后续会整理这本书的后面章节。本博客记录了书中的每段代码。希望对阅读这本书的人有所帮助。FIGHTING...(热烈欢迎大家批评指正,互相讨论)
Nobody gives away anything valuable for free.
***"""


(第六章):语义分析(https://blog.csdn.net/cjx14060307101/article/details/88541214)
(第五章):语法分析(https://blog.csdn.net/cjx14060307101/article/details/88378177)
(第四章):词性标注(https://blog.csdn.net/cjx14060307101/article/details/88357016)
(第三章):形态学(https://blog.csdn.net/cjx14060307101/article/details/88316108)
(第二章):统计语言建模(https://blog.csdn.net/cjx14060307101/article/details/88087305)
(第一章):字符串操作(https://blog.csdn.net/cjx14060307101/article/details/87980631)

  • 2
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值