转自:http://blog.csdn.net/u014568921/article/details/51791495
什么是词性标注,Part-of-speech tagging
比如下面一段标注过词性的文字文字,用空格分开后,/前面的是英文单词,后面表示它的词性。
- Confidence/NN in/IN the/DT pound/NN is/VBZ widely/RB expected/VBN to/TO take/VB another/DT sharp/JJ dive/NN if/IN trade/NN figures/NNS for/IN September/NNP ,/, due/JJ for/IN release/NN tomorrow/NN ,/, fail/VB to/TO show/VB a/DT substantial/JJ improvement/NN from/IN July/NNP and/CC August/NNP ‘s/POS near-record/JJ deficits/NNS ./.
- Chancellor/NNP of/IN the/DT Exchequer/NNP Nigel/NNP Lawson/NNP ‘s/POS restated/VBN commitment/NN to/TO a/DT firm/NN monetary/JJ policy/NN has/VBZ helped/VBN to/TO prevent/VB a/DT freefall/NN in/IN sterling/NN over/IN the/DT past/JJ week/NN ./.
Confidence/NN in/IN the/DT pound/NN is/VBZ widely/RB expected/VBN to/TO take/VB another/DT sharp/JJ dive/NN if/IN trade/NN figures/NNS for/IN September/NNP ,/, due/JJ for/IN release/NN tomorrow/NN ,/, fail/VB to/TO show/VB a/DT substantial/JJ improvement/NN from/IN July/NNP and/CC August/NNP 's/POS near-record/JJ deficits/NNS ./.
Chancellor/NNP of/IN the/DT Exchequer/NNP Nigel/NNP Lawson/NNP 's/POS restated/VBN commitment/NN to/TO a/DT firm/NN monetary/JJ policy/NN has/VBZ helped/VBN to/TO prevent/VB a/DT freefall/NN in/IN sterling/NN over/IN the/DT past/JJ week/NN ./.
上面NN是名词,IN是介词或从属连词,DT: determiner 表示限定词。。。
问题是现在要给一段未标注词性的文字的每个单词标注词性。
HMM、最大熵模型、crf都可以完成这一任务
HMM
用HMM做词性标注和HMM做中文分词类似,也可以看成是序列标注问题
- #coding:utf-8
- import re
- from dicts import DefaultDict
- from random import choice
- def Dict(**args):
- ”“”Return a dictionary with argument names as the keys,
- and argument values as the key values”“”
- return args
- def hmm(training_sentences, reducedtagset):
- ”“”Given a list of pre-tagged sentences, return an HMM tuple containing
- the transition (1) and emission (2) probabilities”“”
- transitions = DefaultDict(DefaultDict(0))
- emissions = DefaultDict(DefaultDict(0))
- wordcounts = DefaultDict(0)
- tagcounts = DefaultDict(0)
- for line in training_sentences:
- prevtag = ’<START>’ # Before each sentence, begin in START state
- tagcounts[’<START>’] += 1
- for taggedword in line.split():
- (word, tag) = re.split(’(?<!\\\)\/’, taggedword)
- if reducedtagset:
- if re.match(‘VB’, tag) is not None: tag = ‘VB’
- elif re.match(‘NN’, tag) is not None: tag = ‘NN’
- elif re.match(‘JJ’, tag) is not None: tag = ‘JJ’
- elif re.match(‘RB’, tag) is not None: tag = ‘RB’
- transitions[prevtag][tag] += 1
- emissions[tag][word] += 1
- wordcounts[word] += 1
- tagcounts[tag] += 1
- prevtag = tag
- print emissions.keys()
- return hmmtuple(transitions, emissions, wordcounts, tagcounts)
- def hmmtuple(transitions, emissions, wordcounts, tagcounts):
- # At test time we will need estimates for “unknown words”—the words
- # the words that never occurred in the training data. One recommended
- # way to do this is to turn all training words occurring just once
- # into ’<UNKNOWN>’ and use this as the stand-in for all “unknown words”
- # at test time. Below we make all the necessary transformations
- # to ’<UNKNOWN>’.
- for tag,dict in emissions.items():
- for word,count in dict.items():
- if wordcounts[word] == 1:
- del emissions[tag][word]
- emissions[tag][’<UNKNOWN>’] += 1
- # Calculate smoothed conditional probabilities
- tags = emissions.keys()
- words = wordcounts.keys()
- for prevtag in transitions.keys():
- for tag in tags: #transitions[prevtag]:
- transitions[prevtag][tag] = (transitions[prevtag][tag]+1.)/(tagcounts[prevtag]+len(tags))
- #transitions[prevtag][tag] *= 1./tagcounts[prevtag]
- for tag in emissions.keys():
- for word in words: #emissions[tag]:
- emissions[tag][word] = (emissions[tag][word]+1.)/(tagcounts[tag]+len(wordcounts))
- #emissions[tag][word] *= 1./tagcounts[tag]
- #print len(transitions), len(emissions), len(tagcounts)
- return (transitions, emissions, tags)
- def strip_tags(tagged_sentences):
- ”“”Given a list of tagged sentences, return a list of untagged sentences”“”
- untagged_sentences = []
- for taggedsent in tagged_sentences:
- untaggedsent = ”
- for taggedword in taggedsent.split():
- word = re.split(’(?<!\\\)\/’, taggedword)[0]
- untaggedsent += word + ’ ’
- #print untaggedsent
- untagged_sentences.append(untaggedsent)
- return untagged_sentences
- def maxsequence(probtable, tags):
- ”“”Given a filled Viterbi probabibility table, return the most likely
- sequence of POS tags”“”
- r = len(probtable)
- c = len(probtable[0])
- maxfinalprob = 0
- maxfinaltag = None
- for i in range(r):
- if (probtable[i][c-1][0] > maxfinalprob):
- maxfinalprob = probtable[i][c-1][0]
- maxfinaltag = i
- #print maxfinaltag
- maxsequence = []
- prevmaxtag = maxfinaltag
- for j in range(c-1, -1, -1):
- maxsequence.insert(0, tags[prevmaxtag])
- #print probtable[prevmaxtag][j][1]
- prevmaxtag = probtable[prevmaxtag][j][1]
- return maxsequence
- def viterbi_tags (untagged_sentences, h):
- ”“”Given a list of untagged sentences, return the most likely sequence of
- POS tags”“”
- transitions = h[0]
- emissions = h[1]
- tags = h[2]
- maxtags = []
- #print tags
- for untaggedsent in untagged_sentences:
- #Create empty probtable
- words = untaggedsent.split()
- r = len(tags)
- c = len(words)
- probtable = [None]*r
- for i in range(r):
- probtable[i] = [None]*c
- for j in range(c):
- probtable[i][j] = [None]*2
- #Initialize zeroth column of probtable
- prevtag = ’<START>’
- word = words[0]
- for i in range(r):
- tag = tags[i]
- transition = transitions[prevtag][tag]
- if word in emissions[tag]:
- emission = emissions[tag][word]
- else:
- emission = .0001*emissions[tag][‘<UNKNOWN>’]
- probtable[i][0][0] = transition*emission
- #Fill in probtable
- for j in range(1, c):
- word = words[j]
- for i in range(r):
- tag = tags[i]
- maxprob = 0
- maxtag = None
- if word in emissions[tag]:
- emission = emissions[tag][word]
- else:
- emission = .0001*emissions[tag][‘<UNKNOWN>’]
- for k in range(r):
- prevtag = tags[k]
- transition = transitions[prevtag][tag]
- prob = probtable[k][j-1][0]*transition*emission
- if (prob > maxprob):
- maxprob = prob
- maxtag = k
- probtable[i][j][0] = maxprob
- probtable[i][j][1] = maxtag
- #Find most likely sequence of POS tags of this sentence
- sentmaxtags = maxsequence(probtable, tags)
- maxtags.extend(sentmaxtags)
- #Return most likely sequence of POS tags of all sentences
- return maxtags
- def true_tags (tagged_sentences):
- ”“”Given a list of tagged sentences, return the tag sequence”“”
- tags = []
- for sent in tagged_sentences:
- tags.extend([re.split(’(?<!\\\)\/’, word)[1] for word in sent.split()])
- return tags
- def compare(mytags, truetags, reducedtagset):
- #print mytags, truetags
- score = 0
- length = len(mytags)
- for i in range(length):
- truetag = truetags[i]
- if reducedtagset:
- if re.match(‘VB’, truetag) is not None: truetag = ‘VB’
- elif re.match(‘NN’, truetag) is not None: truetag = ‘NN’
- elif re.match(‘JJ’, truetag) is not None: truetag = ‘JJ’
- elif re.match(‘RB’, truetag) is not None: truetag = ‘RB’
- if mytags[i] == truetag: score += 1
- return 1.*score/length
- if __name__ == ‘__main__’:
- f = open(’wsj15-18.pos’).readlines()
- #90% of data is used for training
- print ‘90% of data is used for training’
- print ‘——————————–’
- i = int(len(f)*.9)
- h = hmm(f[:i], False)
- test1 = f[i:]
- v1 = viterbi_tags(strip_tags(test1), h)
- t1 = true_tags(test1)
- c1 = compare(v1, t1, False)
- print c1
- test2 = open(’wsj_0159.pos’).readlines()
- v2 = viterbi_tags(strip_tags(test2), h)
- t2 = true_tags(test2)
- c2 = compare(v2, t2, False)
- print c2
#coding:utf-8
import re
from dicts import DefaultDict
from random import choice
def Dict(**args):
"""Return a dictionary with argument names as the keys,
and argument values as the key values"""
return args
def hmm(training_sentences, reducedtagset):
"""Given a list of pre-tagged sentences, return an HMM tuple containing
the transition (1) and emission (2) probabilities"""
transitions = DefaultDict(DefaultDict(0))
emissions = DefaultDict(DefaultDict(0))
wordcounts = DefaultDict(0)
tagcounts = DefaultDict(0)
for line in training_sentences:
prevtag = '<START>' # Before each sentence, begin in START state
tagcounts['<START>'] += 1
for taggedword in line.split():
(word, tag) = re.split('(?<!\\\)\/', taggedword)
if reducedtagset:
if re.match('VB', tag) is not None: tag = 'VB'
elif re.match('NN', tag) is not None: tag = 'NN'
elif re.match('JJ', tag) is not None: tag = 'JJ'
elif re.match('RB', tag) is not None: tag = 'RB'
transitions[prevtag][tag] += 1
emissions[tag][word] += 1
wordcounts[word] += 1
tagcounts[tag] += 1
prevtag = tag
print emissions.keys()
return hmmtuple(transitions, emissions, wordcounts, tagcounts)
def hmmtuple(transitions, emissions, wordcounts, tagcounts):
# At test time we will need estimates for "unknown words"---the words
# the words that never occurred in the training data. One recommended
# way to do this is to turn all training words occurring just once
# into '<UNKNOWN>' and use this as the stand-in for all "unknown words"
# at test time. Below we make all the necessary transformations
# to '<UNKNOWN>'.
for tag,dict in emissions.items():
for word,count in dict.items():
if wordcounts[word] == 1:
del emissions[tag][word]
emissions[tag]['<UNKNOWN>'] += 1
# Calculate smoothed conditional probabilities
tags = emissions.keys()
words = wordcounts.keys()
for prevtag in transitions.keys():
for tag in tags: #transitions[prevtag]:
transitions[prevtag][tag] = (transitions[prevtag][tag]+1.)/(tagcounts[prevtag]+len(tags))
#transitions[prevtag][tag] *= 1./tagcounts[prevtag]
for tag in emissions.keys():
for word in words: #emissions[tag]:
emissions[tag][word] = (emissions[tag][word]+1.)/(tagcounts[tag]+len(wordcounts))
#emissions[tag][word] *= 1./tagcounts[tag]
#print len(transitions), len(emissions), len(tagcounts)
return (transitions, emissions, tags)
def strip_tags(tagged_sentences):
"""Given a list of tagged sentences, return a list of untagged sentences"""
untagged_sentences = []
for taggedsent in tagged_sentences:
untaggedsent = ''
for taggedword in taggedsent.split():
word = re.split('(?<!\\\)\/', taggedword)[0]
untaggedsent += word + ' '
#print untaggedsent
untagged_sentences.append(untaggedsent)
return untagged_sentences
def maxsequence(probtable, tags):
"""Given a filled Viterbi probabibility table, return the most likely
sequence of POS tags"""
r = len(probtable)
c = len(probtable[0])
maxfinalprob = 0
maxfinaltag = None
for i in range(r):
if (probtable[i][c-1][0] > maxfinalprob):
maxfinalprob = probtable[i][c-1][0]
maxfinaltag = i
#print maxfinaltag
maxsequence = []
prevmaxtag = maxfinaltag
for j in range(c-1, -1, -1):
maxsequence.insert(0, tags[prevmaxtag])
#print probtable[prevmaxtag][j][1]
prevmaxtag = probtable[prevmaxtag][j][1]
return maxsequence
def viterbi_tags (untagged_sentences, h):
"""Given a list of untagged sentences, return the most likely sequence of
POS tags"""
transitions = h[0]
emissions = h[1]
tags = h[2]
maxtags = []
#print tags
for untaggedsent in untagged_sentences:
#Create empty probtable
words = untaggedsent.split()
r = len(tags)
c = len(words)
probtable = [None]*r
for i in range(r):
probtable[i] = [None]*c
for j in range(c):
probtable[i][j] = [None]*2
#Initialize zeroth column of probtable
prevtag = '<START>'
word = words[0]
for i in range(r):
tag = tags[i]
transition = transitions[prevtag][tag]
if word in emissions[tag]:
emission = emissions[tag][word]
else:
emission = .0001*emissions[tag]['<UNKNOWN>']
probtable[i][0][0] = transition*emission
#Fill in probtable
for j in range(1, c):
word = words[j]
for i in range(r):
tag = tags[i]
maxprob = 0
maxtag = None
if word in emissions[tag]:
emission = emissions[tag][word]
else:
emission = .0001*emissions[tag]['<UNKNOWN>']
for k in range(r):
prevtag = tags[k]
transition = transitions[prevtag][tag]
prob = probtable[k][j-1][0]*transition*emission
if (prob > maxprob):
maxprob = prob
maxtag = k
probtable[i][j][0] = maxprob
probtable[i][j][1] = maxtag
#Find most likely sequence of POS tags of this sentence
sentmaxtags = maxsequence(probtable, tags)
maxtags.extend(sentmaxtags)
#Return most likely sequence of POS tags of all sentences
return maxtags
def true_tags (tagged_sentences):
"""Given a list of tagged sentences, return the tag sequence"""
tags = []
for sent in tagged_sentences:
tags.extend([re.split('(?<!\\\)\/', word)[1] for word in sent.split()])
return tags
def compare(mytags, truetags, reducedtagset):
#print mytags, truetags
score = 0
length = len(mytags)
for i in range(length):
truetag = truetags[i]
if reducedtagset:
if re.match('VB', truetag) is not None: truetag = 'VB'
elif re.match('NN', truetag) is not None: truetag = 'NN'
elif re.match('JJ', truetag) is not None: truetag = 'JJ'
elif re.match('RB', truetag) is not None: truetag = 'RB'
if mytags[i] == truetag: score += 1
return 1.*score/length
if __name__ == '__main__':
f = open('wsj15-18.pos').readlines()
#90% of data is used for training
print '90% of data is used for training'
print '--------------------------------'
i = int(len(f)*.9)
h = hmm(f[:i], False)
test1 = f[i:]
v1 = viterbi_tags(strip_tags(test1), h)
t1 = true_tags(test1)
c1 = compare(v1, t1, False)
print c1
test2 = open('wsj_0159.pos').readlines()
v2 = viterbi_tags(strip_tags(test2), h)
t2 = true_tags(test2)
c2 = compare(v2, t2, False)
print c2