《Python自然语言处理》第五章练习题答案

这章主要内容涉及分词、词性标注和标注器训练、字典使用。
因为中英文差别,所以在后面练习里尝试用中文数据来训练ngram标注器。

首先导包

import nltk
from nltk.corpus import brown
from nltk.book import *
import jieba
import matplotlib.pyplot as plt

1

#nltk词性标注无法消除歧义
text = nltk.word_tokenize('British Left Waffles on Falkland Islands')
nltk.pos_tag(text)

2

tag_words=brown.tagged_words()
for (word,tag) in tag_words:
    if word == 'contest':
        print(tag)
        break

3

nltk.pos_tag(nltk.word_tokenize('They wind back the clock,while we chase after the wind.'))

7

#dic中update将内容全部添加到d1中
d1 = {'a':1,'b':2,'c':3}
d2 = {'d':4,'f':5,'g':6}
d1.update(d2)
print(d1,d2)

9

text1.concordance('go')
text1.concordance('went')

10

import re
brown_tagged_sents = brown.tagged_sents()
brown_sents = brown.sents()
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
new_text = re.sub('[\,\.]',' ',"What needs to be clarified is that the fundamental purpose of China's development is to ensure that the Chinese people can live a better life and to benefit all humankind. Win-win cooperation is an important principle of China's development and a golden rule in China's external relations. China has no intention to interfere in the political system of the United States, nor challenge or replace its status and influence.In the past few years, due to Washington's irrational suppression of China's legitimate rights and interests, China-US relations have encountered unprecedented difficulties. This situation should not continue any longer. The only right way is to follow the principles of non-conflict, non-confrontation, mutual respect and win-win cooperation.")
word_tags = unigram_tagger.tag(new_text.split())
none_tag = []
for (word,tag) in word_tags:
    if tag==None:
        none_tag.append(word)
none_tag

没被标记的有拼写不规范的词、有连字符、新词

11

help(nltk.AffixTagger)

用法AffixTagger(train=None, model=None, affix_length=-3, min_stem_length=2, backoff=None, cutoff=0, verbose=False)

brown_sents=brown.sents(categories='news')
brown_tagged_sents=brown.tagged_sents(categories='news')
affixtagger=nltk.AffixTagger(train=brown_tagged_sents,affix_length=-3,min_stem_length=2)
affixtagger.tag(brown_sents[2007])

12


sents = brown.sents()
tag_sents = brown.tagged_sents()
baseline_tagger = nltk.BigramTagger(tag_sents)
baseline_tagger.evaluate(tag_sents)
sent = "They expressed their willingness to enhance cooperation or coordination in some specific areas. For instance, the two sides are committed to strengthening dialogue and cooperation in the field of climate change and will establish a joint working group on that subject. In the spirit of reciprocity and mutual benefit, the two sides will hold talks on facilitating activities of each other's diplomatic and consular missions and personnel, as well as on issues related to media reporters.".split()
baseline_tagger.tag(sent)
baseline_tagger.evaluate(brown_tagged_sents)

bigram用于新数据后得分会提高

13

print("date:%i-%i-%i"%(2021,3,21))
print("date:%s/%s/%s"%('2021','3','21'))

14

words = brown.words()
fd_dic={}
for w in words:
    w = w.lower()
    if w in fd_dic.keys():
        fd_dic[w] += 1
    else:
        fd_dic[w] = 1 
fd_dic
sorted(fd_dic.items(),key=lambda item:item[1],reverse=True)

15

words = brown.tagged_words(tagset='universal')
set(cont[1] for cont in words)
n_words = set(word for (word,tag) in words if tag=='NOUN')
n_dic={}
for w in brown.words():
    if len(w)>2:
        w = w[:-1]
        if w in n_words:
            if w in n_dic.keys():
                n_dic[w] += 1
            else:
                n_dic[w] = 1
sorted(n_dic.items(),key = lambda item:item[1],reverse=True)

cfd = nltk.ConditionalFreqDist(
(w.lower(),tag)for (w,tag) in words)
count_dic = {}
for word in cfd.conditions():
    count_dic[word] = len(cfd[word])
sorted(count_dic.items(),key = lambda item:item[1],reverse=True)
count_tag = {}
words = brown.tagged_words()
for (w,tag) in words:
    if tag in count_tag.keys():
        count_tag[tag] += 1
    else:
        count_tag[tag] = 1
sorted(count_tag.items(),key = lambda item:item[1],reverse=True)
words = brown.tagged_words(tagset='universal')
count_tags = {}
for i in range(len(words)):
    if words[i][1] == 'NOUN':
        back_tag = words[i+1][1]
        if back_tag in count_tags.keys():
            count_tags[back_tag] += 1
        else:
            count_tags[back_tag] = 1
sorted(count_tags.items(),key = lambda item:item[1],reverse=True)

16

fd = nltk.FreqDist(brown.words())
cfd = nltk.ConditionalFreqDist(brown.tagged_words())
likely_tags = dict((word,cfd[word].max()) for word in brown.words())
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
baseline_tagger.evaluate(brown_tagged_sents)
fd = nltk.FreqDist(brown.words())
cfd = nltk.ConditionalFreqDist(brown.tagged_words())
likely_tags = dict((word,cfd[word].max()) for word in brown.words())
baseline_tagger = nltk.UnigramTagger(model=likely_tags,backoff=nltk.DefaultTagger('NN'))
baseline_tagger.evaluate(brown_tagged_sents)

18

for (k,v) in count_tags.items():
    print(k,":",(v/sum(count_tags.values()))*100)
cfd = nltk.ConditionalFreqDist(
(w.lower(),tag)for (w,tag) in words)
count_dic = {}
for word in cfd.conditions():
    if len(cfd[word])>1:
        count_dic[word] = len(cfd[word])
print((len(count_dic)/len(words))*100,"%")

20

words = brown.tagged_words()
w_li = []
for (w,t) in words:
    if t=='MD':
        w_li.append(w.lower())
w_li.sort()
print(set(w_li))
for i in range(len(words)):
    if words[i][1]=='P' and words[i+1][1]=='DET' and words[i+2][1]=='NN':
        print(words[i:i+3][0])

21

ws = []
for i in range(len(words)):
    if words[i][0].lower() in ('adore','love','like','prefer'):
        ws.append(words[i-1][0].lower())
set(ws)

24

brown_tagged_sents = brown.tagged_sents()
brown_sents = brown.sents()

train_full_size = int(len(brown_tagged_sents)*0.7)
train_sents = brown_tagged_sents[:train_full_size]
test_sents = brown_tagged_sents[train_full_size:]
tagger = nltk.UnigramTagger(train_sents)
tagger.evaluate(test_sents)
tagger = nltk.BigramTagger(train_sents)
tagger.evaluate(test_sents)
tagger = nltk.TrigramTagger(train_sents)
tagger.evaluate(test_sents)

多元标注器性能逐渐下降

25
加载人民日报2014语料

with open(r'E:\laptop\研一\2014_corpus.txt',encoding='utf8') as f:
    corpus = f.readlines()
#人民日报语料切分
tagged_sents = []
i = 1
for sent in corpus:
    if i<20000:
        tagged_sent = []
        for w in sent.split(' '):
            if w != '\n' and len(w.split('/'))==2:
                tagged_sent.append(tuple(w.split('/')))
        tagged_sents.append(tagged_sent)
        i+=1
    else:
        break
#一元标注器训练
size = int(len(tagged_sents)*0.7)
train = tagged_sents[:size]
test = tagged_sents[size:]
t0 = nltk.DefaultTagger('n')
t1 = nltk.UnigramTagger(train,backoff=t0)
t2 = nltk.BigramTagger(train,backoff=t1)
t2.evaluate(test)
t2.tag(jieba.lcut('PFR语料库是对人民日报1998年上半年的纯文本语料进行了词语切分和词性标注制作而成的,严格按照人民日报的日期、版序、文章顺序编排的。文章中的每个词语都带有词性标记。'))
t1 = nltk.UnigramTagger(train)

26

%matplotlib inline
def perform(data,test):
    baseline_tagger = nltk.UnigramTagger(train=data,backoff=nltk.DefaultTagger('n'))
    return baseline_tagger.evaluate(test)
def display():
    sizes = range(1,16)
    test = tagged_sents[-5000:]
    train_data = tagged_sents
    perfs = [perform(tagged_sents[:size*1000],test) for size in sizes]
    plt.plot(sizes,perfs,'-bo')
    plt.xlabel('data size')
    plt.ylabel('perform')
    plt.show()
display()

27

#抽出人民语料的标记
orl_sent = [[word for (word,tag) in sent if (word != None and tag!= None)] for sent in test]
test_tags = [tag for sent in orl_sent for (word,tag) in t2.tag(sent) if (word != None and tag!= None)]
gold_tags = [tag for (word,tag) in sent for sent in test if (word != None and tag!= None)]
nltk.ConfusionMatrix(gold_tags,test_tags)

31

%matplotlib inline
def perform(data,test):
    baseline_tagger = nltk.UnigramTagger(train=data,backoff=nltk.DefaultTagger('n'))
    return baseline_tagger.evaluate(test)
def display():
    sizes = range(1,16)
    test = tagged_sents[-5000:]
    train_data = tagged_sents
    perfs = [perform(tagged_sents[:size*1000],test) for size in sizes]
    plt.semilogx(sizes,perfs,'-bo')
    plt.xlabel('data size')
    plt.ylabel('perform')
    plt.show()
display()

32

size = int(len(tagged_sents)*0.7)
train = tagged_sents[:size]
test = tagged_sents[size:]
t0 = nltk.DefaultTagger('n')
t1 = nltk.UnigramTagger(train,backoff=t0)
t2 = nltk.BigramTagger(train,backoff=t1)
t2.evaluate(test)
t3 = nltk.BrillTaggerTrainer.train(train_sents=train,max_rules=200, min_score=2, min_acc=None)
t3.evaluate(test)
  • 2
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值