《Python自然语言处理》第五章练习题答案

最新推荐文章于 2024-04-20 21:21:01 发布

heize19

最新推荐文章于 2024-04-20 21:21:01 发布

阅读量1.3k

点赞数 2

分类专栏：自然语言处理文章标签： python 自然语言处理

本文链接：https://blog.csdn.net/qq_44715621/article/details/115139313

版权

自然语言处理专栏收录该内容

4 篇文章 0 订阅

订阅专栏

这章主要内容涉及分词、词性标注和标注器训练、字典使用。
因为中英文差别，所以在后面练习里尝试用中文数据来训练ngram标注器。

首先导包

import nltk
from nltk.corpus import brown
from nltk.book import *
import jieba
import matplotlib.pyplot as plt

#nltk词性标注无法消除歧义
text = nltk.word_tokenize('British Left Waffles on Falkland Islands')
nltk.pos_tag(text)

tag_words=brown.tagged_words()
for (word,tag) in tag_words:
    if word == 'contest':
        print(tag)
        break

nltk.pos_tag(nltk.word_tokenize('They wind back the clock,while we chase after the wind.'))

#dic中update将内容全部添加到d1中
d1 = {'a':1,'b':2,'c':3}
d2 = {'d':4,'f':5,'g':6}
d1.update(d2)
print(d1,d2)

text1.concordance('go')
text1.concordance('went')

import re
brown_tagged_sents = brown.tagged_sents()
brown_sents = brown.sents()
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
new_text = re.sub('[\,\.]',' ',"What needs to be clarified is that the fundamental purpose of China's development is to ensure that the Chinese people can live a better life and to benefit all humankind. Win-win cooperation is an important principle of China's development and a golden rule in China's external relations. China has no intention to interfere in the political system of the United States, nor challenge or replace its status and influence.In the past few years, due to Washington's irrational suppression of China's legitimate rights and interests, China-US relations have encountered unprecedented difficulties. This situation should not continue any longer. The only right way is to follow the principles of non-conflict, non-confrontation, mutual respect and win-win cooperation.")
word_tags = unigram_tagger.tag(new_text.split())
none_tag = []
for (word,tag) in word_tags:
    if tag==None:
        none_tag.append(word)
none_tag

没被标记的有拼写不规范的词、有连字符、新词

help(nltk.AffixTagger)

用法AffixTagger(train=None, model=None, affix_length=-3, min_stem_length=2, backoff=None, cutoff=0, verbose=False)

brown_sents=brown.sents(categories='news')
brown_tagged_sents=brown.tagged_sents(categories='news')
affixtagger=nltk.AffixTagger(train=brown_tagged_sents,affix_length=-3,min_stem_length=2)
affixtagger.tag(brown_sents[2007])


sents = brown.sents()
tag_sents = brown.tagged_sents()
baseline_tagger = nltk.BigramTagger(tag_sents)
baseline_tagger.evaluate(tag_sents)

sent = "They expressed their willingness to enhance cooperation or coordination in some specific areas. For instance, the two sides are committed to strengthening dialogue and cooperation in the field of climate change and will establish a joint working group on that subject. In the spirit of reciprocity and mutual benefit, the two sides will hold talks on facilitating activities of each other's diplomatic and consular missions and personnel, as well as on issues related to media reporters.".split()
baseline_tagger.tag(sent)

baseline_tagger.evaluate(brown_tagged_sents)

bigram用于新数据后得分会提高

print("date:%i-%i-%i"%(2021,3,21))
print("date:%s/%s/%s"%('2021','3','21'))

words = brown.words()
fd_dic={}
for w in words:
    w = w.lower()
    if w in fd_dic.keys():
        fd_dic[w] += 1
    else:
        fd_dic[w] = 1 
fd_dic

sorted(fd_dic.items(),key=lambda item:item[1],reverse=True)

words = brown.tagged_words(tagset='universal')
set(cont[1] for cont in words)

n_words = set(word for (word,tag) in words if tag=='NOUN')
n_dic={}
for w in brown.words():
    if len(w)>2:
        w = w[:-1]
        if w in n_words:
            if w in n_dic.keys():
                n_dic[w] += 1
            else:
                n_dic[w] = 1
sorted(n_dic.items(),key = lambda item:item[1],reverse=True)

cfd = nltk.ConditionalFreqDist(
(w.lower(),tag)for (w,tag) in words)
count_dic = {}
for word in cfd.conditions():
    count_dic[word] = len(cfd[word])
sorted(count_dic.items(),key = lambda item:item[1],reverse=True)

count_tag = {}
words = brown.tagged_words()
for (w,tag) in words:
    if tag in count_tag.keys():
        count_tag[tag] += 1
    else:
        count_tag[tag] = 1
sorted(count_tag.items(),key = lambda item:item[1],reverse=True)

words = brown.tagged_words(tagset='universal')
count_tags = {}
for i in range(len(words)):
    if words[i][1] == 'NOUN':
        back_tag = words[i+1][1]
        if back_tag in count_tags.keys():
            count_tags[back_tag] += 1
        else:
            count_tags[back_tag] = 1
sorted(count_tags.items(),key = lambda item:item[1],reverse=True)

fd = nltk.FreqDist(brown.words())
cfd = nltk.ConditionalFreqDist(brown.tagged_words())
likely_tags = dict((word,cfd[word].max()) for word in brown.words())
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
baseline_tagger.evaluate(brown_tagged_sents)

fd = nltk.FreqDist(brown.words())
cfd = nltk.ConditionalFreqDist(brown.tagged_words())
likely_tags = dict((word,cfd[word].max()) for word in brown.words())
baseline_tagger = nltk.UnigramTagger(model=likely_tags,backoff=nltk.DefaultTagger('NN'))
baseline_tagger.evaluate(brown_tagged_sents)

for (k,v) in count_tags.items():
    print(k,":",(v/sum(count_tags.values()))*100)

cfd = nltk.ConditionalFreqDist(
(w.lower(),tag)for (w,tag) in words)
count_dic = {}
for word in cfd.conditions():
    if len(cfd[word])>1:
        count_dic[word] = len(cfd[word])
print((len(count_dic)/len(words))*100,"%")

words = brown.tagged_words()
w_li = []
for (w,t) in words:
    if t=='MD':
        w_li.append(w.lower())
w_li.sort()
print(set(w_li))

for i in range(len(words)):
    if words[i][1]=='P' and words[i+1][1]=='DET' and words[i+2][1]=='NN':
        print(words[i:i+3][0])

ws = []
for i in range(len(words)):
    if words[i][0].lower() in ('adore','love','like','prefer'):
        ws.append(words[i-1][0].lower())
set(ws)

brown_tagged_sents = brown.tagged_sents()
brown_sents = brown.sents()

train_full_size = int(len(brown_tagged_sents)*0.7)
train_sents = brown_tagged_sents[:train_full_size]
test_sents = brown_tagged_sents[train_full_size:]

tagger = nltk.UnigramTagger(train_sents)
tagger.evaluate(test_sents)

tagger = nltk.BigramTagger(train_sents)
tagger.evaluate(test_sents)

tagger = nltk.TrigramTagger(train_sents)
tagger.evaluate(test_sents)

多元标注器性能逐渐下降

25
加载人民日报2014语料

with open(r'E:\laptop\研一\2014_corpus.txt',encoding='utf8') as f:
    corpus = f.readlines()

#人民日报语料切分
tagged_sents = []
i = 1
for sent in corpus:
    if i<20000:
        tagged_sent = []
        for w in sent.split(' '):
            if w != '\n' and len(w.split('/'))==2:
                tagged_sent.append(tuple(w.split('/')))
        tagged_sents.append(tagged_sent)
        i+=1
    else:
        break

#一元标注器训练
size = int(len(tagged_sents)*0.7)
train = tagged_sents[:size]
test = tagged_sents[size:]
t0 = nltk.DefaultTagger('n')
t1 = nltk.UnigramTagger(train,backoff=t0)
t2 = nltk.BigramTagger(train,backoff=t1)
t2.evaluate(test)

t2.tag(jieba.lcut('PFR语料库是对人民日报1998年上半年的纯文本语料进行了词语切分和词性标注制作而成的，严格按照人民日报的日期、版序、文章顺序编排的。文章中的每个词语都带有词性标记。'))

t1 = nltk.UnigramTagger(train)

%matplotlib inline
def perform(data,test):
    baseline_tagger = nltk.UnigramTagger(train=data,backoff=nltk.DefaultTagger('n'))
    return baseline_tagger.evaluate(test)
def display():
    sizes = range(1,16)
    test = tagged_sents[-5000:]
    train_data = tagged_sents
    perfs = [perform(tagged_sents[:size*1000],test) for size in sizes]
    plt.plot(sizes,perfs,'-bo')
    plt.xlabel('data size')
    plt.ylabel('perform')
    plt.show()
display()

#抽出人民语料的标记
orl_sent = [[word for (word,tag) in sent if (word != None and tag!= None)] for sent in test]

test_tags = [tag for sent in orl_sent for (word,tag) in t2.tag(sent) if (word != None and tag!= None)]
gold_tags = [tag for (word,tag) in sent for sent in test if (word != None and tag!= None)]
nltk.ConfusionMatrix(gold_tags,test_tags)

%matplotlib inline
def perform(data,test):
    baseline_tagger = nltk.UnigramTagger(train=data,backoff=nltk.DefaultTagger('n'))
    return baseline_tagger.evaluate(test)
def display():
    sizes = range(1,16)
    test = tagged_sents[-5000:]
    train_data = tagged_sents
    perfs = [perform(tagged_sents[:size*1000],test) for size in sizes]
    plt.semilogx(sizes,perfs,'-bo')
    plt.xlabel('data size')
    plt.ylabel('perform')
    plt.show()
display()

size = int(len(tagged_sents)*0.7)
train = tagged_sents[:size]
test = tagged_sents[size:]
t0 = nltk.DefaultTagger('n')
t1 = nltk.UnigramTagger(train,backoff=t0)
t2 = nltk.BigramTagger(train,backoff=t1)
t2.evaluate(test)

t3 = nltk.BrillTaggerTrainer.train(train_sents=train,max_rules=200, min_score=2, min_acc=None)
t3.evaluate(test)

heize19

关注

2
点赞
踩
13

收藏

觉得还不错? 一键收藏
0
评论
《Python自然语言处理》第五章练习题答案

这章主要内容涉及分词、词性标注和标注器训练、字典使用。因为中英文差别，所以在后面练习里尝试用中文数据来训练ngram标注器。首先导包import nltkfrom nltk.corpus import brownfrom nltk.book import *import jiebaimport matplotlib.pyplot as plt1#nltk词性标注无法消除歧义text = nltk.word_tokenize('British Left Waffles on Falkla
复制链接

扫一扫