NLTK06《Python自然语言处理》code05 分类和标注词汇

分类和标注词汇

# -*- coding: utf-8 -*-
# win10 python3.5.3/python3.6.1 nltk3.2.4
# 《Python自然语言处理》 05 分类和标注词汇
# pnlp05.py

# 5.1 使用词性标注器
# 词性标注器(part-of-speech tagger|POS tagger)
import nltk
text = nltk.word_tokenize("And now for something completely different")
res = nltk.pos_tag(text)
print(res)
# [('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), ('completely', 'RB'), ('different', 'JJ')]

text = nltk.word_tokenize("They refuse to permit us to obtain the refuse permit")
res = nltk.pos_tag(text)
print(res)
# [('They', 'PRP'), ('refuse', 'VBP'), ('to', 'TO'), ('permit', 'VB'), ('us', 'PRP'), ('to', 'TO'),
# ('obtain', 'VB'), ('the', 'DT'), ('refuse', 'NN'), ('permit', 'NN')]

text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar('woman')
# man time day year car moment world house family child country boy
# state job place way war girl work word
text.similar('bought')
# made said done put had seen found given left heard was been brought
# set got that took in told felt
text.similar('over')
# in on to of and for with from at by that into as up out down through
# is all about

# 5.2 标注语料库
# 表示已标注的标识
tagged_token = nltk.tag.str2tuple('fly/NN')
print(tagged_token) # ('fly', 'NN')
print(tagged_token[0]) # fly
print(tagged_token[1]) # NN

sent = """
The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN 
other/AP topics/NNS ,/, AMONG/IN them/PPO the/AT Atlanta/NP and/CC 
Fulton/NP-tl County/NN-tl purchasing/VBG departments/NNS which/WDT it/PPS
said/VBD ``/`` ARE/BER well/QL operated/VBN and/CC follow/VB generally/RB
accepted/VBN practices/NNS which/WDT inure/VB to/IN the/AT best/JJT 
interest/NN of/IN both/ABX governments/NNS ''/'' ./.
"""
res = [nltk.tag.str2tuple(t) for t in sent.split()]
print(res) # [('The', 'AT'), ('grand', 'JJ'), ('jury', 'NN'), ...

# 读取已标注的语料库
res = nltk.corpus.brown.tagged_words()
print(res) # [('The', 'AT'), ('Fulton', 'NP-TL'), ...]

res = nltk.corpus.nps_chat.tagged_words()
print(res) # [('now', 'RB'), ('im', 'PRP'), ('left', 'VBD'), ...]
res = nltk.corpus.conll2000.tagged_words()
print(res) # [('Confidence', 'NN'), ('in', 'IN'), ('the', 'DT'), ...]
res = nltk.corpus.treebank.tagged_words()
print(res) # [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ...]

res = nltk.corpus.sinica_treebank.tagged_words()
print(res) # [('一', 'Neu'), ('友情', 'Nad'), ('嘉珍', 'Nba'), ...]

res = nltk.corpus.indian.tagged_words()
print(res) # [('মহিষের', 'NN'), ('সন্তান', 'NN'), (':', 'SYM'), ...]

res = nltk.corpus.mac_morpho.tagged_words()
print(res) # [('Jersei', 'N'), ('atinge', 'V'), ('média', 'N'), ...]

res = nltk.corpus.conll2002.tagged_words()
print(res) # [('Sao', 'NC'), ('Paulo', 'VMI'), ('(', 'Fpa'), ...]

res = nltk.corpus.cess_cat.tagged_words()
print(res) # [('El', 'da0ms0'), ('Tribunal_Suprem', 'np0000o'), ...]

# 使用tagged_sents()方法将已标注的词划分成句子

# 简化的词性标记集
# ADJ 形容词
# ADV 动词
# CNJ 连词
# DET 限定词
# EX 存在量词
# FW 外来词
# MOD 情态动词
# N 名词
# NP 专有名词
# NUM 数次
# PRO 代词
# P 介词
# TO 词to
# UH 感叹词
# V 动词
# VD 过去式
# VG 现在分词
# VN 过去分词
# WH Wh限定词

from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories='news')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
print(list(tag_fd.keys())) # ['AT', 'NP-TL', 'NN-TL', 'JJ-TL', 'VBD', 'NR', 'NN',...

# 名词
word_tag_pairs= nltk.bigrams(brown_news_tagged)
word_tag_pairs = list(word_tag_pairs)
res = list(nltk.FreqDist(a[1] for (a, b) in word_tag_pairs if b[1] == 'NP'))
print(res) # ['AT', 'NN-TL', 'NP', 'CS',...

# 动词
wsj = nltk.corpus.treebank.tagged_words()
word_tag_fd = nltk.FreqDist(wsj)
res = [word + "/" + tag for (word, tag) in word_tag_fd if tag.startswith("V")]
print(res) # ['join/VB', 'is/VBZ', 'publishing/VBG',...

cfd1 = nltk.ConditionalFreqDist(wsj)
res = list(cfd1['yield'].keys())
print(res) # ['NN', 'VB']
res = list(cfd1['cut'].keys())
print(res) # ['VBD', 'VB', 'VBN', 'NN']

cfd2 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in wsj)
print(list(cfd2)) # ['NNP', ',', 'CD', 'NNS', 'JJ', 'MD', 'VB',...
res = list(cfd2['VBG'].keys())
print(res) # ['publishing', 'causing', 'using', 'talking', 'having',

res = [w for w in cfd1.conditions() if 'VBD' in cfd1[w] and 'VBN' in cfd1[w]]
print(res) # ['named', 'used', 'caused', 'reported', 'said',

idx1 = wsj.index(('kicked', 'VBD'))
print(wsj[idx1-4:idx1+1]) # [('While', 'IN'), ('program', 'NN'), ('trades', 'NNS'), ('swiftly', 'RB'), ('kicked', 'VBD')]

idx2 = wsj.index(('kicked', 'VBN'))
print(wsj[idx2-4:idx2+1]) # [('head', 'NN'), ('of', 'IN'), ('state', 'NN'), ('has', 'VBZ'), ('kicked', 'VBN')]

# 形容词和副词

# 未简化的标记
def findtags(tag_prefix, tagged_text):
    cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text
                                   if tag.startswith(tag_prefix))
    #[print(tag, cfd[tag].keys()) for tag in cfd.conditions()]
    return cfd

tagdict = findtags("NN", list(nltk.corpus.brown.tagged_words(categories='news')))
for tag in sorted(tagdict):
    print(tag, tagdict[tag])

# 探索已标注的语料库
from nltk.corpus import brown
brown_learned_text = brown.words(categories = 'learned')
res = sorted(set(b for (a, b) in nltk.bigrams(brown_learned_text) if a == 'often'))
print(res) # [',', '.', 'accomplished', 'analytically', 'appear', 'apt', 'associated',...

brown_lrnd_tagged = brown.tagged_words(categories='learned')
tags = [b[1] for (a, b) in nltk.bigrams(brown_lrnd_tagged) if a[0] == 'often']
fd = nltk.FreqDist(tags)
fd.tabulate()
# VBN  VB VBD  JJ  IN  QL   ,  CS  RB  AP VBG  RP VBZ QLP BEN WRB   .  TO  HV
# 15  10   8   5   4   3   3   3   3   1   1   1   1   1   1   1   1   1   1

# 使用POS标记寻找三词短语
from nltk.corpus import brown
def process(sentence):
    for (w1, t1), (w2, t2), (w3,t3) in nltk.trigrams(sentence):
        if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):
            print(w1, w2, w3)

for tagged_sent in brown.tagged_sents():
    process(tagged_sent)

# combined to achieve
# continue to place
# ...

from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories='news')
data = nltk.ConditionalFreqDist((word.lower(), tag) for (word, tag) in brown_news_tagged)
for word in data.conditions():
    if len(data[word]) > 3:
        tags = data[word].keys()
        print(word, ' '.join(tags))
# no AT RB AT-HL AT-TL
# that CS WPS DT QL WPO
# ...

# 5.3 使用Python字典映射词及其属性
# 索引链表VS字典
# Python字典
pos = {}
print(pos) # {}

pos['colorless'] = 'ADJ'
print(pos) # {'colorless': 'ADJ'}
pos['ideas'] = 'N'
pos['sleep'] = 'V'
pos['furiously'] = 'ADV'
print(pos) # {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADV'}
print(pos['ideas']) # N
print(pos['colorless']) # ADJ
# print(pos['green']) # KeyError: 'green'
print(list(pos)) # ['colorless', 'ideas', 'sleep', 'furiously']
print(sorted(pos)) # ['colorless', 'furiously', 'ideas', 'sleep']
res = [w for w in pos if w.endswith('s')]
print(res) # ['colorless', 'ideas']

for word in sorted(pos):
    print(word + ":", pos[word])
# colorless: ADJ
# furiously: ADV
# ideas: N
# sleep: V

print(list(pos.keys())) # ['colorless', 'ideas', 'sleep', 'furiously']

print(list(pos.values())) # ['ADJ', 'N', 'V', 'ADV']

print(list(pos.items())) # [('colorless', 'ADJ'), ('ideas', 'N'), ('sleep', 'V'), ('furiously', 'ADV')]

for key, val in sorted(list(pos.items())):
    print(key + ":", val)
# colorless: ADJ
# furiously: ADV
# ideas: N
# sleep: V

pos['sleep'] = 'V'
print(pos['sleep']) # V
pos['sleep'] = 'N'
print(pos['sleep']) # N

# 定义字典
pos = {'colorless':'ADJ', 'ideas':'N', 'sleep':'V', 'furiously':'ADV'}
print(pos)  # {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADV'}
pos1 = dict(colorless='ADJ', ideas='N', sleep='V', furiously='ADV')
print(pos1) # {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADV'}

# 默认字典
frequency = nltk.defaultdict(int)
frequency['colorless'] = 4
print(frequency['ideas']) # 0
pos = nltk.defaultdict(list)
pos['sleep'] = ['N', 'V']
print(pos['ideas']) # []

pos = nltk.defaultdict(lambda: 'N')
pos['colorless'] = 'ADJ'
print(pos['blog']) # N

print(list(pos.items())) # [('colorless', 'ADJ'), ('blog', 'N')]

alice = nltk.corpus.gutenberg.words('carroll-alice.txt')
vocab = nltk.FreqDist(alice)
v1000 = list(vocab)[:1000]
mapping = nltk.defaultdict(lambda: 'UNK')
for v in v1000:
    mapping[v] = v
alice2 = [mapping[v] for v in alice]
print(alice2[:100]) # ['[', 'Alice', "'", 's', 'Adventures', 'in', 'Wonderland',...
print(len(set(alice2))) # 1001

# 递增更新字典
counts = nltk.defaultdict(int)
from nltk.corpus import brown
for (word, tag) in brown.tagged_words(categories='news'):
    counts[tag] += 1
print(counts['NN']) # 13162
print(list(counts)) # ['AT', 'NP-TL', 'NN-TL', 'JJ-TL', 'VBD', 'NR', ...

from operator import itemgetter
res = sorted(counts.items(), key=itemgetter(1), reverse=True)
print(res) # [('NN', 13162), ('IN', 10616), ('AT', 8893), ...
res = [t for t, c in sorted(counts.items(), key=itemgetter(1), reverse=True)]
print(res) # ['NN', 'IN', 'AT', 'NP', ',...

pair = ('NP', 8336)
print(pair[1]) # 8336
print(itemgetter(1)(pair)) # 8336

last_letters = nltk.defaultdict(list)
words = nltk.corpus.words.words('en')
for word in words:
    key = word[-2:]
    last_letters[key].append(word)

res = last_letters['ly']
print(res) # ['abactinally', 'abandonedly', 'abasedly', 'abashedly',...

print(last_letters['zy']) # ['blazy', 'bleezy', 'blowzy', 'boozy', 'breezy',...

anagrams = nltk.defaultdict(list)
for word in words:
    key = ''.join(sorted(word))
    anagrams[key].append(word)

print(anagrams['aeilnrt']) # ['entrail', 'latrine', 'ratline', 'reliant', 'retinal', 'trenail']

anagrams = nltk.Index((''.join(sorted(w)), w) for w in words)
print(anagrams['aeilnrt']) # ['entrail', 'latrine', 'ratline', 'reliant', 'retinal', 'trenail']

# 复杂的键和值
pos = nltk.defaultdict(lambda: nltk.defaultdict(int))
brown_news_tagged = brown.tagged_words(categories='news')
for ((w1, t1), (w2, t2)) in nltk.bigrams(brown_news_tagged):
    pos[(t1, w2)][t2] += 1
print(pos[('DT', 'library')]) # defaultdict(<class 'int'>, {'NN': 1})

# 颠倒字典
counts = nltk.defaultdict(int)
for word in nltk.corpus.gutenberg.words('milton-paradise.txt'):
    counts[word] += 1
res = [key for (key, value) in counts.items() if value == 32]
print(res) # ['mortal', 'Against', 'Him', 'There', 'brought', 'King', 'virtue', 'every', 'been', 'thine']

pos = {'colorless':'ADJ', 'ideas':'N', 'sleep':'V', 'furiously':'ADV'}
pos2 = dict((value, key) for (key, value) in pos.items())
print(pos2['N']) # ideas

pos.update({'cats':'N', 'scratch':'V', 'peacefully':'ADV', 'old':'ADJ'})
pos2 = nltk.defaultdict(list)
for key, value in pos.items():
    pos2[value].append(key)
print(pos2['ADV']) # ['furiously', 'peacefully']

pos2 = nltk.Index((value, key) for (key, value) in pos.items())
print(pos2['ADV']) # ['furiously', 'peacefully']

# 5.4 自动标注
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')

# 默认标注器
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
print(nltk.FreqDist(tags).max()) # NN

raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
res = default_tagger.tag(tokens)
print(res) # [('I', 'NN'), ('do', 'NN'), ('not', 'NN'), ('like', 'NN'),...

res = default_tagger.evaluate(brown_tagged_sents)
print(res) # 0.13089484257215028

# 正则表达式标注器
patterns = [
    (r'.*img$', 'VBG'),
    (r'.*ed$', 'VBD'),
    (r'.*es$', 'VBZ'),
    (r'.*ould$', 'MD'),
    (r'.*\'s$', 'NN$'),
    (r'.*$', 'NNS'),
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
    (r'.*', 'NN')
]

regexp_tagger = nltk.RegexpTagger(patterns)
res = regexp_tagger.tag(brown_sents[3])
print(res) # [('``', 'NNS'), ('Only', 'NNS'), ('a', 'NNS'), ...
print(regexp_tagger.evaluate(brown_tagged_sents)) # 0.05904290232114088

# 查询标注器
fd = nltk.FreqDist(brown.words(categories='news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
most_freq_words = list(fd.keys())[:100]
likely_tags= dict((word, cfd[word].max()) for word in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
res = baseline_tagger.evaluate(brown_tagged_sents)
print(res) # 0.3329355371243312

sent = brown.sents(categories = 'news')[3]
res = baseline_tagger.tag(sent)
print(res) # [('``', '``'), ('Only', 'RB'), ('a', 'AT'), ...

baseline_tagger = nltk.UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger('NN'))

def performance(cfd, wordlist):
    lt = dict((word, cfd[word].max()) for word in wordlist)
    baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
    return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))

def display():
    import pylab
    words_by_freq = list(nltk.FreqDist(brown.words(categories='news')))
    cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
    sizes = 2 ** pylab.arange(15)
    perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
    pylab.plot(sizes, perfs, '-bo')
    pylab.title('Lookup Tagger Performance with Varying Model Size')
    pylab.xlabel('Model Size')
    pylab.ylabel('Performance')
    pylab.show()
display()

# 评估

# 5.5 N-gram标注
# 一元标注器(Unigram Tagging)
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
res = unigram_tagger.tag(brown_sents[2007])
print(res) # [('Various', 'JJ'), ('of', 'IN'), ('the', 'AT'), ('apartments', 'NNS'),...
res = unigram_tagger.evaluate(brown_tagged_sents)
print(res) # 0.9349006503968017

# 分离训练和测试数据
size = int(len(brown_tagged_sents)*0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
unigram_tagger = nltk.UnigramTagger(train_sents)
res = unigram_tagger.evaluate(test_sents)
print(res) # 0.8121200039868434

# 一般的N-gram的标注
bigram_tagger = nltk.BigramTagger(train_sents)
res = bigram_tagger.tag(brown_sents[2007])
print(res) # [('Various', 'JJ'), ('of', 'IN'), ('the', 'AT'),...

unseen_sent = brown_sents[4203]
res = bigram_tagger.tag(unseen_sent)
print(res) # [('The', 'AT'), ('population', 'NN'),...

res = bigram_tagger.evaluate(test_sents)
print(res) # 0.10206319146815508

# 组合标注器
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
res = t2.evaluate(test_sents)
print(res) # 0.8452108043456593

# 标注生词

# 存储标注器
from pickle import dump
output = open('t2.pkl', 'wb')
dump(t2, output, -1)
output.close()

from pickle import load
input = open('t2.pkl', 'rb')
tagger = load(input)
input.close()

text = """The board's action shows what free enterprise is up against in our complex maze of regulatory laws."""
tokens = text.split()
res = tagger.tag(tokens)
print(res) # [('The', 'AT'), ("board's", 'NN$'), ('action', 'NN'),

# 性能限制
cfd = nltk.ConditionalFreqDist(
    ((x[1], y[1], z[0]), z[1])
    for sent in brown_tagged_sents
    for x, y, z in nltk.trigrams(sent))
ambiguous_contexts = [c for c in cfd.conditions() if len(cfd[c]) > 1]
res = sum(cfd[c].N() for c in ambiguous_contexts) / cfd.N()
print(res) # 0.049297702068029296

test_tags = [tag for sent in brown.sents(categories='editorial')
             for (word, tag) in t2.tag(sent)]
gold_tags = [tag for (word, tag) in brown.tagged_words(categories='editorial')]
print(nltk.ConfusionMatrix(gold_tags, test_tags))

# 跨句子边界标注
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
res = t2.evaluate(test_sents)
print(res) # 0.8452108043456593

# 5.6 基于转换的标注
res = nltk.tag.brill.nltkdemo18()
print(res)

# 5.7 如何确定一个词的分类
# 形态学线索
# 句法线索
# 语义线索
# 新词
# 词性标记集中的形态学
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值