NLTK05《Python自然语言处理》code04 编写结构化程序

最新推荐文章于 2023-09-20 20:03:03 发布
longji
最新推荐文章于 2023-09-20 20:03:03 发布
阅读量775
点赞数
分类专栏： NLTK 文章标签： NLTK
本文链接：https://blog.csdn.net/longji/article/details/77679966
版权
NLTK 专栏收录该内容
12 篇文章 0 订阅
订阅专栏
编写结构化程序

# -*- coding: utf-8 -*-
# win10 python3.5.3/python3.6.1 nltk3.2.4
# 《Python自然语言处理》 04 编写结构化程序
# pnlp04.py

# 4.1 回到基础
# 赋值
foo = 'Monty'
bar = foo
foo = 'Python'
print(bar) # Monty

foo = ['Monty', 'Python']
bar = foo # 链表赋值是一个地址引用
foo[1] = 'Bodkin'
print(bar) # ['Monty', 'Bodkin']
empty = []
mested = [empty, empty, empty]
print(mested) # [[], [], []]
mested[1].append('Python')
print(mested) # [['Python'], ['Python'], ['Python']]

mested = [[]] * 3
mested[1].append('Python')
mested[1] = ['Monty']
print(mested) # [['Python'], ['Monty'], ['Python']]

# 等式
size = 5
python = ['Python']
snake_nest = [python] * size
print(snake_nest)
print(snake_nest[0] == snake_nest[1] == snake_nest[2] == snake_nest[3] == snake_nest[4]) # True
print(snake_nest[0] is snake_nest[1] is snake_nest[2] is snake_nest[3] is snake_nest[4]) # True
print([id(snake) for snake in snake_nest]) # [2561850910920, 2561850910920, 2561850910920, 2561850910920, 2561850910920]

# 条件语句
mixed = ['cat', '', ['dog'], []]
for element in mixed:
    if element:
        print(element)
#cat
#['dog']

sent = ['No', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '.']
print(all(len(w) > 4 for w in sent)) # False
print(any(len(w) > 4 for w in sent)) # True

# 序列
t = 'walk', 'fem', 3
print(t) # ('walk', 'fem', 3)
print(t[0]) # walk
print(t[1:]) # ('fem', 3)
print(len(t)) # 3

raw = 'I turned off the spectroroute'
text = ['I', 'turned', 'off', 'the', 'spectroroute']
pair = (6, 'turned')
print(raw[2], text[3], pair[1]) # t the turned
print(raw[-3:], text[-3:], pair[-3:]) # ute ['off', 'the', 'spectroroute'] (6, 'turned')
print(len(raw), len(text), len(pair)) # 29 5 2

# 遍历序列的各种方式
# for item in s # 遍历s中的元素
# for item in sorted(s) # 按顺序遍历s中的元素
# for item in set(s) # 遍历s中无重复的元素
# for item in reversed(s) # 按逆序遍历s中的元素
# for item in random.shuffle(s) # 按随机顺序遍历s中的元素

import nltk
raw = 'Red lorry, yellow lorry, red lorry, yellow lorry.'
text = nltk.word_tokenize(raw)
fdist = nltk.FreqDist(text)
print(list(fdist)) # ['Red', 'lorry', ',', 'yellow', 'red', '.']
for key in fdist:
    print(fdist[key], end=' ')
print("")
#  1 4 3 2 1 1

words = ['I', 'turned', 'off', 'the', 'spectroroute']
words[2], words[3], words[4] = words[3], words[4], words[2]
print(words) # ['I', 'turned', 'the', 'spectroroute', 'off']

tmp = words[2]
words[2] = words[3]
words[3] = words[4]
words[4] = tmp
print(words) # ['I', 'turned', 'spectroroute', 'off', 'the']

words = ['I', 'turned', 'off', 'the', 'spectroroute']
tags = ['noun', 'verb', 'prep', 'det', 'noun']
res = zip(words, tags)
print(list(res)) # [('I', 'noun'), ('turned', 'verb'), ('off', 'prep'), ('the', 'det'), ('spectroroute', 'noun')]
print(list(enumerate(words))) # [(0, 'I'), (1, 'turned'), (2, 'off'), (3, 'the'), (4, 'spectroroute')]

text = nltk.corpus.nps_chat.words()
cut = int(0.9 * len(text))
training_data, test_data = text[:cut], text[cut:]
print(text == training_data + test_data) # True
print(len(training_data)/len(test_data)) # 9.0

# 合并不同类型的序列
words = 'I turned off the spectroroute'.split()
wordlens = [(len(word), word) for word in words]
wordlens.sort()
print(' '.join(w for (_, w) in wordlens)) # I off the turned spectroroute

lexicon = [
    ('the', 'det', ['Di:', 'D@']),
    ('off', 'prep', ['Qf', 'O:f'])
]
lexicon.sort()
lexicon[1] = ('turned', 'VBD', ['t3:nd', 't3`nd'])
del lexicon[0]
print(lexicon) # [('turned', 'VBD', ['t3:nd', 't3`nd'])]

# 生成器表达式
text = """\"When I use a word, " Humpty Dumpty said in rather a scornful tone,
\"it means just what I choose it to mean - neither more nor less.\""""
res = [w.lower() for w in nltk.word_tokenize(text)]
print(res) # ['``', 'when', 'i', 'use', 'a', 'word',...
res = max([w.lower() for w in nltk.word_tokenize(text)])
print(res) # word
res = max([w.lower() for w in nltk.word_tokenize(text)])
print(res) # word

# 4.3 风格的问题
# Python代码风格
import re
import nltk
from nltk.corpus import brown
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
cv_word_pairs = [(cv, w) for w in rotokas_words
                 for cv in re.findall('[ptksvr][aeiou]]', w)]
cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre)
)
ha_words = ['aaahhhh', 'ah', 'ahah', 'ahahah', 'ahh', 'ahhahahaha',
            'ahhh', 'ahhhh', 'ahhhhhh', 'ahhhhhhhhhhhhhh', 'ha',
            'haaa', 'hah', 'haha', 'hahaaaa', 'hahah', 'hahaha']

# 过程风格与声明风格
tokens = nltk.corpus.brown.words(categories='news')
count = 0
total = 0
for token in tokens:
    count += 1
    total += len(token)
if count == 0:
    count = 1
print(total / count) # 4.401545438271973

total = sum(len(t) for t in tokens)
print(total / len(tokens)) # 4.401545438271973

tokens = tokens[:100]
word_list = []
len_word_list = len(word_list)
i = 0
while i < len(tokens):
    j = 0
    while j < len_word_list and word_list[j] < tokens[i]:
        j += 1
    if j > 0 and j == len_word_list:
        word_list.insert(j, tokens[i])
        len_word_list += 1
    elif j == 0 or tokens[i] != word_list[j]:
        word_list.insert(j, tokens[i])
        len_word_list += 1
    i += 1
print(word_list) # ["''", "''", "''", ',', '.', 'Atlanta', "Atlanta's", 'City', 'Committee', 'County',...
word_list = sorted(set(tokens))
print(word_list) # ["''", ',', '.', 'Atlanta', "Atlanta's", 'City', 'Committee', 'County', 'Court',

fd = nltk.FreqDist(nltk.corpus.brown.words())
cumulative = 0.0
for rank, word in enumerate(fd):
    cumulative += fd[word] * 100 / fd.N()
    print("%3d %6.2f%% %s" % (rank+1, cumulative, word))
    if cumulative > 25:
        break
# 1   0.63% The
# 2   0.63% Fulton
# ...
# 42  26.59% and

text = nltk.corpus.gutenberg.words('milton-paradise.txt')
longest = ''
for word in text:
    if len(word) > len(longest):
        longest = word
print(longest) # unextinguishable

maxlen = max(len(word) for word in text)
res = [word for word in text if len(word) == maxlen]
print(res) # ['unextinguishable', 'transubstantiate', 'inextinguishable', 'incomprehensible']

# 计数器的一些合理用途
sent = ['The', 'dog', 'gave', 'John', 'the', 'newspaper']
n = 3
res = [sent[i:i+n] for i in range(len(sent) - n + 1)]
print(res) # [['The', 'dog', 'gave'], ['dog', 'gave', 'John'], ['gave', 'John', 'the'], ['John', 'the', 'newspaper']]

# bigrams(text) trigrams(text) ngrams(text, n)
import pprint
m, n = 3, 7
array = [[set() for i in range(n)] for j in range(m)]
array[2][5].add('Alice')
pprint.pprint(array)
# [[set(), set(), set(), set(), set(), set(), set()],
#  [set(), set(), set(), set(), set(), set(), set()],
#  [set(), set(), set(), set(), set(), {'Alice'}, set()]]

array = [[set()] * n] * m
array[2][5].add(7)
pprint.pprint(array)
# [[{7}, {7}, {7}, {7}, {7}, {7}, {7}],
#  [{7}, {7}, {7}, {7}, {7}, {7}, {7}],
#  [{7}, {7}, {7}, {7}, {7}, {7}, {7}]]

# 4.4 函数：结构化编程的基础
import re
def get_text(file):
    """Read text from a file, normalizing whitesapce and stripping HTML markup."""
    text = open(file).read()
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'<.*?>', ' ', text)
    return text

help(get_text)

# document.txt
# test document.txt
res = get_text("document.txt")
print(res) # test document.txt

# 函数的输入和输出
def repeat(msg, num):
    return ' '.join([msg]*num)
monty = 'Monty Python'
res = repeat(monty, 3)
print(res) # Monty Python Monty Python Monty Python

def monty():
    return "Monty Python"
res = monty()
print(res) # Monty Python

res = repeat(monty(), 3)
print(res) # Monty Python Monty Python Monty Python

res = repeat("Monty Python", 3)
print(res) # Monty Python Monty Python Monty Python

# 修改了参数，无返回值
def my_sort1(mylist):
    mylist.sort()

# 没有修改参数，有返回值
def my_sort2(mylist):
    return sorted(mylist)

# 即修改了参数，又有返回值
def my_sort3(mylist):
    mylist.sort()
    return mylist

# 参数传递
def set_up(word, properties):
    word = 'lolcat'
    properties.append('noun')
    properties = 5
w = ''
p = []
set_up(w, p)
print("w=", w) # w=
print("p=", p) # p= ['noun']

w = 'a'
word = w
word = 'lolcat'
print(w) # a

# 列表赋值
p = []
properties = p
properties.append('noun')
properties = 5
print(p)

# 变量作用域

# 参数类型检查
def tag(word):
    if word in ['a', 'the', 'all']:
        return 'det'
    else:
        return 'noun'

res = tag('the')
print(res) # det
res = tag('knight')
print(res) # noun
res = tag(["'Tis", 'but', 'a', 'scratch'])
print(res) # noun

def tag(word):
    assert(isinstance(word, str))
    if word in ['a', 'the', 'all']:
        return 'det'
    else:
        return 'noun'

# res = tag(5) # AssertionError

res = tag('a')
print(res) # det

# 功能分解
# 用来计算高频词的拙劣函数
import nltk
import bs4
import lxml
from urllib.request import urlopen
def freq_words(url, freqdist, n):
    # text = nltk.clean_url(url)
    html = urlopen(url).read()
    text = bs4.BeautifulSoup(html, "lxml")
    text = text.get_text()
    for word in nltk.word_tokenize(text):
        #freqdist.inc(word.lower())
        freqdist[word.lower()] += 1
    print(list(freqdist.keys())[:n])
# constitution = "http://www.archives.gov/national-archives-experience/charters/constitution_transcript.html"
constitution = "http://www.archives.gov/founding-docs"
fd = nltk.FreqDist()
freq_words(constitution, fd, 20)
# ['america', "'s", 'founding', 'documents', '|', 'national', 'archives', '(', 'function',...

# 用来计算高频词的函数
def freq_words(url):
    freqdist = nltk.FreqDist()
    # text = nltk.clean_url(url)
    html = urlopen(url).read()
    text = bs4.BeautifulSoup(html, "lxml")
    text = text.get_text()
    for word in nltk.word_tokenize(text):
        freqdist[word.lower()] += 1
    return freqdist
fd = freq_words(constitution)
print(list(fd.keys())[:20])
# ['america', "'s", 'founding', 'documents', '|', 'national', 'archives', '(', 'function',...

# 文档说明函数
# 一个完整的docstring, 包括总结、详细解释、doctest例子及特定参数、类型、返回值类型、异常的epytext标记
def accuracy(reference, test):
    """
    Calculate the fraction of test items that equal the corresponding reference items.
    Given a list of reference values and a corresponding list of test values,
    return the fraction of corresponding values that are equal.
    In particular, return the fraction of indexes
    {(0<i<=len(test)} such that C{test[i] == reference[i]}.
    >>> accuracy(['ADJ', 'N', 'V', 'N'], ['N', 'N', 'V', 'ADJ'])
    0.5

    @param reference: An ordered list of reference values.
    @type refernce: C{list}
    @param test: A list of values to compare against the corresponding reference values.
    @type test: C{list}
    @rtype: C{float}
    @raise ValueError: If C{reference} and C{length} do not have the same length.
    """
    if len(reference) != len(test):
        raise ValueError("Lists must have the same length.")

    num_correct = 0
    for x, y in zip(reference, test):
        if x == y:
            num_correct += 1
    return float(num_correct) / len(reference)

res = accuracy(['ADJ', 'N', 'V', 'N'], ['N', 'N', 'V', 'ADJ'])
print(res) # 0.5

# 4.5 更多关于函数
# 作为参数的函数
sent = ['Take', 'care', 'of', 'the', 'sense', ',', 'and', 'the', 'sounds', 'will',
        'take', 'care', 'of', 'themselves', '.']
def extract_property(prop):
    return [prop(word) for word in sent]

res = extract_property(len)
print(res) # [4, 4, 2, 3, 5, 1, 3, 3, 6, 4, 4, 4, 2, 10, 1]

def last_letter(word):
    return word[-1]
res = extract_property(last_letter)
print(res) # ['e', 'e', 'f', 'e', 'e', ',', 'd', 'e', 's', 'l', 'e', 'e', 'f', 's', '.']

res = extract_property(lambda w: w[-1])
print(res) # ['e', 'e', 'f', 'e', 'e', ',', 'd', 'e', 's', 'l', 'e', 'e', 'f', 's', '.']

import operator
res = sorted(sent)
print(res) # [',', '.', 'Take', 'and', 'care', 'care', 'of', 'of', 'sense', 'sounds', 'take', 'the', 'the', 'themselves', 'will']
# res = sorted(sent,cmp) # 不支持cmp
# res = sorted(sent, lambda x, y: cmp(len(y), len(x))) # 不支持cmp

# 累计函数
def search1(substring, words):
    result = []
    for word in words:
        if substring in word:
            result.append(word)
    return result

def search2(substring, words):
    for word in words:
        if substring in word:
            yield word

print("search1:")
for item in search1('zz', nltk.corpus.brown.words()):
    print(item)
# search1:
# Grizzlies'
# fizzled
# ...

print("search2:")
for item in search2('zz', nltk.corpus.brown.words()):
    print(item)
# search2:
# Grizzlies'
# fizzled
# ...

def permutations(seq):
    if len(seq) <= 1:
        yield seq
    else:
        for perm in permutations(seq[1:]):
            for i in range(len(perm) + 1):
                yield perm[:i] + seq[0:1] + seq[i:]

res = list(permutations(['police', 'fish', 'buffalo']))
print(res)
# [['police', 'police', 'fish', 'buffalo'], ...'buffalo', 'police']]

# 高阶函数
def is_content_word(word):
    return word.lower() not in ['a', 'of', 'the', 'and', 'will', ',', '.']

sent = ['Take', 'care', 'of', 'the', 'sense', ',', 'and', 'the', 'sounds', 'will', 'take', 'care', 'of',
        'themselves', '.']
res = filter(is_content_word, sent)
print(list(res)) # ['Take', 'care', 'sense', 'sounds', 'take', 'care', 'themselves']
res = [w for w in sent if is_content_word(w)]
print(res) # ['Take', 'care', 'sense', 'sounds', 'take', 'care', 'themselves']

lengths = map(len, nltk.corpus.brown.sents(categories = 'news'))
lengths = list(lengths)
print(sum(lengths)/len(lengths)) # 21.75081116158339
lengths = [len(w) for w in nltk.corpus.brown.sents(categories='news')]
print(sum(lengths)/len(lengths)) # 21.75081116158339

res = map(lambda w: len(list(filter(lambda c: c.lower() in "aeiou", w))), sent)
print(list(res)) # [2, 2, 1, 1, 2, 0, 1, 1, 2, 1, 2, 2, 1, 3, 0]
res = [len([c for c in w if c.lower() in "aeiou"]) for w in sent]
print(res) # [2, 2, 1, 1, 2, 0, 1, 1, 2, 1, 2, 2, 1, 3, 0]

# 参数的命名
def repeat(msg='<empty>', num=1):
    return msg*num

res = repeat(num=3)
print(res) # <empty><empty><empty>

res = repeat(msg='Alice')
print(res) # Alice

res = repeat(num=5, msg='Alice')
print(res) # AliceAliceAliceAliceAlice

def generic(*args, **kwargs):
    print(args) # 未命名参数
    print(kwargs) # 命名参数
generic(1, "African swallow", monty="python")
# (1, 'African swallow')
# {'monty': 'python'}

song = [['four', 'calling', 'birds'],
        ['three', 'French', 'hens'],
        ['tow', 'turtle', 'doves']]
res = zip(song[0], song[1], song[2])
print(list(res)) # [('four', 'three', 'tow'), ('calling', 'French', 'turtle'), ('birds', 'hens', 'doves')]

res = zip(*song)
print(list(res)) # [('four', 'three', 'tow'), ('calling', 'French', 'turtle'), ('birds', 'hens', 'doves')]

def freq_words(file, min=1, num=10):
    text = open(file).read()
    tokens = nltk.word_tokenize(text)
    freqdist = nltk.FreqDist(t for t in tokens if len(t) >= min)
    return list(freqdist.keys())[:num]
fw = freq_words('document.txt', 4, 10)
print(fw) # ['test', 'document.txt']
fw = freq_words('document.txt', min=4, num=10)
print(fw) # ['test', 'document.txt']
fw = freq_words('document.txt', num=10, min=4)
print(fw) # ['test', 'document.txt']

def freq_words2(file, min=1, num=10, trace=False):
    freqdist = nltk.FreqDist()
    if trace: print("Opening", file)
    text = open(file).read()
    if trace: print("Read in %d characters" % len(file))
    for word in nltk.word_tokenize(text):
        if len(word) >= min:
            freqdist[word] += 1
            if trace and freqdist.N() % 100 == 0: print(".")
    if trace: print("")
    return list(freqdist.keys())[:num]
fw = freq_words2("document.txt", min=1, num=10, trace=True)
# Opening document.txt
# Read in 12 characters

# 4.6 程序开发
# Python模块结构
# 多模块程序
# 误差源头
# 调试技术 import pdb
# 远程服务端调试使用pdb命令

# 防御性编程 assert

# 4.7 算法设计
# 递归
def factorial1(n):
    result = 1
    for i in range(n):
        result *= (i + 1)
    return result

def factorial2(n):
    if n == 1:
        return 1
    else:
        return n * factorial2(n-1)

def size1(s):
    return 1 + sum(size1(child) for child in s.hyponyms())

def size2(s):
    layer = [s]
    total = 0
    while layer:
        total += len(layer)
        layer = [h for c in layer for h in c.hyponyms()]
    return total

from nltk.corpus import wordnet as wn
dog = wn.synset('dog.n.01')
res = size1(dog)
print(res) # 190
res= size2(dog)
print(res) # 190

def insert(trie, key, value):
    if key:
        first, rest = key[0], key[1:]
        if first not in trie:
            trie[first] = {}
        insert(trie[first], rest, value)
    else:
        trie['value'] = value

import nltk
import pprint
trie = nltk.defaultdict(dict)
insert(trie, 'chat', 'cat')
insert(trie, 'chien', 'dog')
insert(trie, 'chair', 'flesh')
insert(trie, 'chic', 'stylish')
trie = dict(trie)
res = trie['c']['h']['a']['t']['value']
print(res) # cat
pprint.pprint(trie)
# {'c': {'h': {'a': {'i': {'r': {'value': 'flesh'}}, 't': {'value': 'cat'}},
#             'i': {'c': {'value': 'stylish'}, 'e': {'n': {'value': 'dog'}}}}}}

# 空间与时间的权衡
import re
def raw(file):
    contents = open(file).read()
    contents = re.sub(r'<.*?>', ' ', contents)
    contents = re.sub('\s+', ' ', contents)
    return contents

def snippet(doc, term):
    text = ' '*30 + raw(doc) + ' ' * 30
    pos = text.index(term)
    return text[pos - 30:pos + 30]

print("Building Index...")
files = nltk.corpus.movie_reviews.abspaths()
idx = nltk.Index((w, f) for f in files for w in raw(f).split())

query = ''
while query != "quit":
    #query = raw_input("query> ") #python2
    query = input("query> ")# python3
    if query in idx:
        for doc in idx[query]:
            print(snippet(doc, query))
    else:
        print("Not found")

def preprocess(tagged_corpus):
    words = set()
    tags = set()
    for sent in tagged_corpus:
        for word, tag in sent:
            words.add(word)
            tags.add(tag)
    wm = dict((w, i) for (i, w) in enumerate(words))
    tm = dict((t, i) for (i, t) in enumerate(tags))
    return [[(wm[w], tm[t]) for (w, t) in sent] for sent in tagged_corpus]

from timeit import Timer
vocab_size = 100000
setup_list = "import random; vocab = range(%d)" % vocab_size
setup_set = "import random; vocab = set(range(%d))" % vocab_size
# statement = "random.randint(0, %d) in vocab;" % (vocab_size * 2)
statement = "random.randint(0, %d) in vocab;" % vocab_size * 2
print(Timer(statement, setup_list).timeit(1000)) # 0.00331997297051883
print(Timer(statement, setup_set).timeit(1000)) # 0.002430902880215127

# 动态规划
def virahanka1(n):
    if n == 0:
        return [""]
    elif n == 1:
        return ["S"]
    else:
        s = ["S" + prosody for prosody in virahanka1(n-1)]
        l = ["L" + prosody for prosody in virahanka1(n-2)]
        return  s + l

def virahanka2(n):
    lookup = [[""], ["S"]]
    for i in range(n - 1):
        s = ["S" + prosody for prosody in lookup[i + 1]]
        l = ["L" + prosody for prosody in lookup[i]]
        lookup.append(s + l)
    return lookup[n]

def virahanka3(n, lookup={0:[""], 1:["S"]}):
    if n not in lookup:
        s = ["S" + prosody for prosody in virahanka3(n-1)]
        l = ["L" + prosody for prosody in virahanka3(n-2)]
        lookup[n] = s + l
    return lookup[n]

from nltk import memoize
@memoize
def virahanka4(n):
    if n == 0:
        return [""]
    elif n == 1:
        return ["S"]
    else:
        s = ["S" + prosody for prosody in virahanka4(n-1)]
        l = ["L" + prosody for prosody in virahanka4(n-2)]
        return s + l

res = virahanka1(4)
print(res) # ['SSSS', 'SSL', 'SLS', 'LSS', 'LL']
res = virahanka2(4)
print(res) # ['SSSS', 'SSL', 'SLS', 'LSS', 'LL']
res = virahanka3(4)
print(res) # ['SSSS', 'SSL', 'SLS', 'LSS', 'LL']
res = virahanka4(4)
print(res) # ['SSSS', 'SSL', 'SLS', 'LSS', 'LL']

# 4.8 Python样例库
# Matplotlib绘图工具
colors = 'rgbcmyk'
def bar_chart(categories, words, counts):
    "Plot a bar chart showing counts for each word by category"
    import pylab
    ind = pylab.arange(len(words))
    width = 1 / (len(categories) + 1)
    bar_groups = []
    for c in range(len(categories)):
        bars = pylab.bar(ind+c*width, counts[categories[c]], width, color=colors[c % len(colors)])
        bar_groups.append(bars)
    pylab.xticks(ind+width, words)
    pylab.legend([b[0] for b in bar_groups], categories, loc='upper left')
    pylab.ylabel('Frequency')
    pylab.title('Frequency of Six Modal Verbs by Genre')
    pylab.show()
    #pylab.close()

genres = ['news', 'religion', 'hobbies', 'government', 'adventure']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
import nltk
cfdist = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in genres
    for word in nltk.corpus.brown.words(categories=genres)
    if word in modals)
counts = {}
for genre in genres:
    counts[genre] = [cfdist[genre][word] for word in modals]
bar_chart(genres, modals, counts)

import pylab
import matplotlib
#matplotlib.use('Agg') #   warnings.warn(_use_error_msg)
matplotlib.pyplot.switch_backend('Agg'),
pylab.savefig('modals.png')
print('Content-Type: text/html')
print("")
print('<html><body>')
print('<img src="modals.png"/>')
print('</body></html>')

# NetworkX
"""
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
import matplotlib
from nltk.corpus import wordnet as wn

def traverse(graph, start, node):
    graph.depth[node.name] = node.shortest_path_distance(start)
    for child in node.hyponyms():
        graph.add_edge(node.name, child.name)
        traverse(graph, start, child)

def hyponym_graph(start):
    G = nx.Graph()
    G.depth = {}
    traverse(G, start, start)
    return G

# 在win10 x64为系统上 python3.5和python3.6上面，pygraphviz安装失败
def graph_draw(graph):
    # AttributeError: module 'networkx.drawing' has no attribute 'graphviz_layout'
    # 遇到以上错误，需要安装raphvizg
    # http://blog.csdn.net/sinat_29508201/article/details/51887446
    # http://www.graphviz.org/Download_windows.php
    # http://www.graphviz.org/pub/graphviz/stable/windows/graphviz-2.38.msi
    # 安装graphviz-2.38.msi后，确认设置了正确的路径
    # pip3 install pygraphviz
    nx.draw(graph, pos=graphviz_layout(graph),
            node_size = [16*graph.degree(n) for n in graph],
            cmap=matplotlib.pyplot.cm.Blues,
            node_color = [graph.depth[n] for n in graph],
            prog = 'dot')
    matplotlib.pyplot.show()

dog = wn.synset('dog.n.01')
graph = hyponym_graph(dog)
graph_draw(graph)
"""

# CSV
# lexicon.csv
# sleep, sli:p, v.i, a condition of body and mind ...
# walk, wo:k, v.intr, progress by lifting and setting down each foot ...
# awke, weik, intrans, cease to sleep
import csv
input_file = open("lexicon.csv", "r")
for row in csv.reader(input_file):
    print(row)

# numpy
from numpy import array
cube = array([ [[0, 0, 0], [1, 1, 1,], [2, 2, 2]],
               [[3, 3, 3,], [4, 4, 4], [5, 5, 5]],
               [[6, 6, 6], [7, 7, 7], [8, 8, 8]] ])
print(cube[1, 1, 1]) # 4
print(cube[2].transpose())
# [[6 7 8]
# [6 7 8]
# [6 7 8]]
print(cube[2, 1:])
# [[7 7 7]
#  [8 8 8]]

from numpy import linalg
a = array([[4, 0], [3, -5]])
u, s, vt = linalg.svd(a)
print(u)
# [[-0.4472136  -0.89442719]
#  [-0.89442719  0.4472136 ]]
print(s)
# [ 6.32455532  3.16227766]
print(vt)
# [[-0.70710678  0.70710678]
#  [-0.70710678 -0.70710678]]

# 其他Python库
# PyLucene pypdf pywin32 xml.etree feedparser imaplib email