编写结构化程序
foo = 'Monty'
bar = foo
foo = 'Python'
print(bar)
foo = ['Monty', 'Python']
bar = foo
foo[1] = 'Bodkin'
print(bar)
empty = []
mested = [empty, empty, empty]
print(mested)
mested[1].append('Python')
print(mested)
mested = [[]] * 3
mested[1].append('Python')
mested[1] = ['Monty']
print(mested)
size = 5
python = ['Python']
snake_nest = [python] * size
print(snake_nest)
print(snake_nest[0] == snake_nest[1] == snake_nest[2] == snake_nest[3] == snake_nest[4])
print(snake_nest[0] is snake_nest[1] is snake_nest[2] is snake_nest[3] is snake_nest[4])
print([id(snake) for snake in snake_nest])
mixed = ['cat', '', ['dog'], []]
for element in mixed:
if element:
print(element)
sent = ['No', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '.']
print(all(len(w) > 4 for w in sent))
print(any(len(w) > 4 for w in sent))
t = 'walk', 'fem', 3
print(t)
print(t[0])
print(t[1:])
print(len(t))
raw = 'I turned off the spectroroute'
text = ['I', 'turned', 'off', 'the', 'spectroroute']
pair = (6, 'turned')
print(raw[2], text[3], pair[1])
print(raw[-3:], text[-3:], pair[-3:])
print(len(raw), len(text), len(pair))
import nltk
raw = 'Red lorry, yellow lorry, red lorry, yellow lorry.'
text = nltk.word_tokenize(raw)
fdist = nltk.FreqDist(text)
print(list(fdist))
for key in fdist:
print(fdist[key], end=' ')
print("")
words = ['I', 'turned', 'off', 'the', 'spectroroute']
words[2], words[3], words[4] = words[3], words[4], words[2]
print(words)
tmp = words[2]
words[2] = words[3]
words[3] = words[4]
words[4] = tmp
print(words)
words = ['I', 'turned', 'off', 'the', 'spectroroute']
tags = ['noun', 'verb', 'prep', 'det', 'noun']
res = zip(words, tags)
print(list(res))
print(list(enumerate(words)))
text = nltk.corpus.nps_chat.words()
cut = int(0.9 * len(text))
training_data, test_data = text[:cut], text[cut:]
print(text == training_data + test_data)
print(len(training_data)/len(test_data))
words = 'I turned off the spectroroute'.split()
wordlens = [(len(word), word) for word in words]
wordlens.sort()
print(' '.join(w for (_, w) in wordlens))
lexicon = [
('the', 'det', ['Di:', 'D@']),
('off', 'prep', ['Qf', 'O:f'])
]
lexicon.sort()
lexicon[1] = ('turned', 'VBD', ['t3:nd', 't3`nd'])
del lexicon[0]
print(lexicon)
text = """\"When I use a word, " Humpty Dumpty said in rather a scornful tone,
\"it means just what I choose it to mean - neither more nor less.\""""
res = [w.lower() for w in nltk.word_tokenize(text)]
print(res)
res = max([w.lower() for w in nltk.word_tokenize(text)])
print(res)
res = max([w.lower() for w in nltk.word_tokenize(text)])
print(res)
import re
import nltk
from nltk.corpus import brown
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
cv_word_pairs = [(cv, w) for w in rotokas_words
for cv in re.findall('[ptksvr][aeiou]]', w)]
cfd = nltk.ConditionalFreqDist(
(genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre)
)
ha_words = ['aaahhhh', 'ah', 'ahah', 'ahahah', 'ahh', 'ahhahahaha',
'ahhh', 'ahhhh', 'ahhhhhh', 'ahhhhhhhhhhhhhh', 'ha',
'haaa', 'hah', 'haha', 'hahaaaa', 'hahah', 'hahaha']
tokens = nltk.corpus.brown.words(categories='news')
count = 0
total = 0
for token in tokens:
count += 1
total += len(token)
if count == 0:
count = 1
print(total / count)
total = sum(len(t) for t in tokens)
print(total / len(tokens))
tokens = tokens[:100]
word_list = []
len_word_list = len(word_list)
i = 0
while i < len(tokens):
j = 0
while j < len_word_list and word_list[j] < tokens[i]:
j += 1
if j > 0 and j == len_word_list:
word_list.insert(j, tokens[i])
len_word_list += 1
elif j == 0 or tokens[i] != word_list[j]:
word_list.insert(j, tokens[i])
len_word_list += 1
i += 1
print(word_list)
word_list = sorted(set(tokens))
print(word_list)
fd = nltk.FreqDist(nltk.corpus.brown.words())
cumulative = 0.0
for rank, word in enumerate(fd):
cumulative += fd[word] * 100 / fd.N()
print("%3d %6.2f%% %s" % (rank+1, cumulative, word))
if cumulative > 25:
break
text = nltk.corpus.gutenberg.words('milton-paradise.txt')
longest = ''
for word in text:
if len(word) > len(longest):
longest = word
print(longest)
maxlen = max(len(word) for word in text)
res = [word for word in text if len(word) == maxlen]
print(res)
sent = ['The', 'dog', 'gave', 'John', 'the', 'newspaper']
n = 3
res = [sent[i:i+n] for i in range(len(sent) - n + 1)]
print(res)
import pprint
m, n = 3, 7
array = [[set() for i in range(n)] for j in range(m)]
array[2][5].add('Alice')
pprint.pprint(array)
array = [[set()] * n] * m
array[2][5].add(7)
pprint.pprint(array)
import re
def get_text(file):
"""Read text from a file, normalizing whitesapce and stripping HTML markup."""
text = open(file).read()
text = re.sub('\s+', ' ', text)
text = re.sub(r'<.*?>', ' ', text)
return text
help(get_text)
res = get_text("document.txt")
print(res)
def repeat(msg, num):
return ' '.join([msg]*num)
monty = 'Monty Python'
res = repeat(monty, 3)
print(res)
def monty():
return "Monty Python"
res = monty()
print(res)
res = repeat(monty(), 3)
print(res)
res = repeat("Monty Python", 3)
print(res)
def my_sort1(mylist):
mylist.sort()
def my_sort2(mylist):
return sorted(mylist)
def my_sort3(mylist):
mylist.sort()
return mylist
def set_up(word, properties):
word = 'lolcat'
properties.append('noun')
properties = 5
w = ''
p = []
set_up(w, p)
print("w=", w)
print("p=", p)
w = 'a'
word = w
word = 'lolcat'
print(w)
p = []
properties = p
properties.append('noun')
properties = 5
print(p)
def tag(word):
if word in ['a', 'the', 'all']:
return 'det'
else:
return 'noun'
res = tag('the')
print(res)
res = tag('knight')
print(res)
res = tag(["'Tis", 'but', 'a', 'scratch'])
print(res)
def tag(word):
assert(isinstance(word, str))
if word in ['a', 'the', 'all']:
return 'det'
else:
return 'noun'
res = tag('a')
print(res)
import nltk
import bs4
import lxml
from urllib.request import urlopen
def freq_words(url, freqdist, n):
html = urlopen(url).read()
text = bs4.BeautifulSoup(html, "lxml")
text = text.get_text()
for word in nltk.word_tokenize(text):
freqdist[word.lower()] += 1
print(list(freqdist.keys())[:n])
constitution = "http://www.archives.gov/founding-docs"
fd = nltk.FreqDist()
freq_words(constitution, fd, 20)
def freq_words(url):
freqdist = nltk.FreqDist()
html = urlopen(url).read()
text = bs4.BeautifulSoup(html, "lxml")
text = text.get_text()
for word in nltk.word_tokenize(text):
freqdist[word.lower()] += 1
return freqdist
fd = freq_words(constitution)
print(list(fd.keys())[:20])
def accuracy(reference, test):
"""
Calculate the fraction of test items that equal the corresponding reference items.
Given a list of reference values and a corresponding list of test values,
return the fraction of corresponding values that are equal.
In particular, return the fraction of indexes
{(0<i<=len(test)} such that C{test[i] == reference[i]}.
>>> accuracy(['ADJ', 'N', 'V', 'N'], ['N', 'N', 'V', 'ADJ'])
0.5
@param reference: An ordered list of reference values.
@type refernce: C{list}
@param test: A list of values to compare against the corresponding reference values.
@type test: C{list}
@rtype: C{float}
@raise ValueError: If C{reference} and C{length} do not have the same length.
"""
if len(reference) != len(test):
raise ValueError("Lists must have the same length.")
num_correct = 0
for x, y in zip(reference, test):
if x == y:
num_correct += 1
return float(num_correct) / len(reference)
res = accuracy(['ADJ', 'N', 'V', 'N'], ['N', 'N', 'V', 'ADJ'])
print(res)
sent = ['Take', 'care', 'of', 'the', 'sense', ',', 'and', 'the', 'sounds', 'will',
'take', 'care', 'of', 'themselves', '.']
def extract_property(prop):
return [prop(word) for word in sent]
res = extract_property(len)
print(res)
def last_letter(word):
return word[-1]
res = extract_property(last_letter)
print(res)
res = extract_property(lambda w: w[-1])
print(res)
import operator
res = sorted(sent)
print(res)
def search1(substring, words):
result = []
for word in words:
if substring in word:
result.append(word)
return result
def search2(substring, words):
for word in words:
if substring in word:
yield word
print("search1:")
for item in search1('zz', nltk.corpus.brown.words()):
print(item)
print("search2:")
for item in search2('zz', nltk.corpus.brown.words()):
print(item)
def permutations(seq):
if len(seq) <= 1:
yield seq
else:
for perm in permutations(seq[1:]):
for i in range(len(perm) + 1):
yield perm[:i] + seq[0:1] + seq[i:]
res = list(permutations(['police', 'fish', 'buffalo']))
print(res)
def is_content_word(word):
return word.lower() not in ['a', 'of', 'the', 'and', 'will', ',', '.']
sent = ['Take', 'care', 'of', 'the', 'sense', ',', 'and', 'the', 'sounds', 'will', 'take', 'care', 'of',
'themselves', '.']
res = filter(is_content_word, sent)
print(list(res))
res = [w for w in sent if is_content_word(w)]
print(res)
lengths = map(len, nltk.corpus.brown.sents(categories = 'news'))
lengths = list(lengths)
print(sum(lengths)/len(lengths))
lengths = [len(w) for w in nltk.corpus.brown.sents(categories='news')]
print(sum(lengths)/len(lengths))
res = map(lambda w: len(list(filter(lambda c: c.lower() in "aeiou", w))), sent)
print(list(res))
res = [len([c for c in w if c.lower() in "aeiou"]) for w in sent]
print(res)
def repeat(msg='<empty>', num=1):
return msg*num
res = repeat(num=3)
print(res)
res = repeat(msg='Alice')
print(res)
res = repeat(num=5, msg='Alice')
print(res)
def generic(*args, **kwargs):
print(args)
print(kwargs)
generic(1, "African swallow", monty="python")
song = [['four', 'calling', 'birds'],
['three', 'French', 'hens'],
['tow', 'turtle', 'doves']]
res = zip(song[0], song[1], song[2])
print(list(res))
res = zip(*song)
print(list(res))
def freq_words(file, min=1, num=10):
text = open(file).read()
tokens = nltk.word_tokenize(text)
freqdist = nltk.FreqDist(t for t in tokens if len(t) >= min)
return list(freqdist.keys())[:num]
fw = freq_words('document.txt', 4, 10)
print(fw)
fw = freq_words('document.txt', min=4, num=10)
print(fw)
fw = freq_words('document.txt', num=10, min=4)
print(fw)
def freq_words2(file, min=1, num=10, trace=False):
freqdist = nltk.FreqDist()
if trace: print("Opening", file)
text = open(file).read()
if trace: print("Read in %d characters" % len(file))
for word in nltk.word_tokenize(text):
if len(word) >= min:
freqdist[word] += 1
if trace and freqdist.N() % 100 == 0: print(".")
if trace: print("")
return list(freqdist.keys())[:num]
fw = freq_words2("document.txt", min=1, num=10, trace=True)
def factorial1(n):
result = 1
for i in range(n):
result *= (i + 1)
return result
def factorial2(n):
if n == 1:
return 1
else:
return n * factorial2(n-1)
def size1(s):
return 1 + sum(size1(child) for child in s.hyponyms())
def size2(s):
layer = [s]
total = 0
while layer:
total += len(layer)
layer = [h for c in layer for h in c.hyponyms()]
return total
from nltk.corpus import wordnet as wn
dog = wn.synset('dog.n.01')
res = size1(dog)
print(res)
res= size2(dog)
print(res)
def insert(trie, key, value):
if key:
first, rest = key[0], key[1:]
if first not in trie:
trie[first] = {}
insert(trie[first], rest, value)
else:
trie['value'] = value
import nltk
import pprint
trie = nltk.defaultdict(dict)
insert(trie, 'chat', 'cat')
insert(trie, 'chien', 'dog')
insert(trie, 'chair', 'flesh')
insert(trie, 'chic', 'stylish')
trie = dict(trie)
res = trie['c']['h']['a']['t']['value']
print(res)
pprint.pprint(trie)
import re
def raw(file):
contents = open(file).read()
contents = re.sub(r'<.*?>', ' ', contents)
contents = re.sub('\s+', ' ', contents)
return contents
def snippet(doc, term):
text = ' '*30 + raw(doc) + ' ' * 30
pos = text.index(term)
return text[pos - 30:pos + 30]
print("Building Index...")
files = nltk.corpus.movie_reviews.abspaths()
idx = nltk.Index((w, f) for f in files for w in raw(f).split())
query = ''
while query != "quit":
query = input("query> ")
if query in idx:
for doc in idx[query]:
print(snippet(doc, query))
else:
print("Not found")
def preprocess(tagged_corpus):
words = set()
tags = set()
for sent in tagged_corpus:
for word, tag in sent:
words.add(word)
tags.add(tag)
wm = dict((w, i) for (i, w) in enumerate(words))
tm = dict((t, i) for (i, t) in enumerate(tags))
return [[(wm[w], tm[t]) for (w, t) in sent] for sent in tagged_corpus]
from timeit import Timer
vocab_size = 100000
setup_list = "import random; vocab = range(%d)" % vocab_size
setup_set = "import random; vocab = set(range(%d))" % vocab_size
statement = "random.randint(0, %d) in vocab;" % vocab_size * 2
print(Timer(statement, setup_list).timeit(1000))
print(Timer(statement, setup_set).timeit(1000))
def virahanka1(n):
if n == 0:
return [""]
elif n == 1:
return ["S"]
else:
s = ["S" + prosody for prosody in virahanka1(n-1)]
l = ["L" + prosody for prosody in virahanka1(n-2)]
return s + l
def virahanka2(n):
lookup = [[""], ["S"]]
for i in range(n - 1):
s = ["S" + prosody for prosody in lookup[i + 1]]
l = ["L" + prosody for prosody in lookup[i]]
lookup.append(s + l)
return lookup[n]
def virahanka3(n, lookup={0:[""], 1:["S"]}):
if n not in lookup:
s = ["S" + prosody for prosody in virahanka3(n-1)]
l = ["L" + prosody for prosody in virahanka3(n-2)]
lookup[n] = s + l
return lookup[n]
from nltk import memoize
@memoize
def virahanka4(n):
if n == 0:
return [""]
elif n == 1:
return ["S"]
else:
s = ["S" + prosody for prosody in virahanka4(n-1)]
l = ["L" + prosody for prosody in virahanka4(n-2)]
return s + l
res = virahanka1(4)
print(res)
res = virahanka2(4)
print(res)
res = virahanka3(4)
print(res)
res = virahanka4(4)
print(res)
colors = 'rgbcmyk'
def bar_chart(categories, words, counts):
"Plot a bar chart showing counts for each word by category"
import pylab
ind = pylab.arange(len(words))
width = 1 / (len(categories) + 1)
bar_groups = []
for c in range(len(categories)):
bars = pylab.bar(ind+c*width, counts[categories[c]], width, color=colors[c % len(colors)])
bar_groups.append(bars)
pylab.xticks(ind+width, words)
pylab.legend([b[0] for b in bar_groups], categories, loc='upper left')
pylab.ylabel('Frequency')
pylab.title('Frequency of Six Modal Verbs by Genre')
pylab.show()
genres = ['news', 'religion', 'hobbies', 'government', 'adventure']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
import nltk
cfdist = nltk.ConditionalFreqDist(
(genre, word)
for genre in genres
for word in nltk.corpus.brown.words(categories=genres)
if word in modals)
counts = {}
for genre in genres:
counts[genre] = [cfdist[genre][word] for word in modals]
bar_chart(genres, modals, counts)
import pylab
import matplotlib
matplotlib.pyplot.switch_backend('Agg'),
pylab.savefig('modals.png')
print('Content-Type: text/html')
print("")
print('<html><body>')
print('<img src="modals.png"/>')
print('</body></html>')
"""
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
import matplotlib
from nltk.corpus import wordnet as wn
def traverse(graph, start, node):
graph.depth[node.name] = node.shortest_path_distance(start)
for child in node.hyponyms():
graph.add_edge(node.name, child.name)
traverse(graph, start, child)
def hyponym_graph(start):
G = nx.Graph()
G.depth = {}
traverse(G, start, start)
return G
# 在win10 x64为系统上 python3.5和python3.6上面,pygraphviz安装失败
def graph_draw(graph):
# AttributeError: module 'networkx.drawing' has no attribute 'graphviz_layout'
# 遇到以上错误,需要安装raphvizg
# http://blog.csdn.net/sinat_29508201/article/details/51887446
# http://www.graphviz.org/Download_windows.php
# http://www.graphviz.org/pub/graphviz/stable/windows/graphviz-2.38.msi
# 安装graphviz-2.38.msi后,确认设置了正确的路径
# pip3 install pygraphviz
nx.draw(graph, pos=graphviz_layout(graph),
node_size = [16*graph.degree(n) for n in graph],
cmap=matplotlib.pyplot.cm.Blues,
node_color = [graph.depth[n] for n in graph],
prog = 'dot')
matplotlib.pyplot.show()
dog = wn.synset('dog.n.01')
graph = hyponym_graph(dog)
graph_draw(graph)
"""
import csv
input_file = open("lexicon.csv", "r")
for row in csv.reader(input_file):
print(row)
from numpy import array
cube = array([ [[0, 0, 0], [1, 1, 1,], [2, 2, 2]],
[[3, 3, 3,], [4, 4, 4], [5, 5, 5]],
[[6, 6, 6], [7, 7, 7], [8, 8, 8]] ])
print(cube[1, 1, 1])
print(cube[2].transpose())
print(cube[2, 1:])
from numpy import linalg
a = array([[4, 0], [3, -5]])
u, s, vt = linalg.svd(a)
print(u)
print(s)
print(vt)