1. 练习:数据结构
2. 练习:Ben Bitdiddle
3. 练习:网络运行
4. 练习:更好地分词
题目:
# 1 Gold Star
# The built-in <string>.split() procedure works
# okay, but fails to find all the words on a page
# because it only uses whitespace to split the
# string. To do better, we should also use punctuation
# marks to split the page into words.
# Define a procedure, split_string, that takes two
# inputs: the string to split and a string containing
# all of the characters considered separators. The
# procedure should return a list of strings that break
# the source string up by the characters in the
# splitlist.
def split_string(source,splitlist):
#out = split_string("This is a test-of the,string separation-code!"," ,!-")
#print out
#>>> ['This', 'is', 'a', 'test', 'of', 'the', 'string', 'separation', 'code']
#out = split_string("After the flood ... all the colors came out.", " .")
#print out
#>>> ['After', 'the', 'flood', 'all', 'the', 'colors', 'came', 'out']
#out = split_string("First Name,Last Name,Street Address,City,State,Zip Code",",")
#print out
#>>>['First Name', 'Last Name', 'Street Address', 'City', 'State', 'Zip Code']
(我没想出来怎么写代码实现。。
一点我自己的思考:
我一开始想到的是,仍然复用 split()
函数,但是似乎只能基于 splitlist
里面的单个字符,对 source
进行分割:
然后就有了如下的我自己的代码:
def split_string(source, splitlist):
return source.split(splitlist[0])
或者:
def split_string(source, splitlist):
for char in splitlist:
source.split(char)
...
...
)
答案以及一点我自己的注释(先声明下,有点绕):
def split_string(source,splitlist):
# 【 splitlist 中的字符,后面我将均称为 split 字符】
result = []
atsplit = True
# 遍历 `source` 字符串中的每 1 个字符,但是,在此之前,先设置 1 个变量 `atsplit` ,初始值为 `True`。
for char in source:
# 遍历 `source` 时,如果其中的 char 字符是 split 字符,那么,将 atsplit 设置为 True。
if char in splitlist:
atsplit = True
# 如果 char 不是 split 字符, char 是 'a' 'b' 'c' 'd' 这样的,要获取的普通字符。
else:
# 进一步的,如果 atsplit 为 True,即,上一轮循环的 char 为 split 字符,或者,这是第 1 个 'a' 'b' 'c' 'd' 这样的,要获取的普通字符。
if atsplit == True:
# 那么,将本轮循环的 char 作为 1 个元素,添加到 result 列表中的末尾。
# 并将 atsplit 置为 False。
result.append(char)
atsplit = False
# 如果 char 不是 split 字符, char 是 'a' 'b' 'c' 'd' 这样的,要获取的普通字符。
# 进一步的, atsplit 为 False ,即,上一轮循环的 char 是 'a' 'b' 'c' 'd' 这样的,要获取的普通字符。(也就是连续的普通字符,需要拼接到一起,组成单词。)
else: # atsplit == False
# 那么,将这轮循环的普通字符 char 与上一轮循环的普通字符拼接到一起。
result[-1] = result[-1] + char
return result
)
5. 练习:改进索引
题目:
(对于这样的情况: keyword
相同,但是 url
有多个的情况,即多次调用 add_to_index
函数,会造成 index
中有重复的 url
,因此需要修改 add_to_index
。)
# The current index includes a url in the list of urls
# for a keyword multiple times if the keyword appears
# on that page more than once.
# It might be better to only include the same url
# once in the url list for a keyword, even if it appears
# many times.
# Modify add_to_index so that a given url is only
# included once in the url list for a keyword,
# no matter how many times that keyword appears.
def add_to_index(index, keyword, url):
for entry in index:
if entry[0] == keyword:
entry[1].append(url)
return
# not found, add new keyword to index
index.append([keyword, [url]])
def get_page(url):
try:
if url == "http://www.udacity.com/cs101x/index.html":
return '''<html> <body> This is a test page for learning to crawl!
<p> It is a good idea to
<a href="http://www.udacity.com/cs101x/crawling.html">
learn to crawl</a> before you try to
<a href="http://www.udacity.com/cs101x/walking.html">walk</a> or
<a href="http://www.udacity.com/cs101x/flying.html">fly</a>.</p></body>
</html>'''
elif url == "http://www.udacity.com/cs101x/crawling.html":
return '''<html> <body> I have not learned to crawl yet, but I am
quite good at <a href="http://www.udacity.com/cs101x/kicking.html">kicking</a>.
</body> </html>'''
elif url == "http://www.udacity.com/cs101x/walking.html":
return '''<html> <body> I cant get enough
<a href="http://www.udacity.com/cs101x/index.html">crawling</a></body></html>'''
elif url == "http://www.udacity.com/cs101x/flying.html":
return '''<html>
<body>The magic words are Squeamish Ossifrage!</body></html>'''
except:
return ""
return ""
def union(a, b):
for e in b:
if e not in a:
a.append(e)
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
def crawl_web(seed):
tocrawl = [seed]
crawled = []
index = []
while tocrawl:
page = tocrawl.pop()
if page not in crawled:
content = get_page(page)
add_page_to_index(index, page, content)
union(tocrawl, get_all_links(content))
crawled.append(page)
return index
def add_page_to_index(index, url, content):
words = content.split()
for word in words:
add_to_index(index, word, url)
def lookup(index, keyword):
for entry in index:
if entry[0] == keyword:
return entry[1]
return None
#index = crawl_web("http://www.udacity.com/cs101x/index.html")
#print lookup(index,"is")
#>>> ['http://www.udacity.com/cs101x/index.html']
(我的答案:
def add_to_index(index, keyword, url):
for entry in index:
"""
if url in entry[1]:
continue
"""
if entry[0] == keyword and url not in entry[1]:
entry[1].append(url)
return
# not found, add new keyword to index
index.append([keyword, [url]])
对于注释中提供的 1 个测试用例,可以测试通过,但是无法通过网页上所有的测试用例,会报错。
)
视频中的答案:
def add_to_index(index, keyword, url):
for entry in index:
if entry[0] == keyword:
if url not in entry[1]:
entry[1].append(url)
return
# not found, add new keyword to index
index.append([keyword, [url]])
6. 练习:统计点击数
题目:
# 2 Gold Stars
# One way search engines rank pages
# is to count the number of times a
# searcher clicks on a returned link.
# This indicates that the person doing
# the query thought this was a useful
# link for the query, so it should be
# higher in the rankings next time.
# (In Unit 6, we will look at a different
# way of ranking pages that does not depend
# on user clicks.)
# Modify the index such that for each url in a
# list for a keyword, there is also a number
# that counts the number of times a user
# clicks on that link for this keyword.
# The result of lookup(index,keyword) should
# now be a list of url entries, where each url
# entry is a list of a url and a number
# indicating the number of times that url
# was clicked for this query keyword.
# You should define a new procedure to simulate
# user clicks for a given link:
# record_user_click(index,word,url)
# that modifies the entry in the index for
# the input word by increasing the count associated
# with the url by 1.
# You also will have to modify add_to_index
# in order to correctly create the new data
# structure, and to prevent the repetition of
# entries as in homework 4-5.
def record_user_click(index,keyword,url):
def add_to_index(index, keyword, url):
for entry in index:
if entry[0] == keyword:
entry[1].append(url)
return
# not found, add new keyword to index
index.append([keyword, [url]])
def get_page(url):
try:
if url == "http://www.udacity.com/cs101x/index.html":
return '''<html> <body> This is a test page for learning to crawl!
<p> It is a good idea to
<a href="http://www.udacity.com/cs101x/crawling.html">
learn to crawl</a> before you try to
<a href="http://www.udacity.com/cs101x/walking.html">walk</a> or
<a href="http://www.udacity.com/cs101x/flying.html">fly</a>.</p></body></html>'''
elif url == "http://www.udacity.com/cs101x/crawling.html":
return '''<html> <body> I have not learned to crawl yet, but I am
quite good at <a href="http://www.udacity.com/cs101x/kicking.html">kicking</a>.
</body> </html>'''
elif url == "http://www.udacity.com/cs101x/walking.html":
return '''<html> <body> I cant get enough
<a href="http://www.udacity.com/cs101x/index.html">crawling</a>!</body></html>'''
elif url == "http://www.udacity.com/cs101x/flying.html":
return '<html><body>The magic words are Squeamish Ossifrage!</body></html>'
except:
return ""
return ""
def union(a, b):
for e in b:
if e not in a:
a.append(e)
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
def crawl_web(seed):
tocrawl = [seed]
crawled = []
index = []
while tocrawl:
page = tocrawl.pop()
if page not in crawled:
content = get_page(page)
add_page_to_index(index, page, content)
union(tocrawl, get_all_links(content))
crawled.append(page)
return index
def add_page_to_index(index, url, content):
words = content.split()
for word in words:
add_to_index(index, word, url)
def lookup(index, keyword):
for entry in index:
if entry[0] == keyword:
return entry[1]
return None
#Here is an example showing a sequence of interactions:
index = crawl_web('http://www.udacity.com/cs101x/index.html')
print lookup(index, 'good')
#>>> [['http://www.udacity.com/cs101x/index.html', 0],
#>>> ['http://www.udacity.com/cs101x/crawling.html', 0]]
record_user_click(index, 'good', 'http://www.udacity.com/cs101x/crawling.html')
print lookup(index, 'good')
#>>> [['http://www.udacity.com/cs101x/index.html', 0],
#>>> ['http://www.udacity.com/cs101x/crawling.html', 1]]
我的答案(并不准确,也不完整):
def record_user_click(index,keyword,url):
for entry in index:
if entry[0] == keyword:
if url in entry[1]:
entry[1][-1] += 1
return entry[1][-1]
def add_to_index(index, keyword, url):
for entry in index:
if entry[0] == keyword:
if url not in entry[1]:
entry[1].append([url, 0])
else:
entry[1][-1] += 1
return
# not found, add new keyword to index
index.append([keyword, [url, 0]])
视频中的答案:
(简单地谈下,解决这个问题,我认为,最需要了解的是,原来的 index
的结构:
[ [key1, url1],
[key2, url2],
...
]
以及加入了对 url
的统计的 index
的结构:
[ [key1, [url11, count11], [url12, count12], ...],
[key2, [url21, count21], [url22, count22], ...],
...
]
)
def record_user_click(index,keyword,url):
urls = lookup(index, keyword)
if urls:
for entry in urls:
if entry[0] == url:
entry[1] = entry[1] + 1
def add_to_index(index, keyword, url):
for entry in index:
if entry[0] == keyword:
for urls in entry[1]:
#
if urls[0] == url:
return
entry[1].append([url, 0])
return
# not found, add new keyword to index
index.append([keyword, [[url, 0]]])