《计算机科学导论》学习笔记(16) - 课程 16

1. 练习:数据结构

2. 练习:Ben Bitdiddle

3. 练习:网络运行

4. 练习:更好地分词

题目:

# 1 Gold Star

# The built-in <string>.split() procedure works
# okay, but fails to find all the words on a page
# because it only uses whitespace to split the
# string. To do better, we should also use punctuation
# marks to split the page into words.

# Define a procedure, split_string, that takes two
# inputs: the string to split and a string containing
# all of the characters considered separators. The
# procedure should return a list of strings that break
# the source string up by the characters in the
# splitlist.


def split_string(source,splitlist):





#out = split_string("This is a test-of the,string separation-code!"," ,!-")
#print out
#>>> ['This', 'is', 'a', 'test', 'of', 'the', 'string', 'separation', 'code']

#out = split_string("After  the flood   ...  all the colors came out.", " .")
#print out
#>>> ['After', 'the', 'flood', 'all', 'the', 'colors', 'came', 'out']

#out = split_string("First Name,Last Name,Street Address,City,State,Zip Code",",")
#print out
#>>>['First Name', 'Last Name', 'Street Address', 'City', 'State', 'Zip Code']

(我没想出来怎么写代码实现。。

一点我自己的思考:
我一开始想到的是,仍然复用 split() 函数,但是似乎只能基于 splitlist 里面的单个字符,对 source 进行分割:

然后就有了如下的我自己的代码:

def split_string(source, splitlist):
    return source.split(splitlist[0])

或者:

def split_string(source, splitlist):
    for char in splitlist:
        source.split(char)
        ...
    ...

)

答案以及一点我自己的注释(先声明下,有点绕):

def split_string(source,splitlist):
    # 【 splitlist 中的字符,后面我将均称为 split 字符】
    result = []
    atsplit = True
    # 遍历 `source` 字符串中的每 1 个字符,但是,在此之前,先设置 1 个变量 `atsplit` ,初始值为 `True`for char in source:
        # 遍历 `source` 时,如果其中的 char 字符是 split 字符,那么,将 atsplit 设置为 True。
        if char in splitlist:
            atsplit = True
        # 如果 char 不是 split 字符, char'a' 'b' 'c' 'd' 这样的,要获取的普通字符。
        else:
            # 进一步的,如果 atsplit 为 True,即,上一轮循环的 char 为 split 字符,或者,这是第 1'a' 'b' 'c' 'd' 这样的,要获取的普通字符。
            if atsplit == True:
                # 那么,将本轮循环的 char 作为 1 个元素,添加到 result 列表中的末尾。
                # 并将 atsplit 置为 False。
                result.append(char)
                atsplit = False
        # 如果 char 不是 split 字符, char'a' 'b' 'c' 'd' 这样的,要获取的普通字符。
            # 进一步的, atsplit 为 False ,即,上一轮循环的 char'a' 'b' 'c' 'd' 这样的,要获取的普通字符。(也就是连续的普通字符,需要拼接到一起,组成单词。)
            else:   # atsplit == False
                # 那么,将这轮循环的普通字符 char 与上一轮循环的普通字符拼接到一起。
                result[-1] = result[-1] + char
    return result

)

5. 练习:改进索引

题目:
(对于这样的情况: keyword 相同,但是 url 有多个的情况,即多次调用 add_to_index 函数,会造成 index 中有重复的 url ,因此需要修改 add_to_index 。)

# The current index includes a url in the list of urls
# for a keyword multiple times if the keyword appears
# on that page more than once.

# It might be better to only include the same url
# once in the url list for a keyword, even if it appears
# many times.

# Modify add_to_index so that a given url is only
# included once in the url list for a keyword,
# no matter how many times that keyword appears.

def add_to_index(index, keyword, url):
    for entry in index:
        if entry[0] == keyword:
            entry[1].append(url)
            return
    # not found, add new keyword to index
    index.append([keyword, [url]])


def get_page(url):
    try:
        if url == "http://www.udacity.com/cs101x/index.html":
            return '''<html> <body> This is a test page for learning to crawl!
<p> It is a good idea to
<a href="http://www.udacity.com/cs101x/crawling.html">
learn to crawl</a> before you try to
<a href="http://www.udacity.com/cs101x/walking.html">walk</a> or
<a href="http://www.udacity.com/cs101x/flying.html">fly</a>.</p></body>
</html>'''

        elif url == "http://www.udacity.com/cs101x/crawling.html":
            return '''<html> <body> I have not learned to crawl yet, but I am
quite good at  <a href="http://www.udacity.com/cs101x/kicking.html">kicking</a>.
</body> </html>'''

        elif url == "http://www.udacity.com/cs101x/walking.html":
            return '''<html> <body> I cant get enough
<a href="http://www.udacity.com/cs101x/index.html">crawling</a></body></html>'''

        elif url == "http://www.udacity.com/cs101x/flying.html":
            return '''<html>
<body>The magic words are Squeamish Ossifrage!</body></html>'''
    except:
        return ""
    return ""

def union(a, b):
    for e in b:
        if e not in a:
            a.append(e)

def get_next_target(page):
    start_link = page.find('<a href=')
    if start_link == -1:
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1:end_quote]
    return url, end_quote

def get_all_links(page):
    links = []
    while True:
        url, endpos = get_next_target(page)
        if url:
            links.append(url)
            page = page[endpos:]
        else:
            break
    return links

def crawl_web(seed):
    tocrawl = [seed]
    crawled = []
    index = []
    while tocrawl:
        page = tocrawl.pop()
        if page not in crawled:
            content = get_page(page)
            add_page_to_index(index, page, content)
            union(tocrawl, get_all_links(content))
            crawled.append(page)
    return index

def add_page_to_index(index, url, content):
    words = content.split()
    for word in words:
        add_to_index(index, word, url)

def lookup(index, keyword):
    for entry in index:
        if entry[0] == keyword:
            return entry[1]
    return None

#index = crawl_web("http://www.udacity.com/cs101x/index.html")
#print lookup(index,"is")
#>>> ['http://www.udacity.com/cs101x/index.html']

(我的答案:

def add_to_index(index, keyword, url):
    for entry in index:
        """
        if url in entry[1]:
            continue
        """
        if entry[0] == keyword and url not in entry[1]:
            entry[1].append(url)
            return
    # not found, add new keyword to index
    index.append([keyword, [url]])

对于注释中提供的 1 个测试用例,可以测试通过,但是无法通过网页上所有的测试用例,会报错。
)

视频中的答案:

def add_to_index(index, keyword, url):
    for entry in index:
        if entry[0] == keyword:
            if url not in entry[1]:
                entry[1].append(url)
            return
    # not found, add new keyword to index
    index.append([keyword, [url]])

6. 练习:统计点击数

题目:

# 2 Gold Stars

# One way search engines rank pages
# is to count the number of times a
# searcher clicks on a returned link.
# This indicates that the person doing
# the query thought this was a useful
# link for the query, so it should be
# higher in the rankings next time.

# (In Unit 6, we will look at a different
# way of ranking pages that does not depend
# on user clicks.)

# Modify the index such that for each url in a
# list for a keyword, there is also a number
# that counts the number of times a user
# clicks on that link for this keyword.

# The result of lookup(index,keyword) should
# now be a list of url entries, where each url
# entry is a list of a url and a number
# indicating the number of times that url
# was clicked for this query keyword.

# You should define a new procedure to simulate
# user clicks for a given link:

# record_user_click(index,word,url)

# that modifies the entry in the index for
# the input word by increasing the count associated
# with the url by 1.

# You also will have to modify add_to_index
# in order to correctly create the new data
# structure, and to prevent the repetition of
# entries as in homework 4-5.


def record_user_click(index,keyword,url):


def add_to_index(index, keyword, url):
    for entry in index:
        if entry[0] == keyword:
            entry[1].append(url)
            return
    # not found, add new keyword to index
    index.append([keyword, [url]])


def get_page(url):
    try:
        if url == "http://www.udacity.com/cs101x/index.html":
            return '''<html> <body> This is a test page for learning to crawl!
<p> It is a good idea to
<a href="http://www.udacity.com/cs101x/crawling.html">
learn to crawl</a> before you try to
<a href="http://www.udacity.com/cs101x/walking.html">walk</a> or
<a href="http://www.udacity.com/cs101x/flying.html">fly</a>.</p></body></html>'''

        elif url == "http://www.udacity.com/cs101x/crawling.html":
            return '''<html> <body> I have not learned to crawl yet, but I am
quite good at  <a href="http://www.udacity.com/cs101x/kicking.html">kicking</a>.
</body> </html>'''

        elif url == "http://www.udacity.com/cs101x/walking.html":
            return '''<html> <body> I cant get enough
<a href="http://www.udacity.com/cs101x/index.html">crawling</a>!</body></html>'''

        elif url == "http://www.udacity.com/cs101x/flying.html":
            return '<html><body>The magic words are Squeamish Ossifrage!</body></html>'
    except:
        return ""
    return ""

def union(a, b):
    for e in b:
        if e not in a:
            a.append(e)

def get_next_target(page):
    start_link = page.find('<a href=')
    if start_link == -1:
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1:end_quote]
    return url, end_quote

def get_all_links(page):
    links = []
    while True:
        url, endpos = get_next_target(page)
        if url:
            links.append(url)
            page = page[endpos:]
        else:
            break
    return links

def crawl_web(seed):
    tocrawl = [seed]
    crawled = []
    index = []
    while tocrawl:
        page = tocrawl.pop()
        if page not in crawled:
            content = get_page(page)
            add_page_to_index(index, page, content)
            union(tocrawl, get_all_links(content))
            crawled.append(page)
    return index

def add_page_to_index(index, url, content):
    words = content.split()
    for word in words:
        add_to_index(index, word, url)

def lookup(index, keyword):
    for entry in index:
        if entry[0] == keyword:
            return entry[1]
    return None


#Here is an example showing a sequence of interactions:
index = crawl_web('http://www.udacity.com/cs101x/index.html')
print lookup(index, 'good')
#>>> [['http://www.udacity.com/cs101x/index.html', 0],
#>>> ['http://www.udacity.com/cs101x/crawling.html', 0]]
record_user_click(index, 'good', 'http://www.udacity.com/cs101x/crawling.html')
print lookup(index, 'good')
#>>> [['http://www.udacity.com/cs101x/index.html', 0],
#>>> ['http://www.udacity.com/cs101x/crawling.html', 1]]

我的答案(并不准确,也不完整):

def record_user_click(index,keyword,url):
    for entry in index:
        if entry[0] == keyword:
            if url in entry[1]:
                entry[1][-1] += 1
    return entry[1][-1]



def add_to_index(index, keyword, url):
    for entry in index:
        if entry[0] == keyword:
            if url not in entry[1]:
                entry[1].append([url, 0])
            else:
                entry[1][-1] += 1
            return
    # not found, add new keyword to index
    index.append([keyword, [url, 0]])

视频中的答案:
(简单地谈下,解决这个问题,我认为,最需要了解的是,原来的 index 的结构:

[   [key1, url1],
    [key2, url2],
    ...
]

以及加入了对 url 的统计的 index 的结构:

[   [key1, [url11, count11], [url12, count12], ...],
    [key2, [url21, count21], [url22, count22], ...],
    ...
]

)

def record_user_click(index,keyword,url):
    urls = lookup(index, keyword)
    if urls:
        for entry in urls:
            if entry[0] == url:
                entry[1] = entry[1] + 1

def add_to_index(index, keyword, url):
    for entry in index:
        if entry[0] == keyword:
            for urls in entry[1]:
                # 
                if urls[0] == url:
                    return
            entry[1].append([url, 0])
            return
    # not found, add new keyword to index
    index.append([keyword, [[url, 0]]])

7. 练习:在路由器花费的时间

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值