网络爬虫2
def get_html(urls):
import urllib.request as ur
try:
page = ur.urlopen(urls)
return page.read().decode('utf-8-sig')
except:
return ""
def get_url(html_page):
start_position = html_page.find('a href="')
if start_position == -1:
return None, 0
start_position += 8
end_position = html_page.find('"', start_position)
return html_page[start_position:end_position], end_position
def get_all_url(seed):
html_page =get_html(seed)
useful_links = []
while True:
a_url, end = get_url(html_page)
if a_url:
if a_url not in useful_links and is_useful(a_url):
useful_links.append(a_url)
html_page = html_page[end + 1:]
else:
break
useful_links.sort()
return useful_links
def is_useful(a_url):
import re
if re.match(r"/\d+.\d+.+\.html", a_url):
return True
else:
return False
def get_content(html_page):
start_flag = html_page.find("readx()")
# print(start_flag)
if start_flag < 0:
return "获取章节失败"
end_flag = html_page.find("read3()")
content_page = html_page[start_flag:end_flag]
start_flag = content_page.find("</script>")
if start_flag < 0:
return "获取章节失败"
else:
start_flag += 8
end_flag = content_page.find("</div>")
content =deal_content(content_page[start_flag:end_flag])
return content
def deal_content(content_page):
first_deal = content_page.replace(" ", " ")
second_deal = content_page.replace("<br/>", "\n")
return second_deal+"\n"
def get_title(html_page):
start_flag = html_page.find("bookname")
if start_flag < 0:
return "获取章节失败"
start_flag = html_page.find("<h1>",start_flag)+4
end_flag = html_page.find("</h1>",start_flag)
content_title = html_page[start_flag:end_flag]
return content_title
def write_to_txt(name,seed):
new_file = open("./res/"+name,"a")
all_useful_links = get_all_url(seed)
# print(all_useful_links)
while all_useful_links:
a_link = seed[:-7]+all_useful_links.pop(0)
print(a_link)
html_page = get_html(a_link)
# print(html_page)
content_title = get_title(html_page)
content = get_content(html_page)
new_file.write(content_title+"\n"+content)
if not new_file.closed:
new_file.close()
print("缓存结束")
seed = "http://www.biquge.com/0_176/"
write_to_txt("大主宰.txt",seed)
现在比之一来说代码的可读性和移植性都很好,但是不知道为什么执行的效率很慢
求解