#!user/bin/python
# coding: utf-8
from bs4 import BeautifulSoup
import urllib
import re
def get_html(url):
req = urllib.urlopen(url).read()
return req
def handle_html(html):
reg1 = re.compile(r'<span class="atc_title">.*?</span>', re.S)
reg2 = re.compile(r'<span class="atc_title">\s*<a title=".*" target="_blank" href="(?P<link>.*)">(?P<title>.*)</a></span>')
list1 = re.findall(reg1, html)
list2 = []
for title in list1:
n = re.search(reg2, title)
print n.group('title') + '\t' + n.group('link')
list2.append(n.group('link'))
return list2
def load_html(result):
count = 0
for link in result:
count += 1
urllib.urlretrieve(link, 'D:\Documents\%s.html' % count)
if __name__ == '__main__':
url = 'http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html'
html = get_html(url)
result = handle_html(html)
load_html(result)