import urllib.request
import re
from bs4 import BeautifulSoup
#该作者的博文一共有多少页
pageNo=18
#这是要访问的某一篇文章的地址
url_single='http://www.cnblogs.com/over140/p/4440137.html'
#后面需要添加页码
url='http://www.cnblogs.com/over140/default.html?page='
#博主大大的名字
author='over140'
def get_html(url):
'''
返回对应url的网页源码,经过解码的内容
:param url:
:return:
'''
req = urllib.request.Request(url)
resp = urllib.request.urlopen(req)
html_page = resp.read().decode('utf-8')
return html_page
def get_title(url):
'''
获取对应url下文章的标题
:param url:
:return:
'''
html_page = get_html(url)
title_pattern = r'()(.*)()'
title_match = re.search(title_pattern, html_page)
title = title_match.group(2)
return title
def get_Body(url):
'''
获取对应url的文章的正文内容
:param url:
:return:
'''
html_page = get_html(url)
soup = BeautifulSoup(html_page, 'html.parser')
div = soup.find(id="cnblogs_post_body")
return div.get_text()
def save_file(url):
'''
根据url,将文章保存到本地
:param url:
:return:
'''
title=get_title(url)
body=get_Body(url)
filename=author+'-'+title+'.txt'
with open(filename, 'w', encoding='utf-8') as f:
f.write(body)
def get_Urls(url,pageNo):
'''
根据url,pageNo,能够返回该博主所有的文章url列表
:param url:
:param pageNo:
:return:
'''
total_urls=[]
for i in range(1,pageNo+1):
url_1=url+str(i)
html=get_html(url_1)
title_pattern=r''
urls=re.findall(title_pattern,html)
for url_ in urls:
total_urls.append(url_)
print(total_urls)
return total_urls
def save_files(url,pageNo):
'''
根据url和pageNo,保存博主所有的文章
:param url:
:param pageNo:
:return:
'''
totol_urls=get_Urls(url,pageNo)
for url_ in totol_urls:
save_file(url_)
if __name__=='__main__':
save_files(url,pageNo)