#-*-encoding:utf-8-*-
from BeautifulSoup import BeautifulSoup
import urllib
import os
def file_fiter(title):
"""去除windows文件名非法字符"""
filename = []
for i in range(len(title)):
if title[i] not in '<>*?"\/"':
filename.append(title[i])
return filename
def DownBlog(title,url):
"""下载每一类别所包含所有博文"""
path = "F:/Blog/" + ''.join(file_fiter(title))
os.mkdir(path)#根据类别创建存储博文的目录
response = urllib.urlopen(url)
soup = BeautifulSoup(response.read())
all_contents_div = soup.findAll('div',{'class':'entry'})
for content_div in all_contents_div:
title = content_div.find('h3',{'class':'entry-header'}).contents[0]
content = content_div.find('div',{'class':'entry-body'}).getText()
#在构造文件明时,需要注意windows文件名字符限制
open(path + "/"+''.join(file_fiter(title))+".txt","w").write(content.encode('utf-8'))
#print title
#print content
response = urllib.urlopen("http://blog.codingnow.com/")
soup = BeautifulSoup(response.read())
#程序主要抓取博客文章,通过观察,发现主页有个区域(div)是类别。抓取类别就可以进一步抓取
#页面了。
categories_div = soup.find('div',{'class':'module-categories module'})#定位分类栏
href_labels = categories_div.findAll('a')#查找所有<a href = 'http:///...'></a>标签
hrefs = []#所有博客页面链接
categories_titles = []#每个页面所属类别
#类别和对应页面连接
for label in href_labels:
hrefs.append(label['href'])
categories_titles.append(label.contents[0])
#因为还没学多线程了,下面就用最蜗牛的办法获取所有文章
for i in range(len(hrefs)):
DownBlog(categories_titles[i],hrefs[i])
运行结果:
程序里我设置成在F://Blog下,所以这个可以改变的。