python简单爬去博文

最新推荐文章于 2024-07-21 22:27:18 发布

hzz_light

最新推荐文章于 2024-07-21 22:27:18 发布

阅读量353

点赞数

分类专栏： Python 文章标签： python Python 爬虫

本文链接：https://blog.csdn.net/hzz_light/article/details/8541814

版权

Python 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

#-*-encoding:utf-8-*-
from BeautifulSoup import BeautifulSoup
import urllib
import os
def file_fiter(title):
    """去除windows文件名非法字符"""
    filename = []
    for i in range(len(title)):
        if title[i] not in '<>*?"\/"':
            filename.append(title[i])
    return filename

        
def DownBlog(title,url):
    """下载每一类别所包含所有博文"""
    path = "F:/Blog/" + ''.join(file_fiter(title))
    os.mkdir(path)#根据类别创建存储博文的目录
    response = urllib.urlopen(url)
    soup = BeautifulSoup(response.read())
    all_contents_div = soup.findAll('div',{'class':'entry'})
    for content_div in all_contents_div:
        title = content_div.find('h3',{'class':'entry-header'}).contents[0]
        content = content_div.find('div',{'class':'entry-body'}).getText()
        #在构造文件明时，需要注意windows文件名字符限制
        open(path + "/"+''.join(file_fiter(title))+".txt","w").write(content.encode('utf-8'))
        #print title
        #print content
        
    
response = urllib.urlopen("http://blog.codingnow.com/")
soup = BeautifulSoup(response.read())

#程序主要抓取博客文章，通过观察，发现主页有个区域（div）是类别。抓取类别就可以进一步抓取
#页面了。

categories_div = soup.find('div',{'class':'module-categories module'})#定位分类栏

href_labels = categories_div.findAll('a')#查找所有<a href = 'http:///...'></a>标签
hrefs = []#所有博客页面链接
categories_titles = []#每个页面所属类别

#类别和对应页面连接
for label in href_labels:
    hrefs.append(label['href'])
    categories_titles.append(label.contents[0])

#因为还没学多线程了，下面就用最蜗牛的办法获取所有文章
for i in range(len(hrefs)):
    DownBlog(categories_titles[i],hrefs[i])

运行结果:

程序里我设置成在F://Blog下，所以这个可以改变的。

hzz_light

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python简单爬去博文

#-*-encoding:utf-8-*-from BeautifulSoup import BeautifulSoupimport urllibimport osdef file_fiter(title): """去除windows文件名非法字符""" filename = [] for i in range(len(title)): if tit
复制链接

扫一扫