python免费下载csdn_12_python爬虫——下载个人CSDN博客内容

最新推荐文章于 2024-01-17 16:35:13 发布

松直

最新推荐文章于 2024-01-17 16:35:13 发布

阅读量209

点赞数

本文链接：https://blog.csdn.net/weixin_35171603/article/details/112941434

版权

下载个人博客内容

可以是主页的内容，也可以是每个分类下的内容

只需要把传入的URL地址修改一下就OK了

但是修改传入的URL时，记得检查一下如果传入的URL 不带’?viewmode=contents‘，那么只能得到五篇内容

#coding:utf-8

import webbrowser as web

import os

import time

import random

import urllib2,sys

from bs4 import BeautifulSoup

reload(sys)

sys.setdefaultencoding('utf-8')

print '''

本文下载CSDN个人博客下的内容

'''

#此地址根据实际情况修改，但是，记得最后要有 '?viewmode=contents' 否则每页显示内容有限

url = 'http://blog.csdn.net/qiqiyingse/article/category/6292432?viewmode=contents'

baseurl='http://blog.csdn.net'

def getPage(url): #伪装成浏览器登陆,获取网页源代码

headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}

req = urllib2.Request(url=url,headers=headers)

try:

html = urllib2.urlopen(req).read()

except urllib2.HTTPError,e:

print e.code

print e.reason

#将网页内容传给BeautifulSoup解析

return html

def geturl(html,url):

urlList=[]

#print html

page = BeautifulSoup(html,'lxml')

items = page.find_all('div',class_ ='list_item list_view')#找到每一个文章item

'''

if 'categor' in url:

items = page.find_all('div',class_ ='list_item article_item')

else:

items = page.find_all('div',class_ ='list_item list_view')

'''

print len(items)

for item in items:

content=item.find('a')

url=content.get('href')#找到每一个文章的连接

url=baseurl+url#拼接成可访问的地址

urlList.append(url)

return urlList

def getContent(html):

page = BeautifulSoup(html,'lxml')

try:

title=page.find('div',class_='article_title').find('a').text

title=title.strip()

print title

except e:

print e

try:

content=page.find('div',class_='article_content')

#print content.text

with open(title+'.txt','w') as f:

f.write(content.text)

except e:

print e

html=getPage(url)

urls=geturl(html,url)

count=0

while count

print (urls[count])

htmltest=getPage(urls[count])

getContent(htmltest)

count=count+1

重新追加一版

这个版本修正了一些错误

#coding:utf-8

import urllib2,re,time,random,os,datetime

import HTMLParser

from bs4 import BeautifulSoup

import sys

reload(sys)

sys.setdefaultencoding('utf-8')

#自定义打印函数

def self_log(msg):

print u'%s: %s' % (time.strftime('%Y-%m-%d %H:%M:%S'), msg)

#获取页面内容

def get_html(url):

headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}

req = urllib2.Request(url=url,headers=headers)

try:

html = urllib2.urlopen(req).read()

html=HTMLParser.HTMLParser().unescape(html)

return html

except urllib2.HTTPError,e:

print e.code

#得到博客页面总数

def get_last_page(html,fd):

if not html:

self_log(u'页面错误，停止运行')

return

page = BeautifulSoup(html,'lxml')

try:

last_page=page.find('div',class_ ='pagelist').find_all('a')

last_page= last_page[len(last_page)-1].get('href')[-1:]

self_log('总共有%s 页博客' % last_page)

fd.write('总共有%s 页博客\n' % last_page)

return last_page

except Exception,e:

return 1

#获取页面列表

def get_items(url):

content_html=get_html(url)

page = BeautifulSoup(content_html,'lxml')

items = page.find_all('div',class_ ='list_item list_view')

return items

#根据每一个items list 提取需要的元素

def handle_items(items,content_list,read_num_for_sort):

for item in items:

temp={}#临时变量

title=item.find('a')#标题

content_url='http://blog.csdn.net'+title.get('href')#标题对应文章的地址

read_times=item.find('span',class_ ='link_view').text.strip()#阅读次数

comments_time=item.find('span',class_ ='link_comments')#评论次数

read_number = int(filter(str.isdigit, str(read_times))) #提取出来具体阅读次数的数字，为之后的排序做准备

read_num_for_sort.append(read_number)

#将数据打包

temp['indexs']=read_number

temp['title']=title.text.strip()

temp['read_times']=read_times

temp['comments_time']=comments_time.text.strip()

temp['content_url']=content_url

content_list.append(temp)

#创建文件夹

def mkdir_folder(path):

if not os.path.exists(path):

os.makedirs(path)

#获取页面信息

def getContent(html,dir_path):

page = BeautifulSoup(html,'lxml')

try:

title=page.find('div',class_='article_title').find('a').text

title=title.strip()

except Exception,e:

print e

try:

content=page.find('div',class_='article_content')

dir_path=dir_path

artitle_name_path=dir_path+'/'+title+'.txt'

with open(artitle_name_path+'.txt','w') as f:

f.write(content.text)

self_log(u'存贮文章：%s 完毕' % title)

except Exception,e:

print e

#存贮每一篇文章到本地

def run_to_get_article(content_total_list,dir_path):

self_log('start save every article ')

for article_content in content_total_list:

article_url=article_content.split('|')[4]

self_log( '将要存贮的地址是： %s ...' % article_url)

artitle_html=get_html(article_url)

getContent(artitle_html,dir_path)

#根据传进来的地址，获取博主名字，同时以博主名字命名存贮目录

def get_blocker_name(url):

if 'viewmode' in url:

print url.split('.net')[1]

print url.split('.net')[1].split('?')[0].split('/')[1]

return url.split('.net')[1].split('?')[0].split('/')[1]

else:

print url.split('.net')[1]

print url.split('.net')[1].split('/')[1]

return url.split('.net')[1].split('/')[1]

#程序运行主函数

def run(url,dir_path):

read_num_for_sort=[]

content_list=[]

content_total_list=[]

#定义文件夹名字并创建文件夹

dir_path=dir_path

mkdir_folder(dir_path)

#定义文件名字

count_file_name=dir_path+'/'+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')+'.txt'

fd=open(count_file_name,'w')

#1.从主页进入获取页面总数

main_html=get_html(url)

last_page=get_last_page(main_html,fd)

if last_page>1:

#3.组装url，分别加载每页的页面,同时在每一个页面提取我们需要的内容

for i in range(1,int(last_page)+1):

if 'category' not in url:

main_url=url.split('?')[0]+'/article/list/%d?viewmode=contents' % i

else:

main_url=url+'/%s' % i

self_log('即将获取第%d页的内容，地址是：%s' % (i,main_url))

items=get_items(main_url)#获取每一页的页面内容，根据页面内容得到文章item list

handle_items(items,content_list,read_num_for_sort)#处理item list

else:

items=get_items(url)#获取每一页的页面内容，根据页面内容得到文章item list

handle_items(items,content_list,read_num_for_sort)#处理item list

self_log('总共有%d 篇文章' % len(content_list))#根据得到的数据，统计文章总数

#根据 indexs(阅读次数)这个索引值进行排序

#非常好的一个根据列表中字典数据进行排序的方法

content_list = sorted(content_list,cmp=lambda x,y:cmp(x['indexs'],y['indexs']),reverse=0)

article_index = 1

for a in content_list:

#组装打印语句

totalcontent= '第'+str(article_index)+'篇|'+a['title']+'|'+a['read_times']+'|'+a['comments_time']+'|'+a['content_url']

#self_log(totalcontent)

print totalcontent

#将其存贮到本地

fd.write(totalcontent)

fd.write('\n')

article_index +=1

content_total_list.append(totalcontent)

fd.close()

return content_total_list

if __name__ == '__main__':

print '''

*****************************************

** Welcome to Spider of Count CSDN **

** Created on 2017-05-07 **

** @author: Jimy_Fengqi **

*****************************************

'''

url='http://blog.csdn.net/qiqiyingse?viewmode=contents'

#url='http://blog.csdn.net/qiqiyingse/article/category/6292432?viewmode=contents'

#url='http://blog.csdn.net/zuoxiaolong8810/article/category/1434962?viewmode=contents'

dir_path=get_blocker_name(url)

content_total_list=run(url,dir_path)

run_to_get_article(content_total_list,dir_path)

松直

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python免费下载csdn_12_python爬虫——下载个人CSDN博客内容

下载个人博客内容可以是主页的内容，也可以是每个分类下的内容只需要把传入的URL地址修改一下就OK了但是修改传入的URL时，记得检查一下如果传入的URL 不带’?viewmode=contents‘，那么只能得到五篇内容#coding:utf-8import webbrowser as webimport osimport timeimport randomimport urllib2,sys...
复制链接

扫一扫