版权声明:Jimy_Fengqi原创作品,欢迎转载,转载请注明http://blog.csdn.net/qiqiyingse
下载个人博客内容
可以是主页的内容, 也可以是每个分类下的内容
只需要把传入的URL地址修改一下就OK了
但是修改传入的URL时,记得检查一下如果 传入的URL 不带’?viewmode=contents‘, 那么只能得到五篇内容
[python]
view plain
copy
- #coding:utf-8
- import webbrowser as web
- import os
- import time
- import random
- import urllib2,sys
- from bs4 import BeautifulSoup
- reload(sys)
- sys.setdefaultencoding('utf-8')
- print '''''
- 本文下载CSDN个人博客下的内容
- '''
- #此地址根据实际情况修改,但是,记得最后要有 '?viewmode=contents' 否则每页显示内容有限
- url = 'http://blog.csdn.net/qiqiyingse/article/category/6292432?viewmode=contents'
- baseurl='http://blog.csdn.net'
- def getPage(url): #伪装成浏览器登陆,获取网页源代码
- headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
- req = urllib2.Request(url=url,headers=headers)
- try:
- html = urllib2.urlopen(req).read()
- except urllib2.HTTPError,e:
- print e.code
- print e.reason
- #将网页内容传给BeautifulSoup解析
- return html
- def geturl(html,url):
- urlList=[]
- #print html
- page = BeautifulSoup(html,'lxml')
- items = page.find_all('div',class_ ='list_item list_view')#找到每一个文章item
- '''''
- if 'categor' in url:
- items = page.find_all('div',class_ ='list_item article_item')
- else:
- items = page.find_all('div',class_ ='list_item list_view')
- '''
- print len(items)
- for item in items:
- content=item.find('a')
- url=content.get('href')#找到每一个文章的连接
- url=baseurl+url#拼接成可访问的地址
- urlList.append(url)
- return urlList
- def getContent(html):
- page = BeautifulSoup(html,'lxml')
- try:
- title=page.find('div',class_='article_title').find('a').text
- title=title.strip()
- print title
- except e:
- print e
- try:
- content=page.find('div',class_='article_content')
- #print content.text
- with open(title+'.txt','w') as f:
- f.write(content.text)
- except e:
- print e
- html=getPage(url)
- urls=geturl(html,url)
- count=0
- while count<len(urls):#根据文章列表的url决定循环次数,
- print (urls[count])
- htmltest=getPage(urls[count])
- getContent(htmltest)
- count=count+1
重新追加一版
这个版本修正了一些错误
[python]
view plain
copy
- #coding:utf-8
- import urllib2,re,time,random,os,datetime
- import HTMLParser
- from bs4 import BeautifulSoup
- import sys
- reload(sys)
- sys.setdefaultencoding('utf-8')
- #自定义打印函数
- def self_log(msg):
- print u'%s: %s' % (time.strftime('%Y-%m-%d %H:%M:%S'), msg)
- #获取页面内容
- def get_html(url):
- headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
- req = urllib2.Request(url=url,headers=headers)
- try:
- html = urllib2.urlopen(req).read()
- <span style="white-space:pre"> </span>html=HTMLParser.HTMLParser().unescape(html)
- return html
- except urllib2.HTTPError,e:
- print e.code
- #得到博客页面总数
- def get_last_page(html,fd):
- if not html:
- self_log(u'页面错误,停止运行')
- return
- page = BeautifulSoup(html,'lxml')
- try:
- last_page=page.find('div',class_ ='pagelist').find_all('a')
- last_page= last_page[len(last_page)-1].get('href')[-1:]
- self_log('总共有%s 页博客' % last_page)
- fd.write('总共有%s 页博客\n' % last_page)
- return last_page
- except Exception,e:
- return 1
- #获取页面列表
- def get_items(url):
- content_html=get_html(url)
- page = BeautifulSoup(content_html,'lxml')
- items = page.find_all('div',class_ ='list_item list_view')
- return items
- #根据每一个items list 提取需要的元素
- def handle_items(items,content_list,read_num_for_sort):
- for item in items:
- temp={}#临时变量
- title=item.find('a')#标题
- content_url='http://blog.csdn.net'+title.get('href')#标题对应文章的地址
- read_times=item.find('span',class_ ='link_view').text.strip()#阅读次数
- comments_time=item.find('span',class_ ='link_comments')#评论次数
- read_number = int(filter(str.isdigit, str(read_times))) #提取出来具体阅读次数的数字,为之后的排序做准备
- read_num_for_sort.append(read_number)
- #将数据打包
- temp['indexs']=read_number
- temp['title']=title.text.strip()
- temp['read_times']=read_times
- temp['comments_time']=comments_time.text.strip()
- temp['content_url']=content_url
- content_list.append(temp)
- #创建文件夹
- def mkdir_folder(path):
- if not os.path.exists(path):
- os.makedirs(path)
- #获取页面信息
- def getContent(html,dir_path):
- page = BeautifulSoup(html,'lxml')
- try:
- title=page.find('div',class_='article_title').find('a').text
- title=title.strip()
- except Exception,e:
- print e
- try:
- content=page.find('div',class_='article_content')
- dir_path=dir_path
- artitle_name_path=dir_path+'/'+title+'.txt'
- with open(artitle_name_path+'.txt','w') as f:
- f.write(content.text)
- self_log(u'存贮文章:%s 完毕' % title)
- except Exception,e:
- print e
- #存贮每一篇文章到本地
- def run_to_get_article(content_total_list,dir_path):
- self_log('start save every article ')
- for article_content in content_total_list:
- article_url=article_content.split('|')[4]
- self_log( '将要存贮的地址是: %s ...' % article_url)
- artitle_html=get_html(article_url)
- getContent(artitle_html,dir_path)
- #根据传进来的地址,获取博主名字,同时以博主名字命名存贮目录
- def get_blocker_name(url):
- if 'viewmode' in url:
- print url.split('.net')[1]
- print url.split('.net')[1].split('?')[0].split('/')[1]
- return url.split('.net')[1].split('?')[0].split('/')[1]
- else:
- print url.split('.net')[1]
- print url.split('.net')[1].split('/')[1]
- return url.split('.net')[1].split('/')[1]
- #程序运行主函数
- def run(url,dir_path):
- read_num_for_sort=[]
- content_list=[]
- content_total_list=[]
- #定义文件夹名字并创建文件夹
- dir_path=dir_path
- mkdir_folder(dir_path)
- #定义文件名字
- count_file_name=dir_path+'/'+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')+'.txt'
- fd=open(count_file_name,'w')
- #1.从主页进入获取页面总数
- main_html=get_html(url)
- last_page=get_last_page(main_html,fd)
- if last_page>1:
- #3.组装url,分别加载每页的页面,同时在每一个页面提取我们需要的内容
- for i in range(1,int(last_page)+1):
- if 'category' not in url:
- main_url=url.split('?')[0]+'/article/list/%d?viewmode=contents' % i
- else:
- main_url=url+'/%s' % i
- self_log('即将获取第%d页的内容,地址是:%s' % (i,main_url))
- items=get_items(main_url)#获取每一页的页面内容,根据页面内容得到文章item list
- handle_items(items,content_list,read_num_for_sort)#处理item list
- else:
- items=get_items(url)#获取每一页的页面内容,根据页面内容得到文章item list
- handle_items(items,content_list,read_num_for_sort)#处理item list
- self_log('总共有%d 篇文章' % len(content_list))#根据得到的数据,统计文章总数
- #根据 indexs(阅读次数)这个索引值进行排序
- #非常好的一个根据列表中字典数据进行排序的方法
- content_list = sorted(content_list,cmp=lambda x,y:cmp(x['indexs'],y['indexs']),reverse=0)
- article_index = 1
- for a in content_list:
- #组装打印语句
- totalcontent= '第'+str(article_index)+'篇|'+a['title']+'|'+a['read_times']+'|'+a['comments_time']+'|'+a['content_url']
- #self_log(totalcontent)
- print totalcontent
- #将其存贮到本地
- fd.write(totalcontent)
- fd.write('\n')
- article_index +=1
- content_total_list.append(totalcontent)
- fd.close()
- return content_total_list
- if __name__ == '__main__':
- print '''''
- *****************************************
- ** Welcome to Spider of Count CSDN **
- ** Created on 2017-05-07 **
- ** @author: Jimy_Fengqi **
- *****************************************
- '''
- url='http://blog.csdn.net/qiqiyingse?viewmode=contents'
- #url='http://blog.csdn.net/qiqiyingse/article/category/6292432?viewmode=contents'
- #url='http://blog.csdn.net/zuoxiaolong8810/article/category/1434962?viewmode=contents'
- dir_path=get_blocker_name(url)
- content_total_list=run(url,dir_path)
- run_to_get_article(content_total_list,dir_path)