python爬虫爬取博客_Python爬虫抓取csdn博客

最新推荐文章于 2024-08-03 20:32:40 发布

weixin_39777875

最新推荐文章于 2024-08-03 20:32:40 发布

阅读量1k

点赞数

关键词由CSDN通过智能技术生成

Python爬虫抓取csdn博客

昨天晚上为了下载保存某位csdn大牛的全部博文，写了一个爬虫来自动抓取文章并保存到txt文本，当然也可以保存到html网页中。这样就可以不用Ctrl+C 和Ctrl+V了，非常方便，抓取别的网站也是大同小异。

为了解析抓取的网页，用到了第三方模块，BeautifulSoup，这个模块对于解析html文件非常有用，当然也可以自己使用正则表达式去解析，但是比较麻烦。

由于csdn网站的robots.txt文件中显示禁止任何爬虫，所以必须把爬虫伪装成浏览器，而且不能频繁抓取，得sleep一会再抓，使用频繁会被封ip的，但可以使用代理ip。

#-*- encoding: utf-8 -*-

'''

Created on 2014-09-18 21:10:39

@author: Mangoer

@email: 2395528746@qq.com

'''

import urllib2

import re

from bs4 import BeautifulSoup

import random

import time

class CSDN_Blog_Spider:

def __init__(self,url):

print '\n'

print('已启动网络爬虫。。。')

print '网页地址： ' + url

user_agents = [

'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',

'Opera/9.25 (Windows NT 5.1; U; en)',

'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',

'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',

'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',

'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',

"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",

"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",

]

# use proxy ip

# ips_list = ['60.220.204.2:63000','123.150.92.91:80','121.248.150.107:8080','61.185.21.175:8080','222.216.109.114:3128','118.144.54.190:8118',

# '1.50.235.82:80','203.80.144.4:80']

# ip = random.choice(ips_list)

# print '使用的代理ip地址： ' + ip

# proxy_support = urllib2.ProxyHandler({'http':'http://'+ip})

# opener = urllib2.build_opener(proxy_support)

# urllib2.install_opener(opener)

agent = random.choice(user_agents)

req = urllib2.Request(url)

req.add_header('User-Agent',agent)

req.add_header('Host','blog.csdn.net')

req.add_header('Accept','*/*')

req.add_header('Referer','http://blog.csdn.net/mangoer_ys?viewmode=list')

req.add_header('GET',url)

html = urllib2.urlopen(req)

page = html.read().decode('gbk','ignore').encode('utf-8')

self.page = page

self.title = self.getTitle()

self.content = self.getContent()

self.saveFile()

def printInfo(self):

print('文章标题是： '+self.title + '\n')

print('内容已经存储到out.txt文件中！')

def getTitle(self):

rex = re.compile('

(.*?)',re.DOTALL)

match = rex.search(self.page)

if match:

return match.group(1)

return 'NO TITLE'

def getContent(self):

bs = BeautifulSoup(self.page)

html_content_list = bs.findAll('div',{'id':'article_content','class':'article_content'})

html_content = str(html_content_list[0])

rex_p = re.compile(r'(?:.*?)>(.*?)

p_list = rex_p.findall(html_content)

content = ''

for p in p_list:

if p.isspace() or p == '':

continue

content = content + p

return content

def saveFile(self):

outfile = open('out.txt','a')

outfile.write(self.content)

def getNextArticle(self):

bs2 = BeautifulSoup(self.page)

html_nextArticle_list = bs2.findAll('li',{'class':'prev_article'})

# print str(html_nextArticle_list[0])

html_nextArticle = str(html_nextArticle_list[0])

# print html_nextArticle

rex_link = re.compile(r'

link = rex_link.search(html_nextArticle)

# print link.group(1)

if link:

next_url = 'http://blog.csdn.net' + link.group(1)

return next_url

return None

class Scheduler:

def __init__(self,url):

self.start_url = url

def start(self):

spider = CSDN_Blog_Spider(self.start_url)

spider.printInfo()

while True:

if spider.getNextArticle():

spider = CSDN_Blog_Spider(spider.getNextArticle())

spider.printInfo()

elif spider.getNextArticle() == None:

print 'All article haved been downloaded!'

break

time.sleep(10)

#url = input('请输入CSDN博文地址：')

url = "http://blog.csdn.net/mangoer_ys/article/details/38427979"

Scheduler(url).start()

程序中有个问题一直不能解决：不能使用标题去命名文件，所以所有的文章全部放在一个out.txt中，说的编码的问题，希望大神可以解决这个问题。

weixin_39777875

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫