百度空间博客文章下载 [python]

2012年百度空间升级的时候,写过一个下载工具:

http://blog.csdn.net/luosiyong/article/details/7713664

最近百度空间要迁入百度云了:


从2007-06-01 22:01写的第一篇日志,到2015-03-26 20:06写的最后一篇日志,总共650篇。

博文主要是记录一些学习笔记和自己的经历,最开始的时候,百度提供了自己空间内搜索的功能,非常好用,其他用户直接在百度首页搜索也能找到很多相关的博客。

2012年改版,从此我不太喜欢百度空间了,写日志的时候也少了很多。


2013-06-07访问量超过7位数,当时还记录了一下偷笑

再后来百度搜索到的博客不能跳转进入阅读了,访问量直线下降,直到现在:


百度空间明天就关闭了,还是把博文下载下来离线保存的比较好,相对差不多3年前的代码改动了一些地方:

抓取全部的博文链接:

1、改成多线程,提高收集链接的速度

2、增加命令行选项,方便设置各种参数及下载其他用户的博文

下载保存全部博文及图片:

1、改成多线程,提高速度

2、信息提取从正则匹配改成BeautifulSoup,主要提取文章标题、时间、需要下载的图片列表,下载后以 时间+标题 的形式保存在本地。

3、增加命令行参数

4、增加log输出

urlspider.py

# -*- coding: utf-8 -*-

import re
import os
import sys
import getopt
import urllib
import logging
import threading

logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(filename)s [line:%(lineno)d] %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

prefix = 'http://hi.baidu.com/'
pattern = None

usage_format = '''
python %s [options] <username>
options:
	-h [--help]		produce help message
	-t [--thread]	work thread count
	-o [--output]	output file name
'''

url_queue = []
url_set = set()
running_count = 0
visited_count = 0
complete = False

lock = threading.Lock()

def usage():
	print usage_format % (os.path.basename(sys.argv[0]))

def running_log():
	logging.debug('running thread count: {0}, visited count: {1}'.format(running_count, visited_count))

def url_alloc():
	global running_count
	lock.acquire()
	url = None
	if url_queue:
		url = url_queue.pop(0)
		running_count += 1
		running_log()
	lock.release()
	return url

def url_done(url, url_list):
	global running_count
	global visited_count
	lock.acquire()
	if pattern.findall(url):
		visited_count += 1
		running_log()
	for url in url_list:
		if url not in url_set:
			url_set.add(url)
			url_queue.append(url)
	running_count -= 1
	running_log()
	if running_count <= 0 and not url_queue:
		global complete
		complete = True
	lock.release()

class WorkThread(threading.Thread):
	def run(self):
		while not complete:
			url = url_alloc()
			if not url:
				continue
			f = urllib.urlopen(url)
			code, content = f.code, f.read()
			logging.debug('url: {0}, response: {1}'.format(url, code))
			url_done(url, [prefix + x for x in pattern.findall(content)])

def main():
	try:
		opts, args = getopt.getopt(sys.argv[1:], 'ht:o:', ['help', 'thread=', 'output='])
	except getopt.GetoptError, err:
		print str(err)
		usage()
		sys.exit(1)
	thread_count = 30			# default work thread count
	output_file = 'list.txt'	# default output file name
	for opt, value in opts:
		if opt in ('-h', '--help'):
			usage()
			sys.exit()
		elif opt in ('-t', '--thread'):
			thread_count = int(value)
		elif opt in ('-o', '--output'):
			output_file = value
		else:
			assert False, 'unhandled option'
	if not args:
		usage()
		sys.exit()
	username = args[0]
	global pattern
	pattern = re.compile('%s/item/\w+' % username)
	url_queue.append(prefix + username)
	tlist = []
	for i in xrange(thread_count):
		t = WorkThread()
		t.start()
		tlist.append(t)
	for t in tlist:
		t.join()
	f = open(output_file, 'w')
	f.writelines([x + '\n' for x in url_set])
	f.close()

if __name__ == '__main__':
	main()
downloader.py

# -*- coding: utf-8 -*-

import os
import re
import sys
import time
import urllib
import getopt
import logging
import threading

from bs4 import BeautifulSoup

logger = logging.getLogger('downloader')
logger.setLevel(logging.DEBUG)

fh = logging.FileHandler('run.log')
fh.setLevel(logging.ERROR)

ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

formatter = logging.Formatter(fmt='%(asctime)s %(filename)s [line:%(lineno)d] %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
fh.setFormatter(formatter)
ch.setFormatter(formatter)

logger.addHandler(fh)
logger.addHandler(ch)

ILLEGAL_PATH_CHARS_PATTERN = r'[\\/:*?\"<>|&;]'

usage_format = '''
python %s [options] <filename>
options:
	-h [--help]		produce help message
	-t [--thread]	work thread count
	-o [--output]	output path
'''

url_queue = []
running_count = 0
visited_count = 0
complete = False

lock = threading.Lock()

def usage():
	print usage_format % (os.path.basename(sys.argv[0]))

def running_log():
	logger.debug('running thread count: {0}, visited count: {1}'.format(running_count, visited_count))

def url_alloc():
	global running_count
	lock.acquire()
	url = None
	if url_queue:
		url = url_queue.pop(0)
		running_count += 1
		running_log()
	lock.release()
	return url

def url_done(url):
	global running_count
	global visited_count
	lock.acquire()
	visited_count += 1
	running_count -= 1
	running_log()
	if running_count <= 0 and not url_queue:
		global complete
		complete = True
	lock.release()

def download(url, filename):
	logger.debug('download: {0}'.format(url))
	try:
		urllib.urlretrieve(url, filename)
	except Exception, e:
		logger.error('download failed %s' % url)
		logger.error(str(e))

def save(url, path):
	logger.debug('save: {0}'.format(url))
	f = urllib.urlopen(url)
	code, content = f.code, f.read()
	content = content.decode('utf8')
	soup = BeautifulSoup(content)
	logger.debug('title: %s' % soup.title.string)
	content_title = ''
	content_time = ''
	res = soup.find(attrs={'class': 'title content-title'})
	if res:
		content_title = res.get_text().strip()
	res = soup.find(attrs={'class': 'content-other-info'})
	if res:
		content_time = res.span.get_text()
	logger.debug('content time: %s, content title: %s' % (content_time, content_title))
	img_list = [img.get('src') for img in soup.find_all('img')]
	filename = '%s %s' % (content_time, content_title)
	filename = re.sub(ILLEGAL_PATH_CHARS_PATTERN, '', filename)
	filename = filename.strip()
	path = os.path.join(path, filename)
	if not os.path.exists(path):
		try:
			os.makedirs(path)
		except:
			logger.error('makedirs error: %s' % path)
			return
	for image in img_list:
		image_filename = os.path.basename(image)
		download(image if image.startswith('http') else '/'.join([os.path.dirname(url), image]), os.path.join(path, image_filename))
		content = content.replace(image, '/'.join([filename, image_filename]))
	f = open(path + '.html', 'wb')
	f.write(content.encode('utf8'))
	f.close()

class WorkThread(threading.Thread):
	def __init__(self, path):
		super(WorkThread, self).__init__()
		self.path = path

	def run(self):
		while not complete:
			url = url_alloc()
			if not url:
				continue
			save(url, self.path)
			url_done(url)

def main():
	try:
		opts, args = getopt.getopt(sys.argv[1:], 'ht:o:', ['help', 'thread=', 'output='])
	except getopt.GetoptError, err:
		print str(err)
		usage()
		sys.exit(1)
	thread_count = 30			# default work thread count
	output_path = 'webpages'	# default output file name
	for opt, value in opts:
		if opt in ('-h', '--help'):
			usage()
			sys.exit()
		elif opt in ('-t', '--thread'):
			thread_count = int(value)
		elif opt in ('-o', '--output'):
			output_path = value
		else:
			assert False, 'unhandled option'
	if not args:
		usage()
		sys.exit()
	filename = args[0]
	f = open(filename, 'r')
	for line in f.readlines():
		url_queue.append(line.strip())
	f.close()
	tlist = []
	for i in xrange(thread_count):
		t = WorkThread(output_path)
		t.start()
		tlist.append(t)
	for t in tlist:
		t.join()

if __name__ == '__main__':
	main()
1、抓取全部的文章链接:

python urlspider.py luosiyong

2、下载全部文章

python downloader.py list.txt

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值