百度空间博客文章下载 [python]

最新推荐文章于 2019-05-21 11:50:54 发布

luosiyong

最新推荐文章于 2019-05-21 11:50:54 发布

阅读量721

点赞数

分类专栏： Python

本文链接：https://blog.csdn.net/luosiyong/article/details/45529289

版权

Python 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

2012年百度空间升级的时候，写过一个下载工具：

http://blog.csdn.net/luosiyong/article/details/7713664

最近百度空间要迁入百度云了：

从2007-06-01 22:01写的第一篇日志，到2015-03-26 20:06写的最后一篇日志，总共650篇。

博文主要是记录一些学习笔记和自己的经历，最开始的时候，百度提供了自己空间内搜索的功能，非常好用，其他用户直接在百度首页搜索也能找到很多相关的博客。

2012年改版，从此我不太喜欢百度空间了，写日志的时候也少了很多。

2013-06-07访问量超过7位数，当时还记录了一下

再后来百度搜索到的博客不能跳转进入阅读了，访问量直线下降，直到现在：

百度空间明天就关闭了，还是把博文下载下来离线保存的比较好，相对差不多3年前的代码改动了一些地方：

抓取全部的博文链接：

1、改成多线程，提高收集链接的速度

2、增加命令行选项，方便设置各种参数及下载其他用户的博文

下载保存全部博文及图片：

1、改成多线程，提高速度

2、信息提取从正则匹配改成BeautifulSoup，主要提取文章标题、时间、需要下载的图片列表，下载后以时间+标题的形式保存在本地。

3、增加命令行参数

4、增加log输出

urlspider.py

# -*- coding: utf-8 -*-

import re
import os
import sys
import getopt
import urllib
import logging
import threading

logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(filename)s [line:%(lineno)d] %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

prefix = 'http://hi.baidu.com/'
pattern = None

usage_format = '''
python %s [options] <username>
options:
	-h [--help]		produce help message
	-t [--thread]	work thread count
	-o [--output]	output file name
'''

url_queue = []
url_set = set()
running_count = 0
visited_count = 0
complete = False

lock = threading.Lock()

def usage():
	print usage_format % (os.path.basename(sys.argv[0]))

def running_log():
	logging.debug('running thread count: {0}, visited count: {1}'.format(running_count, visited_count))

def url_alloc():
	global running_count
	lock.acquire()
	url = None
	if url_queue:
		url = url_queue.pop(0)
		running_count += 1
		running_log()
	lock.release()
	return url

def url_done(url, url_list):
	global running_count
	global visited_count
	lock.acquire()
	if pattern.findall(url):
		visited_count += 1
		running_log()
	for url in url_list:
		if url not in url_set:
			url_set.add(url)
			url_queue.append(url)
	running_count -= 1
	running_log()
	if running_count <= 0 and not url_queue:
		global complete
		complete = True
	lock.release()

class WorkThread(threading.Thread):
	def run(self):
		while not complete:
			url = url_alloc()
			if not url:
				continue
			f = urllib.urlopen(url)
			code, content = f.code, f.read()
			logging.debug('url: {0}, response: {1}'.format(url, code))
			url_done(url, [prefix + x for x in pattern.findall(content)])

def main():
	try:
		opts, args = getopt.getopt(sys.argv[1:], 'ht:o:', ['help', 'thread=', 'output='])
	except getopt.GetoptError, err:
		print str(err)
		usage()
		sys.exit(1)
	thread_count = 30			# default work thread count
	output_file = 'list.txt'	# default output file name
	for opt, value in opts:
		if opt in ('-h', '--help'):
			usage()
			sys.exit()
		elif opt in ('-t', '--thread'):
			thread_count = int(value)
		elif opt in ('-o', '--output'):
			output_file = value
		else:
			assert False, 'unhandled option'
	if not args:
		usage()
		sys.exit()
	username = args[0]
	global pattern
	pattern = re.compile('%s/item/\w+' % username)
	url_queue.append(prefix + username)
	tlist = []
	for i in xrange(thread_count):
		t = WorkThread()
		t.start()
		tlist.append(t)
	for t in tlist:
		t.join()
	f = open(output_file, 'w')
	f.writelines([x + '\n' for x in url_set])
	f.close()

if __name__ == '__main__':
	main()

downloader.py

# -*- coding: utf-8 -*-

import os
import re
import sys
import time
import urllib
import getopt
import logging
import threading

from bs4 import BeautifulSoup

logger = logging.getLogger('downloader')
logger.setLevel(logging.DEBUG)

fh = logging.FileHandler('run.log')
fh.setLevel(logging.ERROR)

ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

formatter = logging.Formatter(fmt='%(asctime)s %(filename)s [line:%(lineno)d] %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
fh.setFormatter(formatter)
ch.setFormatter(formatter)

logger.addHandler(fh)
logger.addHandler(ch)

ILLEGAL_PATH_CHARS_PATTERN = r'[\\/:*?\"<>|&;]'

usage_format = '''
python %s [options] <filename>
options:
	-h [--help]		produce help message
	-t [--thread]	work thread count
	-o [--output]	output path
'''

url_queue = []
running_count = 0
visited_count = 0
complete = False

lock = threading.Lock()

def usage():
	print usage_format % (os.path.basename(sys.argv[0]))

def running_log():
	logger.debug('running thread count: {0}, visited count: {1}'.format(running_count, visited_count))

def url_alloc():
	global running_count
	lock.acquire()
	url = None
	if url_queue:
		url = url_queue.pop(0)
		running_count += 1
		running_log()
	lock.release()
	return url

def url_done(url):
	global running_count
	global visited_count
	lock.acquire()
	visited_count += 1
	running_count -= 1
	running_log()
	if running_count <= 0 and not url_queue:
		global complete
		complete = True
	lock.release()

def download(url, filename):
	logger.debug('download: {0}'.format(url))
	try:
		urllib.urlretrieve(url, filename)
	except Exception, e:
		logger.error('download failed %s' % url)
		logger.error(str(e))

def save(url, path):
	logger.debug('save: {0}'.format(url))
	f = urllib.urlopen(url)
	code, content = f.code, f.read()
	content = content.decode('utf8')
	soup = BeautifulSoup(content)
	logger.debug('title: %s' % soup.title.string)
	content_title = ''
	content_time = ''
	res = soup.find(attrs={'class': 'title content-title'})
	if res:
		content_title = res.get_text().strip()
	res = soup.find(attrs={'class': 'content-other-info'})
	if res:
		content_time = res.span.get_text()
	logger.debug('content time: %s, content title: %s' % (content_time, content_title))
	img_list = [img.get('src') for img in soup.find_all('img')]
	filename = '%s %s' % (content_time, content_title)
	filename = re.sub(ILLEGAL_PATH_CHARS_PATTERN, '', filename)
	filename = filename.strip()
	path = os.path.join(path, filename)
	if not os.path.exists(path):
		try:
			os.makedirs(path)
		except:
			logger.error('makedirs error: %s' % path)
			return
	for image in img_list:
		image_filename = os.path.basename(image)
		download(image if image.startswith('http') else '/'.join([os.path.dirname(url), image]), os.path.join(path, image_filename))
		content = content.replace(image, '/'.join([filename, image_filename]))
	f = open(path + '.html', 'wb')
	f.write(content.encode('utf8'))
	f.close()

class WorkThread(threading.Thread):
	def __init__(self, path):
		super(WorkThread, self).__init__()
		self.path = path

	def run(self):
		while not complete:
			url = url_alloc()
			if not url:
				continue
			save(url, self.path)
			url_done(url)

def main():
	try:
		opts, args = getopt.getopt(sys.argv[1:], 'ht:o:', ['help', 'thread=', 'output='])
	except getopt.GetoptError, err:
		print str(err)
		usage()
		sys.exit(1)
	thread_count = 30			# default work thread count
	output_path = 'webpages'	# default output file name
	for opt, value in opts:
		if opt in ('-h', '--help'):
			usage()
			sys.exit()
		elif opt in ('-t', '--thread'):
			thread_count = int(value)
		elif opt in ('-o', '--output'):
			output_path = value
		else:
			assert False, 'unhandled option'
	if not args:
		usage()
		sys.exit()
	filename = args[0]
	f = open(filename, 'r')
	for line in f.readlines():
		url_queue.append(line.strip())
	f.close()
	tlist = []
	for i in xrange(thread_count):
		t = WorkThread(output_path)
		t.start()
		tlist.append(t)
	for t in tlist:
		t.join()

if __name__ == '__main__':
	main()

1、抓取全部的文章链接：

python urlspider.py luosiyong

2、下载全部文章

python downloader.py list.txt