2012年百度空间升级的时候,写过一个下载工具:
http://blog.csdn.net/luosiyong/article/details/7713664
最近百度空间要迁入百度云了:
从2007-06-01 22:01写的第一篇日志,到2015-03-26 20:06写的最后一篇日志,总共650篇。
博文主要是记录一些学习笔记和自己的经历,最开始的时候,百度提供了自己空间内搜索的功能,非常好用,其他用户直接在百度首页搜索也能找到很多相关的博客。
2012年改版,从此我不太喜欢百度空间了,写日志的时候也少了很多。
2013-06-07访问量超过7位数,当时还记录了一下
再后来百度搜索到的博客不能跳转进入阅读了,访问量直线下降,直到现在:
百度空间明天就关闭了,还是把博文下载下来离线保存的比较好,相对差不多3年前的代码改动了一些地方:
抓取全部的博文链接:
1、改成多线程,提高收集链接的速度
2、增加命令行选项,方便设置各种参数及下载其他用户的博文
下载保存全部博文及图片:
1、改成多线程,提高速度
2、信息提取从正则匹配改成BeautifulSoup,主要提取文章标题、时间、需要下载的图片列表,下载后以 时间+标题 的形式保存在本地。
3、增加命令行参数
4、增加log输出
urlspider.py
# -*- coding: utf-8 -*-
import re
import os
import sys
import getopt
import urllib
import logging
import threading
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(filename)s [line:%(lineno)d] %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
prefix = 'http://hi.baidu.com/'
pattern = None
usage_format = '''
python %s [options] <username>
options:
-h [--help] produce help message
-t [--thread] work thread count
-o [--output] output file name
'''
url_queue = []
url_set = set()
running_count = 0
visited_count = 0
complete = False
lock = threading.Lock()
def usage():
print usage_format % (os.path.basename(sys.argv[0]))
def running_log():
logging.debug('running thread count: {0}, visited count: {1}'.format(running_count, visited_count))
def url_alloc():
global running_count
lock.acquire()
url = None
if url_queue:
url = url_queue.pop(0)
running_count += 1
running_log()
lock.release()
return url
def url_done(url, url_list):
global running_count
global visited_count
lock.acquire()
if pattern.findall(url):
visited_count += 1
running_log()
for url in url_list:
if url not in url_set:
url_set.add(url)
url_queue.append(url)
running_count -= 1
running_log()
if running_count <= 0 and not url_queue:
global complete
complete = True
lock.release()
class WorkThread(threading.Thread):
def run(self):
while not complete:
url = url_alloc()
if not url:
continue
f = urllib.urlopen(url)
code, content = f.code, f.read()
logging.debug('url: {0}, response: {1}'.format(url, code))
url_done(url, [prefix + x for x in pattern.findall(content)])
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], 'ht:o:', ['help', 'thread=', 'output='])
except getopt.GetoptError, err:
print str(err)
usage()
sys.exit(1)
thread_count = 30 # default work thread count
output_file = 'list.txt' # default output file name
for opt, value in opts:
if opt in ('-h', '--help'):
usage()
sys.exit()
elif opt in ('-t', '--thread'):
thread_count = int(value)
elif opt in ('-o', '--output'):
output_file = value
else:
assert False, 'unhandled option'
if not args:
usage()
sys.exit()
username = args[0]
global pattern
pattern = re.compile('%s/item/\w+' % username)
url_queue.append(prefix + username)
tlist = []
for i in xrange(thread_count):
t = WorkThread()
t.start()
tlist.append(t)
for t in tlist:
t.join()
f = open(output_file, 'w')
f.writelines([x + '\n' for x in url_set])
f.close()
if __name__ == '__main__':
main()
downloader.py
# -*- coding: utf-8 -*-
import os
import re
import sys
import time
import urllib
import getopt
import logging
import threading
from bs4 import BeautifulSoup
logger = logging.getLogger('downloader')
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler('run.log')
fh.setLevel(logging.ERROR)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter(fmt='%(asctime)s %(filename)s [line:%(lineno)d] %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
logger.addHandler(fh)
logger.addHandler(ch)
ILLEGAL_PATH_CHARS_PATTERN = r'[\\/:*?\"<>|&;]'
usage_format = '''
python %s [options] <filename>
options:
-h [--help] produce help message
-t [--thread] work thread count
-o [--output] output path
'''
url_queue = []
running_count = 0
visited_count = 0
complete = False
lock = threading.Lock()
def usage():
print usage_format % (os.path.basename(sys.argv[0]))
def running_log():
logger.debug('running thread count: {0}, visited count: {1}'.format(running_count, visited_count))
def url_alloc():
global running_count
lock.acquire()
url = None
if url_queue:
url = url_queue.pop(0)
running_count += 1
running_log()
lock.release()
return url
def url_done(url):
global running_count
global visited_count
lock.acquire()
visited_count += 1
running_count -= 1
running_log()
if running_count <= 0 and not url_queue:
global complete
complete = True
lock.release()
def download(url, filename):
logger.debug('download: {0}'.format(url))
try:
urllib.urlretrieve(url, filename)
except Exception, e:
logger.error('download failed %s' % url)
logger.error(str(e))
def save(url, path):
logger.debug('save: {0}'.format(url))
f = urllib.urlopen(url)
code, content = f.code, f.read()
content = content.decode('utf8')
soup = BeautifulSoup(content)
logger.debug('title: %s' % soup.title.string)
content_title = ''
content_time = ''
res = soup.find(attrs={'class': 'title content-title'})
if res:
content_title = res.get_text().strip()
res = soup.find(attrs={'class': 'content-other-info'})
if res:
content_time = res.span.get_text()
logger.debug('content time: %s, content title: %s' % (content_time, content_title))
img_list = [img.get('src') for img in soup.find_all('img')]
filename = '%s %s' % (content_time, content_title)
filename = re.sub(ILLEGAL_PATH_CHARS_PATTERN, '', filename)
filename = filename.strip()
path = os.path.join(path, filename)
if not os.path.exists(path):
try:
os.makedirs(path)
except:
logger.error('makedirs error: %s' % path)
return
for image in img_list:
image_filename = os.path.basename(image)
download(image if image.startswith('http') else '/'.join([os.path.dirname(url), image]), os.path.join(path, image_filename))
content = content.replace(image, '/'.join([filename, image_filename]))
f = open(path + '.html', 'wb')
f.write(content.encode('utf8'))
f.close()
class WorkThread(threading.Thread):
def __init__(self, path):
super(WorkThread, self).__init__()
self.path = path
def run(self):
while not complete:
url = url_alloc()
if not url:
continue
save(url, self.path)
url_done(url)
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], 'ht:o:', ['help', 'thread=', 'output='])
except getopt.GetoptError, err:
print str(err)
usage()
sys.exit(1)
thread_count = 30 # default work thread count
output_path = 'webpages' # default output file name
for opt, value in opts:
if opt in ('-h', '--help'):
usage()
sys.exit()
elif opt in ('-t', '--thread'):
thread_count = int(value)
elif opt in ('-o', '--output'):
output_path = value
else:
assert False, 'unhandled option'
if not args:
usage()
sys.exit()
filename = args[0]
f = open(filename, 'r')
for line in f.readlines():
url_queue.append(line.strip())
f.close()
tlist = []
for i in xrange(thread_count):
t = WorkThread(output_path)
t.start()
tlist.append(t)
for t in tlist:
t.join()
if __name__ == '__main__':
main()
1、抓取全部的文章链接:
python urlspider.py luosiyong
2、下载全部文章
python downloader.py list.txt