原来的文章由于用到了curl的命令,比较麻烦,而且带上cookie之后还是有一部分无法下载,所以用httplib2代替了curl、urllib、urllib2的一些功能。重新写了一个代码
前面文章的代码不再维护
前一篇文章在这里:使用Python2.7和火狐浏览器下载QQ空间好友相册
Python2.7代码如下:
# -*- coding: UTF-8 -*-
import os
import re
import json
import datetime
import shutil
import httplib2
# 获取原始curl请求,相册fcg_list_album_v3相片cgi_list_photo
origin_album = ''
origin_photo = ''
origin_album_url = re.search(r'curl\s\"(?P<url>.*?)\"',origin_album).group('url')
origin_photo_url = re.search(r'curl\s\"(?P<url>.*?)\"',origin_photo).group('url')
cookie = re.search(r'\"Cookie:\s(?P<url>.*?)\"',origin_photo).group('url')
print origin_album_url
print origin_photo_url
print cookie
# 获取目标QQ
fp = open('qqlist.txt', 'r')
qqlist = fp.readlines()
for i in range(len(qqlist)):
qqlist[i] = qqlist[i][:-1]
fp.close()
h = httplib2.Http()
headers={'Cookie': cookie}
for target in qqlist:
if len(target) == 0 : # 防止因为出现空行删除所有照片
continue
log = { }
log['qq'] = target
log['access'] = 1 # 是否允许访问
log['time'] = datetime.datetime.now() # 下载完成后记录花费的时间
log['album_count'] = 0 # 相册总数
log['photo_count'] = 0 # 照片总数
print('当前QQ:' + target)
try:
os.makedirs('photos/' + target) # 建立相应的文件夹
except :
shutil.rmtree('photos/' + target) # 无论文件夹是否为空都移除该文件夹
os.makedirs('photos/' + target)
# 先得到正确的url,然后执行获取json数据
hostUin = origin_album.split('&hostUin=')[1].split('&')[0]
url = origin_album_url.replace(hostUin, target) # 替换被访问者
url = url.replace('&pageNumModeSort=40', '&pageNumModeSort=100') # 显示相册数量
resp, content = h.request(url, 'GET',headers=headers)
#print content
jsonstr = content[content.find('(') + 1: content.find(')', -1) - 1] # json字符串,去除不标准的json数据
output = json.loads(jsonstr) # 最终json数据
if output['code'] == -3000: # 对不起,您尚未登录或者登录超时。
print output['message']
break
if output['code'] == -4009:
log['access'] = 0 # 是否允许访问
fp = open('photos/' + target + '/log.txt', 'w') # 日志文件,记录时间与数量
fp.writelines(str(log))
fp.close()
continue
# 相册没有分类时
#print jsonstr
albumList = []
if 'albumListModeSort' in output['data']: # 相册没有分类,目前这一类占大多数
print 'type1'
if output['data']['albumListModeSort'] == 'null':
print 'noalbum'
break
print u'第一个相册名称:' + output['data']['albumListModeSort'][0]['name'] # 输出第一个相册名称
albumList = output['data']['albumListModeSort']
else: # 相册有分类
print 'type2'
print u'第一个相册名称:' + output['data']['albumListModeClass'][0]['albumList'][0]['name'] # 输出第一个相册名称
albumList = [dict() for i in range(0, output['data']['albumsInUser'])]
count = 0
# 重新构造albumList
for i in range(0, len(output['data']['albumListModeClass'])):
for j in range(0, output['data']['albumListModeClass'][i]['totalInClass']):
albumList[count] = output['data']['albumListModeClass'][i]['albumList'][j]
count = count + 1
if count > output['data']['albumsInUser']: # 对跳出条件加强控制
break
theSameAlbumName = 0 # 防止相册同名
for album in albumList:
if not album: # 字典为空跳出,上面的output['data']['albumsInUser']是可访问相册数。
continue
log['album_count'] += 1
print u'当前相册:' + str(album['classid']) + album['name']
if album['allowAccess'] == 0: # 相册无法直接访问(需要密码或者禁止访问)
continue
# album['id']就是照片列表的ID
# 获取照片列表数据
hostUin = origin_photo.split('&hostUin=')[1].split('&')[0]
topicId = origin_photo.split('&topicId=')[1].split('&')[0]
url = origin_photo_url.replace(hostUin, target)
url = url.replace(topicId, album['id'])
url = url.replace('&pageNum=30', '&pageNum=600') # QQ空间每个相册最大貌似不会超过512
resp, content = h.request(url, 'GET',headers=headers)
#print content
jsonstr = content[content.find('(') + 1: content.find(')', -1) - 1] #json字符串
output = json.loads(jsonstr) #json字符串转字典
if (output['code'] == -4404):
continue
# 相册名里面会不会也有奇葩名字呢
filt = re.compile(r'\\|/|:|\*|\?|<|>|\||\.')
album['name'] = re.sub(filt, '', album['name'])
# 我服都服了,QQ空间居然还允许同名的相册。。。
albumname = str(album['classid']) + album['name'].replace(' ', '')
filelist = os.listdir('photos/' + target + '/')
temp = albumname.encode('gbk') # encode的作用是将unicode编码转换成其他编码的字符串,由于文件列表filelist里是gbk编码,保存一致才能比较
if (temp in filelist) or (len(albumname) == 0): # 编号,防止同名
albumname = albumname + '_' + str(theSameAlbumName)
theSameAlbumName += 1
os.makedirs('photos/' + target + '/' + albumname)
# 防止相片同名
same = 0
# 获取该相册下的每一张照片,如果相册为空,那么output['data']['photoList'] = None,output['data']['totalInAlbum']=0
photoList = output['data']['photoList']
if output['data']['totalInAlbum'] == 0:
continue
for photo in photoList:
log['photo_count'] += 1
print(u'当前图片:' + photo['name'])
# 图片格式由photo['phototype']字段(整型)控制
# 1:jpg
# 3:png
phototype = {'1': '.jpg', '2': '.gif', '3': '.png', '5': '.jpg', '10': '.jpg'}
try:
format = phototype[str(photo['phototype'])]
except:
format = '.jpg'
# 建立文件夹并下载图片
# QQ图片里面有太多的特殊字符了
photoname = photo['name']
filelist = os.listdir('photos/' + target + '/' + albumname)
for i in range(len(filelist)):
filelist[i] = filelist[i][:-4]
photoname = photoname.replace(' ', '')
# 文件名中不能有特殊字符
filt = re.compile(r'\\|/|:|\*|\?|<|>|\||\.|\n|\t|\"')
photoname = re.sub(filt, '', photoname)
if (photoname in filelist) or (len(photoname) == 0):
photoname = photoname + '_' + str(same)
same += 1
path = 'photos\\' + target + '\\' + albumname + '\\' + photoname + format
try:
resp , content = h.request(photo['url'], 'GET', headers=headers)
open(path, 'wb').write(content)
except httplib2.HttpLib2Error as e:
print('保存图片出错')
fp = open('photos/' + target + '/log.txt', 'w')
# 日志文件,记录时间与数量
log['time'] = (datetime.datetime.now() - log['time']).seconds
log['time'] = str(log['time']) + 's'
fp.writelines(str(log))
fp.close()
print('当前QQ:' + target + '下载完毕')