改进:加入了多线程下载,提高了照片匹配的准确度,好友相册能够完整下载
一、Cookie获得
chrome 浏览器 Mac Command + Alt + I windows 好像是F12 打开开发者工具进行抓包
二、抓取图片
http://friend.renren.com/groupsdata 从次页面用正则表达式获取全部好友的ID
http://photo.renren.com/photo/' + 好友ID + '/album/relatives/profile 从此页面可获得好友相册的ID
http://photo.renren.com/photo/好友ID/album-相册ID?frommyphoto 从相册页面获取照片ID
三、多线程下载
Download类继承了threading.Thread,重写了run()方法,传入了一个存放照片URL的set(),遍历集合进行下载
在实际抓取照片中,每一个相册将会开启一个线程进行下载
代码需附上你自己的人人Cookie
# coding=utf8
import os
import re
import threading
import urllib2
COOKIE = '你自己人人的Cookie'
HEADERS = {'cookie': COOKIE}
# find title
def find_title(mypage):
myMatch = re.search(r'<title>(.+?)</title>', mypage, re.S)
title = u'undefined'
if myMatch:
title = myMatch.group(1)
else:
print u'find no title'
# 文件名不能包含以下字符: \ / : * ? " < > |
title = title.replace('\\', '').replace('/', '').replace(':', '').replace('*', '').replace('?', '').replace('"',
'').replace(
'>', '').replace('<', '').replace('|', '')
return title
def login_renren(url):
try:
req = urllib2.Request(url, headers=HEADERS)
page = urllib2.urlopen(req).read()
page = page.decode('utf-8')
title = find_title(page)
print title
return page
except:
page = ur''
return page
def find_friendlist():
url_friend = 'http://friend.renren.com/groupsdata' #friend list
req = urllib2.Request(url_friend, headers=HEADERS)
try:
page = urllib2.urlopen(req).read()
page = page.decode('utf-8')
except:
print 'cookie is error'
page = ''
pattern = re.compile(r'"fid":\d*?,')
if pattern.findall(page):
list = pattern.findall(page)
friend_file = open('id.txt', 'w')
for i in list:
id = i[6:-1]
friend_file.write(id)
friend_file.write(os.linesep)
friend_file.close()
else:
print 'find no friendID'
# http://photo.renren.com/photo/XXXXXXXXX/album/relatives/profile
# http://photo.renren.com/photo/XXXXXXXXX/album-535947620?frommyphoto
def find_ablumUrl():
list = ur''
file = open('id.txt')
ablum = open('albumlist.txt', 'w')
while 1:
line = file.readline()
if line:
line = line[:-1]
photo_url = 'http://photo.renren.com/photo/' + str(line) + '/album/relatives/profile'
print photo_url
data = login_renren(photo_url)
pattern = re.compile(r'http://photo.renren.com/photo/(.+?)frommyphoto')
if pattern.findall(data):
list = pattern.findall(data)
else:
print 'find no album id'
#remove duplicate album id
albumid_set = set()
for i in list:
albumid_set.add(i)
for i in albumid_set:
album_list = 'http://photo.renren.com/photo/' + str(i) + 'frommyphoto'
print album_list
ablum.write(album_list)
ablum.write(os.linesep)
else:
break
def download_album():
file = open('albumlist.txt')
while 1:
line = file.readline()
if not line:
break
else:
list = ''
data = login_renren(line)
pattern = re.compile(r'large:.*?\.jpg', re.I) #large xlarge
if pattern.findall(data):
list = pattern.findall(data)
else:
print 'found no image'
photo_url = set()
for i in list:
i = i[7:]
photo_url.add(i)
print i # test
try:
d = Download(photo_url)
print d.name
d.start()
except:
print u'download error ' + line
file.close()
#download by thread
class Download(threading.Thread):
def __init__(self, que):
threading.Thread.__init__(self)
self.que = que
def run(self):
for i in self.que:
data = urllib2.urlopen(i).read()
path = str(i[-15:-5]) + '.jpg'
f = open(path, 'wb') # 存储下载的图片
f.write(data)
f.close()
return
#start
def start_photo_grap():
login_renren(URL)
find_friendlist()
find_ablumUrl()
download_album()
URL = r'http://www.renren.com'
if __name__ == '__main__':
start_photo_grap()
print 'success '