python空间相册爬虫_淘女郎相册爬虫(Python编写)

#*-* coding:utf-8 *-*

__author__ = 'YS'

importurllib2importurllibimportreimportjsonimportosimporttime#抓取淘女郎的图片,淘女郎地址:https://mm.taobao.com/search_tstar_model.htm?spm=5679.126488.640745.2.22495f9f1lYEAb

classMMSpider:def __init__(self, timeout=3, albumLimit=200, picLimit=500, sleepPicCount=100, savePath='pythonspider/'):

self.__headers ={'User-Agent':'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}#抓取时间超时设置

self.timeout =timeout#抓取的相册个数限制

self.albumLimit =albumLimit#获取MM列表的地址

self.__mmListUrl = 'https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8'

#获取相册列表的地址

self.__albumListUrl = 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20=:userId&page=:page'

#获取相册具体相片的地址

self.__albumDetailUrl = 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id=:userId&album_id=:albumId&page=:page'

#MM详情页面地址

self.__personUrl = 'https://mm.taobao.com/self/aiShow.htm?userId=:userId'

#抓取的文件存放路径

self.savePath =savePath#每个MM的照片最多抓多少张

self.picLimit =picLimit#抓取多少张图片时休息1秒

self.sleepPicCount =sleepPicCount

self.__mkdir(self.savePath)#获取页面内容,python中的异常继承关系: https://docs.python.org/3/library/exceptions.html#exception-hierarchy

def __getContents(self, url, data=None, encoding=None, isjson=None):try:

request= urllib2.Request(url, data, self.__headers)

response= urllib2.urlopen(request, timeout=self.timeout)ifencoding:

contents= response.read().decode(encoding).encode('utf-8')else:

contents=response.read()return json.loads(contents,encoding='utf-8') if isjson elsecontentsexcepturllib2.URLError,e:print '出错了' +e.reasonreturnNoneexceptBaseException,e:print '其他错误'

printe.argsreturnNone#获取MM列表

def __getMMList(self, pageIndex):

url= self.__mmListUrldata=urllib.urlencode({'currentPage':pageIndex,'pageSize':50})

list= self.__getContents(url, data, encoding='gbk', isjson=True)if list isNone:returnNoneelif list['status'] != 1:returnNonereturn list['data']['searchDOList']#获取相册列表

def __getAlbumList(self, mm):

albumList=[]

baseUrl= self.__albumListUrl.replace(':userId',str(mm['userId']))

indexUrl= baseUrl.replace(':page','1')

pageCount= int(self.__getAlbumListPage(indexUrl))

pageCount= pageCount if pageCount<=self.albumLimit elseself.albumLimitfor i in range(1, pageCount+1):

listUrl= baseUrl.replace(':page', str(i))

contents= self.__getContents(listUrl)if (contents isNone):continuepattern= re.compile('

def __getPicPage(self, indexUrl):

albuminfo= self.__getContents(indexUrl, encoding='gbk', isjson=True)if albuminfo isNone:print '获取相册照片失败0,照片地址:'+indexUrlreturnNoneif albuminfo['isError'] != '0':print '获取相册照片失败1,照片地址:'+indexUrlreturnNone

totalPage= int(albuminfo['totalPage'])returntotalPage#下载保存单个相册的照片,album表示相册id

def __savePics(self, album, mm):print "正在保存"+mm['realName'].encode('utf-8')+'的相册,相册id为:'+album.encode('utf-8')

pics= self.__getPicList(album, mm)if pics isNone:returnindex= 1

for pic inpics:print "正在保存"+mm['realName'].encode('utf-8')+'的相册,相片地址为:'+pic.encode('utf-8')if index % self.sleepPicCount ==0:print "休息一秒"time.sleep(1)if index >=self.picLimit:print mm["realName"].encode('utf-8') + ":已经保存"+str(self.picLimit)+"张辣"

returnsaveDir= self.savePath + mm['realName'].encode('utf-8') + '/img'self.__mkdir(saveDir)

fileName= saveDir + '/'+str(index)+'.jpg'self.__saveImg(pic, fileName)

index+=1

#获取相册的总页数

def __getAlbumListPage(self, url):

contents= self.__getContents(url)ifcontents:

pattern= re.compile('id="J_Totalpage" value="(.*?)"', re.S)return re.search(pattern, contents).group(1)else:returnNone#保存MM的基本信息至本地的text文件夹

def __saveMM(self, mm):print '正在保存'+mm['realName'].encode('utf-8')+'的信息'saveDir= self.savePath + mm['realName'] + '/text'self.__mkdir(saveDir)

fileName= saveDir + '/info.txt'personUrl= self.__personUrl.replace(':userId', str(mm['userId']))

contents= "姓名:%s\n城市:%s\n体重:%s\n身高:%s\n喜欢:%s\n个人主页:%s\n"%(mm['realName'].encode('utf-8'),mm['city'].encode('utf-8'),str(mm['weight']).encode('utf-8'),str(mm['height']).encode('utf-8'),str(mm['totalFavorNum']).encode('utf-8'),personUrl.encode('utf-8'))

self.__saveTxtFile(contents, fileName)##保存MM的头像到本地img文件夹

def __saveMMAvatar(self, mm):print '正在保存'+mm['realName'].encode('utf-8')+'的头像'saveDir= self.savePath + mm['realName'] + '/img'self.__mkdir(saveDir)

fileName= saveDir + '/avatar.jpg'imgUrl= 'http:'+mm['avatarUrl']+'_240x240xz.jpg' #获取小图

self.__saveImg(imgUrl, fileName)#写入文本文件

def __saveTxtFile(self, contents, fileName):

handler= open(fileName, 'w')

handler.write(contents)

handler.close()#写入图片

def __saveImg(self, imgUrl, fileName):

contents= self.__getContents(imgUrl)ifcontents:

handler= open(fileName, 'wb')

handler.write(contents)

handler.close()else:print '获取图片失败,图片地址:'+imgUrl.encode('utf-8')#创建存放图片或者文本文件的文件夹

def __mkdir(self, saveDir):ifos.path.exists(saveDir):returnFalseelse:

os.makedirs(saveDir)returnTrue#主入口方法

defstart(self, startPage, endPage):

pages= range(startPage, endPage+1)for i inpages:

mmlist= self.__getMMList(i)if notmmlist:print "第%s页无数据\n"%(str(i))break

for mm inmmlist:

self.__saveMM(mm)

self.__saveMMAvatar(mm)

albumList= self.__getAlbumList(mm)for album inalbumList:

self.__savePics(album, mm)if __name__ == '__main__':

mmspider=MMSpider()

mmspider.start(2, 3)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值