#*-* coding:utf-8 *-*
__author__ = 'YS'
importurllib2importurllibimportreimportjsonimportosimporttime#抓取淘女郎的图片,淘女郎地址:https://mm.taobao.com/search_tstar_model.htm?spm=5679.126488.640745.2.22495f9f1lYEAb
classMMSpider:def __init__(self, timeout=3, albumLimit=200, picLimit=500, sleepPicCount=100, savePath='pythonspider/'):
self.__headers ={'User-Agent':'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}#抓取时间超时设置
self.timeout =timeout#抓取的相册个数限制
self.albumLimit =albumLimit#获取MM列表的地址
self.__mmListUrl = 'https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8'
#获取相册列表的地址
self.__albumListUrl = 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20=:userId&page=:page'
#获取相册具体相片的地址
self.__albumDetailUrl = 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id=:userId&album_id=:albumId&page=:page'
#MM详情页面地址
self.__personUrl = 'https://mm.taobao.com/self/aiShow.htm?userId=:userId'
#抓取的文件存放路径
self.savePath =savePath#每个MM的照片最多抓多少张
self.picLimit =picLimit#抓取多少张图片时休息1秒
self.sleepPicCount =sleepPicCount
self.__mkdir(self.savePath)#获取页面内容,python中的异常继承关系: https://docs.python.org/3/library/exceptions.html#exception-hierarchy
def __getContents(self, url, data=None, encoding=None, isjson=None):try:
request= urllib2.Request(url, data, self.__headers)
response= urllib2.urlopen(request, timeout=self.timeout)ifencoding:
contents= response.read().decode(encoding).encode('utf-8')else:
contents=response.read()return json.loads(contents,encoding='utf-8') if isjson elsecontentsexcepturllib2.URLError,e:print '出错了' +e.reasonreturnNoneexceptBaseException,e:print '其他错误'
printe.argsreturnNone#获取MM列表
def __getMMList(self, pageIndex):
url= self.__mmListUrldata=urllib.urlencode({'currentPage':pageIndex,'pageSize':50})
list= self.__getContents(url, data, encoding='gbk', isjson=True)if list isNone:returnNoneelif list['status'] != 1:returnNonereturn list['data']['searchDOList']#获取相册列表
def __getAlbumList(self, mm):
albumList=[]
baseUrl= self.__albumListUrl.replace(':userId',str(mm['userId']))
indexUrl= baseUrl.replace(':page','1')
pageCount= int(self.__getAlbumListPage(indexUrl))
pageCount= pageCount if pageCount<=self.albumLimit elseself.albumLimitfor i in range(1, pageCount+1):
listUrl= baseUrl.replace(':page', str(i))
contents= self.__getContents(listUrl)if (contents isNone):continuepattern= re.compile('
def __getPicPage(self, indexUrl):
totalPage= int(albuminfo['totalPage'])returntotalPage#下载保存单个相册的照片,album表示相册id
pics= self.__getPicList(album, mm)if pics isNone:returnindex= 1
returnsaveDir= self.savePath + mm['realName'].encode('utf-8') + '/img'self.__mkdir(saveDir)
fileName= saveDir + '/'+str(index)+'.jpg'self.__saveImg(pic, fileName)
def __getAlbumListPage(self, url):
contents= self.__getContents(url)ifcontents:
fileName= saveDir + '/info.txt'personUrl= self.__personUrl.replace(':userId', str(mm['userId']))
self.__saveTxtFile(contents, fileName)##保存MM的头像到本地img文件夹
fileName= saveDir + '/avatar.jpg'imgUrl= 'http:'+mm['avatarUrl']+'_240x240xz.jpg' #获取小图
self.__saveImg(imgUrl, fileName)#写入文本文件
def __saveTxtFile(self, contents, fileName):
def __saveImg(self, imgUrl, fileName):
contents= self.__getContents(imgUrl)ifcontents:
handler.close()else:print '获取图片失败,图片地址:'+imgUrl.encode('utf-8')#创建存放图片或者文本文件的文件夹
def __mkdir(self, saveDir):ifos.path.exists(saveDir):returnFalseelse:
os.makedirs(saveDir)returnTrue#主入口方法
defstart(self, startPage, endPage):
pages= range(startPage, endPage+1)for i inpages:
mmlist= self.__getMMList(i)if notmmlist:print "第%s页无数据\n"%(str(i))break
albumList= self.__getAlbumList(mm)for album inalbumList:
self.__savePics(album, mm)if __name__ == '__main__':