趁着国庆节有时间,帮人写了个爬取淘女郎模特动态加载的图片的爬虫,还有爬取模特们的个人信息数据,这个爬虫花了3天时间,因为图片是异步加载的所以爬取
的复杂度有点大,最终我通过研究URL的变化,构造新的URL来进行持续性爬取,不过爬取速度真心慢(查看了cpu的利用率还有很多没有利用到),我准备把多线程加进去
说实话不太好加,有点头大,
1 # -*- coding: utf-8 -*- 2 import requests,time,re 3 import threadpool 4 import json,os,redis 5 import xlwt,xlrd,random 6 import urllib.request 7 from lxml import etree 8 9 url = 'https://mm.taobao.com/search_tstar_model.htm?spm=5679.126488.640745.2.38a457c5mEWpPl&style=&place=city%3A%E5%B9%BF%E5%B7%9E' 10 url1='https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8' 11 12 13 class tn(object): 14 15 headers = { 16 'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 17 'cookie': 'thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; linezing_session=dTr7U8KOxddDd4CJ4VFihDPH_1506516599348AOqB_1; v=0; _tb_token_=5b9375847e363; _m_h5_tk=4efc0ba8d72376fa1968a3f0a92f0eef_1506518851245; _m_h5_tk_enc=e10a67c79b8a47f97dd3134779acfdfe; uc3=sg2=URsQfTD%2BFY9mkKOl%2FNBXqNFPPUNKq8HjGx%2Bair7O99U%3D&nk2=UoCKEw%2B1myb2u1mo&id2=UoCJiFOLhjN6OQ%3D%3D&vt3=F8dBzWk7FANQZ7%2B830Y%3D&lg2=U%2BGCWk%2F75gdr5Q%3D%3D; existShop=MTUwNjk0NjQwMQ%3D%3D; uss=VFcwj3YmzKmO6xkbkJFH%2FN%2FOd2CPNJzRBWBygIM3IKKXIgbm1DSeGb87; lgc=1132771621aa; tracknick=1132771621aa; cookie2=11aae79de97ae344158e4aa965c7003c; sg=a2d; cookie1=Aihx9FxoyUYIE7uEPgeqstl%2B5uvfGslyiCQ%2FpePYriI%3D; unb=1100473042; skt=11ea4b0360e50e08; t=b63e6968872da200706b694d67c62883; _cc_=UtASsssmfA%3D%3D; tg=0; _l_g_=Ug%3D%3D; _nk_=1132771621aa; cookie17=UoCJiFOLhjN6OQ%3D%3D; cna=1/K/EZz4HDECAXhVTdivCBle; mt=ci=45_1; isg=Anx8i770Cd1_Zz2Ede24lLr7TRqCdCGCcQXP_1b95GdKIR2rfoTZL77ZdX-i; JSESSIONID=F34B74BB5A7A0A1BF96E8B3F2C02DE87; uc1=cookie14=UoTcCfmfxB%2Fd7g%3D%3D&lng=zh_CN&cookie16=WqG3DMC9UpAPBHGz5QBErFxlCA%3D%3D&existShop=false&cookie21=U%2BGCWk%2F7owY3i1vB1W2BgQ%3D%3D&tag=8&cookie15=UtASsssmOIJ0bQ%3D%3D&pas=0', 18 19 } 20 21 def getUrlinfo(self,page): 22 23 datas=[] 24 pageurl = 'https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8' 25 26 data = { 27 'q':'', 28 'viewFlag':'A', 29 'sortType':'default', 30 'searchStyle':'', 31 'searchRegion':'city:', 32 'searchFansNum':'', 33 'currentPage':'%s'%(page), 34 'pageSize':'100' 35 } 36 try: 37 while True: 38 time.sleep(1) 39 reqs = requests.post(pageurl,data=data,headers=self.headers,timeout=5) 40 if reqs.status_code ==200: 41 break 42 else: 43 print('field') 44 except Exception as e: 45 print('error:',e) 46 dictx = json.loads(str(reqs.text)) 47 t = dictx['data']['searchDOList'] 48 for i in t: 49 r = i['realName'],i['height'],i['weight'],i['city'],i['userId'] 50 #userid = i['userId'] 51 datas.append(r) 52 return datas #返回淘女郎信息数据 53 54 def getImages(self,rs): 55 56 a=0 57 for id in rs: 58 #print(id) 59 os.mkdir(os.path.join('D:\\SpiderProject\\ZhiHu\\taonvlang\\img', str(id[0]))) 60 imagurl = 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20='+str(id[4]) 61 62 try: 63 64 html1 = requests.get(imagurl,headers=self.headers,timeout=5) 65 reqsones = str(html1.text) 66 #print(reqsones) 67 except Exception as e: 68 print('error:',e) 69 urls = etree.HTML(reqsones) 70 imagesurl = urls.xpath('//a[@class="mm-first"]/@href')#获取淘女郎对应相册url 71 #print(imagesurl) 72 ad = 'album_id=\d{11}|album_id=\d{8}|album_id=\d{9}'#获取album_id 73 album_id = re.compile(ad) 74 result = album_id.findall(str(imagesurl)) 75 76 for im in result: 77 pturl = 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id=%s&%s&top_pic_id=0&page=0'%(id[4],im) 78 time.sleep(1) 79 html5 = requests.get(pturl,headers=self.headers,timeout=5)#获取json图片URL 80 print('获取json图片URL') 81 jsons =json.loads(str(html5.text)) 82 try: 83 pic = jsons['picList'] 84 except KeyError as e: 85 print('Error:',e) 86 87 for ius in pic: 88 a+=1 89 iu = ius['picUrl'] 90 imurl = 'http:'+ str(iu) 91 filename = os.path.join('D:\\SpiderProject\\ZhiHu\\taonvlang\\img\\%s'%(id[0]),str(a)+'.jpg') 92 print('开始下载图片') 93 try: 94 file = urllib.request.urlretrieve(str(imurl),filename) 95 except urllib.error.HTTPErro as e: 96 print('Error:',e) 97 98 def getInfophone(self): 99 100 userurl = 'https://mm.taobao.com/self/aiShow.htm?spm=719.7763510.1998643336.1.6WJFuT&userId=277949921' 101 html = requests.get(userurl,headers=self.headers,timeout=5) 102 html.encoding = 'GBK' 103 print(html.encoding) 104 selector = etree.HTML(str(html.text)) 105 phone = selector.xpath('//strong[@style="font-family: simhei;color: #000000;font-size: 24.0px;line-height: 1.5;"]|//span[@style="font-size: 24.0px;"]/text()') 106 107 def saveInfo(self,p): 108 109 a,b = 1,0 110 workbook = xlwt.Workbook(encoding='ascii') 111 worksheet = workbook.add_sheet('My Worksheet') 112 worksheet.write(0,0, label='姓名') 113 worksheet.write(0, 1, label='身高') 114 worksheet.write(0, 2, label='体重') 115 worksheet.write(0, 3, label='城市') 116 117 while a<=30 or b<=5: 118 for names in p: 119 n = names[0] 120 w = names[1] 121 h = names[2] 122 c = names[3] 123 a+=1 124 worksheet.write(3, 5, label=str(n)) 125 workbook.save('Excel_Workbook.xls') 126 127 if __name__ =="__main__": 128 t = tn() 129 for ii in range(1): 130 rs = t.getUrlinfo(ii) 131 #print(rs) 132 t.getImages(rs) 133 #t.saveInfo(p) 134 #t.getInfophone()
下面是运行代码截图