import os
import re
import urllib.request
import requests
def getHtml(url,headers):
# 代理设置
proxy = urllib.request.ProxyHandler({'http': '118.190.95.26:9001'}) # 字典
# 基本的urlopen不支持代理、cookie等其他Http/Https高级功能,自定义opener()
opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
opener.addheaders = [headers] # 列表
data = urllib.request.urlopen(url).read().decode('utf-8', 'ignore')
pat='"pic_url":"(.*?)"'
imgUrls=re.findall(pat,data)
return data,imgUrls
if __name__=='__main__':
keyword='李冰冰'
#quote编码
keyword=urllib.request.quote(keyword)
pageString='0'
url='https://s.taobao.com/search?q='+keyword+'&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=-3&ntoffset=-3&p4ppushleft=1%2C48&s='+pageString
headers = ('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0')
data = getHtml(url,headers)[0]
print(data)
pat='"totalPage":(.*?),'
totalPage=int(re.findall(pat,data)[1])
print(totalPage)
path = os.path.join(os.getcwd(), 'img')
os.makedirs(path)
for page in range(totalPage):
pathpage = os.path.join(path, '第'+str(page+1)+'页')
os.makedirs(pathpage)
pageString=str(page*44)
url = 'https://s.taobao.com/search?q=' + keyword + '&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=-3&ntoffset=-3&p4ppushleft=1%2C48&s=' + pageString
imgUrls=getHtml(url, headers)[1]
for i in range(1, len(imgUrls)):
try:
url='http:'+imgUrls[i]
print(url)
fileurl=pathpage+'/'+str(i)+'.jpg'
r=requests.get(url)
with open(fileurl,"wb") as f:
f.write(r.content)
except :
print('fuck off')
爬取淘宝图片
最新推荐文章于 2024-03-17 16:43:27 发布