import re
import urllib.request
import os
import numpy as np
def craw(url,page,savedir):
html1=urllib.request.urlopen(url).read()
html1=str(html1)
pat1='<div id="J_goodsList".*<div class="p-commit">'
result1=re.compile(pat1).findall(html1)
result1=result1[0]
pat2 ='source-data-lazy-img="(//.*?jpg)'
imag = re.compile(pat2).findall(result1)
x = 1
for imagurl in imag:
imagname = savedir + '第'+str(page)+'页' + '第'+str(x)+'个' + '.jpg'
imagurl = 'https:' + imagurl
try:
urllib.request.urlretrieve(imagurl,filename=imagname)
print('已输出第',page,'页,第',x,'个')
except urllib.error.URLError as e:
if hasattr(e,'code'):
x+=1
if hasattr(e,'reason'):
x+=1
x+=1
if __name__ =="__main__":
page_= 45
key = ['衬衫','马甲衬衫','马甲','女生职业装','女士西服']
for k in range(len(key)):
if os.path.exists('./img/' + key[k]) == False:
os.makedirs('./img/' + key[k])
savedir = './img/' + key[k] + '/'
name = key[k]
for i in range(1,2*page_+1):
if i%2==0:
key2=i/2+0.5
else:
key2=(i+1)/2
key1=name
key_temp=urllib.request.quote(key1)
url2='https://search.jd.com/Search?keyword='+key_temp +'&enc=utf-8&page='+ str(i)
# 模拟浏览器
req = urllib.request.Request(url2)
req.add_header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8")
req.add_header("Accept-Encoding","gzip, deflate, br")
req.add_header("Accept-Language","zh-CN,zh;q=0.9")
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36")
url=urllib.request.Request(url2)
craw(url,key2,savedir)
本代码为网上所找,网址不记得了。原作可见请告知。