1、爬取目标
URL:https://list.jd.com/list.html?cat=9987,653,655
2、代码
import re
import urllib.request
def craw(url, page):
htmlMain = urllib.request.urlopen(url).read();
htmlMain = str(htmlMain);
pattern1 = '<div id="plist".+? <div class="page clearfix">';
rst1 = re.compile(pattern1).findall(htmlMain);
rst1 = rst1[0];
pattern2 = '<img width="220" height="220" .+?//.+?\.jpg';
imagelist = re.compile(pattern2).findall(rst1);
x = 1;
for imageurl in imagelist:
imagename = "D:/python/精通Python网路爬虫实例/Cha6/image/"+str(page)+"_"+str(x)+".jpg";
pattern3 = '//.+?\.jpg';
imageurl = re.compile(pattern3).findall(imageurl);
imageurl = "http:"+imageurl[0];
try:
urllib.request.urlretrieve(imageurl, filename=imagename);
except urllib.error.URLError as e:
if hasattr(e, 'code'):
x+=1;
if hasattr(e, 'reason'):
x+=1;
x+=1;
for i in range(1, 51):
url = "https://list.jd.com/list.html?cat=9987,653,655&page=" + str(i);
craw(url, i);
3、效果