python爬虫爬取京东某类书籍图片并保存到本地
完整代码
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import urllib.request
import urllib.error
import urllib.parse
sum = 0
def craw(url,page):
req=urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36')
html1 = urllib.request.urlopen(req).read()
html1=str(html1)
pat1 = '<div id="J_goodsList".+?<span class="clr"></span>'
result1=re.compile(pat1).findall(html1)
result1=result1[0]
pat2 = '<img width="" height="" data-img="1" src="//(.+?[\.jpg|\.png])"'
imagelist=re.compile(pat2).findall(result1)
x=1
global sum
for imageurl in imagelist:
imagename='E:/pythonProjects/paChong/jdBook/'+str(page)+'-'+str(x)+'.jpg'
imageurl="http://"+imageurl
try:
urllib.request.urlretrieve(imageurl,filename=imagename)
except urllib.error.URLError as e:
if hasattr(e,'code') or hasattr(e,'reason'):
x+=1
print('成功爬取第%d页第%d张图片'%(page,x))
x+=1
sum+=1
for i in range(1,20):
url='https://list.jd.com/list.html?cat=1713,3287,3805&page='+str(i)
craw(url,i)
print('爬取结束,总共保存了%d张图'%sum)