首先打开京东商城-手机专栏https://list.jd.com/list.html?cat=9987,653,655&page=1&sort=sort_rank_asc&trans=1&JL=6_0_0#J_main
然后打开下一页https://list.jd.com/list.html?cat=9987,653,655&page=2&sort=sort_rank_asc&trans=1&JL=6_0_0#J_main
然后再打开第三页https://list.jd.com/list.html?cat=9987,653,655&page=3&sort=sort_rank_asc&trans=1&JL=6_0_0#J_main
对比三个网址发现 手机专栏下的网址是通过page值的变更而变化 可以确定用于爬取的网址为https://list.jd.com/list.html?cat=9987,653,655&page=
打开页面后来分析要提取的图片地址,提取地址后再用urllib.request.urlretrieve()来保存图片到本地
右键---查看页面源代码--
ctrl +f 输入 第一个图片的名称--Apple iPhone 7 (A1660) 128G 黑色 移动联通电信4G手机
定位到第一个图位置 并找到一个具有唯一性特征的标识信息,用于信息提取 <div id="plist"
然后找到最后一个图片的位置 并找下面具有唯一标识的信息 <div class="page clearfix">
所以第一个pat='<div id="plist".+? <div class="page clearfix">'
然后是最关键的图片源代码分析
<img width="220" height="220" data-img="1" data-lazy-img="//img13.360buyimg.com/n7/jfs/t3961/190/2233466155/341332/2e3803d1/58a55d2aN18488958.jpg">
<img width="220" height="220" data-img="1" data-lazy-img="//img10.360buyimg.com/n7/jfs/t19855/305/881807243/378198/d7130fdd/5b0d0fd8N88e7901d.jpg">
<img width="220" height="220" data-img="1" src="//img12.360buyimg.com/n7/jfs/t2611/360/858752078/90212/68466704/5728910cNd55ac232.jpg">
<img width="220" height="220" data-img="1" src="//img10.360buyimg.com/n7/jfs/t18406/198/1607027948/289456/3e86953e/5acdb065N8f39d863.jpg">
根据上面的对比分析得出下面pat
pat2='<img width="220" height="220" data-img="1" src="//(.+?\.jpg)">'
pat3='<img width="220" height="220" data-img="1" data-lazy-img="//(.+?\.jpg)">'
根据上面的分析下面编写完整代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import os
import sys
import urllib.request
def craw(url,page):
req=urllib.request.Request(url)
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0")
html1=urllib.request.urlopen(req).read()
html1 = str(html1)
##匹配元素1---父节点
# pat1 = '<div id="plist".+? <div class="page clearfix">';
# result1 = re.compile(pat1).findall(html1);
# result1 = result1[0];
##匹配元素2--子节点
pat2='<img width="220" height="220" data-img="1" src="//(.+?\.jpg)">'
pat3='<img width="220" height="220" data-img="1" data-lazy-img="//(.+?\.jpg)">'
##提取连接地址list--并合并list
imagelist=re.compile(pat2).findall(html1)
imagelist1=re.compile(pat3).findall(html1)
imagelist2=imagelist+imagelist1
x=1
for imgurl in imagelist2:
#设置地址跟爬取图片的地址名称
imagename="/home/urllib/test/image/"+str(page)+str(x)+".jpg"
imgurl= "http://" +imgurl
print(imgurl)
try:
#保存图片
urllib.request.urlretrieve(imgurl,filename=imagename)
except urllib.error.URLError as e:
if hasattr(e, "code"):
x+=1
if hasattr(e, "reason"):
x+=1
x+=1
for i in range(1,3):
url = 'https://list.jd.com/list.html?cat=9987,653,655&page='+str(i)
craw(url,i)
执行python test.py 结果
在对应的路径下爬取保存了相应图片,爬取成功
另一种网址的爬取方法代码类似
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import urllib.request
def craw(url,page):
req=urllib.request.Request(url)
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36")
html1=urllib.request.urlopen(req).read()
html1=str(html1)
#pat1 = '<div id="plist".+? <div class="page clearfix">';
#result1 = re.compile(pat1).findall(html1);
#result1 = result1[0];
# pat1='<img width="220" height="220" data-img="1" src="//(.+?\.jpg)">'
# pat2='<img width="220" height="220" data-img="1" data-lazy-img="//(.+?\.jpg)">'
# imagelist=re.compile(pat).findall(html1)
pat1='<img width="220" height="220" class="err-product" data-img="1" source-data-lazy-img="//(.+?\.jpg)" />'
# pat2='<img width="220" height="220" class="" data-img="1" source-data-lazy-img="" data-lazy-img="done" src="//(.+?\.jpg)">'
imagelist=re.compile(pat1).findall(html1)
# imagelist2=re.compile(pat2).findall(html1)
# imagelist=imagelist1+imagelist2
x=1
for imageurl in imagelist:
imagename="/home/urllib/test/image/"+str(page)+str(x)+".jpg"
imageurl="http://"+imageurl
print(imageurl)
try:
urllib.request.urlretrieve(imageurl,filename=imagename)
except urllib.error.URLError as e:
if hasattr(e,"code"):
x+=1
if hasattr(e,"reason"):
x+=1
x+=1
#for i in range(1,3):
# url="https://list.jd.com/list.html?cat=670,671,672&page="+str(i)+"&sort=sort_totalsales15_desc&trans=1&JL=6_0_0#J_main"
# craw(url,i)
for i in range(1,4):
if (i%2) != 0:
url="https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&page="+str(i) #搜索界面的网址,page是奇数
craw(url,i)
转载于:https://blog.51cto.com/superleedo/2123315