爬虫入门小项目,爬取京东的图片。还不会处理动态加载,只是简单的爬取图片和名称。
#-*- coding: utf-8 -*-
from urllib import request
from urllib import error
import chardet
import re
def crawler(urladdr,page,img_id):
urladdr = urladdr + str(page)
print(urladdr)
html1 = request.urlopen(urladdr).read()
htmlfile = open("jd.html",'wb')
htmlfile.write(html1)
html1 = str(html1,'utf-8')
#html1 = str(html1)
# chardit1 = chardet.detect(html1)
# html1 = html1.decode(chardit1['encoding']).encode('utf-8')
# html1 = str(html1)
pattern1 = '<div id="plist" class="goods-list-v2 J-goods-list gl-type-1 ">(.+?)<div class="clr"></div>\n</div>'
res1 = re.compile(pattern1,re.S).findall(html1)
res1 = res1[0]
pattern2 = u'<li class="gl-item">.+?</li>'
res2 = re.compile(pattern2,re.S).findall(res1)
file = open("resList.txt","w")
x=1
for m in res2:
pattern3 = u'<img width=".+?" height=".+?" data-+?img=".+?" data-lazy-img="//(.+?)"'
imgurl = re.compile(pattern3,re.S).findall(m)
if imgurl == []:
pattern3 = u'<img width=".+?" height=".+?" data-+?img=".+?" src="//(.+?)"'
imgurl = re.compile(pattern3, re.S).findall(m)
print(x)
x = x + 1
if imgurl != []:
imgurl = "http://" + imgurl[0]
# pattern4 = '<strong class="J_price"><em>¥</em>(.+?)</i>'
# price = re.compile(pattern4,re.S).findall(m)[0]
pattern5 = u'<a target="_blank" title=.+?>\n.+?<em>\n.+?(\S.+?)</em>'
imgname = re.compile(pattern5,re.S,).findall(m)
imgname = imgname[0]
# imgname = imgname.encode('raw_unicode_escape')
# imgname = imgname.decode()
try:
form = imgurl[-4:]
request.urlretrieve(imgurl,"img/"+str(img_id)+form)
except error.URLError as e:
if(hasattr(e,"code")):
img_id = img_id + 1
if(hasattr(e,"reason")):
img_id = img_id + 1
write_str = str(img_id)+" "+imgname +'\n'
img_id = img_id + 1
print(write_str)
file.write(write_str)
file.close()
return img_id
#urladdr = "https://search.jd.com/search?keyword=%E5%8D%95%E5%8F%8D%E5%85%A5%E9%97%A8%E7%9B%B8%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&spm=2.1.1&vt=2&page="
urladdr = "http://list.jd.com/list.html?cat=652,654,832&page="
img_id = 1
s=1
for i in range(1,205):
img_id = crawler(urladdr,i,img_id)