爬取豆瓣读书首页图片并下载
linux环境下代码:
#conding:utf-8
from urllib import request
import re
def View(url):
with request.urlopen(url) as f:
html=f.read()
with open('2s.txt','wb+') as pafile:
pafile.write(html)
def Img():
with open('2s.txt','r') as pbfile:
a=pbfile.read()
reg=r'src="(.+?\.jpg)" class'
reg_img=re.compile(reg)
imglist=reg_img.findall(a)
j=0
for i in imglist:
request.urlretrieve(i,'%s.jpg' %j) #下载下来
j=j+1
View('https://book.douban.com/')
Img()
结果会下载以下链接的图片:
https://img3.doubanio.com/view/subject/m/public/s32312764.jpg
https://img3.doubanio.com/view/subject/m/public/s33308506.jpg
…
但是移植代码在Windows环境下会报错:
RESTART: C:/Users/shiga/AppData/Local/Programs/Python/Python37/urll/urll.py
Traceback (most recent call last):
File "C:/Users/shiga/AppData/Local/Programs/Python/Python37/urll/urll.py", line 23, in <module>
Img()
File "C:/Users/shiga/AppData/Local/Programs/Python/Python37/urll/urll.py", line 16, in Img
imglist=reg_img.findall(a)
TypeError: cannot use a string pattern on a bytes-like object
主要原因是:
TypeError: can’t use a string pattern on a bytes-like object.
html用decode(‘utf-8’)进行解码,由bytes变成string。
py3的urlopen返回的不是string是bytes。
解决方法是:把’a’类型调整一下:a.decode(‘utf-8’)
代码如下:
#conding:utf-8
from urllib import request
import re
def View(url):
with request.urlopen(url) as f:
html=f.read()
with open('2s.txt','wb+') as pafile:
pafile.write(html)
def Img():
with open('2s.txt','rb+') as pbfile:
a=pbfile.read()
reg=r'src="(.+?\.jpg)" class'
reg_img=re.compile(reg)
imglist=reg_img.findall(a.decode('utf-8'))
j=0
for i in imglist:
request.urlretrieve(i,'%s.jpg' %j)
j=j+1
View('https://book.douban.com/')
Img()
结果如下:
大功告成!