Python爬虫实战基础篇——图片爬取
Python版本:3.5.4
涉及内容都是网络爬虫基础:requests和bs4库的应用、正则表达式等
import requests
import os
from bs4 import BeautifulSoup
import re
def GetHtml(url):#获取soup
try:
r = requests.get(url)
demo = r.text
soup = BeautifulSoup(demo, "html.parser")
return soup
except:
return ""
def GetPic(url):#下载某一页的图片
try:
pic = requests.get(url)
path = "爬取图片/图片page"+str(page)+".jpg"
# 保存非文本类的图片、文件都可以用这个模板
with open(path, 'wb') as f:
f.write(pic.content)
#
print("图片page"+str(page)+"爬取成功")
except:
print("page"+str(page)+"爬取失败")
# 获取页数
soup = GetHtml("http://www.zbjuran.com/mei/xinggan/201708/85005.html")
for x in soup.find(attrs = 'page').strings:
pagestring = x
break
pageend = re.sub("\D","",pagestring)
# 翻页爬取
page = 1
while page <= int(pageend):#不断访问网页的循环
if page==1:
url="http://www.zbjuran.com/mei/xinggan/201708/85005.html"
else:
url = "http://www.zbjuran.com/mei/xinggan/201708/85005_"+str(page)+".html"
soup = GetHtml(url)
img_src = soup.find('img').get('src')
GetPic(img_src)
page += 1