案例分析
概述
Python在网络爬虫方面功能很强大,程序语言简单高效,下面编程实现一下如何抓取网络图片。本案例分为:单独图片抓取、全网图片抓取。分别给出实现代码,作为学习和技术交流。
Python基础环境准备
案例实现
单独图片抓取
import requests
url='http://724.169pp.net/169mm/201904/141.2.jppg'
resp = requests.get(url).content
#print(resp)
#下载图片
with open('my.jpg','wb') as f:
f.write(resp)
全网图片抓取
import requests
import os
from pyquery import PyQuery as pq
url='https://www.169tp.com/guoneimeinv/'
headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3514.0 Safari/537.36guoneimeinv/ tz.js logo.gif 1_061G54TJ0P.jpg 1_061515062KO6.jpg 1_06141601425K5.jpg 1_0613152F01P5.jpg 1_050G4194Y5a.jpg 1_05061336131V3.jpg 1_0506133354E04.jpg 1_05041232432964.jpg 1_05041230246021.jpg 1_0502143440W33.jpg 1_05021432411914.jpg'}
# resp=requests.get(url,headers=headers).content.decode('utf-8')
# 报错:UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb9 in position 423: invalid start byte
# 突破反扒,返回响应
# print(resp)
# 方法-下载图片
def download_images(a_url,a_filename):
''' 详情页面的图片请求,并保存在本地'''
resp_dtl = requests.get(a_url, headers=headers).content.decode('gbk')
doc = pq(resp_dtl)
bigImgs = doc('.big_img p img').items()
count=1
# print(bigImgs)
for bigImg in bigImgs:
bigImgSrc=bigImg.attr("src")
imgs_data=requests.get(bigImgSrc,headers=headers)
file_name="图片/"+a_filename
save_name=str(file_name)
# 若图片目录文件不存在,则重建
if not os.path.exists(save_name):
os.makedirs(file_name)
else:
with open(save_name+'/{}.jpg'.format(count),'wb') as f:
f.write(imgs_data.content)
count+=1
# 方法-取得图片并下载数据
def getPics(a_url):
resp_data = requests.get(a_url, headers=headers).content.decode('gbk')
doc = pq(resp_data)
# 通过类选择器获取数据
picObj =doc(".pic").items()
# print(pic)
for pic in picObj:
# 通过属性获取内容
picUrl=pic.attr("href")
picName = pic.text()
# print(picUrl)
download_images(picUrl,picName)
# 主入口方法,调用抓取图片
getPics(url)
实现效果,抓取图片文件如下