python3爬取百度图片_py3 抓取百度图片-CSDN博客

本文链接：https://blog.csdn.net/weixin_43800510/article/details/86810444

其实爬虫异常的简单，特别是用python，几乎都是些已经封装好的库，功能特别强大，这个爬虫用到了urllib,re(正则),os(文件操作)
首先打开命令窗口输入以下代码:

pip install urllib

然后回车,如图：
在这里插入图片描述
我的已经安装好了，所以和第一次安装的略有不同
百度图片关键字美女
url地址如图：

然后开始爬取这个页面的内容：

#coding=utf-8
import urllib
import urllib.request
url="https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E7%BE%8E%E5%A5%B3"
headers={
	"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",#用户信息
	"referer":"https://image.baidu.com"#从百度图片网址跳转过来
	}#这里写爬虫的请求头，百度图片有反爬
req=urllib.request.Request(url,headers=headers)
body=urllib.request.urlopen(req).read()
print (body)#python3 用print要加括号

然后执行如图：
在这里插入图片描述这便是这个网页的html内容了，接下来要对这个页面的代码进行分析：

经过不懈努力，终于发现我们所要的图片链接都是在thumbURL后面开始用正则表达式

#coding=utf-8
import urllib
import urllib.request

import re#正则表达式
url="https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E7%BE%8E%E5%A5%B3"
headers={
	"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",#用户信息
	"referer":"https://image.baidu.com"#从百度图片网址跳转过来
	}#这里写爬虫的请求头，百度图片有反爬
req=urllib.request.Request(url,headers=headers)
body=urllib.request.urlopen(req).read().decode("utf-8")#切记要转编码


key=r'thumbURL":"(.+?)"'
com=re.compile(key)
for string in re.findall(com,body):
	print (string)#遍历输出图片链接

然后执行：
在这里插入图片描述
现在图片的链接已经获得了，那么接下来就是下载了。有urlretrieve方法，本人用的是另一种:

#coding=utf-8
import urllib
import urllib.request

import re#正则表达式
url="https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E7%BE%8E%E5%A5%B3"
headers={
	"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",#用户信息
	"referer":"https://image.baidu.com"#从百度图片网址跳转过来
	}#这里写爬虫的请求头，百度图片有反爬
req=urllib.request.Request(url,headers=headers)
body=urllib.request.urlopen(req).read().decode("utf-8")


key=r'thumbURL":"(.+?)"'
com=re.compile(key)
num=0
for string in re.findall(com,body):
	f_req=urllib.request.Request(string,headers=headers)
	f_body=urllib.request.urlopen(f_req).read()#读取图片的内容
	fs=open(str(num)+".jpg","wb+")#以二进制创建或打开一个jpg文件
	fs.write(f_body)#往文件里写入内容
	print ("正在下载："+string)
	fs.close()#关闭文件	
	print (string+"已下载成功")#做一下界面优化
	num+=1

执行：
在这里插入图片描述

看着这么几行代码就能让图片一张张往文件夹里下载，是b不是非常有成就感呢
至此，一个简单的百度图片爬虫就完成了，再改一下，做的人性化一些，

在这里插入图片描述
可以根据输入的关键字下载图片，实际上只要改变url中keyword的值，这里就直接上代码了

#coding=utf-8
import urllib
import urllib.request

from urllib.parse import quote
import re
import os


headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",
    "referer":"https://image.baidu.com"
    
}
print ("****************************************************************************************")
print ("                               新年快乐                                               ")

print (" 改撸图神器为第一代，只能撸三十张，免费使用，所撸的图保存在d盘的‘梦尘撸图神器’文件夹中    ")
print ("                                                        --------来源:要成为编程王的男人")
print ("****************************************************************************************")
keyword=input("请输入要下载的图片：")
last_dir="d://梦尘撸图神器"
dir="d://梦尘撸图神器//"+keyword
if os.path.exists(last_dir):
    
    if os.path.exists(dir):
        print ("文件夹已经存在")
    else:
        os.mkdir(dir)
        print (dir+"已经创建成功")

else:
    os.mkdir(last_dir)
    if os.path.exists(dir):
        print ("文件夹已经存在")
    else:
        os.mkdir(dir)
        print (dir+"已经创建成功")


    
keyword1=quote(keyword,encoding="utf-8")

url="http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word="+keyword1

req=urllib.request.Request(url,headers=headers)
f=urllib.request.urlopen(req).read().decode("utf-8")
key=r'thumbURL":"(.+?)"'
key1=re.compile(key)
num=0
for string in re.findall(key1,f):
    print ("正在下载"+string)
    
    f_req=urllib.request.Request(string,headers=headers)
    f_url=urllib.request.urlopen(f_req).read()
    fs=open(dir+"/"+keyword+str(num)+".jpg","wb+")
    fs.write(f_url)
    fs.close()
    num+=1
    print (string+"已下载成功")

input("按任意键结束程序：")