1、首先python爬取百度图片
代码如下:
import csv
import os
import re
import parsel
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0)like Gecko)'}
def baidu_img(keword, num):
base_url = 'https://image.baidu.com/search/index?tn=baiduimage&ie=utf-8&word={}'.format(keword)
path1 = r"E:\图片\\" + keword
y = os.path.exists(path1)
if y == 0:
os.mkdir(path1)
else:
pass
response = requests.get(base_url, headers=headers)
html_str = response.text
# html = parsel.Selector(html_str)
# img_href = html.xpath('//li/div/a/img/@src').extract() #利用xpath提取图片路径
pic_url = re.findall('"objURL":"(.*?)",', html_str, re.S) #利用正则表达式找到图片url
# print(pic_url)
n = 0
for i in pic_url:
try:
img = requests.get(i, headers=headers).content
img_name = i.split('=')[-1]
with open(path1 + '\\' + img_name + '.jpg', 'wb')as f:
f.write(img)
n = n + 1
with open(path1 + '.csv', 'a', newline='')as ff:
csvwriter = csv.writer(ff, dialect='excel')
csvwriter.writerow([img_name, i])
if n >= num:
break
except Exception as e:
print(e)
if __name__ == '__main__':
baidu_img('狗', 20)
2、验证图片是否已经正常爬取
如下图所示,20张狗狗图片已经爬取到我们的磁盘中来了
3、对图片进行去重操作,一般图片去重可以通过MD5去重,也可以通过感知哈希,计算图片的汉明距离进行去重,这里我们介绍第一种
代码如下:
import hashlib
import numpy as np
import requests
from PIL import Image
def md5(dirName):
files = os.listdir(dirName) # 遍历文件夹下的所有文件
temp = s