前言
因为需要一些图片素材,又不想一个个手动下载,遂通过爬虫来解放双手。在百度图片中搜索“汉服美女”,然后以浏览器地址栏上的地址作为初始 URL。通过对 URL 分析知道 URL 分为 3 部分:域名 + 固定参数 + 关键字参数。
爬取
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @author: Nancy
# @contact: fweiren@163.com
# @software: PyCharm
# @file: getHanfu.py
# @time: 2019/2/23 14:34
import requests
import re
import time
class BaiduPictures(object):
def __init__(self, keyboard):
self.headers = {"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3355.4 Safari/537.36"}
self.base_url = "https://image.baidu.com"
self.keyboard = str(keyboard)
def send_request(self, url):
"""
:param url: 网址
:return: unicode 型数据
"""
try:
html = requests.get(url, headers=self.headers).text
return html
except Exception as e:
print(e)
def make_request(self, url):
"""
:param url: 网址
:return: bytes 型数据 (二进制的数据)
"""
try:
html = requests.get(url, headers=self.headers).content
return html
except Exception as e:
print(e)
def load_page(self, html):
pattern = r'"objURL":"(http.*?)",'
img_urls = re.findall(pattern, html)
for img_url in img_urls:
img_puffix = img_url.rsplit(".", 1)[1].lower()
t = time.time()
now_time = str(round(t * 1000))
data = self.make_request(img_url)
if data and img_puffix in ['jpg', 'jpeg', 'png']:
self.write_pic(data, now_time + "." + img_puffix)
elif data:
self.write_pic(data, now_time + ".jpg")
else:
print(img_url + u"地址无效")