描述
利用python3爬取 图虫网图片,请按照国家规范爬取资源,勿占用社会资源,学习可以,请不要商用
思路
- 分析网页内容
- 获取图片地址
- 下载保存图片
- 上传到图库
- 供大家欣赏使用
注意事项
爬取图片时 请注意不要占用资源
代码讲解
导包
pip3 install selenium time os requests random urllib fake_useragent
需要用到的包
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import os
import requests
import random
from urllib import parse
from fake_useragent import UserAgent
代码部分
#!/usr/bin/python3
# -*- coding:utf-8 -*-
def get_page(driver,url,path):
print('open chrome ...')
driver.get(url)
print('Waiting for the chrome browser to load resources')
time.sleep(2)
html = driver.page_source
print("resources load OK content len : ",len(html))
urls = getDownloadUrl(driver)
downloadImages(urls,path)
def getDownloadUrl(driver):
# 下载地址拼写
# https://cdn3-banquan.ituchong.com/weili/image/ml/{i}.jpeg
el = driver.find_elements(By.CLASS_NAME,value='search-result__item')
urls = []
for i in el:
imagesUrl = i.get_attribute("href")
imagesId = get_params(imagesUrl)
url = "https://cdn3-banquan.ituchong.com/weili/image/l/{}.jpeg".format(imagesId[0])
urls.append(imagesId[0] +","+url)
return urls
def downloadImages(urls,path):
print(" start download images")
for url in urls:
arr = url.split(",")
download_img(arr[1],arr[0],path)
# 防止IP禁用
time.sleep(2)
def download_img(url,name,path):
if not os.path.exists(path):
os.mkdir(path)
print('download images ID:',name," url: ",url)
filePath = path+name+str(time.time())+".jpg"
# 随机休眠 防止占用公共资源
time.sleep(random.randint(5, 20))
content = request_get(url, "image")
with open(filePath,'wb') as f:
f.write(content)
def get_params(url):
params = parse.parse_qs(parse.urlparse(url).query)
return params['imageId']
# 请求函数
def request_get(url, ret_type="text", timeout=20, encoding="GBK"):
headers = {
'User-Agent':UserAgent().random
}
res = requests.get(url=url, headers=headers, timeout=timeout)
res.encoding = encoding
if ret_type == "text":
return res.text
elif ret_type == "image":
return res.content
if __name__ == '__main__':
driver = webdriver.Chrome()
url = "https://stock.tuchong.com/search?isTrusted=true&page=1&platform=image&term=%E6%97%B6%E5%B0%9A"
urls = []
path = "D:/images/"
for i in range(1, 5):
urlStr = url.format(i)
urls.append(urlStr)
print("download images list URL info :",urlStr)
for url in urls:
get_page(driver,url,path)
print("start dowmload next page images ")
driver.close() # 关闭当前窗口网页
运行效果预览
赏析地址传送门: http://lccsetsun.free.nf/