# !/usr/bin/python3
# -*- coding:utf-8 -*-
"""
@author: JHC000abc@gmail.com
@file: test.py
@time: 2023/11/6 20:45
@desc:
"""
from sdk.temp.temp_supports import IsSolution
from sdk.utils.util_network import NetWorkRequests
from PIL import Image
class Solution(IsSolution):
def __init__(self, **kwargs):
super(Solution, self).__init__()
self.__dict__.update({k: v for k, v in [
i for i in locals().values() if isinstance(i, dict)][0].items()})
self.net = NetWorkRequests()
self.headers = {
"authority": "www.google.com",
"accept-language": "zh-CN,zh;q=0.9",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
}
self.error_lis = []
def get_query(self, in_path):
with open(in_path, "r", encoding="utf-8")as fp:
for i in fp:
yield i.strip()
def get_data(self, tag, query=None, url=None, headers=None):
if query:
url = "https://www.google.com/search?q={}&tbm=isch".format(query)
# print("url",url)
# proxies = {
# "http": "http://127.0.0.1:1080",
# "https": "http://127.0.0.1:1080",
# }
proxies = None
res = self.net._requests(method="GET", url=url, headers=headers, proxies=proxies)
if res["status"] == 0:
return res["msg"]
else:
self.error_lis.append([tag])
print("获取数据失败:{},请切换代理重试".format(tag))
# raise ValueError("获取数据失败:{},请切换代理重试".format(url))
return None
def get_image_urls(self, url_lis):
with open("temp.html", "r", encoding="utf-8")as fpr:
data = fpr.read()
for dat in data.split(","):
dat = dat.split("?")[0].replace('"', "").replace("[", "")
# print(dat)
if (dat.startswith('https://') and dat.endswith('.jpg')) or (
dat.startswith('http://') and dat.endswith('.jpg')) or (
dat.startswith('https://') and dat.endswith('.png')):
print(dat)
url_lis.append(dat)
if len(url_lis) >= 3:
break
return url_lis
def download_images(self, url_list, query, save_path):
for index, url in enumerate(url_list):
print("开始下载:{}".format(url))
response = self.get_data(url=url, tag=query, headers=self.headers)
if response:
content = response.content
name = "{}_{}.jpg".format(query, index + 1)
image_file = self.folder.merge_path([save_path, name])
with open(image_file, "wb")as fp:
fp.write(content)
if not self.is_image_corrupted(image_file):
print("图像损坏:{}".format(query))
self.error_lis.append([url])
print("{} 下载完成:{}".format(name, url))
def is_image_corrupted(self, file_path):
try:
img = Image.open(file_path) # 尝试打开图像文件
img.verify() # 验证图像文件的完整性
return True # 图像文件未损坏
except (IOError, SyntaxError):
return False # 图像文件损坏
def process(self, **kwargs):
"""
:param kwargs:
:return:
"""
in_path = kwargs["in_path"]
save_path = kwargs["save_path"]
self.folder.create_folder(save_path)
for query in self.get_query(in_path):
url_lis = []
response = self.get_data(tag=query, query=query, headers=self.headers)
if response:
html_str = response.text
# print(html_str)
with open("temp.html", "w", encoding="utf-8")as fp:
fp.write(html_str)
# url_lis = []
url_lis = self.get_image_urls(url_lis)
self.download_images(url_lis, query, save_path)
if self.error_lis:
self.save_result(self.folder.merge_path([save_path, "下载失败链接.txt"]), data=self.error_lis)
if __name__ == '__main__':
in_path = R"D:\Desktop\1.txt"
save_path = R"D:\Desktop\2"
e = Solution()
e.process(in_path=in_path, save_path=save_path)
Python Google 图像搜索结果原图抓取
最新推荐文章于 2024-06-17 09:42:01 发布