import datetime
import os
from tqdm import tqdm
from baiduspider import BaiduSpider
import urllib
import time
import json
import socket
import urllib.request
import urllib.parse
import urllib.error
import re
from PIL import Image
import hashlib
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--kw', type=str, help='keyword', required=True)
args = parser.parse_args()
pg_st, pg_ed = 0, 200
kw = args.kw
base_save_path = "./data"
curr_time = datetime.datetime.now().strftime('%Y%m%d%H%M%S%z')
save_path = os.path.join(base_save_path, f"{kw.replace(' ', '_')}-{pg_st}-{pg_ed}-{curr_time}")
save_img_path = os.path.join(save_path, "imgs")
os.makedirs(save_img_path, exist_ok=True)
pg_needed = range(pg_st, pg_ed)
img_cnt = 0
res_url_list = []
def get_suffix(name):
m = re.search(r'\.[^\.]*$', name)
if m.group(0) and len(m.group(0)) <= 5:
return m.group(0)
else:
return '.jpeg'
def save_jsonl(name, save_map):
with open(name, 'a') as f:
json.dump(save_map, f, ensure_ascii=False)
f.write('\n')
json_path = os.path.join(save_path, f"results-{curr_time}.json")
hash_set = set()
for pn in tqdm(pg_needed):
try:
res = BaiduSpider().search_pic(kw, pn=pn)
except Exception as e:
print(e)
import pdb; pdb.set_trace()
for r in tqdm(res.results):
txt = r.title
url = r.url
try:
time.sleep(0.05)
opener = urllib.request.build_opener()
opener.addheaders = [
('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'),
]
urllib.request.install_opener(opener)
suffix = get_suffix(url)
save_name = os.path.join(save_img_path, f"{img_cnt}{suffix}")
urllib.request.urlretrieve(url, save_name)
md5hash = hashlib.md5(Image.open(save_name).tobytes())
md5_res = md5hash.hexdigest()
if md5_res in hash_set:
print(f"save name {save_name} already saved, delete it")
os.remove(save_name)
else:
hash_set.add(md5_res)
save_map = {"text": txt, "url": url, "imgid": img_cnt, "imgpath": save_name}
save_jsonl(json_path, save_map)
except urllib.error.HTTPError as urllib_err:
print(urllib_err)
continue
except Exception as err:
time.sleep(1)
print(err)
print("产生未知错误,放弃保存")
continue
print(f"saved {img_cnt} images to {save_name}")
img_cnt += 1
json.dump(res_url_list, open(os.path.join(save_path, f"results-{curr_time}-{img_cnt}.json"), "w"), ensure_ascii=False, indent=2)