搜狗图集抓取,类型比较多,但是数据量比较少
import os
import time
from concurrent.futures.thread import ThreadPoolExecutor
import requests
import re
import json
from urllib import parse
from pymongo import MongoClient
class Save:
def __init__(self, host):
self.client = MongoClient(host=host, port=27017)
self.db = self.client.ImageSet
def _save_data_mongodb(self, collect_name, data):
self.collect_name = self.db[collect_name]
history_record = self.collect_name.find_one({"_id": data['id']})
if history_record:
return True
else:
self.collect_name.update_one({'_id': data['id']}, {'$set': data}, upsert=True)
return True
class SouHu:
def __init__(self, category):
self.category_name = category
self.category = parse.quote(category)
self.image_url_temp = "https://pic.sogou.com/pics/imageddetail2013.jsp?k="+self.category+"&tc=&t=&id=0&d={}"
self.start_url = "https://pic.sogou.com/pics?query="+self.category+"&mode=8&dm=11&leftp=44230502&cwidth=1024&cheight=768&st=0&start={}&reqType=ajax&reqFrom=result&tn=0"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
def get_title_id_grpdocs(self, url):
print(url)
response = requests.get(url=url, headers=self.headers)
json_response = json.loads(response.text)
json_dict = json_response.get('items')
image_content = []
for i in json_dict:
item={}
try:
id = i['mf_id']
title = i['title']
grpdocs = i['grpdocs']
rule = re.compile(r'\s+')
grpdocs = rule.sub(',', grpdocs)
item['id'] = id
item['title'] = title
item['ImageUrl'] = self.image_url_temp.format(grpdocs)
image_content.append(item)
except Exception as e:
print(e)
continue
print(image_content)
return image_content
def get_save_content(self, image_content):
save_content = []
for image in image_content:
item={}
url = image['ImageUrl']
response = requests.get(url=url, headers=self.headers)
json_response = json.loads(response.text)
image_list = []
for i in json_response:
try:
image_url = i['pic_url']
except:
try:
image_url = i['ori_pic_url']
except:
continue
image_list.append(image_url)
item['id'] = image['id']
item['title'] = image['title']
item['url'] = image_list
save_content.append(item)
print(save_content)
return save_content
def save_(self,save_content):
upload_time = time.strftime("%Y-%m-%d", time.localtime())
print("开始写入")
for i in save_content:
if len(i['url']) < 3:
continue
collect_name = "搜狗图片"
result = Save("localhost")._save_data_mongodb(collect_name, data=i)
if result:
try:
rule = re.compile(r'\s*', re.S)
rule2 = re.compile(r'\W*', re.S)
title = rule.sub('', i['title'])
title = rule2.sub('', title)
path = 'D:/搜狗/'+self.category_name+'/' + str(upload_time) + '/' + title
except Exception as e:
print(e)
continue
if os.path.exists(path):
continue
else:
os.makedirs(path)
try:
with open(path + '/content.txt', 'w', encoding='utf8')as fb:
fb.write(str([i['title']]))
for s in i['url']:
a = i['url'].index(s)
with open(path + '/{}.jpg'.format(str(a)), 'wb') as f:
print(s)
response = requests.get(url=s)
f.write(response.content)
except Exception as e:
print(e)
continue
print(title+" 写入完成")
else:
continue
def run(self, num):
url = self.start_url.format(num)
image_content = self.get_title_id_grpdocs(url)
save_content = self.get_save_content(image_content)
self.save_(save_content)
if __name__ == '__main__':
category = input("输入分类名称:")
with ThreadPoolExecutor(10) as executor:
sh = SouHu(category)
for num in range(2400):
executor.submit(sh.run, num)