搜狗图片抓取，主要以图集类进行抓取

最新推荐文章于 2024-09-15 14:50:06 发布

baao446016045

最新推荐文章于 2024-09-15 14:50:06 发布

阅读量149

点赞数

文章标签：运维 json 人工智能

原文链接：http://www.cnblogs.com/lqn404/p/11194612.html

版权

搜狗图集抓取，类型比较多，但是数据量比较少

import os
import time
from concurrent.futures.thread import ThreadPoolExecutor
import requests
import re
import json
from urllib import parse
from pymongo import MongoClient


class Save:
    def __init__(self, host):
        self.client = MongoClient(host=host, port=27017)
        self.db = self.client.ImageSet

    def _save_data_mongodb(self, collect_name, data):
        self.collect_name = self.db[collect_name]
        history_record = self.collect_name.find_one({"_id": data['id']})
        if history_record:
            return True
        else:
            self.collect_name.update_one({'_id': data['id']}, {'$set': data}, upsert=True)
            return True


class SouHu:
    def __init__(self, category):
        self.category_name = category
        self.category = parse.quote(category)
        self.image_url_temp = "https://pic.sogou.com/pics/imageddetail2013.jsp?k="+self.category+"&tc=&t=&id=0&d={}"
        self.start_url = "https://pic.sogou.com/pics?query="+self.category+"&mode=8&dm=11&leftp=44230502&cwidth=1024&cheight=768&st=0&start={}&reqType=ajax&reqFrom=result&tn=0"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
        }

    def get_title_id_grpdocs(self, url):
        print(url)
        response = requests.get(url=url, headers=self.headers)
        json_response = json.loads(response.text)
        json_dict = json_response.get('items')
        image_content = []
        for i in json_dict:
            item={}
            try:
                id = i['mf_id']
                title = i['title']
                grpdocs = i['grpdocs']
                rule = re.compile(r'\s+')
                grpdocs = rule.sub(',', grpdocs)
                item['id'] = id
                item['title'] = title
                item['ImageUrl'] = self.image_url_temp.format(grpdocs)
                image_content.append(item)
            except Exception as e:
                print(e)
                continue
        print(image_content)
        return image_content

    def get_save_content(self, image_content):
        save_content = []
        for image in image_content:
            item={}
            url = image['ImageUrl']
            response = requests.get(url=url, headers=self.headers)
            json_response = json.loads(response.text)
            image_list = []
            for i in json_response:
                try:
                    image_url = i['pic_url']
                except:
                    try:
                        image_url = i['ori_pic_url']
                    except:
                        continue
                image_list.append(image_url)
            item['id'] = image['id']
            item['title'] = image['title']
            item['url'] = image_list
            save_content.append(item)
        print(save_content)
        return save_content

    def save_(self,save_content):
        upload_time = time.strftime("%Y-%m-%d", time.localtime())
        print("开始写入")
        for i in save_content:
            if len(i['url']) < 3:
                continue
            collect_name = "搜狗图片"
            result = Save("localhost")._save_data_mongodb(collect_name, data=i)
            if result:
                try:
                    rule = re.compile(r'\s*', re.S)
                    rule2 = re.compile(r'\W*', re.S)
                    title = rule.sub('', i['title'])
                    title = rule2.sub('', title)
                    path = 'D:/搜狗/'+self.category_name+'/' + str(upload_time) + '/' + title
                except Exception as e:
                    print(e)
                    continue
                if os.path.exists(path):
                    continue
                else:
                    os.makedirs(path)
                try:
                    with open(path + '/content.txt', 'w', encoding='utf8')as fb:
                        fb.write(str([i['title']]))
                    for s in i['url']:
                        a = i['url'].index(s)
                        with open(path + '/{}.jpg'.format(str(a)), 'wb') as f:
                            print(s)
                            response = requests.get(url=s)
                            f.write(response.content)
                except Exception as e:
                    print(e)
                    continue
                print(title+"   写入完成")
            else:
                continue

    def run(self, num):
        url = self.start_url.format(num)
        image_content = self.get_title_id_grpdocs(url)
        save_content = self.get_save_content(image_content)
        self.save_(save_content)


if __name__ == '__main__':
    category = input("输入分类名称:")
    with ThreadPoolExecutor(10) as executor:
        sh = SouHu(category)
        for num in range(2400):
            executor.submit(sh.run, num)

转载于:https://www.cnblogs.com/lqn404/p/11194612.html