爬虫爬取百度,bing搜索,谷歌

百度

import requests
import os
import re
import datetime
import hashlib

md5_list = []

def get_images_from_baidu(keyword, save_dir):
    # UA 伪装:当前爬取信息伪装成浏览器
    # 将 User-Agent 封装到一个字典中
    # 【(网页右键 → 审查元素)或者 F12】 → 【Network】 → 【Ctrl+R】 → 左边选一项,右边在 【Response Hearders】 里查找
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
    # 请求的 url
    url = 'https://image.baidu.com/search/acjson?'
    n = 0

    # 去重
    total_urls = []
    global md5_list
    # for pn in range(0, 30 * page_num, 30):
    pag = 0
    while True:
        pn = pag * 30
        pag += 1
        # 请求参数
        param = {'tn': 'resultjson_com',
                 # 'logid': '7603311155072595725',
                 'ipn': 'rj',
                 'ct': 201326592,
                 'is': '',
                 'fp': 'result',
                 'queryWord': keyword,
                 'cl': 2,
                 'lm': -1,
                 'ie': 'utf-8',
                 'oe': 'utf-8',
                 'adpicid': '',
                 'st': -1,
                 'z': '',
                 'ic': '',
                 'hd': '',
                 'latest': '',
                 'copyright': '',
                 'word': keyword,
                 's': '',
                 'se': '',
                 'tab': '',
                 'width': '',
                 'height': '',
                 'face': 0,
                 'istype': 2,
                 'qc': '',
                 'nc': '1',
                 'fr': '',
                 'expermode': '',
                 'force': '',
                 'cg': '',    # 这个参数没公开,但是不可少
                 'pn': pn,    # 显示:30-60-90
                 'rn': '30',  # 每页显示 30 条
                 'gsm': '1e',
                 '1618827096642': ''
                 }
        request = requests.get(url=url, headers=header, params=param)
        if request.status_code != 200:
            print("请求失败")
            continue
        request.encoding = 'utf-8'
        # 正则方式提取图片链接
        html = request.text
        image_url_list = re.findall('"thumbURL":"(.*?)",', html, re.S)
        image_url_list = list(set(image_url_list))
        for image_url in image_url_list:
            if image_url not in total_urls:
                image_data = requests.get(url=image_url, headers=header).content
                md5 = hashlib.md5(image_data).hexdigest()
                if md5 not in md5_list:
                    md5_list.append(md5)
                else:
                    print("重复数据,跳过")
                    continue
                now_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f")
                img_name = now_time + '.jpg'
                with open(os.path.join(save_dir, img_name), 'wb') as fp:
                    fp.write(image_data)
                n +=1
                print(f"已获取:{n}/{total}")
                total_urls.append(image_url)
                if n > total:
                    print("数量达到,退出")
                    return





if __name__ == '__main__':
    keyword = '猫'
    save_dir = r"D:\data\gecaoji\pachong\baidu"
    save_dir = os.path.join(save_dir, keyword)
    os.makedirs(save_dir, exist_ok=True)

    exit_files = os.listdir(save_dir)
    for exit_file in exit_files:
        exit_file_path = os.path.join(save_dir, exit_file)
        with open(exit_file_path,'rb') as f:
            img_data = f.read()
            md5 = hashlib.md5(img_data).hexdigest()
            md5_list.append(md5)
    total = 3000
    # page_num = 100
    get_images_from_baidu(keyword,save_dir)
    print('Get images finished.')

bing搜索

"""功能:通过爬虫快速获取图片"""

import os
import urllib
import requests
import re
from bs4 import BeautifulSoup
import datetime
import hashlib

header = {
    'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'
}

# 定义目标URL
url = "https://cn.bing.com/images/async?q={0}&first={1}&count={2}&scenario=ImageBasicHover&datsrc=N_I&layout=ColumnBased&mmasync=1&dgState=c*9_y*2226s2180s2072s2043s2292s2295s2079s2203s2094_i*71_w*198&IG=0D6AD6CBAF43430EA716510A4754C951&SFX={3}&iid=images.5599"



proxy_host = "127.0.0.1:7890"



# 创建代理配置
proxy_config = {
    'http': f'http://{proxy_host}',
    'https': f'http://{proxy_host}',
}

urls = []
md5_list = []

def getImage(url):
    '''从原图url中将原图保存到本地'''
    global count
    global md5_list
    try:
        now_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f")
        img_name = now_time + '.jpg'
        # time.sleep(0.5)
        # urllib.request.urlretrieve(url, os.path.join(save_path, img_name))
        response = requests.get(url,proxies=proxy_config)
        if response.status_code == 200:
            img_data = response.content
            md5 = hashlib.md5(img_data).hexdigest()
            if md5 not in md5_list:
                md5_list.append(md5)
            else:
                print("重复数据,跳过")
                return
            with open(os.path.join(save_path, img_name), 'wb') as f:
                f.write(img_data)
            count +=1
            print(f"已获取:{count}/{countNum}")
    except Exception as e:
        # time.sleep(1)
        print("本张图片获取异常,跳过...")




def findImgUrlFromHtml(html, rule):
    '''从缩略图列表页中找到原图的url,并返回'''
    soup = BeautifulSoup(html, "lxml")
    link_list = soup.find_all("a", class_="iusc")
    # urls = []
    global urls
    for link in link_list:
        result = re.search(rule, str(link))
        # 将字符串"amp;"删除

        # print(result)
        if result is None:
            continue
        url = result.group(0)
        # 组装完整url
        url = url[8:len(url)]
        urls.append(url)
        # # 打开高清图片网址
        getImage(url)

    # 完成一页,继续加载下一页


def getStartHtml(url, key, first, loadNum, sfx):
    '''获取缩略图列表页'''
    page = urllib.request.Request(url.format(key, first, loadNum, sfx),
                                  headers=header)
    html = urllib.request.urlopen(page)
    return html


if __name__ == '__main__':
    name = "hedgehog"  # 图片关键词

    save_path = os.path.join(r'D:\data\gecaoji\pachong\bing', name)
    os.makedirs(save_path, exist_ok=True)

    exit_files = os.listdir(save_path)
    for exit_file in exit_files:
        exit_file_path = os.path.join(save_path, exit_file)
        with open(exit_file_path,'rb') as f:
            img_data = f.read()
            md5 = hashlib.md5(img_data).hexdigest()
            md5_list.append(md5)


    countNum = 3000  # 爬取数量
    key = urllib.parse.quote(name)
    first = 1
    loadNum = 35
    sfx = 1
    count = 0
    rule = re.compile(r"\"murl\"\:\"http\S[^\"]+")
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    while count < countNum:
        html = getStartHtml(url, key, first, loadNum, sfx)
        findImgUrlFromHtml(html, rule)
        first += loadNum
        # sfx += 1



谷歌

# date:2020.5.25
# author:pmy
# aim:爬取google图片
# 问题在于,不能保证所爬为所见
import selenium.webdriver.common.by
from selenium import webdriver
import os
import requests
import base64
import datetime
import hashlib


# 修改keyword便可以修改搜索关键词 建议也修改存储目录
keyword = '刺猬'
save_path = r'D:\data\gecaoji\pachong\google'
picpath = os.path.join(save_path, keyword)
os.makedirs(picpath, exist_ok=True)
# url = 'https://www.google.com.hk/search?q=' + keyword + '&source=lnms&tbm=isch'
url = "https://www.google.com.hk/search?q=" + keyword + "&tbm=isch&ved=2ahUKEwiqks_KsfyCAxWaSGwGHUsPDT8Q2-cCegQIABAA&oq=hedgehog&gs_lcp=CgNpbWcQAzIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQ6BAgjECdQzApYzApgtRBoAHAAeACAAfwBiAGQA5IBBTAuMS4xmAEAoAEBqgELZ3dzLXdpei1pbWfAAQE&sclient=img&ei=fjxxZarbKJqRseMPy560-AM&bih=931&biw=1920&hl=zh-CN"
num = 600
end = False

proxy_host = "127.0.0.1:7890"

# 定义目标URL

# 创建代理配置
proxy_config = {
    'http': f'http://{proxy_host}',
    'https': f'http://{proxy_host}',
}

count = 0  # 图片序号
md5_list = []


exit_img_files = os.listdir(picpath)
for exit_img_file in exit_img_files:
    exit_file_Path = os.path.join(picpath,exit_img_file)
    with open(exit_file_Path,"rb") as f:
        img_data = f.read()
        md5 = hashlib.md5(img_data).hexdigest()
        md5_list.append(md5)

class Crawler_google_images:
    # 初始化
    def __init__(self):
        self.url = url

    # 获得Chrome驱动,并访问url
    def init_browser(self):
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument("--disable-infobars")
        browser = webdriver.Chrome(options=chrome_options)
        # 访问url
        browser.get(self.url)
        # 最大化窗口,之后需要爬取窗口中所见的所有图片
        browser.maximize_window()
        return browser

    # 下载图片
    def download_images(self, browser, num=100):
        # 存储路径

        # 路径不存在时创建一个
        if not os.path.exists(picpath):
            os.makedirs(picpath)

        pos = 0
        # print(num)
        urls = []
        while (count < num):

                # 向下滑动
                js = 'var q=document.documentElement.scrollTop=' + str(pos)
                pos += 500
                browser.execute_script(js)
                # time.sleep(0.5)
                # 找到图片
                # html = browser.page_source#也可以抓取当前页面的html文本,然后用beautifulsoup来抓取
                # 直接通过tag_name来抓取是最简单的,比较方便
                img_elements = browser.find_elements(selenium.webdriver.common.by.By.XPATH,
                                                     value='//a[@class="FRuiCf islib nfEiy"]')
                for img_element in img_elements:
                    # 点开大图页面
                    try :
                        img_element.click()
                        # time.sleep(0.3)
                    except Exception as e:
                        continue
                        # print(e)
                    # try:
                        # 这里balabala里面有好几个,所以要过滤一下
                        # 取名好烦哦···
                    balabalas = browser.find_elements(selenium.webdriver.common.by.By.XPATH,
                                                          value='//img[@class="rg_i Q4LuWd"]')

                    if (balabalas):
                        for balabala in balabalas:
                            try:
                                src = balabala.get_attribute('src')
                            except:
                                continue
                            if src is None:
                                # print(src)
                                continue
                            # 过滤掉缩略图和无关干扰信息
                            if src.startswith('data:image') or src.startswith(
                                    'https://'):
                                # print('Found' + str(count) + 'st image url')
                                # img_url_dic.append(src)
                                # self.save_img(count, src, picpath)
                                # recorded = keyboard.record(until='esc')
                                # # 当按下esc时结束按键监听,并输出所有按键事件
                                # if end == True:
                                #     print(f"手动结束搜索")
                                #     return urls
                                if src not in urls:
                                    urls.append(src)
                                    try:
                                        self.save_img(src, picpath)

                                    except:
                                        print(f"get {src} failed")

                                    if count >= num:
                                        print(f"数量达到,结束搜索")
                                        return
                            else:
                                print(src)

                        # except:
                        #     print('获取图片失败')

                            # # 回退
                            # browser.back()
                            # time.sleep(0.3)
            #     except:
            #         print('获取页面失败')
            # except:
            #     print("划不动了")

    def save_img(self, img_src, picpath):
        global md5_list
        global count
        now_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f")
        img_name = now_time + '.jpg'
        filename = os.path.join(picpath, img_name)
        if img_src.startswith("data:image"):
            img_base64 = img_src.split(',')[-1]
            img_data = base64.b64decode(img_base64)
            md5 = hashlib.md5(img_data).hexdigest()
            if md5 not in md5_list:
                md5_list.append(md5)
            else:
                print("重复数据,跳过")
                return
            with open(filename, 'wb') as f:
                f.write(img_data)
                count += 1
                print(f"已获取:{count}/{num}")
        else:
            r = requests.get(img_src,proxies=proxy_config)
            if r.status_code == 200:
                img_data = r.content
                md5 = hashlib.md5(img_data).hexdigest()
                if md5 not in md5_list:
                    md5_list.append(md5)
                else:
                    print("重复数据,跳过")
                    return
                with open(filename, 'wb') as f:
                    f.write(img_data)
                    count += 1
                    print(f"已获取:{count}/{num}")


    def run(self):
        self.__init__()
        browser = self.init_browser()
        self.download_images(browser, num)  # 可以修改爬取的图片数
        browser.close()
        print("############爬取完成")


if __name__ == '__main__':

    craw = Crawler_google_images()
    craw.run()


  • 8
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值