使用requests库爬取百度图片

使用requests库爬取百度图片

安装

pip install requests

爬虫流程

爬虫的大致流程

抓取首页图片

静态页面

import re
import os
import requests
#1.目标数据是图片
#2.请求流程  1.先访问page页获取图片url2.对url发起请求,获取图片数据,3.存储
#一张图片
# url="https://ss1.bdstatic.com/70cFuXSh_Q1YnxGkpoWK1HF6hhy/it/u=2325464303,1389978966&fm=26&gp=0.jpg"
#
# res=requests.get(url)
# #res 包含  猫咪图片数据
# print(res.content)#二进制数据
# with  open("猫咪.jpg","wb") as  f:
#     f.write(res.content)

#图片规律
#"thumbURL":"https://ss1.bdstatic.com/70cFuXSh_Q1YnxGkpoWK1HF6hhy/it/u=2535535235,1109729418&fm=26&gp=0.jpg"
#正则

#访问page页面   网页源代码   ---匹配   urls

url="https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&fm=index&pos=history&word=%E7%8C%AB%E5%92%AA"
# headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36"}
headers={"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Cookie": "winWH=%5E6_1366x657; BDIMGISLOGIN=0; BDqhfp=%E7%8C%AB%E5%92%AA%26%260-10-1undefined%26%260%26%261; BAIDUID=31BE3CEB3DDB7ADCA3C987A69863BD4A:FG=1; PSTM=1585467854; BIDUPSID=6B73B5EB3CF18DDDF94A54DB137A0C70; H_WISE_SIDS=139912_143435_142019_144427_141875_141748_143789_144420_142780_144483_136862_144489_131246_141261_144741_138883_141942_127969_140066_143999_140593_143057_141808_140351_141008_143470_144727_143923_144376_131423_144289_142207_143704_143519_107318_138595_139910_144306_143478_142427_140368_138662_142505_141910_144238_142113_143859_136751_140843_110085; BDUSS=UR4a0I0UTR-QmpvflZJdlB4bnduUUR3UGx-ekhlblloSUpsSzZHT3Y1VUdOVTlmRVFBQUFBJCQAAAAAAAAAAAEAAADGU~YYtv63rMrC0rUAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAaoJ18GqCdfO; BDUSS_BFESS=UR4a0I0UTR-QmpvflZJdlB4bnduUUR3UGx-ekhlblloSUpsSzZHT3Y1VUdOVTlmRVFBQUFBJCQAAAAAAAAAAAEAAADGU~YYtv63rMrC0rUAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAaoJ18GqCdfO; __yjs_duid=1_aebcd34bae6eb31144a93ee9ce01016e1611491945901; indexPageSugList=%5B%22%E7%99%BE%E5%BA%A6%E5%9B%BE%E7%89%87%22%2C%22%E7%8C%AB%E5%92%AA%22%2C%22%E6%88%91%E8%A6%81%E5%AD%A6%E4%B9%A0%22%2C%22%E5%9B%BE%E7%89%87%22%2C%22%E8%AE%BE%E8%AE%A1%22%2C%22tornado%22%2C%22%E7%8B%97%E5%AD%90%22%2C%22%E4%BA%91%E6%B2%83%E5%AE%A2%E4%B8%8A%E7%9A%84%E6%8A%95%E6%A0%87%E6%98%AF%E6%80%8E%E4%B9%88%E5%9B%9E%E4%BA%8B%2C%E6%98%AF%E9%9C%80%E8%A6%81%E5%86%99%E5%A5%BD%E4%BB%A3%E7%A0%81%E5%86%8D%E6%8A%95%E6%A0%87%E4%B9%88%22%2C%22%E7%BE%8E%E5%9B%BE%E7%A7%80%E7%A7%80%E5%8E%BB%E9%99%A4%E5%9B%BE%E7%89%87%E6%B0%B4%E5%8D%B0%22%5D; BAIDUID_BFESS=59B0BC4A359EF2EC96697A13EDBA3229:FG=1; H_PS_PSSID=33256_33273_33595_33392_33460_26350_22157; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; delPer=0; PSINO=7; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BA_HECTOR=akahag2ga00g2h2hrv1g39gds0r; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; firstShowTip=1; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; userFrom=null; ab_sr=1.0.0_OTM2Nzg1MDY3YzUxYmJlZDNjZTI2ZjY0Yjc0MjQ4NTIwNzg5ODc1MjEwNjBhNTdjOGY1MmJjNWU5NzM3YTEzMmYwNGVlODA1MTkzYmRiZDAwNmM4YTgyMGNmYjQ0NjVl; BDRCVFR[CLK3Lyfkr9D]=mk3SLVN4HKm",
"Host": "image.baidu.com",
"sec-ch-ua": '"e";v="88", ";Not A Brand";v="99"',
"sec-ch-ua-mobile": "?0",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36",}
res=requests.get(url,headers=headers)
#添加请求头

# print(res.request.headers)
# print(res.text)#字符串数据

#提取数据   猫咪图片url

urls=re.findall('"thumbURL":"(.*?)"', res.text)
print(urls)

if  not os.path.exists("猫咪"):
    os.mkdir("猫咪")
#url发起请求,获取图片数据
for  index,img_url  in  enumerate(urls):
    print(index)
    if  "\\"  in img_url:
        img_url=img_url.replace("\\","")

    res=requests.get(img_url)
    #res 包含  猫咪图片数据
    # print(res.content)#二进制数据
    filename="猫咪"+"/"+"cat"+str(index)+".jpg"

    with  open(filename,"wb") as  f:
        f.write(res.content)

抓取多页图片

动态页面

要点是 page页的规律

import requests
import re
import os

#page  页的规律
page2="https://image.baidu.com/search/acjson?tn=resultjson_com&logid=7495559878915684143&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E7%8C%AB%E5%92%AA&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=0&ic=0&hd=0&latest=0&copyright=0&word=%E7%8C%AB%E5%92%AA&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn=90&rn=30&gsm=5a&1614171744194="
page3="https://image.baidu.com/search/acjson?tn=resultjson_com&logid=7495559878915684143&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E7%8C%AB%E5%92%AA&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=0&ic=0&hd=0&latest=0&copyright=0&word=%E7%8C%AB%E5%92%AA&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn=120&rn=30&gsm=78&1614171746052="
page4="https://image.baidu.com/search/acjson?tn=resultjson_com&logid=7495559878915684143&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E7%8C%AB%E5%92%AA&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=0&ic=0&hd=0&latest=0&copyright=0&word=%E7%8C%AB%E5%92%AA&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn=150&rn=30&gsm=96&1614171872254="

headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36"}
page_url="https://image.baidu.com/search/acjson?tn=resultjson_com&logid=7495559878915684143&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E7%8C%AB%E5%92%AA&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=0&ic=0&hd=0&latest=0&copyright=0&word=%E7%8C%AB%E5%92%AA&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn=90&rn=30&gsm=5a&1614171744194="

# res=requests.get(page_url,headers=headers)
# 
# urls=re.findall('"thumbURL":"(.*?)"', res.text)
# print(urls)
# print(len(urls))

def  get_img(img_urls,dirname):
    """
    功能:获取百度图片,存储到文件夹
    参数:
    img_urls:图片url列表
    dirname:图片存储文件夹
    """
    # 对图片url发起请求
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36",
        "Referer": "https://image.baidu.com"}
    for index, img_url in enumerate(img_urls):
        print(index)
        if "\\" in img_url:
            img_url = img_url.replace("\\", "")

        res = requests.get(img_url, headers=headers)
        # res 包含  猫咪图片数据
        # print(res.content)#二进制数据
        filename = dirname + "/" + "cat" + str(index) + ".jpg"

        with  open(filename, "wb") as  f:
            f.write(res.content)
#获取多页  图片

#1构造page页
for  i  in  range(1,5):
    page_url = "https://image.baidu.com/search/acjson?tn=resultjson_com&logid=7495559878915684143&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E7%8C%AB%E5%92%AA&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=0&ic=0&hd=0&latest=0&copyright=0&word=%E7%8C%AB%E5%92%AA&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&" \
               "pn={}&rn=30&gsm=5a&1614171744194="
    num=str(i*30)
    page_url=page_url.format(num)
    res = requests.get(page_url, headers=headers)
    img_urls=re.findall('"thumbURL":"(.*?)"', res.text)#提取图片url
    dirname="猫咪"+str(i)
    if not os.path.exists(dirname):
        os.mkdir(dirname)
        #对图片url发起请求
    get_img(img_urls,dirname)

总结:

  • 下载一页

    • 1.page_url
    • 2.获取图片url列表(从page_url源代码里)
    • 3.对 图片url列表中的url 发起请求 获取图片数据
    • 4.存储
  • 下载多页

    • 1.构造page页
    • 2.获取图片url列表(从page_url源代码里)
    • 3.对 图片url列表中的url 发起请求 获取图片数据
    • 4.存储
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值