网络爬虫爬取百度图片

最新推荐文章于 2024-05-27 13:31:48 发布

Dxg_01

最新推荐文章于 2024-05-27 13:31:48 发布

阅读量192

点赞数

分类专栏：学习例子文章标签：大数据

本文链接：https://blog.csdn.net/weixin_42394925/article/details/118309371

版权

学习例子专栏收录该内容

37 篇文章 0 订阅

订阅专栏

import requests#
import re#正则表达
from tqdm import tqdm#进度条

with open('Rose.html','r',encoding = 'utf-8') as fp:
    data = fp.readline()

print(data)

#构造请求头
headers = {'access-control-allow-origin':'*',
        'content-type':'image/webp',
        'accept-language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.59'}
#获取页面
def Gethtml(url):
    response = requests.get(url,headers = headers)
    if response.status_code==200:
        Parsehtml(response.content.decode('utf-8'))
    else:
        print(response.status_code)

#解析页面
def Parsehtml(content):
    URLS = re.findall('"thumbURL":"(.*?)"',content)
    i = 0
    for URL in URLS:
        print(URL)
    for Url in URLS:
        response = requests.get(Url,headers = headers)
        #保存图片
        with open("E:\网络爬虫\爬虫图片下载\玫瑰花\玫瑰花{}.jpg".format(i),'wb') as f:
            f.write(response.content)
        i += 1

#入口函数
if __name__=="__main__":
    #想要爬取的目标网址
    url ='https://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=%E7%8E%AB%E7%91%B0%E8%8A%B1'
    Gethtml(url)
    
#获取的response.text的源代码不一样：headers 里加上Cookie

Dxg_01

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
网络爬虫爬取百度图片

import requests#import re#正则表达from tqdm import tqdm#进度条with open('Rose.html','r',encoding = 'utf-8') as fp: data = fp.readline()print(data)#构造请求头headers = {'access-control-allow-origin':'*', 'content-type':'image/webp', 'accept
复制链接

扫一扫