python之爬取百度图片

# 目标url:https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E5%A4%8F%E7%9B%AE%E5%8F%8B%E4%BA%BA%E5%B8%90%E5%A4%B4%E5%83%8F&f=3&oq=%E5%A4%8F%E7%9B%AE&rsp=1
# 转为翻页式:https://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E5%A4%8F%E7%9B%AE%E5%8F%8B%E4%BA%BA%E5%B8%90%E5%A4%B4%E5%83%8F&f=3&oq=%E5%A4%8F%E7%9B%AE&rsp=1  (第一页)
# https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%E5%A4%8F%E7%9B%AE%E5%8F%8B%E4%BA%BA%E5%B8%90%E5%A4%B4%E5%83%8F&pn=20&gsm=3c&ct=&ic=0&lm=-1&width=0&height=0 (第二页)
#https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%E5%A4%8F%E7%9B%AE%E5%8F%8B%E4%BA%BA%E5%B8%90%E5%A4%B4%E5%83%8F&pn=40&gsm=50&ct=&ic=0&lm=-1&width=0&height=0 (第三页)
# https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%E5%A4%8F%E7%9B%AE%E5%8F%8B%E4%BA%BA%E5%B8%90%E5%A4%B4%E5%83%8F&pn=60&gsm=64&ct=&ic=0&lm=-1&width=0&height=0  (第四页)
# 规律:'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + keyword + '&pn=' + page   page = (i-1)*20

'''
用正则表达式爬取百度照片
'''
# 请求网页,获取网页源码
import requests
def get_source(url):
    req = requests.get(url,headers=headers)
    req.encoding = 'utf-8'
    source = req.text
    return source

# 寻找图片url
import re
def get_img(source):
    img = re.findall('"objURL":"(.*?)"',source)
    print(img)
    return img

# 保存图片
def save_img(img):
    for each_img in img:
        name = each_img[-10]
        name = re.sub('/','',name)  # 防止图片命名出现/,保存失败
        end = re.search(r'(\.jpg|\.png|\.jpeg|\.gif)$',name)  # 处理图片末尾文件格式
        if end == None:
            name = name + '.jpg'

        with open('img/'+name,'wb') as f:
            try:
                r = requests.get(each_img,headers=headers)
            except Exception as e:
                print(e)
            f.write(r.content)

import urllib.parse
import os
if __name__ == '__main__':
    os.mkdir('img')
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}
    keyword = input('请输入查询照片关键词:')
    keyword = urllib.parse.quote(keyword)
    page_start = int(input('请输入查询初始页码:'))
    page_end = int(input('请输入查询末端页码:'))
    # print(keyword)
    for i in range(page_start,page_end+1):
        page = str((i-1)*50)
        url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word='+ keyword +'&pn='+ page
        print(url)
        source = get_source(url)
        img = get_img(source)
        save_img(img)

'''
用xpath爬取百度照片
'''
# 请求网页,获取网页源码
import requests
def get_source(url):
    response = requests.get(url,headers=headers)
    # print(response) # <Response [200]>
    response.encoding = 'utf-8'
    return response.text

# 获取图片信息
import lxml
from lxml import etree
def get_img(source):
    html_element = etree.HTML(source)
    img = html_element.xpath('//div/ul/li/a/img/@src')
    print(img)
    return img

# 保存图片
import re
def save_img(img):
    for each_img in img:
        name = each_img[-10]
        name = re.sub('/','',name)  # 防止图片命名出现/,保存失败
        end = re.search(r'(\.jpg|\.png|\.jpeg|\.gif)$',name)  # 处理图片末尾文件格式
        if end == None:
            name = name + '.jpg'

        with open('img1/'+name,'wb') as f:
            try:
                r = requests.get(each_img,headers=headers)
            except Exception as e:
                print(e)
            f.write(r.content)

import urllib.parse
import os
if __name__ == '__main__':
    os.mkdir('img1')
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}
    keyword = input('请输入查询照片关键词:')
    keyword = urllib.parse.quote(keyword)
    page_start = int(input('请输入查询初始页码:'))
    page_end = int(input('请输入查询末端页码:'))
    # print(keyword)
    for i in range(page_start,page_end+1):
        page = str((i-1)*50)
        url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word='+ keyword +'&pn='+ page
        print(url)
        source = get_source(url)
        img = get_img(source)
        save_img(img)
Python中,爬取百度图片可以通过使用`requests`库和`BeautifulSoup`库来实现。以下是一个简单的示例代码,演示如何爬取百度图片: 1. 安装必要的库: ```bash pip install requests pip install beautifulsoup4 ``` 2. 编写爬虫代码: ```python import requests from bs4 import BeautifulSoup import os import re def download_image(url, folder, name): response = requests.get(url) if response.status_code == 200: with open(os.path.join(folder, name), 'wb') as f: f.write(response.content) def crawl_baidu_images(query, download_count): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} url = 'https://image.baidu.com/search/index?tn=baiduimage&word=' + query response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, 'html.parser') images = soup.find_all('img', class_='main_img img-hover') if not os.path.exists(query): os.makedirs(query) count = 0 for img in images: img_url = img.get('src') if img_url: download_image(img_url, query, f'{count}.jpg') count += 1 if count >= download_count: break if __name__ == '__main__': query = '风景' download_count = 10 crawl_baidu_images(query, download_count) ``` ### 代码说明: 1. **download_image函数**:用于下载图片并保存到指定文件夹。 2. **crawl_baidu_images函数**:用于爬取百度图片页面,解析图片URL并调用download_image函数下载图片。 3. **主程序**:设置查询关键词和下载数量,并调用crawl_baidu_images函数开始爬取。 ### 注意事项: - **反爬虫机制**:百度有反爬虫机制,频繁请求可能导致IP被封。可以使用`time.sleep()`函数增加请求间隔。 - **User-Agent**:设置请求头中的User-Agent可以模拟浏览器请求,减少被封的风险。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值