python之爬取百度图片

最新推荐文章于 2021-03-26 12:28:58 发布

Claire_chen_jia

最新推荐文章于 2021-03-26 12:28:58 发布

阅读量973

点赞数 1

文章标签： python

本文链接：https://blog.csdn.net/Claire_chen_jia/article/details/106276496

版权

# 目标url：https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E5%A4%8F%E7%9B%AE%E5%8F%8B%E4%BA%BA%E5%B8%90%E5%A4%B4%E5%83%8F&f=3&oq=%E5%A4%8F%E7%9B%AE&rsp=1
# 转为翻页式：https://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E5%A4%8F%E7%9B%AE%E5%8F%8B%E4%BA%BA%E5%B8%90%E5%A4%B4%E5%83%8F&f=3&oq=%E5%A4%8F%E7%9B%AE&rsp=1  （第一页）
# https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%E5%A4%8F%E7%9B%AE%E5%8F%8B%E4%BA%BA%E5%B8%90%E5%A4%B4%E5%83%8F&pn=20&gsm=3c&ct=&ic=0&lm=-1&width=0&height=0 （第二页）
#https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%E5%A4%8F%E7%9B%AE%E5%8F%8B%E4%BA%BA%E5%B8%90%E5%A4%B4%E5%83%8F&pn=40&gsm=50&ct=&ic=0&lm=-1&width=0&height=0 （第三页）
# https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%E5%A4%8F%E7%9B%AE%E5%8F%8B%E4%BA%BA%E5%B8%90%E5%A4%B4%E5%83%8F&pn=60&gsm=64&ct=&ic=0&lm=-1&width=0&height=0  （第四页）
# 规律：'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + keyword + '&pn=' + page   page = (i-1)*20

'''
用正则表达式爬取百度照片
'''
# 请求网页，获取网页源码
import requests
def get_source(url):
    req = requests.get(url,headers=headers)
    req.encoding = 'utf-8'
    source = req.text
    return source

# 寻找图片url
import re
def get_img(source):
    img = re.findall('"objURL":"(.*?)"',source)
    print(img)
    return img

# 保存图片
def save_img(img):
    for each_img in img:
        name = each_img[-10]
        name = re.sub('/','',name)  # 防止图片命名出现/，保存失败
        end = re.search(r'(\.jpg|\.png|\.jpeg|\.gif)$',name)  # 处理图片末尾文件格式
        if end == None:
            name = name + '.jpg'

        with open('img/'+name,'wb') as f:
            try:
                r = requests.get(each_img,headers=headers)
            except Exception as e:
                print(e)
            f.write(r.content)

import urllib.parse
import os
if __name__ == '__main__':
    os.mkdir('img')
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}
    keyword = input('请输入查询照片关键词：')
    keyword = urllib.parse.quote(keyword)
    page_start = int(input('请输入查询初始页码：'))
    page_end = int(input('请输入查询末端页码：'))
    # print(keyword)
    for i in range(page_start,page_end+1):
        page = str((i-1)*50)
        url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word='+ keyword +'&pn='+ page
        print(url)
        source = get_source(url)
        img = get_img(source)
        save_img(img)

'''
用xpath爬取百度照片
'''
# 请求网页，获取网页源码
import requests
def get_source(url):
    response = requests.get(url,headers=headers)
    # print(response) # <Response [200]>
    response.encoding = 'utf-8'
    return response.text

# 获取图片信息
import lxml
from lxml import etree
def get_img(source):
    html_element = etree.HTML(source)
    img = html_element.xpath('//div/ul/li/a/img/@src')
    print(img)
    return img

# 保存图片
import re
def save_img(img):
    for each_img in img:
        name = each_img[-10]
        name = re.sub('/','',name)  # 防止图片命名出现/，保存失败
        end = re.search(r'(\.jpg|\.png|\.jpeg|\.gif)$',name)  # 处理图片末尾文件格式
        if end == None:
            name = name + '.jpg'

        with open('img1/'+name,'wb') as f:
            try:
                r = requests.get(each_img,headers=headers)
            except Exception as e:
                print(e)
            f.write(r.content)

import urllib.parse
import os
if __name__ == '__main__':
    os.mkdir('img1')
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}
    keyword = input('请输入查询照片关键词：')
    keyword = urllib.parse.quote(keyword)
    page_start = int(input('请输入查询初始页码：'))
    page_end = int(input('请输入查询末端页码：'))
    # print(keyword)
    for i in range(page_start,page_end+1):
        page = str((i-1)*50)
        url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word='+ keyword +'&pn='+ page
        print(url)
        source = get_source(url)
        img = get_img(source)
        save_img(img)