python实现按主题爬取百度图片

    python实现按主题爬取百度图片

python3:

说明:按主题爬取百度图片


code:
#!/usr/bin/python
#encoding: utf-8
import urllib.request
#from urllib.request import urlretrieve,urlcleanup
#from urllib2 import *
import json
from hashlib import md5
import os
import skimage
import skimage.io
import socket
import importlib
import sys
import re
#importlib.reload(sys)
#sys.setdefaultencoding('utf-8')
#from http://outofmemory.cn/code-snippet/35203/picture
class Baiduimage():
    """
    """
	#tag 图片类型  ; path保存路径
    def __init__(self , tag,num,path):
        self.tag = tag
		#num=100
        self.number = num
        self.path = path
        print ("work start")
        print ("pages(self.number) in " , self.path , "are :" , self.number)
		
	#004 拼结url
    def make_url(self,number):#number=60
	#http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=风景&pn=60   由于python3  汉子改为硬编码URL更适合
        url = "http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=" + self.tag + "&pn=" + str(number)
        #print("this time number is: ",number)
        #print ("baidu url is :" , url)
        return url
		
	#003 请求网页内容
    def request_body(self,number):#number=60
        #print("------- before make_url -------")
        url=self.make_url(number)
        #print("------- after make_url -------")
        #print("------- before urlopen -------")
        req = urllib.request.urlopen(url)
        #print("-------### after urlopen ### -------")
        content = req.read()
        content=content.decode("utf8")
        return content
		
	#002  解析请求的内容
    def parse_body(self , number):#num=60,120,。。。。
        w1 = '"objURL":"'
        w2 = '",'
        try:      
            #print("---- before request_body ----")
            content = self.request_body(number)
            #print("---- after request_body ----")
            parse = re.compile(w1 + '(.*?)' + w2 , re.S)
            result = parse.findall(content)#findall函数返回正则表达式在字符串中所有匹配结果的列表list    
            return result
        except:
            return []
    #001
    def download_image(self):
        if not os.path.exists(self.path):
            os.makedirs(self.path)
		#base_path为各个类别下的文件夹下
        base_path = self.path + "/"
        ind = 0
        ind_w = 0
        ind_r = 0
		#遍历 0 --- 100
        for i in range(self.number):
            result = self.parse_body(i*20)
            #print("result:",str(result))
            #print ("pics in page :" ,i , "are :" , len(result))
            #print ("type of result:" ,type(result))
            if len(result) == 0:
                #print ("the page :" , i , "is null")
                break;
            for url in result:
                ind = ind + 1
                try:
                    image_url = url.split(".")
                    image_path = base_path + md5(url.encode("utf8")).hexdigest() + "." + image_url[-1]
                    socket.setdefaulttimeout(30)
					#url为下载图片路径,image_path为图片保存路径
                    urllib.request.urlretrieve(url,image_path)
					#加载一张图片
                    skimage.io.imread(image_path)
					#清除由于urlretrieve()所产生的缓存
                    urllib.request.urlcleanup()
                    #print ("the effective url is :" , url)
                except IOError:
                    try:
                        ind_r = ind_r + 1
                        os.remove(image_path)
                    except:
                        pass
                except:
                    ind_w = ind_w + 1
                    #print ("the wrong url is :" , url)
                    pass
        print ("sum of pictures " , ind)
        print ("sum of wrong pics" , ind_w)
        print ("sum of remove pics" , ind_r)

if __name__ == "__main__":
	# list_tag = ["尤物","证件照","真人","风景" , "自然" , "山水", "景色","美女"]  这地方顺序问题,爬图片后对应改名称
    list_tag = ["%e5%b0%a4%e7%89%a9","%e8%af%81%e4%bb%b6%e7%85%a7","%e7%9c%9f%e4%ba%ba","%e9%a3%8e%e6%99%af" , "%e8%87%aa%e7%84%b6" , "%e5%b1%b1%e6%b0%b4", "%e6%99%af%e8%89%b2","%e7%be%8e%e5%a5%b3"]
    list_path = ["./meinv","./youwu","./zhengjianzhao","./zhenren","./fengjing" , "./ziran" , "./shanshui", "./jingse"]
    print ("len(list_tag):" , len(list_tag))
    print ("len(list_path):" , len(list_path))
    if not len(list_tag) == len(list_path):
        print ("tag length is not equal to path length")
        os._exit(0)
    for i in range(len(list_tag)):
        Baiduimage(list_tag[i] , 100 , list_path[i]).download_image()

		
		
		
		


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
要使用Python的requests库爬取百度图片,您可以按照以下步骤进行操作: 1. 安装requests库:如果您还没有安装requests库,可以使用以下令进行安装: ``` pip install requests ``` 2. 编写爬取代码:下面是一个简单的例子,演示如何使用requests库爬取百度图片的搜索结果: ```python import requests def download_images(keyword, num_images): # 构建百度图片搜索的URL search_url = f'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&word={keyword}&pn=0&rn={num_images}' # 发送GET请求 response = requests.get(search_url) # 处理响应结果 if response.status_code == 200: # 获取图片URL image_urls = [data['middleURL'] for data in response.json().get('data', []) if 'middleURL' in data] # 下载图片 for i, url in enumerate(image_urls): try: response = requests.get(url) with open(f'image{i}.jpg', 'wb') as f: f.write(response.content) print(f'Successfully downloaded image{i}.jpg') except: print(f'Failed to download image{i}.jpg') else: print('请求失败') # 要搜索的关键词和要下载的图片数量 keyword = 'cat' num_images = 10 # 执行爬取 download_images(keyword, num_images) ``` 在上述代码中,我们构建了一个特定的URL,其中包含了搜索关键词和要下载的图片数量。然后,我们发送GET请求来获取搜索结果,并解析响应结果中的图片URL。最后,我们使用requests库下载图片并保存到本地。 请注意,这只是一个简单的示例,实际应用中可能需要更多的处理和错误处理。另外,爬取网站的图片可能需要遵守该网站的使用条款和法律法规。请确保遵守相关规定并尊重他人的权益。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值