ImageNet爬虫（python3）

最新推荐文章于 2023-04-15 12:55:22 发布

cg_Amaz1ng

最新推荐文章于 2023-04-15 12:55:22 发布

阅读量673

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/cg_Amaz1ng/article/details/85092143

版权

爬虫专栏收录该内容

2 篇文章 0 订阅

订阅专栏

ImageNet是一个图片检索网站，本爬虫基于python3

使用方法：

运行爬虫，在控制终端输入搜索内容，后台会自动从ImageNet上搜索并且把图片下载到imagefiles文件夹，子目录文件夹命名为搜索内容。
在这里插入图片描述

源代码

import urllib3
import re
import os
import datetime
import random

baseUrl = "http://www.image-net.org"
user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0  "
headers = {"User-Agent" : user_agent}
def search(objectName,http):
    global baseUrl,headers
    searchUrl = baseUrl + "/search?q=" + objectName
    print (searchUrl)    
    imgUrls=[]
    imgUrlList = []
    try:
        content = http.request('GET', searchUrl, headers=headers).data
        pattern = r'search_result" border-style="none"><tr><td>.*</table>'
        searchResult = re.findall(pattern,str(content))
        searchResult = searchResult[0]
        imgUrls = re.findall(r'<img src="([^"]*)"',searchResult)        
        for imgUrl in imgUrls:
            imgUrl = baseUrl+imgUrl
            imgUrlList.append(imgUrl)
    except:
        print("error")

    return imgUrlList

def mkdir(dirPath):
    dirPath = dirPath.strip()
    if(not os.path.exists(dirPath)):
        os.makedirs(dirPath)
    


def download(imgUrlList,dirPath,http):
    global headers
    print("Path:"+dirPath+",Downloading")
    for imgUrl in imgUrlList:
        randomNum=random.randint(0,100)
        nowTime=datetime.datetime.now().strftime("%Y%m%d%H%M%S")
        uniqueNum=str(nowTime)+str(randomNum);        
        fileName = uniqueNum + ".jpg"
        content = http.request('GET', str(imgUrl),headers=headers).data
        with open(dirPath + "/" +fileName,'wb') as imageFile:
            imageFile.write(content);
    print("All done!")
    
if __name__ == "__main__": 
    http = urllib3.PoolManager(100)
    while True:
       objectName = input("Search For:")
       dirPath = "imagefiles/" + objectName
       urls = search(objectName,http)
       if (len(urls) > 0) :
           mkdir(dirPath)
       download(urls,dirPath,http)