【分享】50行代码!批量爬取大量图片!

102 篇文章 2 订阅
# -*- coding:utf-8 -*-
# coding=UTF-8
 
import os,urllib,urllib2,re
 
url = u"http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=python&oq=python&rsp=-1"
outpath = "t:\\"
 
def getHtml(url):
    webfile = urllib.urlopen(url)
    outhtml = webfile.read()
    print outhtml
    return outhtml
 
def getImageList(html):
    restr=ur'('
    restr+=ur'http:\/\/[^\s,"]*\.jpg'
    restr+=ur'|http:\/\/[^\s,"]*\.jpeg'
    restr+=ur'|http:\/\/[^\s,"]*\.png'
    restr+=ur'|http:\/\/[^\s,"]*\.gif'
    restr+=ur'|http:\/\/[^\s,"]*\.bmp'
    restr+=ur'|https:\/\/[^\s,"]*\.jpeg'   
    restr+=ur'|https:\/\/[^\s,"]*\.jpeg'
    restr+=ur'|https:\/\/[^\s,"]*\.png'
    restr+=ur'|https:\/\/[^\s,"]*\.gif'
    restr+=ur'|https:\/\/[^\s,"]*\.bmp'
    restr+=ur')'
    htmlurl = re.compile(restr)
    imgList = re.findall(htmlurl,html)
    print imgList
    return imgList
 
def download(imgList, page):
    x = 1
    for imgurl in imgList:
        filepathname=str(outpath+'pic_%09d_%010d'%(page,x)+str(os.path.splitext(urllib2.unquote(imgurl).decode('utf8').split('/')[-1])[1])).lower()
        print '[Debug] Download file :'+ imgurl+' >> '+filepathname
        urllib.urlretrieve(imgurl,filepathname)
        x+=1
 
def downImageNum(pagenum):
    page = 1
    pageNumber = pagenum
    while(page <= pageNumber):
        html = getHtml(url)#获得url指向的html内容
        imageList = getImageList(html)#获得所有图片的地址,返回列表
        download(imageList,page)#下载所有的图片
        page = page+1
 
if __name__ == '__main__':
    downImageNum(1)
    ```
    ```
    char:925916955

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值