【实例】爬虫:下载图片

#coding:utf8 

import urllib2
import re 
import os
import urllib

#record all url
fileurl = open("down_url.txt","w")
fileurl.write("************start**************")

#origin page youxunnet
temp = "http://pic.yxdown.com/list/0_0_1.html"
content = urllib2.urlopen(temp).read()
open("down_1.html","w").write(content)

#<div class="cbmiddle"></div>中<a target="_blank" href="/html/5533.html" >  
count = 1
res_div = r'<div class="cbmiddle">(.*?)</div>'
m_div = re.findall(res_div, content, re.S|re.M)
print len(m_div)
for line in m_div:
    if os.path.exists(str(count)) == False:
        os.mkdir(str(count))
    #获取title
    #<b class="imgname">台湾Showgirl晒福利健身照 网友:营养又要不够了(图)</b>
    title_pat = r'<b class="imgname">(.*?)</b>'
    title = re.findall(title_pat,line)
    if len(title)>0:
        title = re.findall(title_pat,line)[0]
    else:
        continue
    unicode(title,'utf-8')
    fileurl.write(title+'\n')
    #获取url
    #<a target="_blank" href="/html/7018.html" class="proimg">
    url_a_pat = r'<a target="_blank" href="(.*?)" class="proimg">'
    url_a = re.findall(url_a_pat,line)
    if len(title)>0:
        url_a = re.findall(url_a_pat,line)[0]
    else:
        continue
    if url_a[0]!='/':
        continue
    fileurl.write(url_a+'\n')
    #获取url中的页面
    print url_a
    html_url = 'http://pic.yxdown.com'+str(url_a)
    print html_url
    html_content = urllib2.urlopen(html_url).read()
    script_pat = r'<script>(.*?)</script>'
    script_con = re.findall(script_pat, html_content, re.S|re.M)
    for script in script_con:
        ori_pat = r'"original":"(.*?)"'
        ori_con = re.findall(ori_pat, script)
        for ori in ori_con:
            fileurl.write(ori+'\n')
            filename = os.path.basename(ori)

            print ori
            #official recommendation method to download picture
            urllib.urlretrieve(ori,str(count)+"/"+filename)
    count = count+1
print "over"
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值