#coding:utf8
import urllib2
import re
import os
import urllib
#record all url
fileurl = open("down_url.txt","w")
fileurl.write("************start**************")
#origin page youxunnet
temp = "http://pic.yxdown.com/list/0_0_1.html"
content = urllib2.urlopen(temp).read()
open("down_1.html","w").write(content)
#<div class="cbmiddle"></div>中<a target="_blank" href="/html/5533.html" >
count = 1
res_div = r'<div class="cbmiddle">(.*?)</div>'
m_div = re.findall(res_div, content, re.S|re.M)
print len(m_div)
for line in m_div:
if os.path.exists(str(count)) == False:
os.mkdir(str(count))
#获取title
#<b class="imgname">台湾Showgirl晒福利健身照 网友:营养又要不够了(图)</b>
title_pat = r'<b class="imgname">(.*?)</b>'
title = re.findall(title_pat,line)
if len(title)>0:
title = re.findall(title_pat,line)[0]
else:
continue
unicode(title,'utf-8')
fileurl.write(title+'\n')
#获取url
#<a target="_blank" href="/html/7018.html" class="proimg">
url_a_pat = r'<a target="_blank" href="(.*?)" class="proimg">'
url_a = re.findall(url_a_pat,line)
if len(title)>0:
url_a = re.findall(url_a_pat,line)[0]
else:
continue
if url_a[0]!='/':
continue
fileurl.write(url_a+'\n')
#获取url中的页面
print url_a
html_url = 'http://pic.yxdown.com'+str(url_a)
print html_url
html_content = urllib2.urlopen(html_url).read()
script_pat = r'<script>(.*?)</script>'
script_con = re.findall(script_pat, html_content, re.S|re.M)
for script in script_con:
ori_pat = r'"original":"(.*?)"'
ori_con = re.findall(ori_pat, script)
for ori in ori_con:
fileurl.write(ori+'\n')
filename = os.path.basename(ori)
print ori
#official recommendation method to download picture
urllib.urlretrieve(ori,str(count)+"/"+filename)
count = count+1
print "over"
【实例】爬虫:下载图片
最新推荐文章于 2024-07-22 20:44:36 发布