仅作学习交流只用。
对于动态网页,可以分析其请求数据,模拟POST,只请求目的数据,占用资源较少,在网络带宽有限的情况下比webbrowser有效。
网站:点击打开链接
下载该网站里的图片。
其中图片列表是动态加载的,从中获得图片的ID,请求对应的网址,下载图片。
主模块:
#-*- coding:utf-8 -*-
#lofterart爬虫
#author:windroid
#15/3/5
import getPage
import downPic
import re
import time
MAXPAGE=184
SPAGE=19
PATH='D:\\lofter\\'
print PATH
print SPAGE
print 'downloading...'
for downpage in range(SPAGE,MAXPAGE+1):
pagelist=getPage.getPage(downpage)
for v in pagelist:
#time.sleep(1)#404
downPic.downPic(v[10:],PATH)
#print v[10:]
open('set.ini','w').write(str(downpage))
print 'download page: '+str(downpage)+' over.'
print 'download is over.'
getPage
获得图片列表
#-*- coding: utf-8 -*-
import urllib2
import gzip
import StringIO
import re
def getPage(page):
'加载页面,返回list,数据格式:productId=23123720'
BATCHID=196800-page*13
#if page==1:
# page=0
# PARAM2=32
#elif page<=50:
# PARAM2=16
#else:
# PARAM2=8
PARAM2=8
#c0-param0 1 框画 2 明信片
postdata='''callCount=1
scriptSessionId=${scriptSessionId}187
httpSessionId=
c0-scriptName=SaleBean
c0-methodName=getSaleRecommendItemList
c0-id=0
c0-param0=number:2
c0-param1=number:-1
c0-param2=number:%d
c0-param3=number:%d
batchId=%d'''%(PARAM2, PARAM2*page, BATCHID)
url='http://www.lofter.com/dwr/call/plaincall/SaleBean.getSaleRecommendItemList.dwr'
myheaders={
'Host':'www.lofter.com',
'User-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0',
'Accept-Encoding':'gzip, deflate',
'Content-Type':'text/plain; charset=UTF-8',
'Referer':'http://www.lofter.com/art/print',
}
opener=urllib2.build_opener()
req=urllib2.Request(url,data=postdata,headers=myheaders)
try:
f=opener.open(req)
except BaseException, err:
print 'getPage: '+page+' failed.'+'Error: '+str(err)
return {}
rawdata=StringIO.StringIO(f.read())
resdata=gzip.GzipFile(fileobj=rawdata).read()
reslist=re.findall('productId=\d*',resdata)
#print reslist
return reslist
#getPage(1)#184
downPic
下载图片
# -*- coding: utf-8 -*-
import urllib
import urllib2
import cookielib
import re
headers={'User-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0'}
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar()))
def downPic(picid,path):
req=urllib2.Request('http://www.lofter.com/art/product-'+picid,headers=headers)
try:
content=opener.open(req)
except BaseException,err:
print 'download: '+picid+'.jpg failed. Error: '+str(err)
return 1
else:
#"showimgtag"
result=re.findall('<img src="(.*?)" class="card showimgtag">',content.read())
x=1
for item in result:
urllib.urlretrieve(item,path+picid+'-'+str(x)+'.jpg')
print 'download: '+picid+'-'+str(x)+'.jpg over.'
x+=1
return 0