一个哥们参加app大赛,我帮他写的抓取程序,但是好像抓取的数据挺少的,不知道怎么回事,先贴上来大家研究研究吧
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 26 10:50:20 2014
@author: lifeix
"""
import urllib2
from HTMLParser import HTMLParser
import simplejson as json
import traceback
class Spyder(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.data = []; #app信息集合
self.temp = {}; #存储单个app信息
self.isStartTag = False #app列表的入口div
self.singleLi = False #每个app的入口li元素
self.isAppDescFlag = False #进入app的详细描述元素div
self.isMeta = False
self.isInstallCount = False #
self.isComment = False
self.isInstallBtn = False
self.isDot = False
self.isIconUrl = False
self.icon= False
self.recordInstallCount = False
pass
def handle_starttag(self, tag, attrs):
if tag == 'li':
for key, value in attrs:
if key == 'class' and value == 'card':
self.singleLi = True
break
elif self.singleLi:
if self.icon == False and self.isIconUrl == False and tag == 'div':
for key, value in attrs:
if key == 'class' and value == 'icon-wrap':
self.isIconUrl = True
break
elif self.isIconUrl and tag == 'img':
for key, value in attrs:
if key == 'src':
self.temp['app_icon_url'] = value
self.isIconUrl = False
self.icon = True
break
if tag == 'div':
for key , value in attrs:
if value == 'app-desc' and key == 'class':
self.singleLi = False
self.isAppDescFlag = True
break
elif self.isAppDescFlag:
if self.isMeta == False and tag == 'a':
for key, value in attrs:
if key == 'title':
self.temp['app_name'] = value
break
elif tag == 'div':
for key, value in attrs:
if self.isMeta == False and key == 'class' and value == 'meta':
self.isMeta = True
break
elif self.isComment == False and key == 'class' and value == 'comment':
self.isComment = True
elif tag == 'span' and self.isMeta:
for key, value in attrs:
if key == 'class':
if self.isInstallCount == False and value == 'install-count':
self.isInstallCount = True
elif self.isDot == False and self.isInstallCount and value == 'dot':
self.isDot = True
elif key== 'title' and self.isInstallCount and self.isDot:
self.isInstallCount = False
self.isDot = False
self.temp['app_size'] = value
if self.isMeta and tag == 'a':
if self.isInstallBtn == False:
for key, value in attrs:
if key == 'class' and value.strip() == 'install-btn':
self.isInstallBtn = True
break
if self.isInstallBtn:
for key, value in attrs:
if key == 'href':
self.temp['app_download_url'] = value
self.isAppDescFlag = False
self.isInstallBtn = False
self.isMeta = False
self.data.append(self.temp)
self.temp = {}
self.recordInstallCount = False
self.icon = False
break
def handle_data(self, data):
if self.recordInstallCount == False and self.isInstallCount:
self.recordInstallCount = True
self.temp['app_data_install'] = data.split(' ')[0]
elif self.isComment:
self.temp['app_desc'] = data
self.isComment = False
def getResult(self):
return self.data;
request = urllib2.urlopen("http://www.wandoujia.com/apps")
content = request.read();
allData = [] #存放所有的app信息
spyder = Spyder()
spyder.feed(content)
allData = spyder.getResult()
maxData = 12
moreUrl = 'http://apps.wandoujia.com/api/v1/feeds?max=12&start=%d&opt_fields=data.app.tags.*,data.app.editorComment,data.app.likesCount,data.app.reason,data.app.ad,data.app.title,data.app.packageName,data.app.apks.size,data.app.icons.px68,data.app.apks.superior,data.app.installedCountStr,data.app.snippet,data.app.apks.versionCode&callback=jsonp1'
def loadMore():
start = 24
flag = True
while flag:
try:
moreRequest = urllib2.urlopen(moreUrl%start)
if moreRequest:
moreData = moreRequest.read()
data = moreData[7:len(moreData) - 2]
target = json.JSONDecoder().decode(data)
targetData = target['data']
print len(targetData), start,len(allData)
if len(targetData) < 1:
flag = False
else:
for obj in targetData:
app = obj['app']
apkSize = app['apks'][0]['size']
comment = app['editorComment']
iconUrl = app['icons']['px68']
installCount = app['installedCountStr'].split(' ')[0]
title = app['title']
packageName = app['packageName']
downloadUrl = 'http://apps.wandoujia.com/apps/%s/download'%packageName
temp = {'app_name':title,
'app_size':apkSize,
'app_icon_url':iconUrl,
'app_data_install':installCount,
'app_desc':comment,
'app_download_url':downloadUrl}
allData.append(temp)
start = start + maxData
except Exception as e:
print e
print traceback.format_exc()
flag = False
else:
print 'loadmore is finished........'
loadMore()
print len(allData)