python抓取豌豆荚app数据信息

一个哥们参加app大赛,我帮他写的抓取程序,但是好像抓取的数据挺少的,不知道怎么回事,先贴上来大家研究研究吧


# -*- coding: utf-8 -*-
"""
Created on Sat Apr 26 10:50:20 2014

@author: lifeix
"""
import urllib2
from HTMLParser import HTMLParser
import simplejson as json
import traceback

class Spyder(HTMLParser):
    
    def __init__(self):
        HTMLParser.__init__(self)
        self.data = [];  #app信息集合
        self.temp = {};  #存储单个app信息
        self.isStartTag = False  #app列表的入口div
        self.singleLi = False   #每个app的入口li元素
        self.isAppDescFlag = False  #进入app的详细描述元素div
        self.isMeta = False
        self.isInstallCount = False  #
        self.isComment = False
        self.isInstallBtn = False
        self.isDot = False
        self.isIconUrl = False
        self.icon= False
        self.recordInstallCount = False
        pass
    
    def handle_starttag(self, tag, attrs):
        if tag == 'li':
            for key, value in attrs:
                if key == 'class' and value == 'card':
                    self.singleLi = True
                    break
        elif  self.singleLi:
            if self.icon == False and self.isIconUrl == False and tag == 'div':              
                for key, value in attrs:
                    if key == 'class' and value == 'icon-wrap':
                        self.isIconUrl = True
                        break
            elif self.isIconUrl and tag == 'img':

                for key, value in attrs:
                    if key == 'src':
                        self.temp['app_icon_url'] = value                     
                        self.isIconUrl = False
                        self.icon = True
                        break
            if tag == 'div':
                for key , value in attrs:
                    if value == 'app-desc' and key == 'class':
                        self.singleLi = False                        
                        self.isAppDescFlag = True
                        break
        elif self.isAppDescFlag:

            if self.isMeta == False and tag == 'a':
            
                for key, value in attrs:
                    if key == 'title':
                        self.temp['app_name'] = value
                        break
            elif  tag == 'div':
                for  key, value in attrs:
                    if self.isMeta == False and key == 'class' and value == 'meta':
                        self.isMeta = True
                        break
                    elif self.isComment == False and key == 'class' and value == 'comment':
                        self.isComment = True

                        
            elif tag == 'span' and self.isMeta:
            
                for key, value in attrs:
                    if key == 'class':
                        if self.isInstallCount == False and value == 'install-count':
                            self.isInstallCount = True
                        elif self.isDot == False and self.isInstallCount and value == 'dot':
                            self.isDot = True
                    elif key== 'title'  and self.isInstallCount and self.isDot:
                         self.isInstallCount = False
                         self.isDot = False
                         self.temp['app_size'] = value
            if self.isMeta and tag == 'a':
                
                if self.isInstallBtn == False:
                    for key, value in attrs:
                        if key == 'class' and value.strip() == 'install-btn':
                            self.isInstallBtn = True
                            break
                if self.isInstallBtn:
                    for key, value in attrs:
                       if key == 'href':
                            self.temp['app_download_url'] = value
                            self.isAppDescFlag = False
                            self.isInstallBtn = False
                            self.isMeta = False
                            self.data.append(self.temp)
                            self.temp = {}
                            self.recordInstallCount = False
                            self.icon = False
                            break
        
    
    def handle_data(self, data):
        if self.recordInstallCount == False and self.isInstallCount:
            self.recordInstallCount = True
            self.temp['app_data_install'] = data.split(' ')[0]
            
        elif self.isComment:
            self.temp['app_desc'] = data
            self.isComment = False
            
            
    
    def getResult(self):
        return self.data;
        

request = urllib2.urlopen("http://www.wandoujia.com/apps")
content = request.read();
allData = []  #存放所有的app信息
spyder = Spyder()
spyder.feed(content)
allData = spyder.getResult()

maxData  = 12
moreUrl = 'http://apps.wandoujia.com/api/v1/feeds?max=12&start=%d&opt_fields=data.app.tags.*,data.app.editorComment,data.app.likesCount,data.app.reason,data.app.ad,data.app.title,data.app.packageName,data.app.apks.size,data.app.icons.px68,data.app.apks.superior,data.app.installedCountStr,data.app.snippet,data.app.apks.versionCode&callback=jsonp1'
def loadMore():
    start = 24
    flag = True
    while flag:    
        try:
            moreRequest = urllib2.urlopen(moreUrl%start)
            if moreRequest:
                moreData = moreRequest.read()
                data = moreData[7:len(moreData) - 2]
                target = json.JSONDecoder().decode(data)
                targetData = target['data']
                print len(targetData), start,len(allData)
                if len(targetData) < 1:
                    flag = False
                else:
                    for obj in targetData:
                        app = obj['app']
                        apkSize = app['apks'][0]['size']

                        comment = app['editorComment']
                        iconUrl = app['icons']['px68']
                        installCount = app['installedCountStr'].split(' ')[0]
                        title = app['title']
                        packageName = app['packageName']
                        downloadUrl = 'http://apps.wandoujia.com/apps/%s/download'%packageName
                        temp = {'app_name':title,
                                'app_size':apkSize,
                                'app_icon_url':iconUrl,
                                'app_data_install':installCount,
                                'app_desc':comment,
                                'app_download_url':downloadUrl}
                    
                        allData.append(temp)
            start = start + maxData
        except Exception as e:
            print e
            print traceback.format_exc()
            flag = False
    else:
        print 'loadmore is finished........'
        
loadMore()
print len(allData)



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值