#-*-coding:utf-8-*-
from pyquery import PyQuery as pq
import CommonUtils
'''
搞趣网抓取
'''
def parseGao7(url='http://www.gao7.com/free/1-0-0-3-0-0-1') :
result=[]
data=pq(url)
if data :
lis=data('.app-list li')
for li in lis:
li=pq(li)
_img=li('a img').attr('src')
main_div=li('div.app-list-main')
_title=main_div('h3').text()
_category=li('p').eq(0)('span').text()
_updtime=li('p').eq(1)('span.upd-time').text()
_desc=li('p').eq(2).text()
r_div=li('div.app-list-r')
_commentnum=r_div('a').text()
_oldpirce=r_div('p del').text()
_newprice=r_div('p span').text()
dict={}
dict['img']=_img
dict['title']=_title
dict['category']=_category
dict['updtime']=_updtime
dict['comment']=_commentnum
dict['oldprice']=_oldpirce
dict['newproce']=_newprice
dict['desc']=_desc
result.append(dict)
return result
def getNextPageUrl(url, domain='http://www.gao7.com') :
try :
data=CommonUtils.getUrlContent(url)
content=pq(data)
next_page=[i for i in content('div.ui-page a').items() if i.text()=='下一页']
next_page_url = next_page.pop().attr('href')
if next_page_url :
return domain + next_page_url
except IndexError as e1:
print(e1)
except Exception as e2:
print(e2)
if __name__ == '__main__' :
result=[]
url = 'http://www.gao7.com/free/1-0-0-3-0-0-1'
while url :
print(url)
page_result=parseGao7(url)
result.append(page_result)
url=getNextPageUrl(url)
print(result)
CommonUtil.py
#-*-coding:utf-8-*-
'''
常用方法工具类
'''
import urllib.request
import gzip
'''
打印分隔符
'''
def printSplitLine(dchar='*', dnum=30) :
print(dchar*dnum)
'''
格式化打印字典
'''
def printDict(dict):
if dict :
for key, value in dict.items() :
print('key=%s; value=%s' % (key, value))
'''
格式化打印列表
'''
def printList(list):
if list :
for value in list :
print(value)
'''
根据URL返回内容,有些页面可能需要gzip解压缩
'''
def getUrlContent(url):
#返回页面内容
doc = urllib.request.urlopen(url).read()
#解码
try:
html=gzip.decompress(doc).decode("utf-8")
except:
html=doc.decode("utf-8")
return html
if __name__ == '__main__' :
#printSplitLine()
# dict={}
# dict['a']='aaa'
# dict['b']='bbb'
# dict['c']='ccc'
# printDict(dict)
# list=[]
# list.append('1')
# list.append('2')
# list.append('3')
# printList(list)
print(getUrlContent('http://www.app111.com/free/'))