python google play_python google play

#!/usr/env  python

#-*- coding: utf-8  -*-

import urllib

import urllib2

import random

import requests

import os,sys

import MySQLdb

from sgmllib import SGMLParser

from BeautifulSoup import BeautifulSoup

import re

num=0

def main():

try:

conn=MySQLdb.connect(host='localhost',user='root',passwd='123456',db='googlemarket',charset="utf8")

conn.query("set names utf8")

except Exception,e:

print e

sys.exit()

cursor=conn.cursor()

category=['PERSONALIZATION','TRANSPORTATION','SPORTS','HEALTH_AND_FITNESS','APP_WALLPAPER','COMICS','MEDICAL','BUSINESS','BOOKS_AND_REFERENCE','WEATHER','ENTERTAINMENT','MEDIA_AND_VIDEO','APP_WIDGETS','TOOLS','PHOTOGRAPHY','PRODUCTIVITY','EDUCATION','NEWS_AND_MAGAZINES','TRAVEL_AND_LOCAL','LIFESTYLE','SOCIAL','FINANCE','SHOPPING','LIBRARIES_AND_DEMO','COMMUNICATION','MUSIC_AND_AUDIO','GAME']

for k in range(0,27):

t="https://play.google.com/store/apps/category/"+category[k]

html=requests.get(t)

preresult=html.content

soup=BeautifulSoup(preresult)

result=soup.prettify("utf-8")

pattern=re.compile('

dataresult=re.findall(pattern,result)

dataresult=list(set(dataresult))

for i in dataresult:

url="https://play.google.com"+i

print url

#url="https://play.google.com/store/apps/details?id=com.androidesk&hl=zh_CNhttps%3A%2F%2Fplay.google.com%2Fstore%2Fapps%2Fdetails%3Fid%3Dcom.androidesk"

html=requests.get(url)

preresult=html.content

soup=BeautifulSoup(preresult)

result=soup.prettify("utf-8")

#名称

pattern=re.compile('

[\s\S]*?

([\s\S]*?)

')

data0=re.findall(pattern,result)

for items in data0:

print items

#制造商

pattern=re.compile('itemprop="name">([\s\S]*?)')

data1=re.findall(pattern,result)

make=data1[0].split("\n")

print make[8]

#版本

pattern=re.compile('itemprop="softwareVersion">([\s\S]*?)

')

data2=re.findall(pattern,result)

print data2[0]

#更新时间

pattern=re.compile('itemprop="datePublished">([\s\S]*?)

')

data3=re.findall(pattern,result)

print data3[0]

#文件大小

pattern=re.compile('itemprop="fileSize">([\s\S]*?)

')

data4=re.findall(pattern,result)

print data4[0]

#支持固件

pattern=re.compile('itemprop="operatingSystems">([\s\S]*?)

')

data5=re.findall(pattern,result)

print data5[0]

#说明

pattern=re.compile('itemprop="description">[\s\S]*?

([\s\S]*?)

')

data6=re.findall(pattern,result)

for items in data6:

print re.sub('[

sql="insert into address(name,version,developer,pubtime,filesize,support,introduction) values(%s,%s,%s,%s,%s,%s,%s)"

for items in data6:

if(data5):

#values=(data0[0],data1[0],data2[0],data3[0],data4[0],data5[0],re.sub('
',' ',items))

#else:

#values=(data0[0],data1[0],data2[0],data3[0],data4[0],'NULL',re.sub('
',' ',items))

#print values

#print sql % values

#cursor.execute(sql,values)

#conn.commit()

   pattern=re.compile('Cover art')

data=re.findall(pattern,result)

global num

for j in data:

print j

print type(j)

headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'}

temp=requests.get(j[1:-2], headers=headers)

f=file("googlemarket/"+str(num),"w+")

num=num+1

print num

f.write(temp.content)

if  __name__=="__main__":

main()

Traceback (most recent call last):

File "crawler0729.py", line 103, in

main()

File "crawler0729.py", line 91, in main

temp=requests.get(j[1:-2], headers=headers)

File "/usr/local/lib/python2.7/dist-packages/requests/api.py", line 55, in get

return request('get', url, **kwargs)

File "/usr/local/lib/python2.7/dist-packages/requests/api.py", line 44, in request

return session.request(method=method, url=url, **kwargs)

File "/usr/local/lib/python2.7/dist-packages/requests/sessions.py", line 335, in request

resp = self.send(prep, **send_kwargs)

File "/usr/local/lib/python2.7/dist-packages/requests/sessions.py", line 438, in send

r = adapter.send(request, **kwargs)

File "/usr/local/lib/python2.7/dist-packages/requests/adapters.py", line 327, in send

raise ConnectionError(e)

requests.exceptions.ConnectionError: HTTPSConnectionPool(host='lh3.ggpht.com', port=443): Max retries exceeded with url: /RBld17rLw4Ik0JtOaKk4bZB2RiGJ2R8H5Q8Rjw3Hh6BAM694fOzzKj1TJFr7R02ZS_40=w30 (Caused by : [Errno 101] Network is unreachable)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值