python爬取app_python 爬取豌豆荚APP的爬虫 源码下载

【实例简介】爬取豌豆荚APP,将APP信息存储到数据库,APP以md5值命名,

【实例截图】

aaf8ed23c2302e5ff7f186f90fd2b5e2.png

【核心代码】

#coding:utf-8

import database

import crawler

import download

import downjudge

import check_update

import context

success_num=0

fail_num=0

def is_unique(md5,db):

return db.data_jud(md5)

def update(db):

global success_num

global fail_num

success_num=0

fail_num = 0

print u'开始更新'

past = open('log_file/urls.txt','r ')

now = open('log_file/update.txt','r')

past_list = past.readlines()

now_list = now.readlines()

exist_num = 0

totle_num = len(now_list) - len(past_list)

for url in now_list:

if url not in now_list and url:

print u'已下载:' '%d'%exist_num '/' '%d'%totle_num u' 成功个数:' '%d'%success_num u' 失败个数:' '%d'%fail_num

exist_num =1

date = context.get_information(url[0:-1])

if date[6]==0:

fail_num =1

else:

success_num =1

try:

print '\n' u'%s下载完成'%date[0].decode('utf-8')

except Exception,e:

pass

if not(is_unique(date[6],db)):

db.data_memory(date[0],date[1],date[2],date[3],date[4],date[5],date[6])

past.write(url)

past.close()

now.close()

print u'更新完成!'

def continue_downall(db):

global success_num

global fail_num

success_num=0

fail_num = 0

downd = open('log_file/downed.txt','r ')

urls = open('log_file/urls.txt','r')

g_exist = downd.readlines()

g_totle = urls.readlines()

totle_num = len(g_totle)

exist_num = len(g_exist)

for url in g_totle:

if url not in g_exist and url:

print u'已下载:' '%d'%exist_num '/' '%d'%totle_num u' 成功个数:' '%d'%success_num u' 失败个数:' '%d'%fail_num

exist_num =1

date = context.get_information(url[0:-1])

if date[6]==0:

fail_num =1

else :

success_num =1

try:

print '\n' u'%s下载完成'%date[0].decode('utf-8')

except Exception,e:

pass

if not(is_unique(date[6],db)):

db.data_memory(date[0],date[1],date[2],date[3],date[4],date[5],date[6])

downd.write(url)

print u'所有应用下载完毕'

downd.close()

urls.close()

def downall(db):

global success_num

global fail_num

success_num=0

fail_num = 0

downd = open('log_file/downed.txt','w')

urls = open('log_file/urls.txt','r')

g_totle = urls.readlines()

totle_num = len(g_totle)

exist_num = 0

for url in g_totle:

if url:

print u'已下载:' '%d'%exist_num '/' '%d'%totle_num u' 成功个数:' '%d'%success_num u' 失败个数:' '%d'%fail_num

exist_num =1

date = context.get_information(url[0:-1])

if date[6]==0:

fail_num =1

else:

success_num =1

try:

print '\n' u'%s下载完成'%date[0].decode('utf-8')

except Exception,e:

pass

if not(is_unique(date[6],db)):

db.data_memory(date[0],date[1],date[2],date[3],date[4],date[5],date[6])

downd.write(url)

print u'所有应用下载完毕'

downd.close()

urls.close()

db = database.db()

db.create_db()

bool_check_update = crawler.start('log_file/urls.txt')

bool_continue_down = downjudge.judge_cont()

if bool_continue_down:

continue_downall(db)

else:

downall(db)

if bool_check_update:

bool_update = check_update.ch_update()

if bool_update:

judge = raw_input('是否更新(y/n):\n'.decode('utf-8').encode('gbk'))

while True:

judge.lower()

if judge == 'y':

update(db)

break

elif judge == 'n':

break

else:

judge = raw_input('输入错误,请重新输入(y/n):\n'.decode('utf-8').encode('gbk'))

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值