爬取“好孩子”app的所有绘本、以及配套文件、音频(纯属自己娱乐)

#coding=utf-8 
import urllib
import random
import urllib2
import json
import os
import re


FOLDER_PATH = "C:\\Users\\guanjia\\Desktop\\goodboy"
log = open('C:\\Users\\guanjia\\Desktop\\log.log', 'a')
def loadPicList(jcId):
    one_huiben_name = getHuibenName(jcId)
    print >> log, "Message: 开始下载        "+str(jcId)+"绘本名字:"+ one_huiben_name


    url         = 'http://m.xhzapp.com/Dian/Illustrated.aspx/GetPicList'
    data      = {'jcId':jcId, 'kwType':7, 'x':random.random()}  # values是post数据
    headers = {'Content-Type': 'application/json'}
    request  = urllib2.Request(url=url, headers=headers, data=json.dumps(data))
    response = urllib2.urlopen(request)
    response_read = response.read()
    if not response_read:
        print >> log, "Message: 获取绘本失败    "+str(jcId)+"绘本名字:"+ one_huiben_name
        return False
    load_pic_list = eval(eval(response_read).get('d')).get('list')
    map3_set = set()
    pic_set = set()
    picid_set = set()
    if not load_pic_list:
        print >> log, "Message: 获取图片音频url失败    "+str(jcId)+"绘本名字:"+ one_huiben_name
        return False


    for one in load_pic_list:
        picid_set.add(one.get('Id'))
        map3_set.add(one.get('PageUrl'))
        pic_set.add(one.get('PagePic'))
    else:
        print >> log, "Message: 创建三个文件夹    bookid:"+str(jcId)+"绘本名字:"+ one_huiben_name
        one_huiben_dir_path = FOLDER_PATH + os.sep + one_huiben_name
        one_huiben_pic_path = one_huiben_dir_path + os.sep + 'huibenshow'
        one_huiben_location_path = one_huiben_dir_path + os.sep + 'huibenclick'
        if not os.path.isdir(one_huiben_dir_path):
            os.makedirs(one_huiben_pic_path)
            os.makedirs(one_huiben_location_path)


        picid_set = list(picid_set)
        print >> log, "Message: 开始下载数据     bookid:"+str(jcId)+"绘本名字:"+ one_huiben_name
        for index, one in enumerate(pic_set):
            urllib.urlretrieve(one, one_huiben_pic_path + os.sep + str(picid_set[index]) + '.jpg')
        print >> log, "Message: 图片下载结束     bookid:"+str(jcId)+"绘本名字:"+ one_huiben_name
        for index, one in enumerate( map3_set):
            urllib.urlretrieve(one, one_huiben_dir_path + os.sep + str(index) + '.mp3')
        print >> log, "Message: mp3下载结束     bookid:"+str(jcId)+"绘本名字:"+ one_huiben_name
        for picId in  picid_set:
            loadAnchor(picId, one_huiben_location_path)
        print >> log, "Message: 点读区下载结束     bookid:"+str(jcId)+"绘本名字:"+ one_huiben_name


def loadAnchor(picId, one_huiben_location_path):
    url       = "http://m.xhzapp.com/Handler/PicAnchorHandler.ashx?guid=" + str(random.random()) + "&option=GetPicAnchor&picId=" + str(picId)
    req = urllib2.Request(url)
    res_data = urllib2.urlopen(req)
    res = res_data.read()
    res = eval(res)
    tracks = [[obj.get("Id"), obj.get("top"), obj.get("left"), obj.get("width"), obj.get("height"),
               obj.get("stime"), obj.get("endtime"), obj.get("Translation")] for obj in  res.get('list')]
    with open(one_huiben_location_path+os.sep+str(picId)+'.txt', 'wb') as file:
        file.write(json.dumps(tracks,ensure_ascii=False))






# 使用正则获取绘本名字
def getHuibenName(bookId):
    url = 'http://m.xhzapp.com/Share/index.aspx?bookId='+str(bookId)+'&IsPointRead=1&bookType=7'
    def getHtml(url):
        page = urllib.urlopen(url)
        html = page.read()
        return json.dumps(html, ensure_ascii=False)


    html = getHtml(url)
    reg = r'<title>(.+)</title>'
    imgre = re.compile(reg)
    imglist = re.findall(imgre, html)
    return  imglist[0].replace('\\r','').replace('\\n','').replace('\\t','')





if __name__ == '__main__':
    """ bookshow, bookclick """
    for jcId in range(1, 3489):
        try:
            print >> log, "*"*100
            loadPicList(jcId)
        except Exception, e:
            print >> log, "Error: "+e.message
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值