多线程小说下载

# -*- coding: utf-8 -*-
# @Author: LI
# @Date:   2016-12-26 20:50:50
# @Last Modified by:   Administrator
# @Last Modified time: 2016-12-27 15:07:06


import urllib2,re,datetime,time
from pyquery import PyQuery as pq
from  multiprocessing import Pool
import codecs


def getBookList(url):
    page = urllib2.urlopen(urls)
    text = unicode(page.read(),'gbk')


    d = pq(text)
    lst = d('#list').find('dd')


    cnt = 0
    urllst = []
    for itm in lst:
        dt = pq(itm).find('a')
        txt_href = dt.attr('href')
        txt_title = dt.text()
        if txt_href!=None:
            cnt += 1
            
            if len(txt_href)<5 or len(txt_title)<1:
                print '--------------------------------------'
                print pq(itm).html()
                print '--------------------------------------'
            else:
                urllst.append([cnt,urls+txt_href,txt_title,pq(itm).html()])#cnt计数,urls+txt_href链接,txt_title标题,pq(itm).html()


    #print urllst
    return urllst


def getContent(lsts,num_retries=5):
    start_time = time.time()
    id,url,title,html = lsts
    #print lsts
    try:
        newpage = urllib2.urlopen(url)
        #newtext = unicode(newpage.read(),'utf-8')
        newtext = newpage.read()
        newd = pq(newtext)
        content = newd('#content').html()
        content = re.sub('<br/><br/>',"\r\n",content)
        content = re.sub('<script>readx\(\);', '', content)
        content = re.sub('</script>', '', content)


        filename = str(id)+'.txt'
        with codecs.open(filename,'w','utf-8') as fps:
            fps.write(title+"\r\n")
            fps.write(content)
        return [id,title,start_time]
    except urllib2.URLError as e:
        print 'Download error | '+e.reason
        print '读取页面失败'
        if num_retries>0:
            getContent(lsts,num_retries-1)


def callbackfun(arg):
    id = arg[0]
    title = arg[1]
    start_time = arg[2]
    end_time = time.time()
    #print "%s \t %s \t %-40s \t %-10ss" % (datetime.datetime.now(),id, title, end_time-start_time)


import os
def num_file(lsts_arr,name):
    name=unicode(name,'utf-8')
    fp=codecs.open(name+'.txt', 'a', 'utf-8')
    for lsts in lsts_arr:
        try:
            fi=codecs.open( str(lsts[0])+'.txt', 'r', 'utf-8')
            fi_txt=fi.read()
            if fi_txt:
                fp.write( fi_txt )
                fp.write('\n\r')
            fi.close()
            os.remove( str(lsts[0])+'.txt' )
        except:
            continue
if __name__ == '__main__':


    urls = 'http://www.qu.la/book/401/'
    urllst = getBookList(urls)
    
    pool = Pool(50)


    for obj in urllst:
        #getContent(obj)
        pool.apply_async(func=getContent, args=(obj,))


    pool.close()
    pool.join()


    num_file(urllst,'测试一下')
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值