# -*- coding: utf-8 -*-
# @Author: LI
# @Date: 2016-12-26 20:50:50
# @Last Modified by: Administrator
# @Last Modified time: 2016-12-27 15:07:06
import urllib2,re,datetime,time
from pyquery import PyQuery as pq
from multiprocessing import Pool
import codecs
def getBookList(url):
page = urllib2.urlopen(urls)
text = unicode(page.read(),'gbk')
d = pq(text)
lst = d('#list').find('dd')
cnt = 0
urllst = []
for itm in lst:
dt = pq(itm).find('a')
txt_href = dt.attr('href')
txt_title = dt.text()
if txt_href!=None:
cnt += 1
if len(txt_href)<5 or len(txt_title)<1:
print '--------------------------------------'
print pq(itm).html()
print '--------------------------------------'
else:
urllst.append([cnt,urls+txt_href,txt_title,pq(itm).html()])#cnt计数,urls+txt_href链接,txt_title标题,pq(itm).html()
#print urllst
return urllst
def getContent(lsts,num_retries=5):
start_time = time.time()
id,url,title,html = lsts
#print lsts
try:
newpage = urllib2.urlopen(url)
#newtext = unicode(newpage.read(),'utf-8')
newtext = newpage.read()
newd = pq(newtext)
content = newd('#content').html()
content = re.sub('<br/><br/>',"\r\n",content)
content = re.sub('<script>readx\(\);', '', content)
content = re.sub('</script>', '', content)
filename = str(id)+'.txt'
with codecs.open(filename,'w','utf-8') as fps:
fps.write(title+"\r\n")
fps.write(content)
return [id,title,start_time]
except urllib2.URLError as e:
print 'Download error | '+e.reason
print '读取页面失败'
if num_retries>0:
getContent(lsts,num_retries-1)
def callbackfun(arg):
id = arg[0]
title = arg[1]
start_time = arg[2]
end_time = time.time()
#print "%s \t %s \t %-40s \t %-10ss" % (datetime.datetime.now(),id, title, end_time-start_time)
import os
def num_file(lsts_arr,name):
name=unicode(name,'utf-8')
fp=codecs.open(name+'.txt', 'a', 'utf-8')
for lsts in lsts_arr:
try:
fi=codecs.open( str(lsts[0])+'.txt', 'r', 'utf-8')
fi_txt=fi.read()
if fi_txt:
fp.write( fi_txt )
fp.write('\n\r')
fi.close()
os.remove( str(lsts[0])+'.txt' )
except:
continue
if __name__ == '__main__':
urls = 'http://www.qu.la/book/401/'
urllst = getBookList(urls)
pool = Pool(50)
for obj in urllst:
#getContent(obj)
pool.apply_async(func=getContent, args=(obj,))
pool.close()
pool.join()
num_file(urllst,'测试一下')
# @Author: LI
# @Date: 2016-12-26 20:50:50
# @Last Modified by: Administrator
# @Last Modified time: 2016-12-27 15:07:06
import urllib2,re,datetime,time
from pyquery import PyQuery as pq
from multiprocessing import Pool
import codecs
def getBookList(url):
page = urllib2.urlopen(urls)
text = unicode(page.read(),'gbk')
d = pq(text)
lst = d('#list').find('dd')
cnt = 0
urllst = []
for itm in lst:
dt = pq(itm).find('a')
txt_href = dt.attr('href')
txt_title = dt.text()
if txt_href!=None:
cnt += 1
if len(txt_href)<5 or len(txt_title)<1:
print '--------------------------------------'
print pq(itm).html()
print '--------------------------------------'
else:
urllst.append([cnt,urls+txt_href,txt_title,pq(itm).html()])#cnt计数,urls+txt_href链接,txt_title标题,pq(itm).html()
#print urllst
return urllst
def getContent(lsts,num_retries=5):
start_time = time.time()
id,url,title,html = lsts
#print lsts
try:
newpage = urllib2.urlopen(url)
#newtext = unicode(newpage.read(),'utf-8')
newtext = newpage.read()
newd = pq(newtext)
content = newd('#content').html()
content = re.sub('<br/><br/>',"\r\n",content)
content = re.sub('<script>readx\(\);', '', content)
content = re.sub('</script>', '', content)
filename = str(id)+'.txt'
with codecs.open(filename,'w','utf-8') as fps:
fps.write(title+"\r\n")
fps.write(content)
return [id,title,start_time]
except urllib2.URLError as e:
print 'Download error | '+e.reason
print '读取页面失败'
if num_retries>0:
getContent(lsts,num_retries-1)
def callbackfun(arg):
id = arg[0]
title = arg[1]
start_time = arg[2]
end_time = time.time()
#print "%s \t %s \t %-40s \t %-10ss" % (datetime.datetime.now(),id, title, end_time-start_time)
import os
def num_file(lsts_arr,name):
name=unicode(name,'utf-8')
fp=codecs.open(name+'.txt', 'a', 'utf-8')
for lsts in lsts_arr:
try:
fi=codecs.open( str(lsts[0])+'.txt', 'r', 'utf-8')
fi_txt=fi.read()
if fi_txt:
fp.write( fi_txt )
fp.write('\n\r')
fi.close()
os.remove( str(lsts[0])+'.txt' )
except:
continue
if __name__ == '__main__':
urls = 'http://www.qu.la/book/401/'
urllst = getBookList(urls)
pool = Pool(50)
for obj in urllst:
#getContent(obj)
pool.apply_async(func=getContent, args=(obj,))
pool.close()
pool.join()
num_file(urllst,'测试一下')