1 单进程:
# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
import requests, sys
"""
类说明:下载《笔趣看》网小说《一念永恒》
"""
class downloader(object):
def __init__(self):
self.server = 'http://www.biqukan.com/'
self.target = 'http://www.biqukan.com/1_1094/'
self.names = [] # 存放章节名
self.urls = [] # 存放章节链接
self.nums = 0 # 章节数
"""
函数说明:获取下载链接
"""
def get_download_url(self):
html=requests.get(self.target).text
bs=BeautifulSoup(html,'html.parser')
div=bs.find_all('div',class_='listmain')
bs=BeautifulSoup(str(div[0]),'html.parser')
list=bs.find_all('a')
self.nums=len(list[15:])
# print(list)
for each in list[15:]:
self.names.append(each.string)
self.urls.append(self.server+each.get('href'))
def get_content(self,target):
html=requests.get(url=target).text
bs=BeautifulSoup(html,'html.parser')
div=bs.find_all('div',class_='showtxt')
texts=div[0].text.replace('\xa0' * 8, '\n\n')
return texts
def writer(self):
f=open('一念永恒.txt','a',encoding='utf-8')
for i in range(self.nums) :
f.write(self.names[i])
f.write('\n')
f.writelines(self.get_content(self.urls[i]))
f.write('\n\n')
print(i)
f.close()
if __name__ == "__main__":
dl = downloader()
dl.get_download_url()
dl.writer()
# print('《一年永恒》开始下载:')
# print('《一年永恒》下载完成')
2 多进程
注意 dl.names[i] 为string ,pool中 args 不能传递 (我也不知道为什么) 所以加了个str
# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
import requests, sys
from multiprocessing import Pool
import string
headers={
'Cookie':r'UM_distinctid=164fd71debe478-0aed9f594fffa9-3c604504-1fa400-164fd71debf4c2; bcolor=; font=; size=; fontcolor=; width=; CNZZDATA1260938422=2084872231-1533260238-%7C1533551125',
'Host':'www.biqukan.com',
#'Referer':r'http://www.biqukan.com/1_1094/',
'User-Agent':r'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
class downloader(object):
def __init__(self):
self.server = 'http://www.biqukan.com/'
self.target = 'http://www.biqukan.com/1_1094/'
self.names = [] # 存放章节名
self.urls = [] # 存放章节链接
self.nums = 0 # 章节数
def get_download_url(self):
req = requests.get(url=self.target,headers=headers)
html = req.text
div_bf = BeautifulSoup(html,"html.parser")
div = div_bf.find_all('div', class_='listmain')
a_bf = BeautifulSoup(str(div[0]),"html.parser")
a = a_bf.find_all('a')
self.nums = len(a[15:]) # 剔除不必要的章节,并统计章节数
for each in a[15:]:
self.names.append(each.string)
self.urls.append(self.server + each.get('href'))
def get_contents(target):
req = requests.get(url=target)
html = req.text
bf = BeautifulSoup(html,"html.parser")
texts = bf.find_all('div', class_='showtxt')
texts = texts[0].text.replace('\xa0' * 8, '\n\n')
return texts
def writer(name,path,texts):
ans=get_contents(texts)
with open(path, 'a', encoding='utf-8') as f:
f.write(name + '\n')
f.writelines(ans)
f.write('\n\n')
if __name__ == "__main__":
pool = Pool(15)
dl = downloader()
dl.get_download_url()
print('《一年永恒》开始下载:')
for i in range(dl.nums):
#print('1',dl.get_contents('http://www.biqukan.com/1_1094/5403177.html'))
pool.apply_async(writer,args=(str(dl.names[i]),'一念永恒.txt',dl.urls[i]))
#pool.apply_async(writer, args=(str(dl.names[i]), '一念永恒.txt', dl.urls[i]))
pool.close()
pool.join()
print('《一年永恒》下载完成')