python3爬虫实战之一
下载顶点小说的小说,有单线程和多线程两种方式,自行体验两种方式快慢
环境先安装requests库、beautifulsiup库
看心情,啥时候补个详细步骤介绍,如果我有动力的话= =
dingdian_novel_download.py
import requests
from bs4 import BeautifulSoup
import re
import sys,io
from multiprocessing.dummy import Pool as threadpool
class Novel(object):
title=''
author=''
content=[]
url=''
def set_url(self,headers):
url=''
while True:
url=input('Your url:')
if(url.startswith('https://www.booktxt.net/')):
r1=requests.get(url,headers=headers)
if r1.status_code==200:
break
print('Error!Input again~')
sys.stdout.flush()
self.url=url
def add_section(self,num,titleThis,content):
self.content.append([num,titleThis,content])
def get_details(self,headers):
req=requests.get(self.url,headers=headers)
so1=BeautifulSoup(req.content.decode('gbk'),'lxml')
self.title=so1.select('#maininfo #info h1')[0].get_text()
self.author=so1.select('#maininfo #info p')[0].get_text()
startTag=so1.select('#list dl dt')[1]
for index,one in enumerate(startTag.find_all_next("dd")):
self.add_section(num=index+1,titleThis=one.a.get_text(),content='https://www.booktxt.net'+one.find('a').get('href'))
def get_content_all(self,headers):
def make_great(one):
r2=requests.get(one[2])
if r2.status_code==200:
so2=BeautifulSoup(r2.content.decode('gbk'),'lxml')
one[2]=so2.select('#content')[0].get_text()
else:
one[2]='内容错误!'
self.allLength=self.allLength-1
print (str(one[0])+' is ok! '+str(self.allLength)+' left!')
sys.stdout.flush()
return one
self.allLength=len(self.content)
print('all is '+str(self.allLength))
sys.stdout.flush()
self.content=[make_great(one) for one in self.content]
def get_content_all_pool(self,headers):
self.allLength=len(self.content)
print('all is '+str(self.allLength))
sys.stdout.flush()
def getAll(one):
r2=requests.get(one[2])
if r2.status_code==200:
so2=BeautifulSoup(r2.content.decode('gbk'),'lxml')
one[2]=so2.select('#content')[0].get_text()
else:
one[2]='内容错误!'
self.allLength=self.allLength-1
print (str(one[0])+' is ok! '+str(self.allLength)+' left!')
sys.stdout.flush()
return one
pool=threadpool(4)
result=pool.map(getAll,self.content)
pool.close()
pool.join()
self.content=result
def downloadToTxt(self):
file=open(self.title+'.txt','w+',encoding='utf-8')
file.write('*****'+self.title+'-'+self.author+'*****\n\n\n')
self.content.sort(key=lambda x:x[0])
for one in self.content:
file.write('### '+one[1]+'\n\n')
file.writelines(one[2]+'\n\n')
file.close()
print('End Downloads and start enjoying!!')
sys.stdout.flush()
if __name__ == '__main__':
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gbk')
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
'Referer':'https://www.booktxt.net/'
}
novel=Novel()
novel.set_url(headers=headers)
novel.get_details(headers=headers)
danDuo=input('Y/n to choose whether use multithreading:')
if (danDuo=='Y' or danDuo=='y'):
novel.get_content_all_pool(headers=headers)
else:
novel.get_content_all(headers=headers)
novel.downloadToTxt()