pyhon爬取东方文学小说【多线程】【正则表达式】【re】
用到的模块
import requests
import re
from concurrent.futures import ThreadPoolExecutor #线程池类
获取目录和每一章url
def book_url(url):
charter_name = []
charter_url = []
r = requests.get(url) #请求小说目录以及每章节http
r.encoding = r.apparent_encoding
obj1 = re.compile('<dt class="jcdf_lline">正文</dt>(?P<part>.*?)</dl></div>',re.S) #定位目录及章节http部分
obj2 = re.compile('''<dd><a href="(?P<url>.*?)" title="(?P<name>.*?)">.*?</a></dd>''',re.S) #将目录,章节http分别提取出
rr = obj1.finditer(r.text)
for i in rr:
rrr = obj2.finditer(i.group('part'))
for i in rrr:
charter_name.append(i.group('name')) #charter_name和charter_url分别装目录名称和章节http
charter_url.append('https://www.jcdf99.com/'+i.group('url'))
return charter_name,charter_url #返回目录及章节http,
保存和获取每章内容的函数
def cha_data_save(name,url):
cha_data = ''
n=1
r = requests.get(url) #请求章节http
r.encoding = r.apparent_encoding
obj3 = re.compile('<div id="content">(?P<part>.*?)<BR><BR>',re.S) #定位章节内容位置
obj4 = re.compile(' (?P<data>.*?)<br />',re.S) #获取章节纯文字
rr = obj3.finditer(r.text)
for i in rr:
rrr = obj4.finditer(i.group('part')+'<br />')
for i in rrr:
cha_data = cha_data + i.group('data')+'\n'
with open(f'F:\\test\\txt\\无敌神龙养成系统\\{name}.txt',mode = 'w',encoding='utf-8') as f: #修改路径
f.write(cha_data) #写入本地
cha_data = ''
多线程池
if __name__ == '__main__':
(cha_name,cha_url) = book_url("https://www.jcdf99.com/book/14/14329/") #修改url
with ThreadPoolExecutor(100) as t: #100个线程同时工作
for x,y in zip(cha_name,cha_url): #遍历目录以及章节url作为函数参数
t.submit(cha_data_save , name=x,url=y)
print('---all_finish---')