from gevent import monkey, joinall
monkey.patch_all() # 重点
from random import randint
from urllib.parse import quote
from gevent.pool import Pool
import requests
from lxml import etree
import os
import time
from re import match, findall
def download_function(index, page_url, error_number=None):
global error_download_list
global error_download_list_1
global error_download_list_2
user_Agent = {
'1': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'2': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
'3': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'4': 'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
'5': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
"6": 'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
"7": 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
"8": 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
"9": 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
"10": 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
"11": 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
"12": 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
"13": 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)'
} # 好像没啥用???
if error_number != 3:
time.sleep(randint(1, 20)) # 网站限流 # 好讨厌的限流
user_agent1 = user_Agent[str(randint(1, 13))]
page_result = requests.get(url=f'https://www.xiaoshuowangzhan.com{page_url}',
headers={'user-agent': user_agent1})
if page_result.status_code != 200:
time.sleep(randint(1, 10)) # 网站限流
user_agent2 = user_Agent[str(randint(1, 13))]
page_result = requests.get(url=f'https://www.xiaoshuowangzhan.com{page_url}',
headers={'user-agent': user_agent2})
if page_result.status_code != 200:
time.sleep(randint(1, 5)) # 网站限流
user_agent3 = user_Agent[str(randint(1, 13))]
page_result = requests.get(url=f'https://www.xiaoshuowangzhan.com{page_url}',
headers={'user-agent': user_agent3})
if page_result != 200:
if not error_number:
error_download_list.append([index, page_url])
elif error_number == 1:
error_download_list_1.append([index, page_url])
elif error_number == 2:
error_download_list_2.append([index, page_url])
print(f'下载失败, 被限流, 错误号:{error_number}')
return
page_html = etree.HTML(page_result.content)
title = page_html.xpath('//div[@class="text-center"]/h1/text()')[0]
messages = ''
content = page_html.xpath('//div[@id="txtContent"]/text()')
for i, line in enumerate(content):
line = str(line).replace("\t", "")
line = line.replace("\r", "")
line = line.replace("\n", "")
line = line.replace(' ', ' ')
line = line.replace(' ', ' ')
messages += f'{line}\n'
with open(f'{dir}/{index}_{title}.txt', 'w', encoding='utf-8') as fw:
fw.write(f'{title}\n')
fw.write(f'{messages}')
print(f'{title} Download Success')
def merge_files(file_name):
file_list = os.listdir(f'{file_name}/')
with open(f'{file_name}.txt', 'w') as fw:
for x in range(0, file_list.__len__() + 1):
for file in file_list:
if match(f'^{x}_.*$', file):
fw.write(open(dir + file, 'r', encoding='utf-8').read())
fw.write('\n')
def str_to_headers(base_str):
dic = {}
for i in base_str.split('\n'):
if i.strip():
dic[i.strip().split(': ')[0]] = i.strip().split(': ')[1]
return dic
if __name__ == '__main__':
print('请输入书名:', end='')
file_name = input()
print(f'书名:{file_name},开始搜索...')
search_url = f'https://www.xiaoshuowangzhan.com/search.htm?keyword={file_name}'
headers_str = """
authority: www.xiaoshuowangzhan.com
method: GET
scheme: https
accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
accept-encoding: gzip, deflate, br
accept-language: zh-CN,zh;q=0.9,en;q=0.8
cache-control: max-age=0
cookie: cacheAC6=122919; cids_AC6=122919
sec-fetch-dest: document
sec-fetch-mode: navigate
sec-fetch-site: none
sec-fetch-user: ?1
upgrade-insecure-requests: 1
user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36
"""
PoolSize = 50
pool = Pool(PoolSize)
error_download_list = []
error_download_list_1 = []
error_download_list_2 = []
headers = str_to_headers(headers_str)
headers['path'] = f'/search.htm?keyword={quote(file_name)}'
search_result = requests.get(url=search_url) # 这个憨憨网站居然不查cookie?
if search_result.status_code != 200:
raise Exception('错误')
# 处理查询 获取小说id
search_html = etree.HTML(search_result.content)
FictionId = search_html.xpath('//div[@class="panel-body"]/ul/li[2]/div[2]/a/@href')[0].split('/')[2]
# 创建文件夹
if not os.path.isdir(f'{file_name}'):
os.mkdir(f'{file_name}')
dir = f'{file_name}/'
menu_url = f'https://www.xiaoshuowangzhan.com/book/{FictionId}/'
menu_result = requests.get(url=menu_url)
print('进入目录')
menu_html = etree.HTML(menu_result.content)
page_list = menu_html.xpath('//ul[@id="chapters-list"]/li/a/@href')
pool_lis = []
for index, page in enumerate(page_list):
pool.spawn(download_function, index, page)
# download_function(index, page)
joinall(pool_lis)
PoolSize = 30 # 减速防限流
pool = Pool(PoolSize)
error_download_re_download = []
print(f'error download list length:{error_download_list.__len__()}')
for page in error_download_list:
error_download_re_download.append(pool.spawn(download_function, page[0], page[1], 1))
joinall(error_download_re_download)
PoolSize = 20 # 减速防限流
pool = Pool(PoolSize)
error_download_re_download = []
print(f'error download list length1:{error_download_list_1.__len__()}')
for page in error_download_list_1:
error_download_re_download.append(pool.spawn(download_function, page[0], page[1], 2))
joinall(error_download_re_download)
for g in error_download_list_2:
download_function(g[0], g[1], 3) # 一个个来 防限流
print('未完成的章节:%s' % error_download_list_2)
merge_files(file_name)
print('下载完成')
print('查看是否缺失章节')
li = os.listdir('地球人实在太凶猛了/')
li1 = []
for i in li:
li1.append(int(findall('^(\d+)_.*$', i)[0]))
li2 = list(range(0, li.__len__()))
missing_file = list(set(li1) ^ set(li2))
if missing_file:
print(f'缺失章节数量: {missing_file.__len__()}, 章节号为:{missing_file}')
else:
print("未发现缺失章节")
print('程序完成 END!')
简单的爬虫还是很好玩的,偶尔写下练练手,复习一下gevent的多协程模式.