import re
import requests
import random
import os
def getHTMLText(url):
# 用户代理
headers = [
{"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/35.0.1916.153 Safari/537.36"},
{"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0"},
{"User-Agent": "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)"}
]
head = random.choice(headers)
# ip代理
proxies = [
{"http": "123.206.25.108:808"},
{"http": "61.150.96.27:36880"},
{"http": "1.198.73.42:9999"},
]
proxie = random.choice(proxies)
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except Exception as e:
return e
def get_chapter_name(url, link_list, title_list):
"""网站链接,找到每一章的标题和每一张章的链接"""
html = getHTMLText(url)
html = str(html)#需要将下载下来的HTML转为STR类型,否则会报错
chapter_infoes = re.findall(r'<li>.*</li>', html)
for chapter_info in chapter_infoes:
g = re.search('href="([^>"]*)"[\s]*title="([^>"]*)"', chapter_info)
url = 'http://www.doupoxs.com' + g.group(1)
title = g.group(2)
link_list.append(url)
title_list.append(title)
return link_list, title_list
def remove_punctuation(text: object) -> object:
"""因为windows文件名名中不能存在某些符号,所以需要将这些符号去除"""
punctuation = '!,;:?"\''
text = re.sub(r'[{}]+'.format(punctuation), '', text)
return text.strip()
def get_chapter_text(url, title, root):
"""根据链接生成每一张的TXT文件"""
html = getHTMLText(url)
html = str(html)
contents = re.findall("<p>(.*?)</p>", html)#findall函数只找括号里的数据,不用再后面取group(1)
title = remove_punctuation(title)#去除标题中的符号
path = root + str(title) + '.txt'
f = open(path, 'a+')
for content in contents:
f.write(content + "\n")
f.close()#f.close()需要和FOR同一级,否则会打不开报错 ~~***ValueError: I/O operation on closed file.***~~
def main() -> object:
url = "http://www.doupoxs.com/nalanwudi/"
root = "D://NALANWUDI/"
if not os.path.exists(root):#创建根目录
os.makedirs(root)
link_list = []
title_list = []
getHTMLText(url)
get_chapter_name(url, link_list, title_list)
for i in range(len(link_list)):#循环生成每一张的小说内容
get_chapter_text(link_list[i], title_list[i], root)
print("下载进度:" + str(i) + "/" + str(len(link_list)))
if __name__ == '__main__':
main()
利用RE爬取斗破小说网站上的小说
于 2020-05-06 23:32:33 首次发布