以下是我的一些代码,小白轻喷,打卡日记
import os.path
import requests
import re
'''
对某网站某小说进行内容爬取
先获取目录1-33页章节列表
然后对每一页对应的每一个章节进行获取url
再对章节对应的两页进行循环输出写入文件
'''
if __name__ == '__main__':
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.60'
}
url = 'http://www.mbiquge.org/indexlist/109_109086/%d/'
if not os.path.exists("./xiao_shou"):
os.mkdir("./xiao_shou")
filename = './xiao_shou/从海贼开始燃烧的世界.txt'
# 对应章节名字
contain_num = 1
# 上面内容不会改动!!!!!!!
# 一.循环每一页目录
for url_page in range(1, 33):
new_url = format(url % url_page)
resp = requests.get(new_url, headers=head).text
# 目录页爬取章节url
one_re = '<a href="(/109_109086/\d.*?)" rel="chapter"><dd>(.*?)</dd></a>'
data_re_content = re.findall(one_re, resp, re.S)
# 二.循环获得章节当前多个url链接
for book_content in data_re_content:
data_url = "http://www.mbiquge.org" + (book_content[0].split('.'))[0] + '_%d.html'
# 三.每个章节有两页循环
for contain_page in range(1, 3):
# 获取对应章节1or2页url内容
new_data_url = format(data_url % contain_page)
data_page = requests.post(new_data_url, headers=head).text
book_re = '<p>(.*?)</p>'
book_data = re.findall(book_re, data_page, re.S)
# 章节名称 + 对应内容写入
file_book_page = "第{}章第{}页".format(contain_num, contain_page)
print(file_book_page) # 对应第几章第1-2页
with open(filename, 'a', encoding='utf-8') as ws:
ws.write(file_book_page + str(book_data[1:-2]) + "\n")
# 第二层循环章节名字
contain_num += 1
运行结果图如下: