from bs4 import BeautifulSoup
import requests
import time
import random
import re
if __name__ =='__main__':
url = 'http://ajnnan.com/88_88846/'
text = requests.get(url=url)
text.encoding='utf-8'
page_text =text.text
soup = BeautifulSoup(page_text, 'lxml')
dd_list = soup.find('div',id='list')
a_list = dd_list.find_all('a')
del a_list[0:12]
print(a_list)
fp=open('./南明第一狠人.txt', 'w', encoding='utf-8')
path = r'C:\爬虫实验\xiaoshuo\ '
for a in a_list:
time.sleep(6.66)#固定间隔时长,括号内数值可以自定议,下一行同可自定义。不过间隔时长短会导致访问频繁被网站 KO
time.sleep(random.random()*3.24)#随机间隔时长 代码基本完善,不过存在方法不够完美
title_1 = a.string
title2 = re.sub(u'\\(.*?\\)','',title_1)#去除了作者求月票的行为!!!!但是作者内藏吐槽章节未去除
title = re.sub(u'\\(.*?\\)','', title2)#标题内的有两种括号,
print(title)#打印章节的名字
detail_url='http://ajnnan.com'+a['href']
print(detail_url)#打印章节的url
detail_page_text = requests.get(url=detail_url,).text #, headers=headers
detail_page_text = detail_page_text.encode("ISO-8859-1")
detail_page_text = detail_page_text.decode("utf-8")
detail_soup = BeautifulSoup(detail_page_text, 'lxml')
div_tag = detail_soup.find('div',attrs={'id':'content'})
content = div_tag.text
fp.write(title + ':' + content + '\n')
#print(content)
with open(path + title + '.txt', 'w', encoding='utf-8') as f:
f.write(title + ':' + content + '\n')
print('返回!!!'+title,'爬取成功!!!')
审核问题,之前的不能查看