#1.需要安装的第三方库:requests,bs4
importos,requests,refrom time importsleepfrom bs4 importBeautifulSoupfrom random importuniform#网址解析
defurl_open(url):
headers={}
headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"response= requests.get(url,headers=headers)
response.encoding= "utf-8" #该网站的编码格式:utf-8
html=response.textreturnhtml#目标链接收集
defcollect_url(html,root_url):print("正在收集全站已完结的小说链接.....")
novel_name_all=[]
novel_url_all=[]
soup= BeautifulSoup(html,"html.parser")
totle_pages= int((soup.find("div",class_="pagelink").em.text).split("/")[1]) #查找总页数
#print(totle_pages)
#逐页打开手收集小说链接
for page in range(1,totle_pages+1):
url= root_url + 'modules/article/articlelist.php?fullflag=1&page={}'.format(page)#print(url)
html =url_open(url)#收集当前页面的小说链接
p_novel_url = fr''novel_url_temp=re.findall(p_novel_url,html)#将小说链接添加到URL总列表并获取小说名称。
for novel_url innovel_url_temp:
novel_url_all.append(novel_url)#获取小说名称
p_novel_name = fr'{novel_url}">(.+)'novel_name_temp=re.findall(p_novel_name,html)[0]
novel_name_all.append(novel_name_temp)break #减少代码运行时间使用(若爬取全站则此处删除即可)
data= [novel_name_all,novel_url_all] #将数据进行打包,以便返回多个数据
print("收集工作已完成,准备进入小说内容下载.....")
sleep(1)returndata#小说内容获取与保存
defget_and_save_data(data):
novel_name_all=data[0]
novel_url_all= data[1]
i= -1 #用于索引获取小说名称
for novel_url innovel_url_all:
i+= 1novel_name= novel_name_all[i] #获取小说名称
print()print("正在下载小说:《%s》"%novel_name)print()
html_1=url_open(novel_url)
soup_1= BeautifulSoup(html_1, "html.parser")
chapters_url= soup_1.find("p", class_="btnlinks").a["href"]#获取所有小说章节URL
html_2 =url_open(chapters_url)
soup_2= BeautifulSoup(html_2, "html.parser")
chapters_url_all= soup_2.find_all("td", class_="L")#逐页打开小说章节网址并获取内容保存
for each inchapters_url_all:
chapters_url= each.a["href"]
html=url_open(chapters_url)
soup= BeautifulSoup(html,"html.parser")
chapters_name= soup.find("dd").h1.text #抓取章节名称
print("正在下载《%s》:%s"%(novel_name,chapters_name))#小说内容抓取
contents = soup.find("dd",id="contents").text
with open("%s.txt"%novel_name,"a",encoding="utf-8") as g:
g.write("\n"*3 + " "+chapters_name+str("\n")*3)
g.write(" "+contents)
slee_time= uniform(0.35,0.75)
sleep(slee_time)print("小说%s已下载完毕"%novel_name)print("准备进入下一部小说下载")
sleep(2)break #减少代码运行时间使用(若爬取全站则此处删除即可)
#主程序
defmain():#设置工作路径
path = r'C:\Users\Administrator\Desktop\test'
if os.getcwd() !=path:if os.path.exists(path) ==False:
os.mkdir(path)
os.chdir(path)else:
os.chdir(path)
root_url= "https://www.suyouge.com/"target_url= root_url + "full.html"data=collect_url(url_open(target_url),root_url)
get_and_save_data(data)if __name__ == "__main__":
main()