获取链接
解析链接(先获取目录链接,再循环解析文章链接)
保存文件
(vip文章不可解析,有些文章使用beautifulsoup匹配不了,不知道为啥。。。)
import urllib.request
import bs4
import time
def getURL(url):
head = {"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Mobile Safari/537.36 Edg/85.0.564.44"
}
req = urllib.request.Request(url=url, headers=head)
res = urllib.request.urlopen(req)
html = res.read()
return html
def main():
url = "https://xiaoshuo.sogou.com/list/926423625/"
zhangjie(url)
print('\n'+"爬取完成!")
def zhangjie(url):
# 发送请求,获取响应
html = getURL(url)
time.sleep(1)
# 使用bs解析内容,生成soup对象
soup = bs4.BeautifulSoup(html, "html.parser")
Title = soup.find('a', href='/book/926423625/')
save('\t\t' + Title.text)
# 查找所有的章节链接对象
a_list = soup.select(".chapter-box ul li a")
print("一共有" + str(len(a_list)) + "章")
# 遍历所有的a对象
for item in a_list:
# 取出这个a的内容
title = item.find('span').text
# 取出a的href属性
href = "https://xiaoshuo.sogou.com" + item['href']
print('正在下载 %s...' % title)
# 向href发送请求,直接获取到解析之后的内容
neirong = get_neirong(href)
print('下载完成')
time.sleep(1)
string = "%s\n\n%s"%(title, neirong)
save(string)
def get_neirong(url):
# 发送请求,获取响应
html = getURL(url)
time.sleep(2)
# 使用bs解析内容,生成soup对象
soup = bs4.BeautifulSoup(html, "html.parser")
# 找包含章节内容的div
content = soup.find('div', id="contentWp").find_all('p') # 查找符合要求的第一个标签中所有p标签
neirong = ''
for item in content: # 换行
text = item.text
neirong += text + '\n' # 标签中所有文字
return neirong # 字符串
def save(data):
path = "小说.txt"
with open(path, 'a+', encoding="utf-8") as f:
f.write(data+'\n\n\n')
if __name__ == '__main__':
main()