python-爬虫小说
这段时间正在学习python,正好写了个爬虫的代码。话不多说,直接上代码。(不足之处,请多多指教!)
在这里爬取了56书库网站的小说。
# -*- coding:utf-8 -*-
"""
爬取《牧神记》
网址:http://www.56shuku.org/files/article/html/138/138911/
"""
import requests
from bs4 import BeautifulSoup
import re
import os
import threading
import random
import time
url_1 = [] # 用于存储每章小说的url
url_2 = []
# 获取每章节小说的url
def get_url(URL):
res = requests.get(url=URL)
html = res.content.decode()
html = html.encode("utf-8")
soup = BeautifulSoup(html, "html.parser")
url_text = soup.find_all("div", class_="dccss")
url_text = BeautifulSoup(str(url_text[:]), "html.parser")
href = url_text.find_all("a")
for i in href:
i = str(i)
url_2 = re.findall("<a href=\"(.*?)\">", i) # 获取到的每一章节的url
for j in url_2:
url_1.append(URL + j) # 将每章节的url存放到一个列表中
return url_1
# 将每章小说的内容保存到本地文本中
def get_text(url, path="G:/小说/剑来_副本.txt"):
agent_list = ['Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)',
'Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
'Opera/8.0 (Windows NT 5.1; U; en)',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)'
]
headers = {
"User-Agent": random.choice(agent_list)
}
response = requests.get(url=url, headers=headers)
html = response.content
html = html.decode()
html = html.encode("utf-8")
soup = BeautifulSoup(html, "lxml")
# print(soup)
name = soup.find("div", align="center")
name1 = name.h1.string
# print(name1)
text = soup.find_all("div", id="content")
text = text[0].text.replace("\xa0"*4, "\n\n") #将不间断空白符替换成换号符
with open("{}".format(path), "a", encoding="utf-8") as f:
f.writelines(name1)
f.writelines(' ' * 2 + text + "\n\n\n")
f.close()
# 创建存放小说的文本,并打印小说下载进度以及耗时
def load_text(path):
if os.path.exists(path):
os.remove(path)
get_url(URL)
num = len(url_1)
m = 1
start_time = time.time()
for url in url_1:
print("共%d章,正在下载第%d章" % (num, m))
get_text(url, path)
m += 1
end_time = time.time()
run_time = end_time - start_time
print("下载完成!")
print("此本小说下载耗时%d秒!" % run_time)
if __name__ == '__main__':
URL = "http://www.56shuku.org/files/article/html/138/138911/"
path = r"F:\workspace\小说\牧神记.txt"
thread1 = threading.Thread(target=load_text, args=(path,))
thread1.start()
程序运行的截图如下: