代码简介
代码由来:闲来无聊,写了一个笔趣阁(https://www.xuliehao.org)的小说爬取
爬去到的小说直接存储在同文件目录下
使用前提
在同目录下需要有一个存储有ip的“代理.txt”文件,ip格式如下
此爬虫采取了IP代理的方式防止被服务器发现
使用方式
手动将第一个url改为封面链接即可
代码如下:
#conding="utf-8"
"""
timeout=3 #最长请求时间
给页面url分别给url和last_url
"""
import requests
from lxml import etree
import random
import time
t1 = time.time()
header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}
url = "https://www.xuliehao.org/novel12602/"
last_url = url
ret = requests.get(url=url,headers=header,verify=False)
url = "https://www.xuliehao.org"
data = ret.content
html = etree.HTML(data)
File_name = "./" + html.xpath('//h1/text()')[0] + ".txt"
with open(File_name,"w")as f:
f.write("")
f.close()
url1 = html.xpath('//div[@id="list"]/dl')
url += url1[0].xpath('./dd/a/@href')[9]
print(File_name,url)
# url = 'https://www.xuliehao.org/novel5/1970.html'
x = list()
with open("./代理.txt", "r", encoding="utf-8") as f:
for c in f.readlines():
items = dict()
c = c.rstrip("\n")
ret = c.split(" ")
items["协议"] = ret[0]
items["端口"] = ret[1]
items["型号"] = ret[2]
items["ip"] = ret[3]
x.append(items)
f.close()
def model(x):
while True:
proxies = x[random.randint(0, len(x) - 1)]
if proxies["型号"] == "高匿":
htt = proxies["协议"]
proxies = proxies["ip"] + ":" + proxies["端口"]
proxies1 = dict()
proxies1[htt] = proxies
return proxies1
size = 0
while url != last_url:
proxies = model(x)
ret = requests.get(url, headers=header, proxies=proxies,verify=False)
data = ret.content
html = etree.HTML(data)
tittle = html.xpath('//h1/text()')
div = html.xpath('//div[@id="content"]/text()')
print("***"+tittle[0]+"***")
print(div)
html = html.xpath('//a[text()="下一章"]/@href')
print(html[0])
url = "https://www.xuliehao.org" + html[0]
s = ""
htittle = tittle[0].split(" ")
if tittle[0].find("第") == -1 and tittle[0].find("章") == -1:
if len(tittle[0].split(" ")) == 3:
s = "\r\n\r\n\r\n" + "第" + str(htittle[1]) + "章" + " " +str(htittle[2]) + "\r\n\r\n\r\n"
else:
s = "\r\n\r\n\r\n" + "第" + str(htittle[1]) + "章"
for i in range(2, len(htittle)):
s = s + " " + htittle[i]
s += "\r\n\r\n\r\n"
else:
s = "\r\n\r\n\r\n" + tittle[0] + "\r\n\r\n\r\n"
print(tittle[0])
print(s)
with open(File_name,"a",encoding="utf-8") as f:
f.write(s)
size += len(s)
for i in div:
f.write(str(i))
size += len(i)
f.close()
t2 = time.time()
with open(File_name,"a",encoding="utf-8")as f:
f.write("\r\n\r\n\r\n")
f.write(str(size))
f.close()
print("大小为:{}".format(size))
print("时间:{}s".format(t2-t1))
(注:此代码为本人学习所创,不可用于其他用途,如用于其他用途,与本人无关)
此文为本人在学习所创,如有错误请多多指教
此文为本人学习所创,如有侵犯请联系本人删除
联系QQ1135999353