咳,寒假无聊看起小说,收费章节,日常盗版。。。
然后一搜一堆广告看着就烦人噢
py爬虫系列
import requests
import time
from bs4 import BeautifulSoup
header={
'cookie':'',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
def getHTMLText(url): #照抄就完事了
try:
r = requests.get(url=url, timeout=30)
r.raise_for_status() # 产生异常信息
r.encoding = r.apparent_encoding # 改变编码
return r.text
except:
print("网页爬取失败")
return ""
def getStochList(lst,stockURL):
html=getHTMLText(stockURL) #得到源码
#print(html)
soup=BeautifulSoup(html,'html.parser') #解析
# a = soup.select('a[class="js-photo-link photo-item__link"')
a=soup.find(id='content').text.replace(" ","\n") #找到小说div
b="\n"+str(soup.title)[7:-12]+"\n"
#print(str(b)[7:-12])
#print(a)
lst.append(a)
lst.append(b)
return ""
def getStochInfo(lst, path):
with open(path,'a',encoding="utf-8") as f:
f.write(lst[1])
f.write(lst[0])
return ""
def getNextChapter(url):
list=[]
html = getHTMLText(url) # 得到源码
# print(html)
soup = BeautifulSoup(html, 'html.parser') # 解析
a=soup.find_all('a')
for i in a:
b=i.attrs['href']
if '.html' in b:
list.append(b.split("/")[-1])
return list[-1]
def main(url,path):
List = []
getStochList(List, url) # 解析网页
getStochInfo(List, path) # 下载文件
return getNextChapter(url)
if __name__ == '__main__':
a='19137771.html'
url='https://www.*****.com/files/article/html/33/33333/'
depth=214 #爬取页数 就一页
path='D://deskpe//小说//我的细胞监狱.txt' #保存路径
for i in range(depth):
try:
a=main(url+a,path)
print('\r当前速度:{:.2f}%'.format((i/depth)*100), end='')
time.sleep(3)
except ArithmeticError:
print("出错了")