适用于某分区内的所有小说(两级网站),并简单地根据关键字赋权值筛选
import requests
import re
from bs4 import BeautifulSoup
# 爬取网站
def getHTMLText(url):
try:
#print("获取url中...")
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
#print("获取url完成")
return r.text
except:
print("获取Url失败")
# 一级网站源码解析
def parsePage1(text1):
url1 = re.findall(r'', text1) # 本页分区所有小说的粗url列表
url1List = []
for n in url1:
n = n.split('"')[1]
url1List.append(n)
return url1List # 本页分区所有小说的url列表
# 具体文章爬取并筛选
def parsePage2(url1List):
urlList = []
for n in url1List:
urlList.append("") # url预处理
nameNum = len(urlList)
count = 1
for url2 in urlList:
judge1 = 0
print("\t第{}/{}个".format(count, nameNum))
count += 1
text2 = getHTMLText(url2) # 获取小说html
pageN = pageNum(text2) # 获取小说页数
name = re.findall() # 获取小说名称
remove = [] # 去除不需要的小说名称
for n in remove:
if n in name:
judge1 = 1
if judge1 == 1:
continue
print("\t\t{}".format(name))
print("\t\t第1/{}页".format(pageN))
text = content(text2) # 文字获取
if pageN > 1:
# 处理其他页的url
url = url2
sl = url.split(".")
sp = ""
for i in sl[:-2]:
sp += i + "."
for n in range(2, min(pageN+1, 15)):
print("\t\t第{}/{}页".format(n, pageN))
url2 = sp + sl[-2] + "_" + str(n) + "." + sl[-1]
text2 = getHTMLText(url2)
text += content(text2)
# 筛选函数
if judge(text):
write(name)
# 写入函数
def write(name):
f = open('1.txt', 'a', encoding='utf-8')
w = name + "\n"
f.write(w)
f.close()
# 文字获取函数
def content(text2):
text = text2
soup = BeautifulSoup(text, "html.parser")
text = soup.select('')[0]
content = ""
for string in text.stripped_strings:
content += string
return content
# 页数获取函数
def pageNum(text2):
try:
pageN = int(re.findall(r'', text2)
return pageN
except:
return 1
# 筛选函数
def judge(text):
search1 = [] # 特别喜欢的关键词
search2 = [] # 一般喜欢的关键词
search3 = [] # 拒绝的关键词
length = len(search1) + len(search2)
count = 0
for str in search1:
if str in text:
count += 4
for str in search2:
if str in text:
count += 1
for str in search3:
if str in text:
count -= 10
rate = count / length
print("\t\t\trate: {}".format(rate))
# 筛选标准
if rate >= 1:
return True
else:
return False
def main():
url1 = "" # 分区网址
text1 = getHTMLText(url1)
indexPageNum = int(re.findall(r'', text1) # 分区总页数
url1List = parsePage1(text1) # 获得分区下所有小说的网址
parsePage2(url1List) # 具体文章爬取并筛选
for indexN in range(1, indexPageNum+1):
print("第{}/{}页".format(indexN, indexPageNum))
url1 = "" + str(indexN) + ".html"
text1 = getHTMLText(url1)
url1List = parsePage1(text1)
parsePage2(url1List)
main()