import re
import openpyxl
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from lxml import etree
from html import unescape
# 获取HTML源码,普通方式无法抓取到源码,所以需要使用代理访问后在提取内容
def getHtml(url):
try:
# 用户代理
headers = {
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
# 获取请求对象
r = requests.get(url, timeout=5, headers=headers)
r.raise_for_status()
# 返回页面内容
return r.text
except Exception as e:
print(e.__traceback__)
# 解析内容
def getInfo(text):
# 获取xpath解析对象
e = etree.HTML(text)
html = etree.tostring(e, encoding="utf-8").decode("utf-8") # 防止中文类似中国的乱码。
# print(html)
# 品类一 (<meta name = "description"content = "老鹰吃小鸡创作的玄幻小说《星门:时光之主》,已更新638章,最新章节:完本感言。
# 传说,在那古老的星空深处,伫立着一道血与火侵染的红色之门。传奇与神话,黑暗与光明,无尽传说皆在这古老的门户中流淌。
# 俯瞰星门,热血照耀天地,黑暗终将离去!《星门》漫画11月18日上线!!!对漫画感兴趣的朋友可以去看看。…" > )
nofollow = e.xpath('//meta[@name="description"]')
nofollow = etree.tostring(nofollow[0], encoding="utf-8").decode("utf-8")
nofollow = re.findall('创作的(.+?)小说', unescape(nofollow))
print(nofollow)
# 品类二 (<meta property = "og:novel:category"content = "高武世界" / >)
category = e.xpath('//meta[@property="og:novel:category"]')
category = etree.tostring(category[0], encoding="utf-8").decode("utf-8")
category = re.findall('content="(.+?)"/>', unescape(category))
print(category)
# 书名 (<meta property = "og:title"content = "星门:时光之主" / >)
title = e.xpath('//meta[@property="og:title"]')
title = etree.tostring(title[0], encoding="utf-8").decode("utf-8")
title = re.findall('content="(.+?)"/>', unescape(title))
print(title)
# 作者 (<meta property = "og:novel:author"content = "老鹰吃小鸡" / >)
author = e.xpath('//meta[@property="og:novel:author"]')
author = etree.tostring(author[0], encoding="utf-8").decode("utf-8")
author = re.findall('content="(.+?)"/>', unescape(author))
print(author)
# 状态 (<meta property = "og:novel:status"content = "完本" / >)
status = e.xpath('//meta[@property="og:novel:status"]')
status = etree.tostring(status[0], encoding="utf-8").decode("utf-8")
status = re.findall('content="(.+?)"/>', unescape(status))
print(status)
# 字数 (<p class="count"><em>647.82万</em> ……</p>)
count = e.xpath('//p[@class="count"]/em[1]/text()')
print(count)
# 简介 (<p id="book-intro-detail"> 传说,在那古老的星空深处,伫立着一道血与火侵染的红色之门。<br>)
intro = e.xpath('//p[@id="book-intro-detail"]/text()')
intro = "".join(intro)
intro = [intro]
print(intro)
return zip(nofollow, category, title, author, status, count, intro)
# 保存数据
def save(info, text, y, data, ws):
# 排名信息
for nofollow, category, title, author, status, count, intro in data:
ws.append([info, text, nofollow, category, title, author, status, count, intro])
if __name__ == '__main__':
# 创建excel文件
# 获取工作簿
wb = openpyxl.Workbook()
# 获取工作表
ws = wb.active
# 设置工作表名
ws.title = '起点中文网月票榜'
# 设置表头
ws.append(['标题', "链接", '品类一', '品类二', '书名', '作者', '状态', '字数', '简介'])
# 共5页
for x in range(10, 13):
for y in range(1, 4):
for z in range(1, 21):
if y == 3 and z > 10:
continue
url = "https://www.qidian.com/rank/yuepiao/year2022/"
options = webdriver.ChromeOptions()
options.add_argument("disable-blink-features=AutomationControlled")
browser = webdriver.Chrome(options=options)
# browser = webdriver.Chrome() # 初始化浏览器为chrome浏览器
browser.maximize_window() # 设置全屏
browser.get(url) # 访问网页
text = getHtml(browser)
print(text)
# print(browser.page_source)
# 选择第几月
browser.find_element(by=By.ID, value='month').click()
months = browser.find_elements(by=By.CLASS_NAME, value='lbf-combobox-item')
months[-x].click() # 这里4是12月, 5是11月,6是10月, -x刚好从1到12
print(len(months))
# 选择第几页page
pagination = browser.find_elements(by=By.CLASS_NAME, value='lbf-pagination-page ')
print(len(pagination))
pagination[y - 1].click()
# 选择第几项
element = browser.find_elements(by=By.CLASS_NAME, value='book-img-box')
print(len(element))
element[z-1].click()
browser.switch_to.window(browser.window_handles[1]) # 切换当前页面标签
currentPageUrl = browser.current_url
print(currentPageUrl)
text = getHtml(currentPageUrl)
data = getInfo(text)
save("year2022-month" + str(x) + "-page" + str(y) + "-option" + str((y - 1) * 20 + z), currentPageUrl, y, data, ws)
# 保存
wb.save('qidian.xlsx')
browser.close()
selenium自动化获取起点排行榜
于 2023-08-24 20:50:05 首次发布