selenium自动化获取起点排行榜

import re
import openpyxl
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from lxml import etree
from html import unescape

# 获取HTML源码,普通方式无法抓取到源码,所以需要使用代理访问后在提取内容
def getHtml(url):
    try:
        # 用户代理
        headers = {
            'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
        # 获取请求对象
        r = requests.get(url, timeout=5, headers=headers)
        r.raise_for_status()

        # 返回页面内容
        return r.text
    except Exception as e:
        print(e.__traceback__)


# 解析内容
def getInfo(text):
    # 获取xpath解析对象
    e = etree.HTML(text)
    html = etree.tostring(e, encoding="utf-8").decode("utf-8") # 防止中文类似中国的乱码。
    # print(html)

    # 品类一 (<meta name = "description"content = "老鹰吃小鸡创作的玄幻小说《星门:时光之主》,已更新638章,最新章节:完本感言。
    # 传说,在那古老的星空深处,伫立着一道血与火侵染的红色之门。传奇与神话,黑暗与光明,无尽传说皆在这古老的门户中流淌。
    # 俯瞰星门,热血照耀天地,黑暗终将离去!《星门》漫画11月18日上线!!!对漫画感兴趣的朋友可以去看看。…" > )
    nofollow = e.xpath('//meta[@name="description"]')
    nofollow = etree.tostring(nofollow[0], encoding="utf-8").decode("utf-8")
    nofollow = re.findall('创作的(.+?)小说', unescape(nofollow))
    print(nofollow)

    # 品类二 (<meta property = "og:novel:category"content = "高武世界" / >)
    category = e.xpath('//meta[@property="og:novel:category"]')
    category = etree.tostring(category[0], encoding="utf-8").decode("utf-8")
    category = re.findall('content="(.+?)"/>', unescape(category))
    print(category)

    # 书名 (<meta property = "og:title"content = "星门:时光之主" / >)
    title = e.xpath('//meta[@property="og:title"]')
    title = etree.tostring(title[0], encoding="utf-8").decode("utf-8")
    title = re.findall('content="(.+?)"/>', unescape(title))
    print(title)

    # 作者 (<meta property = "og:novel:author"content = "老鹰吃小鸡" / >)
    author = e.xpath('//meta[@property="og:novel:author"]')
    author = etree.tostring(author[0], encoding="utf-8").decode("utf-8")
    author = re.findall('content="(.+?)"/>', unescape(author))
    print(author)

    # 状态 (<meta property = "og:novel:status"content = "完本" / >)
    status = e.xpath('//meta[@property="og:novel:status"]')
    status = etree.tostring(status[0], encoding="utf-8").decode("utf-8")
    status = re.findall('content="(.+?)"/>', unescape(status))
    print(status)

    # 字数 (<p class="count"><em>647.82万</em> ……</p>)
    count = e.xpath('//p[@class="count"]/em[1]/text()')

    print(count)
    # 简介 (<p id="book-intro-detail">  传说,在那古老的星空深处,伫立着一道血与火侵染的红色之门。<br>)
    intro = e.xpath('//p[@id="book-intro-detail"]/text()')
    intro = "".join(intro)
    intro = [intro]
    print(intro)
    return zip(nofollow, category, title, author, status, count, intro)


# 保存数据
def save(info, text, y, data, ws):
    # 排名信息
    for nofollow, category, title, author, status, count, intro in data:
        ws.append([info, text, nofollow, category, title, author, status, count, intro])


if __name__ == '__main__':
    # 创建excel文件
    # 获取工作簿
    wb = openpyxl.Workbook()
    # 获取工作表
    ws = wb.active
    # 设置工作表名
    ws.title = '起点中文网月票榜'
    # 设置表头
    ws.append(['标题', "链接", '品类一', '品类二', '书名', '作者', '状态', '字数', '简介'])

    # 共5页
    for x in range(10, 13):
        for y in range(1, 4):
            for z in range(1, 21):
                if y == 3 and z > 10:
                    continue
                url = "https://www.qidian.com/rank/yuepiao/year2022/"

                options = webdriver.ChromeOptions()
                options.add_argument("disable-blink-features=AutomationControlled")
                browser = webdriver.Chrome(options=options)

                # browser = webdriver.Chrome()  # 初始化浏览器为chrome浏览器
                browser.maximize_window()  # 设置全屏
                browser.get(url)  # 访问网页

                text = getHtml(browser)
                print(text)
                # print(browser.page_source)

                # 选择第几月
                browser.find_element(by=By.ID, value='month').click()
                months = browser.find_elements(by=By.CLASS_NAME, value='lbf-combobox-item')
                months[-x].click()  # 这里4是12月, 5是11月,6是10月, -x刚好从1到12
                print(len(months))

                # 选择第几页page
                pagination = browser.find_elements(by=By.CLASS_NAME, value='lbf-pagination-page  ')
                print(len(pagination))
                pagination[y - 1].click()

                # 选择第几项
                element = browser.find_elements(by=By.CLASS_NAME, value='book-img-box')
                print(len(element))
                element[z-1].click()

                browser.switch_to.window(browser.window_handles[1])  # 切换当前页面标签
                currentPageUrl = browser.current_url
                print(currentPageUrl)
                text = getHtml(currentPageUrl)
                data = getInfo(text)
                save("year2022-month" + str(x) + "-page" + str(y) + "-option" + str((y - 1) * 20 + z), currentPageUrl, y, data, ws)

                # 保存
                wb.save('qidian.xlsx')
                browser.close()

在这里插入图片描述

参考:【爬虫实战】起点中文网排行榜(XPath)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值