使用seleium的爬虫

最新推荐文章于 2024-05-13 10:12:28 发布

stark-car

最新推荐文章于 2024-05-13 10:12:28 发布

阅读量451

点赞数 1

文章标签：爬虫 python 开发语言

本文链接：https://blog.csdn.net/m0_73421142/article/details/129280065

版权

这次我们要打开一个动漫网址，然后进行一个搜索可以知道里面有哪些动漫

from selenium.webdriver import Firefox
from selenium.webdriver.common import by
import time
from selenium.webdriver.common.keys import Keys
import requests
from urllib.parse import quote    # 可以将汉字转化为编码
import re

导入这些库

web = Firefox()

web.get('https://www.omofun.top/?ref=lanrenao.com')

初始化，用seleium（自动化测试工具）进入这个网址

web.find_element(by.By.XPATH, '/html/body/div/div[1]/strong/div/ul/li[5]/a/span').click()

使用seleium中by，通过xpath寻找目标并点击

        # 将汉字转化成编码，爬取所有的动漫类型
        keyword = thing
        keywords = quote(keyword) # 编码

因为网址中需要编码，所以导入quote这个库去进行汉字转化为编码

具体源码如下

web = Firefox()

web.get('https://www.omofun.top/?ref=lanrenao.com')
time.sleep(5)
web.find_element(by.By.XPATH, '/html/body/div/div[5]/div/strong/div/div').click()
# 关闭提示
while True:
    thing = input('请输入你想看的动漫>>>')
    if thing == '今日更新':
        try:
            web.find_element(by.By.XPATH, '/html/body/div/div[1]/strong/div/ul/li[5]/a/span').click()
            web.switch_to.window(web.window_handles[-1])
            todaytext = web.page_source
            obj2 = re.compile('<a class="module-poster-item module-item">(?P<href>.*?)title="(?P<movies>.*?)">')
            items2 = obj2.findall(todaytext)
            time.sleep(5)
            for today in items2:
                print(today)
        except:
            print('404 系统出错了')
    else:
        web.find_element(by.By.XPATH, '/html/body/div/strong/div/div/div[2]/div/form/div[1]/input').send_keys(thing, Keys.ENTER)

        web.switch_to.window(web.window_handles[-1])   # 转化为搜索完成后的页面

        # 将汉字转化成编码，爬取所有的动漫类型
        keyword = thing
        keywords = quote(keyword) # 编码

        for i in range(1,10):
            urls = 'https://www.omofun.top/index.php/vod/search/page/' + str(i) + '/wd/' + keywords + '.html'
            # for url in urls:

            headers = {
                    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.76"
                }
            respon = requests.get(urls, headers)
            text = respon.text
            obj = re.compile('<strong>(?P<movie>.*?)</strong>')
            items = obj.findall(text)
            item = items[3:]
            for f in item:
                print(f)
            print('共有'+str(len(item))+'部动漫在这页')
            continue
        repeat = input("是否要继续查询？（输入y/n）")
        if repeat == 'y':
            continue
        else:
            break
print('感谢你的查询！')
web.quit()