第六周学习内容总结(爬虫复习，js基础)

最新推荐文章于 2024-07-12 14:10:05 发布

weixin_43602233

最新推荐文章于 2024-07-12 14:10:05 发布

阅读量118

点赞数

分类专栏：初学爬虫文章标签： javascript

本文链接：https://blog.csdn.net/weixin_43602233/article/details/112703256

版权

第六周学习内容总结(爬虫复习，js基础)

使用第三方服务进行验证码自动识别

使用超级鹰进行验证码识别自动登录b站，并且获取当日排行榜前一百

import io
import time

import xlwt
from PIL import Image
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.support.wait import WebDriverWait
from chaojiying import ChaojiyingClient # 导入超级鹰示例代码模块

driver = webdriver.Chrome()
driver.get('https://passport.bilibili.com/login')
wait = WebDriverWait(driver, 10)
username_input = driver.find_element_by_id('login-username')
username_input.send_keys('') # 账号
password_input = driver.find_element_by_id('login-passwd')
password_input.send_keys('') # 密码
login_button = driver.find_element_by_class_name('btn.btn-login')
login_button.click()
time.sleep(5)
captcha = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div[6]/div/div/div[2]')
# 通过WebElement对象的location和size属性获取元素的位置和宽高，然后截取图片
location, size = captcha.location, captcha.size
print(location, size)
width, height = size['width'], size['height'] - 32
left, top, right, bottom = location['x'], location['y'] - 46, location['x'] + width - 40, location['y'] + height + 20
screen_image_data = driver.get_screenshot_as_png()
image = Image.open(io.BytesIO(screen_image_data))
image.save('screenshot.png')
captcha_image = image.crop((left, top, right, bottom))
buffer = io.BytesIO()
captcha_image.save(buffer, format='png')
captcha_image.show()
client = ChaojiyingClient('', '', '900260') # 输入超级鹰的账号 密码
result = client.post_pic(buffer.getvalue(), '9004')
print(result)
if result['err_no'] == 0:
    # 通过ActionChain模拟鼠标点击
    ac = ActionChains(driver)
    pic_str = result['pic_str']
    for item in pic_str.split('|'):
        x, y = map(int, item.split(','))
        # 将鼠标移动到元素上并按照指定的偏移量找到点击位置
        ac.move_to_element_with_offset(captcha, x, y).click()
        time.sleep(0.5)
    ac.perform()
    sure_button = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div[6]/div/div/div[3]/a/div')
    sure_button.click()
    wait = WebDriverWait(driver, 10)
    hot = driver.find_element_by_xpath('/html/body/div[2]/div/div[1]/div[3]/div/div[1]/ul/li[3]/a/div/i')
    hot.click()
    time.sleep(3)
    rank = driver.find_element_by_xpath('/html/body/div[3]/div[1]/div[4]/div/span')
    rank.click()
    time.sleep(3)
    titles = driver.find_elements_by_xpath('/html/body/div[3]/div[2]/div[2]/ul/li/div[2]/div[2]/a')
    plays = driver.find_elements_by_xpath('/html/body/div[3]/div[2]/div[2]/ul/li/div[2]/div[2]/div[1]/span[1]')
    ups = driver.find_elements_by_xpath('/html/body/div[3]/div[2]/div[2]/ul/li/div[2]/div[2]/div[1]/a/span')
    scores = driver.find_elements_by_xpath('/html/body/div[3]/div[2]/div[2]/ul/li/div[2]/div[2]/div[2]/div')

    wb = xlwt.Workbook()  # 创建工作簿
    sheet = wb.add_sheet('Python')  # 创建工作表
    col_name = ('排名', '标题', '播放量', 'up主', '综合得分')
    count = 1
    for index, name in enumerate(col_name):
        sheet.write(0, index, name)  # (行索引，列索引，数据)
    # 逐列写入数据
    for x in range(1, 101):
        sheet.write(x, 0, x)
    for x in titles:
        sheet.write(count, 1, x.get_attribute('text'))
        count += 1
    count = 1
    for x in plays:
        sheet.write(count, 2, x.get_attribute('textContent'))
        count += 1
    count = 1
    for x in ups:
        sheet.write(count, 3, x.get_attribute('textContent'))
        count += 1
    count = 1
    for x in scores:
        sheet.write(count, 4, x.get_attribute('innerHTML'))
        count += 1
    wb.save('bilibili排行.xls')
    print('结束！')
else:
    print('出错')

js语言基础

学习一些前端知识，便于我们进行爬虫工作。

把浏览器窗口封装成一个对象把HTML页面封装成一个对象

JavaScript —> ECMAScript（ES）+ BOM（Browser Object Model）+ DOM（Document Object Model）
语法规范 window document

// number - 整数和浮点数

// string - 字符串(单引号或双引号)

// boolean - true/false

// null - 空类型

// undefined - 未定义

// Symbol - 符号类型

// object - 对象类型(function/array)

用弹窗做猜数字游戏

<!DOCTYPE html>
<html lang = "en">
    <head>
        <meta charset="UTF-8">
        <meta name="view" content="width=device-width, initial-scale=1.0">
        <tit

最低0.47元/天解锁文章

weixin_43602233

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
第六周学习内容总结(爬虫复习，js基础)

第六周学习内容总结(爬虫复习，js基础)使用第三方服务进行验证码自动识别使用超级鹰进行验证码识别自动登录b站，并且获取当日排行榜前一百import ioimport timeimport xlwtfrom PIL import Imagefrom selenium import webdriverfrom selenium.webdriver import ActionChainsfrom selenium.webdriver.support.wait import WebDrive
复制链接

扫一扫