第六周学习内容总结(爬虫复习,js基础)
- 使用第三方服务进行验证码自动识别
使用超级鹰进行验证码识别自动登录b站,并且获取当日排行榜前一百
import io
import time
import xlwt
from PIL import Image
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.support.wait import WebDriverWait
from chaojiying import ChaojiyingClient # 导入超级鹰示例代码模块
driver = webdriver.Chrome()
driver.get('https://passport.bilibili.com/login')
wait = WebDriverWait(driver, 10)
username_input = driver.find_element_by_id('login-username')
username_input.send_keys('') # 账号
password_input = driver.find_element_by_id('login-passwd')
password_input.send_keys('') # 密码
login_button = driver.find_element_by_class_name('btn.btn-login')
login_button.click()
time.sleep(5)
captcha = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div[6]/div/div/div[2]')
# 通过WebElement对象的location和size属性获取元素的位置和宽高,然后截取图片
location, size = captcha.location, captcha.size
print(location, size)
width, height = size['width'], size['height'] - 32
left, top, right, bottom = location['x'], location['y'] - 46, location['x'] + width - 40, location['y'] + height + 20
screen_image_data = driver.get_screenshot_as_png()
image = Image.open(io.BytesIO(screen_image_data))
image.save('screenshot.png')
captcha_image = image.crop((left, top, right, bottom))
buffer = io.BytesIO()
captcha_image.save(buffer, format='png')
captcha_image.show()
client = ChaojiyingClient('', '', '900260') # 输入超级鹰的账号 密码
result = client.post_pic(buffer.getvalue(), '9004')
print(result)
if result['err_no'] == 0:
# 通过ActionChain模拟鼠标点击
ac = ActionChains(driver)
pic_str = result['pic_str']
for item in pic_str.split('|'):
x, y = map(int, item.split(','))
# 将鼠标移动到元素上并按照指定的偏移量找到点击位置
ac.move_to_element_with_offset(captcha, x, y).click()
time.sleep(0.5)
ac.perform()
sure_button = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div[6]/div/div/div[3]/a/div')
sure_button.click()
wait = WebDriverWait(driver, 10)
hot = driver.find_element_by_xpath('/html/body/div[2]/div/div[1]/div[3]/div/div[1]/ul/li[3]/a/div/i')
hot.click()
time.sleep(3)
rank = driver.find_element_by_xpath('/html/body/div[3]/div[1]/div[4]/div/span')
rank.click()
time.sleep(3)
titles = driver.find_elements_by_xpath('/html/body/div[3]/div[2]/div[2]/ul/li/div[2]/div[2]/a')
plays = driver.find_elements_by_xpath('/html/body/div[3]/div[2]/div[2]/ul/li/div[2]/div[2]/div[1]/span[1]')
ups = driver.find_elements_by_xpath('/html/body/div[3]/div[2]/div[2]/ul/li/div[2]/div[2]/div[1]/a/span')
scores = driver.find_elements_by_xpath('/html/body/div[3]/div[2]/div[2]/ul/li/div[2]/div[2]/div[2]/div')
wb = xlwt.Workbook() # 创建工作簿
sheet = wb.add_sheet('Python') # 创建工作表
col_name = ('排名', '标题', '播放量', 'up主', '综合得分')
count = 1
for index, name in enumerate(col_name):
sheet.write(0, index, name) # (行索引,列索引,数据)
# 逐列写入数据
for x in range(1, 101):
sheet.write(x, 0, x)
for x in titles:
sheet.write(count, 1, x.get_attribute('text'))
count += 1
count = 1
for x in plays:
sheet.write(count, 2, x.get_attribute('textContent'))
count += 1
count = 1
for x in ups:
sheet.write(count, 3, x.get_attribute('textContent'))
count += 1
count = 1
for x in scores:
sheet.write(count, 4, x.get_attribute('innerHTML'))
count += 1
wb.save('bilibili排行.xls')
print('结束!')
else:
print('出错')
- js语言基础
学习一些前端知识,便于我们进行爬虫工作。
把浏览器窗口封装成一个对象 把HTML页面封装成一个对象
JavaScript —> ECMAScript(ES)+ BOM(Browser Object Model)+ DOM(Document Object Model)
语法规范 window document
// number - 整数和浮点数
// string - 字符串(单引号或双引号)
// boolean - true/false
// null - 空类型
// undefined - 未定义
// Symbol - 符号类型
// object - 对象类型(function/array)
- 用弹窗做猜数字游戏
<!DOCTYPE html>
<html lang = "en">
<head>
<meta charset="UTF-8">
<meta name="view" content="width=device-width, initial-scale=1.0">
<tit