from bs4 import BeautifulSoup
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p id="my p" class="title">asdfasdf<b id="bbb" class="boldest">The Dormouse's story</b>
</p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,'lxml')"""
搜索文档树:
find:获取第一个
find_all:获取所有
有5种搜索方式:字符串、正则表达式、列表、True、方法
"""# 1. 字符串:可以按照标签名,属性名查找
res = soup.find(name='a',id='link2')# 查找id为link2的a标签print(res)# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
res = soup.find(href='http://example.com/tillie')# 查找href属性为指定值的标签print(res)# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
res = soup.find(class_='story')# 查找第一个类为story的标签print(res)
res = soup.body.find('p')# 可以和遍历结合使用print(res)
res = soup.body.find(string='Elsie')print(res)# Elsie
res = soup.find(attrs={'class':'sister'})# 按类查找print(res)# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a># 2. 正则表达式:标签名,属性可以使用正则匹配import re
res = soup.find_all(name=re.compile('^b'))# 查找以b开头的标签print(res)
res = soup.find_all(href=re.compile('^http'))for item in res:
url = item.attrs.get('href')print(url)# request-html 获取到页面中所有的链接地址
res = soup.find(attrs={'href': re.compile('^a')})print(res)# 3. 列表 标签名,属性名 等于列表 或条件
res = soup.find_all(class_=['story','sister'])# 或条件,class包含story或sisterprint(res)
res = soup.find_all(name=['a','p'])# 或条件,a或p标签print(res)# 4. True 标签名,属性名 等于布尔
res = soup.find_all(name=True)# 有标签名的所有标签print(res)# 拿出页面中所有图片
res = soup.find_all(src=True)for item in res:
url = item.attrs.get('href')print(url)# 5. 方法 标签名或属性名 = 方法defhas_class_but_no_id(tag):return tag.has_attr('class')andnot tag.has_attr('id')print(soup.find_all(has_class_but_no_id))"""
find_all的其他属性:
limit:限制查找条数
recursive:只找一层(子标签)
"""
res = soup.find_all(name='a', limit=2)# find的本质是find_all + limit=1print(res)
res = soup.body.find(name='p',id=False).find_all(name='a', recursive=False)print(res)"""bs4还可以修改文档树"""
css选择器
"""bs4可以还使用css选择器选择标签"""from bs4 import BeautifulSoup
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p id="my p" class="title">asdfasdf<b id="bbb" class="boldest">The Dormouse's story</b>
</p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,'lxml')
res = soup.select('a')# 通过标签查找print(res)
res = soup.select('#link1')# 通过id查找print(res)
res = soup.select('.sister')# 通过类查找print(res)
res = soup.select('body>p>a')# 查找body下的p下的a标签print(res)# 只需要会了css选择,几乎所有的解析器[bs4,lxml...],都会支持css和xpath
res=soup.select('body>p>a:nth-child(2)')# 获取符合条件的第二个a标签print(res)
res=soup.select('body>p>a:nth-last-child(1)')# 获取符合条件的倒数第一个a标签print(res)
res=soup.select('a[href="http://example.com/tillie"]')# 通过属性查找标签print(res)"""
1. 标签名
2. .类名
3. #id号
4. body a body下子子孙孙中得a
5. body>a body下子的a,没有孙
6. 其他的参照css选择器
"""
# 在页面中执行js代码import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://www.jd.com/')# 1 能干很多事情,打印了cookie# bro.execute_script('alert(document.cookie)')# 2 滚动页面,到最底部# 一点点滑动# for i in range(10):# y=400*(i+1)# bro.execute_script('scrollTo(0,%s)'%y)# time.sleep(1)# 一次性直接滑动到最底部
bro.execute_script('scrollTo(0,document.body.scrollHeight)')
time.sleep(3)
bro.close()
切换选项卡
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://www.jd.com/')# 使用js打开新的选项卡
bro.execute_script('window.open()')# 切换到这个选项卡上,刚刚打开的是第一个
bro.switch_to.window(bro.window_handles[1])
bro.get('http://www.taobao.com')
time.sleep(2)
bro.switch_to.window(bro.window_handles[0])
time.sleep(3)
bro.close()
bro.quit()
浏览器前进后退
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://www.jd.com/')
time.sleep(2)
bro.get('https://www.taobao.com/')
time.sleep(2)
bro.get('https://www.baidu.com/')# 后退一下
bro.back()
time.sleep(1)# 前进一下
bro.forward()
time.sleep(3)
bro.close()
异常处理
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException
try:passexcept Exception as e:print(e)finally:
bro.close()
小案例
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('http://www.baidu.com')
bro.implicitly_wait(10)# 等待,找一个标签,如果标签没加载出来,等一会
bro.maximize_window()# 全屏# 通过 a标签文字内容查找标签的方式
a = bro.find_element(by=By.LINK_TEXT, value='登录')# 点击标签
a.click()# 页面中id唯一,如果有id,优先用id
input_name = bro.find_element(by=By.ID, value='TANGRAM__PSP_11__userName')# 输入用户名
input_name.send_keys('33333@qq.com')
time.sleep(1)
input_password = bro.find_element(by=By.ID, value='TANGRAM__PSP_11__password')
input_password.send_keys('123456')
time.sleep(1)
input_submit = bro.find_element(by=By.ID, value='TANGRAM__PSP_11__submit')# 点击
input_submit.click()
time.sleep(5)
bro.close()
selenium登录获取cookie
# 操作浏览器,登录成功就可以拿到登录成功的cookie,保存到本地# 如果有很多小号,会有很多cookie,搭建cookie池import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import json
# 登录过程# bro = webdriver.Chrome(executable_path='./chromedriver.exe')# bro.get('https://www.cnblogs.com/')# bro.implicitly_wait(10)# try:# # 找到登录按钮# submit_btn = bro.find_element(By.LINK_TEXT, value='登录')# submit_btn.click()# time.sleep(1)# username = bro.find_element(By.ID, value='mat-input-0')# password = bro.find_element(By.ID, value='mat-input-1')# username.send_keys("616564099@qq.com")# password.send_keys('sadfasdfads')## submit = bro.find_element(By.CSS_SELECTOR,# value='body > app-root > app-sign-in-layout > div > div > app-sign-in > app-content-container > div > div > div > form > div > button')## time.sleep(20)# submit.click()# # 会有验证码,滑动,手动操作完了,敲回车,程序继续往下走# input()# # 已经登录成功了## cookie = bro.get_cookies()# print(cookie)# with open('cnblogs.json', 'w', encoding='utf-8') as f:# json.dump(cookie, f)## time.sleep(5)# except Exception as e:# print(e)# finally:# bro.close()# 打开cnblose,自动写入cookie,我就是登录状态了
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://www.cnblogs.com/')
bro.implicitly_wait(10)
time.sleep(3)# 把本地的cookie写入,就登录了withopen('cnblogs.json','r',encoding='utf-8')as f:
cookie=json.load(f)for item in cookie:
bro.add_cookie(item)# 刷新一下页面
bro.refresh()
time.sleep(10)
bro.close()
抽屉半自动点赞
# 使用selenium登录到抽屉,获取到,使用requests,自动点赞-使用requests登录,非常难登录,因为有验证码
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import json
import requests
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://dig.chouti.com/')
bro.implicitly_wait(10)try:
submit = bro.find_element(by=By.ID, value='login_btn')
bro.execute_script("arguments[0].click()", submit)# submit.click() # 有的页面button能找到,但是点击不了,报错,可以使用js点击它
time.sleep(2)
username = bro.find_element(by=By.NAME, value='phone')
username.send_keys('18953675221')
password = bro.find_element(by=By.NAME, value='password')
password.send_keys('lqz123')
time.sleep(3)
submit_button = bro.find_element(By.CSS_SELECTOR,'body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div:nth-child(4) > button')
submit_button.click()# 验证码input()
cookie = bro.get_cookies()print(cookie)withopen('chouti.json','w', encoding='utf-8')as f:
json.dump(cookie, f)# 找出所有文章的id号
div_list = bro.find_elements(By.CLASS_NAME,'link-item')
l =[]for div in div_list:
article_id = div.get_attribute('data-id')
l.append(article_id)except Exception as e:print(e)finally:
bro.close()# 继续往下写,selenium完成它的任务了,登录---》拿到cookie,使用requests发送[点赞]print(l)withopen('chouti.json','r', encoding='utf-8')as f:
cookie = json.load(f)# 小细节,selenium的cookie不能直接给request用,需要有些处理
request_cookies ={}for item in cookie:
request_cookies[item['name']]= item['value']print(request_cookies)
header ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}for i in l:
data ={'linkId': i
}
res = requests.post('https://dig.chouti.com/link/vote', data=data, headers=header, cookies=request_cookies)print(res.text)