pip install selenium
pip install requsets
pip install BeautifulSoup4
pip install bs4
11111#lxml爬取网页内容
import requests
ua="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
html=requests.get('https://yjsy.hunnu.edu.cn/zsks/sszk1.htm',headers={'User-Agent':ua})
html.encoding='utf-8'
#print(html.text)文本都在html.test里面
#获取节点
xpath="//a[@target='_blank']"
from lxml import etree
page =etree.HTML(html.text)
#xpath方法
result =page.xpath(xpath)
for i in result:
print(i.text)
print()
print("符合条件的结果有")
for i in result:
if '湖南' in i.text and '录取' in i.text:
print(i.text)
2222# selenium 方法截图
from selenium import webdriver
driver =webdriver.PhantomJS()
driver.get('https://www.baidu.com/')
driver.find_element_by_id('kw').send_keys(u"湘潭理工学院")
driver.find_element_by_id('su').click()
import time
time.sleep(5)
driver.save_screenshot('b1.png')
3333#selenium方法的actionchains点击网页
from selenium import webdriver
#不关闭浏览器
options= webdriver.EdgeOptions()
options.add_experimental_option('detach',True)
#Chrome 谷歌 ChromiumEdge新E浏览器Edge老版
driver = webdriver.ChromiumEdge(options=options)
driver.get('https://yjsy.hunnu.edu.cn/')
#得到一个不关闭的浏览器
from selenium.webdriver.common.by import By
import time
xpath1="//ul[@class='menu']/li[4]/a"
xpath2="//ul[@class='menu']/li[4]/ul/li[2]/a"
b1=driver.find_element(By.XPATH,xpath1)
b2=driver.find_element(By.XPATH,xpath2)
from selenium.webdriver import ActionChains
ActionChains(driver).move_to_element(b1).perform()
time.sleep(3)
ActionChains(driver).move_to_element(b2).perform()
time.sleep(3)
ActionChains(driver).move_to_element(b2).click().perform()
4444#selenium方法By.id点击网页
from selenium import webdriver
#不关闭浏览器
options= webdriver.EdgeOptions()
options.add_experimental_option('detach',True)
#Chrome 谷歌 ChromiumEdge新E浏览器
driver = webdriver.ChromiumEdge(options=options)
driver.get('http:/www.baidu.com')
#得到一个不关闭的浏览器
from selenium.webdriver.common.by import By
import time
driver.find_element(By.ID,'kw').send_keys('湖南师范大学研究生院')
time.sleep(3)
#click点击出现
driver.find_element(By.ID,'su').click()
5555#beautifulsoup爬取网页
import requests
ua="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
html=requests.get('https://yjsy.hunnu.edu.cn/zsks/sszk1.htm',headers={'User-Agent':ua})
html.encoding='utf-8'
#引入beautifulsoup
from bs4 import BeautifulSoup
soup = BeautifulSoup(html.text, 'lxml')
#用findall方法找符合条件的节点
result = soup.find_all('a',target='_blank')
#print(result)
for i in result:
print(i.text)
print()
print("符合结果的有:")
for i in result:
if '大学' in i.text and '2024' in i.text:
print(i.text)
6666#selenium方法得到网页和bs4爬取网页内容
from selenium import webdriver
url='https://www.bilibili.com/video/BV1iN4y1a7KJ'options=webdriver.ChromeOptions()
options.add_experimental_option('detach', True)
driver=webdriver.Chrome(options=options)
driver.get(url)
import time
time.sleep(5)
html = driver.page_source
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
title = soup.find('h1', class_="video-title")
count = soup.find('span', class_="view item")
dm = soup.find('span', class_="dm item")
datetime = soup.find('span', class_="pubdate-text")
comments = soup.find_all('div', class_="content-warp")
comments_text = []
for comment in comments:
name = comment.find('div', class_="user-info").text
text = comment.find('span', class_="reply-content").text
comments_text.append({
'name': name,
'text': text })
# 输出结果
print(f"标题:{title.text},播放量:{count.text.strip()},弹幕数:{dm.text.strip()}")
for comment in comments_text: print(f"评论:\nID:{comment['name']},评论内容:{comment['text']}")
driver.close()