Python爬虫攻略

bili (精简版test1)

#bili  teacher源码
from selenium import webdriver
url = 'https://www.bilibili.com/video/BV1iN4y1a7KJ'
options = webdriver.ChromeOptions()
options.add_experimental_option('detach', True)
driver = webdriver.Chrome(options=options)

driver.get(url)
import time
time.sleep(5)

html = driver.page_source

from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')

title = soup.find('h1', class_="video-title")
count = soup.find('span', class_="view item")
dm = soup.find('span', class_="dm item")
datetime = soup.find('span', class_="pubdate-text")

comments = soup.find_all('div', class_="content-warp")
comments_text = []

for comment in comments:
    name = comment.find('div', class_="user-info").text
    text = comment.find('span', class_="reply-content").text
    comments_text.append({
        'name': name,
        'text': text
    })

# 输出结果
print(f"标题:{title.text},播放量:{count.text.strip()},弹幕数:{dm.text.strip()}")
for comment in comments_text:
    print(f"评论:\nID:{comment['name']},评论内容:{comment['text']}")

driver.close()





test1 

#Edge爬取哔哩哔哩主页视频、up主、浏览量和日期,评论区up主名称、评论
from selenium import webdriver
from selenium.webdriver.common.by import By

#不让浏览器自动关闭
options = webdriver.EdgeOptions()
options.add_experimental_option('detach',True)

driver = webdriver.Chrome(options=options)

#加载网页,获取源代码
url = 'https://www.bilibili.com/video/BV1va411v7zE/'
driver.get(url)
import time
time.sleep(5)

html = driver.page_source

#导入BeautifulSoup,筛选数据
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')

result = soup.find_all('div', class_='video-card')

title = soup.find('h1', class_='video-title')
count = soup.find('span', class_='view item')
dm = soup.find('span',class_='dm item')
datetime = soup.find('span',class_='pubdate-ip item')

comments = soup.find_all('div',class_='content-warp')
comments_text = []
for comment in comments:
    name = comment.find('div',class_='user-name').text
    text = comment.find('div',class_='root-reply').text
    comments_text.append({
        'name': name,
        'text': text
    })
print(f'视频:{title.text},时间:{datetime.text},播放量:{count.text.strip()},弹幕:{dm.text}')
for comment in comments_text:
    print(f"评论:\nID:{comment['name']},评论内容:{comment['text']}")

driver.close()

test2

#Edge打开百度,再自动在百度页面搜索湘潭理工学院出结果
from selenium import webdriver

options = webdriver.EdgeOptions()
options.add_experimental_option('detach',True)


driver = webdriver.ChromiumEdge(options=options)#新版本Edge浏览器

driver.get('https://www.baidu.com')

import time
time.sleep(3)#停三秒
from selenium.webdriver.common.by import By
driver.find_element(By.ID,'kw').send_keys('湘潭理工学院')
time.sleep(3)
driver.find_element(By.ID,'su').click()

test3

#Edge浏览器直接在师大招生页面爬取2024年页面相关数据(不打开师大招生页面版)
import requests

url = 'https://yjsy.hunnu.edu.cn/zsks/sszk1.htm'
ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.81'
headers = {
    'User-Agent': ua
}
html = requests.get(url, headers=headers)
html.encoding = 'utf-8' 

#print(html.text)
from bs4 import BeautifulSoup
soup = BeautifulSoup(html.text,'lxml')
result = soup.find_all('a',target='_blank')

#print(result)
for item in result:
    if '2024' in item.text:
        print(item.text)
#    print(item.text)

test4

#谷歌浏览器跑2023年师大硕士招生页面的代码(自动全流程版) 并且爬取2023年相关标题
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains

#不让浏览器自动关闭
options = webdriver.ChromeOptions()
options.add_experimental_option('detach',True)
driver = webdriver.Chrome(options=options)
driver.get('https://yjsy.hunnu.edu.cn')

import time
time.sleep(5)

xpath_1 = "//ul[@class='menu']/li[4]/a"
xpath_2 = "//ul[@class='menu']/li[4]/ul/li[2]/a"

button_1 = driver.find_element(By.XPATH,xpath_1)
button_2 = driver.find_element(By.XPATH,xpath_2)

ActionChains(driver).move_to_element(button_1).perform()
time.sleep(5)

ActionChains(driver).move_to_element(button_2).click().perform()
time.sleep(5)

html = driver.page_source

from bs4 import BeautifulSoup

soup = BeautifulSoup(html,'lxml')
result = soup.find_all('a',target = '_blank')

for item in result:
    if '2023' in item.text:
        print(item.text)

test5

#Edge爬取哔哩哔哩热门页视频、up主和浏览量
from selenium import webdriver
from selenium.webdriver.common.by import By

#不让浏览器自动关闭
options = webdriver.EdgeOptions()
options.add_experimental_option('detach',True)

driver = webdriver.Edge(options=options)

#加载网页,获取源代码
url = 'https://www.bilibili.com/v/popular/all/'
driver.get(url)

#导入BeautifulSoup,筛选数据
from bs4 import BeautifulSoup
soup = BeautifulSoup(driver.page_source,'lxml')

result = soup.find_all('div', class_='video-card')

for item in result:
    title = item.find('p', class_='video-name')
    up = item.find('span',class_='up-name__text')
    count = item.find('span', class_='play-text')
    print(f'视频:{title.text},UP:{up.text},播放量:{count.text.strip()}')

ysjy

#精简yjsy  teacher源码
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains

import time
options = webdriver.ChromeOptions()
options.add_experimental_option('detach', True)
driver = webdriver.Chrome(options=options)

driver.get('https://yjsy.hunnu.edu.cn')
time.sleep(5)

xpath_1 = "//ul[@class='menu']/li[4]/a"
xpath_2 = "//ul[@class='menu']/li[4]/ul/li[2]/a"

button_1 = driver.find_element(By.XPATH, xpath_1)
button_2 = driver.find_element(By.XPATH, xpath_2)

ActionChains(driver).move_to_element(button_1).perform()
time.sleep(5)

ActionChains(driver).move_to_element(button_2).click().perform()

评论 12
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值