爬虫(复习)

最新推荐文章于 2024-09-11 12:21:46 发布

别惹小娘子

最新推荐文章于 2024-09-11 12:21:46 发布

阅读量1.1k

点赞数 12

文章标签：爬虫

本文链接：https://blog.csdn.net/weixin_71943927/article/details/135107133

版权

一、爬取网页数据（urllib和requests）

》 urllib

import urllib.request
#调用库中的urlopen（）方法，并传入一个url
response=urllib.request.urlopen('http://www.baidu.com')
#使用read（）方法读取获取到的网页内容
html=response.read().decode('UTF-8')
print(html)

》requests

下载（如果下载速度慢的话可以使用国内源）

#进入python文件进行下载
python.exe -m pip install requests

二、数据解析（Xpath、lxml、beautifulsoup、jsonpath）

》Xpath

#抓取网页源代码
import requests       #使用XPath爬虫

ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'

html = requests.get('http://gra.hnu.edu.cn/index/zsxx.htm',headers={'User-Agent':ua})
html.encoding = 'utf-8'

#使用XPath筛选数据
#print(html.text)

xpath = "//a[@target='_blank']"

from lxml import etree
page = etree.HTML(html.text)

result = page.xpath(xpath)

#输出所有符合XPath的结果
print(result)

for item in result:
    print(item.text)

#对感兴趣的数据进行提纯（2024年湖南大学研究生院招生简章）
print()
print("符合条件的结果有")

for item in result:
    if "2024" in item.text and '招生简章' in item.text:
        print(item.text)

》lxml

下载

pip install lxml

使用

》beautifulsoup

下载

pip install beautifulsoup4

使用

import requests
from bs4 import BeautifulSoup   #使用BeautifulSoup爬虫

ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'

#构造headers
headers = {
    'User-Agent':ua
}

#发起请求
html = requests.get('http://gra.hnu.edu.cn/index/zsxx.htm')
html.encoding = 'utf-8'
#print(html.text)

#解析数据
soup = BeautifulSoup(html.text,'lxml')
result = soup.find_all('a',target='_blank')

for item in result:
   if "2024" in item.text and '招生简章' in item.text:
        print(item.text)

三、爬取动态内容（selenuim、phantomJS）

》selenuim

操作鼠标键盘

from selenium import webdriver

#设置不关闭浏览器
options = webdriver.EdgeOptions()
options.add_experimental_option('detach',True)

driver = webdriver.ChromiumEdge()#新版edge浏览器

driver.get('https://www.baidu.com')

import time
time.sleep(3)
from selenium.webdriver.common.by import By
driver.find_element(By.ID,'kw').send_keys('湘潭理工学院')

import time
time.sleep(3)
driver.find_element(By.ID,'su').click()
#或者使用键盘点击
#from selenium.webdriver.common.keys import Keys
#driver.find_element(By.ID,'kw').send_keys(Keys.ENTER)

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains

#设置不关闭浏览器
options = webdriver.EdgeOptions()
options.add_experimental_option('detach',True)

driver = webdriver.ChromiumEdge(options=options)#新版edge浏览器

driver.get('https://yjsy.hunnu.edu.cn/')

import time
time.sleep(3)
el = driver.find_element(By.XPATH,"//ul[@class='menu']/li[4]/a")
ActionChains(driver).move_to_element(el).perform()  # 鼠标悬停
time.sleep(3)
ek = driver.find_element(By.XPATH,"//ul[@class='menu']/li[4]/ul/li[2]/a")
ActionChains(driver).move_to_element(ek).perform()  # 鼠标悬停
time.sleep(3)
ActionChains(driver).move_to_element(ek).click().perform()
time.sleep(3)

动态抓取

from selenium import webdriver
from selenium.webdriver.common.by import By

options = webdriver.EdgeOptions()
options .add_experimental_option('detach',True)

driver = webdriver.ChromiumEdge(options=options)

#加载网页获取源代码
url = 'https://www.bilibili.com/v/popular/all/'
driver.get(url)

#导入BeautifulSoup，筛选数据
from bs4 import BeautifulSoup
soup = BeautifulSoup(driver.page_source,'lxml')

result = soup.find_all('div',class_='video-card')

for item in result:
    title = item.find('p',class_='video-name')
    up = item.find('span',class_='up-name__text')
    count = item.find('span',class_='play-text')
    print(f'视频:{title.text},up:{up.text},播放量:{count.text.strip()}')

动态抓取（升级版）

from selenium import webdriver
from selenium.webdriver.common.by import By

options = webdriver.EdgeOptions()
options.add_experimental_option('detach',True)

driver = webdriver.ChromiumEdge(options=options)

url='https://www.bilibili.com/video/BV1SC4y1F7Vj/'
driver.get(url)

import time
time.sleep(5)

from bs4 import BeautifulSoup
soup = BeautifulSoup(driver.page_source,'lxml')

result = soup.find_all('div',class_='app-v1')

for item in result:
    title = item.find('h1',class_='video-title')
    count = item.find('span',class_='view item')
    abd = item.find('span',class_='dm item') 
    datetime = item.find('span',class_='pubdate-text')
    

contents = soup.find_all('div',class_='content-warp')
contents_text = []

for content in contents:
    ad = content.find('div',class_='user-info').text
    ac = content.find('span',class_='reply-content').text
    contents_text.append({
        'name':ad,
        'text':ac
    })

print(f'视频:{title.text},播放量:{count.text.strip()},弹幕数:{abd.text.strip()},时间:{datetime.text}')
for content in contents_text:   
    print(f"评论:\nID:{content['name']},内容:{content['text']}")



driver.close()

正则表达式

import re
#待匹配字符串
title="你好, hello, 世界"
#创建正则表达式，用于只匹配中文
pattern=re.complie(r"[\u4e00-\u9fa5]+")
#检索整个字符串，将匹配的中文放到列表中
result=pattern.findall(title)
print(result)

天气查询

# -*- coding: utf-8 -*-
from __future__ import print_function

import ssl, hmac, base64, hashlib
from datetime import datetime as pydatetime

try:
    from urllib import urlencode
    from urllib.request import Request, urlopen
except ImportError:
    from urllib.parse import urlencode
    from urllib.request import Request, urlopen

# 云市场分配的密钥Id
secretId = "AKID1c6k2R0Duhg3eCnZP3ljQ8asmIe25sYnfUot"
# 云市场分配的密钥Key
secretKey = "Grb2a7e3UCjuHqLk03ub9nLAalmmc2bStAGI0dq3"
source = "market"

# 签名
datetime = pydatetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT')
signStr = "x-date: %s\nx-source: %s" % (datetime, source)
sign = base64.b64encode(hmac.new(secretKey.encode('utf-8'), signStr.encode('utf-8'), hashlib.sha1).digest())
auth = 'hmac id="%s", algorithm="hmac-sha1", headers="x-date x-source", signature="%s"' % (secretId, sign.decode('utf-8'))

# 请求方法
method = 'GET'
# 请求头
headers = {
    'X-Source': source,
    'X-Date': datetime,
    'Authorization': auth,
    
}
# 查询参数
queryParams = {
   "areaCn": "湘潭",
   }
# body参数（POST方法下存在）
bodyParams = {
}
# url参数拼接
url = 'https://service-6drgk6su-1258850945.gz.apigw.tencentcs.com/release/lundear/weather1d'
if len(queryParams.keys()) > 0:
    url = url + '?' + urlencode(queryParams)

request = Request(url, headers=headers)
request.get_method = lambda: method
if method in ('POST', 'PUT', 'PATCH'):
    request.data = urlencode(bodyParams).encode('utf-8')
    request.add_header('Content-Type', 'application/x-www-form-urlencoded')
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
response = urlopen(request, context=ctx)
content = response.read()
if content:
    #print(content.decode('utf-8'))
    import json
    weather =json.loads(content.decode('utf-8'))
    print(f"湘潭的天气：{weather['data']['now']['weather']},温度：{weather['data']['now']['temp']}")

别惹小娘子

关注

12
点赞
踩
12

收藏

觉得还不错? 一键收藏
0
评论
爬虫(复习)

二、数据解析（Xpath、lxml、beautifulsoup、jsonpath）三、爬取动态内容（selenuim、phantomJS）一、爬取网页数据（urllib和requests）下载（如果下载速度慢的话可以使用国内源）
复制链接

扫一扫