robots.txt
案例:https://www.baidu.com/robots.txt
robots.txt 文件是一个放置在网站根目录的文本文件,用于告诉搜索引擎的爬虫哪些页面或目录可以或不可以被爬取。
# 表示以下规则适用于哪些爬虫。
User-agent: *
# 禁止爬取该目录下的页面。
Disallow: /tmp/
# 允许爬取该目录下的页面。
Allow: /
requests
requests 官方文档:https://requests.readthedocs.io/projects/cn/zh-cn/latest/
import requests
r = requests.get("http://www.baidu.com/s", params={"wd": "python"})
r = requests.post("http://www.baidu.com/s", data={"wd": "python"})
r = requests.get("http://www.baidu.com",headers={"User-Agent":"MyClient"})
print(r.content)# 二进制
print(r.text)# 源码
print(r.json()["data"])# json
print(r.url)
print(r.headers)
print(r.cookies)
会话对象
会话对象让你能够跨请求保持某些参数。
import requests
s = requests.Session()
# 通过为会话对象的属性提供数据,实现为请求方法提供缺省数据
s.auth = ('user', 'pass')
# 方法层的参数(url 后面)会覆盖会话的参数,但是不会被跨请求保持。
r = s.get('http://httpbin.org/cookies', cookies={'from-my': 'browser'})
r = s.get('http://httpbin.org/cookies')
前后文会话管理器
确保 with 区块退出后会话能被关闭,即使发生了异常也一样。
with requests.Session() as s:
s.get('http://httpbin.org/cookies')
SSL 证书验证
将 verify(默认为True) 设置为 False,Requests 也能忽略对 SSL 证书的验证。
requests.get('https://kennethreitz.org', verify=False)
代理
import requests
# http
proxies = {
"http": "http://10.10.1.10:3128",
"https": "http://10.10.1.10:1080",
}
# Basic Auth
proxies = {
"http": "http://user:pass@10.10.1.10:3128/",
}
# SOCKS
proxies = {
'http': 'socks5://user:pass@host:port',
'https': 'socks5://user:pass@host:port'
}
requests.get("http://example.org", proxies=proxies)
页面解析
Beautiful Soup
中文文档:https://beautifulsoup.cn/
Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库。
通过 CSS 选择器定位
from bs4 import BeautifulSoup
r = requests.get('https://www.baidu.com')
soup = BeautifulSoup(r.text,'lxml')
fonts = soup.select('#id > div font')
print(fonts[0]['属性名'])
XPath
XPath 是一门在 XML 文档中查找信息的语言。XPath 可用来在 XML 文档中对元素和属性进行遍历。
教程:https://www.w3school.com.cn/xpath/index.asp
表达式 | 描述 |
---|---|
nodename | 选取此节点的所有子节点。 |
/ | 从根节点选取。 |
// | 从匹配选择的当前节点选择文档中的节点,而不考虑它们的位置。 |
. | 选取当前节点。 |
… | 选取当前节点的父节点。 |
@ | 选取属性。 |
from lxml import etree
html = requests.get('https://www.baidu.com')
soup = etree.HTML(html.text)
content = soup.xpath('//a/@href')
正则表达式
https://docs.python.org/3/library/re.html
import re
# 忽略大小写和使.匹配换行符(默认情况下,在正则表达式中,.匹配除换行符\n之外的任何单个字符。)
compiles = re.compile(r'\n',re.I|re.S)
compiles.findall('''<div class="test">''')
案例
爬图片
import os
import requests
from bs4 import BeautifulSoup
# 目标网站可以通过 url 实现翻页
baseurl = 'https://xxx.com/'
headers = {
"User-Agent":""
}
# 爬 10 页
for i in range(1, 11):
path = '图片/第' + str(i) + '页/'
# 创建文件夹
if not os.path.exists(path):
os.mkdir(path)
html = requests.get(baseurl + str(i),headers=headers)
soup = BeautifulSoup(html.text,'lxml')
# 选择要爬取的数据
imgs = soup.select('img.lazy')
for img in imgs:
# 按需求清洗数据
pictureUrl = img['data-original']
pictureName = ''
# 写入
with open(path + pictureName, 'wb') as file:
file.write(requests.get('https://' + pictureUrl).content)
爬音频
import hashlib
import json
import time
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": ""
}
timestamp = str(int(time.time()))
signature = [
"NVP5eb63bbbe01eeed093cb22bb8f5acdc3",
"appid=0000",
"clienttime=" + timestamp,
"clientver=20000",
"dfid=5eb63bbbe01eeed093cb22bb8f5acdc3",
"encode_album_audio_id=",
"mid=5eb63bbbe01eeed093cb22bb8f5acdc3",
"platid=0",
"srcappid=0000",
"token=",
"userid=0",
"uuid=5eb63bbbe01eeed093cb22bb8f5acdc3",
"NVP5eb63bbbe01eeed093cb22bb8f5acdc3",
]
params = {
"srcappid": 0000,
"clientver": 00000,
"mid": "5eb63bbbe01eeed093cb22bb8f5acdc3",
"uuid": "5eb63bbbe01eeed093cb22bb8f5acdc3",
"dfid": "5eb63bbbe01eeed093cb22bb8f5acdc3",
"appid": 0000,
"platid": 0,
"token": "",
"userid": 0,
"clienttime": timestamp,
}
html = requests.get('https://www.xxx.com', headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
songLinks = soup.select('#rankWrap li>a[href]')
baseurl = 'https://www.xxx.com/'
for songLink in songLinks:
# 歌曲id
songId = songLink['href'].split('/')[-1].split('.')[0]
# 元素顺序不能乱,否则加密后的签名不能用
signature[5] = "encode_album_audio_id=" + songId
md5 = hashlib.md5()
for i in signature:
md5.update(i.encode("utf-8"))
params['signature'] = md5.hexdigest()
params['encode_album_audio_id'] = songId,
res = requests.get(baseurl,params=params,headers=headers)
data = json.loads(res.text)["data"]
with open(data["song_name"] + ".mp3", 'wb') as file:
file.write(requests.get(data["play_url"]).content)
爬视频
import hashlib
import json
import os
import re
import time
from pprint import pprint
# 进度条
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
# 模拟真实请求
baseUrl = "https://vd6.l.qq.com/proxyhttp"
headers = {
"Cookie": "",
"User-Agent": ""
}
data = {"请求数据"}
res = requests.post(url=baseUrl,json=data,headers=headers)
# 获取响应的json数据
json_res = res.json()
# 提取有 m3u8 连接的数据并转为字典
vinfo = json.loads(json_res['vinfo'])
# pprint 格式化打印,方便查看
pprint(vinfo)
# 获取视频标题
title = vinfo['vl']['vi'][0]['ti']
# 获取网站的资源访问连接
src_url = vinfo['vl']['vi'][0]['ul']['ui'][-1]['url']
# 获取 m3u8 连接(请求连接 + ts片段)
m3u8_url = src_url + vinfo['vl']['vi'][0]['ul']['ui'][0]['hls']['pt']
# 获取 ts 片段连接
m3u8 = requests.get(m3u8_url).text
# 清洗数据
ts = re.findall(',\n(.*?)\n#', m3u8)
# 保存数据
for i in tqdm(range(len(ts))):
with open('腾讯视频/' + title + '.mp4', mode='ab') as f:
f.write(requests.get(src_url + ts[i]).content)