今天简单地封装了一下爬虫常用的方法,包含了urllib、requests和selenium。
"""
@Title: spider
@Time: 2023/10/8 9:19
@Author: Michael
"""
import urllib.parse
import urllib.request
import urllib.error
import requests
from selenium import webdriver
# 处理 url
class MjSpiderUrl:
# 构造函数
def __init__(self, base_url, datas):
"""
:param base_url: 根 url
:param datas: 参数
"""
self.base_url = base_url
self.datas = datas
self.complete_url = ""
self.encoding = "utf-8"
# 编译字符串
def translate_string(self, string: str):
"""
:param string: 字符串
:return: <class 'str'>
"""
return urllib.parse.quote(string=string, encoding=self.encoding)
# 编译参数
def translate_datas(self):
"""
:return: <class 'str'>
"""
return urllib.parse.urlencode(query=self.datas, encoding=self.encoding)
# 拼接 url
def join_url(self):
"""
:return: <class 'str'>
"""
url_datas = urllib.parse.urlencode(self.datas)
complete_url = self.base_url + "?" + url_datas
self.complete_url = complete_url
return complete_url
# 解析 url
@staticmethod
def parse_url(url: str):
"""
:param url: 地址
:return: <class 'ParseResult'>
"""
result = urllib.parse.urlparse(url)
return result
# 使用 urllib 爬取网页
class MjSpiderUrllib:
# 构造函数
def __init__(self, url, datas, headers):
"""
:param url: 请求地址
:param datas: 参数
:param headers: 请求头
"""
self.url = url
self.datas = datas
self.headers = headers
self.request = None
self.response = None
self.encoding = "utf-8"
# 创建请求对象
def create_request(self, with_data: bool = False):
"""
:param with_data: get or post
:return: <class 'urllib.request.Request'>
"""
if not with_data:
request = urllib.request.Request(url=self.url, headers=self.headers)
self.request = request
else:
mj_spider_url = MjSpiderUrl(self.url, self.datas)
translate_datas = mj_spider_url.translate_datas().encode(self.encoding)
request = urllib.request.Request(url=self.url, headers=self.headers, data=translate_datas)
self.request = request
return request
# 爬取网页
def spider_data(self, proxies: dict = None, timeout: int = 5):
"""
:param proxies: 代理
:param timeout: 超时
:return: <class 'str'>
"""
content = ""
try:
if not proxies:
response = urllib.request.urlopen(self.request, timeout=timeout)
else:
handler = urllib.request.ProxyHandler(proxies=proxies)
opener = urllib.request.build_opener(handler)
response = opener.open(self.request, timeout=timeout)
self.response = response
content = response.read().decode(self.encoding)
response.close()
except urllib.error.URLError as e:
print("爬取失败!", e)
return content
# 下载
def download_data(self, path: str):
urllib.request.urlretrieve(url=self.url, filename=path)
# 获取响应内容,字符串
def find_text(self):
return self.response.read().decode(self.encoding)
# 获取响应内容,二进制
def find_content(self):
return self.response.read()
# 获取响应状态码
def find_status_code(self):
return self.response.getcode()
# 获取 url
def find_url(self):
return self.response.geturl()
# 获取响应头
def find_headers(self):
return self.response.getheaders()
# 使用 requests 爬取网页
class MjSpiderRequests:
install = r"pip3 install requests"
# 构造函数
def __init__(self, url: str, datas: dict, headers: dict, proxies: dict, timeout: int):
"""
:param url: 请求地址
:param datas: 参数
:param headers: 请求头
:param proxies: 代理
:param timeout: 超时
"""
self.url = url
self.datas = datas
self.headers = headers
self.proxies = proxies
self.timeout = timeout
self.response = None
# 基本访问
def get(self):
self.response = requests.get(self.url)
def post(self):
self.response = requests.post(self.url)
# 带参访问
def get_with_datas(self):
self.response = requests.get(self.url, data=self.datas)
def post_with_datas(self):
self.response = requests.post(self.url, data=self.datas)
# 请求头
def get_with_headers(self):
self.response = requests.get(self.url, data=self.datas, headers=self.headers)
def post_with_headers(self):
self.response = requests.post(self.url, data=self.datas, headers=self.headers)
# 代理
def get_with_proxies(self):
self.response = requests.get(self.url, data=self.datas, headers=self.headers, proxies=self.proxies)
def post_with_proxies(self):
self.response = requests.post(self.url, data=self.datas, headers=self.headers, proxies=self.proxies)
# 获取响应内容,字符串
def find_text(self):
return self.response.text
# 获取响应内容,二进制
def find_content(self):
return self.response.content
# 获取响应状态码
def find_status_code(self):
return self.response.status_code
# 获取 url
def find_url(self):
return self.response.url
# 获取 cookies
def find_cookies(self):
return self.response.cookies
# 获取 cookies_items
def find_cookies_items(self):
return self.response.cookies.items()
# 获取响应头
def find_headers(self):
return self.response.headers
# 获取响应历史
def find_history(self):
return self.response.history
# 自动化测试工具
class MjSpiderSelenium:
install = r"pip install -i https://pypi.douban.com/simple selenium"
version_map = r"https://blog.csdn.net/huilan_same/article/details/51896672"
drive_install1 = r"https://chromedriver.storage.googleapis.com/index.html"
drive_install2 = r"https://chromedriver.chromium.org/downloads"
# 构造函数
def __init__(self, path: str, url: str):
"""
:param path: 浏览器本地路径
:param url: 请求地址
"""
self.path = path
self.url = url
self.driver = None
# 创建浏览器对象,打开网址
def open_windows(self):
"""
:return: <class 'WebDriver'>
"""
# 配置项
option = webdriver.ChromeOptions()
# 设置默认浏览器
option.binary_location = self.path
# 不自动关闭浏览器
option.add_experimental_option("detach", True)
# 创建浏览器对象,传入配置项,打开网址
driver = webdriver.Chrome(options=option)
driver.get(self.url)
# 最大化窗口
driver.maximize_window()
self.driver = driver
return driver