目标:
- 增量爬虫
- 异步redis
- 日志,邮件
- 重定向请求
现阶段:
- 爬虫基础模板
- 异步操作mysql
下阶段:
- 日志,邮件
- 异步redis
码云地址
https://gitee.com/wenwenc9/sdpier_frame.git
项目结构:
一、运用采集
# -*- coding: utf-8 -*-
# @Time : 2021-07-01 14:32
# @Author : XuGuangJun
# @FileName: crawl_main.py
# @Software: PyCharm
import aiohttp
import asyncio
from lxml import etree
import nest_asyncio
import ujson
import time
import queue # 对列
import traceback # 错误日志
class AsyncSpider_request:
def __init__(self, url_list, headers=None, cookies=None,
json=None, data=None, params=None, method='GET',
timeout=9, binary=False):
"""
:param url_list: urls为list
:param headers:{"Authorization": "Basic bG9naW46cGFzcw=="}
:param cookies:{'cookies_are': 'working'}
:param json:{'test': 'object'}
:param data:{'key1': 'value1', 'key2': 'value2'}/b'\x00Binary-data\x00'
:param params:{'key1': 'value1', 'key2': 'value2'}/[('key', 'value1'), ('key', 'value2')]
:param method: POST请求还是GET请求
:param timeout: 超时时间
:param binary: 是否为文件下载
"""
# 在jupyter需要这个,不然asyncio运行出错
nest_asyncio.apply()
# 初始化默认参数
self.headers = headers
self.cookies = cookies
self.json = json
self.data = data
self.params = params
self.method = method.lower()
self.timeout = timeout
self.binary = binary
# 结果集合
self.sucess = []
self.sucessNum = 0
self.fail = []
self.failNum = 0
if type(url_list) == str: # 应对单个url和多个url
self.url_list = [url_list]
else:
self.url_list = url_list
async def judge_binary(self, response):
status = response.status
# 判断是否是二进制文件采集,二进制文件得到流
if self.binary:
html = await response.read()
else:
encoding = response.get_encoding() # 有些网页并不是utf-8编码
if encoding == 'gb2312':
encoding = 'gbk'
html = await response.text(encoding=encoding)
redirected_url = str(response.url)
return status, html, redirected_url
# 请求函数
async def fetch(self, session, url, ):
try:
if self.method == 'get':
async with session.get(url, timeout=self.timeout, verify_ssl=False, params=self.params,
cookies=self.cookies) as response:
status, html, redirected_url = await self.judge_binary(response)
elif self.method == 'post':
async with session.post(url, timeout=self.timeout, verify_ssl=False, cookies=self.cookies,
json=self.json, data=self.data, ) as response:
status, html, redirected_url = await self.judge_binary(response)
else:
raise print('method方法错误!仅有get与post')
except Exception as e:
msg = '请求失败:{},错误原因:{}==={}'.format(url, str(type(e)), str(e))
print(msg)
html = ''
status = 0
redirected_url = url
# 状态码过滤
if status == 200:
self.sucessNum += 1
self.sucess.append((status, html, redirected_url))
else:
self.failNum += 1
self.fail.append((status, redirected_url))
async def create_urls_session(self):
"""
创建sesiion,创建请求任务
:return:
"""
_headers = {
'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; '
'Windows NT 6.1; Win64; x64; Trident/5.0)'),
}
if self.headers:
_headers = self.headers
async with aiohttp.ClientSession(json_serialize=ujson.dumps, headers=_headers) as session:
# 异步任务创建
tasks = [loop.create_task(self.fetch(session, url)) for url in self.url_list]
# 激活任务
await asyncio.wait(tasks)
def run(self):
loop.run_until_complete(self.create_urls_session())
async def main():
# ▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲ 普通示例,默认get请求 ▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲
# url_list = ['https://www.baidu.com/', 'https://www.baidu.com/', 'https://www.baidu.com/'] # 目标网址
# P = AsyncSpider_request(url_list=url_list)
# P.run() # 启动方法
# print(P.sucess) # 返回成功
# print(P.fail) # 返回失败
# ▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲ 最佳东方示例,get请求示例 ▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲
# url_zjdf_list = 'https://interface-mobile.veryeast.cn/v1/job/recommends' # 目标网址
# params_1 = {'city': '070000', 'location': '070000', 'company_industry': '1', 'page': '1'}
# params_2 = [('city', '070000'), ('location', '070000'), ('company_industry', '1'), ('page', '1')]
# P = AsyncSpider_request(url_list=url_zjdf_list, method='get', params=params_2)
# P.run() # 启动方法
# print(P.sucess) # 返回成功
# print(P.fail) # 返回失败
# ▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲ 猎聘示例,post请求 ▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲
# url_lp_list = 'https://app-tongdao.liepin.com/a/n/job/search.json'
# true = True
# index_json = {
# "data": {"keyword": "", "dq": "010", "industry": "000", "salaryLow": 0, "salaryHigh": 999,
# "refreshTime": "000",
# "sortType": 0, "compKind": ["000"], "compScale": "000", "currentPage": 0, "pageSize": 100,
# "isCampus": true}, "client_id": 80001, "version": "3.1.0", "version_code": 30100,
# "dev_type": 1, 'dq': '', 'industry': ''}
# index_headers = {
# 'Host': 'app-tongdao.liepin.com',
# 'Connection': 'keep-alive',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
# 'content-type': 'application/json',
# 'cookie': '_mscid=cx_wx_01; openid=opSQK0Q5_ywuMc9BH54rjO4roGGM; acw_tc=2760829516167265918576629ead62b6e99406247dc701b6117bfc651fb581; UniqueKey=9706777a3386324c1a667e00ad0d9493; lt_auth=7rtbOXVXxl3x5yLajWNY46ge2o35VGqd8HgLhEoG0tbuWfyz4PbkSgKGqrUExAMhwxl1dMULN7H5%0D%0APe73y3VI60QXwG2uiZiyo%2FK4z3wJdvRcN8W2vfT%2Bk8zRe50clUAC8mNb%0D%0A; app_auth=7rtbOXVXxl3x5yLajWNY46ge2o35VGqd8HgLhEoG0tbuWfyz4PbkSgKGqrUExAMhwxl1dMULN7H5%0D%0APe73y3VI60QXwG2uiZiyo%2FK4z3wJdvRcN8W2vfT%2Bk8zRe50clUAC8mNb%0D%0A; FE-hasUserLogin=true',
# 'x-client-type': 'wxa',
# 'Referer': 'https://servicewechat.com/wx4d70579bfaefd959/142/page-frame.html',
# 'Accept-Encoding': 'gzip, deflate, br',
# }
# P = AsyncSpider_request(url_list=url_lp_list, headers=index_headers, method='POST', json=index_json)
# P.run() # 启动方法
# print(P.sucess) # 返回成功
# print(P.fail) # 返回失败
if __name__ == '__main__':
start = time.time()
# 创建事件循环
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()
print('总耗时:', time.time() - start)
二、异步mysql,asyn
# -*- coding: utf-8 -*-
# @Time : 2021/6/23 15:46
# @Author : XuGuangJun
# @FileName: asyncio_mysql.py
# @Software: PyCharm
# -*- coding: utf-8 -*-
import asyncio
import aiomysql
# from ..settings import MYSQL_SETTINGS
# 数据库设置
MYSQL_SETTINGS = {
'MINSIZE': 5, # 最小协程链接mysql
'MAXSIZ': 10, # 最大协程链接mysql
'PORT': 3306,
'HOST': '172.16.10.23',
'USER': 'root',
'PASSWORD': '..',
}
class Pmysql:
"""
由于aiomysql.connect是异步的,在python里 __init__ 方法不能使用async关键词,
也就是在对象的初始化时不能异步,所以我获取连接的操作单独的使用单例模式来创建一个连接,当然也可以不使用单例,
每次进行查询的时候,都重新获取一个新的连接connection。
"""
def __init__(self, database):
self.coon = None
self.pool = None
self.database = database
"""
使用连接池的意义在于,有一个池子,它里保持着指定数量的可用连接,当一个查询结执行之前从这个池子里取一个连接,
查询结束以后将连接放回池子中,这样可以避免频繁的连接数据库,节省大量的资源。
"""
async def initpool(self):
# 初始化连接池
__pool = await aiomysql.create_pool(
minsize=MYSQL_SETTINGS['MINSIZE'],
maxsize=MYSQL_SETTINGS['MAXSIZ'],
host=MYSQL_SETTINGS['HOST'],
port=MYSQL_SETTINGS['PORT'],
user=MYSQL_SETTINGS['USER'],
password=MYSQL_SETTINGS['PASSWORD'],
db=self.database,
autocommit=False,
)
return __pool
async def getCurosr(self):
# 从空闲池获取连接的协程。根据需要创建新连接,并且池的大小小于maxsize。
self.conn = await self.pool.acquire()
# 获取连接游标
cur = await self.conn.cursor(aiomysql.cursors.DictCursor)
return self.conn, cur
async def query(self, query_sql, param=None):
"""
查询函数
:param query_sql: sql语句
:param param: 参数
:return:
"""
self.conn, cur = await self.getCurosr()
try:
await cur.execute(query_sql, param)
return await cur.fetchall()
except Exception as e:
print('执行查询命令出错')
finally:
if cur: # 游标存在,使用完毕 关闭游标
await cur.close()
# 释放掉conn,将连接放回到连接池中
await self.pool.release(self.conn)
async def insert(self, query_sql, param=None):
"""
插入数据函数
:param query_sql: sql语句
:param param: 参数
:return:
"""
self.conn, cur = await self.getCurosr()
try:
await cur.execute(query_sql, param)
except Exception as e:
print('执行插入命令出错')
finally:
if cur: # 游标存在,使用完毕 关闭游标
await cur.close()
# 释放掉conn,将连接放回到连接池中
await self.pool.release(self.conn)
async def createMysqlObj(db):
"""
创建mysql连接对象
:return: mysql对象
"""
mysqlobj = Pmysql(database=db)
pool = await mysqlobj.initpool()
mysqlobj.pool = pool
return mysqlobj
async def query(sql, databases):
mysqlobj = await createMysqlObj(databases)
datas = await mysqlobj.query(sql)
return datas
async def insert(sql, databases):
mysqlobj = await createMysqlObj(databases)
datas = await mysqlobj.query(sql)
return datas
if __name__ == '__main__':
sql = "select * from tb_job"
# spyder 与 jupyter 编译器(内置运行了loop)
# res_df = pd.DataFrame(list(await query(sql)))
# python3.7+
result = asyncio.run(query(sql, 'bigdata_final')) # 传入sql语句,数据库名称
print(result)
# python3.4+
# loop = asyncio.get_event_loop()
# datas = loop.run_until_complete(query(sql, 'bigdata_final'))
# print(datas)