任务执行器设计图
Scheduler + Executor + Data module
Scheduler 调度器
调度器可用当前流行的Azkaban, 也可以写一个定时启动的脚本。
创建task, 设置调用时间间隔周期period
Executor 执行器
1. 获取一条数:
获取status=0 的数据, status设为1(查询该数据时并设置),设置operation_time为当前时间
或者
获取status=1 and (present_time当前时间 - operation_time历史操作时间按) > period 周期 and end_time 为空 的数据
2. 执行用户自定义函数
执行用户自定义函数
3. 返回自定义函数结果和数据是否使用成功的状态
如果使用成功status设为2,如果使用失败status设为0.
Data module 数据模块
-
原数据表source_data_table,必须包含ID,CATEGORY字段,ID作为每条字段的唯一识别符,CATEGORY字段不做强要求,如果需要再一个表中存多种类型的数据,需要该字段,其他字段自定义
-
状态表status_table, 复制原数据表source_data_table中的数据ID, 记录每条数据的使用情况
status的取值:0未使用,1使用中,2使用成功
数据库表结构设计(PostgreSQL)
1. source_data_table 原始数据表设计
# PG中需实现id的自增长
CREATE SEQUENCE IF NOT EXISTS upms_area_id_seq START 101;
CREATE TABLE SOURCE_DATA_TABLE(
ID INT NOT NULL DEFAULT nextval('upms_area_id_seq'::regclass),
CATEGORY VARCHAR(50),
...
);
2 status_table 状态表设计
CREATE TABLE STATUS_TABLE(
ID INT NOT NULL,
STATUS INT NOT NULL,
START_TIME INT NOT NULL,
END_TIME INT,
TASK_NAME VARCHAR(50) NOT NULL,
unique(ID,TASK_NAME)
);
代码:task_executor.py
import time
import psycopg2
# from configuration import (PG_DATABASE, PG_HOST, PG_PASSWORD, PG_PORT, PG_USER)
INSERT_LIMIT_NUM = 60000
class Case(object):
def source_data_table_case(self):
"""
源数据表中必须包含ID字段,ID作为每条字段的唯一识别符。PG中需实现id的自增长
CATEGORY字段不做强要求,如果需要再一个表中存多种类型的数据,需要该字段
:return:
"""
text = """
# PG中需实现id的自增长
CREATE SEQUENCE IF NOT EXISTS upms_area_id_seq START 101;
1)
CREATE TABLE SOURCE_DATA_TABLE(
ID INT NOT NULL DEFAULT nextval('upms_area_id_seq'::regclass),
CATEGORY VARCHAR(50) NOT NULL,
...
);
2)
CREATE TABLE SOURCE_DATA_TABLE(
ID INT NOT NULL DEFAULT nextval('upms_area_id_seq'::regclass),
CATEGORY VARCHAR(50),
PROVINCE VARCHAR(50) NOT NULL,
CITY VARCHAR(50),
DISTRICT VARCHAR(50)
);
"""
print(text)
def status_table_case(self):
"""
状态表表结构统一如下,只需更改表名即可。
:return:
"""
text = """
CREATE TABLE STATUS_TABLE_CASE(
ID INT NOT NULL,
STATUS INT NOT NULL,
START_TIME INT NOT NULL,
END_TIME INT,
TASK_NAME VARCHAR(50) NOT NULL,
unique(ID,TASK_NAME)
);
"""
print(text)
def use_case(self):
def udf(data):
print(data)
return data[0][0], 1
pg_database = "db"
pg_user = "user"
pg_password = "pd"
pg_host = "127.0.0.1"
pg_port = 5432
task_name = "test6"
source_data_table = "source_data_table"
status_table = "status_table_case"
period = 60 * 10
cycle_index = 10
order_by_field_list = ["PROVINCE"] # []
output_fields = ["ID", "PROVINCE"]
category = ""
run_task(udf, pg_database, pg_user, pg_password, pg_host, pg_port, task_name, source_data_table,
status_table, period, cycle_index, category, order_by_field_list, output_fields)
class PGInitialize(object):
def __init__(self, pg_database, pg_user, pg_password, pg_host, pg_port):
self.conn = psycopg2.connect(database=pg_database, user=pg_user, password=pg_password,
host=pg_host, port=pg_port)
self.cursor = self.conn.cursor()
def select(self, sql):
self.cursor.execute(sql)
return self.cursor.fetchall()
def update_returning(self, sql):
self.cursor.execute(sql)
self.conn.commit()
return self.cursor.fetchall()
def update(self, sql):
self.cursor.execute(sql)
self.conn.commit()
def insert(self, sql):
self.cursor.execute(sql)
self.conn.commit()
class TaskExecutor(object):
def __init__(self, task_name, source_data_table, status_table, pg):
"""
:param task_name: task
:param source_data_table:
:param status_table:
:param pg: postgresql db
"""
self.task_name = task_name
self.source_data_table = source_data_table
self.status_table = status_table
self.pg = pg
self.category = None
self.order_by_field_list = None
self.output_fields = None
def set_sort_fields(self, fields):
self.order_by_field_list = fields
def set_output_fields(self, fields):
if fields:
if not any([True for f in fields if f.lower() == "id"]):
raise ValueError("Missing ID field")
self.output_fields = fields
def set_category(self, category):
self.category = category
def get_unprocessed_data(self, period, limit_num=1):
"""
:param limit_num:
:param output_fields:
:param period: 时间周期,单位:秒
:return:
"""
current_time = time.time()
time_interval = current_time - period
output_fields = ','.join([f"{f}" for f in self.output_fields]) if self.output_fields else f"*"
order_by_text = "ORDER BY id" if self.order_by_field_list else ""
sql = f"UPDATE {self.status_table} t1 SET status=1, start_time={current_time} " \
f"from (SELECT * FROM {self.status_table} " \
f"WHERE task_name='{self.task_name}' AND status=0 OR (status=1 and {time_interval} > start_time " \
f"AND end_time=0) " \
f"{order_by_text} LIMIT {limit_num}) t2 " \
f"WHERE t1.id=t2.id and t1.task_name=t2.task_name " \
f"RETURNING t1.id"
data = self.pg.update_returning(sql)
if data:
return self.pg.select(f"select {output_fields} from {self.source_data_table} where id = {data[0][0]}")
def update_status(self, data_id):
"""
:param data_id: data id in table
:return:
"""
sql = f"UPDATE {self.status_table} SET status=2, end_time={time.time()} " \
f"WHERE id={data_id} AND task_name='{self.task_name}'"
self.pg.update(sql)
def update_basic_data(self):
start_time = time.time()
category = f"WHERE category= '{self.category}'" if self.category else ''
select_mun_sql = f"select count(1) from {self.source_data_table}"
order_by_text = ("ORDER BY " + ', '.join(self.order_by_field_list)) if self.order_by_field_list else ""
data_sum = self.pg.select(select_mun_sql)[0][0]
for i in range(int(data_sum / INSERT_LIMIT_NUM) + 1):
select_sql = f"select id from {self.source_data_table} {category} " \
f"{order_by_text} LIMIT {INSERT_LIMIT_NUM} OFFSET {i}"
print(select_sql)
data_ids = self.pg.select(select_sql)
if data_ids:
values_list = [(index[0], 0, start_time, 0, self.task_name) for index in data_ids]
values_text = str(values_list)[1:-1]
insert_sql = f"INSERT INTO {self.status_table} (ID, STATUS, START_TIME, END_TIME, TASK_NAME) VALUES " \
f"{values_text} ON CONFLICT ON CONSTRAINT {self.status_table}_id_task_name_key DO NOTHING"
self.pg.insert(insert_sql)
def run_job(self, user_defined_functions, period, cycle_index):
"""
:param user_defined_functions:
:param period:
:param cycle_index:
:return:
"""
for i in range(cycle_index):
data = self.get_unprocessed_data(period)
if data:
data_id, status = user_defined_functions(data)
if status:
self.update_status(data_id)
else:
break
def run_task(udf, pg_database, pg_user, pg_password, pg_host, pg_port, task_name, source_data_table,
status_table, period, cycle_index, category, order_by_field_list=[], output_fields=[]):
"""
:param output_fields: Output source_data_table field to User-defined functions
:param category:
:param udf: User-defined functions
:param pg_database: postgresql info
:param pg_user:
:param pg_password:
:param pg_host:
:param pg_port:
:param task_name: task name
:param source_data_table: source data table name
:param status_table: status table name
:param order_by_field_list: order by field list like: ["PROVINCE"]
:param period: time period, Unit s
:param cycle_index: cycle index
:return:
"""
PG = PGInitialize(pg_database, pg_user, pg_password, pg_host, pg_port)
task_executor = TaskExecutor(task_name, source_data_table, status_table, PG)
task_executor.set_sort_fields(order_by_field_list)
task_executor.set_output_fields(output_fields)
task_executor.set_category(category)
task_executor.update_basic_data()
task_executor.run_job(udf, period, cycle_index)
Instructions 字段描述
source_data_table 中CATEGORY字段:原始数据表中可以存多种类型的数据,使用时指定CATEGORY进行筛选。
如对全国城市打点,CATEGORY记录该点所在的省市。使用时可通过该字段单独筛选数据使用。
接上,程序中可先查询CATEGORY的去重后的所有值,for 循环使用
set_sort_fields
数据库中排序的字段,可按该排序顺序输出使用数据,如set_sort_fields为[“city”], 任务中数据将按城市排序后输出。
set_output_fields
字段集为原始数据库中的字段,这里是为指定用户自定义函数的内部输入字段,必须包含id字段。
如原始数据表中太多字段,而用户自定义函数中不需要那么多字段,可选择性输入。
set_category
筛选原始数据表中CATEGORY=“指定category”的数据。
get_unprocessed_data
获取未处理的一条数据
update_status
更新某条数据的状态
update_basic_data
根据原始数据表中的数据生成数据状态表
run_job
核心方法,包含 get_unprocessed_data(),user_defined_functions用户自定义函数,update_status() 函数
run_task 启动任务