from urllib.parse import quote_plus
import pandas as pd
from sqlalchemy import create_engine
"""
原始表:
create table ori_event
(
id int auto_increment
primary key,
begin_date datetime null,
end_date datetime null,
cnt int null
);
结果表:
create table target_event
(
id int auto_increment
primary key,
begin_date datetime null,
end_date datetime null,
cnt int null
);
"""
db_username = "user1"
db_pwd = ""
db_host = ""
db_port = "3306"
db_name = "tmp_data"
def get_mysql_conn():
mysql_conn = "mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8mb4"
mysql_conn_pro = mysql_conn.format(db_username, quote_plus(db_pwd), db_host, db_port, db_name)
return create_engine(mysql_conn_pro, pool_pre_ping=True)
def ori_batch_query(batch_size=10) -> pd.DataFrame:
"""
分批查询需要时间合并的源数据
:param batch_size: 每批大小
:return:
"""
conn = get_mysql_conn()
sql = "select * from ori_event where begin_date is not null and end_date is not null and id > (select max(id) from target_event) order by 1"
# 读表头
columns = conn.execute("show columns from ori_event")
pd_column = pd.DataFrame(columns)
column_name = pd_column.iloc[:, 0].array
# 读取数据
limit = " limit 0," + str(batch_size)
data = conn.execute(sql + limit)
result = pd.DataFrame(data, columns=column_name)
# 改成日期类型
result["begin_date"] = pd.to_datetime(result["begin_date"])
result["end_date"] = pd.to_datetime(result["end_date"])
return result
def load_latest_target() -> pd.DataFrame:
"""
加载最新的、已完成合并的结果表
:return:
"""
# todo 将条件中的1=1改成真实条件!目标:只读取需要参与合并的结果集,避免内存占用过大,如,只取当天的数据
sql = "select * from target_event where 1 = 1 and begin_date is not null and end_date is not null order by 1"
conn = get_mysql_conn()
# 读表头
columns = conn.execute("show columns from target_event")
pd_column = pd.DataFrame(columns)
column_name = pd_column.iloc[:, 0].array
data = conn.execute(sql)
result = pd.DataFrame(data, columns=column_name)
result["begin_date"] = pd.to_datetime(result["begin_date"])
result["end_date"] = pd.to_datetime(result["end_date"])
return result
def del_target_by_ids(del_ids):
if del_ids is not None and len(del_ids) > 0:
conn = get_mysql_conn()
sql = "delete from target_event where id in ({})".format(",".join([str(i) for i in del_ids]))
conn.execute(sql)
def save_target(result):
if result is None or len(result) == 0:
return
conn = get_mysql_conn()
for i, row in result.iterrows():
sql = "insert into target_event (id,begin_date,end_date,cnt) values ({},'{}','{}',{})".format(row["id"],
row["begin_date"],
row["end_date"],
row["cnt"])
conn.execute(sql)
def merge(event: pd.DataFrame, latest: pd.DataFrame):
if event is None or len(event) == 0:
return
del_ids = list()
for i, row in event.iterrows():
print("开始处理以下数据:")
print("----------------------------")
print(row.to_frame().T)
print("----------------------------")
less_than_min = row["end_date"] < latest["begin_date"]
more_than_max = row["begin_date"] > latest["end_date"]
match_data = latest[~(less_than_min | more_than_max)]
if len(match_data) > 0:
# 将匹配到的结果以及需要处理的这条记录,合并成新的结果
print("处理方式:需要合并,rows = ", len(match_data))
tmp = pd.concat([match_data, row.to_frame().T], ignore_index=True)
new_min = tmp["begin_date"].min()
new_max = tmp["end_date"].max()
# todo 修改这里,合并的逻辑
new_cnt = tmp["cnt"].sum()
tmp_data = [
[row["id"], new_min, new_max, new_cnt]
]
tmp_df = pd.DataFrame(data=tmp_data, columns=latest.columns)
# 从原结果中删除匹配到的数据,用合并后的结果替代
del_ids.extend(match_data["id"].values)
latest = latest.drop(labels=match_data.index)
latest = pd.concat([latest, tmp_df], ignore_index=True)
else:
latest = pd.concat([latest, row.to_frame().T])
print("处理方式:无交集,直接加入结果集")
print("本行数据处理完成,最新结果数据: \n", latest)
# 删除被合并的数据
del_target_by_ids(del_ids)
# 插入最新的数据
save_target(latest[latest["id"].isin(event["id"])])
return latest
if __name__ == '__main__':
d = ori_batch_query(2)
while len(d) > 0:
r = load_latest_target()
merge(d, r)
d = ori_batch_query(2)