main.py文件
import schedule
import time
import pyodbc
import pandas as pd
from datetime import datetime, timedelta
import warnings
import logging
import json
import os
def load_db_config():
base_dir = os.path.dirname(os.path.abspath(__file__))
config_file_path = os.path.join(base_dir, 'db_config.json')
with open(config_file_path, 'r', encoding='utf-8-sig') as file:
config = json.load(file)
return config
def connect_to_db(config):
connection_string = f"DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={config['server']};DATABASE={config['database']};UID={config['username']};PWD={config['password']}"
return pyodbc.connect(connection_string)
def process_data(df, branch_id, fendian_field):
df[fendian_field] = branch_id
return df
def get_yesterday_data(connection, table, date_field):
yesterday_start = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) - timedelta(days=1)
yesterday_end = yesterday_start + timedelta(hours=23, minutes=59, seconds=59)
query = f"SELECT * FROM {table} WHERE {date_field} BETWEEN ? AND ? AND (upload != 5 OR upload IS NULL)"
warnings.filterwarnings('ignore', category=UserWarning,
message="pandas only supports SQLAlchemy connectable")
return pd.read_sql(query, connection, params=[yesterday_start, yesterday_end])
def get_target_columns(connection, table):
cursor = connection.cursor()
cursor.execute(f"SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{table}'")
columns = [row.COLUMN_NAME for row in cursor.fetchall()]
cursor.close()
return columns
def convert_data_types(row):
new_row = []
for value in row:
if pd.isnull(value):
new_row.append(None)
elif isinstance(value, pd.Timestamp):
new_row.append(value.to_pydatetime())
else:
new_row.append(value)
return tuple(new_row)
def batch_insert_data_to_target(source_connection, target_connection, table, unique_field, df, db_config,
batch_size=1000):
print(f"{time.ctime()} —— {db_config['database_name']}:表-{table}: 数据批量插入中...")
source_cursor = source_connection.cursor()
target_cursor = target_connection.cursor()
target_columns = get_target_columns(target_connection, table)
df_columns = df.columns.tolist()
common_columns = [col for col in df_columns if col in target_columns]
columns = ', '.join(common_columns)
placeholders = ', '.join(['?' for _ in common_columns])
insert_query = f"INSERT INTO {table} ({columns}) VALUES ({placeholders})"
success_count = 0
failure_count = 0
error = ''
for start in range(0, len(df), batch_size):
batch_df = df.iloc[start:start + batch_size]
data_to_insert = [convert_data_types(row[common_columns]) for _, row in batch_df.iterrows()]
try:
target_cursor.executemany(insert_query, data_to_insert)
target_connection.commit()
unique_values = batch_df[unique_field].tolist()
update_source_query = f"UPDATE {table} SET upload = 5 WHERE {unique_field} IN ({', '.join(['?' for _ in unique_values])})"
source_cursor.execute(update_source_query, unique_values)
source_connection.commit()
update_target_query = f"UPDATE {table} SET upload = 5 WHERE {unique_field} IN ({', '.join(['?' for _ in unique_values])})"
target_cursor.execute(update_target_query, unique_values)
target_connection.commit()
success_count += len(data_to_insert)
except Exception as e:
failure_count += len(data_to_insert)
error = str(e)
target_connection.rollback()
source_connection.rollback()
source_cursor.close()
target_cursor.close()
message = f"{time.ctime()} —— {db_config['database_name']}:表-{table}: 插入成功 {success_count} 条, 插入失败 {failure_count} 条"
if failure_count > 0:
message += f", 失败原因: {error}"
print(message)
def main_task():
db_config_list = load_db_config()
source_databases = db_config_list['source_databases']
target_database = db_config_list['target_database']
tables = db_config_list['tables']
target_conn = None
try:
target_conn = connect_to_db(target_database)
print(f"{time.ctime()} —— {target_database['database_name']}-数据库连接成功!")
for db_config in source_databases:
source_conn = None
try:
source_conn = connect_to_db(db_config)
print(f"{time.ctime()} —— {db_config['database_name']}-数据库连接成功!")
for table, fields in tables.items():
unique_field = fields['unique_field']
date_field = fields['date_field']
fendian_field = fields['fendian_field']
try:
df = get_yesterday_data(source_conn, table, date_field)
if not df.empty:
processed_df = process_data(df, db_config['branch_id'], fendian_field)
batch_insert_data_to_target(source_conn, target_conn, table, unique_field, processed_df, db_config)
else:
print(f"{time.ctime()} —— {db_config['database_name']}:表-{table}: 暂无待处理的昨日数据!")
except Exception as e:
print(f"{time.ctime()} —— {db_config['database_name']}:表-{table}:处理数据失败!error: {e}")
except Exception as e:
logging.error(f"{time.ctime()} —— {db_config['database_name']}-数据库连接失败! error: {e}")
finally:
try:
if source_conn is not None:
source_conn.close()
print(f"{time.ctime()} —— 关闭数据库连接 {db_config['database_name']}")
except Exception as e:
logging.error(f"{time.ctime()} —— 关闭 {db_config['database_name']}-数据库连接时出错: {e}")
except Exception as e:
logging.error(f"{time.ctime()} —— {target_database['database_name']}-数据库连接失败! error: {e}")
finally:
try:
if target_conn is not None:
target_conn.close()
print(f"{time.ctime()} —— 关闭数据库连接 {target_database['database_name']}")
except Exception as e:
logging.error(f"{time.ctime()} —— 关闭 {target_database['database_name']}-数据库连接时出错: {e}")
db_config = load_db_config()
schedule_time = db_config.get('schedule_time', '03:00')
schedule.every().day.at(schedule_time).do(main_task)
print(f"程序已运行,将在 {schedule_time} 点执行数据抓取!")
while True:
schedule.run_pending()
time.sleep(40)
db_config.json配置文件
{
"schedule_time": "03:00",
"source_databases": [
{
"database_name": "名称(自定义)",
"server": "主机名",
"database": "数据库名称",
"username": "用户名",
"password": "密码",
"branch_id": "数据库标识(自定义)"
}
],
"target_database": {
"database_name": "名称(自定义)",
"server": "主机名",
"database": "数据库名称",
"username": "用户名",
"password": "密码"
},
"tables": {
"表名": {
"unique_field": "唯一标识",
"date_field": "日期字段",
"fendian_field": "数据库标识(从哪个数据库插入来的数据)"
}
}
}