假设关系型数据库存在一张记录资金流向表:
假设 字段为客户号party_num、下游客户号next_party_num、客户性别、客户日均金额、客户逾期情况。
1、模拟生成一万条该表数据:
import pandas as pd
import random
import numpy as np
# 设置随机数种子以便复现结果
random.seed(42)
np.random.seed(42)
# 生成随机客户号 (假设是6位数字)
def generate_party_num():
return random.randint(100000, 999999)
# 生成随机客户性别
def generate_gender():
return random.choice(['M', 'F'])
# 生成随机日均金额(假设在100到50000之间)
def generate_avg_amount():
return round(random.uniform(100, 50000), 2)
# 生成随机逾期情况
def generate_overdue():
return random.choice([0, 1]) # 0: 没有逾期, 1: 逾期
# 生成客户资金流向表
def generate_data(num_records):
data = {
'party_num': [generate_party_num() for _ in range(num_records)], # 客户号
'next_party_num': [generate_party_num() for _ in range(num_records)], # 下游客户号
'gender': [generate_gender() for _ in range(num_records)], # 性别
'avg_amount': [generate_avg_amount() for _ in range(num_records)], # 日均金额
'overdue': [generate_overdue() for _ in range(num_records)] # 逾期情况
}
return pd.DataFrame(data)
# 生成1万条记录
num_records = 10000
df = generate_data(num_records)
# 保存为CSV文件
df.to_csv('financial_flow_data.csv', index=False)
print(f"生成了 {num_records} 条资金流向数据,并保存为 financial_flow_data.csv 文件")
2、Python 脚本:连接 Neo4j 并导入数据
from neo4j import GraphDatabase
import pandas as pd
# 连接到 Neo4j 数据库的类
class Neo4jConnection:
def __init__(self, uri, user, pwd):
self.__uri = uri
self.__user = user
self.__pwd = pwd
self.__driver = None
try:
self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
except Exception as e:
print("Failed to create the driver:", e)
def close(self):
if self.__driver is not None:
self.__driver.close()
def query(self, query, parameters=None):
assert self.__driver is not None, "Driver not initialized!"
session = None
response = None
try:
session = self.__driver.session()
response = session.run(query, parameters)
except Exception as e:
print("Query failed:", e)
finally:
if session is not None:
session.close()
return response
# 导入 CSV 数据并将其插入 Neo4j 数据库
def import_data_to_neo4j(file_path, neo4j_conn):
# 读取 CSV 文件
df = pd.read_csv(file_path)
# 循环读取数据并插入 Neo4j
for index, row in df.iterrows():
query = """
MERGE (p1:Customer {party_num: $party_num})
ON CREATE SET p1.gender = $gender, p1.avg_amount = $avg_amount, p1.overdue = $overdue
MERGE (p2:Customer {party_num: $next_party_num})
ON CREATE SET p2.gender = $gender, p2.avg_amount = $avg_amount, p2.overdue = $overdue
MERGE (p1)-[:FLOWS_TO]->(p2)
"""
# 执行 Cypher 查询
neo4j_conn.query(query, parameters={
"party_num": row['party_num'],
"next_party_num": row['next_party_num'],
"gender": row['gender'],
"avg_amount": float(row['avg_amount']),
"overdue": int(row['overdue'])
})
print("数据导入完成")
# 主函数
def main():
# Neo4j 数据库连接参数
uri = "bolt://localhost:7687" # Neo4j 的 Bolt URL
user = "neo4j" # Neo4j 的用户名
pwd = "your_password" # Neo4j 的密码
# 创建数据库连接
neo4j_conn = Neo4jConnection(uri=uri, user=user, pwd=pwd)
# CSV 文件路径
file_path = "financial_flow_data.csv"
# 将数据导入 Neo4j
import_data_to_neo4j(file_path, neo4j_conn)
# 关闭连接
neo4j_conn.close()
if __name__ == "__main__":
main()
3、从 Neo4j 中提取客户(节点)及其上下游关系(边),并将性别、日均金额作为节点的特征属性。你还需要逾期情况作为标签。
以下是从 Neo4j 中提取数据并构建 NetworkX 图的代码:
from py2neo import Graph
import networkx as nx
from node2vec import Node2Vec
import numpy as np
import pandas as pd
# 连接到 Neo4j 数据库
graph_db = Graph("bolt://localhost:7687", auth=("neo4j", "your_password"))
# 查询客户节点和上下游资金流向关系
query = """
MATCH (p1:Customer)-[:FLOWS_TO]->(p2:Customer)
RETURN p1.party_num as party_num, p2.party_num as next_party_num, p1.gender as gender, p1.avg_amount as avg_amount, p1.overdue as overdue
"""
data = graph_db.run(query).data()
# 构建 NetworkX 图
G = nx.Graph()
# 将节点和边加入图中
for row in data:
G.add_node(row['party_num'], gender=row['gender'], avg_amount=row['avg_amount'], overdue=row['overdue'])
G.add_edge(row['party_num'], row['next_party_num'])
print(f"图中共有 {len(G.nodes)} 个节点和 {len(G.edges)} 条边")
# 提取节点的属性 (性别,日均金额,逾期情况)
node_features = []
labels = []
for node, attr in G.nodes(data=True):
gender = 1 if attr['gender'] == 'M' else 0 # 男性为1,女性为0
avg_amount = attr['avg_amount']
overdue = attr['overdue']
# 节点特征
node_features.append([gender, avg_amount])
# 逾期情况标签
labels.append(overdue)
node_features = np.array(node_features)
labels = np.array(labels)
4、使用 Node2Vec 生成节点嵌入
使用 node2vec
库来生成节点的嵌入特征向量,Node2Vec 是一种基于随机游走的图嵌入算法,它能捕捉节点的结构特性并生成特征向量。
# 使用 Node2Vec 生成嵌入
node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=4)
model = node2vec.fit(window=10, min_count=1, batch_words=4)
# 获取节点的嵌入向量
node_embeddings = np.array([model.wv[str(node)] for node in G.nodes])
print(f"生成的嵌入向量形状为: {node_embeddings.shape}")
5、合并节点特征与嵌入向量
将 Node2Vec
生成的嵌入向量与原始的节点特征(性别和日均金额)合并,形成最终的节点特征矩阵,之后可以将其用于机器学习任务,如分类或回归模型。
# 合并嵌入向量和节点特征
final_features = np.hstack((node_embeddings, node_features))
# 打印最终特征矩阵的形状
print(f"最终的特征矩阵形状为: {final_features.shape}")
# 将特征和标签保存为用于后续机器学习
df = pd.DataFrame(final_features)
df['label'] = labels
# 保存到CSV文件
df.to_csv('customer_node_embeddings.csv', index=False)
print("嵌入和特征已保存为 'customer_node_embeddings.csv'")
6、 使用机器学习模型
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(final_features, labels, test_size=0.3, random_state=42)
# 训练逻辑回归模型
model = LogisticRegression()
model.fit(X_train, y_train)
# 预测
y_pred = model.predict(X_test)
# 输出分类报告
print(classification_report(y_test, y_pred))
- 节点嵌入:通过
Node2Vec
生成节点嵌入特征向量,捕捉客户在图中的结构信息。 - 特征组合:将节点的嵌入向量与节点的属性(性别、日均金额)合并,生成完整的特征矩阵。
- 标签:客户的逾期情况
overdue
被作为分类任务的目标值,1 表示逾期,0 表示未逾期。