在HarmonyOS智能编程助手项目中,我们开发了一个知识图谱模块,用于构建、检索和可视化HarmonyOS API文档的知识结构。本文将介绍这个模块的设计思路、核心功能和实现细节。
1. 知识图谱概述
知识图谱是一种结构化的知识表示方式,通过实体和关系来描述领域知识。在我们的项目中,知识图谱用于表示HarmonyOS API之间的各种关系,如调用关系、继承关系等,以及API与其方法、参数、返回值等元素之间的关系。
知识图谱模块包含以下核心组件:
- 知识图谱构建器:从API文档中提取实体和关系,构建知识图谱
- 知识图谱检索器:基于用户查询,检索相关API及其关联知识
- 知识图谱可视化器:将知识图谱以可视化方式展示
- Web应用:提供用户友好的界面,支持知识图谱的查询和展示
2. 知识图谱构建
2.1 实体和关系设计
知识图谱中的基本元素是实体和关系。我们定义了多种实体类型和关系类型:
# 实体类型
self.entity_types = {
"API": "API接口或函数",
"Parameter": "API参数",
"ReturnValue": "API返回值",
"DataType": "数据类型",
"Module": "模块或包",
"Class": "类",
"Method": "方法",
"Constant": "常量",
"Feature": "功能特性"
}
# 关系类型
self.relation_types = {
"has_parameter": "API具有参数",
"returns": "API返回值",
"throws": "API抛出异常",
"belongs_to": "属于某个模块/类",
"depends_on": "依赖于",
"calls": "调用",
"implements": "实现",
"extends": "继承",
"has_feature": "具有特性",
"has_constraint": "具有约束条件"
}
2.2 实体和关系的数据结构
实体和关系的数据结构如下:
# 实体类
class APIEntity:
def __init__(self, entity_id: str, entity_type: str, name: str, properties: Dict[str, Any]):
self.entity_id = entity_id # 实体ID
self.entity_type = entity_type # 实体类型
self.name = name # 实体名称
self.properties = properties # 实体属性
# 关系类
class APIRelation:
def __init__(self, source_id: str, target_id: str, relation_type: str, properties: Dict[str, Any]):
self.source_id = source_id # 源实体ID
self.target_id = target_id # 目标实体ID
self.relation_type = relation_type # 关系类型
self.properties = properties # 关系属性
2.3 从API文档提取实体和关系
知识图谱构建的核心是从API文档中提取实体和关系。以下是提取API实体及其方法、参数和返回值的代码:
def extract_entities_from_doc(self, doc: Dict[str, Any]) -> List[APIEntity]:
"""从文档中提取实体"""
entities = []
# 提取API实体
api_id = doc.get('doc_id', '')
api_name = doc.get('title', '')
api_properties = {
'overview': doc.get('overview', ''),
'type': 'API'
}
api_entity = APIEntity(api_id, 'API', api_name, api_properties)
entities.append(api_entity)
# 提取各个部分中的实体
for section in doc.get('sections', []):
section_title = section.get('title', '')
section_content = section.get('content', '')
# 提取方法实体
if '方法' in section_title or '函数' in section_title:
method_id = f"{api_id}_method_{len(entities)}"
method_properties = {
'description': section_content,
'type': 'Method'
}
method_entity = APIEntity(method_id, 'Method', section_title, method_properties)
entities.append(method_entity)
# 添加API与方法的关系
relation = APIRelation(api_id, method_id, 'HAS_METHOD', {})
self.relations.append(relation)
# 提取参数实体
if '参数' in section_title:
param_id = f"{api_id}_param_{len(entities)}"
param_properties = {
'description': section_content,
'type': 'Parameter'
}
param_entity = APIEntity(param_id, 'Parameter', section_title, param_properties)
entities.append(param_entity)
# 添加API与参数的关系
relation = APIRelation(api_id, param_id, 'HAS_PARAMETER', {})
self.relations.append(relation)
# 提取返回值实体
if '返回值' in section_title:
return_id = f"{api_id}_return_{len(entities)}"
return_properties = {
'description': section_content,
'type': 'ReturnValue'
}
return_entity = APIEntity(return_id, 'ReturnValue', section_title, return_properties)
entities.append(return_entity)
# 添加API与返回值的关系
relation = APIRelation(api_id, return_id, 'HAS_RETURN', {})
self.relations.append(relation)
return entities
2.4 使用大语言模型增强知识提取
为了提高知识提取的质量,我们还使用了DeepSeek大语言模型来分析API文档,提取更丰富的实体和关系:
def _extract_entities_and_relations_with_llm(self, doc: Dict[str, Any]) -> Tuple[List[Dict], List[Dict]]:
"""使用大语言模型提取实体和关系"""
# 构建提示词
prompt = self._build_extraction_prompt(doc)
# 调用DeepSeek API
response = self.ds_client.chat_completion([
{"role": "system", "content": "你是一个专业的知识图谱构建助手,擅长从文本中提取实体和关系。"},
{"role": "user", "content": prompt}
])
# 解析响应
try:
# 提取JSON部分
json_str = self._extract_json_from_response(response)
result = json.loads(json_str)
entities = result.get("entities", [])
relations = result.get("relations", [])
return entities, relations
except Exception as e:
print(f"解析LLM响应时出错: {str(e)}")
return [], []
2.5 保存知识图谱
构建完成的知识图谱会保存为JSON格式,便于后续使用:
def save_knowledge_graph(self):
"""保存知识图谱"""
# 创建保存目录
os.makedirs(KN_GRAPH_DATA_DIR, exist_ok=True)
# 保存实体
entities_file = os.path.join(KN_GRAPH_DATA_DIR, 'entities.json')
entities_data = [entity.to_dict() for entity in self.entities]
with open(entities_file, 'w', encoding='utf-8') as f:
json.dump(entities_data, f, ensure_ascii=False, indent=2)
# 保存关系
relations_file = os.path.join(KN_GRAPH_DATA_DIR, 'relations.json')
relations_data = [relation.to_dict() for relation in self.relations]
with open(relations_file, 'w', encoding='utf-8') as f:
json.dump(relations_data, f, ensure_ascii=False, indent=2)
# 保存完整的知识图谱
kg_file = os.path.join(KN_GRAPH_DATA_DIR, 'knowledge_graph.json')
kg_data = {
"entity_types": self.entity_types,
"relation_types": self.relation_types,
"entities": entities_data,
"relations": relations_data
}
with open(kg_file, 'w', encoding='utf-8') as f:
json.dump(kg_data, f, ensure_ascii=False, indent=2)
3. 知识图谱检索
知识图谱检索器负责根据用户查询,检索相关API及其关联知识。
3.1 检索器初始化
def __init__(self):
"""初始化知识图谱检索器"""
# 加载PureRAG检索器
self.rag_retriever = PureRAGRetriever()
# 加载知识图谱数据
self.entities = self.load_entities()
self.relations = self.load_relations()
# 构建实体索引
self.entity_index = {}
for entity in self.entities:
self.entity_index[entity['entity_id']] = entity
3.2 搜索API及关联知识
def search(self, query: str, top_k: int = 3) -> List[Dict[str, Any]]:
"""搜索API及其关联知识"""
# 使用PureRAG检索器获取初始结果
rag_results = self.rag_retriever.retrieve(query, top_k=top_k)
# 扩展搜索结果
enriched_results = []
for result in rag_results:
# 获取API实体ID
api_id = result['doc_id']
# 获取相关实体
related_entities = self.get_related_entities(api_id)
# 构建扩展结果
enriched_result = {
'api': result, # 原始API信息
'related_knowledge': [] # 关联知识
}
# 添加关联知识
for related in related_entities:
entity = related['entity']
relation = related['relation']
direction = related['direction']
# 根据实体类型构建关联知识
if entity['entity_type'] == 'Method':
knowledge = {
'type': '方法',
'name': entity['name'],
'description': entity['properties'].get('description', ''),
'relation': relation
}
enriched_result['related_knowledge'].append(knowledge)
elif entity['entity_type'] == 'Parameter':
knowledge = {
'type': '参数',
'name': entity['name'],
'description': entity['properties'].get('description', ''),
'relation': relation
}
enriched_result['related_knowledge'].append(knowledge)
# ... 其他实体类型处理 ...
enriched_results.append(enriched_result)
return enriched_results
4. 知识图谱可视化
知识图谱可视化器使用NetworkX和Matplotlib将知识图谱以图形方式展示:
def visualize_knowledge_graph(self, max_nodes: int = 100):
"""可视化知识图谱"""
if len(self.G) == 0:
print("知识图谱为空,无法可视化")
return
# 如果节点太多,只显示一部分
if len(self.G) > max_nodes:
# 选择度数最高的节点
degrees = dict(self.G.degree())
top_nodes = sorted(degrees.items(), key=lambda x: x[1], reverse=True)[:max_nodes]
top_node_ids = [node_id for node_id, _ in top_nodes]
G_sub = self.G.subgraph(top_node_ids)
else:
G_sub = self.G
# 设置节点颜色(按实体类型)
node_types = nx.get_node_attributes(G_sub, 'entity_type')
unique_types = set(node_types.values())
# 创建颜色映射
colors = list(mcolors.TABLEAU_COLORS)
color_map = {t: colors[i % len(colors)] for i, t in enumerate(unique_types)}
# 设置节点颜色
node_colors = [color_map[node_types[n]] for n in G_sub.nodes()]
# 设置节点标签
node_labels = {n: G_sub.nodes[n]['name'] for n in G_sub.nodes()}
# 设置边标签
edge_labels = {(u, v): G_sub[u][v]['relation_type'] for u, v in G_sub.edges()}
# 创建图形
plt.figure(figsize=(20, 16))
pos = nx.spring_layout(G_sub, seed=42)
# 绘制节点
nx.draw_networkx_nodes(G_sub, pos, node_color=node_colors, node_size=700, alpha=0.8)
# 绘制边
nx.draw_networkx_edges(G_sub, pos, width=1.0, alpha=0.5, arrows=True, arrowsize=20)
# 绘制节点标签
nx.draw_networkx_labels(G_sub, pos, labels=node_labels, font_size=10)
# 绘制边标签
nx.draw_networkx_edge_labels(G_sub, pos, edge_labels=edge_labels, font_size=8)
# 添加图例
legend_elements = [plt.Line2D([0], [0], marker='o', color='w',
label=f"{t}", markerfacecolor=color_map[t], markersize=10)
for t in unique_types]
plt.legend(handles=legend_elements, loc='upper right')
plt.title("API知识图谱")
plt.axis('off')
# 保存图像
plt.savefig(KN_DS_EXTRACT_OUTPUT_DIR / "api_knowledge_graph.png", dpi=300, bbox_inches='tight')
plt.close()
5. Web应用
为了方便用户使用知识图谱,我们开发了一个Web应用,提供API搜索和知识展示功能:
def generate_answer(query: str, contexts: List[Dict[str, Any]]) -> str:
"""根据上下文生成回答"""
# 如果没有上下文,返回默认回答
if not contexts:
return "抱歉,我没有找到与您问题相关的API信息。请尝试使用其他关键词。"
# 构建提示词
prompt = f"""基于以下HarmonyOS API信息,回答用户的问题。如果无法从提供的信息中找到答案,请明确说明。
用户问题: {query}
相关API信息:
"""
# 添加上下文信息
for i, context in enumerate(contexts):
api = context['api']
prompt += f"\n{i+1}. {api.get('label', '')}: {api.get('text', '')[:300]}..."
# 添加关联知识
if context['related_knowledge']:
prompt += "\n 相关知识:"
for item in context['related_knowledge'][:3]: # 限制数量
prompt += f"\n - {item['type']}: {item['name']} - {item['description']}"
# 使用DeepSeek API生成回答
try:
answer = ds_client.chat_completion([
{"role": "system", "content": "你是HarmonyOS API专家,请基于提供的API信息回答用户问题。回答应该准确、简洁、易于理解,并尽可能包含代码示例。"},
{"role": "user", "content": prompt}
], temperature=0.3)
# 移除思考标签
answer = ds_client.remove_think_tags(answer)
return answer
except Exception as e:
return f"抱歉,生成回答时出现错误: {str(e)}"
6. 知识图谱统计
为了了解知识图谱的规模和结构,我们生成了统计报告:
<!DOCTYPE html>
<html>
<head>
<title>HarmonyOS Knowledge Graph Statistics</title>
<style>
body { font-family: Arial, sans-serif; margin: 20px; }
h1, h2 { color: #333; }
.stats-container { display: flex; flex-wrap: wrap; }
.stats-box {
background-color: #f5f5f5;
border-radius: 5px;
padding: 15px;
margin: 10px;
flex: 1;
min-width: 300px;
}
table { border-collapse: collapse; width: 100%; }
th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
th { background-color: #f2f2f2; }
tr:nth-child(even) { background-color: #f9f9f9; }
</style>
</head>
<body>
<h1>HarmonyOS Knowledge Graph Statistics</h1>
<div class="stats-container">
<div class="stats-box">
<h2>Basic Statistics</h2>
<p>Total Nodes: 14714</p>
<p>Total Edges: 14559</p>
</div>
</div>
<div class="stats-container">
<div class="stats-box">
<h2>Node Types</h2>
<table>
<tr>
<th>Type</th>
<th>Count</th>
</tr>
<tr><td>method</td><td>2931</td></tr>
<tr><td>document</td><td>2283</td></tr>
<tr><td>scenario</td><td>48</td></tr>
<tr><td>unknown</td><td>398</td></tr>
<tr><td>code_example</td><td>8023</td></tr>
<tr><td>api</td><td>1031</td></tr>
</table>
</div>
<div class="stats-box">
<h2>Edge Types</h2>
<table>
<tr>
<th>Type</th>
<th>Count</th>
</tr>
<tr><td>has_example</td><td>8023</td></tr>
<tr><td>contains</td><td>4007</td></tr>
<tr><td>documents</td><td>2283</td></tr>
<tr><td>applies_to</td><td>246</td></tr>
</table>
</div>
</div>
</body>
</html>
7. 总结
HarmonyOS API知识图谱模块通过构建、检索和可视化API之间的关系,为开发者提供了更丰富、更结构化的API文档访问方式。知识图谱不仅包含API的基本信息,还包含API之间的各种关系,如调用关系、继承关系等,以及API与其方法、参数、返回值等元素之间的关系。
通过知识图谱,开发者可以更全面地了解API的使用方式、约束条件和相关功能,从而提高开发效率和代码质量。同时,知识图谱也为智能编程助手提供了更丰富的知识基础,使其能够生成更准确、更有用的回答。