导出dify知识库脚本
import requests
import pandas as pd
def fetch_json(session, url, headers):
try:
response = session.get(url, headers=headers)
response.raise_for_status()
return response.json()
except requests.RequestException as e:
print(f"请求 {url} 时出错: {e}")
return None
base_url = 'http://xxx/v1/datasets'
headers = {
'Authorization': 'Bearer 密钥'
}
with requests.Session() as session:
# 获取所有数据集
datasets_url = f"{base_url}?page=1&limit=100"
data = fetch_json(session, datasets_url, headers)
if not data:
print('无法获取数据集信息')
exit(1)
datasets = {item['id']: item['name'] for item in data.get('data', [])}
# 获取所有文档
document_split_list = []
for dataset_id, dataset_name in datasets.items():
documents_url = f"{base_url}/{dataset_id}/documents"
documents_data = fetch_json(session, documents_url, headers)
if not documents_data:
print(f"无法获取数据集 {dataset_id} 的文档")
continue
for doc in documents_data.get('data', []):
doc_id = doc['id']
doc_name = doc['name']
segments_url = f"{base_url}/{dataset_id}/documents/{doc_id}/segments"
segments_data = fetch_json(session, segments_url, headers)
if not segments_data:
print(f"无法获取文档 {doc_id} 的片段")
continue
for segment in segments_data.get('data', []):
document_split_list.append({
"name": doc_name,
"content": segment.get('content', ''),
"answer": segment.get('answer', ''),
"keywords": segment.get('keywords', [])
})
# 转换为 DataFrame 并保存为 CSV
document_split_df = pd.DataFrame(document_split_list)
document_split_df.to_csv('document_split.csv', index=False)
print('完成')
可能会存在的问题:可能会存在请求断开的情况,多尝试几次可能就可以了,如果尝试还不可以,那就每个步骤单独拉出来跑。