根据微软的API例子修改的,可以解决chat model 下文字的限制,用 Embedding模式,但是太烧钱了,按照收费标准,5000字估计1美元的样子,千万别轻易尝试大文件!先生成.csv,然后直接用这个csv做输入的信息(按道理要用矢量数据库,但例子就是这样给的。)
# coding:utf-8
# imports
import typing
import mwclient # for downloading example Wikipedia articles
import mwparserfromhell # for splitting Wikipedia articles into sections
import openai # for generating embeddings
import pandas as pd # for DataFrames to store article sections and embeddings
import re # for cutting <ref> links out of Wikipedia articles
import tiktoken # for counting tokens
from Tools.scripts.dutree import display
# get Wikipedia pages about the 2022 Winter Olympics
from docx import Document
from typing import Tuple, List, Any
doc = Document(r"data/m.docx")
wikipedia_sections: List[Any] = []
for para in doc.paragraphs:
title = para.text.strip()
if title != "" and