前言
- 我们在使用ChatGPT进行询问的时候,问出来的都是一些通用的知识,但对于自己私有领域的知识却没有一个很好的考虑。现在虽然也有一些方案可以把私有领域的知识以先验信息的形式加到ChatGPT上面,但是毕竟openAI不open,你发给他的任何数据,都会被openAI收集过去用来作为之后训练的材料。既然是私有领域的知识,那么肯定不想被openAI记录,所以我们这里考虑使用本地的ChatGLM。
效果如下
思路
代码
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained(".\\models\\chatglm-6b-int4", trust_remote_code=True, revision="")
model = AutoModel.from_pretrained(".\\models\\chatglm-6b-int4", trust_remote_code=True, revision="").half().cuda()
# tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b-int4", trust_remote_code=True)
# model = AutoModel.from_pretrained("THUDM/chatglm-6b-int4", trust_remote_code=True, revision="").half().cuda()
# model = AutoModel.from_pretrained("THUDM/chatglm-6b-int4", trust_remote_code=True)
# kernel_file = "./models/chatglm-6b-int4/quantization_kernels.so"
# model = model.quantize(bits=4, kernel_file=kernel_file)
model = model.quantize(bits=4)
model = model.eval()
def parse_text(text):
lines = text.split("\n")
lines = [line for line in lines if line != ""]
count = 0
for i, line in enumerate(lines):
if "```" in line:
count += 1
items = line.split('`')
if count % 2 == 1:
lines[i] = f'<pre><code class="language-{items[-1]}">'
else:
lines[i] = f'<br></code></pre>'
else:
if i > 0:
if count % 2 == 1:
line = line.replace("`", "\`")
line = line.replace("<", "<")
line = line.replace(">", ">")
line = line.replace(" ", " ")
line = line.replace("*", "*")
line = line.replace("_", "_")
line = line.replace("-", "-")
line = line.replace(".", ".")
line = line.replace("!", "!")
line = line.replace("(", "(")
line = line.replace(")", ")")
line = line.replace("$", "$")
lines[i] = "<br>"+line
text = "".join(lines)
return text
def predict(input, chatbot, max_length, top_p, temperature, history):
chatbot.append((parse_text(input), ""))
for response, history in model.stream_chat(tokenizer, input, history, max_length=max_length, top_p=top_p,
temperature=temperature):
chatbot[-1] = (parse_text(input), parse_text(response))
yield chatbot, history
def text2ver_search(file_name,search_text,limit=1): #使用text2ver进行单个文件的语义搜索
from docarray import Document, DocumentArray
from text2vec import SentenceModel, EncoderType
from tqdm import tqdm
with open(file_name, encoding='utf-8') as f:
txt = f.read()
document = Document(text=txt)
document_array = DocumentArray(
Document(text=s.strip()) for s in document.text.split('\n') if s.strip()) # 按照换行进行分割字符串
model = SentenceModel("shibing624/text2vec-base-chinese", encoder_type=EncoderType.FIRST_LAST_AVG, device='cpu')
feature_vec = model.encode
for document in tqdm(document_array):
document.embedding = feature_vec(document.text)
text = Document(text=search_text) # 要匹配的文本
text.embedding = feature_vec(text.text)
querys = text.match(document_array, limit=limit, exclude_self=True, metric='cos', use_scipy=True) # 找到与输入的文本最相似的句子
querys_text = querys.matches[:, ('text')]
querys_list = []
for query_text in querys_text:
temp = (search_text, query_text)
querys_list.append(temp)
return querys_list
file_name ='./data/test.txt'
search_text = '安心的老婆是谁?'
querys_list = text2ver_search(file_name,search_text,1)
print("querys_list:", querys_list)
response_new = ''
history = querys_list
for chatbot, history in predict(search_text, chatbot=[], max_length=10000, top_p=0.5, temperature=0.5, history=history):
response_old = response_new
response_new = chatbot[0][1]
new_single = response_new.replace(response_old, '')
print(new_single,end='')
结语
- 这里的文档可以替换成其他的PDF、word文档,也可以先对这些文件批量建立索引保存下来,之后再读取对应的索引,输入到ChatGLM中,具体玩法就看各位自己的了。