通过Llamaindex分析用户舆情信息技术调研

两行省略号

已于 2023-09-08 22:07:55 修改

阅读量379

点赞数

分类专栏：大模型文章标签： llama

于 2023-09-08 21:44:49 首次发布

本文链接：https://blog.csdn.net/qq_40830548/article/details/132767872

版权

大模型专栏收录该内容

1 篇文章 0 订阅

订阅专栏

代码功能介绍

1.加载业务方jsonarray数据，构造为一堆node，其中每个node的text部分是一条json数据

2. 通过node构造 GPTListIndex

3.使用开源大模型通过tree_summarize的方式，获得最终的结果。

4.xxLLM 表示根据llamaindex CustomLLM，自定义的模型类，此模型是内部部署的部署的开源模型，非openAI模型

5.下面代码仅仅是技术调研阶段写的可行性分析，并不是生产环境最终的代码。

需要注意：

1） ServiceContext中定义了整个算法的全局字典属性，包括使用的大模型、embedding、上下文限制等。

2） get_response_synthesizer 通过此函数定义合成对象的行为，包括定义prompt。

3）如果出现 Failed Building Wheel for llama-cpp-python runing，请参照官方解决方案：https://github.com/abetlen/llama-cpp-python/issues/233

import logging
import sys
from llama_index import (
    ServiceContext,
    GPTListIndex,
    MockEmbedding
)
from  LlamaIndex.customllm.xxLLM import xxLLM
from LlamaIndex.customllm.JsonArrayConstruct import DataConstruct
from llama_index.indices.prompt_helper import PromptHelper
from llama_index.prompts import PromptTemplate, SelectorPromptTemplate
from llama_index.prompts.prompt_type import PromptType
from llama_index.response_synthesizers import ResponseMode, get_response_synthesizer

# 功能介绍：
#   1.加载业务方投递的jsonarray数据，构造为一堆nodes，每个node的text是一条用户json数据
#   2.通过node构造 GPTListIndex
#   3.使用开源大模型通过tree_summarize的方式，获得最终的结果。
# 需要注意：
#  1） ServiceContext中定义了整个算法的全局字典属性，包括使用的大模型、embedding、上下文限制等。
#  2） get_response_synthesizer  通过此函数定义合成对象的行为，包括定义prompt。
#  3） 如果出现 Failed Building Wheel for llama-cpp-python runing，请参照官方解决方案：https://github.com/abetlen/llama-cpp-python/issues/233

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

query_str = "职责：" \
            "你是公司的舆情处理人员，每天主要工作是根据用户真实的反馈中，分析目前有哪些重要的反馈信息需要关注或者紧急介入，以免爱奇艺公司名誉受损。" \
            "要求：" \
            "请给出最重要的10条用户反馈的问题，尽可能多一些细节。不要概括性太强。"

# copy自 default_prompt_selectors
DEFAULT_TREE_SUMMARIZE_TMPL = (
    " 上下文信息如下:\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "仅使用上面---------------------包裹的信息，不要使用先验知识，回答下面问题:\n"
    "问题是: {query_str}\n"
    "请回答: "
)
DEFAULT_TREE_SUMMARIZE_PROMPT = PromptTemplate(
    DEFAULT_TREE_SUMMARIZE_TMPL, prompt_type=PromptType.SUMMARY
)

DEFAULT_TREE_SUMMARIZE_PROMPT_SEL = SelectorPromptTemplate(default_template=DEFAULT_TREE_SUMMARIZE_PROMPT)

# prompt中的上下文长度
prompt_helper = PromptHelper(
    context_window=10000, num_output=512
)
# 替换默认大模型,embed_model=MockEmbedding 需要配置，否则默认会加载embedding大模型
customLLm = xxLLM()
mockEmbedding = MockEmbedding(0)
service_context = ServiceContext.from_defaults(
    llm=customLLm,
    prompt_helper=prompt_helper,
    embed_model=mockEmbedding
)

# 组装业务数据
dataConstruct = DataConstruct()
nodes = dataConstruct.throughJsonArrayGetNodes()
index = GPTListIndex(nodes, service_context=service_context)

# 定制引擎的行为
response_synthesizer = get_response_synthesizer(service_context=service_context,
                                                response_mode=ResponseMode.TREE_SUMMARIZE,
                                                summary_template=DEFAULT_TREE_SUMMARIZE_PROMPT_SEL,
                                                use_async=True,
                                                verbose=True)
query_engine = index.as_query_engine(response_synthesizer=response_synthesizer)
# 查看 TreeSummarize 源码 repack
response = query_engine.query(query_str)
print("最终答案：" + str(response))

数据处理函数

import json
import random
import os
from datetime import datetime, timedelta
from llama_index import (
    Document,
)
from llama_index.data_structs import Node


class DataConstruct:
    text_pool = []

    def __init__(self):
        current_directory = os.getcwd()
        print(current_directory)

        with open('D:/aigc_dev/aigc/LlamaIndex/customllm/yuqing.txt', 'r', encoding="UTF-8") as file:
            for line in file:
                self.text_pool.append(line)

    # 10个一组json变为document内容
    def throughJsonArrayGetDocument(self):
        # 获取当前日期和时间
        current_time = datetime.now()
        documents = []
        feedbackList = []
        for item in self.text_pool:
            # 随机生成一个在最近一周内的日期
            random_days = random.randint(1, 7)
            feedback_time = current_time - timedelta(days=random_days)

            # 创建反馈信息
            feedback = {
                "time": feedback_time.isoformat(),
                "text": item
            }
            feedbackList.append(feedback)

        groupList = self.split_array_into_groups(feedbackList, 10)
        for group in groupList:
            document = Document(text=json.dumps(group, ensure_ascii=False))
            documents.append(document)
        return documents

    def split_array_into_groups(self, arr, group_size=10):
        grouped_arrays = []
        for i in range(0, len(arr), group_size):
            group = arr[i:i + group_size]
            grouped_arrays.append(group)
        return grouped_arrays

    def throughJsonArrayGetNodes(self):
        # 获取当前日期和时间
        current_time = datetime.now()
        nodes = []
        for item in self.text_pool:
            # 随机生成一个在最近一周内的日期
            random_days = random.randint(1, 7)
            feedback_time = current_time - timedelta(days=random_days)

            # 创建反馈信息
            feedback = {
                "time": feedback_time.isoformat(),
                "text": item
            }

            node = Node(text=json.dumps(feedback, ensure_ascii=False))
            nodes.append(node)
        return nodes