声明:该版代码在2024.08.23有效。
代码如下:
from langchain_community.document_loaders import PyPDFLoader
import getpass
import os
from langchain_openai import ChatOpenAI
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
class QA:
"""
A class to handle question-answering tasks on a given PDF document.
Attributes:
question (str): The question to be answered about the PDF.
pdf_path (str): Path to the PDF file.
model_name (str): Name of the model used for analysis.
docs (list): Loaded PDF documents.
vecstore (Chroma): The vector store object for storing document embeddings.
Methods:
set_environ(): Set environment variables for the OpenAI API.
load_file(): Load a PDF file using PyPDFLoader.
split_and_store(): Split the PDF text and store embeddings using Chroma.
retrieve_pdf(): Retrieve and answer questions based on the PDF content.
"""
def __init__(self, question, pdf_path, model_name):
"""
Initializes the QA object with provided question, PDF path, and model name.
Parameters:
question (str): The question to be answered about the PDF.
pdf_path (str): Path to the PDF file.
model_name (str): Name of the model used for analysis.
"""
self.question = question
self.pdf_path = pdf_path
self.model_name = model_name
self.docs = None
self.vecstore = None
def set_environ(self):
"""
Sets the environment variables necessary for OpenAI API authentication.
"""
os.environ['OPENAI_API_KEY'] = input("your api:")
os.environ['OPENAI_PROXY'] = 'http://127.0.0.1:20171'
def load_file(self):
"""
Loads the PDF file specified by the pdf_path attribute using PyPDFLoader.
"""
loader = PyPDFLoader(self.pdf_path)
self.docs = loader.load()
def split_and_store(self):
"""
Splits the loaded PDF text into manageable chunks and stores the embeddings in a vector store.
"""
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(self.docs)
self.vecstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
def retrieve_pdf(self):
"""
Retrieves context from the vector store and generates an answer to the input question
using a retrieval-augmented generation chain.
"""
retriever = self.vecstore.as_retriever()
llm = ChatOpenAI(model="gpt-4o")
system_prompt = (
"You are an assistant for question-answering tasks. "
"Use the following pieces of retrieved context to answer "
"the question. If you don't know the answer, say that you "
"don't know. Use three sentences maximum and keep the "
"answer concise."
"\n\n"
"{context}"
)
prompt = ChatPromptTemplate.from_messages(
[
("system", system_prompt),
("human", "{input}"),
]
)
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
results = rag_chain.invoke({"input": self.question})
print(results['answer'])
def run(self):
self.set_environ()
self.load_file()
self.split_and_store()
self.retrieve_pdf()
def __main__():
"""
Main function to execute the QA class functionality.
Prompts user for input parameters, creates a QA object, and processes the specified PDF.
"""
question = input("Your question:")
pdf_path = input("Enter the path of the pdf file:")
model_name = input("Enter the model name:")
qa = QA(question, pdf_path, model_name)
qa.run()
if __name__ == "__main__":
__main__()