声明:该版代码在2024.08.23有效。
代码如下:
from langchain_community.document_loaders import PyPDFLoader
import getpass
import os
from langchain_openai import ChatOpenAI
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplateclass QA:"""A class to handle question-answering tasks on a given PDF document.Attributes:question (str): The question to be answered about the PDF.pdf_path (str): Path to the PDF file.model_name (str): Name of the model used for analysis.docs (list): Loaded PDF documents.vecstore (Chroma): The vector store object for storing document embeddings.Methods:set_environ(): Set environment variables for the OpenAI API.load_file(): Load a PDF file using PyPDFLoader.split_and_store(): Split the PDF text and store embeddings using Chroma.retrieve_pdf(): Retrieve and answer questions based on the PDF content."""def __init__(self, question, pdf_path, model_name):"""Initializes the QA object with provided question, PDF path, and model name.Parameters:question (str): The question to be answered about the PDF.pdf_path (str): Path to the PDF file.model_name (str): Name of the model used for analysis."""self.question = questionself.pdf_path = pdf_pathself.model_name = model_nameself.docs = Noneself.vecstore = Nonedef set_environ(self):"""Sets the environment variables necessary for OpenAI API authentication."""os.environ['OPENAI_API_KEY'] = input("your api:")os.environ['OPENAI_PROXY'] = 'http://127.0.0.1:20171'def load_file(self):"""Loads the PDF file specified by the pdf_path attribute using PyPDFLoader."""loader = PyPDFLoader(self.pdf_path)self.docs = loader.load()def split_and_store(self):"""Splits the loaded PDF text into manageable chunks and stores the embeddings in a vector store."""text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)splits = text_splitter.split_documents(self.docs)self.vecstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())def retrieve_pdf(self):"""Retrieves context from the vector store and generates an answer to the input questionusing a retrieval-augmented generation chain."""retriever = self.vecstore.as_retriever()llm = ChatOpenAI(model="gpt-4o")system_prompt = ("You are an assistant for question-answering tasks. ""Use the following pieces of retrieved context to answer ""the question. If you don't know the answer, say that you ""don't know. Use three sentences maximum and keep the ""answer concise.""\n\n""{context}")prompt = ChatPromptTemplate.from_messages([("system", system_prompt),("human", "{input}"),])question_answer_chain = create_stuff_documents_chain(llm, prompt)rag_chain = create_retrieval_chain(retriever, question_answer_chain)results = rag_chain.invoke({"input": self.question})print(results['answer'])def run(self):self.set_environ()self.load_file()self.split_and_store()self.retrieve_pdf()def __main__():"""Main function to execute the QA class functionality.Prompts user for input parameters, creates a QA object, and processes the specified PDF."""question = input("Your question:")pdf_path = input("Enter the path of the pdf file:")model_name = input("Enter the model name:")qa = QA(question, pdf_path, model_name)qa.run()if __name__ == "__main__":__main__()