LangChain目前已经更新到了V3版本,之前一直使用的V1版本,有很多方法都需要自己去封装,这次重新看了V3版本的API文档,很多方法都十分便利,调用方法简单明了十分方便,下面就来展示下这次对于PDF文件加载的优化处理:
import time
from langchain.chains.summarize import load_summarize_chain
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
# 记录开始时间
start_time = time.time()
# 加载PDF文件,这里存在2种模式,对于小文件直接使用文件流读取single,对于大文件使用page分页加载,这里可以更加细分,按需选择
loader = PyMuPDFLoader(file_path="your online pdf url",mode="page")
docs = []
pages = loader.load_and_split()
print(f"总共加载了 {len(pages)} 页")
merged_docs = []
chunk_size = 15 # 每 3 页合并成一个文档对象
for i in range(0, len(pages), chunk_size):# 合并多个页面的文本combined_text = ""for j in range(i, min(i + chunk_size, len(pages))): # 处理小于 3 页的余数combined_text += pages[j].page_content + "\n" # 将每页的文本拼接起来# 创建新的 Document 对象merged_docs.append(Document(page_content=combined_text.strip(),metadata={"start_page": i + 1, "end_page": min(i + chunk_size, len(pages))}))
# 检查合并后的文档块数量
print(f"最终合并后文档块数量: {len(merged_docs)}")
# print(merged_docs)
# ############通过总页数来拆分文本# 定义文本拆分器(约 1000 token 分块
text_splitter = RecursiveCharacterTextSplitter(chunk_size=6000, # 尽可能大,提高每次请求信息量chunk_overlap=1000 # 适当增加重叠,保持上下文
)
# 处理 PDF 页面的文本
docs = text_splitter.split_documents(merged_docs)
print(f"拆分后共有 {len(docs)} 个文本块")OPENAI_API_KEY = "your api key"
llm = ChatOpenAI(model_name="gpt-3.5-turbo", streaming=False,openai_api_key=OPENAI_API_KEY)map_prompt = PromptTemplate(input_variables=["text"],template="""Summarize this content in English in approximately 50 words, keeping technical terms unchanged. Be concise and to the point. Use simple language and avoid unnecessary details:\n\n{text}"""
)combine_prompt = PromptTemplate(input_variables=["text"],template="""Summarize this content: {text},Use the following criteria:1. Create an attention-grabbing title under 10 words with emojis.2. Break down complex ideas into bite-sized, memorable points, keeping it under 200 words total."""
)final_map_prompt = PromptTemplate(input_variables=["text"],template=("""Summarize this content: {text},Use the following criteria:"Create:1. Title (requirements):- Create a clear, descriptive title under 10 words- Capture the main topic or key finding- Use professional language- Format as "Title: [Your Title]"2. Summary content (requirements):- Main argument or central theme (1-2 sentences)- 3-4 most important supporting points- Key conclusions- Output must be in English regardless of source language- Keep the total summary under 200 words- Use professional language but avoid unnecessary jargon- Preserve all key technical terms in original form when appropriate
" """)
)
# 加载 LangChain 摘要链
print("\n开始总结:")
# chain = load_summarize_chain(llm, chain_type="map_reduce", map_prompt=map_prompt,
# combine_prompt=combine_prompt)
# summary = chain.invoke(docs, config=None)def summarize_document(doc):chain = load_summarize_chain(llm, chain_type="map_reduce", map_prompt=map_prompt, combine_prompt=combine_prompt)return chain.invoke([doc])
import concurrent.futures
# 使用线程池进行并行处理
with concurrent.futures.ThreadPoolExecutor() as executor:summaries = list(executor.map(summarize_document, docs))# 合并所有摘要
combined_summaries = "\n\n".join([summary['output_text'] for summary in summaries])
print(f"多少个总结:{combined_summaries}")
final_chain = load_summarize_chain(llm, chain_type="map_reduce", map_prompt=final_map_prompt, combine_prompt=combine_prompt)
final_summary = final_chain.invoke([Document(page_content=combined_summaries)])# 打印最终总结
print("\n最终总结:")
print(final_summary['output_text'])
#
# print("\n最终总结:")
# print(summary['output_text'])
# 记录结束时间
end_time = time.time()# 计算总花费时间
elapsed_time = end_time - start_time
print(f"Total execution time: {elapsed_time:.2f} seconds")
上面的内容希望对你有所帮助,如果有什么优化点请评论联系我!十分感谢