LangChain RAG 实战
文档准备
# 存放所有文档的路径
path_list = ["../note/太上老君的炉子.txt","../note/中国第一颗原子弹.txt","../note/大明王朝.txt","../note/人工智能发展史.txt"
]
文档加载和分割
# 存放分割后所有的文档块
docs_list = []
# 创建分割器 按字符分割
ext_splitter = CharacterTextSplitter(chunk_size=100, # 每个文档块的最大字符数chunk_overlap=10 # 相邻文档块间的重叠字符数
)
# 遍历文档
for path in path_list:# 加载文档documents = TextLoader(path, encoding="utf-8").load()# 分割docs = ext_splitter.split_documents(documents)# 存储docs_list += docs
初始化 Embedding 模型
embeddings = DashScopeEmbeddings(model="text-embedding-v1", # 通义千问官方 Embedding 模型dashscope_api_key=key,
)
初始化 向量存储数据库实例
vector_store = Chroma(collection_name="my_docs", # 集合名称embedding_function=embeddings, # 嵌入模型persist_directory="../chroma_db" # 持久化目录
)
文档列表存入向量数据库
uuids = [str(uuid4()) for _ in range(len(docs_list))]
vector_store.add_documents(documents=docs_list, ids=uuids)
创建检索器
retriever = vector_store.as_retriever()
处理检索结果
def format_docs(docs):# 将文档中的page_content属性以换行符连接起来return "\\n\\n".join(doc.page_content for doc in docs)
创建一个数据处理管道
用于获取context内容
- 输入:
{"question": "问题内容", ...}
- 输出:
文档1:xxxx...\\n\\n文档2:xxxx...
get_context = itemgetter("question") | retriever | format_docs
数据流整合
将context拼接到question之后
- 输入:
{"question": "问题内容", ...}
- 输出:
{"question": "问题内容", "context": "文档1:xxxx...\\n\\n文档2:xxxx..." ...}
context_gen = RunnablePassthrough.assign(context=get_context)
创建提示词
prompt = ChatPromptTemplate.from_messages([("system", "你是一个友好的助手,这是{context}。"),MessagesPlaceholder(variable_name="history"),("human", "{question}")
])
初始化大模型
llm = ChatTongyi(model_name="qwen-max",dashscope_api_key=key,streaming=True
)
创建基础处理链
base_chain = context_gen | prompt | llm
创建全局存储字典
- 键:会话ID(session_id)
- 值:
InMemoryHistory
实例
store = {}
实现内存历史存储类
class InMemoryHistory(BaseChatMessageHistory, BaseModel):messages: List[BaseMessage] = Field(default_factory=list)def add_messages(self, messages: List[BaseMessage]) -> None:self.messages.extend(messages)def clear(self) -> None:self.messages = []
获取会话历史
def get_session_history(session_id):if session_id not in store:store[session_id] = InMemoryHistory()return store[session_id]
创建带有历史的对话链
chain_with_history = RunnableWithMessageHistory(base_chain,get_session_history,input_messages_key="question",history_messages_key="history"
)
测试
r4 = chain_with_history.invoke({"question": "太上老君炼丹炉的起源传说,出自哪里?"},config={'configurable': {"session_id": "test"}}
)
print(r4)r5 = chain_with_history.invoke({"question": "我刚才问的什么?"},config={'configurable': {"session_id": "test"}}
)
print(r5)