RAG 入门全流程代码实战
RAG 入门
检索增强生成(RAG)是一种混合方法,它结合了信息检索与生成模型。通过结合外部知识,它增强了语言模型的表现,提高了准确性和事实的正确性。
实现步骤:
- Data Ingestion(数据采集): 加载和预处理文本数据。
- Chunking(分块处理): 将数据分割成更小的块以提高检索性能。
- Embedding Creation(嵌入创建): 使用嵌入模型将文本块转换为数值表示。
- Semantic Search(语义搜索): 根据用户查询检索相关块。
- Response Generation(响应生成):使用语言模型根据检索到的文本生成响应。
设置环境
import fitz
import os
import numpy as np
import json
from openai import OpenAI
from dotenv import load_dotenvload_dotenv()
从 PDF 文件中提取文本
使用 PyMuPDF 库从 PDF 文件中提取文本
def extract_text_from_pdf(pdf_path):"""Extracts text from a PDF file and prints the first `num_chars` characters.Args:pdf_path (str): Path to the PDF file.Returns:str: Extracted text from the PDF."""# Open the PDF filemypdf = fitz.open(pdf_path)all_text = "" # Initialize an empty string to store the extracted text# Iterate through each page in the PDFfor page_num in range(mypdf.page_count):page = mypdf[page_num] # Get the pagetext = page.get_text("text") # Extract text from the pageall_text += text # Append the extracted text to the all_text stringreturn all_text # Return the extracted text
对提取的文本进行分块
将文本切分成更小的、重叠的块以提高检索准确性
def chunk_text(text, n, overlap):"""Chunks the given text into segments of n characters with overlap.Args:text (str): 文本n (int): 块长度overlap (int): 重叠度Returns:List[str]: A list of text chunks."""chunks = [] # Initialize an empty list to store the chunks# Loop through the text with a step size of (n - overlap)for i in range(0, len(text), n - overlap):# Append a chunk of text from index i to i + n to the chunks listchunks.append(text[i:i + n])return chunks
设置 OpenAI API 客户端
初始化 OpenAI 客户端以生成嵌入和响应
# 国内支持类OpenAI的API都可,我用的是火山引擎的,需要配置对应的base_url和api_keyclient = OpenAI(base_url=os.getenv("LLM_BASE_URL"),api_key=os.getenv("LLM_API_KEY")
)
从 PDF 文件中提取和分块文本
加载 PDF,提取文本并将其分割成块
# PDF file
pdf_path = "../../data/AI_Information.en.zh-CN.pdf"# 提取文本
extracted_text = extract_text_from_pdf(pdf_path)# 切分文本块,块长度为300,重叠度为50
text_chunks = chunk_text(extracted_text, 500, 100)# 文本块的数量
print("Number of text chunks:", len(text_chunks))# 第一个文本块
print("\nFirst text chunk:")
print(text_chunks[0])
文本块创建嵌入
嵌入将文本转换为数值向量,这允许进行高效的相似性搜索
# from sentence_transformers import SentenceTransformer, util
# from typing import List
# from pathlib import Path
#
#
# def create_embeddings(text: List[str], model_path: str = "../rag_naive/model/gte-base-zh") -> List[List[float]]:
# """
# Creates embeddings for the given text using the local-embedding model.
# eg: modelscope gte-base-zh
# """
# # Create embeddings for the input text using the specified model
#
# st_model = SentenceTransformer(model_name_or_path=model_path)
# st_embeddings = st_model.encode(text, normalize_embeddings=True)
# response = [embedding.tolist() for embedding in st_embeddings]
#
# return responsedef create_embeddings(text):# Create embeddings for the input text using the specified modelresponse = client.embeddings.create(model=os.getenv("EMBEDDING_MODEL_ID"),input=text)return response # Return the response containing the embeddings
文本块的嵌入向量
response = create_embeddings(text_chunks)
语义搜索
实现余弦相似度来找到与用户查询最相关的文本片段
def cosine_similarity(vec1, vec2):"""Calculates the cosine similarity between two vectors.Args:vec1 (np.ndarray): The first vector.vec2 (np.ndarray): The second vector.Returns:float: The cosine similarity between the two vectors."""# Compute the dot product of the two vectors and divide by the product of their normsreturn np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
def semantic_search(query, text_chunks, embeddings, k=5):"""Performs semantic search on the text chunks using the given query and embeddings.Args:query (str): The query for the semantic search.text_chunks (List[str]): A list of text chunks to search through.embeddings (List[dict]): A list of embeddings for the text chunks.k (int): The number of top relevant text chunks to return. Default is 5.Returns:List[str]: A list of the top k most relevant text chunks based on the query."""# Create an embedding for the queryquery_embedding = create_embeddings(query).data[0].embeddingsimilarity_scores = [] # Initialize a list to store similarity scores# Calculate similarity scores between the query embedding and each text chunk embeddingfor i, chunk_embedding in enumerate(embeddings):similarity_score = cosine_similarity(np.array(query_embedding), np.array(chunk_embedding.embedding))similarity_scores.append((i, similarity_score)) # Append the index and similarity score# Sort the similarity scores in descending ordersimilarity_scores.sort(key=lambda x: x[1], reverse=True)# Get the indices of the top k most similar text chunkstop_indices = [index for index, _ in similarity_scores[:k]]# Return the top k most relevant text chunksreturn [text_chunks[index] for index in top_indices]
在提取的文本块上进行语义搜索
# Load the validation data from a JSON file
with open('../../data/val.json', encoding="utf-8") as f:data = json.load(f)# Extract the first query from the validation data
query = data[0]['question']# Perform semantic search to find the top 2 most relevant text chunks for the query
top_chunks = semantic_search(query, text_chunks, response.data, k=2)# Print the query
print("Query:", query)# Print the top 2 most relevant text chunks
for i, chunk in enumerate(top_chunks):print(f"Context {i + 1}:\n{chunk}\n=====================================")
基于检索到的片段生成响应
# Define the system prompt for the AI assistant
system_prompt = "你是一个AI助手,严格根据给定的上下文进行回答。如果无法直接从提供的上下文中得出答案,请回复:'我没有足够的信息来回答这个问题。'"def generate_response(system_prompt, user_message):"""Generates a response from the AI model based on the system prompt and user message.Args:system_prompt (str): The system prompt to guide the AI's behavior.user_message (str): The user's message or query.Returns:dict: The response from the AI model."""response = client.chat.completions.create(model=os.getenv("LLM_MODEL_ID"),messages=[{"role": "system", "content": system_prompt},{"role": "user", "content": user_message}],temperature=0.1,top_p=0.8,presence_penalty=1.05,max_tokens=4096,)return response.choices[0].message.content# Create the user prompt based on the top chunks
user_prompt = "\n".join([f"上下文内容 {i + 1}:\n{chunk}\n=====================================\n" for i, chunk in enumerate(top_chunks)])
user_prompt = f"{user_prompt}\n问题: {query}"# Generate AI response
ai_response = generate_response(system_prompt, user_prompt)
print(ai_response)
评估响应质量
# Define the system prompt for the evaluation system
evaluate_system_prompt = "你是一个智能评估系统,负责评估AI助手的回答。如果AI助手的回答与真实答案非常接近,则评分为1。如果回答错误或与真实答案不符,则评分为0。如果回答部分符合真实答案,则评分为0.5。"# Create the evaluation prompt by combining the user query, AI response, true response, and evaluation system prompt
evaluation_prompt = f"用户问题: {query}\nAI回答:\n{ai_response}\nTrue Response: {data[0]['ideal_answer']}\n{evaluate_system_prompt}"# Generate the evaluation response using the evaluation system prompt and evaluation prompt
evaluation_response = generate_response(evaluate_system_prompt, evaluation_prompt)
print(evaluation_response)