当前位置：首页 > news >正文

RAG初筛混合方案 - bm25+vector

news 2025/9/2 7:32:14

RAG初筛混合方案，在初筛阶段，不仅使用向量检索，同时使用bm25全文检索，以提高覆盖率。

milvus2.5已支持bm25，这里以milvus为例示例混合检索过程，代码修改自网络资料。

1 milvus包安装

为兼容bm25，milvus版本选择2.6。

由于linux系统较老，milvus-lite可能遇到glibc不兼容问题，这里采用docker compose方式。

假设docker compose已安装，milvus安装命令如下

# Download the configuration file
wget https://github.com/milvus-io/milvus/releases/download/v2.6.0/milvus-standalone-docker-compose.yml -O docker-compose.yml
# Start Milvus
sudo docker compose up -d

https://milvus.io/docs/install_standalone-docker-compose.md

2 milvus库构建

1）连接milvus库

from pymilvus import MilvusClient, DataTypeclient = MilvusClient(uri=f"http://localhost:19530",token="root:Milvus"
)

2）计算向量

假设向量纬度为1024

# emb_model_cal为对embedding模型的实际调用
def query_embedding_compute(queries=[]):embeds = emb_model_cal(queries)return embedsi = 0
# data缓存向量和其他信息
data = []
for i, doc in enumerate(docs):vectors = query_embedding_compute([doc])p = {"vector": vectors[0], "text": doc}data.append(p.copy())
print(f"data: {len(data)}")

3) 数据导入

from pymilvus import DataType, Function, FunctionTypecollection_name = "hybrid_cases"
vector_dim = 1024schema = MilvusClient.create_schema(enable_dynamic_field=True,
)analyzer_params = {"type": "chinese"  # 指定分词器类型为中文
}# Add fields to schema
schema.add_field(field_name="vid", datatype=DataType.INT64, is_primary=True, auto_id=True)
schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, analyzer_params=analyzer_params, enable_match=True)
schema.add_field(field_name="sparse_bm25", datatype=DataType.SPARSE_FLOAT_VECTOR)
schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=vector_dim)
schema.add_field(field_name="subject", datatype=DataType.VARCHAR, max_length=65535, enable_analyzer=True, enable_match=True)bm25_function = Function(name="bm25",function_type=FunctionType.BM25,input_field_names=["text"],output_field_names="sparse_bm25",
)
schema.add_function(bm25_function)index_params = client.prepare_index_params()# Add indexes
index_params.add_index(field_name="vector",index_name="vector_index",index_type="IVF_FLAT",metric_type="IP",params={"nlist": 128},
)index_params.add_index(field_name="sparse_bm25",index_name="sparse_bm25_index",index_type="SPARSE_WAND",metric_type="BM25"
)# Create collection
client.create_collection(collection_name=collection_name,schema=schema,index_params=index_params
)# Insert data
res = client.insert(collection_name=collection_name,data=dataitems
)print(f"生成 {len(dataitems)} 个向量，维度：{len(data[0]["vector"])}")

3 milvus测试

1）BM25全文搜索

from pymilvus import MilvusClientclient = MilvusClient(uri=f"http://localhost:19530",token="root:Milvus",db_name="default" 
)search_params = {'params': {'drop_ratio_search': 0.2},
}full_text_res = client.search(collection_name=collection_name,data=['中国首都在哪里?'],anns_field='sparse_bm25',limit=3,search_params=search_params,output_fields=["text"],
)for hits in full_text_res:for hit in hits:print(hit)

2）向量搜索+文本匹配

filter = "TEXT_MATCH(text, '中国') and TEXT_MATCH(text, '首都')"query = "中国首都在哪里"query_embeddings =  query_embedding_compute([query])
print(query_embeddings)text_match_res = client.search(collection_name=collection_name,anns_field="vector",data=query_embeddings,filter=filter,search_params={"params": {"nprobe": 10}},limit=10,output_fields=["text"]
)print(text_match_res)

3）纯文本匹配

filter = "TEXT_MATCH(text, '首都')"text_match_res = client.query(collection_name=collection_name,filter=filter,output_fields=["text"]
)text_match_res

4）混合搜索 - 向量+BM25全文

from pymilvus import AnnSearchRequest, RRFRanker# Define the query
query = "中国首都在哪里"# Embed the query and generate the corresponding vector representation
query_embeddings =  sentence_vecs([query])# Set the top K result count
top_k = 8  # Get the top 5 docs related to the query# Define the parameters for the dense vector search
search_params_dense = {"metric_type": "IP","params": {"nprobe": 2}
}# Create a dense vector search request
request_dense = AnnSearchRequest([query_embeddings[0]], "vector", search_params_dense, limit=top_k)# Define the parameters for the BM25 text search
search_params_bm25 = {"metric_type": "BM25"
}# Create a BM25 text search request
request_bm25 = AnnSearchRequest([query], "sparse_bm25", search_params_bm25, limit=top_k)# Combine the two requests
reqs = [request_dense, request_bm25]# Initialize the RRF ranking algorithm
ranker = RRFRanker(100)# Perform the hybrid search
hybrid_search_res = client.hybrid_search(collection_name=collection_name,reqs=reqs,ranker=ranker,limit=top_k,output_fields=["text"]
)# Extract the context from hybrid search results
context = []
print("Top K Results:")
for hits in hybrid_search_res:  # Use the correct variable herefor hit in hits:context.append(hit['entity']['text'])  # Extract text content to the context listprint(hit['entity']['text'])  # Output each retrieved document

附录: milvus-lite问题

问题1: 创建混合索引失败