模型下载
from huggingface_hub import snapshot_download
download_dir = r"../model/V2/jina-embeddings-v3"local_dir = snapshot_download(repo_id="jinaai/jina-embeddings-v3",local_dir=download_dir,local_dir_use_symlinks=False
)print("模型保存路径:", local_dir)
- 当使用跑数据集的时候,还是会出现错误,找不到各种文件,原因是依赖了其他的文件
- 地址 https://huggingface.co/jinaai/xlm-roberta-flash-implementation/tree/main
文本向量化
import pandas as pd
from sentence_transformers import SentenceTransformer
import os
input_datapath = './data/fine_food_reviews_1k.csv'df = pd.read_csv(input_datapath, index_col=0)
df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.dropna()
df["combined"] = ("Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
)
model_path = 'model/jina-embeddings-v3/models--jinaai--jina-embeddings-v3/snapshots/f1944de8402dcd5f2b03f822a4bc22a7f2de2eb9'
model = SentenceTransformer(model_path, trust_remote_code=True)
texts = df["combined"].tolist()
embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
df["embedding"] = embeddings.tolist()
output_path = './output/fine_food_reviews_with_embeddings.parquet'
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_parquet(output_path, index=False)print(f"✅ 向量化完成,共生成 {len(df)} 条记录,已保存至:{output_path}")
Faiss检索
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
df = pd.read_parquet('./output/fine_food_reviews_with_embeddings.parquet')
embeddings = np.array(df['embedding'].to_list())
model_path = 'model/jina-embeddings-v3/models--jinaai--jina-embeddings-v3/snapshots/f1944de8402dcd5f2b03f822a4bc22a7f2de2eb9'
model = SentenceTransformer(model_path, trust_remote_code=True)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
query = "蜂蜜"
query_vec = model.encode([query], convert_to_numpy=True)
D, I = index.search(query_vec, k=3)print("🔍 查询结果:")
for idx in I[0]:print(f"\n📌 Score: {df.iloc[idx]['Score']}")print(f"📄 Text: {df.iloc[idx]['combined']}")