当前位置: 首页 > news >正文

milvus+flask山寨复刻《从零构建向量数据库》第7章

常规练手,图片搜索山寨版。拜读罗云大佬著作,结果只有操作层的东西可以上上手。

书中是自己写的向量数据库,这边直接用python拼个现成的milvus向量数据库。

1. 创建一个向量数据库以及对应的相应数据表:

# Milvus Setup Arguments
COLLECTION_NAME = 'animal_search'
DIMENSION = 2048
MILVUS_HOST = "localhost"
MILVUS_PORT = "19530"# Inference Arguments
BATCH_SIZE = 128from pymilvus import connections# Connect to the instance
connections.connect(host=MILVUS_HOST,port=MILVUS_PORT)from pymilvus import utility# Remove any previous collection with the same name
if utility.has_collection(COLLECTION_NAME):utility.drop_collection(COLLECTION_NAME)#创建保存ID、图片文件路径及Embeddings的Collection。
from pymilvus import FieldSchema, CollectionSchema, DataType, Collectionfields = [FieldSchema(name='id',dtype=DataType.INT64, is_primary=True, auto_id=True),FieldSchema(name='filepath', dtype=DataType.VARCHAR,max_length=200),FieldSchema(name='image_embedding',dtype=DataType.FLOAT_VECTOR,dim=DIMENSION)]
schema = CollectionSchema(fields=fields)
collection = Collection(name=COLLECTION_NAME, schema=schema)index_params = {'metric_type':'L2','index_type': "IVF_FLAT",'params':{'nlist':16384}
}
collection.create_index(field_name="image_embedding",index_params=index_params)
collection.load()

2. 写一堆图片进去存着,向量其实就是各种像素间的维度特征,

# Milvus Setup Arguments
COLLECTION_NAME = 'animal_search'
DIMENSION = 2048
MILVUS_HOST = "localhost"
MILVUS_PORT = "19530"# Inference Arguments
BATCH_SIZE = 128from pymilvus import connections# Connect to the instance
connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)import globpaths = glob.glob('/mcm/vectorDB_training/animals_db/*',recursive=True)#分批预处理数据
import torch
# Load the embedding model with the last layer removed
model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet50', pretrained=True)
model = torch.nn.Sequential(*(list(model.children())[:-1]))
model.eval()from torchvision import transforms
# Preprocessing for images
preprocess = transforms.Compose([transforms.Resize(256),transforms.CenterCrop(224),transforms.ToTensor(),transforms.Normalize(mean=[0.485,0.456,0.406],std=[0.229,0.224,0.225])
])#插入数据
from PIL import Image
from tqdm import tqdm# Embed function that embeds the batch and inserts it
def embed(data):from pymilvus import FieldSchema, CollectionSchema, DataType, Collectionfields = [FieldSchema(name='id',dtype=DataType.INT64, is_primary=True, auto_id=True),FieldSchema(name='filepath', dtype=DataType.VARCHAR,max_length=200),FieldSchema(name='image_embedding',dtype=DataType.FLOAT_VECTOR,dim=DIMENSION)]schema = CollectionSchema(fields=fields)collection = Collection(name=COLLECTION_NAME, schema=schema)with torch.no_grad():output = model(torch.stack(data[0])).squeeze()collection.insert([data[1],output.tolist()])collection.flush()data_batch = [[],[]]# Read the images into batches for embedding and insertion
for path in tqdm(paths):im = Image.open(path).convert('RGB')data_batch[0].append(preprocess(im))data_batch[1].append(path)if len(data_batch[0]) % BATCH_SIZE == 0:embed(data_batch)data_batch = [[],[]]# Embed and insert the remainder
if len(data_batch[0]) != 0:embed(data_batch)

3. 向量化图片的函数要单独拎出来,做搜索功能的时候用它。

import torch
import torchvision.transforms as transforms
from torchvision.models import resnet50
from PIL import Imagedef extract_features(image_path):# 加载预训练的 ResNet-50 模型model = resnet50(pretrained=True)model = torch.nn.Sequential(*list(model.children())[:-1])  #移除fc层,不移除,向量最后就是1000层,而不是2048model.eval()# 图像预处理preprocess = transforms.Compose([transforms.Resize(256),transforms.CenterCrop(224),transforms.ToTensor(),transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),])# 读取图像img = Image.open(image_path)img_t = preprocess(img)batch_t = torch.unsqueeze(img_t, 0)# 提取特征with torch.no_grad():out = model(batch_t)# 将特征向量转换为一维数组并返回return out.flatten().numpy()

4. 用flask做的界面

from flask import Flask,request,jsonify
from flask import render_template
from image_eb import extract_features
#from pymilvus import connections
from pymilvus import MilvusClientimport logging
import os
import shutilMILVUS_HOST = "localhost"
MILVUS_PORT = "19530"
COLLECTION_NAME = 'animal_search'
TOP_K = 3app = Flask(__name__)
milvus_client = MilvusClient(uri="http://localhost:19530")@app.route("/")
def index():return render_template("index.html")@app.route("/upload",methods=["POST"])
def upload_image():image_file = request.files["image"]image_id_str = request.form.get("image_id")data = []#检查image_id是否存在。if not image_id_str:return jsonify({"message": "Image ID is required"}),400#image id转化为整型try:image_id = int(image_id_str)data.append(image_id)except ValueError:return jsonify({"message": "Invalid image ID. It must be an integer"}),400filename = image_file.filenameimage_path = os.path.join("static/images",image_id_str)image_file.save(image_path)image_features = extract_features(image_path)data.append(image_features)data_dict = dict(filepath=image_path,image_embedding=data[1])#更新数据库中记录milvus_client.insert(collection_name=COLLECTION_NAME,data=[data_dict])return jsonify({"message": "Image uploaded successfully", "id": image_id})@app.route("/search",methods=["POST"])
def search_image():image_file = request.files["image"]image_path = os.path.join("static/images","temp_image.jpg")image_file.save(image_path)image_features = extract_features(image_path)data_li = [extract_features(image_path).tolist()]search_result = milvus_client.search(collection_name=COLLECTION_NAME,data=data_li,output_fields=["filepath"],limit=TOP_K,search_params={'metric_type': 'L2', 'params': {}},)dict_search_result = search_result[0]arr_search_result = []destination_folder = '/mcm/vectorDB_training/static/images'for index,value in enumerate(dict_search_result):source_file = value["entity"]["filepath"]base_file_name = os.path.basename(source_file)destination_file = os.path.join(destination_folder, base_file_name)shutil.copy(source_file, destination_file)key_file_name = os.path.join("/static/images",base_file_name)arr_search_result.append(key_file_name)        image_urls = [f"{filepath}" for filepath in arr_search_result]return jsonify({"image_urls":image_urls})if __name__=="__main__":app.run(host='0.0.0.0',port=5020,debug=True)

小网站结构,以及其他杂代码,可以查看以及直接下载:https://www.ituring.com.cn/book/3305

相关文章:

  • 常见的降维算法
  • 右值引用的剖析
  • OpenCv实战笔记(4)基于opencv实现ORB特征匹配检测
  • SpringCloud之Ribbon基础认识-服务负载均衡
  • MySQL索引与事务
  • 游戏引擎学习第268天:合并调试链表与分组
  • windows 环境下 python环境安装与配置
  • KEIL_C51和KEIL_MDK共存
  • AI大模型学习十七、利用Dify搭建 AI 图片生成应用
  • average per-pixel disparity error: EPE及不同距离值下的误差曲线
  • Python函数式编程入门:闭包与装饰器详解
  • 开疆智能Canopen转Profinet网关连接工博士GBS20机器人配置案例
  • 栈应用:辅助站(c++)
  • 使用 CST 软件增强工业设备的电磁兼容性设计:原理、方法和应用
  • SpringBoot框架开发网络安全科普系统开发实现
  • SAP CEO Christian Klein专访:企业AI、云转型与未来战略
  • MySQL进阶篇2_SQL优化、锁
  • JDBC工具类
  • 基于Spring Boot + Vue的母婴商城系统( 前后端分离)
  • PHP框架在分布式系统中的应用!
  • 武汉旅游体育集团有限公司原党委书记、董事长董志向被查
  • 韩国执政党总统候选人更换方案被否决,金文洙候选人资格即刻恢复
  • 贵州省总工会正厅级副主席梁伟被查,曾任贵州省纪委副书记
  • 当创业热土遇上年轻气息,上海南汇新城发展如何再发力?
  • 上海:企业招用高校毕业生可享受1500元/人一次性扩岗补助
  • 上海楼市“银四”兑现:新房市场高端改善领跑,二手房量价企稳回升