瑞莎星瑞(Radxa Orion O6) 基于 Android OS 使用 NPU的图片模糊查找APP 开发
安谋科技、此芯科技与瑞莎计算机共同推出"星睿O6"开发套件,专为AI PC、边缘计算及机器人等应用场景设计。该套件创新性地整合了Arm®v9 CPU核心、Arm Immortalis™ GPU以及安谋科技自主研发的"周易"NPU。
在Android操作系统环境下,开发者可利用这套开发套件,通过原生JNI方式对开源项目进行优化改造,实现基于NPU加速的CLIP技术,从而提升图片模糊搜索的性能表现。
🔍 Search local images with natural language on Android, powered by OpenAI's CLIP model. / 在 Android 上用自然语言搜索本地图片 (基于 OpenAI 的 CLIP 模型)
https://github.com/greyovo/PicQuery
下面我们开始改造之路:
- 导入项目到 AndroidStudio
- 导入 O6 NPU的 native lib 库
- 新增 JNI CPP代码,
- 改造项目 kotlin 代码
- 编译
- 上板执行!
下面给出具体的操作细节:
- O6 NPU 的 native lib 库是在 Android 镜像的如下位置
.
└── vendor├── include│ └── npu│ ├── kmd│ │ ├── armchina_aipu.h│ │ └── tcb.h│ └── standard_api.h└── lib64└── libaipudrv.so
这里还需要注意一下的就是,我们还需要把 libc++的库包含到 apk 的 jnilibs 中去,因为系统里提供的这个 so 应该是动态编译的,不包含这个 c++的 so,运行会报错的
vendor/lib64/libc++.so
- 主要是用到的 API 可以参考 CIX NPU 开发指导手册
首先使用方法和 Linux 端的使用基本一致,只是需要根据 JNI 的方式做一些符合 JNI 要求的修改即可。
大家也可以参考我这边的代码来组织自己的code,这边应该可以说是通用的。
extern "C" JNIEXPORT jint JNICALL
Java_me_grey_picquery_NpuInference_preprocessNpuInference(JNIEnv *env, jobject thiz, jbyteArray model, jint model_size)
{initTestBench(&opt);jbyte* model_bin = env->GetByteArrayElements(model, NULL);char* buffer1 = new char[model_size];memcpy(buffer1, model_bin, model_size);opt.model_bin=(buffer1);opt.bin_size = model_size;LOGE("[TEST INFO] preprocessNpuInference\n");memset(&sim_glb_config, 0, sizeof(sim_glb_config));memset(&sim_job_config, 0, sizeof(sim_job_config));memset(&mem_dump_config, 0, sizeof(mem_dump_config));mem_dump_config.dump_dir = opt.dump_dir;aipu_init_context(&ctx);if (ret != AIPU_STATUS_SUCCESS){aipu_get_error_message(ctx, ret, &msg);LOGE("[TEST ERROR] AIPU_init_ctx: %s\n", msg);
// return -1;}ret = aipu_config_global(ctx, AIPU_CONFIG_TYPE_SIMULATION, &sim_glb_config);if (ret != AIPU_STATUS_SUCCESS){aipu_get_error_message(ctx, ret, &msg);LOGE("[TEST ERROR] AIPU_config_simulation: %s\n", msg);
// goto deinit_ctx;}if (part_cnt == 0) {ret = aipu_get_partition_count(ctx, &part_cnt);if (ret != AIPU_STATUS_SUCCESS) {aipu_get_error_message(ctx, ret, &msg);LOGE("aipu_get_partition_count: %s \n", msg);
// goto unload_graph;}for (uint32_t i = 0; i < part_cnt; i++) {ret = aipu_get_cluster_count(ctx, i, &cluster_cnt);if (ret != AIPU_STATUS_SUCCESS) {aipu_get_error_message(ctx, ret, &msg);LOGE("aipu_get_cluster_count: %s \n", msg);
// goto unload_graph;}for (uint32_t j = 0; j < cluster_cnt; j++) {ret = aipu_get_core_count(ctx, i, j, &core_cnt);if (ret != AIPU_STATUS_SUCCESS) {aipu_get_error_message(ctx, ret, &msg);LOGE("aipu_get_core_count: %s \n", msg);
// goto unload_graph;}LOGE("[TEST INFO] <part_idx, cluster_idx, core_cnt> = <%u, %u, %u>\n", i, j, core_cnt);}}}ret = aipu_load_graph_helper(ctx, opt.model_bin,opt.bin_size, &graph_id);if (ret != AIPU_STATUS_SUCCESS){aipu_get_error_message(ctx, ret, &msg);LOGE("[TEST ERROR] AIPU_load_graph_helper: %s\n", msg);
// goto deinit_ctx;}LOGE("[TEST INFO] AIPU load graph successfully.\n");ret = aipu_get_tensor_count(ctx, graph_id, AIPU_TENSOR_TYPE_INPUT, &input_cnt);if (ret != AIPU_STATUS_SUCCESS){aipu_get_error_message(ctx, ret, &msg);LOGE("[TEST ERROR] INPUT: aipu_get_tensor_count: %s\n", msg);
// goto unload_graph;}LOGE("[TEST INFO] INPUT: aipu_get_tensor_count success: input_cnt = %d\n",input_cnt);for (uint32_t i = 0; i < input_cnt; i++){aipu_tensor_desc_t desc;ret = aipu_get_tensor_descriptor(ctx, graph_id, AIPU_TENSOR_TYPE_INPUT, i, &desc);if (ret != AIPU_STATUS_SUCCESS){aipu_get_error_message(ctx, ret, &msg);LOGE("[TEST ERROR] INPUT: aipu_get_tensor_descriptor: %s\n", msg);}LOGE("[TEST INFO] INPUT[%d]: desc.size: %u\n", i, desc.size);LOGE("[TEST INFO] INPUT[%d]: desc.scale: %f\n", i, desc.scale);LOGE("[TEST INFO] INPUT[%d]: desc.zero_point: %f\n", i, desc.zero_point);LOGE("[TEST INFO] INPUT[%d]: desc.data_type: %u\n", i, desc.data_type);LOGE("[TEST INFO] INPUT[%d]: desc.id: %u\n", i, desc.id);input_desc.push_back(desc);}ret = aipu_get_tensor_count(ctx, graph_id, AIPU_TENSOR_TYPE_OUTPUT, &output_cnt);if (ret != AIPU_STATUS_SUCCESS){aipu_get_error_message(ctx, ret, &msg);fprintf(stderr, "[TEST ERROR] aipu_get_tensor_count: %s\n", msg);
// goto unload_graph;}LOGE("[TEST INFO] OUTPUT: aipu_get_tensor_count success: output_cnt = %d\n", output_cnt);for (uint32_t i = 0; i < output_cnt; i++){aipu_tensor_desc_t desc;aipu_get_tensor_descriptor(ctx, graph_id, AIPU_TENSOR_TYPE_OUTPUT, i, &desc);if (ret != AIPU_STATUS_SUCCESS){aipu_get_error_message(ctx, ret, &msg);LOGE("[TEST ERROR] aipu_get_tensor_descriptor: %s\n", msg);
// goto unload_graph;}LOGE("[TEST INFO] OUTPUT[%d]: desc.size: %u\n", i, desc.size);LOGE("[TEST INFO] OUTPUT[%d]: desc.scale: %f\n", i, desc.scale);LOGE("[TEST INFO] OUTPUT[%d]: desc.zero_point: %f\n", i, desc.zero_point);LOGE("[TEST INFO] OUTPUT[%d]: desc.data_type: %u\n", i, desc.data_type);LOGE("[TEST INFO] OUTPUT[%d]: desc.id: %u\n", i, desc.id);output_desc.push_back(desc);}for (uint32_t i = 0; i < output_cnt; i++){char* output = new char[output_desc[i].size];output_data.push_back(output);}env->ReleaseByteArrayElements(model, model_bin, 0);delete[] buffer1;return 0;
}
extern "C" JNIEXPORT jint JNICALL
Java_me_grey_picquery_NpuInference_processNpuInference(JNIEnv *env, jobject thiz,jintArray inputBin, jint inputLength,
// jbyteArray goldenOutputBin, jint outputLength,jbyteArray output)
{char* buffer2 = new char[inputLength * sizeof(int)];jint* inputData = env->GetIntArrayElements(inputBin, NULL);jbyte* outputData = env->GetByteArrayElements(output, NULL);void* voidInputData = malloc(inputLength * sizeof(jint));if (voidInputData != nullptr) {memcpy(voidInputData, inputData, inputLength * sizeof(jint));}opt.inputs.push_back(voidInputData);opt.inputs_size.push_back(inputLength * sizeof(int));LOGE("[TEST INFO] NpuInference void* type inputLength= %lu \n", inputLength* sizeof(int));// char* buffer3 = new char[outputLength];
// jbyte* outputGoldenData = env->GetByteArrayElements(goldenOutputBin, NULL);
// if (outputGoldenData != NULL) {
// memcpy(buffer3, outputGoldenData, outputLength);
// opt.gt = buffer3;
// opt.gt_size = outputLength;
// }LOGE("[TEST INFO] do npu inference now\n");create_job_cfg.partition_id = 0;create_job_cfg.qos_level = AIPU_JOB_QOS_HIGH;ret = aipu_create_job(ctx, graph_id, &job_id, &create_job_cfg);if (ret != AIPU_STATUS_SUCCESS){aipu_get_error_message(ctx, ret, &msg);LOGE("[TEST ERROR] aipu_create_job: %s\n", msg);
// goto unload_graph;}LOGE("[TEST INFO] aipu_create_job success\n");// cfg_types = AIPU_JOB_CONFIG_TYPE_DUMP_INPUT | AIPU_JOB_CONFIG_TYPE_DUMP_OUTPUT;
// ret = aipu_config_job(ctx, job_id, cfg_types, &mem_dump_config);
// if (ret != AIPU_STATUS_SUCCESS) {
// aipu_get_error_message(ctx, ret, &msg);
// LOGE("[TEST ERROR] aipu_config_job: %s\n", msg);
// }aipu_config_job(ctx, job_id, AIPU_CONFIG_TYPE_SIMULATION, &sim_job_config);if (ret != AIPU_STATUS_SUCCESS){aipu_get_error_message(ctx, ret, &msg);LOGE("[TEST ERROR] aipu_config_job: %s\n", msg);
// goto clean_job;}LOGE("[TEST INFO] set job simulation config success\n");if (opt.inputs.size() != input_cnt) {LOGE("[TEST WARN] input file count (%u) != input tensor count (%u)\n",(uint32_t)opt.inputs.size(), input_cnt);}for (uint32_t i = 0; i < min((uint32_t)opt.inputs.size(), input_cnt); i++) {if (input_desc[i].size > opt.inputs_size[i]) {LOGE("[TEST INFO] input file %s len 0x%x < input tensor %u size 0x%x\n",opt.input_files[i].c_str(), opt.inputs_size[i], i, input_desc[i].size);
// goto clean_job;}ret = aipu_load_tensor(ctx, job_id, i, opt.inputs[i]);if (ret != AIPU_STATUS_SUCCESS) {aipu_get_error_message(ctx, ret, &msg);LOGE("[TEST ERROR] aipu_load_tensor: %s\n", msg);
// goto clean_job;}LOGE("[TEST INFO] load input tensor %d from (%u/%u)\n", i, i+1, input_cnt);}gettimeofday(×tart, NULL);aipu_finish_job(ctx, job_id, -1);if (ret != AIPU_STATUS_SUCCESS){aipu_get_error_message(ctx, ret, &msg);LOGE("[TEST ERROR] aipu_finish_job: %s\n", msg);pass = -1;
// goto clean_job;}LOGE("[TEST INFO] aipu_finish_job success\n");gettimeofday(&timeend, NULL);for (uint32_t i = 0; i < input_cnt; i++){opt.inputs.pop_back();opt.inputs_size.pop_back();}for (uint32_t i = 0; i < output_cnt; i++){ret = aipu_get_tensor(ctx, job_id, AIPU_TENSOR_TYPE_OUTPUT, i, output_data[i]);if (ret != AIPU_STATUS_SUCCESS){aipu_get_error_message(ctx, ret, &msg);LOGE("[TEST ERROR] aipu_get_tensor: %s\n", msg);
// goto clean_job;}LOGE("[TEST INFO] get output tensor %u success (%u/%u)\n", i, i+1, output_cnt);}// pass = check_result_helper(output_data, output_desc, opt.gt, opt.gt_size);LOGE("[TEST INFO] output_desc[0].size 0x%x\n",output_desc[0].size);// post processLOGE("[TEST INFO] npu post process\n");memcpy(outputData, output_data[0], output_desc[0].size);// input_desc.clear();
// output_desc.clear();clean_job:ret = aipu_clean_job(ctx, job_id);if (ret != AIPU_STATUS_SUCCESS){aipu_get_error_message(ctx, ret, &msg);LOGE("[TEST ERROR] AIPU_clean_job: %s\n", msg);
// goto unload_graph;}LOGE("[TEST INFO] aipu_clean_job success\n");///////////////////////////////////////////
//unload_graph:
// ret = aipu_unload_graph(ctx, graph_id);
// if (ret != AIPU_STATUS_SUCCESS)
// {
// aipu_get_error_message(ctx, ret, &msg);
// LOGE("[TEST ERROR] aipu_unload_graph: %s\n", msg);
// goto deinit_ctx;
// }
// LOGE("[TEST INFO] aipu_unload_graph success\n");
//
//deinit_ctx:
// ret = aipu_deinit_context(ctx);
// if (ret != AIPU_STATUS_SUCCESS)
// {
// aipu_get_error_message(ctx, ret, &msg);
// LOGE("[TEST ERROR] aipu_deinit_ctx: %s\n", msg);
//// return -1;
// }
// LOGE("[TEST INFO] aipu_deinit_context success\n");////#endif
//// return 0;
// finish:
// if (AIPU_STATUS_SUCCESS != ret) {
// pass = -1;
// }
// for (uint32_t i = 0; i < output_data.size(); i++) {
// delete[] output_data[i];
// }
//
// output_data.clear();
///////////////////////////////////////////env->ReleaseIntArrayElements(inputBin, inputData, 0);env->ReleaseByteArrayElements(output, outputData, 0);
// env->ReleaseByteArrayElements(goldenOutputBin, outputGoldenData, 0);delete[] buffer2;
// delete[] buffer3;delete[] voidInputData;return 0;
}
CMakeLists.txt 也可以参考我的代码:
include_directories(${CMAKE_CURRENT_SOURCE_DIR}
)add_library(npu_inference SHARED ${CMAKE_CURRENT_SOURCE_DIR}/npu_inference.cpp)target_link_libraries(npu_inferenceaipudrvandroidlog
)