Vortex GPGPU的github流程跑通与功能模块波形探索(四)
文章目录
- 前言
- 一、demo的输入文件
- 二、trace_csv
- 三、2个值得注意的点
- 3.1 csv指令表格里面的tmask?
- 3.2 rtlsim和simx的log文件?
- 总结
前言
跟着前面那篇最后留下的几个问题接着把输出波形文件和csv文件的输入、输出搞明白!
一、demo的输入文件
该文件夹下的内容包括:
dention@dention-virtual-machine:~/Desktop/vortex/vortex/tests/regression/demo$ tree
.
├── common.h
├── kernel.cpp
├── main.cpp
└── Makefile0 directories, 4 files
Makefile
的内容中规中矩,没有很复杂的写法!主要是完成了编译的配置,其编译的命令出现在../Makefile
内。
因此,核心文件就main.cpp
和kernel.cpp
:
kernel.cpp
如下:
#include <vx_spawn.h>
#include "common.h"void kernel_body(kernel_arg_t* __UNIFORM__ arg) {auto src0_ptr = reinterpret_cast<TYPE*>(arg->src0_addr);auto src1_ptr = reinterpret_cast<TYPE*>(arg->src1_addr);auto dst_ptr = reinterpret_cast<TYPE*>(arg->dst_addr);uint32_t count = arg->task_size;uint32_t offset = blockIdx.x * count;for (uint32_t i = 0; i < count; ++i) {dst_ptr[offset+i] = src0_ptr[offset+i] + src1_ptr[offset+i];}
}int main() {kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);return vx_spawn_threads(1, &arg->num_tasks, nullptr, (vx_kernel_func_cb)kernel_body, arg);
}
粗略看功能实现是:执行向量加法,将两个源向量的元素相加并存储到目标向量中。通过reinterpret_cast
将地址转换为相应的指针类型,然后进行内存访问和计算。这里的count
和offset
有点CUDA
的blockIdx
、thread
的感觉了。其中的vx_spawn_threads
是(见于/src/vx_spawn.c
):
int vx_spawn_threads(uint32_t dimension,const uint32_t* grid_dim,const uint32_t * block_dim,vx_kernel_func_cb kernel_func,const void* arg) {// calculate number of groups and group sizeuint32_t num_groups = 1;uint32_t group_size = 1;for (uint32_t i = 0; i < 3; ++i) {uint32_t gd = (grid_dim && (i < dimension)) ? grid_dim[i] : 1;uint32_t bd = (block_dim && (i < dimension)) ? block_dim[i] : 1;num_groups *= gd;group_size *= bd;gridDim.m[i] = gd;blockDim.m[i] = bd;}
这一段与CUDA
编程的相似度可能更加明显,不同于NVIDIA GPU
,这段用在Vortex平台
上启动并行线程。根据给定的grid
和block
维度,计算线程组的数量和大小,并启动内核函数。
然后是main.cpp
的测试程序:
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"#define FLOAT_ULP 6#define RT_CHECK(_expr) \do { \int _ret = _expr; \if (0 == _ret) \break; \printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \cleanup(); \exit(-1); \} while (false)///template <typename Type>
class Comparator {};template <>
class Comparator<int> {
public:static const char* type_str() {return "integer";}static int generate() {return rand();}static bool compare(int a, int b, int index, int errors) {if (a != b) {if (errors < 100) {printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a);}return false;}return true;}
};template <>
class Comparator<float> {
private:union Float_t { float f; int i; };
public:static const char* type_str() {return "float";}static int generate() {return static_cast<float>(rand()) / RAND_MAX;}static bool compare(float a, float b, int index, int errors) {union fi_t { float f; int32_t i; };fi_t fa, fb;fa.f = a;fb.f = b;auto d = std::abs(fa.i - fb.i);if (d > FLOAT_ULP) {if (errors < 100) {printf("*** error: [%d] expected=%f(0x%x), actual=%f(0x%x), ulp=%d\n", index, b, fb.i, a, fa.i, d);}return false;}return true;}
};const char* kernel_file = "kernel.vxbin";
uint32_t count = 16;vx_device_h device = nullptr;
vx_buffer_h src0_buffer = nullptr;
vx_buffer_h src1_buffer = nullptr;
vx_buffer_h dst_buffer = nullptr;
vx_buffer_h krnl_buffer = nullptr;
vx_buffer_h args_buffer = nullptr;
kernel_arg_t kernel_arg = {};static void show_usage() {std::cout << "Vortex Test." << std::endl;std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}static void parse_args(int argc, char **argv) {int c;while ((c = getopt(argc, argv, "n:k:h?")) != -1) {switch (c) {case 'n':count = atoi(optarg);break;case 'k':kernel_file = optarg;break;case 'h':case '?': {show_usage();exit(0);} break;default:show_usage();exit(-1);}}
}void cleanup() {if (device) {vx_mem_free(src0_buffer);vx_mem_free(src1_buffer);vx_mem_free(dst_buffer);vx_mem_free(krnl_buffer);vx_mem_free(args_buffer);vx_dev_close(device);}
}int main(int argc, char *argv[]) {// parse command argumentsparse_args(argc, argv);std::srand(50);// open device connectionstd::cout << "open device connection" << std::endl;RT_CHECK(vx_dev_open(&device));uint64_t num_cores, num_warps, num_threads;RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));uint32_t total_threads = num_cores * num_warps * num_threads;uint32_t num_points = count * total_threads;uint32_t buf_size = num_points * sizeof(TYPE);std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;std::cout << "number of points: " << num_points << std::endl;std::cout << "buffer size: " << buf_size << " bytes" << std::endl;kernel_arg.num_tasks = total_threads;kernel_arg.task_size = count;// allocate device memorystd::cout << "allocate device memory" << std::endl;RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src0_buffer));RT_CHECK(vx_mem_address(src0_buffer, &kernel_arg.src0_addr));RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src1_buffer));RT_CHECK(vx_mem_address(src1_buffer, &kernel_arg.src1_addr));RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &dst_buffer));RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;// allocate host buffersstd::cout << "allocate host buffers" << std::endl;std::vector<TYPE> h_src0(num_points);std::vector<TYPE> h_src1(num_points);std::vector<TYPE> h_dst(num_points);// generate source datafor (uint32_t i = 0; i < num_points; ++i) {h_src0[i] = Comparator<TYPE>::generate();h_src1[i] = Comparator<TYPE>::generate();}// upload source buffer0std::cout << "upload source buffer0" << std::endl;RT_CHECK(vx_copy_to_dev(src0_buffer, h_src0.data(), 0, buf_size));// upload source buffer1std::cout << "upload source buffer1" << std::endl;RT_CHECK(vx_copy_to_dev(src1_buffer, h_src1.data(), 0, buf_size));// upload programstd::cout << "upload program" << std::endl;RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));// upload kernel argumentstd::cout << "upload kernel argument" << std::endl;RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));// start devicestd::cout << "start device" << std::endl;RT_CHECK(vx_start(device, krnl_buffer, args_buffer));// wait for completionstd::cout << "wait for completion" << std::endl;RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));// download destination bufferstd::cout << "download destination buffer" << std::endl;RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, buf_size));// verify resultstd::cout << "verify result" << std::endl;int errors = 0;for (uint32_t i = 0; i < num_points; ++i) {auto ref = h_src0[i] + h_src1[i];auto cur = h_dst[i];if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {++errors;}}// cleanupstd::cout << "cleanup" << std::endl;cleanup();if (errors != 0) {std::cout << "Found " << std::dec << errors << " errors!" << std::endl;std::cout << "FAILED!" << std::endl;return errors;}std::cout << "PASSED!" << std::endl;return 0;
}
大致介绍其功能:
1、首先通过parse_args
函数解析命令行参数,设置每个任务的count
和内核文件路径kernel_file
。
2、通过vx_dev_open
打开Vortex
设备连接,并使用vx_dev_caps
获取设备的内核数量、线程数量等信息,进而根据count
和数据类型大小计算缓冲区大小。
3、在内存分配阶段,程序通过vx_mem_alloc
分配设备内存用于存储源数据和目标数据,并通过vx_mem_address
获取设备内存地址。
4、随后,分配主机缓冲区,并通过vx_copy_to_dev
将源数据上传到设备内存。
5、在内核启动阶段,通过vx_upload_kernel_file
上传内核文件,通过vx_upload_bytes
上传内核参数,然后调用vx_start
启动设备执行内核函数,并通过vx_ready_wait
等待内核函数执行完成。
6、在结果下载和验证阶段,通过vx_copy_from_dev
将目标数据下载到主机缓冲区,并使用Comparator<TYPE>::compare
验证目标数据是否正确。
7、最后,释放分配的资源并关闭设备连接。
二、trace_csv
主要分为这么几个模块:
parse_args:参数解析,包括了-t(--type)用于指定类型,可选包括rtlsim或者simx;包括-o(--csv),默认输出trace.csv;包括log参数,表示输入日志文件名。load_config:加载日志文件。parse_simx和parse_rtlsim:日志解析函数。
# parse_simx:解析simx类型的日志,并使用正则表达式匹配日志中的关键信息(如PC、core_id、warp_id、instr等)。
# parse_rtlsim:解析rtlsim类型的日志,并使用正则表达式匹配日志中的关键信息(如PC、core_id、warp_id、instr、opcode等),根据日志的阶段(decode、issue、commit),逐步提取和更新指令信息。write_csv以及其他:辅助写入csv或者数据格式处理的函数。
三、2个值得注意的点
3.1 csv指令表格里面的tmask?
csv
里面的tmask
的值要么就是1111
和1000
,但很明显出现了J
型指令,所以不可能不出现thread mask
为其他值的情况!
dention@dention-virtual-machine:~/Desktop/vortex/vortex/build$ ./cal_line_csv.sh trace_rtlsim.csv
tmask=tmask: 1
tmask=1111: 3275
tmask=1000: 402
dention@dention-virtual-machine:~/Desktop/vortex/vortex/build$ ./cal_line_csv.sh trace_simx.csv
tmask=tmask: 1
tmask=1111: 3275
tmask=1000: 402
该shell
程序如下:
#!/bin/bashif [ -z "$1" ]; thenecho "Usage: $0 <filename>"exit 1
fifilename="$1"if [ ! -f "$filename" ]; thenecho "Error: File '$filename' not found."exit 1
fitmask_values=("0000" "0001" "0010" "0011" "0100" "0101" "0110" "0111" "1000" "1001" "1010" "1011" "1100" "1101" "1110" "1111")declare -A countsfor tmask in "${tmask_values[@]}"; docounts[$tmask]=0
doneawk -F, 'NR==1 {for(i=1;i<=NF;i++) if($i=="tmask") col=i} col {counts[$col]++} END {for(tmask in counts) print "tmask="tmask": "counts[tmask]}' "$filename"
翻了翻,发现某个run.log
里面的tmask
除了1111
和1000
,还包含其他值。
dention@dention-virtual-machine:~/Desktop/vortex/vortex/build$ ./cal_line.sh rtlsim_run.log
tmask=0000: 1
tmask=0001: 2583
tmask=0010: 572
tmask=0011: 1
tmask=0100: 574
tmask=1000: 572
tmask=1010: 2
tmask=1011: 1
tmask=1100: 2
tmask=1111: 15804
dention@dention-virtual-machine:~/Desktop/vortex/vortex/build$ ./cal_line.sh simx_run.log
tmask=1000: 402
tmask=1111: 3275
计数的shell
程序如下:
#!/bin/bashif [ -z "$1" ]; thenecho "Usage: $0 <filename>"exit 1
fifilename="$1"if [ ! -f "$filename" ]; thenecho "Error: File '$filename' not found."exit 1
fitmask_values=("0000" "0001" "0010" "0011" "0100" "0101" "0110" "0111" "1000" "1001" "1010" "1011" "1100" "1101" "1110" "1111")for tmask in "${tmask_values[@]}"; docount=$(grep -c "tmask=$tmask" "$filename")if [ $count -gt 0 ]; thenecho "tmask=$tmask: $count"fi
done
看样子,为了看明白架构设计,tmask
部分得仔细看rtlsim_run.log
。
3.2 rtlsim和simx的log文件?
除了tmask
上的差异外,还包括若干点:
粘贴一部分simx_run.log
的开头和结尾内容:
# 开头
make: Entering directory '/home/dention/Desktop/vortex/vortex/build/tests/regression/demo'
LD_LIBRARY_PATH=/home/dention/Desktop/vortex/vortex/build/runtime: VORTEX_DRIVER=simx ./demo -n64
open device connection
CONFIGS: num_threads=4, num_warps=4, num_cores=1, num_clusters=1, socket_size=1, local_mem_base=0xffff0000, num_barriers=2
[VXDRV] DEV_OPEN: hdevice=0x588eaa0c6b00
[VXDRV] DCR_WRITE: hdevice=0x588eaa0c6b00, addr=0x1, value=0x80000000
[VXDRV] DCR_WRITE: hdevice=0x588eaa0c6b00, addr=0x2, value=0x0
[VXDRV] DCR_WRITE: hdevice=0x588eaa0c6b00, addr=0x3, value=0x0
[VXDRV] DCR_WRITE: hdevice=0x588eaa0c6b00, addr=0x4, value=0x0
[VXDRV] DCR_WRITE: hdevice=0x588eaa0c6b00, addr=0x5, value=0x0
[VXDRV] DEV_CAPS: hdevice=0x588eaa0c6b00, caps_id=3, value=1
[VXDRV] DEV_CAPS: hdevice=0x588eaa0c6b00, caps_id=2, value=4
[VXDRV] DEV_CAPS: hdevice=0x588eaa0c6b00, caps_id=1, value=4
data type: integer
number of points: 1024
buffer size: 4096 bytes
allocate device memory
[VXDRV] [RT:mem_alloc] size: 0x1000, asize, 0x1000,flag : 0x1
[VXDRV] MEM_ALLOC: hdevice=0x588eaa0c6b00, size=4096, flags=0x1, hbuffer=0x588eaa1d2fe0
[VXDRV] MEM_ADDRESS: hbuffer=0x588eaa1d2fe0, address=0x10000
[VXDRV] [RT:mem_alloc] size: 0x1000, asize, 0x1000,flag : 0x1
[VXDRV] MEM_ALLOC: hdevice=0x588eaa0c6b00, size=4096, flags=0x1, hbuffer=0x588eaa1d30d0
[VXDRV] MEM_ADDRESS: hbuffer=0x588eaa1d30d0, address=0x11000
[VXDRV] [RT:mem_alloc] size: 0x1000, asize, 0x1000,flag : 0x2
[VXDRV] MEM_ALLOC: hdevice=0x588eaa0c6b00, size=4096, flags=0x2, hbuffer=0x588eaa1d3180
[VXDRV] MEM_ADDRESS: hbuffer=0x588eaa1d3180, address=0x12000
dev_src0=0x10000
dev_src1=0x11000
dev_dst=0x12000
allocate host buffers
upload source buffer0
[VXDRV] COPY_TO_DEV: hbuffer=0x588eaa1d2fe0, host_addr=0x588eaa1d31a0, dst_offset=0, size=4096
upload source buffer1
[VXDRV] COPY_TO_DEV: hbuffer=0x588eaa1d30d0, host_addr=0x588eaa1d41b0, dst_offset=0, size=4096
upload program
[VXDRV] [RT:mem_reserve] addr: 0x80000000, asize:0x8000, size: 0x724c
[VXDRV] MEM_RESERVE: hdevice=0x588eaa0c6b00, address=0x80000000, size=29260, flags=0x0, hbuffer=0x588eaa1da520
[VXDRV] MEM_ACCESS: hbuffer=0x588eaa1da520, offset=0, size=29232, flags=1
[VXDRV] MEM_ACCESS: hbuffer=0x588eaa1da520, offset=29232, size=28, flags=3
[VXDRV] COPY_TO_DEV: hbuffer=0x588eaa1da520, host_addr=0x588eaa1dc3d0, dst_offset=0, size=29232
upload kernel argument
[VXDRV] [RT:mem_alloc] size: 0x20, asize, 0x1000,flag : 0x1
[VXDRV] MEM_ALLOC: hdevice=0x588eaa0c6b00, size=32, flags=0x1, hbuffer=0x588eaa1db7a0
[VXDRV] COPY_TO_DEV: hbuffer=0x588eaa1db7a0, host_addr=0x588e9665b1a0, dst_offset=0, size=32
start device
[VXDRV] START: hdevice=0x588eaa0c6b00, hkernel=0x588eaa1da520, harguments=0x588eaa1db7a0
wait for completion
[VXDRV] READY_WAIT: hdevice=0x588eaa0c6b00, timeout=86400000
DEBUG Fetch: cid=0, wid=0, tmask=1000, PC=0x80000000 (#0)
DEBUG Instr 0xfc1022f3: CSRRS x5, x0, 0xfc1
DEBUG Fetch: cid=0, wid=0, tmask=1000, PC=0x80000004 (#1)
DEBUG Instr 0x317: AUIPC x6, 0x0
DEBUG Fetch: cid=0, wid=0, tmask=1000, PC=0x80000008 (#2)
DEBUG Instr 0x15c30313: ADDI x6, x6, 0x15c
DEBUG Fetch: cid=0, wid=0, tmask=1000, PC=0x8000000c (#3)
DEBUG Instr 0x62900b: WSPAWN x5, x6
DEBUG Fetch: cid=0, wid=0, tmask=1000, PC=0x80000010 (#4)
DEBUG Instr 0xfff00293: ADDI x5, x0, 0xffffffff
DEBUG Fetch: cid=0, wid=1, tmask=1000, PC=0x80000160 (#4294967296)
DEBUG Instr 0xfff00293: ADDI x5, x0, 0xffffffff
DEBUG Fetch: cid=0, wid=2, tmask=1000, PC=0x80000160 (#8589934592)
DEBUG Instr 0xfff00293: ADDI x5, x0, 0xffffffff
DEBUG Fetch: cid=0, wid=3, tmask=1000, PC=0x80000160 (#12884901888)
DEBUG Instr 0xfff00293: ADDI x5, x0, 0xffffffff
DEBUG Fetch: cid=0, wid=0, tmask=1000, PC=0x80000014 (#5)
DEBUG Instr 0x2800b: TMC x5
DEBUG Fetch: cid=0, wid=0, tmask=1111, PC=0x80000018 (#6)
DEBUG Instr 0x118000ef: JAL x1, 0x118
DEBUG Fetch: cid=0, wid=1, tmask=1000, PC=0x80000164 (#4294967297)
DEBUG Instr 0x2800b: TMC x5# 结尾
DEBUG Fetch: cid=0, wid=0, tmask=1000, PC=0x80000124 (#1196)
DEBUG Instr 0xa2a023: SW x5, x10, 0x0
DEBUG Fetch: cid=0, wid=0, tmask=1000, PC=0x80000128 (#1197)
DEBUG Instr 0xff0000f: FENCE 0xff
DEBUG Fetch: cid=0, wid=0, tmask=1000, PC=0x8000012c (#1198)
DEBUG Instr 0xb: TMC x0
download destination buffer
[VXDRV] COPY_FROM_DEV: hbuffer=0x588eaa1d3180, host_addr=0x588eaa1d51c0, src_offset=0, size=4096
verify result
cleanup
[VXDRV] MEM_FREE: hbuffer=0x588eaa1d2fe0
[VXDRV] MEM_FREE: hbuffer=0x588eaa1d30d0
[VXDRV] MEM_FREE: hbuffer=0x588eaa1d3180
[VXDRV] MEM_FREE: hbuffer=0x588eaa1da520
[VXDRV] MEM_FREE: hbuffer=0x588eaa1db7a0
[VXDRV] DEV_CAPS: hdevice=0x588eaa0c6b00, caps_id=3, value=1
[VXDRV] DEV_CAPS: hdevice=0x588eaa0c6b00, caps_id=7, value=220126515488
[VXDRV] DEV_CAPS: hdevice=0x588eaa0c6b00, caps_id=8, value=2
[VXDRV] MPM_QUERY: hdevice=0x588eaa0c6b00, addr=0xb00, core_id=0, value=0x51e5
[VXDRV] MPM_QUERY: hdevice=0x588eaa0c6b00, addr=0xb02, core_id=0, value=0x343c
PERF: instrs=13372, cycles=20965, IPC=0.637825
[VXDRV] DEV_CLOSE: hdevice=0x588eaa0c6b00
PASSED!
make: Leaving directory '/home/dention/Desktop/vortex/vortex/build/tests/regression/demo'
再粘贴点rtlsim_run.log
的开头内容:
# 开头
make: Entering directory '/home/dention/Desktop/vortex/vortex/build/tests/regression/demo'
LD_LIBRARY_PATH=/home/dention/Desktop/vortex/vortex/build/runtime: VORTEX_DRIVER=rtlsim ./demo -n64
open device connection
CONFIGS: num_threads=4, num_warps=4, num_cores=1, num_clusters=1, socket_size=1, local_mem_base=0xffff0000, num_barriers=21: cluster0-socket0-core0-commit: wid=0, PC=0x5f107a52, ex=ALU, tmask=0001, wb=0, rd=3, sop=1, eop=1, data={0xb7565521, 0x9bc48e7e, 0xa32ad15d, 0x9389602e} (#13683927122444)3: cluster0-socket0-core0-fetch req: wid=3, PC=0xe433879e, tmask=0011 (#12884901888)3: cluster0-socket0-core0-fetch rsp: wid=3, PC=0x71e4448, tmask=1100, instr=0x349268a (#17094256310868)3: cluster0-socket0-core0-commit: wid=1, PC=0x52773234, ex=ALU, tmask=0100, wb=1, rd=36, sop=0, eop=1, data={0x4b5e9eb4, 0x2caf1927, 0xfe7aa429, 0xf3fe8f6c} (#12548972853897)3: cluster0-socket0-core0-issue0: wid=2, PC=0xe48211bc, ex=LSU, op=?, tmask=1111, wb=0, rd=19, rs1_data={0x9836776f, 0xc8641052, 0xd0a1def8, 0x6cc767e9}, rs2_data={0x723c9207, 0x31d09f01, 0xd7686059, 0xeb86a82f}, rs3_data={0xb75bff11, 0xd2952bed, 0x5f988735, 0x302f3caa}, offset=0x1b1 (#2627431648627)3: cluster0-socket0-core0-decode: wid=3, PC=0x71e4448, instr=0x349268a, ex=ALU, op=MULW, tmask=1100, wb=0, rd=0, rs1=0, rs2=0, rs3=0, opds=0000, use_PC=0, use_imm=0, imm=0xad7bb945 (#17094256310868)5: cluster0-socket0-core0-commit: wid=3, PC=0x1ac94c6, ex=ALU, tmask=1010, wb=0, rd=17, sop=0, eop=0, data={0x622ca31d, 0x3c0cd2a3, 0xdf57daf0, 0x4a2db2c0} (#1048219273081)5: cluster0-socket0-core0-issue0: wid=0, PC=0xe63d2e26, ex=LSU, op=SB, tmask=0000, wb=0, rd=48, rs1_data={0x192f8f69, 0x44286d54, 0x9ea4bb06, 0xd50866bc}, rs2_data={0xc96de59a, 0x84899e44, 0xf711e234, 0x6a373f12}, rs3_data={0x56c1c3b6, 0x4b37021d, 0x80b8d4f3, 0xbd771324}, offset=0x654 (#11721782109298)7: cluster0-socket0-core0-fetch req: wid=3, PC=0xe43387a2, tmask=1010 (#12884901889)7: cluster0-socket0-core0-commit: wid=2, PC=0xc93aa, ex=ALU, tmask=0100, wb=1, rd=43, sop=0, eop=1, data={0xc0967ef, 0x20438730, 0x291eaa1d, 0xa71be0fe} (#5228876286358)7: cluster0-socket0-core0-issue0: wid=1, PC=0xea70b098, ex=FPU, op=FSQRT.S, tmask=1011, wb=1, rd=56, rs1_data={0x7e4a3e0b, 0xd9f00800, 0xc6c9ee49, 0xfc9a9d2d}, rs2_data={0x0, 0x0, 0x0, 0x0}, rs3_data={0x603da36e, 0x8be4ac57, 0x1dbfe827, 0x5f107de0}, fmt=0x0, frm=0x3 (#4099303258324)
[VXDRV] DEV_OPEN: hdevice=0x57d9ef008d90
[VXDRV] DCR_WRITE: hdevice=0x57d9ef008d90, addr=0x1, value=0x80000000
STARTUP_ADDR0[VXDRV] DCR_WRITE: hdevice=0x57d9ef008d90, addr=0x2, value=0x0
STARTUP_ADDR1[VXDRV] DCR_WRITE: hdevice=0x57d9ef008d90, addr=0x3, value=0x0
STARTUP_ARG0[VXDRV] DCR_WRITE: hdevice=0x57d9ef008d90, addr=0x4, value=0x0
STARTUP_ARG1[VXDRV] DCR_WRITE: hdevice=0x57d9ef008d90, addr=0x5, value=0x0
MPM_CLASS[VXDRV] DEV_CAPS: hdevice=0x57d9ef008d90, caps_id=3, value=1
[VXDRV] DEV_CAPS: hdevice=0x57d9ef008d90, caps_id=2, value=4
[VXDRV] DEV_CAPS: hdevice=0x57d9ef008d90, caps_id=1, value=4
data type: integer
number of points: 1024
buffer size: 4096 bytes
allocate device memory
[VXDRV] MEM_ALLOC: hdevice=0x57d9ef008d90, size=4096, flags=0x1, hbuffer=0x57d9ef13e4b0
[VXDRV] MEM_ADDRESS: hbuffer=0x57d9ef13e4b0, address=0x10000
[VXDRV] MEM_ALLOC: hdevice=0x57d9ef008d90, size=4096, flags=0x1, hbuffer=0x57d9ef13e510
[VXDRV] MEM_ADDRESS: hbuffer=0x57d9ef13e510, address=0x11000
[VXDRV] MEM_ALLOC: hdevice=0x57d9ef008d90, size=4096, flags=0x2, hbuffer=0x57d9ef13e5c0
[VXDRV] MEM_ADDRESS: hbuffer=0x57d9ef13e5c0, address=0x12000
dev_src0=0x10000
dev_src1=0x11000
dev_dst=0x12000
allocate host buffers
upload source buffer0
[VXDRV] COPY_TO_DEV: hbuffer=0x57d9ef13e4b0, host_addr=0x57d9ef13e5e0, dst_offset=0, size=4096
upload source buffer1
[VXDRV] COPY_TO_DEV: hbuffer=0x57d9ef13e510, host_addr=0x57d9ef13f5f0, dst_offset=0, size=4096
upload program
[VXDRV] MEM_RESERVE: hdevice=0x57d9ef008d90, address=0x80000000, size=29260, flags=0x0, hbuffer=0x57d9ef14cb80
[VXDRV] MEM_ACCESS: hbuffer=0x57d9ef14cb80, offset=0, size=29232, flags=1
[VXDRV] MEM_ACCESS: hbuffer=0x57d9ef14cb80, offset=29232, size=28, flags=3
[VXDRV] COPY_TO_DEV: hbuffer=0x57d9ef14cb80, host_addr=0x57d9ef145870, dst_offset=0, size=29232
upload kernel argument
[VXDRV] MEM_ALLOC: hdevice=0x57d9ef008d90, size=32, flags=0x1, hbuffer=0x57d9ef143970
[VXDRV] COPY_TO_DEV: hbuffer=0x57d9ef143970, host_addr=0x57d9eb3fc1a0, dst_offset=0, size=32
start device
[VXDRV] START: hdevice=0x57d9ef008d90, hkernel=0x57d9ef14cb80, harguments=0x57d9ef143970
STARTUP_ADDR0STARTUP_ADDR1STARTUP_ARG0STARTUP_ARG1wait for completion
[VXDRV] READY_WAIT: hdevice=0x57d9ef008d90, timeout=86400000
52: [sim] run()77: cluster0-socket0-core0-fetch req: wid=0, PC=0x80000000, tmask=0001 (#0)265: cluster0-socket0-core0-fetch rsp: wid=0, PC=0x80000000, tmask=0001, instr=0xfc1022f3 (#0)265: cluster0-socket0-core0-decode: wid=0, PC=0x80000000, instr=0xfc1022f3, ex=SFU, op=CSRRS, tmask=0001, wb=1, rd=5, rs1=0, rs2=0, rs3=0, opds=1100, addr=0xfc1, use_imm=0, imm=0x5 (#0)269: cluster0-socket0-core0-fetch req: wid=0, PC=0x80000004, tmask=0001 (#1)277: cluster0-socket0-core0-fetch rsp: wid=0, PC=0x80000004, tmask=0001, instr=0x317 (#1)277: cluster0-socket0-core0-issue0: wid=0, PC=0x80000000, ex=SFU, op=CSRRS, tmask=0001, wb=1, rd=5, rs1_data={0x0, 0x0, 0x0, 0x0}, rs2_data={0x0, 0x0, 0x0, 0x0}, rs3_data={0x0, 0x0, 0x0, 0x0}, addr=0xfc1, use_imm=0, imm=0x5 (#0)277: cluster0-socket0-core0-decode: wid=0, PC=0x80000004, instr=0x317, ex=ALU, op=AUIPC, tmask=0001, wb=1, rd=6, rs1=0, rs2=0, rs3=0, opds=1000, use_PC=1, use_imm=1, imm=0x0 (#1)281: cluster0-socket0-core0-fetch req: wid=0, PC=0x80000008, tmask=0001 (#2)287: cluster0-socket0-core0-commit: wid=0, PC=0x80000000, ex=SFU, tmask=0001, wb=1, rd=5, sop=1, eop=1, data={0x4, 0x4, 0x4, 0x4} (#0)289: cluster0-socket0-core0-fetch rsp: wid=0, PC=0x80000008, tmask=0001, instr=0x15c30313 (#2)289: cluster0-socket0-core0-issue0: wid=0, PC=0x80000004, ex=ALU, op=AUIPC, tmask=0001, wb=1, rd=6, rs1_data={0x0, 0x0, 0x0, 0x0}, rs2_data={0x0, 0x0, 0x0, 0x0}, rs3_data={0x0, 0x0, 0x0, 0x0}, use_PC=1, use_imm=1, imm=0x0 (#1)289: cluster0-socket0-core0-decode: wid=0, PC=0x80000008, instr=0x15c30313, ex=ALU, op=ADDI, tmask=0001, wb=1, rd=6, rs1=6, rs2=0, rs3=0, opds=1100, use_PC=0, use_imm=1, imm=0x15c (#2)293: cluster0-socket0-core0-fetch req: wid=0, PC=0x8000000c, tmask=0001 (#3)295: cluster0-socket0-core0-commit: wid=0, PC=0x80000004, ex=ALU, tmask=0001, wb=1, rd=6, sop=1, eop=1, data={0x80000004, 0x80000004, 0x80000004, 0x80000004} (#1)301: cluster0-socket0-core0-fetch rsp: wid=0, PC=0x8000000c, tmask=0001, instr=0x62900b (#3)301: cluster0-socket0-core0-decode: wid=0, PC=0x8000000c, instr=0x62900b, ex=SFU, op=WSPAWN, tmask=0001, wb=0, rd=0, rs1=5, rs2=6, rs3=0, opds=0110 (#3)307: cluster0-socket0-core0-issue0: wid=0, PC=0x80000008, ex=ALU, op=ADDI, tmask=0001, wb=1, rd=6, rs1_data={0x0, 0x0, 0x0, 0x80000004}, rs2_data={0x0, 0x0, 0x0, 0x0}, rs3_data={0x0, 0x0, 0x0, 0x0}, use_PC=0, use_imm=1, imm=0x15c (#2)313: cluster0-socket0-core0-commit: wid=0, PC=0x80000008, ex=ALU, tmask=0001, wb=1, rd=6, sop=1, eop=1, data={0x15c, 0x15c, 0x15c, 0x80000160} (#2)325: cluster0-socket0-core0-issue0: wid=0, PC=0x8000000c, ex=SFU, op=WSPAWN, tmask=0001, wb=0, rd=0, rs1_data={0x0, 0x0, 0x0, 0x4}, rs2_data={0x0, 0x0, 0x0, 0x80000160}, rs3_data={0x0, 0x0, 0x0, 0x0} (#3)335: cluster0-socket0-core0-commit: wid=0, PC=0x8000000c, ex=SFU, tmask=0001, wb=0, rd=0, sop=1, eop=1, data={0x0, 0x0, 0x0, 0x0} (#3)337: cluster0-socket0-core0-fetch req: wid=0, PC=0x80000010, tmask=0001 (#4)# 结尾43883: cluster0-socket0-core0-fetch rsp: wid=0, PC=0x8000012c, tmask=0001, instr=0xb (#1198)43883: cluster0-socket0-core0-issue0: wid=0, PC=0x80000128, ex=LSU, op=FENCE, tmask=0001, wb=0, rd=0, rs1_data={0x0, 0x0, 0x0, 0x0}, rs2_data={0x0, 0x0, 0x0, 0x0}, rs3_data={0x0, 0x0, 0x0, 0x0}, offset=0x0 (#1197)43883: cluster0-socket0-core0-decode: wid=0, PC=0x8000012c, instr=0xb, ex=SFU, op=TMC, tmask=0001, wb=0, rd=0, rs1=0, rs2=0, rs3=0, opds=0100 (#1198)43887: cluster0-socket0-core0-commit: wid=0, PC=0x80000124, ex=LSU, tmask=0001, wb=0, rd=0, sop=1, eop=1, data={0x0, 0x0, 0x0, 0x0} (#1196)43895: cluster0-socket0-core0-issue0: wid=0, PC=0x8000012c, ex=SFU, op=TMC, tmask=0001, wb=0, rd=0, rs1_data={0x0, 0x0, 0x0, 0x0}, rs2_data={0x0, 0x0, 0x0, 0x0}, rs3_data={0x0, 0x0, 0x0, 0x0} (#1198)43905: cluster0-socket0-core0-commit: wid=0, PC=0x8000012c, ex=SFU, tmask=0001, wb=0, rd=0, sop=1, eop=1, data={0x0, 0x0, 0x0, 0x0} (#1198)44093: cluster0-socket0-core0-commit: wid=0, PC=0x80000128, ex=LSU, tmask=0001, wb=0, rd=0, sop=1, eop=1, data={0xf58212ff, 0xc1377584, 0x1a1de1da, 0xb1e383e0} (#1197)
download destination buffer
[VXDRV] COPY_FROM_DEV: hbuffer=0x57d9ef13e5c0, host_addr=0x57d9ef140600, src_offset=0, size=4096
verify result
cleanup
[VXDRV] MEM_FREE: hbuffer=0x57d9ef13e4b0
[VXDRV] MEM_FREE: hbuffer=0x57d9ef13e510
[VXDRV] MEM_FREE: hbuffer=0x57d9ef13e5c0
[VXDRV] MEM_FREE: hbuffer=0x57d9ef14cb80
[VXDRV] MEM_FREE: hbuffer=0x57d9ef143970
[VXDRV] DEV_CAPS: hdevice=0x57d9ef008d90, caps_id=3, value=1
[VXDRV] DEV_CAPS: hdevice=0x57d9ef008d90, caps_id=7, value=220126515488
[VXDRV] DEV_CAPS: hdevice=0x57d9ef008d90, caps_id=8, value=2
[VXDRV] MPM_QUERY: hdevice=0x57d9ef008d90, addr=0xb00, core_id=0, value=0x5114
[VXDRV] MPM_QUERY: hdevice=0x57d9ef008d90, addr=0xb02, core_id=0, value=0x3440
PERF: instrs=13376, cycles=20756, IPC=0.644440
[VXDRV] DEV_CLOSE: hdevice=0x57d9ef008d90
PASSED!
make: Leaving directory '/home/dention/Desktop/vortex/vortex/build/tests/regression/demo'
这俩log
文件的差异确实很大,也能看得出来基于模拟器的方案和基于rtl
的方案存在多个方面的差异!
simx_run.log:
1、日志文件从编译和运行命令开始,记录了程序的启动过程。
2、包含设备配置信息(CONFIGS)。
3、详细记录了设备内存分配、数据上传、程序上传、设备启动、指令执行等过程。
4、包含指令执行的详细信息,如 DEBUG Fetch、DEBUG Instr 等。rtlsim_run.log:
1、日志文件同样从编译和运行命令开始,记录了程序的启动过程。同前
2、包含设备配置信息(CONFIGS)。同前
3、详细记录了设备内存分配、数据上传、程序上传、设备启动、指令执行等过程。同前
4、包含指令执行的详细信息,如 cluster0-socket0-core0-fetch req、cluster0-socket0-core0-fetch rsp 等。仅标志不同!
总结
对上一篇遗留的几个任务进行了回答,顺带梳理了2种log的差异,为后续继续解读rtl代码做了大量准备工作。后面就一点一点慢慢看吧,任重而道远!