mmap映射物理内存之四内核cache同步
目录
物理地址的映射
phys_to_virt
remap
总结
驱动实现
内核配置
同步汇编代码
内嵌汇编
驱动实现
makefile
用户态测试程序
cache clean
invalide cache
测试时间汇总
FT2000+
D2000
3588
总结
通过cmdline的mem参数预留的内存,mmap到用户空间访问时,cache的同步,在前述的文章系列中已经有说明,但是invalide会失败。本文描述在内核空间对此段空间进行同步的实现。
物理地址的映射
物理地址到虚拟地址的映射。由于对cache的同步操作,最终用的是虚拟地址,需要先将物理地址映射为虚拟地址。内核有几种接口完成此项任务。
phys_to_virt
-
物理地址 0x151200000 超出了内核的直接映射范围
-
__va()
只能用于直接映射区域的物理地址转换 -
通过
mem=5G
预留的内存不在直接映射区域中
#define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET) | PAGE_OFFSET)CONFIG_ARM64_VA_BITS=39#define VA_BITS (CONFIG_ARM64_VA_BITS)
#define _PAGE_OFFSET(va) (-(UL(1) << (va)))
#define PAGE_OFFSET (_PAGE_OFFSET(VA_BITS)) #define PHYS_OFFSET ({ VM_BUG_ON(memstart_addr & 1); memstart_addr; })
-
CONFIG_ARM64_VA_BITS = 39
虚拟地址空间大小为 39 位。 -
计算
其最高 25 位全是 1 (PAGE_OFFSET
:PAGE_OFFSET = 0xFFFF_FF80_0000_0000
0xFFFF_FF
),低 39 位全是 0。这正是 39 位地址空间下内核空间的起始地址。 -
假设
PHYS_OFFSET
:假设一个常见的值,例如0x4000_0000
(1GB)。这是很多 ARM64 开发板物理内存的起始地址 -
计算物理地址 对一个的虚拟地址
x
:0x4008_0000
(这是一个物理地址,比如是某块内存或设备的地址)
(x) - PHYS_OFFSET
= 0x4008_0000 - 0x4000_0000
= 0x80000
偏移量 | PAGE_OFFSET
= 0x80000 | 0xFFFF_FF80_0000_0000
__phys_to_virt(0x4008_0000) = 0xFFFF_FF80_0008_0000
特性 | 前 5GB 物理内存 | 5GB 以上的物理内存 |
---|---|---|
内核管理 | 是,由 buddy allocator 等管理 | 否,内核完全忽略其存在 |
页表映射 | 已建立有效映射 | 无有效页表映射 |
__phys_to_virt() | 可计算出有效虚拟地址 | 同样可计算出虚拟地址 |
地址访问 | 可正常读写 | 触发页错误 -> 段错误/内核恐慌 |
remap
phys_addr_t ioremap_to_phys(void __iomem *vaddr)
{struct page *page;phys_addr_t phys_addr;// 获取对应的页结构page = vmalloc_to_page(vaddr);if (!page)return 0;// 获取物理地址phys_addr = page_to_phys(page);// 加上页内偏移phys_addr += offset_in_page(vaddr);return phys_addr;
}
总结
ioremap()
创建的映射位于vmalloc区域,而 virt_to_phys()
只能正确转换直接映射区域(线性映射区域)的地址。
-
直接映射区域:物理地址和虚拟地址有固定的偏移(PAGE_OFFSET)
-
vmalloc区域:使用动态的页表映射,没有固定的偏移关系
-
当代码(如内核模块、驱动程序)试图访问
0xFFFF_FF94_0000_0000
时,CPU 的 MMU 会自动查询页表。 -
MMU 发现这个虚拟地址没有有效的页表映射。
-
MMU 会触发一个页面错误 (Page Fault) 异常。
-
内核的页面错误处理程序会接管,它检查地址后会发现这个地址超出了它已知的内存范围。
-
最终,内核会向试图访问的进程发送一个
SIGSEGV
(段错误)信号(如果是在用户态访问),或者直接触发内核恐慌 (Kernel Panic)(如果是在内核态非法访问)。
驱动实现
内核配置
#ifdef CONFIG_NO_GKI
EXPORT_SYMBOL(__dma_map_area);
EXPORT_SYMBOL(__dma_unmap_area);
#endif
当内核配置了CONFIG_NO_GKI ,则在驱动模块中可以直接调用上述两个接口,进行cache的同步,但是在麒麟V10系统中,并没有此项的配置。因而不能在驱动中直接调用两个接口。
-
禁用 (
=n
或 未设置): 这是正确且必须的配置,用于构建一个符合 Android GKI 要求的内核。它启用了 GKI 所需的框架:-
强制要求某些关键的核心功能必须内置。
-
强制要求某些硬件相关的代码必须编译为模块而不是内置。
-
确保内核遵守稳定的内核模块接口(KMI)。
-
同步汇编代码
arm64的汇编代码实现在\kernel\arch\arm64\mm\cache.S,如果将其拷贝出来和我们的驱动编译成一个模块也可以。但是在编译时,由于此文件依赖其他的汇编文件,导致比较麻烦。因而简单的方式参考此文件生成一个独立的文件。如下:
/* SPDX-License-Identifier: GPL-2.0-only */
/** Cache maintenance** Copyright (C) 2001 Deep Blue Solutions Ltd.* Copyright (C) 2012 ARM Ltd.*/#include <linux/errno.h>
#include <linux/linkage.h>
#include <linux/init.h>
#include <asm/assembler.h>
#include <asm/cpufeature.h>
#include <asm/alternative.h>
#include <asm/asm-uaccess.h>//#include <linux/linkage.h>
//#include <asm/assembler.h>/** Simple cache line operations*/// 假设cache line大小为64字节
#define CACHE_LINE_SIZE 64/** __dma_map_area(start, size, dir)* start: x0, size: x1, dir: w2*/
SYM_FUNC_START(__dma_map_area1)cmp w2, #2 // DMA_FROM_DEVICEbeq __dma_inv_area1b __dma_clean_area1
SYM_FUNC_END(__dma_map_area1)/** __dma_unmap_area(start, size, dir)* start: x0, size: x1, dir: w2*/
SYM_FUNC_START(__dma_unmap_area1)cmp w2, #1 // DMA_TO_DEVICEbne __dma_inv_area1ret
SYM_FUNC_END(__dma_unmap_area1)/** __dma_inv_area(start, size)* start: x0, size: x1*/
SYM_FUNC_START(__dma_inv_area1)mov x2, #CACHE_LINE_SIZEsub x3, x2, #1// 计算结束地址add x1, x0, x1// 对齐开始地址到cache linebic x0, x0, x3// 对齐结束地址到cache linebic x1, x1, x3// 循环处理每个cache line
1:dc ivac, x0 // invalidate D lineadd x0, x0, x2cmp x0, x1blo 1bdsb syret
SYM_FUNC_END(__dma_inv_area1)/** __dma_clean_area(start, size)* start: x0, size: x1*/
SYM_FUNC_START(__dma_clean_area1)mov x2, #CACHE_LINE_SIZEsub x3, x2, #1// 计算结束地址add x1, x0, x1// 对齐开始地址到cache linebic x0, x0, x3// 对齐结束地址到cache linebic x1, x1, x3// 循环处理每个cache line
1:dc cvac, x0 // clean D lineadd x0, x0, x2cmp x0, x1blo 1bdsb syret
SYM_FUNC_END(__dma_clean_area1)
内嵌汇编
// 内联汇编实现cache操作
static void __dma_clean_area(void *start, size_t size)
{void *end = start + size;void *addr = (void *)((unsigned long)start & ~(64 - 1));while (addr < end) {asm volatile("dc cvac, %0" : : "r" (addr));addr += 64;}asm volatile("dsb sy");
}static void __dma_inv_area(void *start, size_t size)
{void *end = start + size;void *addr = (void *)((unsigned long)start & ~(64 - 1));while (addr < end) {asm volatile("dc ivac, %0" : : "r" (addr));addr += 64;}asm volatile("dsb sy");
}static void __dma_map_area(void *start, size_t size, int dir)
{if (dir == 2) { // DMA_FROM_DEVICE__dma_inv_area(start, size);} else {__dma_clean_area(start, size);}
}static void __dma_unmap_area(void *start, size_t size, int dir)
{if (dir != 1) { // not DMA_TO_DEVICE__dma_inv_area(start, size);}
}
驱动实现
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/cdev.h>
#include <linux/device.h>
#include <linux/uaccess.h>
#include <linux/slab.h>
#include <linux/dma-direction.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>#define DEVICE_NAME "cache_sync"
#define CLASS_NAME "cache_sync"// 声明汇编函数
extern void __dma_map_area1(void *start, size_t size, int dir);
extern void __dma_unmap_area1(void *start, size_t size, int dir);// IOCTL命令定义
#define CACHE_SYNC_IOCTL_MAGIC 'C'#define CACHE_SYNC_MAP _IOW(CACHE_SYNC_IOCTL_MAGIC, 1, struct cache_sync_cmd)
#define CACHE_SYNC_UNMAP _IOW(CACHE_SYNC_IOCTL_MAGIC, 2, struct cache_sync_cmd)
#define CACHE_SYNC_FLUSH _IOW(CACHE_SYNC_IOCTL_MAGIC, 3, struct cache_sync_cmd)#define RESERVED_PHYS_ADDR 0x151200000
#define RESERVED_SIZE (128 * 1024 * 1024) // 128MBstruct cache_sync_cmd {unsigned long long addr; // 用户空间地址size_t size; // 大小int direction; // DMA方向
};/*
struct dma_sync_cmd {unsigned long long addr;size_t size;int dir;
};
*/
// 设备结构体
struct cache_sync_dev {struct cdev cdev;struct class *class;struct device *device;dev_t devno;void __iomem *vaddr; // ioremap的虚拟地址phys_addr_t phys_addr;size_t total_size;
};static struct cache_sync_dev *cache_dev;// DMA同步接口
static void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,enum dma_data_direction dir)
{struct cache_sync_dev *dev = cache_dev;void *sync_vaddr;sync_vaddr = dev->vaddr + (paddr - dev->phys_addr);__dma_map_area1(sync_vaddr, size, dir);
}static void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,enum dma_data_direction dir)
{struct cache_sync_dev *dev = cache_dev;void *sync_vaddr;sync_vaddr = dev->vaddr + (paddr - dev->phys_addr);__dma_unmap_area1(sync_vaddr, size, dir);
}static int cache_sync_open(struct inode *inode, struct file *filp)
{filp->private_data = cache_dev;return 0;
}static int cache_sync_release(struct inode *inode, struct file *filp)
{return 0;
}static long cache_sync_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{struct cache_sync_cmd cmd_data;//void __user *user_addr;phys_addr_t phys_addr;//void *vaddr=NULL;if (copy_from_user(&cmd_data, (void __user *)arg, sizeof(cmd_data)))return -EFAULT;// user_addr = (void __user *)cmd_data.addr;phys_addr = cmd_data.addr;switch (cmd) {case CACHE_SYNC_MAP:// DMA映射:CPU到设备方向arch_sync_dma_for_device(phys_addr, cmd_data.size, cmd_data.direction);// vaddr = ioremap(cmd_data.addr, cmd_data.size); // size是你需要映射的大小//if (!vaddr) {// pr_err("Failed to ioremap physical address 0x151200000\n");// return -ENOMEM;// }///pr_info("Cache sync MAP: addr=0x%llx, size=%zu, dir=%d,viraddr=%p,viraddr1=%p,phyaddr=0x%llx, phyaddr1=0x%llx,viraddr2=%p\n",// cmd_data.addr, cmd_data.size, cmd_data.direction,phys_to_virt(cmd_data.addr),vaddr,virt_to_phys(phys_to_virt(cmd_data.addr)),ioremap_to_phys(vaddr), __va(cmd_data.addr));//iounmap(vaddr);break;case CACHE_SYNC_UNMAP:// DMA取消映射:设备到CPU方向arch_sync_dma_for_cpu(phys_addr, cmd_data.size, cmd_data.direction);//pr_info("Cache sync UNMAP: addr=0x%llx, size=%zu, dir=%d\n",// cmd_data.addr, cmd_data.size, cmd_data.direction);break;case CACHE_SYNC_FLUSH:// 刷新cache(双向)arch_sync_dma_for_device(phys_addr, cmd_data.size, DMA_BIDIRECTIONAL);pr_info("Cache sync FLUSH: addr=0x%llx, size=%zu\n",cmd_data.addr, cmd_data.size);break;default:return -ENOTTY;}return 0;
}static const struct file_operations cache_sync_fops = {.owner = THIS_MODULE,.open = cache_sync_open,.release = cache_sync_release,.unlocked_ioctl = cache_sync_ioctl,
};static int __init cache_sync_init(void)
{int retval;cache_dev = kzalloc(sizeof(struct cache_sync_dev), GFP_KERNEL);if (!cache_dev)return -ENOMEM;//add cache_dev->phys_addr = RESERVED_PHYS_ADDR;cache_dev->total_size = RESERVED_SIZE;// 使用ioremap_wc映射预留内存(适合设备内存)cache_dev->vaddr = ioremap_wc(cache_dev->phys_addr, cache_dev->total_size);if (!cache_dev->vaddr) {pr_err("Failed to ioremap reserved memory at 0x%llx\n", cache_dev->phys_addr);retval = -ENOMEM;goto err_free_dev;}pr_info("Reserved memory mapped successfully\n");pr_info(" Physical address: 0x%llx\n", cache_dev->phys_addr);pr_info(" Virtual address: %p\n", cache_dev->vaddr);pr_info(" Size: %zu bytes\n", cache_dev->total_size);//add end retval = alloc_chrdev_region(&cache_dev->devno, 0, 1, DEVICE_NAME);if (retval < 0)goto err_free_dev;cdev_init(&cache_dev->cdev, &cache_sync_fops);cache_dev->cdev.owner = THIS_MODULE;retval = cdev_add(&cache_dev->cdev, cache_dev->devno, 1);if (retval < 0)goto err_unregister;cache_dev->class = class_create(THIS_MODULE, CLASS_NAME);if (IS_ERR(cache_dev->class)) {retval = PTR_ERR(cache_dev->class);goto err_cdev_del;}cache_dev->device = device_create(cache_dev->class, NULL, cache_dev->devno, NULL, DEVICE_NAME);if (IS_ERR(cache_dev->device)) {retval = PTR_ERR(cache_dev->device);goto err_class_destroy;}pr_info("Cache sync driver loaded successfully\n");pr_info("Device major: %d\n", MAJOR(cache_dev->devno));return 0;err_class_destroy:class_destroy(cache_dev->class);
err_cdev_del:cdev_del(&cache_dev->cdev);
err_unregister:unregister_chrdev_region(cache_dev->devno, 1);
err_free_dev:kfree(cache_dev);return retval;
}static void __exit cache_sync_exit(void)
{if (cache_dev) {device_destroy(cache_dev->class, cache_dev->devno);class_destroy(cache_dev->class);cdev_del(&cache_dev->cdev);unregister_chrdev_region(cache_dev->devno, 1);kfree(cache_dev);}pr_info("Cache sync driver unloaded\n");
}module_init(cache_sync_init);
module_exit(cache_sync_exit);MODULE_LICENSE("GPL");
MODULE_AUTHOR("Your Name");
MODULE_DESCRIPTION("Cache synchronization driver");
MODULE_VERSION("1.0");
makefile
CROSS_COMPILE ?= /gcc/linux-x86/aarch64/gcc-arm-10.3-2021.07-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-
KERNEL_DIR ?= /soft/kernelbuild
obj-m += cache_sync_driver.ocache_sync_driver-objs := cache_sync_main.o cache.oall:$(MAKE) ARCH=arm64 CROSS_COMPILE=$(CROSS_COMPILE) -C $(KERNEL_DIR) M=$(PWD) modulesclean:$(MAKE) ARCH=arm64 CROSS_COMPILE=$(CROSS_COMPILE) -C $(KERNEL_DIR) M=$(PWD) cleantest: test_cache_sync.c$(CROSS_COMPILE)gcc -o test_cache_sync test_cache_sync.c
用户态测试程序
主要测试 clean及invalide的功能是否正常,测试逻辑参考前述的相关文章,这里不再赘述。
cache clean
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <linux/ioctl.h>
#include <linux/types.h>#define RESERVED_MEM_TAG "Reserved" // 在 /proc/iomem 中标识保留内存的行
struct timeval start_time;
struct timeval end_time;
double elapsed_time = 0.0;#define MAP_MAX_SIZE (0x8000000) //128M #define CACHE_LINE_SIZE (64)#define TEST_BLOCK_MAX_SIZE (2 << 20U) /* 最大2MB,同时必须64字节对齐 */
#define TEST_BLOCK_ALIGNMENT CACHE_LINE_SIZE /* ql45的cache line是64字节 */
#define TEST_BLOCK_ALIGNMENT_MASK (~(TEST_BLOCK_ALIGNMENT - 1))
#define TEST_BLOCK_ADDR_ALIGNMENT CACHE_LINE_SIZE /* 地址也必须64B对齐 */struct dma_sync_cmd {unsigned long long addr;size_t size;int dir;
};
typedef struct dma_sync_cmd dma_sync_cmd_t;/* 与内核驱动相同的定义 */
#define CACHE_SYNC_IOCTL_MAGIC 'C'
#define CACHE_SYNC_MAP _IOW(CACHE_SYNC_IOCTL_MAGIC, 1, dma_sync_cmd_t)
#define CACHE_SYNC_UNMAP _IOW(CACHE_SYNC_IOCTL_MAGIC, 2, dma_sync_cmd_t)
#define CACHE_SYNC_FLUSH _IOW(CACHE_SYNC_IOCTL_MAGIC, 3, dma_sync_cmd_t)enum dma_data_direction {DMA_BIDIRECTIONAL = 0,DMA_TO_DEVICE = 1,DMA_FROM_DEVICE = 2,DMA_NONE = 3,
};#define DEVICE_PATH "/dev/cache_sync"static double elapsed(struct timeval start_time, struct timeval end_time)
{elapsed_time = (end_time.tv_sec * 1000000 + end_time.tv_usec - start_time.tv_sec* 1000000 - start_time.tv_usec);return elapsed_time;
}
static int g_dmafd =0;static int cache_clean (unsigned long long addr,size_t size)
{struct dma_sync_cmd cmd;cmd.addr = addr;cmd.size = size;cmd.dir = DMA_TO_DEVICE;if (ioctl(g_dmafd, CACHE_SYNC_MAP, &cmd) < 0) {perror("DMA同步到设备失败");} return 0;
}// 映射保留内存到用户空间
unsigned int *map_reserved_memory(unsigned long phys_addr, unsigned long size) {int fd = open("/dev/mem", O_RDWR );if (fd < 0) {perror("Failed to open /dev/mem");return NULL;}unsigned int *mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED|MAP_POPULATE|MAP_LOCKED, fd, phys_addr);if (mem == (void *) -1) {perror("mmap failed");close(fd);return NULL;}close(fd);return mem;
}// 映射保留内存到用户空间
static unsigned int *map_reserved_memory_cache(unsigned long phys_addr, unsigned long size) {int fd = open("/dev/selfmem", O_RDWR );if (fd < 0) {perror("Failed to open /dev/mem");return NULL;}unsigned int *mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED|MAP_POPULATE|MAP_LOCKED, fd, phys_addr);if (mem == (void *) -1) {perror("mmap failed");close(fd);return NULL;}close(fd);return mem;
}int main(int argc, char *argv[]) {unsigned long reserved_start, reserved_size;reserved_size =0x8000000 ;unsigned long copy_size=0x2000000;unsigned int *src=NULL;unsigned int *dst=NULL;unsigned char *temp_addr=NULL;int i=0;unsigned int tmp=0;/* enable cache */unsigned int *write_addr_cache, * read_addr_cache;/* disable cache */unsigned int *mem_write_addr, * mem_read_addr;unsigned int *check_mem_ptr_w;unsigned int *test_virt_addr_w;unsigned int gen_addr_off = 0, gen_test_size = 0;unsigned long long phy_addr=0;int clean_cache=0;/* 打开DMA同步设备 */g_dmafd = open(DEVICE_PATH, O_RDWR);if (g_dmafd < 0) {perror("Failed to open DMA sync device");return -1;}// src=(unsigned int *)malloc(0x2000000);// dst=(unsigned int *)malloc(0x2000000);// 1. 从 /proc/iomem 解析保留内存地址reserved_start=0x151200000-1;if (reserved_start== 0) {return -1;}printf("Reserved Memory: 0x%lx \n",reserved_start);// 2. 映射保留内存mem_write_addr = map_reserved_memory(reserved_start+1, 0x8000000);if (!mem_write_addr) {return -1;}printf("Mapped reserved memory at virtual address: %p\n", mem_write_addr);write_addr_cache = map_reserved_memory_cache(reserved_start+1, 0x8000000);if (!write_addr_cache) {return -1;}printf("Mapped reserved cache memory at virtual address: %p\n", write_addr_cache);int ch;while((ch = getopt(argc, argv, "c"))!= -1){switch(ch){case 'c':{printf("You have specified the -c option\n");clean_cache = 1;break;}default:break;} }int count=0;while(1){/* set random seed */srand((int)time(NULL));/* 随机生成一个偏移,范围0~62MB */gen_addr_off = ((unsigned int)rand())%(MAP_MAX_SIZE - TEST_BLOCK_MAX_SIZE);gen_addr_off = (gen_addr_off + TEST_BLOCK_ALIGNMENT - 1) & TEST_BLOCK_ALIGNMENT_MASK;/* 随机生成一个大小,范围64 ~ 2MB,这里选64也是为了64字节对齐 */gen_test_size = ((unsigned int)rand())%TEST_BLOCK_MAX_SIZE;gen_test_size = (gen_test_size + TEST_BLOCK_ALIGNMENT - 1) & TEST_BLOCK_ALIGNMENT_MASK;if(!gen_test_size)gen_test_size = CACHE_LINE_SIZE;gen_test_size = 18038592;gen_addr_off=count*18*1024*1024; //xiehj temp count++;if(count>5)count=0;/* 对于读和写的偏移超过64MB范围了,读和写地址加偏移都希望控制在各自的64MB字节空间 */if((gen_addr_off + gen_test_size) > MAP_MAX_SIZE){printf("READ/WRITE is out of the range!\n");continue;}//memset(mem_write_addr, 0x0, 0x10000); //Zero write buffer directly to DDR with /dev/memtest_virt_addr_w =(unsigned long) write_addr_cache + gen_addr_off;phy_addr=reserved_start+1+ gen_addr_off;gettimeofday(&(start_time), NULL);memset((void *)test_virt_addr_w, 0x5a, gen_test_size);//flush todo#if 1if (1==clean_cache){for (i = 0; i < gen_test_size; i = i + 256){//temp_addr=(unsigned char *)((unsigned char *)test_virt_addr_w+i);//__asm volatile("dc cvac,%0"::"r"(temp_addr));// __asm volatile("dc cvac,%0\n" "dc cvac, %1\n"::"r"(temp_addr),"r"(temp_addr+64): "memory");//__asm volatile("dc cvac,%0\n" "dc cvac, %1\n" "dc cvac, %2\n" "dc cvac, %3\n" ::"r"(temp_addr),"r"(temp_addr+64),"r"(temp_addr+128),"r"(temp_addr+192): "memory");}cache_clean(phy_addr,gen_test_size);}gettimeofday(&(end_time), NULL);elapsed_time = elapsed(start_time, end_time);#endif check_mem_ptr_w = mem_write_addr + gen_addr_off/4;for(i = 0 ; i < (gen_test_size>>2); i++){tmp = *((volatile unsigned int *)check_mem_ptr_w + i);//non-cached: read from the DDR directlyif(tmp != 0x5a5a5a5a){printf("After flush, the data for clean is not correct!\n");printf("index:%d,data: 0x%x\n",i,tmp);exit(-7);}} printf("addr: 0x%llx, test size: %d KB,cost time : %f us\n",phy_addr,gen_test_size/1024,elapsed_time);memset((void *)test_virt_addr_w, 0xa5, gen_test_size);//flush todo#if 1if (1==clean_cache){for (i = 0; i < gen_test_size; i = i + 64){// temp_addr=(unsigned char *)((unsigned char *)test_virt_addr_w+i);// __asm volatile("dc cvac,%0"::"r"(temp_addr));//__asm volatile("dc cvac,%0"::"r"(temp_addr));}cache_clean(phy_addr,gen_test_size);}#endif check_mem_ptr_w = mem_write_addr + gen_addr_off/4;for(i = 0 ; i < (gen_test_size>>2); i++){tmp = *((volatile unsigned int *)check_mem_ptr_w + i);//non-cached: read from the DDR directlyif(tmp != 0xa5a5a5a5){printf("After flush, the data for clean is ~ not correct!\n");printf("index:%d,data: 0x%x,except : 0xa5a5a5a5\n",i,tmp);exit(-7);}} }munmap(mem_write_addr, reserved_size);munmap(write_addr_cache, reserved_size);return 0;
}
invalide cache
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <linux/ioctl.h>
#include <linux/types.h>#define RESERVED_MEM_TAG "Reserved" // 在 /proc/iomem 中标识保留内存的行
struct timeval start_time;
struct timeval end_time;
double elapsed_time = 0.0;#define MAP_MAX_SIZE (0x8000000) //128M #define CACHE_LINE_SIZE (64)#define TEST_BLOCK_MAX_SIZE (2 << 20U) /* 最大2MB,同时必须64字节对齐 */
#define TEST_BLOCK_ALIGNMENT CACHE_LINE_SIZE /* ql45的cache line是64字节 */
#define TEST_BLOCK_ALIGNMENT_MASK (~(TEST_BLOCK_ALIGNMENT - 1))
#define TEST_BLOCK_ADDR_ALIGNMENT CACHE_LINE_SIZE /* 地址也必须64B对齐 */
struct cache_range {unsigned long addr;size_t length;
};
struct dma_sync_cmd {unsigned long long addr;size_t size;int dir;
};
typedef struct dma_sync_cmd dma_sync_cmd_t;/* 与内核驱动相同的定义 */
#define CACHE_SYNC_IOCTL_MAGIC 'C'
#define CACHE_SYNC_MAP _IOW(CACHE_SYNC_IOCTL_MAGIC, 1, dma_sync_cmd_t)
#define CACHE_SYNC_UNMAP _IOW(CACHE_SYNC_IOCTL_MAGIC, 2, dma_sync_cmd_t)
#define CACHE_SYNC_FLUSH _IOW(CACHE_SYNC_IOCTL_MAGIC, 3, dma_sync_cmd_t)enum dma_data_direction {DMA_BIDIRECTIONAL = 0,DMA_TO_DEVICE = 1,DMA_FROM_DEVICE = 2,DMA_NONE = 3,
};#define DEVICE_PATH "/dev/cache_sync"static int g_dmafd =0;static int cache_invalidate (unsigned long long addr,size_t size)
{struct dma_sync_cmd cmd;cmd.addr = addr;cmd.size = size;cmd.dir = DMA_FROM_DEVICE;if (ioctl(g_dmafd, CACHE_SYNC_UNMAP, &cmd) < 0) {perror("DMA同步到设备失败");}//else {// printf("DMA同步到设备成功\n");// }
}static double elapsed(struct timeval start_time, struct timeval end_time)
{elapsed_time = (end_time.tv_sec * 1000000 + end_time.tv_usec - start_time.tv_sec* 1000000 - start_time.tv_usec);return elapsed_time;
}// 映射保留内存到用户空间
unsigned int *map_reserved_memory(unsigned long phys_addr, unsigned long size) {int fd = open("/dev/mem", O_RDWR );if (fd < 0) {perror("Failed to open /dev/mem");return NULL;}unsigned int *mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED|MAP_POPULATE|MAP_LOCKED, fd, phys_addr);if (mem == (void *) -1) {perror("mmap failed");close(fd);return NULL;}close(fd);return mem;
}// 映射保留内存到用户空间
static unsigned int *map_reserved_memory_cache(unsigned long phys_addr, unsigned long size) {int fd = open("/dev/selfmem", O_RDWR );if (fd < 0) {perror("Failed to open /dev/mem");return NULL;}unsigned int *mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED|MAP_POPULATE|MAP_LOCKED, fd, phys_addr);if (mem == (void *) -1) {perror("mmap failed");close(fd);return NULL;}close(fd);return mem;
}int main(int argc, char *argv[]) {unsigned long reserved_start, reserved_size;reserved_size =0x8000000 ;unsigned long copy_size=0x2000000;unsigned int *src=NULL;unsigned int *dst=NULL;unsigned char *temp_addr=NULL;int i=0;unsigned int tmp=0;unsigned long long phy_addr=0;/* enable cache */unsigned int *write_addr_cache, * read_addr_cache;/* disable cache */unsigned int *mem_write_addr, * mem_read_addr;unsigned int *modify_mem_ptr_r;unsigned int *test_virt_addr_r;unsigned int gen_addr_off = 0, gen_test_size = 0;int clean_cache=0;volatile unsigned int *rd_ptr;struct cache_range range;// src=(unsigned int *)malloc(0x2000000);// dst=(unsigned int *)malloc(0x2000000);// 1. 从 /proc/iomem 解析保留内存地址/* 打开DMA同步设备 */g_dmafd = open(DEVICE_PATH, O_RDWR);if (g_dmafd < 0) {perror("Failed to open DMA sync device");return -1;}reserved_start=0x151200000-1;;if (reserved_start== 0) {return -1;}printf("Reserved Memory: 0x%lx \n",reserved_start);// 2. 映射保留内存mem_read_addr = map_reserved_memory(reserved_start+1, 0x8000000);if (!mem_read_addr) {return -1;}printf("Mapped reserved memory at virtual address: %p\n", mem_read_addr);read_addr_cache = map_reserved_memory_cache(reserved_start+1, 0x8000000);if (!read_addr_cache) {return -1;}printf("Mapped reserved cache memory at virtual address: %p\n", read_addr_cache);int ch;while((ch = getopt(argc, argv, "c"))!= -1){switch(ch){case 'c':{printf("You have specified the -c option\n");clean_cache = 1;break;}default:break;} }int count=0;while(1){/* set random seed */srand((int)time(NULL));/* 随机生成一个偏移,范围0~62MB */gen_addr_off = ((unsigned int)rand())%(MAP_MAX_SIZE - TEST_BLOCK_MAX_SIZE);gen_addr_off = (gen_addr_off + TEST_BLOCK_ALIGNMENT - 1) & TEST_BLOCK_ALIGNMENT_MASK;/* 随机生成一个大小,范围64 ~ 2MB,这里选64也是为了64字节对齐 *///gen_test_size = (gen_test_size + TEST_BLOCK_ALIGNMENT - 1) & TEST_BLOCK_ALIGNMENT_MASK;if(!gen_test_size)gen_test_size = CACHE_LINE_SIZE;gen_test_size = 18038592;// ((unsigned int)rand())%TEST_BLOCK_MAX_SIZE;gen_addr_off=count*18*1024*1024; //xiehj temp count++;if(count>5)count=0;/* 对于读和写的偏移超过64MB范围了,读和写地址加偏移都希望控制在各自的64MB字节空间 */if((gen_addr_off + gen_test_size) > MAP_MAX_SIZE){printf("READ/WRITE is out of the range!\n");continue;}//gen_addr_off=0; //xiehj add temp //memset(mem_write_addr, 0x0, 0x10000); //Zero write buffer directly to DDR with /dev/memtest_virt_addr_r =(unsigned long) read_addr_cache + gen_addr_off;modify_mem_ptr_r = mem_read_addr + gen_addr_off/4;phy_addr=reserved_start+1+ gen_addr_off;gettimeofday(&(start_time), NULL);memset((void *)modify_mem_ptr_r, 0xa5, gen_test_size);//flush todo#if 1if (1==clean_cache){#if 0for (i = 0; i < gen_test_size; i = i + 64){temp_addr=(unsigned char *)((unsigned char *)test_virt_addr_r+i);__asm volatile("dc ivac,%0"::"r"(temp_addr));}// 构造参数range.addr = (unsigned long)test_virt_addr_r;range.length = gen_test_size;// 调用 IOCTL 失效 Cacheif (ioctl(fd, 0, &range) < 0) {perror("ioctl failed");} else {printf("Cache invalidated for range [%p, %p)\n", test_virt_addr_r, test_virt_addr_r + gen_test_size);}#endif cache_invalidate(phy_addr,gen_test_size);}gettimeofday(&(end_time), NULL);elapsed_time = elapsed(start_time, end_time);#endif rd_ptr = (volatile unsigned int *)test_virt_addr_r;//cache enabledfor(i = 0; i < (gen_test_size >> 2); i++){tmp = *(rd_ptr + i); if(tmp != 0xa5a5a5a5){break;}}printf("addr :0x%llx,test size: %d KB,cost time : %f us\n",phy_addr,gen_test_size/1024,elapsed_time);memset((void *)modify_mem_ptr_r, 0x5a, gen_test_size);//flush todo#if 1if (1==clean_cache){cache_invalidate(phy_addr,gen_test_size);}#endif rd_ptr = (volatile unsigned int *)test_virt_addr_r;//cache enabledfor(i = 0 ; i < (gen_test_size>>2); i++){tmp = *(rd_ptr + i);//non-cached: read from the DDR directlyif(tmp != 0x5a5a5a5a){printf("After flush, the data for clean is ~ not correct!\n");printf("index:%d,data: 0x%x,except : 0x5a5a5a5a\n",i,tmp);exit(-7);}} }munmap(mem_write_addr, reserved_size);munmap(write_addr_cache, reserved_size);return 0;
}
测试时间汇总
测试的地址为物理地址起始地址,然后每次加18M,连续加5次,测试时间包括写内存及clean时间;invalid及读内存时间示例如下:
FT2000+
D2000
3588
3588(5.10内核) | FT2000+(麒麟V10) | D2000(麒麟V10) | |
clean | 1-7ms,其中第一个是地址是7ms 后续是1ms多 | 4-6ms | 3-7ms,其中起始 是7ms,后续减少为3ms |
invalid |
总结
1) 为了兼容不同系统内核版本及配置项的差异,将汇编代码一起作为驱动的一部分
2) 虚拟地址的映射多样性,需要明确应用范围。用户空间mmap时,仅仅是在对应的进程建立了页面,并没有在内核态建立对应的页表,直接在内核态访问怎会出错。
3)除了本身内存属性的差异,各个soc对cache的同步时间差异要区别对待。尤其时延敏感的业务。