当前位置: 首页 > news >正文

Linux:文件 mmap 读写流程简析

文章目录

  • 1. 前言
  • 2. 文件 mmap 读写流程
    • 2.1 分配 mmap 映射虚拟地址区间
    • 2.2 页表 和 page cache 分配
  • 3. 推荐阅读

1. 前言

限于作者能力水平,本文可能存在谬误,因此而给读者带来的损失,作者不做任何承诺。

2. 文件 mmap 读写流程

文件 read() / write() 操作,需要在用户空间缓冲内核空间文件 page cache 之间进行数据拷贝:

         读/写
用户缓冲 <----> page cache <-----> 磁盘拷贝

而文件的 mmap() 对文件的读写操作,是直接作用于内核空间文件 page cache

         读/写
映射地址 <----> page cache <-----> 磁盘

这是文件的 mmap() 读写操作在较大数据量读写场合上性能优于普通 read() / write() 操作的原因。当然,也不是所有的场合 mmap() 的性能都优于 read() / write(),譬如小文件的场景。

同时,在 32 位架构下,由于(1G/3G 布局下)进程地址空间的只有大概小于 2G 的空间可用于 mmap(),所以能映射的文件大小也受到了限制。下图是一个 32 位架构下,典型的 1G/3G布局下进程地址空间分布:

0xFFFF_FFFF  -------------  \|   Kernel    |  ||             |   } 1|   Space     |  |  G|-------------| /|   modules   | \|-------------| ||             | ||   STACK     | ||             | ||-------------| ||             | ||             |  } 2G|    MMAP     | ||             | ||             | ||-------------| ||             | ||     HEAP    | ||             | ||-------------|/|   PROGRAM   |\|  CODE/DATA  | ||-------------|  } 1G|   reserved  | |
0x0000_0000  ------------- /

不同于 32 位系统,64 位系统下可用的 mmap 虚拟地址空间很大,大概接近 128T

另外,和普通的read() / write() 一样,mmap() 的数据仍然也是在 page cache,而不是直接落入磁盘,可以在调用 munmap() 之前,调用 msync() 来将数据同步到磁盘。

文件的 mmap() 读写操作,主要分为如下 2 步:

1. 从进程虚拟地址空间(mmap 区间),找出一段可以建立映射的空间,然后用 vm_area_struct 数据记录相关信息,并返回映射区间虚拟地址到用户空间;
2. 用户空间进行读写,将产生 page fault,在 page fault 处理中为映射区间建立页表,并分配文件的 page cache 页面,然后用户空间的读写将落到 page cache 页面中。

也可以通过 MAP_POPULATE 标志位在第 1 步就进行页表和 page cache 页面的预分配。

下面分别分析上面 2 步的代码实现细节。

2.1 分配 mmap 映射虚拟地址区间

从系统调用 mmap() 开始:

/* mm/mmap.c */SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,unsigned long, prot, unsigned long, flags,unsigned long, fd, unsigned long, pgoff)
{if (!(flags & MAP_ANONYMOUS)) { /* 非匿名映射, 即文件映射 */...file = fget(fd); /* 找到 @fd 对应的文件对象 @file */if (!file) /* 找不到 @fd 对应的文件对象 */return -EBADF;...} else if (flags & MAP_HUGETLB) {...}flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);...return retval; /* 返回分配的 mmap 虚拟地址映射区间地址 或 错误码 */
}
/* mm/util.c */unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,unsigned long len, unsigned long prot,unsigned long flag, unsigned long pgoff)
{unsigned long ret;struct mm_struct *mm = current->mm; /* 当前进程的地址空间管理对象 */unsigned long populate;LIST_HEAD(uf);ret = security_mmap_file(file, prot, flag);if (!ret) {if (down_write_killable(&mm->mmap_sem)) /* 锁住当前进程地址空间 */return -EINTR;ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,&populate, &uf);up_write(&mm->mmap_sem); /* 下锁当前进程地址空间 */userfaultfd_unmap_complete(mm, &uf);if (populate) /* (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE) */mm_populate(ret, populate); /* 返回前建立页表并分配好物理内存 */}return ret;
}
do_mmap_pgoff()do_mmap()
/* mm/mmap.c *//** The caller must hold down_write(&current->mm->mmap_sem).*/
unsigned long do_mmap(struct file *file, unsigned long addr,unsigned long len, unsigned long prot,unsigned long flags, vm_flags_t vm_flags,unsigned long pgoff, unsigned long *populate,struct list_head *uf)
{struct mm_struct *mm = current->mm;int pkey = 0;.../* Obtain the address to map to. we verify (or select) it and ensure* that it represents a valid section of the address space.*//* 从进程地址空间找一块空闲的 mmap 虚拟地址段 */addr = get_unmapped_area(file, addr, len, pgoff, flags);...addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);...return addr;
}unsigned long mmap_region(struct file *file, unsigned long addr,unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,struct list_head *uf)
{struct mm_struct *mm = current->mm;struct vm_area_struct *vma, *prev;int error;struct rb_node **rb_link, *rb_parent;unsigned long charged = 0;.../* 新建一个 vm_area_struct 来表示 mmap 映射区间 */vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);if (!vma) {error = -ENOMEM;goto unacct_error;}vma->vm_mm = mm;vma->vm_start = addr;vma->vm_end = addr + len;vma->vm_flags = vm_flags;vma->vm_page_prot = vm_get_page_prot(vm_flags);vma->vm_pgoff = pgoff;INIT_LIST_HEAD(&vma->anon_vma_chain);if (file) { /* 文件 mmap 映射 */.../* ->mmap() can change vma->vm_file, but must guarantee that* vma_link() below can deny write-access if VM_DENYWRITE is set* and map writably if VM_SHARED is set. This usually means the* new file must not have been exposed to user-space, yet.*/vma->vm_file = get_file(file);error = call_mmap(file, vma); /* 文件系统的 mmap 操作 */...addr = vma->vm_start; /* 返回分配的 mmap 的虚拟地址 */...} else if (vm_flags & VM_SHARED) {...}/* 将新的 VMA 插入到进程地址空间 VMA 红黑树 */vma_link(mm, vma, prev, rb_link, rb_parent);...return addr; /* 返回分配的 mmap 的虚拟地址 */...
}
/* include/ *//* 文件系统的 mmap 操作 */
static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
{/** ext4: fs/ext4/file.c, ext4_file_mmap()* ...*/return file->f_op->mmap(file, vma);
}

每个文件系统的 mmap 操作不同,这里以 ext4 文件系统为例进行分析:

/* fs/ext4/file.c */static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{struct inode *inode = file->f_mapping->host;if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))return -EIO;file_accessed(file);if (IS_DAX(file_inode(file))) {vma->vm_ops = &ext4_dax_vm_ops;vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;} else {vma->vm_ops = &ext4_file_vm_ops;}return 0;
}

到此,mmap 的第 1 步分配 mmap 虚拟地址空间已经完成,接下来看读写时 page fault 中 mmap 相关的处理细节。

2.2 页表 和 page cache 分配

当对 未建立页表 和 未分配 page cache 的文件 mmap 地址进行读写时,将产生 page fault,这里以 ARM32 的处理流程为例进行分析:

@ arch/arm/kernel/entry-armv.Svector_stub	dabt, ABT_MODE, 8.long	__dabt_usr			@  0  (USR_26 / USR_32)....long	__dabt_svc			@  3  (SVC_26 / SVC_32)....globl	vector_fiq/* 各个CPU模式下的中断向量表指针 */.section .vectors, "ax", %progbits
.L__vectors_start:...W(b)	vector_dabt /* DataAbort模式的中断向量表指针 */.../* 缺页(DataAbort)中断可产生于[SVC、用户]两种模式下 */
__dabt_usr: /* 用户模式缺页中断 */...dabt_helper // bl	CPU_DABORT_HANDLER -> bl v7_early_abort....align	5
__dabt_svc: /* SVC模式缺页中断 */...dabt_helper // bl	CPU_DABORT_HANDLER  -> bl v7_early_abort...
@ arch/arm/mm/abort-ev7.S.align	5
ENTRY(v7_early_abort)mrc	p15, 0, r1, c5, c0, 0		@ get FSRmrc	p15, 0, r0, c6, c0, 0		@ get FARuaccess_disable ip			@ disable userspace access...b	do_DataAbort
ENDPROC(v7_early_abort)
/** 以 3 级页表举例。* arch/arm/mm/fsr-3level.c */
static struct fsr_info fsr_info[] = {.../* 缺页中断处理接口 */{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 1 translation fault"	}, /* 1级页目录转换接口 */{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 2 translation fault"	}, /* 2级页目录转换接口 */{ do_page_fault,	SIGSEGV, SEGV_MAPERR,	"level 3 translation fault"	}, /* 3级页表项转换接口 */...
};/** arch/arm/mm/fault.c */asmlinkage void __exception
do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{const struct fsr_info *inf = fsr_info + fsr_fs(fsr);struct siginfo info;/* 调用具体类型缺页中断的入口: do_translation_fault() 或 do_page_fault() */if (!inf->fn(addr, fsr & ~FSR_LNX_PF, regs))return;...
}#ifdef CONFIG_MMU
static int __kprobes
do_translation_fault(unsigned long addr, unsigned int fsr,struct pt_regs *regs)
{...if (addr < TASK_SIZE) /* 用户空间地址 */return do_page_fault(addr, fsr, regs);...
}static int __kprobes
do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{struct task_struct *tsk;struct mm_struct *mm;...tsk = current;mm  = tsk->mm;/* Enable interrupts if they were enabled in the parent context. */if (interrupts_enabled(regs))local_irq_enable();...if (fsr & FSR_WRITE) /* 写入引发的 Data Abort (如 mmap 文件写,COW: Copy-On-Write) */flags |= FAULT_FLAG_WRITE;...fault = __do_page_fault(mm, addr, fsr, flags, tsk);...
}static int __kprobes
__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr,unsigned int flags, struct task_struct *tsk)
{struct vm_area_struct *vma;int fault;/** 查看 addr 是否存在对应的 vma ? * 如果没有的话,意味着非法地址访问.*/vma = find_vma(mm, addr);fault = VM_FAULT_BADMAP;if (unlikely(!vma))goto out;.../** Ok, we have a good vm_area for this* memory access, so we can handle it.*//* 验证对 addr 的访问(读/写/执行)是否合法? */ 
good_area:if (access_error(fsr, vma)) {fault = VM_FAULT_BADACCESS;goto out;}return handle_mm_fault(vma, addr & PAGE_MASK, flags);check_stack:...
out:...
}
/** mm/memory.c*//* 建立 [页表 + page cache 页面] */
int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,unsigned int flags)
{int ret;...if (unlikely(is_vm_hugetlb_page(vma)))...elseret = __handle_mm_fault(vma, address, flags);...return ret;
}static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,unsigned int flags)
{struct vm_fault vmf = {.vma = vma,.address = address & PAGE_MASK,.flags = flags,.pgoff = linear_page_index(vma, address),.gfp_mask = __get_fault_gfp_mask(vma),};unsigned int dirty = flags & FAULT_FLAG_WRITE;struct mm_struct *mm = vma->vm_mm;pgd_t *pgd;p4d_t *p4d;int ret;pgd = pgd_offset(mm, address);p4d = p4d_alloc(mm, pgd, address);if (!p4d)return VM_FAULT_OOM;vmf.pud = pud_alloc(mm, p4d, address);if (!vmf.pud)return VM_FAULT_OOM;...vmf.pmd = pmd_alloc(mm, vmf.pud, address);if (!vmf.pmd)return VM_FAULT_OOM;...return handle_pte_fault(&vmf);
}static int handle_pte_fault(struct vm_fault *vmf)
{pte_t entry;if (unlikely(pmd_none(*vmf->pmd))) {/** Leave __pte_alloc() until later: because vm_ops->fault may* want to allocate huge page, and if we expose page table* for an instant, it will be difficult to retract from* concurrent faults and from rmap lookups.*/vmf->pte = NULL;} else {...}if (!vmf->pte) {if (vma_is_anonymous(vmf->vma))return do_anonymous_page(vmf); /* 匿名映射 */elsereturn do_fault(vmf); /* 场景一: 文件 mmap 映射 */}...
}static int do_fault(struct vm_fault *vmf)
{struct vm_area_struct *vma = vmf->vma;int ret;/** The VMA was not fully populated on mmap() or missing VM_DONTEXPAND*/if (!vma->vm_ops->fault) {...} else if (!(vmf->flags & FAULT_FLAG_WRITE))ret = do_read_fault(vmf);else if (!(vma->vm_flags & VM_SHARED))ret = do_cow_fault(vmf); /* COW fault */elseret = do_shared_fault(vmf);/* preallocated pagetable is unused: free it */if (vmf->prealloc_pte) {pte_free(vma->vm_mm, vmf->prealloc_pte);vmf->prealloc_pte = NULL;}return ret;
}static int do_read_fault(struct vm_fault *vmf)
{struct vm_area_struct *vma = vmf->vma;int ret = 0;.../** mmap 场景: * 分配 page cache 页面,读取数据到 page cache 页面,* 然后将 page cache 页面对象从 @vmf 返回。*/ret = __do_fault(vmf);.../** 将新建的 page 页面设定到 @vmf 的虚拟地址区间 的 PTE 页表项,* 即将 @vmf 的虚拟地址区间 映射到 page 页面。*/ret |= finish_fault(vmf);...return ret;
}static int __do_fault(struct vm_fault *vmf)
{struct vm_area_struct *vma = vmf->vma;int ret;...ret = vma->vm_ops->fault(vmf); /* ext4_filemap_fault() */...return ret;
}
/* fs/ext4/inode.c */int ext4_filemap_fault(struct vm_fault *vmf)
{struct inode *inode = file_inode(vmf->vma->vm_file);int err;down_read(&EXT4_I(inode)->i_mmap_sem);err = filemap_fault(vmf);up_read(&EXT4_I(inode)->i_mmap_sem);return err;
}
/* mm/filemap.c *//*** filemap_fault - read in file data for page fault handling* @vmf:	struct vm_fault containing details of the fault* ......*/
int filemap_fault(struct vm_fault *vmf)
{int error;struct file *file = vmf->vma->vm_file;struct address_space *mapping = file->f_mapping;struct file_ra_state *ra = &file->f_ra;struct inode *inode = mapping->host;pgoff_t offset = vmf->pgoff;pgoff_t max_off;struct page *page;int ret = 0;.../** Do we have something in the page cache already?*/page = find_get_page(mapping, offset);if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { /* 找到了文件 @file 对应的 page cache *//** We found the page, so try async readahead before* waiting for the lock.*/do_async_mmap_readahead(vmf->vma, ra, file, page, offset);} else if (!page) { /* 没有找到文件 @file 对应的 page cache, 分配 page cache, 并预读指定 @offset 的内容到 page cache *//* No page in the page cache at all */do_sync_mmap_readahead(vmf->vma, ra, file, offset);count_vm_event(PGMAJFAULT);count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);ret = VM_FAULT_MAJOR;
retry_find:page = find_get_page(mapping, offset);if (!page)goto no_cached_page;}.../** 返回 page cache 页面。* 后续在 finish_fault() 中设定到 mmap 映射的虚拟地址区间 的 PTE * 页表项,即将 mmap 映射的虚拟地址区间 映射到 page cache 页面*/vmf->page = page;return ret | VM_FAULT_LOCKED;no_cached_page:/** We're only likely to ever get here if MADV_RANDOM is in* effect.*/error = page_cache_read(file, offset, vmf->gfp_mask);...
}/*** page_cache_read - adds requested page to the page cache if not already there* @file:	file to read* @offset:	page index* @gfp_mask:	memory allocation flags** This adds the requested page to the page cache if it isn't already there,* and schedules an I/O to read in its contents from disk.*/
static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
{struct address_space *mapping = file->f_mapping;struct page *page;int ret;do {/* 分配 page cache 页面 */page = __page_cache_alloc(gfp_mask|__GFP_COLD);if (!page)return -ENOMEM;/* 添加 read page cache 页面到 地址空间 @mapping 的 LRU 链表 */ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask);if (ret == 0)/** 请求从磁盘读取数据到 page cache* ext4: ext4_readpage()* ...*/ret = mapping->a_ops->readpage(file, page);else if (ret == -EEXIST)ret = 0; /* losing race to add is OK */put_page(page);} while (ret == AOP_TRUNCATED_PAGE);return ret;
}

最后,finish_fault()mmap 虚拟地址区间映射到分配的 page cache 页面

int finish_fault(struct vm_fault *vmf)
{struct page *page;int ret = 0;/* Did we COW the page? */if ((vmf->flags & FAULT_FLAG_WRITE) &&!(vmf->vma->vm_flags & VM_SHARED))page = vmf->cow_page;elsepage = vmf->page;/** check even for read faults because we might have lost our CoWed* page*/if (!(vmf->vma->vm_flags & VM_SHARED))ret = check_stable_address_space(vmf->vma->vm_mm);if (!ret)ret = alloc_set_pte(vmf, vmf->memcg, page); /* PTE 页表项映射到 @page 页面 */if (vmf->pte)pte_unmap_unlock(vmf->pte, vmf->ptl);return ret;
}

到此,mmap 读写操作 page fault 过程中建立 mmap 映射区间页表以及 page cahce 的过程已经分析完毕。

3. 推荐阅读

从内核世界透视 mmap 内存映射的本质(源码实现篇)

http://www.dtcms.com/a/596881.html

相关文章:

  • Ros1 Noetic(本地)和Ros2 Humble(docker)之间相互通信及设置初始位姿
  • 使用 Docker Compose 部署 Redis 单节点 和 主从架构
  • 群晖NAS上使用最新版WordPress安装部署个人的博客或网站
  • 在Visio中保存PDF时去除空白区域
  • AI学习路线图2025:从入门到进阶的完整指南
  • 电气工程师求职问答-初级篇
  • Learn Git Branching
  • 凡科快图网站中医院网站源码
  • 太原网站快速排名提升河北婚庆网站建设定制
  • [Spring 注解详解]为何 @Service 不仅仅是 @Component?
  • 前端高频面试题之Vue(初、中级篇)
  • 谷歌云发布 Axion Arm 处理器与 TPU v5p,加速 AI 基础设施闭环
  • STM32H743-ARM例程43-SD_IAP_FPGA
  • 甘肃做网站找谁网上帮人卖东西的平台
  • 数据分析笔记01:数据分析概述
  • 瑞利信道下PSK水声通信系统均衡技术
  • 网站建设存在的问题及对策软文代写发布
  • Note:汽车轮胎的电阻测量-目的是减少静电对新能源汽车电气件的损坏风险
  • RabbitMQ 核心知识点
  • Python使用消息队列rabbitmq
  • GBD调试KingSCADA详细步骤
  • 做美妆的网站南昌优化网站分析
  • 上海个人医疗网站备案尖扎县公司网站建设
  • 多端统一的教育系统源码开发详解:Web、小程序与APP的无缝融合
  • uniapp小程序 订阅消息推送
  • 微信小程序管理系统,代运营3600+医院小程序
  • 重庆论坛网站建设在网站开发中应该避免哪些漏洞
  • Spring Boot整合Redis注解,实战Redis注解使用
  • 数学分析简明教程——3.5
  • php网站500错误电子商务网站建设的作用