Linux:文件 mmap 读写流程简析
文章目录
- 1. 前言
- 2. 文件 mmap 读写流程
- 2.1 分配 mmap 映射虚拟地址区间
- 2.2 页表 和 page cache 分配
- 3. 推荐阅读
1. 前言
限于作者能力水平,本文可能存在谬误,因此而给读者带来的损失,作者不做任何承诺。
2. 文件 mmap 读写流程
文件 read() / write() 操作,需要在用户空间缓冲和内核空间文件 page cache 之间进行数据拷贝:
读/写
用户缓冲 <----> page cache <-----> 磁盘拷贝
而文件的 mmap() 对文件的读写操作,是直接作用于内核空间文件 page cache:
读/写
映射地址 <----> page cache <-----> 磁盘
这是文件的 mmap() 读写操作在较大数据量读写场合上性能优于普通 read() / write() 操作的原因。当然,也不是所有的场合 mmap() 的性能都优于 read() / write(),譬如小文件的场景。
同时,在 32 位架构下,由于(1G/3G 布局下)进程地址空间的只有大概小于 2G 的空间可用于 mmap(),所以能映射的文件大小也受到了限制。下图是一个 32 位架构下,典型的 1G/3G布局下进程地址空间分布:
0xFFFF_FFFF ------------- \| Kernel | || | } 1| Space | | G|-------------| /| modules | \|-------------| || | || STACK | || | ||-------------| || | || | } 2G| MMAP | || | || | ||-------------| || | || HEAP | || | ||-------------|/| PROGRAM |\| CODE/DATA | ||-------------| } 1G| reserved | |
0x0000_0000 ------------- /
不同于 32 位系统,64 位系统下可用的 mmap 虚拟地址空间很大,大概接近 128T。
另外,和普通的read() / write() 一样,mmap() 的数据仍然也是在 page cache,而不是直接落入磁盘,可以在调用 munmap() 之前,调用 msync() 来将数据同步到磁盘。
文件的 mmap() 读写操作,主要分为如下 2 步:
1. 从进程虚拟地址空间(mmap 区间),找出一段可以建立映射的空间,然后用 vm_area_struct 数据记录相关信息,并返回映射区间虚拟地址到用户空间;
2. 用户空间进行读写,将产生 page fault,在 page fault 处理中为映射区间建立页表,并分配文件的 page cache 页面,然后用户空间的读写将落到 page cache 页面中。
也可以通过 MAP_POPULATE 标志位在第 1 步就进行页表和 page cache 页面的预分配。
下面分别分析上面 2 步的代码实现细节。
2.1 分配 mmap 映射虚拟地址区间
从系统调用 mmap() 开始:
/* mm/mmap.c */SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,unsigned long, prot, unsigned long, flags,unsigned long, fd, unsigned long, pgoff)
{if (!(flags & MAP_ANONYMOUS)) { /* 非匿名映射, 即文件映射 */...file = fget(fd); /* 找到 @fd 对应的文件对象 @file */if (!file) /* 找不到 @fd 对应的文件对象 */return -EBADF;...} else if (flags & MAP_HUGETLB) {...}flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);...return retval; /* 返回分配的 mmap 虚拟地址映射区间地址 或 错误码 */
}
/* mm/util.c */unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,unsigned long len, unsigned long prot,unsigned long flag, unsigned long pgoff)
{unsigned long ret;struct mm_struct *mm = current->mm; /* 当前进程的地址空间管理对象 */unsigned long populate;LIST_HEAD(uf);ret = security_mmap_file(file, prot, flag);if (!ret) {if (down_write_killable(&mm->mmap_sem)) /* 锁住当前进程地址空间 */return -EINTR;ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,&populate, &uf);up_write(&mm->mmap_sem); /* 下锁当前进程地址空间 */userfaultfd_unmap_complete(mm, &uf);if (populate) /* (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE) */mm_populate(ret, populate); /* 返回前建立页表并分配好物理内存 */}return ret;
}
do_mmap_pgoff()do_mmap()
/* mm/mmap.c *//** The caller must hold down_write(¤t->mm->mmap_sem).*/
unsigned long do_mmap(struct file *file, unsigned long addr,unsigned long len, unsigned long prot,unsigned long flags, vm_flags_t vm_flags,unsigned long pgoff, unsigned long *populate,struct list_head *uf)
{struct mm_struct *mm = current->mm;int pkey = 0;.../* Obtain the address to map to. we verify (or select) it and ensure* that it represents a valid section of the address space.*//* 从进程地址空间找一块空闲的 mmap 虚拟地址段 */addr = get_unmapped_area(file, addr, len, pgoff, flags);...addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);...return addr;
}unsigned long mmap_region(struct file *file, unsigned long addr,unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,struct list_head *uf)
{struct mm_struct *mm = current->mm;struct vm_area_struct *vma, *prev;int error;struct rb_node **rb_link, *rb_parent;unsigned long charged = 0;.../* 新建一个 vm_area_struct 来表示 mmap 映射区间 */vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);if (!vma) {error = -ENOMEM;goto unacct_error;}vma->vm_mm = mm;vma->vm_start = addr;vma->vm_end = addr + len;vma->vm_flags = vm_flags;vma->vm_page_prot = vm_get_page_prot(vm_flags);vma->vm_pgoff = pgoff;INIT_LIST_HEAD(&vma->anon_vma_chain);if (file) { /* 文件 mmap 映射 */.../* ->mmap() can change vma->vm_file, but must guarantee that* vma_link() below can deny write-access if VM_DENYWRITE is set* and map writably if VM_SHARED is set. This usually means the* new file must not have been exposed to user-space, yet.*/vma->vm_file = get_file(file);error = call_mmap(file, vma); /* 文件系统的 mmap 操作 */...addr = vma->vm_start; /* 返回分配的 mmap 的虚拟地址 */...} else if (vm_flags & VM_SHARED) {...}/* 将新的 VMA 插入到进程地址空间 VMA 红黑树 */vma_link(mm, vma, prev, rb_link, rb_parent);...return addr; /* 返回分配的 mmap 的虚拟地址 */...
}
/* include/ *//* 文件系统的 mmap 操作 */
static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
{/** ext4: fs/ext4/file.c, ext4_file_mmap()* ...*/return file->f_op->mmap(file, vma);
}
每个文件系统的 mmap 操作不同,这里以 ext4 文件系统为例进行分析:
/* fs/ext4/file.c */static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{struct inode *inode = file->f_mapping->host;if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))return -EIO;file_accessed(file);if (IS_DAX(file_inode(file))) {vma->vm_ops = &ext4_dax_vm_ops;vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;} else {vma->vm_ops = &ext4_file_vm_ops;}return 0;
}
到此,mmap 的第 1 步分配 mmap 虚拟地址空间已经完成,接下来看读写时 page fault 中 mmap 相关的处理细节。
2.2 页表 和 page cache 分配
当对 未建立页表 和 未分配 page cache 的文件 mmap 地址进行读写时,将产生 page fault,这里以 ARM32 的处理流程为例进行分析:
@ arch/arm/kernel/entry-armv.Svector_stub dabt, ABT_MODE, 8.long __dabt_usr @ 0 (USR_26 / USR_32)....long __dabt_svc @ 3 (SVC_26 / SVC_32)....globl vector_fiq/* 各个CPU模式下的中断向量表指针 */.section .vectors, "ax", %progbits
.L__vectors_start:...W(b) vector_dabt /* DataAbort模式的中断向量表指针 */.../* 缺页(DataAbort)中断可产生于[SVC、用户]两种模式下 */
__dabt_usr: /* 用户模式缺页中断 */...dabt_helper // bl CPU_DABORT_HANDLER -> bl v7_early_abort....align 5
__dabt_svc: /* SVC模式缺页中断 */...dabt_helper // bl CPU_DABORT_HANDLER -> bl v7_early_abort...
@ arch/arm/mm/abort-ev7.S.align 5
ENTRY(v7_early_abort)mrc p15, 0, r1, c5, c0, 0 @ get FSRmrc p15, 0, r0, c6, c0, 0 @ get FARuaccess_disable ip @ disable userspace access...b do_DataAbort
ENDPROC(v7_early_abort)
/** 以 3 级页表举例。* arch/arm/mm/fsr-3level.c */
static struct fsr_info fsr_info[] = {.../* 缺页中断处理接口 */{ do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" }, /* 1级页目录转换接口 */{ do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" }, /* 2级页目录转换接口 */{ do_page_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, /* 3级页表项转换接口 */...
};/** arch/arm/mm/fault.c */asmlinkage void __exception
do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{const struct fsr_info *inf = fsr_info + fsr_fs(fsr);struct siginfo info;/* 调用具体类型缺页中断的入口: do_translation_fault() 或 do_page_fault() */if (!inf->fn(addr, fsr & ~FSR_LNX_PF, regs))return;...
}#ifdef CONFIG_MMU
static int __kprobes
do_translation_fault(unsigned long addr, unsigned int fsr,struct pt_regs *regs)
{...if (addr < TASK_SIZE) /* 用户空间地址 */return do_page_fault(addr, fsr, regs);...
}static int __kprobes
do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{struct task_struct *tsk;struct mm_struct *mm;...tsk = current;mm = tsk->mm;/* Enable interrupts if they were enabled in the parent context. */if (interrupts_enabled(regs))local_irq_enable();...if (fsr & FSR_WRITE) /* 写入引发的 Data Abort (如 mmap 文件写,COW: Copy-On-Write) */flags |= FAULT_FLAG_WRITE;...fault = __do_page_fault(mm, addr, fsr, flags, tsk);...
}static int __kprobes
__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr,unsigned int flags, struct task_struct *tsk)
{struct vm_area_struct *vma;int fault;/** 查看 addr 是否存在对应的 vma ? * 如果没有的话,意味着非法地址访问.*/vma = find_vma(mm, addr);fault = VM_FAULT_BADMAP;if (unlikely(!vma))goto out;.../** Ok, we have a good vm_area for this* memory access, so we can handle it.*//* 验证对 addr 的访问(读/写/执行)是否合法? */
good_area:if (access_error(fsr, vma)) {fault = VM_FAULT_BADACCESS;goto out;}return handle_mm_fault(vma, addr & PAGE_MASK, flags);check_stack:...
out:...
}
/** mm/memory.c*//* 建立 [页表 + page cache 页面] */
int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,unsigned int flags)
{int ret;...if (unlikely(is_vm_hugetlb_page(vma)))...elseret = __handle_mm_fault(vma, address, flags);...return ret;
}static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,unsigned int flags)
{struct vm_fault vmf = {.vma = vma,.address = address & PAGE_MASK,.flags = flags,.pgoff = linear_page_index(vma, address),.gfp_mask = __get_fault_gfp_mask(vma),};unsigned int dirty = flags & FAULT_FLAG_WRITE;struct mm_struct *mm = vma->vm_mm;pgd_t *pgd;p4d_t *p4d;int ret;pgd = pgd_offset(mm, address);p4d = p4d_alloc(mm, pgd, address);if (!p4d)return VM_FAULT_OOM;vmf.pud = pud_alloc(mm, p4d, address);if (!vmf.pud)return VM_FAULT_OOM;...vmf.pmd = pmd_alloc(mm, vmf.pud, address);if (!vmf.pmd)return VM_FAULT_OOM;...return handle_pte_fault(&vmf);
}static int handle_pte_fault(struct vm_fault *vmf)
{pte_t entry;if (unlikely(pmd_none(*vmf->pmd))) {/** Leave __pte_alloc() until later: because vm_ops->fault may* want to allocate huge page, and if we expose page table* for an instant, it will be difficult to retract from* concurrent faults and from rmap lookups.*/vmf->pte = NULL;} else {...}if (!vmf->pte) {if (vma_is_anonymous(vmf->vma))return do_anonymous_page(vmf); /* 匿名映射 */elsereturn do_fault(vmf); /* 场景一: 文件 mmap 映射 */}...
}static int do_fault(struct vm_fault *vmf)
{struct vm_area_struct *vma = vmf->vma;int ret;/** The VMA was not fully populated on mmap() or missing VM_DONTEXPAND*/if (!vma->vm_ops->fault) {...} else if (!(vmf->flags & FAULT_FLAG_WRITE))ret = do_read_fault(vmf);else if (!(vma->vm_flags & VM_SHARED))ret = do_cow_fault(vmf); /* COW fault */elseret = do_shared_fault(vmf);/* preallocated pagetable is unused: free it */if (vmf->prealloc_pte) {pte_free(vma->vm_mm, vmf->prealloc_pte);vmf->prealloc_pte = NULL;}return ret;
}static int do_read_fault(struct vm_fault *vmf)
{struct vm_area_struct *vma = vmf->vma;int ret = 0;.../** mmap 场景: * 分配 page cache 页面,读取数据到 page cache 页面,* 然后将 page cache 页面对象从 @vmf 返回。*/ret = __do_fault(vmf);.../** 将新建的 page 页面设定到 @vmf 的虚拟地址区间 的 PTE 页表项,* 即将 @vmf 的虚拟地址区间 映射到 page 页面。*/ret |= finish_fault(vmf);...return ret;
}static int __do_fault(struct vm_fault *vmf)
{struct vm_area_struct *vma = vmf->vma;int ret;...ret = vma->vm_ops->fault(vmf); /* ext4_filemap_fault() */...return ret;
}
/* fs/ext4/inode.c */int ext4_filemap_fault(struct vm_fault *vmf)
{struct inode *inode = file_inode(vmf->vma->vm_file);int err;down_read(&EXT4_I(inode)->i_mmap_sem);err = filemap_fault(vmf);up_read(&EXT4_I(inode)->i_mmap_sem);return err;
}
/* mm/filemap.c *//*** filemap_fault - read in file data for page fault handling* @vmf: struct vm_fault containing details of the fault* ......*/
int filemap_fault(struct vm_fault *vmf)
{int error;struct file *file = vmf->vma->vm_file;struct address_space *mapping = file->f_mapping;struct file_ra_state *ra = &file->f_ra;struct inode *inode = mapping->host;pgoff_t offset = vmf->pgoff;pgoff_t max_off;struct page *page;int ret = 0;.../** Do we have something in the page cache already?*/page = find_get_page(mapping, offset);if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { /* 找到了文件 @file 对应的 page cache *//** We found the page, so try async readahead before* waiting for the lock.*/do_async_mmap_readahead(vmf->vma, ra, file, page, offset);} else if (!page) { /* 没有找到文件 @file 对应的 page cache, 分配 page cache, 并预读指定 @offset 的内容到 page cache *//* No page in the page cache at all */do_sync_mmap_readahead(vmf->vma, ra, file, offset);count_vm_event(PGMAJFAULT);count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);ret = VM_FAULT_MAJOR;
retry_find:page = find_get_page(mapping, offset);if (!page)goto no_cached_page;}.../** 返回 page cache 页面。* 后续在 finish_fault() 中设定到 mmap 映射的虚拟地址区间 的 PTE * 页表项,即将 mmap 映射的虚拟地址区间 映射到 page cache 页面*/vmf->page = page;return ret | VM_FAULT_LOCKED;no_cached_page:/** We're only likely to ever get here if MADV_RANDOM is in* effect.*/error = page_cache_read(file, offset, vmf->gfp_mask);...
}/*** page_cache_read - adds requested page to the page cache if not already there* @file: file to read* @offset: page index* @gfp_mask: memory allocation flags** This adds the requested page to the page cache if it isn't already there,* and schedules an I/O to read in its contents from disk.*/
static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
{struct address_space *mapping = file->f_mapping;struct page *page;int ret;do {/* 分配 page cache 页面 */page = __page_cache_alloc(gfp_mask|__GFP_COLD);if (!page)return -ENOMEM;/* 添加 read page cache 页面到 地址空间 @mapping 的 LRU 链表 */ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask);if (ret == 0)/** 请求从磁盘读取数据到 page cache* ext4: ext4_readpage()* ...*/ret = mapping->a_ops->readpage(file, page);else if (ret == -EEXIST)ret = 0; /* losing race to add is OK */put_page(page);} while (ret == AOP_TRUNCATED_PAGE);return ret;
}
最后,finish_fault() 将 mmap 虚拟地址区间映射到分配的 page cache 页面:
int finish_fault(struct vm_fault *vmf)
{struct page *page;int ret = 0;/* Did we COW the page? */if ((vmf->flags & FAULT_FLAG_WRITE) &&!(vmf->vma->vm_flags & VM_SHARED))page = vmf->cow_page;elsepage = vmf->page;/** check even for read faults because we might have lost our CoWed* page*/if (!(vmf->vma->vm_flags & VM_SHARED))ret = check_stable_address_space(vmf->vma->vm_mm);if (!ret)ret = alloc_set_pte(vmf, vmf->memcg, page); /* PTE 页表项映射到 @page 页面 */if (vmf->pte)pte_unmap_unlock(vmf->pte, vmf->ptl);return ret;
}
到此,mmap 读写操作 page fault 过程中建立 mmap 映射区间页表以及 page cahce 的过程已经分析完毕。
3. 推荐阅读
从内核世界透视 mmap 内存映射的本质(源码实现篇)
