当前位置：首页 > news >正文

Linux 内存管理 (5)：buddy 内存分配简要流程

news 2025/11/12 16:10:40

文章目录

1. 前言
2. buddy 分配流程
- 2.1 快速分配路径
- 2.2 慢速分配路径

上一篇： Linux 内存管理 (4)：buddy 管理系统的建立

1. 前言

限于作者能力水平，本文可能存在谬误，因此而给读者带来的损失，作者不做任何承诺。

2. buddy 分配流程

头文件 include/linux/gfp.h 导出了多个接口从 buddy 分配器分配页面：

static inline struct page *
__alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid)
{return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL);
}/** Allocate pages, preferring the node given as nid. The node must be valid and* online. For more general interface, see alloc_pages_node().*/
static inline struct page *
__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
{VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);VM_WARN_ON(!node_online(nid));return __alloc_pages(gfp_mask, order, nid);
}/** Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,* prefer the current CPU's closest node. Otherwise node must be valid and* online.*/
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,unsigned int order)
{if (nid == NUMA_NO_NODE)nid = numa_mem_id();return __alloc_pages_node(nid, gfp_mask, order);
}#ifdef CONFIG_NUMA
extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order);static inline struct page *
alloc_pages(gfp_t gfp_mask, unsigned int order)
{return alloc_pages_current(gfp_mask, order);
}
extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,struct vm_area_struct *vma, unsigned long addr,int node, bool hugepage);
#define alloc_hugepage_vma(gfp_mask, vma, addr, order)	\alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
#else
#define alloc_pages(gfp_mask, order) \alloc_pages_node(numa_node_id(), gfp_mask, order)
#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\alloc_pages(gfp_mask, order)
#define alloc_hugepage_vma(gfp_mask, vma, addr, order)	\alloc_pages(gfp_mask, order)
#endif#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
#define alloc_page_vma(gfp_mask, vma, addr)			\alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)
#define alloc_page_vma_node(gfp_mask, vma, addr, node)		\alloc_pages_vma(gfp_mask, 0, vma, addr, node, false)extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
extern unsigned long get_zeroed_page(gfp_t gfp_mask);#define __get_free_page(gfp_mask) \__get_free_pages((gfp_mask), 0)#define __get_dma_pages(gfp_mask, order) \__get_free_pages((gfp_mask) | GFP_DMA, (order))

所有这些接口，最终都会调用同一接口 __alloc_pages_nodemask()，其主要逻辑可以概括为：

尝试从满足分配要求的 NUMA node/zone 空闲链表进行分配，称为快速分配路径
如果从快速路径分配失败，则先后尝试内存回收、内存规则后再进行分配，成为慢速路径

下面来看细节：

/** This is the 'heart' of the zoned buddy allocator.*/
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,nodemask_t *nodemask)
{struct page *page;unsigned int alloc_flags = ALLOC_WMARK_LOW; /* 默认在 WMARK_LOW 触发内存回收 */gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */struct alloc_context ac = { };.../** 页面分配准备工作:* 设定分配上下文 (alloc_context), allocate mask 等*/gfp_mask &= gfp_allowed_mask;alloc_mask = gfp_mask;if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))return NULL;finalise_ac(gfp_mask, order, &ac);/* First allocation attempt *//** 快速分配路径: * 首先尝试从匹配分配条件的 NUMA node/zone 的当前空闲页面分配*/page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);if (likely(page)) /* 分配成功：当前空闲页面满足分配请求 */goto out;/* * 当前空闲页面不满足分配请求，需做进一步* 内存碎片整理(compact)、回收(swap)工作之后，* 然后再次进行分配请求。*//** Apply scoped allocation constraints. This is mainly about GFP_NOFS* resp. GFP_NOIO which has to be inherited for all allocation requests* from a particular context which has been marked by* memalloc_no{fs,io}_{save,restore}.*/ alloc_mask = current_gfp_context(gfp_mask);ac.spread_dirty_pages = false;/** Restore the original nodemask if it was potentially replaced with* &cpuset_current_mems_allowed to optimize the fast-path attempt.*/if (unlikely(ac.nodemask != nodemask))ac.nodemask = nodemask;/** 慢速分配路径: * 从快速分配路径分配失败, 接下来可能要进行 内存规整、内存回收 来满足分配要求*/page = __alloc_pages_slowpath(alloc_mask, order, &ac);...out:...return page;
}

2.1 快速分配路径

先看快速分配路径，其分配又可分为两种情形：

分配单个页面，从每 CPU 的 PCP 空闲链表分配
分配多个页面，从满足分配要求的 NUMA node/zone 空闲列表分配

来看细节：

static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,const struct alloc_context *ac)
{struct zoneref *z = ac->preferred_zoneref;struct zone *zone;struct pglist_data *last_pgdat_dirty_limit = NULL;/** Scan zonelist, looking for a zone with enough free.* See also __cpuset_node_allowed() comment in kernel/cpuset.c.*/for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,ac->nodemask) {struct page *page;unsigned long mark;.../* 检测内存区域 @zone 的页面数是否触及了 @alloc_flags 设定的水准线 */mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];if (!zone_watermark_fast(zone, order, mark, /* 内存区域 @zone 的页面数在设定的水准线之下, NUMA 尝试回收 @zone 的页面 */ac_classzone_idx(ac), alloc_flags)) {int ret;.../** 分配请求不要求对内存区域 @zone 做水准线检测: * 即使页面数在水准线之下, 仍然尝试进行分配.*/if (alloc_flags & ALLOC_NO_WATERMARKS)goto try_this_zone;/** 检测是否要对 内存区域 @zone 进行内存回收工作:* NUMA 会存在不同情形, UMA 总是不进行回收工作.*/if (node_reclaim_mode == 0 ||!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))continue; /* 内存区域 @zone 没有足够页面, 也不支持页面回收, 只能尝试下一个内存区域(struct zone) *//* 内存区域 @zone 内存回收(仅 NUMA) */ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);...}try_this_zone:/* 从内存区域 @zone 的 ac->migratetype 类型页面空闲列表 分配一个页面 */page = rmqueue(ac->preferred_zoneref->zone, zone, order,gfp_mask, alloc_flags, ac->migratetype);if (page) { /* 分配页面成功 */prep_new_page(page, order, gfp_mask, alloc_flags);/** If this is a high-order atomic allocation then check* if the pageblock should be reserved for the future*/if (unlikely(order && (alloc_flags & ALLOC_HARDER)))reserve_highatomic_pageblock(page, zone, order);return page; /* 返回分配的页面 */}}return NULL; /* 分配失败 */
}/** Allocate a page from the given zone. Use pcplists for order-0 allocations.*/
static inline
struct page *rmqueue(struct zone *preferred_zone,struct zone *zone, unsigned int order,gfp_t gfp_flags, unsigned int alloc_flags,int migratetype)
{unsigned long flags;struct page *page;if (likely(order == 0)) { /* 【单个页面】从 per-cpu 的 PCP 列表分配 */page = rmqueue_pcplist(preferred_zone, zone, order,gfp_flags, migratetype);goto out;}/* 【非单个页面】 从 zone 的 frea_area 空闲列表分配 */...spin_lock_irqsave(&zone->lock, flags);do {page = NULL;if (alloc_flags & ALLOC_HARDER) {page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);...}if (!page) /* 分配失败, 尝试 fallback 分配 */page = __rmqueue(zone, order, migratetype);} while (page && check_new_pages(page, order));spin_unlock(&zone->lock);if (!page)goto failed;...local_irq_restore(flags);out:...return page; /* 分配成功，返回分配页面 */failed:local_irq_restore(flags);return NULL; /* 分配失败 */
}/** Go through the free lists for the given migratetype and remove* the smallest available page from the freelists*/
/* 从 [order -> MAX_ORDER] 分配 */
static inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,int migratetype)
{unsigned int current_order;struct free_area *area;struct page *page;/* Find a page of the appropriate size in the preferred list */for (current_order = order; current_order < MAX_ORDER; ++current_order) {area = &(zone->free_area[current_order]);page = list_first_entry_or_null(&area->free_list[migratetype],struct page, lru);if (!page)continue;list_del(&page->lru); /* 将分配的 page 从空闲列表移除 */rmv_page_order(page);area->nr_free--;/* 当从更高 order 空闲列表分配时, 要把剩余的 page 放到合适 order 空闲列表 */expand(zone, page, order, current_order, area, migratetype);set_pcppage_migratetype(page, migratetype);return page;}return NULL;
}/** Do the hard work of removing an element from the buddy allocator.* Call me with the zone->lock already held.*/
static struct page *__rmqueue(struct zone *zone, unsigned int order,int migratetype)
{struct page *page;retry:page = __rmqueue_smallest(zone, order, migratetype); /* 从 [order -> MAX_ORDER] 分配 */if (unlikely(!page)) { /* 分配失败, 尝试从 CMA 或 migratetype fallback 分配 */if (migratetype == MIGRATE_MOVABLE) /* 尝试从 CMA 分配 */page = __rmqueue_cma_fallback(zone, order);/** fallback 分配: * 找一个合适的 fallback migratetype(兼容请求的 migratetype), 然后尝试分配。*/if (!page && __rmqueue_fallback(zone, order, migratetype))goto retry;}trace_mm_page_alloc_zone_locked(page, order, migratetype);return page;
}#ifdef CONFIG_CMA
static struct page *__rmqueue_cma_fallback(struct zone *zone,unsigned int order)
{return __rmqueue_smallest(zone, order, MIGRATE_CMA);
}
#else
static inline struct page *__rmqueue_cma_fallback(struct zone *zone,unsigned int order) { return NULL; }
#endif/** Try finding a free buddy page on the fallback list and put it on the free* list of requested migratetype, possibly along with other pages from the same* block, depending on fragmentation avoidance heuristics. Returns true if* fallback was found so that __rmqueue_smallest() can grab it.** The use of signed ints for order and current_order is a deliberate* deviation from the rest of this file, to make the for loop* condition simpler.*/
/** fallback 分配: * 找一个合适的 fallback migratetype(兼容请求的 migratetype), 然后尝试分配。*/
static inline bool
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
{struct free_area *area;int current_order;struct page *page;int fallback_mt;bool can_steal;/** Find the largest available free page in the other list. This roughly* approximates finding the pageblock with the most free pages, which* would be too costly to do exactly.*/for (current_order = MAX_ORDER - 1; current_order >= order;--current_order) {area = &(zone->free_area[current_order]);fallback_mt = find_suitable_fallback(area, current_order,start_migratetype, false, &can_steal);if (fallback_mt == -1)continue;/** We cannot steal all free pages from the pageblock and the* requested migratetype is movable. In that case it's better to* steal and split the smallest available page instead of the* largest available page, because even if the next movable* allocation falls back into a different pageblock than this* one, it won't cause permanent fragmentation.*/if (!can_steal && start_migratetype == MIGRATE_MOVABLE&& current_order > order)goto find_smallest;goto do_steal;}return false;find_smallest:for (current_order = order; current_order < MAX_ORDER;current_order++) {area = &(zone->free_area[current_order]);fallback_mt = find_suitable_fallback(area, current_order,start_migratetype, false, &can_steal);if (fallback_mt != -1)break;}...do_steal:page = list_first_entry(&area->free_list[fallback_mt],struct page, lru);steal_suitable_fallback(zone, page, start_migratetype, can_steal);...return true;
}

快速分配路径 的分析到此为止，不再做进一步的深入，其中 fallback 的规则详见 fallbacks[] 数组定义：

/** This array describes the order lists are fallen back to when* the free lists for the desirable migrate type are depleted*/
static int fallbacks[MIGRATE_TYPES][4] = {[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
#ifdef CONFIG_CMA[MIGRATE_CMA]         = { MIGRATE_TYPES }, /* Never used */
#endif
#ifdef CONFIG_MEMORY_ISOLATION[MIGRATE_ISOLATE]     = { MIGRATE_TYPES }, /* Never used */
#endif
};

2.2 慢速分配路径

慢速分配路径可以概括为：

调整分配标志和 zone，尝试分配
如果调整分配失败，进行内存回收操作，然后再尝试分配失败
如果内存回收后分配仍然失败，则进行内存规整(compact)，然后再尝试分配失败
如果内存规整(compact)分配还是失败，则触发 OOM-killer，然后再尝试分配

来看细节：

static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,struct alloc_context *ac)
{...if (gfp_mask & __GFP_KSWAPD_RECLAIM)wake_all_kswapds(order, ac); /* 唤醒 每 NUMA 内存节点 的 kswapd，进行内存回收 *//** The adjusted alloc_flags might result in immediate success, so try* that first*//* 按调整后的 分配标志，再次尝试分配 */page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);if (page)goto got_pg;...retry:/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */if (gfp_mask & __GFP_KSWAPD_RECLAIM)wake_all_kswapds(order, ac);.../* Attempt with potentially adjusted zonelist and alloc_flags *//* 按调整后的 分配标志 和 zone 列表，再次尝试分配 */page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);if (page)goto got_pg;/* Try direct reclaim and then allocating *//** 直接进行内存回收，并等待回收操作完成后，再次尝试分配.* 这不同于唤醒 kswapd 的间接回收，时间上更为确定.*/page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,&did_some_progress);if (page)goto got_pg;/* Try direct compaction and then allocating *//* 内存页面回收分配失败，进行内存规整后再尝试分配 */page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,compact_priority, &compact_result);if (page)goto got_pg;.../* Reclaim has failed us, start killing things *//* 触发 OOM-killer 再次尝试分配 */page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);if (page)goto got_pg;...got_pg:return page; /* 分配完成，返回 分配页面 或 NULL */
}