Linux中NUMA节点初始化内存相关参数的实现
NUMA架构内存初始化setup_memory
unsigned long __init setup_memory(void)
{int nid;unsigned long bootmap_size, system_start_pfn, system_max_low_pfn;unsigned long reserve_pages, pfn;/** When mapping a NUMA machine we allocate the node_mem_map arrays* from node local memory. They are then mapped directly into KVA* between zone normal and vmalloc space. Calculate the size of* this space and use it to adjust the boundry between ZONE_NORMAL* and ZONE_HIGHMEM.*/get_memcfg_numa();/* Fill in the physnode_map */for (nid = 0; nid < numnodes; nid++) {printk("Node: %d, start_pfn: %ld, end_pfn: %ld\n",nid, node_start_pfn[nid], node_end_pfn[nid]);printk(" Setting physnode_map array to node %d for pfns:\n ",nid);for (pfn = node_start_pfn[nid]; pfn < node_end_pfn[nid];pfn += PAGES_PER_ELEMENT) {physnode_map[pfn / PAGES_PER_ELEMENT] = nid;printk("%ld ", pfn);}printk("\n");}reserve_pages = calculate_numa_remap_pages();/* partially used pages are not usable - thus round upwards */system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);find_max_pfn();system_max_low_pfn = max_low_pfn = find_max_low_pfn() - reserve_pages;printk("reserve_pages = %ld find_max_low_pfn() ~ %ld\n",reserve_pages, max_low_pfn + reserve_pages);printk("max_pfn = %ld\n", max_pfn);
#ifdef CONFIG_HIGHMEMhighstart_pfn = highend_pfn = max_pfn;if (max_pfn > system_max_low_pfn)highstart_pfn = system_max_low_pfn;printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",pages_to_mb(highend_pfn - highstart_pfn));
#endifprintk(KERN_NOTICE "%ldMB LOWMEM available.\n",pages_to_mb(system_max_low_pfn));printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n",min_low_pfn, max_low_pfn, highstart_pfn);printk("Low memory ends at vaddr %08lx\n",(ulong) pfn_to_kaddr(max_low_pfn));for (nid = 0; nid < numnodes; nid++) {node_remap_start_vaddr[nid] = pfn_to_kaddr((highstart_pfn + reserve_pages) - node_remap_offset[nid]);allocate_pgdat(nid);printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,(ulong) node_remap_start_vaddr[nid],(ulong) pfn_to_kaddr(highstart_pfn + reserve_pages- node_remap_offset[nid] + node_remap_size[nid]));}printk("High memory starts at vaddr %08lx\n",(ulong) pfn_to_kaddr(highstart_pfn));vmalloc_earlyreserve = reserve_pages * PAGE_SIZE;for (nid = 0; nid < numnodes; nid++)find_max_pfn_node(nid);NODE_DATA(0)->bdata = &node0_bdata;/** Initialize the boot-time allocator (with low memory only):*/bootmap_size = init_bootmem_node(NODE_DATA(0), min_low_pfn, 0, system_max_low_pfn);register_bootmem_low_pages(system_max_low_pfn);/** Reserve the bootmem bitmap itself as well. We do this in two* steps (first step was init_bootmem()) because this catches* the (very unlikely) case of us accidentally initializing the* bootmem allocator with an invalid RAM area.*/reserve_bootmem_node(NODE_DATA(0), HIGH_MEMORY, (PFN_PHYS(min_low_pfn) +bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY));/** reserve physical page 0 - it's a special BIOS page on many boxes,* enabling clean reboots, SMP operation, laptop functions.*/reserve_bootmem_node(NODE_DATA(0), 0, PAGE_SIZE);/** But first pinch a few for the stack/trampoline stuff* FIXME: Don't need the extra page at 4K, but need to fix* trampoline before removing it. (see the GDT stuff)*/reserve_bootmem_node(NODE_DATA(0), PAGE_SIZE, PAGE_SIZE);/* reserve EBDA region, it's a 4K region */reserve_ebda_region_node();#ifdef CONFIG_ACPI_SLEEP/** Reserve low memory region for sleep support.*/acpi_reserve_bootmem();
#endif/** Find and reserve possible boot-time SMP configuration:*/find_smp_config();#ifdef CONFIG_BLK_DEV_INITRDif (LOADER_TYPE && INITRD_START) {if (INITRD_START + INITRD_SIZE <= (system_max_low_pfn << PAGE_SHIFT)) {reserve_bootmem_node(NODE_DATA(0), INITRD_START, INITRD_SIZE);initrd_start =INITRD_START ? INITRD_START + PAGE_OFFSET : 0;initrd_end = initrd_start+INITRD_SIZE;}else {printk(KERN_ERR "initrd extends beyond end of memory ""(0x%08lx > 0x%08lx)\ndisabling initrd\n",INITRD_START + INITRD_SIZE,system_max_low_pfn << PAGE_SHIFT);initrd_start = 0;}}
#endifreturn system_max_low_pfn;
}
1. 代码详细解析
1.1. 变量声明
int nid;unsigned long bootmap_size, system_start_pfn, system_max_low_pfn;unsigned long reserve_pages, pfn;
变量说明:
nid
:NUMA节点IDbootmap_size
:bootmem分配器位图大小system_start_pfn
:系统起始页框号system_max_low_pfn
:系统最大低端页框号reserve_pages
:为NUMA重映射保留的页面数pfn
:页框号循环变量
1.2. NUMA内存配置获取
/** When mapping a NUMA machine we allocate the node_mem_map arrays* from node local memory. They are then mapped directly into KVA* between zone normal and vmalloc space. Calculate the size of* this space and use it to adjust the boundry between ZONE_NORMAL* and ZONE_HIGHMEM.*/get_memcfg_numa();
- 获取NUMA系统的内存配置信息
- 计算每个节点的内存范围
- 为后续的内存分区调整做准备
NUMA特性:
- 在NUMA系统中,每个节点的mem_map数组从本地内存分配
- 这些数组映射到内核虚拟地址空间的特定区域
- 影响ZONE_NORMAL和ZONE_HIGHMEM的边界划分
1.3. 构建物理节点映射表
/* Fill in the physnode_map */for (nid = 0; nid < numnodes; nid++) {printk("Node: %d, start_pfn: %ld, end_pfn: %ld\n",nid, node_start_pfn[nid], node_end_pfn[nid]);printk(" Setting physnode_map array to node %d for pfns:\n ",nid);for (pfn = node_start_pfn[nid]; pfn < node_end_pfn[nid];pfn += PAGES_PER_ELEMENT) {physnode_map[pfn / PAGES_PER_ELEMENT] = nid;printk("%ld ", pfn);}printk("\n");}
物理节点映射表:
physnode_map
: 全局数组,将页框号映射到所属的NUMA节点- 通过这个表可以快速知道任意物理页面属于哪个节点
映射算法:
physnode_map[pfn / PAGES_PER_ELEMENT] = nid
- 不是为每个页面存储节点ID(那样太浪费内存)
- 而是以块为单位存储,
PAGES_PER_ELEMENT
个页面共享一个映射条目
1.4. 计算NUMA重映射保留页
reserve_pages = calculate_numa_remap_pages();
计算为NUMA内存重映射需要保留的页面数量,页面管理结构和节点描述符需要的空间
1.5. 确定内存边界
/* partially used pages are not usable - thus round upwards */system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);find_max_pfn();system_max_low_pfn = max_low_pfn = find_max_low_pfn() - reserve_pages;
边界计算:
system_start_pfn
:从页表结束位置开始,向上取整到页边界find_max_pfn()
:探测系统最大物理页框号find_max_low_pfn()
:探测低端内存最大页框号- 减去reserve_pages:为NUMA重映射预留空间
1.6. 配置高端内存
#ifdef CONFIG_HIGHMEMhighstart_pfn = highend_pfn = max_pfn;if (max_pfn > system_max_low_pfn)highstart_pfn = system_max_low_pfn;printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",pages_to_mb(highend_pfn - highstart_pfn));
#endif
高端内存设置:
highstart_pfn
: 高端内存起始页框highend_pfn
: 高端内存结束页框- 如果物理内存超过低端内存限制,则配置高端内存
1.7. 信息输出
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",pages_to_mb(system_max_low_pfn));printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n",min_low_pfn, max_low_pfn, highstart_pfn);
输出内存配置信息,便于调试和监控
1.8. 计算节点重映射地址
printk("Low memory ends at vaddr %08lx\n",(ulong) pfn_to_kaddr(max_low_pfn));for (nid = 0; nid < numnodes; nid++) {node_remap_start_vaddr[nid] = pfn_to_kaddr((highstart_pfn + reserve_pages) - node_remap_offset[nid]);allocate_pgdat(nid);printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,(ulong) node_remap_start_vaddr[nid],(ulong) pfn_to_kaddr(highstart_pfn + reserve_pages- node_remap_offset[nid] + node_remap_size[nid]));}
NUMA重映射:
- 为每个节点计算重映射的虚拟地址范围
- 分配节点的
pg_data_t
数据结构 - 建立节点内存到内核虚拟地址空间的映射
1.9. 初始化bootmem
分配器
printk("High memory starts at vaddr %08lx\n",(ulong) pfn_to_kaddr(highstart_pfn));vmalloc_earlyreserve = reserve_pages * PAGE_SIZE;for (nid = 0; nid < numnodes; nid++)find_max_pfn_node(nid);NODE_DATA(0)->bdata = &node0_bdata;/** Initialize the boot-time allocator (with low memory only):*/bootmap_size = init_bootmem_node(NODE_DATA(0), min_low_pfn, 0, system_max_low_pfn);
bootmem
初始化:
- 设置
vmalloc
早期保留空间 - 查找每个节点的最大页框号
- 初始化节点0的
bootmem
分配器
1.10. 注册可用物理页
register_bootmem_low_pages(system_max_low_pfn);
将低端内存的所有页面注册为可用状态
1.11. 保留关键内存区域
/** Reserve the bootmem bitmap itself as well. We do this in two* steps (first step was init_bootmem()) because this catches* the (very unlikely) case of us accidentally initializing the* bootmem allocator with an invalid RAM area.*/reserve_bootmem_node(NODE_DATA(0), HIGH_MEMORY, (PFN_PHYS(min_low_pfn) +bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY));/** reserve physical page 0 - it's a special BIOS page on many boxes,* enabling clean reboots, SMP operation, laptop functions.*/reserve_bootmem_node(NODE_DATA(0), 0, PAGE_SIZE);/** But first pinch a few for the stack/trampoline stuff* FIXME: Don't need the extra page at 4K, but need to fix* trampoline before removing it. (see the GDT stuff)*/reserve_bootmem_node(NODE_DATA(0), PAGE_SIZE, PAGE_SIZE);/* reserve EBDA region, it's a 4K region */reserve_ebda_region_node();#ifdef CONFIG_ACPI_SLEEP/** Reserve low memory region for sleep support.*/acpi_reserve_bootmem();
#endif/** Find and reserve possible boot-time SMP configuration:*/find_smp_config();
保留的关键区域:
bootmem
位图自身:防止分配器数据被覆盖- 物理页0:BIOS特殊页面,用于重启和SMP
- SMP启动页面:多处理器启动所需的代码区域
- EBDA区域:扩展BIOS数据区
- ACPI睡眠支持:系统休眠唤醒所需内存
- SMP配置:多处理器配置信息
1.12. 处理初始RAM磁盘
#ifdef CONFIG_BLK_DEV_INITRDif (LOADER_TYPE && INITRD_START) {if (INITRD_START + INITRD_SIZE <= (system_max_low_pfn << PAGE_SHIFT)) {reserve_bootmem_node(NODE_DATA(0), INITRD_START, INITRD_SIZE);initrd_start =INITRD_START ? INITRD_START + PAGE_OFFSET : 0;initrd_end = initrd_start+INITRD_SIZE;}else {printk(KERN_ERR "initrd extends beyond end of memory ""(0x%08lx > 0x%08lx)\ndisabling initrd\n",INITRD_START + INITRD_SIZE,system_max_low_pfn << PAGE_SHIFT);initrd_start = 0;}}
#endif
初始RAM磁盘处理:
- 检查
initrd
是否在可用内存范围内 - 如果在范围内,保留内存并设置
initrd
起止地址 - 如果超出范围,禁用
initrd
并报错
1.13. 函数返回
return system_max_low_pfn;
返回系统最大低端页框号,供后续内存初始化使用
不同类型的NUMA系统get_memcfg_numa
static inline void get_memcfg_numa(void)
{
#ifdef CONFIG_X86_NUMAQif (get_memcfg_numaq())return;
#elif CONFIG_ACPI_SRATif (get_memcfg_from_srat())return;
#endifget_memcfg_numa_flat();
}
1. 配置选项概述
1.1. Flat模式的用途:
- 兼容性:在不支持NUMA的系统上提供统一的接口
NUMA-Q 架构的内存配置初始化get_memcfg_numaq
int __init get_memcfg_numaq(void)
{smp_dump_qct();return 1;
}
static void __init smp_dump_qct(void)
{int node;struct eachquadmem *eq;struct sys_cfg_data *scd =(struct sys_cfg_data *)__va(SYS_CFG_DATA_PRIV_ADDR);numnodes = 0;for(node = 0; node < MAX_NUMNODES; node++) {if(scd->quads_present31_0 & (1 << node)) {node_set_online(node);numnodes++;eq = &scd->eq[node];/* Convert to pages */node_start_pfn[node] = MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size);node_end_pfn[node] = MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);}}
}
1. get_memcfg_numaq()
函数
int __init get_memcfg_numaq(void)
{smp_dump_qct();return 1;
}
-
init
标记表示该函数仅在内核初始化阶段调用,初始化完成后会被释放;返回值为 int 类型。 -
smp_dump_qct();
:调用内部静态函数smp_dump_qct()
执行实际的 NUMA 节点内存配置探测。
2. smp_dump_qct()
函数
变量定义:
-
int node;
: 循环变量,用于遍历 NUMA 节点 -
struct eachquadmem *eq;
:指向eachquadmem
结构体的指针,该结构体存储单个 NUMA 节点的内存信息 -
struct sys_cfg_data *scd = (struct sys_cfg_data *)__va(SYS_CFG_DATA_PRIV_ADDR);
:-
SYS_CFG_DATA_PRIV_ADDR
:物理地址,指向系统配置数据在硬件中的存储位置(通常由 BIOS 或固件提供) -
__va(phys_addr)
:内核宏,将物理地址转换为虚拟地址 -
struct sys_cfg_data
:系统配置数据结构体,包含 NUMA 节点的存在状态、内存分布等信息
-
初始化节点计数:
numnodes = 0;
:numnodes
是全局变量,用于记录系统中实际存在的 NUMA 节点数量,初始化为 0
遍历并探测 NUMA 节点:
for(node = 0; node < MAX_NUMNODES; node++)
:循环遍历所有可能的 NUMA 节点(MAX_NUMNODES 是内核定义的最大节点数)
检查节点是否存在:
-
if(scd->quads_present31_0 & (1 << node))
:-
scd->quads_present31_0
:是一个位掩码(bitmask),每个位代表一个 NUMA 节点是否存在(如第 0 位为 1 表示节点 0 存在) -
1 << node
:将 1 左移 node 位,用于检测对应节点的位是否被设置
-
标记节点为在线状态:
node_set_online(node);
:内核宏,将节点 node 标记为 “在线”(可用状态),更新内核中的节点状态表
更新节点计数:
numnodes++;
:每检测到一个存在的节点,全局节点计数器加 1
获取节点内存信息结构体:
eq = &scd->eq[node];
:scd->eq
是eachquadmem
结构体数组,每个元素对应一个节点的内存信息;这里获取当前节点的内存信息指针
计算节点内存起始页帧号:
-
node_start_pfn[node] = MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size);
: -
eq->priv_mem_size
:当前节点私有内存的大小 -
eq->hi_shrd_mem_start
:当前节点高端共享内存的起始地址 -
计算逻辑:节点的实际内存起始地址 = 高端共享内存起始地址 - 私有内存大小
-
MB_TO_PAGES(x):宏,将 MB 转换为页帧号(1 页 = 4KB 时,1MB = 256 页)
-
node_start_pfn[node]
:全局数组,存储每个节点的内存起始页帧号
计算节点内存结束页帧号:
-
node_end_pfn[node] = MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
: -
eq->hi_shrd_mem_size
:当前节点高端共享内存的大小 -
计算逻辑:节点的内存结束地址 = 高端共享内存起始地址 + 高端共享内存大小
-
node_end_pfn[node]
:全局数组,存储每个节点的内存结束页帧号
为NUMA节点的内存映射预留虚拟地址空间calculate_numa_remap_pages
static unsigned long calculate_numa_remap_pages(void)
{int nid;unsigned long size, reserve_pages = 0;for (nid = 1; nid < numnodes; nid++) {/* calculate the size of the mem_map needed in bytes */size = (node_end_pfn[nid] - node_start_pfn[nid] + 1)* sizeof(struct page) + sizeof(pg_data_t);/* convert size to large (pmd size) pages, rounding up */size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;/* now the roundup is correct, convert to PAGE_SIZE pages */size = size * PTRS_PER_PTE;printk("Reserving %ld pages of KVA for lmem_map of node %d\n",size, nid);node_remap_size[nid] = size;reserve_pages += size;node_remap_offset[nid] = reserve_pages;printk("Shrinking node %d from %ld pages to %ld pages\n",nid, node_end_pfn[nid], node_end_pfn[nid] - size);node_end_pfn[nid] -= size;node_remap_start_pfn[nid] = node_end_pfn[nid];}printk("Reserving total of %ld pages for numa KVA remap\n",reserve_pages);return reserve_pages;
}
1. 代码逐段解析
1.1. 变量声明
int nid;
unsigned long size, reserve_pages = 0;
nid
: 节点ID,用于遍历所有NUMA节点size
: 临时变量,存储每个节点需要的内存映射大小reserve_pages
: 累计所有节点需要预留的总页面数,初始化为0
1.2. 节点遍历循环
for (nid = 1; nid < numnodes; nid++) {
- 从节点1开始遍历到
numnodes-1
(节点0通常保留给引导内存) - 这表明节点0不需要进行内存重映射
1.3. 计算内存映射大小(字节)
size = (node_end_pfn[nid] - node_start_pfn[nid] + 1)* sizeof(struct page) + sizeof(pg_data_t);
node_end_pfn[nid] - node_start_pfn[nid] + 1
: 计算节点的页面帧数量* sizeof(struct page)
: 乘以struct page
的大小,得到mem_map
数组所需字节数+ sizeof(pg_data_t)
: 加上节点描述符pg_data_t
的大小- 结果是该节点完整内存管理数据结构所需的总字节数
1.4. 转换为大页单位
size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
LARGE_PAGE_BYTES
: 大页大小(size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES
: 向上取整到最近的大页边界- 这确保内存映射按大页对齐
1.5. 转换回普通页面单位
size = size * PTRS_PER_PTE;
PTRS_PER_PTE
: 每个页表条目(PTE)映射的页面数(x86架构通常是512)- 将大页数量转换回普通的4KB页面数量
1.6. 调试信息输出
printk("Reserving %ld pages of KVA for lmem_map of node %d\n",size, nid);
- 打印为节点
nid
预留的KVA(内核虚拟地址)页面数量
1.7. 存储节点重映射信息
node_remap_size[nid] = size;
reserve_pages += size;
node_remap_offset[nid] = reserve_pages;
node_remap_size[nid] = size
: 存储该节点的重映射大小reserve_pages += size
: 累计总预留页面数node_remap_offset[nid] = reserve_pages
: 存储该节点的重映射偏移量(累积值)
1.8. 调整节点内存边界
printk("Shrinking node %d from %ld pages to %ld pages\n",nid, node_end_pfn[nid], node_end_pfn[nid] - size);
node_end_pfn[nid] -= size;
node_remap_start_pfn[nid] = node_end_pfn[nid];
- 打印节点内存收缩信息
node_end_pfn[nid] -= size
: 减少节点的结束页面帧号,为内存映射预留空间node_remap_start_pfn[nid] = node_end_pfn[nid]
: 记录重映射区域的起始页面帧号
1.9. 循环结束和总信息输出
}
printk("Reserving total of %ld pages for numa KVA remap\n",reserve_pages);
return reserve_pages;
- 循环结束后打印总的预留页面数
- 返回累计的预留页面总数
为NUMA节点分配并初始化节点描述符allocate_pgdat
static void __init allocate_pgdat(int nid)
{if (nid)NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];else {NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT));min_low_pfn += PFN_UP(sizeof(pg_data_t));memset(NODE_DATA(nid), 0, sizeof(pg_data_t));}
}
1. 代码逐段解析
1.1. 函数声明
static void __init allocate_pgdat(int nid)
static
: 函数只在当前文件内可见void
: 函数没有返回值__init
: 表示这是初始化函数,内核启动后会被释放nid
: 节点ID参数
1.2. 非零节点处理(nid
> 0)
if (nid)NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
if (nid)
: 检查节点ID是否非零(即不是节点0)NODE_DATA(nid)
: 内核宏,用于获取节点nid
的pg_data_t
指针node_remap_start_vaddr[nid]
: 从之前计算的重映射虚拟地址数组中获取地址- 将节点的
pg_data_t
结构直接放置在预分配的重映射虚拟地址区域
1.3. 零节点处理(nid
== 0)- else分支开始
else {NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT));
else
: 处理节点0的特殊情况__va()
: 将物理地址转换为虚拟地址的内核宏min_low_pfn << PAGE_SHIFT
:min_low_pfn
: 最低可用的页面帧号<< PAGE_SHIFT
: 左移页大小位数(通常12位),将页面帧号转换为字节地址- 这计算出节点0描述符的物理地址,然后转换为虚拟地址
- 节点0的
pg_data_t
直接从低端内存分配
1.4. 更新内存边界
min_low_pfn += PFN_UP(sizeof(pg_data_t));
PFN_UP(sizeof(pg_data_t))
: 将pg_data_t
大小转换为页面数,并向上取整min_low_pfn += ...
: 增加最低可用页面帧号,表示这部分内存已被占用- 这确保后续内存分配不会覆盖节点0的描述符
1.5. 内存清零初始化
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
memset()
: 将内存区域设置为0NODE_DATA(nid)
: 节点0的pg_data_t
指针0
: 填充值(全部清零)sizeof(pg_data_t)
: 要清零的内存大小- 这将节点描述符的所有字段初始化为0