start_kernel
目录
[TOC]
主核的执行入口(PC 寄存器的初始值)是编译内核时决定的,运行时由 BIOS 或者 BootLoader 传递给内核。内核的初始入口是kernel_entry
。LA 的 kernel_entry
和 mips 的类似,进行.bss 段的清 0(包括之后加载用户态进程也需要清 0,为什么要清 0?),保存 a0~a3 等操作。之后就进入到第二入口 start_kernel
。
通过 gdb 单步调试看 LA 内核是怎样初始化的。但是遇到一个问题,内核使用 -O2
优化项,在单步调试时很多值都是 optimized out
,同时设置断点也不会顺序执行,是跳着执行的,给阅读代码带来困难。后来请教师兄,这是正常的,start_kernel
部分的代码可以直接看源码,不用单步调试。
start_kernel
start_kernel
的一级节点中,架构相关的重要函数有 setup_arch
, trap_init
, init_IRQ
, time_init
。
start_kernel
| -- local_irq_disable(); // 关中断,中断处理程序没有准备好,通过标志位
| -- boot_cpu_init(); // 如果是 smp,设置启动的 CPU,主核
| -- page_address_init(); // do nothing
| -- pr_notice("%s", linux_banner); // 输出一些 kernel 信息
| -- setup_arch(&command_line); // 下面详细分析
| -- add_device_randomness(command_line, strlen(command_line)); // ?
| -- mm_init_cpumask(&init_mm);
| -- setup_command_line(command_line); // 设置 kernel 启动参数,可写在 grub.cfg 中,由 grub 传递
| -- setup_nr_cpu_ids(); // 获取 cpu_possible_mask 中最大的编号
| -- boot_cpu_hotplug_init(); // 将 boot_once 设为 true
| -- pr_notice("Kernel command line: %s\n", boot_command_line);
| -- jump_label_init(); // 设置一个变量
| -- parse_early_param(); // 早期参数检查
| -- vfs_caches_init_early(); // 初始化 VFS
| -- sort_main_extable();
| -- trap_init(); // 下面详细分析
| -- mm_init(); // Set up kernel memory allocators
| |-- page_ext_init_flatmem(); // la 采用的是 SPARSEMEM,这个函数为空
| | -- mem_init(); // 建立内存分布图,将 BootMem/MemBlock 内存分布图转换为伙伴系统的内存分布图
| | -- free_all_bootmem(); // 释放所有 bootmem 所用的 page
| | -- setup_zero_pages(); // buddy 开始接管所有的 page
| | -- kmem_cache_init(); // SLAB 内存对象管理器初始化 kmem_cache: Slab cache management
| | -- create_boot_cache; // 分配两个 kmem_cache 变量:boot_kmem_cache, boot_kmem_cache_node
| | -- pgtable_init(); // empty
| | -- vmalloc_init(); // 非连续内存区管理器的初始化,将不连续的页面碎片映射到连续的虚拟地址空间
| | -- vmap_block_queue; // 非连续内存块队列管理结构
| | -- vfree_deferred; // 内存延迟释放管理
| | -- ioremap_huge_init(); // empty
| | -- init_espfix_bsp(); // empty
| | -- pti_init(); // empty
|
| -- ftrace_init();
| -- sched_init(); // 调度器初始化,完成后 kernel 可以进行任务调度
| -- preempt_disable(); // 禁止抢占
| -- radix_tree_init();
| -- housekeeping_init();
| -- workqueue_init_early(); // 工作队列初始化
| -- rcu_init();
| -- trace_init(); // Trace events are available after this
| -- context_tracking_init();
| -- early_irq_init();
| -- init_IRQ(); // 下面详细分析
| -- tick_init();
| -- rcu_init_nohz();
| -- init_timers(); // percpu 基本定时器的初始化,并且设置时间软中断的回调函数
| -- hrtimers_init(); // 高分辨率定时器初始化
| -- softirq_init(); // 上下半部中的下半部,属于不紧急,可以延迟完成的中断
| -- timekeeping_init(); // 初始化各种时间相关的变量,维护系统时间
| -- time_init();
| -- perf_event_init(); // kernel 性能剖析工具
| -- profile_init(); // kernel 性能剖析工具
| -- call_function_init();
|
| -- early_boot_irqs_disabled = false;
| -- local_irq_enable(); // 第一阶段完成,开中断
| -- kmem_cache_init_late(); // 需要进一步分析
| -- console_init(); // 控制台初始化,VTConsole, SerialConsole, NetConsole
| -- lockdep_init();
| -- locking_selftest();
| -- mem_encrypt_init(); // This needs to be called before any devices perform DMA
// operations that might use the SWIOTLB bounce buffers
| -- kmemleak_init(); // 内存泄漏扫描器
| -- debug_objects_mem_init();
| -- setup_per_cpu_pageset();
| -- numa_policy_init(); // 内存分配策略初始化
| -- acpi_early_init(); // Initialize ACPICA and populate the ACPI namespace
| -- if (late_time_init)
| | -- late_time_init();
| -- sched_clock_init();
| -- calibrate_delay(); // 设置每个时钟节拍对应的空循环数,用于 delay
| -- pid_idr_init();
| -- anon_vma_init();
| -- thread_stack_cache_init();
| -- cred_init();
| -- fork_init(); // fork 系统调用创建新进程,这里初始化 fork 用到的数据结构
| -- proc_caches_init(); // 应该是进程空间初始化,还调用了 mmap_init
| -- uts_ns_init(); // 基本都是用 kmem_cache_create 分配空间
| -- buffer_init();
| -- key_init();
| -- security_init();
| -- dbg_late_init();
| -- vfs_caches_init(); // 为上面初始化的 dcache 和 inode 分配空间
| -- pagecache_init();
| -- signals_init(); // 初始化信号相关的数据结构
| -- seq_file_init();
| -- proc_root_init();
| -- nsfs_init();
| -- cpuset_init();
| -- cgroup_init(); // Control Group,内核控制资源分配的机制,可以和 namespace 配合使用
| -- taskstats_init_early();
| -- delayacct_init();
| -- check_bugs();
| -- acpi_subsystem_init(); // Finalize the early initialization of ACPI
| -- arch_post_acpi_subsys_init();
| -- sfi_init_late();
| -- if (efi_enabled(EFI_RUNTIME_SERVICES)) {
| -- efi_free_boot_services();
| -- }
| -- rest_init(); // 第三阶段,通过 kernel_thread 创建 1 号进程 kernel_init 和 2 号进程 kthreadd
\
setup_arch
架构相关,代码和 mips 类似,下为代码树展开。
setup_arch
| -- cpu_probe; // 探测 cpu 类型,写入 cputype 中
|
| -- plat_early_init; // 解析 bios 传入的参数
| | -- fw_init_cmdline; // 获取参数
| | -- prom_init_env; // 根据参数设置环境变量
| | -- memblock_and_maxpfn_init // 挂载 memblock
| | -- memblock_add; // loongson_mem_map 和 boot_mem_map 是什么关系
|
| -- init_initrd; // 主要是检查 initrd_start 和 initrd_end 是否正确,将其映射到虚拟地址
|
| -- prom_init; // 初始化 io 空间的基址、ACPI 表、loongarch 使用的 numa 存储等
| | -- set_io_port_base; // 设置 IO 空间的基址
| | -- if(efi_bp){} // efi_bp 是在 prom_init_env 中用 bios 传递的_fw_envp 赋值的
| // 之后进行 ACPI 初始化,主要是初始化各种表
| | -- acpi_table_upgrade; // 通过 CPIO 获取或 bios 收集的数据,对各个表进行初始化
| | -- acpi_boot_table_init;
| | -- acpi_initialize_tables; // Initialize the table manager, get the RSDP and RSDT/XSDT.
| | -- acpi_boot_init; // 主要是解析 MADT
| | -- prom_init_numa_memory;
| | -- numa_mem_init; // 初始化 numa
| | -- numa_default_distance; // 初始化 numa 节点的距离矩阵
| | -- init_node_memblock; // 逐个分析内存分布图并将结果通过 add_mem_region 保存到 loongson_mem_map 中
| | -- loongson_acpi_init; // ACPI 初始化始终是个大问题,需要进一步了解 ACPI 才能看懂
|
| -- cpu_report; // 打印一些初始化后 CPU 的信息
|
| -- arch_mem_init; // 主要是初始化设备树和 bootmem
| | -- plat_mem_setup; // detects the memory configuration and
| // will record detected memory areas using add_memory_region.
| | -- early_init_dt_scan_memory; // 早期读取 bios 传入的信息,最终通过 memblock_add 挂载
| | -- early_init_dt_scan; // 早期初始化设备树
| | -- dt_bootmem_init; // 建立 boot_mem_map 内存映射图,boot_mem_map 主要给 BootMem 内存分配器用,只包含系统内存
| // 这里不是初始化 bootmem 的地方,而只是确定其上下界,
| // 然后通过 memblock_add_range(核心函数)将其挂载
| | -- device_tree_init; // 用 bios 传递的信息初始化设备树节点
| | -- unflatten_and_copy_device_tree;
| | -- early_init_dt_alloc_memory_arch; // 先在初始化好的 bootmem 中分配物理空间
| | -- unflatten_device_tree; // create tree of device_nodes from flat blob
|
| | -- sparse_init; // 初始化稀疏型内存模型
|
| | -- plat_swiotlb_setup; // swiotlb 为软件中转站,用于让任意设备能够对任意内存地址发起 DMA 访问
| // 要保证弱寻址能力的设备能够访问,所有尽早初始化
|
| | -- resource_init; // 在已经初始化的 bootmem 中为 code, date, bss 段分配空间
|
| | -- plat_smp_setup; // smp 是多对称处理器,这里先配置主核,主要是主核编号,核间中断等
|
| | -- prefill_possible_map; // 建立合理的逻辑 CPU 的 possible 值,possible 和 present 的区别是 CPU 物理热拔插,
| // 如果物理上移除一个 CPU,present 就会减 1,默认两者像等
|
| | -- cpu_cache_init; // 三级 cache 初始化,主要是 ways, sets, size
| | -- setup_protection_map; // 建立进程 VMA 权限到页表权限的映射表(为什么是 16 个页表?)
|
| | -- paging_init; // 初始化各个内存页面管理区。设置不同的页面管理区是为访问能力有限的设备服务
| | -- free_area_init_nodes; // Initialise all pg_data_t and zone data, the start_pfn, end_pfn.
\
cpu_probe
void cpu_probe(void) // probe CPU type, LOONGARCH's processor_id should be 0
{
struct cpuinfo_loongarch *c = ¤t_cpu_data; // current_cpu_data指向当前cpu信息
unsigned int cpu = smp_processor_id(); // 获取当前cpu编号
/*
* Set a default elf platform, cpu probe may later
* overwrite it with a more precise value
*/
set_elf_platform(cpu, "loongarch");
c->cputype = CPU_UNKNOWN; // 初始化当前cpu的信息
// 有多个CPUCFG,这些CFG是干嘛用的,同时read_cpucfd好像返回的都是0,怎么回事
c->processor_id = read_cpucfg(LOONGARCH_CPUCFG0);
c->fpu_vers = (read_cpucfg(LOONGARCH_CPUCFG2) >> 3) & 0x3;
c->writecombine = _CACHE_SUC;
c->fpu_csr31 = FPU_CSR_RN;
c->fpu_msk31 = FPU_CSR_RSVD | FPU_CSR_ABS2008 | FPU_CSR_NAN2008;
switch (c->processor_id & PRID_COMP_MASK) {
case PRID_COMP_LOONGSON:
cpu_probe_loongson(c, cpu); // 通过这个函数探测CPU类型
break;
}
BUG_ON(!__cpu_family[cpu]);
BUG_ON(c->cputype == CPU_UNKNOWN);
/*
* Platform code can force the cpu type to optimize code
* generation. In that case be sure the cpu type is correctly
* manually setup otherwise it could trigger some nasty bugs.
*/
BUG_ON(current_cpu_type() != c->cputype);
if (loongarch_fpu_disabled)
c->options &= ~LOONGARCH_CPU_FPU;
if (c->options & LOONGARCH_CPU_FPU)
cpu_set_fpu_opts(c);
else
cpu_set_nofpu_opts(c);
if (cpu_has_lsx)
elf_hwcap |= HWCAP_LOONGARCH_LSX;
if (cpu_has_lasx)
elf_hwcap |= HWCAP_LOONGARCH_LASX;
if (cpu_has_lvz && IS_ENABLED(CONFIG_KVM_LOONGARCH_VZ)) {
cpu_probe_lvz(c);
elf_hwcap |= HWCAP_LOONGARCH_LVZ;
}
elf_hwcap |= HWCAP_LOONGARCH_CRC32;
cpu_probe_vmbits(c);
#ifdef CONFIG_64BIT
if (cpu == 0)
__ua_limit = ~((1ull << cpu_vmbits) - 1);
#endif
}
plat_early_init
void __init fw_init_cmdline(void)
{
int i;
fw_argc = fw_arg0; // 参数个数
_fw_argv = (long *)fw_arg1; // 参数的字符串数组
_fw_envp = (long *)fw_arg2; // 环境变量
arcs_cmdline[0] = '\0';
for (i = 1; i < fw_argc; i++) {
strlcat(arcs_cmdline, fw_argv(i), COMMAND_LINE_SIZE);
if (i < (fw_argc - 1))
strlcat(arcs_cmdline, " ", COMMAND_LINE_SIZE);
}
}
void __init prom_init_env(void)
{
efi_bp = (struct bootparamsinterface *)_fw_envp;
// 设置 smp_gropu 寄存器,但不知道为什么要设置这些寄存器
loongson_regaddr_set(smp_group, 0x800000001fe01000, 16);
loongson_sysconf.ht_control_base = 0x80000EFDFB000000;
loongson_regaddr_set(loongson_chipcfg, 0x800000001fe00180, 16);
loongson_regaddr_set(loongson_chiptemp, 0x800000001fe0019c, 16);
loongson_regaddr_set(loongson_freqctrl, 0x800000001fe001d0, 16);
loongson_regaddr_set(loongson_tempinthi, 0x800000001fe01460, 16);
loongson_regaddr_set(loongson_tempintlo, 0x800000001fe01468, 16);
loongson_regaddr_set(loongson_tempintsta, 0x800000001fe01470, 16);
loongson_regaddr_set(loongson_tempintup, 0x800000001fe01478, 16);
loongson_sysconf.io_base_irq = LOONGSON_PCH_IRQ_BASE;
loongson_sysconf.io_last_irq = LOONGSON_PCH_IRQ_BASE + 256;
loongson_sysconf.msi_base_irq = LOONGSON_PCI_MSI_IRQ_BASE;
loongson_sysconf.msi_last_irq = LOONGSON_PCI_MSI_IRQ_BASE + 192;
loongson_sysconf.msi_address_hi = 0;
loongson_sysconf.msi_address_lo = 0x2FF00000;
loongson_sysconf.dma_mask_bits = LOONGSON_DMA_MASK_BIT;
loongson_sysconf.pcie_wake_enabled =
!(readw(LS7A_PM1_ENA_REG) & ACPI_PCIE_WAKEUP_STATUS);
if (list_find(efi_bp->extlist))
printk("Scan bootparm failed\n");
}
这个函数本来以为只是解析 bios 传入的参数,但后来看 bootmem 的过程中发现,bootmem 用的是 memblock 实现的,不是之前的位图,所以对这个函数详细分析。
重要的数据结构:
// 这个应该就是bootmem的数据结构,书上说是用位图的方式,但这里改用 mem_start 和 mem_size 表示内存空间
// 这个是 BIOS 内存分布图,记录了包括 NUMA 节点和多种类型在内的内存信息。
struct loongsonlist_mem_map {
struct _extention_list_hdr header; /*{"M", "E", "M"}*/
u8 map_count;
struct _loongson_mem_map {
u32 mem_type;
u64 mem_start;
u64 mem_size;
}__attribute__((packed))map[LOONGSON3_BOOT_MEM_MAP_MAX];
}__attribute__((packed));
void __init memblock_and_maxpfn_init(void)
{
int i;
u32 mem_type;
u64 mem_start, mem_end, mem_size;
/* parse memory information */
for (i = 0; i < loongson_mem_map->map_count; i++) { // 将 map 中的虚拟内存依次挂载
mem_type = loongson_mem_map->map[i].mem_type; // loongson_mem_map 在哪里初始化的?目前没有找到
mem_start = loongson_mem_map->map[i].mem_start;
mem_size = loongson_mem_map->map[i].mem_size;
mem_end = mem_start + mem_size;
switch (mem_type) {
case ADDRESS_TYPE_SYSRAM:
memblock_add(mem_start, mem_size); // 分配物理内存
if (max_low_pfn < (mem_end >> PAGE_SHIFT))
max_low_pfn = mem_end >> PAGE_SHIFT;
break;
}
}
memblock_set_current_limit(PFN_PHYS(max_low_pfn));
}
memblock_add_range
就是 bootmem 的 allocator,初始化过程中,所有的内存挂载,物理页的 reserved,都是通过这个函数进行。
/**
* memblock_add_range - add new memblock region
* @type: memblock type to add new region into
* @base: base address of the new region
* @size: size of the new region
* @nid: nid of the new region
* @flags: flags of the new region
*
* Add new memblock region [@base, @base + @size) into @type. The new region
* is allowed to overlap with existing ones - overlaps don't affect already
* existing regions. @type is guaranteed to be minimal (all neighbouring
* compatible regions are merged) after the addition.
*
* Return:
* 0 on success, -errno on failure.
*/
int __init_memblock memblock_add_range(struct memblock_type *type,
phys_addr_t base, phys_addr_t size,
int nid, enum memblock_flags flags)
{
bool insert = false;
phys_addr_t obase = base;
phys_addr_t end = base + memblock_cap_size(base, &size); // 防止溢出
int idx, nr_new;
struct memblock_region *rgn; // 每个 memblock_region 表示一块内存,而不再是一页一页的表示
if (!size)
return 0;
/* special case for empty array */
if (type->regions[0].size == 0) { // 该种type的第一个region
WARN_ON(type->cnt != 1 || type->total_size);
type->regions[0].base = base;
type->regions[0].size = size;
type->regions[0].flags = flags;
memblock_set_region_node(&type->regions[0], nid);
type->total_size = size;
return 0;
}
repeat:
/*
* The following is executed twice. Once with %false @insert and
* then with %true. The first counts the number of regions needed
* to accommodate the new area. The second actually inserts them.
*/
base = obase;
nr_new = 0;
for_each_memblock_type(idx, type, rgn) {
phys_addr_t rbase = rgn->base;
phys_addr_t rend = rbase + rgn->size; // 内存从低往高增长
if (rbase >= end) // overlap,直接跳转到 if(!insert)
break;
if (rend <= base) // 该 region 的 end < base,也就是说会出现碎片,尝试存储在下一个 region
continue;
/*
* @rgn overlaps. If it separates the lower part of new
* area, insert that portion.
*/
if (rbase > base) {
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
WARN_ON(nid != memblock_get_region_node(rgn));
#endif
WARN_ON(flags != rgn->flags);
nr_new++;
if (insert)
memblock_insert_region(type, idx++, base,
rbase - base, nid,
flags);
}
/* area below @rend is dealt with, forget about it */
base = min(rend, end);
}
/* insert the remaining portion */
if (base < end) {
nr_new++;
if (insert)
memblock_insert_region(type, idx, base, end - base,
nid, flags);
}
if (!nr_new)
return 0;
/*
* If this was the first round, resize array and repeat for actual
* insertions; otherwise, merge and return.
*/
if (!insert) {
while (type->cnt + nr_new > type->max)
if (memblock_double_array(type, obase, size) < 0)
return -ENOMEM;
insert = true;
goto repeat;
} else {
memblock_merge_regions(type);
return 0;
}
}
prom_init
void __init prom_init(void)
{
/* init base address of io space */
set_io_port_base((unsigned long) // ioremap 获取到 io base 的物理地址后
// // set_io_port_base 将其赋值给全局变量loongarch_io_port_base
ioremap(LOONGSON_LIO_BASE, LOONGSON_LIO_SIZE));
if (efi_bp) { // efi_bp是在prom_init_env中用bios传递的_fw_envp赋值的
efi_init(); // 为什么要初始化efi,efi和acpi有什么关系?
#if defined(CONFIG_ACPI) && defined(CONFIG_BLK_DEV_INITRD)
acpi_table_upgrade(); // 这部分初始化看不懂,为什么要从cpio中获取数据。应该是bios将数据保存成这种格式。
#endif
#ifdef CONFIG_ACPI
acpi_gbl_use_default_register_widths = false;
acpi_boot_table_init();
acpi_boot_init();
#endif
if (!cpu_has_hypervisor)
loongarch_pci_ops = &ls7a_pci_ops;
else
loongarch_pci_ops = &virt_pci_ops;
}
if (nr_pch_pics == 0)
register_pch_pic(0, LS7A_PCH_REG_BASE,
LOONGSON_PCH_IRQ_BASE);
#ifdef CONFIG_NUMA
prom_init_numa_memory(); //
#else
prom_init_memory();
#endif
if (efi_bp) {
dmi_scan_machine();
if (dmi_available) {
dmi_set_dump_stack_arch_desc();
smbios_parse();
}
}
pr_info("The BIOS Version: %s\n", b_info.bios_version);
efi_runtime_init();
register_smp_ops(&loongson3_smp_ops);
loongson_acpi_init();
}
对ACPI进一步分析:
首先分析重要的数据结构 RSDT,RSDT 分为 the header 和 data 两个部分,the header 是所有 SDT 共有的。
struct acpi_table_header {
// All the ACPI tables have a 4 byte Signature field (except the RSDP which has an 8 byte one).
// Using the signature, you can determine what table are you working with.
char signature[ACPI_NAME_SIZE]; /* ASCII table signature */
u32 length; /* Length of table in bytes, including this header */
u8 revision; /* ACPI Specification minor version number */
u8 checksum; /* To make sum of entire table == 0 */
char oem_id[ACPI_OEM_ID_SIZE]; /* ASCII OEM identification */
char oem_table_id[ACPI_OEM_TABLE_ID_SIZE]; /* ASCII OEM table identification */
u32 oem_revision; /* OEM revision number */
char asl_compiler_id[ACPI_NAME_SIZE]; /* ASCII ASL compiler vendor ID */
u32 asl_compiler_revision; /* ASL compiler version */
};
这个函数并不是初始化 RSDT 的,而是初始化所有的 ACPI 表。
void __init acpi_table_upgrade(void)
{
void *data = (void *)initrd_start;
size_t size = initrd_end - initrd_start;
int sig, no, table_nr = 0, total_offset = 0;
long offset = 0;
struct acpi_table_header *table;
char cpio_path[32] = "kernel/firmware/acpi/"; // bios获取到的数据
struct cpio_data file;
if (data == NULL || size == 0)
return;
for (no = 0; no < NR_ACPI_INITRD_TABLES; no++) {
file = find_cpio_data(cpio_path, data, size, &offset);
if (!file.data)
break;
data += offset;
size -= offset;
if (file.size < sizeof(struct acpi_table_header)) {
pr_err("ACPI OVERRIDE: Table smaller than ACPI header [%s%s]\n",
cpio_path, file.name);
continue;
}
table = file.data; // file.data就是table,接下来初始化对应的ACPI表
for (sig = 0; table_sigs[sig]; sig++) // 找到对应的ACPI表
if (!memcmp(table->signature, table_sigs[sig], 4))
break;
if (!table_sigs[sig]) {
pr_err("ACPI OVERRIDE: Unknown signature [%s%s]\n",
cpio_path, file.name);
continue;
}
if (file.size != table->length) {
pr_err("ACPI OVERRIDE: File length does not match table length [%s%s]\n",
cpio_path, file.name);
continue;
}
// A 8-bit checksum field of the whole table, inclusive of the header.
// All bytes of the table summed must be equal to 0 (mod 0x100).
if (acpi_table_checksum(file.data, table->length)) {
pr_err("ACPI OVERRIDE: Bad table checksum [%s%s]\n",
cpio_path, file.name);
continue;
}
pr_info("%4.4s ACPI table found in initrd [%s%s][0x%x]\n",
table->signature, cpio_path, file.name, table->length);
all_tables_size += table->length;
acpi_initrd_files[table_nr].data = file.data; // 记录所有初始化的表信息
acpi_initrd_files[table_nr].size = file.size;
table_nr++;
}
if (table_nr == 0)
return;
acpi_tables_addr = // 为初始化的ACPI表分配物理地址
memblock_find_in_range(0, ACPI_TABLE_UPGRADE_MAX_PHYS,
all_tables_size, PAGE_SIZE);
if (!acpi_tables_addr) {
WARN_ON(1);
return;
}
/*
* Only calling e820_add_reserve does not work and the
* tables are invalid (memory got used) later.
* memblock_reserve works as expected and the tables won't get modified.
* But it's not enough on X86 because ioremap will
* complain later (used by acpi_os_map_memory) that the pages
* that should get mapped are not marked "reserved".
* Both memblock_reserve and e820__range_add (via arch_reserve_mem_area)
* works fine.
*/
memblock_reserve(acpi_tables_addr, all_tables_size); // 这里为什么要设为reserve还不清楚
arch_reserve_mem_area(acpi_tables_addr, all_tables_size);
/*
* early_ioremap only can remap 256k one time. If we map all
* tables one time, we will hit the limit. Need to map chunks
* one by one during copying the same as that in relocate_initrd().
*/
for (no = 0; no < table_nr; no++) { // 这里应该是将分配好的物理空间进行映射
unsigned char *src_p = acpi_initrd_files[no].data;
phys_addr_t size = acpi_initrd_files[no].size;
phys_addr_t dest_addr = acpi_tables_addr + total_offset;
phys_addr_t slop, clen;
char *dest_p;
total_offset += size;
while (size) {
slop = dest_addr & ~PAGE_MASK;
clen = size;
if (clen > MAP_CHUNK_SIZE - slop)
clen = MAP_CHUNK_SIZE - slop;
dest_p = early_memremap(dest_addr & PAGE_MASK,
clen + slop);
memcpy(dest_p + slop, src_p, clen);
early_memunmap(dest_p, clen + slop);
src_p += clen;
dest_addr += clen;
size -= clen;
}
}
}
按照注释,这个才是获取 RSDT 的,但为什么这里又要初始化一个各种表,和上一个函数有什么区别?
猜想:不是初始化其他表的,而是建立 RSDT 与其他表的关联,因为 RSDT 包含了所有指向其他系统表的指针。
/*******************************************************************************
*
* FUNCTION: acpi_initialize_tables
*
* PARAMETERS: initial_table_array - Pointer to an array of pre-allocated
* struct acpi_table_desc structures. If NULL, the
* array is dynamically allocated.
* initial_table_count - Size of initial_table_array, in number of
* struct acpi_table_desc structures
* allow_resize - Flag to tell Table Manager if resize of
* pre-allocated array is allowed. Ignored
* if initial_table_array is NULL.
*
* RETURN: Status
*
* DESCRIPTION: Initialize the table manager, get the RSDP and RSDT/XSDT.
*
* NOTE: Allows static allocation of the initial table array in order
* to avoid the use of dynamic memory in confined environments
* such as the kernel boot sequence where it may not be available.
*
* If the host OS memory managers are initialized, use NULL for
* initial_table_array, and the table will be dynamically allocated.
*
******************************************************************************/
acpi_status ACPI_INIT_FUNCTION
acpi_initialize_tables(struct acpi_table_desc *initial_table_array,
u32 initial_table_count, u8 allow_resize)
{
acpi_physical_address rsdp_address;
acpi_status status;
ACPI_FUNCTION_TRACE(acpi_initialize_tables);
/*
* Setup the Root Table Array and allocate the table array
* if requested
*/
if (!initial_table_array) {
status = acpi_allocate_root_table(initial_table_count);
if (ACPI_FAILURE(status)) {
return_ACPI_STATUS(status);
}
} else {
/* Root Table Array has been statically allocated by the host */
memset(initial_table_array, 0,
(acpi_size)initial_table_count *
sizeof(struct acpi_table_desc));
acpi_gbl_root_table_list.tables = initial_table_array;
acpi_gbl_root_table_list.max_table_count = initial_table_count;
acpi_gbl_root_table_list.flags = ACPI_ROOT_ORIGIN_UNKNOWN;
if (allow_resize) {
acpi_gbl_root_table_list.flags |=
ACPI_ROOT_ALLOW_RESIZE;
}
}
/* Get the address of the RSDP */
rsdp_address = acpi_os_get_root_pointer();
if (!rsdp_address) {
return_ACPI_STATUS(AE_NOT_FOUND);
}
/*
* Get the root table (RSDT or XSDT) and extract all entries to the local
* Root Table Array. This array contains the information of the RSDT/XSDT
* in a common, more useable format.
*/
status = acpi_tb_parse_root_table(rsdp_address);
return_ACPI_STATUS(status);
}
static int __init numa_mem_init(int (*init_func)(void))
{
int i;
int ret;
int node;
for (i = 0; i < CONFIG_NR_CPUS; i++)
set_cpuid_to_node(i, NUMA_NO_NODE);
nodes_clear(numa_nodes_parsed); // 初始化前先清0
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
memset(&numa_meminfo, 0, sizeof(numa_meminfo));
numa_default_distance(); // 初始化节点距离矩阵
/* Parse SRAT and SLIT if provided by firmware. */
ret = init_func();
if (ret < 0)
return ret;
node_possible_map = numa_nodes_parsed;
if (WARN_ON(nodes_empty(node_possible_map)))
return -EINVAL;
init_node_memblock(); // 逐个分析内存分布图并将结果通过add_mem_region保存到loongson_mem_map中
if (numa_meminfo_cover_memory(&numa_meminfo) == false)
return -EINVAL;
for_each_node_mask(node, node_possible_map) { // 建立逻辑CPU和节点的映射关系(CPU拓扑图)
node_mem_init(node); // 描述哪个核属于哪个节点
node_set_online(node);
__node_data[(node)]->cpumask = cpus_on_node[node];
}
max_low_pfn = PHYS_PFN(memblock_end_of_DRAM());
return 0;
}
arch_mem_init
/*
* arch_mem_init - initialize memory management subsystem
*
* o plat_mem_setup() detects the memory configuration and will record detected
* memory areas using add_memory_region.
*
* At this stage the memory configuration of the system is known to the
* kernel but generic memory management system is still entirely uninitialized.
*
* o bootmem_init()
* o sparse_init()
* o paging_init()
* o dma_contiguous_reserve()
*
* At this stage the bootmem allocator is ready to use.
*
* NOTE: historically plat_mem_setup did the entire platform initialization.
* This was rather impractical because it meant plat_mem_setup had to
* get away without any kind of memory allocator. To keep old code from
* breaking plat_setup was just renamed to plat_mem_setup and a second platform
* initialization hook for anything else was introduced.
*/
static void __init arch_mem_init(char **cmdline_p)
{
unsigned int node;
unsigned long start_pfn, end_pfn;
struct memblock_region *reg;
extern void plat_mem_setup(void);
#ifdef CONFIG_MACH_LOONGSON64
bool enable;
#endif
/* call board setup routine */
plat_mem_setup(); // 初始化系统控制台——哑控制台,同时通过early_init_dt_scan_nodes进行早期的FDT校验和初始化
memblock_set_bottom_up(true);
early_init_fdt_reserve_self();
early_init_fdt_scan_reserved_mem();
if (loongson_fdt_blob)
dt_bootmem_init(); // 建立 boot_mem_map 内存映射图
// 这里应该不是建立,而是建立好了将其挂载到物理空间,所以关键还是找到boot_mem_map在哪里初始化的
else
bootmem_init();
/*
* Prevent memblock from allocating high memory.
* This cannot be done before max_low_pfn is detected, so up
* to this point is possible to only reserve physical memory
* with memblock_reserve; memblock_virt_alloc* can be used
* only after this point
*/
memblock_set_current_limit(PFN_PHYS(max_low_pfn));
#ifdef CONFIG_PROC_VMCORE
if (setup_elfcorehdr && setup_elfcorehdr_size) {
printk(KERN_INFO "kdump reserved memory at %lx-%lx\n",
setup_elfcorehdr, setup_elfcorehdr_size);
memblock_reserve(setup_elfcorehdr, setup_elfcorehdr_size);
}
#endif
loongarch_parse_crashkernel();
#ifdef CONFIG_KEXEC
if (crashk_res.start != crashk_res.end)
memblock_reserve(crashk_res.start,
crashk_res.end - crashk_res.start + 1);
#endif
for_each_online_node(node) {
get_pfn_range_for_nid(node, &start_pfn, &end_pfn);
reserve_crashm_region(node, start_pfn, end_pfn);
reserve_oldmem_region(node, start_pfn, end_pfn);
}
device_tree_init();// 解析和初始化设备树
#ifdef CONFIG_MACH_LOONGSON64
enable = memblock_bottom_up();
memblock_set_bottom_up(false);
#endif
sparse_init(); // 初始化稀疏型内存模型
#ifdef CONFIG_MACH_LOONGSON64
memblock_set_bottom_up(enable);
#endif
plat_swiotlb_setup(); // swiotlb是一个纯软件中转站,用于让访存能力有限的IO设备能够访问任意的DMA空间
dma_contiguous_reserve(PFN_PHYS(max_low_pfn));
/* Tell bootmem about cma reserved memblock section */
for_each_memblock(reserved, reg)
if (reg->size != 0)
memblock_reserve(reg->base, reg->size);
reserve_nosave_region();
}
初始化设备树可以看这里,分析的很详细。
plat_swiotlb_setup
void __init
swiotlb_init(int verbose)
{
size_t default_size = IO_TLB_DEFAULT_SIZE;
unsigned char *vstart;
unsigned long bytes;
if (!io_tlb_nslabs) { // io_tlb_nslabs表示有多少个slab,一个slab是2K
io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
}
bytes = io_tlb_nslabs << IO_TLB_SHIFT;
/* Get IO TLB memory from the low pages */
vstart = memblock_virt_alloc_low_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE); // 从低地址分配物理空间
if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) // 进一步初始化
return;
if (io_tlb_start) { // 分配失败,释放物理内存
memblock_free_early(io_tlb_start,
PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
io_tlb_start = 0;
}
pr_warn("Cannot allocate buffer");
no_iotlb_memory = true;
}
plat_smp_setup
LoongArch 也使用 loongson3_smp_setup
进行初始化。
源码分析:
const struct plat_smp_ops loongson3_smp_ops = {
.send_ipi_single = loongson3_send_ipi_single, // 核间通讯
.send_ipi_mask = loongson3_send_ipi_mask, // 核间通讯
.smp_setup = loongson3_smp_setup, // 主核启动
.prepare_cpus = loongson3_prepare_cpus,
.boot_secondary = loongson3_boot_secondary, // 辅核启动
.init_secondary = loongson3_init_secondary,
.smp_finish = loongson3_smp_finish,
#ifdef CONFIG_HOTPLUG_CPU // CPU热拔插
.cpu_disable = loongson3_cpu_disable,
.cpu_die = loongson3_cpu_die,
#endif
};
static void __init loongson3_smp_setup(void)
{
int i = 0, num = 0; /* i: physical id, num: logical id */
if (acpi_disabled) {
init_cpu_possible(cpu_none_mask);
while (i < MAX_CPUS) {
if (loongson_sysconf.reserved_cpus_mask & (0x1UL << i)) { // reserved_cpus_mask非0,该核不用
/* Reserved physical CPU cores */
__cpu_number_map[i] = -1;
} else { // 建立CPU逻辑编号和物理编号的对应关系
__cpu_number_map[i] = num;
__cpu_logical_map[num] = i;
set_cpu_possible(num, true);
num++;
}
i++;
}
pr_info("Detected %i available CPU(s)\n", num);
while (num < MAX_CPUS) {
__cpu_logical_map[num] = -1;
num++;
}
}
ipi_method_init(); // ipi(核间中断)初始化
ipi_set_regs_init();
ipi_clear_regs_init();
ipi_status_regs_init();
ipi_en_regs_init();
ipi_mailbox_buf_init();
if (cpu_has_csripi)
iocsr_writel(0xffffffff, LOONGARCH_IOCSR_IPI_EN);
else
xconf_writel(0xffffffff, ipi_en_regs[cpu_logical_map(0)]);
cpu_set_core(&cpu_data[0],
cpu_logical_map(0) % loongson_sysconf.cores_per_package);
cpu_set_cluster(&cpu_data[0],
cpu_logical_map(0) / loongson_sysconf.cores_per_package);
cpu_data[0].package = cpu_logical_map(0) / loongson_sysconf.cores_per_package; // 确定主核的封装编号和核编号
}
paging_init
void __init free_area_init_nodes(unsigned long *max_zone_pfn)
{
unsigned long start_pfn, end_pfn;
int i, nid;
/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
sizeof(arch_zone_lowest_possible_pfn));
memset(arch_zone_highest_possible_pfn, 0,
sizeof(arch_zone_highest_possible_pfn));
start_pfn = find_min_pfn_with_active_regions();
for (i = 0; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
end_pfn = max(max_zone_pfn[i], start_pfn); // an array of max PFNs for each zone
arch_zone_lowest_possible_pfn[i] = start_pfn;
arch_zone_highest_possible_pfn[i] = end_pfn;
start_pfn = end_pfn;
}
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
find_zone_movable_pfns_for_nodes();
/* Print out the zone ranges */
pr_info("Zone ranges:\n");
for (i = 0; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
pr_info(" %-8s ", zone_names[i]);
if (arch_zone_lowest_possible_pfn[i] ==
arch_zone_highest_possible_pfn[i])
pr_cont("empty\n"); // If the maximum PFN between two adjacent zones match,
else // it is assumed that the zone is empty.
pr_cont("[mem %#018Lx-%#018Lx]\n",
(u64)arch_zone_lowest_possible_pfn[i]
<< PAGE_SHIFT,
((u64)arch_zone_highest_possible_pfn[i]
<< PAGE_SHIFT) - 1);
}
/* Print out the PFNs ZONE_MOVABLE begins at in each node */
pr_info("Movable zone start for each node\n");
for (i = 0; i < MAX_NUMNODES; i++) {
if (zone_movable_pfn[i])
pr_info(" Node %d: %#018Lx\n", i,
(u64)zone_movable_pfn[i] << PAGE_SHIFT);
}
/* Print out the early node map */
pr_info("Early memory node ranges\n");
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
((u64)end_pfn << PAGE_SHIFT) - 1);
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
zero_resv_unavail();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
free_area_init_node(nid, NULL,
find_min_pfn_for_node(nid), NULL);
/* Any memory on that node */
if (pgdat->node_present_pages)
node_set_state(nid, N_MEMORY);
check_for_memory(pgdat, nid);
}
}
trap_init
异常初始化。
trap_init
| -- set_handle // 将不同的trap_handler加载到对应的内存位置
| | -- memcpy // 每个handler大小为vec_size,所以要EXCCODE * vec_size
|
| -- cache_error_setup // Install uncached CPU exception handler
| | -- set_merr_handler // except_vec_cex就是cache exception handler
\
还有很多异常,如 PSI, HYP, GCM 等,为什么没有设置 handler?
应该是 set_handler
只负责设置 cpu exception handler,其他的都是由中断控制器接受中断,然后发送给 CPU。
void __init trap_init(void)
{
unsigned long i;
/*
* Initialise exception handlers
*/
for (i = 0; i < 64; i++)
set_handler(i * vec_size, handle_reserved, vec_size);
set_handler(EXCCODE_TLBL * vec_size, handle_tlbl, vec_size); // TLB miss on a load
set_handler(EXCCODE_TLBS * vec_size, handle_tlbs, vec_size); // TLB miss on a store
set_handler(EXCCODE_TLBI * vec_size, handle_tlbl, vec_size); // TLB miss on a ifetch(what is ifetch)
set_handler(EXCCODE_TLBM * vec_size, handle_tlbm, vec_size); // TLB modified fault
set_handler(EXCCODE_TLBRI * vec_size, tlb_do_page_fault_rixi, vec_size); // TLB Read-Inhibit exception
set_handler(EXCCODE_TLBXI * vec_size, tlb_do_page_fault_rixi, vec_size); // TLB Execution-Inhibit exception
set_handler(EXCCODE_ADE * vec_size, handle_ade, vec_size); // Address Error
set_handler(EXCCODE_UNALIGN * vec_size, handle_unalign, vec_size); // Unalign Access
set_handler(EXCCODE_SYS * vec_size, handle_sys_wrap, vec_size); // System call
set_handler(EXCCODE_BP * vec_size, handle_bp, vec_size); // Breakpoint
set_handler(EXCCODE_INE * vec_size, handle_ri, vec_size); // Inst. Not Exist
set_handler(EXCCODE_IPE * vec_size, handle_ri, vec_size); // Inst. Privileged Error
set_handler(EXCCODE_FPDIS * vec_size, handle_fpdis, vec_size); // FPU Disabled
set_handler(EXCCODE_LSXDIS * vec_size, handle_lsx, vec_size); // LSX Disabled
set_handler(EXCCODE_LASXDIS * vec_size, handle_lasx, vec_size); // LASX Disabled
set_handler(EXCCODE_FPE * vec_size, handle_fpe, vec_size); // Floating Point Exception
set_handler(EXCCODE_BTDIS * vec_size, handle_lbt, vec_size); // Binary Trans. Disabled
set_handler(EXCCODE_WATCH * vec_size, handle_watch, vec_size); // Watch address reference
cache_error_setup(); // Install uncached CPU exception handler
local_flush_icache_range(ebase, ebase + 0x400);
sort_extable(__start___dbe_table, __stop___dbe_table);
}
/* Install CPU exception handler */
void set_handler(unsigned long offset, void *addr, unsigned long size)
{
memcpy((void *)(ebase + offset), addr, size);
local_flush_icache_range(ebase + offset, ebase + offset + size);
}
/*
* Install uncached CPU exception handler.
* This is suitable only for the cache error exception which is the only
* exception handler that is being run uncached.
*/
void set_merr_handler(unsigned long offset, void *addr, unsigned long size)
{
unsigned long uncached_ebase = TO_UNCAC(__pa(merror_ebase));
if (!addr)
panic(panic_null_cerr);
memcpy((void *)(uncached_ebase + offset), addr, size);
}
init_IRQ
init_IRQ
| -- irq_set_noprobe;
|
| -- arch_init_irq;
| | -- setup_IRQ;
|
在 init_IRQ
之前还有一个函数——early_irq_init
,用于初始化中断描述符——irq_desc
,irq_desc
中包含了每个中断号(IRQ)的芯片数据 irq_data
和中断处理程序 irqaction
等信息。该函数只是设置默认信息,体系相关的设置有 init_IRQ
完成。
int __init early_irq_init(void)
{
int count, i, node = first_online_node;
struct irq_desc *desc;
init_irq_default_affinity();
printk(KERN_INFO "NR_IRQS: %d\n", NR_IRQS);
desc = irq_desc;
count = ARRAY_SIZE(irq_desc);
for (i = 0; i < count; i++) { // 设置默认信息
desc[i].kstat_irqs = alloc_percpu(unsigned int);
alloc_masks(&desc[i], node);
raw_spin_lock_init(&desc[i].lock);
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
mutex_init(&desc[i].request_mutex);
desc_set_defaults(i, &desc[i], node, NULL, NULL);
}
return arch_early_irq_init();
}