[PATCH] Reduce per_cpu allocations to the minimum needed for boot -V5.

From: Robin Holt <holt_at_sgi.com>
Date: 2008-02-12 05:09:02
This attached patch significantly shrinks boot memory allocation on ia64.
It does this by not allocating per_cpu areas for cpus that can never
exist.

In the case where acpi does not have any numa node description of the
cpus, I defaulted to assigning the first 32 round-robin on the known
nodes..  For the !CONFIG_ACPI  I used for_each_possible_cpu().


Signed-off-by: Robin Holt <holt@sgi.com>

---

I tested all the different config options.  allyesconfig fails with
or without this patch so that was the one exception.  Otherwise,
allnoconfig, allmodconfig, deconfig, and configs/* all compiled.
Additionally, I booted the sn2- and defconfig both on altix and the
defconfig on a zx2000 with 2 cpus.  I would like it if somebody with
access to a simulator could build and boot this.  That is a different
code path which I have no means of checking.

Version 5:

I went too quickly.  Shortly after I sent the last email, I got a reply
from HP saying 16 was their largest non-numa box.  I will therefore go
back to the 32 Tony and I discussed last Friday.

Version 4:

Changed the reservation of additional per_cpu space to round-robin on
the known nodes.

Cleaned up a copy other loops to use for_each_possible_early_cpu().

Changed the default number of cpus to 256 and also changed the lower
threshold to only apply when no early boot cpus are found.  This change
was prompted by an note from HP that they support 256 cpus.  They did
mention this is on a NUMA box, but I have not currently received a reply
as to whether the cpu locations are described in the ACPI tables.

Version 3:

I reworked this patch to use a cpumask to track the cpus we have seen.
It still initializes the .nid to NUMA_NO_NODE (-1).  The introcution of
a bitmask makes the scans much cleaner.

This patch could be using the cpu_possible_map instead of our own.
I was reluctant to do that, but there is nothing that prevents it.
Does anybody have an opinion?


Version 2 fixed a port bug.  It also introduces NUMA_NO_NODE for ia64.
This is a direct copy from x86.

One comment I have received is the hard-coded 4 described above should
probably be 8 or 16 to handle larger non-NUMA machines.  I originally
set it to 4 because my recollection was that, at most, you could have
four processors per FSB, but maybe that is just an SGI limitation.

How should this be set?  Should I be using a PAL call? processor model?
Limit by current FSB spec and adjust as new processors come along?


Using a patched SuSE SLES10 kernel with both the mca patch that Jack/Russ
submitted a couple days ago and the attached.

On a 2 cpu, 6GB system, NR_CPUS=4096:
Before the patch:
Memory: 5687728k/6234784k available (5777k code, 579632k reserved, 10450k data,
672k init)
After both patches:
Memory: 6211984k/6235040k available (5552k code, 55376k reserved, 10418k data, 656k init)
90% savings on reserved.

On a 1 cpu, 1GB system, NR_CPUS=4096 before 572,464K, after 37,456k for
a 93% savings.


Index: per_cpu_v4/arch/ia64/kernel/setup.c
===================================================================
--- per_cpu_v4.orig/arch/ia64/kernel/setup.c	2008-02-11 06:22:41.586019474 -0600
+++ per_cpu_v4/arch/ia64/kernel/setup.c	2008-02-11 12:05:29.030432470 -0600
@@ -45,6 +45,7 @@
 #include <linux/cpufreq.h>
 #include <linux/kexec.h>
 #include <linux/crash_dump.h>
+#include <linux/numa.h>
 
 #include <asm/ia32.h>
 #include <asm/machvec.h>
@@ -494,9 +495,12 @@ setup_arch (char **cmdline_p)
 # ifdef CONFIG_ACPI_NUMA
 	acpi_numa_init();
 # endif
+	per_cpu_scan_finalize((cpus_weight(early_cpu_possible_map) == 0 ?
+		32 : cpus_weight(early_cpu_possible_map)), additional_cpus);
 #else
 # ifdef CONFIG_SMP
 	smp_build_cpu_map();	/* happens, e.g., with the Ski simulator */
+	per_cpu_scan_finalize(num_possible_cpus(), additional_cpus);
 # endif
 #endif /* CONFIG_APCI_BOOT */
 
Index: per_cpu_v4/arch/ia64/mm/discontig.c
===================================================================
--- per_cpu_v4.orig/arch/ia64/mm/discontig.c	2008-02-11 06:22:41.610022488 -0600
+++ per_cpu_v4/arch/ia64/mm/discontig.c	2008-02-11 06:24:46.513705386 -0600
@@ -104,7 +104,7 @@ static int __meminit early_nr_cpus_node(
 {
 	int cpu, n = 0;
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
+	for_each_possible_early_cpu(cpu)
 		if (node == node_cpuid[cpu].nid)
 			n++;
 
@@ -142,7 +142,7 @@ static void *per_cpu_node_setup(void *cp
 #ifdef CONFIG_SMP
 	int cpu;
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+	for_each_possible_early_cpu(cpu) {
 		if (node == node_cpuid[cpu].nid) {
 			memcpy(__va(cpu_data), __phys_per_cpu_start,
 			       __per_cpu_end - __per_cpu_start);
@@ -345,7 +345,7 @@ static void __init initialize_pernode_da
 
 #ifdef CONFIG_SMP
 	/* Set the node_data pointer for each per-cpu struct */
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+	for_each_possible_early_cpu(cpu) {
 		node = node_cpuid[cpu].nid;
 		per_cpu(cpu_info, cpu).node_data = mem_data[node].node_data;
 	}
@@ -493,13 +493,9 @@ void __cpuinit *per_cpu_init(void)
 	int cpu;
 	static int first_time = 1;
 
-
-	if (smp_processor_id() != 0)
-		return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
-
 	if (first_time) {
 		first_time = 0;
-		for (cpu = 0; cpu < NR_CPUS; cpu++)
+		for_each_possible_early_cpu(cpu)
 			per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu];
 	}
 
Index: per_cpu_v4/arch/ia64/kernel/acpi.c
===================================================================
--- per_cpu_v4.orig/arch/ia64/kernel/acpi.c	2008-02-11 06:22:41.538013446 -0600
+++ per_cpu_v4/arch/ia64/kernel/acpi.c	2008-02-11 09:10:49.016485958 -0600
@@ -482,6 +482,7 @@ acpi_numa_processor_affinity_init(struct
 	    (pa->apic_id << 8) | (pa->local_sapic_eid);
 	/* nid should be overridden as logical node id later */
 	node_cpuid[srat_num_cpus].nid = pxm;
+	cpu_set(srat_num_cpus, early_cpu_possible_map);
 	srat_num_cpus++;
 }
 
@@ -559,7 +560,7 @@ void __init acpi_numa_arch_fixup(void)
 	}
 
 	/* set logical node id in cpu structure */
-	for (i = 0; i < srat_num_cpus; i++)
+	for_each_possible_early_cpu(i)
 		node_cpuid[i].nid = pxm_to_node(node_cpuid[i].nid);
 
 	printk(KERN_INFO "Number of logical nodes in system = %d\n",
Index: per_cpu_v4/arch/ia64/kernel/numa.c
===================================================================
--- per_cpu_v4.orig/arch/ia64/kernel/numa.c	2008-02-11 06:22:41.578018469 -0600
+++ per_cpu_v4/arch/ia64/kernel/numa.c	2008-02-11 06:24:46.549709906 -0600
@@ -73,7 +73,7 @@ void __init build_cpu_to_node_map(void)
 	for(node=0; node < MAX_NUMNODES; node++)
 		cpus_clear(node_to_cpu_mask[node]);
 
-	for(cpu = 0; cpu < NR_CPUS; ++cpu) {
+	for_each_possible_early_cpu(cpu) {
 		node = -1;
 		for (i = 0; i < NR_CPUS; ++i)
 			if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) {
Index: per_cpu_v4/include/asm-ia64/acpi.h
===================================================================
--- per_cpu_v4.orig/include/asm-ia64/acpi.h	2008-02-11 06:22:51.167222639 -0600
+++ per_cpu_v4/include/asm-ia64/acpi.h	2008-02-11 06:24:46.569712417 -0600
@@ -115,7 +115,11 @@ extern unsigned int is_cpu_cpei_target(u
 extern void set_cpei_target_cpu(unsigned int cpu);
 extern unsigned int get_cpei_target_cpu(void);
 extern void prefill_possible_map(void);
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
 extern int additional_cpus;
+#else
+#define additional_cpus 0
+#endif
 
 #ifdef CONFIG_ACPI_NUMA
 #if MAX_NUMNODES > 256
Index: per_cpu_v4/include/asm-ia64/numa.h
===================================================================
--- per_cpu_v4.orig/include/asm-ia64/numa.h	2008-02-11 06:22:51.183224648 -0600
+++ per_cpu_v4/include/asm-ia64/numa.h	2008-02-11 11:39:05.266138236 -0600
@@ -22,6 +22,8 @@
 
 #include <asm/mmzone.h>
 
+#define NUMA_NO_NODE	-1
+
 extern u16 cpu_to_node_map[NR_CPUS] __cacheline_aligned;
 extern cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned;
 extern pg_data_t *pgdat_list[MAX_NUMNODES];
@@ -68,6 +70,31 @@ extern int paddr_to_nid(unsigned long pa
 extern void map_cpu_to_node(int cpu, int nid);
 extern void unmap_cpu_from_node(int cpu, int nid);
 
+extern cpumask_t early_cpu_possible_map;
+#define for_each_possible_early_cpu(cpu)  \
+	for_each_cpu_mask((cpu), early_cpu_possible_map)
+
+static inline void per_cpu_scan_finalize(int min_cpus, int reserve_cpus)
+{
+	int low_cpu, high_cpu;
+	int cpu;
+	int next_nid = 0;
+
+	low_cpu = cpus_weight(early_cpu_possible_map);
+
+	high_cpu = max(low_cpu, min_cpus);
+	high_cpu = min(high_cpu + reserve_cpus, NR_CPUS);
+
+	for (cpu = low_cpu; cpu <= high_cpu; cpu++) {
+		cpu_set(cpu, early_cpu_possible_map);
+		if (node_cpuid[cpu].nid == NUMA_NO_NODE) {
+			node_cpuid[cpu].nid = next_nid;
+			next_nid++;
+			if (next_nid >= num_online_nodes())
+				next_nid = 0;
+		}
+	}
+}
 
 #else /* !CONFIG_NUMA */
 #define map_cpu_to_node(cpu, nid)	do{}while(0)
@@ -75,6 +102,7 @@ extern void unmap_cpu_from_node(int cpu,
 
 #define paddr_to_nid(addr)	0
 
+static inline void per_cpu_scan_finalize(int min_cpus, int reserve_cpus) { }
 #endif /* CONFIG_NUMA */
 
 #endif /* _ASM_IA64_NUMA_H */
Index: per_cpu_v4/arch/ia64/mm/numa.c
===================================================================
--- per_cpu_v4.orig/arch/ia64/mm/numa.c	2008-02-11 06:22:41.610022488 -0600
+++ per_cpu_v4/arch/ia64/mm/numa.c	2008-02-11 06:24:46.629719951 -0600
@@ -27,7 +27,10 @@
  */
 int num_node_memblks;
 struct node_memblk_s node_memblk[NR_NODE_MEMBLKS];
-struct node_cpuid_s node_cpuid[NR_CPUS];
+struct node_cpuid_s node_cpuid[NR_CPUS] =
+	{ [0 ... NR_CPUS-1] = { .phys_id = 0, .nid = NUMA_NO_NODE } };
+cpumask_t early_cpu_possible_map = CPU_MASK_NONE;
+
 /*
  * This is a matrix with "distances" between nodes, they should be
  * proportional to the memory access latency ratios.
-
To unsubscribe from this list: send the line "unsubscribe linux-ia64" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Received on Tue Feb 12 05:09:17 2008

This archive was generated by hypermail 2.1.8 : 2008-02-12 05:09:34 EST