[PATCH 1/1] ia64: numa emulation

From: Lee Schermerhorn <lee.schermerhorn_at_hp.com>
Date: 2005-10-21 06:36:36
This patch subdivides an ia64 SMP platform into 2 or more emulated NUMA
nodes.  Applies to kernel 2.6.14-rc4.

Signed-off-by:  Lee Schermerhorn <lee.schermerhorn@hp.com>

---
This patch is a "work in progress" [sort of--I'm not really doing much
work on it recently].  You'll note a number of TODO's noting
questions/deferred decisions/...  

Also, the changes to mm/discontig.c could be eliminated.  Minor
"cleanup" [subjective, I know] that I left in.  

A few other changes to eliminate trailing whitespace in the files I
touched.

 arch/ia64/Kconfig        |   18 +
 arch/ia64/kernel/acpi.c  |  479 +++++++++++++++++++++++++++++++++++++++
+++++++-
 arch/ia64/kernel/efi.c   |  109 ++++++++++
 arch/ia64/mm/discontig.c |    9 
 fs/Kconfig               |    4 
 include/linux/efi.h      |    8 
 6 files changed, 616 insertions(+), 11 deletions(-)


--- fakenuma-2.6.14-rc4/arch/ia64/kernel/acpi.c~original	2005-10-17 11:56:51.000000000 -0400
+++ fakenuma-2.6.14-rc4/arch/ia64/kernel/acpi.c	2005-10-19 10:54:34.000000000 -0400
@@ -54,6 +54,10 @@
 #include <asm/sal.h>
 #include <asm/cyclone.h>
 
+#ifdef CONFIG_NUMA_EMU
+#include <asm/pgtable.h>	/* for IA64_GRANULE_SIZE */
+#endif
+
 #define BAD_MADT_ENTRY(entry, end) (                                        \
 		(!entry) || (unsigned long)entry + sizeof(*entry) > end ||  \
 		((acpi_table_entry_header *)entry)->length != sizeof(*entry))
@@ -174,6 +178,10 @@ static int available_cpus __initdata;
 struct acpi_table_madt *acpi_madt __initdata;
 static u8 has_8259;
 
+#ifdef CONFIG_NUMA_EMU
+static int __initdata already_parsed_lsapic = 0;
+#endif
+
 static int __init
 acpi_parse_lapic_addr_ovr(acpi_table_entry_header * header,
 			  const unsigned long end)
@@ -371,6 +379,12 @@ static void __init acpi_madt_oem_check(c
 
 static int __init acpi_parse_madt(unsigned long phys_addr, unsigned long size)
 {
+
+#ifdef CONFIG_NUMA_EMU
+	if (already_parsed_lsapic)
+		return 0;	/* been there, done that */
+#endif
+
 	if (!phys_addr || !size)
 		return -EINVAL;
 
@@ -485,20 +499,478 @@ acpi_numa_memory_affinity_init(struct ac
 	num_node_memblks++;
 }
 
+#ifdef CONFIG_NUMA_EMU
+
+#undef NUMA_EMU_DEBUG
+
+// TODO:  compute from page size and max order?
+#define NUMA_EMU_MIN_PER_NODE_MEM (1 << 30)	/* arbitrary:  1GB/node min */
+
+static int __initdata numa_fake = 0;	/* # of emulated nodes */
+
+struct acpi_table_slit_emu {
+	struct acpi_table_slit table;
+	u8                     entry[MAX_NUMNODES*MAX_NUMNODES];
+};
+static struct acpi_table_slit_emu __initdata acpi_table_slit_emu;
+
+/*
+ * Need a count of cpus to validate requested NUMA Emulation, but
+ * parse of lsapic doesn't happen until later.  So, count the
+ * cpus here, and let acpi_boot_init() know that we've already
+ * done it.
+ */
+static int __init
+acpi_numa_emu_count_cpus(void)
+{
+
+	if (acpi_table_parse(ACPI_APIC, acpi_parse_madt) < 1) {
+		printk(KERN_ERR PREFIX "Can't find MADT\n");
+		return 0;
+	}
+
+	if (acpi_table_parse_madt(ACPI_MADT_LSAPIC, acpi_parse_lsapic, NR_CPUS)
+		< 1) {
+		printk(KERN_ERR PREFIX
+			 "Error parsing MADT - no LAPIC entries\n");
+		return 0;
+	}
+	already_parsed_lsapic = 1;	/* skip it in acpi_boot_init() */
+
+#ifdef NUMA_EMU_DEBUG
+	printk("NUMA Emulation:  "
+		"%s found %d cpus\n", __FUNCTION__, available_cpus);
+#endif
+
+	return available_cpus;	/* counted by acpi_parse_lsapic() */
+}
+
+/*
+ * Callback for efi.c:efi_numa_emu_find_physmem()
+ * Add contiguous range of physical memory to node_memblk[].
+ * We'll assign affinity after all have been collected.
+ * Ranges arrive in address order from the efi memory map.
+ */
+static int __init
+acpi_numa_emu_add_memblk(unsigned long start, unsigned long end, void *arg)
+{
+	struct node_memblk_s *p = &node_memblk[num_node_memblks];
+
+	if (num_node_memblks >= NR_NODE_MEMBLKS)
+		return -1;	/* too many blocks */
+
+	p->start_paddr = start;
+	p->size        = end - start;
+	++num_node_memblks;
+
+	return 0;
+}
+
+/*
+ * acpi_numa_emu_memory_affinity():
+ *
+ * Use physical memory from SRAT [single node platform] or walk
+ * EFI memory map to find physical memory.  Distribute memory
+ * among emulated nodes.  Must distribute on "order boundary"
+ * to maintain sanity.
+ */
+//TODO:  make order boundary stuff conditional on VIRTUAL_MEM_MAP?
+#define ORDER_BOUNDARY (PAGE_SIZE << MAX_ORDER)
+#define ORDER_MASK     (ORDER_BOUNDARY-1)
+#define ORDERROUNDUP(n) (((n)+ORDER_MASK) & ~ORDER_MASK)
+
+static int __init
+acpi_numa_emu_memory_affinity(void)
+{
+	unsigned long total_mem = 0, per_node_mem, node_0_mem;
+	struct node_memblk_s *p, *pend;
+	int pxm = 0;
+
+	if (num_node_memblks > 0) {
+		/*
+		 * use info from SRAT
+		 */
+		for(p = &node_memblk[0]; p < &node_memblk[num_node_memblks];
+			 ++p) {
+			total_mem += p->size;
+		}
+	} else {
+		if(efi_numa_emu_find_physmem(acpi_numa_emu_add_memblk,
+						 &total_mem))
+			return -1;
+	}
+
+	pend = &node_memblk[num_node_memblks];
+	per_node_mem = GRANULEROUNDDOWN(total_mem / numa_fake);
+
+	if (per_node_mem < NUMA_EMU_MIN_PER_NODE_MEM)
+		return -1;
+
+	/*
+	 * give the left over to node 0
+	 */
+	node_0_mem = per_node_mem + (total_mem - (per_node_mem * numa_fake));
+
+#ifdef NUMA_EMU_DEBUG
+	printk("NUMA Emulation:  "
+		"total_mem %luMB, node_0_mem %luMB, per_node_mem %luMB\n"
+		"                 "
+		"before memblk affinitization:  num_node_memblks=%d\n",
+		(total_mem >> 20), (node_0_mem >> 20), (per_node_mem >> 20),
+		num_node_memblks);
+
+	for(p = &node_memblk[0]; p < pend; ++p) {
+		printk("NUMA Emulation:  "
+			"node_memblk[%lu]:  nid:  %d, range=[0x%016lx-0x%016lx)"
+			"(%luMB)\n",
+				p-node_memblk, p->nid, p->start_paddr,
+				p->start_paddr+p->size, p->size >> 20);
+	}
+#endif
+
+	/*
+	 * Now, distribute memblk's over nodes.  Splitting as needed.
+	 * re:  pxm:  we're assigning memory to [emulated] proximity domains
+	 */
+	for(p = &node_memblk[0]; p < pend && total_mem > 0; ++p, ++pxm) {
+		long need;
+
+		if(pxm == 0) {
+			need = node_0_mem;
+		} else {
+			if((need = min(per_node_mem, total_mem)) <= 0)
+				return -1; /* because of order alignment */
+		}
+		total_mem -= need;	/* remaining after this pxm */
+
+		p->nid = pxm; /* assign this memblk to node */
+		need  -= p->size;
+
+		/*
+		 * fulfill this pxm's need in this pass of the for loop
+		 */
+		while (need > 0) {
+			(++p)->nid = pxm; /* assign next block */
+			need  -= p->size;
+		}
+
+		if (need < 0) {
+			/*
+			 * may need to split p on "order boundary"
+			 * Needed because of funky phymem layout on
+			 * HP rx2600/rx46xx platforms. [maybe others?]
+			 * Note:  we reduce default CONFIG_FORCE_MAX_ZONEORDER
+			 * for NUMA Emulation so this works for < 8GB or so.
+			 */
+			unsigned long next_start, adjust;
+			long excess = 0 - need;
+
+			next_start = p->start_paddr + p->size - excess;
+			adjust = ORDERROUNDUP(next_start) - next_start;
+			next_start += adjust;
+			excess     -= adjust;
+			total_mem  -= min(adjust, total_mem);
+			if (excess > 0) {
+				/*
+				 * split memblk 'p'
+				 */
+				struct node_memblk_s *q;
+				for (q = pend; q > p; --q)
+					*q = *(q - 1);	/* make room */
+
+				(++q)->start_paddr = next_start;
+				q->size =  excess;
+
+				p->size =  q->start_paddr - p->start_paddr;
+
+				if (++num_node_memblks > NR_NODE_MEMBLKS) {
+					printk(KERN_WARNING
+						"%s:  NUMA Emulation would "
+						"exceed NR_NODE_MEMBLKS %d\n",
+					   __FUNCTION__, NR_NODE_MEMBLKS);
+					num_node_memblks = 0;
+					return -1; /* abandons numa emulation */
+				}
+				++pend;
+				continue;	/* aligned on order boundary */
+			}
+			/*
+			 * else let this pxm/node have all of 'p'
+			 */
+		}
+
+		/*
+		 * TODO:
+		 * Technically, we should ensure that following memblks, if any,
+		 * [these will be assigned to next pxm/node] wouldn't cause
+		 * memmap overlap when rounded down to "order boundary".
+		 * ??? SPARSEMEM interaction?
+		 */
+
+	} /* for each memblk */
+
+	/*
+	 * TODO:
+	 * Should check that all fake nodes got some minimal memory after
+	 * all the order alignment.
+	 */
+
+#ifdef NUMA_EMU_DEBUG
+	printk("NUMA Emulation:  "
+		"after memblk affinitization:  num_node_memblks=%d\n",
+	        	num_node_memblks);
+	for(p = &node_memblk[0]; p < pend; ++p) {
+		printk("NUMA Emulation:  "
+			"node_memblk[%lu]:  nid:  %d, range=[0x%016lx-0x%016lx)"
+			" (%luMB)\n",
+				p-node_memblk, p->nid, p->start_paddr,
+				p->start_paddr+p->size, p->size >> 20);
+	}
+#endif
+	return 0;
+
+}
+
+/*
+ * acpi_numa_emu_processor_affinity() - assign cpus to fake nodes.
+ * VERY simple round robin algorithm [except cpu 0--see below].
+ * TODO:  will need rework for SMT/multi-core to ensure that siblings
+ *        end up on same node.
+ */
+static void __init
+acpi_numa_emu_processor_affinity(void)
+{
+	int cpu, pxm, i;
+
+	/*
+	 * distribute cpus over emulated proximity domains in a similar
+	 * fashion to acpi_boot_init() when srat_num_cpus == 0.
+	 * But first, boot cpu == logical id 0 on pxm/node 0.
+	 * Note:  the real acpi_numa_processor_affinity() function
+	 * doesn't do anything special for cpu/pxm 0.  Perhaps the
+	 * SRAT presents the boot pxm first?
+	 */
+	node_cpuid[0].phys_id = hard_smp_processor_id();
+	node_cpuid[0].nid     = 0;
+	pxm_bit_set(0);		/* emulated pxm/node 0 */
+
+#ifdef NUMA_EMU_DEBUG
+		printk("NUMA Emulation:  "
+			"cpu 0 [phys 0x%x] assigned to proximity domain 0\n",
+		        node_cpuid[0].phys_id);
+#endif
+
+	pxm = i = 1;
+	for(cpu=0; cpu < available_cpus; ++cpu) {
+		if (smp_boot_data.cpu_phys_id[cpu] == hard_smp_processor_id())
+			continue;	/* boot cpu is "special" */
+
+		if (!pxm_bit_test(pxm))
+			pxm_bit_set(pxm);
+
+		/*
+		 * Use phys_id from lsapic scan.
+		 * Only because the real acpi_numa_processor_affinity_init()
+		 * does so.
+		 */
+		node_cpuid[i].phys_id = smp_boot_data.cpu_phys_id[cpu];
+
+		/*
+		 * fake proximity domain id
+		 */
+		node_cpuid[i].nid = pxm;
+
+#ifdef NUMA_EMU_DEBUG
+		printk("NUMA Emulation:  "
+			"cpu %d [phys 0x%x] assigned to proximity domain %d\n",
+		        i, node_cpuid[i].phys_id, node_cpuid[i].nid);
+#endif
+
+		++i;
+		if (++pxm == numa_fake)
+			pxm = 0;	/* wrap */
+	}
+
+	/*
+	 * Mark any remaining [non-existent] cpus as on node 0.
+	 * That's where their [unused] per cpu data will be allocated.
+	 */
+	for (cpu=available_cpus; cpu < NR_CPUS; ++cpu) {
+		node_cpuid[cpu].nid   = 0;
+		node_cpuid[i].phys_id = 0;
+	}
+
+	srat_num_cpus = available_cpus;
+
+}
+
+#define NUMA_EMU_INTRANODE_DISTANCE 10
+#define NUMA_EMU_INTERNODE_DISTANCE 20  // TODO:  ???
+static void __init
+acpi_numa_emu_slit(void)
+{
+	struct acpi_table_slit* slit;
+	int ifrom, ito;
+
+	slit = (struct acpi_table_slit*)&acpi_table_slit_emu;
+
+	/*
+	 * We only need to initialize slit table members:
+	 * localities and the corresponding entry[]'s
+	 */
+	slit->localities = numa_fake;
+
+	for(ifrom = 0; ifrom < numa_fake; ++ifrom) {
+		for(ito = 0; ito < numa_fake; ++ito) {
+			slit->entry[ifrom*numa_fake + ito] =
+			      (ifrom == ito) ? NUMA_EMU_INTRANODE_DISTANCE
+			                     : NUMA_EMU_INTERNODE_DISTANCE;
+		}
+	}
+
+	slit_table = slit;
+}
+
+#define NUMA_FIXUP_CONTINUE 0 /* multi-node:  real or emulated */
+#define NUMA_FIXUP_DONE 1     /* single node */
+static int __init
+acpi_numa_emulation_init(void)
+{
+	char *cp;
+
+	/*
+	 * Don't attempt fake numa if SRAT exists and contains more than
+	 * one proximity domain.
+	 */
+	if (srat_num_cpus != 0) {
+		int i, pxm_id, npxm=0;
+
+		for (i = 0; i < MAX_PXM_DOMAINS; ++i) {
+			if (pxm_bit_test(i) && ++npxm > 1 )
+				break;	/* no need to look further */
+			pxm_id = i;
+		}
+
+		if (npxm > 1) {
+			printk(KERN_INFO
+				"> 1 proximity domain => no NUMA emulation\n");
+			return NUMA_FIXUP_CONTINUE;
+		}
+
+		/*
+		 * Clear the pxm flag for the only pxm.
+		 * We'll reassign a fake one when we emulate processor affinity.
+		 * TODO:  will this adversly impact the SGI SN platform?
+		 *        See:  sn/kernel/setup.c:sn_init_pdas() which uses
+		 *        nid_to_pxm_map[].  Or is boot pxm always zero in SRAT?
+		 */
+		clear_bit(pxm_id, (void *)pxm_flag);
+	}
+
+	/*
+	 * Still too early to use the standard kernel command line support...
+	 */
+	for (cp = saved_command_line; *cp; ) {
+		if (memcmp(cp, "numa=fake", 9) == 0) {
+			cp += 9;
+			if (*(cp++) == '=') {
+				numa_fake = simple_strtoul(cp, NULL, 0);
+			} else {
+				numa_fake = 2;	/* default */
+			}
+			break;
+		} else {
+			while (*cp != ' ' && *cp)
+				++cp;
+			while (*cp == ' ')
+				++cp;
+		}
+	}
+
+	if (numa_fake < 2)
+		goto one_node;
+
+	printk(KERN_INFO "%s:  NUMA Emulation requested:  %d nodes\n",
+	       __FUNCTION__, numa_fake);
+
+	/*
+	 * Validate/sanitize numa_fake and setup numa emulation so that
+	 * the rest of acpi_numa_arch_fixup() "just works".
+	 */
+	if ((!srat_num_cpus && acpi_numa_emu_count_cpus() < 2)
+			|| srat_num_cpus == 1) {
+		printk(KERN_WARNING
+		   "%s:  abandoning NUMA Emulation because we have < 2 cpus\n",
+		   __FUNCTION__);
+		/*
+		 * could also be because parse of MADT failed...
+		 */
+		goto one_node;
+	}
+
+	if (numa_fake > available_cpus) {
+		numa_fake = available_cpus;
+		printk(KERN_INFO
+		   "%s:  reducing NUMA Emulation to available cpus: %d\n",
+		   __FUNCTION__, numa_fake);
+	}
+
+	if (numa_fake > MAX_NUMNODES) {  /* VERY unlikely, at this point */
+		numa_fake = MAX_NUMNODES;
+		printk(KERN_INFO
+		   "%s:  reducing NUMA Emulation to MAX_NUMNODES: %d\n",
+		   __FUNCTION__, numa_fake);
+	}
+
+	/*
+	 * Do memory affinity emulation before processors because
+	 * this can fail.  Don't want to touch srat_num_cpus nor
+	 * node_cpuid[] unless we're sure we're going to emulate
+	 * multiple nodes.  Else bad things happen later.
+	 */
+	if (acpi_numa_emu_memory_affinity()) {
+		printk(KERN_WARNING
+		   "%s:  abandoning NUMA Emulation because memory"
+		   " affinity emulation failed\n", __FUNCTION__);
+		goto one_node;
+	}
+
+	acpi_numa_emu_processor_affinity();
+
+	acpi_numa_emu_slit();
+
+	return NUMA_FIXUP_CONTINUE;
+
+one_node:
+	num_node_memblks = 0;  /* in case we've mucked with it */
+	node_set_online(0);
+	node_cpuid[0].phys_id = hard_smp_processor_id();
+	// TODO:  anything else?
+	return NUMA_FIXUP_DONE;
+
+}
+#endif /* ! CONFIG_NUMA_EMU */
+
 void __init acpi_numa_arch_fixup(void)
 {
 	int i, j, node_from, node_to;
 
+#ifndef CONFIG_NUMA_EMU
 	/* If there's no SRAT, fix the phys_id and mark node 0 online */
 	if (srat_num_cpus == 0) {
 		node_set_online(0);
 		node_cpuid[0].phys_id = hard_smp_processor_id();
 		return;
 	}
+#else
+	if (acpi_numa_emulation_init() == NUMA_FIXUP_DONE)
+		return;
+#endif
 
 	/*
-	 * MCD - This can probably be dropped now.  No need for pxm ID to node ID
-	 * mapping with sparse node numbering iff MAX_PXM_DOMAINS <= MAX_NUMNODES.
+	 * MCD - This can probably be dropped now.  No need for pxm ID to node
+	 * ID mapping with sparse node numbering iff MAX_PXM_DOMAINS <=
+	 * MAX_NUMNODES.
 	 */
 	/* calculate total number of nodes in system from PXM bitmap */
 	memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map));
@@ -649,6 +1121,9 @@ int __init acpi_boot_init(void)
 		printk(KERN_ERR PREFIX
 		       "Error parsing LAPIC address override entry\n");
 
+#ifdef CONFIG_NUMA_EMU
+	if (!already_parsed_lsapic)
+#endif
 	if (acpi_table_parse_madt(ACPI_MADT_LSAPIC, acpi_parse_lsapic, NR_CPUS)
 	    < 1)
 		printk(KERN_ERR PREFIX
--- fakenuma-2.6.14-rc4/arch/ia64/kernel/efi.c~original	2005-08-28 19:41:01.000000000 -0400
+++ fakenuma-2.6.14-rc4/arch/ia64/kernel/efi.c	2005-10-19 10:54:55.000000000 -0400
@@ -862,3 +862,112 @@ efi_uart_console_only(void)
 	printk(KERN_ERR "Malformed %s value\n", name);
 	return 0;
 }
+
+#ifdef CONFIG_NUMA_EMU
+/*
+ * efi_numa_emu_find_physmem()
+ *
+ * walk efi memory map to find contiguous ranges of physical memory to emulate
+ * SRAT info.  Can't use existing efi_memmap_walk() because it doesn't report
+ * all memory, and we don't want to be making assumptions about what physical
+ * memory REALLY exists, from the chunks passed to the callback.
+ *
+ * We'll use that same callback prototype as efi_memmap_walk() to avoid
+ * introducing new inter-module types for numa emulation.
+ */
+
+//TODO:  verify this:
+#define is_physmem(MD) \
+	((MD)->type != EFI_MEMORY_MAPPED_IO  && \
+	 (MD)->type != EFI_MEMORY_MAPPED_IO_PORT_SPACE )
+
+int __init
+efi_numa_emu_find_physmem(efi_freemem_callback_t callback, void *arg)
+{
+	void *efi_map_start, *efi_map_end, *p;
+	u64 efi_desc_size, start = 0, end, prev_end = 0;
+	unsigned long total_mem = 0, *total_mem_p = arg;
+	int prev_is_physmem = 0;
+
+	efi_map_start = __va(ia64_boot_param->efi_memmap);
+	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
+	efi_desc_size = ia64_boot_param->efi_memdesc_size;
+	*total_mem_p = 0;
+
+	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
+		efi_memory_desc_t *md = p;
+		int physmem_after_gap = 0;
+
+		if(is_physmem(md)) {
+			if(!prev_is_physmem) {
+				/*
+				 * start a new physmem segment
+				 */
+				start  = md->phys_addr;
+				prev_end = start +
+					(md->num_pages << EFI_PAGE_SHIFT);
+				prev_is_physmem = 1;
+				continue;
+			} else if(prev_end == md->phys_addr) {
+				/*
+				 * accumulate contiguous physmem
+				 */
+				prev_end += (md->num_pages << EFI_PAGE_SHIFT);
+				continue;
+			} else {
+				/*
+				 * report prev segment and start a new one
+				 */
+				physmem_after_gap = 1;
+			}
+		}
+
+		/*
+		 * md represents a non-physmem descriptor or
+		 * phys memory after a gap in the map
+		 */
+		if(prev_is_physmem) {
+
+			/*
+			 * no sense in reporting phys mem that
+			 * efi_memmap_walk() will trim
+			 */
+			start = GRANULEROUNDUP(start);
+			end   = GRANULEROUNDDOWN(prev_end);
+			if(start < end) {
+				total_mem += end - start;
+				if((*callback)(start, end, NULL))
+					return -1;
+			}
+
+			if (physmem_after_gap) {
+				start  = md->phys_addr;
+				prev_end = start +
+					(md->num_pages << EFI_PAGE_SHIFT);
+				prev_is_physmem = 1;
+			} else
+				prev_is_physmem = 0;
+
+		}
+
+	} /* for each map descriptor */
+
+	if(prev_is_physmem) {
+
+		/*
+		 * no sense in reporting phys mem that
+		 * efi_memmap_walk() will trim
+		 */
+		start = GRANULEROUNDUP(start);
+		end   = GRANULEROUNDDOWN(prev_end);
+		if(start < end) {
+			total_mem += end - start;
+			if ((*callback)(start, end, NULL))
+				return -1;
+		}
+	}
+
+	*total_mem_p = total_mem;
+	return 0;
+}
+#endif  /* CONFIG_NUMA_EMU */
--- fakenuma-2.6.14-rc4/arch/ia64/mm/discontig.c~original	2005-08-28 19:41:01.000000000 -0400
+++ fakenuma-2.6.14-rc4/arch/ia64/mm/discontig.c	2005-10-19 10:48:52.000000000 -0400
@@ -201,8 +201,8 @@ static void __init fill_pernode(int node
  *   |                        |
  *   |~~~~~~~~~~~~~~~~~~~~~~~~| <-- NODEDATA_ALIGN(start, node) for the first
  *   |    PERCPU_PAGE_SIZE *  |     start and length big enough
- *   |    cpus_on_this_node   | Node 0 will also have entries for all non-existent cpus.
- *   |------------------------|
+ *   |    cpus_on_this_node   | Node 0 will also have entries for all
+ *   |------------------------| non-existent cpus.
  *   |   local pg_data_t *    |
  *   |------------------------|
  *   |  local ia64_node_data  |
@@ -224,9 +224,6 @@ static int __init find_pernode_space(uns
 
 	epfn = (start + len) >> PAGE_SHIFT;
 
-	pages = bdp->node_low_pfn - (bdp->node_boot_start >> PAGE_SHIFT);
-	mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT;
-
 	/*
 	 * Make sure this memory falls within this node's usable memory
 	 * since we may have thrown some away in build_maps().
@@ -242,6 +239,8 @@ static int __init find_pernode_space(uns
 	 * Calculate total size needed, incl. what's necessary
 	 * for good alignment and alias prevention.
 	 */
+	pages = bdp->node_low_pfn - (bdp->node_boot_start >> PAGE_SHIFT);
+	mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT;
 	pernodesize = compute_pernodesize(node);
 	pernode = NODEDATA_ALIGN(start, node);
 
--- fakenuma-2.6.14-rc4/arch/ia64/Kconfig~original	2005-10-17 11:56:51.000000000 -0400
+++ fakenuma-2.6.14-rc4/arch/ia64/Kconfig	2005-10-17 13:49:26.000000000 -0400
@@ -186,6 +186,15 @@ config NUMA
 	  Access).  This option is for configuring high-end multiprocessor
 	  server systems.  If in doubt, say N.
 
+# move to Kconfig.debug?
+config NUMA_EMU
+	bool "NUMA emulation support"
+	depends on NUMA
+	help
+	  Enable NUMA emulation. A flat machine will be split
+	  into virtual nodes when booted with "numa=fake=N", where N is the
+	  number of nodes. This is only useful for debugging.
+
 config VIRTUAL_MEM_MAP
 	bool "Virtual mem map"
 	default y if !IA64_HP_SIM
@@ -233,7 +242,14 @@ config IA64_SGI_SN_XP
 
 config FORCE_MAX_ZONEORDER
 	int
-	default "18"
+	range 11 20
+	default "18" if !NUMA_EMU
+	default "14" if NUMA_EMU
+	help
+	  This parameter affects on pagesize of HugetlbFS and SectionSize.
+	  Max pagesize of HugetlbFS is PAGE_SIZE << MAX_ORDER.
+	  If using SPARSEMEM, Min SectionSize is PAGESIZE << MAX_ORDER.
+	  SectionSize is a unit of Hotpluggable Memory Size.
 
 config SMP
 	bool "Symmetric multi-processing support"
--- fakenuma-2.6.14-rc4/include/linux/efi.h~original	2005-10-17 11:56:54.000000000 -0400
+++ fakenuma-2.6.14-rc4/include/linux/efi.h	2005-10-19 10:55:37.000000000 -0400
@@ -166,8 +166,8 @@ typedef efi_status_t efi_get_variable_t 
 					 unsigned long *data_size, void *data);
 typedef efi_status_t efi_get_next_variable_t (unsigned long *name_size, efi_char16_t *name,
 					      efi_guid_t *vendor);
-typedef efi_status_t efi_set_variable_t (efi_char16_t *name, efi_guid_t *vendor, 
-					 unsigned long attr, unsigned long data_size, 
+typedef efi_status_t efi_set_variable_t (efi_char16_t *name, efi_guid_t *vendor,
+					 unsigned long attr, unsigned long data_size,
 					 void *data);
 typedef efi_status_t efi_get_next_high_mono_count_t (u32 *count);
 typedef void efi_reset_system_t (int reset_type, efi_status_t status,
@@ -324,6 +324,10 @@ static inline int efi_range_is_wc(unsign
 extern int __init efi_setup_pcdp_console(char *);
 #endif
 
+#ifdef CONFIG_NUMA_EMU
+extern int efi_numa_emu_find_physmem(efi_freemem_callback_t, void*);
+#endif
+
 /*
  * We play games with efi_enabled so that the compiler will, if possible, remove
  * EFI-related code altogether.
--- fakenuma-2.6.14-rc4/fs/Kconfig~original	2005-10-17 11:56:53.000000000 -0400
+++ fakenuma-2.6.14-rc4/fs/Kconfig	2005-10-17 13:49:26.000000000 -0400
@@ -808,9 +808,11 @@ config TMPFS
 
 	  See <file:Documentation/filesystems/tmpfs.txt> for details.
 
+# disallow HUGETLBFS when emulating numa because we reduce MAX_ORDER
+# evenutally, may address by adjusting hpage_shift
 config HUGETLBFS
 	bool "HugeTLB file system support"
-	depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || X86_64 || BROKEN
+	depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || X86_64 || BROKEN && !NUMA_EMU
 
 config HUGETLB_PAGE
 	def_bool HUGETLBFS


-
To unsubscribe from this list: send the line "unsubscribe linux-ia64" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Received on Fri Oct 21 06:36:53 2005

This archive was generated by hypermail 2.1.8 : 2005-10-21 06:37:30 EST