Re: [Linux-ia64] [patch] 2.4.20-ia64-021210 new spinlock code

From: Keith Owens <kaos_at_sgi.com>
Date: 2003-03-15 13:36:01
On Fri, 14 Mar 2003 17:30:01 -0800, 
David Mosberger <davidm@napali.hpl.hp.com> wrote:
>I think it's correct, but more complicated than it has to be: you
>should be able to use the general directive ".spillreg rp, b7" instead
>of ".altrp".

Done ...

>Another suggestion: use #ifndef CONFIG_ITANIUM instead of #ifdef
>CONFIG_MCKINLEY.  This ensures that brl will be used on future CPUs as
>well (as it should be).

... and done.  Take 2.

Patch is against 2.4.20-ia64-021210.

Reduce the uncontended spinlock path from 5 bundles to 3 (McKinley) or
4 (Itanium), both have one less memory access on the uncontended path.
Move the contended path out of line so we can do exponential backoff,
kdb, lock metering etc. in one place.

David, closing the 1 bundle unwind window was a bit harder than I
expected, it turns out that altrp does not specify where it applies, it
is prologue global.  To get altrp to apply after mov b7=r28, I needed
multiple prologues and bodies.  AFAICT this will unwind correctly on
any instruction within ia64_spinlock_contention.  Could you verify that
on your simulator (assuming it supports .save ar.pfs,r0)?

Note 1.  The patch uses .save ar.pfs, r0 which requires the unwind code
         be updated to allow reading from r0.  You need one of these
         unwind patches as well :-

	 2.4.20           http://external-lists.valinux.com/archives/linux-ia64/2003-March/004940.html
	 2.4.21-pre5 + bk http://external-lists.valinux.com/archives/linux-ia64/2003-March/004939.html

Note 2.  For (CONFIG_MODULES=y && CONFIG_MCKINLEY=y), you need modutils
         2.4.23.  If you get this message then upgrade your modutils.

	   Unhandled relocation of type 72 for ia64_spinlock_contention

Index: 20.5/include/asm-ia64/spinlock.h
--- 20.5/include/asm-ia64/spinlock.h Fri, 01 Mar 2002 11:01:28 +1100 kaos (linux-2.4/s/28_spinlock.h 1.1.3.1.1.1.2.1 644)
+++ 20.5(w)/include/asm-ia64/spinlock.h Sat, 15 Mar 2003 12:45:54 +1100 kaos (linux-2.4/s/28_spinlock.h 1.1.3.1.1.1.2.1 644)
@@ -15,10 +15,6 @@
 #include <asm/bitops.h>
 #include <asm/atomic.h>
 
-#undef NEW_LOCK
-
-#ifdef NEW_LOCK
-
 typedef struct {
 	volatile unsigned int lock;
 } spinlock_t;
@@ -27,81 +23,74 @@ typedef struct {
 #define spin_lock_init(x)			((x)->lock = 0)
 
 /*
- * Streamlined test_and_set_bit(0, (x)).  We use test-and-test-and-set
- * rather than a simple xchg to avoid writing the cache-line when
- * there is contention.
+ * Try to get the lock.  If we fail to get the lock, branch (not call) to
+ * ia64_spinlock_contention.  We do not use call because that stamps on ar.pfs
+ * which has unwanted side effects on the routine using spin_lock().
+ *
+ * ia64_spinlock_contention is entered with :-
+ *   r28 - address of start of spin_lock code.  Used as a "return" address
+ *         from the contention path.  mov r28=ip must be in the first bundle.
+ *   r29 - available for use.
+ *   r30 - available for use.
+ *   r31 - address of lock.
+ *   b7  - available for use.
+ *   p15 - available for use.
+ *
+ * If you patch ia64_spinlock_contention to use more registers, do not forget to
+ * update the clobber lists below.
  */
-#define spin_lock(x)									\
-{											\
+
+#ifdef	CONFIG_ITANIUM
+#define spin_lock(x) 									\
+({											\
 	register char *addr __asm__ ("r31") = (char *) &(x)->lock;			\
 											\
 	__asm__ __volatile__ (								\
-		"mov r30=1\n"								\
+		"1:\n"		/* force a new bundle, r28 points here */		\
 		"mov ar.ccv=r0\n"							\
+		"mov r28=ip\n"								\
+		"mov r30=1\n"								\
 		";;\n"									\
 		"cmpxchg4.acq r30=[%0],r30,ar.ccv\n"					\
+		"movl r29=ia64_spinlock_contention\n"					\
 		";;\n"									\
 		"cmp.ne p15,p0=r30,r0\n"						\
-		"(p15) br.call.spnt.few b7=ia64_spinlock_contention\n"			\
+		"mov b7=r29\n"								\
 		";;\n"									\
-		"1:\n"				/* force a new bundle */		\
+		"(p15) br.cond.spnt.few b7\n"						\
+		";;\n"									\
+		"2:\n"		/* force a new bundle */				\
 		:: "r"(addr)								\
-		: "ar.ccv", "ar.pfs", "b7", "p15", "r28", "r29", "r30", "memory");	\
-}
-
-#define spin_trylock(x)									\
+		: "ar.ccv", "r28", "r29", "r30", "b7", "p15", "memory");		\
+})
+#else	/* !CONFIG_ITANIUM */
+#define spin_lock(x) 									\
 ({											\
-	register long result;								\
+	register char *addr __asm__ ("r31") = (char *) &(x)->lock;			\
 											\
 	__asm__ __volatile__ (								\
+		"1:\n"		/* force a new bundle, r28 points here */		\
 		"mov ar.ccv=r0\n"							\
+		"mov r28=ip\n"								\
+		"mov r30=1\n"								\
+		";;\n"									\
+		"cmpxchg4.acq r30=[%0],r30,ar.ccv\n"					\
+		";;\n"									\
+		"cmp.ne p15,p0=r30,r0\n"						\
+		";;\n"									\
+		"(p15) brl.cond.spnt.few ia64_spinlock_contention\n"			\
 		";;\n"									\
-		"cmpxchg4.acq %0=[%2],%1,ar.ccv\n"					\
-		: "=r"(result) : "r"(1), "r"(&(x)->lock) : "ar.ccv", "memory");		\
-	(result == 0);									\
+		"2:\n"		/* force a new bundle */				\
+		:: "r"(addr)								\
+		: "ar.ccv", "r28", "r29", "r30", "b7", "p15", "memory");		\
 })
-
-#define spin_is_locked(x)	((x)->lock != 0)
-#define spin_unlock(x)		do { barrier(); ((spinlock_t *) x)->lock = 0;} while (0)
-#define spin_unlock_wait(x)	do { barrier(); } while ((x)->lock)
-
-#else /* !NEW_LOCK */
-
-typedef struct {
-	volatile unsigned int lock;
-} spinlock_t;
-
-#define SPIN_LOCK_UNLOCKED			(spinlock_t) { 0 }
-#define spin_lock_init(x)			((x)->lock = 0)
-
-/*
- * Streamlined test_and_set_bit(0, (x)).  We use test-and-test-and-set
- * rather than a simple xchg to avoid writing the cache-line when
- * there is contention.
- */
-#define spin_lock(x) __asm__ __volatile__ (			\
-	"mov ar.ccv = r0\n"					\
-	"mov r29 = 1\n"						\
-	";;\n"							\
-	"1:\n"							\
-	"ld4 r2 = [%0]\n"					\
-	";;\n"							\
-	"cmp4.eq p0,p7 = r0,r2\n"				\
-	"(p7) br.cond.spnt.few 1b \n"				\
-	"cmpxchg4.acq r2 = [%0], r29, ar.ccv\n"			\
-	";;\n"							\
-	"cmp4.eq p0,p7 = r0, r2\n"				\
-	"(p7) br.cond.spnt.few 1b\n"				\
-	";;\n"							\
-	:: "r"(&(x)->lock) : "ar.ccv", "p7", "r2", "r29", "memory")
+#endif	/* CONFIG_ITANIUM */
 
 #define spin_is_locked(x)	((x)->lock != 0)
 #define spin_unlock(x)		do { barrier(); ((spinlock_t *) x)->lock = 0; } while (0)
 #define spin_trylock(x)		(cmpxchg_acq(&(x)->lock, 0, 1) == 0)
 #define spin_unlock_wait(x)	do { barrier(); } while ((x)->lock)
 
-#endif /* !NEW_LOCK */
-
 typedef struct {
 	volatile int read_counter:31;
 	volatile int write_lock:1;
Index: 20.5/arch/ia64/kernel/ia64_ksyms.c
--- 20.5/arch/ia64/kernel/ia64_ksyms.c Wed, 11 Dec 2002 20:58:53 +1100 kaos (linux-2.4/r/c/35_ia64_ksyms 1.1.3.1.3.1.1.1.1.3 644)
+++ 20.5(w)/arch/ia64/kernel/ia64_ksyms.c Fri, 14 Mar 2003 16:11:57 +1100 kaos (linux-2.4/r/c/35_ia64_ksyms 1.1.3.1.3.1.1.1.1.3 644)
@@ -165,3 +165,9 @@ EXPORT_SYMBOL(machvec_noop);
 EXPORT_SYMBOL(pfm_install_alternate_syswide_subsystem);
 EXPORT_SYMBOL(pfm_remove_alternate_syswide_subsystem);
 #endif
+
+/* Spinlock contention path is entered via direct branch, not using a function
+ * pointer.  Fudge the declaration so we do not generate a function descriptor.
+ */
+extern char ia64_spinlock_contention[];
+EXPORT_SYMBOL_NOVERS(ia64_spinlock_contention);
Index: 20.5/arch/ia64/kernel/head.S
--- 20.5/arch/ia64/kernel/head.S Wed, 11 Dec 2002 20:58:53 +1100 kaos (linux-2.4/s/c/11_head.S 1.1.4.1.3.1.1.1.1.3 644)
+++ 20.5(w)/arch/ia64/kernel/head.S Sat, 15 Mar 2003 12:47:57 +1100 kaos (linux-2.4/s/c/11_head.S 1.1.4.1.3.1.1.1.1.3 644)
@@ -742,69 +742,53 @@ SET_REG(b5);
 #ifdef CONFIG_SMP
 
 	/*
-	 * This routine handles spinlock contention.  It uses a simple exponential backoff
-	 * algorithm to reduce unnecessary bus traffic.  The initial delay is selected from
-	 * the low-order bits of the cycle counter (a cheap "randomizer").  I'm sure this
-	 * could use additional tuning, especially on systems with a large number of CPUs.
-	 * Also, I think the maximum delay should be made a function of the number of CPUs in
-	 * the system. --davidm 00/08/05
+	 * This routine handles spinlock contention, using non-standard entry
+	 * conventions.  To avoid converting leaf routines into non-leaf, the
+	 * inline spin_lock() code uses br.cond (not br.call) to enter this
+	 * code.  r28 contains the start of the inline spin_lock() code.
 	 *
-	 * WARNING: This is not a normal procedure.  It gets called from C code without
-	 * the compiler knowing about it.  Thus, we must not use any scratch registers
-	 * beyond those that were declared "clobbered" at the call-site (see spin_lock()
-	 * macro).  We may not even use the stacked registers, because that could overwrite
-	 * output registers.  Similarly, we can't use the scratch stack area as it may be
-	 * in use, too.
+	 * Do not use gp relative variables, this code is called from the kernel
+	 * and from modules, r1 is undefined.  Do not use stacked registers, the
+	 * caller owns them.  Do not use the scratch stack space, the caller
+	 * owns it.  Do not change ar.pfs, the caller owns it.
 	 *
 	 * Inputs:
-	 *	ar.ccv = 0 (and available for use)
-	 *	r28 = available for use
-	 *	r29 = available for use
-	 *	r30 = non-zero (and available for use)
-	 *	r31 = address of lock we're trying to acquire
-	 *	p15 = available for use
+	 *   ar.ccv - 0 (and available for use)
+	 *   r12    - kernel stack pointer, but see above.
+	 *   r13    - current process.
+	 *   r28    - address of start of spin_lock code.  Used as a "return"
+	 *            address from this contention path.  Available for use
+	 *            after it has been saved.
+	 *   r29    - available for use.
+	 *   r30    - available for use.
+	 *   r31    - address of lock.
+	 *   b7     - available for use.
+	 *   p15    - available for use.
+	 *   Rest   - caller's state, do not use, especially ar.pfs.
+	 *
+	 * If you patch this code to use more registers, do not forget to update
+	 * the clobber lists for spin_lock() in include/asm-ia64/spinlock.h.
 	 */
 
-#	define delay	r28
-#	define timeout	r29
-#	define tmp	r30
-
 GLOBAL_ENTRY(ia64_spinlock_contention)
-	mov tmp=ar.itc
-	;;
-	and delay=0x3f,tmp
-	;;
-
-.retry:	add timeout=tmp,delay
-	shl delay=delay,1
-	;;
-	dep delay=delay,r0,0,13	// limit delay to 8192 cycles
-	;;
-	// delay a little...
-.wait:	sub tmp=tmp,timeout
-	or delay=0xf,delay	// make sure delay is non-zero (otherwise we get stuck with 0)
-	;;
-	cmp.lt p15,p0=tmp,r0
-	mov tmp=ar.itc
-(p15)	br.cond.sptk .wait
-	;;
-	ld4 tmp=[r31]
-	;;
-	cmp.ne p15,p0=tmp,r0
-	mov tmp=ar.itc
-(p15)	br.cond.sptk .retry	// lock is still busy
-	;;
-	// try acquiring lock (we know ar.ccv is still zero!):
-	mov tmp=1
-	;;
-	cmpxchg4.acq tmp=[r31],tmp,ar.ccv
-	;;
-	cmp.eq p15,p0=tmp,r0
-
-	mov tmp=ar.itc
-(p15)	br.ret.sptk.many b7	// got lock -> return
-	br .retry		// still no luck, retry
+	// To get decent unwind data, lie about our state
+	.prologue
+	.save ar.pfs, r0	// this code effectively has a zero frame size
+	.spillreg rp, r28
+	mov b7=r28
+	.spillreg rp, b7
+	.body
+
+.retry:
+	// exponential backoff, kdb, lockmeter etc. go in here
+	//
+	;;
+	ld4 r28=[r31]
+	;;
+	cmp4.eq p15,p0=r28,r0
+(p15)	br.cond.spnt.few b7	// lock is now free, try to acquire
+	br.cond.sptk.few .retry
 
 END(ia64_spinlock_contention)
 
-#endif
+#endif	// CONFIG_SMP
Received on Fri Mar 14 18:36:19 2003

This archive was generated by hypermail 2.1.8 : 2005-08-02 09:20:12 EST