[patch] Improve ia64_leave_syscall() for McKinley-type cores.

From: David Mosberger <davidm_at_napali.hpl.hp.com>
Date: 2005-01-19 16:02:07
Optimize ia64_leave_syscall() a bit better for McKinley-type cores.
The patch looks big, but that's mostly due to renaming r16/r17 to r2/r3.
Good for a 13 cycle improvement.

	--david

Signed-off-by: David Mosberger-Tang <davidm@hpl.hp.com>

# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2005/01/18 18:12:07-08:00 davidm@tiger.hpl.hp.com 
#   ia64: Improve ia64_leave_syscall() for McKinley-type cores.  Improves
#   	(normal) getpid() from 271 to 258 cycles.
# 
# arch/ia64/kernel/entry.S
#   2005/01/18 18:11:50-08:00 davidm@tiger.hpl.hp.com +51 -48
#   (ia64_leave_syscall): Make it a local function (it's not used anywhere
#   	else anymore).
#   	Use r2/r3 as base-pointers instead of r16/r17.
#   	Load b6 into r18 instead of r22 (frees up a register).
#   	Read ar.bsp early (_big_ savings!).
#   	Reschedule for McKinley-type cores.
#   	Do srlz.i _before_ restoring the stack-pointer or updating
#   	current->thread.on_ustack.
#   (skip_rbs_switch): Clear r2 if pLvSys is TRUE.
# 
diff -Nru a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
--- a/arch/ia64/kernel/entry.S	2005-01-18 21:00:24 -08:00
+++ b/arch/ia64/kernel/entry.S	2005-01-18 21:00:24 -08:00
@@ -633,10 +633,12 @@
  *		 r13: restored (user-level thread pointer)
  *		 r14: cleared
  *		 r15: restored (syscall #)
- *	     r16-r19: cleared
+ *	     r16-r17: cleared
+ *		 r18: user-level b6
+ *		 r19: cleared
  *		 r20: user-level ar.fpsr
  *		 r21: user-level b0
- *		 r22: user-level b6
+ *		 r22: cleared
  *		 r23: user-level ar.bspstore
  *		 r24: user-level ar.rnat
  *		 r25: user-level ar.unat
@@ -661,7 +663,7 @@
  *	      ar.csd: cleared
  *	      ar.ssd: cleared
  */
-GLOBAL_ENTRY(ia64_leave_syscall)
+ENTRY(ia64_leave_syscall)
 	PT_REGS_UNWIND_INFO(0)
 	/*
 	 * work.need_resched etc. mustn't get changed by this CPU before it returns to
@@ -690,79 +692,80 @@
 (pUStk)	cmp.eq.unc p6,p0=r0,r0		// p6 <- pUStk
 #endif
 .work_processed_syscall:
-	adds r16=PT(LOADRS)+16,r12
-	adds r17=PT(AR_BSPSTORE)+16,r12
+	adds r2=PT(LOADRS)+16,r12
+	adds r3=PT(AR_BSPSTORE)+16,r12
 	adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
 	;;
 (p6)	ld4 r31=[r18]				// load current_thread_info()->flags
-	ld8 r19=[r16],PT(B6)-PT(LOADRS)		// load ar.rsc value for "loadrs"
-	nop.i 0
+	ld8 r19=[r2],PT(B6)-PT(LOADRS)		// load ar.rsc value for "loadrs"
+	mov b7=r0		// clear b7
 	;;
-	ld8 r23=[r17],PT(R9)-PT(AR_BSPSTORE)	// load ar.bspstore (may be garbage)
-	ld8 r22=[r16],PT(R8)-PT(B6)		// load b6
+	ld8 r23=[r3],PT(R9)-PT(AR_BSPSTORE)	// load ar.bspstore (may be garbage)
+	ld8 r18=[r2],PT(R8)-PT(B6)		// load b6
 (p6)	and r15=TIF_WORK_MASK,r31		// any work other than TIF_SYSCALL_TRACE?
 	;;
-
-	mov.m ar.ccv=r0		// clear ar.ccv
+	mov r16=ar.bsp				// M2  get existing backing store pointer
 (p6)	cmp4.ne.unc p6,p0=r15, r0		// any special work pending?
 (p6)	br.cond.spnt .work_pending
 	;;
 	// start restoring the state saved on the kernel stack (struct pt_regs):
-	ld8.fill r8=[r16],16
-	ld8.fill r9=[r17],16
+	ld8.fill r8=[r2],16
+	ld8.fill r9=[r3],16
 	mov f6=f0		// clear f6
 	;;
-	ld8.fill r10=[r16],16
-	ld8.fill r11=[r17],16
+	invala			// M0|1 invalidate ALAT
+	rsm psr.i | psr.ic	// M2 initiate turning off of interrupt and interruption collection
+	mov f9=f0		// clear f9
+
+	ld8.fill r10=[r2],16
+	ld8.fill r11=[r3],16
 	mov f7=f0		// clear f7
 	;;
-	ld8 r29=[r16],16	// load cr.ipsr
-	ld8 r28=[r17],16	// load cr.iip
+	ld8 r29=[r2],16		// load cr.ipsr
+	ld8 r28=[r3],16			// load cr.iip
 	mov f8=f0		// clear f8
 	;;
-	ld8 r30=[r16],16	// load cr.ifs
-	ld8 r25=[r17],16	// load ar.unat
+	ld8 r30=[r2],16		// M0|1 load cr.ifs
+	mov.m ar.ssd=r0		// M2 clear ar.ssd
 	cmp.eq p9,p0=r0,r0	// set p9 to indicate that we should restore cr.ifs
 	;;
-	rsm psr.i | psr.ic	// initiate turning off of interrupt and interruption collection
-	invala			// invalidate ALAT
-	mov f9=f0		// clear f9
-
-	mov.m ar.ssd=r0		// clear ar.ssd
-	mov.m ar.csd=r0		// clear ar.csd
+	ld8 r25=[r3],16		// M0|1 load ar.unat
+	mov.m ar.csd=r0		// M2 clear ar.csd
+	mov r22=r0		// clear r22
+	;;
+	ld8 r26=[r2],PT(B0)-PT(AR_PFS)	// M0|1 load ar.pfs
+	nop.m 0
 	mov f10=f0		// clear f10
 	;;
-	ld8 r26=[r16],16	// load ar.pfs
-	ld8 r27=[r17],PT(PR)-PT(AR_RSC)	// load ar.rsc
+	ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // load b0
+	ld8 r27=[r3],PT(PR)-PT(AR_RSC)	// load ar.rsc
 	mov f11=f0		// clear f11
 	;;
-	ld8 r24=[r16],PT(B0)-PT(AR_RNAT)	// load ar.rnat (may be garbage)
-	ld8 r31=[r17],PT(R1)-PT(PR)		// load predicates
+	ld8 r24=[r2],PT(AR_FPSR)-PT(AR_RNAT)	// load ar.rnat (may be garbage)
+	ld8 r31=[r3],PT(R1)-PT(PR)		// load predicates
 (pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
 	;;
-	ld8 r21=[r16],PT(R12)-PT(B0) // load b0
-	ld8.fill r1=[r17],16	// load r1
-(pUStk) mov r3=1
-	;;
-	ld8.fill r12=[r16],16
-	ld8.fill r13=[r17],16
-	mov r2=r0		// clear r2
+	ld8 r20=[r2],PT(R12)-PT(AR_FPSR)	// load ar.fpsr
+	ld8.fill r1=[r3],16	// load r1
+(pUStk) mov r17=1
 	;;
-	ld8 r20=[r16]		// load ar.fpsr
-	ld8.fill r15=[r17]	// load r15
-	mov b7=r0		// clear b7
+	srlz.i			// M0  ensure interruption collection is off
+	ld8.fill r13=[r3],16
+	nop.i 0
 	;;
-(pUStk) st1 [r14]=r3
-	addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0
+	ld8.fill r12=[r2]	// restore r12 (sp)
+	ld8.fill r15=[r3]	// restore r15
+	addl r3=THIS_CPU(ia64_phys_stacked_size_p8),r0
+	;;
+(pUStk)	ld4 r3=[r3]		// r3 = cpu_data->phys_stacked_size_p8
+(pUStk) st1 [r14]=r17
+	mov b6=r18		// I0  restore b6
 	;;
-	mov r16=ar.bsp		// get existing backing store pointer
-	srlz.i			// ensure interruption collection is off
+	shr.u r18=r19,16	// I0|1 get byte size of existing "dirty" partition
 	mov r14=r0		// clear r14
-	;;
-	ld4 r17=[r17]		// r17 = cpu_data->phys_stacked_size_p8
-	mov b6=r22				// restore b6
-	shr.u r18=r19,16	// get byte size of existing "dirty" partition
 (pKStk) br.cond.dpnt.many skip_rbs_switch
+
+	mov.m ar.ccv=r0		// clear ar.ccv
 (pNonSys) br.cond.dpnt.many dont_preserve_current_frame
 	br.cond.sptk.many rbs_switch
 END(ia64_leave_syscall)
@@ -1054,7 +1057,7 @@
 	;;
 (pUStk)	mov ar.rnat=r24		// M2 must happen with RSE in lazy mode
 	nop 0
-	nop 0
+(pLvSys)mov r2=r0
 
 	mov ar.rsc=r27		// M2
 	mov pr=r31,-1		// I0
-
To unsubscribe from this list: send the line "unsubscribe linux-ia64" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Received on Wed Jan 19 00:05:07 2005

This archive was generated by hypermail 2.1.8 : 2005-08-02 09:20:34 EST