From b830ab665ad96c6b20d51a89b35cbc09ab5a2c29 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 28 Feb 2006 15:10:26 -0800
Subject: [PATCH] [SPARC64]: Fix bugs in SUN4V cpu mondo dispatch.

There were several bugs in the SUN4V cpu mondo dispatch code.

In fact, if we ever got a EWOULDBLOCK or other error from
the hypervisor call, we'd potentially send a cpu mondo multiple
times to the same cpu and even worse we could loop until the
timeout resending the same mondo over and over to such cpus.

So let's bulletproof this thing as follows:

1) Implement cpu_mondo_send() and cpu_state() hypervisor calls
   in arch/sparc64/kernel/entry.S, add prototypes to asm/hypervisor.h

2) Don't build and update the cpulist using inline functions, this
   was causing the cpu mask to not get updated in the caller.

3) Disable interrupts during the entire mondo send, otherwise our
   cpu list and/or mondo block could get overwritten if we take
   an interrupt and do a cpu mondo send on the current cpu.

4) Check for all possible error return types from the cpu_mondo_send()
   hypervisor call.  In particular:

   HV_EOK) Our work is done, all cpus have received the mondo.
   HV_CPUERROR) One or more of the cpus in the cpu list we passed
                to the hypervisor are in error state.  Use cpu_state()
                calls over the entries in the cpu list to see which
		ones.  Record them in "error_mask" and report this
		after we are done sending the mondo to cpus which are
		not in error state.
   HV_EWOULDBLOCK) We need to keep trying.

   Any other error we consider fatal, we report the event and exit
   immediately.

5) We only timeout if forward progress is not made.  Forward progress
   is defined as having at least one cpu get the mondo successfully
   in a given cpu_mondo_send() call.  Otherwise we bump a counter
   and delay a little.  If the counter hits a limit, we signal an
   error and report the event.

Also, smp_call_function_mask() error handling reports the number
of cpus incorrectly.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/kernel/entry.S      |  28 +++++
 arch/sparc64/kernel/smp.c        | 180 +++++++++++++++++++++----------
 include/asm-sparc64/hypervisor.h |  10 ++
 3 files changed, 161 insertions(+), 57 deletions(-)

diff --git a/arch/sparc64/kernel/entry.S b/arch/sparc64/kernel/entry.S
index 9f3048e64e..6d0b3ed77a 100644
--- a/arch/sparc64/kernel/entry.S
+++ b/arch/sparc64/kernel/entry.S
@@ -1795,3 +1795,31 @@ sun4v_cpu_yield:
 	ta	HV_FAST_TRAP
 	retl
 	 nop
+
+	/* %o0:	num cpus in cpu list
+	 * %o1:	cpu list paddr
+	 * %o2:	mondo block paddr
+	 *
+	 * returns %o0: status
+	 */
+	.globl	sun4v_cpu_mondo_send
+sun4v_cpu_mondo_send:
+	mov	HV_FAST_CPU_MONDO_SEND, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 nop
+
+	/* %o0:	CPU ID
+	 *
+	 * returns %o0:	-status if status non-zero, else
+	 *         %o0:	cpu state as HV_CPU_STATE_*
+	 */
+	.globl	sun4v_cpu_state
+sun4v_cpu_state:
+	mov	HV_FAST_CPU_STATE, %o5
+	ta	HV_FAST_TRAP
+	brnz,pn	%o0, 1f
+	 sub	%g0, %o0, %o0
+	mov	%o1, %o0
+1:	retl
+	 nop
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index eb7c0f855b..6bc7fd47e4 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -556,77 +556,144 @@ retry:
 }
 
 /* Multi-cpu list version.  */
-static int init_cpu_list(u16 *list, cpumask_t mask)
-{
-	int i, cnt;
-
-	cnt = 0;
-	for_each_cpu_mask(i, mask)
-		list[cnt++] = i;
-
-	return cnt;
-}
-
-static int update_cpu_list(u16 *list, int orig_cnt, cpumask_t mask)
-{
-	int i;
-
-	for (i = 0; i < orig_cnt; i++) {
-		if (list[i] == 0xffff)
-			cpu_clear(i, mask);
-	}
-
-	return init_cpu_list(list, mask);
-}
-
 static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)
 {
-	int this_cpu = get_cpu();
-	struct trap_per_cpu *tb = &trap_block[this_cpu];
-	u64 *mondo = __va(tb->cpu_mondo_block_pa);
-	u16 *cpu_list = __va(tb->cpu_list_pa);
-	int cnt, retries;
+	struct trap_per_cpu *tb;
+	u16 *cpu_list;
+	u64 *mondo;
+	cpumask_t error_mask;
+	unsigned long flags, status;
+	int cnt, retries, this_cpu, i;
+
+	/* We have to do this whole thing with interrupts fully disabled.
+	 * Otherwise if we send an xcall from interrupt context it will
+	 * corrupt both our mondo block and cpu list state.
+	 *
+	 * One consequence of this is that we cannot use timeout mechanisms
+	 * that depend upon interrupts being delivered locally.  So, for
+	 * example, we cannot sample jiffies and expect it to advance.
+	 *
+	 * Fortunately, udelay() uses %stick/%tick so we can use that.
+	 */
+	local_irq_save(flags);
+
+	this_cpu = smp_processor_id();
+	tb = &trap_block[this_cpu];
 
+	mondo = __va(tb->cpu_mondo_block_pa);
 	mondo[0] = data0;
 	mondo[1] = data1;
 	mondo[2] = data2;
 	wmb();
 
+	cpu_list = __va(tb->cpu_list_pa);
+
+	/* Setup the initial cpu list.  */
+	cnt = 0;
+	for_each_cpu_mask(i, mask)
+		cpu_list[cnt++] = i;
+
+	cpus_clear(error_mask);
 	retries = 0;
-	cnt = init_cpu_list(cpu_list, mask);
 	do {
-		register unsigned long func __asm__("%o5");
-		register unsigned long arg0 __asm__("%o0");
-		register unsigned long arg1 __asm__("%o1");
-		register unsigned long arg2 __asm__("%o2");
-
-		func = HV_FAST_CPU_MONDO_SEND;
-		arg0 = cnt;
-		arg1 = tb->cpu_list_pa;
-		arg2 = tb->cpu_mondo_block_pa;
-
-		__asm__ __volatile__("ta	%8"
-				     : "=&r" (func), "=&r" (arg0),
-				       "=&r" (arg1), "=&r" (arg2)
-				     : "0" (func), "1" (arg0),
-				       "2" (arg1), "3" (arg2),
-				       "i" (HV_FAST_TRAP)
-				     : "memory");
-		if (likely(arg0 == HV_EOK))
-			break;
+		int forward_progress;
+
+		status = sun4v_cpu_mondo_send(cnt,
+					      tb->cpu_list_pa,
+					      tb->cpu_mondo_block_pa);
 
-		if (unlikely(++retries > 100)) {
-			printk("CPU[%d]: sun4v mondo error %lu\n",
-			       this_cpu, arg0);
+		/* HV_EOK means all cpus received the xcall, we're done.  */
+		if (likely(status == HV_EOK))
 			break;
+
+		/* First, clear out all the cpus in the mask that were
+		 * successfully sent to.  The hypervisor indicates this
+		 * by setting the cpu list entry of such cpus to 0xffff.
+		 */
+		forward_progress = 0;
+		for (i = 0; i < cnt; i++) {
+			if (cpu_list[i] == 0xffff) {
+				cpu_clear(i, mask);
+				forward_progress = 1;
+			}
 		}
 
-		cnt = update_cpu_list(cpu_list, cnt, mask);
+		/* If we get a HV_ECPUERROR, then one or more of the cpus
+		 * in the list are in error state.  Use the cpu_state()
+		 * hypervisor call to find out which cpus are in error state.
+		 */
+		if (unlikely(status == HV_ECPUERROR)) {
+			for (i = 0; i < cnt; i++) {
+				long err;
+				u16 cpu;
+
+				cpu = cpu_list[i];
+				if (cpu == 0xffff)
+					continue;
+
+				err = sun4v_cpu_state(cpu);
+				if (err >= 0 &&
+				    err == HV_CPU_STATE_ERROR) {
+					cpu_clear(cpu, mask);
+					cpu_set(cpu, error_mask);
+				}
+			}
+		} else if (unlikely(status != HV_EWOULDBLOCK))
+			goto fatal_mondo_error;
+
+		/* Rebuild the cpu_list[] array and try again.  */
+		cnt = 0;
+		for_each_cpu_mask(i, mask)
+			cpu_list[cnt++] = i;
 
-		udelay(2 * cnt);
+		if (unlikely(!forward_progress)) {
+			if (unlikely(++retries > 10000))
+				goto fatal_mondo_timeout;
+
+			/* Delay a little bit to let other cpus catch up
+			 * on their cpu mondo queue work.
+			 */
+			udelay(2 * cnt);
+		}
 	} while (1);
 
-	put_cpu();
+	local_irq_restore(flags);
+
+	if (unlikely(!cpus_empty(error_mask)))
+		goto fatal_mondo_cpu_error;
+
+	return;
+
+fatal_mondo_cpu_error:
+	printk(KERN_CRIT "CPU[%d]: SUN4V mondo cpu error, some target cpus "
+	       "were in error state\n",
+	       this_cpu);
+	printk(KERN_CRIT "CPU[%d]: Error mask [ ", this_cpu);
+	for_each_cpu_mask(i, error_mask)
+		printk("%d ", i);
+	printk("]\n");
+	return;
+
+fatal_mondo_timeout:
+	local_irq_restore(flags);
+	printk(KERN_CRIT "CPU[%d]: SUN4V mondo timeout, no forward "
+	       " progress after %d retries.\n",
+	       this_cpu, retries);
+	goto dump_cpu_list_and_out;
+
+fatal_mondo_error:
+	local_irq_restore(flags);
+	printk(KERN_CRIT "CPU[%d]: Unexpected SUN4V mondo error %lu\n",
+	       this_cpu, status);
+	printk(KERN_CRIT "CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) "
+	       "mondo_block_pa(%lx)\n",
+	       this_cpu, cnt, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
+
+dump_cpu_list_and_out:
+	printk(KERN_CRIT "CPU[%d]: CPU list [ ", this_cpu);
+	for (i = 0; i < cnt; i++)
+		printk("%u ", cpu_list[i]);
+	printk("]\n");
 }
 
 /* Send cross call to all processors mentioned in MASK
@@ -723,9 +790,8 @@ static int smp_call_function_mask(void (*func)(void *info), void *info,
 
 out_timeout:
 	spin_unlock(&call_lock);
-	printk("XCALL: Remote cpus not responding, ncpus=%ld finished=%ld\n",
-	       (long) num_online_cpus() - 1L,
-	       (long) atomic_read(&data.finished));
+	printk("XCALL: Remote cpus not responding, ncpus=%d finished=%d\n",
+	       cpus, atomic_read(&data.finished));
 	return 0;
 }
 
diff --git a/include/asm-sparc64/hypervisor.h b/include/asm-sparc64/hypervisor.h
index 726e2ea03c..612bf31975 100644
--- a/include/asm-sparc64/hypervisor.h
+++ b/include/asm-sparc64/hypervisor.h
@@ -342,6 +342,8 @@ extern unsigned long sun4v_cpu_qconf(unsigned long type,
  *		ENOCPU		Invalid cpu in CPU list
  *		EWOULDBLOCK	Some or all of the listed CPUs did not receive
  *				the mondo
+ *		ECPUERROR	One or more of the listed CPUs are in error
+ *				state, use HV_FAST_CPU_STATE to see which ones
  *		EINVAL		CPU list includes caller's CPU ID
  *
  * Send a mondo interrupt to the CPUs in the given CPU list with the
@@ -355,6 +357,10 @@ extern unsigned long sun4v_cpu_qconf(unsigned long type,
  */
 #define HV_FAST_CPU_MONDO_SEND		0x42
 
+#ifndef __ASSEMBLY__
+extern unsigned long sun4v_cpu_mondo_send(unsigned long cpu_count, unsigned long cpu_list_pa, unsigned long mondo_block_pa);
+#endif
+
 /* cpu_myid()
  * TRAP:	HV_FAST_TRAP
  * FUNCTION:	HV_FAST_CPU_MYID
@@ -382,6 +388,10 @@ extern unsigned long sun4v_cpu_qconf(unsigned long type,
 #define  HV_CPU_STATE_RUNNING		 0x02
 #define  HV_CPU_STATE_ERROR		 0x03
 
+#ifndef __ASSEMBLY__
+extern long sun4v_cpu_state(unsigned long cpuid);
+#endif
+
 /* cpu_set_rtba()
  * TRAP:	HV_FAST_TRAP
  * FUNCTION:	HV_FAST_CPU_SET_RTBA
-- 
2.39.5