Workaround for some SMP stability issues - request for testing

Wed Aug 6 09:06:58 EDT 2003

On 6 August 2003 06:12, Jouni Malinen wrote:
> Thanks to all the testing Michael Vallaly did with couple of patches to
> Host AP driver, I think I now have quite a bit more information about
> the stability issues with hostap_pci on SMP systems. It looks like there
> are two separate issues. 1) Something is corrupting the information
> transfered between card and driver which then results in frequent wlan
> hw resets. 2) hw_reset with hostap_pci on SMP system seems to hang the
> system completely in some cases (completely enough to even prevent NMI
> watchdog from detecting hang).

Thank you guys sooo much for attacking this!
 
> It looks like the major cause for frequent resets was in fid register
> (RX, TX, TX Error, AllocFid) getting corrupted. I do not have any
> explanation for this apart from hardware/firmware bug. It looks like
> consecutive reads of the fid registers results in different results even
> though that register should not change before the event is acknowledged.
> 
> I added code that will try its best to make sure that the fid register
> read will return correct fid number. This workaround will read the
> register three times and will return the value only if at least two of
> the reads returns the same value. This will be repeated up to five
> times.

You are obviously talking about this code:

/* Some SMP systems have reported number of odd errors with hostap_pci. fid
 * register has changed values between consecutive reads for an unknown reason.
 * This should really not happen, so more debugging is needed. This test
 * version is a big slower, but it will detect most of such register changes
 * and will try to get the correct fid eventually. */
#define EXTRA_FID_READ_TESTS

static inline u16 prism2_read_fid_reg(struct net_device *dev, u16 reg)
{
#ifdef EXTRA_FID_READ_TESTS
        u16 val, val2, val3;
        int i;

        for (i = 0; i < 10; i++) {
                val = HFA384X_INW(reg);
                val2 = HFA384X_INW(reg);
                val3 = HFA384X_INW(reg);

                if (val == val2 && val == val3)
                        return val;

                printk(KERN_DEBUG "%s: detected fid change (try=%d, reg=%04x):"
                       " %04x %04x %04x\n",
                       dev->name, i, reg, val, val2, val3);
                if ((val == val2 || val == val3) && val != 0)
                        return val;
                if (val2 == val3 && val2 != 0)
                        return val2;
        }
        printk(KERN_WARNING "%s: Uhhuh.. could not read good fid from reg "
               "%04x (%04x %04x %04x)\n", dev->name, reg, val, val2, val3);
        return val;
#else /* EXTRA_FID_READ_TESTS */
        return HFA384X_INW(reg);
#endif /* EXTRA_FID_READ_TESTS */
}

> The workaround seemed to eliminate more or less all corrupted fid values
> in the test system. Consequently, there was no need to reset the
> hardware and no host system hangs.

Corruption must be triggered by something. Most probably other CPU.

Maybe this patch will give us some insights. Tries to show irq
races and which processor did it.
Applies to CVS. Compile tested.
--
vda

--- hostap_hw.c.orig	Wed Aug  6 05:50:00 2003
+++ hostap_hw.c	Wed Aug  6 16:03:12 2003
@@ -2199,11 +2199,16 @@
  * and will try to get the correct fid eventually. */
 #define EXTRA_FID_READ_TESTS
 
+#ifdef EXTRA_FID_READ_TESTS
+static int interrupts_nested=0; 
+static int intr_cpu[16];
+#endif
+
 static inline u16 prism2_read_fid_reg(struct net_device *dev, u16 reg)
 {
 #ifdef EXTRA_FID_READ_TESTS
 	u16 val, val2, val3;
-	int i;
+	int i,j;
 
 	for (i = 0; i < 10; i++) {
 		val = HFA384X_INW(reg);
@@ -2214,8 +2219,14 @@
 			return val;
 
 		printk(KERN_DEBUG "%s: detected fid change (try=%d, reg=%04x):"
-		       " %04x %04x %04x\n",
-		       dev->name, i, reg, val, val2, val3);
+		       " %04x %04x %04x. cpu %d. interrupts nested: %d\n",
+		       dev->name, i, reg, val, val2, val3,
+		       smp_processor_id(),
+		       interrupts_nested
+		       );
+		for(j=0; j<interrupts_nested; j++)
+			printk(KERN_DEBUG "intr_cpu[%d]=%d\n", j, intr_cpu[j]);
+			
 		if ((val == val2 || val == val3) && val != 0)
 			return val;
 		if (val2 == val3 && val2 != 0)
@@ -2957,13 +2968,17 @@
 	local_info_t *local = (local_info_t *) dev->priv;
 	int events = 0;
 	u16 ev;
+	
+#ifdef EXTRA_FID_READ_TESTS
+	intr_cpu[interrupts_nested++] = smp_processor_id();
+#endif
 
 	prism2_io_debug_add(dev, PRISM2_IO_DEBUG_CMD_INTERRUPT, 0, 0);
 
 	if (local->func->card_present && !local->func->card_present(local)) {
 		printk(KERN_DEBUG "%s: Interrupt, but dev not OK\n",
 		       dev->name);
-		return IRQ_HANDLED;
+		goto return_IRQ_HANDLED;
 	}
 
 	prism2_check_magic(local);
@@ -2972,11 +2987,11 @@
 		ev = HFA384X_INW(HFA384X_EVSTAT_OFF);
 		if (ev == 0xffff) {
 			if (local->shutdown)
-				return IRQ_HANDLED;
+				goto return_IRQ_HANDLED;
 			HFA384X_OUTW(0xffff, HFA384X_EVACK_OFF);
 			printk(KERN_DEBUG "%s: prism2_interrupt: ev=0xffff\n",
 			       dev->name);
-			return IRQ_HANDLED;
+			goto return_IRQ_HANDLED;
 		}
 
 		ev &= HFA384X_INW(HFA384X_INTEN_OFF);
@@ -2997,7 +3012,7 @@
 			if (ev & HFA384X_EV_CMD)
 				goto next_event;
 			if ((ev & HFA384X_EVENT_MASK) == 0)
-				return IRQ_HANDLED;
+				goto return_IRQ_HANDLED;
 			if (local->dev_enabled && (ev & ~HFA384X_EV_TICK) &&
 			    net_ratelimit()) {
 				printk(KERN_DEBUG "%s: prism2_interrupt: hw "
@@ -3012,7 +3027,7 @@
 				       " (!dev_enabled)" : "");
 			}
 			HFA384X_OUTW(ev, HFA384X_EVACK_OFF);
-			return IRQ_HANDLED;
+			goto return_IRQ_HANDLED;
 		}
 
 		if (ev & HFA384X_EV_TICK) {
@@ -3071,7 +3086,17 @@
 		}
 	}
 	prism2_io_debug_add(dev, PRISM2_IO_DEBUG_CMD_INTERRUPT, 0, 1);
+
+#ifdef EXTRA_FID_READ_TESTS
+	interrupts_nested--;
+#endif
 	return IRQ_RETVAL(events);
+	
+return_IRQ_HANDLED:
+#ifdef EXTRA_FID_READ_TESTS
+	interrupts_nested--;
+#endif
+	return IRQ_HANDLED;
 }