x86: Fix UV BAU for non-consecutive nasids
authorCliff Wickman <cpw@sgi.com>
Tue, 10 May 2011 13:26:43 +0000 (08:26 -0500)
committerIngo Molnar <mingo@elte.hu>
Thu, 12 May 2011 21:45:42 +0000 (23:45 +0200)
This is a fix for the SGI Altix-UV Broadcast Assist Unit code,
which is used for TLB flushing.

Certain hardware configurations (that customers are ordering)
cause nasids (numa address space id's) to be non-consecutive.
Specifically, once you have more than 4 blades in a IRU
(Individual Rack Unit - or 1/2 rack) but less than the maximum
of 16, the nasid numbering becomes non-consecutive.  This
currently results in a 'catastrophic error' (CATERR) detected by
the firmware during OS boot.  The BAU is generating an 'INTD'
request that is targeting a non-existent nasid value. Such
configurations may also occur when a blade is configured off
because of hardware errors. (There is one UV hub per blade.)

This patch is required to support such configurations.

The problem with the tlb_uv.c code is that is using the
consecutive hub numbers as indices to the BAU distribution bit
map. These are simply the ordinal position of the hub or blade
within its partition.  It should be using physical node numbers
(pnodes), which correspond to the physical nasid values. Use of
the hub number only works as long as the nasids in the partition
are consecutive and increase with a stride of 1.

This patch changes the index to be the pnode number, thus
allowing nasids to be non-consecutive.
It also provides a table in local memory for each cpu to
translate target cpu number to target pnode and nasid.
And it improves naming to properly reflect 'node' and 'uvhub'
versus 'nasid'.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/r/E1QJmxX-0002Mz-Fk@eag09.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
arch/x86/include/asm/uv/uv_bau.h
arch/x86/platform/uv/tlb_uv.c

index 3e094af443c396a6ccf9f08628ecafc2a5d974ab..130f1eeee5fed416bb41ed4fa54a97bdcc31a601 100644 (file)
@@ -94,6 +94,8 @@
 /* after this # consecutive successes, bump up the throttle if it was lowered */
 #define COMPLETE_THRESHOLD 5
 
+#define UV_LB_SUBNODEID 0x10
+
 /*
  * number of entries in the destination side payload queue
  */
  * The distribution specification (32 bytes) is interpreted as a 256-bit
  * distribution vector. Adjacent bits correspond to consecutive even numbered
  * nodeIDs. The result of adding the index of a given bit to the 15-bit
- * 'base_dest_nodeid' field of the header corresponds to the
+ * 'base_dest_nasid' field of the header corresponds to the
  * destination nodeID associated with that specified bit.
  */
 struct bau_target_uvhubmask {
@@ -176,7 +178,7 @@ struct bau_msg_payload {
 struct bau_msg_header {
        unsigned int dest_subnodeid:6;  /* must be 0x10, for the LB */
        /* bits 5:0 */
-       unsigned int base_dest_nodeid:15; /* nasid of the */
+       unsigned int base_dest_nasid:15; /* nasid of the */
        /* bits 20:6 */                   /* first bit in uvhub map */
        unsigned int command:8; /* message type */
        /* bits 28:21 */
@@ -378,6 +380,10 @@ struct ptc_stats {
        unsigned long d_rcanceled; /* number of messages canceled by resets */
 };
 
+struct hub_and_pnode {
+       short uvhub;
+       short pnode;
+};
 /*
  * one per-cpu; to locate the software tables
  */
@@ -399,10 +405,12 @@ struct bau_control {
        int baudisabled;
        int set_bau_off;
        short cpu;
+       short osnode;
        short uvhub_cpu;
        short uvhub;
        short cpus_in_socket;
        short cpus_in_uvhub;
+       short partition_base_pnode;
        unsigned short message_number;
        unsigned short uvhub_quiesce;
        short socket_acknowledge_count[DEST_Q_SIZE];
@@ -422,15 +430,16 @@ struct bau_control {
        int congested_period;
        cycles_t period_time;
        long period_requests;
+       struct hub_and_pnode *target_hub_and_pnode;
 };
 
 static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp)
 {
        return constant_test_bit(uvhub, &dstp->bits[0]);
 }
-static inline void bau_uvhub_set(int uvhub, struct bau_target_uvhubmask *dstp)
+static inline void bau_uvhub_set(int pnode, struct bau_target_uvhubmask *dstp)
 {
-       __set_bit(uvhub, &dstp->bits[0]);
+       __set_bit(pnode, &dstp->bits[0]);
 }
 static inline void bau_uvhubs_clear(struct bau_target_uvhubmask *dstp,
                                    int nbits)
index 7cb6424317f642bcbb44caabdd6c92cf0fbfa605..c58e0ea39ef5facdaa630fa8138cb77fb530aa06 100644 (file)
@@ -699,16 +699,17 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
                                          struct mm_struct *mm,
                                          unsigned long va, unsigned int cpu)
 {
-       int tcpu;
-       int uvhub;
        int locals = 0;
        int remotes = 0;
        int hubs = 0;
+       int tcpu;
+       int tpnode;
        struct bau_desc *bau_desc;
        struct cpumask *flush_mask;
        struct ptc_stats *stat;
        struct bau_control *bcp;
        struct bau_control *tbcp;
+       struct hub_and_pnode *hpp;
 
        /* kernel was booted 'nobau' */
        if (nobau)
@@ -750,11 +751,18 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
        bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
        bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
 
-       /* cpu statistics */
        for_each_cpu(tcpu, flush_mask) {
-               uvhub = uv_cpu_to_blade_id(tcpu);
-               bau_uvhub_set(uvhub, &bau_desc->distribution);
-               if (uvhub == bcp->uvhub)
+               /*
+                * The distribution vector is a bit map of pnodes, relative
+                * to the partition base pnode (and the partition base nasid
+                * in the header).
+                * Translate cpu to pnode and hub using an array stored
+                * in local memory.
+                */
+               hpp = &bcp->socket_master->target_hub_and_pnode[tcpu];
+               tpnode = hpp->pnode - bcp->partition_base_pnode;
+               bau_uvhub_set(tpnode, &bau_desc->distribution);
+               if (hpp->uvhub == bcp->uvhub)
                        locals++;
                else
                        remotes++;
@@ -855,7 +863,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
  * an interrupt, but causes an error message to be returned to
  * the sender.
  */
-static void uv_enable_timeouts(void)
+static void __init uv_enable_timeouts(void)
 {
        int uvhub;
        int nuvhubs;
@@ -1326,10 +1334,10 @@ static int __init uv_ptc_init(void)
 }
 
 /*
- * initialize the sending side's sending buffers
+ * Initialize the sending side's sending buffers.
  */
 static void
-uv_activation_descriptor_init(int node, int pnode)
+uv_activation_descriptor_init(int node, int pnode, int base_pnode)
 {
        int i;
        int cpu;
@@ -1352,11 +1360,11 @@ uv_activation_descriptor_init(int node, int pnode)
        n = pa >> uv_nshift;
        m = pa & uv_mmask;
 
+       /* the 14-bit pnode */
        uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
                              (n << UV_DESC_BASE_PNODE_SHIFT | m));
-
        /*
-        * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
+        * Initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
         * cpu even though we only use the first one; one descriptor can
         * describe a broadcast to 256 uv hubs.
         */
@@ -1365,12 +1373,13 @@ uv_activation_descriptor_init(int node, int pnode)
                memset(bd2, 0, sizeof(struct bau_desc));
                bd2->header.sw_ack_flag = 1;
                /*
-                * base_dest_nodeid is the nasid of the first uvhub
-                * in the partition. The bit map will indicate uvhub numbers,
-                * which are 0-N in a partition. Pnodes are unique system-wide.
+                * The base_dest_nasid set in the message header is the nasid
+                * of the first uvhub in the partition. The bit map will
+                * indicate destination pnode numbers relative to that base.
+                * They may not be consecutive if nasid striding is being used.
                 */
-               bd2->header.base_dest_nodeid = UV_PNODE_TO_NASID(uv_partition_base_pnode);
-               bd2->header.dest_subnodeid = 0x10; /* the LB */
+               bd2->header.base_dest_nasid = UV_PNODE_TO_NASID(base_pnode);
+               bd2->header.dest_subnodeid = UV_LB_SUBNODEID;
                bd2->header.command = UV_NET_ENDPOINT_INTD;
                bd2->header.int_both = 1;
                /*
@@ -1442,7 +1451,7 @@ uv_payload_queue_init(int node, int pnode)
 /*
  * Initialization of each UV hub's structures
  */
-static void __init uv_init_uvhub(int uvhub, int vector)
+static void __init uv_init_uvhub(int uvhub, int vector, int base_pnode)
 {
        int node;
        int pnode;
@@ -1450,11 +1459,11 @@ static void __init uv_init_uvhub(int uvhub, int vector)
 
        node = uvhub_to_first_node(uvhub);
        pnode = uv_blade_to_pnode(uvhub);
-       uv_activation_descriptor_init(node, pnode);
+       uv_activation_descriptor_init(node, pnode, base_pnode);
        uv_payload_queue_init(node, pnode);
        /*
-        * the below initialization can't be in firmware because the
-        * messaging IRQ will be determined by the OS
+        * The below initialization can't be in firmware because the
+        * messaging IRQ will be determined by the OS.
         */
        apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits;
        uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
@@ -1491,10 +1500,11 @@ calculate_destination_timeout(void)
 /*
  * initialize the bau_control structure for each cpu
  */
-static int __init uv_init_per_cpu(int nuvhubs)
+static int __init uv_init_per_cpu(int nuvhubs, int base_part_pnode)
 {
        int i;
        int cpu;
+       int tcpu;
        int pnode;
        int uvhub;
        int have_hmaster;
@@ -1528,6 +1538,15 @@ static int __init uv_init_per_cpu(int nuvhubs)
                bcp = &per_cpu(bau_control, cpu);
                memset(bcp, 0, sizeof(struct bau_control));
                pnode = uv_cpu_hub_info(cpu)->pnode;
+               if ((pnode - base_part_pnode) >= UV_DISTRIBUTION_SIZE) {
+                       printk(KERN_EMERG
+                               "cpu %d pnode %d-%d beyond %d; BAU disabled\n",
+                               cpu, pnode, base_part_pnode,
+                               UV_DISTRIBUTION_SIZE);
+                       return 1;
+               }
+               bcp->osnode = cpu_to_node(cpu);
+               bcp->partition_base_pnode = uv_partition_base_pnode;
                uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
                *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
                bdp = &uvhub_descs[uvhub];
@@ -1536,7 +1555,7 @@ static int __init uv_init_per_cpu(int nuvhubs)
                bdp->pnode = pnode;
                /* kludge: 'assuming' one node per socket, and assuming that
                   disabling a socket just leaves a gap in node numbers */
-               socket = (cpu_to_node(cpu) & 1);
+               socket = bcp->osnode & 1;
                bdp->socket_mask |= (1 << socket);
                sdp = &bdp->socket[socket];
                sdp->cpu_number[sdp->num_cpus] = cpu;
@@ -1585,6 +1604,20 @@ static int __init uv_init_per_cpu(int nuvhubs)
 nextsocket:
                        socket++;
                        socket_mask = (socket_mask >> 1);
+                       /* each socket gets a local array of pnodes/hubs */
+                       bcp = smaster;
+                       bcp->target_hub_and_pnode = kmalloc_node(
+                               sizeof(struct hub_and_pnode) *
+                               num_possible_cpus(), GFP_KERNEL, bcp->osnode);
+                       memset(bcp->target_hub_and_pnode, 0,
+                               sizeof(struct hub_and_pnode) *
+                               num_possible_cpus());
+                       for_each_present_cpu(tcpu) {
+                               bcp->target_hub_and_pnode[tcpu].pnode =
+                                       uv_cpu_hub_info(tcpu)->pnode;
+                               bcp->target_hub_and_pnode[tcpu].uvhub =
+                                       uv_cpu_hub_info(tcpu)->numa_blade_id;
+                       }
                }
        }
        kfree(uvhub_descs);
@@ -1637,21 +1670,22 @@ static int __init uv_bau_init(void)
        spin_lock_init(&disable_lock);
        congested_cycles = microsec_2_cycles(congested_response_us);
 
-       if (uv_init_per_cpu(nuvhubs)) {
-               nobau = 1;
-               return 0;
-       }
-
        uv_partition_base_pnode = 0x7fffffff;
-       for (uvhub = 0; uvhub < nuvhubs; uvhub++)
+       for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
                if (uv_blade_nr_possible_cpus(uvhub) &&
                        (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
                        uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
+       }
+
+       if (uv_init_per_cpu(nuvhubs, uv_partition_base_pnode)) {
+               nobau = 1;
+               return 0;
+       }
 
        vector = UV_BAU_MESSAGE;
        for_each_possible_blade(uvhub)
                if (uv_blade_nr_possible_cpus(uvhub))
-                       uv_init_uvhub(uvhub, vector);
+                       uv_init_uvhub(uvhub, vector, uv_partition_base_pnode);
 
        uv_enable_timeouts();
        alloc_intr_gate(vector, uv_bau_message_intr1);