arch/ia64/sn/kernel/xpc_main.c

   1 /*
   2  * This file is subject to the terms and conditions of the GNU General Public
   3  * License.  See the file "COPYING" in the main directory of this archive
   4  * for more details.
   5  *
   6  * Copyright (c) 2004-2005 Silicon Graphics, Inc.  All Rights Reserved.
   7  */
   8
   9
  10 /*
  11  * Cross Partition Communication (XPC) support - standard version.
  12  *
  13  *      XPC provides a message passing capability that crosses partition
  14  *      boundaries. This module is made up of two parts:
  15  *
  16  *          partition   This part detects the presence/absence of other
  17  *                      partitions. It provides a heartbeat and monitors
  18  *                      the heartbeats of other partitions.
  19  *
  20  *          channel     This part manages the channels and sends/receives
  21  *                      messages across them to/from other partitions.
  22  *
  23  *      There are a couple of additional functions residing in XP, which
  24  *      provide an interface to XPC for its users.
  25  *
  26  *
  27  *      Caveats:
  28  *
  29  *        . We currently have no way to determine which nasid an IPI came
  30  *          from. Thus, xpc_IPI_send() does a remote AMO write followed by
  31  *          an IPI. The AMO indicates where data is to be pulled from, so
  32  *          after the IPI arrives, the remote partition checks the AMO word.
  33  *          The IPI can actually arrive before the AMO however, so other code
  34  *          must periodically check for this case. Also, remote AMO operations
  35  *          do not reliably time out. Thus we do a remote PIO read solely to
  36  *          know whether the remote partition is down and whether we should
  37  *          stop sending IPIs to it. This remote PIO read operation is set up
  38  *          in a special nofault region so SAL knows to ignore (and cleanup)
  39  *          any errors due to the remote AMO write, PIO read, and/or PIO
  40  *          write operations.
  41  *
  42  *          If/when new hardware solves this IPI problem, we should abandon
  43  *          the current approach.
  44  *
  45  */
  46
  47
  48 #include <linux/kernel.h>
  49 #include <linux/module.h>
  50 #include <linux/init.h>
  51 #include <linux/sched.h>
  52 #include <linux/syscalls.h>
  53 #include <linux/cache.h>
  54 #include <linux/interrupt.h>
  55 #include <linux/slab.h>
  56 #include <linux/delay.h>
  57 #include <asm/sn/intr.h>
  58 #include <asm/sn/sn_sal.h>
  59 #include <asm/uaccess.h>
  60 #include "xpc.h"
  61
  62
  63 /* define two XPC debug device structures to be used with dev_dbg() et al */
  64
  65 struct device_driver xpc_dbg_name = {
  66         .name = "xpc"
  67 };
  68
  69 struct device xpc_part_dbg_subname = {
  70         .bus_id = {0},          /* set to "part" at xpc_init() time */
  71         .driver = &xpc_dbg_name
  72 };
  73
  74 struct device xpc_chan_dbg_subname = {
  75         .bus_id = {0},          /* set to "chan" at xpc_init() time */
  76         .driver = &xpc_dbg_name
  77 };
  78
  79 struct device *xpc_part = &xpc_part_dbg_subname;
  80 struct device *xpc_chan = &xpc_chan_dbg_subname;
  81
  82
  83 /* systune related variables for /proc/sys directories */
  84
  85 static int xpc_hb_min = 1;
  86 static int xpc_hb_max = 10;
  87
  88 static int xpc_hb_check_min = 10;
  89 static int xpc_hb_check_max = 120;
  90
  91 static ctl_table xpc_sys_xpc_hb_dir[] = {
  92         {
  93                 1,
  94                 "hb_interval",
  95                 &xpc_hb_interval,
  96                 sizeof(int),
  97                 0644,
  98                 NULL,
  99                 &proc_dointvec_minmax,
 100                 &sysctl_intvec,
 101                 NULL,
 102                 &xpc_hb_min, &xpc_hb_max
 103         },
 104         {
 105                 2,
 106                 "hb_check_interval",
 107                 &xpc_hb_check_interval,
 108                 sizeof(int),
 109                 0644,
 110                 NULL,
 111                 &proc_dointvec_minmax,
 112                 &sysctl_intvec,
 113                 NULL,
 114                 &xpc_hb_check_min, &xpc_hb_check_max
 115         },
 116         {0}
 117 };
 118 static ctl_table xpc_sys_xpc_dir[] = {
 119         {
 120                 1,
 121                 "hb",
 122                 NULL,
 123                 0,
 124                 0555,
 125                 xpc_sys_xpc_hb_dir
 126         },
 127         {0}
 128 };
 129 static ctl_table xpc_sys_dir[] = {
 130         {
 131                 1,
 132                 "xpc",
 133                 NULL,
 134                 0,
 135                 0555,
 136                 xpc_sys_xpc_dir
 137         },
 138         {0}
 139 };
 140 static struct ctl_table_header *xpc_sysctl;
 141
 142
 143 /* #of IRQs received */
 144 static atomic_t xpc_act_IRQ_rcvd;
 145
 146 /* IRQ handler notifies this wait queue on receipt of an IRQ */
 147 static DECLARE_WAIT_QUEUE_HEAD(xpc_act_IRQ_wq);
 148
 149 static unsigned long xpc_hb_check_timeout;
 150
 151 /* xpc_hb_checker thread exited notification */
 152 static DECLARE_MUTEX_LOCKED(xpc_hb_checker_exited);
 153
 154 /* xpc_discovery thread exited notification */
 155 static DECLARE_MUTEX_LOCKED(xpc_discovery_exited);
 156
 157
 158 static struct timer_list xpc_hb_timer;
 159
 160
 161 static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *);
 162
 163
 164 /*
 165  * Notify the heartbeat check thread that an IRQ has been received.
 166  */
 167 static irqreturn_t
 168 xpc_act_IRQ_handler(int irq, void *dev_id, struct pt_regs *regs)
 169 {
 170         atomic_inc(&xpc_act_IRQ_rcvd);
 171         wake_up_interruptible(&xpc_act_IRQ_wq);
 172         return IRQ_HANDLED;
 173 }
 174
 175
 176 /*
 177  * Timer to produce the heartbeat.  The timer structures function is
 178  * already set when this is initially called.  A tunable is used to
 179  * specify when the next timeout should occur.
 180  */
 181 static void
 182 xpc_hb_beater(unsigned long dummy)
 183 {
 184         xpc_vars->heartbeat++;
 185
 186         if (jiffies >= xpc_hb_check_timeout) {
 187                 wake_up_interruptible(&xpc_act_IRQ_wq);
 188         }
 189
 190         xpc_hb_timer.expires = jiffies + (xpc_hb_interval * HZ);
 191         add_timer(&xpc_hb_timer);
 192 }
 193
 194
 195 /*
 196  * This thread is responsible for nearly all of the partition
 197  * activation/deactivation.
 198  */
 199 static int
 200 xpc_hb_checker(void *ignore)
 201 {
 202         int last_IRQ_count = 0;
 203         int new_IRQ_count;
 204         int force_IRQ=0;
 205
 206
 207         /* this thread was marked active by xpc_hb_init() */
 208
 209         daemonize(XPC_HB_CHECK_THREAD_NAME);
 210
 211         set_cpus_allowed(current, cpumask_of_cpu(XPC_HB_CHECK_CPU));
 212
 213         xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ);
 214
 215         while (!(volatile int) xpc_exiting) {
 216
 217                 /* wait for IRQ or timeout */
 218                 (void) wait_event_interruptible(xpc_act_IRQ_wq,
 219                             (last_IRQ_count < atomic_read(&xpc_act_IRQ_rcvd) ||
 220                                         jiffies >= xpc_hb_check_timeout ||
 221                                                 (volatile int) xpc_exiting));
 222
 223                 dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have "
 224                         "been received\n",
 225                         (int) (xpc_hb_check_timeout - jiffies),
 226                         atomic_read(&xpc_act_IRQ_rcvd) - last_IRQ_count);
 227
 228
 229                 /* checking of remote heartbeats is skewed by IRQ handling */
 230                 if (jiffies >= xpc_hb_check_timeout) {
 231                         dev_dbg(xpc_part, "checking remote heartbeats\n");
 232                         xpc_check_remote_hb();
 233
 234                         /*
 235                          * We need to periodically recheck to ensure no
 236                          * IPI/AMO pairs have been missed.  That check
 237                          * must always reset xpc_hb_check_timeout.
 238                          */
 239                         force_IRQ = 1;
 240                 }
 241
 242
 243                 new_IRQ_count = atomic_read(&xpc_act_IRQ_rcvd);
 244                 if (last_IRQ_count < new_IRQ_count || force_IRQ != 0) {
 245                         force_IRQ = 0;
 246
 247                         dev_dbg(xpc_part, "found an IRQ to process; will be "
 248                                 "resetting xpc_hb_check_timeout\n");
 249
 250                         last_IRQ_count += xpc_identify_act_IRQ_sender();
 251                         if (last_IRQ_count < new_IRQ_count) {
 252                                 /* retry once to help avoid missing AMO */
 253                                 (void) xpc_identify_act_IRQ_sender();
 254                         }
 255                         last_IRQ_count = new_IRQ_count;
 256
 257                         xpc_hb_check_timeout = jiffies +
 258                                            (xpc_hb_check_interval * HZ);
 259                 }
 260         }
 261
 262         dev_dbg(xpc_part, "heartbeat checker is exiting\n");
 263
 264
 265         /* mark this thread as inactive */
 266         up(&xpc_hb_checker_exited);
 267         return 0;
 268 }
 269
 270
 271 /*
 272  * This thread will attempt to discover other partitions to activate
 273  * based on info provided by SAL. This new thread is short lived and
 274  * will exit once discovery is complete.
 275  */
 276 static int
 277 xpc_initiate_discovery(void *ignore)
 278 {
 279         daemonize(XPC_DISCOVERY_THREAD_NAME);
 280
 281         xpc_discovery();
 282
 283         dev_dbg(xpc_part, "discovery thread is exiting\n");
 284
 285         /* mark this thread as inactive */
 286         up(&xpc_discovery_exited);
 287         return 0;
 288 }
 289
 290
 291 /*
 292  * Establish first contact with the remote partititon. This involves pulling
 293  * the XPC per partition variables from the remote partition and waiting for
 294  * the remote partition to pull ours.
 295  */
 296 static enum xpc_retval
 297 xpc_make_first_contact(struct xpc_partition *part)
 298 {
 299         enum xpc_retval ret;
 300
 301
 302         while ((ret = xpc_pull_remote_vars_part(part)) != xpcSuccess) {
 303                 if (ret != xpcRetry) {
 304                         XPC_DEACTIVATE_PARTITION(part, ret);
 305                         return ret;
 306                 }
 307
 308                 dev_dbg(xpc_chan, "waiting to make first contact with "
 309                         "partition %d\n", XPC_PARTID(part));
 310
 311                 /* wait a 1/4 of a second or so */
 312                 msleep_interruptible(250);
 313
 314                 if (part->act_state == XPC_P_DEACTIVATING) {
 315                         return part->reason;
 316                 }
 317         }
 318
 319         return xpc_mark_partition_active(part);
 320 }
 321
 322
 323 /*
 324  * The first kthread assigned to a newly activated partition is the one
 325  * created by XPC HB with which it calls xpc_partition_up(). XPC hangs on to
 326  * that kthread until the partition is brought down, at which time that kthread
 327  * returns back to XPC HB. (The return of that kthread will signify to XPC HB
 328  * that XPC has dismantled all communication infrastructure for the associated
 329  * partition.) This kthread becomes the channel manager for that partition.
 330  *
 331  * Each active partition has a channel manager, who, besides connecting and
 332  * disconnecting channels, will ensure that each of the partition's connected
 333  * channels has the required number of assigned kthreads to get the work done.
 334  */
 335 static void
 336 xpc_channel_mgr(struct xpc_partition *part)
 337 {
 338         while (part->act_state != XPC_P_DEACTIVATING ||
 339                                 atomic_read(&part->nchannels_active) > 0) {
 340
 341                 xpc_process_channel_activity(part);
 342
 343
 344                 /*
 345                  * Wait until we've been requested to activate kthreads or
 346                  * all of the channel's message queues have been torn down or
 347                  * a signal is pending.
 348                  *
 349                  * The channel_mgr_requests is set to 1 after being awakened,
 350                  * This is done to prevent the channel mgr from making one pass
 351                  * through the loop for each request, since he will
 352                  * be servicing all the requests in one pass. The reason it's
 353                  * set to 1 instead of 0 is so that other kthreads will know
 354                  * that the channel mgr is running and won't bother trying to
 355                  * wake him up.
 356                  */
 357                 atomic_dec(&part->channel_mgr_requests);
 358                 (void) wait_event_interruptible(part->channel_mgr_wq,
 359                                 (atomic_read(&part->channel_mgr_requests) > 0 ||
 360                                 (volatile u64) part->local_IPI_amo != 0 ||
 361                                 ((volatile u8) part->act_state ==
 362                                                         XPC_P_DEACTIVATING &&
 363                                 atomic_read(&part->nchannels_active) == 0)));
 364                 atomic_set(&part->channel_mgr_requests, 1);
 365
 366                 // >>> Does it need to wakeup periodically as well? In case we
 367                 // >>> miscalculated the #of kthreads to wakeup or create?
 368         }
 369 }
 370
 371
 372 /*
 373  * When XPC HB determines that a partition has come up, it will create a new
 374  * kthread and that kthread will call this function to attempt to set up the
 375  * basic infrastructure used for Cross Partition Communication with the newly
 376  * upped partition.
 377  *
 378  * The kthread that was created by XPC HB and which setup the XPC
 379  * infrastructure will remain assigned to the partition until the partition
 380  * goes down. At which time the kthread will teardown the XPC infrastructure
 381  * and then exit.
 382  *
 383  * XPC HB will put the remote partition's XPC per partition specific variables
 384  * physical address into xpc_partitions[partid].remote_vars_part_pa prior to
 385  * calling xpc_partition_up().
 386  */
 387 static void
 388 xpc_partition_up(struct xpc_partition *part)
 389 {
 390         DBUG_ON(part->channels != NULL);
 391
 392         dev_dbg(xpc_chan, "activating partition %d\n", XPC_PARTID(part));
 393
 394         if (xpc_setup_infrastructure(part) != xpcSuccess) {
 395                 return;
 396         }
 397
 398         /*
 399          * The kthread that XPC HB called us with will become the
 400          * channel manager for this partition. It will not return
 401          * back to XPC HB until the partition's XPC infrastructure
 402          * has been dismantled.
 403          */
 404
 405         (void) xpc_part_ref(part);      /* this will always succeed */
 406
 407         if (xpc_make_first_contact(part) == xpcSuccess) {
 408                 xpc_channel_mgr(part);
 409         }
 410
 411         xpc_part_deref(part);
 412
 413         xpc_teardown_infrastructure(part);
 414 }
 415
 416
 417 static int
 418 xpc_activating(void *__partid)
 419 {
 420         partid_t partid = (u64) __partid;
 421         struct xpc_partition *part = &xpc_partitions[partid];
 422         unsigned long irq_flags;
 423         struct sched_param param = { sched_priority: MAX_RT_PRIO - 1 };
 424         int ret;
 425
 426
 427         DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
 428
 429         spin_lock_irqsave(&part->act_lock, irq_flags);
 430
 431         if (part->act_state == XPC_P_DEACTIVATING) {
 432                 part->act_state = XPC_P_INACTIVE;
 433                 spin_unlock_irqrestore(&part->act_lock, irq_flags);
 434                 part->remote_rp_pa = 0;
 435                 return 0;
 436         }
 437
 438         /* indicate the thread is activating */
 439         DBUG_ON(part->act_state != XPC_P_ACTIVATION_REQ);
 440         part->act_state = XPC_P_ACTIVATING;
 441
 442         XPC_SET_REASON(part, 0, 0);
 443         spin_unlock_irqrestore(&part->act_lock, irq_flags);
 444
 445         dev_dbg(xpc_part, "bringing partition %d up\n", partid);
 446
 447         daemonize("xpc%02d", partid);
 448
 449         /*
 450          * This thread needs to run at a realtime priority to prevent a
 451          * significant performance degradation.
 452          */
 453         ret = sched_setscheduler(current, SCHED_FIFO, &param);
 454         if (ret != 0) {
 455                 dev_warn(xpc_part, "unable to set pid %d to a realtime "
 456                         "priority, ret=%d\n", current->pid, ret);
 457         }
 458
 459         /* allow this thread and its children to run on any CPU */
 460         set_cpus_allowed(current, CPU_MASK_ALL);
 461
 462         /*
 463          * Register the remote partition's AMOs with SAL so it can handle
 464          * and cleanup errors within that address range should the remote
 465          * partition go down. We don't unregister this range because it is
 466          * difficult to tell when outstanding writes to the remote partition
 467          * are finished and thus when it is safe to unregister. This should
 468          * not result in wasted space in the SAL xp_addr_region table because
 469          * we should get the same page for remote_amos_page_pa after module
 470          * reloads and system reboots.
 471          */
 472         if (sn_register_xp_addr_region(part->remote_amos_page_pa,
 473                                                         PAGE_SIZE, 1) < 0) {
 474                 dev_warn(xpc_part, "xpc_partition_up(%d) failed to register "
 475                         "xp_addr region\n", partid);
 476
 477                 spin_lock_irqsave(&part->act_lock, irq_flags);
 478                 part->act_state = XPC_P_INACTIVE;
 479                 XPC_SET_REASON(part, xpcPhysAddrRegFailed, __LINE__);
 480                 spin_unlock_irqrestore(&part->act_lock, irq_flags);
 481                 part->remote_rp_pa = 0;
 482                 return 0;
 483         }
 484
 485         XPC_ALLOW_HB(partid, xpc_vars);
 486         xpc_IPI_send_activated(part);
 487
 488
 489         /*
 490          * xpc_partition_up() holds this thread and marks this partition as
 491          * XPC_P_ACTIVE by calling xpc_hb_mark_active().
 492          */
 493         (void) xpc_partition_up(part);
 494
 495         xpc_mark_partition_inactive(part);
 496
 497         if (part->reason == xpcReactivating) {
 498                 /* interrupting ourselves results in activating partition */
 499                 xpc_IPI_send_reactivate(part);
 500         }
 501
 502         return 0;
 503 }
 504
 505
 506 void
 507 xpc_activate_partition(struct xpc_partition *part)
 508 {
 509         partid_t partid = XPC_PARTID(part);
 510         unsigned long irq_flags;
 511         pid_t pid;
 512
 513
 514         spin_lock_irqsave(&part->act_lock, irq_flags);
 515
 516         pid = kernel_thread(xpc_activating, (void *) ((u64) partid), 0);
 517
 518         DBUG_ON(part->act_state != XPC_P_INACTIVE);
 519
 520         if (pid > 0) {
 521                 part->act_state = XPC_P_ACTIVATION_REQ;
 522                 XPC_SET_REASON(part, xpcCloneKThread, __LINE__);
 523         } else {
 524                 XPC_SET_REASON(part, xpcCloneKThreadFailed, __LINE__);
 525         }
 526
 527         spin_unlock_irqrestore(&part->act_lock, irq_flags);
 528 }
 529
 530
 531 /*
 532  * Handle the receipt of a SGI_XPC_NOTIFY IRQ by seeing whether the specified
 533  * partition actually sent it. Since SGI_XPC_NOTIFY IRQs may be shared by more
 534  * than one partition, we use an AMO_t structure per partition to indicate
 535  * whether a partition has sent an IPI or not.  >>> If it has, then wake up the
 536  * associated kthread to handle it.
 537  *
 538  * All SGI_XPC_NOTIFY IRQs received by XPC are the result of IPIs sent by XPC
 539  * running on other partitions.
 540  *
 541  * Noteworthy Arguments:
 542  *
 543  *      irq - Interrupt ReQuest number. NOT USED.
 544  *
 545  *      dev_id - partid of IPI's potential sender.
 546  *
 547  *      regs - processor's context before the processor entered
 548  *             interrupt code. NOT USED.
 549  */
 550 irqreturn_t
 551 xpc_notify_IRQ_handler(int irq, void *dev_id, struct pt_regs *regs)
 552 {
 553         partid_t partid = (partid_t) (u64) dev_id;
 554         struct xpc_partition *part = &xpc_partitions[partid];
 555
 556
 557         DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
 558
 559         if (xpc_part_ref(part)) {
 560                 xpc_check_for_channel_activity(part);
 561
 562                 xpc_part_deref(part);
 563         }
 564         return IRQ_HANDLED;
 565 }
 566
 567
 568 /*
 569  * Check to see if xpc_notify_IRQ_handler() dropped any IPIs on the floor
 570  * because the write to their associated IPI amo completed after the IRQ/IPI
 571  * was received.
 572  */
 573 void
 574 xpc_dropped_IPI_check(struct xpc_partition *part)
 575 {
 576         if (xpc_part_ref(part)) {
 577                 xpc_check_for_channel_activity(part);
 578
 579                 part->dropped_IPI_timer.expires = jiffies +
 580                                                         XPC_P_DROPPED_IPI_WAIT;
 581                 add_timer(&part->dropped_IPI_timer);
 582                 xpc_part_deref(part);
 583         }
 584 }
 585
 586
 587 void
 588 xpc_activate_kthreads(struct xpc_channel *ch, int needed)
 589 {
 590         int idle = atomic_read(&ch->kthreads_idle);
 591         int assigned = atomic_read(&ch->kthreads_assigned);
 592         int wakeup;
 593
 594
 595         DBUG_ON(needed <= 0);
 596
 597         if (idle > 0) {
 598                 wakeup = (needed > idle) ? idle : needed;
 599                 needed -= wakeup;
 600
 601                 dev_dbg(xpc_chan, "wakeup %d idle kthreads, partid=%d, "
 602                         "channel=%d\n", wakeup, ch->partid, ch->number);
 603
 604                 /* only wakeup the requested number of kthreads */
 605                 wake_up_nr(&ch->idle_wq, wakeup);
 606         }
 607
 608         if (needed <= 0) {
 609                 return;
 610         }
 611
 612         if (needed + assigned > ch->kthreads_assigned_limit) {
 613                 needed = ch->kthreads_assigned_limit - assigned;
 614                 // >>>should never be less than 0
 615                 if (needed <= 0) {
 616                         return;
 617                 }
 618         }
 619
 620         dev_dbg(xpc_chan, "create %d new kthreads, partid=%d, channel=%d\n",
 621                 needed, ch->partid, ch->number);
 622
 623         xpc_create_kthreads(ch, needed);
 624 }
 625
 626
 627 /*
 628  * This function is where XPC's kthreads wait for messages to deliver.
 629  */
 630 static void
 631 xpc_kthread_waitmsgs(struct xpc_partition *part, struct xpc_channel *ch)
 632 {
 633         do {
 634                 /* deliver messages to their intended recipients */
 635
 636                 while ((volatile s64) ch->w_local_GP.get <
 637                                 (volatile s64) ch->w_remote_GP.put &&
 638                                         !((volatile u32) ch->flags &
 639                                                 XPC_C_DISCONNECTING)) {
 640                         xpc_deliver_msg(ch);
 641                 }
 642
 643                 if (atomic_inc_return(&ch->kthreads_idle) >
 644                                                 ch->kthreads_idle_limit) {
 645                         /* too many idle kthreads on this channel */
 646                         atomic_dec(&ch->kthreads_idle);
 647                         break;
 648                 }
 649
 650                 dev_dbg(xpc_chan, "idle kthread calling "
 651                         "wait_event_interruptible_exclusive()\n");
 652
 653                 (void) wait_event_interruptible_exclusive(ch->idle_wq,
 654                                 ((volatile s64) ch->w_local_GP.get <
 655                                         (volatile s64) ch->w_remote_GP.put ||
 656                                 ((volatile u32) ch->flags &
 657                                                 XPC_C_DISCONNECTING)));
 658
 659                 atomic_dec(&ch->kthreads_idle);
 660
 661         } while (!((volatile u32) ch->flags & XPC_C_DISCONNECTING));
 662 }
 663
 664
 665 static int
 666 xpc_daemonize_kthread(void *args)
 667 {
 668         partid_t partid = XPC_UNPACK_ARG1(args);
 669         u16 ch_number = XPC_UNPACK_ARG2(args);
 670         struct xpc_partition *part = &xpc_partitions[partid];
 671         struct xpc_channel *ch;
 672         int n_needed;
 673
 674
 675         daemonize("xpc%02dc%d", partid, ch_number);
 676
 677         dev_dbg(xpc_chan, "kthread starting, partid=%d, channel=%d\n",
 678                 partid, ch_number);
 679
 680         ch = &part->channels[ch_number];
 681
 682         if (!(ch->flags & XPC_C_DISCONNECTING)) {
 683                 DBUG_ON(!(ch->flags & XPC_C_CONNECTED));
 684
 685                 /* let registerer know that connection has been established */
 686
 687                 if (atomic_read(&ch->kthreads_assigned) == 1) {
 688                         xpc_connected_callout(ch);
 689
 690                         /*
 691                          * It is possible that while the callout was being
 692                          * made that the remote partition sent some messages.
 693                          * If that is the case, we may need to activate
 694                          * additional kthreads to help deliver them. We only
 695                          * need one less than total #of messages to deliver.
 696                          */
 697                         n_needed = ch->w_remote_GP.put - ch->w_local_GP.get - 1;
 698                         if (n_needed > 0 &&
 699                                         !(ch->flags & XPC_C_DISCONNECTING)) {
 700                                 xpc_activate_kthreads(ch, n_needed);
 701                         }
 702                 }
 703
 704                 xpc_kthread_waitmsgs(part, ch);
 705         }
 706
 707         if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
 708                         ((ch->flags & XPC_C_CONNECTCALLOUT) ||
 709                                 (ch->reason != xpcUnregistering &&
 710                                         ch->reason != xpcOtherUnregistering))) {
 711                 xpc_disconnected_callout(ch);
 712         }
 713
 714
 715         xpc_msgqueue_deref(ch);
 716
 717         dev_dbg(xpc_chan, "kthread exiting, partid=%d, channel=%d\n",
 718                 partid, ch_number);
 719
 720         xpc_part_deref(part);
 721         return 0;
 722 }
 723
 724
 725 /*
 726  * For each partition that XPC has established communications with, there is
 727  * a minimum of one kernel thread assigned to perform any operation that
 728  * may potentially sleep or block (basically the callouts to the asynchronous
 729  * functions registered via xpc_connect()).
 730  *
 731  * Additional kthreads are created and destroyed by XPC as the workload
 732  * demands.
 733  *
 734  * A kthread is assigned to one of the active channels that exists for a given
 735  * partition.
 736  */
 737 void
 738 xpc_create_kthreads(struct xpc_channel *ch, int needed)
 739 {
 740         unsigned long irq_flags;
 741         pid_t pid;
 742         u64 args = XPC_PACK_ARGS(ch->partid, ch->number);
 743
 744
 745         while (needed-- > 0) {
 746                 pid = kernel_thread(xpc_daemonize_kthread, (void *) args, 0);
 747                 if (pid < 0) {
 748                         /* the fork failed */
 749
 750                         if (atomic_read(&ch->kthreads_assigned) <
 751                                                 ch->kthreads_idle_limit) {
 752                                 /*
 753                                  * Flag this as an error only if we have an
 754                                  * insufficient #of kthreads for the channel
 755                                  * to function.
 756                                  *
 757                                  * No xpc_msgqueue_ref() is needed here since
 758                                  * the channel mgr is doing this.
 759                                  */
 760                                 spin_lock_irqsave(&ch->lock, irq_flags);
 761                                 XPC_DISCONNECT_CHANNEL(ch, xpcLackOfResources,
 762                                                                 &irq_flags);
 763                                 spin_unlock_irqrestore(&ch->lock, irq_flags);
 764                         }
 765                         break;
 766                 }
 767
 768                 /*
 769                  * The following is done on behalf of the newly created
 770                  * kthread. That kthread is responsible for doing the
 771                  * counterpart to the following before it exits.
 772                  */
 773                 (void) xpc_part_ref(&xpc_partitions[ch->partid]);
 774                 xpc_msgqueue_ref(ch);
 775                 atomic_inc(&ch->kthreads_assigned);
 776                 ch->kthreads_created++; // >>> temporary debug only!!!
 777         }
 778 }
 779
 780
 781 void
 782 xpc_disconnect_wait(int ch_number)
 783 {
 784         partid_t partid;
 785         struct xpc_partition *part;
 786         struct xpc_channel *ch;
 787
 788
 789         /* now wait for all callouts to the caller's function to cease */
 790         for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
 791                 part = &xpc_partitions[partid];
 792
 793                 if (xpc_part_ref(part)) {
 794                         ch = &part->channels[ch_number];
 795
 796 // >>> how do we keep from falling into the window between our check and going
 797 // >>> down and coming back up where sema is re-inited?
 798                         if (ch->flags & XPC_C_SETUP) {
 799                                 (void) down(&ch->teardown_sema);
 800                         }
 801
 802                         xpc_part_deref(part);
 803                 }
 804         }
 805 }
 806
 807
 808 static void
 809 xpc_do_exit(void)
 810 {
 811         partid_t partid;
 812         int active_part_count;
 813         struct xpc_partition *part;
 814
 815
 816         /* now it's time to eliminate our heartbeat */
 817         del_timer_sync(&xpc_hb_timer);
 818         xpc_vars->heartbeating_to_mask = 0;
 819
 820         /* indicate to others that our reserved page is uninitialized */
 821         xpc_rsvd_page->vars_pa = 0;
 822
 823         /*
 824          * Ignore all incoming interrupts. Without interupts the heartbeat
 825          * checker won't activate any new partitions that may come up.
 826          */
 827         free_irq(SGI_XPC_ACTIVATE, NULL);
 828
 829         /*
 830          * Cause the heartbeat checker and the discovery threads to exit.
 831          * We don't want them attempting to activate new partitions as we
 832          * try to deactivate the existing ones.
 833          */
 834         xpc_exiting = 1;
 835         wake_up_interruptible(&xpc_act_IRQ_wq);
 836
 837         /* wait for the heartbeat checker thread to mark itself inactive */
 838         down(&xpc_hb_checker_exited);
 839
 840         /* wait for the discovery thread to mark itself inactive */
 841         down(&xpc_discovery_exited);
 842
 843
 844         msleep_interruptible(300);
 845
 846
 847         /* wait for all partitions to become inactive */
 848
 849         do {
 850                 active_part_count = 0;
 851
 852                 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
 853                         part = &xpc_partitions[partid];
 854                         if (part->act_state != XPC_P_INACTIVE) {
 855                                 active_part_count++;
 856
 857                                 XPC_DEACTIVATE_PARTITION(part, xpcUnloading);
 858                         }
 859                 }
 860
 861                 if (active_part_count)
 862                         msleep_interruptible(300);
 863         } while (active_part_count > 0);
 864
 865
 866         /* close down protections for IPI operations */
 867         xpc_restrict_IPI_ops();
 868
 869
 870         /* clear the interface to XPC's functions */
 871         xpc_clear_interface();
 872
 873         if (xpc_sysctl) {
 874                 unregister_sysctl_table(xpc_sysctl);
 875         }
 876 }
 877
 878
 879 int __init
 880 xpc_init(void)
 881 {
 882         int ret;
 883         partid_t partid;
 884         struct xpc_partition *part;
 885         pid_t pid;
 886
 887
 888         if (!ia64_platform_is("sn2")) {
 889                 return -ENODEV;
 890         }
 891
 892         /*
 893          * xpc_remote_copy_buffer is used as a temporary buffer for bte_copy'ng
 894          * both a partition's reserved page and its XPC variables. Its size was
 895          * based on the size of a reserved page. So we need to ensure that the
 896          * XPC variables will fit as well.
 897          */
 898         if (XPC_VARS_ALIGNED_SIZE > XPC_RSVD_PAGE_ALIGNED_SIZE) {
 899                 dev_err(xpc_part, "xpc_remote_copy_buffer is not big enough\n");
 900                 return -EPERM;
 901         }
 902         DBUG_ON((u64) xpc_remote_copy_buffer !=
 903                                 L1_CACHE_ALIGN((u64) xpc_remote_copy_buffer));
 904
 905         snprintf(xpc_part->bus_id, BUS_ID_SIZE, "part");
 906         snprintf(xpc_chan->bus_id, BUS_ID_SIZE, "chan");
 907
 908         xpc_sysctl = register_sysctl_table(xpc_sys_dir, 1);
 909
 910         /*
 911          * The first few fields of each entry of xpc_partitions[] need to
 912          * be initialized now so that calls to xpc_connect() and
 913          * xpc_disconnect() can be made prior to the activation of any remote
 914          * partition. NOTE THAT NONE OF THE OTHER FIELDS BELONGING TO THESE
 915          * ENTRIES ARE MEANINGFUL UNTIL AFTER AN ENTRY'S CORRESPONDING
 916          * PARTITION HAS BEEN ACTIVATED.
 917          */
 918         for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
 919                 part = &xpc_partitions[partid];
 920
 921                 DBUG_ON((u64) part != L1_CACHE_ALIGN((u64) part));
 922
 923                 part->act_IRQ_rcvd = 0;
 924                 spin_lock_init(&part->act_lock);
 925                 part->act_state = XPC_P_INACTIVE;
 926                 XPC_SET_REASON(part, 0, 0);
 927                 part->setup_state = XPC_P_UNSET;
 928                 init_waitqueue_head(&part->teardown_wq);
 929                 atomic_set(&part->references, 0);
 930         }
 931
 932         /*
 933          * Open up protections for IPI operations (and AMO operations on
 934          * Shub 1.1 systems).
 935          */
 936         xpc_allow_IPI_ops();
 937
 938         /*
 939          * Interrupts being processed will increment this atomic variable and
 940          * awaken the heartbeat thread which will process the interrupts.
 941          */
 942         atomic_set(&xpc_act_IRQ_rcvd, 0);
 943
 944         /*
 945          * This is safe to do before the xpc_hb_checker thread has started
 946          * because the handler releases a wait queue.  If an interrupt is
 947          * received before the thread is waiting, it will not go to sleep,
 948          * but rather immediately process the interrupt.
 949          */
 950         ret = request_irq(SGI_XPC_ACTIVATE, xpc_act_IRQ_handler, 0,
 951                                                         "xpc hb", NULL);
 952         if (ret != 0) {
 953                 dev_err(xpc_part, "can't register ACTIVATE IRQ handler, "
 954                         "errno=%d\n", -ret);
 955
 956                 xpc_restrict_IPI_ops();
 957
 958                 if (xpc_sysctl) {
 959                         unregister_sysctl_table(xpc_sysctl);
 960                 }
 961                 return -EBUSY;
 962         }
 963
 964         /*
 965          * Fill the partition reserved page with the information needed by
 966          * other partitions to discover we are alive and establish initial
 967          * communications.
 968          */
 969         xpc_rsvd_page = xpc_rsvd_page_init();
 970         if (xpc_rsvd_page == NULL) {
 971                 dev_err(xpc_part, "could not setup our reserved page\n");
 972
 973                 free_irq(SGI_XPC_ACTIVATE, NULL);
 974                 xpc_restrict_IPI_ops();
 975
 976                 if (xpc_sysctl) {
 977                         unregister_sysctl_table(xpc_sysctl);
 978                 }
 979                 return -EBUSY;
 980         }
 981
 982
 983         /*
 984          * Set the beating to other partitions into motion.  This is
 985          * the last requirement for other partitions' discovery to
 986          * initiate communications with us.
 987          */
 988         init_timer(&xpc_hb_timer);
 989         xpc_hb_timer.function = xpc_hb_beater;
 990         xpc_hb_beater(0);
 991
 992
 993         /*
 994          * The real work-horse behind xpc.  This processes incoming
 995          * interrupts and monitors remote heartbeats.
 996          */
 997         pid = kernel_thread(xpc_hb_checker, NULL, 0);
 998         if (pid < 0) {
 999                 dev_err(xpc_part, "failed while forking hb check thread\n");
1000
1001                 /* indicate to others that our reserved page is uninitialized */
1002                 xpc_rsvd_page->vars_pa = 0;
1003
1004                 del_timer_sync(&xpc_hb_timer);
1005                 free_irq(SGI_XPC_ACTIVATE, NULL);
1006                 xpc_restrict_IPI_ops();
1007
1008                 if (xpc_sysctl) {
1009                         unregister_sysctl_table(xpc_sysctl);
1010                 }
1011                 return -EBUSY;
1012         }
1013
1014
1015         /*
1016          * Startup a thread that will attempt to discover other partitions to
1017          * activate based on info provided by SAL. This new thread is short
1018          * lived and will exit once discovery is complete.
1019          */
1020         pid = kernel_thread(xpc_initiate_discovery, NULL, 0);
1021         if (pid < 0) {
1022                 dev_err(xpc_part, "failed while forking discovery thread\n");
1023
1024                 /* mark this new thread as a non-starter */
1025                 up(&xpc_discovery_exited);
1026
1027                 xpc_do_exit();
1028                 return -EBUSY;
1029         }
1030
1031
1032         /* set the interface to point at XPC's functions */
1033         xpc_set_interface(xpc_initiate_connect, xpc_initiate_disconnect,
1034                           xpc_initiate_allocate, xpc_initiate_send,
1035                           xpc_initiate_send_notify, xpc_initiate_received,
1036                           xpc_initiate_partid_to_nasids);
1037
1038         return 0;
1039 }
1040 module_init(xpc_init);
1041
1042
1043 void __exit
1044 xpc_exit(void)
1045 {
1046         xpc_do_exit();
1047 }
1048 module_exit(xpc_exit);
1049
1050
1051 MODULE_AUTHOR("Silicon Graphics, Inc.");
1052 MODULE_DESCRIPTION("Cross Partition Communication (XPC) support");
1053 MODULE_LICENSE("GPL");
1054
1055 module_param(xpc_hb_interval, int, 0);
1056 MODULE_PARM_DESC(xpc_hb_interval, "Number of seconds between "
1057                 "heartbeat increments.");
1058
1059 module_param(xpc_hb_check_interval, int, 0);
1060 MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between "
1061                 "heartbeat checks.");
1062