kernel/power/tuxonice_cluster.c

   1 /*
   2  * kernel/power/tuxonice_cluster.c
   3  *
   4  * Copyright (C) 2006-2010 Nigel Cunningham (nigel at tuxonice net)
   5  *
   6  * This file is released under the GPLv2.
   7  *
   8  * This file contains routines for cluster hibernation support.
   9  *
  10  * Based on ip autoconfiguration code in net/ipv4/ipconfig.c.
  11  *
  12  * How does it work?
  13  *
  14  * There is no 'master' node that tells everyone else what to do. All nodes
  15  * send messages to the broadcast address/port, maintain a list of peers
  16  * and figure out when to progress to the next step in hibernating or resuming.
  17  * This makes us more fault tolerant when it comes to nodes coming and going
  18  * (which may be more of an issue if we're hibernating when power supplies
  19  * are being unreliable).
  20  *
  21  * At boot time, we start a ktuxonice thread that handles communication with
  22  * other nodes. This node maintains a state machine that controls our progress
  23  * through hibernating and resuming, keeping us in step with other nodes. Nodes
  24  * are identified by their hw address.
  25  *
  26  * On startup, the node sends CLUSTER_PING on the configured interface's
  27  * broadcast address, port $toi_cluster_port (see below) and begins to listen
  28  * for other broadcast messages. CLUSTER_PING messages are repeated at
  29  * intervals of 5 minutes, with a random offset to spread traffic out.
  30  *
  31  * A hibernation cycle is initiated from any node via
  32  *
  33  * echo > /sys/power/tuxonice/do_hibernate
  34  *
  35  * and (possibily) the hibernate script. At each step of the process, the node
  36  * completes its work, and waits for all other nodes to signal completion of
  37  * their work (or timeout) before progressing to the next step.
  38  *
  39  * Request/state  Action before reply   Possible reply  Next state
  40  * HIBERNATE      capable, pre-script   HIBERNATE|ACK   NODE_PREP
  41  *                                      HIBERNATE|NACK  INIT_0
  42  *
  43  * PREP           prepare_image         PREP|ACK        IMAGE_WRITE
  44  *                                      PREP|NACK       INIT_0
  45  *                                      ABORT           RUNNING
  46  *
  47  * IO             write image           IO|ACK          power off
  48  *                                      ABORT           POST_RESUME
  49  *
  50  * (Boot time)    check for image       IMAGE|ACK       RESUME_PREP
  51  *                                      (Note 1)
  52  *                                      IMAGE|NACK      (Note 2)
  53  *
  54  * PREP           prepare read image    PREP|ACK        IMAGE_READ
  55  *                                      PREP|NACK       (As NACK_IMAGE)
  56  *
  57  * IO             read image            IO|ACK          POST_RESUME
  58  *
  59  * POST_RESUME    thaw, post-script                     RUNNING
  60  *
  61  * INIT_0         init 0
  62  *
  63  * Other messages:
  64  *
  65  * - PING: Request for all other live nodes to send a PONG. Used at startup to
  66  *   announce presence, when a node is suspected dead and periodically, in case
  67  *   segments of the network are [un]plugged.
  68  *
  69  * - PONG: Response to a PING.
  70  *
  71  * - ABORT: Request to cancel writing an image.
  72  *
  73  * - BYE: Notification that this node is shutting down.
  74  *
  75  * Note 1: Repeated at 3s intervals until we continue to boot/resume, so that
  76  * nodes which are slower to start up can get state synchronised. If a node
  77  * starting up sees other nodes sending RESUME_PREP or IMAGE_READ, it may send
  78  * ACK_IMAGE and they will wait for it to catch up. If it sees ACK_READ, it
  79  * must invalidate its image (if any) and boot normally.
  80  *
  81  * Note 2: May occur when one node lost power or powered off while others
  82  * hibernated. This node waits for others to complete resuming (ACK_READ)
  83  * before completing its boot, so that it appears as a fail node restarting.
  84  *
  85  * If any node has an image, then it also has a list of nodes that hibernated
  86  * in synchronisation with it. The node will wait for other nodes to appear
  87  * or timeout before beginning its restoration.
  88  *
  89  * If a node has no image, it needs to wait, in case other nodes which do have
  90  * an image are going to resume, but are taking longer to announce their
  91  * presence. For this reason, the user can specify a timeout value and a number
  92  * of nodes detected before we just continue. (We might want to assume in a
  93  * cluster of, say, 15 nodes, if 8 others have booted without finding an image,
  94  * the remaining nodes will too. This might help in situations where some nodes
  95  * are much slower to boot, or more subject to hardware failures or such like).
  96  */
  97
  98 #include <linux/suspend.h>
  99 #include <linux/module.h>
 100 #include <linux/moduleparam.h>
 101 #include <linux/if.h>
 102 #include <linux/rtnetlink.h>
 103 #include <linux/ip.h>
 104 #include <linux/udp.h>
 105 #include <linux/in.h>
 106 #include <linux/if_arp.h>
 107 #include <linux/kthread.h>
 108 #include <linux/wait.h>
 109 #include <linux/netdevice.h>
 110 #include <net/ip.h>
 111
 112 #include "tuxonice.h"
 113 #include "tuxonice_modules.h"
 114 #include "tuxonice_sysfs.h"
 115 #include "tuxonice_alloc.h"
 116 #include "tuxonice_io.h"
 117
 118 #if 1
 119 #define PRINTK(a, b...) do { printk(a, ##b); } while (0)
 120 #else
 121 #define PRINTK(a, b...) do { } while (0)
 122 #endif
 123
 124 static int loopback_mode;
 125 static int num_local_nodes = 1;
 126 #define MAX_LOCAL_NODES 8
 127 #define SADDR (loopback_mode ? b->sid : h->saddr)
 128
 129 #define MYNAME "TuxOnIce Clustering"
 130
 131 enum cluster_message {
 132         MSG_ACK = 1,
 133         MSG_NACK = 2,
 134         MSG_PING = 4,
 135         MSG_ABORT = 8,
 136         MSG_BYE = 16,
 137         MSG_HIBERNATE = 32,
 138         MSG_IMAGE = 64,
 139         MSG_IO = 128,
 140         MSG_RUNNING = 256
 141 };
 142
 143 static char *str_message(int message)
 144 {
 145         switch (message) {
 146         case 4:
 147                 return "Ping";
 148         case 8:
 149                 return "Abort";
 150         case 9:
 151                 return "Abort acked";
 152         case 10:
 153                 return "Abort nacked";
 154         case 16:
 155                 return "Bye";
 156         case 17:
 157                 return "Bye acked";
 158         case 18:
 159                 return "Bye nacked";
 160         case 32:
 161                 return "Hibernate request";
 162         case 33:
 163                 return "Hibernate ack";
 164         case 34:
 165                 return "Hibernate nack";
 166         case 64:
 167                 return "Image exists?";
 168         case 65:
 169                 return "Image does exist";
 170         case 66:
 171                 return "No image here";
 172         case 128:
 173                 return "I/O";
 174         case 129:
 175                 return "I/O okay";
 176         case 130:
 177                 return "I/O failed";
 178         case 256:
 179                 return "Running";
 180         default:
 181                 printk(KERN_ERR "Unrecognised message %d.\n", message);
 182                 return "Unrecognised message (see dmesg)";
 183         }
 184 }
 185
 186 #define MSG_ACK_MASK (MSG_ACK | MSG_NACK)
 187 #define MSG_STATE_MASK (~MSG_ACK_MASK)
 188
 189 struct node_info {
 190         struct list_head member_list;
 191         wait_queue_head_t member_events;
 192         spinlock_t member_list_lock;
 193         spinlock_t receive_lock;
 194         int peer_count, ignored_peer_count;
 195         struct toi_sysfs_data sysfs_data;
 196         enum cluster_message current_message;
 197 };
 198
 199 struct node_info node_array[MAX_LOCAL_NODES];
 200
 201 struct cluster_member {
 202         __be32 addr;
 203         enum cluster_message message;
 204         struct list_head list;
 205         int ignore;
 206 };
 207
 208 #define toi_cluster_port_send 3501
 209 #define toi_cluster_port_recv 3502
 210
 211 static struct net_device *net_dev;
 212 static struct toi_module_ops toi_cluster_ops;
 213
 214 static int toi_recv(struct sk_buff *skb, struct net_device *dev,
 215                     struct packet_type *pt, struct net_device *orig_dev);
 216
 217 static struct packet_type toi_cluster_packet_type = {
 218         .type = __constant_htons(ETH_P_IP),
 219         .func = toi_recv,
 220 };
 221
 222 struct toi_pkt {                /* BOOTP packet format */
 223         struct iphdr iph;       /* IP header */
 224         struct udphdr udph;     /* UDP header */
 225         u8 htype;               /* HW address type */
 226         u8 hlen;                /* HW address length */
 227         __be32 xid;             /* Transaction ID */
 228         __be16 secs;            /* Seconds since we started */
 229         __be16 flags;           /* Just what it says */
 230         u8 hw_addr[16];         /* Sender's HW address */
 231         u16 message;            /* Message */
 232         unsigned long sid;      /* Source ID for loopback testing */
 233 };
 234
 235 static char toi_cluster_iface[IFNAMSIZ] = CONFIG_TOI_DEFAULT_CLUSTER_INTERFACE;
 236
 237 static int added_pack;
 238
 239 static int others_have_image;
 240
 241 /* Key used to allow multiple clusters on the same lan */
 242 static char toi_cluster_key[32] = CONFIG_TOI_DEFAULT_CLUSTER_KEY;
 243 static char pre_hibernate_script[255] = CONFIG_TOI_DEFAULT_CLUSTER_PRE_HIBERNATE;
 244 static char post_hibernate_script[255] = CONFIG_TOI_DEFAULT_CLUSTER_POST_HIBERNATE;
 245
 246 /*                      List of cluster members                 */
 247 static unsigned long continue_delay = 5 * HZ;
 248 static unsigned long cluster_message_timeout = 3 * HZ;
 249
 250 /*              === Membership list === */
 251
 252 static void print_member_info(int index)
 253 {
 254         struct cluster_member *this;
 255
 256         printk(KERN_INFO "==> Dumping node %d.\n", index);
 257
 258         list_for_each_entry(this, &node_array[index].member_list, list)
 259             printk(KERN_INFO "%d.%d.%d.%d last message %s. %s\n",
 260                    NIPQUAD(this->addr),
 261                    str_message(this->message), this->ignore ? "(Ignored)" : "");
 262         printk(KERN_INFO "== Done ==\n");
 263 }
 264
 265 static struct cluster_member *__find_member(int index, __be32 addr)
 266 {
 267         struct cluster_member *this;
 268
 269         list_for_each_entry(this, &node_array[index].member_list, list) {
 270                 if (this->addr != addr)
 271                         continue;
 272
 273                 return this;
 274         }
 275
 276         return NULL;
 277 }
 278
 279 static void set_ignore(int index, __be32 addr, struct cluster_member *this)
 280 {
 281         if (this->ignore) {
 282                 PRINTK("Node %d already ignoring %d.%d.%d.%d.\n", index, NIPQUAD(addr));
 283                 return;
 284         }
 285
 286         PRINTK("Node %d sees node %d.%d.%d.%d now being ignored.\n", index, NIPQUAD(addr));
 287         this->ignore = 1;
 288         node_array[index].ignored_peer_count++;
 289 }
 290
 291 static int __add_update_member(int index, __be32 addr, int message)
 292 {
 293         struct cluster_member *this;
 294
 295         this = __find_member(index, addr);
 296         if (this) {
 297                 if (this->message != message) {
 298                         this->message = message;
 299                         if ((message & MSG_NACK) &&
 300                             (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
 301                                 set_ignore(index, addr, this);
 302                         PRINTK("Node %d sees node %d.%d.%d.%d now sending "
 303                                "%s.\n", index, NIPQUAD(addr), str_message(message));
 304                         wake_up(&node_array[index].member_events);
 305                 }
 306                 return 0;
 307         }
 308
 309         this = (struct cluster_member *)toi_kzalloc(36, sizeof(struct cluster_member), GFP_KERNEL);
 310
 311         if (!this)
 312                 return -1;
 313
 314         this->addr = addr;
 315         this->message = message;
 316         this->ignore = 0;
 317         INIT_LIST_HEAD(&this->list);
 318
 319         node_array[index].peer_count++;
 320
 321         PRINTK("Node %d sees node %d.%d.%d.%d sending %s.\n", index,
 322                NIPQUAD(addr), str_message(message));
 323
 324         if ((message & MSG_NACK) && (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
 325                 set_ignore(index, addr, this);
 326         list_add_tail(&this->list, &node_array[index].member_list);
 327         return 1;
 328 }
 329
 330 static int add_update_member(int index, __be32 addr, int message)
 331 {
 332         int result;
 333         unsigned long flags;
 334         spin_lock_irqsave(&node_array[index].member_list_lock, flags);
 335         result = __add_update_member(index, addr, message);
 336         spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
 337
 338         print_member_info(index);
 339
 340         wake_up(&node_array[index].member_events);
 341
 342         return result;
 343 }
 344
 345 static void del_member(int index, __be32 addr)
 346 {
 347         struct cluster_member *this;
 348         unsigned long flags;
 349
 350         spin_lock_irqsave(&node_array[index].member_list_lock, flags);
 351         this = __find_member(index, addr);
 352
 353         if (this) {
 354                 list_del_init(&this->list);
 355                 toi_kfree(36, this, sizeof(*this));
 356                 node_array[index].peer_count--;
 357         }
 358
 359         spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
 360 }
 361
 362 /*              === Message transmission ===    */
 363
 364 static void toi_send_if(int message, unsigned long my_id);
 365
 366 /*
 367  *  Process received TOI packet.
 368  */
 369 static int toi_recv(struct sk_buff *skb, struct net_device *dev,
 370                     struct packet_type *pt, struct net_device *orig_dev)
 371 {
 372         struct toi_pkt *b;
 373         struct iphdr *h;
 374         int len, result, index;
 375         unsigned long addr, message, ack;
 376
 377         /* Perform verifications before taking the lock.  */
 378         if (skb->pkt_type == PACKET_OTHERHOST)
 379                 goto drop;
 380
 381         if (dev != net_dev)
 382                 goto drop;
 383
 384         skb = skb_share_check(skb, GFP_ATOMIC);
 385         if (!skb)
 386                 return NET_RX_DROP;
 387
 388         if (!pskb_may_pull(skb, sizeof(struct iphdr) + sizeof(struct udphdr)))
 389                 goto drop;
 390
 391         b = (struct toi_pkt *)skb_network_header(skb);
 392         h = &b->iph;
 393
 394         if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
 395                 goto drop;
 396
 397         /* Fragments are not supported */
 398         if (h->frag_off & htons(IP_OFFSET | IP_MF)) {
 399                 if (net_ratelimit())
 400                         printk(KERN_ERR "TuxOnIce: Ignoring fragmented " "cluster message.\n");
 401                 goto drop;
 402         }
 403
 404         if (skb->len < ntohs(h->tot_len))
 405                 goto drop;
 406
 407         if (ip_fast_csum((char *)h, h->ihl))
 408                 goto drop;
 409
 410         if (b->udph.source != htons(toi_cluster_port_send) ||
 411             b->udph.dest != htons(toi_cluster_port_recv))
 412                 goto drop;
 413
 414         if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
 415                 goto drop;
 416
 417         len = ntohs(b->udph.len) - sizeof(struct udphdr);
 418
 419         /* Ok the front looks good, make sure we can get at the rest.  */
 420         if (!pskb_may_pull(skb, skb->len))
 421                 goto drop;
 422
 423         b = (struct toi_pkt *)skb_network_header(skb);
 424         h = &b->iph;
 425
 426         addr = SADDR;
 427         PRINTK(">>> Message %s received from " NIPQUAD_FMT ".\n",
 428                str_message(b->message), NIPQUAD(addr));
 429
 430         message = b->message & MSG_STATE_MASK;
 431         ack = b->message & MSG_ACK_MASK;
 432
 433         for (index = 0; index < num_local_nodes; index++) {
 434                 int new_message = node_array[index].current_message, old_message = new_message;
 435
 436                 if (index == SADDR || !old_message) {
 437                         PRINTK("Ignoring node %d (offline or self).\n", index);
 438                         continue;
 439                 }
 440
 441                 /* One message at a time, please. */
 442                 spin_lock(&node_array[index].receive_lock);
 443
 444                 result = add_update_member(index, SADDR, b->message);
 445                 if (result == -1) {
 446                         printk(KERN_INFO "Failed to add new cluster member "
 447                                NIPQUAD_FMT ".\n", NIPQUAD(addr));
 448                         goto drop_unlock;
 449                 }
 450
 451                 switch (b->message & MSG_STATE_MASK) {
 452                 case MSG_PING:
 453                         break;
 454                 case MSG_ABORT:
 455                         break;
 456                 case MSG_BYE:
 457                         break;
 458                 case MSG_HIBERNATE:
 459                         /* Can I hibernate? */
 460                         new_message = MSG_HIBERNATE | ((index & 1) ? MSG_NACK : MSG_ACK);
 461                         break;
 462                 case MSG_IMAGE:
 463                         /* Can I resume? */
 464                         new_message = MSG_IMAGE | ((index & 1) ? MSG_NACK : MSG_ACK);
 465                         if (new_message != old_message)
 466                                 printk(KERN_ERR "Setting whether I can resume "
 467                                        "to %d.\n", new_message);
 468                         break;
 469                 case MSG_IO:
 470                         new_message = MSG_IO | MSG_ACK;
 471                         break;
 472                 case MSG_RUNNING:
 473                         break;
 474                 default:
 475                         if (net_ratelimit())
 476                                 printk(KERN_ERR "Unrecognised TuxOnIce cluster"
 477                                        " message %d from " NIPQUAD_FMT ".\n",
 478                                        b->message, NIPQUAD(addr));
 479                 };
 480
 481                 if (old_message != new_message) {
 482                         node_array[index].current_message = new_message;
 483                         printk(KERN_INFO ">>> Sending new message for node " "%d.\n", index);
 484                         toi_send_if(new_message, index);
 485                 } else if (!ack) {
 486                         printk(KERN_INFO ">>> Resending message for node %d.\n", index);
 487                         toi_send_if(new_message, index);
 488                 }
 489  drop_unlock:
 490                 spin_unlock(&node_array[index].receive_lock);
 491         };
 492
 493  drop:
 494         /* Throw the packet out. */
 495         kfree_skb(skb);
 496
 497         return 0;
 498 }
 499
 500 /*
 501  *  Send cluster message to single interface.
 502  */
 503 static void toi_send_if(int message, unsigned long my_id)
 504 {
 505         struct sk_buff *skb;
 506         struct toi_pkt *b;
 507         int hh_len = LL_RESERVED_SPACE(net_dev);
 508         struct iphdr *h;
 509
 510         /* Allocate packet */
 511         skb = alloc_skb(sizeof(struct toi_pkt) + hh_len + 15, GFP_KERNEL);
 512         if (!skb)
 513                 return;
 514         skb_reserve(skb, hh_len);
 515         b = (struct toi_pkt *)skb_put(skb, sizeof(struct toi_pkt));
 516         memset(b, 0, sizeof(struct toi_pkt));
 517
 518         /* Construct IP header */
 519         skb_reset_network_header(skb);
 520         h = ip_hdr(skb);
 521         h->version = 4;
 522         h->ihl = 5;
 523         h->tot_len = htons(sizeof(struct toi_pkt));
 524         h->frag_off = htons(IP_DF);
 525         h->ttl = 64;
 526         h->protocol = IPPROTO_UDP;
 527         h->daddr = htonl(INADDR_BROADCAST);
 528         h->check = ip_fast_csum((unsigned char *)h, h->ihl);
 529
 530         /* Construct UDP header */
 531         b->udph.source = htons(toi_cluster_port_send);
 532         b->udph.dest = htons(toi_cluster_port_recv);
 533         b->udph.len = htons(sizeof(struct toi_pkt) - sizeof(struct iphdr));
 534         /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
 535
 536         /* Construct message */
 537         b->message = message;
 538         b->sid = my_id;
 539         b->htype = net_dev->type;       /* can cause undefined behavior */
 540         b->hlen = net_dev->addr_len;
 541         memcpy(b->hw_addr, net_dev->dev_addr, net_dev->addr_len);
 542         b->secs = htons(3);     /* 3 seconds */
 543
 544         /* Chain packet down the line... */
 545         skb->dev = net_dev;
 546         skb->protocol = htons(ETH_P_IP);
 547         if ((dev_hard_header(skb, net_dev, ntohs(skb->protocol),
 548                              net_dev->broadcast, net_dev->dev_addr, skb->len) < 0) ||
 549             dev_queue_xmit(skb) < 0)
 550                 printk(KERN_INFO "E");
 551 }
 552
 553 /*      =========================================               */
 554
 555 /*                      kTOICluster                     */
 556
 557 static atomic_t num_cluster_threads;
 558 static DECLARE_WAIT_QUEUE_HEAD(clusterd_events);
 559
 560 static int kTOICluster(void *data)
 561 {
 562         unsigned long my_id;
 563
 564         my_id = atomic_add_return(1, &num_cluster_threads) - 1;
 565         node_array[my_id].current_message = (unsigned long)data;
 566
 567         PRINTK("kTOICluster daemon %lu starting.\n", my_id);
 568
 569         current->flags |= PF_NOFREEZE;
 570
 571         while (node_array[my_id].current_message) {
 572                 toi_send_if(node_array[my_id].current_message, my_id);
 573                 sleep_on_timeout(&clusterd_events, cluster_message_timeout);
 574                 PRINTK("Link state %lu is %d.\n", my_id, node_array[my_id].current_message);
 575         }
 576
 577         toi_send_if(MSG_BYE, my_id);
 578         atomic_dec(&num_cluster_threads);
 579         wake_up(&clusterd_events);
 580
 581         PRINTK("kTOICluster daemon %lu exiting.\n", my_id);
 582         __set_current_state(TASK_RUNNING);
 583         return 0;
 584 }
 585
 586 static void kill_clusterd(void)
 587 {
 588         int i;
 589
 590         for (i = 0; i < num_local_nodes; i++) {
 591                 if (node_array[i].current_message) {
 592                         PRINTK("Seeking to kill clusterd %d.\n", i);
 593                         node_array[i].current_message = 0;
 594                 }
 595         }
 596         wait_event(clusterd_events, !atomic_read(&num_cluster_threads));
 597         PRINTK("All cluster daemons have exited.\n");
 598 }
 599
 600 static int peers_not_in_message(int index, int message, int precise)
 601 {
 602         struct cluster_member *this;
 603         unsigned long flags;
 604         int result = 0;
 605
 606         spin_lock_irqsave(&node_array[index].member_list_lock, flags);
 607         list_for_each_entry(this, &node_array[index].member_list, list) {
 608                 if (this->ignore)
 609                         continue;
 610
 611                 PRINTK("Peer %d.%d.%d.%d sending %s. "
 612                        "Seeking %s.\n",
 613                        NIPQUAD(this->addr), str_message(this->message), str_message(message));
 614                 if ((precise ? this->message : this->message & MSG_STATE_MASK) != message)
 615                         result++;
 616         }
 617         spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
 618         PRINTK("%d peers in sought message.\n", result);
 619         return result;
 620 }
 621
 622 static void reset_ignored(int index)
 623 {
 624         struct cluster_member *this;
 625         unsigned long flags;
 626
 627         spin_lock_irqsave(&node_array[index].member_list_lock, flags);
 628         list_for_each_entry(this, &node_array[index].member_list, list)
 629             this->ignore = 0;
 630         node_array[index].ignored_peer_count = 0;
 631         spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
 632 }
 633
 634 static int peers_in_message(int index, int message, int precise)
 635 {
 636         return node_array[index].peer_count -
 637             node_array[index].ignored_peer_count - peers_not_in_message(index, message, precise);
 638 }
 639
 640 static int time_to_continue(int index, unsigned long start, int message)
 641 {
 642         int first = peers_not_in_message(index, message, 0);
 643         int second = peers_in_message(index, message, 1);
 644
 645         PRINTK("First part returns %d, second returns %d.\n", first, second);
 646
 647         if (!first && !second) {
 648                 PRINTK("All peers answered message %d.\n", message);
 649                 return 1;
 650         }
 651
 652         if (time_after(jiffies, start + continue_delay)) {
 653                 PRINTK("Timeout reached.\n");
 654                 return 1;
 655         }
 656
 657         PRINTK("Not time to continue yet (%lu < %lu).\n", jiffies, start + continue_delay);
 658         return 0;
 659 }
 660
 661 void toi_initiate_cluster_hibernate(void)
 662 {
 663         int result;
 664         unsigned long start;
 665
 666         result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
 667         if (result)
 668                 return;
 669
 670         toi_send_if(MSG_HIBERNATE, 0);
 671
 672         start = jiffies;
 673         wait_event(node_array[0].member_events, time_to_continue(0, start, MSG_HIBERNATE));
 674
 675         if (test_action_state(TOI_FREEZER_TEST)) {
 676                 toi_send_if(MSG_ABORT, 0);
 677
 678                 start = jiffies;
 679                 wait_event(node_array[0].member_events, time_to_continue(0, start, MSG_RUNNING));
 680
 681                 do_toi_step(STEP_QUIET_CLEANUP);
 682                 return;
 683         }
 684
 685         toi_send_if(MSG_IO, 0);
 686
 687         result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
 688         if (result)
 689                 return;
 690
 691         /* This code runs at resume time too! */
 692         if (toi_in_hibernate)
 693                 result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
 694 }
 695 EXPORT_SYMBOL_GPL(toi_initiate_cluster_hibernate);
 696
 697 /* toi_cluster_print_debug_stats
 698  *
 699  * Description: Print information to be recorded for debugging purposes into a
 700  *              buffer.
 701  * Arguments:   buffer: Pointer to a buffer into which the debug info will be
 702  *                      printed.
 703  *              size:   Size of the buffer.
 704  * Returns:     Number of characters written to the buffer.
 705  */
 706 static int toi_cluster_print_debug_stats(char *buffer, int size)
 707 {
 708         int len;
 709
 710         if (strlen(toi_cluster_iface))
 711                 len = scnprintf(buffer, size, "- Cluster interface is '%s'.\n", toi_cluster_iface);
 712         else
 713                 len = scnprintf(buffer, size, "- Cluster support is disabled.\n");
 714         return len;
 715 }
 716
 717 /* cluster_memory_needed
 718  *
 719  * Description: Tell the caller how much memory we need to operate during
 720  *              hibernate/resume.
 721  * Returns:     Unsigned long. Maximum number of bytes of memory required for
 722  *              operation.
 723  */
 724 static int toi_cluster_memory_needed(void)
 725 {
 726         return 0;
 727 }
 728
 729 static int toi_cluster_storage_needed(void)
 730 {
 731         return 1 + strlen(toi_cluster_iface);
 732 }
 733
 734 /* toi_cluster_save_config_info
 735  *
 736  * Description: Save informaton needed when reloading the image at resume time.
 737  * Arguments:   Buffer:         Pointer to a buffer of size PAGE_SIZE.
 738  * Returns:     Number of bytes used for saving our data.
 739  */
 740 static int toi_cluster_save_config_info(char *buffer)
 741 {
 742         strcpy(buffer, toi_cluster_iface);
 743         return strlen(toi_cluster_iface + 1);
 744 }
 745
 746 /* toi_cluster_load_config_info
 747  *
 748  * Description: Reload information needed for declustering the image at
 749  *              resume time.
 750  * Arguments:   Buffer:         Pointer to the start of the data.
 751  *              Size:           Number of bytes that were saved.
 752  */
 753 static void toi_cluster_load_config_info(char *buffer, int size)
 754 {
 755         strncpy(toi_cluster_iface, buffer, size);
 756         return;
 757 }
 758
 759 static void cluster_startup(void)
 760 {
 761         int have_image = do_check_can_resume(), i;
 762         unsigned long start = jiffies, initial_message;
 763         struct task_struct *p;
 764
 765         initial_message = MSG_IMAGE;
 766
 767         have_image = 1;
 768
 769         for (i = 0; i < num_local_nodes; i++) {
 770                 PRINTK("Starting ktoiclusterd %d.\n", i);
 771                 p = kthread_create(kTOICluster, (void *)initial_message, "ktoiclusterd/%d", i);
 772                 if (IS_ERR(p)) {
 773                         printk(KERN_ERR "Failed to start ktoiclusterd.\n");
 774                         return;
 775                 }
 776
 777                 wake_up_process(p);
 778         }
 779
 780         /* Wait for delay or someone else sending first message */
 781         wait_event(node_array[0].member_events, time_to_continue(0, start, MSG_IMAGE));
 782
 783         others_have_image = peers_in_message(0, MSG_IMAGE | MSG_ACK, 1);
 784
 785         printk(KERN_INFO "Continuing. I %shave an image. Peers with image:"
 786                " %d.\n", have_image ? "" : "don't ", others_have_image);
 787
 788         if (have_image) {
 789                 int result;
 790
 791                 /* Start to resume */
 792                 printk(KERN_INFO "  === Starting to resume === \n");
 793                 node_array[0].current_message = MSG_IO;
 794                 toi_send_if(MSG_IO, 0);
 795
 796                 /* result = do_toi_step(STEP_RESUME_LOAD_PS1); */
 797                 result = 0;
 798
 799                 if (!result) {
 800                         /*
 801                          * Atomic restore - we'll come back in the hibernation
 802                          * path.
 803                          */
 804
 805                         /* result = do_toi_step(STEP_RESUME_DO_RESTORE); */
 806                         result = 0;
 807
 808                         /* do_toi_step(STEP_QUIET_CLEANUP); */
 809                 }
 810
 811                 node_array[0].current_message |= MSG_NACK;
 812
 813                 /* For debugging - disable for real life? */
 814                 wait_event(node_array[0].member_events, time_to_continue(0, start, MSG_IO));
 815         }
 816
 817         if (others_have_image) {
 818                 /* Wait for them to resume */
 819                 printk(KERN_INFO "Waiting for other nodes to resume.\n");
 820                 start = jiffies;
 821                 wait_event(node_array[0].member_events, time_to_continue(0, start, MSG_RUNNING));
 822                 if (peers_not_in_message(0, MSG_RUNNING, 0))
 823                         printk(KERN_INFO "Timed out while waiting for other " "nodes to resume.\n");
 824         }
 825
 826         /* Find out whether an image exists here. Send ACK_IMAGE or NACK_IMAGE
 827          * as appropriate.
 828          *
 829          * If we don't have an image:
 830          * - Wait until someone else says they have one, or conditions are met
 831          *   for continuing to boot (n machines or t seconds).
 832          * - If anyone has an image, wait for them to resume before continuing
 833          *   to boot.
 834          *
 835          * If we have an image:
 836          * - Wait until conditions are met before continuing to resume (n
 837          *   machines or t seconds). Send RESUME_PREP and freeze processes.
 838          *   NACK_PREP if freezing fails (shouldn't) and follow logic for
 839          *   us having no image above. On success, wait for [N]ACK_PREP from
 840          *   other machines. Read image (including atomic restore) until done.
 841          *   Wait for ACK_READ from others (should never fail). Thaw processes
 842          *   and do post-resume. (The section after the atomic restore is done
 843          *   via the code for hibernating).
 844          */
 845
 846         node_array[0].current_message = MSG_RUNNING;
 847 }
 848
 849 /* toi_cluster_open_iface
 850  *
 851  * Description: Prepare to use an interface.
 852  */
 853
 854 static int toi_cluster_open_iface(void)
 855 {
 856         struct net_device *dev;
 857
 858         rtnl_lock();
 859
 860         for_each_netdev(&init_net, dev) {
 861                 if (/* dev == &init_net.loopback_dev || */
 862                            strcmp(dev->name, toi_cluster_iface))
 863                         continue;
 864
 865                 net_dev = dev;
 866                 break;
 867         }
 868
 869         rtnl_unlock();
 870
 871         if (!net_dev) {
 872                 printk(KERN_ERR MYNAME ": Device %s not found.\n", toi_cluster_iface);
 873                 return -ENODEV;
 874         }
 875
 876         dev_add_pack(&toi_cluster_packet_type);
 877         added_pack = 1;
 878
 879         loopback_mode = (net_dev == init_net.loopback_dev);
 880         num_local_nodes = loopback_mode ? 8 : 1;
 881
 882         PRINTK("Loopback mode is %s. Number of local nodes is %d.\n",
 883                loopback_mode ? "on" : "off", num_local_nodes);
 884
 885         cluster_startup();
 886         return 0;
 887 }
 888
 889 /* toi_cluster_close_iface
 890  *
 891  * Description: Stop using an interface.
 892  */
 893
 894 static int toi_cluster_close_iface(void)
 895 {
 896         kill_clusterd();
 897         if (added_pack) {
 898                 dev_remove_pack(&toi_cluster_packet_type);
 899                 added_pack = 0;
 900         }
 901         return 0;
 902 }
 903
 904 static void write_side_effect(void)
 905 {
 906         if (toi_cluster_ops.enabled) {
 907                 toi_cluster_open_iface();
 908                 set_toi_state(TOI_CLUSTER_MODE);
 909         } else {
 910                 toi_cluster_close_iface();
 911                 clear_toi_state(TOI_CLUSTER_MODE);
 912         }
 913 }
 914
 915 static void node_write_side_effect(void)
 916 {
 917 }
 918
 919 /*
 920  * data for our sysfs entries.
 921  */
 922 static struct toi_sysfs_data sysfs_params[] = {
 923         SYSFS_STRING("interface", SYSFS_RW, toi_cluster_iface, IFNAMSIZ, 0,
 924                      NULL),
 925         SYSFS_INT("enabled", SYSFS_RW, &toi_cluster_ops.enabled, 0, 1, 0,
 926                   write_side_effect),
 927         SYSFS_STRING("cluster_name", SYSFS_RW, toi_cluster_key, 32, 0, NULL),
 928         SYSFS_STRING("pre-hibernate-script", SYSFS_RW, pre_hibernate_script,
 929                      256, 0, NULL),
 930         SYSFS_STRING("post-hibernate-script", SYSFS_RW, post_hibernate_script,
 931                      256, 0, STRING),
 932         SYSFS_UL("continue_delay", SYSFS_RW, &continue_delay, HZ / 2, 60 * HZ,
 933                  0)
 934 };
 935
 936 /*
 937  * Ops structure.
 938  */
 939
 940 static struct toi_module_ops toi_cluster_ops = {
 941         .type = FILTER_MODULE,
 942         .name = "Cluster",
 943         .directory = "cluster",
 944         .module = THIS_MODULE,
 945         .memory_needed = toi_cluster_memory_needed,
 946         .print_debug_info = toi_cluster_print_debug_stats,
 947         .save_config_info = toi_cluster_save_config_info,
 948         .load_config_info = toi_cluster_load_config_info,
 949         .storage_needed = toi_cluster_storage_needed,
 950
 951         .sysfs_data = sysfs_params,
 952         .num_sysfs_entries = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data),
 953 };
 954
 955 /* ---- Registration ---- */
 956
 957 #ifdef MODULE
 958 #define INIT static __init
 959 #define EXIT static __exit
 960 #else
 961 #define INIT
 962 #define EXIT
 963 #endif
 964
 965 INIT int toi_cluster_init(void)
 966 {
 967         int temp = toi_register_module(&toi_cluster_ops), i;
 968         struct kobject *kobj = toi_cluster_ops.dir_kobj;
 969
 970         for (i = 0; i < MAX_LOCAL_NODES; i++) {
 971                 node_array[i].current_message = 0;
 972                 INIT_LIST_HEAD(&node_array[i].member_list);
 973                 init_waitqueue_head(&node_array[i].member_events);
 974                 spin_lock_init(&node_array[i].member_list_lock);
 975                 spin_lock_init(&node_array[i].receive_lock);
 976
 977                 /* Set up sysfs entry */
 978                 node_array[i].sysfs_data.attr.name = toi_kzalloc(8,
 979                                                                  sizeof(node_array[i].sysfs_data.
 980                                                                         attr.name), GFP_KERNEL);
 981                 sprintf((char *)node_array[i].sysfs_data.attr.name, "node_%d", i);
 982                 node_array[i].sysfs_data.attr.mode = SYSFS_RW;
 983                 node_array[i].sysfs_data.type = TOI_SYSFS_DATA_INTEGER;
 984                 node_array[i].sysfs_data.flags = 0;
 985                 node_array[i].sysfs_data.data.integer.variable =
 986                     (int *)&node_array[i].current_message;
 987                 node_array[i].sysfs_data.data.integer.minimum = 0;
 988                 node_array[i].sysfs_data.data.integer.maximum = INT_MAX;
 989                 node_array[i].sysfs_data.write_side_effect = node_write_side_effect;
 990                 toi_register_sysfs_file(kobj, &node_array[i].sysfs_data);
 991         }
 992
 993         toi_cluster_ops.enabled = (strlen(toi_cluster_iface) > 0);
 994
 995         if (toi_cluster_ops.enabled)
 996                 toi_cluster_open_iface();
 997
 998         return temp;
 999 }
1000
1001 EXIT void toi_cluster_exit(void)
1002 {
1003         int i;
1004         toi_cluster_close_iface();
1005
1006         for (i = 0; i < MAX_LOCAL_NODES; i++)
1007                 toi_unregister_sysfs_file(toi_cluster_ops.dir_kobj, &node_array[i].sysfs_data);
1008         toi_unregister_module(&toi_cluster_ops);
1009 }
1010
1011 static int __init toi_cluster_iface_setup(char *iface)
1012 {
1013         toi_cluster_ops.enabled = (*iface && strcmp(iface, "off"));
1014
1015         if (toi_cluster_ops.enabled)
1016                 strncpy(toi_cluster_iface, iface, strlen(iface));
1017 }
1018
1019 __setup("toi_cluster=", toi_cluster_iface_setup);
1020
1021 #ifdef MODULE
1022 MODULE_LICENSE("GPL");
1023 module_init(toi_cluster_init);
1024 module_exit(toi_cluster_exit);
1025 MODULE_AUTHOR("Nigel Cunningham");
1026 MODULE_DESCRIPTION("Cluster Support for TuxOnIce");
1027 #endif