2 * kernel/power/tuxonice_cluster.c
4 * Copyright (C) 2006-2010 Nigel Cunningham (nigel at tuxonice net)
6 * This file is released under the GPLv2.
8 * This file contains routines for cluster hibernation support.
10 * Based on ip autoconfiguration code in net/ipv4/ipconfig.c.
14 * There is no 'master' node that tells everyone else what to do. All nodes
15 * send messages to the broadcast address/port, maintain a list of peers
16 * and figure out when to progress to the next step in hibernating or resuming.
17 * This makes us more fault tolerant when it comes to nodes coming and going
18 * (which may be more of an issue if we're hibernating when power supplies
19 * are being unreliable).
21 * At boot time, we start a ktuxonice thread that handles communication with
22 * other nodes. This node maintains a state machine that controls our progress
23 * through hibernating and resuming, keeping us in step with other nodes. Nodes
24 * are identified by their hw address.
26 * On startup, the node sends CLUSTER_PING on the configured interface's
27 * broadcast address, port $toi_cluster_port (see below) and begins to listen
28 * for other broadcast messages. CLUSTER_PING messages are repeated at
29 * intervals of 5 minutes, with a random offset to spread traffic out.
31 * A hibernation cycle is initiated from any node via
33 * echo > /sys/power/tuxonice/do_hibernate
35 * and (possibily) the hibernate script. At each step of the process, the node
36 * completes its work, and waits for all other nodes to signal completion of
37 * their work (or timeout) before progressing to the next step.
39 * Request/state Action before reply Possible reply Next state
40 * HIBERNATE capable, pre-script HIBERNATE|ACK NODE_PREP
41 * HIBERNATE|NACK INIT_0
43 * PREP prepare_image PREP|ACK IMAGE_WRITE
47 * IO write image IO|ACK power off
50 * (Boot time) check for image IMAGE|ACK RESUME_PREP
54 * PREP prepare read image PREP|ACK IMAGE_READ
55 * PREP|NACK (As NACK_IMAGE)
57 * IO read image IO|ACK POST_RESUME
59 * POST_RESUME thaw, post-script RUNNING
65 * - PING: Request for all other live nodes to send a PONG. Used at startup to
66 * announce presence, when a node is suspected dead and periodically, in case
67 * segments of the network are [un]plugged.
69 * - PONG: Response to a PING.
71 * - ABORT: Request to cancel writing an image.
73 * - BYE: Notification that this node is shutting down.
75 * Note 1: Repeated at 3s intervals until we continue to boot/resume, so that
76 * nodes which are slower to start up can get state synchronised. If a node
77 * starting up sees other nodes sending RESUME_PREP or IMAGE_READ, it may send
78 * ACK_IMAGE and they will wait for it to catch up. If it sees ACK_READ, it
79 * must invalidate its image (if any) and boot normally.
81 * Note 2: May occur when one node lost power or powered off while others
82 * hibernated. This node waits for others to complete resuming (ACK_READ)
83 * before completing its boot, so that it appears as a fail node restarting.
85 * If any node has an image, then it also has a list of nodes that hibernated
86 * in synchronisation with it. The node will wait for other nodes to appear
87 * or timeout before beginning its restoration.
89 * If a node has no image, it needs to wait, in case other nodes which do have
90 * an image are going to resume, but are taking longer to announce their
91 * presence. For this reason, the user can specify a timeout value and a number
92 * of nodes detected before we just continue. (We might want to assume in a
93 * cluster of, say, 15 nodes, if 8 others have booted without finding an image,
94 * the remaining nodes will too. This might help in situations where some nodes
95 * are much slower to boot, or more subject to hardware failures or such like).
98 #include <linux/suspend.h>
99 #include <linux/module.h>
100 #include <linux/moduleparam.h>
101 #include <linux/if.h>
102 #include <linux/rtnetlink.h>
103 #include <linux/ip.h>
104 #include <linux/udp.h>
105 #include <linux/in.h>
106 #include <linux/if_arp.h>
107 #include <linux/kthread.h>
108 #include <linux/wait.h>
109 #include <linux/netdevice.h>
112 #include "tuxonice.h"
113 #include "tuxonice_modules.h"
114 #include "tuxonice_sysfs.h"
115 #include "tuxonice_alloc.h"
116 #include "tuxonice_io.h"
119 #define PRINTK(a, b...) do { printk(a, ##b); } while (0)
121 #define PRINTK(a, b...) do { } while (0)
124 static int loopback_mode
;
125 static int num_local_nodes
= 1;
126 #define MAX_LOCAL_NODES 8
127 #define SADDR (loopback_mode ? b->sid : h->saddr)
129 #define MYNAME "TuxOnIce Clustering"
131 enum cluster_message
{
143 static char *str_message(int message
)
151 return "Abort acked";
153 return "Abort nacked";
161 return "Hibernate request";
163 return "Hibernate ack";
165 return "Hibernate nack";
167 return "Image exists?";
169 return "Image does exist";
171 return "No image here";
181 printk(KERN_ERR
"Unrecognised message %d.\n", message
);
182 return "Unrecognised message (see dmesg)";
186 #define MSG_ACK_MASK (MSG_ACK | MSG_NACK)
187 #define MSG_STATE_MASK (~MSG_ACK_MASK)
190 struct list_head member_list
;
191 wait_queue_head_t member_events
;
192 spinlock_t member_list_lock
;
193 spinlock_t receive_lock
;
194 int peer_count
, ignored_peer_count
;
195 struct toi_sysfs_data sysfs_data
;
196 enum cluster_message current_message
;
199 struct node_info node_array
[MAX_LOCAL_NODES
];
201 struct cluster_member
{
203 enum cluster_message message
;
204 struct list_head list
;
208 #define toi_cluster_port_send 3501
209 #define toi_cluster_port_recv 3502
211 static struct net_device
*net_dev
;
212 static struct toi_module_ops toi_cluster_ops
;
214 static int toi_recv(struct sk_buff
*skb
, struct net_device
*dev
,
215 struct packet_type
*pt
, struct net_device
*orig_dev
);
217 static struct packet_type toi_cluster_packet_type
= {
218 .type
= __constant_htons(ETH_P_IP
),
222 struct toi_pkt
{ /* BOOTP packet format */
223 struct iphdr iph
; /* IP header */
224 struct udphdr udph
; /* UDP header */
225 u8 htype
; /* HW address type */
226 u8 hlen
; /* HW address length */
227 __be32 xid
; /* Transaction ID */
228 __be16 secs
; /* Seconds since we started */
229 __be16 flags
; /* Just what it says */
230 u8 hw_addr
[16]; /* Sender's HW address */
231 u16 message
; /* Message */
232 unsigned long sid
; /* Source ID for loopback testing */
235 static char toi_cluster_iface
[IFNAMSIZ
] = CONFIG_TOI_DEFAULT_CLUSTER_INTERFACE
;
237 static int added_pack
;
239 static int others_have_image
;
241 /* Key used to allow multiple clusters on the same lan */
242 static char toi_cluster_key
[32] = CONFIG_TOI_DEFAULT_CLUSTER_KEY
;
243 static char pre_hibernate_script
[255] = CONFIG_TOI_DEFAULT_CLUSTER_PRE_HIBERNATE
;
244 static char post_hibernate_script
[255] = CONFIG_TOI_DEFAULT_CLUSTER_POST_HIBERNATE
;
246 /* List of cluster members */
247 static unsigned long continue_delay
= 5 * HZ
;
248 static unsigned long cluster_message_timeout
= 3 * HZ
;
250 /* === Membership list === */
252 static void print_member_info(int index
)
254 struct cluster_member
*this;
256 printk(KERN_INFO
"==> Dumping node %d.\n", index
);
258 list_for_each_entry(this, &node_array
[index
].member_list
, list
)
259 printk(KERN_INFO
"%d.%d.%d.%d last message %s. %s\n",
261 str_message(this->message
), this->ignore
? "(Ignored)" : "");
262 printk(KERN_INFO
"== Done ==\n");
265 static struct cluster_member
*__find_member(int index
, __be32 addr
)
267 struct cluster_member
*this;
269 list_for_each_entry(this, &node_array
[index
].member_list
, list
) {
270 if (this->addr
!= addr
)
279 static void set_ignore(int index
, __be32 addr
, struct cluster_member
*this)
282 PRINTK("Node %d already ignoring %d.%d.%d.%d.\n", index
, NIPQUAD(addr
));
286 PRINTK("Node %d sees node %d.%d.%d.%d now being ignored.\n", index
, NIPQUAD(addr
));
288 node_array
[index
].ignored_peer_count
++;
291 static int __add_update_member(int index
, __be32 addr
, int message
)
293 struct cluster_member
*this;
295 this = __find_member(index
, addr
);
297 if (this->message
!= message
) {
298 this->message
= message
;
299 if ((message
& MSG_NACK
) &&
300 (message
& (MSG_HIBERNATE
| MSG_IMAGE
| MSG_IO
)))
301 set_ignore(index
, addr
, this);
302 PRINTK("Node %d sees node %d.%d.%d.%d now sending "
303 "%s.\n", index
, NIPQUAD(addr
), str_message(message
));
304 wake_up(&node_array
[index
].member_events
);
309 this = (struct cluster_member
*)toi_kzalloc(36, sizeof(struct cluster_member
), GFP_KERNEL
);
315 this->message
= message
;
317 INIT_LIST_HEAD(&this->list
);
319 node_array
[index
].peer_count
++;
321 PRINTK("Node %d sees node %d.%d.%d.%d sending %s.\n", index
,
322 NIPQUAD(addr
), str_message(message
));
324 if ((message
& MSG_NACK
) && (message
& (MSG_HIBERNATE
| MSG_IMAGE
| MSG_IO
)))
325 set_ignore(index
, addr
, this);
326 list_add_tail(&this->list
, &node_array
[index
].member_list
);
330 static int add_update_member(int index
, __be32 addr
, int message
)
334 spin_lock_irqsave(&node_array
[index
].member_list_lock
, flags
);
335 result
= __add_update_member(index
, addr
, message
);
336 spin_unlock_irqrestore(&node_array
[index
].member_list_lock
, flags
);
338 print_member_info(index
);
340 wake_up(&node_array
[index
].member_events
);
345 static void del_member(int index
, __be32 addr
)
347 struct cluster_member
*this;
350 spin_lock_irqsave(&node_array
[index
].member_list_lock
, flags
);
351 this = __find_member(index
, addr
);
354 list_del_init(&this->list
);
355 toi_kfree(36, this, sizeof(*this));
356 node_array
[index
].peer_count
--;
359 spin_unlock_irqrestore(&node_array
[index
].member_list_lock
, flags
);
362 /* === Message transmission === */
364 static void toi_send_if(int message
, unsigned long my_id
);
367 * Process received TOI packet.
369 static int toi_recv(struct sk_buff
*skb
, struct net_device
*dev
,
370 struct packet_type
*pt
, struct net_device
*orig_dev
)
374 int len
, result
, index
;
375 unsigned long addr
, message
, ack
;
377 /* Perform verifications before taking the lock. */
378 if (skb
->pkt_type
== PACKET_OTHERHOST
)
384 skb
= skb_share_check(skb
, GFP_ATOMIC
);
388 if (!pskb_may_pull(skb
, sizeof(struct iphdr
) + sizeof(struct udphdr
)))
391 b
= (struct toi_pkt
*)skb_network_header(skb
);
394 if (h
->ihl
!= 5 || h
->version
!= 4 || h
->protocol
!= IPPROTO_UDP
)
397 /* Fragments are not supported */
398 if (h
->frag_off
& htons(IP_OFFSET
| IP_MF
)) {
400 printk(KERN_ERR
"TuxOnIce: Ignoring fragmented " "cluster message.\n");
404 if (skb
->len
< ntohs(h
->tot_len
))
407 if (ip_fast_csum((char *)h
, h
->ihl
))
410 if (b
->udph
.source
!= htons(toi_cluster_port_send
) ||
411 b
->udph
.dest
!= htons(toi_cluster_port_recv
))
414 if (ntohs(h
->tot_len
) < ntohs(b
->udph
.len
) + sizeof(struct iphdr
))
417 len
= ntohs(b
->udph
.len
) - sizeof(struct udphdr
);
419 /* Ok the front looks good, make sure we can get at the rest. */
420 if (!pskb_may_pull(skb
, skb
->len
))
423 b
= (struct toi_pkt
*)skb_network_header(skb
);
427 PRINTK(">>> Message %s received from " NIPQUAD_FMT
".\n",
428 str_message(b
->message
), NIPQUAD(addr
));
430 message
= b
->message
& MSG_STATE_MASK
;
431 ack
= b
->message
& MSG_ACK_MASK
;
433 for (index
= 0; index
< num_local_nodes
; index
++) {
434 int new_message
= node_array
[index
].current_message
, old_message
= new_message
;
436 if (index
== SADDR
|| !old_message
) {
437 PRINTK("Ignoring node %d (offline or self).\n", index
);
441 /* One message at a time, please. */
442 spin_lock(&node_array
[index
].receive_lock
);
444 result
= add_update_member(index
, SADDR
, b
->message
);
446 printk(KERN_INFO
"Failed to add new cluster member "
447 NIPQUAD_FMT
".\n", NIPQUAD(addr
));
451 switch (b
->message
& MSG_STATE_MASK
) {
459 /* Can I hibernate? */
460 new_message
= MSG_HIBERNATE
| ((index
& 1) ? MSG_NACK
: MSG_ACK
);
464 new_message
= MSG_IMAGE
| ((index
& 1) ? MSG_NACK
: MSG_ACK
);
465 if (new_message
!= old_message
)
466 printk(KERN_ERR
"Setting whether I can resume "
467 "to %d.\n", new_message
);
470 new_message
= MSG_IO
| MSG_ACK
;
476 printk(KERN_ERR
"Unrecognised TuxOnIce cluster"
477 " message %d from " NIPQUAD_FMT
".\n",
478 b
->message
, NIPQUAD(addr
));
481 if (old_message
!= new_message
) {
482 node_array
[index
].current_message
= new_message
;
483 printk(KERN_INFO
">>> Sending new message for node " "%d.\n", index
);
484 toi_send_if(new_message
, index
);
486 printk(KERN_INFO
">>> Resending message for node %d.\n", index
);
487 toi_send_if(new_message
, index
);
490 spin_unlock(&node_array
[index
].receive_lock
);
494 /* Throw the packet out. */
501 * Send cluster message to single interface.
503 static void toi_send_if(int message
, unsigned long my_id
)
507 int hh_len
= LL_RESERVED_SPACE(net_dev
);
510 /* Allocate packet */
511 skb
= alloc_skb(sizeof(struct toi_pkt
) + hh_len
+ 15, GFP_KERNEL
);
514 skb_reserve(skb
, hh_len
);
515 b
= (struct toi_pkt
*)skb_put(skb
, sizeof(struct toi_pkt
));
516 memset(b
, 0, sizeof(struct toi_pkt
));
518 /* Construct IP header */
519 skb_reset_network_header(skb
);
523 h
->tot_len
= htons(sizeof(struct toi_pkt
));
524 h
->frag_off
= htons(IP_DF
);
526 h
->protocol
= IPPROTO_UDP
;
527 h
->daddr
= htonl(INADDR_BROADCAST
);
528 h
->check
= ip_fast_csum((unsigned char *)h
, h
->ihl
);
530 /* Construct UDP header */
531 b
->udph
.source
= htons(toi_cluster_port_send
);
532 b
->udph
.dest
= htons(toi_cluster_port_recv
);
533 b
->udph
.len
= htons(sizeof(struct toi_pkt
) - sizeof(struct iphdr
));
534 /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
536 /* Construct message */
537 b
->message
= message
;
539 b
->htype
= net_dev
->type
; /* can cause undefined behavior */
540 b
->hlen
= net_dev
->addr_len
;
541 memcpy(b
->hw_addr
, net_dev
->dev_addr
, net_dev
->addr_len
);
542 b
->secs
= htons(3); /* 3 seconds */
544 /* Chain packet down the line... */
546 skb
->protocol
= htons(ETH_P_IP
);
547 if ((dev_hard_header(skb
, net_dev
, ntohs(skb
->protocol
),
548 net_dev
->broadcast
, net_dev
->dev_addr
, skb
->len
) < 0) ||
549 dev_queue_xmit(skb
) < 0)
550 printk(KERN_INFO
"E");
553 /* ========================================= */
557 static atomic_t num_cluster_threads
;
558 static DECLARE_WAIT_QUEUE_HEAD(clusterd_events
);
560 static int kTOICluster(void *data
)
564 my_id
= atomic_add_return(1, &num_cluster_threads
) - 1;
565 node_array
[my_id
].current_message
= (unsigned long)data
;
567 PRINTK("kTOICluster daemon %lu starting.\n", my_id
);
569 current
->flags
|= PF_NOFREEZE
;
571 while (node_array
[my_id
].current_message
) {
572 toi_send_if(node_array
[my_id
].current_message
, my_id
);
573 sleep_on_timeout(&clusterd_events
, cluster_message_timeout
);
574 PRINTK("Link state %lu is %d.\n", my_id
, node_array
[my_id
].current_message
);
577 toi_send_if(MSG_BYE
, my_id
);
578 atomic_dec(&num_cluster_threads
);
579 wake_up(&clusterd_events
);
581 PRINTK("kTOICluster daemon %lu exiting.\n", my_id
);
582 __set_current_state(TASK_RUNNING
);
586 static void kill_clusterd(void)
590 for (i
= 0; i
< num_local_nodes
; i
++) {
591 if (node_array
[i
].current_message
) {
592 PRINTK("Seeking to kill clusterd %d.\n", i
);
593 node_array
[i
].current_message
= 0;
596 wait_event(clusterd_events
, !atomic_read(&num_cluster_threads
));
597 PRINTK("All cluster daemons have exited.\n");
600 static int peers_not_in_message(int index
, int message
, int precise
)
602 struct cluster_member
*this;
606 spin_lock_irqsave(&node_array
[index
].member_list_lock
, flags
);
607 list_for_each_entry(this, &node_array
[index
].member_list
, list
) {
611 PRINTK("Peer %d.%d.%d.%d sending %s. "
613 NIPQUAD(this->addr
), str_message(this->message
), str_message(message
));
614 if ((precise
? this->message
: this->message
& MSG_STATE_MASK
) != message
)
617 spin_unlock_irqrestore(&node_array
[index
].member_list_lock
, flags
);
618 PRINTK("%d peers in sought message.\n", result
);
622 static void reset_ignored(int index
)
624 struct cluster_member
*this;
627 spin_lock_irqsave(&node_array
[index
].member_list_lock
, flags
);
628 list_for_each_entry(this, &node_array
[index
].member_list
, list
)
630 node_array
[index
].ignored_peer_count
= 0;
631 spin_unlock_irqrestore(&node_array
[index
].member_list_lock
, flags
);
634 static int peers_in_message(int index
, int message
, int precise
)
636 return node_array
[index
].peer_count
-
637 node_array
[index
].ignored_peer_count
- peers_not_in_message(index
, message
, precise
);
640 static int time_to_continue(int index
, unsigned long start
, int message
)
642 int first
= peers_not_in_message(index
, message
, 0);
643 int second
= peers_in_message(index
, message
, 1);
645 PRINTK("First part returns %d, second returns %d.\n", first
, second
);
647 if (!first
&& !second
) {
648 PRINTK("All peers answered message %d.\n", message
);
652 if (time_after(jiffies
, start
+ continue_delay
)) {
653 PRINTK("Timeout reached.\n");
657 PRINTK("Not time to continue yet (%lu < %lu).\n", jiffies
, start
+ continue_delay
);
661 void toi_initiate_cluster_hibernate(void)
666 result
= do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE
);
670 toi_send_if(MSG_HIBERNATE
, 0);
673 wait_event(node_array
[0].member_events
, time_to_continue(0, start
, MSG_HIBERNATE
));
675 if (test_action_state(TOI_FREEZER_TEST
)) {
676 toi_send_if(MSG_ABORT
, 0);
679 wait_event(node_array
[0].member_events
, time_to_continue(0, start
, MSG_RUNNING
));
681 do_toi_step(STEP_QUIET_CLEANUP
);
685 toi_send_if(MSG_IO
, 0);
687 result
= do_toi_step(STEP_HIBERNATE_SAVE_IMAGE
);
691 /* This code runs at resume time too! */
692 if (toi_in_hibernate
)
693 result
= do_toi_step(STEP_HIBERNATE_POWERDOWN
);
695 EXPORT_SYMBOL_GPL(toi_initiate_cluster_hibernate
);
697 /* toi_cluster_print_debug_stats
699 * Description: Print information to be recorded for debugging purposes into a
701 * Arguments: buffer: Pointer to a buffer into which the debug info will be
703 * size: Size of the buffer.
704 * Returns: Number of characters written to the buffer.
706 static int toi_cluster_print_debug_stats(char *buffer
, int size
)
710 if (strlen(toi_cluster_iface
))
711 len
= scnprintf(buffer
, size
, "- Cluster interface is '%s'.\n", toi_cluster_iface
);
713 len
= scnprintf(buffer
, size
, "- Cluster support is disabled.\n");
717 /* cluster_memory_needed
719 * Description: Tell the caller how much memory we need to operate during
721 * Returns: Unsigned long. Maximum number of bytes of memory required for
724 static int toi_cluster_memory_needed(void)
729 static int toi_cluster_storage_needed(void)
731 return 1 + strlen(toi_cluster_iface
);
734 /* toi_cluster_save_config_info
736 * Description: Save informaton needed when reloading the image at resume time.
737 * Arguments: Buffer: Pointer to a buffer of size PAGE_SIZE.
738 * Returns: Number of bytes used for saving our data.
740 static int toi_cluster_save_config_info(char *buffer
)
742 strcpy(buffer
, toi_cluster_iface
);
743 return strlen(toi_cluster_iface
+ 1);
746 /* toi_cluster_load_config_info
748 * Description: Reload information needed for declustering the image at
750 * Arguments: Buffer: Pointer to the start of the data.
751 * Size: Number of bytes that were saved.
753 static void toi_cluster_load_config_info(char *buffer
, int size
)
755 strncpy(toi_cluster_iface
, buffer
, size
);
759 static void cluster_startup(void)
761 int have_image
= do_check_can_resume(), i
;
762 unsigned long start
= jiffies
, initial_message
;
763 struct task_struct
*p
;
765 initial_message
= MSG_IMAGE
;
769 for (i
= 0; i
< num_local_nodes
; i
++) {
770 PRINTK("Starting ktoiclusterd %d.\n", i
);
771 p
= kthread_create(kTOICluster
, (void *)initial_message
, "ktoiclusterd/%d", i
);
773 printk(KERN_ERR
"Failed to start ktoiclusterd.\n");
780 /* Wait for delay or someone else sending first message */
781 wait_event(node_array
[0].member_events
, time_to_continue(0, start
, MSG_IMAGE
));
783 others_have_image
= peers_in_message(0, MSG_IMAGE
| MSG_ACK
, 1);
785 printk(KERN_INFO
"Continuing. I %shave an image. Peers with image:"
786 " %d.\n", have_image
? "" : "don't ", others_have_image
);
791 /* Start to resume */
792 printk(KERN_INFO
" === Starting to resume === \n");
793 node_array
[0].current_message
= MSG_IO
;
794 toi_send_if(MSG_IO
, 0);
796 /* result = do_toi_step(STEP_RESUME_LOAD_PS1); */
801 * Atomic restore - we'll come back in the hibernation
805 /* result = do_toi_step(STEP_RESUME_DO_RESTORE); */
808 /* do_toi_step(STEP_QUIET_CLEANUP); */
811 node_array
[0].current_message
|= MSG_NACK
;
813 /* For debugging - disable for real life? */
814 wait_event(node_array
[0].member_events
, time_to_continue(0, start
, MSG_IO
));
817 if (others_have_image
) {
818 /* Wait for them to resume */
819 printk(KERN_INFO
"Waiting for other nodes to resume.\n");
821 wait_event(node_array
[0].member_events
, time_to_continue(0, start
, MSG_RUNNING
));
822 if (peers_not_in_message(0, MSG_RUNNING
, 0))
823 printk(KERN_INFO
"Timed out while waiting for other " "nodes to resume.\n");
826 /* Find out whether an image exists here. Send ACK_IMAGE or NACK_IMAGE
829 * If we don't have an image:
830 * - Wait until someone else says they have one, or conditions are met
831 * for continuing to boot (n machines or t seconds).
832 * - If anyone has an image, wait for them to resume before continuing
835 * If we have an image:
836 * - Wait until conditions are met before continuing to resume (n
837 * machines or t seconds). Send RESUME_PREP and freeze processes.
838 * NACK_PREP if freezing fails (shouldn't) and follow logic for
839 * us having no image above. On success, wait for [N]ACK_PREP from
840 * other machines. Read image (including atomic restore) until done.
841 * Wait for ACK_READ from others (should never fail). Thaw processes
842 * and do post-resume. (The section after the atomic restore is done
843 * via the code for hibernating).
846 node_array
[0].current_message
= MSG_RUNNING
;
849 /* toi_cluster_open_iface
851 * Description: Prepare to use an interface.
854 static int toi_cluster_open_iface(void)
856 struct net_device
*dev
;
860 for_each_netdev(&init_net
, dev
) {
861 if (/* dev == &init_net.loopback_dev || */
862 strcmp(dev
->name
, toi_cluster_iface
))
872 printk(KERN_ERR MYNAME
": Device %s not found.\n", toi_cluster_iface
);
876 dev_add_pack(&toi_cluster_packet_type
);
879 loopback_mode
= (net_dev
== init_net
.loopback_dev
);
880 num_local_nodes
= loopback_mode
? 8 : 1;
882 PRINTK("Loopback mode is %s. Number of local nodes is %d.\n",
883 loopback_mode
? "on" : "off", num_local_nodes
);
889 /* toi_cluster_close_iface
891 * Description: Stop using an interface.
894 static int toi_cluster_close_iface(void)
898 dev_remove_pack(&toi_cluster_packet_type
);
904 static void write_side_effect(void)
906 if (toi_cluster_ops
.enabled
) {
907 toi_cluster_open_iface();
908 set_toi_state(TOI_CLUSTER_MODE
);
910 toi_cluster_close_iface();
911 clear_toi_state(TOI_CLUSTER_MODE
);
915 static void node_write_side_effect(void)
920 * data for our sysfs entries.
922 static struct toi_sysfs_data sysfs_params
[] = {
923 SYSFS_STRING("interface", SYSFS_RW
, toi_cluster_iface
, IFNAMSIZ
, 0,
925 SYSFS_INT("enabled", SYSFS_RW
, &toi_cluster_ops
.enabled
, 0, 1, 0,
927 SYSFS_STRING("cluster_name", SYSFS_RW
, toi_cluster_key
, 32, 0, NULL
),
928 SYSFS_STRING("pre-hibernate-script", SYSFS_RW
, pre_hibernate_script
,
930 SYSFS_STRING("post-hibernate-script", SYSFS_RW
, post_hibernate_script
,
932 SYSFS_UL("continue_delay", SYSFS_RW
, &continue_delay
, HZ
/ 2, 60 * HZ
,
940 static struct toi_module_ops toi_cluster_ops
= {
941 .type
= FILTER_MODULE
,
943 .directory
= "cluster",
944 .module
= THIS_MODULE
,
945 .memory_needed
= toi_cluster_memory_needed
,
946 .print_debug_info
= toi_cluster_print_debug_stats
,
947 .save_config_info
= toi_cluster_save_config_info
,
948 .load_config_info
= toi_cluster_load_config_info
,
949 .storage_needed
= toi_cluster_storage_needed
,
951 .sysfs_data
= sysfs_params
,
952 .num_sysfs_entries
= sizeof(sysfs_params
) / sizeof(struct toi_sysfs_data
),
955 /* ---- Registration ---- */
958 #define INIT static __init
959 #define EXIT static __exit
965 INIT
int toi_cluster_init(void)
967 int temp
= toi_register_module(&toi_cluster_ops
), i
;
968 struct kobject
*kobj
= toi_cluster_ops
.dir_kobj
;
970 for (i
= 0; i
< MAX_LOCAL_NODES
; i
++) {
971 node_array
[i
].current_message
= 0;
972 INIT_LIST_HEAD(&node_array
[i
].member_list
);
973 init_waitqueue_head(&node_array
[i
].member_events
);
974 spin_lock_init(&node_array
[i
].member_list_lock
);
975 spin_lock_init(&node_array
[i
].receive_lock
);
977 /* Set up sysfs entry */
978 node_array
[i
].sysfs_data
.attr
.name
= toi_kzalloc(8,
979 sizeof(node_array
[i
].sysfs_data
.
980 attr
.name
), GFP_KERNEL
);
981 sprintf((char *)node_array
[i
].sysfs_data
.attr
.name
, "node_%d", i
);
982 node_array
[i
].sysfs_data
.attr
.mode
= SYSFS_RW
;
983 node_array
[i
].sysfs_data
.type
= TOI_SYSFS_DATA_INTEGER
;
984 node_array
[i
].sysfs_data
.flags
= 0;
985 node_array
[i
].sysfs_data
.data
.integer
.variable
=
986 (int *)&node_array
[i
].current_message
;
987 node_array
[i
].sysfs_data
.data
.integer
.minimum
= 0;
988 node_array
[i
].sysfs_data
.data
.integer
.maximum
= INT_MAX
;
989 node_array
[i
].sysfs_data
.write_side_effect
= node_write_side_effect
;
990 toi_register_sysfs_file(kobj
, &node_array
[i
].sysfs_data
);
993 toi_cluster_ops
.enabled
= (strlen(toi_cluster_iface
) > 0);
995 if (toi_cluster_ops
.enabled
)
996 toi_cluster_open_iface();
1001 EXIT
void toi_cluster_exit(void)
1004 toi_cluster_close_iface();
1006 for (i
= 0; i
< MAX_LOCAL_NODES
; i
++)
1007 toi_unregister_sysfs_file(toi_cluster_ops
.dir_kobj
, &node_array
[i
].sysfs_data
);
1008 toi_unregister_module(&toi_cluster_ops
);
1011 static int __init
toi_cluster_iface_setup(char *iface
)
1013 toi_cluster_ops
.enabled
= (*iface
&& strcmp(iface
, "off"));
1015 if (toi_cluster_ops
.enabled
)
1016 strncpy(toi_cluster_iface
, iface
, strlen(iface
));
1019 __setup("toi_cluster=", toi_cluster_iface_setup
);
1022 MODULE_LICENSE("GPL");
1023 module_init(toi_cluster_init
);
1024 module_exit(toi_cluster_exit
);
1025 MODULE_AUTHOR("Nigel Cunningham");
1026 MODULE_DESCRIPTION("Cluster Support for TuxOnIce");