bonding: add some slack to arp monitoring time limits
authorJiri Bohac <jbohac@suse.cz>
Thu, 30 Aug 2012 12:02:47 +0000 (12:02 +0000)
committerDavid S. Miller <davem@davemloft.net>
Fri, 31 Aug 2012 20:37:12 +0000 (16:37 -0400)
Currently, all the time limits in the bonding ARP monitor are in
multiples of arp_interval -- the time interval at which the ARP
monitor is periodically scheduled.

With a fast network round-trip and a little scheduling latency
of the ARP monitor work, a limit of n*delta_in_ticks may
effectively mean (n-1)*delta_in_ticks.

This is fatal in case of n==1  (the link will stay down
forever) and makes the behaviour non-deterministic in all the
other cases.

Add a delta_in_ticks/2 time slack to all the time limits.

Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/bonding/bond_main.c

index b24ce257ac7b7dccce0c9a0745865aed872e1a6f..7858c58df4a3a12c47de7c1a015b613781352c19 100644 (file)
@@ -2811,12 +2811,13 @@ void bond_loadbalance_arp_mon(struct work_struct *work)
                                            arp_work.work);
        struct slave *slave, *oldcurrent;
        int do_failover = 0;
-       int delta_in_ticks;
+       int delta_in_ticks, extra_ticks;
        int i;
 
        read_lock(&bond->lock);
 
        delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
+       extra_ticks = delta_in_ticks / 2;
 
        if (bond->slave_cnt == 0)
                goto re_arm;
@@ -2839,10 +2840,10 @@ void bond_loadbalance_arp_mon(struct work_struct *work)
                if (slave->link != BOND_LINK_UP) {
                        if (time_in_range(jiffies,
                                trans_start - delta_in_ticks,
-                               trans_start + delta_in_ticks) &&
+                               trans_start + delta_in_ticks + extra_ticks) &&
                            time_in_range(jiffies,
                                slave->dev->last_rx - delta_in_ticks,
-                               slave->dev->last_rx + delta_in_ticks)) {
+                               slave->dev->last_rx + delta_in_ticks + extra_ticks)) {
 
                                slave->link  = BOND_LINK_UP;
                                bond_set_active_slave(slave);
@@ -2872,10 +2873,10 @@ void bond_loadbalance_arp_mon(struct work_struct *work)
                         */
                        if (!time_in_range(jiffies,
                                trans_start - delta_in_ticks,
-                               trans_start + 2 * delta_in_ticks) ||
+                               trans_start + 2 * delta_in_ticks + extra_ticks) ||
                            !time_in_range(jiffies,
                                slave->dev->last_rx - delta_in_ticks,
-                               slave->dev->last_rx + 2 * delta_in_ticks)) {
+                               slave->dev->last_rx + 2 * delta_in_ticks + extra_ticks)) {
 
                                slave->link  = BOND_LINK_DOWN;
                                bond_set_backup_slave(slave);
@@ -2933,6 +2934,14 @@ static int bond_ab_arp_inspect(struct bonding *bond, int delta_in_ticks)
        struct slave *slave;
        int i, commit = 0;
        unsigned long trans_start;
+       int extra_ticks;
+
+       /* All the time comparisons below need some extra time. Otherwise, on
+        * fast networks the ARP probe/reply may arrive within the same jiffy
+        * as it was sent.  Then, the next time the ARP monitor is run, one
+        * arp_interval will already have passed in the comparisons.
+        */
+       extra_ticks = delta_in_ticks / 2;
 
        bond_for_each_slave(bond, slave, i) {
                slave->new_link = BOND_LINK_NOCHANGE;
@@ -2940,7 +2949,7 @@ static int bond_ab_arp_inspect(struct bonding *bond, int delta_in_ticks)
                if (slave->link != BOND_LINK_UP) {
                        if (time_in_range(jiffies,
                                slave_last_rx(bond, slave) - delta_in_ticks,
-                               slave_last_rx(bond, slave) + delta_in_ticks)) {
+                               slave_last_rx(bond, slave) + delta_in_ticks + extra_ticks)) {
 
                                slave->new_link = BOND_LINK_UP;
                                commit++;
@@ -2956,7 +2965,7 @@ static int bond_ab_arp_inspect(struct bonding *bond, int delta_in_ticks)
                 */
                if (time_in_range(jiffies,
                                  slave->jiffies - delta_in_ticks,
-                                 slave->jiffies + 2 * delta_in_ticks))
+                                 slave->jiffies + 2 * delta_in_ticks + extra_ticks))
                        continue;
 
                /*
@@ -2976,7 +2985,7 @@ static int bond_ab_arp_inspect(struct bonding *bond, int delta_in_ticks)
                    !bond->current_arp_slave &&
                    !time_in_range(jiffies,
                        slave_last_rx(bond, slave) - delta_in_ticks,
-                       slave_last_rx(bond, slave) + 3 * delta_in_ticks)) {
+                       slave_last_rx(bond, slave) + 3 * delta_in_ticks + extra_ticks)) {
 
                        slave->new_link = BOND_LINK_DOWN;
                        commit++;
@@ -2992,10 +3001,10 @@ static int bond_ab_arp_inspect(struct bonding *bond, int delta_in_ticks)
                if (bond_is_active_slave(slave) &&
                    (!time_in_range(jiffies,
                        trans_start - delta_in_ticks,
-                       trans_start + 2 * delta_in_ticks) ||
+                       trans_start + 2 * delta_in_ticks + extra_ticks) ||
                     !time_in_range(jiffies,
                        slave_last_rx(bond, slave) - delta_in_ticks,
-                       slave_last_rx(bond, slave) + 2 * delta_in_ticks))) {
+                       slave_last_rx(bond, slave) + 2 * delta_in_ticks + extra_ticks))) {
 
                        slave->new_link = BOND_LINK_DOWN;
                        commit++;
@@ -3027,7 +3036,7 @@ static void bond_ab_arp_commit(struct bonding *bond, int delta_in_ticks)
                        if ((!bond->curr_active_slave &&
                             time_in_range(jiffies,
                                           trans_start - delta_in_ticks,
-                                          trans_start + delta_in_ticks)) ||
+                                          trans_start + delta_in_ticks + delta_in_ticks / 2)) ||
                            bond->curr_active_slave != slave) {
                                slave->link = BOND_LINK_UP;
                                if (bond->current_arp_slave) {