Skip to content

Commit

Permalink
Optimize neigh restore process during warm-reboot (sonic-net#722)
Browse files Browse the repository at this point in the history
* Optimize neigh restore process
* set stale timer bigger to avoid testbed difference related timing issues
  • Loading branch information
zhenggen-xu authored and qiluo-msft committed Dec 19, 2018
1 parent 3d60b3e commit a7a68c1
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 37 deletions.
2 changes: 1 addition & 1 deletion neighsyncd/restore_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def set_neigh_in_kernel(ipclass, family, intf_idx, dst_ip, dmac):
dst=dst_ip,
lladdr=dmac,
ifindex=intf_idx,
state=ndmsg.states['reachable'])
state=ndmsg.states['stale'])
# If neigh exists, log it but no exception raise, other exceptions, raise
except NetlinkError as e:
if e[0] == errno.EEXIST:
Expand Down
113 changes: 77 additions & 36 deletions tests/test_warm_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,22 @@ def check_kernel_reachable_v6_neigh_num(dvs, number):
neigh_num = int(output.strip())
assert neigh_num == number

def check_kernel_stale_neigh_num(dvs, number):
(exitcode, output) = dvs.runcmd(['sh', '-c', "ip neigh show nud stale | grep -v 'dev lo' | wc -l"])
neigh_num = int(output.strip())
assert neigh_num == number

def check_kernel_stale_v4_neigh_num(dvs, number):
(exitcode, output) = dvs.runcmd(['sh', '-c', "ip -4 neigh show nud stale | grep -v 'dev lo' | wc -l"])
neigh_num = int(output.strip())
assert neigh_num == number

def check_kernel_stale_v6_neigh_num(dvs, number):
(exitcode, output) = dvs.runcmd(['sh', '-c', "ip -6 neigh show nud stale | grep -v 'dev lo' | wc -l"])
neigh_num = int(output.strip())
assert neigh_num == number


def kernel_restore_neighs_done(restoretbl):
keys = restoretbl.getKeys()
return (len(keys) > 0)
Expand Down Expand Up @@ -1510,13 +1526,52 @@ def test_routing_WarmRestart(dvs, testlog):
assert rt_key['dest'] == "192.168.100.0/24"


# macros for number of interfaces and number of neighbors
# TBD: NUM_NEIGH_PER_INTF >= 128 ips will cause test framework to hang by default kernel settings
# TBD: Need tune gc_thresh1/2/3 at host side of vs docker to support this.
NUM_INTF = 8
NUM_NEIGH_PER_INTF = 16 #128
NUM_OF_NEIGHS = (NUM_INTF*NUM_NEIGH_PER_INTF)

# 'ip neigh flush all' won't remove failed entries if number of neighs less than gc_threshold1
# Also it takes time to remove them completly.
# We use arp off/on to do it
def flush_neigh_entries(dvs):
dvs.runcmd("ip link set group default arp off")
dvs.runcmd("ip link set group default arp on")

# Add neighbor entries on servers connecting to SONiC ports
# ping them to get the neighbor entries
def setup_initial_neighbors(dvs):
for i in range(8, 8+NUM_INTF):
for j in range(NUM_NEIGH_PER_INTF):
dvs.servers[i].runcmd("ip addr add {}.0.0.{}/24 dev eth0".format(i*4, j+2))
dvs.servers[i].runcmd("ip -6 addr add {}00::{}/64 dev eth0".format(i*4,j+2))

time.sleep(1)

for i in range(8, 8+NUM_INTF):
for j in range(NUM_NEIGH_PER_INTF):
dvs.runcmd(['sh', '-c', "ping -c 1 -W 0 -q {}.0.0.{} > /dev/null 2>&1".format(i*4,j+2)])
dvs.runcmd(['sh', '-c', "ping6 -c 1 -W 0 -q {}00::{} > /dev/null 2>&1".format(i*4,j+2)])

# Del half of the ips and a new half of the ips
# note: the first ipv4 can not be deleted only
def del_and_add_neighbors(dvs):
for i in range(8, 8+NUM_INTF):
for j in range(NUM_NEIGH_PER_INTF/2):
dvs.servers[i].runcmd("ip addr del {}.0.0.{}/24 dev eth0".format(i*4, j+NUM_NEIGH_PER_INTF/2+2))
dvs.servers[i].runcmd("ip -6 addr del {}00::{}/64 dev eth0".format(i*4,j+NUM_NEIGH_PER_INTF/2+2))
dvs.servers[i].runcmd("ip addr add {}.0.0.{}/24 dev eth0".format(i*4, j+NUM_NEIGH_PER_INTF+2))
dvs.servers[i].runcmd("ip -6 addr add {}00::{}/64 dev eth0".format(i*4,j+NUM_NEIGH_PER_INTF+2))

#ping new IPs
def ping_new_ips(dvs):
for i in range(8, 8+NUM_INTF):
for j in range(NUM_NEIGH_PER_INTF/2):
dvs.runcmd(['sh', '-c', "ping -c 1 -W 0 -q {}.0.0.{} > /dev/null 2>&1".format(i*4,j+NUM_NEIGH_PER_INTF+2)])
dvs.runcmd(['sh', '-c', "ping6 -c 1 -W 0 -q {}00::{} > /dev/null 2>&1".format(i*4,j+NUM_NEIGH_PER_INTF+2)])

def test_system_warmreboot_neighbor_syncup(dvs, testlog):

appl_db = swsscommon.DBConnector(swsscommon.APPL_DB, dvs.redis_sock, 0)
Expand All @@ -1536,18 +1591,16 @@ def test_system_warmreboot_neighbor_syncup(dvs, testlog):
# Ethernet32/36/.../60, with ip: 32.0.0.1/24... 60.0.0.1/24
# ipv6: 3200::1/64...6000::1/64
# bring up the servers'interfaces and assign NUM_NEIGH_PER_INTF (e,g 128) ips per interface
# TBD: NUM_NEIGH_PER_INTF >= 128 ips will cause test framework to hang by default settings
# TBD: Need tune gc_thresh1/2/3 at host side of vs docker to support this.
NUM_INTF = 8
NUM_NEIGH_PER_INTF = 64 #128
NUM_OF_NEIGHS = (NUM_INTF*NUM_NEIGH_PER_INTF)
macs = []
for i in range(8, 8+NUM_INTF):
# set timeout to be the same as real HW
# set stale timer bigger to avoid testbed difference related timing issues.
# set ip on server facing interfaces
# bring servers' interface up, save the macs
dvs.runcmd("sysctl -w net.ipv4.neigh.Ethernet{}.base_reachable_time_ms=1800000".format(i*4))
dvs.runcmd("sysctl -w net.ipv6.neigh.Ethernet{}.base_reachable_time_ms=1800000".format(i*4))
dvs.runcmd("sysctl -w net.ipv4.neigh.Ethernet{}.gc_stale_time=180".format(i*4))
dvs.runcmd("sysctl -w net.ipv6.neigh.Ethernet{}.gc_stale_time=180".format(i*4))
dvs.runcmd("ip addr flush dev Ethernet{}".format(i*4))
dvs.runcmd("ifconfig Ethernet{} {}.0.0.1/24 up".format(i*4, i*4))
dvs.runcmd("ip -6 addr add {}00::1/64 dev Ethernet{}".format(i*4,i*4))
Expand All @@ -1559,21 +1612,8 @@ def test_system_warmreboot_neighbor_syncup(dvs, testlog):

#
# Testcase 1:
# Add neighbor entries on servers connecting to SONiC ports
# 128 ipv4 and 128 ipv6 on each server
# total: 1024 ipv4 and 1024 ipv6
# ping them to get the neighbor entries
for i in range(8, 8+NUM_INTF):
for j in range(NUM_NEIGH_PER_INTF):
dvs.servers[i].runcmd("ip addr add {}.0.0.{}/24 dev eth0".format(i*4, j+2))
dvs.servers[i].runcmd("ip -6 addr add {}00::{}/64 dev eth0".format(i*4,j+2))

time.sleep(1)

for i in range(8, 8+NUM_INTF):
for j in range(NUM_NEIGH_PER_INTF):
dvs.runcmd(['sh', '-c', "ping -c 1 -W 0 -q {}.0.0.{} > /dev/null 2>&1".format(i*4,j+2)])
dvs.runcmd(['sh', '-c', "ping6 -c 1 -W 0 -q {}00::{} > /dev/null 2>&1".format(i*4,j+2)])
# Setup initial neigbors
setup_initial_neighbors(dvs)

# Check the neighbor entries are inserted correctly
db = swsscommon.DBConnector(0, dvs.redis_sock, 0)
Expand Down Expand Up @@ -1660,15 +1700,9 @@ def test_system_warmreboot_neighbor_syncup(dvs, testlog):
# stop neighsyncd and sairedis.rec
stop_neighsyncd(dvs)
del_entry_tbl(state_db, "NEIGH_RESTORE_TABLE", "Flags")
time.sleep(3)

# Del half of the ips and a new half of the ips
# note: the first ipv4 can not be deleted only
for i in range(8, 8+NUM_INTF):
for j in range(NUM_NEIGH_PER_INTF/2):
dvs.servers[i].runcmd("ip addr del {}.0.0.{}/24 dev eth0".format(i*4, j+NUM_NEIGH_PER_INTF/2+2))
dvs.servers[i].runcmd("ip -6 addr del {}00::{}/64 dev eth0".format(i*4,j+NUM_NEIGH_PER_INTF/2+2))
dvs.servers[i].runcmd("ip addr add {}.0.0.{}/24 dev eth0".format(i*4, j+NUM_NEIGH_PER_INTF+2))
dvs.servers[i].runcmd("ip -6 addr add {}00::{}/64 dev eth0".format(i*4,j+NUM_NEIGH_PER_INTF+2))
del_and_add_neighbors(dvs)

flush_neigh_entries(dvs)
time.sleep(3)
Expand All @@ -1684,8 +1718,11 @@ def test_system_warmreboot_neighbor_syncup(dvs, testlog):
# should finish the store within 10 seconds
time.sleep(10)

check_kernel_reachable_v4_neigh_num(dvs, NUM_OF_NEIGHS)
check_kernel_reachable_v6_neigh_num(dvs, NUM_OF_NEIGHS)
check_kernel_reachable_v4_neigh_num(dvs, NUM_OF_NEIGHS/2)
check_kernel_reachable_v6_neigh_num(dvs, NUM_OF_NEIGHS/2)

check_kernel_stale_v4_neigh_num(dvs, NUM_OF_NEIGHS/2)
check_kernel_stale_v6_neigh_num(dvs, NUM_OF_NEIGHS/2)

# check syslog and sairedis.rec file for activities
check_syslog_for_neighbor_entry(dvs, marker, 0, 0, "ipv4")
Expand All @@ -1701,14 +1738,14 @@ def test_system_warmreboot_neighbor_syncup(dvs, testlog):
# ping the new ips, should get it into appDB
marker = dvs.add_log_marker()

for i in range(8, 8+NUM_INTF):
for j in range(NUM_NEIGH_PER_INTF/2):
dvs.runcmd(['sh', '-c', "ping -c 1 -W 0 -q {}.0.0.{} > /dev/null 2>&1".format(i*4,j+NUM_NEIGH_PER_INTF+2)])
dvs.runcmd(['sh', '-c', "ping6 -c 1 -W 0 -q {}00::{} > /dev/null 2>&1".format(i*4,j+NUM_NEIGH_PER_INTF+2)])
ping_new_ips(dvs)

check_kernel_reachable_v4_neigh_num(dvs, NUM_OF_NEIGHS)
check_kernel_reachable_v6_neigh_num(dvs, NUM_OF_NEIGHS)

check_kernel_stale_v4_neigh_num(dvs, NUM_OF_NEIGHS/2)
check_kernel_stale_v6_neigh_num(dvs, NUM_OF_NEIGHS/2)

check_kernel_reachable_v4_neigh_num(dvs, NUM_OF_NEIGHS+NUM_OF_NEIGHS/2)
check_kernel_reachable_v6_neigh_num(dvs, NUM_OF_NEIGHS+NUM_OF_NEIGHS/2)
check_redis_neigh_entries(dvs, tbl, 2*(NUM_OF_NEIGHS+NUM_OF_NEIGHS/2))

(nadd, ndel) = dvs.CountSubscribedObjects(pubsub)
Expand All @@ -1725,6 +1762,10 @@ def test_system_warmreboot_neighbor_syncup(dvs, testlog):

check_kernel_reachable_v4_neigh_num(dvs, NUM_OF_NEIGHS)
check_kernel_reachable_v6_neigh_num(dvs, NUM_OF_NEIGHS)

check_kernel_stale_v4_neigh_num(dvs, 0)
check_kernel_stale_v6_neigh_num(dvs, 0)

check_redis_neigh_entries(dvs, tbl, 2*NUM_OF_NEIGHS)

(nadd, ndel) = dvs.CountSubscribedObjects(pubsub)
Expand Down

0 comments on commit a7a68c1

Please sign in to comment.