Bond接口最简化的创建命令如下,任何参数都使用默认:
ip link add bond1 type bond 或者:
ip link add type bond 由内核决定接口名称,格式为:bond%d。
以上命令,创建的bond1设备,默认参数可使用如下命令查看。
$ ip -d link show dev bond1
5: bond1: <BROADCAST,MULTICAST,MASTER> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
link/ether 8e:cf:e6:a3:de:82 brd ff:ff:ff:ff:ff:ff promiscuity 0
bond mode balance-rr miimon 0 updelay 0 downdelay 0 use_carrier 1 arp_interval 0 arp_validate none arp_all_targets any primary_reselect always fail_over_mac none xmit_hash_policy layer2 resend_igmp 1 num_grat_arp 1 all_slaves_active 0 min_links 0 lp_interval 1 packets_per_slave 1 lacp_rate slow ad_select stable tlb_dynamic_lb 1 addrgenmode eui64 numtxqueues 16 numrxqueues 16 gso_max_size 65536 gso_max_segs 65535
参数mode
由内核中函数bond_check_params可知,如果不指定模式mode参数,默认情况下为BOND_MODE_ROUNDROBIN,即以上显示的balance-rr。
static int bond_check_params(struct bond_params *params)
{
int bond_mode = BOND_MODE_ROUNDROBIN;
目前bond支持的模式如下:
值 | 模式 | 描述 |
---|---|---|
0 | balance-rr | 在可用的从设备之间按顺序轮询 |
1 | active-backup | 固定某一个从设备,不可用时选择另一从设备 |
2 | balance-xor | 依据对报文信息的hash(xor)结果选择发送使用的从设备 |
3 | broadcast | 每个报文都在所有的从设备上执行发送 |
4 | 802.3ad | 802.3ad标准 |
5 | balance-tlb | 可调整的发送负载均衡,根据从设备的负载选择使用从设备 |
6 | balance-alb | 可调整的负载均衡,包含发送tlb和接收rlb |
MII相关选项
选项miimon指定通过MII检查链路状态的间隔时长,单位是毫秒。默认情况下miimon值为零,对于802.3ad、tlb和alb三种不使用以下介绍的arp链路检测方式的bond模式,内核要求启用miimon检查方式,默认检测时长为BOND_DEFAULT_MIIMON(100ms),否则,bond系统将不能检测链路的状况,如下bond_check_params函数中代码所示。
static int bond_check_params(struct bond_params *params)
{
if (miimon < 0) {
pr_warn("Warning: miimon module parameter (%d), not in range 0-%d, so it was reset to 0\n", miimon, INT_MAX);
miimon = 0;
}
/* reset values for 802.3ad/TLB/ALB */
if (!bond_mode_uses_arp(bond_mode)) {
if (!miimon) {
pr_warn("Warning: miimon must be specified, otherwise bonding will not detect link failure, speed and duplex which are essential for 802.3ad operation\n");
pr_warn("Forcing miimon to 100msec\n");
miimon = BOND_DEFAULT_MIIMON;
}
}
对于选项updelay和downdelay,前者表示延迟多长时间系统认为链路为UP状态;或者表示延迟多久认为链路DOWN,单位都是毫秒。如下所示,如果miimon选项未开启,updelay和downdelay选项将不起作用。
如果开启miimon选项,将关闭arp_interval选项(稍后介绍),updelay和downdelay的值应为miimon值的整数倍,系统会强制这一点,最终,updelay和downdelay中保存的为miimon的倍数,而不是最初的毫秒值。
if (!miimon) {
if (updelay || downdelay) {
/* just warn the user the up/down delay will have no effect since miimon is zero...
*/
pr_warn("Warning: miimon module parameter not set and updelay (%d) or downdelay (%d) module parameter is set; updelay and downdelay have no effect unless miimon is set\n", updelay, downdelay);
}
} else {
if (arp_interval) { /* don't allow arp monitoring */
pr_warn("Warning: miimon (%d) and arp_interval (%d) can't be used simultaneously, disabling ARP monitoring\n", miimon, arp_interval);
arp_interval = 0;
}
if ((updelay % miimon) != 0) {
pr_warn("Warning: updelay (%d) is not a multiple of miimon (%d), updelay rounded to %d ms\n", updelay, miimon, (updelay / miimon) * miimon);
}
updelay /= miimon;
if ((downdelay % miimon) != 0) {
pr_warn("Warning: downdelay (%d) is not a multiple of miimon (%d), downdelay rounded to %d ms\n", downdelay, miimon, (downdelay / miimon) * miimon);
}
downdelay /= miimon;
}
bond系统为每个实例启动一个delayed work调用函数bond_mii_monitor执行MII检查,要求网卡支持ethtool的get_link调用,或者MII的ioctl接口,参见函数bond_check_dev_link。
static int bond_check_dev_link(struct bonding *bond, struct net_device *slave_dev, int reporting)
{
const struct net_device_ops *slave_ops = slave_dev->netdev_ops;
if (!reporting && !netif_running(slave_dev))
return 0;
if (bond->params.use_carrier)
return netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0;
if (slave_dev->ethtool_ops->get_link) /* Try to get link status using Ethtool first. */
return slave_dev->ethtool_ops->get_link(slave_dev) ? BMSR_LSTATUS : 0;
ioctl = slave_ops->ndo_do_ioctl; /* Ethtool can't be used, fallback to MII ioctls. */
if (ioctl) {
strncpy(ifr.ifr_name, slave_dev->name, IFNAMSIZ);
mii = if_mii(&ifr);
if (ioctl(slave_dev, &ifr, SIOCGMIIPHY) == 0) {
mii->reg_num = MII_BMSR;
if (ioctl(slave_dev, &ifr, SIOCGMIIREG) == 0)
return mii->val_out & BMSR_LSTATUS;
}
选项use_carrier定义了在MII进行链路检测时,所使用的方式。值为0表示使用以上介绍的ethtool或者MII ioctl方式;值为1表示使用驱动程序提供的netif_carrier_ok函数,如以上的bond_check_dev_link函数所示。默认情况下,user_carrier选项为1。
static int use_carrier = 1;
static int bond_check_params(struct bond_params *params)
{
if ((use_carrier != 0) && (use_carrier != 1)) {
pr_warn("Warning: use_carrier module parameter (%d), not of valid value (0/1), so it was set to 1\n", use_carrier);
use_carrier = 1;
}
ARP相关选项
arp_interval选项指定bond系统进行ARP链路检测的间隔时长,也就是bond_arp_monitor函数对应的delayed work执行的间隔(对于activebackup模式会略有出入)。在函数bond_arp_monitor中,检查从设备接收或者发送流量的最近时间,来决定其up或者down的状态。ARP流量在bond_arp_monitor函数中使用诸如(bond_arp_send_all函数)发送,其目的地址为选项arp_ip_target中所指定地址,最多可指定16(BOND_MAX_ARP_TARGETS)个目标地址。arp链路检测功能不能和miimon功能同时使用。
如下函数bond_check_params中的初始化代码,如果arp_ip_target选项中未指定地址,禁用arp_interval,默认情况下arp_interval为0。
for (arp_ip_count = 0, i = 0; (arp_ip_count < BOND_MAX_ARP_TARGETS) && arp_ip_target[i]; i++) {
/* not a complete check, but good enough to catch mistakes */
if (!in4_pton(arp_ip_target[i], -1, (u8 *)&ip, -1, NULL) || !bond_is_ip_target_ok(ip)) {
pr_warn("Warning: bad arp_ip_target module parameter (%s), ARP monitoring will not be performed\n", arp_ip_target[i]);
arp_interval = 0;
} else {
if (bond_get_targets_ip(arp_target, ip) == -1)
arp_target[arp_ip_count++] = ip;
else
pr_warn("Warning: duplicate address %pI4 in arp_ip_target, skipping\n", &ip);
}
}
if (arp_interval && !arp_ip_count) {
/* don't allow arping if no arp_ip_target given... */
pr_warn("Warning: arp_interval module parameter (%d) specified without providing an arp_ip_target parameter, arp_inter val was reset to 0\n",
arp_interval);
arp_interval = 0;
}
选项arp_validate决定是否对ARP流量(请求和应答)进行验证,以及是否过滤非ARP流量,其取值如下:
数值 | 值 | 描述 |
---|---|---|
0 | none | No validation or filtering |
1 | active | Validation is performed only for the active slave. |
2 | backup | Validation is performed only for backup slaves. |
3 | all | Validation is performed for all slaves. |
4 | filter | Filtering is applied to all slaves. No validation is performed. |
5 | filter_active | Filtering is applied to all slaves, validation is performed only for the active slave. |
6 | filter_backup | Filtering is applied to all slaves, validation is performed only for backup slaves. |
对于活动的从设备,开启arp_validate之后,将检查arp响应报文是否由arp_ip_target中的地址所产生。对于非活动从设备,其可能由对端交换机接收到活动设备发送的广播ARP请求,从而执行arp_validate操作,如果由于对端交换机的行为,接收不到ARP报文,需要禁止对非活动从设备的验证。对于非活动从设备接收到ARP请求的情况,应判断报文的源IP为bond接口的IP地址,而目的IP为arp_ip_target中的地址。对于接收到ARP请求,判断情况相反。
对于多个bond实例连接在同一个交换机的情况,其中某个bond实例可接收到其它实例发送的ARP报文,将引起链路状态判断的错误,arp_validate正是解决此问题。filter过滤功能仅依据接收的ARP报文进行链路状态判断,忽略非ARP报文,这对于网络拓扑中存在大量其它的广播报文的情况,可屏蔽这里报文对链路状态判断的影响。
如果为指定arp_interval,不应开启arp_validate功能,默认情况下arp_validate为0。
if (arp_validate) {
if (!arp_interval) {
pr_err("arp_validate requires arp_interval\n");
return -EINVAL;
}
bond_opt_initstr(&newval, arp_validate);
valptr = bond_opt_parse(bond_opt_get(BOND_OPT_ARP_VALIDATE), &newval);
if (!valptr) {
pr_err("Error: invalid arp_validate \"%s\"\n", arp_validate);
return -EINVAL;
}
arp_validate_value = valptr->value;
} else {
arp_validate_value = 0;
}
选项arp_all_targets仅对activeback模式的bond实例,在开启arp_validation的情况下有作用,其取值为0(any)或1(all)。为0时,如果arp_ip_targets中的某一个目标地址不可达时,就认为从设备部可用;如果取值为1,仅当arp_ip_targets中指定的所有目标地址全部可达时,才判断从设备为up状态。默认情况下arp_all_targets选项为0。
if (arp_all_targets) {
bond_opt_initstr(&newval, arp_all_targets);
valptr = bond_opt_parse(bond_opt_get(BOND_OPT_ARP_ALL_TARGETS), &newval);
if (!valptr) {
pr_err("Error: invalid arp_all_targets_value \"%s\"\n", arp_all_targets);
arp_all_targets_value = 0;
} else {
arp_all_targets_value = valptr->value;
primary和primary_reselect
这两个选项仅在模式为active-backup(1), balance-tlb (5) 和 balance-alb (6)时有效,primary选项指定首选从设备,只要其可用,总是活动的从设备。只有当指定的首选从设备离线时,才选择其它从设备为活动设备。应用于某一从设备在带宽等方面优于其它从设备的情况下。
primary_reselect选项规定在当前活动设备失效或者首选设备恢复时,首选设备是否转变为活动设备的策略,其可选值如下:
数值 | 策略 | 描述 |
---|---|---|
0 | always | 默认值,首选设备总是转变为活动设备 |
1 | better | 首选设备的速率和双工参数要由于当前的活动设备 |
2 | failure | 仅在当前活动设备失效时,首选设备转变为活动设备 |
但是,当所有从设备都为失效状态时,第一个恢复的从设备转变为活动设备。另外,当首选设备首次添加到bond中时,其总是活动设备。重选策略可通过修改sysfs文件实现(/sys/devices/virtual/net/bond1/bonding/primary_reselect),此文件的修改将触发一次活动设备的选择操作。
$ cat /sys/devices/virtual/net/bond1/bonding/primary_reselect
always 0
如下函数bond_check_params中的代码,函数bond_mode_uses_primary判断模式是否为ACTIVEBACKUP/TLB/ALB,如果不成立,将primary设置为空。primary_reselect选项的值默认设置为宏BOND_PRI_RESELECT_ALWAYS(0)。
if (primary && !bond_mode_uses_primary(bond_mode)) {
/* currently, using a primary only makes sense in active backup, TLB or ALB modes
*/
pr_warn("Warning: %s primary device specified but has no effect in %s mode\n", primary, bond_mode_name(bond_mode));
primary = NULL;
}
if (primary && primary_reselect) {
bond_opt_initstr(&newval, primary_reselect);
valptr = bond_opt_parse(bond_opt_get(BOND_OPT_PRIMARY_RESELECT), &newval);
if (!valptr) {
pr_err("Error: Invalid primary_reselect \"%s\"\n", primary_reselect);
return -EINVAL;
}
primary_reselect_value = valptr->value;
} else {
primary_reselect_value = BOND_PRI_RESELECT_ALWAYS;
}
fail_over_mac
选项fail_over_mac仅对active-backup模式有效,其取值如下:
数组 | 表示 | 描述 |
---|---|---|
0 | none | 默认值,将bond内所有从设备MAC地址设为同一地址 |
1 | active | 从设备MAC地址不改变,bond的MAC地址等于活动从设备的MAC |
2 | follow | bond的MAC地址等于第一个加入的从设备的MAC地址, 当其它从设备变为活动设备时,将活动设备MAC修改为bond的MAC地址; 而上一个活动设备的MAC地址修改为当前活动设备的MAC地址。 |
如下函数bond_check_params所示,默认情况下fail_over_mac取值为BOND_FOM_NONE(0)。
if (fail_over_mac) {
bond_opt_initstr(&newval, fail_over_mac);
valptr = bond_opt_parse(bond_opt_get(BOND_OPT_FAIL_OVER_MAC), &newval);
if (!valptr) {
pr_err("Error: invalid fail_over_mac \"%s\"\n", fail_over_mac);
return -EINVAL;
}
fail_over_mac_value = valptr->value;
if (bond_mode != BOND_MODE_ACTIVEBACKUP)
pr_warn("Warning: fail_over_mac only affects active-backup mode\n");
} else {
fail_over_mac_value = BOND_FOM_NONE;
}
可通过以下sysfs文件修改fail_over_mac的值,但是此操作要求在bond没有任何从设备的请求下进行。
$ cat /sys/devices/virtual/net/bond1/bonding/fail_over_mac
none 0
xmit_hash_policy
用于在balance-xor, 802.3ad, 和 tlb模式下,选择发送使用的hash算法,可选值如下:
数值 | 策略 | 算法 |
---|---|---|
0 | layer 2 | 默认值, hash = source MAC XOR destination MAC XOR packet type ID hash mod slave count |
1 | layer 3+4 | hash = source MAC XOR destination MAC XOR packet type ID hash = hash XOR source IP XOR destination IP hash = hash XOR (hash RSHIFT 16) hash = hash XOR (hash RSHIFT 8) hash mod slave count |
2 | layer 2+3 | hash = source port, destination port (as in the header) hash = hash XOR source IP XOR destination IP hash = hash XOR (hash RSHIFT 16) hash = hash XOR (hash RSHIFT 8) hash modulo slave count. |
3 | encap layer 2+3 | 算法与layer 2+3相同,但是对于隧道封装报文,使用内部报文头部数据做计算 |
4 | encap layer 3+4 | 算法与layer 3+4相同,但是对于隧道封装报文,使用内部报文头部数据做计算 |
内核中默认的值为BOND_XMIT_POLICY_LAYER2(0),对于ROUNDROBIN / ACTIVEBACKUP / BROADCAST三种模式,此选项没有效果。
int xmit_hashtype = BOND_XMIT_POLICY_LAYER2;
if (xmit_hash_policy) {
if (bond_mode == BOND_MODE_ROUNDROBIN || bond_mode == BOND_MODE_ACTIVEBACKUP || bond_mode == BOND_MODE_BROADCAST) {
pr_info("xmit_hash_policy param is irrelevant in mode %s\n", bond_mode_name(bond_mode));
} else {
bond_opt_initstr(&newval, xmit_hash_policy);
valptr = bond_opt_parse(bond_opt_get(BOND_OPT_XMIT_HASH), &newval);
if (!valptr) {
pr_err("Error: Invalid xmit_hash_policy \"%s\"\n", xmit_hash_policy);
return -EINVAL;
}
xmit_hashtype = valptr->value;
}
}
resend_igmp
选项resend_igmp指定在发生故障切换时,发送IGMP membership报告的数量,取值范围[0, 255],默认值为BOND_DEFAULT_RESEND_IGMP(1)。在发送failover时,如果resend_igmp不为零,立即发送IGMP报告,如果resend_igmp大于1,后续的报文间隔200ms发送。
static int resend_igmp = BOND_DEFAULT_RESEND_IGMP;
static int bond_check_params(struct bond_params *params)
{
if (resend_igmp < 0 || resend_igmp > 255) {
pr_warn("Warning: resend_igmp (%d) should be between 0 and 255, resetting to %d\n", resend_igmp, BOND_DEFAULT_RESEND_IGMP);
resend_igmp = BOND_DEFAULT_RESEND_IGMP;
}
如下的delayed work执行函数bond_resend_igmp_join_requests_delayed,每个HZ/5(200ms)执行一次。
static void bond_resend_igmp_join_requests_delayed(struct work_struct *work)
{
struct bonding *bond = container_of(work, struct bonding, mcast_work.work);
if (!rtnl_trylock()) {
queue_delayed_work(bond->wq, &bond->mcast_work, 1);
return;
}
call_netdevice_notifiers(NETDEV_RESEND_IGMP, bond->dev);
if (bond->igmp_retrans > 1) {
bond->igmp_retrans--;
queue_delayed_work(bond->wq, &bond->mcast_work, HZ/5);
}
num_grat_arp
选项num_grat_arp与IPv6的选项num_unsol_na意义相同,指定在故障转移时,发送免费ARP和unsolicited IPv6 NA邻居通告的数量,取值范围:[0, 255],默认值为1。此选项仅对active-backup模式有效。
static int num_peer_notif = 1;
static int bond_check_params(struct bond_params *params)
{
if (num_peer_notif < 0 || num_peer_notif > 255) {
pr_warn("Warning: num_grat_arp/num_unsol_na (%d) not in range 0-255 so it was reset to 1\n", num_peer_notif);
num_peer_notif = 1;
}
首先在failover处理函数bond_change_active_slave中,如果num_grat_arp启用,发送NETDEV_NOTIFY_PEERS通知。
void bond_change_active_slave(struct bonding *bond, struct slave *new_active)
{
if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) {
if (should_notify_peers)
call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev);
其次,在bond系统的两个delayed work处理MII和ARP链路检测中,如果满足发送条件,其中包括num_grat_arp指定次数未达到,继续发送NETDEV_NOTIFY_PEERS通知。
static void bond_mii_monitor(struct work_struct *work)
{
if (should_notify_peers) {
call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev);
static void bond_activebackup_arp_mon(struct bonding *bond)
{
if (should_notify_peers || should_notify_rtnl) {
if (should_notify_peers)
call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev);
在bond函数注册的netdevice通知链处理函数bond_master_netdev_event中,每接收到一次NETDEV_NOTIFY_PEERS通知,将发送次数递减一。
static int bond_master_netdev_event(unsigned long event, struct net_device *bond_dev)
{
struct bonding *event_bond = netdev_priv(bond_dev);
switch (event) {
case NETDEV_NOTIFY_PEERS:
if (event_bond->send_peer_notif)
event_bond->send_peer_notif--;
最后,对于IPv4协议,在函数inetdev_event中,接收到NETDEV_NOTIFY_PEERS事件,发送免费ARP。
static int inetdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct in_device *in_dev = __in_dev_get_rtnl(dev);
switch (event) {
case NETDEV_NOTIFY_PEERS:
/* Send gratuitous ARP to notify of link change */
inetdev_send_gratuitous_arp(dev, in_dev);
对于IPv6协议,在函数ndisc_netdev_event中处理NETDEV_NOTIFY_PEERS通知事件,发送unsolicited邻居通告。
static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
switch (event) {
case NETDEV_NOTIFY_PEERS:
ndisc_send_unsol_na(dev);
break;
另外,对于bond接口上创建的VLAN子接口,在VLAN子系统处理函数vlan_device_event中,将变量所有的VLAN子接口,发送NETDEV_NOTIFY_PEERS事件,每个VLAN子设备的处理函数也是上面介绍的inetdev_event和ndisc_netdev_event函数。
static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr)
{
struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
switch (event) {
case NETDEV_NOTIFY_PEERS:
case NETDEV_BONDING_FAILOVER:
case NETDEV_RESEND_IGMP:
/* Propagate to vlan devices */
vlan_group_for_each_dev(grp, i, vlandev)
call_netdevice_notifiers(event, vlandev);
break;
all_slaves_active
选项all_slaves_active决定是否为每个从设备设定active标志,开启之后,所有从设备都可正常转发报文,这将都是重复报文的处理。默认情况下all_slaves_active值为0(dropped),非活动的从设备将丢弃重复报文;可能在某些情况下all_slaves_active需要设置为1(delivered)。
如下bond_set_slave_inactive_flags函数,如果all_salves_active为1,将不会为从设备设置inactive标志。
static inline void bond_set_slave_inactive_flags(struct slave *slave, bool notify)
{
if (!bond_is_lb(slave->bond))
bond_set_slave_state(slave, BOND_STATE_BACKUP, notify);
if (!slave->bond->params.all_slaves_active)
slave->inactive = 1;
}
在bond系统处理函数bond_handle_frame中,将调用如下函数bond_should_deliver_exact_match判断是否要跳过报文处理,如果所有的从设备都没有设置inactive,返回false,正常处理接收的报文。
static bool bond_should_deliver_exact_match(struct sk_buff *skb, struct slave *slave, struct bonding *bond)
{
if (bond_is_slave_inactive(slave)) {
if (BOND_MODE(bond) == BOND_MODE_ALB &&
skb->pkt_type != PACKET_BROADCAST &&
skb->pkt_type != PACKET_MULTICAST)
return false;
return true;
}
return false;
min_links
选项min_lins规定了bond接口转变为carrier on前至少要有的可用链路数量,此选项仅在802.3ad模式下生效,默认值为0,即只要有可用的链路,bond接口就设置为carrier on。如下函数bond_3ad_set_carrier所示,如果active的端口数量小于设定值min_links,将bond设备设置为carrier off。
int bond_3ad_set_carrier(struct bonding *bond)
{
active = __get_active_agg(&(SLAVE_AD_INFO(first_slave)->aggregator));
if (active) {
/* are enough slaves available to consider link up? */
if (__agg_active_ports(active) < bond->params.min_links) {
if (netif_carrier_ok(bond->dev)) {
netif_carrier_off(bond->dev);
goto out;
}
} else if (!netif_carrier_ok(bond->dev)) {
netif_carrier_on(bond->dev);
goto out;
}
lp_interval
选项lp_interval定义了bond实例向对端交换机发送Learning Packets报文的时间间隔,此选项仅在balance-tlb 和 balance-alb模式下生效,取值范围:[1, 0x7fffffff],单位为秒,默认值为BOND_ALB_DEFAULT_LP_INTERVAL(1秒)。
static int lp_interval = BOND_ALB_DEFAULT_LP_INTERVAL;
static int bond_check_params(struct bond_params *params)
{
if (lp_interval == 0) {
pr_warn("Warning: ip_interval must be between 1 and %d, so it was reset to %d\n", INT_MAX, BOND_ALB_DEFAULT_LP_INTERVAL);
lp_interval = BOND_ALB_DEFAULT_LP_INTERVAL;
}
内核中处理函数bond_alb_monitor的运行时间间隔是alb_delta_in_ticks,即1/10秒,每次运行,将lp计数lp_counter自加一,选项lp_interval换算为次数的话,值为BOND_ALB_LP_TICKS,当lp_counter大于等于BOND_ALB_LP_TICKS时,发送Learning报文。
#define ALB_TIMER_TICKS_PER_SEC 10 /* should be a divisor of HZ */
#define BOND_ALB_LP_INTERVAL(bond) (bond->params.lp_interval)
#define BOND_ALB_LP_TICKS(bond) (BOND_ALB_LP_INTERVAL(bond) * ALB_TIMER_TICKS_PER_SEC)
static const int alb_delta_in_ticks = HZ / ALB_TIMER_TICKS_PER_SEC;
void bond_alb_monitor(struct work_struct *work)
{
struct bonding *bond = container_of(work, struct bonding, alb_work.work);
struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
bond_info->lp_counter++;
if (bond_info->lp_counter >= BOND_ALB_LP_TICKS(bond)) {/* send learning packets */
bond_for_each_slave_rcu(bond, slave, iter) {
/* If updating current_active, use all currently user mac addreses (!strict_match). Otherwise, only
* use mac of the slave device. In RLB mode, we always use strict matches.
*/
strict_match = (slave != rcu_access_pointer(bond->curr_active_slave) || bond_info->rlb_enabled);
alb_send_learning_packets(slave, slave->dev->dev_addr, strict_match);
}
bond_info->lp_counter = 0;
}
以上函数bond_alb_monitor仅在ALB模式或者TLB模式下并且打开tlb_dynamic_lb选项的情况下调用。
static int bond_open(struct net_device *bond_dev)
{
struct bonding *bond = netdev_priv(bond_dev);
if (bond_is_lb(bond)) {
/* bond_alb_initialize must be called before the timer is started.
*/
if (bond_alb_initialize(bond, (BOND_MODE(bond) == BOND_MODE_ALB)))
return -ENOMEM;
if (bond->params.tlb_dynamic_lb || BOND_MODE(bond) == BOND_MODE_ALB)
queue_delayed_work(bond->wq, &bond->alb_work, 0);
}
packets_per_slave
选项packets_per_slave指定在转换到下一个从设备之前,当前从设备发送的报文数量,仅在balance-rr模式下生效。取值范围:[0 - 65535],值为0时,随机选择下一个从设备;默认值为1,每个从设备发送一个报文。
在初始化函数bond_check_params中,如果选项packets_per_slave大于零,这里先计算出其倒数值(reciprocal_value)。
static int packets_per_slave = 1;
static int bond_check_params(struct bond_params *params)
{
if (packets_per_slave > 0) {
params->reciprocal_packets_per_slave = reciprocal_value(packets_per_slave);
}
如下函数bond_rr_gen_slave_id用于选择balance-rr模式下的下一个发送从设备ID。当选项packets_per_slave为0时,使用随机函数prandom_u32进行选择。当为1时,slave_id等于rr_tx_counter的值,后者在每发送一个报文后递增一。其它情况下,使用统计出来的发送报文数量rr_tx_counter除以选项中定义的报文数量packets_per_slave的结果作为从设备ID,即每个设备发送packets_per_slave数量的报文。
static u32 bond_rr_gen_slave_id(struct bonding *bond)
{
int packets_per_slave = bond->params.packets_per_slave;
switch (packets_per_slave) {
case 0:
slave_id = prandom_u32();
break;
case 1:
slave_id = bond->rr_tx_counter;
break;
default:
reciprocal_packets_per_slave = bond->params.reciprocal_packets_per_slave;
slave_id = reciprocal_divide(bond->rr_tx_counter, reciprocal_packets_per_slave);
break;
}
bond->rr_tx_counter++;
return slave_id;
lacp_rate
选项lacp_rate指定请求对端执行的LACPDU报文的发送速率,此选项仅在802.3ad模式下生效。默认值为0(slow),其它有效值为1(fast)。如下函数ad_initialize_port,如果lacp_fast为真,将设置Actor状态的AD_STATE_LACP_TIMEOUT比特位,根据802.3ad协议,此位位于LACPDU报文结构的Actor_State中的第二位(bit 1),表示长短两种超时时间,说明此LACPDU报文中的信息的有效时长。
void bond_3ad_bind_slave(struct slave *slave)
{
if (SLAVE_AD_INFO(slave)->port.slave != slave) {
port = &(SLAVE_AD_INFO(slave)->port);
ad_initialize_port(port, bond->params.lacp_fast);
static void ad_initialize_port(struct port *port, int lacp_fast)
{
if (port) {
if (lacp_fast)
port->actor_oper_port_state |= AD_STATE_LACP_TIMEOUT;
802.3ad定义的两者超时时长,如下宏定义AD_FAST_PERIODIC_TIME和AD_SLOW_PERIODIC_TIME,单位为秒。如下函数__update_lacpdu_from_port,根据端口port的actor_oper_port_state值,更新LACPDU报文中的actor_state字段。
#define AD_FAST_PERIODIC_TIME 1
#define AD_SLOW_PERIODIC_TIME 30
static inline void __update_lacpdu_from_port(struct port *port)
{
struct lacpdu *lacpdu = &port->lacpdu;
lacpdu->actor_state = port->actor_oper_port_state;
以下为802.3ad的Periodic Transmission machine实现函数ad_periodic_machine,在sm_periodic_timer_counter时间超时之后,将根据AD_STATE_LACP_TIMEOUT标志,设置此状态机的状态为AD_SLOW_PERIODIC或者AD_FAST_PERIODIC,前者意味着下一次超时时长将设置为AD_SLOW_PERIODIC_TIME;而后者意味值超时时长设置为AD_FAST_PERIODIC_TIME。
static void ad_periodic_machine(struct port *port)
{
if...
else if (port->sm_periodic_timer_counter) {/* check if state machine should change state */
} else {
switch (port->sm_periodic_state) {
case AD_NO_PERIODIC:
port->sm_periodic_state = AD_FAST_PERIODIC;
break;
case AD_PERIODIC_TX:
if (!(port->partner_oper.port_state & AD_STATE_LACP_TIMEOUT))
port->sm_periodic_state = AD_SLOW_PERIODIC;
else
port->sm_periodic_state = AD_FAST_PERIODIC;
break;
if (port->sm_periodic_state != last_state) {
switch (port->sm_periodic_state) {
case AD_FAST_PERIODIC:
port->sm_periodic_timer_counter = __ad_timer_to_ticks(AD_PERIODIC_TIMER, (u16)(AD_FAST_PERIODIC_TIME))-1;
break;
case AD_SLOW_PERIODIC:
port->sm_periodic_timer_counter = __ad_timer_to_ticks(AD_PERIODIC_TIMER, (u16)(AD_SLOW_PERIODIC_TIME))-1;
break;
case AD_PERIODIC_TX:
port->ntt = true;
LACPDU报文的发送时通过设置ntt(Need To Transmit)标志实现。具体发送在TX状态机函数ad_tx_machine中。
static void ad_tx_machine(struct port *port)
{
/* check if tx timer expired, to verify that we do not send more than 3 packets per second
*/
if (port->sm_tx_timer_counter && !(--port->sm_tx_timer_counter)) {
if (port->ntt && (port->sm_vars & AD_PORT_LACP_ENABLED)) {
__update_lacpdu_from_port(port);
if (ad_lacpdu_send(port) >= 0) {
ad_select
选项ad_select定义802.3ad模式的聚合选择逻辑,取值如下:
数值 | 字符串 | 描述 |
---|---|---|
0 | stable | 默认值,活动aggregator依据最大带宽选择,当所有从设备down时,进行重新选择 |
1 | bandwidth | 同上,按照最大带宽选择。重选择时机如下: 从设备添加/移除 任一从设备链路变化 任一从设备的802.3ad关联状态变化 bond的管理状态变为UP |
2 | count | 活动aggregator依据最大port(从设备)数量选择,重选择机制如同以上的bandwidth |
如下函数ad_agg_selection_test所示,子函数__agg_active_ports用于在BOND_AD_COUNT选择模式时,获取aggregator的活动port数量。子函数__get_agg_bandwidth用户子其它两个选择模式时,获取带宽值。
static struct aggregator *ad_agg_selection_test(struct aggregator *best, struct aggregator *curr)
{
switch (__get_agg_selection_mode(curr->lag_ports)) {
case BOND_AD_COUNT:
if (__agg_active_ports(curr) > __agg_active_ports(best))
return curr;
if (__agg_active_ports(curr) < __agg_active_ports(best))
return best;
/*FALLTHROUGH*/
case BOND_AD_STABLE:
case BOND_AD_BANDWIDTH:
if (__get_agg_bandwidth(curr) > __get_agg_bandwidth(best))
return curr;
如下函数ad_agg_selection_logic,对于BOND_AD_STABLE选择模式,如果当前活动的aggregator还有活动的port及应答的partner,或者当前活动的aggregator和刚刚选择的aggregator都没有应答的partner时,仍然使用当前的活动aggregator。
static void ad_agg_selection_logic(struct aggregator *agg, bool *update_slave_arr)
{
struct aggregator *best, *active, *origin;
bond_for_each_slave_rcu(bond, slave, iter) {
agg = &(SLAVE_AD_INFO(slave)->aggregator);
agg->is_active = 0;
if (__agg_active_ports(agg) && agg_device_up(agg))
best = ad_agg_selection_test(best, agg);
}
if (best && __get_agg_selection_mode(best->lag_ports) == BOND_AD_STABLE) {
if (active && active->lag_ports && __agg_active_ports(active) && (__agg_has_partner(active) ||
(!__agg_has_partner(active) && !__agg_has_partner(best)))) {
if (!(!active->actor_oper_aggregator_key && best->actor_oper_aggregator_key)) {
best = NULL;
active->is_active = 1;
tlb_dynamic_lb
选项tlb_dynamic_lb决定是否开启动态负载均衡,此选项仅使用在TLB模式。默认值为1,意味着根据负荷情况在从设备之间动态分配流量;如果设置为0,表示根据对报文部分字段的HASH结果选择从设备,参见选项xmit_hash_policy。前者,根据负荷进行的负载均衡,副作用是会引起报文乱序。
参见以下TLB模式发送函数bond_tlb_xmit,选项tlb_dynamic_lb没有开启时,根据hash值选择发送使用的从设备。相反,使用函数tlb_choose_channel进行选择。
netdev_tx_t bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
{
struct bonding *bond = netdev_priv(bond_dev);
if (!is_multicast_ether_addr(eth_data->h_dest)) {
switch (skb->protocol) {
case htons(ETH_P_IP):
case htons(ETH_P_IPX):
case htons(ETH_P_IPV6):
hash_index = bond_xmit_hash(bond, skb);
if (bond->params.tlb_dynamic_lb) {
tx_slave = tlb_choose_channel(bond, hash_index & 0xFF, skb->len);
} else {
slaves = rcu_dereference(bond->slave_arr);
count = slaves ? READ_ONCE(slaves->count) : 0;
if (likely(count))
tx_slave = slaves->arr[hash_index % count];
如下__tlb_choose_channel函数,如果报文对应的hash值已经有对应的处理设备,使用此设备,否则选取负荷最轻从设备。此函数还负责更新每个hash值所对应的发送报文数量。
static struct slave *__tlb_choose_channel(struct bonding *bond, u32 hash_index, u32 skb_len)
{
struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
struct tlb_client_info *hash_table;
hash_table = bond_info->tx_hashtbl;
assigned_slave = hash_table[hash_index].tx_slave;
if (!assigned_slave) {
assigned_slave = tlb_get_least_loaded_slave(bond);
if (assigned_slave) {
struct tlb_slave_info *slave_info = &(SLAVE_TLB_INFO(assigned_slave));
u32 next_index = slave_info->head;
hash_table[hash_index].tx_slave = assigned_slave;
hash_table[hash_index].next = next_index;
hash_table[hash_index].prev = TLB_NULL_INDEX;
if (next_index != TLB_NULL_INDEX) hash_table[next_index].prev = hash_index;
slave_info->head = hash_index;
slave_info->load += hash_table[hash_index].load_history;
}
}
if (assigned_slave) hash_table[hash_index].tx_bytes += skb_len;
numtxqueues和numrxqueues
函数bond_get_num_tx_queues获取bond系统默认的发送队列数值,为BOND_DEFAULT_TX_QUEUES(16),并且接收队列的数值与发送相同,也是16,由以上的两个选项(numtxqueues/numrxqueues)可见。
struct net_device *rtnl_create_link(struct net *net, const char *ifname, unsigned char name_assign_type,
const struct rtnl_link_ops *ops, struct nlattr *tb[], struct netlink_ext_ack *extack)
{
if (tb[IFLA_NUM_TX_QUEUES])
num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]);
else if (ops->get_num_tx_queues)
num_tx_queues = ops->get_num_tx_queues();
if (tb[IFLA_NUM_RX_QUEUES])
num_rx_queues = nla_get_u32(tb[IFLA_NUM_RX_QUEUES]);
else if (ops->get_num_rx_queues)
num_rx_queues = ops->get_num_rx_queues();
gso_max_size和gso_max_segs选项
选项gso_max_size和gso_max_segs定义了对于bond接口GSO最大的长度和数量,选项gso_max_size默认值为GSO_MAX_SIZE(65536);选项gso_max_segs的默认值为GSO_MAX_SEGS(65535)。如下函数bond_compute_features所示,其在从设备加入和删除时调用,计算bond接口的GSO相关数值,取所有从设备中最小的值。
static void bond_compute_features(struct bonding *bond)
{
struct net_device *bond_dev = bond->dev;
unsigned int gso_max_size = GSO_MAX_SIZE;
u16 gso_max_segs = GSO_MAX_SEGS;
bond_for_each_slave(bond, slave, iter) {
...
gso_max_size = min(gso_max_size, slave->dev->gso_max_size);
gso_max_segs = min(gso_max_segs, slave->dev->gso_max_segs);
}
bond_dev->hard_header_len = max_hard_header_len;
done:
bond_dev->gso_max_segs = gso_max_segs;
netif_set_gso_max_size(bond_dev, gso_max_size);
选项ad_user_port_key
此选项仅用于802.3ad模式,定义端口key值的高10位,即[6, 15],默认情况下为0。
--------------------------------------------------------------
| User key (10 bits) | Speed (5 bits) | Duplex|
--------------------------------------------------------------
|15 6|5 1|0
在如下函数bond_3ad_bind_slave中,端口初始化时,先行将端口key值的[6,15]位赋为选项ad_user_port_key的值,之后在添加接口速率和双工值。
void bond_3ad_bind_slave(struct slave *slave)
{
struct bonding *bond = bond_get_bond_by_slave(slave);
struct port *port;
if (SLAVE_AD_INFO(slave)->port.slave != slave) {
port = &(SLAVE_AD_INFO(slave)->port);
...
/* key is determined according to the link speed, duplex and
* user key
*/
port->actor_admin_port_key = bond->params.ad_user_port_key << 6;
内核版本 5.0