必现用例:[2013/6/20 10:37:10] 方: Bug 637 - 网卡配置绑定导致系统不停重启问题
这个bug找到复现的关键点了。
主机和阵列通过交换机想连,并且交换机上连一根外网的网线,以rr模式添加bond0,alb模式添加bond1,就开始打堆栈,无限重启
[2013/6/20 10:37:36] 方: [2013年6月20日 10:37] 方:
<<< 交换机上连一根外网的网线还有rr和alb两个不同的模式,
[2013/6/20 10:37:51] 方: 不连外网网线的话出不了
[2013/6/20 10:38:13] 方: 而且无限重启的时候,把外网网线拔掉就能停下了
[2013/6/20 10:39:21] 方: 这个现象怎么解释
查找过程
1 首先根据oops定位到出错行
[2013/6/20 10:40:43] 方: CPU 0 Unable to handle kernel paging request at virtual address 0000000000000318, epc == ffffffffc0445a10, ra == ffffffffc04459dc
Oops[#1]:
Cpu 0
$ 0 : 0000000000000000 ffffffff808b1da0 0000000000000300 0000000000000030
$ 4 : 0000000000000000 a8000000029d2160 000000000000002e a800000002559000
$ 8 : a8000000029d2140 0000000000000001 0000000000000000 0000000000000018
$12 : 0000000000000000 000000001000001f a800000031180000 0000000000000000
$16 : a8000000029d214e 0000000000000300 a8000000012d1600 a8000000029c8580
$20 : a8000000012d1870 ffffffff812408e8 0000000000000806 0000000000000000
$24 : 00000000000002b1 000000555d5887b0
$28 : ffffffff811c4000 ffffffff811c7970 ffffffff811c7970 ffffffffc04459dc
Hi : 0000000000000000
Lo : 0000000000000000
epc : ffffffffc0445a10 rlb_arp_recv+0x128/0x228 [bonding]
Tainted: P
ra : ffffffffc04459dc rlb_arp_recv+0xf4/0x228 [bonding]
Status: 1010cce3 KX SX UX KERNEL EXL IE
Cause : 00800008
BadVA : 0000000000000318
PrId : 000d9202 (Cavium Octeon II)
Modules linked in: bonding run(P) raid vscsih iscsitgt disk vdisk cache(P) service gmeta mpt2sas netlink bubble platform octeon_ethernet at24
Process swapper (pid: 0, threadinfo=ffffffff811c4000, task=ffffffff811e5280, tls=0000000000000000)
Stack : 0000000000000003 ffffffff81241498 ffffffff812414d8 a8000000029c8580
a8000000029c8644 a800000002559000 ffffffff811c79b0 ffffffff807a7648
000d0300000d0300 ffffffff808b22e0 000000000000003c a800000002559600
a8000000029c8580 a800000002b7d280 0000000000000000 0000000000000001
0000000000000001 0000000000000001 ffffffff811c7a10 ffffffffc0010154
ffffffff811c7b80 ffffffff802d22e8 0000000000000000 ffffffff80356140
0000000000000000 0000000000000000 8001670000000000 0000000000000001
0000000000000003 0000000000000001 0000000000000000 000000000000ffff
0000000000000000 ffffffffc001ac00 0000000000000020 000000011000001f
a800000031180000 0000000000000000 ffffffff811d2a00 8001670000000100
...
Call Trace:
[<ffffffffc0445a10>] rlb_arp_recv+0x128/0x228 [bonding]
[<ffffffff807a7648>] netif_receive_skb+0x3f0/0x4d8
[<ffffffffc0010154>] cvm_oct_napi_poll_38+0x7ac/0x10e8 [octeon_ethernet]
[<ffffffff807a8218>] net_rx_action+0x128/0x280
[<ffffffff80314018>] __do_softirq+0x130/0x248
[<ffffffff803141b8>] do_softirq+0x88/0x90
[<ffffffff80314418>] irq_exit+0x70/0x88
[<ffffffff808b22e0>] do_IRQ+0x48/0x60
[<ffffffff80104cd4>] octeon_irq_ip2_ciu+0x94/0xb8
[<ffffffff80103348>] plat_irq_dispatch+0x80/0xd0
[<ffffffff802d22e8>] ret_from_irq+0x0/0x4
[<ffffffff802d24e0>] r4k_wait+0x20/0x40
[<ffffffff802d4794>] cpu_idle+0x84/0xa0
[<ffffffff808a3270>] rest_init+0x80/0x98
[<ffffffff81243b5c>] start_kernel+0x37c/0x4c4
Code: de440268 70431003 0082882d <92230018> 10600007 3c02808b 8a020018 8e230000 9a02001b
Kernel panic - not syncing: Fatal exception in interrupt
*** NMI Watchdog interrupt on Core 0x01 ***
$0 0x0000000000000000 at 0xffffffff803471bc
v0 0xffffffff802d24c0 v1 0x0000000000000001
a0 0xfffffffffffffffd a1 0x0000000000000000
a2 0xffffffff812403c8 a3 0x0000000000000001
a4 0x0000000000000800 a5 0x0000000000000020
a6 0x0000000000000000 a7 0x000000aaab43b498
t0 0x0000000000000000 t1 0x000000001000001f
t2 0xa800000031188000 t3 0x0000000000000000
s0 0xffffffff853e0000 s1 0xffffffff853f0000
s2 0xffffffff811c8980 s3 0x0000000000000000
s4 0x0000000000000002 s5 0x0000000000200200
s6 0xffffffff811c8990 s7 0xffffffff811287d0
t8 0x0000000000000000 t9 0x0000005561b7f7b0
k0 0x0000000000000000 k1 0x0000000000000000
gp 0xa8000000310fc000 sp 0xa8000000310ffb10
s8 0xa8000000310ffb10 ra 0xffffffff802dbc18
err_epc 0xffffffff802d24e0 epc 0xffffffff802d24e0
status 0x000000001058cce4 cause 0x0000000040808800
sum0 0x0000000000000000 en0 0x0000000000000000
*** Chip soft reset soon ***
关键的一行应该是这个:epc : ffffffffc0445a10 rlb_arp_recv+0x128/0x228 [bonding]
2 反汇编一下
mips64-octeon-linux-gnu-objdump -S bonding.ko
先搜索一下 rlb_arp_recv的基址:
000000000000e8e8 <rlb_arp_recv>:
e8e8 + 0x128 = ea10
_lock_rx_hashtbl(bond);
hash_index = _simple_hash((u8*)&(arp->ip_src), sizeof(arp->ip_src));
client_info = &(bond_info->rx_hashtbl[hash_index]);
e9fc: 7c82f803 dext v0,a0,0x0,0x20
ea00: 24030030 li v1,48
ea04: de440268 ld a0,616(s2)
ea08: 70431003 dmul v0,v0,v1
ea0c: 0082882d daddu s1,a0,v0
if ((client_info->assigned) &&
ea10: 92230018 lbu v1,24(s1)
ea14: 10600007 beqz v1,ea34 <rlb_arp_recv+0x14c>
ea18: 3c020000 lui v0,0x0
ea1c: 8a020018 lwl v0,24(s0)
ea20: 8e230000 lw v1,0(s1)
ea24: 9a02001b lwr v0,27(s0)
ea28: 10620019 beq v1,v0,ea90 <rlb_arp_recv+0x1a8>
ea2c: 00000000 nop
spin_lock_bh(&(BOND_ALB_INFO(bond).rx_hashtbl_lock));
也就是说,正确的出错位置是 if ((client_info->assigned) &&
为什么这个会出错呢?
3 源代码分析过程
出错位置在 rlb_ubond_alb.c 340,2-9
即rlb_update_entry_from_arp(struct bonding *bond, struct arp_pkt *arp)函数
把代码贴出来
329 static void rlb_update_entry_from_arp(struct bonding *bond, struct arp_pkt *arp)
330 {
331 struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
332 struct rlb_client_info *client_info;
333 u32 hash_index;
334
335 _lock_rx_hashtbl(bond);
336
337 hash_index = _simple_hash((u8*)&(arp->ip_src), sizeof(arp->ip_src));
338 client_info = &(bond_info->rx_hashtbl[hash_index]);
339
340 if ((client_info->assigned) &&
341 (client_info->ip_src == arp->ip_dst) &&
342 (client_info->ip_dst == arp->ip_src)) {
343 /* update the clients MAC address */
344 memcpy(client_info->mac_dst, arp->mac_src, ETH_ALEN);
345 client_info->ntt = 1;
346 bond_info->rx_ntt = 1;
347 }
348
349 _unlock_rx_hashtbl(bond);
350 }
L340里client_info是一个指针,那么应该跟踪一下它的值,于是printk出来,发现在oops的时候它的值比较怪异:
hugui: client_info: 300
很明显这个值是有问题的,不可能有这么小的值,内存的低端地址早就被占用了。
跟踪一下L338看看这个值怎么来的?又见两个指针!于是继续printk出来:
hugui: bond: a80000000d70b600
hugui: hash_index: 10
hugui: rx_hashtbl: 0
眼前一亮,0指针呀!于是 client_info: 300是咋回事也就很简单了,看代码就知道,把rx_hashtbl表的第10项的地址取出来,10*30=300,sizeof(struct tlb_client_info)=30。
接下来就是查找为啥是0指针了。
在同一个文件以这个字符串“rx_hashtbl =”搜索一下,得到两处结果,浏览一下代码,我觉得这处相关性更大一些:
kernel_2.6/linux/drivers/net/bonding/bond_alb.c 803
803 static int rlb_initialize(struct bonding *bond)
804 {
805 struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
806 struct packet_type *pk_type = &(BOND_ALB_INFO(bond).rlb_pkt_type);
807 struct rlb_client_info *new_hashtbl;
808 int size = RLB_HASH_TABLE_SIZE * sizeof(struct rlb_client_info);
809 int i;
810
811 spin_lock_init(&(bond_info->rx_hashtbl_lock));
812
813 new_hashtbl = kmalloc(size, GFP_KERNEL);
814 if (!new_hashtbl) {
815 pr_err(DRV_NAME
816 ": %s: Error: Failed to allocate RLB hash table\n",
817 bond->dev->name);
818 return -1;
819 }
820 _lock_rx_hashtbl(bond);
821
822 bond_info->rx_hashtbl = new_hashtbl;
823
824 bond_info->rx_hashtbl_head = RLB_NULL_INDEX;
825
826 for (i = 0; i < RLB_HASH_TABLE_SIZE; i++) {
827 rlb_init_table_entry(bond_info->rx_hashtbl + i);
828 }
829
830 _unlock_rx_hashtbl(bond);
831
832 /*initialize packet type*/
833 pk_type->type = cpu_to_be16(ETH_P_ARP);
834 pk_type->dev = NULL;
835 pk_type->func = rlb_arp_recv;
836
837 /* register to receive ARPs */
838 dev_add_pack(pk_type);
839
840 return 0;
841 }
从L822看,指针是存在的!
看看谁call了rlb_initialize: kernel_2.6/linux/drivers/net/bonding/bond_alb.c 1255
1255 int bond_alb_initialize(struct bonding *bond, int rlb_enabled)
1256 {
1257 int res;
1258
1259 res = tlb_initialize(bond);
1260 if (res) {
1261 return res;
1262 }
1263
1264 if (rlb_enabled) {
1265 bond->alb_info.rlb_enabled = 1;
1266 /* initialize rlb */
1267 res = rlb_initialize(bond);
1268 if (res) {
1269 tlb_deinitialize(bond);
1270 return res;
1271 }
1272 } else {
1273 bond->alb_info.rlb_enabled = 0;
1274 }
1275
1276 return 0;
1277 }
注意这个rlb_enabled的问题,再往上级查找,谁call了bond_alb_initialize?
drivers/net/bonding/bond_main.c
3714 static int bond_open(struct net_device *bond_dev)
3715 {
3716 struct bonding *bond = netdev_priv(bond_dev);
3717
3718 bond->kill_timers = 0;
3719
3720 if (bond_is_lb(bond)) {
3721 /* bond_alb_initialize must be called before the timer
3722 * is started.
3723 */
3724 if (bond_alb_initialize(bond, (bond->params.mode == BOND_MODE_ALB))) {
3725 /* something went wrong - fail the open operation */
3726 return -1;
3727 }
3728
3729 INIT_DELAYED_WORK(&bond->alb_work, bond_alb_monitor);
3730 queue_delayed_work(bond->wq, &bond->alb_work, 0);
3731 }
3732
3733 if (bond->params.miimon) { /* link check interval, in milliseconds. */
3734 INIT_DELAYED_WORK(&bond->mii_work, bond_mii_monitor);
3735 queue_delayed_work(bond->wq, &bond->mii_work, 0);
3736 }
3737
3738 if (bond->params.arp_interval) { /* arp interval, in milliseconds. */
3739 if (bond->params.mode == BOND_MODE_ACTIVEBACKUP)
3740 INIT_DELAYED_WORK(&bond->arp_work,
3741 bond_activebackup_arp_mon);
3742 else
3743 INIT_DELAYED_WORK(&bond->arp_work,
3744 bond_loadbalance_arp_mon);
3745
3746 queue_delayed_work(bond->wq, &bond->arp_work, 0);
3747 if (bond->params.arp_validate)
3748 bond_register_arp(bond);
3749 }
3750
3751 if (bond->params.mode == BOND_MODE_8023AD) {
3752 INIT_DELAYED_WORK(&bond->ad_work, bond_3ad_state_machine_handler);
3753 queue_delayed_work(bond->wq, &bond->ad_work, 0);
3754 /* register to receive LACPDUs */
3755 bond_register_lacpdu(bond);
3756 bond_3ad_initiate_agg_selection(bond, 1);
3757 }
3758
3759 return 0;
3760 }
L3720-L3727是关键代码!
从以上两个代码片段可知,调用rlb_initialize是有条件的,即在条件满足的时候才会去对rx_hashtbl 分配资源。
于是我猜测,oops的原因在于条件不满足的时候也去访问了rx_hashtbl ,分析了一下代码,然后加入了一些调试语句来验证我的猜测:
即
d1. 在bond_open函数中把bond指针打印出来,
d2. 在bond_open函数L3720之后插入一条打印语句来跟踪流程
d3. 在rlb_update_entry_from_arp函数中把bond指针打印出来
具有决定意义的信息出来了,分析信息后可以总结出这么几点:
a. 不管是rr,还是alb,在d1都有动作,且打印出了bond指针。
b. 和rr有关的,在d2没有动作.
c. 不管是rr,还是alb,在d3都有动作,且打印出了bond指针。
d. 经过比较指针内容,得知oops时,是和rr有关的,而根据b可知,其rx_hashtbl并未分配资源。
进一步分析,既然未分配资源,那么程序怎么就去访问它了呢?继续分析代码:
a. 看看是谁调用了rlb_update_entry_from_arp()?
b. 是它:rlb_arp_recv()
c. 再往上已经没有调用关系了,不过有一个引用:
803 static int rlb_initialize(struct bonding *bond)
804 {
805 struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
806 struct packet_type *pk_type = &(BOND_ALB_INFO(bond).rlb_pkt_type);
807 struct rlb_client_info *new_hashtbl;
808 int size = RLB_HASH_TABLE_SIZE * sizeof(struct rlb_client_info);
809 int i;
810
811 spin_lock_init(&(bond_info->rx_hashtbl_lock));
812
813 new_hashtbl = kmalloc(size, GFP_KERNEL);
814 if (!new_hashtbl) {
815 pr_err(DRV_NAME
816 ": %s: Error: Failed to allocate RLB hash table\n",
817 bond->dev->name);
818 return -1;
819 }
820 _lock_rx_hashtbl(bond);
821
822 bond_info->rx_hashtbl = new_hashtbl;
823
824 bond_info->rx_hashtbl_head = RLB_NULL_INDEX;
825
826 for (i = 0; i < RLB_HASH_TABLE_SIZE; i++) {
827 rlb_init_table_entry(bond_info->rx_hashtbl + i);
828 }
829
830 _unlock_rx_hashtbl(bond);
831
832 /*initialize packet type*/
833 pk_type->type = cpu_to_be16(ETH_P_ARP);
834 pk_type->dev = NULL;
835 pk_type->func = rlb_arp_recv;
836
837 /* register to receive ARPs */
838 dev_add_pack(pk_type);
839
840 return 0;
841 }
从L838可以看出,其在arp层注册了一个回调接口,即pk_type,它是和rlb_arp_recv有关的。
但是从L803可以看出,引用rlb_arp_recv的是rlb_initialize函数,这是一个初始化函数。
之前我们已经分析过了,只有在满足一定条件的情况下才会调用这个函数。显然和rr相关的是没有去调用这个初始化的,也就不会注册回调,而回调却产生了,岂不怪哉?
查看一下pk_type的数据结构:
include/linux/netdevice.h
1053 struct packet_type {
1054 __be16 type; /* This is really htons(ether_type). */
1055 struct net_device *dev; /* NULL is wildcarded here */
1056 int (*func) (struct sk_buff *,
1057 struct net_device *,
1058 struct packet_type *,
1059 struct net_device *);
1060 struct sk_buff *(*gso_segment)(struct sk_buff *skb,
1061 int features);
1062 int (*gso_send_check)(struct sk_buff *skb);
1063 struct sk_buff **(*gro_receive)(struct sk_buff **head,
1064 struct sk_buff *skb);
1065 int (*gro_complete)(struct sk_buff *skb);
1066 void *af_packet_priv;
1067 struct list_head list;
1068 };
眼前再次一亮,看这个关键信息:
1055 struct net_device *dev; /* NULL is wildcarded here */
看见 wildcarded这个单词了吧?翻译过来即“通配”!
我猜测其意思就是说当dev为NULL的时候就不区分是哪个网卡的arp包了(它也没法做匹配,因为没有上层的net_device结构信息),于是不管来自哪个网卡的数据,都调用这个回调接口,所以在rr模式下的网卡就悲剧了!
我再次猜测这应该是kernel的一个bug,鉴于修改起来并不麻烦:
832 /*initialize packet type*/
833 pk_type->type = cpu_to_be16(ETH_P_ARP);
834 pk_type->dev = NULL;
835 pk_type->func = rlb_arp_recv;
836
837 /* register to receive ARPs */
838 dev_add_pack(pk_type);
只需把L834修改成: pk_type->dev = bond->dev;
然后再验证一下,奇迹产生了,终于不再oops了!
总结研究
为了再次验证一下我的修改是否有问题,我查看了一下最新的内核代码,截止到本文写作时内核版本是3.9.7
drivers/net/bonding/bond_alb.c
910static int rlb_initialize(struct bonding *bond)
911{
912 struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
913 struct rlb_client_info *new_hashtbl;
914 int size = RLB_HASH_TABLE_SIZE * sizeof(struct rlb_client_info);
915 int i;
916
917 new_hashtbl = kmalloc(size, GFP_KERNEL);
918 if (!new_hashtbl)
919 return -1;
920
921 _lock_rx_hashtbl_bh(bond);
922
923 bond_info->rx_hashtbl = new_hashtbl;
924
925 bond_info->rx_hashtbl_used_head = RLB_NULL_INDEX;
926
927 for (i = 0; i < RLB_HASH_TABLE_SIZE; i++) {
928 rlb_init_table_entry(bond_info->rx_hashtbl + i);
929 }
930
931 _unlock_rx_hashtbl_bh(bond);
932
933
934 bond->recv_probe = rlb_arp_recv;
935
936 return 0;
937}
看L934,把回调挂在了本身的数据结构里。
再看一段代码
drivers/net/bonding/bond_main.c
1448static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb)
1449{
1450 struct sk_buff *skb = *pskb;
1451 struct slave *slave;
1452 struct bonding *bond;
1453 int (*recv_probe)(const struct sk_buff *, struct bonding *,
1454 struct slave *);
1455 int ret = RX_HANDLER_ANOTHER;
1456
1457 skb = skb_share_check(skb, GFP_ATOMIC);
1458 if (unlikely(!skb))
1459 return RX_HANDLER_CONSUMED;
1460
1461 *pskb = skb;
1462
1463 slave = bond_slave_get_rcu(skb->dev);
1464 bond = slave->bond;
1465
1466 if (bond->params.arp_interval)
1467 slave->dev->last_rx = jiffies;
1468
1469 recv_probe = ACCESS_ONCE(bond->recv_probe);
1470 if (recv_probe) {
1471 ret = recv_probe(skb, bond, slave);
1472 if (ret == RX_HANDLER_CONSUMED) {
1473 consume_skb(skb);
1474 return ret;
1475 }
1476 }
1477
1478 if (bond_should_deliver_exact_match(skb, slave, bond)) {
1479 return RX_HANDLER_EXACT;
1480 }
1481
1482 skb->dev = bond->dev;
1483
1484 if (bond->params.mode == BOND_MODE_ALB &&
1485 bond->dev->priv_flags & IFF_BRIDGE_PORT &&
1486 skb->pkt_type == PACKET_HOST) {
1487
1488 if (unlikely(skb_cow_head(skb,
1489 skb->data - skb_mac_header(skb)))) {
1490 kfree_skb(skb);
1491 return RX_HANDLER_CONSUMED;
1492 }
1493 memcpy(eth_hdr(skb)->h_dest, bond->dev->dev_addr, ETH_ALEN);
1494 }
1495
1496 return ret;
1497}
L1470有对指针存在与否的判断。
drivers/net/bonding/bond_main.c
1888 res = netdev_rx_handler_register(slave_dev, bond_handle_frame,
1889 new_slave);
这里才是注册回调,也就是说,在中间插入了一个函数来检查recv_probe的存在与否,也就能够区分属于那种bond模式。
netdev_rx_handler_register属于网络子系统的核心函数了。
可以看到新版本对其做了很大的修改,接下来我从SDK所用内核2.6.32.27开始往后跟踪,终于在2.6.35版本看到了和我一致的修改:
823
824 pk_type->type = cpu_to_be16(ETH_P_ARP);
825 pk_type->dev = bond->dev;
826 pk_type->func = rlb_arp_recv;
827
其它
对bond的创建操作、模式设置操作是在/sys下完成的
open是通过ifconfig来触发的。
=== 全文完 ====