日志

kernel的 bonding问题hack过程记录

热度 5已有 4397 次阅读2013-7-8 10:35 |个人分类:原创|系统分类:嵌入式系统| linux, kernel, bonding, bug, oops

必现用例：

[2013/6/20 10:37:10] 方: Bug 637 - 网卡配置绑定导致系统不停重启问题

这个bug找到复现的关键点了。

主机和阵列通过交换机想连，并且交换机上连一根外网的网线，以rr模式添加bond0，alb模式添加bond1，就开始打堆栈，无限重启

[2013/6/20 10:37:36] 方: [2013年6月20日 10:37] 方:

<<< 交换机上连一根外网的网线还有rr和alb两个不同的模式，

[2013/6/20 10:37:51] 方: 不连外网网线的话出不了

[2013/6/20 10:38:13] 方: 而且无限重启的时候，把外网网线拔掉就能停下了

[2013/6/20 10:39:21] 方: 这个现象怎么解释

查找过程

1 首先根据oops定位到出错行

[2013/6/20 10:40:43] 方: CPU 0 Unable to handle kernel paging request at virtual address 0000000000000318, epc == ffffffffc0445a10, ra == ffffffffc04459dc

Oops[#1]:

Cpu 0

$ 0 : 0000000000000000 ffffffff808b1da0 0000000000000300 0000000000000030

$ 4 : 0000000000000000 a8000000029d2160 000000000000002e a800000002559000

$ 8 : a8000000029d2140 0000000000000001 0000000000000000 0000000000000018

$12 : 0000000000000000 000000001000001f a800000031180000 0000000000000000

$16 : a8000000029d214e 0000000000000300 a8000000012d1600 a8000000029c8580

$20 : a8000000012d1870 ffffffff812408e8 0000000000000806 0000000000000000

$24 : 00000000000002b1 000000555d5887b0

$28 : ffffffff811c4000 ffffffff811c7970 ffffffff811c7970 ffffffffc04459dc

Hi : 0000000000000000

Lo : 0000000000000000

epc : ffffffffc0445a10 rlb_arp_recv+0x128/0x228 [bonding]

Tainted: P

ra : ffffffffc04459dc rlb_arp_recv+0xf4/0x228 [bonding]

Status: 1010cce3 KX SX UX KERNEL EXL IE

Cause : 00800008

BadVA : 0000000000000318

PrId : 000d9202 (Cavium Octeon II)

Modules linked in: bonding run(P) raid vscsih iscsitgt disk vdisk cache(P) service gmeta mpt2sas netlink bubble platform octeon_ethernet at24

Process swapper (pid: 0, threadinfo=ffffffff811c4000, task=ffffffff811e5280, tls=0000000000000000)

Stack : 0000000000000003 ffffffff81241498 ffffffff812414d8 a8000000029c8580

a8000000029c8644 a800000002559000 ffffffff811c79b0 ffffffff807a7648

000d0300000d0300 ffffffff808b22e0 000000000000003c a800000002559600

a8000000029c8580 a800000002b7d280 0000000000000000 0000000000000001

0000000000000001 0000000000000001 ffffffff811c7a10 ffffffffc0010154

ffffffff811c7b80 ffffffff802d22e8 0000000000000000 ffffffff80356140

0000000000000000 0000000000000000 8001670000000000 0000000000000001

0000000000000003 0000000000000001 0000000000000000 000000000000ffff

0000000000000000 ffffffffc001ac00 0000000000000020 000000011000001f

a800000031180000 0000000000000000 ffffffff811d2a00 8001670000000100

...

Call Trace:

[<ffffffffc0445a10>] rlb_arp_recv+0x128/0x228 [bonding]

[<ffffffff807a7648>] netif_receive_skb+0x3f0/0x4d8

[<ffffffffc0010154>] cvm_oct_napi_poll_38+0x7ac/0x10e8 [octeon_ethernet]

[<ffffffff807a8218>] net_rx_action+0x128/0x280

[<ffffffff80314018>] __do_softirq+0x130/0x248

[<ffffffff803141b8>] do_softirq+0x88/0x90

[<ffffffff80314418>] irq_exit+0x70/0x88

[<ffffffff808b22e0>] do_IRQ+0x48/0x60

[<ffffffff80104cd4>] octeon_irq_ip2_ciu+0x94/0xb8

[<ffffffff80103348>] plat_irq_dispatch+0x80/0xd0

[<ffffffff802d22e8>] ret_from_irq+0x0/0x4

[<ffffffff802d24e0>] r4k_wait+0x20/0x40

[<ffffffff802d4794>] cpu_idle+0x84/0xa0

[<ffffffff808a3270>] rest_init+0x80/0x98

[<ffffffff81243b5c>] start_kernel+0x37c/0x4c4

Code: de440268 70431003 0082882d <92230018> 10600007 3c02808b 8a020018 8e230000 9a02001b

Kernel panic - not syncing: Fatal exception in interrupt

*** NMI Watchdog interrupt on Core 0x01 ***

$0 0x0000000000000000 at 0xffffffff803471bc

v0 0xffffffff802d24c0 v1 0x0000000000000001

a0 0xfffffffffffffffd a1 0x0000000000000000

a2 0xffffffff812403c8 a3 0x0000000000000001

a4 0x0000000000000800 a5 0x0000000000000020

a6 0x0000000000000000 a7 0x000000aaab43b498

t0 0x0000000000000000 t1 0x000000001000001f

t2 0xa800000031188000 t3 0x0000000000000000

s0 0xffffffff853e0000 s1 0xffffffff853f0000

s2 0xffffffff811c8980 s3 0x0000000000000000

s4 0x0000000000000002 s5 0x0000000000200200

s6 0xffffffff811c8990 s7 0xffffffff811287d0

t8 0x0000000000000000 t9 0x0000005561b7f7b0

k0 0x0000000000000000 k1 0x0000000000000000

gp 0xa8000000310fc000 sp 0xa8000000310ffb10

s8 0xa8000000310ffb10 ra 0xffffffff802dbc18

err_epc 0xffffffff802d24e0 epc 0xffffffff802d24e0

status 0x000000001058cce4 cause 0x0000000040808800

sum0 0x0000000000000000 en0 0x0000000000000000

*** Chip soft reset soon ***

关键的一行应该是这个：epc : ffffffffc0445a10 rlb_arp_recv+0x128/0x228 [bonding]

2 反汇编一下

mips64-octeon-linux-gnu-objdump -S bonding.ko

先搜索一下 rlb_arp_recv的基址：

000000000000e8e8 <rlb_arp_recv>:

e8e8 + 0x128 = ea10

_lock_rx_hashtbl(bond);

hash_index = _simple_hash((u8*)&(arp->ip_src), sizeof(arp->ip_src));

client_info = &(bond_info->rx_hashtbl[hash_index]);

e9fc: 7c82f803 dext v0,a0,0x0,0x20

ea00: 24030030 li v1,48

ea04: de440268 ld a0,616(s2)

ea08: 70431003 dmul v0,v0,v1

ea0c: 0082882d daddu s1,a0,v0

if ((client_info->assigned) &&

ea10: 92230018 lbu v1,24(s1)

ea14: 10600007 beqz v1,ea34 <rlb_arp_recv+0x14c>

ea18: 3c020000 lui v0,0x0

ea1c: 8a020018 lwl v0,24(s0)

ea20: 8e230000 lw v1,0(s1)

ea24: 9a02001b lwr v0,27(s0)

ea28: 10620019 beq v1,v0,ea90 <rlb_arp_recv+0x1a8>

ea2c: 00000000 nop

spin_lock_bh(&(BOND_ALB_INFO(bond).rx_hashtbl_lock));

也就是说，正确的出错位置是 if ((client_info->assigned) &&

为什么这个会出错呢？

3 源代码分析过程

出错位置在 rlb_ubond_alb.c 340,2-9

即rlb_update_entry_from_arp(struct bonding *bond, struct arp_pkt *arp)函数

把代码贴出来

329 static void rlb_update_entry_from_arp(struct bonding *bond, struct arp_pkt *arp)

330 {

331 struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));

332 struct rlb_client_info *client_info;

333 u32 hash_index;

334

335 _lock_rx_hashtbl(bond);

336

337 hash_index = _simple_hash((u8*)&(arp->ip_src), sizeof(arp->ip_src));

338 client_info = &(bond_info->rx_hashtbl[hash_index]);

339

340 if ((client_info->assigned) &&

341 (client_info->ip_src == arp->ip_dst) &&

342 (client_info->ip_dst == arp->ip_src)) {

343 /* update the clients MAC address */

344 memcpy(client_info->mac_dst, arp->mac_src, ETH_ALEN);

345 client_info->ntt = 1;

346 bond_info->rx_ntt = 1;

347 }

348

349 _unlock_rx_hashtbl(bond);

350 }

L340里client_info是一个指针，那么应该跟踪一下它的值，于是printk出来，发现在oops的时候它的值比较怪异：

hugui: client_info: 300

很明显这个值是有问题的，不可能有这么小的值，内存的低端地址早就被占用了。

跟踪一下L338看看这个值怎么来的？又见两个指针！于是继续printk出来：

hugui: bond: a80000000d70b600

hugui: hash_index: 10

hugui: rx_hashtbl: 0

眼前一亮，0指针呀！于是 client_info: 300是咋回事也就很简单了，看代码就知道，把rx_hashtbl表的第10项的地址取出来，10*30=300，sizeof(struct tlb_client_info)=30。

接下来就是查找为啥是0指针了。

在同一个文件以这个字符串“rx_hashtbl =”搜索一下，得到两处结果，浏览一下代码，我觉得这处相关性更大一些：

kernel_2.6/linux/drivers/net/bonding/bond_alb.c 803

803 static int rlb_initialize(struct bonding *bond)

804 {

805 struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));

806 struct packet_type *pk_type = &(BOND_ALB_INFO(bond).rlb_pkt_type);

807 struct rlb_client_info *new_hashtbl;

808 int size = RLB_HASH_TABLE_SIZE * sizeof(struct rlb_client_info);

809 int i;

810

811 spin_lock_init(&(bond_info->rx_hashtbl_lock));

812

813 new_hashtbl = kmalloc(size, GFP_KERNEL);

814 if (!new_hashtbl) {

815 pr_err(DRV_NAME

816 ": %s: Error: Failed to allocate RLB hash table\n",

817 bond->dev->name);

818 return -1;

819 }

820 _lock_rx_hashtbl(bond);

821

822 bond_info->rx_hashtbl = new_hashtbl;

823

824 bond_info->rx_hashtbl_head = RLB_NULL_INDEX;

825

826 for (i = 0; i < RLB_HASH_TABLE_SIZE; i++) {

827 rlb_init_table_entry(bond_info->rx_hashtbl + i);

828 }

829

830 _unlock_rx_hashtbl(bond);

831

832 /*initialize packet type*/

833 pk_type->type = cpu_to_be16(ETH_P_ARP);

834 pk_type->dev = NULL;

835 pk_type->func = rlb_arp_recv;

836

837 /* register to receive ARPs */

838 dev_add_pack(pk_type);

839

840 return 0;

841 }

从L822看，指针是存在的！

看看谁call了rlb_initialize： kernel_2.6/linux/drivers/net/bonding/bond_alb.c 1255

1255 int bond_alb_initialize(struct bonding *bond, int rlb_enabled)

1256 {

1257 int res;

1258

1259 res = tlb_initialize(bond);

1260 if (res) {

1261 return res;

1262 }

1263

1264 if (rlb_enabled) {

1265 bond->alb_info.rlb_enabled = 1;

1266 /* initialize rlb */

1267 res = rlb_initialize(bond);

1268 if (res) {

1269 tlb_deinitialize(bond);

1270 return res;

1271 }

1272 } else {

1273 bond->alb_info.rlb_enabled = 0;

1274 }

1275

1276 return 0;

1277 }

注意这个rlb_enabled的问题，再往上级查找，谁call了bond_alb_initialize？

drivers/net/bonding/bond_main.c

3714 static int bond_open(struct net_device *bond_dev)

3715 {

3716 struct bonding *bond = netdev_priv(bond_dev);

3717

3718 bond->kill_timers = 0;

3719

3720 if (bond_is_lb(bond)) {

3721 /* bond_alb_initialize must be called before the timer

3722 * is started.

3723 */

3724 if (bond_alb_initialize(bond, (bond->params.mode == BOND_MODE_ALB))) {

3725 /* something went wrong - fail the open operation */

3726 return -1;

3727 }

3728

3729 INIT_DELAYED_WORK(&bond->alb_work, bond_alb_monitor);

3730 queue_delayed_work(bond->wq, &bond->alb_work, 0);

3731 }

3732

3733 if (bond->params.miimon) { /* link check interval, in milliseconds. */

3734 INIT_DELAYED_WORK(&bond->mii_work, bond_mii_monitor);

3735 queue_delayed_work(bond->wq, &bond->mii_work, 0);

3736 }

3737

3738 if (bond->params.arp_interval) { /* arp interval, in milliseconds. */

3739 if (bond->params.mode == BOND_MODE_ACTIVEBACKUP)

3740 INIT_DELAYED_WORK(&bond->arp_work,

3741 bond_activebackup_arp_mon);

3742 else

3743 INIT_DELAYED_WORK(&bond->arp_work,

3744 bond_loadbalance_arp_mon);

3745

3746 queue_delayed_work(bond->wq, &bond->arp_work, 0);

3747 if (bond->params.arp_validate)

3748 bond_register_arp(bond);

3749 }

3750

3751 if (bond->params.mode == BOND_MODE_8023AD) {

3752 INIT_DELAYED_WORK(&bond->ad_work, bond_3ad_state_machine_handler);

3753 queue_delayed_work(bond->wq, &bond->ad_work, 0);

3754 /* register to receive LACPDUs */

3755 bond_register_lacpdu(bond);

3756 bond_3ad_initiate_agg_selection(bond, 1);

3757 }

3758

3759 return 0;

3760 }

L3720-L3727是关键代码！

从以上两个代码片段可知，调用rlb_initialize是有条件的，即在条件满足的时候才会去对rx_hashtbl 分配资源。

于是我猜测，oops的原因在于条件不满足的时候也去访问了rx_hashtbl ，分析了一下代码，然后加入了一些调试语句来验证我的猜测：

即

d1. 在bond_open函数中把bond指针打印出来，

d2. 在bond_open函数L3720之后插入一条打印语句来跟踪流程

d3. 在rlb_update_entry_from_arp函数中把bond指针打印出来

具有决定意义的信息出来了，分析信息后可以总结出这么几点：

a. 不管是rr，还是alb，在d1都有动作，且打印出了bond指针。

b. 和rr有关的，在d2没有动作.

c. 不管是rr，还是alb，在d3都有动作，且打印出了bond指针。

d. 经过比较指针内容，得知oops时，是和rr有关的，而根据b可知，其rx_hashtbl并未分配资源。

进一步分析，既然未分配资源，那么程序怎么就去访问它了呢？继续分析代码：

a. 看看是谁调用了rlb_update_entry_from_arp（）？

b. 是它：rlb_arp_recv（）

c. 再往上已经没有调用关系了，不过有一个引用：

803 static int rlb_initialize(struct bonding *bond)

804 {

805 struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));

806 struct packet_type *pk_type = &(BOND_ALB_INFO(bond).rlb_pkt_type);

807 struct rlb_client_info *new_hashtbl;

808 int size = RLB_HASH_TABLE_SIZE * sizeof(struct rlb_client_info);

809 int i;

810

811 spin_lock_init(&(bond_info->rx_hashtbl_lock));

812

813 new_hashtbl = kmalloc(size, GFP_KERNEL);

814 if (!new_hashtbl) {

815 pr_err(DRV_NAME

816 ": %s: Error: Failed to allocate RLB hash table\n",

817 bond->dev->name);

818 return -1;

819 }

820 _lock_rx_hashtbl(bond);

821

822 bond_info->rx_hashtbl = new_hashtbl;

823

824 bond_info->rx_hashtbl_head = RLB_NULL_INDEX;

825

826 for (i = 0; i < RLB_HASH_TABLE_SIZE; i++) {

827 rlb_init_table_entry(bond_info->rx_hashtbl + i);

828 }

829

830 _unlock_rx_hashtbl(bond);

831

832 /*initialize packet type*/

833 pk_type->type = cpu_to_be16(ETH_P_ARP);

834 pk_type->dev = NULL;

835 pk_type->func = rlb_arp_recv;

836

837 /* register to receive ARPs */

838 dev_add_pack(pk_type);

839

840 return 0;

841 }

从L838可以看出，其在arp层注册了一个回调接口，即pk_type，它是和rlb_arp_recv有关的。

但是从L803可以看出，引用rlb_arp_recv的是rlb_initialize函数，这是一个初始化函数。

之前我们已经分析过了，只有在满足一定条件的情况下才会调用这个函数。显然和rr相关的是没有去调用这个初始化的，也就不会注册回调，而回调却产生了，岂不怪哉？

查看一下pk_type的数据结构:

include/linux/netdevice.h

1053 struct packet_type {

1054 __be16 type; /* This is really htons(ether_type). */

1055 struct net_device *dev; /* NULL is wildcarded here */

1056 int (*func) (struct sk_buff *,

1057 struct net_device *,

1058 struct packet_type *,

1059 struct net_device *);

1060 struct sk_buff *(*gso_segment)(struct sk_buff *skb,

1061 int features);

1062 int (*gso_send_check)(struct sk_buff *skb);

1063 struct sk_buff **(*gro_receive)(struct sk_buff **head,

1064 struct sk_buff *skb);

1065 int (*gro_complete)(struct sk_buff *skb);

1066 void *af_packet_priv;

1067 struct list_head list;

1068 };

眼前再次一亮，看这个关键信息：

1055 struct net_device *dev; /* NULL is wildcarded here */

看见 wildcarded这个单词了吧？翻译过来即“通配”！

我猜测其意思就是说当dev为NULL的时候就不区分是哪个网卡的arp包了（它也没法做匹配，因为没有上层的net_device结构信息），于是不管来自哪个网卡的数据，都调用这个回调接口，所以在rr模式下的网卡就悲剧了！

我再次猜测这应该是kernel的一个bug，鉴于修改起来并不麻烦：

832 /*initialize packet type*/

833 pk_type->type = cpu_to_be16(ETH_P_ARP);

834 pk_type->dev = NULL;

835 pk_type->func = rlb_arp_recv;

836

837 /* register to receive ARPs */

838 dev_add_pack(pk_type);

只需把L834修改成： pk_type->dev = bond->dev;

然后再验证一下，奇迹产生了，终于不再oops了！

总结研究

为了再次验证一下我的修改是否有问题，我查看了一下最新的内核代码，截止到本文写作时内核版本是3.9.7

drivers/net/bonding/bond_alb.c

 910static int rlb_initialize(struct bonding *bond)

 911{

 912        struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));

 913        struct rlb_client_info  *new_hashtbl;

 914        int size = RLB_HASH_TABLE_SIZE * sizeof(struct rlb_client_info);

 915        int i;

 916

 917        new_hashtbl = kmalloc(size, GFP_KERNEL);

 918        if (!new_hashtbl)

 919                return -1;

 920

 921        _lock_rx_hashtbl_bh(bond);

 922

 923        bond_info->rx_hashtbl = new_hashtbl;

 924

 925        bond_info->rx_hashtbl_used_head = RLB_NULL_INDEX;

 926

 927        for (i = 0; i < RLB_HASH_TABLE_SIZE; i++) {

 928                rlb_init_table_entry(bond_info->rx_hashtbl + i);

 929        }

 930

 931        _unlock_rx_hashtbl_bh(bond);

 932

 933        /* register to receive ARPs */

 934        bond->recv_probe = rlb_arp_recv;

 935

 936        return 0;

 937}

看L934，把回调挂在了本身的数据结构里。

再看一段代码

drivers/net/bonding/bond_main.c

1448static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb)

1449{

1450        struct sk_buff *skb = *pskb;

1451        struct slave *slave;

1452        struct bonding *bond;

1453        int (*recv_probe)(const struct sk_buff *, struct bonding *,

1454                          struct slave *);

1455        int ret = RX_HANDLER_ANOTHER;

1456

1457        skb = skb_share_check(skb, GFP_ATOMIC);

1458        if (unlikely(!skb))

1459                return RX_HANDLER_CONSUMED;

1460

1461        *pskb = skb;

1462

1463        slave = bond_slave_get_rcu(skb->dev);

1464        bond = slave->bond;

1465

1466        if (bond->params.arp_interval)

1467                slave->dev->last_rx = jiffies;

1468

1469        recv_probe = ACCESS_ONCE(bond->recv_probe);

1470        if (recv_probe) {

1471                ret = recv_probe(skb, bond, slave);

1472                if (ret == RX_HANDLER_CONSUMED) {

1473                        consume_skb(skb);

1474                        return ret;

1475                }

1476        }

1477

1478        if (bond_should_deliver_exact_match(skb, slave, bond)) {

1479                return RX_HANDLER_EXACT;

1480        }

1481

1482        skb->dev = bond->dev;

1483

1484        if (bond->params.mode == BOND_MODE_ALB &&

1485            bond->dev->priv_flags & IFF_BRIDGE_PORT &&

1486            skb->pkt_type == PACKET_HOST) {

1487

1488                if (unlikely(skb_cow_head(skb,

1489                                          skb->data - skb_mac_header(skb)))) {

1490                        kfree_skb(skb);

1491                        return RX_HANDLER_CONSUMED;

1492                }

1493                memcpy(eth_hdr(skb)->h_dest, bond->dev->dev_addr, ETH_ALEN);

1494        }

1495

1496        return ret;

1497}
L1470有对指针存在与否的判断。

drivers/net/bonding/bond_main.c
1888        res = netdev_rx_handler_register(slave_dev, bond_handle_frame,

1889                                         new_slave);
这里才是注册回调，也就是说，在中间插入了一个函数来检查recv_probe的存在与否，也就能够区分属于那种bond模式。

netdev_rx_handler_register属于网络子系统的核心函数了。


可以看到新版本对其做了很大的修改，接下来我从SDK所用内核2.6.32.27开始往后跟踪，终于在2.6.35版本看到了和我一致的修改：
 823        /*initialize packet type*/

 824        pk_type->type = cpu_to_be16(ETH_P_ARP);

 825        pk_type->dev = bond->dev;

 826        pk_type->func = rlb_arp_recv;

 827


其它

对bond的创建操作、模式设置操作是在/sys下完成的
open是通过ifconfig来触发的。

=== 全文完 ====