环境
- Potentially all RHEL6 versions, seen at least on:
- Red Hat Enterprise Linux 6.6 (
kernel-2.6.32-504.23.4.el6
) - Red Hat Enterprise Linux 6.8 (
2.6.32-642.4.2.el6
)
- Red Hat Enterprise Linux 6.6 (
- Network interface with offloading enabled
问题
Machines are randomly rebooting/panicking with the following in the vmcore-dmesg.txt
and back trace:
<2>[259117.681597] kernel BUG at net/core/skbuff.c:1886!
<4>[259117.738957] invalid opcode: 0000 [#1] SMP
<4>[259117.789236] last sysfs file: /sys/devices/system/cpu/online
<4>[259117.857008] CPU 0
crash> bt
PID: 0 TASK: ffffffff81a8d020 CPU: 0 COMMAND: "swapper"
#0 [ffff880061a03190] machine_kexec at ffffffff8103b60b
#1 [ffff880061a031f0] crash_kexec at ffffffff810c99e2
#2 [ffff880061a032c0] oops_end at ffffffff8152e0d0
#3 [ffff880061a032f0] die at ffffffff81010f5b
#4 [ffff880061a03320] do_trap at ffffffff8152d934
#5 [ffff880061a03380] do_invalid_op at ffffffff8100cf55
#6 [ffff880061a03420] invalid_op at ffffffff8100c01b
[exception RIP: skb_checksum+725]
RIP: ffffffff81451ff5 RSP: ffff880061a034d0 RFLAGS: 00010206
RAX: ffff881062d2dd80 RBX: 0000000000000090 RCX: 00000000ff2a5251
RDX: 0000000000000180 RSI: 0000000000000000 RDI: ffff881062d2dd40
RBP: ffff880061a03530 R8: ffff880fe9624868 R9: ffff881062d2dd40
R10: 0000000000000020 R11: 0000000000000003 R12: 0000000000000042
R13: 0000000000000042 R14: 0000000000000000 R15: ffffffffa014a580
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#7 [ffff880061a03538] skb_checksum_help at ffffffff8145d60a
#8 [ffff880061a03568] dev_hard_start_xmit at ffffffff81461f8d
#9 [ffff880061a035c8] sch_direct_xmit at ffffffff8147ec3a
#10 [ffff880061a03618] dev_queue_xmit at ffffffff81462368
#11 [ffff880061a03658] bond_dev_queue_xmit at ffffffffa024b93c [bonding]
#12 [ffff880061a03668] bond_start_xmit at ffffffffa024bc51 [bonding]
#13 [ffff880061a036b8] dev_hard_start_xmit at ffffffff81461e5c
#14 [ffff880061a03718] dev_queue_xmit at ffffffff814622fd
#15 [ffff880061a03758] ip_finish_output at ffffffff8149d618
#16 [ffff880061a037a8] ip_output at ffffffff8149d898
#17 [ffff880061a037d8] ip_local_out at ffffffff8149cb75
#18 [ffff880061a037f8] ip_queue_xmit at ffffffff8149d070
#19 [ffff880061a038a8] tcp_transmit_skb at ffffffff814b25a4
#20 [ffff880061a03928] tcp_write_xmit at ffffffff814b4aea
#21 [ffff880061a039b8] __tcp_push_pending_frames at ffffffff814b56d0
#22 [ffff880061a039d8] tcp_data_snd_check at ffffffff814accd3
#23 [ffff880061a039f8] tcp_rcv_established at ffffffff814b0941
#24 [ffff880061a03a58] tcp_v4_do_rcv at ffffffff814b8e13
#25 [ffff880061a03af8] tcp_v4_rcv at ffffffff814ba732
#26 [ffff880061a03b88] ip_local_deliver_finish at ffffffff814974ed
#27 [ffff880061a03bb8] ip_local_deliver at ffffffff81497778
#28 [ffff880061a03be8] ip_rcv_finish at ffffffff81496c3d
#29 [ffff880061a03c28] ip_rcv at ffffffff814971c5
#30 [ffff880061a03c68] __netif_receive_skb at ffffffff8145cdb8
#31 [ffff880061a03cb8] netif_receive_skb at ffffffff81460b78
#32 [ffff880061a03cf8] napi_skb_finish at ffffffff81460c80
#33 [ffff880061a03d18] napi_gro_receive at ffffffff81462539
#34 [ffff880061a03d38] ixgbe_clean_rx_irq at ffffffffa011bcca [ixgbe]
#35 [ffff880061a03db8] ixgbe_poll at ffffffffa0121583 [ixgbe]
#36 [ffff880061a03e68] net_rx_action at ffffffff81462653
#37 [ffff880061a03ec8] __do_softirq at ffffffff8107d901
#38 [ffff880061a03f38] call_softirq at ffffffff8100c38c
#39 [ffff880061a03f50] do_softirq at ffffffff8100fbd5
#40 [ffff880061a03f70] irq_exit at ffffffff8107d7b5
#41 [ffff880061a03f80] do_IRQ at ffffffff81533ba5
RIP: 0010:[<ffffffff8146b2fd>] [<ffffffff8146b2fd>] skb_over_panic+0x5d/0x70
...
Call Trace:
[<ffffffff814c2ee2>] ? sk_stream_alloc_skb+0xb2/0x110
[<ffffffff8146c0ec>] skb_put+0x5c/0x70
[<ffffffff814c348b>] tcp_sendmsg+0x54b/0xa20
[<ffffffff81464843>] sock_sendmsg+0x123/0x150
决议
Upgrade to RHEL7. Red Hat believe this bug is fixed by the following commit in RHEL7:
tcp: fix tcp_trim_head()
commit f07d960df3 (tcp: avoid frag allocation for small frames)
breaked assumption in tcp stack that skb is either linear (skb->data_len
== 0), or fully fragged (skb->data_len == skb->len)
tcp_trim_head() made this assumption, we must fix it.
Workaround
Disable adapter offloading features:
- If the relevant
/etc/sysconfig/network-scripts/ifcfg-*
files do not contain anETHTOOL_OPTS
parameter, add the line:
ETHTOOL_OPTS="-K eth1 rx off tx off tso off gso off gro off lro off"
- If the relevant
/etc/sysconfig/network-scripts/ifcfg-*
files already contain anETHTOOL_OPTS
line, then append the string; -K eth1 rx off tx off tso off gso off gro off lro off
as follows:
ETHTOOL_OPTS="-G ethX rx 4096; -G eth1 tx 4096; -K eth1 rx off tx off tso off gso off gro off lro off"
根源
A sk_buff
ended up with and invalid skb_shared_info
structure that did not tally with skb->data_len
. This caused the skb_checksum_help()
routine to reboot the system generating a vmcore in the process.
诊断步骤
From a vmcore:
CPUS: 24
LOAD AVERAGE: 2.23, 0.80, 0.56
NODENAME: hosta
RELEASE: 2.6.32-504.23.4.el6.x86_64
VERSION: #1 SMP Fri May 29 10:16:43 EDT 2015
MACHINE: x86_64 (2499 Mhz)
MEMORY: 63.9 GB
PANIC: "[191691.047083] kernel BUG at net/core/skbuff.c:1886!"
PID: 0
COMMAND: "swapper"
TASK: ffffffff81a8d020 (1 of 24) [THREAD_INFO: ffffffff81a00000]
CPU: 0
STATE: TASK_RUNNING (PANIC)
Vmore analysis follows:
.
crash> bt
PID: 0 TASK: ffffffff81a8d020 CPU: 0 COMMAND: "swapper"
#0 [ffff880061a03190] machine_kexec at ffffffff8103b60b
#1 [ffff880061a031f0] crash_kexec at ffffffff810c99e2
#2 [ffff880061a032c0] oops_end at ffffffff8152e0d0
#3 [ffff880061a032f0] die at ffffffff81010f5b
#4 [ffff880061a03320] do_trap at ffffffff8152d934
#5 [ffff880061a03380] do_invalid_op at ffffffff8100cf55
#6 [ffff880061a03420] invalid_op at ffffffff8100c01b
[exception RIP: skb_checksum+725]
RIP: ffffffff81451ff5 RSP: ffff880061a034d0 RFLAGS: 00010206
RAX: ffff881062d2dd80 RBX: 0000000000000090 RCX: 00000000ff2a5251
RDX: 0000000000000180 RSI: 0000000000000000 RDI: ffff881062d2dd40
RBP: ffff880061a03530 R8: ffff880fe9624868 R9: ffff881062d2dd40
R10: 0000000000000020 R11: 0000000000000003 R12: 0000000000000042
R13: 0000000000000042 R14: 0000000000000000 R15: ffffffffa014a580
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#7 [ffff880061a03538] skb_checksum_help at ffffffff8145d60a
#8 [ffff880061a03568] dev_hard_start_xmit at ffffffff81461f8d
#9 [ffff880061a035c8] sch_direct_xmit at ffffffff8147ec3a
#10 [ffff880061a03618] dev_queue_xmit at ffffffff81462368
#11 [ffff880061a03658] bond_dev_queue_xmit at ffffffffa024b93c [bonding]
#12 [ffff880061a03668] bond_start_xmit at ffffffffa024bc51 [bonding]
#13 [ffff880061a036b8] dev_hard_start_xmit at ffffffff81461e5c
#14 [ffff880061a03718] dev_queue_xmit at ffffffff814622fd
#15 [ffff880061a03758] ip_finish_output at ffffffff8149d618
#16 [ffff880061a037a8] ip_output at ffffffff8149d898
#17 [ffff880061a037d8] ip_local_out at ffffffff8149cb75
#18 [ffff880061a037f8] ip_queue_xmit at ffffffff8149d070
#19 [ffff880061a038a8] tcp_transmit_skb at ffffffff814b25a4
#20 [ffff880061a03928] tcp_write_xmit at ffffffff814b4aea
#21 [ffff880061a039b8] __tcp_push_pending_frames at ffffffff814b56d0
#22 [ffff880061a039d8] tcp_data_snd_check at ffffffff814accd3
#23 [ffff880061a039f8] tcp_rcv_established at ffffffff814b0941
#24 [ffff880061a03a58] tcp_v4_do_rcv at ffffffff814b8e13
#25 [ffff880061a03af8] tcp_v4_rcv at ffffffff814ba732
#26 [ffff880061a03b88] ip_local_deliver_finish at ffffffff814974ed
#27 [ffff880061a03bb8] ip_local_deliver at ffffffff81497778
#28 [ffff880061a03be8] ip_rcv_finish at ffffffff81496c3d
#29 [ffff880061a03c28] ip_rcv at ffffffff814971c5
#30 [ffff880061a03c68] __netif_receive_skb at ffffffff8145cdb8
#31 [ffff880061a03cb8] netif_receive_skb at ffffffff81460b78
#32 [ffff880061a03cf8] napi_skb_finish at ffffffff81460c80
#33 [ffff880061a03d18] napi_gro_receive at ffffffff81462539
#34 [ffff880061a03d38] ixgbe_clean_rx_irq at ffffffffa011bcca [ixgbe]
#35 [ffff880061a03db8] ixgbe_poll at ffffffffa0121583 [ixgbe]
#36 [ffff880061a03e68] net_rx_action at ffffffff81462653
#37 [ffff880061a03ec8] __do_softirq at ffffffff8107d901
#38 [ffff880061a03f38] call_softirq at ffffffff8100c38c
#39 [ffff880061a03f50] do_softirq at ffffffff8100fbd5
#40 [ffff880061a03f70] irq_exit at ffffffff8107d7b5
#41 [ffff880061a03f80] do_IRQ at ffffffff81533ba5
So the softirq was in the process of clearing the pending frames xmit queue.
Extract the sk_buff that was being processed:
crash> dis -lr ffffffff8145d60a|tail
/usr/src/debug/kernel-2.6.32-504.23.4.el6/linux-2.6.32-504.23.4.el6.x86_64/net/core/dev.c: 1951
0xffffffff8145d5f1 <skb_checksum_help+113>: cmp %eax,%r12d
0xffffffff8145d5f4 <skb_checksum_help+116>: jae 0xffffffff8145d72a <skb_checksum_help+426>
/usr/src/debug/kernel-2.6.32-504.23.4.el6/linux-2.6.32-504.23.4.el6.x86_64/net/core/dev.c: 1952
0xffffffff8145d5fa <skb_checksum_help+122>: sub %r12d,%edx
0xffffffff8145d5fd <skb_checksum_help+125>: xor %ecx,%ecx
0xffffffff8145d5ff <skb_checksum_help+127>: mov %r12d,%esi
0xffffffff8145d602 <skb_checksum_help+130>: mov %rbx,%rdi <--- so %rbx and %rbi are the same skbuff is in %rdi
Next stack frame:
crash> dis -lr ffffffff81451ff5|grep push
0xffffffff81451d20 <skb_checksum>: push %rbp
0xffffffff81451d24 <skb_checksum+4>: push %r15
0xffffffff81451d26 <skb_checksum+6>: push %r14
0xffffffff81451d28 <skb_checksum+8>: push %r13
0xffffffff81451d2a <skb_checksum+10>: push %r12
0xffffffff81451d2c <skb_checksum+12>: push %rbx < 6th push on the stack:
crash> bt -F
ffff880061a03508: [skbuff_fclone_cache] 0000000000000022
ffff880061a03518: [size-65536] [size-8192]
ffff880061a03528: ixgbe_netdev_ops ffff880061a03560
ffff880061a03538: skb_checksum_help+138
crash> rd ffff880061a03508
ffff880061a03508: ffff880fe9624868 hHb.....
crash>
Here it is:
crash> sk_buff ffff880fe9624868
struct sk_buff {
next = 0x0,
prev = 0x0,
sk = 0xffff881063a669c0,
tstamp = {
tv64 = 1445352847620836094
},
dev = 0xffff881068560020,
_skb_dst = 18446612200945595328,
sp = 0x0,
cb = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\XXX\000\000\000\000\000\000\000\000\000\000",
len = 210,
data_len = 144,
mac_len = 0,
hdr_len = 320,
{
csum = 1048864,
{
csum_start = 288,
csum_offset = 16
}
},
priority = 0,
flags1_begin = 0xffff880fe96248e4,
local_df = 0 '\000',
cloned = 1 '\001',
ip_summed = 3 '\003',
nohdr = 0 '\000',
nfctinfo = 0 '\000',
pkt_type = 0 '\000',
fclone = 2 '\002',
ipvs_property = 0 '\000',
peeked = 1 '\001',
nf_trace = 0 '\000',
protocol = 8,
flags1_end = 0xffff880fe96248e8,
destructor = 0xffffffff814b3ed0 <tcp_wfree>,
nfct = 0x0,
nfct_reasm = 0x0,
nf_bridge = 0x0,
iif = 0,
tc_index = 0,
tc_verd = 8192,
flags2_begin = 0xffff880fe9624910,
queue_mapping = 5,
ndisc_nodetype = 1 '\001',
__RH_UNUSED = 0 '\000',
ooo_okay = 0 '\000',
encapsulation = 0 '\000',
flags2_end = 0xffff880fe9624914,
{
napi_id = 0,
dma_cookie = 0
},
secmark = 0,
{
mark = 0,
dropcount = 0
},
vlan_tci = 0,
rxhash = 0,
transport_header = 288,
network_header = 268,
mac_header = 4294967295,
tail = 320,
end = 384,
head = 0xffff881062d2dc00 "",
data = 0xffff881062d2dcfe "",
truesize = 856,
users = {
counter = 1
}
}
So the sk_buff belongs to tcp_socket 0xffff881063a669c0 ("sk =" at the top of the
structure). Lets find the owner:
To do this dump all sockets to a file and then search the file for the socket to find the PID.
crash> foreach net -s > sockets.out
crash> ps
29860 29591 9 ffff881061546040 IN 5.7 6181920 3977016 [proc1]
crash> net -s|grep ffff881063a669c0
7330 ffff880da69e6700 ffff881063a669c0 INET:STREAM 1.xx.xx.20-443 11.xx.xx.190-51000 <--
crash>
So here we can see the local IP Address belongs to the loopback device:
crash> net
NET_DEVICE NAME IP ADDRESS(ES)
ffff881068593020 lo 1.xx.xx.20
ffff881068560020 eth1
ffff881062192020 bond0
ffff881062345020 bond1
crash>
Back to the sk_buff:
crash> sk_buff ffff880fe9624868|grep len
len = 210,
data_len = 144, <---- indicates that there should be fragments
mac_len = 0,
hdr_len = 320,
crash>
Dump the skb_shared_info structure. Find the address 1st:
include/linux/skbuff.h
637 static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)↩
638 {↩
639 ▹ return skb->head + skb->end;↩
640 }↩
crash> sk_buff ffff880fe9624868|grep head
head = 0xffff881062d2dc00
crash> sk_buff ffff880fe9624868|grep end
end = 384,
crash> px 0xffff881062d2dc00+384
$2 = 0xffff881062d2dd80
crash> px 0xffff881062d2dc00+384
$2 = 0xffff881062d2dd80
crash> skb_shared_info ffff881062d2dd80
struct skb_shared_info {
dataref = {
counter = 65538
},
nr_frags = 0, <---- 0
gso_size = 0,
gso_segs = 1,
gso_type = 0,
ip6_frag_id = 0,
tx_flags = {
{
hardware = 0 '\000',
software = 0 '\000',
in_progress = 0 '\000',
reserved = 0 '\000',
dev_zerocopy = 0 '\000',
shared_frag = 0 '\000'
},
flags = 0 '\000'
},
frag_list = 0x0, <--- no list
hwtstamps = {
hwtstamp = {
tv64 = 0
},
syststamp = {
tv64 = 0
}
},
frags = {{
page = 0xffffea0038d05b58,
page_offset = 0,
size = 64
}, {
page = 0x24,
page_offset = 1657986776,
size = 4294936592
}, {
page = 0xffffffffa01def70,
page_offset = 0,
size = 0
}, {
page = 0x0,
page_offset = 2686314640,
size = 4294967295
}, {
page = 0xffff881062d2dcc8,
page_offset = 4205894280,
size = 4294936591
}, {
page = 0x600000000,
page_offset = 0,
size = 0
}, {
page = 0x0,
page_offset = 1657986352,
size = 4294936592
}, {
page = 0xffff881062d2dd40,
page_offset = 1457633567,
size = 1820893016
}, {
page = 0x98da4b7d16a72354,
page_offset = 0,
size = 4096
}, {
page = 0x6060972fa5c0a4,
page_offset = 2443502653,
size = 201326592
}, {
page = 0x0,
page_offset = 0,
size = 0
}, {
page = 0x0,
page_offset = 0,
size = 0
}, {
page = 0xffffea00393fe030,
page_offset = 50,
size = 304
}, {
page = 0xffff8809e505fdc0,
page_offset = 0,
size = 4294936588
}, {
page = 0xc5df21e59cece0e9,
page_offset = 401553718,
size = 2699719059
}, {
page = 0x0,
page_offset = 1740561163,
size = 533550841
}, {
page = 0x5000000ef1ef3d7,
page_offset = 906265306,
size = 4294961664
}, {
page = 0x800000ce0,
page_offset = 0,
size = 0
}},
destructor_arg = 0x0
}