Description
Description
This does not happen right away but occasionally during the process of programming ACLs by the networking-vpp agent.
vpp# show version verbose
Version: v17.04-rc0~81-g655fcc4
Compiled by: root
Compile host: sjo-smf-ubuntu-server-2
Compile date: Tue Jan 10 17:41:27 PST 2017
Compile location: /home/demo/src/vpp
Compiler: GCC 5.4.0 20160609
Current PID: 5845
DPDK Version: DPDK 16.11.0
DPDK EAL init args: -c 1 -n 4 --huge-dir /run/vpp/hugepages --file-prefix vpp -b 0000:08:00.0 -b 0000:09:00.0 --master-lcore 0 --socket-mem 256,256
Here's the console output and backtrace of all the threads:
VPP console output when it is hung:
vpp# 0: acl_hook_l2_input_classify:545: ACL enabling on interface sw_if_index 4, setting tables to the following: ip4: 0 ip6: 1
0: acl_hook_l2_input_classify:545: ACL enabling on interface sw_if_index 4, setting tables to the following: ip4: 1 ip6: 0
0: acl_hook_l2_output_classify:603: ACL enabling on interface sw_if_index 4, setting tables to the following: ip4: 2 ip6: 3
0: acl_hook_l2_output_classify:603: ACL enabling on interface sw_if_index 4, setting tables to the following: ip4: 3 ip6: 2
vpp#
vpp#
vpp# 0: send_acl_details:1538: Sending acl details for ACL index 0
0: send_acl_details:1538: Sending acl details for ACL index 1
0: send_acl_details:1538: Sending acl details for ACL index 2
0: send_acl_details:1538: Sending acl details for ACL index 3
0: send_acl_details:1538: Sending acl details for ACL index 4
0: send_acl_details:1538: Sending acl details for ACL index 5
0: send_acl_details:1538: Sending acl details for ACL index 6
0: send_acl_details:1538: Sending acl details for ACL index 7
0: acl_hook_l2_input_classify:545: ACL enabling on interface sw_if_index 6, setting tables to the following: ip4: 8 ip6: 9
0: acl_hook_l2_input_classify:545: ACL enabling on interface sw_if_index 6, setting tables to the following: ip4: 9 ip6: 8
0: acl_hook_l2_output_classify:603: ACL enabling on interface sw_if_index 6, setting tables to the following: ip4: 10 ip6: 11
0: acl_hook_l2_output_classify:603: ACL enabling on interface sw_if_index 6, setting tables to the following: ip4: 11 ip6: 10
vpp#
vpp# 0: svm_client_scan_this_region_nolock:1139: /vpe-api: cleanup ghost pid 30698
0: svm_client_scan_this_region_nolock:1139: /vpe-api: cleanup ghost pid 30873
0: acl_hook_l2_input_classify:545: ACL enabling on interface sw_if_index 6, setting tables to the following: ip4: 8 ip6: 9
0: acl_hook_l2_input_classify:545: ACL enabling on interface sw_if_index 6, setting tables to the following: ip4: 9 ip6: 8
0: acl_hook_l2_output_classify:603: ACL enabling on interface sw_if_index 6, setting tables to the following: ip4: 10 ip6: 11
0: acl_hook_l2_output_classify:603: ACL enabling on interface sw_if_index 6, setting tables to the following: ip4: 11 ip6: 10
GDB trace:
(gdb) info threads
Id Target Id Frame
<li>1 Thread 0x7ffff7fe38c0 (LWP 30865) "vpp_main" pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
2 Thread 0x7fff722a8700 (LWP 30867) "vpp_stats" 0x00007ffff6362c5d in nanosleep () at ../sysdeps/unix/syscall-template.S:84
3 Thread 0x7fff72aa9700 (LWP 30866) "eal-intr-thread" 0x00007ffff5e8c153 in epoll_wait () at ../sysdeps/unix/syscall-template.S:84
(gdb) bt
#0 pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
#1 0x00007ffff79be5bb in unix_shared_memory_queue_add (q=q@entry=0x305fec80, elem=elem@entry=0x7fffb5a01e68 "<7G0", nowait=nowait@entry=0)
at /home/demo/src/vpp/build-data/../src/vlibmemory/unix_shared_memory_queue.c:216
#2 0x00007ffff79b14dd in vl_msg_api_send_shmem (q=q@entry=0x305fec80, elem=elem@entry=0x7fffb5a01e68 "<7G0")
at /home/demo/src/vpp/build-data/../src/vlibmemory/memory_shared.c:547
#3 0x00000000005d2ba6 in vl_api_control_ping_t_handler (mp=0x3052573c) at /home/demo/src/vpp/build-data/../src/vpp/api/api.c:1277
#4 0x00007ffff7bcc9c3 in vl_msg_api_handler_with_vm_node (am=am@entry=0x7ffff7dd62e0 <api_main>, the_msg=the_msg@entry=0x3052573c,
vm=vm@entry=0x7ffff79aa940 <vlib_global_main>, node=node@entry=0x7fffb59f9000)
at /home/demo/src/vpp/build-data/../src/vlibapi/api_shared.c:510
#5 0x00007ffff79b86b4 in memclnt_process (vm=, node=0x7fffb59f9000, f=)
at /home/demo/src/vpp/build-data/../src/vlibmemory/memory_vlib.c:487
#6 0x00007ffff774e146 in vlib_process_bootstrap (_a=) at /home/demo/src/vpp/build-data/../src/vlib/main.c:1218
#7 0x00007ffff68a1850 in clib_calljmp () at /home/demo/src/vpp/build-data/../src/vppinfra/longjmp.S:110
#8 0x00007fffb5b41e20 in ?? ()
#9 0x00007ffff774f0f9 in vlib_process_startup (f=0x0, p=0x7fffb59f9000, vm=0x7ffff79aa940 <vlib_global_main>)
at /home/demo/src/vpp/build-data/../src/vlib/main.c:1240
#10 dispatch_process (vm=0x7ffff79aa940 <vlib_global_main>, p=0x7fffb59f9000, last_time_stamp=6216603291757294, f=0x0)
at /home/demo/src/vpp/build-data/../src/vlib/main.c:1283
#11 0x0000000300000002 in ?? ()
#12 0x6361727400000005 in ?? ()
#13 0x0000000300000065 in ?? ()
#14 0x000000250000000c in ?? ()
#15 0x736c706d2d367069 in ?? ()
#16 0x692d6c6562616c2d in ?? ()
#17 0x6f697469736f706d in ?? ()
#18 0x6e72757465722d6e in ?? ()
#19 0x000000006425203a in ?? ()
#20 0x0000000c00000000 in ?? ()
#21 0x000000230000000c in ?? ()
#22 0x736c706d2d367069 in ?? ()
#23 0x692d6c6562616c2d in ?? ()
#24 0x6f697469736f706d in ?? ()
#25 0x203a6c6c61632d6e in ?? ()
#26 0x0000000000006425 in ?? ()
#27 0x0000000c00000000 in ?? ()
#28 0x0000001900000008 in ?? ()
#29 0x736c706d2d347069 in ?? ()
#30 0x692d6c6562616c2d in ?? ()
#31 0x6f697469736f706d in ?? ()
#32 0x000000080000006e in ?? ()
#33 0x0000001900000008 in ?? ()
#34 0x736c706d2d367069 in ?? ()
#35 0x692d6c6562616c2d in ?? ()
#36 0x6f697469736f706d in ?? ()
--Type to continue, or q to quit--
#37 0x000000080000006e in ?? ()
#38 0xf746494000000046 in ?? ()
#39 0xb5a0215000007fff in ?? ()
#40 0x0000032e00007fff in ?? ()
#41 0x0000000000000000 in ?? ()
(gdb) thread 2
[Switching to thread 2 (Thread 0x7fff722a8700 (LWP 30867))]
#0 0x00007ffff6362c5d in nanosleep () at ../sysdeps/unix/syscall-template.S:84
84 ../sysdeps/unix/syscall-template.S: No such file or directory.
(gdb) bt
#0 0x00007ffff6362c5d in nanosleep () at ../sysdeps/unix/syscall-template.S:84
#1 0x00000000005bc4a6 in ip46_fib_stats_delay (sec=sec@entry=10, nsec=nsec@entry=0, sm=0xbdada0 <stats_main>)
at /home/demo/src/vpp/build-data/../src/vpp/stats/stats.c:251
#2 0x00000000005c34a8 in stats_thread_fn (arg=) at /home/demo/src/vpp/build-data/../src/vpp/stats/stats.c:626
#3 0x00007ffff68a1850 in clib_calljmp () at /home/demo/src/vpp/build-data/../src/vppinfra/longjmp.S:110
#4 0x00007fff722a7da0 in ?? ()
#5 0x00007ffff63596fa in start_thread (arg=0xf200000004) at pthread_create.c:333
#6 0x0000000000000000 in ?? ()
(gdb) thread 3
[Switching to thread 3 (Thread 0x7fff72aa9700 (LWP 30866))]
#0 0x00007ffff5e8c153 in epoll_wait () at ../sysdeps/unix/syscall-template.S:84
84 ../sysdeps/unix/syscall-template.S: No such file or directory.
(gdb) bt
#0 0x00007ffff5e8c153 in epoll_wait () at ../sysdeps/unix/syscall-template.S:84
#1 0x00000000004fbf54 in eal_intr_handle_interrupts (totalfds=, pfd=13)
at /home/demo/src/vpp/build-root/build-vpp-native/dpdk/dpdk-16.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:768
#2 eal_intr_thread_main (arg=)
at /home/demo/src/vpp/build-root/build-vpp-native/dpdk/dpdk-16.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:852
#3 0x00007ffff63596fa in start_thread (arg=0x7fff72aa9700) at pthread_create.c:333
#4 0x00007ffff5e8bb5d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109
Assignee
Unassigned
Reporter
Naveen Joy
Comments
- dbarach (Wed, 18 Jan 2017 18:00:44 +0000): Duplicate of
VPP-593 - christophefontaine (Mon, 16 Jan 2017 12:02:17 +0000):
Thanks a lot for looking at this !
I was playing with this parameter last Saturday, but I had to increase much more the queue to have something stable. (With a dozen of VirtualEthernet)
Also, This patch is not in stable/1701: could we backport it (I cherry picked the patch without any conflict) ?
- dbarach (Mon, 16 Jan 2017 11:58:16 +0000): This seems like the same issue reported in
VPP-593. I won't mark it as a duplicate just yet, but it's quite likely to have the same root cause. - dbarach (Mon, 16 Jan 2017 11:54:14 +0000): The vpp -> agent api message queue is full.
This seems related to the following agent backtrace:
(gdb) bt
#0 0x00007f9cd98ff8f3 in select () from /lib64/libc.so.6
#1 0x00007f9cd2ee107b in time_sleep () from /usr/lib64/python2.7/lib-dynload/timemodule.so
#2 0x00007f9cda5d8bd4 in PyEval_EvalFrameEx () from /lib64/libpython2.7.so.1.0
#3 0x00007f9cda5da1ed in PyEval_EvalCodeEx () from /lib64/libpython2.7.so.1.0
#4 0x00007f9cda5d889f in PyEval_EvalFrameEx () from /lib64/libpython2.7.so.1.0
#5 0x00007f9cda5da1ed in PyEval_EvalCodeEx () from /lib64/libpython2.7.so.1.0
#6 0x00007f9cda5670c8 in function_call () from /lib64/libpython2.7.so.1.0
#7 0x00007f9cda5420c3 in PyObject_Call () from /lib64/libpython2.7.so.1.0
#8 0x00007f9cda5510b5 in instancemethod_call () from /lib64/libpython2.7.so.1.0
#9 0x00007f9cda5420c3 in PyObject_Call () from /lib64/libpython2.7.so.1.0
#10 0x00007f9cda5d4037 in PyEval_CallObjectWithKeywords () from /lib64/libpython2.7.so.1.0
#11 0x00007f9ccd7f1a9c in g_initialstub () from /usr/lib64/python2.7/site-packages/greenlet.so
#12 0x00007f9ccd7f13e6 in g_switch () from /usr/lib64/python2.7/site-packages/greenlet.so
#13 0x00007f9ccd7f1f46 in green_switch () from /usr/lib64/python2.7/site-packages/greenlet.so
#14 0x00007f9cda5d8bd4 in PyEval_EvalFrameEx () from /lib64/libpython2.7.so.1.0
#15 0x00007f9cda5d8990 in PyEval_EvalFrameEx () from /lib64/libpython2.7.so.1.0
#16 0x00007f9cda5da1ed in PyEval_EvalCodeEx () from /lib64/libpython2.7.so.1.0
#17 0x00007f9cda5d889f in PyEval_EvalFrameEx () from /lib64/libpython2.7.so.1.0
#18 0x00007f9cda5d8990 in PyEval_EvalFrameEx () from /lib64/libpython2.7.so.1.0
#19 0x00007f9cda5da1ed in PyEval_EvalCodeEx () from /lib64/libpython2.7.so.1.0
#20 0x00007f9cda5670c8 in function_call () from /lib64/libpython2.7.so.1.0
#21 0x00007f9cda5420c3 in PyObject_Call () from /lib64/libpython2.7.so.1.0
#22 0x00007f9cda5510b5 in instancemethod_call () from /lib64/libpython2.7.so.1.0
#23 0x00007f9cda5420c3 in PyObject_Call () from /lib64/libpython2.7.so.1.0
#24 0x00007f9cda5421a5 in call_function_tail () from /lib64/libpython2.7.so.1.0
#25 0x00007f9cda54228e in PyObject_CallFunction () from /lib64/libpython2.7.so.1.0
#26 0x00007f9cccbb15dc in wrap_pneum_callback (data=0x3048216c "\001\344", len=10) at vpp_papi/pneum_wrap.c:34
#27 0x00007f9ccc9ae2b5 in pneum_api_handler (msg=0x3048216c) at /w/workspace/vpp-verify-1701-centos7/vpp-api/python/pneum/pneum.c:90
#28 pneum_rx_thread_fn (arg=) at /w/workspace/vpp-verify-1701-centos7/vpp-api/python/pneum/pneum.c:109
#29 0x00007f9cda2e3dc5 in start_thread () from /lib64/libpthread.so.0
#30 0x00007f9cd99081cd in clone () from /lib64/libc.so.6
As in: the agent stops processing for a considerable amount of time. The vpp -> agent queue is full. Vpp is stuck in a mutex/condvar wait.
We can start dropping vpp -> agent messages, but at a certain level this condition should +never +happen. As in: if the agent isn’t prepared to handle replies (and stats messages!) as fast as they arrive, it’s game-over.
Note that I added the ability for python coders to set the vpp -> python-client queue size. If it simply the case that more queue depth is needed, calling VPP(..., rx_q_len=128) or some such might fix the problem.
- christophefontaine (Fri, 13 Jan 2017 14:17:54 +0000): Also tested with 2 vpp 17.01 rc2 (stable branch) and 17.04-rc0 (current master), same hang:
- sh version
vpp v17.01-rc2~5-g257d5e2 built by root on celexa.ctooffice at Fri Jan 13 14:41:28 CET 2017
(gdb) bt
#0 pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
#1 0x00007f87fbdaf092 in unix_shared_memory_queue_add (q=q@entry=0x305e0f40, elem=elem@entry=0x7f87b9862e68 "4", nowait=nowait@entry=0) at /root/vpp/build-data/../vlib-api/vlibmemory/unix_shared_memory_queue.c:216
#2 0x00007f87fbda13bd in vl_msg_api_send_shmem (q=q@entry=0x305e0f40, elem=elem@entry=0x7f87b9862e68 "4") at /root/vpp/build-data/../vlib-api/vlibmemory/memory_shared.c:526
#3 0x00000000005c0a59 in send_sw_interface_vhost_user_details (am=0xb64ee0 <vpe_api_main>, context=1224736768, vui=0x7f87baa731e8, q=0x305e0f40) at /root/vpp/vpp/vpp-api/api.c:2700
#4 vl_api_sw_interface_vhost_user_dump_t_handler (mp=0x30503f94) at /root/vpp/vpp/vpp-api/api.c:2725
#5 0x00007f87fbfbcb33 in vl_msg_api_handler_with_vm_node (am=am@entry=0x7f87fc1c62c0 <api_main>, the_msg=the_msg@entry=0x30503f94, vm=vm@entry=0x7f87fbb7d700 <vlib_global_main>, node=node@entry=0x7f87b985a000) at /root/vpp/build-data/../vlib-api/vlibapi/api_shared.c:510
#6 0x00007f87fbda8753 in memclnt_process (vm=, node=0x7f87b985a000, f=) at /root/vpp/build-data/../vlib-api/vlibmemory/memory_vlib.c:487
#7 0x00007f87fb939df6 in vlib_process_bootstrap (_a=) at /root/vpp/build-data/../vlib/vlib/main.c:1218
#8 0x00007f87fa8aed20 in clib_calljmp () at /root/vpp/build-data/../vppinfra/vppinfra/longjmp.S:110
#9 0x00007f87b99a0e30 in ?? ()
#10 0x00007f87fb93ae91 in vlib_process_startup (f=0x0, p=0x7f87b985a000, vm=0x7f87fbb7d700 <vlib_global_main>) at /root/vpp/build-data/../vlib/vlib/main.c:1240
#11 dispatch_process (vm=0x7f87fbb7d700 <vlib_global_main>, p=0x7f87b985a000, last_time_stamp=13293978469616576, f=0x0) at /root/vpp/build-data/../vlib/vlib/main.c:1283
- vppctl sh ver
vpp v17.04-rc0~98-g8bf68e8 built by root on effexor.ctooffice at Fri Jan 13 14:39:15 CET 2017
(gdb) bt
#1 0x00007f0b4c21b202 in unix_shared_memory_queue_add (q=q@entry=0x305e0f40, elem=elem@entry=0x7f0b0a0ade68 "\334DG0", nowait=nowait@entry=0) at /root/vpp/build-data/../src/vlibmemory/unix_shared_memory_queue.c:216
#2 0x00007f0b4c20d4ed in vl_msg_api_send_shmem (q=q@entry=0x305e0f40, elem=elem@entry=0x7f0b0a0ade68 "\334DG0") at /root/vpp/build-data/../src/vlibmemory/memory_shared.c:547
#3 0x00007f0b4bc456fa in send_sw_interface_flags_deleted (am=, sw_if_index=4, q=0x305e0f40) at /root/vpp/build-data/../src/vnet/devices/virtio/vhost_user_api.c:70
#4 vl_api_delete_vhost_user_if_t_handler (mp=) at /root/vpp/build-data/../src/vnet/devices/virtio/vhost_user_api.c:147
#5 0x00007f0b4c428b03 in vl_msg_api_handler_with_vm_node (am=am@entry=0x7f0b4c6322c0 <api_main>, the_msg=the_msg@entry=0x30566bd4, vm=vm@entry=0x7f0b4c206960 <vlib_global_main>, node=node@entry=0x7f0b0a0a5000) at /root/vpp/build-data/../src/vlibapi/api_shared.c:510
#6 0x00007f0b4c2148c3 in memclnt_process (vm=, node=0x7f0b0a0a5000, f=) at /root/vpp/build-data/../src/vlibmemory/memory_vlib.c:487
#7 0x00007f0b4bfa9446 in vlib_process_bootstrap (_a=) at /root/vpp/build-data/../src/vlib/main.c:1218
#8 0x00007f0b4af1ad70 in clib_calljmp () at /root/vpp/build-data/../src/vppinfra/longjmp.S:110
#9 0x00007f0b0a1eee30 in ?? ()
#10 0x00007f0b4bfaa4e1 in vlib_process_startup (f=0x0, p=0x7f0b0a0a5000, vm=0x7f0b4c206960 <vlib_global_main>) at /root/vpp/build-data/../src/vlib/main.c:1240
#11 dispatch_process (vm=0x7f0b4c206960 <vlib_global_main>, p=0x7f0b0a0a5000, last_time_stamp=40186761586983235, f=0x0) at /root/vpp/build-data/../src/vlib/main.c:1283
- christophefontaine (Fri, 13 Jan 2017 09:38:54 +0000):
I also go this bug and not only when setting the ACLs.
I also see this bug on vpp 17.01-rc2:
vpp version:
vpp# sh version
vpp v17.01-rc21-gb95a916b13 built by jenkins on centos7-basebuild-4c-4g-1999 at Thu Jan 5 16:13:53 UTC 2017
vpp conf file:
unix {
nodaemon
log /tmp/vpp.log
full-coredump
cli-listen localhost:5002
interactive
}
cpu {
main-core 7
}
dpdk {
socket-mem 1024
uio-driver uio_pci_generic
dev 0000:03:00.0
dev 0000:08:00.0
}
api-trace {
on
}
gdb backtrace:
#0 pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
#1 0x00007f98732aa0e2 in unix_shared_memory_queue_add (q=q@entry=0x305dcf80, elem=elem@entry=0x7f9830d5de68 "$\253F0",
nowait=nowait@entry=0) at /w/workspace/vpp-merge-1701-centos7/vlib-api/vlibmemory/unix_shared_memory_queue.c:216
#2 0x00007f987329c40d in vl_msg_api_send_shmem (q=q@entry=0x305dcf80, elem=elem@entry=0x7f9830d5de68 "$\253F0")
at /w/workspace/vpp-merge-1701-centos7/vlib-api/vlibmemory/memory_shared.c:526
#3 0x00000000005bca46 in vl_api_control_ping_t_handler (mp=0x304daefc) at /w/workspace/vpp-merge-1701-centos7/vpp/vpp-api/api.c:2061
#4 0x00007f98734b7b83 in vl_msg_api_handler_with_vm_node (am=am@entry=0x7f98736c12c0 <api_main>, the_msg=the_msg@entry=0x304daefc,
vm=vm@entry=0x7f9873078700 <vlib_global_main>, node=node@entry=0x7f9830d55000)
at /w/workspace/vpp-merge-1701-centos7/vlib-api/vlibapi/api_shared.c:510
#5 0x00007f98732a37a3 in memclnt_process (vm=, node=0x7f9830d55000, f=)
at /w/workspace/vpp-merge-1701-centos7/vlib-api/vlibmemory/memory_vlib.c:487
#6 0x00007f9872e34e36 in vlib_process_bootstrap (_a=) at /w/workspace/vpp-merge-1701-centos7/vlib/vlib/main.c:1218
#7 0x00007f9871da9d20 in clib_calljmp () at /w/workspace/vpp-merge-1701-centos7/vppinfra/vppinfra/longjmp.S:110
#8 0x00007f9830e9be30 in ?? ()
#9 0x00007f9872e35ed1 in vlib_process_startup (f=0x0, p=0x7f9830d55000, vm=0x7f9873078700 <vlib_global_main>)
at /w/workspace/vpp-merge-1701-centos7/vlib/vlib/main.c:1240
#10 dispatch_process (vm=0x7f9873078700 <vlib_global_main>, p=0x7f9830d55000, last_time_stamp=12651934560102568, f=0x0)
at /w/workspace/vpp-merge-1701-centos7/vlib/vlib/main.c:1283</p>
Original issue: https://jira.fd.io/browse/VPP-597