![]() |
会搭讪的熊猫 · System.Data.OracleClie ...· 6 月前 · |
![]() |
打酱油的莴苣 · 使用openv屁恩打通两个异地网络 - ...· 1 年前 · |
![]() |
玩手机的鸡蛋面 · 显存不够怎么调用内存_头条· 1 年前 · |
![]() |
威武的松树 · 浏览器 :解决window.open() ...· 1 年前 · |
![]() |
风流倜傥的木瓜 · 使用Matlab将矩阵保存到csv和txt文 ...· 1 年前 · |
腾讯 · Tencent高级云计算工程师 (已认证)
文章介绍:
本文主要介绍如何通过crash来分析系统IO hung导致的panic问题,
中间会穿插讲到一些mq-deadline调度器相关的关键数据结构。
1. 问题描述:
虚拟机运行过程中触发kdump后重启,查看监控重启前有IO读写突增的情况
2. 问题分析
查看kdump log日志56149进程D状态超120s:
crash> log
...
[519966.887618] INFO: task java:56149 blocked for more than 120 seconds.
[519966.888432] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[519966.889329] java D ffff92704798b0c0 0 56149 1 0x00000084
[519966.889334] Call Trace:
[519966.889637] [<ffffffffb2969df9>] schedule_preempt_disabled+0x29/0x70
[519966.890356] [<ffffffffb2967d77>] __mutex_lock_slowpath+0xc7/0x1d0
[519966.891046] [<ffffffffb296715f>] mutex_lock+0x1f/0x2f
[519966.891652] [<ffffffffc07a8a5c>] xfs_reclaim_inodes_ag+0x2dc/0x390 [xfs]
[519966.892412] [<ffffffffb2256eb2>] ? native_smp_send_reschedule+0x52/0x70
[519966.893165] [<ffffffffb22d234e>] ? resched_curr+0xae/0xc0
[519966.893783] [<ffffffffb22d3250>] ? check_preempt_curr+0x80/0xa0
[519966.894458] [<ffffffffb22d3289>] ? ttwu_do_wakeup+0x19/0xe0
[519966.895100] [<ffffffffb22d33bf>] ? ttwu_do_activate+0x6f/0x80
[519966.895759] [<ffffffffb22d68a0>] ? try_to_wake_up+0x190/0x390
[519966.896407] [<ffffffffb22d6ab5>] ? wake_up_process+0x15/0x20
[519966.897131] [<ffffffffc07a9bc3>] xfs_reclaim_inodes_nr+0x33/0x40 [xfs]
[519966.897892] [<ffffffffc07ba2c5>] xfs_fs_free_cached_objects+0x15/0x20 [xfs]
[519966.898684] [<ffffffffb24444c6>] prune_super+0xf6/0x190
[519966.899282] [<ffffffffb23ca085>] shrink_slab+0x175/0x340
[519966.899891] [<ffffffffb24378c1>] ? vmpressure+0x21/0x90
[519966.900486] [<ffffffffb23cd342>] do_try_to_free_pages+0x3c2/0x4e0
[519966.901182] [<ffffffffb23cd55c>] try_to_free_pages+0xfc/0x180
[519966.901842] [<ffffffffb295e4af>] __alloc_pages_slowpath+0x457/0x724
[519966.902561] [<ffffffffb23c1454>] __alloc_pages_nodemask+0x404/0x420
[519966.903273] [<ffffffffb240e308>] alloc_pages_current+0x98/0x110
[519966.903947] [<ffffffffb2419395>] new_slab+0x2c5/0x390
[519966.904533] [<ffffffffb241ae0c>] ___slab_alloc+0x3ac/0x4f0
[519966.905161] [<ffffffffb28251bd>] ? __alloc_skb+0x8d/0x2d0
[519966.905784] [<ffffffffb2889d13>] ? ip_output+0x73/0xe0
[519966.906373] [<ffffffffb28251bd>] ? __alloc_skb+0x8d/0x2d0
[519966.906992] [<ffffffffb295f8cc>] __slab_alloc+0x40/0x5c
[519966.907597] [<ffffffffb241f888>] __kmalloc_node_track_caller+0xb8/0x290
[519966.908339] [<ffffffffb28251bd>] ? __alloc_skb+0x8d/0x2d0
[519966.908958] [<ffffffffb2823f01>] __kmalloc_reserve.isra.32+0x31/0x90
[519966.909683] [<ffffffffb282518d>] ? __alloc_skb+0x5d/0x2d0
[519966.910319] [<ffffffffb28251bd>] __alloc_skb+0x8d/0x2d0
[519966.910918] [<ffffffffb2894922>] sk_stream_alloc_skb+0x52/0x1b0
[519966.911597] [<ffffffffb28957c8>] tcp_sendmsg+0x788/0xc60
[519966.913041] [<ffffffffb281fa00>] ? release_sock+0x120/0x170
[519966.914502] [<ffffffffb2892b2a>] ? tcp_recvmsg+0x62a/0xb30
[519966.915941] [<ffffffffb28c1199>] inet_sendmsg+0x69/0xb0
[519966.917351] [<ffffffffb28c1340>] ? inet_recvmsg+0x80/0xb0
[519966.918763] [<ffffffffb281957d>] sock_aio_write+0x15d/0x180
[519966.920168] [<ffffffffb2440da3>] do_sync_write+0x93/0xe0
[519966.921508] [<ffffffffb2441995>] vfs_write+0x1c5/0x1f0
[519966.922813] [<ffffffffb24426af>] SyS_write+0x7f/0xf0
[519966.924080] [<ffffffffb2975ddb>] system_call_fastpath+0x22/0x27
从栈可知进程因free内存不足触发回收slab引起回写inode,但是由于获取不到mutex锁所以hung住:
crash> set 56149
PID: 56149
COMMAND: "java"
TASK: ffff92704798b0c0 [THREAD_INFO: ffff926f32a1c000]
CPU: 38
STATE: TASK_UNINTERRUPTIBLE
crash> bt
PID: 56149 TASK: ffff92704798b0c0 CPU: 38 COMMAND: "java"
#0 [ffff926f32a1f318] __schedule at ffffffffb2968a32
#1 [ffff926f32a1f3a8] schedule_preempt_disabled at ffffffffb2969df9
#2 [ffff926f32a1f3b8] __mutex_lock_slowpath at ffffffffb2967d77
#3 [ffff926f32a1f418] mutex_lock at ffffffffb296715f
#4 [ffff926f32a1f430] xfs_reclaim_inodes_ag at ffffffffc07a8a5c [xfs]
#5 [ffff926f32a1f5c8] xfs_reclaim_inodes_nr at ffffffffc07a9bc3 [xfs]
#6 [ffff926f32a1f5e8] xfs_fs_free_cached_objects at ffffffffc07ba2c5 [xfs]
#7 [ffff926f32a1f5f8] prune_super at ffffffffb24444c6
#8 [ffff926f32a1f630] shrink_slab at ffffffffb23ca085
#9 [ffff926f32a1f6d0] do_try_to_free_pages at ffffffffb23cd342
#10 [ffff926f32a1f748] try_to_free_pages at ffffffffb23cd55c
#11 [ffff926f32a1f7e0] __alloc_pages_slowpath at ffffffffb295e4af
#12 [ffff926f32a1f8d0] __alloc_pages_nodemask at ffffffffb23c1454
#13 [ffff926f32a1f980] alloc_pages_current at ffffffffb240e308
#14 [ffff926f32a1f9c8] new_slab at ffffffffb2419395
#15 [ffff926f32a1fa00] ___slab_alloc at ffffffffb241ae0c
#16 [ffff926f32a1fad8] __slab_alloc at ffffffffb295f8cc
#17 [ffff926f32a1fb18] __kmalloc_node_track_caller at ffffffffb241f888
#18 [ffff926f32a1fb70] __kmalloc_reserve at ffffffffb2823f01
#19 [ffff926f32a1fbb0] __alloc_skb at ffffffffb28251bd
#20 [ffff926f32a1fc00] sk_stream_alloc_skb at ffffffffb2894922
#21 [ffff926f32a1fc28] tcp_sendmsg at ffffffffb28957c8
#22 [ffff926f32a1fcf8] inet_sendmsg at ffffffffb28c1199
#23 [ffff926f32a1fd28] sock_aio_write at ffffffffb281957d
#24 [ffff926f32a1fdf0] do_sync_write at ffffffffb2440da3
#25 [ffff926f32a1fec8] vfs_write at ffffffffb2441995
#26 [ffff926f32a1ff08] sys_write at ffffffffb24426af
#27 [ffff926f32a1ff50] system_call_fastpath at ffffffffb2975ddb
RIP: 00007f2aa697543d RSP: 00007f2a5619e278 RFLAGS: 00000202
RAX: 0000000000000001 RBX: 000000000d091320 RCX: 000000000000fbe0
RDX: 000000000000fe17 RSI: 000000000d091320 RDI: 0000000000000624
RBP: 00007f2a561a0070 R8: 000000000000fe17 R9: 00000007524d10f8
R10: 00000000000591aa R11: 0000000000000293 R12: 00007f2a845a69f8
R13: 000000000000fe17 R14: 00007f2a561a00b0 R15: 00007f2a845a6800
ORIG_RAX: 0000000000000001 CS: 0033 SS: 002b
crash>
找出进程56149在等待的mutex锁地址:
crash> dis -r ffffffffc07a8a5c
0xffffffffc07a882b <xfs_reclaim_inodes_ag+171>: lea 0x88(%rbx),%rax
0xffffffffc07a8832 <xfs_reclaim_inodes_ag+178>: mov %rax,-0x180(%rbp)//rax和(rbp%-0x180)此时存放的是struct mutex *lock地址
0xffffffffc07a8839 <xfs_reclaim_inodes_ag+185>: mov %rax,%rdi //因为下一条指令是跳转到xfs_reclaim_inodes_ag+727,所以rdi此时存放的是struct mutex *lock地址
0xffffffffc07a883c <xfs_reclaim_inodes_ag+188>: je 0xffffffffc07a8a57 <xfs_reclaim_inodes_ag+727>
0xffffffffc07a8842 <xfs_reclaim_inodes_ag+194>: callq 0xffffffffb29671b0 <mutex_trylock>
....
....
0xffffffffc07a8a4a <xfs_reclaim_inodes_ag+714>: mov %rbx,%rdi
0xffffffffc07a8a4d <xfs_reclaim_inodes_ag+717>: callq 0xffffffffc0795730 <xfs_perag_put>
0xffffffffc07a8a52 <xfs_reclaim_inodes_ag+722>: jmpq 0xffffffffc07a87f9 <xfs_reclaim_inodes_ag+121>
0xffffffffc07a8a57 <xfs_reclaim_inodes_ag+727>: callq 0xffffffffb2967140 <mutex_lock>
0xffffffffc07a8a5c <xfs_reclaim_inodes_ag+732>: movq $0x0,-0x138(%rbp)
crash> whatis mutex_lock
svoid mutex_lock(struct mutex *);
crash>
crash> dis mutex_lock
0xffffffffb2967140 <mutex_lock>: nopl 0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffffb2967145 <mutex_lock+5>: push %rbp //对rbp寄存器值压入函数mutex_lock栈顶偏移1处
0xffffffffb2967146 <mutex_lock+6>: mov %rsp,%rbp
0xffffffffb2967149 <mutex_lock+9>: push %rbx
0xffffffffb296714a <mutex_lock+10>: mov %rdi,%rbx
0xffffffffb296714d <mutex_lock+13>: callq 0xffffffffb2969140 <_cond_resched>
0xffffffffb2967152 <mutex_lock+18>: mov %rbx,%rdi
0xffffffffb2967155 <mutex_lock+21>: lock decl (%rdi)
0xffffffffb2967158 <mutex_lock+24>: jns 0xffffffffb296715f <mutex_lock+31>
0xffffffffb296715a <mutex_lock+26>: callq 0xffffffffb2967cb0 <__mutex_lock_slowpath>
0xffffffffb296715f <mutex_lock+31>: mov %gs:0x10e80,%rax
0xffffffffb2967168 <mutex_lock+40>: mov %rax,0x18(%rbx)
0xffffffffb296716c <mutex_lock+44>: pop %rbx
0xffffffffb296716d <mutex_lock+45>: pop %rbp
0xffffffffb296716e <mutex_lock+46>: retq
crash>
crash> bt 56149 -f
PID: 56149 TASK: ffff92704798b0c0 CPU: 38 COMMAND: "java"
...
...
#2 [ffff926f32a1f3b8] __mutex_lock_slowpath at ffffffffb2967d77
ffff926f32a1f3c0: ffff92ab45777030 ffff926f33dc33c0
ffff926f32a1f3d0: ffff92704798b0c0 00000000c030b888
ffff926f32a1f3e0: ffff926f32a1f428 ffff92adb8a36988
ffff926f32a1f3f0: 0000000000000000 ffff926f32a1f4a0
ffff926f32a1f400: ffff926f32a1f5d4 ffff92adbb416000
ffff926f32a1f410: ffff926f32a1f428 ffffffffb296715f
#3 [ffff926f32a1f418] mutex_lock at ffffffffb296715f
ffff926f32a1f420: ffff92adb8a36900 ffff926f32a1f5c0
ffff926f32a1f430: ffffffffc07a8a5c
#4 [ffff926f32a1f430] xfs_reclaim_inodes_ag at ffffffffc07a8a5c [xfs]
...
进入mutex_lock函数后rbp寄存器被压栈,所以从栈可以找到rbp寄存器值为ffff926f32a1f5c0,
那么-0x180(%rbp)=0xffff926f32a1f5c0-0x180=0xffff926f32a1f440
crash> px 0xffff926f32a1f5c0-0x180
$2 = 0xffff926f32a1f440
crash>
0xffff926f32a1f440地址存放的值为ffff92adb8a36988,也就是struct mutex *lock地址为 ffff92adb8a36988
crash> rd 0xffff926f32a1f440
ffff926f32a1f440: ffff92adb8a36988 .i......
crash>
找到占住mutex锁的进程task_struct地址0xffff92722decc100:
crash> struct mutex.owner ffff92adb8a36988
owner = 0xffff92722decc100
crash>
因此56451进程为当前占住mutex lock的进程,进程56451自身也为UN状态
crash> ps | grep ffff92722decc100
56451 1 62 ffff92722decc100 UN 1.5 7260408 4173556 java
crash>
crash> bt 56451
PID: 56451 TASK: ffff92722decc100 CPU: 62 COMMAND: "java"
#0 [ffff926fa124f158] __schedule at ffffffffb2968a32
#1 [ffff926fa124f1e8] schedule at ffffffffb2968ed9
#2 [ffff926fa124f1f8] xlog_cil_force_lsn at ffffffffc07c5ae2 [xfs]
#3 [ffff926fa124f298] _xfs_log_force_lsn at ffffffffc07c3d30 [xfs]
#4 [ffff926fa124f320] xfs_log_force_lsn at ffffffffc07c4024 [xfs]
#5 [ffff926fa124f358] __xfs_iunpin_wait at ffffffffc07b19ec [xfs]
#6 [ffff926fa124f3d0] xfs_iunpin_wait at ffffffffc07b4dc9 [xfs]
#7 [ffff926fa124f3e0] xfs_reclaim_inode at ffffffffc07a8563 [xfs]
#8 [ffff926fa124f430] xfs_reclaim_inodes_ag at ffffffffc07a89e7 [xfs]
#9 [ffff926fa124f5c8] xfs_reclaim_inodes_nr at ffffffffc07a9bc3 [xfs]
#10 [ffff926fa124f5e8] xfs_fs_free_cached_objects at ffffffffc07ba2c5 [xfs]
#11 [ffff926fa124f5f8] prune_super at ffffffffb24444c6
#12 [ffff926fa124f630] shrink_slab at ffffffffb23ca085
#13 [ffff926fa124f6d0] do_try_to_free_pages at ffffffffb23cd342
#14 [ffff926fa124f748] try_to_free_pages at ffffffffb23cd55c
#15 [ffff926fa124f7e0] __alloc_pages_slowpath at ffffffffb295e4af
#16 [ffff926fa124f8d0] __alloc_pages_nodemask at ffffffffb23c1454
#17 [ffff926fa124f980] alloc_pages_current at ffffffffb240e308
#18 [ffff926fa124f9c8] new_slab at ffffffffb2419395
#19 [ffff926fa124fa00] ___slab_alloc at ffffffffb241ae0c
#20 [ffff926fa124fad8] __slab_alloc at ffffffffb295f8cc
#21 [ffff926fa124fb18] __kmalloc_node_track_caller at ffffffffb241f888
#22 [ffff926fa124fb70] __kmalloc_reserve at ffffffffb2823f01
#23 [ffff926fa124fbb0] __alloc_skb at ffffffffb28251bd
#24 [ffff926fa124fc00] sk_stream_alloc_skb at ffffffffb2894922
#25 [ffff926fa124fc28] tcp_sendmsg at ffffffffb28957c8
#26 [ffff926fa124fcf8] inet_sendmsg at ffffffffb28c1199
#27 [ffff926fa124fd28] sock_aio_write at ffffffffb281957d
#28 [ffff926fa124fdf0] do_sync_write at ffffffffb2440da3
#29 [ffff926fa124fec8] vfs_write at ffffffffb2441995
#30 [ffff926fa124ff08] sys_write at ffffffffb24426af
#31 [ffff926fa124ff50] system_call_fastpath at ffffffffb2975ddb
RIP: 00007f2aa697543d RSP: 00007f2a664a3cb0 RFLAGS: 00000293
RAX: 0000000000000001 RBX: 000000000d37fab0 RCX: 0000000000000010
RDX: 000000000000fe17 RSI: 000000000d37fab0 RDI: 00000000000005aa
RBP: 00007f2a664a3e70 R8: 000000000000fe17 R9: 00000007525610f8
R10: 0000000000059198 R11: 0000000000000293 R12: 00007f2a851fd1f8
R13: 000000000000fe17 R14: 00007f2a664a3eb0 R15: 00007f2a851fd000
ORIG_RAX: 0000000000000001 CS: 0033 SS: 002b
crash>
先看下死机时进程56451正在操作的inode以及对应的磁盘:
crash> whatis xfs_iunpin_wait
void xfs_iunpin_wait(struct xfs_inode *);
crash>
crash> dis -r ffffffffc07a8563
...
0xffffffffc07a855b <xfs_reclaim_inode+315>: mov %rbx,%rdi //struct xfs_inode
0xffffffffc07a855e <xfs_reclaim_inode+318>: callq 0xffffffffc07b4db0 <xfs_iunpin_wait>
0xffffffffc07a8563 <xfs_reclaim_inode+323>: mov %r12,%rdi
crash> dis xfs_iunpin_wait
0xffffffffc07b4db0 <xfs_iunpin_wait>: nopl 0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffffc07b4db5 <xfs_iunpin_wait+5>: mov 0xf0(%rdi),%eax
0xffffffffc07b4dbb <xfs_iunpin_wait+11>: test %eax,%eax
0xffffffffc07b4dbd <xfs_iunpin_wait+13>: jne 0xffffffffc07b4dc0 <xfs_iunpin_wait+16>
0xffffffffc07b4dbf <xfs_iunpin_wait+15>: retq
0xffffffffc07b4dc0 <xfs_iunpin_wait+16>: push %rbp
0xffffffffc07b4dc1 <xfs_iunpin_wait+17>: mov %rsp,%rbp
0xffffffffc07b4dc4 <xfs_iunpin_wait+20>: callq 0xffffffffc07b1950 <__xfs_iunpin_wait>
0xffffffffc07b4dc9 <xfs_iunpin_wait+25>: pop %rbp
0xffffffffc07b4dca <xfs_iunpin_wait+26>: retq
crash> dis __xfs_iunpin_wait
0xffffffffc07b1950 <__xfs_iunpin_wait>: nopl 0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffffc07b1955 <__xfs_iunpin_wait+5>: push %rbp
0xffffffffc07b1956 <__xfs_iunpin_wait+6>: mov $0x8,%esi
0xffffffffc07b195b <__xfs_iunpin_wait+11>: mov %rsp,%rbp
0xffffffffc07b195e <__xfs_iunpin_wait+14>: push %r14
0xffffffffc07b1960 <__xfs_iunpin_wait+16>: push %r13
0xffffffffc07b1962 <__xfs_iunpin_wait+18>: push %r12
0xffffffffc07b1964 <__xfs_iunpin_wait+20>: lea 0xf8(%rdi),%r12
0xffffffffc07b196b <__xfs_iunpin_wait+27>: push %rbx //struct xfs_inode
crash> bt 56451 -f
...
...
#5 [ffff926fa124f358] __xfs_iunpin_wait at ffffffffc07b19ec [xfs]
ffff926fa124f360: ffff9272340069f8 0000000000000008
ffff926fa124f370: 0000000000000000 0000000000000000
ffff926fa124f380: ffff92722decc100 ffffffffb22c2f30
ffff926fa124f390: ffff926fa124f390 ffff926fa124f390
ffff926fa124f3a0: 000000009d36ba3a ffff927234006900 //rbx
ffff926fa124f3b0: ffff9272340069f4 0000000000000001
ffff926fa124f3c0: ffff92adb8a36900 ffff926fa124f3d8
ffff926fa124f3d0: ffffffffc07b4dc9
#6 [ffff926fa124f3d0] xfs_iunpin_wait at ffffffffc07b4dc9 [xfs]
ffff926fa124f3d8: ffff926fa124f428 ffffffffc07a8563
#7 [ffff926fa124f3e0] xfs_reclaim_inode at ffffffffc07a8563 [xfs]
ffff926fa124f3e8: 0000000000000000 0000000000000000
ffff926fa124f3f8: 000000009d36ba3a ffff92adb8a36900
ffff926fa124f408: 0000000000000000 ffff926fa124f520
ffff926fa124f418: 0000000000000003 ffff926fa124f520
ffff926fa124f428: ffff926fa124f5c0 ffffffffc07a89e7
#8 [ffff926fa124f430] xfs_reclaim_inodes_ag at ffffffffc07a89e7 [xfs]
...
...
struct xfs_inode ffff927234006900
crash> struct xfs_inode.i_mount ffff927234006900
i_mount = 0xffff92adbb416000
crash> struct xfs_mount.m_log 0xffff92adbb416000
m_log = 0xffff92a544196c00
crash>
crash> struct xfs_mount.m_super 0xffff92adbb416000
m_super = 0xffff92adba246800
crash>
crash> struct super_block.s_id 0xffff92adba246800
s_id = "vdg\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
crash>
为什么进程56451会在xlog_cil_force_lsn被调度出去?
xfs_lsn_t xlog_cil_force_lsn( struct xlog *log, xfs_lsn_t sequence) list_for_each_entry(ctx, &cil->xc_committing, committing) { if (!ctx->commit_lsn) {commit_lsn需要等待函数xlog_cil_push通过xfs_log_done完成日志落盘后更新为log sequence number (LSN),日志记录型文件系统在覆盖数据结构之前需要先将日志写入磁盘 * It is still being pushed! Wait for the push to * complete, then start again from the beginning. xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock); }
进程56451调用xlog_wait进入block状态要成立的前提条件是ctx->commit_lsn值为0,怎么来找出ctx->commit_lsn值?
crash> whatis xlog_cil_force_lsn
xfs_lsn_t xlog_cil_force_lsn(struct xlog *, xfs_lsn_t);
crash>
crash> dis -r ffffffffc07c3d30
...
...
0xffffffffc07c3d19 <_xfs_log_force_lsn+105>: mov %r15,%rdi//参数1 struct xlog
0xffffffffc07c3d1c <_xfs_log_force_lsn+108>: add -0x4d0b4360(,%rdx,8),%rax
0xffffffffc07c3d24 <_xfs_log_force_lsn+116>: addl $0x1,0x90(%rax)
0xffffffffc07c3d2b <_xfs_log_force_lsn+123>: callq 0xffffffffc07c5900 <xlog_cil_force_lsn>
0xffffffffc07c3d30 <_xfs_log_force_lsn+128>: mov %rax,%rbx
crash> dis xlog_cil_force_lsn
0xffffffffc07c5900 <xlog_cil_force_lsn>: nopl 0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffffc07c5905 <xlog_cil_force_lsn+5>: push %rbp
0xffffffffc07c5906 <xlog_cil_force_lsn+6>: mov %rsp,%rbp
0xffffffffc07c5909 <xlog_cil_force_lsn+9>: push %r15 //压栈
crash> bt 56451 -f
PID: 56451 TASK: ffff92722decc100 CPU: 62 COMMAND: "java"
...
...
#2 [ffff926fa124f1f8] xlog_cil_force_lsn at ffffffffc07c5ae2 [xfs]
ffff926fa124f200: ffff92722decc100 ffff92adb8a36ca0
ffff926fa124f210: ffff92adb8a36c08 ffff92adb8a36cc0
ffff926fa124f220: ffff92adb8a36c80 ffff92a544196c00
ffff926fa124f230: ffff92adb8a36c80 0000000000000001
ffff926fa124f240: ffff92722decc100 ffffffffb22d6b60
ffff926fa124f250: ffff92708e247250 ffff92adb8a36ca8
ffff926fa124f260: 000000009d36ba3a ffff927234006900
ffff926fa124f270: 0000000000000000 0000000000000000
ffff926fa124f280: ffff92722decc100 ffff92a544196c00
ffff926fa124f290: ffff926fa124f318 ffffffffc07c3d30
#3 [ffff926fa124f298] _xfs_log_force_lsn at ffffffffc07c3d30 [xfs]
因此xlog_cil_force_lsn函数的第一个参数struct xlog地址为ffff92a544196c00 . (这里实际上从前面找到的inode信息里也可以找到)
crash> struct xlog.l_cilp ffff92a544196c00
l_cilp = 0xffff92adb8a36c00
crash>
crash> struct xfs_cil.xc_committing 0xffff92adb8a36c00
xc_committing = {
next = 0xffff92701f982f60,//指向xfs_cil_ctx.committing
prev = 0xffff92701f982f60
}
crash>
0xffff92701f982f60为类型xfs_cil_ctx.committing地址:
crash> list 0xffff92701f982f60 -l xfs_cil_ctx.committing -s xfs_cil_ctx.committing
ffff92701f982f60
committing = {
next = 0xffff92adb8a36c90,
prev = 0xffff92adb8a36c90
}
ffff92adb8a36c90
committing = {
next = 0xffff92701f982f60,
prev = 0xffff92701f982f60
}
crash>
crash> struct xfs_cil_ctx.committing -xo
struct xfs_cil_ctx {
[0x60] struct list_head committing;
}
crash>
crash> px (0xffff92adb8a36c90-0x60)
$1 = 0xffff92adb8a36c30
crash> struct xfs_cil_ctx.commit_lsn 0xffff92adb8a36c30
commit_lsn = 0
crash>
crash> px (0xffff92701f982f60-0x60)
$5 = 0xffff92701f982f00
crash>
找出ctx->commit_lsn值确实为0,前面假设的条件成立:
crash> struct xfs_cil_ctx.commit_lsn 0xffff92701f982f00
commit_lsn = 0
crash>
xlog结构对应的磁盘也是vdg的:
crash> struct xlog.l_mp ffff92a544196c00
l_mp = 0xffff92adbb416000
crash>
crash> struct xfs_mount.m_super 0xffff92adbb416000
m_super = 0xffff92adba246800
crash> struct super_block.s_id 0xffff92adba246800
s_id = "vdg\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
crash>
crash> mount | grep vdg
ffff92adbb4c1e00 ffff92adba246800 xfs /dev/vdg /data5
crash>
系统在运行了519966秒时触发panic重启:
[519966.887618] INFO: task java:56149 blocked for more than 120 seconds.
进程56451上一次在cpu上开始运行时间为系统启动第519946秒:
crash> struct task_struct.sched_info 0xffff92722decc100
sched_info = {
last_arrival = 519946514637661,
}
crash>
PID: 56451 TASK: ffff92722decc100 CPU: 62 COMMAND: "java"
RUN TIME: 00:04:53
START TIME: 519668
UTIME: 65000000
STIME: 628000000
crash>
两者相差20秒, 代表进程56451最近一次连续占用该mutex锁的时间顶多20秒:
crash> p 519966-519946
$3 = 20
crash>
crash> p sysctl_hung_task_timeout_secs
sysctl_hung_task_timeout_secs = $4 = 120
crash> p sysctl_hung_task_panic
sysctl_hung_task_panic = $5 = 1
crash>
hung_task_timeout_secs配置的是120秒,也就是进程56149已经因为等待mutex锁而连续hung住120秒,因此可以推测出最近120秒内这个mutex锁并不是
一直由进程56451占住,从侧面也排除了56451进程之所以不释放mutex lock并非死锁导致.
另外看看还有哪些进程所跟56451进程一样因为在等待xfs log落盘而阻塞:
xlog_cil_force_lsn struct xfs_cil *cil = log->l_cilp; xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock); #define DECLARE_WAITQUEUE(name, tsk) \ wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk) #define __WAITQUEUE_INITIALIZER(name, tsk) { \ .private = tsk, \ .func = default_wake_function, \ .task_list = { NULL, NULL } } static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock) DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(wq, &wait); __set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock(lock); schedule(); remove_wait_queue(wq, &wait); void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) unsigned long flags; wait->flags |= WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); __add_wait_queue_tail(q, wait); spin_unlock_irqrestore(&q->lock, flags); static inline void __add_wait_queue_tail(wait_queue_head_t *head, wait_queue_t *new) list_add_tail(&new->task_list, &head->task_list); }
crash> struct xlog.l_cilp ffff92a544196c00
l_cilp = 0xffff92adb8a36c00
crash>
crash> struct xfs_cil.xc_commit_wait 0xffff92adb8a36c00
xc_commit_wait = {
lock = {
{
rlock = {
raw_lock = {
val = {
counter = 0
}
}
}
}
},
task_list = {
next = 0xffff926fa124f250, //从前面代码可以知道这里指向的是wait_queue_t.task_list类型地址,0xffff926fa124f250代表第一个,prev代表最后一个
prev = 0xffff92708e247250
}
}
crash>
crash> struct xfs_cil.xc_commit_wait 0xffff92adb8a36c00 -xo
struct xfs_cil {
[ffff92adb8a36ca0] wait_queue_head_t xc_commit_wait;
}
crash>
crash> wait_queue_head_t
typedef struct __wait_queue_head {
spinlock_t lock;
struct list_head task_list; //这里task_list.next指向的wait_queue_t.task_list地址
} wait_queue_head_t;
SIZE: 24
crash>
crash> wait_queue_t
typedef struct __wait_queue {
unsigned int flags;
void *private;
wait_queue_func_t func;
struct list_head task_list;
} wait_queue_t;
SIZE: 40
crash>
0xffff926fa124f250是一个wait_queue_t.task_list地址,通过wait_queue_t.task_list地址找出其对应的wait_queue_t信息:
crash> list 0xffff926fa124f250 -l wait_queue_t.task_list -s wait_queue_t
ffff926fa124f250 //对应next
struct wait_queue_t {
flags = 1,
private = 0xffff92722decc100,
func = 0xffffffffb22d6b60,
task_list = {
next = 0xffff92708e247250,
prev = 0xffff92adb8a36ca8
}
}
ffff92708e247250 //对应prev
struct wait_queue_t {
flags = 1,
private = 0xffff926f00184100,
func = 0xffffffffb22d6b60,
task_list = {
next = 0xffff92adb8a36ca8,
prev = 0xffff926fa124f250
}
}
ffff92adb8a36ca8
struct wait_queue_t {
flags = 530067296,
private = 0xffff92701f982f60,
func = 0x0,
task_list = {
next = 0xffff926fa124f250,
prev = 0xffff92708e247250
}
}
wait_queue_t.private对应task_struct地址,列出所有在等待队列中的task:
crash> list 0xffff926fa124f250 -l wait_queue_t.task_list -s wait_queue_t.private
ffff926fa124f250 //对应next
private = 0xffff92722decc100
ffff92708e247250//对应prev
private = 0xffff926f00184100 //对应task_struct
ffff92adb8a36ca8
private = 0xffff92701f982f60
crash>
从list信息可以看到除了56451还有另外一个进程也是阻塞在等待xfs log落盘,
进程57314同样也是因为free内存不足触发回写inode:
crash> bt 0xffff926f00184100
PID: 57314 TASK: ffff926f00184100 CPU: 52 COMMAND: "java"
#0 [ffff92708e247158] __schedule at ffffffffb2968a32
#1 [ffff92708e2471e8] schedule at ffffffffb2968ed9
#2 [ffff92708e2471f8] xlog_cil_force_lsn at ffffffffc07c5ae2 [xfs]
#3 [ffff92708e247298] _xfs_log_force_lsn at ffffffffc07c3d30 [xfs]
#4 [ffff92708e247320] xfs_log_force_lsn at ffffffffc07c4024 [xfs]
#5 [ffff92708e247358] __xfs_iunpin_wait at ffffffffc07b19ec [xfs]
#6 [ffff92708e2473d0] xfs_iunpin_wait at ffffffffc07b4dc9 [xfs]
#7 [ffff92708e2473e0] xfs_reclaim_inode at ffffffffc07a8563 [xfs]
#8 [ffff92708e247430] xfs_reclaim_inodes_ag at ffffffffc07a89e7 [xfs]
#9 [ffff92708e2475c8] xfs_reclaim_inodes_nr at ffffffffc07a9bc3 [xfs]
#10 [ffff92708e2475e8] xfs_fs_free_cached_objects at ffffffffc07ba2c5 [xfs]
#11 [ffff92708e2475f8] prune_super at ffffffffb24444c6
#12 [ffff92708e247630] shrink_slab at ffffffffb23ca085
#13 [ffff92708e2476d0] do_try_to_free_pages at ffffffffb23cd342
#14 [ffff92708e247748] try_to_free_pages at ffffffffb23cd55c
#15 [ffff92708e2477e0] __alloc_pages_slowpath at ffffffffb295e4af
#16 [ffff92708e2478d0] __alloc_pages_nodemask at ffffffffb23c1454
#17 [ffff92708e247980] alloc_pages_current at ffffffffb240e308
#18 [ffff92708e2479c8] new_slab at ffffffffb2419395
#19 [ffff92708e247a00] ___slab_alloc at ffffffffb241ae0c
#20 [ffff92708e247ad8] __slab_alloc at ffffffffb295f8cc
#21 [ffff92708e247b18] __kmalloc_node_track_caller at ffffffffb241f888
#22 [ffff92708e247b70] __kmalloc_reserve at ffffffffb2823f01
#23 [ffff92708e247bb0] __alloc_skb at ffffffffb28251bd
#24 [ffff92708e247c00] sk_stream_alloc_skb at ffffffffb2894922
#25 [ffff92708e247c28] tcp_sendmsg at ffffffffb28957c8
#26 [ffff92708e247cf8] inet_sendmsg at ffffffffb28c1199
#27 [ffff92708e247d28] sock_aio_write at ffffffffb281957d
#28 [ffff92708e247df0] do_sync_write at ffffffffb2440da3
#29 [ffff92708e247ec8] vfs_write at ffffffffb2441995
#30 [ffff92708e247f08] sys_write at ffffffffb24426af
#31 [ffff92708e247f50] system_call_fastpath at ffffffffb2975ddb
从前面分析知道进程56451之所以卡住在xlog_cil_force_lsn是因为xfs_cil_ctx.commit_lsn为0,那么哪个进程会调用到xfs_log_done并对commit_lsn设置为lsn并唤醒进程56451?
在所有线程栈中搜索所有访问过struct xfs_cil类型的地址0xffff92adb8a36c00:
crash> search -t 0xffff92adb8a36c00
PID: 38953 TASK: ffff92aaf7ba4100 CPU: 60 COMMAND: "java"
ffff92ab3f732e40: ffff92adb8a36c00
ffff92ab3f732e58: ffff92adb8a36c00
PID: 56451 TASK: ffff92722decc100 CPU: 62 COMMAND: "java"
ffff926fa124f120: ffff92adb8a36c00
ffff926fa124f148: ffff92adb8a36c00
ffff926fa124f1d0: ffff92adb8a36c00
PID: 57314 TASK: ffff926f00184100 CPU: 52 COMMAND: "java"
ffff92708e247120: ffff92adb8a36c00
ffff92708e247148: ffff92adb8a36c00
ffff92708e2471d0: ffff92adb8a36c00
PID: 57573 TASK: ffff926f2d49a080 CPU: 56 COMMAND: "java"
ffff926f803f2e40: ffff92adb8a36c00
ffff926f803f2e58: ffff92adb8a36c00
PID: 57990 TASK: ffff9272680f1040 CPU: 44 COMMAND: "java"
ffff926f807471d0: ffff92adb8a36c00
ffff926f807471e8: ffff92adb8a36c00
PID: 61516 TASK: ffff92704da40000 CPU: 60 COMMAND: "kworker/60:1"
ffff9270cc167d28: ffff92adb8a36c00
crash>
逐一查看这些进程的栈除了进程61516进程其他的都跟56451进程一样在阻塞在xfs_log_force_lsn函数上
因此推测出进程56451写数据前正在等待进程61516完成xfs log的落盘:
crash> bt 61516
PID: 61516 TASK: ffff92704da40000 CPU: 60 COMMAND: "kworker/60:1"
#0 [ffff9270cc167778] __schedule at ffffffffb2968a32
#1 [ffff9270cc167808] schedule at ffffffffb2968ed9
#2 [ffff9270cc167818] schedule_timeout at ffffffffb29669e1
#3 [ffff9270cc1678c8] io_schedule_timeout at ffffffffb29685ad
#4 [ffff9270cc1678f8] io_schedule at ffffffffb2968648
#5 [ffff9270cc167908] blk_mq_get_tag at ffffffffb25541b1
#6 [ffff9270cc167978] __blk_mq_alloc_request at ffffffffb254f83b
#7 [ffff9270cc1679a8] blk_mq_sched_get_request at ffffffffb255653b
#8 [ffff9270cc1679e8] blk_mq_make_request at ffffffffb2551759
#9 [ffff9270cc167a78] generic_make_request at ffffffffb25454e7
#10 [ffff9270cc167ad0] submit_bio at ffffffffb2545790
#11 [ffff9270cc167b28] _xfs_buf_ioapply at ffffffffc079e473 [xfs]
#12 [ffff9270cc167bd8] xfs_buf_submit at ffffffffc079fd4c [xfs]
#13 [ffff9270cc167c00] xlog_bdstrat at ffffffffc07c094b [xfs]
#14 [ffff9270cc167c20] xlog_sync at ffffffffc07c27d6 [xfs]
#15 [ffff9270cc167c68] xlog_state_release_iclog at ffffffffc07c295b [xfs]
#16 [ffff9270cc167c98] xlog_write at ffffffffc07c361a [xfs]
#17 [ffff9270cc167d40] xlog_cil_push at ffffffffc07c4fb8 [xfs] //这里会将调用xfs_log_done的返回值设置commit_lsn
#18 [ffff9270cc167e10] xlog_cil_push_work at ffffffffc07c5155 [xfs]
#19 [ffff9270cc167e20] process_one_work at ffffffffb22b9ebf
#20 [ffff9270cc167e68] worker_thread at ffffffffb22bb0a8
#21 [ffff9270cc167ec8] kthread at ffffffffb22c1da1
crash>
找出死机时进程61516正在操作的xlog地址来核实是否跟进程56451正在等的是同一个xlog,从而进一步核实
进程56451正在等待61516进程完成xfs log写入:
crash> whatis xlog_sync
int xlog_sync(struct xlog *, struct xlog_in_core *);
crash>
crash> dis -r ffffffffc07c295b
...
0xffffffffc07c2950 <xlog_state_release_iclog+112>: mov %rbx,%rsi
0xffffffffc07c2953 <xlog_state_release_iclog+115>: mov %r14,%rdi//struct xlog
0xffffffffc07c2956 <xlog_state_release_iclog+118>: callq 0xffffffffc07c24f0 <xlog_sync>
0xffffffffc07c295b <xlog_state_release_iclog+123>: pop %rbx
crash> dis xlog_sync
0xffffffffc07c24f0 <xlog_sync>: nopl 0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffffc07c24f5 <xlog_sync+5>: push %rbp
0xffffffffc07c24f6 <xlog_sync+6>: mov %rsp,%rbp
0xffffffffc07c24f9 <xlog_sync+9>: push %r15
0xffffffffc07c24fb <xlog_sync+11>: push %r14 //struct xlog
0xffffffffc07c24fd <xlog_sync+13>: push %r13
crash> bt 61516 -f
...
...
#14 [ffff9270cc167c20] xlog_sync at ffffffffc07c27d6 [xfs]
ffff9270cc167c28: 0000000000000040 ffffaa9619b38000
ffff9270cc167c38: ffff92adb8a36e00 ffff92a544196d28
ffff9270cc167c48: 00000000fffffffb ffff92a544196c00 //r14 struct xlog
ffff9270cc167c58: ffff927177cf4000 ffff9270cc167c90
ffff9270cc167c68: ffffffffc07c295b
#15 [ffff9270cc167c68] xlog_state_release_iclog at ffffffffc07c295b [xfs]
...
...
struct xlog ffff92a544196c00
死机时,进程61516与进程56451访问的是同一个xlog地址 ffff92a544196c00,
因此可以确定56451之所以阻塞在IO上是因为在等待进程61516完成xlog落盘。
找出进程61516操作的bio地址进而获取相应的磁盘分区和request等信息:
crash> whatis generic_make_request
void generic_make_request(struct bio *);
crash>
crash> dis -r ffffffffb2545790
...
...
0xffffffffb2545788 <submit_bio+104>: mov %rbx,%rdi //struct bio
0xffffffffb254578b <submit_bio+107>: callq 0xffffffffb25453a0 <generic_make_request>
0xffffffffb2545790 <submit_bio+112>: mov -0x20(%rbp),%rax
crash> dis generic_make_request
0xffffffffb25453a0 <generic_make_request>: nopl 0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffffb25453a5 <generic_make_request+5>: push %rbp
0xffffffffb25453a6 <generic_make_request+6>: xor %esi,%esi
0xffffffffb25453a8 <generic_make_request+8>: mov %rsp,%rbp
0xffffffffb25453ab <generic_make_request+11>: push %r12
0xffffffffb25453ad <generic_make_request+13>: mov %rdi,%r12
0xffffffffb25453b0 <generic_make_request+16>: push %rbx //struct bio
crash> bt 61516 -f
...
...
#9 [ffff9270cc167a78] generic_make_request at ffffffffb25454e7
ffff9270cc167a80: 0000000000000000 0000000000000000
ffff9270cc167a90: 0000000000000000 0000000000000000
ffff9270cc167aa0: ffff92aebf7e3900 000000008f45ebc2
ffff9270cc167ab0: ffffffffb247e989 ffff9270a9d74500 //rbx
ffff9270cc167ac0: 0000000000000040 ffff9270cc167b20
ffff9270cc167ad0: ffffffffb2545790
#10 [ffff9270cc167ad0] submit_bio at ffffffffb2545790
...
...
struct bio ffff9270a9d74500
查看bio对应的磁盘
crash> struct bio.bi_bdev ffff9270a9d74500
bi_bdev = 0xffff92adb3fd0000
crash>
crash> struct block_device.bd_disk 0xffff92adb3fd0000
bd_disk = 0xffff92adb202f800
crash>
crash> struct gendisk.disk_name 0xffff92adb202f800
disk_name = "vdg\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000g"
crash>
查看vdg盘最后接收到request的时间
crash> struct gendisk 0xffff92adb202f800 | grep -w stamp
stamp = 4814629147,
crash> p jiffies
jiffies = $10 = 4814629154
crash> p (4814629154-4814629147)
$11 = 7
crash>
死机前最后一次给vdg盘发送I/Orequest发生在距离触发panic前 7个jffies前
查看request_queue相关信息:
crash> struct gendisk.queue 0xffff92adb202f800
queue = 0xffff92adb34f2660
crash> struct request_queue.nr_requests 0xffff92adb34f2660
nr_requests = 256 //软件队列深度,可通过cat /sys/block/vdb/queue/nr_requests查看
//blk_mq_init_allocated_queue->blk_queue_make_request函数中设置为
//默认值BLKDEV_MAX_RQ(128),使用调度器时会在blk_mq_init_sched函数中将nr_requests的值
//设置为:q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,BLKDEV_MAX_RQ);
//q->tag_set->queue_depth为硬件队列深度blk_mq_tag_set.queue_depth
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
/*
* Default to double of smaller one between hw queue_depth and 128,
* since we don't split into sync/async like the old code did.
* Additionally, this is a per-hw queue depth.
*/
q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
BLKDEV_MAX_RQ);
}
crash> struct request_queue.nr_hw_queues 0xffff92adb34f2660
nr_hw_queues = 1 //块设备硬件队列数量
crash>
IO软件队列request_queue结构体成员(queue_hw_ctx、nr_queues等)在blk_mq_init_allocated_queue函数中进行初始化:
crash> struct request_queue.nr_queues 0xffff92adb34f2660
nr_queues = 64 //软件队列数量,跟cpu数量相同
crash>
crash> struct request_queue.queue_hw_ctx 0xffff92adb34f2660
queue_hw_ctx = 0xffff92adb9383e00
crash>
crash> struct request_queue.elevator 0xffff92adb34f2660
elevator = 0xffff92adb21c8400
crash> struct request_queue.elevator 0xffff92adb34f2660
elevator = 0xffff92adb21c8400
crash>
crash> struct elevator_queue.type 0xffff92adb21c8400
type = 0xffffffffb2ed60c0
crash> struct elevator_type.elevator_name 0xffffffffb2ed60c0
elevator_name = "mq-deadline\000\000\000\000" //io调度器类型
crash>
# cat /sys/block/vdg/queue/schedule
[mq-deadline] kyber none
crash> struct request_queue.tag_set 0xffff92adb34f2660
tag_set = 0xffff92adb26629d0
crash> struct blk_mq_tag_set.nr_hw_queues 0xffff92adb26629d0
nr_hw_queues = 1 //硬件队列数量,virtblk_probe中初始化
crash>
crash> struct blk_mq_tag_set.queue_depth 0xffff92adb26629d0 -d
queue_depth = 128 //硬件队列深度,该值会赋值给struct blk_mq_tags.nr_tags
crash>
查看mq-deadline调度器相关信息:
crash> struct request_queue.elevator 0xffff92adb34f2660
elevator = 0xffff92adb21c8400
crash>
crash> struct elevator_queue.elevator_data 0xffff92adb21c8400
elevator_data = 0xffff92adb65e0680
crash>
crash> struct deadline_data 0xffff92adb65e0680
struct deadline_data {
sort_list = {{ //sort_list是两颗红黑树,用于对io请求按起始扇区编号进行排序,deadline_add_rq_rb函数完成将request
rb_node = 0xffff92adb22e5a90 //Read
}, {
rb_node = 0xffff92adb22b4890 //Write
}},
fifo_list = {{//读IO fifo
next = 0xffff92adb22d6c00, //最早放入读fifo的request
prev = 0xffff92adb22f4800 //最近放入读fifo的request
}, {//写IO fifo
next = 0xffff92adb22b4800,
prev = 0xffff92adb2310000
}}, //dd_insert_request函数将request请求成员queuelist地址&request.queuelist插入fifo_list,
next_rq = {0xffff92adb235c600, 0x0},//0xffff92adb235c600为下一个将要派发给块设备驱动的request.
//数组第一个成员代表读request,第二个为写request
batching = 12,
...
...
}
crash>
sort_list是两颗红黑树,用于对io请求按起始扇区编号进行排序,
deadline的电梯扫描就是基于这两个红黑树进行的,这里有两颗树,
一颗读请求树,一颗写请求树,分别作用于读和写。
fifo_list是普通的先进先出链表,也有两个,分别对应读和写。
每个IO请求在进入调度器的时候都会根据当前系统时间和超时时间给它赋上一个时间戳,
然后根据IO方向添加到读或者写fifo_list,fifo_list头部保存即将超时的IO请求,
调度器在派发IO请求的时候会检测fifo_list中是否有已经超时的IO请求,
如果有则必须进行派发处理,这就是deadline的字面意思,
每个IO请求都有一个死亡线的截至时间,读和写的超时时间由fifo_expire成员定义,
默认读为500ms,写为5s(可通过cat /sys/block/vdg/queue/iosched/write_expire查看).
读的优先级比写要高,因为读请求大部分情况下是同步的,需要尽快得到满足。
使用mq-deadline调度器时,request加入sort_list和fifo_list都是在dd_insert_requests函数中实现的
crash> struct elevator_type_aux.ops.mq.insert_requests 0xffff92adbdb0bb00
ops.mq.insert_requests = 0xffffffffb2570570,
crash> sym 0xffffffffb2570570
ffffffffb2570570 (t) dd_insert_requests /usr/src/debug/kernel-3.10.0-957.21.3.el7/linux-3.10.0-957.21.3.el7.x86_64/block/mq-deadline.c: 427
crash>
找出blk_mq_alloc_data地址从而获取软件队列ctx和硬件队列hctx相关信息:
crash> whatis blk_mq_sched_get_request
struct request *blk_mq_sched_get_request(struct request_queue *, struct bio *, unsigned int, struct blk_mq_alloc_data *);
crash>
crash> dis -r ffffffffb2551759
...
...
0xffffffffb255173e <blk_mq_make_request+766>: mov -0x68(%rbp),%rsi
0xffffffffb2551742 <blk_mq_make_request+770>: mov 0x20(%rsi),%rax
0xffffffffb2551746 <blk_mq_make_request+774>: mov %rax,-0x70(%rbp)
0xffffffffb255174a <blk_mq_make_request+778>: mov -0x70(%rbp),%edx
0xffffffffb255174d <blk_mq_make_request+781>: lea -0x50(%rbp),%rcx//struct blk_mq_alloc_data
0xffffffffb2551751 <blk_mq_make_request+785>: mov %rbx,%rdi
0xffffffffb2551754 <blk_mq_make_request+788>: callq 0xffffffffb2556380 <blk_mq_sched_get_request>
...
...
crash> dis blk_mq_sched_get_request
0xffffffffb2556380 <blk_mq_sched_get_request>: nopl 0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffffb2556385 <blk_mq_sched_get_request+5>: push %rbp //struct blk_mq_alloc_data = rbp-0x50
0xffffffffb2556386 <blk_mq_sched_get_request+6>: mov %rsp,%rbp
0xffffffffb2556389 <blk_mq_sched_get_request+9>: push %r15
0xffffffffb255638b <blk_mq_sched_get_request+11>: push %r14
0xffffffffb255638d <blk_mq_sched_get_request+13>: mov %rsi,%r14
0xffffffffb2556390 <blk_mq_sched_get_request+16>: mov %edx,%esi
0xffffffffb2556392 <blk_mq_sched_get_request+18>: push %r13
0xffffffffb2556394 <blk_mq_sched_get_request+20>: push %r12
0xffffffffb2556396 <blk_mq_sched_get_request+22>: mov %rdi,%r12
0xffffffffb2556399 <blk_mq_sched_get_request+25>: push %rbx
...
...
crash> bt -f
PID: 61516 TASK: ffff92704da40000 CPU: 60 COMMAND: "kworker/60:1"
...
...
#7 [ffff9270cc1679a8] blk_mq_sched_get_request at ffffffffb255653b
ffff9270cc1679b0: 0000000000001c01 ffff92adb34f2660
ffff9270cc1679c0: 0000000000001c31 0000000000001800
ffff9270cc1679d0: 0000000000000000 ffff9270a9d74500
ffff9270cc1679e0: ffff9270cc167a70 ffffffffb2551759
#8 [ffff9270cc1679e8] blk_mq_make_request at ffffffffb2551759
...
...
rbp==ffff9270cc167a70, rbp-0x50=0xffff9270cc167a70-0x50=0xffff9270cc167a20
crash> struct blk_mq_alloc_data 0xffff9270cc167a20
struct blk_mq_alloc_data {
q = 0xffff92adb34f2660,
flags = 4, //flags & BLK_MQ_REQ_INTERNAL = 1
shallow_depth = 0,
ctx = 0xffff92adbf525ec0, //IO软件队列, ctx=(per cpu base)+request_queue.queue_ctx
hctx = 0xffff92adb2028400 //IO硬件队列,指向request_queue.queue_hw_ctx的成员内容
}
crash>
blk_mq_sched_get_request函数分配request前会根据cpu号分配指定的request queue并分别赋值给ctx和hctx struct request *blk_mq_sched_get_request(struct request_queue *q, struct bio *bio, unsigned int op, struct blk_mq_alloc_data *data) data->q = q; if (likely(!data->ctx)) data->ctx = blk_mq_get_ctx(q); if (likely(!data->hctx)) data->hctx = blk_mq_map_queue(q, data->ctx->cpu); static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, unsigned int cpu) return per_cpu_ptr(q->queue_ctx, cpu); * This assumes per-cpu software queueing queues. They could be per-node * as well, for instance. For now this is hardcoded as-is. Note that we don't * care about preemption, since we know the ctx's are persistent. This does * mean that we can't rely on ctx always matching the currently running CPU. static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) return __blk_mq_get_ctx(q, get_cpu()); }
crash> struct request_queue.queue_ctx 0xffff92adb34f2660
queue_ctx = 0x25ec0
crash> kmem -o
...
CPU 60: ffff92adbf500000
...
crash> px (0xffff92adbf500000+0x25ec0)
$3 = 0xffff92adbf525ec0
crash>
crash> struct request_queue.queue_hw_ctx 0xffff92adb34f2660
queue_hw_ctx = 0xffff92adb9383e00
crash> rd 0xffff92adb9383e00
ffff92adb9383e00: ffff92adb2028400 ........
crash>
queue_ctx与queue_hw_ctx在virtblk_probe->blk_mq_init_queue->blk_mq_init_allocated_queue进行初始化: struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) q->queue_ctx = alloc_percpu(struct blk_mq_ctx); q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)), GFP_KERNEL, set->numa_node); blk_mq_realloc_hw_ctxs(set, q); }
查看下sched_tags的信息,sched_tags在在下列调用流程中进行分配内存和初始化: blk_mq_realloc_hw_ctxs->blk_mq_init_hctx->blk_mq_sched_init_hctx->blk_mq_sched_alloc_tags static int blk_mq_sched_alloc_tags(struct request_queue *q, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, set->reserved_tags); return ret;
crash> struct blk_mq_hw_ctx.sched_tags 0xffff92adb2028400
sched_tags = 0xffff92adb26638c0
crash>
//blk_mq_init_hctx->blk_mq_sched_init_hctx->blk_mq_sched_alloc_tags static int blk_mq_sched_alloc_tags(struct request_queue *q, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, set->reserved_tags);
crash> struct blk_mq_tags.nr_reserved_tags 0xffff92adb26638c0
nr_reserved_tags = 0
crash>
crash> struct blk_mq_tags.nr_tags 0xffff92adb26638c0
nr_tags = 256 //对应软件队列深度值request_queue.nr_requests,通过blk_mq_sched_alloc_tags->blk_mq_alloc_rq_map->blk_mq_init_tags进行初始化
crash>
crash> struct blk_mq_tags.bitmap_tags 0xffff92adb26638c0
bitmap_tags = {
sb = {
depth = 256,
shift = 6,
map_nr = 4,
map = 0xffff92adba7d7200
},
alloc_hint = 0x25e90,
wake_batch = 8,
wake_index = {
counter = 5
},
ws = 0xffff92adb9383800,
round_robin = false
}
crash> struct blk_mq_hw_ctx.tags 0xffff92adb2028400
tags = 0xffff92adb2663680
crash>
crash> struct blk_mq_tags.nr_reserved_tags 0xffff92adb2663680
nr_reserved_tags = 0
crash> struct blk_mq_tags.nr_tags 0xffff92adb2663680
nr_tags = 128 //对应硬件队列深度struct blk_mq_tag_set.queue_depth,可从/sys/block/vdg/mq/0/nr_tags获取。
//通过调用路径blk_mq_alloc_rq_maps->__blk_mq_alloc_rq_maps->__blk_mq_alloc_rq_map->blk_mq_alloc_rq_map->blk_mq_init_tags对该值进行初始化
crash>
crash> struct blk_mq_tags.bitmap_tags 0xffff92adb2663680
bitmap_tags = {
sb = {
depth = 128,
shift = 5,
map_nr = 4,
map = 0xffff92adba7d7000
},
alloc_hint = 0x25d8c,
wake_batch = 8,
wake_index = {
counter = 0
},
ws = 0xffff92adba2d5800,
round_robin = false
}
crash>
结合61516进程栈以及代码分析下61516进程为什么会进入IO 阻塞:
blk_mq_get_tag
__blk_mq_get_tag
__sbitmap_queue_get
unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) struct blk_mq_tags *tags = blk_mq_tags_from_data(data); tag = __blk_mq_get_tag(data, bt); io_schedule(); static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) if (data->flags & BLK_MQ_REQ_INTERNAL) //data->flags & BLK_MQ_REQ_INTERNAL =1 return data->hctx->sched_tags; return data->hctx->tags; }
因为blk_mq_alloc_data.flags值为1,所以data->flags & BLK_MQ_REQ_INTERNAL =1等于1:
crash> struct blk_mq_alloc_data.flags 0xffff9270cc167a20
flags = 4
crash>
因此blk_mq_get_tag函数中struct blk_mq_tags *tags = blk_mq_tags_from_data(data) 定义的tags指向的是sched_tags:
crash> struct blk_mq_hw_ctx.sched_tags 0xffff92adb2028400
sched_tags = 0xffff92adb26638c0
crash>
unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct sbitmap_queue *bt; bt = &tags->bitmap_tags; tag_offset = tags->nr_reserved_tags; tag = __blk_mq_get_tag(data, bt); }
crash> struct blk_mq_alloc_data.shallow_depth 0xffff9270cc167a20
shallow_depth = 0
crash>
static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt) if (!(data->flags & BLK_MQ_REQ_INTERNAL) && !hctx_may_queue(data->hctx, bt))//data->flags & BLK_MQ_REQ_INTERNAL=1,所以这里不成立 return -1; if (data->shallow_depth)//这里为0,条件不成立 return __sbitmap_queue_get_shallow(bt, data->shallow_depth); return __sbitmap_queue_get(bt);//因此执行这里 int __sbitmap_queue_get(struct sbitmap_queue *sbq) unsigned int hint, depth; int nr; hint = this_cpu_read(*sbq->alloc_hint); depth = READ_ONCE(sbq->sb.depth); if (unlikely(hint >= depth)) { hint = depth ? prandom_u32() % depth : 0; this_cpu_write(*sbq->alloc_hint, hint); nr = sbitmap_get(&sbq->sb, hint, sbq->round_robin); if (nr == -1) {//sbitmap_get分配tag失败 /* If the map is full, a hint won't do us much good. */ this_cpu_write(*sbq->alloc_hint, 0); } else if (nr == hint || unlikely(sbq->round_robin)) {//sbq->round_robin=false,从struct sbitmap_queue.round_robin ffff92adb26638d0获取 /* Only update the hint if we used it. */ hint = nr + 1; if (hint >= depth - 1)//hint超过软件队列深度 hint = 0;
![]() |
玩手机的鸡蛋面 · 显存不够怎么调用内存_头条 1 年前 |