Description of problem: Running g2 baseline gfs2 tests on ia64 I hit the following soft lockup. I have not seen this prior to running the .52 + abhi 005. BUG: soft lockup detected on CPU#0! Call Trace: [<a000000100013b20>] show_stack+0x40/0xa0 sp=e00000000ddc7860 bsp=e00000000ddc1758 [<a000000100013bb0>] dump_stack+0x30/0x60 sp=e00000000ddc7a30 bsp=e00000000ddc1740 [<a0000001000e5fe0>] softlockup_tick+0x240/0x280 sp=e00000000ddc7a30 bsp=e00000000ddc1700 [<a000000100092df0>] run_local_timers+0x30/0x60 sp=e00000000ddc7a40 bsp=e00000000ddc16e8 [<a000000100092ea0>] update_process_times+0x80/0x100 sp=e00000000ddc7a40 bsp=e00000000ddc16b8 [<a000000100037220>] timer_interrupt+0x180/0x360 sp=e00000000ddc7a40 bsp=e00000000ddc1678 [<a0000001000e6650>] handle_IRQ_event+0x90/0x120 sp=e00000000ddc7a40 bsp=e00000000ddc1638 [<a0000001000e6810>] __do_IRQ+0x130/0x420 sp=e00000000ddc7a40 bsp=e00000000ddc15e8 [<a000000100011630>] ia64_handle_irq+0xf0/0x1a0 sp=e00000000ddc7a40 bsp=e00000000ddc15b8 [<a00000010000c020>] __ia64_leave_kernel+0x0/0x280 sp=e00000000ddc7a40 bsp=e00000000ddc15b8 [<a000000100152920>] kfree+0x540/0x580 sp=e00000000ddc7c10 bsp=e00000000ddc1558 [<a000000200f37270>] gfs2_log_flush+0xc50/0xca0 [gfs2] sp=e00000000ddc7c20 bsp=e00000000ddc1520 [<a000000200f38010>] gfs2_log_reserve+0x1b0/0x380 [gfs2] sp=e00000000ddc7c50 bsp=e00000000ddc14c8 [<a000000200f631a0>] gfs2_do_trans_begin+0x280/0x340 [gfs2] sp=e00000000ddc7c50 bsp=e00000000ddc1478 [<a000000200f41a30>] gfs2_prepare_write+0x430/0x780 [gfs2] sp=e00000000ddc7c50 bsp=e00000000ddc1400 [<a0000001001002a0>] generic_file_buffered_write+0x3c0/0xda0 sp=e00000000ddc7c60 bsp=e00000000ddc1330 [<a000000100101370>] __generic_file_aio_write_nolock+0x6f0/0x800 sp=e00000000ddc7cf0 bsp=e00000000ddc12b8 [<a000000100101780>] __generic_file_write_nolock+0x100/0x140 sp=e00000000ddc7d30 bsp=e00000000ddc1280 [<a0000001001019a0>] generic_file_write+0x80/0x1a0 sp=e00000000ddc7e10 bsp=e00000000ddc1230 [<a000000100161200>] vfs_write+0x200/0x3a0 sp=e00000000ddc7e20 bsp=e00000000ddc11d8 [<a000000100161d50>] sys_write+0x70/0xe0 sp=e00000000ddc7e20 bsp=e00000000ddc1160 [<a00000010000bdb0>] __ia64_trace_syscall+0xd0/0x110 sp=e00000000ddc7e30 bsp=e00000000ddc1160 [<a000000000010620>] __start_ivt_text+0xffffffff00010620/0x400 sp=e00000000ddc8000 bsp=e00000000ddc1160 BUG: soft lockup detected on CPU#0! Call Trace: [<a000000100013b20>] show_stack+0x40/0xa0 sp=e00000002cfff850 bsp=e00000002cff9748 [<a000000100013bb0>] dump_stack+0x30/0x60 sp=e00000002cfffa20 bsp=e00000002cff9730 [<a0000001000e5fe0>] softlockup_tick+0x240/0x280 sp=e00000002cfffa20 bsp=e00000002cff96f0 [<a000000100092df0>] run_local_timers+0x30/0x60 sp=e00000002cfffa30 bsp=e00000002cff96d8 [<a000000100092ea0>] update_process_times+0x80/0x100 sp=e00000002cfffa30 bsp=e00000002cff96a8 [<a000000100037220>] timer_interrupt+0x180/0x360 sp=e00000002cfffa30 bsp=e00000002cff9668 [<a0000001000e6650>] handle_IRQ_event+0x90/0x120 sp=e00000002cfffa30 bsp=e00000002cff9628 [<a0000001000e6810>] __do_IRQ+0x130/0x420 sp=e00000002cfffa30 bsp=e00000002cff95d8 [<a000000100011630>] ia64_handle_irq+0xf0/0x1a0 sp=e00000002cfffa30 bsp=e00000002cff95a8 [<a00000010000c020>] __ia64_leave_kernel+0x0/0x280 sp=e00000002cfffa30 bsp=e00000002cff95a8 [<a0000001006262d0>] _spin_lock+0x30/0x60 sp=e00000002cfffc00 bsp=e00000002cff95a0 [<a000000200f3b5d0>] databuf_lo_before_commit+0x130/0x4c0 [gfs2] sp=e00000002cfffc00 bsp=e00000002cff9558 [<a000000200f36c60>] gfs2_log_flush+0x640/0xca0 [gfs2] sp=e00000002cfffc20 bsp=e00000002cff9520 [<a000000200f38010>] gfs2_log_reserve+0x1b0/0x380 [gfs2] sp=e00000002cfffc50 bsp=e00000002cff94c8 [<a000000200f631a0>] gfs2_do_trans_begin+0x280/0x340 [gfs2] sp=e00000002cfffc50 bsp=e00000002cff9478 [<a000000200f41a30>] gfs2_prepare_write+0x430/0x780 [gfs2] sp=e00000002cfffc50 bsp=e00000002cff9400 [<a0000001001002a0>] generic_file_buffered_write+0x3c0/0xda0 sp=e00000002cfffc60 bsp=e00000002cff9330 [<a000000100101370>] __generic_file_aio_write_nolock+0x6f0/0x800 sp=e00000002cfffcf0 bsp=e00000002cff92b8 [<a000000100101780>] __generic_file_write_nolock+0x100/0x140 sp=e00000002cfffd30 bsp=e00000002cff9280 [<a0000001001019a0>] generic_file_write+0x80/0x1a0 sp=e00000002cfffe10 bsp=e00000002cff9230 [<a000000100161200>] vfs_write+0x200/0x3a0 sp=e00000002cfffe20 bsp=e00000002cff91d8 [<a000000100161d50>] sys_write+0x70/0xe0 sp=e00000002cfffe20 bsp=e00000002cff9160 [<a00000010000bdb0>] __ia64_trace_syscall+0xd0/0x110 sp=e00000002cfffe30 bsp=e00000002cff9160 [<a000000000010620>] __start_ivt_text+0xffffffff00010620/0x400 sp=e00000002d000000 bsp=e00000002cff9160 BUG: soft lockup detected on CPU#0! Call Trace: [<a000000100013b20>] show_stack+0x40/0xa0 sp=e000000023ccf990 bsp=e000000023cc9718 [<a000000100013bb0>] dump_stack+0x30/0x60 sp=e000000023ccfb60 bsp=e000000023cc9700 [<a0000001000e5fe0>] softlockup_tick+0x240/0x280 sp=e000000023ccfb60 bsp=e000000023cc96c0 [<a000000100092df0>] run_local_timers+0x30/0x60 sp=e000000023ccfb70 bsp=e000000023cc96a8 [<a000000100092ea0>] update_process_times+0x80/0x100 sp=e000000023ccfb70 bsp=e000000023cc9678 [<a000000100037220>] timer_interrupt+0x180/0x360 sp=e000000023ccfb70 bsp=e000000023cc9638 [<a0000001000e6650>] handle_IRQ_event+0x90/0x120 sp=e000000023ccfb70 bsp=e000000023cc95f0 [<a0000001000e6810>] __do_IRQ+0x130/0x420 sp=e000000023ccfb70 bsp=e000000023cc95a8 [<a000000100011630>] ia64_handle_irq+0xf0/0x1a0 sp=e000000023ccfb70 bsp=e000000023cc9578 [<a00000010000c020>] __ia64_leave_kernel+0x0/0x280 sp=e000000023ccfb70 bsp=e000000023cc9578 [<a000000200f34f60>] gfs2_ail1_start_one+0x120/0x420 [gfs2] sp=e000000023ccfd40 bsp=e000000023cc9508 [<a000000200f353f0>] gfs2_ail1_start+0x190/0x220 [gfs2] sp=e000000023ccfd40 bsp=e000000023cc94c8 [<a000000200f38030>] gfs2_log_reserve+0x1d0/0x380 [gfs2] sp=e000000023ccfd40 bsp=e000000023cc9470 [<a000000200f631a0>] gfs2_do_trans_begin+0x280/0x340 [gfs2] sp=e000000023ccfd40 bsp=e000000023cc9420 [<a000000200f3f750>] gfs2_write_cache_jdata+0x2d0/0x940 [gfs2] sp=e000000023ccfd40 bsp=e000000023cc9390 [<a000000200f410b0>] gfs2_jdata_writepages+0x50/0x100 [gfs2] sp=e000000023ccfdd0 bsp=e000000023cc9358 [<a00000010010d590>] do_writepages+0xb0/0x120 sp=e000000023ccfdd0 bsp=e000000023cc9330 [<a0000001000fefd0>] __filemap_fdatawrite_range+0xb0/0xe0 sp=e000000023ccfdd0 bsp=e000000023cc92f8 [<a0000001000ff0f0>] filemap_fdatawrite_range+0x30/0x60 sp=e000000023ccfe10 bsp=e000000023cc92c8 [<a0000001000ff290>] sync_page_range+0x70/0x140 sp=e000000023ccfe10 bsp=e000000023cc9280 [<a000000100101a70>] generic_file_write+0x150/0x1a0 sp=e000000023ccfe10 bsp=e000000023cc9230 [<a000000100161200>] vfs_write+0x200/0x3a0 sp=e000000023ccfe20 bsp=e000000023cc91d8 [<a000000100161d50>] sys_write+0x70/0xe0 sp=e000000023ccfe20 bsp=e000000023cc9160 [<a00000010000bdb0>] __ia64_trace_syscall+0xd0/0x110 sp=e000000023ccfe30 bsp=e000000023cc9160 [<a000000000010620>] __start_ivt_text+0xffffffff00010620/0x400 sp=e000000023cd0000 bsp=e000000023cc9160 kernel unaligned access to 0xe0000000344f912e, ip=0xa000000100040c50 kernel unaligned access to 0xe00000002801907d, ip=0xa000000100040c50 BUG: soft lockup detected on CPU#0! Call Trace: [<a000000100013b20>] show_stack+0x40/0xa0 sp=e0000000395a78a0 bsp=e0000000395a1770 [<a000000100013bb0>] dump_stack+0x30/0x60 sp=e0000000395a7a70 bsp=e0000000395a1758 [<a0000001000e5fe0>] softlockup_tick+0x240/0x280 sp=e0000000395a7a70 bsp=e0000000395a1718 [<a000000100092df0>] run_local_timers+0x30/0x60 sp=e0000000395a7a80 bsp=e0000000395a1700 [<a000000100092ea0>] update_process_times+0x80/0x100 sp=e0000000395a7a80 bsp=e0000000395a16d0 [<a000000100037220>] timer_interrupt+0x180/0x360 sp=e0000000395a7a80 bsp=e0000000395a1690 [<a0000001000e6650>] handle_IRQ_event+0x90/0x120 sp=e0000000395a7a80 bsp=e0000000395a1650 [<a0000001000e6810>] __do_IRQ+0x130/0x420 sp=e0000000395a7a80 bsp=e0000000395a1608 [<a000000100011630>] ia64_handle_irq+0xf0/0x1a0 sp=e0000000395a7a80 bsp=e0000000395a15d0 [<a00000010000c020>] __ia64_leave_kernel+0x0/0x280 sp=e0000000395a7a80 bsp=e0000000395a15d0 [<a000000200f35220>] gfs2_ail1_start_one+0x3e0/0x420 [gfs2] sp=e0000000395a7c50 bsp=e0000000395a1560 [<a000000200f353f0>] gfs2_ail1_start+0x190/0x220 [gfs2] sp=e0000000395a7c50 bsp=e0000000395a1520 [<a000000200f38030>] gfs2_log_reserve+0x1d0/0x380 [gfs2] sp=e0000000395a7c50 bsp=e0000000395a14c8 [<a000000200f631a0>] gfs2_do_trans_begin+0x280/0x340 [gfs2] sp=e0000000395a7c50 bsp=e0000000395a1478 [<a000000200f41a30>] gfs2_prepare_write+0x430/0x780 [gfs2] sp=e0000000395a7c50 bsp=e0000000395a1400 [<a0000001001002a0>] generic_file_buffered_write+0x3c0/0xda0 sp=e0000000395a7c60 bsp=e0000000395a1330 [<a000000100101370>] __generic_file_aio_write_nolock+0x6f0/0x800 sp=e0000000395a7cf0 bsp=e0000000395a12b8 [<a000000100101780>] __generic_file_write_nolock+0x100/0x140 sp=e0000000395a7d30 bsp=e0000000395a1280 [<a0000001001019a0>] generic_file_write+0x80/0x1a0 sp=e0000000395a7e10 bsp=e0000000395a1230 [<a000000100161200>] vfs_write+0x200/0x3a0 sp=e0000000395a7e20 bsp=e0000000395a11d8 [<a000000100161d50>] sys_write+0x70/0xe0 sp=e0000000395a7e20 bsp=e0000000395a1160 [<a00000010000bdb0>] __ia64_trace_syscall+0xd0/0x110 sp=e0000000395a7e30 bsp=e0000000395a1160 [<a000000000010620>] __start_ivt_text+0xffffffff00010620/0x400 sp=e0000000395a8000 bsp=e0000000395a1160 BUG: soft lockup detected on CPU#0! Call Trace: [<a000000100013b20>] show_stack+0x40/0xa0 sp=e000000003e8f870 bsp=e000000003e89720 [<a000000100013bb0>] dump_stack+0x30/0x60 sp=e000000003e8fa40 bsp=e000000003e89708 [<a0000001000e5fe0>] softlockup_tick+0x240/0x280 sp=e000000003e8fa40 bsp=e000000003e896c8 [<a000000100092df0>] run_local_timers+0x30/0x60 sp=e000000003e8fa50 bsp=e000000003e896b0 [<a000000100092ea0>] update_process_times+0x80/0x100 sp=e000000003e8fa50 bsp=e000000003e89680 [<a000000100037220>] timer_interrupt+0x180/0x360 sp=e000000003e8fa50 bsp=e000000003e89640 [<a0000001000e6650>] handle_IRQ_event+0x90/0x120 sp=e000000003e8fa50 bsp=e000000003e89600 [<a0000001000e6810>] __do_IRQ+0x130/0x420 sp=e000000003e8fa50 bsp=e000000003e895b0 [<a000000100011630>] ia64_handle_irq+0xf0/0x1a0 sp=e000000003e8fa50 bsp=e000000003e89580 [<a00000010000c020>] __ia64_leave_kernel+0x0/0x280 sp=e000000003e8fa50 bsp=e000000003e89580 [<a000000200f39380>] glock_lo_after_commit+0x0/0x1a0 [gfs2] sp=e000000003e8fc20 bsp=e000000003e89558 [<a000000200f37120>] gfs2_log_flush+0xb00/0xca0 [gfs2] sp=e000000003e8fc20 bsp=e000000003e89520 [<a000000200f38010>] gfs2_log_reserve+0x1b0/0x380 [gfs2] sp=e000000003e8fc50 bsp=e000000003e894c8 [<a000000200f631a0>] gfs2_do_trans_begin+0x280/0x340 [gfs2] sp=e000000003e8fc50 bsp=e000000003e89478 [<a000000200f41a30>] gfs2_prepare_write+0x430/0x780 [gfs2] sp=e000000003e8fc50 bsp=e000000003e89400 [<a0000001001002a0>] generic_file_buffered_write+0x3c0/0xda0 sp=e000000003e8fc60 bsp=e000000003e89330 [<a000000100101370>] __generic_file_aio_write_nolock+0x6f0/0x800 sp=e000000003e8fcf0 bsp=e000000003e892b8 [<a000000100101780>] __generic_file_write_nolock+0x100/0x140 sp=e000000003e8fd30 bsp=e000000003e89280 [<a0000001001019a0>] generic_file_write+0x80/0x1a0 sp=e000000003e8fe10 bsp=e000000003e89230 [<a000000100161200>] vfs_write+0x200/0x3a0 sp=e000000003e8fe20 bsp=e000000003e891d8 [<a000000100161d50>] sys_write+0x70/0xe0 sp=e000000003e8fe20 bsp=e000000003e89160 [<a00000010000bdb0>] __ia64_trace_syscall+0xd0/0x110 sp=e000000003e8fe30 bsp=e000000003e89160 [<a000000000010620>] __start_ivt_text+0xffffffff00010620/0x400 sp=e000000003e90000 bsp=e000000003e89160 BUG: soft lockup detected on CPU#0! Call Trace: [<a000000100013b20>] show_stack+0x40/0xa0 sp=e00000003d077870 bsp=e00000003d071748 [<a000000100013bb0>] dump_stack+0x30/0x60 sp=e00000003d077a40 bsp=e00000003d071730 [<a0000001000e5fe0>] softlockup_tick+0x240/0x280 sp=e00000003d077a40 bsp=e00000003d0716f0 [<a000000100092df0>] run_local_timers+0x30/0x60 sp=e00000003d077a50 bsp=e00000003d0716d8 [<a000000100092ea0>] update_process_times+0x80/0x100 sp=e00000003d077a50 bsp=e00000003d0716a8 [<a000000100037220>] timer_interrupt+0x180/0x360 sp=e00000003d077a50 bsp=e00000003d071668 [<a0000001000e6650>] handle_IRQ_event+0x90/0x120 sp=e00000003d077a50 bsp=e00000003d071628 [<a0000001000e6810>] __do_IRQ+0x130/0x420 sp=e00000003d077a50 bsp=e00000003d0715d8 [<a000000100011630>] ia64_handle_irq+0xf0/0x1a0 sp=e00000003d077a50 bsp=e00000003d0715a8 [<a00000010000c020>] __ia64_leave_kernel+0x0/0x280 sp=e00000003d077a50 bsp=e00000003d0715a8 [<a000000200f39500>] glock_lo_after_commit+0x180/0x1a0 [gfs2] sp=e00000003d077c20 bsp=e00000003d071558 [<a000000200f37120>] gfs2_log_flush+0xb00/0xca0 [gfs2] sp=e00000003d077c20 bsp=e00000003d071520 [<a000000200f38010>] gfs2_log_reserve+0x1b0/0x380 [gfs2] sp=e00000003d077c50 bsp=e00000003d0714c8 [<a000000200f631a0>] gfs2_do_trans_begin+0x280/0x340 [gfs2] sp=e00000003d077c50 bsp=e00000003d071478 [<a000000200f41a30>] gfs2_prepare_write+0x430/0x780 [gfs2] sp=e00000003d077c50 bsp=e00000003d071400 [<a0000001001002a0>] generic_file_buffered_write+0x3c0/0xda0 sp=e00000003d077c60 bsp=e00000003d071330 [<a000000100101370>] __generic_file_aio_write_nolock+0x6f0/0x800 sp=e00000003d077cf0 bsp=e00000003d0712b8 [<a000000100101780>] __generic_file_write_nolock+0x100/0x140 sp=e00000003d077d30 bsp=e00000003d071280 [<a0000001001019a0>] generic_file_write+0x80/0x1a0 sp=e00000003d077e10 bsp=e00000003d071230 [<a000000100161200>] vfs_write+0x200/0x3a0 sp=e00000003d077e20 bsp=e00000003d0711d8 [<a000000100161d50>] sys_write+0x70/0xe0 sp=e00000003d077e20 bsp=e00000003d071160 [<a00000010000bdb0>] __ia64_trace_syscall+0xd0/0x110 sp=e00000003d077e30 bsp=e00000003d071160 [<a000000000010620>] __start_ivt_text+0xffffffff00010620/0x400 sp=e00000003d078000 bsp=e00000003d071160 Version-Release number of selected component (if applicable): 2.6.18-52.el5 + kmod-gfs2-1.52-gfs2abhi.005 How reproducible: Every time. Steps to Reproduce: 1. Make/mount single node gfs2 2. Run g2 Actual results: Soft lockup warnings Expected results: No such warnings Additional info:
Since you appear to have a number of these warnings, I presume that the tests are still making progress, even if its a bit slow. My thought is that if the write is reasonably large, it is probably (given a log flush or two) taking long enough that it triggers the warning, even though its probably doing the right thing. We may well just need to add a suitable cond_reshed() to get rid of the warnings in that case. The slightly more long winded way to deal with the problem is via bugzilla #329321. I assume that the reason that this isn't seen for non-jdata files is that the volume of data for the log is much smaller, so that it doesn't reach a size where we stall for so long during log flushes.
Created attachment 245441 [details] Potential fix I wonder if this bug might potentially be fixed by the attached patch. It might possibly have some bearing on it anyway, so its worth a go.
I hit this again with abhi's kmod-gfs2-1.53-4.1: GFS2 Overlay (built Nov 16 2007 12:45:12) installed Red Hat Enterprise Linux Server release 5.1 (Tikanga) Kernel 2.6.18-53.el5 on an ia64 a1 login: BUG: soft lockup detected on CPU#0! Call Trace: [<a000000100013b20>] show_stack+0x40/0xa0 sp=e000004047e2f8a0 bsp=e000004047e29770 [<a000000100013bb0>] dump_stack+0x30/0x60 sp=e000004047e2fa70 bsp=e000004047e29758 [<a0000001000e5fe0>] softlockup_tick+0x240/0x280 sp=e000004047e2fa70 bsp=e000004047e29718 [<a000000100092df0>] run_local_timers+0x30/0x60 sp=e000004047e2fa80 bsp=e000004047e29700 [<a000000100092ea0>] update_process_times+0x80/0x100 sp=e000004047e2fa80 bsp=e000004047e296d0 [<a000000100037220>] timer_interrupt+0x180/0x360 sp=e000004047e2fa80 bsp=e000004047e29690 [<a0000001000e6650>] handle_IRQ_event+0x90/0x120 sp=e000004047e2fa80 bsp=e000004047e29650 [<a0000001000e6810>] __do_IRQ+0x130/0x420 sp=e000004047e2fa80 bsp=e000004047e29608 [<a000000100011630>] ia64_handle_irq+0xf0/0x1a0 sp=e000004047e2fa80 bsp=e000004047e295d0 [<a00000010000c020>] __ia64_leave_kernel+0x0/0x280 sp=e000004047e2fa80 bsp=e000004047e295d0 [<a000000202a88ef0>] gfs2_ail1_start_one+0xb0/0x420 [gfs2] sp=e000004047e2fc50 bsp=e000004047e29560 [<a000000202a893f0>] gfs2_ail1_start+0x190/0x220 [gfs2] sp=e000004047e2fc50 bsp=e000004047e29520 [<a000000202a8c050>] gfs2_log_reserve+0x1d0/0x380 [gfs2] sp=e000004047e2fc50 bsp=e000004047e294c8 [<a000000202ab7140>] gfs2_do_trans_begin+0x280/0x340 [gfs2] sp=e000004047e2fc50 bsp=e000004047e29478 [<a000000202a95a30>] gfs2_prepare_write+0x430/0x700 [gfs2] sp=e000004047e2fc50 bsp=e000004047e29400 [<a0000001001002a0>] generic_file_buffered_write+0x3c0/0xda0 sp=e000004047e2fc60 bsp=e000004047e29330 [<a000000100101370>] __generic_file_aio_write_nolock+0x6f0/0x800 sp=e000004047e2fcf0 bsp=e000004047e292b8 [<a000000100101780>] __generic_file_write_nolock+0x100/0x140 sp=e000004047e2fd30 bsp=e000004047e29280 [<a0000001001019a0>] generic_file_write+0x80/0x1a0 sp=e000004047e2fe10 bsp=e000004047e29230 [<a000000100161200>] vfs_write+0x200/0x3a0 sp=e000004047e2fe20 bsp=e000004047e291d8 [<a000000100161d50>] sys_write+0x70/0xe0 sp=e000004047e2fe20 bsp=e000004047e29160 [<a00000010000bdb0>] __ia64_trace_syscall+0xd0/0x110 sp=e000004047e2fe30 bsp=e000004047e29160 [<a000000000010620>] __start_ivt_text+0xffffffff00010620/0x400 sp=e000004047e30000 bsp=e000004047e29160 BUG: soft lockup detected on CPU#0! Call Trace: [<a000000100013b20>] show_stack+0x40/0xa0 sp=e000004050d7f8a0 bsp=e000004050d79770 [<a000000100013bb0>] dump_stack+0x30/0x60 sp=e000004050d7fa70 bsp=e000004050d79758 [<a0000001000e5fe0>] softlockup_tick+0x240/0x280 sp=e000004050d7fa70 bsp=e000004050d79718 [<a000000100092df0>] run_local_timers+0x30/0x60 sp=e000004050d7fa80 bsp=e000004050d79700 [<a000000100092ea0>] update_process_times+0x80/0x100 sp=e000004050d7fa80 bsp=e000004050d796d0 [<a000000100037220>] timer_interrupt+0x180/0x360 sp=e000004050d7fa80 bsp=e000004050d79690 [<a0000001000e6650>] handle_IRQ_event+0x90/0x120 sp=e000004050d7fa80 bsp=e000004050d79650 [<a0000001000e6810>] __do_IRQ+0x130/0x420 sp=e000004050d7fa80 bsp=e000004050d79608 [<a000000100011630>] ia64_handle_irq+0xf0/0x1a0 sp=e000004050d7fa80 bsp=e000004050d795d0 [<a00000010000c020>] __ia64_leave_kernel+0x0/0x280 sp=e000004050d7fa80 bsp=e000004050d795d0 [<a000000202a88ee0>] gfs2_ail1_start_one+0xa0/0x420 [gfs2] sp=e000004050d7fc50 bsp=e000004050d79560 [<a000000202a893f0>] gfs2_ail1_start+0x190/0x220 [gfs2] sp=e000004050d7fc50 bsp=e000004050d79520 [<a000000202a8c050>] gfs2_log_reserve+0x1d0/0x380 [gfs2] sp=e000004050d7fc50 bsp=e000004050d794c8 [<a000000202ab7140>] gfs2_do_trans_begin+0x280/0x340 [gfs2] sp=e000004050d7fc50 bsp=e000004050d79478 [<a000000202a95a30>] gfs2_prepare_write+0x430/0x700 [gfs2] sp=e000004050d7fc50 bsp=e000004050d79400 [<a0000001001002a0>] generic_file_buffered_write+0x3c0/0xda0 sp=e000004050d7fc60 bsp=e000004050d79330 [<a000000100101370>] __generic_file_aio_write_nolock+0x6f0/0x800 sp=e000004050d7fcf0 bsp=e000004050d792b8 [<a000000100101780>] __generic_file_write_nolock+0x100/0x140 sp=e000004050d7fd30 bsp=e000004050d79280 [<a0000001001019a0>] generic_file_write+0x80/0x1a0 sp=e000004050d7fe10 bsp=e000004050d79230 [<a000000100161200>] vfs_write+0x200/0x3a0 sp=e000004050d7fe20 bsp=e000004050d791d8 [<a000000100161d50>] sys_write+0x70/0xe0 sp=e000004050d7fe20 bsp=e000004050d79160 [<a00000010000bdb0>] __ia64_trace_syscall+0xd0/0x110 sp=e000004050d7fe30 bsp=e000004050d79160 [<a000000000010620>] __start_ivt_text+0xffffffff00010620/0x400 sp=e000004050d80000 bsp=e000004050d79160 BUG: soft lockup detected on CPU#0! Call Trace: [<a000000100013b20>] show_stack+0x40/0xa0 sp=e0000040ba4678a0 bsp=e0000040ba461770 [<a000000100013bb0>] dump_stack+0x30/0x60 sp=e0000040ba467a70 bsp=e0000040ba461758 [<a0000001000e5fe0>] softlockup_tick+0x240/0x280 sp=e0000040ba467a70 bsp=e0000040ba461718 [<a000000100092df0>] run_local_timers+0x30/0x60 sp=e0000040ba467a80 bsp=e0000040ba461700 [<a000000100092ea0>] update_process_times+0x80/0x100 sp=e0000040ba467a80 bsp=e0000040ba4616d0 [<a000000100037220>] timer_interrupt+0x180/0x360 sp=e0000040ba467a80 bsp=e0000040ba461690 [<a0000001000e6650>] handle_IRQ_event+0x90/0x120 sp=e0000040ba467a80 bsp=e0000040ba461650 [<a0000001000e6810>] __do_IRQ+0x130/0x420 sp=e0000040ba467a80 bsp=e0000040ba461608 [<a000000100011630>] ia64_handle_irq+0xf0/0x1a0 sp=e0000040ba467a80 bsp=e0000040ba4615d0 [<a00000010000c020>] __ia64_leave_kernel+0x0/0x280 sp=e0000040ba467a80 bsp=e0000040ba4615d0 [<a000000202a88ea0>] gfs2_ail1_start_one+0x60/0x420 [gfs2] sp=e0000040ba467c50 bsp=e0000040ba461560 [<a000000202a89320>] gfs2_ail1_start+0xc0/0x220 [gfs2] sp=e0000040ba467c50 bsp=e0000040ba461520 [<a000000202a8c050>] gfs2_log_reserve+0x1d0/0x380 [gfs2] sp=e0000040ba467c50 bsp=e0000040ba4614c8 [<a000000202ab7140>] gfs2_do_trans_begin+0x280/0x340 [gfs2] sp=e0000040ba467c50 bsp=e0000040ba461478 [<a000000202a95a30>] gfs2_prepare_write+0x430/0x700 [gfs2] sp=e0000040ba467c50 bsp=e0000040ba461400 [<a0000001001002a0>] generic_file_buffered_write+0x3c0/0xda0 sp=e0000040ba467c60 bsp=e0000040ba461330 [<a000000100101370>] __generic_file_aio_write_nolock+0x6f0/0x800 sp=e0000040ba467cf0 bsp=e0000040ba4612b8 [<a000000100101780>] __generic_file_write_nolock+0x100/0x140 sp=e0000040ba467d30 bsp=e0000040ba461280 [<a0000001001019a0>] generic_file_write+0x80/0x1a0 sp=e0000040ba467e10 bsp=e0000040ba461230 [<a000000100161200>] vfs_write+0x200/0x3a0 sp=e0000040ba467e20 bsp=e0000040ba4611d8 [<a000000100161d50>] sys_write+0x70/0xe0 sp=e0000040ba467e20 bsp=e0000040ba461160 [<a00000010000bdb0>] __ia64_trace_syscall+0xd0/0x110 sp=e0000040ba467e30 bsp=e0000040ba461160 [<a000000000010620>] __start_ivt_text+0xffffffff00010620/0x400 sp=e0000040ba468000 bsp=e0000040ba461160 BUG: soft lockup detected on CPU#0! Call Trace: [<a000000100013b20>] show_stack+0x40/0xa0 sp=e0000040932ef990 bsp=e0000040932e9718 [<a000000100013bb0>] dump_stack+0x30/0x60 sp=e0000040932efb60 bsp=e0000040932e9700 [<a0000001000e5fe0>] softlockup_tick+0x240/0x280 sp=e0000040932efb60 bsp=e0000040932e96c0 [<a000000100092df0>] run_local_timers+0x30/0x60 sp=e0000040932efb70 bsp=e0000040932e96a8 [<a000000100092ea0>] update_process_times+0x80/0x100 sp=e0000040932efb70 bsp=e0000040932e9678 [<a000000100037220>] timer_interrupt+0x180/0x360 sp=e0000040932efb70 bsp=e0000040932e9638 [<a0000001000e6650>] handle_IRQ_event+0x90/0x120 sp=e0000040932efb70 bsp=e0000040932e95f0 [<a0000001000e6810>] __do_IRQ+0x130/0x420 sp=e0000040932efb70 bsp=e0000040932e95a8 [<a000000100011630>] ia64_handle_irq+0xf0/0x1a0 sp=e0000040932efb70 bsp=e0000040932e9578 [<a00000010000c020>] __ia64_leave_kernel+0x0/0x280 sp=e0000040932efb70 bsp=e0000040932e9578 [<a000000202a88f60>] gfs2_ail1_start_one+0x120/0x420 [gfs2] sp=e0000040932efd40 bsp=e0000040932e9508 [<a000000202a893f0>] gfs2_ail1_start+0x190/0x220 [gfs2] sp=e0000040932efd40 bsp=e0000040932e94c8 [<a000000202a8c050>] gfs2_log_reserve+0x1d0/0x380 [gfs2] sp=e0000040932efd40 bsp=e0000040932e9470 [<a000000202ab7140>] gfs2_do_trans_begin+0x280/0x340 [gfs2] sp=e0000040932efd40 bsp=e0000040932e9420 [<a000000202a93760>] gfs2_write_cache_jdata+0x2c0/0x920 [gfs2] sp=e0000040932efd40 bsp=e0000040932e9390 [<a000000202a950b0>] gfs2_jdata_writepages+0x50/0x100 [gfs2] sp=e0000040932efdd0 bsp=e0000040932e9358 [<a00000010010d590>] do_writepages+0xb0/0x120 sp=e0000040932efdd0 bsp=e0000040932e9330 [<a0000001000fefd0>] __filemap_fdatawrite_range+0xb0/0xe0 sp=e0000040932efdd0 bsp=e0000040932e92f8 [<a0000001000ff0f0>] filemap_fdatawrite_range+0x30/0x60 sp=e0000040932efe10 bsp=e0000040932e92c8 [<a0000001000ff290>] sync_page_range+0x70/0x140 sp=e0000040932efe10 bsp=e0000040932e9280 [<a000000100101a70>] generic_file_write+0x150/0x1a0 sp=e0000040932efe10 bsp=e0000040932e9230 [<a000000100161200>] vfs_write+0x200/0x3a0 sp=e0000040932efe20 bsp=e0000040932e91d8 [<a000000100161d50>] sys_write+0x70/0xe0 sp=e0000040932efe20 bsp=e0000040932e9160 [<a00000010000bdb0>] __ia64_trace_syscall+0xd0/0x110 sp=e0000040932efe30 bsp=e0000040932e9160 [<a000000000010620>] __start_ivt_text+0xffffffff00010620/0x400 sp=e0000040932f0000 bsp=e0000040932e9160 BUG: soft lockup detected on CPU#0! Call Trace: [<a000000100013b20>] show_stack+0x40/0xa0 sp=e0000001f6f2f8a0 bsp=e0000001f6f29770 [<a000000100013bb0>] dump_stack+0x30/0x60 sp=e0000001f6f2fa70 bsp=e0000001f6f29758 [<a0000001000e5fe0>] softlockup_tick+0x240/0x280 sp=e0000001f6f2fa70 bsp=e0000001f6f29718 [<a000000100092df0>] run_local_timers+0x30/0x60 sp=e0000001f6f2fa80 bsp=e0000001f6f29700 [<a000000100092ea0>] update_process_times+0x80/0x100 sp=e0000001f6f2fa80 bsp=e0000001f6f296d0 [<a000000100037220>] timer_interrupt+0x180/0x360 sp=e0000001f6f2fa80 bsp=e0000001f6f29690 [<a0000001000e6650>] handle_IRQ_event+0x90/0x120 sp=e0000001f6f2fa80 bsp=e0000001f6f29650 [<a0000001000e6810>] __do_IRQ+0x130/0x420 sp=e0000001f6f2fa80 bsp=e0000001f6f29608 [<a000000100011630>] ia64_handle_irq+0xf0/0x1a0 sp=e0000001f6f2fa80 bsp=e0000001f6f295d0 [<a00000010000c020>] __ia64_leave_kernel+0x0/0x280 sp=e0000001f6f2fa80 bsp=e0000001f6f295d0 [<a000000202a88f60>] gfs2_ail1_start_one+0x120/0x420 [gfs2] sp=e0000001f6f2fc50 bsp=e0000001f6f29560 [<a000000202a893f0>] gfs2_ail1_start+0x190/0x220 [gfs2] sp=e0000001f6f2fc50 bsp=e0000001f6f29520 [<a000000202a8c050>] gfs2_log_reserve+0x1d0/0x380 [gfs2] sp=e0000001f6f2fc50 bsp=e0000001f6f294c8 [<a000000202ab7140>] gfs2_do_trans_begin+0x280/0x340 [gfs2] sp=e0000001f6f2fc50 bsp=e0000001f6f29478 [<a000000202a95a30>] gfs2_prepare_write+0x430/0x700 [gfs2] sp=e0000001f6f2fc50 bsp=e0000001f6f29400 [<a0000001001002a0>] generic_file_buffered_write+0x3c0/0xda0 sp=e0000001f6f2fc60 bsp=e0000001f6f29330 [<a000000100101370>] __generic_file_aio_write_nolock+0x6f0/0x800 sp=e0000001f6f2fcf0 bsp=e0000001f6f292b8 [<a000000100101780>] __generic_file_write_nolock+0x100/0x140 sp=e0000001f6f2fd30 bsp=e0000001f6f29280 [<a0000001001019a0>] generic_file_write+0x80/0x1a0 sp=e0000001f6f2fe10 bsp=e0000001f6f29230 [<a000000100161200>] vfs_write+0x200/0x3a0 sp=e0000001f6f2fe20 bsp=e0000001f6f291d8 [<a000000100161d50>] sys_write+0x70/0xe0 sp=e0000001f6f2fe20 bsp=e0000001f6f29160 [<a00000010000bdb0>] __ia64_trace_syscall+0xd0/0x110 sp=e0000001f6f2fe30 bsp=e0000001f6f29160 [<a000000000010620>] __start_ivt_text+0xffffffff00010620/0x400 sp=e0000001f6f30000 bsp=e0000001f6f29160
Created attachment 290856 [details] simple fix I haven't quite looked into this just yet, but I'm pretty sure its safe to drop the sd_log_lock during this loop and re-acquire it when we are rescheduled. This should stop the softlockups from happening when using jdata.
We already drop the log lock at this point in time. If we need to add a cond_resched() here thats ok, but there is no need for the rest of that patch. In fact I suspect that with spinlock debugging on, it will flag up as a bug in its current form.
Hit this on x86 w/ kernel-2.6.18-53.el5 and kmod-gfs2-1.62-1.4 installed. I was running d_io with flocks and tank-04 was running accordion_quick and lock2. BUG: soft lockup detected on CPU#0! [<c044d21c>] softlockup_tick+0x96/0xa4 [<c042ddb0>] update_process_times+0x39/0x5c [<c04196fb>] smp_apic_timer_interrupt+0x5b/0x6c [<c04059bf>] apic_timer_interrupt+0x1f/0x24 [<c042007b>] try_to_wake_up+0x145/0x37b [<c0605810>] _spin_lock+0x7/0xf [<f8d4f3a7>] gfs2_ail1_empty+0x13/0x7d [gfs2] [<f8d44539>] gfs2_logd+0x44/0x13f [gfs2] [<f8d444f5>] gfs2_logd+0x0/0x13f [gfs2] [<c0435f65>] kthread+0xc0/0xeb [<c0435ea5>] kthread+0x0/0xeb [<c0405c3b>] kernel_thread_helper+0x7/0x10 ======================= BUG: soft lockup detected on CPU#1! [<c044d21c>] softlockup_tick+0x96/0xa4 [<c042ddb0>] update_process_times+0x39/0x5c [<c04196fb>] smp_apic_timer_interrupt+0x5b/0x6c [<c04059bf>] apic_timer_interrupt+0x1f/0x24 [<c0605810>] _spin_lock+0x7/0xf [<f8d4f584>] gfs2_ail1_start_one+0x12b/0x15b [gfs2] [<f8d4f61c>] gfs2_ail1_start+0x68/0xe6 [gfs2] [<f8d504fe>] gfs2_log_reserve+0xc6/0x171 [gfs2] [<f8d5f710>] gfs2_do_trans_begin+0xe3/0x119 [gfs2] [<f8d53727>] gfs2_prepare_write+0x1c2/0x2d2 [gfs2] [<c0455c0a>] generic_file_buffered_write+0x23f/0x5f1 [<c042a1f1>] current_fs_time+0x4a/0x55 [<c0456462>] __generic_file_aio_write_nolock+0x4a6/0x52a [<f8ab510b>] gdlm_ast+0x0/0x2 [lock_dlm] [<c0456616>] __generic_file_write_nolock+0x86/0x9a [<c042da86>] lock_timer_base+0x15/0x2f [<c0436031>] autoremove_wake_function+0x0/0x2d [<c04e5231>] copy_to_user+0x31/0x48 [<c0604a93>] mutex_lock+0xb/0x19 [<c0456661>] generic_file_writev+0x37/0x96 [<c045662a>] generic_file_writev+0x0/0x96 [<c04566c0>] generic_file_write+0x0/0x94 [<c047001f>] do_readv_writev+0x166/0x277 [<f8d4c0dd>] gfs2_holder_uninit+0xb/0x1b [gfs2] [<f8d54f05>] gfs2_llseek+0x6f/0x92 [gfs2] [<c044abd9>] audit_syscall_entry+0x11c/0x14e [<c0470167>] vfs_writev+0x37/0x43 [<c04705ab>] sys_writev+0x3c/0x96 [<c0404eff>] syscall_call+0x7/0xb ======================= BUG: soft lockup detected on CPU#0! [<c044d21c>] softlockup_tick+0x96/0xa4 [<c042ddb0>] update_process_times+0x39/0x5c [<c04196fb>] smp_apic_timer_interrupt+0x5b/0x6c [<c04059bf>] apic_timer_interrupt+0x1f/0x24 [<c042007b>] try_to_wake_up+0x145/0x37b [<c0605813>] _spin_lock+0xa/0xf [<f8d4f3a7>] gfs2_ail1_empty+0x13/0x7d [gfs2] [<f8d44539>] gfs2_logd+0x44/0x13f [gfs2] [<f8d444f5>] gfs2_logd+0x0/0x13f [gfs2] [<c0435f65>] kthread+0xc0/0xeb [<c0435ea5>] kthread+0x0/0xeb [<c0405c3b>] kernel_thread_helper+0x7/0x10 ======================= BUG: soft lockup detected on CPU#1! [<c044d21c>] softlockup_tick+0x96/0xa4 [<c042ddb0>] update_process_times+0x39/0x5c [<c04196fb>] smp_apic_timer_interrupt+0x5b/0x6c [<c04059bf>] apic_timer_interrupt+0x1f/0x24 [<c0605810>] _spin_lock+0x7/0xf [<f8d4f584>] gfs2_ail1_start_one+0x12b/0x15b [gfs2] [<f8d4f61c>] gfs2_ail1_start+0x68/0xe6 [gfs2] [<f8d504fe>] gfs2_log_reserve+0xc6/0x171 [gfs2] [<f8d5f710>] gfs2_do_trans_begin+0xe3/0x119 [gfs2] [<f8d53727>] gfs2_prepare_write+0x1c2/0x2d2 [gfs2] [<c0455c0a>] generic_file_buffered_write+0x23f/0x5f1 [<c042a1f1>] current_fs_time+0x4a/0x55 [<c0456462>] __generic_file_aio_write_nolock+0x4a6/0x52a [<f8ab510b>] gdlm_ast+0x0/0x2 [lock_dlm] [<c0456616>] __generic_file_write_nolock+0x86/0x9a [<c042da86>] lock_timer_base+0x15/0x2f [<c0436031>] autoremove_wake_function+0x0/0x2d [<c04e5231>] copy_to_user+0x31/0x48 [<c0604a93>] mutex_lock+0xb/0x19 [<c0456661>] generic_file_writev+0x37/0x96 [<c045662a>] generic_file_writev+0x0/0x96 [<c04566c0>] generic_file_write+0x0/0x94 [<c047001f>] do_readv_writev+0x166/0x277 [<f8d4c0dd>] gfs2_holder_uninit+0xb/0x1b [gfs2] [<f8d54f05>] gfs2_llseek+0x6f/0x92 [gfs2] [<c044abd9>] audit_syscall_entry+0x11c/0x14e [<c0470167>] vfs_writev+0x37/0x43 [<c04705ab>] sys_writev+0x3c/0x96 [<c0404eff>] syscall_call+0x7/0xb ======================= BUG: soft lockup detected on CPU#0! [<c044d21c>] softlockup_tick+0x96/0xa4 [<c042ddb0>] update_process_times+0x39/0x5c [<c04196fb>] smp_apic_timer_interrupt+0x5b/0x6c [<c04059bf>] apic_timer_interrupt+0x1f/0x24 [<c042007b>] try_to_wake_up+0x145/0x37b [<c0605813>] _spin_lock+0xa/0xf [<f8d4f3a7>] gfs2_ail1_empty+0x13/0x7d [gfs2] [<f8d44539>] gfs2_logd+0x44/0x13f [gfs2] [<f8d444f5>] gfs2_logd+0x0/0x13f [gfs2] [<c0435f65>] kthread+0xc0/0xeb [<c0435ea5>] kthread+0x0/0xeb [<c0405c3b>] kernel_thread_helper+0x7/0x10 ======================= BUG: soft lockup detected on CPU#1! [<c044d21c>] softlockup_tick+0x96/0xa4 [<c042ddb0>] update_process_times+0x39/0x5c [<c04196fb>] smp_apic_timer_interrupt+0x5b/0x6c [<c04059bf>] apic_timer_interrupt+0x1f/0x24 [<c0605813>] _spin_lock+0xa/0xf [<f8d4f584>] gfs2_ail1_start_one+0x12b/0x15b [gfs2] [<f8d4f61c>] gfs2_ail1_start+0x68/0xe6 [gfs2] [<f8d504fe>] gfs2_log_reserve+0xc6/0x171 [gfs2] [<f8d5f710>] gfs2_do_trans_begin+0xe3/0x119 [gfs2] [<f8d53727>] gfs2_prepare_write+0x1c2/0x2d2 [gfs2] [<c0455c0a>] generic_file_buffered_write+0x23f/0x5f1 [<c042a1f1>] current_fs_time+0x4a/0x55 [<c0456462>] __generic_file_aio_write_nolock+0x4a6/0x52a [<f8ab510b>] gdlm_ast+0x0/0x2 [lock_dlm] [<c0456616>] __generic_file_write_nolock+0x86/0x9a [<c042da86>] lock_timer_base+0x15/0x2f [<c0436031>] autoremove_wake_function+0x0/0x2d [<c04e5231>] copy_to_user+0x31/0x48 [<c0604a93>] mutex_lock+0xb/0x19 [<c0456661>] generic_file_writev+0x37/0x96 [<c045662a>] generic_file_writev+0x0/0x96 [<c04566c0>] generic_file_write+0x0/0x94 [<c047001f>] do_readv_writev+0x166/0x277 [<f8d4c0dd>] gfs2_holder_uninit+0xb/0x1b [gfs2] [<f8d54f05>] gfs2_llseek+0x6f/0x92 [gfs2] [<c044abd9>] audit_syscall_entry+0x11c/0x14e [<c0470167>] vfs_writev+0x37/0x43 [<c04705ab>] sys_writev+0x3c/0x96 [<c0404eff>] syscall_call+0x7/0xb [<c04566c0>] generic_file_write+0x0/0x94 [<c047001f>] do_readv_writev+0x166/0x277 [<f8d4c0dd>] gfs2_holder_uninit+0xb/0x1b [gfs2] [<f8d54f05>] gfs2_llseek+0x6f/0x92 [gfs2] [<c044abd9>] audit_syscall_entry+0x11c/0x14e [<c0470167>] vfs_writev+0x37/0x43 [<c04705ab>] sys_writev+0x3c/0x96 [<c0404eff>] syscall_call+0x7/0xb
I have been running Dean's g2 tests on IA64 and Nate's tests but haven't hit this lockup yet. I'm moving this to NEEDINFO. If anybody sees this again, please post your test scenario.