zfsproxmox

ZFS stalled, task txg_sync blocked for more than 120 seconds


A scrub was running, which got stuck, I restarted the server to attempt to resolve the issue, however this caused ZFS to stall instantly (zpool, and zfs) commands just run indefinitely with the following errors in dmesg

[  363.672135] INFO: task zpool:1784 blocked for more than 120 seconds.
[  363.672201]       Tainted: P           O      5.4.203-1-pve #1
[  363.672264] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  363.672329] zpool           D    0  1784   1417 0x80004006
[  363.672330] Call Trace:
[  363.672335]  __schedule+0x2e6/0x6f0
[  363.672338]  ? try_to_wake_up+0x227/0x650
[  363.672340]  schedule+0x33/0xa0
[  363.672341]  schedule_preempt_disabled+0xe/0x10
[  363.672342]  __mutex_lock.isra.12+0x297/0x490
[  363.672343]  __mutex_lock_slowpath+0x13/0x20
[  363.672344]  mutex_lock+0x2c/0x30
[  363.672390]  spa_open_common+0x62/0x4d0 [zfs]
[  363.672418]  spa_open+0x13/0x20 [zfs]
[  363.672444]  dsl_pool_hold+0x33/0x80 [zfs]
[  363.672467]  dmu_objset_hold_flags+0x3b/0xd0 [zfs]
[  363.672470]  ? spl_kmem_free+0x33/0x40 [spl]
[  363.672493]  dmu_objset_hold+0x16/0x20 [zfs]
[  363.672518]  dsl_prop_get+0x44/0xb0 [zfs]
[  363.672544]  dsl_prop_get_integer+0x1e/0x20 [zfs]
[  363.672573]  zvol_create_minors_cb+0x3b/0x120 [zfs]
[  363.672595]  dmu_objset_find_impl+0x10d/0x3f0 [zfs]
[  363.672625]  ? zvol_add_clones+0x270/0x270 [zfs]
[  363.672647]  dmu_objset_find_impl+0x1d2/0x3f0 [zfs]
[  363.672676]  ? zvol_add_clones+0x270/0x270 [zfs]
[  363.672699]  dmu_objset_find_impl+0x1d2/0x3f0 [zfs]
[  363.672728]  ? zvol_add_clones+0x270/0x270 [zfs]
[  363.672751]  dmu_objset_find_impl+0x1d2/0x3f0 [zfs]
[  363.672779]  ? zvol_add_clones+0x270/0x270 [zfs]
[  363.672808]  ? zvol_add_clones+0x270/0x270 [zfs]
[  363.672831]  dmu_objset_find+0x58/0x90 [zfs]
[  363.672860]  zvol_create_minors_recursive+0x186/0x1c0 [zfs]
[  363.672861]  ? __mutex_unlock_slowpath.isra.20+0xc7/0x120
[  363.672889]  spa_import+0x507/0x810 [zfs]
[  363.672892]  ? nvpair_value_common.part.13+0x14d/0x170 [znvpair]
[  363.672921]  zfs_ioc_pool_import+0x12d/0x150 [zfs]
[  363.672950]  zfsdev_ioctl_common+0x5b2/0x820 [zfs]
[  363.672952]  ? __kmalloc_node+0x267/0x330
[  363.672953]  ? lru_cache_add_active_or_unevictable+0x39/0xb0
[  363.672981]  zfsdev_ioctl+0x54/0xe0 [zfs]
[  363.672983]  do_vfs_ioctl+0xa9/0x640
[  363.672984]  ? handle_mm_fault+0xc9/0x1f0
[  363.672985]  ksys_ioctl+0x67/0x90
[  363.672986]  __x64_sys_ioctl+0x1a/0x20
[  363.672988]  do_syscall_64+0x57/0x190
[  363.672989]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  363.672990] RIP: 0033:0x7fedcd739fc7
[  363.672993] Code: Bad RIP value.
[  363.672993] RSP: 002b:00007ffee2d843c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[  363.672995] RAX: ffffffffffffffda RBX: 00007ffee2d84440 RCX: 00007fedcd739fc7
[  363.672995] RDX: 00007ffee2d84440 RSI: 0000000000005a02 RDI: 0000000000000003
[  363.672996] RBP: 00007ffee2d88330 R08: 0000564e085427b0 R09: 00007fedcd805440
[  363.672996] R10: 0000564e0851c010 R11: 0000000000000246 R12: 0000564e0851c2e0
[  363.672997] R13: 0000564e08529de0 R14: 0000000000000000 R15: 0000000000000000
[  363.673001] INFO: task dp_sync_taskq:1968 blocked for more than 120 seconds.
[  363.673066]       Tainted: P           O      5.4.203-1-pve #1
[  363.673128] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  363.673205] dp_sync_taskq   D    0  1968      2 0x80004000
[  363.673206] Call Trace:
[  363.673207]  __schedule+0x2e6/0x6f0
[  363.673208]  schedule+0x33/0xa0
[  363.673210]  cv_wait_common+0x104/0x130 [spl]
[  363.673212]  ? __wake_up_pollfree+0x40/0x40
[  363.673214]  __cv_wait+0x15/0x20 [spl]
[  363.673240]  dsl_scan_prefetch_thread+0xad/0x290 [zfs]
[  363.673243]  taskq_thread+0x2f7/0x4e0 [spl]
[  363.673245]  ? wake_up_q+0x80/0x80
[  363.673247]  kthread+0x120/0x140
[  363.673249]  ? task_done+0xb0/0xb0 [spl]
[  363.673250]  ? kthread_park+0x90/0x90
[  363.673251]  ret_from_fork+0x35/0x40
[  363.673256] INFO: task txg_sync:2138 blocked for more than 120 seconds.
[  363.673325]       Tainted: P           O      5.4.203-1-pve #1
[  363.673393] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  363.673471] txg_sync        D    0  2138      2 0x80004000
[  363.673471] Call Trace:
[  363.673473]  __schedule+0x2e6/0x6f0
[  363.673474]  schedule+0x33/0xa0
[  363.673477]  vcmn_err.cold.1+0x92/0x94 [spl]
[  363.673499]  ? zfs_btree_insert_into_leaf+0x1c4/0x230 [zfs]
[  363.673520]  ? zfs_btree_add_idx+0xac/0x210 [zfs]
[  363.673540]  ? zfs_btree_find+0x160/0x2d0 [zfs]
[  363.673568]  zfs_panic_recover+0x6f/0x90 [zfs]
[  363.673596]  range_tree_adjust_fill+0x19c/0x5e0 [zfs]
[  363.673623]  range_tree_add_impl+0x31a/0x1030 [zfs]
[  363.673624]  ? _cond_resched+0x19/0x30
[  363.673626]  ? spl_kmem_cache_alloc+0x7c/0x770 [spl]
[  363.673653]  range_tree_add+0x11/0x20 [zfs]
[  363.673679]  scan_io_queue_insert_impl+0xd7/0xe0 [zfs]
[  363.673704]  dsl_scan_scrub_cb+0x430/0x770 [zfs]
[  363.673707]  ? spl_kmem_alloc+0xdc/0x130 [spl]
[  363.673732]  dsl_scan_visitbp+0x68d/0xcd0 [zfs]
[  363.673752]  ? arc_hdr_set_compress+0x50/0x50 [zfs]
[  363.673778]  dsl_scan_visitbp+0x2ac/0xcd0 [zfs]
[  363.673804]  dsl_scan_visitbp+0x5b6/0xcd0 [zfs]
[  363.673829]  dsl_scan_visitbp+0x2ac/0xcd0 [zfs]
[  363.673855]  dsl_scan_visitbp+0x2ac/0xcd0 [zfs]
[  363.673881]  dsl_scan_visitbp+0x2ac/0xcd0 [zfs]
[  363.673907]  dsl_scan_visitbp+0x2ac/0xcd0 [zfs]
[  363.673933]  dsl_scan_visitbp+0x2ac/0xcd0 [zfs]
[  363.673959]  dsl_scan_visitbp+0x826/0xcd0 [zfs]
[  363.673985]  dsl_scan_visit_rootbp+0xe8/0x150 [zfs]
[  363.674011]  dsl_scan_visitds+0x1ae/0x510 [zfs]
[  363.674012]  ? _cond_resched+0x19/0x30
[  363.674013]  ? __kmalloc_node+0x1e0/0x330
[  363.674016]  ? spl_kmem_alloc+0xdc/0x130 [spl]
[  363.674018]  ? spl_kmem_alloc+0xdc/0x130 [spl]
[  363.674020]  ? tsd_hash_search.isra.5+0x47/0xa0 [spl]
[  363.674023]  ? tsd_set+0x1a2/0x4f0 [spl]
[  363.674050]  ? rrw_enter_read_impl+0xaf/0x160 [zfs]
[  363.674076]  dsl_scan_sync+0x838/0x1320 [zfs]
[  363.674104]  spa_sync+0x5f0/0xff0 [zfs]
[  363.674105]  ? mutex_lock+0x12/0x30
[  363.674133]  ? spa_txg_history_init_io+0x104/0x110 [zfs]
[  363.674162]  txg_sync_thread+0x2e1/0x4a0 [zfs]
[  363.674191]  ? txg_thread_exit.isra.13+0x60/0x60 [zfs]
[  363.674193]  thread_generic_wrapper+0x74/0x90 [spl]
[  363.674194]  kthread+0x120/0x140
[  363.674197]  ? __thread_exit+0x20/0x20 [spl]
[  363.674197]  ? kthread_park+0x90/0x90
[  363.674199]  ret_from_fork+0x35/0x40
[  363.674201] INFO: task spa_async:2262 blocked for more than 120 seconds.
[  363.674271]       Tainted: P           O      5.4.203-1-pve #1
[  363.674340] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  363.674416] spa_async       D    0  2262      2 0x80004000
[  363.674417] Call Trace:
[  363.674418]  __schedule+0x2e6/0x6f0
[  363.674419]  schedule+0x33/0xa0
[  363.674420]  io_schedule+0x16/0x40
[  363.674422]  cv_wait_common+0xb5/0x130 [spl]
[  363.674423]  ? __wake_up_pollfree+0x40/0x40
[  363.674425]  __cv_wait_io+0x18/0x20 [spl]
[  363.674454]  txg_wait_synced_impl+0xc9/0x110 [zfs]
[  363.674483]  txg_wait_synced+0x10/0x40 [zfs]
[  363.674511]  spa_config_update+0xd6/0x180 [zfs]
[  363.674539]  spa_config_update+0x174/0x180 [zfs]
[  363.674567]  spa_async_thread+0x1c0/0x6d0 [zfs]
[  363.674595]  ? spa_async_autoexpand+0x90/0x90 [zfs]
[  363.674598]  thread_generic_wrapper+0x74/0x90 [spl]
[  363.674599]  kthread+0x120/0x140
[  363.674601]  ? __thread_exit+0x20/0x20 [spl]
[  363.674602]  ? kthread_park+0x90/0x90
[  363.674603]  ret_from_fork+0x35/0x40

On Boot the following is logged to dmesg

[  134.248706] PANIC: zfs: attempting to increase fill beyond max; probable double add in segment [71ac2134000:71b02136000]
[  134.248774] Showing stack for process 2138
[  134.248776] CPU: 2 PID: 2138 Comm: txg_sync Tainted: P           O      5.4.203-1-pve #1
[  134.248776] Hardware name: System manufacturer System Product Name/P8Z68-V GEN3, BIOS 0301 09/16/2011
[  134.248777] Call Trace:
[  134.248782]  dump_stack+0x6d/0x8b
[  134.248787]  spl_dumpstack+0x29/0x2b [spl]
[  134.248790]  vcmn_err.cold.1+0x60/0x94 [spl]
[  134.248831]  ? zfs_btree_insert_into_leaf+0x1c4/0x230 [zfs]
[  134.248852]  ? zfs_btree_add_idx+0xac/0x210 [zfs]
[  134.248872]  ? zfs_btree_find+0x160/0x2d0 [zfs]
[  134.248901]  zfs_panic_recover+0x6f/0x90 [zfs]
[  134.248928]  range_tree_adjust_fill+0x19c/0x5e0 [zfs]
[  134.248956]  range_tree_add_impl+0x31a/0x1030 [zfs]
[  134.248957]  ? _cond_resched+0x19/0x30
[  134.248960]  ? spl_kmem_cache_alloc+0x7c/0x770 [spl]
[  134.248987]  range_tree_add+0x11/0x20 [zfs]
[  134.249012]  scan_io_queue_insert_impl+0xd7/0xe0 [zfs]
[  134.249038]  dsl_scan_scrub_cb+0x430/0x770 [zfs]
[  134.249041]  ? spl_kmem_alloc+0xdc/0x130 [spl]
[  134.249066]  dsl_scan_visitbp+0x68d/0xcd0 [zfs]
[  134.249085]  ? arc_hdr_set_compress+0x50/0x50 [zfs]
[  134.249111]  dsl_scan_visitbp+0x2ac/0xcd0 [zfs]
[  134.249137]  dsl_scan_visitbp+0x5b6/0xcd0 [zfs]
[  134.249163]  dsl_scan_visitbp+0x2ac/0xcd0 [zfs]
[  134.249190]  dsl_scan_visitbp+0x2ac/0xcd0 [zfs]
[  134.249215]  dsl_scan_visitbp+0x2ac/0xcd0 [zfs]
[  134.249241]  dsl_scan_visitbp+0x2ac/0xcd0 [zfs]
[  134.249267]  dsl_scan_visitbp+0x2ac/0xcd0 [zfs]
[  134.249293]  dsl_scan_visitbp+0x826/0xcd0 [zfs]
[  134.249319]  dsl_scan_visit_rootbp+0xe8/0x150 [zfs]
[  134.249345]  dsl_scan_visitds+0x1ae/0x510 [zfs]
[  134.249346]  ? _cond_resched+0x19/0x30
[  134.249348]  ? __kmalloc_node+0x1e0/0x330
[  134.249350]  ? spl_kmem_alloc+0xdc/0x130 [spl]
[  134.249352]  ? spl_kmem_alloc+0xdc/0x130 [spl]
[  134.249355]  ? tsd_hash_search.isra.5+0x47/0xa0 [spl]
[  134.249357]  ? tsd_set+0x1a2/0x4f0 [spl]
[  134.249384]  ? rrw_enter_read_impl+0xaf/0x160 [zfs]
[  134.249410]  dsl_scan_sync+0x838/0x1320 [zfs]
[  134.249438]  spa_sync+0x5f0/0xff0 [zfs]
[  134.249439]  ? mutex_lock+0x12/0x30
[  134.249468]  ? spa_txg_history_init_io+0x104/0x110 [zfs]
[  134.249496]  txg_sync_thread+0x2e1/0x4a0 [zfs]
[  134.249525]  ? txg_thread_exit.isra.13+0x60/0x60 [zfs]
[  134.249528]  thread_generic_wrapper+0x74/0x90 [spl]
[  134.249529]  kthread+0x120/0x140
[  134.249532]  ? __thread_exit+0x20/0x20 [spl]
[  134.249533]  ? kthread_park+0x90/0x90
[  134.249534]  ret_from_fork+0x35/0x40

I have attempted to run "zdb -e -bc storage", This runs for about half an hour to an hour, (3.58TB scanned) and then fails with

zio_wait(zio_claim(NULL, zcb->zcb_spa, refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL
, ZIO_FLAG_CANFAIL)) == 0 (0x2 == 0x0)

I ran the above twice, with the same error returned, with the same amount scanned.

I have run a memory test, which returned okay.

I have also run a short SMART test on all 4 drives, which returned no errors.


Solution

  • I found that another memory test started returning hundreds of errors from one of the RAM stick, so replacing that resolved the issue.