Bug 1263200
Summary: | Data Tiering:Setting only promote frequency and no demote frequency causes crash | |||
---|---|---|---|---|
Product: | [Community] GlusterFS | Reporter: | Nag Pavan Chilakam <nchilaka> | |
Component: | tiering | Assignee: | Nithya Balachandran <nbalacha> | |
Status: | CLOSED DUPLICATE | QA Contact: | bugs <bugs> | |
Severity: | urgent | Docs Contact: | ||
Priority: | urgent | |||
Version: | 3.7.4 | CC: | bugs, dlambrig, nbalacha | |
Target Milestone: | --- | |||
Target Release: | --- | |||
Hardware: | Unspecified | |||
OS: | Unspecified | |||
Whiteboard: | ||||
Fixed In Version: | Doc Type: | Bug Fix | ||
Doc Text: | Story Points: | --- | ||
Clone Of: | ||||
: | 1263204 (view as bug list) | Environment: | ||
Last Closed: | 2015-09-16 15:28:30 UTC | Type: | Bug | |
Regression: | --- | Mount Type: | --- | |
Documentation: | --- | CRM: | ||
Verified Versions: | Category: | --- | ||
oVirt Team: | --- | RHEL 7.3 requirements from Atomic Host: | ||
Cloudforms Team: | --- | Target Upstream Version: | ||
Embargoed: | ||||
Bug Depends On: | ||||
Bug Blocks: | 1260923, 1263204, 1263746 |
Description
Nag Pavan Chilakam
2015-09-15 10:11:38 UTC
[root@rhsqe-repo sosreports]# hostname rhsqe-repo.lab.eng.blr.redhat.com [root@rhsqe-repo sosreports]# ls /home/repo/sosreports/bug.1263200 nalysis of the coredump: [Thread debugging using libthread_db enabled] Using host libthread_db library "/lib64/libthread_db.so.1". Core was generated by `/usr/sbin/glusterfs -s localhost --volfile-id rebalance/9301 --xlator-option *d'. Program terminated with signal 11, Segmentation fault. #0 tier_build_migration_qfile (is_promotion=_gf_true, query_cbk_args=0x7f3afa854e70, args=0x7f3fee27bca0) at tier.c:607 607 list_for_each_entry (local_brick, args->brick_list, list) { #0 tier_build_migration_qfile (is_promotion=_gf_true, query_cbk_args=0x7f3afa854e70, args=0x7f3fee27bca0) at tier.c:607 #1 tier_promote (args=0x7f3fee27bca0) at tier.c:704 #2 0x00007f4003bd0df5 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f40035171ad in clone () from /lib64/libc.so.6 (gdb) f 0 #0 tier_build_migration_qfile (is_promotion=_gf_true, query_cbk_args=0x7f3afa854e70, args=0x7f3fee27bca0) at tier.c:607 607 list_for_each_entry (local_brick, args->brick_list, list) { (gdb) p *args $1 = {this = 0x0, defrag = 0x0, brick_list = 0x0, freq_time = 0, return_value = 0} All members of the args structure are NULL, causing the tier process to crash when it tries to access args->brick_list. (gdb) t a a bt Thread 16 (Thread 0x7f3fdbfff700 (LWP 2861)): #0 0x00007f4003bd4705 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00007f3ff6da434a in gf_defrag_task (opaque=0x7f3ff0028c10) at dht-rebalance.c:1801 #2 0x00007f4003bd0df5 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f40035171ad in clone () from /lib64/libc.so.6 ... Thread 6 (Thread 0x7f3ffa380700 (LWP 2843)): #0 0x00007f40034de48d in nanosleep () from /lib64/libc.so.6 #1 0x00007f40034de324 in sleep () from /lib64/libc.so.6 #2 0x00007f3ff6911f2a in tier_start (this=0x7f3ff0020920, defrag=0x7f3ff0028c10) at tier.c:860 #3 0x00007f3ff6da6d34 in gf_defrag_start_crawl (data=0x7f3ff0020920) at dht-rebalance.c:2841 #4 0x00007f4004da9d72 in synctask_wrap (old_task=<optimized out>) at syncop.c:380 #5 0x00007f40034680f0 in ?? () from /lib64/libc.so.6 #6 0x0000000000000000 in ?? () Thread 5 (Thread 0x7f3ffab81700 (LWP 2842)): #0 0x00007f4003bd4ab2 in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00007f4004dabe88 in syncenv_task (proc=proc@entry=0x7f4006a675a0) at syncop.c:607 #2 0x00007f4004dacbc0 in syncenv_processor (thdata=0x7f4006a675a0) at syncop.c:699 #3 0x00007f4003bd0df5 in start_thread () from /lib64/libpthread.so.0 #4 0x00007f40035171ad in clone () from /lib64/libc.so.6 ... Thread 2 (Thread 0x7f3ff7ca8700 (LWP 2844)): #0 0x00007f4003517783 in epoll_wait () from /lib64/libc.so.6 ---Type <return> to continue, or q <return> to quit--- #1 0x00007f4004dc9680 in event_dispatch_epoll_worker (data=0x7f4006aa45c0) at event-epoll.c:668 #2 0x00007f4003bd0df5 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f40035171ad in clone () from /lib64/libc.so.6 Thread 1 (Thread 0x7f3afa855700 (LWP 13112)): #0 tier_build_migration_qfile (is_promotion=_gf_true, query_cbk_args=0x7f3afa854e70, args=0x7f3fee27bca0) at tier.c:607 #1 tier_promote (args=0x7f3fee27bca0) at tier.c:704 #2 0x00007f4003bd0df5 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f40035171ad in clone () from /lib64/libc.so.6 Looking at thread 6: (gdb) t 6 [Switching to thread 6 (Thread 0x7f3ffa380700 (LWP 2843))] #0 0x00007f40034de48d in nanosleep () from /lib64/libc.so.6 (gdb) bt #0 0x00007f40034de48d in nanosleep () from /lib64/libc.so.6 #1 0x00007f40034de324 in sleep () from /lib64/libc.so.6 #2 0x00007f3ff6911f2a in tier_start (this=0x7f3ff0020920, defrag=0x7f3ff0028c10) at tier.c:860 #3 0x00007f3ff6da6d34 in gf_defrag_start_crawl (data=0x7f3ff0020920) at dht-rebalance.c:2841 #4 0x00007f4004da9d72 in synctask_wrap (old_task=<optimized out>) at syncop.c:380 #5 0x00007f40034680f0 in ?? () from /lib64/libc.so.6 #6 0x0000000000000000 in ?? () (gdb) f 2 #2 0x00007f3ff6911f2a in tier_start (this=0x7f3ff0020920, defrag=0x7f3ff0028c10) at tier.c:860 860 sleep(1); (gdb) l 855 856 defrag->defrag_status = GF_DEFRAG_STATUS_STARTED; 857 858 while (1) { 859 860 sleep(1); 861 ... 911 912 913 ret_promotion = -1; 914 ret_demotion = -1; (gdb) 915 916 if (is_demotion_triggered) { 917 demotion_args.this = this; 918 demotion_args.brick_list = &bricklist_hot; 919 demotion_args.defrag = defrag; 920 demotion_args.freq_time = freq_demote; 921 ret_demotion = pthread_create (&demote_thread, 922 NULL, &tier_demote, 923 &demotion_args); 924 if (ret_demotion) { (gdb) 925 gf_msg (this->name, GF_LOG_ERROR, 0, 926 DHT_MSG_LOG_TIER_ERROR, 927 "Failed starting Demotion " 928 "thread!"); 929 } 930 } 931 932 if (is_promotion_triggered) { 933 promotion_args.this = this; 934 promotion_args.brick_list = &bricklist_cold; (gdb) 935 promotion_args.defrag = defrag; 936 promotion_args.freq_time = freq_promote * PROMOTION_CYCLE_CNT; 937 ret_promotion = pthread_create (&promote_thread, 938 NULL, &tier_promote, 939 &promotion_args); 940 if (ret_promotion) { 941 gf_msg (this->name, GF_LOG_ERROR, 0, 942 DHT_MSG_LOG_TIER_ERROR, 943 "Failed starting Promotion " 944 "thread!"); } ... 958 if (is_demotion_triggered && (ret_promotion == 0)) { 959 pthread_join (promote_thread, NULL); 960 if (promotion_args.return_value) { 961 gf_msg (this->name, GF_LOG_ERROR, 0, 962 DHT_MSG_LOG_TIER_ERROR, 963 "Promotion failed!"); 964 } (gdb) 965 ret_promotion = promotion_args.return_value; 966 } 967 968 /* Collect previous and current cummulative status */ 969 /* If demotion was not triggered just pass 0 to ret */ 970 ret = (is_demotion_triggered) ? ret_demotion : 0; 971 /* If promotion was not triggered just pass 0 to ret */ 972 ret = ret | (is_promotion_triggered) ? 973 ret_promotion : 0; 974 (gdb) 975 /* reseting promotion and demotion arguments for 976 * next iteration*/ 977 memset (&demotion_args, 0, sizeof(demotion_args_t)); 978 memset (&promotion_args, 0, sizeof(promotion_args_t)); 979 980 } (gdb) p is_demotion_triggered $2 = _gf_false As is_demotion_triggered is false, the pthread_join (promote_thread, NULL) on line 959 is never called. The main thread proceeds without waiting for the promote thread to complete and memsets promotion_args to 0, thus causing the process to crash when the promote_thread tries to access the args structure. *** This bug has been marked as a duplicate of bug 1263746 *** *** Bug 1263585 has been marked as a duplicate of this bug. *** |