Bug 1263200
| Summary: | Data Tiering:Setting only promote frequency and no demote frequency causes crash | |||
|---|---|---|---|---|
| Product: | [Community] GlusterFS | Reporter: | Nag Pavan Chilakam <nchilaka> | |
| Component: | tiering | Assignee: | Nithya Balachandran <nbalacha> | |
| Status: | CLOSED DUPLICATE | QA Contact: | bugs <bugs> | |
| Severity: | urgent | Docs Contact: | ||
| Priority: | urgent | |||
| Version: | 3.7.4 | CC: | bugs, dlambrig, nbalacha | |
| Target Milestone: | --- | |||
| Target Release: | --- | |||
| Hardware: | Unspecified | |||
| OS: | Unspecified | |||
| Whiteboard: | ||||
| Fixed In Version: | Doc Type: | Bug Fix | ||
| Doc Text: | Story Points: | --- | ||
| Clone Of: | ||||
| : | 1263204 (view as bug list) | Environment: | ||
| Last Closed: | 2015-09-16 15:28:30 UTC | Type: | Bug | |
| Regression: | --- | Mount Type: | --- | |
| Documentation: | --- | CRM: | ||
| Verified Versions: | Category: | --- | ||
| oVirt Team: | --- | RHEL 7.3 requirements from Atomic Host: | ||
| Cloudforms Team: | --- | Target Upstream Version: | ||
| Embargoed: | ||||
| Bug Depends On: | ||||
| Bug Blocks: | 1260923, 1263204, 1263746 | |||
|
Description
Nag Pavan Chilakam
2015-09-15 10:11:38 UTC
[root@rhsqe-repo sosreports]# hostname rhsqe-repo.lab.eng.blr.redhat.com [root@rhsqe-repo sosreports]# ls /home/repo/sosreports/bug.1263200 nalysis of the coredump:
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
Core was generated by `/usr/sbin/glusterfs -s localhost --volfile-id rebalance/9301 --xlator-option *d'.
Program terminated with signal 11, Segmentation fault.
#0 tier_build_migration_qfile (is_promotion=_gf_true, query_cbk_args=0x7f3afa854e70, args=0x7f3fee27bca0) at tier.c:607
607 list_for_each_entry (local_brick, args->brick_list, list) {
#0 tier_build_migration_qfile (is_promotion=_gf_true, query_cbk_args=0x7f3afa854e70, args=0x7f3fee27bca0) at tier.c:607
#1 tier_promote (args=0x7f3fee27bca0) at tier.c:704
#2 0x00007f4003bd0df5 in start_thread () from /lib64/libpthread.so.0
#3 0x00007f40035171ad in clone () from /lib64/libc.so.6
(gdb) f 0
#0 tier_build_migration_qfile (is_promotion=_gf_true, query_cbk_args=0x7f3afa854e70, args=0x7f3fee27bca0) at tier.c:607
607 list_for_each_entry (local_brick, args->brick_list, list) {
(gdb) p *args
$1 = {this = 0x0, defrag = 0x0, brick_list = 0x0, freq_time = 0, return_value = 0}
All members of the args structure are NULL, causing the tier process to crash when it tries to access args->brick_list.
(gdb) t a a bt
Thread 16 (Thread 0x7f3fdbfff700 (LWP 2861)):
#0 0x00007f4003bd4705 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007f3ff6da434a in gf_defrag_task (opaque=0x7f3ff0028c10) at dht-rebalance.c:1801
#2 0x00007f4003bd0df5 in start_thread () from /lib64/libpthread.so.0
#3 0x00007f40035171ad in clone () from /lib64/libc.so.6
...
Thread 6 (Thread 0x7f3ffa380700 (LWP 2843)):
#0 0x00007f40034de48d in nanosleep () from /lib64/libc.so.6
#1 0x00007f40034de324 in sleep () from /lib64/libc.so.6
#2 0x00007f3ff6911f2a in tier_start (this=0x7f3ff0020920, defrag=0x7f3ff0028c10) at tier.c:860
#3 0x00007f3ff6da6d34 in gf_defrag_start_crawl (data=0x7f3ff0020920) at dht-rebalance.c:2841
#4 0x00007f4004da9d72 in synctask_wrap (old_task=<optimized out>) at syncop.c:380
#5 0x00007f40034680f0 in ?? () from /lib64/libc.so.6
#6 0x0000000000000000 in ?? ()
Thread 5 (Thread 0x7f3ffab81700 (LWP 2842)):
#0 0x00007f4003bd4ab2 in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00007f4004dabe88 in syncenv_task (proc=proc@entry=0x7f4006a675a0) at syncop.c:607
#2 0x00007f4004dacbc0 in syncenv_processor (thdata=0x7f4006a675a0) at syncop.c:699
#3 0x00007f4003bd0df5 in start_thread () from /lib64/libpthread.so.0
#4 0x00007f40035171ad in clone () from /lib64/libc.so.6
...
Thread 2 (Thread 0x7f3ff7ca8700 (LWP 2844)):
#0 0x00007f4003517783 in epoll_wait () from /lib64/libc.so.6
---Type <return> to continue, or q <return> to quit---
#1 0x00007f4004dc9680 in event_dispatch_epoll_worker (data=0x7f4006aa45c0) at event-epoll.c:668
#2 0x00007f4003bd0df5 in start_thread () from /lib64/libpthread.so.0
#3 0x00007f40035171ad in clone () from /lib64/libc.so.6
Thread 1 (Thread 0x7f3afa855700 (LWP 13112)):
#0 tier_build_migration_qfile (is_promotion=_gf_true, query_cbk_args=0x7f3afa854e70, args=0x7f3fee27bca0) at tier.c:607
#1 tier_promote (args=0x7f3fee27bca0) at tier.c:704
#2 0x00007f4003bd0df5 in start_thread () from /lib64/libpthread.so.0
#3 0x00007f40035171ad in clone () from /lib64/libc.so.6
Looking at thread 6:
(gdb) t 6
[Switching to thread 6 (Thread 0x7f3ffa380700 (LWP 2843))]
#0 0x00007f40034de48d in nanosleep () from /lib64/libc.so.6
(gdb) bt
#0 0x00007f40034de48d in nanosleep () from /lib64/libc.so.6
#1 0x00007f40034de324 in sleep () from /lib64/libc.so.6
#2 0x00007f3ff6911f2a in tier_start (this=0x7f3ff0020920, defrag=0x7f3ff0028c10) at tier.c:860
#3 0x00007f3ff6da6d34 in gf_defrag_start_crawl (data=0x7f3ff0020920) at dht-rebalance.c:2841
#4 0x00007f4004da9d72 in synctask_wrap (old_task=<optimized out>) at syncop.c:380
#5 0x00007f40034680f0 in ?? () from /lib64/libc.so.6
#6 0x0000000000000000 in ?? ()
(gdb) f 2
#2 0x00007f3ff6911f2a in tier_start (this=0x7f3ff0020920, defrag=0x7f3ff0028c10) at tier.c:860
860 sleep(1);
(gdb) l
855
856 defrag->defrag_status = GF_DEFRAG_STATUS_STARTED;
857
858 while (1) {
859
860 sleep(1);
861
...
911
912
913 ret_promotion = -1;
914 ret_demotion = -1;
(gdb)
915
916 if (is_demotion_triggered) {
917 demotion_args.this = this;
918 demotion_args.brick_list = &bricklist_hot;
919 demotion_args.defrag = defrag;
920 demotion_args.freq_time = freq_demote;
921 ret_demotion = pthread_create (&demote_thread,
922 NULL, &tier_demote,
923 &demotion_args);
924 if (ret_demotion) {
(gdb)
925 gf_msg (this->name, GF_LOG_ERROR, 0,
926 DHT_MSG_LOG_TIER_ERROR,
927 "Failed starting Demotion "
928 "thread!");
929 }
930 }
931
932 if (is_promotion_triggered) {
933 promotion_args.this = this;
934 promotion_args.brick_list = &bricklist_cold;
(gdb)
935 promotion_args.defrag = defrag;
936 promotion_args.freq_time = freq_promote * PROMOTION_CYCLE_CNT;
937 ret_promotion = pthread_create (&promote_thread,
938 NULL, &tier_promote,
939 &promotion_args);
940 if (ret_promotion) {
941 gf_msg (this->name, GF_LOG_ERROR, 0,
942 DHT_MSG_LOG_TIER_ERROR,
943 "Failed starting Promotion "
944 "thread!");
}
...
958 if (is_demotion_triggered && (ret_promotion == 0)) {
959 pthread_join (promote_thread, NULL);
960 if (promotion_args.return_value) {
961 gf_msg (this->name, GF_LOG_ERROR, 0,
962 DHT_MSG_LOG_TIER_ERROR,
963 "Promotion failed!");
964 }
(gdb)
965 ret_promotion = promotion_args.return_value;
966 }
967
968 /* Collect previous and current cummulative status */
969 /* If demotion was not triggered just pass 0 to ret */
970 ret = (is_demotion_triggered) ? ret_demotion : 0;
971 /* If promotion was not triggered just pass 0 to ret */
972 ret = ret | (is_promotion_triggered) ?
973 ret_promotion : 0;
974
(gdb)
975 /* reseting promotion and demotion arguments for
976 * next iteration*/
977 memset (&demotion_args, 0, sizeof(demotion_args_t));
978 memset (&promotion_args, 0, sizeof(promotion_args_t));
979
980 }
(gdb) p is_demotion_triggered
$2 = _gf_false
As is_demotion_triggered is false, the pthread_join (promote_thread, NULL) on line 959 is never called. The main thread proceeds without waiting for the promote thread to complete and memsets promotion_args to 0, thus causing the process to crash when the promote_thread tries to access the args structure.
*** This bug has been marked as a duplicate of bug 1263746 *** *** Bug 1263585 has been marked as a duplicate of this bug. *** |