Login
[x]
Log in using an account from:
Fedora Account System
Red Hat Associate
Red Hat Customer
Or login using a Red Hat Bugzilla account
Forgot Password
Login:
Hide Forgot
Create an Account
Red Hat Bugzilla – Attachment 251121 Details for
Bug 360401
rgmanager is stuck in a loop while rebooting a node.
[?]
New
Simple Search
Advanced Search
My Links
Browse
Requests
Reports
Current State
Search
Tabular reports
Graphical reports
Duplicates
Other Reports
User Changes
Plotly Reports
Bug Status
Bug Severity
Non-Defaults
|
Product Dashboard
Help
Page Help!
Bug Writing Guidelines
What's new
Browser Support Policy
5.0.4.rh83 Release notes
FAQ
Guides index
User guide
Web Services
Contact
Legal
This site requires JavaScript to be enabled to function correctly, please enable it.
[patch]
Incremental fix (need both patches for completeness)
rgmanager-360401-2.patch (text/plain), 9.89 KB, created by
Lon Hohberger
on 2007-11-08 01:38:25 UTC
(
hide
)
Description:
Incremental fix (need both patches for completeness)
Filename:
MIME Type:
Creator:
Lon Hohberger
Created:
2007-11-08 01:38:25 UTC
Size:
9.89 KB
patch
obsolete
>diff -ur rgmanager-1.9.69/include/resgroup.h rgmanager-1.9.69-lhh/include/resgroup.h >--- rgmanager-1.9.69/include/resgroup.h 2007-08-17 16:23:01.000000000 -0400 >+++ rgmanager-1.9.69-lhh/include/resgroup.h 2007-11-07 18:12:57.000000000 -0500 >@@ -156,6 +156,7 @@ > cluster_member_list_t *member_list(void); > uint64_t my_id(void); > >+#define RG_ERELO -9 /* Operation cannot complete here */ > #define RG_ENODEDEATH -8 /* Processing node died */ > #define RG_ERUN -7 /* Service is running already */ > #define RG_EAGAIN -6 /* Try again */ >diff -ur rgmanager-1.9.69/src/daemons/groups.c rgmanager-1.9.69-lhh/src/daemons/groups.c >--- rgmanager-1.9.69/src/daemons/groups.c 2007-08-17 16:23:01.000000000 -0400 >+++ rgmanager-1.9.69-lhh/src/daemons/groups.c 2007-11-07 18:33:34.000000000 -0500 >@@ -157,7 +157,8 @@ > } > > if (st.rs_state != RG_STATE_STARTED && >- st.rs_state != RG_STATE_STARTING) >+ st.rs_state != RG_STATE_STARTING && >+ st.rs_state != RG_STATE_STOPPING) > continue; > > if (mp->cm_id != st.rs_owner) >diff -ur rgmanager-1.9.69/src/daemons/rg_state.c rgmanager-1.9.69-lhh/src/daemons/rg_state.c >--- rgmanager-1.9.69/src/daemons/rg_state.c 2007-11-07 20:04:51.000000000 -0500 >+++ rgmanager-1.9.69-lhh/src/daemons/rg_state.c 2007-11-07 20:10:04.000000000 -0500 >@@ -41,10 +41,13 @@ > int set_rg_state(char *servicename, rg_state_t *svcblk); > int get_rg_state(char *servicename, rg_state_t *svcblk); > void get_recovery_policy(char *rg_name, char *buf, size_t buflen); >-int have_exclusive_resources(); >+int have_exclusive_resources(void); > int check_exclusive_resources(cluster_member_list_t *membership, char *svcName); > > >+pthread_mutex_t exclusive_mutex = PTHREAD_MUTEX_INITIALIZER; >+ >+ > uint64_t > next_node_id(cluster_member_list_t *membership, uint64_t me) > { >@@ -446,6 +449,7 @@ > break; > > case RG_STATE_DISABLED: >+ ret = 2; > case RG_STATE_UNINITIALIZED: > if (req == RG_DISABLE) { > clulog(LOG_NOTICE, >@@ -536,7 +540,7 @@ > } > > clulog(LOG_NOTICE, >- "Starting stopped service%s\n", >+ "Starting stopped service %s\n", > svcName); > ret = 1; > break; >@@ -557,7 +561,7 @@ > snprintf(query, > sizeof(query), > "/cluster/clusternodes/clusternode[@nodeid=\"%d\"]/@name", >- svcStatus->rs_owner); >+ (int)svcStatus->rs_owner); > ccs_get(fd, query, &nodename); > ccs_disconnect(fd); > } >@@ -650,42 +654,61 @@ > svc_start(char *svcName, int req) > { > void *lockp = NULL; >- int ret; >+ int ret, xret; > rg_state_t svcStatus; >+ int need_check = have_exclusive_resources(); >+ cluster_member_list_t *membership; >+ >+ if (need_check) >+ pthread_mutex_lock(&exclusive_mutex); >+ >+ ret = RG_EFAIL; > > if (rg_lock(svcName, &lockp) < 0) { > clulog(LOG_ERR, "#45: Unable to obtain cluster lock: %s\n", > strerror(errno)); >- return FAIL; >+ goto out_nolock; > } > > if (get_rg_state(svcName, &svcStatus) != 0) { >- rg_unlock(svcName, lockp); > clulog(LOG_ERR, "#46: Failed getting status for RG %s\n", > svcName); >- return FAIL; >+ goto out_unlock; >+ } >+ >+ if (need_check) { >+ membership = member_list(); >+ xret = check_exclusive_resources(membership, svcName); >+ cml_free(membership); >+ if (xret != 0) { >+ if (xret > 0) >+ /* Exc. service running */ >+ ret = RG_ERELO; >+ else >+ /* XXX */ >+ ret = RG_ENOSERVICE; >+ goto out_unlock; >+ } > } > > /* LOCK HELD */ > switch (svc_advise_start(&svcStatus, svcName, req)) { > case 0: /* Don't start service, return FAIL */ >- rg_unlock(svcName, lockp); >- return FAIL; >+ goto out_unlock; > case 2: /* Don't start service, return 0 */ >- rg_unlock(svcName, lockp); >- return 0; >+ ret = 0; >+ goto out_unlock; > case 3: >- rg_unlock(svcName, lockp); >- return RG_EAGAIN; >+ ret = RG_EAGAIN; >+ goto out_unlock; > case 4: >- rg_unlock(svcName, lockp); >- return RG_ERUN; >+ ret = RG_ERUN; >+ goto out_unlock; > default: > break; > } > > /* LOCK HELD if we get here */ >- > svcStatus.rs_owner = my_id(); > svcStatus.rs_state = RG_STATE_STARTING; > svcStatus.rs_transition = (uint64_t)time(NULL); >@@ -699,10 +722,17 @@ > clulog(LOG_ERR, > "#47: Failed changing service status\n"); > rg_unlock(svcName, lockp); >- return FAIL; >+ goto out_unlock; > } > > rg_unlock(svcName, lockp); >+ >+ /* release excl. mutex during start */ >+ if (need_check) { >+ /* Also clear need_check so we don't double-unlock */ >+ pthread_mutex_unlock(&exclusive_mutex); >+ need_check = 0; >+ } > > ret = group_op(svcName, RG_START); > ret = !!ret; /* Either it worked or it didn't. Ignore all the >@@ -711,17 +741,17 @@ > if (rg_lock(svcName, &lockp) < 0) { > clulog(LOG_ERR, "#74: Unable to obtain cluster lock: %s\n", > strerror(errno)); >- return FAIL; >+ ret = RG_EFAIL; >+ goto out_nolock; > } > > svcStatus.rs_state = RG_STATE_STARTED; > if (set_rg_state(svcName, &svcStatus) != 0) { > clulog(LOG_ERR, > "#75: Failed changing service status\n"); >- rg_unlock(svcName, lockp); >- return FAIL; >+ ret = RG_EFAIL; >+ goto out_unlock; > } >- rg_unlock(svcName, lockp); > > if (ret == 0) > clulog(LOG_NOTICE, >@@ -732,6 +762,11 @@ > "#68: Failed to start %s; return value: %d\n", > svcName, ret); > >+out_unlock: >+ rg_unlock(svcName, lockp); >+out_nolock: >+ if (need_check) >+ pthread_mutex_unlock(&exclusive_mutex); > return ret; > } > >@@ -1115,7 +1150,7 @@ > { > cluster_member_list_t *allowed_nodes, *backup = NULL; > uint64_t target = preferred_target, me = my_id(); >- int ret, x; >+ int ret, x, tried = 0; > > /* > * Stop the service - if we haven't already done so. >@@ -1181,6 +1216,7 @@ > * It's legal to start the service on the given > * node. Try to do so. > */ >+ ++tried; > if (relocate_service(svcName, request, target) == 0) { > *new_owner = target; > /* >@@ -1211,9 +1247,12 @@ > if (target == me) > goto exhausted; > >+ ++tried; >+ >+ /* Each node gets one try */ >+ memb_mark_down(allowed_nodes, target); > switch (relocate_service(svcName, request, target)) { > case RG_EFAIL: >- memb_mark_down(allowed_nodes, target); > continue; > case RG_EABORT: > svc_report_failure(svcName); >@@ -1255,9 +1294,10 @@ > */ > exhausted: > if (!rg_locked()) { >- clulog(LOG_WARNING, >- "#70: Attempting to restart service %s locally.\n", >- svcName); >+ if (tried) >+ clulog(LOG_WARNING, >+ "#70: Attempting to restart service %s locally.\n", >+ svcName); > if (svc_start(svcName, RG_START_RECOVER) == 0) { > *new_owner = me; > return FAIL; >@@ -1276,9 +1316,9 @@ > int > handle_fd_start_req(char *svcName, int request, uint64_t *new_owner) > { >- cluster_member_list_t *allowed_nodes, *backup = NULL; >+ cluster_member_list_t *allowed_nodes; > uint64_t target, me = my_id(); >- int ret, x; >+ int ret; > > allowed_nodes = member_list(); > >@@ -1327,7 +1367,6 @@ > } > > >-pthread_mutex_t exclusive_mutex = PTHREAD_MUTEX_INITIALIZER; > /** > * handle_start_req - Handle a generic start request from a user or during > * service manager boot. >@@ -1343,7 +1382,6 @@ > { > int ret, tolerance = FOD_BEST; > cluster_member_list_t *membership = member_list(); >- int need_check = have_exclusive_resources(); > > /* > * When a service request is from a user application (eg, clusvcadm), >@@ -1359,18 +1397,6 @@ > cml_free(membership); > return FAIL; > } >- if (need_check) { >- pthread_mutex_lock(&exclusive_mutex); >- ret = check_exclusive_resources(membership, svcName); >- if (ret != 0) { >- cml_free(membership); >- pthread_mutex_unlock(&exclusive_mutex); >- if (ret > 0) >- goto relocate; >- else >- return FAIL; >- } >- } > cml_free(membership); > > /* >@@ -1378,25 +1404,22 @@ > * mask here - so that we can try all nodes if necessary. > */ > ret = svc_start(svcName, req); >- if (need_check) >- pthread_mutex_unlock(&exclusive_mutex); >- >- /* >- If services are locked, return the error >- */ >- if (ret == RG_EAGAIN || ret == RG_ERUN) >+ switch(ret) { >+ case RG_ERELO: >+ goto relocate; >+ >+ case RG_EAGAIN: >+ /* If services are locked, return the error */ >+ case RG_ENOSERVICE: >+ /* service doesn't exist? */ >+ case RG_ERUN: >+ /* If service is already running, return that value */ > return ret; > >- /* >- * If we succeeded, then we're done. >- */ >- if (ret == SUCCESS) { >+ case SUCCESS: >+ /* If we succeeded, then we're done. */ > *new_owner = my_id(); >- return SUCCESS; >- } >- >- /* Already running? */ >- if (ret == NO) { >+ case NO: > return SUCCESS; > } > >@@ -1419,13 +1442,13 @@ > return RG_EABORT; > } > >-relocate: > /* > * OK, it failed to start - but succeeded to stop. Now, > * we should relocate the service. > */ > clulog(LOG_WARNING, "#71: Relocating failed service %s\n", > svcName); >+relocate: > ret = handle_relocate_req(svcName, RG_START_RECOVER, -1, new_owner); > > /* If we leave the service stopped, instead of disabled, someone >@@ -1457,7 +1480,6 @@ > int x; > uint64_t me = my_id(); > cluster_member_list_t *membership = member_list(); >- int need_check = have_exclusive_resources(); > > /* XXX ok, so we need to say "should I start this if I was the > only cluster member online */ >@@ -1478,23 +1500,29 @@ > cml_free(membership); > return FAIL; > } >- if (need_check) { >- pthread_mutex_lock(&exclusive_mutex); >- if (check_exclusive_resources(membership, svcName) != 0) { >- pthread_mutex_unlock(&exclusive_mutex); >- cml_free(membership); >- return FAIL; >- } >- } > cml_free(membership); > > x = svc_start(svcName, req); >- if (need_check) >- pthread_mutex_unlock(&exclusive_mutex); >- if (x == 0) >- return 0; >- if (x == RG_ERUN) >- return RG_ERUN; >+ switch(x) { >+ case RG_ERELO: >+ /* Don't relocate from here; it was a remote start */ >+ /* Return fail so the other node can go ahead and >+ try the other nodes in the cluster */ >+ case NO: >+ return RG_EFAIL; >+ >+ case RG_EAGAIN: >+ /* If services are locked, return the error */ >+ case RG_ENOSERVICE: >+ /* service doesn't exist? */ >+ case RG_ERUN: >+ /* If service is already running, return that value */ >+ return x; >+ >+ case SUCCESS: >+ /* If we succeeded, then we're done. */ >+ return SUCCESS; >+ } > > if (svc_stop(svcName, RG_STOP_RECOVER) == 0) > return RG_EFAIL;
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 360401
:
244631
|
244641
|
250901
|
250981
| 251121