| Summary: | stat of file is hung with possible deadlock | |||
|---|---|---|---|---|
| Product: | Red Hat Gluster Storage | Reporter: | Nag Pavan Chilakam <nchilaka> | |
| Component: | core | Assignee: | Pranith Kumar K <pkarampu> | |
| Status: | CLOSED ERRATA | QA Contact: | Nag Pavan Chilakam <nchilaka> | |
| Severity: | urgent | Docs Contact: | ||
| Priority: | unspecified | |||
| Version: | rhgs-3.2 | CC: | amukherj, pkarampu, rcyriac, rhinduja, rhs-bugs, storage-qa-internal | |
| Target Milestone: | --- | |||
| Target Release: | RHGS 3.2.0 | |||
| Hardware: | Unspecified | |||
| OS: | Unspecified | |||
| Whiteboard: | ||||
| Fixed In Version: | glusterfs-3.8.4-5 | Doc Type: | If docs needed, set a value | |
| Doc Text: | Story Points: | --- | ||
| Clone Of: | ||||
| : | 1393259 (view as bug list) | Environment: | ||
| Last Closed: | 2017-03-23 06:17:17 UTC | Type: | Bug | |
| Regression: | --- | Mount Type: | --- | |
| Documentation: | --- | CRM: | ||
| Verified Versions: | Category: | --- | ||
| oVirt Team: | --- | RHEL 7.3 requirements from Atomic Host: | ||
| Cloudforms Team: | --- | Target Upstream Version: | ||
| Bug Depends On: | ||||
| Bug Blocks: | 1351528, 1393259, 1393677, 1393682 | |||
|
Description
Nag Pavan Chilakam
2016-11-08 12:48:13 UTC
also note that two bricks were brought down. [root@dhcp37-108 ~]# gluster v status Status of volume: drvol Gluster process TCP Port RDMA Port Online Pid ------------------------------------------------------------------------------ Brick 10.70.35.191:/rhs/brick1/drvol N/A N/A N N/A Brick 10.70.37.108:/rhs/brick1/drvol 49154 0 Y 14087 Brick 10.70.35.3:/rhs/brick1/drvol 49154 0 Y 4790 Brick 10.70.37.66:/rhs/brick1/drvol 49154 0 Y 14602 Brick 10.70.35.191:/rhs/brick2/drvol 49156 0 Y 15070 Brick 10.70.37.108:/rhs/brick2/drvol N/A N/A N N/A Brick 10.70.35.3:/rhs/brick2/drvol 49155 0 Y 4810 Brick 10.70.37.66:/rhs/brick2/drvol 49155 0 Y 14622 Snapshot Daemon on localhost 49152 0 Y 14278 Self-heal Daemon on localhost N/A N/A Y 14128 Quota Daemon on localhost N/A N/A Y 14226 Snapshot Daemon on 10.70.35.3 49152 0 Y 4974 Self-heal Daemon on 10.70.35.3 N/A N/A Y 4835 Quota Daemon on 10.70.35.3 N/A N/A Y 4928 Snapshot Daemon on 10.70.37.66 49152 0 Y 14795 Self-heal Daemon on 10.70.37.66 N/A N/A Y 14643 Quota Daemon on 10.70.37.66 N/A N/A Y 14742 Snapshot Daemon on dhcp35-191.lab.eng.blr.r edhat.com 49153 0 Y 15307 Self-heal Daemon on dhcp35-191.lab.eng.blr. redhat.com N/A N/A Y 15091 Quota Daemon on dhcp35-191.lab.eng.blr.redh at.com N/A N/A Y 15239 Task Status of Volume drvol ------------------------------------------------------------------------------ There are no active volume tasks open-behind is taking fd->lock then inode->lock where as statedump is taking inode->lock then fd->lock, so it is leading to deadlock:
void
ob_fd_free (ob_fd_t *ob_fd)
{
loc_wipe (&ob_fd->loc); <<--- this takes (inode->lock)
if (ob_fd->xdata)
dict_unref (ob_fd->xdata);
if (ob_fd->open_frame)
STACK_DESTROY (ob_fd->open_frame->root);
GF_FREE (ob_fd);
}
int
ob_wake_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, fd_t *fd_ret, dict_t *xdata)
{
fd_t *fd = NULL;
struct list_head list;
ob_fd_t *ob_fd = NULL;
call_stub_t *stub = NULL, *tmp = NULL;
fd = frame->local;
frame->local = NULL;
INIT_LIST_HEAD (&list);
LOCK (&fd->lock); <<---- fd->lock
{
ob_fd = __ob_fd_ctx_get (this, fd);
list_splice_init (&ob_fd->list, &list);
if (op_ret < 0) {
/* mark fd BAD for ever */
ob_fd->op_errno = op_errno;
} else {
__fd_ctx_del (fd, this, NULL);
ob_fd_free (ob_fd);
}
}
UNLOCK (&fd->lock);
==============================================================
inode_dump (inode_t *inode, char *prefix)
{
int ret = -1;
xlator_t *xl = NULL;
int i = 0;
fd_t *fd = NULL;
struct _inode_ctx *inode_ctx = NULL;
struct list_head fd_list;
if (!inode)
return;
INIT_LIST_HEAD (&fd_list);
ret = TRY_LOCK(&inode->lock); <<---- takes inode->lock
if (ret != 0) {
return;
}
{
gf_proc_dump_write("gfid", "%s", uuid_utoa (inode->gfid));
gf_proc_dump_write("nlookup", "%ld", inode->nlookup);
gf_proc_dump_write("fd-count", "%u", inode->fd_count);
gf_proc_dump_write("ref", "%u", inode->ref);
gf_proc_dump_write("ia_type", "%d", inode->ia_type);
if (inode->_ctx) {
inode_ctx = GF_CALLOC (inode->table->ctxcount,
sizeof (*inode_ctx),
gf_common_mt_inode_ctx);
if (inode_ctx == NULL) {
goto unlock;
}
for (i = 0; i < inode->table->ctxcount;
i++) {
inode_ctx[i] = inode->_ctx[i];
}
}
if (dump_options.xl_options.dump_fdctx != _gf_true)
goto unlock;
list_for_each_entry (fd, &inode->fd_list, inode_list) {
fd_ctx_dump (fd, prefix); <<<-----------------
}
}
fd_ctx_dump (fd_t *fd, char *prefix)
{
struct _fd_ctx *fd_ctx = NULL;
xlator_t *xl = NULL;
int i = 0;
if ((fd == NULL) || (fd->_ctx == NULL)) {
goto out;
}
LOCK (&fd->lock); <<<-------------------
{
if (fd->_ctx != NULL) {
fd_ctx = GF_CALLOC (fd->xl_count, sizeof (*fd_ctx),
gf_common_mt_fd_ctx);
if (fd_ctx == NULL) {
goto unlock;
}
for (i = 0; i < fd->xl_count; i++) {
fd_ctx[i] = fd->_ctx[i];
}
}
}
I have run this case for two days on my systemic setup and didn't hit any hang/deadlock Hence moving to verified test version:3.8.4-5 Since the problem described in this bug report should be resolved in a recent advisory, it has been closed with a resolution of ERRATA. For information on the advisory, and where to find the updated files, follow the link below. If the solution does not work for you, open a new bug report. https://rhn.redhat.com/errata/RHSA-2017-0486.html |