DescriptionMarcel Holtmann
2007-06-26 17:24:57 UTC
The root cause of this problem is not specifically that the inode pointer is
null, it's actually that a sysfs_dirent was "touched" by someone else while we
were working on behalf of it in sysfs_readdir(). At the time of the crash, the
s_dentry pointer in the sysfs_dirent struct has actually been zeroed. The code
specifically tests to ensure this cannot be the case before trying to walk the
s_dentry to the inode to the inode number.
PID: 6185 TASK: 10151a207f0 CPU: 1 COMMAND: "java"
#0 [1014d819c30] panic at ffffffff80137977
#1 [1014d819ce8] oops_end at ffffffff80111aec
#2 [1014d819d10] oops_end at ffffffff80111b07
#3 [1014d819d20] do_page_fault at ffffffff80124148
#4 [1014d819d60] dput at ffffffff8018f019
#5 [1014d819d80] link_path_walk at ffffffff80186dc0
#6 [1014d819e00] error_exit at ffffffff80110d91
[exception RIP: sysfs_readdir+333]
RIP: ffffffff801b3581 RSP: 000001014d819eb8 RFLAGS: 00010202
RAX: 0000000000000000 RBX: 000001009fd406c8 RCX: 0000000000000003
RDX: 000001009fd40688 RSI: ffffffff8033c9f6 RDI: ffffffff803245b3
RBP: 000001009fd406c0 R8: 000000000ca8d3d4 R9: 0000000000000005
R10: 0000002a95da61e8 R11: 0000000000000246 R12: ffffffff803245af
R13: 000001014a1e0548 R14: 0000010046662580 R15: 000001009fd40700
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#7 [1014d819eb0] sysfs_readdir at ffffffff801b355c
#8 [1014d819f00] vfs_readdir at ffffffff8018aef7
#9 [1014d819f30] sys_getdents64 at ffffffff8018b2e8
#10 [1014d819f50] sys_fcntl at ffffffff8018a675
#11 [1014d819f80] system_call at ffffffff8011026a
RIP: 0000003f6d68be3e RSP: 0000000041b6a7b0 RFLAGS: 00010283
RAX: 00000000000000d9 RBX: ffffffff8011026a RCX: 0000000000000000
RDX: 0000000000001000 RSI: 0000002b23ab3188 RDI: 000000000000000f
RBP: 0000002b1f3b4360 R8: 0000002b1d7efd28 R9: 0000002b1d7efcf0
R10: 0000002a95da61e8 R11: 0000000000000246 R12: 0000002b1dfeccc0
R13: 0000000041b6a760 R14: 0000002b23ab3150 R15: 0000000000000001
ORIG_RAX: 00000000000000d9 CS: 0033 SS: 002b
crash>
Source code of failing routine
static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
{
struct dentry *dentry = filp->f_dentry;
struct sysfs_dirent * parent_sd = dentry->d_fsdata;
struct sysfs_dirent *cursor = filp->private_data;
struct list_head *p, *q = &cursor->s_sibling;
ino_t ino;
int i = filp->f_pos;
switch (i) {
case 0:
ino = dentry->d_inode->i_ino;
if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
break;
filp->f_pos++;
i++;
/* fallthrough */
case 1:
ino = parent_ino(dentry);
if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
break;
filp->f_pos++;
i++;
/* fallthrough */
default:
if (filp->f_pos == 2) {
list_del(q);
list_add(q, &parent_sd->s_children);
}
for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
struct sysfs_dirent *next;
const char * name;
int len;
next = list_entry(p, struct sysfs_dirent,
s_sibling);
if (!next->s_element)
continue;
name = sysfs_get_name(next);
len = strlen(name);
if (next->s_dentry)
ino = next->s_dentry->d_inode->i_ino;
else
ino = iunique(sysfs_sb, 2);
if (filldir(dirent, name, len, filp->f_pos, ino,
dt_type(next)) < 0)
return 0;
list_del(q);
list_add(q, p);
p = q;
filp->f_pos++;
}
}
return 0;
}
Here's the portion of disassembled code:
0xffffffff801b3538 : lea 0x18(%r15),%rax
0xffffffff801b353c : cmp %rax,%rbx
0xffffffff801b353f : je 0xffffffff801b35f9
0xffffffff801b3545 : lea 0xfffffffffffffff8(%rbx),%rbp
0xffffffff801b3549 : cmpq $0x0,0x28(%rbp)
0xffffffff801b354e : je 0xffffffff801b35f1
0xffffffff801b3554 : mov %rbp,%rdi
0xffffffff801b3557 : callq 0xffffffff801b210a
0xffffffff801b355c : cld
0xffffffff801b355d : mov %rax,%r12
0xffffffff801b3560 : or $0xffffffffffffffff,%rcx
0xffffffff801b3564 : xor %eax,%eax
0xffffffff801b3566 : mov %r12,%rdi
0xffffffff801b3569 : repnz scas %es:(%edi),%al
0xffffffff801b356b : not %rcx
0xffffffff801b356e : dec %ecx
0xffffffff801b3570 : mov %ecx,0x4(%rsp)
0xffffffff801b3574 : mov 0x38(%rbp),%rax
0xffffffff801b3578 : test %rax,%rax
0xffffffff801b357b : je 0xffffffff801b3587
0xffffffff801b357d : mov 0x10(%rax),%rax
0xffffffff801b3581 : mov 0x30(%rax),%rax
0xffffffff801b3585 : jmp 0xffffffff801b3598
0xffffffff801b3587 : mov 3021882(%rip),%rdi # 0xffffffff804951
c8
0xffffffff801b358e : mov $0x2,%esi
0xffffffff801b3593 : callq 0xffffffff80191f99
At +320 we can tell that 0x38+%rbp was stored into%rax
Then we test is %rax is null
A couple of instructions later we see a call to iunique.
The source code shows us one call to iunique:
name = sysfs_get_name(next);
len = strlen(name);
if (next->s_dentry)
ino = next->s_dentry->d_inode->i_ino;
else
ino = iunique(sysfs_sb, 2);
The 'if' test the next->s_dentry for true (not null). That matches
the test at +324. So the 2 x mov's look like the instruction:
ino = next->s_dentry->d_inode->i_ino;
To verify if %rax is 0x38 off %rbp then that should mean that
%rbp is next
+0x38 is the s_dentry offset within the structure pointed to by next
A few lines earlier in the source code we can see next defined:
struct sysfs_dirent *next;
shows the structure of sysfs_direct. Hex 38 is decimal 56
struct sysfs_dirent {
[0] atomic_t s_count;
[8] struct list_head s_sibling;
[24] struct list_head s_children;
[40] void *s_element;
[48] int s_type;
[52] umode_t s_mode;
[56] struct dentry *s_dentry; <============
So now we have the confirmation that the disassembled code is
executing the source code we found.
Therefore, lets now display the structure pointed to by %rbp using
the address we have in the stack for that register:
crash> struct sysfs_dirent 1009fd406c0
struct sysfs_dirent {
s_count = {
counter = 1
},
s_sibling = {
next = 0x1009fd40718,
prev = 0x1014a1e0548
},
s_children = {
next = 0x1009fd406d8,
prev = 0x1009fd406d8
},
s_element = 0xffffffff80404a20,
s_type = 4,
s_mode = 33060,
s_dentry = 0x0
}
The s_dentry is null
Now the problem. The disassembled code shows:
0xffffffff801b3574 : mov 0x38(%rbp)
,%rax
0xffffffff801b3578 : test %rax,%rax
0xffffffff801b357b : je 0xffffffff801b3587
Well if the s_dentry is null, we should have taken the jump but
we did not. This suggests that the sysfs_dentry had a valid entry
when we started to execute this code and someone cleared it after.
So the problem is we have a dentry that has a nulled inode pointer but yet the
s_dentry pointer itself in the sysfs_dirent struct is now also zero which
is specifically tested in the code to ensure this cannot happen. This therefore
suggests someone else touched the structure while we accessing it.
An advisory has been issued which should help the problem
described in this bug report. This report is therefore being
closed with a resolution of ERRATA. For more information
on the solution and/or where to find the updated files,
please follow the link below. You may reopen this bug report
if the solution does not work for you.
http://rhn.redhat.com/errata/RHSA-2008-0089.html
The root cause of this problem is not specifically that the inode pointer is null, it's actually that a sysfs_dirent was "touched" by someone else while we were working on behalf of it in sysfs_readdir(). At the time of the crash, the s_dentry pointer in the sysfs_dirent struct has actually been zeroed. The code specifically tests to ensure this cannot be the case before trying to walk the s_dentry to the inode to the inode number. PID: 6185 TASK: 10151a207f0 CPU: 1 COMMAND: "java" #0 [1014d819c30] panic at ffffffff80137977 #1 [1014d819ce8] oops_end at ffffffff80111aec #2 [1014d819d10] oops_end at ffffffff80111b07 #3 [1014d819d20] do_page_fault at ffffffff80124148 #4 [1014d819d60] dput at ffffffff8018f019 #5 [1014d819d80] link_path_walk at ffffffff80186dc0 #6 [1014d819e00] error_exit at ffffffff80110d91 [exception RIP: sysfs_readdir+333] RIP: ffffffff801b3581 RSP: 000001014d819eb8 RFLAGS: 00010202 RAX: 0000000000000000 RBX: 000001009fd406c8 RCX: 0000000000000003 RDX: 000001009fd40688 RSI: ffffffff8033c9f6 RDI: ffffffff803245b3 RBP: 000001009fd406c0 R8: 000000000ca8d3d4 R9: 0000000000000005 R10: 0000002a95da61e8 R11: 0000000000000246 R12: ffffffff803245af R13: 000001014a1e0548 R14: 0000010046662580 R15: 000001009fd40700 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #7 [1014d819eb0] sysfs_readdir at ffffffff801b355c #8 [1014d819f00] vfs_readdir at ffffffff8018aef7 #9 [1014d819f30] sys_getdents64 at ffffffff8018b2e8 #10 [1014d819f50] sys_fcntl at ffffffff8018a675 #11 [1014d819f80] system_call at ffffffff8011026a RIP: 0000003f6d68be3e RSP: 0000000041b6a7b0 RFLAGS: 00010283 RAX: 00000000000000d9 RBX: ffffffff8011026a RCX: 0000000000000000 RDX: 0000000000001000 RSI: 0000002b23ab3188 RDI: 000000000000000f RBP: 0000002b1f3b4360 R8: 0000002b1d7efd28 R9: 0000002b1d7efcf0 R10: 0000002a95da61e8 R11: 0000000000000246 R12: 0000002b1dfeccc0 R13: 0000000041b6a760 R14: 0000002b23ab3150 R15: 0000000000000001 ORIG_RAX: 00000000000000d9 CS: 0033 SS: 002b crash> Source code of failing routine static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) { struct dentry *dentry = filp->f_dentry; struct sysfs_dirent * parent_sd = dentry->d_fsdata; struct sysfs_dirent *cursor = filp->private_data; struct list_head *p, *q = &cursor->s_sibling; ino_t ino; int i = filp->f_pos; switch (i) { case 0: ino = dentry->d_inode->i_ino; if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) break; filp->f_pos++; i++; /* fallthrough */ case 1: ino = parent_ino(dentry); if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) break; filp->f_pos++; i++; /* fallthrough */ default: if (filp->f_pos == 2) { list_del(q); list_add(q, &parent_sd->s_children); } for (p=q->next; p!= &parent_sd->s_children; p=p->next) { struct sysfs_dirent *next; const char * name; int len; next = list_entry(p, struct sysfs_dirent, s_sibling); if (!next->s_element) continue; name = sysfs_get_name(next); len = strlen(name); if (next->s_dentry) ino = next->s_dentry->d_inode->i_ino; else ino = iunique(sysfs_sb, 2); if (filldir(dirent, name, len, filp->f_pos, ino, dt_type(next)) < 0) return 0; list_del(q); list_add(q, p); p = q; filp->f_pos++; } } return 0; } Here's the portion of disassembled code: 0xffffffff801b3538 : lea 0x18(%r15),%rax 0xffffffff801b353c : cmp %rax,%rbx 0xffffffff801b353f : je 0xffffffff801b35f9 0xffffffff801b3545 : lea 0xfffffffffffffff8(%rbx),%rbp 0xffffffff801b3549 : cmpq $0x0,0x28(%rbp) 0xffffffff801b354e : je 0xffffffff801b35f1 0xffffffff801b3554 : mov %rbp,%rdi 0xffffffff801b3557 : callq 0xffffffff801b210a 0xffffffff801b355c : cld 0xffffffff801b355d : mov %rax,%r12 0xffffffff801b3560 : or $0xffffffffffffffff,%rcx 0xffffffff801b3564 : xor %eax,%eax 0xffffffff801b3566 : mov %r12,%rdi 0xffffffff801b3569 : repnz scas %es:(%edi),%al 0xffffffff801b356b : not %rcx 0xffffffff801b356e : dec %ecx 0xffffffff801b3570 : mov %ecx,0x4(%rsp) 0xffffffff801b3574 : mov 0x38(%rbp),%rax 0xffffffff801b3578 : test %rax,%rax 0xffffffff801b357b : je 0xffffffff801b3587 0xffffffff801b357d : mov 0x10(%rax),%rax 0xffffffff801b3581 : mov 0x30(%rax),%rax 0xffffffff801b3585 : jmp 0xffffffff801b3598 0xffffffff801b3587 : mov 3021882(%rip),%rdi # 0xffffffff804951 c8 0xffffffff801b358e : mov $0x2,%esi 0xffffffff801b3593 : callq 0xffffffff80191f99 At +320 we can tell that 0x38+%rbp was stored into%rax Then we test is %rax is null A couple of instructions later we see a call to iunique. The source code shows us one call to iunique: name = sysfs_get_name(next); len = strlen(name); if (next->s_dentry) ino = next->s_dentry->d_inode->i_ino; else ino = iunique(sysfs_sb, 2); The 'if' test the next->s_dentry for true (not null). That matches the test at +324. So the 2 x mov's look like the instruction: ino = next->s_dentry->d_inode->i_ino; To verify if %rax is 0x38 off %rbp then that should mean that %rbp is next +0x38 is the s_dentry offset within the structure pointed to by next A few lines earlier in the source code we can see next defined: struct sysfs_dirent *next; shows the structure of sysfs_direct. Hex 38 is decimal 56 struct sysfs_dirent { [0] atomic_t s_count; [8] struct list_head s_sibling; [24] struct list_head s_children; [40] void *s_element; [48] int s_type; [52] umode_t s_mode; [56] struct dentry *s_dentry; <============ So now we have the confirmation that the disassembled code is executing the source code we found. Therefore, lets now display the structure pointed to by %rbp using the address we have in the stack for that register: crash> struct sysfs_dirent 1009fd406c0 struct sysfs_dirent { s_count = { counter = 1 }, s_sibling = { next = 0x1009fd40718, prev = 0x1014a1e0548 }, s_children = { next = 0x1009fd406d8, prev = 0x1009fd406d8 }, s_element = 0xffffffff80404a20, s_type = 4, s_mode = 33060, s_dentry = 0x0 } The s_dentry is null Now the problem. The disassembled code shows: 0xffffffff801b3574 : mov 0x38(%rbp) ,%rax 0xffffffff801b3578 : test %rax,%rax 0xffffffff801b357b : je 0xffffffff801b3587 Well if the s_dentry is null, we should have taken the jump but we did not. This suggests that the sysfs_dentry had a valid entry when we started to execute this code and someone cleared it after. So the problem is we have a dentry that has a nulled inode pointer but yet the s_dentry pointer itself in the sysfs_dirent struct is now also zero which is specifically tested in the code to ensure this cannot happen. This therefore suggests someone else touched the structure while we accessing it.