Bug 1074023 - list dir with more than N files results in Input/output error
Summary: list dir with more than N files results in Input/output error
Keywords:
Status: CLOSED CURRENTRELEASE
Alias: None
Product: GlusterFS
Classification: Community
Component: fuse
Version: mainline
Hardware: All
OS: Linux
unspecified
high
Target Milestone: ---
Assignee: Niels de Vos
QA Contact:
URL:
Whiteboard:
: 1085425 (view as bug list)
Depends On:
Blocks: 1089934
TreeView+ depends on / blocked
 
Reported: 2014-03-07 16:45 UTC by Luis Flores
Modified: 2014-07-11 19:17 UTC (History)
6 users (show)

Fixed In Version: glusterfs-3.5.1
Doc Type: Bug Fix
Doc Text:
Clone Of:
: 1089934 (view as bug list)
Environment:
Last Closed: 2014-07-11 19:17:40 UTC
Regression: ---
Mount Type: ---
Documentation: ---
CRM:
Verified Versions:
Embargoed:


Attachments (Terms of Use)
systemtap script for debugging readdir returning -EIO (1.86 KB, text/plain)
2014-03-09 12:38 UTC, Niels de Vos
no flags Details
Initial patch: write max PAGESIZE bytes to /dev/fuse (1.74 KB, patch)
2014-03-09 13:03 UTC, Niels de Vos
no flags Details | Diff
call send_fuse_data multiple times in readdir callback if neccessary (1.66 KB, patch)
2014-03-25 21:38 UTC, thomas higdon
no flags Details | Diff

Description Luis Flores 2014-03-07 16:45:21 UTC
Tried on Fedora 20 server and client, updated (on 2014/03/07), with bluster 3.4.2.

Create a Gluster filesystem (fails on distributed and replicated fs)
mount -t glusterfs <GLUSTER SHARE> /mnt
cd /mnt
mkdir test
d=1
while [ $d -le 24 ]; do touch test/$d; d=$(( $d + 1 )); done
ls test
   ls: reading directory test: Input/output error
cd /
umount /mnt
mount -t nfs <GLUSTER SHARE> /mnt
cd /mnt
ls test
   success
cd /
umount /mnt
mount -t glusterfs <GLUSTER SHARE> /mnt
cd /mnt
rm -f test/1
ls test
   success

The number of maximum files depends on the filename size.

Comment 1 Luis Flores 2014-03-07 16:47:07 UTC
glusters package version: 3.4.2-1.fc20
kernel version: 3.13.5-202.fc20.x86_64
fuse version: 2.9.3-2.fc20

Comment 2 Joe Julian 2014-03-07 16:59:40 UTC
Additional notes from the IRC channel:
* Client is an x86_64, server is an arm7hl (32-bit)
* Number of dirents before failure changes based on the length of the filenames, like it's hitting some buffer limit.
* Client and brick logs show no errors
* Can mount via nfs without issue

Comment 3 Luis Flores 2014-03-07 17:05:27 UTC
strace ls test
execve("/usr/bin/ls", ["ls", "test/"], [/* 22 vars */]) = 0
brk(0)                                  = 0x1807000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e20b000
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=90898, ...}) = 0
mmap(NULL, 90898, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f487e1f4000
close(3)                                = 0
open("/lib64/libselinux.so.1", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\240d@\223=\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=144952, ...}) = 0
mmap(0x3d93400000, 2242712, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3d93400000
mprotect(0x3d93421000, 2093056, PROT_NONE) = 0
mmap(0x3d93620000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x20000) = 0x3d93620000
mmap(0x3d93622000, 6296, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x3d93622000
close(3)                                = 0
open("/lib64/libcap.so.2", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0 \26\300\251=\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=21424, ...}) = 0
mmap(0x3da9c00000, 2114112, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3da9c00000
mprotect(0x3da9c04000, 2093056, PROT_NONE) = 0
mmap(0x3da9e03000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x3000) = 0x3da9e03000
close(3)                                = 0
open("/lib64/libacl.so.1", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\200\37@\250=\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=39200, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e1f3000
mmap(0x3da8400000, 2130560, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3da8400000
mprotect(0x3da8407000, 2097152, PROT_NONE) = 0
mmap(0x3da8607000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x7000) = 0x3da8607000
close(3)                                = 0
open("/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0p\36\202\221=\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=2100672, ...}) = 0
mmap(0x3d91800000, 3924576, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3d91800000
mprotect(0x3d919b4000, 2097152, PROT_NONE) = 0
mmap(0x3d91bb4000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1b4000) = 0x3d91bb4000
mmap(0x3d91bba000, 16992, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x3d91bba000
close(3)                                = 0
open("/lib64/libpcre.so.1", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0p\27\0\223=\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=421144, ...}) = 0
mmap(0x3d93000000, 2511368, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3d93000000
mprotect(0x3d93065000, 2093056, PROT_NONE) = 0
mmap(0x3d93264000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x64000) = 0x3d93264000
close(3)                                = 0
open("/lib64/liblzma.so.5", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\3400\300\222=\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=155400, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e1f2000
mmap(0x3d92c00000, 2245240, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3d92c00000
mprotect(0x3d92c24000, 2093056, PROT_NONE) = 0
mmap(0x3d92e23000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x23000) = 0x3d92e23000
close(3)                                = 0
open("/lib64/libdl.so.2", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\320\16\0\222=\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=22440, ...}) = 0
mmap(0x3d92000000, 2109744, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3d92000000
mprotect(0x3d92003000, 2093056, PROT_NONE) = 0
mmap(0x3d92202000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x2000) = 0x3d92202000
close(3)                                = 0
open("/lib64/libattr.so.1", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\320\23\200\250=\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=22160, ...}) = 0
mmap(0x3da8800000, 2113904, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3da8800000
mprotect(0x3da8804000, 2093056, PROT_NONE) = 0
mmap(0x3da8a03000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x3000) = 0x3da8a03000
close(3)                                = 0
open("/lib64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\340m\300\221=\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=150800, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e1f1000
mmap(0x3d91c00000, 2213104, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3d91c00000
mprotect(0x3d91c18000, 2093056, PROT_NONE) = 0
mmap(0x3d91e17000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x17000) = 0x3d91e17000
mmap(0x3d91e19000, 13552, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x3d91e19000
close(3)                                = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e1f0000
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e1ee000
arch_prctl(ARCH_SET_FS, 0x7f487e1ee840) = 0
mprotect(0x3d91bb4000, 16384, PROT_READ) = 0
mprotect(0x3d91e17000, 4096, PROT_READ) = 0
mprotect(0x3da8a03000, 4096, PROT_READ) = 0
mprotect(0x3d92202000, 4096, PROT_READ) = 0
mprotect(0x3d92e23000, 4096, PROT_READ) = 0
mprotect(0x3d93264000, 4096, PROT_READ) = 0
mprotect(0x3da8607000, 4096, PROT_READ) = 0
mprotect(0x3da9e03000, 4096, PROT_READ) = 0
mprotect(0x3d93620000, 4096, PROT_READ) = 0
mprotect(0x61a000, 4096, PROT_READ)     = 0
mprotect(0x3d9161f000, 4096, PROT_READ) = 0
munmap(0x7f487e1f4000, 90898)           = 0
set_tid_address(0x7f487e1eeb10)         = 13298
set_robust_list(0x7f487e1eeb20, 24)     = 0
rt_sigaction(SIGRTMIN, {0x3d91c068c0, [], SA_RESTORER|SA_SIGINFO, 0x3d91c0f750}, NULL, 8) = 0
rt_sigaction(SIGRT_1, {0x3d91c06950, [], SA_RESTORER|SA_RESTART|SA_SIGINFO, 0x3d91c0f750}, NULL, 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0
getrlimit(RLIMIT_STACK, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) = 0
statfs("/sys/fs/selinux", 0x7fff120969a0) = -1 ENOENT (No such file or directory)
statfs("/selinux", 0x7fff120969a0)      = -1 ENOENT (No such file or directory)
brk(0)                                  = 0x1807000
brk(0x1828000)                          = 0x1828000
open("/proc/filesystems", O_RDONLY)     = 3
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e20a000
read(3, "nodev\tsysfs\nnodev\trootfs\nnodev\tr"..., 1024) = 390
read(3, "", 1024)                       = 0
close(3)                                = 0
munmap(0x7f487e20a000, 4096)            = 0
open("/usr/lib/locale/locale-archive", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=106070960, ...}) = 0
mmap(NULL, 106070960, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f4877cc5000
close(3)                                = 0
ioctl(1, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or TCGETS, {B38400 opost isig icanon echo ...}) = 0
ioctl(1, TIOCGWINSZ, {ws_row=48, ws_col=164, ws_xpixel=0, ws_ypixel=0}) = 0
stat("test/", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
openat(AT_FDCWD, "test/", O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC) = 3
getdents(3, 0x180cc90, 32768)           = -1 EIO (Input/output error)
open("/usr/share/locale/locale.alias", O_RDONLY|O_CLOEXEC) = 4
fstat(4, {st_mode=S_IFREG|0644, st_size=2492, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e20a000
read(4, "# Locale name alias data base.\n#"..., 4096) = 2492
read(4, "", 4096)                       = 0
close(4)                                = 0
munmap(0x7f487e20a000, 4096)            = 0
open("/usr/share/locale/en_US.UTF-8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en_US.utf8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en_US/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en.UTF-8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en.utf8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
write(2, "ls: ", 4ls: )                     = 4
write(2, "reading directory test/", 23reading directory test/) = 23
open("/usr/share/locale/en_US.UTF-8/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en_US.utf8/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en_US/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en.UTF-8/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en.utf8/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
write(2, ": Input/output error", 20: Input/output error)    = 20
write(2, "\n", 1
)                       = 1
close(3)                                = 0
close(1)                                = 0
close(2)                                = 0
exit_group(2)                           = ?
+++ exited with 2 +++

Comment 4 Niels de Vos 2014-03-07 17:15:18 UTC
From the strace above, these are the most important lines:

stat("test/", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
openat(AT_FDCWD, "test/", O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC) = 3
getdents(3, 0x180cc90, 32768)           = -1 EIO (Input/output error)



Unfortunately, this happens to me too :-/

My environment:
- 32-bit armv7hl storage servers - glusterfs-3.4.2-1.fc20.armv7hl
- x86_64 workstation - glusterfs-3.4.0.57rhs-1.el6_5.x86_64

Mounting over glusterfs on the storage servers does not have a problem.
When mounting over glusterfs from the workstation, -EIO is returned.
When mounting over nfs from the workstation, all is working fine.

tcpdump/wireshark of the GlusterFS protocol when the error is seen, does not contain an error. This suggests that the problem is in the glusterfs-fuse client, in the kernel/fuse module or in the kernel/vfs.

Comment 5 Niels de Vos 2014-03-08 15:18:11 UTC
One way reproduce (same package versions as in comment #4):

1. install a Fedora 20 i386 Gluster server (hostname vm130-12)
2. create and start a volume with one brick
3. install a RHEL-6 x86_64 Gluster client (hostname vm130-11)
4. mount the volume on the client over NFS
   # mount -t nfs vm130-12:/bug-1074023 /mnt/nfs
5. populate the volume with some files
   # for I in {0..50} ; do touch /mnt/nfs/$(uuidgen) ; done
6. mount the volume over glusterfs
   # mount -t glusterfs vm130-12:/bug-1074023 /mnt/glusterfs
7. check for the contents over the glusterfs mountpoint
   # ls /mnt/glusterfs
   ls: reading directory /mnt/glusterfs: Input/output error


This seems to be a problem in the READDIRPLUS implementation. I can not reproduce the problem (yet) when I disable READDIRPLUS on the client side.

   # mount -t glusterfs -o use-readdirp=no vm130-12:/bug-1074023 /mnt/glusterfs

Comment 6 Niels de Vos 2014-03-09 12:38:30 UTC
Created attachment 872404 [details]
systemtap script for debugging readdir returning -EIO

Usage:
- written for RHEL6.5 with glusterfs-fuse-3.4.0.59rhs-1.el6_4.x86_64
- make sure to have the matching glusterfs+kernel -debuginfo packages installed
- make sure to have /usr/bin/stap available
- run ./bug-1074023.stp on the glusterfs-client (vm130-11)

When mounting from a 32-bit system:
[root@vm130-11 ~]# ./bug-1074023.stp
Starting probe, press CTRL+C to exit...
send_fuse_data: size = 24
send_fuse_data returns 0
send_fuse_data: size = 104
send_fuse_data returns 0
send_fuse_data: size = 16
send_fuse_data returns 0
send_fuse_data: size = 4160
fuse_dev_write returns -22
send_fuse_data returns 22
fuse_readdir:1231: err=-5
0xffffffff81196380 returns -5, stack follows:
Returning from:  0xffffffffa042e940 : fuse_readdir+0x0/0x5e0 [fuse]
Returning to  :  0xffffffff81196380
 0xffffffff81196509 (inexact)
 0xffffffff8100b229 (inexact)
 0xffffffff8100b288 (inexact)
^C...exiting!

When mounting from a 64-bit server:
[root@vm130-11 ~]# ./bug-1074023.stp
Starting probe, press CTRL+C to exit...
send_fuse_data: size = 24
send_fuse_data returns 0
send_fuse_data: size = 104
send_fuse_data returns 0
send_fuse_data: size = 16
send_fuse_data returns 0
send_fuse_data: size = 3808
send_fuse_data returns 0
fuse_readdir:1231: err=0
send_fuse_data: size = 104
send_fuse_data returns 0
send_fuse_data: size = 3808
send_fuse_data returns 0
fuse_readdir:1231: err=0
send_fuse_data: size = 104
send_fuse_data returns 0
send_fuse_data: size = 2496
send_fuse_data returns 0
fuse_readdir:1231: err=0
send_fuse_data: size = 104
send_fuse_data returns 0
send_fuse_data: size = 0
send_fuse_data returns 0
fuse_readdir:1231: err=0
^C...exiting!


Note:
64-bit: send_fuse_data: size = 3808 [several call,s 3808 bytes max]
32-bit: send_fuse_data: size = 4160 [4160 is bigger than PAGESIZE=4096]

This is very surprising! Why would a 32-bit server cause a bigger buffer than the 64-bit server?

Lets compare the replies that come in over the network.

Captured teo tcpdumps like this:
[root@vm130-11 ~]# tcpdump -i any -s 0 -w /tmp/bug-1074023_32bit.pcap \
    tcp and not port 22
[root@vm130-11 ~]# tcpdump -i any -s 0 -w /tmp/bug-1074023_64bit.pcap \
    tcp and not port 22


Filter only the READDIRP procedures, and we're only interested in the replies (rpc.msgtyp == 1). Combine this with an additional 'grep' to show only some of the lines that interest us:

$ tshark -r bug-1074023_32bit.pcap \
    -Y 'glusterfs.proc == READDIRP && rpc.msgtyp == 1' -V \
    | grep -e '^Frame' -e '^Remote Procedure' -e 'Fragment header' \
           -e 'Program' -e 'Procedure'
Frame 48: 1024 bytes on wire (8192 bits), 1024 bytes captured (8192 bits)
Remote Procedure Call, Type:Reply XID:0x0000000c
    Fragment header: Last fragment, 3848 bytes
    [Program: GlusterFS (1298437)]
    [Program Version: 330]
    [Procedure: READDIRP (40)]
    [Program Version: 330]

$ tshark -r /tmp/bug-1074023_64bit.pcap \
    -Y 'glusterfs.proc == READDIRP && rpc.msgtyp == 1' -V \
    | grep -e '^Frame' -e '^Remote Procedure' -e 'Fragment header' \
           -e 'Program' -e 'Procedure'
Frame 48: 704 bytes on wire (5632 bits), 704 bytes captured (5632 bits)
Remote Procedure Call, Type:Reply XID:0x0000000c
    Fragment header: Last fragment, 3528 bytes
    [Program: GlusterFS (1298437)]
    [Program Version: 330]
    [Procedure: READDIRP (40)]
    [Program Version: 330]
Frame 55: 704 bytes on wire (5632 bits), 704 bytes captured (5632 bits)
Remote Procedure Call, Type:Reply XID:0x0000000e
    Fragment header: Last fragment, 3528 bytes
    [Program: GlusterFS (1298437)]
    [Program Version: 330]
    [Procedure: READDIRP (40)]
    [Program Version: 330]
Frame 60: 952 bytes on wire (7616 bits), 952 bytes captured (7616 bits)
Remote Procedure Call, Type:Reply XID:0x00000010
    Fragment header: Last fragment, 2328 bytes
    [Program: GlusterFS (1298437)]
    [Program Version: 330]
    [Procedure: READDIRP (40)]
    [Program Version: 330]
Frame 65: 112 bytes on wire (896 bits), 112 bytes captured (896 bits)
Remote Procedure Call, Type:Reply XID:0x00000012
    Fragment header: Last fragment, 40 bytes
    [Program: GlusterFS (1298437)]
    [Program Version: 330]
    [Procedure: READDIRP (40)]
    [Program Version: 330]


Conclusions:
The 64-bit Gluster server returns multiple READDIRP replies, and the result always fits in one PAGESIZE. The 32-bit Gluster server sends a first reply which does not fit in the PAGESIZE.

In the glusterfs-fuse client, a writev() to /dev/fuse is done (through send_fuse_data()). I currently suspect that the 'struct iovec' passed to writev() is limited to buffers of PAGESIZE.

The underlying cause is probably related to the aligning of the structures returned by the local/posix readdir() call on the Gluster server. The 32-bit architecture can align the structure more efficiently than the 64-bit architecture (bigger gaps between the dentries). When receiving the READDIRP response, the structures get converted and aligned one-by-one (the network protocol does not use aligning, it's a stream).

Checking in the tcpdump again:
$ tshark -r /tmp/bug-1074023_32bit.pcap \
    -Y 'glusterfs.proc == READDIRP && rpc.msgtyp == 1' -V \
    | grep -e '^Frame' -e '^Remote Procedure' -e 'Fragment header' \
           -e 'Program' -e 'Procedure' -e 'Entries returned'
Frame 48: 1056 bytes on wire (8448 bits), 1056 bytes captured (8448 bits)
Remote Procedure Call, Type:Reply XID:0x0000000c
    Fragment header: Last fragment, 3880 bytes
    [Program: GlusterFS (1298437)]
    [Program Version: 330]
    [Procedure: READDIRP (40)]
    [Program Version: 330]
    Entries returned: 22

$ tshark -r /tmp/bug-1074023_64bit.pcap \
    -Y 'glusterfs.proc == READDIRP && rpc.msgtyp == 1' -V \
    | grep -e '^Frame' -e '^Remote Procedure' -e 'Fragment header' \
           -e 'Program' -e 'Procedure' -e 'Entries returned'
Frame 48: 704 bytes on wire (5632 bits), 704 bytes captured (5632 bits)
Remote Procedure Call, Type:Reply XID:0x0000000c
    Fragment header: Last fragment, 3528 bytes
    [Program: GlusterFS (1298437)]
    [Program Version: 330]
    [Procedure: READDIRP (40)]
    [Program Version: 330]
    Entries returned: 20
...

The 32-bit Gluster server returns 22 dentries in the 1st READDIPR, where the 64-bit Gluster server returns only 20. For both the READDIRP Call contains a maximum size that the Reply may contain (max size is set to 4096).


Planned solution to test:
PAGESIZE and alignments can differ per architecture. There is no reason to make the Gluster servers aware of these details from the client side. Gluster should send big replies where possible, which is a little more efficient than sending multiple smaller replies.

Splitting the list of 'struct iovec' that is passed to writev() into buffers with a maximum of PAGESIZE might work. This would also make it possible to receive more dentries on any architecture.

Comment 7 Niels de Vos 2014-03-09 13:03:20 UTC
Created attachment 872408 [details]
Initial patch: write max PAGESIZE bytes to /dev/fuse

This works for me. I'll do some more verifications later and probably propose it for inclusion over the next few days.

Comment 8 Anand Avati 2014-03-14 15:28:33 UTC
REVIEW: http://review.gluster.org/7278 (fuse: support bigger than PAGESIZE writes to /dev/fuse) posted (#1) for review on master by Niels de Vos (ndevos)

Comment 9 Anand Avati 2014-03-24 21:35:50 UTC
REVIEW: http://review.gluster.org/7278 (fuse: support bigger than PAGESIZE writes to /dev/fuse) posted (#2) for review on master by Niels de Vos (ndevos)

Comment 10 thomas higdon 2014-03-25 21:38:05 UTC
Created attachment 878702 [details]
call send_fuse_data multiple times in readdir callback if neccessary

ndevos's patch does not work for me. The attached patch does solve at least my immediate problem.

Comment 11 Niels de Vos 2014-03-27 20:11:21 UTC
(In reply to thomas higdon from comment #10)
> Created attachment 878702 [details]
> call send_fuse_data multiple times in readdir callback if neccessary

After re-checking, I do not think this is a correct fix. Without testing it myself, I expect it to result in missing directory entries.

My initial patch indeed does not correctly address the problem either. It seems that a fuse-request for readdir(p) includes a maximum size that the response may contain. Any bigger responses will return in writev()-to-"/dev/fuse" errors, which are relatively silently ignored (some log-messages only, no errors returned).

The next patch should solve the issue for readdir(p). Other FOPS could be affected in a similar way, so some logging was added to help in debugging future issues.

Comment 12 Anand Avati 2014-03-27 20:16:29 UTC
REVIEW: http://review.gluster.org/7278 (fuse: prevent READDIR(P) from writing to much data to /dev/fuse) posted (#3) for review on master by Niels de Vos (ndevos)

Comment 13 Anand Avati 2014-03-31 13:03:34 UTC
REVIEW: http://review.gluster.org/7278 (fuse: prevent READDIR(P) from writing to much data to /dev/fuse) posted (#4) for review on master by Niels de Vos (ndevos)

Comment 14 Niels de Vos 2014-04-09 11:44:11 UTC
*** Bug 1085425 has been marked as a duplicate of this bug. ***

Comment 15 Anand Avati 2014-04-20 15:44:46 UTC
REVIEW: http://review.gluster.org/7278 (fuse: prevent READDIR(P) from writing to much data to /dev/fuse) posted (#5) for review on master by Niels de Vos (ndevos)

Comment 16 Anand Avati 2014-04-21 17:25:27 UTC
COMMIT: http://review.gluster.org/7278 committed in master by Anand Avati (avati) 
------
commit 20e317011af7c0f075819bf0648b225f6dc42350
Author: Niels de Vos <ndevos>
Date:   Thu Mar 27 20:34:44 2014 +0100

    fuse: prevent READDIR(P) from writing to much data to /dev/fuse
    
    In an environment with mixed architectures (32-bit servers, 64-bit
    client), it is possible that the on-wire Reply on a READDIR(P) procedure
    contains more direntries than the client can fit in the maximum size
    that the fuse-request indicated.
    
    A direntry is a dynamically sized structure, because the structure
    contains the name of the entry. The client sends a maximum size in the
    READDIR(P) Call to the server, and the server uses this size to limit
    the number of direntries to return. In case the server can pack more
    direntries in the requested maximum size (due to alignment differences
    between the architectures), it can happen that the client unpacks the
    list of direntries into a buffer that exceeds the maximum size that was
    given in the initial fuse-request.
    
    This change introduces a check for the maximum requested size of the
    fuse-response in fuse_readdir_cbk() and fuse_readdirp_cbk(). When the
    conversion from gluster-direntries to the fuse-direntry format takes
    place, the maximum size is checked, and the 'extra' direntries are
    discarded. The next readdir()/getdents() that is done, will fetch the
    just discarded direntries again.
    
    In addition to this bugfix, some extra logging in send_fuse_iov() and
    send_fuse_data() has been added to help diagnosing similar issues.
    
    Change-Id: If2eecfcdf9c248f3820035601446d2c89ff9d1a1
    BUG: 1074023
    Signed-off-by: Niels de Vos <ndevos>
    Reviewed-on: http://review.gluster.org/7278
    Tested-by: Gluster Build System <jenkins.com>
    Reviewed-by: Xavier Hernandez <xhernandez>
    Reviewed-by: Anand Avati <avati>

Comment 17 Anand Avati 2014-04-24 11:50:19 UTC
REVIEW: http://review.gluster.org/7547 (fuse: minor improvements for readdir(plus)) posted (#1) for review on master by Niels de Vos (ndevos)

Comment 18 Anand Avati 2014-04-27 17:51:17 UTC
COMMIT: http://review.gluster.org/7547 committed in master by Anand Avati (avati) 
------
commit 76ab97169f63da78c9e83daf040d7b09766497cf
Author: Niels de Vos <ndevos>
Date:   Thu Apr 24 13:38:31 2014 +0200

    fuse: minor improvements for readdir(plus)
    
    Instead of using 'int' for the sizes, use a 'size_t' as it is more
    correct. Save the size of a fuse_dirent in a temporary variable so that
    strlen() on the filename is called fewer times.
    
    Also correcting some typos in comments.
    
    Change-Id: Ic62d9d729a86a1a6a53ed1354fce153bac01d860
    BUG: 1074023
    Reported-by: Kaleb S. KEITHLEY <kkeithle>
    Signed-off-by: Niels de Vos <ndevos>
    Reviewed-on: http://review.gluster.org/7547
    Tested-by: Gluster Build System <jenkins.com>
    Reviewed-by: Anand Avati <avati>


Note You need to log in before you can comment on or make changes to this bug.