First compile up: #include <stdio.h> #include <dlfcn.h> int main(int argc, char *argv[]) { void *libcuda_handle = NULL; void *libudev_handle = NULL; libcuda_handle = dlopen("libcuda.so", RTLD_NOW); if (!libcuda_handle) { fprintf(stderr, "Could not open libcuda.so - %s\n", dlerror()); } libudev_handle = dlopen("libudev.so", RTLD_NOW); if (!libudev_handle) { fprintf(stderr, "Could not open libudev - %s\n", dlerror()); } return 0; } Then sudo mv /usr/lib64/libnvidia-fatbinaryloader.so.384.90 /usr/lib64/libnvidia-fatbinaryloader.so.384.90-save so that it is out of the way. Loading /usr/lib64/libcuda.so.1 triggers a load of its dependency librt.so.1 librt.so.1 is mapped into memory and added to link maps, but does not yet get relocations applied. libcuda's other dependency, libnvidia-fatbinaryloader.so.384.90, is not found because we moved it out of the way libcuda.so is dlclose'd, but librt.so.1 is left loaded and unrelocated The test dlopens "libudev.so" libudev.so loads tries to run library constructors for its dependencies, including librt.so.1. The library constructor for librt.so.1 segfaults because it was never relocated. [ben@Mustang dl-bug]$ sudo mv /usr/lib64/libnvidia-fatbinaryloader.so.384.90 /usr/lib64/libnvidia-fatbinaryloader.so.384.90-save [sudo] password for ben: [ben@Mustang dl-bug]$ ./a.out Could not open libcuda.so - libnvidia-fatbinaryloader.so.384.90: cannot open shared object file: No such file or directory ./a.out: Relink `/lib64/libudev.so' with `/lib64/librt.so.1' for IFUNC symbol `clock_gettime' Segmentation fault (core dumped) [ben@Mustang dl-bug]$ sudo mv /usr/lib64/libnvidia-fatbinaryloader.so.384.90-save /usr/lib64/libnvidia-fatbinaryloader.so.384.90 [ben@Mustang dl-bug]$ ./a.out [ben@Mustang dl-bug]$ You can see that librt.so is loaded as part of the libcuda dlopen: open("/lib64/libcuda.so", O_RDONLY|O_CLOEXEC) = 3 mmap(NULL, 15192024, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f7a98565000 mmap(0x7f7a99284000, 1376256, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0xb1f000) = 0x7f7a99284000 mmap(0x7f7a993d4000, 57304, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f7a993d4000 open("/lib64/libm.so.6", O_RDONLY|O_CLOEXEC) = 3 mmap(NULL, 3231896, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f7a9824f000 mmap(0x7f7a98563000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x114000) = 0x7f7a98563000 open("/lib64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = 3 mmap(NULL, 2220552, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f7a98030000 mmap(0x7f7a98249000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x19000) = 0x7f7a98249000 mmap(0x7f7a9824b000, 12808, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f7a9824b000 open("/lib64/librt.so.1", O_RDONLY|O_CLOEXEC) = 3 mmap(NULL, 2128384, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f7a97e28000 mmap(0x7f7a9802e000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x6000) = 0x7f7a9802e000 open("/lib64/libnvidia-fatbinaryloader.so.384.90", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory) open("/lib64/tls/x86_64/libnvidia-fatbinaryloader.so.384.90", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory) open("/lib64/tls/libnvidia-fatbinaryloader.so.384.90", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory) open("/lib64/x86_64/libnvidia-fatbinaryloader.so.384.90", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory) open("/lib64/libnvidia-fatbinaryloader.so.384.90", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory) open("/usr/lib64/tls/x86_64/libnvidia-fatbinaryloader.so.384.90", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory) open("/usr/lib64/tls/libnvidia-fatbinaryloader.so.384.90", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory) open("/usr/lib64/x86_64/libnvidia-fatbinaryloader.so.384.90", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory) open("/usr/lib64/libnvidia-fatbinaryloader.so.384.90", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory) write(2, "Could not open libcuda.so - libn"..., 123) = 123 open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3 mmap(NULL, 123176, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f7a99bbc000 open("/lib64/tls/libudev.so", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory) open("/lib64/libudev.so", O_RDONLY|O_CLOEXEC) = 3 mmap(NULL, 139777, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f7a99b96000 mmap(0x7f7a99bb6000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1f000) = 0x7f7a99bb6000 mmap(0x7f7a99bb8000, 513, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f7a99bb8000 open("/lib64/libresolv.so.2", O_RDONLY|O_CLOEXEC) = 3 mmap(NULL, 2197696, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f7a991c9000 mmap(0x7f7a993de000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x15000) = 0x7f7a993de000 mmap(0x7f7a993e0000, 6336, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f7a993e0000 open("/lib64/libselinux.so.1", O_RDONLY|O_CLOEXEC) = 3 mmap(NULL, 2258128, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f7a98fa1000 mmap(0x7f7a991c5000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x24000) = 0x7f7a991c5000 mmap(0x7f7a991c7000, 5328, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f7a991c7000 open("/lib64/libcap.so.2", O_RDONLY|O_CLOEXEC) = 3 mmap(NULL, 2113848, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f7a98d9c000 mmap(0x7f7a98f9f000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x3000) = 0x7f7a98f9f000 open("/lib64/libgcc_s.so.1", O_RDONLY|O_CLOEXEC) = 3 mmap(NULL, 2188336, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f7a98b85000 mmap(0x7f7a98d9a000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x15000) = 0x7f7a98d9a000 open("/lib64/libpcre.so.1", O_RDONLY|O_CLOEXEC) = 3 mmap(NULL, 2564360, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f7a98912000 mmap(0x7f7a98b83000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x71000) = 0x7f7a98b83000 and you can see that the library is not reloaded and the relocations are not being done by following through LD_DEBUG [ben@Mustang dl-bug]$ egrep "find lib|reloc" foo 8067: find library=libdl.so.2 [0]; searching 8067: find library=libc.so.6 [0]; searching 8067: relocation processing: /lib64/libc.so.6 8067: relocation processing: /lib64/libdl.so.2 8067: relocation processing: ./a.out (lazy) 8067: relocation processing: /lib64/ld-linux-x86-64.so.2 8067: find library=libcuda.so [0]; searching 8067: find library=libm.so.6 [0]; searching 8067: find library=libpthread.so.0 [0]; searching 8067: find library=librt.so.1 [0]; searching 8067: find library=libnvidia-fatbinaryloader.so.384.90 [0]; searching 8067: find library=libudev.so [0]; searching 8067: find library=libresolv.so.2 [0]; searching 8067: find library=libselinux.so.1 [0]; searching 8067: find library=libcap.so.2 [0]; searching 8067: find library=libgcc_s.so.1 [0]; searching 8067: find library=libpcre.so.1 [0]; searching 8067: relocation processing: /lib64/libpcre.so.1 8067: relocation processing: /lib64/libgcc_s.so.1 8067: relocation processing: /lib64/libcap.so.2 8067: relocation processing: /lib64/libselinux.so.1 8067: relocation processing: /lib64/libresolv.so.2 8067: relocation processing: /lib64/libudev.so It seems to me that one of two things should happen: a) If the dynamic linker fails to load a library for whatever reason, it should free up the address space and remove the library AND its dependencies from the address space. This seems like the preferred option since it isn't clear that the library dependencies will ever be needed. b) If we decide that it is a worthwhile to keep the library mapped into the address space then we need to remember that it hasn't been relocated yet and make sure that we fixup the relocations before trying to use it.
This problem also exists in F26 and so it is likely an upstream problem as well.
(In reply to Ben Woodard from comment #2) > This problem also exists in F26 and so it is likely an upstream problem as > well. I'm surprised this fails like this. Can you create a smaller example I can use on RHEL7 to reproduce? That way we can talk about the smaller example, and excludes any libudev issues?
Created attachment 1336898 [details] non-functioning attempt at a reproducer
(In reply to Ben Woodard from comment #5) > Created attachment 1336898 [details] > non-functioning attempt at a reproducer Please also attach all the shared objects that are involved in the working reproducer, and a shell script to run them in the failure mode. That way we have both sides of the equation. I'll look at the original objects and see what's unique about them.
Created attachment 1336924 [details] self contained reproducer Just untar this and then cd into dl-repo and run the script ./runme.sh
I can reproduce a SIGSEGV. Program received signal SIGSEGV, Segmentation fault. 0x00007ffff6474b8b in __pthread_initialize_minimal_internal () from ./libpthread.so.0 (gdb) bt #0 0x00007ffff6474b8b in __pthread_initialize_minimal_internal () from ./libpthread.so.0 #1 0x00007ffff64745d1 in _init () from ./libpthread.so.0 #2 0x00007ffff7fd1f60 in ?? () from ./libudev.so #3 0x00007ffff7de65da in call_init (l=0x7ffff7fffd60, argc=1, argv=0x7fffffffdcc0, env=0x7fffffffdcd0) at dl-init.c:58 #4 0x00007ffff7de6795 in call_init (env=0x7fffffffdcd0, argv=0x7fffffffdcc0, argc=1, l=<optimized out>) at dl-init.c:103 #5 _dl_init (main_map=main_map@entry=0x7ffff7fff280, argc=1, argv=0x7fffffffdcc0, env=0x7fffffffdcd0) at dl-init.c:86 #6 0x00007ffff7dea9ca in dl_open_worker (a=a@entry=0x7fffffffd960) at dl-open.c:562 #7 0x00007ffff794926c in __GI__dl_catch_exception (exception=0x7fffffffd940, operate=0x7ffff7dea660 <dl_open_worker>, args=0x7fffffffd960) at dl-error-skeleton.c:198 #8 0x00007ffff7dea2ba in _dl_open (file=0x400740 "libudev.so", mode=-2147483646, caller_dlopen=0x400640, nsid=<optimized out>, argc=1, argv=<optimized out>, env=0x7fffffffdcd0) at dl-open.c:645 #9 0x00007ffff7bd3f76 in dlopen_doit (a=a@entry=0x7fffffffdb90) at dlopen.c:66 #10 0x00007ffff794926c in __GI__dl_catch_exception ( exception=exception@entry=0x7fffffffdb30, operate=0x7ffff7bd3f20 <dlopen_doit>, args=0x7fffffffdb90) at dl-error-skeleton.c:198 #11 0x00007ffff79492ef in __GI__dl_catch_error (objname=0x7ffff7dd60d0 <last_result+16>, errstring=0x7ffff7dd60d8 <last_result+24>, mallocedp=0x7ffff7dd60c8 <last_result+8>, operate=<optimized out>, args=<optimized out>) at dl-error-skeleton.c:217 #12 0x00007ffff7bd45a9 in _dlerror_run ( operate=operate@entry=0x7ffff7bd3f20 <dlopen_doit>, args=args@entry=0x7fffffffdb90) at dlerror.c:162 #13 0x00007ffff7bd4002 in __dlopen (file=<optimized out>, mode=<optimized out>) at dlopen.c:87 #14 0x0000000000400640 in ?? () #15 0x00007fffffffdcc0 in ?? () #16 0x0000000100400500 in ?? () #17 0x0000000000000000 in ?? () (gdb) ~/build/glibc/elf/ld.so --library-path /home/carlos/build/glibc:/home/carlos/build/glibc/elf:/home/carlos/build/glibc/rt:/home/carlos/build/glibc/dlfcn:/home/carlos/build/glibc/resolv/:. ./orig Could not open libcuda.so - libnvidia-fatbinaryloader.so.384.90: cannot open shared object file: No such file or directory ./orig: Relink `./libudev.so' with `/home/carlos/build/glibc/rt/librt.so.1' for IFUNC symbol `clock_gettime' Segmentation fault (core dumped) The error is interesting, because what the dynamic loader is saying is that librt.so.1 is not yet resolved, but that a reference to clock_gettime exists. The hint to relink against librt is not correct. [carlos@athas dl-repo]$ readelf -a -W libudev.so | grep librt 0x0000000000000001 (NEEDED) Shared library: [librt.so.1] 0x00c0: Version: 1 File: librt.so.1 Cnt: 1 As you can see libudev.so is already linked against librt.so.1. I assume this comes from the failure mode issue. This is present in upstream master.
Created attachment 1337016 [details] working reproducer The original reporter cracked the mystery. librt has the NODELETE flag. Adding -Wl,nodelete to libb.so's link line causes the problem to reproduce. [ben@Mustang dl-bug]$ make run LD_LIBRARY_PATH=. ./main d_fn x=12 inside b_fn rm libe.so LD_LIBRARY_PATH=. ./main Could not open liba.so - libe.so: cannot open shared object file: No such file or directory make: *** [Makefile:38: run] Segmentation fault (core dumped)
We have a report that this can also happen due to ENOMEM while opening a shared object which has a NODELETE shared object as a dependency.
*** This bug has been marked as a duplicate of bug 1410154 ***