Inconsistency detected by ld.so: dl-tls.c: 493: _dl_allocate_tls_init: Assertion `listp->slotinfo[cnt].gen <= GL(dl_tls_generation)' failed!
Affects | Status | Importance | Assigned to | Milestone | |
---|---|---|---|---|---|
GLibC |
Fix Released
|
Medium
|
|||
glibc (Ubuntu) |
Confirmed
|
Undecided
|
Unassigned |
Bug Description
When using glibc as part of our NSX product, we are running into the above mentioned glibc assert case sometimes.
Here's relevant revision information :
Ubuntu - 16.04
glibc version - 2.23.
This is a known issue with resolution identified as per thread link below :
https:/
We have applied this patch in our product and it seems to be working fine.
Is there a way to upstream these changes and make those available in standard glibc upstream?
Please let us know.
Here are the two patches:
PATCH1
=======
diff --git a/elf/dl-open.c b/elf/dl-open.c
index 6f178b3..2b97605 100644
--- a/elf/dl-open.c
+++ b/elf/dl-open.c
@@ -524,9 +524,16 @@ dl_open_worker (void *a)
}
/* Bump the generation number if necessary. */
- if (any_tls && __builtin_expect (++GL(dl_
- _dl_fatal_printf (N_("\
+ if (any_tls)
+ {
+ size_t newgen = GL(dl_tls_
+ if (__builtin_expect (newgen == 0, 0))
+ _dl_fatal_printf (N_("\
TLS generation counter wrapped! Please report this."));
+ /* Synchronize with the load acquire in _dl_allocate_
+ See the CONCURRENCY NOTES there in dl-tls.c. */
+ atomic_
+ }
/* We need a second pass for static tls data, because _dl_update_slotinfo
must not be run while calls to _dl_add_to_slotinfo are still pending. */
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
index ed13fd9..7184a54 100644
--- a/elf/dl-tls.c
+++ b/elf/dl-tls.c
@@ -443,6 +443,48 @@ _dl_resize_dtv (dtv_t *dtv)
}
+/* CONCURRENCY NOTES:
+
+ During dynamic TLS and DTV allocation and setup various objects may be
+ accessed concurrently:
+
+ GL(dl_tls_
+ GL(dl_tls_
+ listp->
+ listp->
+ listp->next
+
+ where listp is a node in the GL(dl_tls_
+ APIs that may access them are
+
+ Writers: dlopen, dlclose and dynamic linker start up code.
+ Readers only: pthread_create and __tls_get_addr (TLS access).
+
+ The writers hold the GL(dl_load_lock), but the readers don't, so atomics
+ should be used when accessing these globals.
+
+ dl_open_worker (called from dlopen) for each loaded module increases
+ GL(dl_tls_
+ slotinfo entry to GL(dl_tls_
+ the next generation number GL(dl_tls_
+ GL(dl_tls_
+ This last write is release mo so previous writes can be synchronized.
+
+ GL(dl_tls_
+ entries. The slotinfo list might be shorter than that during dlopen.
+ Entries in the slotinfo list might have gen > GL(dl_tls_
+ map == NULL.
+
+ _dl_allocate_
+ the slotinfo list to do the dynamic TLS and DTV setup for the new thread.
+ It first loads the current GL(dl_tls_
+ considers modules up to that generation ignoring any later change to the
+ slotinfo list.
+
+ TODO: Entries might get changed and freed in dlclose without sync.
+ TODO: __tls_get_addr is not yet synchronized with dlopen and dlclose.
+*/
+
void *
internal_function
_dl_allocate_
@@ -455,9 +497,18 @@ _dl_allocate_
struct dtv_slotinfo_list *listp;
size_t total = 0;
size_t maxgen = 0;
-
- /* Check if the current dtv is big enough. */
- if (dtv[-1].counter < GL(dl_tls_
+ size_t gen_count;
+ size_t dtv_slots;
+
+ /* Synchronize with the release mo store in dl_open_worker, modules with
+ larger generation number are ignored. */
+ gen_count = atomic_load_acquire (&GL(dl_
+ /* Check if the current dtv is big enough. GL(dl_tls_
+ concurrently modified, but after the release mo store to
+ GL(dl_tls_
+ previously loaded modules so relaxed access is enough. */
+ dtv_slots = atomic_load_relaxed (&GL(dl_
+ if (dtv[-1].counter < dtv_slots)
{
/* Resize the dtv. */
dtv = _dl_resize_dtv (dtv);
@@ -480,18 +531,25 @@ _dl_allocate_
void *dest;
/* Check for the total number of used slots. */
- if (total + cnt > GL(dl_tls_
+ if (total + cnt > dtv_slots)
break;
- map = listp->
+ /* Synchronize with the release mo store in _dl_add_to_slotinfo in
+ dlopen, so the generation number read below is for a valid entry.
+ TODO: remove_slotinfo in dlclose is not synchronized. */
+ map = atomic_load_acquire (&listp-
if (map == NULL)
/* Unused entry. */
continue;
+ size_t gen = listp->
+ if (gen > gen_count)
+ /* New, concurrently loaded entry. */
+ continue;
+
/* Keep track of the maximum generation number. This might
not be the generation counter. */
- assert (listp-
- maxgen = MAX (maxgen, listp->
+ maxgen = MAX (maxgen, gen);
dtv[
dtv[
@@ -518,11 +576,14 @@ _dl_allocate_
}
total += cnt;
- if (total >= GL(dl_tls_
+ if (total >= dtv_slots)
break;
- listp = listp->next;
- assert (listp != NULL);
+ /* Synchronize with the release mo store in _dl_add_to_slotinfo
+ so only initialized slotinfo nodes are looked at. */
+ listp = atomic_load_acquire (&listp->next);
+ if (listp == NULL)
+ break;
}
/* The DTV version is up-to-date now. */
@@ -916,7 +977,7 @@ _dl_add_to_slotinfo (struct link_map *l)
the first slot. */
assert (idx == 0);
- listp = prevp->next = (struct dtv_slotinfo_list *)
+ listp = (struct dtv_slotinfo_list *)
malloc (sizeof (struct dtv_slotinfo_list)
+ TLS_SLOTINFO_
if (listp == NULL)
@@ -939,9 +1000,15 @@ cannot create TLS data structures"));
listp->next = NULL;
memset (listp->slotinfo, '\0',
+ /* _dl_allocate_
+ creation, it must only observe initialized nodes in the list.
+ See the CONCURRENCY NOTES there. */
+ atomic_
}
/* Add the information into the slotinfo data structure. */
- listp->
listp-
+ /* Synchronize with the acquire load in _dl_allocate_
+ See the CONCURRENCY NOTES there. */
+ atomic_
}
PATCH 2
========
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
index 073321c..2c9ad2a 100644
--- a/elf/dl-tls.c
+++ b/elf/dl-tls.c
@@ -571,7 +571,7 @@ _dl_allocate_
}
total += cnt;
- if (total >= dtv_slots)
+ if (total > dtv_slots)
break;
/* Synchronize with dl_add_to_slotinfo. */
Changed in glibc: | |
importance: | Unknown → Medium |
status: | Unknown → Confirmed |
Changed in glibc: | |
status: | Confirmed → Fix Released |
(this is a continuation of bug 17918, but it turns out to be a different
issue that was originally reported there.)
failure:
Inconsistency detected by ld.so: dl-tls.c: 493: _dl_allocate_ tls_init: Assertion `listp- >slotinfo[ cnt].gen <= _rtld_local. _dl_tls_ generation' failed!
caused by dlopen (in _dl_add_to_slotinfo and in dl_open_worker) doing
listp- >slotinfo[ idx].gen = GL(dl_tls_ generation) + 1; tls_generation) == 0, 0))
//...
if (any_tls && __builtin_expect (++GL(dl_
while pthread_create (in _dl_allocate_ tls_init) concurrently doing
assert (listp- >slotinfo[ cnt].gen <= GL(dl_tls_ generation) );
so
T1:
y = x + 1;
++x;
T2:
assert(y <= x);
this is hard to trigger as the race window is short compared to the time
dlopen and pthread_create takes, however if i add a usleep(1000) between
the two operations in T1, it is triggered all the time.
the slotinfo and tls generation update lack any sort of synchronization or atomics in _dl_allocate_ tls_init (dlopen holds GL(dl_load_lock)).
on x86_64 with added usleep:
(gdb) p _rtld_local. _dl_tls_ dtv_slotinfo_ list->slotinfo[ 0]@64 _dl_tls_ generation
$11 = {{gen = 0, map = 0x7ffff7ff94e8}, {gen = 1, map = 0x7ffff7ff94e8}, {gen = 2, map = 0x7ffff0000910}, {gen = 0, map = 0x0} <repeats 61 times>}
(gdb) p _rtld_local.
$12 = 1
T1: unix/syscall- template. S:84 <optimised out>) at ../sysdeps/ posix/usleep. c:32 0x7ffff7611c80) at dl-open.c:527 objname@ entry=0x7ffff76 11c70, errstring= errstring@ entry=0x7ffff76 11c78, mallocedp= mallocedp@ entry=0x7ffff76 11c6f, operate@ entry=0x7ffff7d ec720 <dl_open_worker>, args=args@ entry=0x7ffff76 11c80) at dl-error.c:187 11ee0 "mod-0.so", mode=-2147483646, caller_ dlopen= 0x4007e2 <start+34>, nsid=-2, argc=<optimised out>, 0x7ffff7611eb0) at dlopen.c:66 0x7ffff00008d0, errstring= 0x7ffff00008d8, mallocedp= 0x7ffff00008c8, operate= 0x7ffff7bd5e90 <dlopen_doit>, 0x7ffff7611eb0) at dl-error.c:187 operate@ entry=0x7ffff7b d5e90 <dlopen_doit>, args=args@ entry=0x7ffff76 11eb0) at dlerror.c:163 entry=0x7ffff76 11ee0 "mod-0.so", mode=mode@entry=2) at dlopen.c:87 2700) at pthread_ create. c:333 unix/sysv/ linux/x86_ 64/clone. S:109
#0 0x00007ffff7df2097 in nanosleep () at ../sysdeps/
#1 0x00007ffff7df1f74 in usleep (useconds=
#2 0x00007ffff7decc6b in dl_open_worker (a=a@entry=
#3 0x00007ffff7de8314 in _dl_catch_error (objname=
operate=
#4 0x00007ffff7dec2a9 in _dl_open (file=0x7ffff76
argv=<optimised out>, env=0x7fffffffe378) at dl-open.c:652
#5 0x00007ffff7bd5ee9 in dlopen_doit (a=a@entry=
#6 0x00007ffff7de8314 in _dl_catch_error (objname=
args=
#7 0x00007ffff7bd6521 in _dlerror_run (operate=
#8 0x00007ffff7bd5f82 in __dlopen (file=file@
#9 0x00000000004007e2 in start (a=<optimised out>) at a.c:19
#10 0x00007ffff79bf3d4 in start_thread (arg=0x7ffff761
#11 0x00007ffff76feedd in clone () at ../sysdeps/
T2: 0x7ffff7df8840 "listp- >slotinfo[ cnt].gen <= GL(dl_tls_ generation) ", file=0x7ffff7df68e6 "dl-tls.c", line=493, 0x7ffff7df9020 <__PRETTY_ FUNCTION_ _.9528> "_dl_allocate_ tls_init" ) at dl-minimal.c:220 allocate_ tls_init (result= 0x7fffb7fff700) at dl-tls.c:493
#0 __GI___assert_fail (assertion=
function=
#1 0x00007ffff7deb492 in __GI__dl_
#2 0x00007ffff79bff67 in allocate_stack (...