In C/C++ programs, global variables are shared by all threads by default, and developers need to deal with multi-threaded competition. There are cases where we need to make sure that one thread has an exclusive share of the data and that other threads cannot access it. A typical case is the errno global variable, which always stores the error code of the last call of the current thread, without thread conflicts. This time you need to use thread-local storage (TLS) to solve it.
pthread’s memory structure
Before describing TLS, let’s understand the memory structure of pthread. glibc/nptl/descr.h defines an important data structure for threads, struct pthread
, which describes the complete information of user state threads. The pthread structure is very complex and is related to the TLS by the specific_1stblock array and the specific secondary array, which will be described later.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
|
#define PTHREAD_KEY_2NDLEVEL_SIZE 32
#define PTHREAD_KEY_1STLEVEL_SIZE \
((PTHREAD_KEYS_MAX + PTHREAD_KEY_2NDLEVEL_SIZE - 1) \
/ PTHREAD_KEY_2NDLEVEL_SIZE)
struct pthread
{
union
{
#if !TLS_DTV_AT_TP
/* This overlaps the TCB as used for TLS without threads (see tls.h). */
tcbhead_t header;
#else
struct
{
int multiple_threads;
int gscope_flag;
} header;
#endif
void *__padding[24];
};
list_t list;
pid_t tid;
...
struct pthread_key_data
{
/* Sequence number. We use uintptr_t to not require padding on
32- and 64-bit machines. On 64-bit machines it helps to avoid
wrapping, too. */
uintptr_t seq;
/* Data pointer. */
void *data;
} specific_1stblock[PTHREAD_KEY_2NDLEVEL_SIZE];
/* Two-level array for the thread-specific data. */
struct pthread_key_data *specific[PTHREAD_KEY_1STLEVEL_SIZE];
/* Flag which is set when specific data is set. */
bool specific_used;
...
}
|
__thread
The __thread
keyword can be used to declare TLS variables in the GCC/Clang compilation environment. The __thread
keyword is not C-standard and has different names from compiler to compiler.
Tested on Xcode 13.2 only i386 architecture does not support __thread
.
1
2
3
4
5
|
#if defined(__i386__)
static char *g_thread_data = NULL;
#else
static __thread char *g_thread_data = NULL;
#endif
|
Variables declared with the __thread
keyword are stored in the memory area between the stack space, after the pthred structure. That is, in terms of memory layout, the memory distribution from the high address to the bottom address is: the pthred structure, the __thread
variable area, and the stack area (the bottom of the stack and the top of the __thread
variable area are contiguous).
This is illustrated below with a program running on Xcode 13.2/arm64.
1
2
3
4
5
6
7
8
9
10
|
__thread uint64_t g_tls_int = 6;
__thread char *g_tls_string = "easeapi.com";;
void tls_test(void)
{
uint64_t value = g_tls_int;
printf("%llu", value);
char *string = g_tls_string;
printf("%s", string);
}
|
Breakpoint at the tls_test entry and view the corresponding assembly program as follows.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
0x104235240 <+0>: sub sp, sp, #0x40 ; =0x40
0x104235244 <+4>: stp x29, x30, [sp, #0x30]
0x104235248 <+8>: add x29, sp, #0x30 ; =0x30
0x10423524c <+12>: adrp x0, 529
0x104235250 <+16>: add x0, x0, #0xd70 ; =0xd70
0x104235254 <+20>: ldr x8, [x0]
0x104235258 <+24>: blr x8
0x10423525c <+28>: str x0, [sp, #0x10]
0x104235260 <+32>: adrp x0, 529
0x104235264 <+36>: add x0, x0, #0xd88 ; =0xd88
0x104235268 <+40>: ldr x8, [x0]
0x10423526c <+44>: blr x8
0x104235270 <+48>: mov x8, x0
0x104235274 <+52>: ldr x0, [sp, #0x10]
0x104235278 <+56>: str x8, [sp, #0x18]
0x10423527c <+60>: ldr x8, [x0]
0x104235280 <+64>: stur x8, [x29, #-0x8]
0x104235284 <+68>: ldur x8, [x29, #-0x8]
0x104235288 <+72>: adrp x0, 471
0x10423528c <+76>: add x0, x0, #0x7fc ; =0x7fc
0x104235290 <+80>: mov x9, sp
0x104235294 <+84>: str x8, [x9]
0x104235298 <+88>: bl 0x104403be0 ; symbol stub for: printf
0x10423529c <+92>: ldr x0, [sp, #0x18]
0x1042352a0 <+96>: ldr x8, [x0]
0x1042352a4 <+100>: stur x8, [x29, #-0x10]
0x1042352a8 <+104>: ldur x8, [x29, #-0x10]
0x1042352ac <+108>: adrp x0, 471
0x1042352b0 <+112>: add x0, x0, #0x801 ; =0x801
0x1042352b4 <+116>: mov x9, sp
0x1042352b8 <+120>: str x8, [x9]
0x1042352bc <+124>: bl 0x104403be0 ; symbol stub for: printf
0x1042352c0 <+128>: ldp x29, x30, [sp, #0x30]
0x1042352c4 <+132>: add sp, sp, #0x40 ; =0x40
0x1042352c8 <+136>: ret
|
At 0x104235274, the sp register is read at offset 0x10 bytes to x0. The value of the x0 register is read at 0x104235278 (g_tls_int).
1
2
3
4
|
(lldb) register read x0
x0 = 0x0000000281cf41a0
(lldb) memory read/1xg 0x0000000281cf41a0
0x281cf41a0: 0x0000000000000006
|
At 0x10423529c, sp register offset 0x18 bytes read to x0. Read x0 register value at 0x1042352a0 (g_tls_string).
1
2
3
4
5
6
7
|
(lldb) register read x0
x0 = 0x0000000281cf41a8
(lldb) memory read/1xg 0x0000000281cf41a8
0x281cf41a8: 0x000000010440c7f0
(lldb) memory read 0x000000010440c7f0
0x10440c7f0: 65 61 73 65 61 70 69 2e 63 6f 6d 00 25 6c 6c 75 easeapi.com.%llu
0x10440c800: 00 25 73 00 4d 79 41 70 70 6c 69 63 61 74 69 6f .%s.MyApplicatio
|
From the above test results, reading __thread
variables is done via fp pointer offsets (shifts to higher addresses).
Variables modified by __thread
must be of type POD (Plain Old Data) and do not support high-level language features such as class. The __thread
variable remains for the life of the thread and is released when the thread is destroyed. Note that since __thread
does not specify a destruction method, when we define a __thread
-modified pointer variable and malloc memory in the thread, the end of the thread will only set the __thread
variable pointer to NULL, requiring the developer to free memory manually.
1
2
3
4
5
6
7
|
__thread char *g_tls_string = NULL;
void tls_test(void)
{
if (g_tls_string == NULL) g_tls_string = calloc(1024, 1);
//线程销毁时,需要手动释放malloc的内存
}
|
If you want the release of malloc memory to be done automatically when the thread ends, you need to use the pthread specific related API.
pthread specific API
pthread also provides the following APIs to implement the TLS functionality.
1
2
3
4
5
6
7
8
9
|
//nptl/bits/pthreadtypes.h
/* Keys for thread-specific data */
typedef unsigned int pthread_key_t;
int pthread_key_create(pthread_key_t *, void (* _Nullable)(void *));
int pthread_key_delete(pthread_key_t);
int pthread_setspecific(pthread_key_t , const void * _Nullable);
void* _Nullable pthread_getspecific(pthread_key_t);
|
The first parameter of pthread_key_create is the pthread_key_t pointer, which is used to receive the pthread_key_t returned by successful creation, and the second parameter is the data destructor pointer, which will be executed when the thread is destroyed. pthread_key_create obtains the pthread_key_t after successful creation, and then pthread_key_t can be used to read and write thread private data. The sample code is as follows.
1
2
3
4
5
6
7
8
9
10
|
//create key
pthread_key_t key = 0;
pthread_key_create(&key, NULL);
//write
struct easeapi_struct data;
pthread_setspecific(key, &struct_data);
//read
struct easeapi_struct* = (struct easeapi_struct *)pthread_getspecific(key)
|
Each process has a global array __pthread_keys to manage pthread_key_t.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
//nptl/internaltypes.h:
/* Thread-local data handling. */
struct pthread_key_struct
{
/* Sequence numbers. Even numbers indicated vacant entries. Note
that zero is even. We use uintptr_t to not require padding on
32- and 64-bit machines. On 64-bit machines it helps to avoid
wrapping, too. */
uintptr_t seq;
/* Destructor for the data. */
void (*destr) (void *);
};
//sysdeps/unix/sysv/linux/bits/local_lim.h
/* This is the value this implementation supports. */
#define PTHREAD_KEYS_MAX 1024
//nptl/pthread_keys.c
/* Table of the key information. */
struct pthread_key_struct __pthread_keys[PTHREAD_KEYS_MAX];
|
The struct pthread_key_struct
structure defines the seq and the pointer to the incoming destructor. A program can create up to PTHREAD_KEYS_MAX pthread_key_t at the same time. pthread_key_t is global, but different threads actually operate on different memory when accessing the read/write interface through pthread_key_t.
When pthread_key_create is executed, it finds an unused pthread_key_struct structure from the __pthread_keys
array and adds 1 to its seq. The returned pthread_key_t is actually this pthread_key_struct in the __pthread_keys
array. pthread_key_t is actually the serial number of this pthread_key_struct in the __pthread_keys
array. The following code.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
//nptl/pthread_key_create.c:
int
___pthread_key_create (pthread_key_t *key, void (*destr) (void *))
{
/* Find a slot in __pthread_keys which is unused. */
for (size_t cnt = 0; cnt < PTHREAD_KEYS_MAX; ++cnt)
{
uintptr_t seq = __pthread_keys[cnt].seq;
if (KEY_UNUSED (seq) && KEY_USABLE (seq)
/* We found an unused slot. Try to allocate it. */
&& ! atomic_compare_and_exchange_bool_acq (&__pthread_keys[cnt].seq,
seq + 1, seq))
{
/* Remember the destructor. */
__pthread_keys[cnt].destr = destr;
/* Return the key to the caller. */
*key = cnt;
/* The call succeeded. */
return 0;
}
}
return EAGAIN;
}
|
When pthread_key_delete is executed, the corresponding pthread_key_struct is found from __pthread_keys according to the serial number of pthread_key_t, and its seq is added by 1. The following code is used.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
//nptl/pthread_key_delete.c
int
___pthread_key_delete (pthread_key_t key)
{
int result = EINVAL;
if (__glibc_likely (key < PTHREAD_KEYS_MAX))
{
unsigned int seq = __pthread_keys[key].seq;
if (__builtin_expect (! KEY_UNUSED (seq), 1)
&& ! atomic_compare_and_exchange_bool_acq (&__pthread_keys[key].seq,
seq + 1, seq))
/* We deleted a valid key. */
result = 0;
}
return result;
}
|
Note that atomic_compare_and_exchange_bool_acq
is used here to guarantee atomic operations.
The default value of seq is 0. Either pthread_key_create or pthread_key_delete adds 1 to seq. When the value of seq is even (including 0), it means that the current pthread_key_struct is not in use, and when it is odd, it means that it is in use.
The allocation of pthread_key_t via pthread_key_create is global, but the key-value association is thread-independent. The following definition is available in the struct pthread
structure.
1
2
3
4
5
6
7
8
9
10
11
12
13
|
struct pthread_key_data
{
/* Sequence number. We use uintptr_t to not require padding on
32- and 64-bit machines. On 64-bit machines it helps to avoid
wrapping, too. */
uintptr_t seq;
/* Data pointer. */
void *data;
} specific_1stblock[PTHREAD_KEY_2NDLEVEL_SIZE];
/* Two-level array for the thread-specific data. */
struct pthread_key_data *specific[PTHREAD_KEY_1STLEVEL_SIZE];
|
The struct pthread_key_data
structure defines a pointer to data where the current thread stores TLS data, seq and seq of struct pthread_key_struct
are the same, which identifies whether the corresponding key is created or not.
The specific_1stblock is not set to the same size as PTHREAD_KEYS_MAX, but to PTHREAD_KEY_2NDLEVEL_SIZE (32), which is supposed to be designed from a memory-saving point of view, as we do not use many TLS variables in most cases.
When pthread_setspecific is executed, when the number of pthread_key_t is smaller than PTHREAD_KEY_2NDLEVEL_SIZE, specific_1stblock array is used directly; when the number of pthread_key_t exceeds PTHREAD_KEY_2NDLEVEL_SIZE, memory space is requested again. SIZE, then apply memory space to use specific secondary array, and store the value in specific[idx1st][idx2nd].data.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
|
//nptl/pthread_setspecific.c
int
___pthread_setspecific (pthread_key_t key, const void *value)
{
struct pthread *self;
unsigned int idx1st;
unsigned int idx2nd;
struct pthread_key_data *level2;
unsigned int seq;
self = THREAD_SELF;
/* Special case access to the first 2nd-level block. This is the
usual case. */
if (__glibc_likely (key < PTHREAD_KEY_2NDLEVEL_SIZE))
{
/* Verify the key is sane. */
if (KEY_UNUSED ((seq = __pthread_keys[key].seq)))
/* Not valid. */
return EINVAL;
level2 = &self->specific_1stblock[key];
/* Remember that we stored at least one set of data. */
if (value != NULL)
THREAD_SETMEM (self, specific_used, true);
}
else
{
if (key >= PTHREAD_KEYS_MAX
|| KEY_UNUSED ((seq = __pthread_keys[key].seq)))
/* Not valid. */
return EINVAL;
idx1st = key / PTHREAD_KEY_2NDLEVEL_SIZE;
idx2nd = key % PTHREAD_KEY_2NDLEVEL_SIZE;
/* This is the second level array. Allocate it if necessary. */
level2 = THREAD_GETMEM_NC (self, specific, idx1st);
if (level2 == NULL)
{
if (value == NULL)
/* We don't have to do anything. The value would in any case
be NULL. We can save the memory allocation. */
return 0;
level2
= (struct pthread_key_data *) calloc (PTHREAD_KEY_2NDLEVEL_SIZE,
sizeof (*level2));
if (level2 == NULL)
return ENOMEM;
THREAD_SETMEM_NC (self, specific, idx1st, level2);
}
/* Pointer to the right array element. */
level2 = &level2[idx2nd];
/* Remember that we stored at least one set of data. */
THREAD_SETMEM (self, specific_used, true);
}
/* Store the data and the sequence number so that we can recognize
stale data. */
level2->seq = seq;
level2->data = (void *) value;
return 0;
}
|
With the above analysis, the logic of executing pthread_getspecific is relatively clear.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
|
//nptl/pthread_getspecific.c
void *
___pthread_getspecific (pthread_key_t key)
{
struct pthread_key_data *data;
/* Special case access to the first 2nd-level block. This is the
usual case. */
if (__glibc_likely (key < PTHREAD_KEY_2NDLEVEL_SIZE))
data = &THREAD_SELF->specific_1stblock[key];
else
{
/* Verify the key is sane. */
if (key >= PTHREAD_KEYS_MAX)
/* Not valid. */
return NULL;
unsigned int idx1st = key / PTHREAD_KEY_2NDLEVEL_SIZE;
unsigned int idx2nd = key % PTHREAD_KEY_2NDLEVEL_SIZE;
/* If the sequence number doesn't match or the key cannot be defined
for this thread since the second level array is not allocated
return NULL, too. */
struct pthread_key_data *level2 = THREAD_GETMEM_NC (THREAD_SELF,
specific, idx1st);
if (level2 == NULL)
/* Not allocated, therefore no data. */
return NULL;
/* There is data. */
data = &level2[idx2nd];
}
void *result = data->data;
if (result != NULL)
{
uintptr_t seq = data->seq;
if (__glibc_unlikely (seq != __pthread_keys[key].seq))
result = data->data = NULL;
}
return result;
}
|
According to the implementation of glibc, the pthread_key_t obtained when executing pthread_key_create should be a relatively small value in order to use the specific_1stblock array in preference. But I tested in macOS environment and found that the pthread_key_t obtained is relatively large, here should be the specific implementation of macOS and glibc inconsistency?
__thread
and pthread specific API comparison
- Different storage areas/addressing methods
The data defined by the pthread specific API is addressed by the specific_1stblock array and the specific secondary array of the struct pthread
structure, while the __thread
variables are addressed by fp register offsets.
- Performance/efficiency differences
Since __thread
is addressed by fp register offset, performance is higher than pthread specific API.
- Different data can be stored
__thread
can only modify POD type variables, and for pointer type data, you need to destroy it manually when there is a memory request; while pthread specific API supports incoming destruction method and supports all data types.
- The number of supported data is different
Theoretically, as long as the stack is not full, __thread
can be defined indefinitely (doubtful?) The pthread specific API can only create PTHREAD_KEYS_MAX keys, but can use a key to store multiple values by means of structures, etc.