Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 55 additions & 5 deletions TSRM/TSRM.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ typedef struct {
ts_allocate_ctor ctor;
ts_allocate_dtor dtor;
size_t fast_offset;
/* When set, storage comes from __thread memory instead of being allocated by TSRM. */
void *(*tls_addr)(void);
int done;
} tsrm_resource_type;

Expand Down Expand Up @@ -163,14 +165,19 @@ TSRM_API bool tsrm_startup(int expected_threads, int expected_resources, int deb

static void ts_free_resources(tsrm_tls_entry *thread_resources)
{
bool own_thread = thread_resources->thread_id == tsrm_thread_id();
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here it's possible that a thread id was recycled, but the tls points to a now obsolete one


/* Need to destroy in reverse order to respect dependencies. */
for (int i = thread_resources->count - 1; i >= 0; i--) {
if (!resource_types_table[i].done) {
if (resource_types_table[i].tls_addr && !own_thread) {
continue;
}
if (resource_types_table[i].dtor) {
resource_types_table[i].dtor(thread_resources->storage[i]);
}

if (!resource_types_table[i].fast_offset) {
if (!resource_types_table[i].fast_offset && !resource_types_table[i].tls_addr) {
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can't manually free __thread storage

free(thread_resources->storage[i]);
}
}
Expand Down Expand Up @@ -256,7 +263,10 @@ static void tsrm_update_active_threads(void)

p->storage = (void *) realloc(p->storage, sizeof(void *)*id_count);
for (j=p->count; j<id_count; j++) {
if (resource_types_table[j].fast_offset) {
if (resource_types_table[j].tls_addr) {
TSRM_ASSERT(p->thread_id == tsrm_thread_id());
p->storage[j] = resource_types_table[j].tls_addr();
} else if (resource_types_table[j].fast_offset) {
p->storage[j] = (void *) (((char*)p) + resource_types_table[j].fast_offset);
} else {
p->storage[j] = (void *) malloc(resource_types_table[j].size);
Expand Down Expand Up @@ -301,6 +311,7 @@ TSRM_API ts_rsrc_id ts_allocate_id(ts_rsrc_id *rsrc_id, size_t size, ts_allocate
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].ctor = ctor;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].dtor = dtor;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].fast_offset = 0;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].tls_addr = NULL;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].done = 0;

tsrm_update_active_threads();
Expand Down Expand Up @@ -359,6 +370,7 @@ TSRM_API ts_rsrc_id ts_allocate_fast_id(ts_rsrc_id *rsrc_id, size_t *offset, siz
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].ctor = ctor;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].dtor = dtor;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].fast_offset = *offset;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].tls_addr = NULL;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].done = 0;

tsrm_update_active_threads();
Expand All @@ -368,6 +380,41 @@ TSRM_API ts_rsrc_id ts_allocate_fast_id(ts_rsrc_id *rsrc_id, size_t *offset, siz
return *rsrc_id;
}/*}}}*/

/* allocates a resource id whose per-thread storage is a native __thread block */
TSRM_API ts_rsrc_id ts_allocate_tls_id(ts_rsrc_id *rsrc_id, void *(*tls_addr)(void), size_t size, ts_allocate_ctor ctor, ts_allocate_dtor dtor)
{/*{{{*/
TSRM_ERROR((TSRM_ERROR_LEVEL_CORE, "Obtaining a new TLS resource id, %d bytes", size));
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

function largely copied from above, looking at it now I see that size_t should be printed as %zu.


tsrm_mutex_lock(tsmm_mutex);

*rsrc_id = TSRM_SHUFFLE_RSRC_ID(id_count++);

if (resource_types_table_size < id_count) {
tsrm_resource_type *_tmp;
_tmp = (tsrm_resource_type *) realloc(resource_types_table, sizeof(tsrm_resource_type)*id_count);
if (!_tmp) {
TSRM_ERROR((TSRM_ERROR_LEVEL_ERROR, "Unable to allocate storage for resource"));
*rsrc_id = 0;
tsrm_mutex_unlock(tsmm_mutex);
return 0;
}
resource_types_table = _tmp;
resource_types_table_size = id_count;
}
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].size = size;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].ctor = ctor;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].dtor = dtor;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].fast_offset = 0;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].tls_addr = tls_addr;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].done = 0;

tsrm_update_active_threads();
tsrm_mutex_unlock(tsmm_mutex);

TSRM_ERROR((TSRM_ERROR_LEVEL_CORE, "Successfully allocated new TLS resource id %d", *rsrc_id));
return *rsrc_id;
}/*}}}*/

static void set_thread_local_storage_resource_to(tsrm_tls_entry *thread_resource)
{
tsrm_tls_set(thread_resource);
Expand Down Expand Up @@ -397,7 +444,9 @@ static void allocate_new_resource(tsrm_tls_entry **thread_resources_ptr, THREAD_
if (resource_types_table[i].done) {
(*thread_resources_ptr)->storage[i] = NULL;
} else {
if (resource_types_table[i].fast_offset) {
if (resource_types_table[i].tls_addr) {
(*thread_resources_ptr)->storage[i] = resource_types_table[i].tls_addr();
} else if (resource_types_table[i].fast_offset) {
(*thread_resources_ptr)->storage[i] = (void *) (((char*)(*thread_resources_ptr)) + resource_types_table[i].fast_offset);
} else {
(*thread_resources_ptr)->storage[i] = (void *) malloc(resource_types_table[i].size);
Expand Down Expand Up @@ -485,7 +534,8 @@ TSRM_API void *ts_resource_ex(ts_rsrc_id id, THREAD_T *th_id)
/* In case that extensions don't use the pointer passed from the dtor, but incorrectly
* use the global pointer, we need to setup the global pointer temporarily here. */
set_thread_local_storage_resource_to(thread_resources);
/* Free up the old resource from the old thread instance */
/* Dead thread, recycled id: already freed, so just zero it. */
thread_resources->thread_id = 0;
ts_free_resources(thread_resources);
free(thread_resources);
/* Allocate a new resource at the same point in the linked list, and relink the next pointer */
Expand Down Expand Up @@ -559,7 +609,7 @@ void ts_free_id(ts_rsrc_id id)
if (resource_types_table[rsrc_id].dtor) {
resource_types_table[rsrc_id].dtor(p->storage[rsrc_id]);
}
if (!resource_types_table[rsrc_id].fast_offset) {
if (!resource_types_table[rsrc_id].fast_offset && !resource_types_table[rsrc_id].tls_addr) {
free(p->storage[rsrc_id]);
}
}
Expand Down
4 changes: 3 additions & 1 deletion TSRM/TSRM.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ TSRM_API ts_rsrc_id ts_allocate_id(ts_rsrc_id *rsrc_id, size_t size, ts_allocate
/* Fast resource in reserved (pre-allocated) space */
TSRM_API void tsrm_reserve(size_t size);
TSRM_API ts_rsrc_id ts_allocate_fast_id(ts_rsrc_id *rsrc_id, size_t *offset, size_t size, ts_allocate_ctor ctor, ts_allocate_dtor dtor);
/* Must be called at startup before any other thread exists. */
TSRM_API ts_rsrc_id ts_allocate_tls_id(ts_rsrc_id *rsrc_id, void *(*tls_addr)(void), size_t size, ts_allocate_ctor ctor, ts_allocate_dtor dtor);

/* fetches the requested resource for the current thread */
TSRM_API void *ts_resource_ex(ts_rsrc_id id, THREAD_T *th_id);
Expand Down Expand Up @@ -155,7 +157,7 @@ TSRM_API bool tsrm_is_managed_thread(void);
#if !__has_attribute(tls_model) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__MUSL__) || defined(__HAIKU__)
# define TSRM_TLS_MODEL_ATTR
# define TSRM_TLS_MODEL_DEFAULT
#elif __PIC__
#elif __PIC__ && !defined(__PIE__)
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a PIE program can use local exec if it's the main executable. Only shared libraries (embed, extensions) need to fall back to initial-exed.

This alone would already be a small speedup (one fewer instruction per access)

# define TSRM_TLS_MODEL_ATTR __attribute__((tls_model("initial-exec")))
# define TSRM_TLS_MODEL_INITIAL_EXEC
#else
Expand Down
5 changes: 5 additions & 0 deletions Zend/Zend.m4
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,11 @@ AC_MSG_RESULT([$ZEND_ZTS])
AS_VAR_IF([ZEND_ZTS], [yes], [
AC_DEFINE([ZTS], [1], [Define to 1 if thread safety (ZTS) is enabled.])
AS_VAR_APPEND([CFLAGS], [" -DZTS"])

dnl -mtls-size=12 drops the dead high-bits offset add from TLS access,
dnl valid while the thread-local block stays under 4 KiB.
AX_CHECK_COMPILE_FLAG([-mtls-size=12],
[AS_VAR_APPEND([CFLAGS], [" -mtls-size=12"])])
])

AC_MSG_CHECKING([whether to enable Zend debugging])
Expand Down
12 changes: 8 additions & 4 deletions Zend/zend.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,12 @@ static bool startup_done = false;
#ifdef ZTS
ZEND_API int compiler_globals_id;
ZEND_API int executor_globals_id;
ZEND_API size_t compiler_globals_offset;
ZEND_API size_t executor_globals_offset;
ZEND_API TSRM_TLS TSRM_TLS_MODEL_ATTR zend_executor_globals executor_globals_tls;
ZEND_API TSRM_TLS TSRM_TLS_MODEL_ATTR zend_compiler_globals compiler_globals_tls;
Comment on lines +55 to +56
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possibly these could be embeded in the main _tsrm_ls_cache (TSRMLS_MAIN_CACHE_DEFINE) :

ZEND_API TSRM_TLS TSRM_TLS_MODEL_ATTR struct {
    void *cache;
    zend_executor_globals eg;
    zend_compiler_globals cg;
} _tsrm_ls_cache;

This would simplify JIT changes as we can access eg/cg at an offset from jit->tls.

Also the compiler may generate better code when both EG and CG are used in the same function.

/* ts_allocate_tls_id takes a callback so each thread resolves its own block.
* A plain &..._tls would capture only the registering thread's address. */
static void *executor_globals_tls_addr(void) { return &executor_globals_tls; }
static void *compiler_globals_tls_addr(void) { return &compiler_globals_tls; }
static HashTable *global_function_table = NULL;
static HashTable *global_class_table = NULL;
static HashTable *global_constants_table = NULL;
Expand Down Expand Up @@ -1019,8 +1023,8 @@ void zend_startup(zend_utility_functions *utility_functions) /* {{{ */
zend_init_rsrc_list_dtors();

#ifdef ZTS
ts_allocate_fast_id(&compiler_globals_id, &compiler_globals_offset, sizeof(zend_compiler_globals), (ts_allocate_ctor) compiler_globals_ctor, (ts_allocate_dtor) compiler_globals_dtor);
ts_allocate_fast_id(&executor_globals_id, &executor_globals_offset, sizeof(zend_executor_globals), (ts_allocate_ctor) executor_globals_ctor, (ts_allocate_dtor) executor_globals_dtor);
ts_allocate_tls_id(&compiler_globals_id, compiler_globals_tls_addr, sizeof(zend_compiler_globals), (ts_allocate_ctor) compiler_globals_ctor, (ts_allocate_dtor) compiler_globals_dtor);
ts_allocate_tls_id(&executor_globals_id, executor_globals_tls_addr, sizeof(zend_executor_globals), (ts_allocate_ctor) executor_globals_ctor, (ts_allocate_dtor) executor_globals_dtor);
ts_allocate_fast_id(&language_scanner_globals_id, &language_scanner_globals_offset, sizeof(zend_php_scanner_globals), (ts_allocate_ctor) php_scanner_globals_ctor, NULL);
ts_allocate_fast_id(&ini_scanner_globals_id, &ini_scanner_globals_offset, sizeof(zend_ini_scanner_globals), (ts_allocate_ctor) ini_scanner_globals_ctor, NULL);
compiler_globals = ts_resource(compiler_globals_id);
Expand Down
2 changes: 0 additions & 2 deletions Zend/zend_globals.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,6 @@
BEGIN_EXTERN_C()
ZEND_API extern int compiler_globals_id;
ZEND_API extern int executor_globals_id;
ZEND_API extern size_t compiler_globals_offset;
ZEND_API extern size_t executor_globals_offset;
END_EXTERN_C()

#endif
Expand Down
6 changes: 4 additions & 2 deletions Zend/zend_globals_macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ BEGIN_EXTERN_C()

/* Compiler */
#ifdef ZTS
# define CG(v) ZEND_TSRMG_FAST(compiler_globals_offset, zend_compiler_globals *, v)
extern ZEND_API TSRM_TLS TSRM_TLS_MODEL_ATTR zend_compiler_globals compiler_globals_tls;
# define CG(v) (compiler_globals_tls.v)
#else
# define CG(v) (compiler_globals.v)
extern ZEND_API struct _zend_compiler_globals compiler_globals;
Expand All @@ -40,7 +41,8 @@ ZEND_API int zendparse(void);

/* Executor */
#ifdef ZTS
# define EG(v) ZEND_TSRMG_FAST(executor_globals_offset, zend_executor_globals *, v)
extern ZEND_API TSRM_TLS TSRM_TLS_MODEL_ATTR zend_executor_globals executor_globals_tls;
# define EG(v) (executor_globals_tls.v)
#else
# define EG(v) (executor_globals.v)
extern ZEND_API zend_executor_globals executor_globals;
Expand Down
6 changes: 5 additions & 1 deletion ext/opcache/jit/ir/ir_aarch64.dasc
Original file line number Diff line number Diff line change
Expand Up @@ -5868,8 +5868,12 @@ static void ir_emit_tls(ir_ctx *ctx, ir_ref def, ir_insn *insn)
| ldr Rx(reg), [Rx(reg), #insn->op3]
|| }
||# else
|| /* op2 == 0 with no index requests the bare thread pointer (used to form
|| * &EG/&CG with an add); a real TLS var never sits at tprel offset 0. */
|| if (insn->op2 != 0 || insn->op3 != IR_NULL) {
||//??? IR_ASSERT(insn->op2 <= LDR_STR_PIMM64);
| ldr Rx(reg), [Rx(reg), #insn->op2]
| ldr Rx(reg), [Rx(reg), #insn->op2]
|| }
||# endif
||#endif
if (IR_REG_SPILLED(ctx->regs[def][0])) {
Expand Down
Loading
Loading