-
Notifications
You must be signed in to change notification settings - Fork 8.1k
perf: move EG() and CG() in ZTS builds into __thread storage #22231
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -42,6 +42,8 @@ typedef struct { | |
| ts_allocate_ctor ctor; | ||
| ts_allocate_dtor dtor; | ||
| size_t fast_offset; | ||
| /* When set, storage comes from __thread memory instead of being allocated by TSRM. */ | ||
| void *(*tls_addr)(void); | ||
| int done; | ||
| } tsrm_resource_type; | ||
|
|
||
|
|
@@ -163,14 +165,19 @@ TSRM_API bool tsrm_startup(int expected_threads, int expected_resources, int deb | |
|
|
||
| static void ts_free_resources(tsrm_tls_entry *thread_resources) | ||
| { | ||
| bool own_thread = thread_resources->thread_id == tsrm_thread_id(); | ||
|
|
||
| /* Need to destroy in reverse order to respect dependencies. */ | ||
| for (int i = thread_resources->count - 1; i >= 0; i--) { | ||
| if (!resource_types_table[i].done) { | ||
| if (resource_types_table[i].tls_addr && !own_thread) { | ||
| continue; | ||
| } | ||
| if (resource_types_table[i].dtor) { | ||
| resource_types_table[i].dtor(thread_resources->storage[i]); | ||
| } | ||
|
|
||
| if (!resource_types_table[i].fast_offset) { | ||
| if (!resource_types_table[i].fast_offset && !resource_types_table[i].tls_addr) { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we can't manually free __thread storage |
||
| free(thread_resources->storage[i]); | ||
| } | ||
| } | ||
|
|
@@ -256,7 +263,10 @@ static void tsrm_update_active_threads(void) | |
|
|
||
| p->storage = (void *) realloc(p->storage, sizeof(void *)*id_count); | ||
| for (j=p->count; j<id_count; j++) { | ||
| if (resource_types_table[j].fast_offset) { | ||
| if (resource_types_table[j].tls_addr) { | ||
| TSRM_ASSERT(p->thread_id == tsrm_thread_id()); | ||
| p->storage[j] = resource_types_table[j].tls_addr(); | ||
| } else if (resource_types_table[j].fast_offset) { | ||
| p->storage[j] = (void *) (((char*)p) + resource_types_table[j].fast_offset); | ||
| } else { | ||
| p->storage[j] = (void *) malloc(resource_types_table[j].size); | ||
|
|
@@ -301,6 +311,7 @@ TSRM_API ts_rsrc_id ts_allocate_id(ts_rsrc_id *rsrc_id, size_t size, ts_allocate | |
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].ctor = ctor; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].dtor = dtor; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].fast_offset = 0; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].tls_addr = NULL; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].done = 0; | ||
|
|
||
| tsrm_update_active_threads(); | ||
|
|
@@ -359,6 +370,7 @@ TSRM_API ts_rsrc_id ts_allocate_fast_id(ts_rsrc_id *rsrc_id, size_t *offset, siz | |
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].ctor = ctor; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].dtor = dtor; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].fast_offset = *offset; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].tls_addr = NULL; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].done = 0; | ||
|
|
||
| tsrm_update_active_threads(); | ||
|
|
@@ -368,6 +380,41 @@ TSRM_API ts_rsrc_id ts_allocate_fast_id(ts_rsrc_id *rsrc_id, size_t *offset, siz | |
| return *rsrc_id; | ||
| }/*}}}*/ | ||
|
|
||
| /* allocates a resource id whose per-thread storage is a native __thread block */ | ||
| TSRM_API ts_rsrc_id ts_allocate_tls_id(ts_rsrc_id *rsrc_id, void *(*tls_addr)(void), size_t size, ts_allocate_ctor ctor, ts_allocate_dtor dtor) | ||
| {/*{{{*/ | ||
| TSRM_ERROR((TSRM_ERROR_LEVEL_CORE, "Obtaining a new TLS resource id, %d bytes", size)); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. function largely copied from above, looking at it now I see that size_t should be printed as %zu. |
||
|
|
||
| tsrm_mutex_lock(tsmm_mutex); | ||
|
|
||
| *rsrc_id = TSRM_SHUFFLE_RSRC_ID(id_count++); | ||
|
|
||
| if (resource_types_table_size < id_count) { | ||
| tsrm_resource_type *_tmp; | ||
| _tmp = (tsrm_resource_type *) realloc(resource_types_table, sizeof(tsrm_resource_type)*id_count); | ||
| if (!_tmp) { | ||
| TSRM_ERROR((TSRM_ERROR_LEVEL_ERROR, "Unable to allocate storage for resource")); | ||
| *rsrc_id = 0; | ||
| tsrm_mutex_unlock(tsmm_mutex); | ||
| return 0; | ||
| } | ||
| resource_types_table = _tmp; | ||
| resource_types_table_size = id_count; | ||
| } | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].size = size; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].ctor = ctor; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].dtor = dtor; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].fast_offset = 0; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].tls_addr = tls_addr; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].done = 0; | ||
|
|
||
| tsrm_update_active_threads(); | ||
| tsrm_mutex_unlock(tsmm_mutex); | ||
|
|
||
| TSRM_ERROR((TSRM_ERROR_LEVEL_CORE, "Successfully allocated new TLS resource id %d", *rsrc_id)); | ||
| return *rsrc_id; | ||
| }/*}}}*/ | ||
|
|
||
| static void set_thread_local_storage_resource_to(tsrm_tls_entry *thread_resource) | ||
| { | ||
| tsrm_tls_set(thread_resource); | ||
|
|
@@ -397,7 +444,9 @@ static void allocate_new_resource(tsrm_tls_entry **thread_resources_ptr, THREAD_ | |
| if (resource_types_table[i].done) { | ||
| (*thread_resources_ptr)->storage[i] = NULL; | ||
| } else { | ||
| if (resource_types_table[i].fast_offset) { | ||
| if (resource_types_table[i].tls_addr) { | ||
| (*thread_resources_ptr)->storage[i] = resource_types_table[i].tls_addr(); | ||
| } else if (resource_types_table[i].fast_offset) { | ||
| (*thread_resources_ptr)->storage[i] = (void *) (((char*)(*thread_resources_ptr)) + resource_types_table[i].fast_offset); | ||
| } else { | ||
| (*thread_resources_ptr)->storage[i] = (void *) malloc(resource_types_table[i].size); | ||
|
|
@@ -485,7 +534,8 @@ TSRM_API void *ts_resource_ex(ts_rsrc_id id, THREAD_T *th_id) | |
| /* In case that extensions don't use the pointer passed from the dtor, but incorrectly | ||
| * use the global pointer, we need to setup the global pointer temporarily here. */ | ||
| set_thread_local_storage_resource_to(thread_resources); | ||
| /* Free up the old resource from the old thread instance */ | ||
| /* Dead thread, recycled id: already freed, so just zero it. */ | ||
| thread_resources->thread_id = 0; | ||
| ts_free_resources(thread_resources); | ||
| free(thread_resources); | ||
| /* Allocate a new resource at the same point in the linked list, and relink the next pointer */ | ||
|
|
@@ -559,7 +609,7 @@ void ts_free_id(ts_rsrc_id id) | |
| if (resource_types_table[rsrc_id].dtor) { | ||
| resource_types_table[rsrc_id].dtor(p->storage[rsrc_id]); | ||
| } | ||
| if (!resource_types_table[rsrc_id].fast_offset) { | ||
| if (!resource_types_table[rsrc_id].fast_offset && !resource_types_table[rsrc_id].tls_addr) { | ||
| free(p->storage[rsrc_id]); | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -93,6 +93,8 @@ TSRM_API ts_rsrc_id ts_allocate_id(ts_rsrc_id *rsrc_id, size_t size, ts_allocate | |
| /* Fast resource in reserved (pre-allocated) space */ | ||
| TSRM_API void tsrm_reserve(size_t size); | ||
| TSRM_API ts_rsrc_id ts_allocate_fast_id(ts_rsrc_id *rsrc_id, size_t *offset, size_t size, ts_allocate_ctor ctor, ts_allocate_dtor dtor); | ||
| /* Must be called at startup before any other thread exists. */ | ||
| TSRM_API ts_rsrc_id ts_allocate_tls_id(ts_rsrc_id *rsrc_id, void *(*tls_addr)(void), size_t size, ts_allocate_ctor ctor, ts_allocate_dtor dtor); | ||
|
|
||
| /* fetches the requested resource for the current thread */ | ||
| TSRM_API void *ts_resource_ex(ts_rsrc_id id, THREAD_T *th_id); | ||
|
|
@@ -155,7 +157,7 @@ TSRM_API bool tsrm_is_managed_thread(void); | |
| #if !__has_attribute(tls_model) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__MUSL__) || defined(__HAIKU__) | ||
| # define TSRM_TLS_MODEL_ATTR | ||
| # define TSRM_TLS_MODEL_DEFAULT | ||
| #elif __PIC__ | ||
| #elif __PIC__ && !defined(__PIE__) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. a PIE program can use local exec if it's the main executable. Only shared libraries (embed, extensions) need to fall back to initial-exed. This alone would already be a small speedup (one fewer instruction per access) |
||
| # define TSRM_TLS_MODEL_ATTR __attribute__((tls_model("initial-exec"))) | ||
| # define TSRM_TLS_MODEL_INITIAL_EXEC | ||
| #else | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -52,8 +52,12 @@ static bool startup_done = false; | |
| #ifdef ZTS | ||
| ZEND_API int compiler_globals_id; | ||
| ZEND_API int executor_globals_id; | ||
| ZEND_API size_t compiler_globals_offset; | ||
| ZEND_API size_t executor_globals_offset; | ||
| ZEND_API TSRM_TLS TSRM_TLS_MODEL_ATTR zend_executor_globals executor_globals_tls; | ||
| ZEND_API TSRM_TLS TSRM_TLS_MODEL_ATTR zend_compiler_globals compiler_globals_tls; | ||
|
Comment on lines
+55
to
+56
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Possibly these could be embeded in the main This would simplify JIT changes as we can access eg/cg at an offset from jit->tls. Also the compiler may generate better code when both EG and CG are used in the same function. |
||
| /* ts_allocate_tls_id takes a callback so each thread resolves its own block. | ||
| * A plain &..._tls would capture only the registering thread's address. */ | ||
| static void *executor_globals_tls_addr(void) { return &executor_globals_tls; } | ||
| static void *compiler_globals_tls_addr(void) { return &compiler_globals_tls; } | ||
| static HashTable *global_function_table = NULL; | ||
| static HashTable *global_class_table = NULL; | ||
| static HashTable *global_constants_table = NULL; | ||
|
|
@@ -1019,8 +1023,8 @@ void zend_startup(zend_utility_functions *utility_functions) /* {{{ */ | |
| zend_init_rsrc_list_dtors(); | ||
|
|
||
| #ifdef ZTS | ||
| ts_allocate_fast_id(&compiler_globals_id, &compiler_globals_offset, sizeof(zend_compiler_globals), (ts_allocate_ctor) compiler_globals_ctor, (ts_allocate_dtor) compiler_globals_dtor); | ||
| ts_allocate_fast_id(&executor_globals_id, &executor_globals_offset, sizeof(zend_executor_globals), (ts_allocate_ctor) executor_globals_ctor, (ts_allocate_dtor) executor_globals_dtor); | ||
| ts_allocate_tls_id(&compiler_globals_id, compiler_globals_tls_addr, sizeof(zend_compiler_globals), (ts_allocate_ctor) compiler_globals_ctor, (ts_allocate_dtor) compiler_globals_dtor); | ||
| ts_allocate_tls_id(&executor_globals_id, executor_globals_tls_addr, sizeof(zend_executor_globals), (ts_allocate_ctor) executor_globals_ctor, (ts_allocate_dtor) executor_globals_dtor); | ||
| ts_allocate_fast_id(&language_scanner_globals_id, &language_scanner_globals_offset, sizeof(zend_php_scanner_globals), (ts_allocate_ctor) php_scanner_globals_ctor, NULL); | ||
| ts_allocate_fast_id(&ini_scanner_globals_id, &ini_scanner_globals_offset, sizeof(zend_ini_scanner_globals), (ts_allocate_ctor) ini_scanner_globals_ctor, NULL); | ||
| compiler_globals = ts_resource(compiler_globals_id); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here it's possible that a thread id was recycled, but the tls points to a now obsolete one