Skip to content

Commit eba2336

Browse files
authored
[mypyc] Speed up generator allocation by using a per-type freelist (#19316)
Add support for per-type free "lists" that can cache up to one instance for quick allocation. If a free list is empty, fall back to regular object allocation. The per-type free list can be enabled for a class by setting a flag in ClassIR. Currently there is no way for users to control this, and these must be enabled based on heuristics. Use this free list for generator objects and coroutines, since they are often short-lived. Use a thread local variable for the free list so that each thread in a free threaded build has a separate free list. This way we need less synchronization, and the free list hit rate is higher for multithreaded workloads. This speeds up a microbenchmark that performs non-blocking calls of async functions in a loop by about 20%. The impact will become significantly bigger after some follow-up optimizations that I'm working on. This trades off memory use for performance, which is often good. This could use a lot of memory if many threads are calling async functions, but generally async functions are run on a single thread, so this case seems unlikely right now. Also, in my experience with large code bases only a small fraction of functions are async functions or generators, so the overall memory use impact shouldn't be too bad. We can later look into making this profile guided, so that only functions that are called frequently get the free list. Also we could add a compile-time flag to optimize for memory use, and it would turn this off.
1 parent cbe28b2 commit eba2336

File tree

7 files changed

+281
-14
lines changed

7 files changed

+281
-14
lines changed

mypyc/codegen/emit.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1115,6 +1115,31 @@ def emit_gc_clear(self, target: str, rtype: RType) -> None:
11151115
else:
11161116
assert False, "emit_gc_clear() not implemented for %s" % repr(rtype)
11171117

1118+
def emit_reuse_clear(self, target: str, rtype: RType) -> None:
1119+
"""Emit attribute clear before object is added into freelist.
1120+
1121+
Assume that 'target' represents a C expression that refers to a
1122+
struct member, such as 'self->x'.
1123+
1124+
Unlike emit_gc_clear(), initialize attribute value to match a freshly
1125+
allocated object.
1126+
"""
1127+
if isinstance(rtype, RTuple):
1128+
for i, item_type in enumerate(rtype.types):
1129+
self.emit_reuse_clear(f"{target}.f{i}", item_type)
1130+
elif not rtype.is_refcounted:
1131+
self.emit_line(f"{target} = {rtype.c_undefined};")
1132+
elif isinstance(rtype, RPrimitive) and rtype.name == "builtins.int":
1133+
self.emit_line(f"if (CPyTagged_CheckLong({target})) {{")
1134+
self.emit_line(f"CPyTagged __tmp = {target};")
1135+
self.emit_line(f"{target} = {self.c_undefined_value(rtype)};")
1136+
self.emit_line("Py_XDECREF(CPyTagged_LongAsObject(__tmp));")
1137+
self.emit_line("} else {")
1138+
self.emit_line(f"{target} = {self.c_undefined_value(rtype)};")
1139+
self.emit_line("}")
1140+
else:
1141+
self.emit_gc_clear(target, rtype)
1142+
11181143
def emit_traceback(
11191144
self, source_path: str, module_name: str, traceback_entry: tuple[str, int]
11201145
) -> None:

mypyc/codegen/emitclass.py

Lines changed: 86 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,29 @@ def generate_class_type_decl(
186186
)
187187

188188

189+
def generate_class_reuse(
190+
cl: ClassIR, c_emitter: Emitter, external_emitter: Emitter, emitter: Emitter
191+
) -> None:
192+
"""Generate a definition of a single-object per-class free "list".
193+
194+
This speeds up object allocation and freeing when there are many short-lived
195+
objects.
196+
197+
TODO: Generalize to support a free list with up to N objects.
198+
"""
199+
assert cl.reuse_freed_instance
200+
201+
# The free list implementation doesn't support class hierarchies
202+
assert cl.is_final_class or cl.children == []
203+
204+
context = c_emitter.context
205+
name = cl.name_prefix(c_emitter.names) + "_free_instance"
206+
struct_name = cl.struct_name(c_emitter.names)
207+
context.declarations[name] = HeaderDeclaration(
208+
f"CPyThreadLocal {struct_name} *{name};", needs_export=True
209+
)
210+
211+
189212
def generate_class(cl: ClassIR, module: str, emitter: Emitter) -> None:
190213
"""Generate C code for a class.
191214
@@ -557,7 +580,22 @@ def generate_setup_for_class(
557580
emitter.emit_line("static PyObject *")
558581
emitter.emit_line(f"{func_name}(PyTypeObject *type)")
559582
emitter.emit_line("{")
560-
emitter.emit_line(f"{cl.struct_name(emitter.names)} *self;")
583+
struct_name = cl.struct_name(emitter.names)
584+
emitter.emit_line(f"{struct_name} *self;")
585+
586+
prefix = cl.name_prefix(emitter.names)
587+
if cl.reuse_freed_instance:
588+
# Attempt to use a per-type free list first (a free "list" with up to one object only).
589+
emitter.emit_line(f"if ({prefix}_free_instance != NULL) {{")
590+
emitter.emit_line(f"self = {prefix}_free_instance;")
591+
emitter.emit_line(f"{prefix}_free_instance = NULL;")
592+
emitter.emit_line("Py_SET_REFCNT(self, 1);")
593+
emitter.emit_line("PyObject_GC_Track(self);")
594+
if defaults_fn is not None:
595+
emit_attr_defaults_func_call(defaults_fn, "self", emitter)
596+
emitter.emit_line("return (PyObject *)self;")
597+
emitter.emit_line("}")
598+
561599
emitter.emit_line(f"self = ({cl.struct_name(emitter.names)} *)type->tp_alloc(type, 0);")
562600
emitter.emit_line("if (self == NULL)")
563601
emitter.emit_line(" return NULL;")
@@ -571,9 +609,7 @@ def generate_setup_for_class(
571609
else:
572610
emitter.emit_line(f"self->vtable = {vtable_name};")
573611

574-
for i in range(0, len(cl.bitmap_attrs), BITMAP_BITS):
575-
field = emitter.bitmap_field(i)
576-
emitter.emit_line(f"self->{field} = 0;")
612+
emit_clear_bitmaps(cl, emitter)
577613

578614
if cl.has_method("__call__"):
579615
name = cl.method_decl("__call__").cname(emitter.names)
@@ -590,19 +626,34 @@ def generate_setup_for_class(
590626

591627
# Initialize attributes to default values, if necessary
592628
if defaults_fn is not None:
593-
emitter.emit_lines(
594-
"if ({}{}((PyObject *)self) == 0) {{".format(
595-
NATIVE_PREFIX, defaults_fn.cname(emitter.names)
596-
),
597-
"Py_DECREF(self);",
598-
"return NULL;",
599-
"}",
600-
)
629+
emit_attr_defaults_func_call(defaults_fn, "self", emitter)
601630

602631
emitter.emit_line("return (PyObject *)self;")
603632
emitter.emit_line("}")
604633

605634

635+
def emit_clear_bitmaps(cl: ClassIR, emitter: Emitter) -> None:
636+
"""Emit C code to clear bitmaps that track if attributes have an assigned value."""
637+
for i in range(0, len(cl.bitmap_attrs), BITMAP_BITS):
638+
field = emitter.bitmap_field(i)
639+
emitter.emit_line(f"self->{field} = 0;")
640+
641+
642+
def emit_attr_defaults_func_call(defaults_fn: FuncIR, self_name: str, emitter: Emitter) -> None:
643+
"""Emit C code to initialize attribute defaults by calling defaults_fn.
644+
645+
The code returns NULL on a raised exception.
646+
"""
647+
emitter.emit_lines(
648+
"if ({}{}((PyObject *){}) == 0) {{".format(
649+
NATIVE_PREFIX, defaults_fn.cname(emitter.names), self_name
650+
),
651+
"Py_DECREF(self);",
652+
"return NULL;",
653+
"}",
654+
)
655+
656+
606657
def generate_constructor_for_class(
607658
cl: ClassIR,
608659
fn: FuncDecl,
@@ -787,6 +838,8 @@ def generate_dealloc_for_class(
787838
emitter.emit_line("Py_TYPE(self)->tp_finalize((PyObject *)self);")
788839
emitter.emit_line("}")
789840
emitter.emit_line("PyObject_GC_UnTrack(self);")
841+
if cl.reuse_freed_instance:
842+
emit_reuse_dealloc(cl, emitter)
790843
# The trashcan is needed to handle deep recursive deallocations
791844
emitter.emit_line(f"CPy_TRASHCAN_BEGIN(self, {dealloc_func_name})")
792845
emitter.emit_line(f"{clear_func_name}(self);")
@@ -795,6 +848,27 @@ def generate_dealloc_for_class(
795848
emitter.emit_line("}")
796849

797850

851+
def emit_reuse_dealloc(cl: ClassIR, emitter: Emitter) -> None:
852+
"""Emit code to deallocate object by putting it to per-type free list.
853+
854+
The free "list" currently can have up to one object.
855+
"""
856+
prefix = cl.name_prefix(emitter.names)
857+
emitter.emit_line(f"if ({prefix}_free_instance == NULL) {{")
858+
emitter.emit_line(f"{prefix}_free_instance = self;")
859+
860+
# Clear attributes and free referenced objects.
861+
862+
emit_clear_bitmaps(cl, emitter)
863+
864+
for base in reversed(cl.base_mro):
865+
for attr, rtype in base.attributes.items():
866+
emitter.emit_reuse_clear(f"self->{emitter.attr(attr)}", rtype)
867+
868+
emitter.emit_line("return;")
869+
emitter.emit_line("}")
870+
871+
798872
def generate_finalize_for_class(
799873
del_method: FuncIR, finalize_func_name: str, emitter: Emitter
800874
) -> None:

mypyc/codegen/emitmodule.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
from mypy.util import hash_digest, json_dumps
3030
from mypyc.codegen.cstring import c_string_initializer
3131
from mypyc.codegen.emit import Emitter, EmitterContext, HeaderDeclaration, c_array_initializer
32-
from mypyc.codegen.emitclass import generate_class, generate_class_type_decl
32+
from mypyc.codegen.emitclass import generate_class, generate_class_reuse, generate_class_type_decl
3333
from mypyc.codegen.emitfunc import generate_native_function, native_function_header
3434
from mypyc.codegen.emitwrapper import (
3535
generate_legacy_wrapper_function,
@@ -609,6 +609,8 @@ def generate_c_for_modules(self) -> list[tuple[str, str]]:
609609
self.declare_finals(module_name, module.final_names, declarations)
610610
for cl in module.classes:
611611
generate_class_type_decl(cl, emitter, ext_declarations, declarations)
612+
if cl.reuse_freed_instance:
613+
generate_class_reuse(cl, emitter, ext_declarations, declarations)
612614
self.declare_type_vars(module_name, module.type_var_names, declarations)
613615
for fn in module.functions:
614616
generate_function_declaration(fn, declarations)

mypyc/ir/class_ir.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,12 @@ def __init__(
204204
# If this is a generator environment class, what is the actual method for it
205205
self.env_user_function: FuncIR | None = None
206206

207+
# If True, keep one freed, cleared instance available for immediate reuse to
208+
# speed up allocations. This helps if many objects are freed quickly, before
209+
# other instances of the same class are allocated. This is effectively a
210+
# per-type free "list" of up to length 1.
211+
self.reuse_freed_instance = False
212+
207213
def __repr__(self) -> str:
208214
return (
209215
"ClassIR("
@@ -403,6 +409,7 @@ def serialize(self) -> JsonDict:
403409
"_sometimes_initialized_attrs": sorted(self._sometimes_initialized_attrs),
404410
"init_self_leak": self.init_self_leak,
405411
"env_user_function": self.env_user_function.id if self.env_user_function else None,
412+
"reuse_freed_instance": self.reuse_freed_instance,
406413
}
407414

408415
@classmethod
@@ -458,6 +465,7 @@ def deserialize(cls, data: JsonDict, ctx: DeserMaps) -> ClassIR:
458465
ir.env_user_function = (
459466
ctx.functions[data["env_user_function"]] if data["env_user_function"] else None
460467
)
468+
ir.reuse_freed_instance = data["reuse_freed_instance"]
461469

462470
return ir
463471

mypyc/irbuild/generator.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ def setup_generator_class(builder: IRBuilder) -> ClassIR:
156156
name = f"{builder.fn_info.namespaced_name()}_gen"
157157

158158
generator_class_ir = ClassIR(name, builder.module_name, is_generated=True, is_final_class=True)
159+
generator_class_ir.reuse_freed_instance = True
159160
if builder.fn_info.can_merge_generator_and_env_classes():
160161
builder.fn_info.env_class = generator_class_ir
161162
else:

mypyc/lib-rt/mypyc_util.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,31 @@
2323
#define CPy_NOINLINE
2424
#endif
2525

26+
#ifndef Py_GIL_DISABLED
27+
28+
// Everything is running in the same thread, so no need for thread locals
29+
#define CPyThreadLocal
30+
31+
#else
32+
33+
// 1. Use C11 standard thread_local storage, if available
34+
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
35+
#define CPyThreadLocal _Thread_local
36+
37+
// 2. Microsoft Visual Studio fallback
38+
#elif defined(_MSC_VER)
39+
#define CPyThreadLocal __declspec(thread)
40+
41+
// 3. GNU thread local storage for GCC/Clang targets that still need it
42+
#elif defined(__GNUC__) || defined(__clang__)
43+
#define CPyThreadLocal __thread
44+
45+
#else
46+
#error "Can't define CPyThreadLocal for this compiler/target (consider using a non-free-threaded Python build)"
47+
#endif
48+
49+
#endif // Py_GIL_DISABLED
50+
2651
// INCREF and DECREF that assert the pointer is not NULL.
2752
// asserts are disabled in release builds so there shouldn't be a perf hit.
2853
// I'm honestly kind of surprised that this isn't done by default.

0 commit comments

Comments
 (0)