From 0d22545a0f11e2cc0d4db77d037400b1edaf98e2 Mon Sep 17 00:00:00 2001 From: Edoardo Piroli Date: Thu, 26 Sep 2024 10:42:13 +0200 Subject: [PATCH] [IMP] util.explode_query_range: replace problematic `parallel_filter` formatting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The usage of `str.format` to inject the parallel filter used to explode queries is not robust to the presence of other curly braces. Examples: 1. `JSON` strings (typically to leverage their mapping capabilities): see 79f3d7184285e39b04a6c032aec0e05de6565fe8, where a query had to be modified to accomodate that. 2. Hardcoded sets of curly braces: ```python >>> "UPDATE t SET c = '{usage as literal characters}' WHERE {parallel_filter}".format(parallel_filter="…") Traceback (most recent call last): File "", line 1, in KeyError: 'usage as literal characters' ``` Which can be (unelegantly) solved adding even more braces, leveraging one side-effect of `.format`: ```python >>> "UPDATE t SET c = '{{usage as literal characters}}' WHERE {parallel_filter}".format(parallel_filter="…") "UPDATE t SET c = '{usage as literal characters}' WHERE …" ``` 3. Hardcoded curly braces (AFAICT no way to solve this): ```python >>> "UPDATE t SET c = 'this is an open curly brace = {' WHERE {parallel_filter}".format(parallel_filter="…") Traceback (most recent call last): File "", line 1, in ValueError: unexpected '{' in field name ``` ```python >>> "UPDATE t SET c = 'this is a close brace = }' WHERE {parallel_filter}".format(parallel_filter="…") Traceback (most recent call last): File "", line 1, in ValueError: Single '}' encountered in format string ``` --- src/base/tests/test_util.py | 28 ++++++++++++++++++++++++++++ src/util/pg.py | 23 ++++++++++++++++++----- 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/src/base/tests/test_util.py b/src/base/tests/test_util.py index 0bf383b8a..dc5552c50 100644 --- a/src/base/tests/test_util.py +++ b/src/base/tests/test_util.py @@ -773,6 +773,34 @@ def _get_cr(self): self.addCleanup(cr.close) return cr + def test_explode_format_parallel_filter(self): + cr = self._get_cr() + + cr.execute("SELECT MIN(id) FROM res_users") + min_id = cr.fetchone()[0] + + q1 = "SELECT '{} {0} {x} {x:*^30d} {x.a.b} {x[0]}, {x[1]!s:*^30} {{x}}' FROM res_users" + q2 = "SELECT '{} {0} {x} {x:*^30d} {x.a.b} {x[0]}, {x[1]!s:*^30} {{x}}' FROM res_users WHERE {parallel_filter}" + expected_out = q1 + f" WHERE res_users.id BETWEEN {min_id} AND {min_id}" + + out1 = util.explode_query_range( + cr, + q1, + table="res_users", + bucket_size=1, + format=False, + )[0] + self.assertEqual(out1, expected_out) + + out2 = util.explode_query_range( + cr, + q2, + table="res_users", + bucket_size=1, + format=False, + )[0] + self.assertEqual(out2, expected_out) + def test_explode_mult_filters(self): cr = self._get_cr() queries = util.explode_query_range( diff --git a/src/util/pg.py b/src/util/pg.py index cd48a95c4..45279c71d 100644 --- a/src/util/pg.py +++ b/src/util/pg.py @@ -260,7 +260,7 @@ def explode_query(cr, query, alias=None, num_buckets=8, prefix=None): return [cr.mogrify(query, [num_buckets, index]).decode() for index in range(num_buckets)] -def explode_query_range(cr, query, table, alias=None, bucket_size=DEFAULT_BUCKET_SIZE, prefix=None): +def explode_query_range(cr, query, table, alias=None, bucket_size=DEFAULT_BUCKET_SIZE, prefix=None, format=True): """ Explode a query to multiple queries that can be executed in parallel. @@ -335,16 +335,27 @@ def explode_query_range(cr, query, table, alias=None, bucket_size=DEFAULT_BUCKET # Still, since the query may only be valid if there is no split, we force the usage of `prefix` in the query to # validate its correctness and avoid scripts that pass the CI but fail in production. parallel_filter = "{alias}.id IS NOT NULL".format(alias=alias) - return [query.format(parallel_filter=parallel_filter)] + return [ + ( + query.format(parallel_filter=parallel_filter) + if format + else query.replace("{parallel_filter}", parallel_filter) + ) + ] parallel_filter = "{alias}.id BETWEEN %(lower-bound)s AND %(upper-bound)s".format(alias=alias) - query = query.replace("%", "%%").format(parallel_filter=parallel_filter) + + query = query.replace("%", "%%") + query = ( + query.format(parallel_filter=parallel_filter) if format else query.replace("{parallel_filter}", parallel_filter) + ) + return [ cr.mogrify(query, {"lower-bound": ids[i], "upper-bound": ids[i + 1] - 1}).decode() for i in range(len(ids) - 1) ] -def explode_execute(cr, query, table, alias=None, bucket_size=DEFAULT_BUCKET_SIZE, logger=_logger): +def explode_execute(cr, query, table, alias=None, bucket_size=DEFAULT_BUCKET_SIZE, logger=_logger, format=True): """ Execute a query in parallel. @@ -376,6 +387,8 @@ def explode_execute(cr, query, table, alias=None, bucket_size=DEFAULT_BUCKET_SIZ :param int bucket_size: size of the buckets of ids to split the processing :param logger: logger used to report the progress :type logger: :class:`logging.Logger` + :param bool format: whether to use `.format` (instead of `.replace`) to replace the parallel filter, + setting it to `False` can prevent issues with hard-coded curly braces. :return: the sum of `cr.rowcount` for each query run :rtype: int @@ -387,7 +400,7 @@ def explode_execute(cr, query, table, alias=None, bucket_size=DEFAULT_BUCKET_SIZ """ return parallel_execute( cr, - explode_query_range(cr, query, table, alias=alias, bucket_size=bucket_size), + explode_query_range(cr, query, table, alias=alias, bucket_size=bucket_size, format=format), logger=logger, )