advisor: fix included column order in suggested indexes

If an index optimizes multiple quals, the previous code wasn't deterministic and could append the columns from the included quals in any order, which could lead to a non optimal index suggestion. To fix, the include quals are not emitted in ascending order of column they contains, so there's a guarantee that the first columns are in the correct order. The result is still not fully deterministic in case of a single qual having multiple columns not included in a child qual, but this doesn't really matter as the order is not significant there (at least from the advisor point of view). Thanks to disqus user Sivan for the report.
powa-team · Aug 5, 2024 · e5c4ce7 · e5c4ce7
1 parent 97ea8bb
commit e5c4ce7
Show file tree

Hide file tree

Showing 2 changed files with 286 additions and 3 deletions.
diff --git a/pg_qualstats--2.1.0--2.1.1.sql b/pg_qualstats--2.1.0--2.1.1.sql
@@ -1,2 +1,282 @@
 -- complain if script is sourced in psql, rather than via CREATE EXTENSION
 \echo Use "ALTER EXTENSION pg_qualstats UPDATE" to load this file. \quit
+
+CREATE OR REPLACE FUNCTION @[email protected]_qualstats_index_advisor (
+    min_filter integer DEFAULT 1000,
+    min_selectivity integer DEFAULT 30,
+    forbidden_am text[] DEFAULT '{}')
+    RETURNS json
+AS $_$
+DECLARE
+    v_processed bigint[] = '{}';
+    v_indexes json[] = '{}';
+    v_unoptimised json[] = '{}';
+
+    rec record;
+    v_nb_processed integer = 1;
+
+    v_ddl text;
+    v_col text;
+    v_qualnodeid bigint;
+    v_quals_todo bigint[];
+    v_quals_done bigint[];
+    v_quals_col_done text[];
+    v_queryids bigint[] = '{}';
+BEGIN
+    -- sanity checks and default values
+    SELECT coalesce(min_filter, 1000), coalesce(min_selectivity, 30),
+      coalesce(forbidden_am, '{}')
+    INTO min_filter, min_selectivity, forbidden_am;
+
+    -- don't try to generate hash indexes Before pg 10, as those are only WAL
+    -- logged since pg 11.
+    IF pg_catalog.current_setting('server_version_num')::bigint < 100000 THEN
+        forbidden_am := array_append(forbidden_am, 'hash');
+    END IF;
+
+    -- first find out unoptimizable quals.
+    -- We need an array of json containing the per-qual info, and a single
+    -- array containing all the underlying qualnodeids, so we need to create
+    -- the wanted final object manually as we can't have two different grouping
+    -- approach.
+    FOR rec IN WITH src AS (SELECT DISTINCT qualnodeid,
+        (coalesce(lrelid, rrelid), coalesce(lattnum, rattnum),
+          opno, eval_type)::@[email protected] AS qual,
+          queryid
+      FROM @[email protected]_qualstats() q
+      JOIN pg_catalog.pg_database d ON q.dbid = d.oid
+      LEFT JOIN pg_catalog.pg_operator op ON op.oid = q.opno
+      LEFT JOIN pg_catalog.pg_amop amop ON amop.amopopr = op.oid
+      LEFT JOIN pg_catalog.pg_am am ON am.oid = amop.amopmethod
+      WHERE d.datname = current_database()
+       AND eval_type = 'f'
+       AND coalesce(lrelid, rrelid) != 0
+       AND amname IS NULL
+    )
+    SELECT pg_catalog.json_build_object(
+            'qual', @[email protected]_qualstats_deparse_qual(qual),
+            -- be careful to generate an empty array if no queryid availiable
+            'queryids',
+            coalesce(pg_catalog.array_agg(DISTINCT queryid)
+                FILTER (WHERE queryid IS NOT NULL), '{}')
+        ) AS obj,
+        array_agg(qualnodeid) AS qualnodeids
+    FROM src
+    GROUP BY qual
+    LOOP
+        v_unoptimised := array_append(v_unoptimised, rec.obj);
+        v_processed := array_cat(v_processed, rec.qualnodeids);
+    END LOOP;
+
+    -- The index suggestion is done in multiple iteration, by scoring for each
+    -- relation containing interesting quals a path of possibly AND-ed quals
+    -- that contains other possibly AND-ed quals.  Only the higher score path
+    -- will be used to create an index, so we can then compute another set of
+    -- paths ignoring the quals that are now optimized with an index.
+    WHILE v_nb_processed > 0 LOOP
+      v_nb_processed := 0;
+      FOR rec IN
+        -- first, find quals that seems worth to optimize along with the
+        -- possible access methods, discarding any qualnode that are marked as
+        -- already processed.  Also apply access method restriction.
+        WITH pgqs AS (
+          SELECT dbid, amname, qualid, qualnodeid,
+            (coalesce(lrelid, rrelid), coalesce(lattnum, rattnum),
+            opno, eval_type)::@[email protected] AS qual, queryid,
+            round(avg(execution_count)) AS execution_count,
+            sum(occurences) AS occurences,
+            round(sum(nbfiltered)::numeric / sum(occurences)) AS avg_filter,
+            CASE WHEN sum(execution_count) = 0
+              THEN 0
+              ELSE round(sum(nbfiltered::numeric) / sum(execution_count) * 100)
+            END AS avg_selectivity
+          FROM @[email protected]_qualstats() q
+          JOIN pg_catalog.pg_database d ON q.dbid = d.oid
+          JOIN pg_catalog.pg_operator op ON op.oid = q.opno
+          JOIN pg_catalog.pg_amop amop ON amop.amopopr = op.oid
+          JOIN pg_catalog.pg_am am ON am.oid = amop.amopmethod
+          WHERE d.datname = current_database()
+          AND eval_type = 'f'
+          AND amname != ALL (forbidden_am)
+          AND coalesce(lrelid, rrelid) != 0
+          AND qualnodeid != ALL(v_processed)
+          GROUP BY dbid, amname, qualid, qualnodeid, lrelid, rrelid,
+            lattnum, rattnum, opno, eval_type, queryid
+        ),
+        -- apply cardinality and selectivity restrictions
+        filtered AS (
+          SELECT (qual).relid, amname, coalesce(qualid, qualnodeid) AS parent,
+            count(*) AS weight,
+            (array_agg(DISTINCT qualnodeid),
+             array_agg(queryid)
+            )::@[email protected]_quals AS quals
+          FROM pgqs
+          WHERE avg_filter >= min_filter
+          AND avg_selectivity >= min_selectivity
+          GROUP BY (qual).relid, amname, parent
+        ),
+        -- for each possibly AND-ed qual, build the list of included qualnodeid
+        nodes AS (
+          SELECT p.relid, p.amname, p.parent, p.quals,
+            c.quals AS children
+          FROM filtered p
+          LEFT JOIN filtered c ON (p.quals).qualnodeids @> (c.quals).qualnodeids
+            AND p.amname = c.amname
+            AND p.parent != c.parent
+            AND (p.quals).qualnodeids != (c.quals).qualnodeids
+        ),
+        -- build the "paths", which is the list of AND-ed quals that entirely
+        -- contains another possibly AND-ed quals, and give a score for each
+        -- path.  The scoring method used here is simply the number of
+        -- columns in the quals.
+        paths AS (
+          SELECT DISTINCT *,
+            coalesce(pg_catalog.array_length((children).qualnodeids, 1),
+                     0) AS weight
+          FROM nodes
+          UNION
+          SELECT DISTINCT p.relid, p.amname, p.parent, p.quals, c.children,
+            coalesce(pg_catalog.array_length((c.children).qualnodeids, 1),
+                     0) AS weight
+          FROM nodes p
+          JOIN nodes c ON (p.children).qualnodeids @> (c.quals).qualnodeids
+            AND (c.quals).qualnodeids IS NOT NULL
+            AND (c.quals).qualnodeids != (p.quals).qualnodeids
+            AND p.amname = c.amname
+        ),
+        -- compute the final paths.
+        -- The scoring method used here is simply the sum of total
+        -- number of columns in each possibly AND-ed quals, so that we can
+        -- later chose to create indexes that optimize as many queries as
+        -- possible with as few indexes as possible.
+        -- We also compute here an access method weight, so that we can later
+        -- choose a btree index rather than another access method if btree is
+        -- available.
+        computed AS (
+          SELECT relid, amname, parent, quals,
+            array_agg(to_json(children) ORDER BY weight)
+              FILTER (WHERE children IS NOT NULL) AS included,
+            pg_catalog.array_length((quals).qualnodeids, 1)
+                + sum(weight) AS path_weight,
+          CASE amname WHEN 'btree' THEN 1 ELSE 2 END AS amweight
+          FROM paths
+          GROUP BY relid, amname, parent, quals
+        ),
+        -- compute a rank for each final paths, per relation.
+        final AS (
+          SELECT relid, amname, parent, quals, included, path_weight, amweight,
+          row_number() OVER (
+            PARTITION BY relid
+            ORDER BY path_weight DESC, amweight) AS rownum
+          FROM computed
+        )
+        -- and finally choose the higher rank final path for each relation.
+        SELECT relid, amname, parent,
+            (quals).qualnodeids as quals, (quals).queryids as queryids,
+            included, path_weight
+        FROM final
+        WHERE rownum = 1
+      LOOP
+        v_nb_processed := v_nb_processed + 1;
+
+        v_ddl := '';
+        v_quals_todo := '{}';
+        v_quals_done := '{}';
+        v_quals_col_done := '{}';
+
+        -- put columns from included quals, if any, first for order dependency
+        DECLARE
+            v_cur json;
+        BEGIN
+            IF rec.included IS NOT NULL THEN
+              FOR v_cur IN SELECT v->'qualnodeids'
+                    FROM (SELECT * FROM unnest(rec.included)) AS r(v)
+                    ORDER BY pg_catalog.json_array_length(v->'qualnodeids') ASC
+              LOOP
+                -- Direct cast from json to bigint is only possible since pg10
+                FOR v_qualnodeid IN
+                    SELECT pg_catalog.json_array_elements(v_cur)::text::bigint
+                LOOP
+                  v_quals_todo := v_quals_todo || v_qualnodeid;
+                END LOOP;
+              END LOOP;
+            END IF;
+        END;
+
+        -- and append qual's own columns
+        v_quals_todo := v_quals_todo || rec.quals;
+
+        -- generate the index DDL
+        FOREACH v_qualnodeid IN ARRAY v_quals_todo LOOP
+          -- skip quals already present in the index
+          CONTINUE WHEN v_quals_done @> ARRAY[v_qualnodeid];
+
+          -- skip other quals for the same column
+          v_col := @[email protected]_qualstats_get_idx_col(v_qualnodeid, false);
+          CONTINUE WHEN v_quals_col_done @> ARRAY[v_col];
+
+          -- mark this qual as present in a generated index so it's ignore at
+          -- next round of best quals to optimize
+          v_processed := pg_catalog.array_append(v_processed, v_qualnodeid);
+
+          -- mark this qual and col as present in this index
+          v_quals_done := v_quals_done || v_qualnodeid;
+          v_quals_col_done := v_quals_col_done || v_col;
+
+          -- if underlying table has been dropped, stop here
+          CONTINUE WHEN coalesce(v_col, '') = '';
+
+          -- append the column to the index
+          IF v_ddl != '' THEN v_ddl := v_ddl || ', '; END IF;
+          v_ddl := v_ddl || @[email protected]_qualstats_get_idx_col(v_qualnodeid, true);
+        END LOOP;
+
+        -- if underlying table has been dropped, skip this (broken) index
+        CONTINUE WHEN coalesce(v_ddl, '') = '';
+
+        -- generate the full CREATE INDEX ddl
+        v_ddl = pg_catalog.format('CREATE INDEX ON %s USING %I (%s)',
+          @[email protected]_qualstats_get_qualnode_rel(v_qualnodeid), rec.amname, v_ddl);
+
+        -- get the underlyings queryid(s)
+        DECLARE
+            v_queryid text;
+            v_cur json;
+        BEGIN
+            v_queryids = rec.queryids;
+            IF rec.included IS NOT NULL THEN
+              FOREACH v_cur IN ARRAY rec.included LOOP
+                -- Direct cast from json to bigint is only possible since pg10
+                FOR v_queryid IN SELECT pg_catalog.json_array_elements(v_cur->'queryids')::text
+                LOOP
+                  CONTINUE WHEN v_queryid = 'null';
+                  v_queryids := v_queryids || v_queryid::text::bigint;
+                END LOOP;
+              END LOOP;
+            END IF;
+        END;
+
+        -- remove any duplicates
+        SELECT pg_catalog.array_agg(DISTINCT v) INTO v_queryids
+            FROM (SELECT unnest(v_queryids)) s(v);
+
+        -- sanitize the queryids
+        IF v_queryids IS NULL OR v_queryids = '{null}' THEN
+            v_queryids = '{}';
+        END IF;
+
+        -- and finally append the index to the list of generated indexes
+        v_indexes := pg_catalog.array_append(v_indexes,
+            pg_catalog.json_build_object(
+                'ddl', v_ddl,
+                'queryids', v_queryids
+            )
+        );
+      END LOOP;
+    END LOOP;
+
+    RETURN pg_catalog.json_build_object(
+        'indexes', v_indexes,
+        'unoptimised', v_unoptimised);
+END;
+$_$ LANGUAGE plpgsql;       /* end of pg_qualstats_index_advisor */
diff --git a/pg_qualstats--2.1.1.sql b/pg_qualstats--2.1.1.sql
@@ -546,7 +546,7 @@ BEGIN
         filtered AS (
           SELECT (qual).relid, amname, coalesce(qualid, qualnodeid) AS parent,
             count(*) AS weight,
-            (array_agg(qualnodeid),
+            (array_agg(DISTINCT qualnodeid),
              array_agg(queryid)
             )::@[email protected]_quals AS quals
           FROM pgqs
@@ -628,10 +628,13 @@ BEGIN
             v_cur json;
         BEGIN
             IF rec.included IS NOT NULL THEN
-              FOREACH v_cur IN ARRAY rec.included LOOP
+              FOR v_cur IN SELECT v->'qualnodeids'
+                    FROM (SELECT * FROM unnest(rec.included)) AS r(v)
+                    ORDER BY pg_catalog.json_array_length(v->'qualnodeids') ASC
+              LOOP
                 -- Direct cast from json to bigint is only possible since pg10
                 FOR v_qualnodeid IN
-                    SELECT pg_catalog.json_array_elements(v_cur->'qualnodeids')::text::bigint
+                    SELECT pg_catalog.json_array_elements(v_cur)::text::bigint
                 LOOP
                   v_quals_todo := v_quals_todo || v_qualnodeid;
                 END LOOP;