mysqlpostgresqlquery-optimizationdatabase-performance

Why correlated scalar is 10x times slower in MySQL comparing to PG


Let's create a table with 3000 rows

create table tt(id int, txt text);

insert into tt
with recursive r(id) as
(select 1 union all select id + 1 from r where id < 3e3)
select id, concat('name', id)
from r;

The same query in both databases results in very different performance:

select sum(id), 
       sum((select count(*) 
            from tt t1 
            where t1.id = t2.id)) cnt
from tt t2

MYSQL

mysql> explain analyze
    -> select sum(id), sum((select count(*) from tt t1 where t1.id = t2.id)) cnt
    -> from tt t2\G
*************************** 1. row ***************************
EXPLAIN: -> Aggregate: sum(t2.id), sum((select #2))  (cost=602 rows=1) (actual time=7542..7542 rows=1 loops=1)
    -> Table scan on t2  (cost=302 rows=3000) (actual time=0.025..2.75 rows=3000 loops=1)
-> Select #2 (subquery in projection; dependent)
    -> Aggregate: count(0)  (cost=62.5 rows=1) (actual time=2.51..2.51 rows=1 loops=3000)
        -> Filter: (t1.id = t2.id)  (cost=32.5 rows=300) (actual time=1.25..2.51 rows=1 loops=3000)
            -> Table scan on t1  (cost=32.5 rows=3000) (actual time=0.00256..2.31 rows=3000 loops=3000)

1 row in set, 1 warning (7.54 sec)

PG

postgres=# explain analyze
postgres-# select sum(id), sum((select count(*) from tt t1 where t1.id = t2.id)) cnt
postgres-# from tt t2;
                                                   QUERY PLAN
-----------------------------------------------------------------------------------------------------------------
 Aggregate  (cost=163599.50..163599.51 rows=1 width=40) (actual time=684.339..684.340 rows=1 loops=1)
   ->  Seq Scan on tt t2  (cost=0.00..47.00 rows=3000 width=4) (actual time=0.013..0.223 rows=3000 loops=1)
   SubPlan 1
     ->  Aggregate  (cost=54.50..54.51 rows=1 width=8) (actual time=0.227..0.227 rows=1 loops=3000)
           ->  Seq Scan on tt t1  (cost=0.00..54.50 rows=1 width=0) (actual time=0.113..0.223 rows=1 loops=3000)
                 Filter: (id = t2.id)
                 Rows Removed by Filter: 2999
 Planning Time: 0.663 ms
 Execution Time: 684.512 ms
(9 rows)

As you can see the difference is ~7.0 seconds vs ~0.7 seconds.

So it both cases it does not unnest subquery and executes it 3000 times.

But in MySQL one execution takes 2+ ms while in PG it takes 0.2 ms.

Question asked to understand this difference and not to make query faster.

Obviously we can create an index or rewrite to explicit join.

select sum(t1.id), sum(cnt) cnt
from tt t2
join (select id, sum(1) cnt from tt group by id) t1 on t1.id = t2.id;

PS. Both RDBMS have default settings.

mysql> select version();
+-----------+
| version() |
+-----------+
| 8.0.43    |
+-----------+
1 row in set (0.00 sec)

postgres=# select version();
                          version
------------------------------------------------------------
 PostgreSQL 15.1, compiled by Visual C++ build 1914, 64-bit
(1 row)

UPDATE

As requested in the comments - adding comparison on Ubuntu.

Still 10x difference.

MYSQL

mysql> explain analyze
    -> select sum(id), sum((select count(*) from tt t1 where t1.id = t2.id)) cnt
    -> from tt t2\G
*************************** 1. row ***************************
EXPLAIN: -> Aggregate: sum(t2.id), sum((select #2))  (cost=600 rows=1) (actual time=5416..5416 rows=1 loops=1)
    -> Table scan on t2  (cost=300 rows=3000) (actual time=0.0244..5.38 rows=3000 loops=1)
-> Select #2 (subquery in projection; dependent)
    -> Aggregate: count(0)  (cost=60.3 rows=1) (actual time=1.79..1.79 rows=1 loops=3000)
        -> Filter: (t1.id = t2.id)  (cost=30.3 rows=300) (actual time=0.906..1.79 rows=1 loops=3000)
            -> Table scan on t1  (cost=30.3 rows=3000) (actual time=0.00668..1.56 rows=3000 loops=3000)

1 row in set, 1 warning (5.41 sec)

PG

postgres=# explain analyze
select sum(id), sum((select count(*) from tt t1 where t1.id = t2.id)) cnt
from tt t2;
                                                   QUERY PLAN                                                    
-----------------------------------------------------------------------------------------------------------------
 Aggregate  (cost=163599.50..163599.51 rows=1 width=40) (actual time=526.720..526.721 rows=1 loops=1)
   ->  Seq Scan on tt t2  (cost=0.00..47.00 rows=3000 width=4) (actual time=0.012..0.301 rows=3000 loops=1)
   SubPlan 1
     ->  Aggregate  (cost=54.50..54.51 rows=1 width=8) (actual time=0.175..0.175 rows=1 loops=3000)
           ->  Seq Scan on tt t1  (cost=0.00..54.50 rows=1 width=0) (actual time=0.087..0.173 rows=1 loops=3000)
                 Filter: (id = t2.id)
                 Rows Removed by Filter: 2999
 Planning Time: 0.078 ms
 Execution Time: 526.766 ms
(9 rows)

Solution

  • Well, answering my own question.

    TL;DR: InnoDB is super slow for full table scans.

    To demonstrate let's just create bigger table and run some trivial queries.

    create table ttt(id int, txt text);
    
    insert into ttt
    with recursive r(id) as
    (select 1 union all select id + 1 from r where id < 6e6)
    select id, concat('name', id)
    from r;
    

    PG

    postgres=# select count(*) from ttt;
      count
    ---------
     6000000
    (1 row)
    
    
    Time: 454.881 ms
    postgres=# select sum(id) from ttt;
          sum
    ----------------
     18000003000000
    (1 row)
    
    
    Time: 544.896 ms
    postgres=# select sum(id), max(txt) from ttt;
          sum       |    max
    ----------------+------------
     18000003000000 | name999999
    (1 row)
    
    
    Time: 2541.881 ms (00:02.542)
    

    MYSQL

    mysql> select count(*) from ttt;
    +----------+
    | count(*) |
    +----------+
    |  6000000 |
    +----------+
    1 row in set (0.38 sec)
    
    mysql> select sum(id) from ttt;
    +----------------+
    | sum(id)        |
    +----------------+
    | 18000003000000 |
    +----------------+
    1 row in set (6.04 sec)
    
    mysql> select sum(id), max(txt) from ttt;
    +----------------+------------+
    | sum(id)        | max(txt)   |
    +----------------+------------+
    | 18000003000000 | name999999 |
    +----------------+------------+
    1 row in set (8.13 sec)
    

    Let's compare size on disk

    PG

    postgres=# select
    postgres-#   c.relname table_name,
    postgres-#   pg_relation_filepath(c.oid) file,
    postgres-#   pg_size_pretty(pg_total_relation_size(c.oid)) total_size
    postgres-# from pg_class c
    postgres-# where c.relname = 'ttt';
     table_name |     file     | total_size
    ------------+--------------+------------
     ttt        | base/5/98593 | 253 MB
    (1 row)
    

    MYSQL

    mysql> select
        ->   t.table_name,
        ->   f.file_name file,
        ->   round((t.data_length + t.index_length)/1024/1024, 2) total_mb
        -> from information_schema.tables as t
        -> join information_schema.files as f
        ->   on f.tablespace_name = concat(t.table_schema, '/', t.table_name)
        -> where t.table_schema = 'mydb'
        ->   and t.table_name   = 'ttt'
        ->   and t.engine = 'InnoDB'
        ->   and f.file_type = 'TABLESPACE';
    +------------+----------------+----------+
    | TABLE_NAME | file           | total_mb |
    +------------+----------------+----------+
    | ttt        | ./mydb/ttt.ibd |   255.80 |
    +------------+----------------+----------+
    

    So this is almost the same but queries take 10x times longer (except count(*)).

    It would be understandable if full scan was 10% slower or 20% slower… or even 100% slower.

    But 1000+% slower that is a bit too much! ((6.04-0.54)/(0.54)*1000 = 1018)

    PS.

    postgres=# show max_parallel_workers_per_gather;
     max_parallel_workers_per_gather
    ---------------------------------
     0
    (1 row)
    
    mysql> show variables like 'innodb_file_per_table';
    +-----------------------+-------+
    | Variable_name         | Value |
    +-----------------------+-------+
    | innodb_file_per_table | ON    |
    +-----------------------+-------+
    1 row in set (0.00 sec)