sqldistributiongreenplumskew

Greenplum distribution


There is a table with a random distribution

CREATE TABLE schema.table (
    col1 int4 NULL,
    col2 int4 NULL,
    col3 int4 NULL
)
WITH (
    appendonly=true,
    compresstype=zstd,
    orientation=column
)
DISTRIBUTED RANDOMLY; 

We need to optimally (with minimal skew) distribute rows over one field. For this we can create test tables

CREATE TABLE schema.test_table (
    col_1 int4 NULL,
    col_2 int4 NULL,
    col_3 int4 NULL
)
WITH (
    appendonly=true,
    compresstype=zstd,
    orientation=column
)
DISTRIBUTED BY (col_i); 
INSERT INTO schema.test_table SELECT * FROM schema.table;

And then check them against skew, for example via

select * from gp_toolkit.gp_skew_coefficient('schema.test_table'::regclass);

The problem is that we want to check the table for skew without creating test tables. Can this be done, and if so, how?


Solution

  • Something like this will work if you really do not want to create a new table, but I do not know of a way to model the distribution without actually (re)distributing the data.

    foo=# create table foo(a int, b int, c int);
    NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
    HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
    CREATE TABLE
    
    foo=# insert into foo values (generate_series(1,100), generate_series(101,200), generate_series(2001, 2100));
    INSERT 0 100
    
    foo=# select * from gp_toolkit.gp_skew_coefficient('public.foo'::regclass);
     skcoid |         skccoeff
    --------+--------------------------
      76788 | 18.460769214742921763000
    (1 row)
    
    foo=# select gp_segment_id, count(*) from foo group by 1 order by 1;
     gp_segment_id | count
    ---------------+-------
                 0 |    17
                 1 |    18
                 2 |    23
                 3 |    17
                 4 |    15
                 5 |    10
    (6 rows)
    
    
    foo=# ALTER TABLE foo SET
    foo-# WITH (REORGANIZE=true)
    foo-# DISTRIBUTED BY (a);
    ALTER TABLE
    foo=# select * from gp_toolkit.gp_skew_coefficient('public.foo'::regclass);
     skcoid |         skccoeff
    --------+--------------------------
      76788 | 18.460769214742921763000
    (1 row)
    
    foo=# select gp_segment_id, count(*) from foo group by 1 order by 1;
     gp_segment_id | count
    ---------------+-------
                 0 |    21
                 1 |    18
                 2 |    12
                 3 |    15
                 4 |    18
                 5 |    16
    (6 rows)
    
    foo=#
    
    foo=# ALTER TABLE foo SET
    WITH (REORGANIZE=true)
    DISTRIBUTED BY (b);
    ALTER TABLE
    foo=# select * from gp_toolkit.gp_skew_coefficient('public.foo'::regclass);
     skcoid |         skccoeff
    --------+--------------------------
      76788 | 27.011108825814611346000
    (1 row)
    
    foo=# select gp_segment_id, count(*) from foo group by 1 order by 1;
     gp_segment_id | count
    ---------------+-------
                 0 |    12
                 1 |    14
                 2 |    20
                 3 |    24
                 4 |    16
                 5 |    14
    (6 rows)
    
    foo=#
    
    
    foo=# ALTER TABLE foo SET
    WITH (REORGANIZE=true)
    DISTRIBUTED BY (c);
    ALTER TABLE
    foo=# select * from gp_toolkit.gp_skew_coefficient('public.foo'::regclass);
     skcoid |         skccoeff
    --------+--------------------------
      76788 | 30.983866769659334938000
    (1 row)
    
    foo=#
    
    foo=# select gp_segment_id, count(*) from foo group by 1 order by 1;
     gp_segment_id | count
    ---------------+-------
                 0 |    19
                 1 |    10
                 2 |    20
                 3 |    23
                 4 |    11
                 5 |    17
    (6 rows)
    
    foo=#