There are transactions
table and logs
table. logs
are linked to transactions
by transaction_id
. I need to query logs
by address
, join it with transactions
, aggregate logs to array, LIMIT transactions (example is LIMIT 2
) and FETCH ALL LOGS that were in that transaction (but query only by one address
field). transactions.hash
is varchar
.
create table transactions
(hash varchar,
block_number integer,
t_value varchar
);
insert into transactions values
('h1',120,'v1'),
('h2',170,'v2'),
('h3',130,'v3'),
('h4',160,'v4'),
('h5',142,'v5')
;
create table logs
(transaction_hash varchar,
address varchar,
l_value varchar,
l_trans_block_number integer -- this value duplicates from transactions for any case
);
insert into logs values
('h1', 'a1', 'h1.a1.1', 120),
('h1', 'a1', 'h1.a1.2', 120),
('h1', 'a3', 'h1.a3.1', 120),
('h2', 'a1', 'h2.a1.1', 170),
('h2', 'a2', 'h2.a2.1', 170),
('h2', 'a2', 'h2.a2.2', 170),
('h2', 'a3', 'h2.a3.1', 170),
('h3', 'a2', 'h3.a2.1', 130),
('h4', 'a1', 'h4.a1.1', 160),
('h5', 'a2', 'h5.a2.1', 142),
('h5', 'a3', 'h5.a3.1', 142)
;
create index on transaction(hash);
create index on transaction(block_number);
create index on logs(address);
create index on logs(l_trans_block_number);
Result must be with query WHERE log.address='a2' LIMIT 2
:
hash block_number t_value logs_array
h2 170 v2 {"{"address" : "a1", "l_value" : "h2.a1.1", "l_trans_block_number" : 170}","{"address" : "a2", "l_value" : "h2.a2.1", "l_trans_block_number" : 170}","{"address" : "a2", "l_value" : "h2.a2.2", "l_trans_block_number" : 170}","{"address" : "a3", "l_value" : "h2.a3.1", "l_trans_block_number" : 170}"}
h5 142 v5 {"{"address" : "a2", "l_value" : "h5.a2.1", "l_trans_block_number" : 142}","{"address" : "a3", "l_value" : "h5.a3.1", "l_trans_block_number" : 142}"}
Problem: sql query below works correct, but on very high amount of logs (100k+ logs for 1 address) it can take many minutes for search. The solution would be set LIMIT
in MATERIALIZED
, but in that case I can get transactions with not fully correct list of logs. How to fix? Either rewrite query without MATERIALIZED
and use multiple SELECT
inside each other, but I don't know how, or fix with MATERIALIZED
.
So problem is that Postgres does not understand correct in MATERIALIZED
that I need limited number of transactions, it first searches all logs, only then append them to transactions with limit (as I guess). Index on logs(address)
is set.
WITH
b AS MATERIALIZED (
SELECT lg.transaction_hash
FROM logs lg
WHERE lg.address='a2'
-- this must be commented, otherwise not correct results, although fast execution
-- ORDER BY lg.l_trans_block_number DESC
-- LIMIT 2
)
SELECT
hash,
block_number,
t_value,
(SELECT array_agg(JSON_BUILD_OBJECT('address',address,'l_value',l_value,'l_trans_block_number',l_trans_block_number)) FROM logs WHERE transaction_hash = t.hash) logs_array
FROM transactions t
WHERE t.hash IN
(SELECT transaction_hash FROM b)
ORDER BY t.block_number DESC -- must be ordered by block_number
LIMIT 2
Real-world example, query was executing ~30 seconds (in db I have transaction_id
integer, but it's not increasing primary key):
EXPLAIN WITH
b AS MATERIALIZED (
SELECT lg.transaction_id
FROM _logs lg
WHERE lg.address in ('0xca530408c3e552b020a2300debc7bd18820fb42f', '0x68e78497a7b0db7718ccc833c164a18d8e626816')
)
SELECT
(SELECT array_agg(JSON_BUILD_OBJECT('address',address)) FROM _logs WHERE transaction_id = t.id) logs_array
FROM _transactions t
WHERE t.id IN
(SELECT transaction_id FROM b)
LIMIT 5000;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------------------
Limit (cost=87540.62..3180266.26 rows=5000 width=32)
CTE b
-> Index Scan using _logs_address_idx on _logs lg (cost=0.70..85820.98 rows=76403 width=8)
Index Cond: ((address)::text = ANY ('{0xca530408c3e552b020a2300debc7bd18820fb42f,0x68e78497a7b0db7718ccc833c164a18d8e626816}'::text[]))
-> Nested Loop (cost=1719.64..47260423.09 rows=76403 width=32)
-> HashAggregate (cost=1719.07..1721.07 rows=200 width=8)
Group Key: b.transaction_id
-> CTE Scan on b (cost=0.00..1528.06 rows=76403 width=8)
-> Index Only Scan using _transactions_pkey on _transactions t (cost=0.57..2.79 rows=1 width=8)
Index Cond: (id = b.transaction_id)
SubPlan 2
-> Aggregate (cost=618.53..618.54 rows=1 width=32)
-> Index Scan using _logs_transaction_id_idx on _logs (cost=0.57..584.99 rows=6707 width=43)
Index Cond: (transaction_id = t.id)
JIT:
Functions: 17
Options: Inlining true, Optimization true, Expressions true, Deforming true
(17 rows)
Adding DISTINCT
to query took ~10 seconds (but anyway also needs ORDER BY
inside MATERIALIZED
:
EXPLAIN WITH
b AS MATERIALIZED (
SELECT DISTINCT lg.transaction_id
FROM _logs lg
WHERE lg.address in ('0xca530408c3e552b020a2300debc7bd18820fb42f', '0x68e78497a7b0db7718ccc833c164a18d8e626816')
LIMIT 5000
)
SELECT
(SELECT array_agg(JSON_BUILD_OBJECT('address',address)) FROM _logs WHERE transaction_id = t.id) logs_array
FROM _transactions t
WHERE t.id IN
(SELECT transaction_id FROM b)
;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------------------------------
Nested Loop (cost=87079.76..3211065.34 rows=5000 width=32)
CTE b
-> Limit (cost=86916.69..86966.69 rows=5000 width=8)
-> HashAggregate (cost=86916.69..87443.81 rows=52712 width=8)
Group Key: lg.transaction_id
-> Index Scan using _logs_address_idx on _logs lg (cost=0.70..86723.67 rows=77206 width=8)
Index Cond: ((address)::text = ANY ('{0xca530408c3e552b020a2300debc7bd18820fb42f,0x68e78497a7b0db7718ccc833c164a18d8e626816}'::text[]))
-> HashAggregate (cost=112.50..114.50 rows=200 width=8)
Group Key: b.transaction_id
-> CTE Scan on b (cost=0.00..100.00 rows=5000 width=8)
-> Index Only Scan using _transactions_pkey on _transactions t (cost=0.57..2.79 rows=1 width=8)
Index Cond: (id = b.transaction_id)
SubPlan 2
-> Aggregate (cost=624.68..624.69 rows=1 width=32)
-> Index Scan using _logs_transaction_id_idx on _logs (cost=0.57..590.78 rows=6778 width=43)
Index Cond: (transaction_id = t.id)
JIT:
Functions: 19
Options: Inlining true, Optimization true, Expressions true, Deforming true
(19 rows)
You should just add DISTINCT keyword to your CTE named b and use LIMIT 2 in the CTE. You don't need LIMIT in main sql anymore:
WITH
b AS MATERIALIZED
( Select DISTINCT lg.transaction_hash
From logs lg
Where lg.address='a2'
LIMIT 2
)
Select hash, t_value,
( Select ARRAY_AGG( JSON_BUILD_OBJECT('address', address, 'l_value', l_value) )
From logs
Where transaction_hash = t.hash
) logs_array
From transactions t
Where t.hash IN( Select transaction_hash From b)
Without DISTINCT keyword CTE b results with two rows - both with 'h2' value....
See the fiddle here.
Instead of DISTINCT keyword you could use Group By to get the same result...
WITH
b AS MATERIALIZED
( Select lg.transaction_hash
From logs lg
Where lg.address='a2'
Group By lg.transaction_hash
LIMIT 2
)
... ....
See it here.
UPDATED ANSWER (after question update):
WITH
b AS MATERIALIZED
( Select lg.l_trans_block_number, lg.transaction_hash
From logs lg
Where lg.address='a2'
Group By lg.l_trans_block_number, lg.transaction_hash
Order By lg.l_trans_block_number Desc, lg.transaction_hash
LIMIT 2
)
Select hash, block_number, t_value,
( Select ARRAY_AGG( JSON_BUILD_OBJECT('address', address, 'l_value', l_value, 'l_trans_block_number', l_trans_block_number) )
From logs
Where transaction_hash = t.hash And l_trans_block_number = t.block_number
) logs_array
From transactions t
Where t.hash IN( Select transaction_hash From b)
R e s u l t :
hash | block_number | t_value | logs_array |
---|---|---|---|
h2 | 170 | v2 | {"{"address" : "a1", "l_value" : "h2.a1.1", "l_trans_block_number" : 170}","{"address" : "a2", "l_value" : "h2.a2.1", "l_trans_block_number" : 170}","{"address" : "a2", "l_value" : "h2.a2.2", "l_trans_block_number" : 170}","{"address" : "a3", "l_value" : "h2.a3.1", "l_trans_block_number" : 170}"} |
h5 | 142 | v5 | {"{"address" : "a2", "l_value" : "h5.a2.1", "l_trans_block_number" : 142}","{"address" : "a3", "l_value" : "h5.a3.1", "l_trans_block_number" : 142}"} |
See it here.