I'm working on aproject in which I have
The celery tasks uses the database pool through the following code:
import os
from psycopg_pool import ConnectionPool
from contextlib import contextmanager
PG_USERNAME = os.getenv('PG_USERNAME')
if not PG_USERNAME:
raise ValueError(f"Invalid postgres username")
PG_PASSWORD = os.getenv('PG_PASSWORD')
if not PG_PASSWORD:
raise ValueError(f"Invalid postgres pass")
PG_HOST = os.getenv('PG_HOST')
if not PG_HOST:
raise ValueError(f"Invalid postgres host")
PG_PORT = os.getenv('PG_PORT')
if not PG_PORT:
raise ValueError(f"Invalid postgres port")
# Options used to prevent closed connections
# conn_options = f"-c statement_timeout=1800000 -c tcp_keepalives_idle=30 -c tcp_keepalives_interval=30"
conninfo = f'host={PG_HOST} port={PG_PORT} dbname=postgres user={PG_USERNAME} password={PG_PASSWORD}'
connection_pool = ConnectionPool(
min_size=4,
max_size=100,
conninfo=conninfo,
check=ConnectionPool.check_connection,
#options=conn_options,
)
@contextmanager
def get_db_conn():
conn = connection_pool.getconn()
try:
yield conn
finally:
connection_pool.putconn(conn)
And an example celery task would be
@app.task(bind=True)
def example_task(self, id):
with get_db_conn() as conn:
try:
with conn.cursor(row_factory=dict_row) as cursor:
test = None
cursor.execute('SELECT * FROM test WHERE id = %s', (id,))
try:
test = cursor.fetchone()
except psycopg.errors.ProgrammingError:
logger.warning(f'Test log msg')
conn.rollback()
return
cursor.execute("UPDATE test SET status = 'running' WHERE id = %s", (id,))
conn.commit()
# Some processing...
# Fetch another resource needed
cursor.execute('SELECT * FROM test WHERE id = %s', (test['resource_id'],))
cursor.fetchone()
# Update the entry with the result
cursor.execute("""
UPDATE test
SET status = 'done', properties = %s
WHERE id = %s
""", (Jsonb(properties), id))
conn.commit()
except Exception as e:
logger.exception(f'Error: {e}')
conn.rollback()
with conn.cursor(row_factory=dict_row) as cursor:
# Update status to error with exception information
cursor.execute("""
UPDATE test
SET status = 'error', error = %s
WHERE id = %s
""", (Jsonb({'error': str(e), 'stacktrace': traceback.format_exc()}), webpage_id))
conn.commit()
The code works most of the times, but sometimes, when multiple tasks of the same type are being launched, I'm getting some errors of type psycopg.ProgrammingError: the last operation didn't produce a result
on the second fetchone() call.
Meanwhile, on the database I can see the following warning
WARNING: there is already a transaction in progress
I suspect there might be some problems with the way I'm working with connections, but I cannot find were.
As far as I know, once get_db_conn() is called that connection is not available for other tasks, so in theory there cannot be multiple tasks using the same connection, and therefore there should be no transaction already in progress when performing the second fetchone() call.
The resource exists, as every other task can access it, so that's not the problem.
If both the main target row of test
as well as the additional one selected based on its test.resource_id
foreign key aren't shareable, lock them. Otherwise, concurrent workers are likely bumping into each other, taking on the processing of the same row and altering its fields and the fields of the one its associated with through resource_id
, at unpredictable points between subsequent steps of this operation.
Regular explicit locks get automatically released on commit
/rollback
so to keep your conn.commit()
after updating target's status
field, you can use session-level advisory locks to let them last multiple transactions instead:
@app.task(bind=True)
def example_task(self, id):
with get_db_conn() as conn:
try:
with conn.cursor(row_factory=dict_row) as cursor:
test = None
cursor.execute("""SELECT *, pg_advisory_lock_shared(resource_id)
FROM test
WHERE id = %s
AND pg_try_advisory_lock(id)
""", (id,))
try:
test = cursor.fetchone()
#if it fails here, someone else is already processing this `id`
#if it waits, someone else was altering the row behind `resource_id`
#in the 2nd case, it's best to wait for them to finish
except psycopg.errors.ProgrammingError:
logger.warning(f'Test log msg')
conn.rollback()
return
cursor.execute("""UPDATE test
SET status = 'running'
WHERE id = %s
""", (id,))
conn.commit()
# Some processing...
# Fetch another resource needed
cursor.execute("""SELECT *
FROM test
WHERE id = %s
/*AND probably more conditions here*/
""", (test['resource_id'],))
cursor.fetchone()
# Update the entry with the result
cursor.execute("""UPDATE test
SET status = 'done'
, properties = %s
WHERE id = %s
RETURNING pg_advisory_unlock(id)
, pg_advisory_unlock(resource_id)
""", (Jsonb(properties), id))
conn.commit()
except Exception as e:
logger.exception(f'Error: {e}')
conn.rollback()
with conn.cursor(row_factory=dict_row) as cursor:
# Update status to error with exception information
cursor.execute("""UPDATE test
SET status = 'error', error = %s
WHERE id = %s
RETURNING pg_advisory_unlock(id)
, pg_advisory_unlock(resource_id)
""", (Jsonb({'error': str(e), 'stacktrace': traceback.format_exc()}), webpage_id))
conn.commit()
The problem might also be in the part of the code that you did not share, where you pick and assign the id
you pass to example_task(self, id)
from outside. If that's more or less how workers find their next task:
select id
from test
where status='ready'
order by priority
, created_at
limit 1;
Then there's nothing stopping two workers from picking the same one if the second one grabs it before the first one has the chance to conn.commit()
its status
change.
You could acquire the lock right there and make all following calls skip to the nearest row that's still free:
select id
from test
where status='ready'
order by priority
, created_at
for update skip locked--this
limit 1;
But to hold on to a lock like that you'd have to only conn.commit()
once you're done with the whole operation, without running commits between its sub-steps - otherwise you'd lose the lock along the way.
To guard the rest of the operation beyond the nearest .commit()
, use that lock to secure the query against immediate collisions but also add an advisory lock that survives multiple transactions.
Advisory locks don't offer a skip locked
but it can be emulated with a recurisve cte (walks the id
's and stops at the first one that doesn't return false
on locking attempt). Or, you can just look up which id
's are already advisory-locked according to pg_locks.objid
and exclude those
select id, pg_try_advisory_lock(id)
from test
where status='ready'
and id not in(select objid
from pg_locks
where locktype='advisory')
order by priority
, created_at
for update skip locked
limit 1;
You could also get rid of that entirely and look up free id
's straight from the worker:
@app.task(bind=True)
def example_task(self, id):
with get_db_conn() as conn:
try:
with conn.cursor(row_factory=dict_row) as cursor:
test = None
cursor.execute("""WITH find_free_id_and_lock_it AS
(UPDATE test
SET status='running'
WHERE id=(SELECT id
FROM test
WHERE status='ready'
ORDER BY priority
, created_at
FOR UPDATE SKIP LOCKED
LIMIT 1)
RETURNING *)
,lock_resource AS
(SELECT *, pg_advisory_lock_shared(id)
FROM test
WHERE id=(SELECT resource_id
FROM find_free_id_and_lock_it)
FOR SHARE/*waits if necessary*/)
SELECT target.*
, resource.*--replace with alias list
FROM find_free_id_and_lock_it AS target
JOIN lock_resource AS resource
ON target.resource_id=resource.id;
""", (id,))
try:
test = cursor.fetchone()
except psycopg.errors.ProgrammingError:
logger.warning(f'Test log msg')
conn.rollback()
return
conn.commit()
# Some processing...
cursor.execute("""UPDATE test
SET status = 'done'
, properties = %s
WHERE id = %s
RETURNING pg_advisory_unlock(resource_id)
""", (Jsonb(properties), id))
conn.commit()
except Exception as e:
logger.exception(f'Error: {e}')
conn.rollback()
with conn.cursor(row_factory=dict_row) as cursor:
# Update status to error with exception information
cursor.execute("""UPDATE test
SET status = 'error', error = %s
WHERE id = %s
RETURNING pg_advisory_unlock(resource_id)
""", (Jsonb({'error': str(e), 'stacktrace': traceback.format_exc()}), webpage_id))
conn.commit()
Both target and resource lookups, adequate locks as well as the status update are all applied within a single query and transaction. Depending on what you do in # Some processing...
and how long that takes, it might be preferable to acquire the shared lock on resource
later, just in time, like it was done originally.