I have a list of lists in the following way -
[['X_API', 'Y_API',....], ['Z_API', 'P_API', ...], [....], [...] .... ]
Here, each API name corresponds to a PythonOperator.
In Airflow, I would like to create task dependencies such that from a starting dummy task, I should have parallel tasks for each of the list inside the main list, and the operators inside the list of lists should execute in sequence :
How can I do this ? I would appreciate any help in this !
Existing code:
args = {
'depends_on_past': False,
'start_date': datetime.now(),
'email': '',
'email_on_failure': False,
'email_on_retry': False,
'retries': 3,
'retry_delay': timedelta(minutes=1)
}
dag = DAG(dag_id, default_args=args, schedule_interval=None)
with dag:
tasks = []
tmp, tmp2 = set(), set()
Start = DummyOperator(
task_id='Start',
dag=dag
)
End = DummyOperator(
task_id='End',
dag=dag
)
for i in dags:
for j in i:
if 'APIs' in list(i.keys()):
for l in i['APIs']:
tab = DummyOperator(
task_id=l['api'] + "_API",
dag=dag
)
tmp.add(tab)
elif 'tables' in list(i.keys()):
for k in i['tables']:
tab2 = DummyOperator(
task_id=k['table'] + "_API",
dag=dag
)
tmp2.add(tab2)
tasks.append(list(tmp))
tasks.append(list(tmp2))
for task in tasks:
for op in range(0, len(task)-1):
Start.set_downstream(task[op])
task[op].set_downstream(task[op+1])
task[op+1].set_downstream(End)
This was the solution I came up with -
with dag:
Start = ShortCircuitOperator(
task_id = 'Create_Postgres_Schema',
python_callable = create_postgres_schema,
ignore_downstream_trigger_rules = True
)
End = DummyOperator (
task_id = 'End_of_Data_extraction',
trigger_rule = TriggerRule.ALL_DONE
)
# Get list of first and last task. This is done to enable the task flow in Airflow
start_task_list = list(map(itemgetter(0), tasks_master_list))
end_tasks_list = list(map(itemgetter(-1), tasks_master_list))
Start >> start_task_list
for task in tasks_master_list:
for op in range(len(tasks_master_list)-1):
task[op] >> task[op+1]
end_tasks_list >> End