I am trying to fetch data for one asset for reporting purposes which is dependent on multiple processes and start time should be the earliest job that starts among them and end time should be latest completion time for one of those assets. I prepared one query which is running fine if asset has only one process but need help for multiple processes. Below are the details:
Table Name: ABC.ASSET_CONFIG
ASSET_ID,APPLICATION,ASSET_NAME,MODULE_NAME,ASSET_DESCRIPTION,ASSET_TYPE
9999,Marketing,Activation Data Mart,ADM,NA,Business Necessary
Table Name: ABC.ASSET_DEPENDENCY_CONFIG
ASSET_ID,COMPLETING_PROCESS_ID,DEPENDENCY_LEVEL,DATA_DELAY,PRCS_CNTRL_STORE_TYPE
9999,206,1,-1,ABC.BATCH_CONTROL
9999,208,1,-1,ABC.BATCH_CONTROL
9999,508,1,-1,ABC.BATCH_CONTROL
9999,1024,1,-1,ABC.BATCH_CONTROL
One asset may be related to multiple processes(COMPLETING_PROCESS_ID) which is completing at different times, all needs to be completed(Status as 'C' in BATCH_CONTROL table) to be able to get the full dataset of that asset. We have table(ABC.BATCH_CONTROL) where completion status of each processes are tracked. Status: 'C' denotes - Process Completed.
Table Name: ABC.BATCH_CONTROL
DATA_SOURCE_ID,BATCH_RUN_ID,LOAD_START_TS,LOAD_END_TS,BATCH_START_TS,BATCH_END_TS,BATCH_STATUS_CD
206,784,'2023-05-28 00:00:00','2023-05-29 00:00:00','2023-05-29 13:22:00','2023-05-29 14:36:00','C'
206,783,'2023-05-27 00:00:00','2023-05-28 00:00:00','2023-05-28 13:23:00','2023-05-28 14:38:00','C'
206,782,'2023-05-26 00:00:00','2023-05-27 00:00:00','2023-05-27 13:22:00','2023-05-27 15:21:00','C'
206,781,'2023-05-25 00:00:00','2023-05-26 00:00:00','2023-05-26 13:25:00','2023-05-26 15:04:00','C'
1024,2224,'2023-05-28 00:00:00','2023-05-29 00:00:00','2023-05-29 17:50:00','2023-05-29 18:23:00','C'
1024,2223,'2023-05-27 00:00:00','2023-05-28 00:00:00','2023-05-28 17:50:00','2023-05-28 18:21:00','C'
1024,2222,'2023-05-26 00:00:00','2023-05-27 00:00:00','2023-05-27 17:46:00','2023-05-27 18:22:00','C'
1024,2221,'2023-05-25 00:00:00','2023-05-26 00:00:00','2023-05-26 17:48:00','2023-05-26 18:18:00','C'
I tried below code which is working fine for a asset which has only one process, if an asset data readiness depends on multiple multiple processes it won't work:
Output i want is:
9999,Marketing,Business Necessary,Activation Data Mart,ADM,1024,2023-05-28,2023-05-29 13:22:00,2023-05-29 17:50:00, 04:12:23,0 04:12:23.000000,0 00:04:12.0000,203.0000,Completed
Query:
SELECT
QRY3.ASSET_ID,
QRY3.APPLICATION,
QRY3.ASSET_TYP,
QRY3.ASSET_NAME,
QRY3.MODULE_NAME,
QRY3.COMPLETING_PROCESS_ID,
QRY3.DATA_DATE,
QRY3.START_TIME,
QRY3.END_TIME,
SUBSTRING((TRIM((CAST(((CAST(QRY3.END_TIME AS TIMESTAMP(0)) - CAST(QRY3.START_TIME AS TIMESTAMP(0))) DAY(4) TO SECOND)AS VARCHAR(50))))),2,9) AS DURATION,
(TRIM((CAST(((CAST(QRY3.END_TIME AS TIMESTAMP(0)) - CAST(QRY3.START_TIME AS TIMESTAMP(0))) DAY(4) TO SECOND)AS VARCHAR(50))))) AS DURATION_TIMESTAMP,
--((CAST(QRY3.END_TIME AS TIMESTAMP(0)) - CAST(QRY3.START_TIME AS TIMESTAMP(0))) Second(10,6)) AS DURATION_TIMESTAMP_SECOND,
((CAST(QRY3.END_TIME AS TIMESTAMP(0)) - QRY3.START_TIME) DAY(4) to SECOND(4)) AS t1,
(EXTRACT(DAY from t1)*(24*60*60) +
EXTRACT(HOUR from t1)*(60*60) +
EXTRACT(MINUTE from t1)*60 +
EXTRACT(SECOND from t1)
) AS DURATION_SECONDS,
CASE WHEN ETL1.BATCH_STATUS_CD = 'C' THEN 'Completed'
WHEN ETL1.BATCH_STATUS_CD = 'R' THEN 'Running'
WHEN ETL1.BATCH_STATUS_CD = 'N' THEN 'Not Started'
WHEN ETL1.BATCH_STATUS_CD = 'F' THEN 'Failed'
END AS STATUS
FROM
(
SELECT
QRY2.ASSET_ID,
QRY2.APPLICATION,
QRY2.ASSET_TYP,
QRY2.ASSET_NAME,
QRY2.MODULE_NAME,
QRY2.COMPLETING_PROCESS_ID,
QRY2.DATA_DATE
,MIN(QRY2.START_TIME) OVER(PARTITION BY QRY2.APPLICATION,QRY2.ASSET_TYP,QRY2.ASSET_NAME,QRY2.MODULE_NAME,QRY2.COMPLETING_PROCESS_ID,QRY2.DATA_DATE)AS START_TIME
,MAX(QRY2.END_TIME) OVER(PARTITION BY QRY2.APPLICATION,QRY2.ASSET_TYP,QRY2.ASSET_NAME,QRY2.MODULE_NAME,QRY2.COMPLETING_PROCESS_ID,QRY2.DATA_DATE)AS END_TIME
,MAX(QRY2.BATCH_RUN_ID) OVER(PARTITION BY QRY2.APPLICATION,QRY2.ASSET_TYP,QRY2.ASSET_NAME,QRY2.MODULE_NAME,QRY2.COMPLETING_PROCESS_ID,QRY2.DATA_DATE) AS BATCH_RUN_ID
FROM
(
SELECT
QRY1.ASSET_ID,
QRY1.APPLICATION,
QRY1.ASSET_TYP,
QRY1.MODULE_NAME,
QRY1.ASSET_NAME,
QRY1.COMPLETING_PROCESS_ID,
QRY1.DATA_DATE AS DATA_DATE,
ETL.BATCH_STATUS_CD,
CASE WHEN
QRY1.COMPL_SESSION_NAME IS NOT NULL
THEN SESS.SESSION_INSTANCE_START_TS
ELSE
ETL.BATCH_START_TS
END AS START_TIME,
CASE WHEN
QRY1.COMPL_SESSION_NAME IS NOT NULL
THEN SESS.SESSION_INSTANCE_END_TS
ELSE
ETL.BATCH_END_TS
END AS END_TIME,
ETL.BATCH_RUN_ID
FROM
(
select MASTER.ASSET_ID,
MASTER.APPLICATION,
MASTER.ASSET_TYP,
MASTER.MODULE_NAME,
MASTER.ASSET_NAME,
DEPEND.COMPLETING_PROCESS_ID,
CAL.CALENDAR_DAY_DT AS RUN_DATE,
cast(CAL.CALENDAR_DAY_DT as format 'YYYY-MM-DD')+ cast(DEPEND.DATA_DELAY as interval DAY) AS DATA_DATE,
DEPEND.COMPL_SESSION_NAME
from
ABC.ASSET_CONFIG MASTER
INNER JOIN
ABC.ASSET_DEPENDENCY_CONFIG DEPEND
ON
MASTER.ASSET_ID = DEPEND.ASSET_ID
inner join
ndw_base_views.fiscal_calendar cal
on
1=1
and
CALENDAR_DAY_DT BETWEEN '2023-05-27' AND '2023-05-27'
where
DEPEND.PRCS_CNTRL_STORE_TYP = 'NDW_PRCS_CNTRL_VIEWS.NDW_ETL_BATCH_CONTROL'
GROUP BY 1,2,3,4,5,6,7,8,9
)QRY1
LEFT JOIN
ABC.BATCH_CONTROL ETL
ON
ETL.DATA_SOURCE_ID= QRY1.COMPLETING_PROCESS_ID
AND ETL.INCREMENTAL_LOAD_START_TS = QRY1.DATA_DATE
--LEFT JOIN
--ABC.NDW_ETL_WORKFLOW_CONTROL WF
--ON WF.DATA_SOURCE_ID = ETL.DATA_SOURCE_ID
--AND ETL.BATCH_RUN_ID = WF.BATCH_RUN_ID
--LEFT JOIN
--ABC.NDW_ETL_SESSION_CONTROL SESS
--ON WF.WORKFLOW_RUN_ID = SESS.WORKFLOW_RUN_ID
--AND SESS.SESSION_INSTANCE_NM = QRY1.COMPL_SESSION_NAME
--WHERE QRY1.ASSET_ID = 10080
) QRY2
) QRY3
INNER JOIN
ABC.BATCH_CONTROL ETL1
ON
ETL1.DATA_SOURCE_ID = QRY3.COMPLETING_PROCESS_ID
AND ETL1.BATCH_RUN_ID = QRY3.BATCH_RUN_ID
GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14;
Read your Query code - You need to do inner join
with the ABC.ASSET_DEPENDENCY_CONFIG table on the asset ID to consider the dependency relationship between assets and processes.You need to use the MIN and MAX
aggregation functions to determine the earliest start time (MIN(B.BATCH_START_TS)) and the latest completion time (MAX(B.BATCH_END_TS)) among all the processes related to an asset. That takes into account multiple processes for an asset. It identifies the earliest start time and the latest completion time among all the processes for a given asset.
So your final code will be like this
PS the original answer-er gave a sudo query but another person took some extra time to rewrite the original query with my suggestions which is below
SELECT
MASTER.APPLICATION,
MASTER.ASSET_NAME,
MASTER.MODULE_NAME,
MASTER.ASSET_TYP,
CAST(CAL.CALENDAR_DAY_DT as format 'YYYY-MM-DD') + CAST(DEPEND.DATA_DELAY as interval DAY) AS DATA_DATE,
MIN(ETL.BATCH_START_TS) AS START_TIME,
MAX(ETL.BATCH_END_TS) AS FINAL_REFRESH_TIME,
CASE
WHEN DEPEND.DATA_DELAY = -1 AND FINAL_REFRESH_TIME < cast(current_date as timestamp(0)) + INTERVAL '9' HOUR THEN 'Yes'
WHEN DEPEND.DATA_DELAY = -2 AND FINAL_REFRESH_TIME < cast(current_date + INTERVAL '1' DAY as timestamp(0)) + INTERVAL '9' HOUR THEN 'Yes'
ELSE 'No' END AS REFRESHED_BEFORE_9_AM,
CASE
WHEN DEPEND.DATA_DELAY = -1 AND FINAL_REFRESH_TIME < cast(current_date as timestamp(0)) + INTERVAL '12' HOUR THEN 'Yes'
WHEN DEPEND.DATA_DELAY = -2 AND FINAL_REFRESH_TIME < cast(current_date + INTERVAL '1' DAY as timestamp(0)) + INTERVAL '12' HOUR THEN 'Yes'
ELSE 'No' END AS REFRESHED_BEFORE_12_PM,
CASE
WHEN FINAL_REFRESH_TIME < cast(current_date + INTERVAL '1' DAY as timestamp(0)) + INTERVAL '9' HOUR THEN 'No'
ELSE 'Yes' END AS SOURCE_DATA_ISSUE,
CASE
WHEN FINAL_REFRESH_TIME < cast(current_date + INTERVAL '1' DAY as timestamp(0)) + INTERVAL '9' HOUR THEN 'No'
ELSE 'Yes' END AS ETL_OPS_PROCESS_ISSUE,
CASE
WHEN FINAL_REFRESH_TIME < cast(current_date + INTERVAL '1' DAY as timestamp(0)) + INTERVAL '9' HOUR THEN 'No'
ELSE 'Yes' END AS PLATFORM_ISSUE,
CASE
WHEN SOURCE_DATA_ISSUE = 'No' AND ETL_OPS_PROCESS_ISSUE = 'No' AND PLATFORM_ISSUE = 'No' THEN 'NA'
WHEN SOURCE_DATA_ISSUE = 'Yes' OR ETL_OPS_PROCESS_ISSUE = 'Yes' OR PLATFORM_ISSUE = 'Yes'
AND DEPEND.DATA_DELAY = -1 AND FINAL_REFRESH_TIME < cast(current_date as timestamp(0)) + INTERVAL '9' HOUR THEN 'Yes'
WHEN SOURCE_DATA_ISSUE = 'Yes' OR ETL_OPS_PROCESS_ISSUE = 'Yes' OR PLATFORM_ISSUE = 'Yes'
AND DEPEND.DATA_DELAY = -2 AND FINAL_REFRESH_TIME < cast(current_date + INTERVAL '1' DAY as timestamp(0)) + INTERVAL '12' HOUR THEN 'Yes'
ELSE 'No' END AS ISSUE_RESOLVED_SAME_DAY,
SUBSTRING((TRIM((CAST(((CAST(MAX(ETL.BATCH_END_TS) AS TIMESTAMP(0)) - CAST(MIN(ETL.BATCH_START_TS) AS TIMESTAMP(0))) DAY(4) TO SECOND) AS VARCHAR(50))))), 2, 9) AS DURATION,
/* (TRIM((CAST(((CAST(MAX(ETL.BATCH_END_TS) AS TIMESTAMP(0)) - CAST(MIN(ETL.BATCH_START_TS) AS TIMESTAMP(0))) DAY(4) TO SECOND) AS VARCHAR(50))))) AS DURATION_TIMESTAMP,
((CAST(MAX(ETL.BATCH_END_TS) AS TIMESTAMP(0)) - MIN(ETL.BATCH_START_TS)) DAY(4) TO SECOND(4)) AS t1,
(EXTRACT(DAY FROM t1) * (24 * 60 * 60) +
EXTRACT(HOUR FROM t1) * (60 * 60) +
EXTRACT(MINUTE FROM t1) * 60 +
EXTRACT(SECOND FROM t1)) AS DURATION_SECONDS, */
CASE ETL.BATCH_STATUS_CD
WHEN 'C' THEN 'Completed'
WHEN 'R' THEN 'Running'
WHEN 'N' THEN 'Not Started'
WHEN 'F' THEN 'Failed'
END AS STATUS
FROM
ABC.ASSET_CONFIG MASTER
INNER JOIN ABC.ASSET_DEPENDENCY_CONFIG DEPEND ON MASTER.ASSET_ID = DEPEND.ASSET_ID
INNER JOIN ABC.ETL_BATCH_CONTROL ETL ON DEPEND.COMPLETING_PROCESS_ID = ETL.DATA_SOURCE_ID
INNER JOIN ABC.FISCAL_CALENDAR CAL ON 1=1
LEFT JOIN ABC.ETL_WORKFLOW_CONTROL WF ON WF.DATA_SOURCE_ID = ETL.DATA_SOURCE_ID AND WF.BATCH_RUN_ID = ETL.BATCH_RUN_ID
LEFT JOIN ABC.NDW_ETL_SESSION_CONTROL SESS ON WF.WORKFLOW_RUN_ID = SESS.WORKFLOW_RUN_ID AND SESS.SESSION_INSTANCE_NM = DEPEND.COMPL_SESSION_NAME
WHERE CAL.CALENDAR_DAY_DT BETWEEN '?ldt1' AND '?ldt2'
AND DEPEND.PRCS_CNTRL_STORE_TYP = 'NDW_PRCS_CNTRL_VIEWS.NDW_ETL_BATCH_CONTROL'
AND CAST(DATA_DATE AS DATE) = CAST(ETL.INCREMENTAL_LOAD_START_TS AS DATE)
GROUP BY
--1,2,3,4,5,8,9,10,11,12,13
MASTER.APPLICATION,
MASTER.ASSET_NAME,
MASTER.MODULE_NAME,
MASTER.ASSET_TYP,
DATA_DATE,
DATA_DELAY,
STATUS