Using snowflake for this:
I have a query that produces a very simple table union from 5 difference data sources:
WITH personal_info_workday AS (
SELECT
'Workday' AS source,
CAST(w.worker_code AS string) AS employee_id,
w.business_site_summary_name AS office_location
FROM
prod_raw.workday_prod.worker w
),
personal_info_bamboo AS (
SELECT
'Bamboo' AS source,
NULL AS employee_id,
NULL AS office_location
FROM
prod_raw.bamboohr.employee e
),
personal_info_adp AS (
SELECT
'ADP' AS source,
CAST(w.id AS string) AS employee_id,
NULL AS office_location
FROM
prod_raw.adp_workforce_now.worker w
),
personal_info_salesforce AS (
SELECT
'Salesforce' AS source,
CAST(c.workday_id__c AS string) AS employee_id,
c.office_location__c AS office_location
FROM
prod_raw.salesforce_corp_hub.contact c
WHERE
c.full_time_part_time__c = 'Contractor'
),
personal_info_mihi AS (
SELECT
'Mihi' AS source,
CAST(e.employee_number AS string) AS employee_id,
NULL AS office_location
FROM
prod_raw.mihi.employees e
)
SELECT * FROM personal_info_workday
UNION ALL
SELECT *
FROM personal_info_salesforce s
WHERE NOT EXISTS (
SELECT 1
FROM personal_info_workday w
WHERE s.employee_id = w.employee_id
)
UNION ALL
SELECT * FROM personal_info_bamboo
UNION ALL
SELECT * FROM personal_info_adp
UNION ALL
SELECT * FROM personal_info_mihi
ORDER BY employee_id asc
It produces this table:
SOURCE | EMPLOYEE_ID | Office Location |
---|---|---|
ADP | 57 | 1 |
Salesforce | 57 | 2 |
Workday | 57 | 2 |
Bamboo | 58 | 3 |
Salesforce | 59 | 4 |
Mihi | 1010013461 | 5 |
Anytime there's an instance of an employee number matching in the workday
and salesforce
[source] at the same time (ID 57), I want only the workday
row returned, filter out the salesforce
duplicate. Any other instance of the employee number showing (like with ADP also having id 57), just leave it be.
This is how I want the table to look:
SOURCE | EMPLOYEE_ID | Office Location |
---|---|---|
ADP | 57 | 1 |
Workday | 57 | 2 |
Bamboo | 58 | 3 |
Salesforce | 59 | 4 |
Mihi | 1010013461 | 5 |
I tried utilizing this, but it wasn't consolidating correctly:
SELECT *
FROM personal_info_salesforce s
WHERE NOT EXISTS (
SELECT 1
FROM personal_info_workday w
WHERE s.employee_id = w.employee_id
)
Here's a query to show the union data for something like DB fiddle:
CREATE TABLE employee_office_location (
source VARCHAR(50),
employee_id VARCHAR(50),
office_location INT,
PRIMARY KEY (source, employee_id) -- Ensures each (source, employee_id) pair is unique
);
INSERT INTO employee_office_location (source, employee_id, office_location) VALUES
('ADP', '1', 1),
('Salesforce', '57', 2),
('Workday', '57', 2),
('Bamboo', '58', 3),
('Salesforce', '59', 4),
('Mihi', '1010013461', 5);
Any ideas?
with main as (
SELECT 'ADP' AS source, '57' AS employee_id, 1 AS office_location
UNION ALL
SELECT 'Salesforce', '57', 2
UNION ALL
SELECT 'Workday', '57', 2
UNION ALL
SELECT 'Bamboo', '58', 3
UNION ALL
SELECT 'Salesforce', '59', 4
UNION ALL
SELECT 'Mihi', '1010013461', 5
),
test as (
select *, list_agg(source) over(partition by employee_id) as list_of_sources
from main
),
logic as (
select *,
if(list_of_sources like '%Salesforce%' and list_of_sources like '%Workday%', true,false) as is_list_salesforce_and_workday,
if(list_of_sources like '%Salesforce%' and list_of_sources like '%Workday%' and source = 'Salesforce', true,false) as is_list_salesforce_and_workday_and_source_is_salesforce
from test
)
select source, employee_id, office_location from logic where not is_list_salesforce_and_workday OR not is_list_salesforce_and_workday_and_source_is_salesforce
order by employee_id