I have my master table of sources:
CREATE TABLE masterSources (
pk int NOT NULL AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(500)
);
INSERT INTO masterSources (name) VALUES
('derived:com.google.active_minutes:com.google.ios.fit:appleinc.:iphone:1213084:top_level'),
('derived:com.google.active_minutes:com.google.ios.fit:appleinc.:watch:f8df280c:top_level'),
('derived:com.google.step_count.delta:com.google.android.gms:estimated_steps'),
('derived:com.google.activity.segment:com.google.ios.fit:appleinc.:iphone:1213084:top_level'),
('derived:com.google.activity.segment:com.google.ios.fit:appleinc.:watch:f8df280c:top_level'),
('derived:com.google.activity.segment:com.google.ios.fit:appleinc.:watch:f8df280c:session_activity_segment'),
('derived:com.google.calories.expended:com.google.ios.fit:appleinc.:iphone:1213084:top_level'),
('derived:com.google.calories.expended:com.google.ios.fit:appleinc.:watch:f8df280c:top_level'),
('derived:com.google.calories.expended:com.google.android.gms:from_activities');
Then, I receive thousands and thousands of sources that I can bulk into this table:
CREATE TABLE receivedSources (
name VARCHAR(500)
);
INSERT INTO receivedSources (name) VALUES
('derived:com.google.active_minutes:com.google.ios.fit:appleinc.:iphone:1213084:top_level'),
('derived:com.google.active_minutes:com.google.ios.fit:appleinc.:watch:f8df280c:top_level'),
('derived:com.google.active_minutes:com.google.ios.fit:appleinc.:watch:f8df280c:top_level'), #repeated
('derived:com.google.active_minutes:com.google.ios.fit:appleinc.:watch:gew8df280c:top_level'), #new
('derived:com.google.active_minutes:com.google.ios.fit:appleinc.:watch:ged8df280c:low_level'), #new
('derived:com.google.step_count.delta:com.google.android.gms:estimated_steps'),
('derived:com.google.step_count.delta:com.google.android.gms:estimated_steps'), #repeated
('derived:com.google.activity.segment:com.google.ios.fit:appleinc.:iphone:1213084:top_level'),
('derived:com.google.activity.segment:com.google.ios.fit:appleinc.:watch:f8df280c:top_level'),
('derived:com.google.activity.segment:com.google.ios.fit:appleinc.:watch:f8df280c:session_activity_segment'),
('derived:com.google.calories.expended:com.google.ios.fit:appleinc.:iphone:1213084:top_level'),
('derived:com.google.calories.expended:com.google.ios.fit:appleinc.:iphone:1213084:top_level'), #repeated
('derived:com.google.calories.expended:com.google.ios.fit:appleinc.:watch:f8df280c:top_level'),
('derived:com.google.calories.expended:com.google.android.gms:fromx2_activities'), #new
('derived:com.google.calories.expended:com.google.android.gms:from_activities');
What I want is to add the "new" sources into my masterSources
table and finally generate a new table associating each source from receivedSources
with its corresponding pk.
I can do it with a procedure:
CREATE PROCEDURE my_procedure()
BEGIN
DECLARE newName VARCHAR(500);
DECLARE done BOOLEAN DEFAULT FALSE;
# receivedSources MINUS masterSources
DECLARE cur CURSOR FOR SELECT n.name
FROM receivedSources n LEFT JOIN masterSources m ON (n.name=m.name)
WHERE m.pk IS NULL;
DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET done = TRUE;
OPEN cur;
loop1: LOOP
FETCH cur INTO newName;
IF done THEN
LEAVE loop1;
END IF;
# Inserting new Sources in master
INSERT INTO masterSources (name) VALUES (newName);
END LOOP loop1;
CLOSE cur;
# Getting my desired table
CREATE TABLE newReceivedSources AS
SELECT m.pk,m.name FROM receivedSources n INNER JOIN masterSources m ON (n.name=m.name);
END
call my_procedure()
select * from newReceivedSources
My question: Is there a faster/smarter way to do it, considering how expensive is the join by varchar
fields?
First add a unique key on masterSources
ALTER TABLE masterSources ADD UNIQUE KEY (name)
(Optional) to increase performance, also add a (non-unique) key on receivedSources
ALTER TABLE receivedSources ADD KEY (name)
Then insert any new unique values into masterSources
INSERT IGNORE INTO masterSources (name) SELECT name FROM receivedSources
Then retrieve your desired dataset
SELECT masterSources.pk, masterSources.name
FROM receivedSources
INNER JOIN masterSources ON receivedSources.name = masterSources.name