mysqlvarchar

Inserting into Mysql master table of very large varchar fields


I have my master table of sources:

CREATE TABLE masterSources (
pk int NOT NULL AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(500)
);

INSERT INTO masterSources (name) VALUES 
('derived:com.google.active_minutes:com.google.ios.fit:appleinc.:iphone:1213084:top_level'),
('derived:com.google.active_minutes:com.google.ios.fit:appleinc.:watch:f8df280c:top_level'),
('derived:com.google.step_count.delta:com.google.android.gms:estimated_steps'),
('derived:com.google.activity.segment:com.google.ios.fit:appleinc.:iphone:1213084:top_level'),
('derived:com.google.activity.segment:com.google.ios.fit:appleinc.:watch:f8df280c:top_level'),
('derived:com.google.activity.segment:com.google.ios.fit:appleinc.:watch:f8df280c:session_activity_segment'),
('derived:com.google.calories.expended:com.google.ios.fit:appleinc.:iphone:1213084:top_level'),
('derived:com.google.calories.expended:com.google.ios.fit:appleinc.:watch:f8df280c:top_level'),
('derived:com.google.calories.expended:com.google.android.gms:from_activities');

Then, I receive thousands and thousands of sources that I can bulk into this table:

CREATE TABLE receivedSources (
name VARCHAR(500)
);

INSERT INTO receivedSources (name) VALUES 
('derived:com.google.active_minutes:com.google.ios.fit:appleinc.:iphone:1213084:top_level'),
('derived:com.google.active_minutes:com.google.ios.fit:appleinc.:watch:f8df280c:top_level'),
('derived:com.google.active_minutes:com.google.ios.fit:appleinc.:watch:f8df280c:top_level'), #repeated
('derived:com.google.active_minutes:com.google.ios.fit:appleinc.:watch:gew8df280c:top_level'), #new
('derived:com.google.active_minutes:com.google.ios.fit:appleinc.:watch:ged8df280c:low_level'), #new
('derived:com.google.step_count.delta:com.google.android.gms:estimated_steps'),
('derived:com.google.step_count.delta:com.google.android.gms:estimated_steps'), #repeated
('derived:com.google.activity.segment:com.google.ios.fit:appleinc.:iphone:1213084:top_level'),
('derived:com.google.activity.segment:com.google.ios.fit:appleinc.:watch:f8df280c:top_level'),
('derived:com.google.activity.segment:com.google.ios.fit:appleinc.:watch:f8df280c:session_activity_segment'),
('derived:com.google.calories.expended:com.google.ios.fit:appleinc.:iphone:1213084:top_level'),
('derived:com.google.calories.expended:com.google.ios.fit:appleinc.:iphone:1213084:top_level'), #repeated
('derived:com.google.calories.expended:com.google.ios.fit:appleinc.:watch:f8df280c:top_level'),
('derived:com.google.calories.expended:com.google.android.gms:fromx2_activities'), #new
('derived:com.google.calories.expended:com.google.android.gms:from_activities');

What I want is to add the "new" sources into my masterSources table and finally generate a new table associating each source from receivedSources with its corresponding pk.

I can do it with a procedure:

CREATE PROCEDURE my_procedure()
BEGIN
    DECLARE newName VARCHAR(500);
    DECLARE done BOOLEAN DEFAULT FALSE;                     


    # receivedSources MINUS masterSources 
    DECLARE cur CURSOR FOR  SELECT n.name 
                            FROM receivedSources n LEFT JOIN masterSources m ON (n.name=m.name)
                            WHERE m.pk IS NULL;
    DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET done = TRUE;
    
    OPEN cur;

    loop1: LOOP
    FETCH cur INTO newName;
   
        IF done THEN 
            LEAVE loop1;            
        END IF; 
    
        # Inserting new Sources in master
        INSERT INTO masterSources (name) VALUES (newName);     

    END LOOP loop1;
    CLOSE cur;
   
    # Getting my desired table
    CREATE TABLE newReceivedSources AS
    SELECT m.pk,m.name FROM receivedSources n INNER JOIN masterSources m ON (n.name=m.name);

END

call my_procedure()

select * from newReceivedSources

enter image description here

My question: Is there a faster/smarter way to do it, considering how expensive is the join by varchar fields?


Solution

  • First add a unique key on masterSources

    ALTER TABLE masterSources ADD UNIQUE KEY (name)
    

    (Optional) to increase performance, also add a (non-unique) key on receivedSources

    ALTER TABLE receivedSources ADD KEY (name)
    

    Then insert any new unique values into masterSources

    INSERT IGNORE INTO masterSources (name) SELECT name FROM receivedSources
    

    Then retrieve your desired dataset

    SELECT masterSources.pk, masterSources.name
    FROM receivedSources
    INNER JOIN masterSources ON receivedSources.name = masterSources.name