mysqlmultivalueu2unidatamultivalue-database

MySQL procedure to load data from staging table to other tables. Need to split up multivalue field in the process


I'm trying to export data from a multivalue database (Unidata) into MySQL. Lets say my source data was a person's ID number, their first name and all the states they've lived in. The states field is a multi value field and I'm exporting them so that the different values within that field are seperated by a ~. A sample extract looks like:

"1234","Sally","NY~NJ~CT"
"1235","Dave","ME~MA~FL"
"3245","Fred","UT~CA"
"2344","Sue","OR"

I've loaded this data into a staging table

Table:staging
Column 1: personId
Column 2: name
Column 3: states

What I want to do is split this data out into two tables using a procedure: a persons table and a states table. A person can have many entries in the states table:

Table 1: persons
Column 1: id
Column 2: name

Table 2: states
Column 1: personId
Column 2: state

My procedure takes the data from the staging table and dumps it over to table 1 just fine. However, i'm a little lost how how to split the data up and send it to table 2. Sally would need to have three entries in the states table (NY, NJ, CT), Dave would have 3, Fred would have 2 and Sue would have1 (OR). Any ideas on how to accomplish this?


Solution

  • try something like this : http://pastie.org/1213943

    -- TABLES
    
    drop table if exists staging;
    create table staging
    (
    person_id int unsigned not null primary key,
    name varchar(255) not null,
    states_csv varchar(1024)
    )
    engine=innodb;
    
    drop table if exists persons;
    create table persons
    (
    person_id int unsigned not null primary key,
    name varchar(255) not null
    )
    engine=innodb;
    
    drop table if exists states;
    create table states
    (
    state_id tinyint unsigned not null auto_increment primary key, -- i want a nice new integer based PK
    state_code varchar(3) not null unique, -- original state code from staging
    name varchar(255) null
    )
    engine=innodb;
    
    /*
    you might want to make the person_states primary key (person_id, state_id) depending on 
    your queries as this is currently optimised for queries like - select all the people from NY
    */
    
    drop table if exists person_states;
    create table person_states
    (
    state_id tinyint unsigned not null,
    person_id int unsigned not null,
    primary key(state_id, person_id),
    key (person_id)
    )
    engine=innodb;
    
    
    -- STORED PROCEDURES
    
    drop procedure if exists load_staging_data;
    
    delimiter #
    
    create procedure load_staging_data()
    proc_main:begin
    
    truncate table staging;
    
    -- assume this is done by load data infile...
    
    set autocommit = 0;
    
    insert into staging values
    (1234,'Sally','NY~NJ~CT'),
    (1235,'Dave','ME~MA~FL'),
    (3245,'Fred','UT~CA'),
    (2344,'Sue','OR'),
    (5555,'f00','OR~NY');
    
    commit;
    
    end proc_main #
    
    delimiter ;
    
    
    drop procedure if exists cleanse_map_staging_data;
    
    delimiter #
    
    create procedure cleanse_map_staging_data()
    proc_main:begin
    
    declare v_cursor_done tinyint unsigned default 0;
    
    -- watch out for variable names that have the same names as fields !!
    
    declare v_person_id int unsigned;
    
    declare v_states_csv varchar(1024);
    declare v_state_code varchar(3);
    declare v_state_id tinyint unsigned;
    
    declare v_states_done tinyint unsigned;
    declare v_states_idx int unsigned;
    
    declare v_staging_cur cursor for select person_id, states_csv from staging order by person_id;
    declare continue handler for not found set v_cursor_done = 1;
    
    -- do the person data
    
    set autocommit = 0;
    
    insert ignore into persons (person_id, name)
      select person_id, name from staging order by person_id;
    
    commit;
    
    -- ok now we have to use the cursor !!
    
    set autocommit = 0; 
    
    open v_staging_cur;
    repeat
    
      fetch v_staging_cur into v_person_id, v_states_csv;
    
      -- clean up the data (for example)
    
      set v_states_csv = upper(trim(v_states_csv));
    
      -- split the out the v_states_csv and insert
    
      set v_states_done = 0;       
      set v_states_idx = 1;
    
      while not v_states_done do
    
        set v_state_code = substring(v_states_csv, v_states_idx, 
          if(locate('~', v_states_csv, v_states_idx) > 0, 
            locate('~', v_states_csv, v_states_idx) - v_states_idx, 
            length(v_states_csv)));
    
          set v_state_code = trim(v_state_code);
    
          if length(v_state_code) > 0 then
    
            set v_states_idx = v_states_idx + length(v_state_code) + 1;
    
            -- add the state if it doesnt already exist
            insert ignore into states (state_code) values (v_state_code);
    
            select state_id into v_state_id from states where state_code = v_state_code;
    
            -- add the person state
            insert ignore into person_states (state_id, person_id) values (v_state_id, v_person_id);
    
          else
            set v_states_done = 1;
          end if;
    
      end while;
    
    until v_cursor_done end repeat;
    
    close v_staging_cur;
    
    commit;
    
    end proc_main #
    
    
    delimiter ;
    
    
    -- TESTING
    
    
    call load_staging_data();
    
    select * from staging;
    
    call cleanse_map_staging_data();
    
    select * from states order by state_id;
    select * from persons order by person_id;
    select * from person_states order by state_id, person_id;