google-bigquery

BigQuery: Convert accented characters to their plain ascii equivalents


I have the following string:

brasília

And I need to convert to:

brasilia

Withou the ´ accent!

How can I do on BigQuery?

Thank you!


Solution

  • Try below as quick and simple option for you:

    #standardSQL
    WITH lookups AS (
      SELECT 
      'ç,æ,œ,á,é,í,ó,ú,à,è,ì,ò,ù,ä,ë,ï,ö,ü,ÿ,â,ê,î,ô,û,å,ø,Ø,Å,Á,À,Â,Ä,È,É,Ê,Ë,Í,Î,Ï,Ì,Ò,Ó,Ô,Ö,Ú,Ù,Û,Ü,Ÿ,Ç,Æ,Œ,ñ' AS accents,
      'c,ae,oe,a,e,i,o,u,a,e,i,o,u,a,e,i,o,u,y,a,e,i,o,u,a,o,O,A,A,A,A,A,E,E,E,E,I,I,I,I,O,O,O,O,U,U,U,U,Y,C,AE,OE,n' AS latins
    ),
    pairs AS (
      SELECT accent, latin FROM lookups, 
        UNNEST(SPLIT(accents)) AS accent WITH OFFSET AS p1, 
        UNNEST(SPLIT(latins)) AS latin WITH OFFSET AS p2
      WHERE p1 = p2
    ),
    yourTableWithWords AS (
      SELECT word FROM UNNEST(
            SPLIT('brasília,ångström,aperçu,barège, beau idéal, belle époque, béguin, bête noire, bêtise, Bichon Frisé, blasé, blessèd, bobèche, boîte, bombé, Bön, Boötes, boutonnière, bric-à-brac, Brontë Beyoncé,El Niño')
        ) AS word
    )
    SELECT 
      word AS word_with_accent, 
      (SELECT STRING_AGG(IFNULL(latin, char), '')
        FROM UNNEST(SPLIT(word, '')) char
        LEFT JOIN pairs
        ON char = accent) AS word_without_accent
    FROM yourTableWithWords   
    

    Output is

    word_with_accent word_without_accent     
    blessèd         blessed  
    El Niño         El Nino  
    belle époque    belle epoque     
    boîte           boite    
    Boötes          Bootes   
    blasé           blase    
    ångström        angstrom     
    bobèche         bobeche  
    barège          barege   
    bric-à-brac     bric-a-brac  
    bête noire      bete noire   
    Bichon Frisé    Bichon Frise     
    Brontë Beyoncé  Bronte Beyonce   
    bêtise          betise   
    beau idéal      beau ideal   
    bombé           bombe    
    brasília        brasilia     
    boutonnière     boutonniere  
    aperçu          apercu   
    béguin          beguin   
    Bön             Bon   
    

    UPDATE

    Below is how to pack this logic into SQL UDF - so accent2latin(word) can be called to make a "magic"

    #standardSQL
    CREATE TEMP FUNCTION accent2latin(word STRING) AS
    ((
      WITH lookups AS (
        SELECT 
        'ç,æ,œ,á,é,í,ó,ú,à,è,ì,ò,ù,ä,ë,ï,ö,ü,ÿ,â,ê,î,ô,û,å,ø,Ø,Å,Á,À,Â,Ä,È,É,Ê,Ë,Í,Î,Ï,Ì,Ò,Ó,Ô,Ö,Ú,Ù,Û,Ü,Ÿ,Ç,Æ,Œ,ñ' AS accents,
        'c,ae,oe,a,e,i,o,u,a,e,i,o,u,a,e,i,o,u,y,a,e,i,o,u,a,o,O,A,A,A,A,A,E,E,E,E,I,I,I,I,O,O,O,O,U,U,U,U,Y,C,AE,OE,n' AS latins
      ),
      pairs AS (
        SELECT accent, latin FROM lookups, 
          UNNEST(SPLIT(accents)) AS accent WITH OFFSET AS p1, 
          UNNEST(SPLIT(latins)) AS latin WITH OFFSET AS p2
        WHERE p1 = p2
      )
      SELECT STRING_AGG(IFNULL(latin, char), '')
      FROM UNNEST(SPLIT(word, '')) char
      LEFT JOIN pairs
      ON char = accent
    ));
    
    WITH yourTableWithWords AS (
      SELECT word FROM UNNEST(
            SPLIT('brasília,ångström,aperçu,barège, beau idéal, belle époque, béguin, bête noire, bêtise, Bichon Frisé, blasé, blessèd, bobèche, boîte, bombé, Bön, Boötes, boutonnière, bric-à-brac, Brontë Beyoncé,El Niño')
        ) AS word
    )
    SELECT 
      word AS word_with_accent, 
      accent2latin(word) AS word_without_accent
    FROM yourTableWithWords