google-bigquerygoogle-cloud-datalab

standard sql: casting strings in array as float64


I am trying to cast the fields GCAM_value and Themes_value as number such as float64.

I have tried CAST(regexp_extract(x, r'^(.*?):') as FLOAT64) but got an error message as the code stumbles over NULL values.

#standardSQL

SELECT
  GKGRECORDID,
  DATE, 
  ARRAY(
    SELECT regexp_extract(x, r'^(.*?)\:')
    FROM UNNEST(split(GCAM,',')) AS x
    WHERE regexp_extract(x, r'^(.*?)\:') IS NOT NULL
  ) AS GCAM_field,
   ARRAY(
    SELECT regexp_extract(x, r'[^:]*$')
    FROM UNNEST(split(GCAM,',')) AS x
    WHERE regexp_extract(x, r'[^:]*$') IS NOT NULL
  ) AS GCAM_value,

    ARRAY(
    SELECT regexp_extract(x, r'^(.*?)\,')
    FROM UNNEST(split(V2Themes,';')) AS x
    WHERE regexp_extract(x, r'^(.*?)\,') IS NOT NULL
  ) AS Theme_field,
   ARRAY(
    SELECT regexp_extract(x, r'[^,]*$')
    FROM UNNEST(split(V2Themes,';')) AS x
    WHERE regexp_extract(x, r'[^,]*$') IS NOT NULL
  ) AS Theme_value,

  TranslationInfo,
  V2Tone,
  SourceCollectionIdentifier,
  SourceCommonName,
  DocumentIdentifier
FROM `gdelt-bq.gdeltv2.gkg_partitioned`
WHERE _PARTITIONTIME BETWEEN TIMESTAMP('2019-02-02') AND TIMESTAMP('2019- 
02-02')

I would like the array items for GCAM_value and Themes_value to be in some numeric format like float64.


Solution

  • Below should be good starting point for you

    #standardSQL
    SELECT
      GKGRECORDID,
      DATE, 
      ARRAY(
        SELECT REGEXP_EXTRACT(x, r'^(.*?)\:')
        FROM UNNEST(SPLIT(GCAM,',')) AS x
        WHERE REGEXP_EXTRACT(x, r'^(.*?)\:') IS NOT NULL
      ) AS GCAM_field,
      ARRAY(
        SELECT CAST(REGEXP_EXTRACT(x, r'[^:]*$') AS FLOAT64)
        FROM UNNEST(SPLIT(GCAM,',')) AS x
        WHERE IFNULL(REGEXP_EXTRACT(x, r'[^:]*$'), '') != ''
      ) AS GCAM_value,
      ARRAY(
        SELECT REGEXP_EXTRACT(x, r'^(.*?)\,')
        FROM UNNEST(SPLIT(V2Themes,';')) AS x
        WHERE REGEXP_EXTRACT(x, r'^(.*?)\,') IS NOT NULL
      ) AS Theme_field,
      ARRAY(
        SELECT CAST(REGEXP_EXTRACT(x, r'[^,]*$') AS FLOAT64)
        FROM UNNEST(SPLIT(V2Themes,';')) AS x
        WHERE IFNULL(REGEXP_EXTRACT(x, r'[^,]*$'), '') != ''
      ) AS Theme_value,
      TranslationInfo,
      V2Tone,
      SourceCollectionIdentifier,
      SourceCommonName,
      DocumentIdentifier
    FROM `gdelt-bq.gdeltv2.gkg_partitioned`
    WHERE _PARTITIONTIME BETWEEN TIMESTAMP('2019-02-02') AND TIMESTAMP('2019-02-02')  
    

    It returns REPEATED FLOAT instead REPEATED STRING for respective fields