I am trying to cast the fields GCAM_value and Themes_value as number such as float64.
I have tried CAST(regexp_extract(x, r'^(.*?):') as FLOAT64) but got an error message as the code stumbles over NULL values.
#standardSQL
SELECT
GKGRECORDID,
DATE,
ARRAY(
SELECT regexp_extract(x, r'^(.*?)\:')
FROM UNNEST(split(GCAM,',')) AS x
WHERE regexp_extract(x, r'^(.*?)\:') IS NOT NULL
) AS GCAM_field,
ARRAY(
SELECT regexp_extract(x, r'[^:]*$')
FROM UNNEST(split(GCAM,',')) AS x
WHERE regexp_extract(x, r'[^:]*$') IS NOT NULL
) AS GCAM_value,
ARRAY(
SELECT regexp_extract(x, r'^(.*?)\,')
FROM UNNEST(split(V2Themes,';')) AS x
WHERE regexp_extract(x, r'^(.*?)\,') IS NOT NULL
) AS Theme_field,
ARRAY(
SELECT regexp_extract(x, r'[^,]*$')
FROM UNNEST(split(V2Themes,';')) AS x
WHERE regexp_extract(x, r'[^,]*$') IS NOT NULL
) AS Theme_value,
TranslationInfo,
V2Tone,
SourceCollectionIdentifier,
SourceCommonName,
DocumentIdentifier
FROM `gdelt-bq.gdeltv2.gkg_partitioned`
WHERE _PARTITIONTIME BETWEEN TIMESTAMP('2019-02-02') AND TIMESTAMP('2019-
02-02')
I would like the array items for GCAM_value and Themes_value to be in some numeric format like float64.
Below should be good starting point for you
#standardSQL
SELECT
GKGRECORDID,
DATE,
ARRAY(
SELECT REGEXP_EXTRACT(x, r'^(.*?)\:')
FROM UNNEST(SPLIT(GCAM,',')) AS x
WHERE REGEXP_EXTRACT(x, r'^(.*?)\:') IS NOT NULL
) AS GCAM_field,
ARRAY(
SELECT CAST(REGEXP_EXTRACT(x, r'[^:]*$') AS FLOAT64)
FROM UNNEST(SPLIT(GCAM,',')) AS x
WHERE IFNULL(REGEXP_EXTRACT(x, r'[^:]*$'), '') != ''
) AS GCAM_value,
ARRAY(
SELECT REGEXP_EXTRACT(x, r'^(.*?)\,')
FROM UNNEST(SPLIT(V2Themes,';')) AS x
WHERE REGEXP_EXTRACT(x, r'^(.*?)\,') IS NOT NULL
) AS Theme_field,
ARRAY(
SELECT CAST(REGEXP_EXTRACT(x, r'[^,]*$') AS FLOAT64)
FROM UNNEST(SPLIT(V2Themes,';')) AS x
WHERE IFNULL(REGEXP_EXTRACT(x, r'[^,]*$'), '') != ''
) AS Theme_value,
TranslationInfo,
V2Tone,
SourceCollectionIdentifier,
SourceCommonName,
DocumentIdentifier
FROM `gdelt-bq.gdeltv2.gkg_partitioned`
WHERE _PARTITIONTIME BETWEEN TIMESTAMP('2019-02-02') AND TIMESTAMP('2019-02-02')
It returns REPEATED FLOAT
instead REPEATED STRING
for respective fields