sqlrnetezza

SQL: Learning How to Use The Percentile Functions in SQL


I am working with Netezza SQL.

I have the following table:

CREATE TABLE MY_TABLE (
    country VARCHAR(50),
    gender CHAR(1),
    age INTEGER,
    height FLOAT,
    owns_bicycle VARCHAR(3)
);

INSERT INTO MY_TABLE (country, gender, age, height, owns_bicycle)
VALUES ('UK', 'M', 25, 175.99, 'No');
INSERT INTO MY_TABLE (country, gender, age, height, owns_bicycle)
VALUES ('UK', 'F', 63, 163.65, 'No');
INSERT INTO MY_TABLE (country, gender, age, height, owns_bicycle)
VALUES ('UK', 'F', 83, 166.01, 'Yes');
INSERT INTO MY_TABLE (country, gender, age, height, owns_bicycle)
VALUES ('Canada', 'M', 94, 178.92, 'Yes');
INSERT INTO MY_TABLE (country, gender, age, height, owns_bicycle)
VALUES ('UK', 'M', 63, 173.24, 'Yes');
INSERT INTO MY_TABLE (country, gender, age, height, owns_bicycle)
VALUES ('Canada', 'M', 87, 156.27, 'Yes');
INSERT INTO MY_TABLE (country, gender, age, height, owns_bicycle)
VALUES ('Canada', 'F', 89, 159.26, 'No');
INSERT INTO MY_TABLE (country, gender, age, height, owns_bicycle)
VALUES ('Canada', 'M', 61, 179.14, 'Yes');
INSERT INTO MY_TABLE (country, gender, age, height, owns_bicycle)
VALUES ('UK', 'F', 49, 167.55, 'No');
INSERT INTO MY_TABLE (country, gender, age, height, owns_bicycle)
VALUES ('USA', 'F', 53, 172.82, 'No');
INSERT INTO MY_TABLE (country, gender, age, height, owns_bicycle)
VALUES ('Canada', 'F', 62, 161.18, 'No');
INSERT INTO MY_TABLE (country, gender, age, height, owns_bicycle)
VALUES ('Canada', 'F', 31, 173.08, 'No');
INSERT INTO MY_TABLE (country, gender, age, height, owns_bicycle)
VALUES ('USA', 'F', 33, 166.13, 'Yes');
INSERT INTO MY_TABLE (country, gender, age, height, owns_bicycle)
VALUES ('Canada', 'F', 50, 177.42, 'No');
INSERT INTO MY_TABLE (country, gender, age, height, owns_bicycle)
VALUES ('UK', 'F', 57, 155.56, 'No');
INSERT INTO MY_TABLE (country, gender, age, height, owns_bicycle)
VALUES ('USA', 'M', 57, 158.47, 'Yes');
INSERT INTO MY_TABLE (country, gender, age, height, owns_bicycle)
VALUES ('UK', 'F', 27, 152.85, 'No');
INSERT INTO MY_TABLE (country, gender, age, height, owns_bicycle)
VALUES ('UK', 'F', 89, 156.31, 'Yes');
INSERT INTO MY_TABLE (country, gender, age, height, owns_bicycle)
VALUES ('USA', 'F', 99, 179.31, 'No');

My Problem:

The final result should look something like this (i.e. each person should only be assigned to one final subset):

  country gender height_group    age_group   bicycle_proportion counts
  <chr>   <chr>  <fct>           <fct>                    <dbl>  <int>
1 Canada  F      150.84 - 158.49 18 - 31                  0          2
2 Canada  F      150.84 - 158.49 31 - 45.2                0.333      3
3 Canada  F      150.84 - 158.49 62.4 - 78.4              0          2
4 Canada  F      150.84 - 158.49 78.4 - 99                0          1
5 Canada  F      158.49 - 169.33 18 - 31                  0          1
6 Canada  F      158.49 - 169.33 31 - 45.2                1          1

I know how to do this using the R programming language:

library(dplyr)



set.seed(123)
n <- 100
country <- sample(c("USA", "Canada", "UK"), n, replace = TRUE)
gender <- sample(c("M", "F"), n, replace = TRUE)
age <- sample(18:100, n, replace = TRUE)
height <- runif(n, min = 150, max = 180)
owns_bicycle <- sample(c("Yes", "No"), n, replace = TRUE)

df <- data.frame(country, gender, age, height, owns_bicycle)


height_breaks <- quantile(df$height, probs = seq(0, 1, by = 1/3))
age_breaks <- quantile(df$age, probs = seq(0, 1, by = 1/5))

height_breaks <- round(height_breaks, 2)

height_labels <- paste0(height_breaks[-length(height_breaks)], " - ", height_breaks[-1])
age_labels <- paste0(age_breaks[-length(age_breaks)], " - ", age_breaks[-1])


df$height_group <- cut(df$height, breaks = height_breaks, labels = height_labels, include.lowest = TRUE)
df$age_group <- cut(df$age, breaks = age_breaks, labels = age_labels, include.lowest = TRUE)

final = df %>%
  group_by(country, gender, height_group, age_group) %>%
  summarise(bicycle_proportion = mean(owns_bicycle == "Yes"),
            counts = n())

Now, I am trying to convert this into Netezza SQL.

I am not sure how to do this:

CREATE TABLE height_groups AS
  SELECT
    NTILE(3) OVER (ORDER BY height) AS height_group,
    MIN(height) AS min_height,
    MAX(height) AS max_height
  FROM MY_TABLE;

CREATE TABLE age_groups AS
  SELECT
    NTILE(5) OVER (ORDER BY age) AS age_group,
    MIN(age) AS min_age,
    MAX(age) AS max_age
  FROM MY_TABLE;

But I don't think I am doing this correctly. Can someone please show me how to do this?

Thanks!


Solution

  • Here is an example use of some functions to arrive at the equivalent of your R code. It uses percentile_cont to arrive at the 2 boundary values between quantiles which can then be used in case expressions to arrive at height or age labels. The "cross join" simply adds these values to each table row for ease of use in the case expressions. (This is displayed further in the referenced fiddle below.)

    SELECT
          t.country
        , t.gender
        , CASE WHEN t.height <= height_quantiles.q1 THEN 'short' 
               WHEN t.height <= height_quantiles.q2 THEN 'medium' 
               ELSE 'tall' 
          END AS height_group
        , CASE WHEN t.age <= age_quantiles.q1 THEN 'young' 
               WHEN t.age <= age_quantiles.q2 THEN 'middle_age' 
               ELSE 'old' 
          END AS age_group
        , AVG(CASE WHEN t.owns_bicycle = 'Yes' THEN 1 ELSE 0 END) AS bicycle_proportion
        , COUNT(*) AS counts
    FROM MY_TABLE t
    CROSS JOIN (
        SELECT
              PERCENTILE_CONT(0.33) WITHIN GROUP (ORDER BY height) AS q1
            , PERCENTILE_CONT(0.67) WITHIN GROUP (ORDER BY height) AS q2
        FROM MY_TABLE t
        ) height_quantiles
    CROSS JOIN (
        SELECT
              PERCENTILE_CONT(0.2) WITHIN GROUP (ORDER BY age) AS q1
            , PERCENTILE_CONT(0.4) WITHIN GROUP (ORDER BY age) AS q2
        FROM MY_TABLE t
        ) age_quantiles
    GROUP BY
          t.country
        , t.gender
        , height_group
        , age_group
    ORDER BY
          t.country
        , t.gender
        , height_group
        , age_group
    
    country gender height_group age_group bicycle_proportion counts
    Canada F medium old 0.00000000000000000000 1
    Canada F medium young 0.00000000000000000000 1
    Canada F short old 0.00000000000000000000 1
    Canada F tall middle_age 0.00000000000000000000 1
    Canada M short old 1.00000000000000000000 1
    Canada M tall old 1.00000000000000000000 2
    UK F medium middle_age 0.00000000000000000000 1
    UK F medium old 0.50000000000000000000 2
    UK F short middle_age 0.00000000000000000000 1
    UK F short old 1.00000000000000000000 1
    UK F short young 0.00000000000000000000 1
    UK M tall old 1.00000000000000000000 1
    UK M tall young 0.00000000000000000000 1
    USA F medium middle_age 0.00000000000000000000 1
    USA F medium young 1.00000000000000000000 1
    USA F tall old 0.00000000000000000000 1
    USA M short middle_age 1.00000000000000000000 1

    fiddle

    Re: An "adaptable" variant. SQL is not a programming language, it deals with sets of data, so mimicking the R code may be possible if you use a "procedural extension" to SQL (e.g. plsql in Postgres) but as I'm not able to run anything in Netazza the best I can suggest is something along these lines.

    with CTE as (
        SELECT
          *
         , NTILE(4) OVER (ORDER BY height) AS height_quantile
         , NTILE(5) OVER (ORDER BY age) AS age_quantile
        FROM my_table
     )
    , height_labels as (
      select
           height_quantile hq
         , concat(floor(min(round(height::decimal,2))) , ' to '
                , max(ceiling(round(height::decimal,2)) ) ) height_label
      from CTE
     group by
           height_quantile
      )
    , age_labels as (
      select
           age_quantile aq
         , concat(min(age ) , ' to '
                , max(age ) ) age_label
      from CTE
     group by
           age_quantile
      )
    select
      *
    from CTE 
    inner join height_labels h on cte.height_quantile=h.hq
    inner join age_labels    a on cte.age_quantile=a.aq
    order by
        height, age
    
    country gender age height owns_bicycle height_quantile age_quantile hq height_label aq age_label
    UK F 27 152.85 No 1 1 1 152 to 159 1 25 to 33
    UK F 57 155.56 No 1 3 1 152 to 159 3 57 to 63
    Canada M 87 156.27 Yes 1 4 1 152 to 159 4 63 to 89
    UK F 89 156.31 Yes 1 5 1 152 to 159 5 89 to 99
    USA M 57 158.47 Yes 1 2 1 152 to 159 2 49 to 57
    Canada F 89 159.26 No 2 4 2 159 to 167 4 63 to 89
    Canada F 62 161.18 No 2 3 2 159 to 167 3 57 to 63
    UK F 63 163.65 No 2 3 2 159 to 167 3 57 to 63
    UK F 83 166.01 Yes 2 4 2 159 to 167 4 63 to 89
    USA F 33 166.13 Yes 2 1 2 159 to 167 1 25 to 33
    UK F 49 167.55 No 3 2 3 167 to 176 2 49 to 57
    USA F 53 172.82 No 3 2 3 167 to 176 2 49 to 57
    Canada F 31 173.08 No 3 1 3 167 to 176 1 25 to 33
    UK M 63 173.24 Yes 3 4 3 167 to 176 4 63 to 89
    UK M 25 175.99 No 3 1 3 167 to 176 1 25 to 33
    Canada F 50 177.42 No 4 2 4 177 to 180 2 49 to 57
    Canada M 94 178.92 Yes 4 5 4 177 to 180 5 89 to 99
    Canada M 61 179.14 Yes 4 3 4 177 to 180 3 57 to 63
    USA F 99 179.31 No 4 5 4 177 to 180 5 89 to 99

    The third fiddle

    In this approach the labels are generated from the min/max pairs of each NTILE, and then those used in the final output. To simplify the label creation I introduced the functions floor and ceiling.