sqloracle-databaselistagg

Find modified file extensions


I'm trying to convert a query from postgres to Oracle and I'm running into a problem I can't seem to figure out and was hoping someone can help me.

The query should find for each day, how many times each file extension was modified. Then for each day, get the max modified file extension.

Below is my setup and attempt. If there is a more efficient way to accomplish this task I'm certainly open to any suggestions. Thanks in advance to all who answer and your expertise.

create table files
(
id              int primary key,
date_modified   date,
file_name       varchar(50)
);

insert into files values (1    ,   to_date('2021-06-03','yyyy-mm-dd'), 'thresholds.svg');
insert into files values (2    ,   to_date('2021-06-01','yyyy-mm-dd'), 'redrag.py');
insert into files values (3    ,   to_date('2021-06-03','yyyy-mm-dd'), 'counter.pdf');
insert into files values (4    ,   to_date('2021-06-06','yyyy-mm-dd'), 'reinfusion.py');
insert into files values (5    ,   to_date('2021-06-06','yyyy-mm-dd'), 'tonoplast.docx');
insert into files values (6    ,   to_date('2021-06-01','yyyy-mm-dd'), 'uranian.pptx');
insert into files values (7    ,   to_date('2021-06-03','yyyy-mm-dd'), 'discuss.pdf');
insert into files values (8    ,   to_date('2021-06-06','yyyy-mm-dd'), 'nontheologically.pdf');
insert into files values (9    ,   to_date('2021-06-01','yyyy-mm-dd'), 'skiagrams.py');
insert into files values (10,   to_date('2021-06-04','yyyy-mm-dd'), 'flavors.py');
insert into files values (11,   to_date('2021-06-05','yyyy-mm-dd'), 'nonv.pptx');
insert into files values (12,   to_date('2021-06-01','yyyy-mm-dd'), 'under.pptx');
insert into files values (13,   to_date('2021-06-02','yyyy-mm-dd'), 'demit.csv');
insert into files values (14,   to_date('2021-06-02','yyyy-mm-dd'), 'trailings.pptx');
insert into files values (15,   to_date('2021-06-04','yyyy-mm-dd'), 'asst.py');
insert into files values (16,   to_date('2021-06-03','yyyy-mm-dd'), 'pseudo.pdf');
insert into files values (17,   to_date('2021-06-03','yyyy-mm-dd'), 'unguarded.jpeg');
insert into files values (18,   to_date('2021-06-06','yyyy-mm-dd'), 'suzy.docx');
insert into files values (19,   to_date('2021-06-06','yyyy-mm-dd'), 'anitsplentic.py');
insert into files values (20,   to_date('2021-06-03','yyyy-mm-dd'), 'tallies.py');

with cte as
(select date_modified, substring(file_name, position('.' in file_name) + 1) as file_ext, count(1) as cnt
from files
group by date_modified,file_ext)
select date_modified, listagg(file_ext, ',' order by file_ext desc) as extension, max(cnt) as the_count
from cte c1
where cnt = (select max(cnt) from cte c2 where c1.date_modified=c2.date_modified )
group by date_modified
order by 1;


Solution

  • This is your code, modified so that it doesn't return an error:

    SQL> with cte as
      2  (select date_modified,
      3          regexp_substr(file_name, '\w+$') as file_ext,
      4          count(*) as cnt
      5   from files
      6   group by date_modified,
      7            regexp_substr(file_name, '\w+$')
      8  )
      9  select date_modified,
     10    listagg(file_ext, ',') within group (order by file_ext desc) as extension,
     11    max(cnt) as the_count
     12  from cte c1
     13  where cnt = (select max(cnt) from cte c2 where c1.date_modified=c2.date_modified )
     14  group by date_modified
     15  order by 1;
    
    DATE_MODIF EXTENSION        THE_COUNT
    ---------- --------------- ----------
    2021-06-01 py,pptx                  2
    2021-06-02 pptx,csv                 1
    2021-06-03 pdf                      3
    2021-06-04 py                       2
    2021-06-05 pptx                     1
    2021-06-06 py,docx                  2
    
    6 rows selected.
    
    SQL>