loopssasdataset

How to count the frequency of change of an attribute in any column in Sas


I have a dataset which has 100 columns. A simpler & smaller version is as follows:

As_of_Date Account Account_Number Request_Type Stage Amount Anticipated_Close_Date Need_by_Date Date_Accepted Date_Confirmed
01-01-2024 John ABC012345 New_Loan Early 1000 02-28-2024
01-20-2024 John ABC012345 New_Loan Pending Acceptance 1500 02-28-2024 02-28-2024
01-20-2024 John ABC345678 Existing_Loan Early 2000 02-28-2024
02-03-2024 John ABC012345 Existing_Loan Confirmed 1500 02-28-2024 02-28-2024 02-10-2024 02-20-2024
02-23-2024 John ABC345678 Existing_Loan Confirmed 2000 02-28-2024 03-10-2024 02-25-2024 02-25-2024
01-01-2024 Sam ABC135790 New_Loan Pending Acceptance 500 01-10-2024 01-10-2024 01-05-2024
01-02-2024 Sam ABC135790 New_Loan Confirmed 500 01-10-2024 01-10-2024 01-03-2024 01-05-2024
02-10-2024 Peter ABC246810 New_Loan Pending Acceptance 1000 03-01-2024 03-01-2024
02-15-2024 Peter ABC246810 Exisitng_Loan Confirmed 1000 03-01-2024 03-07-2024 02-12-2024 02-15-2024
02-20-2024 Peter ABC246810 Exisitng_Loan Confirmed 2000 03-01-2024 03-10-2024 02-18-2024 02-19-2024

The dataset is unique at the 'As_of_Date' and 'Account_Number'. Except for the first 3 columns, any cell of any column can change. I need to find out the number of times the content of a column changes for each Account Number.

I know:

proc sort data = database; by Account_Number Account As_of_Date; run;

data want;
set database;
format lag_value mmddyy10.;
by Account_Number;
lag_vlaue = lag(Anticipated_close_date);
if first.Account_Number then value_change= 0;
else value_change = (Anticipated_close_date^=lag_value);
run;

is giving me the desired result. I need to do it for all the columns so I have done this:

proc contents data = database out=contents noprint; run;

proc sql;
select cats('_',varnum), cats(quote(trim(name),"'"),'N')
into
  :temp_column_vars separated by ' '
  :column_vars separated by ' '
from contents
where (name like '%Date%' and name ne 'As_of_Date');
quit;

proc sort data = database; by Account_Number Account As_of_Date; run;

data final;
set database;
by Account_Number;
array column &column_vars;
array temp_column &temp_column_vars;
do over column;
    lag_vlaue_&temp_column. = lag(&temp_column);
    if first.Account_Number then value_change_&temp_column.= 0;
    else value_change_&temp_column. = (&temp_column^=lag_value_&temp_column.);
end;
run;    

I get an error saying "Statement is not valid or it is used out of proper order". Moreover, if I try to add 'Request_Type', 'Stage' etc in the array I get an error saying "All variables in array list must be the same type, i.e. all numeric or character."


Solution

  • You can track the account, variable name, and delta_count in a hash.

    Example:

    General purpose program to count number of changes in all variables within a by group.

    data have;
        input As_of_Date :mmddyy10. Account $ Account_Number $ Request_Type $ Stage $ Amount Anticipated_Close_Date :mmddyy10. Need_by_Date :mmddyy10. Date_Accepted :mmddyy10. Date_Confirmed :mmddyy10.;
        format As_of_Date mmddyy10. Anticipated_Close_Date mmddyy10. Need_by_Date mmddyy10. Date_Accepted mmddyy10. Date_Confirmed mmddyy10.;
        datalines;
    01-01-2024 John ABC012345 New_Loan Early 1000 02-28-2024 . . .
    01-20-2024 John ABC012345 New_Loan Pending_Acceptance 1500 02-28-2024 02-28-2024 . .
    01-20-2024 John ABC345678 Existing_Loan Early 2000 02-28-2024 . . .
    02-03-2024 John ABC012345 Existing_Loan Confirmed 1500 02-28-2024 02-28-2024 02-10-2024 02-20-2024
    02-23-2024 John ABC345678 Existing_Loan Confirmed 2000 02-28-2024 03-10-2024 02-25-2024 02-25-2024
    01-01-2024 Sam ABC135790 New_Loan Pending_Acceptance 500 01-10-2024 01-10-2024 . 01-05-2024
    01-02-2024 Sam ABC135790 New_Loan Confirmed 500 01-10-2024 01-10-2024 01-03-2024 01-05-2024
    02-10-2024 Peter ABC246810 New_Loan Pending_Acceptance 1000 03-01-2024 03-01-2024 . .
    02-15-2024 Peter ABC246810 Existing_Loan Confirmed 1000 03-01-2024 03-07-2024 02-12-2024 02-15-2024
    02-20-2024 Peter ABC246810 Existing_Loan Confirmed 2000 03-01-2024 03-10-2024 02-18-2024 02-19-2024
    ;
    run;
    
    proc contents noprint data=have out=have_c;
    
    proc sql noprint ;
      select 
        ifc(type=1, cats('_',put(varnum,z6.)) || ' ' || cats(length), '')
      , ifc(type=2, cats('_',put(varnum,z6.)) || ' $' || cats(length), '')
      into
        :num_vars separated by ' '
      , :char_vars separated by ' '
      from
        have_c
     order
        varnum
     ;
    
    proc sort data=have;
      by account as_of_date ;
    run ;
    
    data _null_ ;
      set have end=end ;
      by account ;
    
      retain _nsentinel . _csentinel '' ;
    
      array nums _numeric_ ;    
      array chars _character_ ;
    
      length _nsentinel2 8  &num_vars _nsentinel3 8 ;
      length _csentinel2 $1 &char_vars _csentinel3 $1 ;
    
      array nprev _nsentinel2 -- _nsentinel3 ;
      array cprev _csentinel2 -- _csentinel3 ;
      retain _nsentinel2 -- _nsentinel3 _csentinel2 -- _csentinel3 ;
    
      length name vorder $32 delta_count 8 ;
      if _n_ = 1 then do ;
        declare hash deltas (ordered:'a');
        deltas.defineKey ('account', 'vorder') ;
        deltas.defineData ('account', 'name', 'delta_count') ;
        deltas.defineDone() ;
        declare hiter hi ('deltas');
      end ;
    
      if not first.account then do ;
        do over nums ;
          if nums ne nprev[_i_+1] then do ;
            name = vname(nums) ;
            vorder = vname(nprev[_i_+1]) ;
    
            if deltas.find() ne 0 
              then delta_count = 1 ;
              else delta_count + 1 ;
            deltas.replace() ;
          end ;
        end ;
        do over chars ;
          if chars ne cprev[_i_+1] then do ;
            name = vname(chars) ;
            vorder = vname(cprev[_i_+1]) ;
            if deltas.find() ne 0 
              then delta_count = 1 ;
              else delta_count + 1 ;
            deltas.replace() ;
          end ;
        end ;
      end ;
    
      do over nums ;  nprev[_i_+1] = nums ; end ;
      do over chars ; cprev[_i_+1] = chars ; end ;
    
      if last.account then call missing (of nprev(*), of cprev(*)) ;
    
      if end then deltas.output(dataset:'deltas') ;
    run ;