I like to apply the template method pattern for a data science project while I need to select or identify target subjects from a large pool of original subjects. I will create tags based on different characteristics of these subjects, i.e., age, sex, disease status, etc.
I prefer this code to be reused for future projects of similar nature. But all projects are somewhat different and the criteria of selecting subjects to be in the final filtered pool are different from one another. How do I structure the subject_selection_steps
in such a way that it is flexible and customizable based on project needs. Currently, I only included three tags in my code, but I may need more or less in different projects.
import sys
from abc import ABC, abstractmethod
import pandas as pd
import datetime
import ctypes
import numpy as np
import random
import pysnooper
import var_creator.var_creator as vc
import feature_tagger.feature_tagger as ft
import data_descriptor.data_descriptor as dd
import data_transformer.data_transformer as dt
import helper_functions.helper_functions as hf
import sec1_data_preparation as data_prep
import sec2_prepped_data_import as prepped_data_import
class SubjectGrouping(ABC):
def __init__(self):
pass
def subject_selection_steps(self):
self._pandas_output_setting()
self.run_data_preparation()
self.import_processed_main_data()
self.inject_test_data()
self.create_all_subject_list()
self.CREATE_TAG1()
self.FILTER_SUBJECT_BY_TAG1()
self.CREATE_TAG2()
self.FILTER_SUBJECT_BY_TAG2()
self.CREATE_TAG3()
self.FILTER_SUBJECT_BY_TAG3()
self.finalize_data()
def _pandas_output_setting(self):
'''Set pandas output display setting'''
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 180)
@abstractmethod
def run_data_preparation(self):
'''Run data_preparation_steps from base class'''
pass
@abstractmethod
def import_processed_main_data(self):
'''Import processed main data'''
pass
def inject_test_data(self):
'''For unitest, by injecting mock cases that for sure fulfill/fail the defined subject selection criteria'''
pass
def create_all_subject_list(self):
'''Gather all the unique subject ids from all datasets and create a full subject list'''
pass
def CREATE_TAG1(self): pass
def FILTER_SUBJECT_BY_TAG1(self): pass
def CREATE_TAG2(self): pass
def FILTER_SUBJECT_BY_TAG2(self): pass
def CREATE_TAG3(self): pass
def FILTER_SUBJECT_BY_TAG3(self): pass
def finalize_data(self):
pass
class SubjectGrouping_Project1(SubjectGrouping, data_prep.DataPreparation_Project1):
def __init__(self):
self.df_dad = None
self.df_pc = None
self.df_nacrs = None
self.df_pin = None
self.df_reg = None
self.df_final_subject_group1 = None
self.df_final_subject_group2 = None
self.df_final_subject_group3 = None
self.control_panel = {
'save_file_switch': False, # WARNING: Will overwrite existing files
'df_subsampling_switch': True, # WARNING: Only switch to True when testing
'df_subsampling_n': 8999,
'random_seed': 888,
'df_remove_dup_switch': True,
'parse_date_switch': True,
'result_printout_switch': True,
'comp_loc': 'office',
'show_df_n_switch': False, # To be implemented. Show df length before and after record removal
'done_switch': False,
}
def run_data_preparation(self):
self.data_preparation_steps()
def import_processed_main_data(self):
x = prepped_data_import.PreppedDataImport_Project1()
x.data_preparation_steps()
x.prepped_data_import_steps()
df_dict = x.return_all_dfs()
self.df_d, self.df_p, self.df_n, self.df_p, self.df_r = (df_dict['DF_D'], df_dict['DF_P'],
df_dict['DF_N'], df_dict['DF_P'], df_dict['DF_R'])
del x
if __name__=='__main__':
x = SubjectGrouping_Project1()
x.subject_selection_steps()
Consider a Filter Pattern. It basically allows filtering of list of objects based on defined filters and you can easily introduce a new filter at a later point with minimal changes to your code.
Create an Criteria
interface or abstract class.
class Criteria():
def filter(self, request):
raise NotImplementedError("Should have implemented this")
and have each of your filter extend from Criteria
class. Let's consider one of the filters is an Age filter
class AgeFilter(Criteria):
def __init__(self, age=20):
self.age = age
def filter(self, list):
filteredList = []
for item in self.list:
if (item.age > self.age):
# add to the filteredList
return filteredList
Similar you can define other filters like DiseaseFilter
, GenderFilter
by extending from Criteria
interface.
You can also do logical operations on your filters by defining And
or Or
filters as well. For eg.
class AndFilter(Criteria):
def __init__(self, filter1, filter2):
self.filter1 = filter1
self.filter2 = filter2
def filter(self, list):
filteredList1 = filter1.filter(list)
filteredList2 = filter2.filter(filteredList1)
return filteredList2
Assuming you have already defined your filters, after which your subject_selection_steps
method will look like,
def subject_selection_steps(self):
# define list of filters
filterList = [ageFilter1, maleFilter, MalariaAndJaundiceFilter]
result = personList
for criteria in filterList:
result = criteria.filter(result)
return result