pythonserializationmultiprocessingpickledill

How to serialize custom classes in Python?


I have a custom class and I want to serialize it for multiprocessing, but pickle and dill doesn't work fine and loses important data. How can I fix this?

My class:

import pickle
import dill
import pandas as pd
import numpy as np

class C(pd.Series):
    def __init__(self, value: str, *args, **kwargs):
        super().__init__(*args, **kwargs, dtype=np.float64)
        self.value = value

    @property
    def value(self):
        try:
            return self._value
        except AttributeError:
            self._value = None
            return self._value

    @value.setter
    def value(self, value: str):
        self._value = value


c1 = C(data=[0], index=[0], value=1)

c2 = pickle.loads(pickle.dumps(c1))

print(c1.value) # prints 1
print(c2.value) # prints None

c2 = dill.loads(dill.dumps(c1))

print(c1.value) # prints 1
print(c2.value) # prints None

Solution

  • As @gog commented, you need to provide your own implementations of __getstate__ and __setstate__.

    The pandas implementation of __getstate__ returns a dictionary but this does not contain any user-set attributes. So we must explicitly add the additional key/value pair:

    import pickle
    import dill
    import pandas as pd
    import numpy as np
    
    class C(pd.Series):
        def __init__(self, value: str, *args, **kwargs):
            super().__init__(*args, **kwargs, dtype=np.float64)
            self.value = value
    
        @property
        def value(self):
            try:
                return self._value
            except AttributeError:
                self._value = None
                return self._value
    
        @value.setter
        def value(self, value: str):
            self._value = value
    
        def __getstate__(self):
            the_dict = super().__getstate__()
            the_dict['_value'] = self._value
            return the_dict
    
        def __setstate__(self, state):
            super().__setstate__(state)
            self._value = state['_value']
    
    c1 = C(data=[0], index=[0], value=1)
    
    c2 = pickle.loads(pickle.dumps(c1))
    
    print(c1.value) # prints 1
    print(c2.value) # prints 1
    
    c2 = dill.loads(dill.dumps(c1))
    
    print(c1.value) # prints 1
    print(c2.value) # prints 1
    

    Prints:

    1
    1
    1
    1