azure-machine-learning-service

Streamclosed when trying to work with image imported from datastore into AzureML notebook


I use the methodology for loading an image from an Azure datastore into an AzureML notebook described in this tutorial. The image loads, but when I try to convert the dictionary that contains the image to a datasets object, I get a Streamclosed error. Note I'm running this in an AzureAI Machine Learning Studio notebook instance.

from azureml.fsspec import AzureMachineLearningFileSystem
import datasets import Dataset
from PIL import Image

fs = AzureMachineLearningFileSystem('path_to_my_datastore')

file_list = ['file1', 'file2', 'file3']

img_lst = []

for file in files:
    with fs.open('data_direc/'+file+'.jpg') as f:
        img = Image.open(f)
        img_lst.append(img)

test_dict = {'image': img_lst}

Dataset.from_dict(test_dict)

I get an error on the last line:

---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
Cell In[88], line 1
----> 1 Dataset.from_dict(data_dict)

File /anaconda/envs/main-env/lib/python3.11/site-packages/datasets/arrow_dataset.py:912, in Dataset.from_dict(cls, mapping, features, info, split)
    910     arrow_typed_mapping[col] = data
    911 mapping = arrow_typed_mapping
--> 912 pa_table = InMemoryTable.from_pydict(mapping=mapping)
    913 if info is None:
    914     info = DatasetInfo()

File /anaconda/envs/main-env/lib/python3.11/site-packages/datasets/table.py:758, in InMemoryTable.from_pydict(cls, *args, **kwargs)
    742 @classmethod
    743 def from_pydict(cls, *args, **kwargs):
    744     """
    745     Construct a Table from Arrow arrays or columns.
    746 
   (...)
    756         `datasets.table.Table`
    757     """
--> 758     return cls(pa.Table.from_pydict(*args, **kwargs))

File /anaconda/envs/main-env/lib/python3.11/site-packages/pyarrow/table.pxi:1813, in pyarrow.lib._Tabular.from_pydict()

File /anaconda/envs/main-env/lib/python3.11/site-packages/pyarrow/table.pxi:5347, in pyarrow.lib._from_pydict()

File /anaconda/envs/main-env/lib/python3.11/site-packages/pyarrow/array.pxi:373, in pyarrow.lib.asarray()

File /anaconda/envs/main-env/lib/python3.11/site-packages/pyarrow/array.pxi:247, in pyarrow.lib.array()

File /anaconda/envs/main-env/lib/python3.11/site-packages/pyarrow/array.pxi:112, in pyarrow.lib._handle_arrow_array_protocol()

File /anaconda/envs/main-env/lib/python3.11/site-packages/datasets/arrow_writer.py:169, in TypedSequence.__arrow_array__(***failed resolving arguments***)
    167 # automatic type inference for custom objects
    168 if self.type is None and self.try_type is None:
--> 169     data, self._inferred_type = self._infer_custom_type_and_encode(data)
    170 if self._inferred_type is None:
    171     type = self.try_type if self.trying_type else self.type

File /anaconda/envs/main-env/lib/python3.11/site-packages/datasets/arrow_writer.py:157, in TypedSequence._infer_custom_type_and_encode(data)
    155     non_null_idx, non_null_value = first_non_null_value(data)
    156     if isinstance(non_null_value, PIL.Image.Image):
--> 157         return [Image().encode_example(value) if value is not None else None for value in data], Image()
    158 return data, None

File /anaconda/envs/main-env/lib/python3.11/site-packages/datasets/arrow_writer.py:157, in <listcomp>(.0)
    155     non_null_idx, non_null_value = first_non_null_value(data)
    156     if isinstance(non_null_value, PIL.Image.Image):
--> 157         return [Image().encode_example(value) if value is not None else None for value in data], Image()
    158 return data, None

File /anaconda/envs/main-env/lib/python3.11/site-packages/datasets/features/image.py:119, in Image.encode_example(self, value)
    116     return encode_np_array(value)
    117 elif isinstance(value, PIL.Image.Image):
    118     # convert the PIL image to bytes (default format is PNG/TIFF)
--> 119     return encode_pil_image(value)
    120 elif value.get("path") is not None and os.path.isfile(value["path"]):
    121     # we set "bytes": None to not duplicate the data if they're already available locally
    122     return {"bytes": None, "path": value.get("path")}

File /anaconda/envs/main-env/lib/python3.11/site-packages/datasets/features/image.py:308, in encode_pil_image(image)
    306     return {"path": image.filename, "bytes": None}
    307 else:
--> 308     return {"path": None, "bytes": image_to_bytes(image)}

File /anaconda/envs/main-env/lib/python3.11/site-packages/datasets/features/image.py:300, in image_to_bytes(image)
    298 else:
    299     format = "PNG" if image.mode in ["1", "L", "LA", "RGB", "RGBA"] else "TIFF"
--> 300 image.save(buffer, format=format)
    301 return buffer.getvalue()

File /anaconda/envs/main-env/lib/python3.11/site-packages/PIL/Image.py:2421, in Image.save(self, fp, format, **params)
   2418     filename = os.path.realpath(os.fspath(fp.name))
   2420 # may mutate self!
-> 2421 self._ensure_mutable()
   2423 save_all = params.pop("save_all", False)
   2424 self.encoderinfo = params

File /anaconda/envs/main-env/lib/python3.11/site-packages/PIL/Image.py:595, in Image._ensure_mutable(self)
    593 def _ensure_mutable(self) -> None:
    594     if self.readonly:
--> 595         self._copy()
    596     else:
    597         self.load()

File /anaconda/envs/main-env/lib/python3.11/site-packages/PIL/Image.py:588, in Image._copy(self)
    587 def _copy(self) -> None:
--> 588     self.load()
    589     self.im = self.im.copy()
    590     self.pyaccess = None

File /anaconda/envs/main-env/lib/python3.11/site-packages/PIL/ImageFile.py:258, in ImageFile.load(self)
    251 self.tile = [
    252     list(tiles)[-1]
    253     for _, tiles in itertools.groupby(
    254         self.tile, lambda tile: (tile[0], tile[1], tile[3])
    255     )
    256 ]
    257 for decoder_name, extents, offset, args in self.tile:
--> 258     seek(offset)
    259     decoder = Image._getdecoder(
    260         self.mode, decoder_name, args, self.decoderconfig
    261     )
    262     try:

File pystreaminfo_companion.py:23, in seek(self, offset, whence)

Exception: StreamClosed

Solution

  • You read it in bytes so that the file stream is managed explicitly and should not be prematurely closed. Then create the dataset.

    Use below code.

    fs = AzureMachineLearningFileSystem(ds_p)
    file_list = ['invoice.png']
    img_lst = []
    
    for file in file_list:
        with fs.open('pdf/'+file) as f:
            img = Image.open(io.BytesIO(f.read()))
            img_lst.append(img)
    
    test_dict = {'image': img_lst}
    
    from datasets import Dataset
    dataset = Dataset.from_dict(test_dict)
    dataset[0]['image']
    

    Output:

    enter image description here