pythonpandasyfinance

ValueError while saving a dataframe


I am facing hurdle while saving a pandas data frame to parquet file

Code I am using -

import pandas as pd
import yfinance as yf

start_date = "2022-08-06"
end_date = "2024-08-05"

ticker = 'RELIANCE.NS'
data = yf.download(tickers=ticker, start=start_date, end=end_date, interval="1h")

data.reset_index(inplace=True)

data['Date'] = data['Datetime'].dt.date
data['Time'] = data['Datetime'].dt.time

data.to_parquet('./RELIANCE.parquet')

The error it produces is - ValueError: Can't infer object conversion type: 0

Can someone tell me how to fix this.

PS: Detailed error below-

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[1], line 15
     12 data['Date'] = data['Datetime'].dt.date
     13 data['Time'] = data['Datetime'].dt.time
---> 15 data.to_parquet('./RELIANCE.parquet')

File ~/python_venv/lib/python3.10/site-packages/pandas/util/_decorators.py:333, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    327 if len(args) > num_allow_args:
    328     warnings.warn(
    329         msg.format(arguments=_format_argument_list(allow_args)),
    330         FutureWarning,
    331         stacklevel=find_stack_level(),
    332     )
--> 333 return func(*args, **kwargs)

File ~/python_venv/lib/python3.10/site-packages/pandas/core/frame.py:3113, in DataFrame.to_parquet(self, path, engine, compression, index, partition_cols, storage_options, **kwargs)
   3032 """
   3033 Write a DataFrame to the binary parquet format.
   3034 
   (...)
   3109 >>> content = f.read()
   3110 """
   3111 from pandas.io.parquet import to_parquet
-> 3113 return to_parquet(
   3114     self,
   3115     path,
   3116     engine,
   3117     compression=compression,
   3118     index=index,
   3119     partition_cols=partition_cols,
   3120     storage_options=storage_options,
   3121     **kwargs,
   3122 )

File ~/python_venv/lib/python3.10/site-packages/pandas/io/parquet.py:480, in to_parquet(df, path, engine, compression, index, storage_options, partition_cols, filesystem, **kwargs)
    476 impl = get_engine(engine)
    478 path_or_buf: FilePath | WriteBuffer[bytes] = io.BytesIO() if path is None else path
--> 480 impl.write(
    481     df,
    482     path_or_buf,
    483     compression=compression,
    484     index=index,
    485     partition_cols=partition_cols,
    486     storage_options=storage_options,
    487     filesystem=filesystem,
    488     **kwargs,
    489 )
    491 if path is None:
    492     assert isinstance(path_or_buf, io.BytesIO)

File ~/python_venv/lib/python3.10/site-packages/pandas/io/parquet.py:349, in FastParquetImpl.write(self, df, path, compression, index, partition_cols, storage_options, filesystem, **kwargs)
    344     raise ValueError(
    345         "storage_options passed with file object or non-fsspec file path"
    346     )
    348 with catch_warnings(record=True):
--> 349     self.api.write(
    350         path,
    351         df,
    352         compression=compression,
    353         write_index=index,
    354         partition_on=partition_cols,
    355         **kwargs,
    356     )

File ~/python_venv/lib/python3.10/site-packages/fastparquet/writer.py:1304, in write(filename, data, row_group_offsets, compression, file_scheme, open_with, mkdirs, has_nulls, write_index, partition_on, fixed_text, append, object_encoding, times, custom_metadata, stats)
   1301 check_column_names(data.columns, partition_on, fixed_text,
   1302                    object_encoding, has_nulls)
   1303 ignore = partition_on if file_scheme != 'simple' else []
-> 1304 fmd = make_metadata(data, has_nulls=has_nulls, ignore_columns=ignore,
   1305                     fixed_text=fixed_text,
   1306                     object_encoding=object_encoding,
   1307                     times=times, index_cols=index_cols,
   1308                     partition_cols=partition_on, cols_dtype=cols_dtype)
   1309 if custom_metadata:
   1310     kvm = fmd.key_value_metadata or []

File ~/python_venv/lib/python3.10/site-packages/fastparquet/writer.py:904, in make_metadata(data, has_nulls, ignore_columns, fixed_text, object_encoding, times, index_cols, partition_cols, cols_dtype)
    902     se.name = column
    903 else:
--> 904     se, type = find_type(data[column], fixed_text=fixed,
    905                          object_encoding=oencoding, times=times,
    906                          is_index=is_index)
    907 col_has_nulls = has_nulls
    908 if has_nulls is None:

File ~/python_venv/lib/python3.10/site-packages/fastparquet/writer.py:122, in find_type(data, fixed_text, object_encoding, times, is_index)
    120 elif dtype == "O":
    121     if object_encoding == 'infer':
--> 122         object_encoding = infer_object_encoding(data)
    124     if object_encoding == 'utf8':
    125         type, converted_type, width = (parquet_thrift.Type.BYTE_ARRAY,
    126                                        parquet_thrift.ConvertedType.UTF8,
    127                                        None)

File ~/python_venv/lib/python3.10/site-packages/fastparquet/writer.py:357, in infer_object_encoding(data)
    355     s += 1
    356 else:
--> 357     raise ValueError("Can't infer object conversion type: %s" % data)
    358 if s > 10:
    359     break

ValueError: Can't infer object conversion type: 0       2022-08-08
1       2022-08-08
2       2022-08-08
3       2022-08-08
4       2022-08-08
           ...    
3398    2024-08-02
3399    2024-08-02
3400    2024-08-02
3401    2024-08-02
3402    2024-08-02
Name: Date, Length: 3403, dtype: object

Solution

  • Your code worked for me without any issues. I assume this issue is arising from the parquet engine that you are using to save the file as parquet( io.parquet.engine, pyarrow or fastparquet).

    Here are the versions of the libraries I have used:

    Pandas version: 2.2.2
    PyArrow version: 16.1.0
    

    To make sure you are using pyarrow or any desired engine make sure you include it like this:

    data.to_parquet('./RELIANCE.parquet', engine='pyarrow')
    

    Here's first two rows of the saved parquet file:

    {"Datetime":1659930300000,"Open":2532.25,"High":2572.75,"Low":2531.39990234375,"Close":2569,"Adj Close":2569,"Volume":0,"Date":"2022-08-08T00:00:00.000Z","Time":33300000000}
    {"Datetime":1659933900000,"Open":2568.60009765625,"High":2571,"Low":2562.10009765625,"Close":2567.300048828125,"Adj Close":2567.300048828125,"Volume":392815,"Date":"2022-08-08T00:00:00.000Z","Time":36900000000}
    

    If you would not consider changing your parquet engine, you should investigate what data type conversions might go wrong. As an example you can try converting time to string.