I am facing hurdle while saving a pandas data frame to parquet file
Code I am using -
import pandas as pd
import yfinance as yf
start_date = "2022-08-06"
end_date = "2024-08-05"
ticker = 'RELIANCE.NS'
data = yf.download(tickers=ticker, start=start_date, end=end_date, interval="1h")
data.reset_index(inplace=True)
data['Date'] = data['Datetime'].dt.date
data['Time'] = data['Datetime'].dt.time
data.to_parquet('./RELIANCE.parquet')
The error it produces is - ValueError: Can't infer object conversion type: 0
Can someone tell me how to fix this.
PS: Detailed error below-
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[1], line 15
12 data['Date'] = data['Datetime'].dt.date
13 data['Time'] = data['Datetime'].dt.time
---> 15 data.to_parquet('./RELIANCE.parquet')
File ~/python_venv/lib/python3.10/site-packages/pandas/util/_decorators.py:333, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
327 if len(args) > num_allow_args:
328 warnings.warn(
329 msg.format(arguments=_format_argument_list(allow_args)),
330 FutureWarning,
331 stacklevel=find_stack_level(),
332 )
--> 333 return func(*args, **kwargs)
File ~/python_venv/lib/python3.10/site-packages/pandas/core/frame.py:3113, in DataFrame.to_parquet(self, path, engine, compression, index, partition_cols, storage_options, **kwargs)
3032 """
3033 Write a DataFrame to the binary parquet format.
3034
(...)
3109 >>> content = f.read()
3110 """
3111 from pandas.io.parquet import to_parquet
-> 3113 return to_parquet(
3114 self,
3115 path,
3116 engine,
3117 compression=compression,
3118 index=index,
3119 partition_cols=partition_cols,
3120 storage_options=storage_options,
3121 **kwargs,
3122 )
File ~/python_venv/lib/python3.10/site-packages/pandas/io/parquet.py:480, in to_parquet(df, path, engine, compression, index, storage_options, partition_cols, filesystem, **kwargs)
476 impl = get_engine(engine)
478 path_or_buf: FilePath | WriteBuffer[bytes] = io.BytesIO() if path is None else path
--> 480 impl.write(
481 df,
482 path_or_buf,
483 compression=compression,
484 index=index,
485 partition_cols=partition_cols,
486 storage_options=storage_options,
487 filesystem=filesystem,
488 **kwargs,
489 )
491 if path is None:
492 assert isinstance(path_or_buf, io.BytesIO)
File ~/python_venv/lib/python3.10/site-packages/pandas/io/parquet.py:349, in FastParquetImpl.write(self, df, path, compression, index, partition_cols, storage_options, filesystem, **kwargs)
344 raise ValueError(
345 "storage_options passed with file object or non-fsspec file path"
346 )
348 with catch_warnings(record=True):
--> 349 self.api.write(
350 path,
351 df,
352 compression=compression,
353 write_index=index,
354 partition_on=partition_cols,
355 **kwargs,
356 )
File ~/python_venv/lib/python3.10/site-packages/fastparquet/writer.py:1304, in write(filename, data, row_group_offsets, compression, file_scheme, open_with, mkdirs, has_nulls, write_index, partition_on, fixed_text, append, object_encoding, times, custom_metadata, stats)
1301 check_column_names(data.columns, partition_on, fixed_text,
1302 object_encoding, has_nulls)
1303 ignore = partition_on if file_scheme != 'simple' else []
-> 1304 fmd = make_metadata(data, has_nulls=has_nulls, ignore_columns=ignore,
1305 fixed_text=fixed_text,
1306 object_encoding=object_encoding,
1307 times=times, index_cols=index_cols,
1308 partition_cols=partition_on, cols_dtype=cols_dtype)
1309 if custom_metadata:
1310 kvm = fmd.key_value_metadata or []
File ~/python_venv/lib/python3.10/site-packages/fastparquet/writer.py:904, in make_metadata(data, has_nulls, ignore_columns, fixed_text, object_encoding, times, index_cols, partition_cols, cols_dtype)
902 se.name = column
903 else:
--> 904 se, type = find_type(data[column], fixed_text=fixed,
905 object_encoding=oencoding, times=times,
906 is_index=is_index)
907 col_has_nulls = has_nulls
908 if has_nulls is None:
File ~/python_venv/lib/python3.10/site-packages/fastparquet/writer.py:122, in find_type(data, fixed_text, object_encoding, times, is_index)
120 elif dtype == "O":
121 if object_encoding == 'infer':
--> 122 object_encoding = infer_object_encoding(data)
124 if object_encoding == 'utf8':
125 type, converted_type, width = (parquet_thrift.Type.BYTE_ARRAY,
126 parquet_thrift.ConvertedType.UTF8,
127 None)
File ~/python_venv/lib/python3.10/site-packages/fastparquet/writer.py:357, in infer_object_encoding(data)
355 s += 1
356 else:
--> 357 raise ValueError("Can't infer object conversion type: %s" % data)
358 if s > 10:
359 break
ValueError: Can't infer object conversion type: 0 2022-08-08
1 2022-08-08
2 2022-08-08
3 2022-08-08
4 2022-08-08
...
3398 2024-08-02
3399 2024-08-02
3400 2024-08-02
3401 2024-08-02
3402 2024-08-02
Name: Date, Length: 3403, dtype: object
Your code worked for me without any issues. I assume this issue is arising from the parquet engine that you are using to save the file as parquet( io.parquet.engine
, pyarrow
or fastparquet
).
Here are the versions of the libraries I have used:
Pandas version: 2.2.2
PyArrow version: 16.1.0
To make sure you are using pyarrow
or any desired engine make sure you include it like this:
data.to_parquet('./RELIANCE.parquet', engine='pyarrow')
Here's first two rows of the saved parquet file:
{"Datetime":1659930300000,"Open":2532.25,"High":2572.75,"Low":2531.39990234375,"Close":2569,"Adj Close":2569,"Volume":0,"Date":"2022-08-08T00:00:00.000Z","Time":33300000000}
{"Datetime":1659933900000,"Open":2568.60009765625,"High":2571,"Low":2562.10009765625,"Close":2567.300048828125,"Adj Close":2567.300048828125,"Volume":392815,"Date":"2022-08-08T00:00:00.000Z","Time":36900000000}
If you would not consider changing your parquet engine, you should investigate what data type conversions might go wrong. As an example you can try converting time
to string
.