I'm new here so I hope that I will put all needed information
After replacing wrong delimiter (replace '\n\t' by '\t' when necessary) in the csv file and define data type for each column, I'm trying to read it using the \t delimiter but encounter 2 errors.
1)
parse error on line 1715281 : expected 209 fields, saw 239 --> For that, I try to check issue by using getline and then split the line with delimiter='\t'. I found 'quantity of columns' = len(split(getline)) = 209
error : Unable to parse string "URL link". No issue with all the previous lines before this one
My questions are:
Please find hereunder my codes, comments and full error messages
import os.path
import pandas as pd
import numpy as np
import linecache
# data file available under: https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv.gz
# it's .csv but delimiter is TAB
# generate the path to the file
data_local_path = os.getcwd() + '\\'
csv_filename = 'en.openfoodfacts.org.products.csv'
csv_local_path = data_local_path + csv_filename
# generate the path to create the file corrected
clean_filename = 'en.off.corrected.csv'
clean_local_path = data_local_path + clean_filename
# check if the file is already existing, if not then proceed to wrong delimiter replacement
if not os.path.isfile(clean_local_path):
with open(csv_local_path, 'r',encoding='utf-8') as csv_file, open(clean_local_path, 'a', encoding='utf-8') as clean_file:
for row in csv_file:
clean_file.write(row.replace('\n\t', '\t'))
# columns type are defined under : https://static.openfoodfacts.org/data/data-fields.txt
column_names = pd.read_csv(clean_local_path, sep='\t', encoding = 'utf-8', nrows=0).columns.values
column_types = {col: 'Int64' for (col) in column_names if col.endswith (('_t', '_n'))}
column_types |= {col: float for (col) in column_names if col.endswith (('_100g', '_serving'))}
column_types |= {col: str for (col) in column_names if not col.endswith (('_t', '_n', '_100g', '_serving', '_tags'))}
print ("number of columns detected: ",len(column_names))
# output is "number of columns detected: 209"
print (column_names)
# Load the data
data = pd.read_csv(clean_local_path, sep='\t', encoding='utf_8',
dtype=column_types, parse_dates=[col for (col) in column_names if col.endswith('_datetime')],
on_bad_lines='warn'
)
# display info
data.info()
Message error at the line " data = pd.read_csv..." is:
...\AppData\Local\Temp\ipykernel_2824\611804071.py:2: ParserWarning: Skipping line 1715281: expected 209 fields, saw 239
data = pd.read_csv(clean_local_path, sep='\t', encoding='utf_8',
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
File lib.pyx:2391, in pandas._libs.lib.maybe_convert_numeric()
ValueError: Unable to parse string "https://images.openfoodfacts.org/images/products/356/007/117/1049/front_fr.3.200.jpg"
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
Cell In[6], line 2
1 # Load the data
----> 2 data = pd.read_csv(clean_local_path, sep='\t', encoding='utf_8',
3 dtype=column_types, parse_dates=[col for (col) in column_names if col.endswith('_datetime')],
4 on_bad_lines='warn'
5 )
6 # display info
7 data.info()
File ~\AppData\Local\Programs\Python\Python313\Lib\site-packages\pandas\io\parsers\readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
1013 kwds_defaults = _refine_defaults_read(
1014 dialect,
1015 delimiter,
(...) 1022 dtype_backend=dtype_backend,
1023 )
1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)
File ~\AppData\Local\Programs\Python\Python313\Lib\site-packages\pandas\io\parsers\readers.py:626, in _read(filepath_or_buffer, kwds)
623 return parser
625 with parser:
--> 626 return parser.read(nrows)
File ~\AppData\Local\Programs\Python\Python313\Lib\site-packages\pandas\io\parsers\readers.py:1923, in TextFileReader.read(self, nrows)
1916 nrows = validate_integer("nrows", nrows)
1917 try:
1918 # error: "ParserBase" has no attribute "read"
1919 (
1920 index,
1921 columns,
1922 col_dict,
-> 1923 ) = self._engine.read( # type: ignore[attr-defined]
1924 nrows
1925 )
1926 except Exception:
1927 self.close()
File ~\AppData\Local\Programs\Python\Python313\Lib\site-packages\pandas\io\parsers\c_parser_wrapper.py:234, in CParserWrapper.read(self, nrows)
232 try:
233 if self.low_memory:
--> 234 chunks = self._reader.read_low_memory(nrows)
235 # destructive to chunks
236 data = _concatenate_chunks(chunks)
File parsers.pyx:838, in pandas._libs.parsers.TextReader.read_low_memory()
File parsers.pyx:921, in pandas._libs.parsers.TextReader._read_rows()
File parsers.pyx:1066, in pandas._libs.parsers.TextReader._convert_column_data()
File parsers.pyx:1105, in pandas._libs.parsers.TextReader._convert_tokens()
File parsers.pyx:1211, in pandas._libs.parsers.TextReader._convert_with_dtype()
File ~\AppData\Local\Programs\Python\Python313\Lib\site-packages\pandas\core\arrays\numeric.py:283, in NumericArray._from_sequence_of_strings(cls, strings, dtype, copy)
277 @classmethod
278 def _from_sequence_of_strings(
279 cls, strings, *, dtype: Dtype | None = None, copy: bool = False
280 ) -> Self:
281 from pandas.core.tools.numeric import to_numeric
--> 283 scalars = to_numeric(strings, errors="raise", dtype_backend="numpy_nullable")
284 return cls._from_sequence(scalars, dtype=dtype, copy=copy)
File ~\AppData\Local\Programs\Python\Python313\Lib\site-packages\pandas\core\tools\numeric.py:232, in to_numeric(arg, errors, downcast, dtype_backend)
230 coerce_numeric = errors not in ("ignore", "raise")
231 try:
--> 232 values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload]
233 values,
234 set(),
235 coerce_numeric=coerce_numeric,
236 convert_to_masked_nullable=dtype_backend is not lib.no_default
237 or isinstance(values_dtype, StringDtype)
238 and not values_dtype.storage == "pyarrow_numpy",
239 )
240 except (ValueError, TypeError):
241 if errors == "raise":
File lib.pyx:2433, in pandas._libs.lib.maybe_convert_numeric()
ValueError: Unable to parse string "https://images.openfoodfacts.org/images/products/356/007/117/1049/front_fr.3.200.jpg" at position 1963
'Getline' and 'split' used to check ParserWarning concerning line 1715281
#get the line where the first warning had occured
line = linecache.getline(csv_local_path,1715281)
print (line)
# Split the string using tab delimiter
split_list = line.split('\t')
# Output the result
print("concerning Parser Warning: Skipping line 1715281: expected 209 fields, saw 239")
print("number of data detected in the raw 1715281: ",len(split_list))
print ("number of columns detected in CSV: ",len(column_names))
# Output is:
# concerning Parser Warning: Skipping line 1715281: expected 209 fields, saw 239
# number of data detected in the raw 1715281: 209
# number of columns detected in CSV: 209
I try "on_bad_lines='skip'" but without success
The problem appears to be that the file uses the "
character not to quote items but to indicate a measurement in inches. For example, I found this in one cell of the file:
fluted shell round sweet 2.5"
The fix is straightforward: add quoting=csv.QUOTE_NONE
to your call to pd.read_csv
. (You'll need to add import csv
as well.)
With this fix in place, I would expect pandas to be able to read in the CSV file without any of the 'corrections' you have applied. (My machine doesn't have enough RAM to read the whole file in, but if I split the file into chunks of 100,000 rows, it reads each chunk in fine, provided I tell pandas to ignore quotes.)