I'm trying to write a csv parser to work with some very complex data and I'm having trouble making the csv-parse module ignore the delimiters inside quotes. I don't have control over how the data comes except maybe for the double quotes themselves, so I can't really put scape characters to solve it. The parser works by first parsing lines with '|', then '@', then ';'.
this is an example of the data I'm trying to process:
recognitionDate | training
"2021-08-17" | "01|0009";"random string";"2"@"01|0009";"random string 2";"2"
Currently i'ts breaking on the training column due the '|' at '01|0009' after the '@'.
This is the code for the parser:
const getData = (data, delimiter) => {
return new Promise((resolve, reject) => {
const rows = []
const rtrim = delimiter === '|' || delimiter === '@'? false : true
const parser = parseCsv(
data,
{
delimiter,
relax: true,
ltrim: true,
rtrim,
}
)
parser.on('readable', () => {
for(let data = parser.read(); data; data = parser.read()){
rows.push(data)
}
})
parser.on('end', () => {
resolve(
delimiter === '|' ?
rows :
rows.flat(1)
)
})
parser.on('error', err => {console.log(err)})
})
}
this is how I'm using it:
const buildRisk = async (row, header) => {
const riskObj = {}
// iterate through each element of each row
for(let i = 0; i < row.length; i++){
if(!row[i]) continue
let fields = await getData(row[i], '@')
const mainIndex = header[i].replace(/\s+/g, ' ').trim() + 's'
riskObj[mainIndex] = []
for(let j = 0; j < fields.length; j++){
const objData = await getData(fields[j], ';')
if(objData.length > 1){
riskObj[mainIndex].push({})
for(let k = 0; k < objData.length; k++){
riskObj[mainIndex][j]['index' + k] = objData[k].replace(/\s+/g, ' ').trim()
}
}
else{
riskObj[mainIndex] = objData[0].replace(/\s+/g, ' ').trim()
}
}
}
return riskObj
}
const main = async () => {
// first get all rows
const rows = await getData(data, '|')
if(!rows.length) return
// get header
const header = rows.shift()
console.log(rows)
// iterate through rows
for(const row of rows){
const risk = await buildRisk(row, header)
console.log(risk)
}
}
and this is the error from the module:
CsvError: Invalid Record Length: expect 2, got 3 on line 2
at Parser.__onRecord (/home/luders/teste/node_modules/csv-parse/lib/index.js:773:9)
at Parser.__parse (/home/luders/teste/node_modules/csv-parse/lib/index.js:662:38)
at Parser._flush (/home/luders/teste/node_modules/csv-parse/lib/index.js:483:22)
at Parser.prefinish (internal/streams/transform.js:147:10)
at Parser.emit (events.js:375:28)
at prefinish (internal/streams/writable.js:630:14)
at finishMaybe (internal/streams/writable.js:638:5)
at Parser.Writable.end (internal/streams/writable.js:582:5)
at Immediate._onImmediate (/home/luders/teste/node_modules/csv-parse/lib/index.js:1186:16)
at processImmediate (internal/timers.js:464:21) {
code: 'CSV_INCONSISTENT_RECORD_LENGTH',
comment_lines: 0,
empty_lines: 0,
invalid_field_length: 0,
lines: 2,
records: 1,
columns: false,
error: undefined,
header: false,
index: 3,
column: 3,
quoting: false,
record: [
'"2021-08-17" ',
'"01|0009";"random string";"2"@"01',
'0009";"random string 2";"2"'
]
}
As you can see, It's parsing three columns when it should be parsing 2.
How could I make it ignore whats inside the double quotes?
I believe it's not a valid way of escaping csv. A whole field needs to be enclosed in quotes and the quotes inside it should be also escaped as double quotes. You need to do some pre-processing here. The valid format of csv would be:
recognitionDate | training
"2021-08-17" | " ""01|0009"";""random string"";""2""@""01|0009"";""random string 2"";""2"" "