I have a .ndjson files with millions of rows. Each row has a field html which contains html strings. I would like to write all such html into a .txt file. One html is into one line of the .txt file. I tried using polars.LazyFrame.sink_csv for its speed:
import polars as pl
import requests
from pathlib import Path
url = "https://raw.githubusercontent.com/leanhdung1994/files/main/processedStep1_enwiktionary_namespace_0_43.ndjson"
workingDir = r"E:\Personal Projects\tmp\tarFiles"
outNdjson = Path(workingDir, "wiktionary.ndjson")
outTxt = Path(workingDir, "wiktionary.txt")
# Download
resp = requests.get(url)
resp.raise_for_status()
# Save
with open(outNdjson, "wb") as f:
f.write(resp.content)
# Read with Polars
df = pl.scan_ndjson(outNdjson)
print(df.select("html").collect())
df.select("html").sink_csv(outTxt, include_header=False)
The column html is
shape: (23, 1)
┌─────────────────────────────────┐
│ html │
│ --- │
│ str │
╞═════════════════════════════════╡
│ playabilities <link href="enWi… │
│ plecopterans <link href="enWik… │
│ pleiotropies <link href="enWik… │
│ pleochroisms <link href="enWik… │
│ plicometry <link href="enWikti… │
│ … │
│ pontil marks <link href="enWik… │
│ poringly <link href="enWiktion… │
│ pornaholics <link href="enWikt… │
│ geronimo <link href="enWiktion… │
│ uncage <link href="enWiktionar… │
└─────────────────────────────────┘
But the resulted .txt file contain escape quotations:
"playabilities <link href=""enWiktionary.css"" rel=""stylesheet"" type=""text/css""/> <script src=""enWiktionary.js"" type=""text/javascript""></script>
I tried the option quote_char='', but it returns an error:
TypeError: ord() expected string of length 1, but NoneType found
Could you explain how to do so?
The problem is that the data contains both double quotes " and embedded trailing newlines.
import polars as pl
url = "https://raw.githubusercontent.com/leanhdung1994/files/main/processedStep1_enwiktionary_namespace_0_43.ndjson"
df = pl.read_ndjson(url)
df.head(1).select("html").item()[-10:]
# 'html> </>\n'
# ^^
You want to strip the trailing newlines, and disable quoting in the CSV writer.
(pl.scan_ndjson(url)
.select(pl.col("html").str.strip_chars_end("\n"))
.sink_csv(outTxt, include_header=False, line_terminator="\r\n", quote_style="never")
)
import io
import polars as pl
df = pl.DataFrame({"x": ['"foo"\n']})
f = io.BytesIO()
df.write_csv(f, include_header=False, line_terminator="\r\n")
f.getvalue()
# b'"""foo""\n"\r\n'
quote_style="never" is how to disable quoting.
f = io.BytesIO()
df.write_csv(f, include_header=False, line_terminator="\r\n", quote_style="never")
f.getvalue()
# b'"foo"\n\r\n'
Note the trailing \n from the data is still there, so you end up with mixed line endings. (\n\r\n)
.strip_chars_end("\n") can be used to remove it before sinking.
f = io.BytesIO()
(df.select(pl.col("x").str.strip_chars_end("\n"))
.write_csv(f, include_header=False, line_terminator="\r\n", quote_style="never"))
f.getvalue()
# b'"foo"\r\n'