this is my code. I am working on use code detected a bunch of text files in a folder and then string parse the data output as csv files. Could you please give me some hint on how to do this? I am hard struggled.
The first step for my code is detect where the data is in txt files. I found that all the data starts with 'Read', then I found which line the data starts in each files. After that I am struggled on how to export the data output to csv files.
import os
import argparse
import csv
from typing import List
def validate_directory(path):
if os.path.isdir(path):
return path
else:
raise NotADirectoryError(path)
def get_data_from_file(file) -> List[str]:
ignore_list = ["Read Segment", "Read Disk", "Read a line", "Read in"]
data = []
with open(file, "r", encoding="latin1") as f:
try:
lines = f.readlines()
except Exception as e:
print(f"Unable to process {file}: {e}")
return []
for line_number, line in enumerate(lines, start=1):
if not any(variation in line for variation in ignore_list):
if line.strip().startswith("Read ") and not line.strip().startswith("Read ("): # TODO: fix this with better regex
data.append(f'Found "Read" at line {line_number} in {file}')
print(f'Found "Read" at {file}:{line_number}')
print(lines[line_number-1])
return data
def list_read_data(directory_path: str) -> List[str]:
total_data = []
for root, _, files in os.walk(directory_path):
for file_name in files:
if file_name.endswith(".txt"):
data = get_data_from_file(os.path.join(root, file_name))
total_data.extend(data)
return total_data
def write_results_to_csv(output_file: str, data: List[str]):
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["Results"])
for line in data:
writer.writerow([line])
def main(directory_path: str, output_file: str):
data = list_read_data(directory_path)
write_results_to_csv(output_file, data)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Process the 2020Model folder for input data."
)
parser.add_argument(
"--directory", type=validate_directory, help="folder to be processed"
)
parser.add_argument("--output", type=str, help="Output file name (e.g., outputfile.csv)", default="outputfile.csv")
args = parser.parse_args()
main(os.path.abspath(args.directory), args.output)
Below is my ideal csv output data:
1985 | 1986 | 1986 | 1987 | 1988 | 1989 | 1990 | 1991 | 1992 | 1993 | 1994 |
---|---|---|---|---|---|---|---|---|---|---|
37839 | 36962 | 37856 | 41971 | 40838 | 44640.87 | 42826.34 | 44883.03 | 43077.59 | 45006.49 | 46789 |
Could you please give me some hint on:
Below is a sample txt file:
Select Year(2007-2025)
Read TotPkSav
/2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025
00 27 53 78 108 133 151 161 169 177 186 195 205 216 229 242 257 273 288
If all your files look like those 4 lines, then I'd recommend just turning the file into a list of lines up front, and not try to step/iterate over the lines. I also recommend just using glob and recursive=True and avoid trying to walk the tree.
Because it reads the file inside the for-loop, any files with bad properties can be skipped by just continue
-ing to the next file in the loop:
all_rows: list[list[str]] = []
for fname in glob.glob("**/*.txt", recursive=True):
with open(fname, encoding="iso-8859-1") as f:
print(f"reading {fname}")
lines = [x.strip() for x in list(f)]
if len(lines) != 4:
print(f'skipping {fname} with too few lines"')
continue
line2 = lines[1]
if line2[:4] != "Read" or line2[:6] == "Read (":
print(f'skipping {fname} with line2 = "{line2}"')
continue
line3, line4 = lines[2:4]
if line3[0] == "/":
line3 = line3[1:]
header = [x for x in line3.split(" ") if x]
data = [x for x in line4.split(" ") if x]
all_rows.append(header)
all_rows.append(data)
with open("output.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["Result"])
writer.writerows(all_rows)
I mocked up a few more files and spread them throughout my tree:
- .
- a
input3.txt
- b
foo.txt
input1.txt
input2.txt
main.py
When I run that program from the root of that tree, I get:
reading input1.txt
reading input2.txt
skipping input2.txt with line2 = "Read (TotPkSav)"
reading a/input3.txt
reading b/foo.txt
skipping b/foo.txt with too few lines"
and output.csv looks like:
| Result |
|--------|
| 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 | 2019 | 2020 | 2021 | 2022 | 2023 | 2024 | 2025 |
| 00 | 27 | 53 | 78 | 108 | 133 | 151 | 161 | 169 | 177 | 186 | 195 | 205 | 216 | 229 | 242 | 257 | 273 | 288 |
| 2099 | 2098 | 2009 | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 | 2019 | 2020 | 2021 | 2022 | 2023 | 2024 | 2025 |
| 00 | 27 | 53 | 78 | 108 | 133 | 151 | 161 | 169 | 177 | 186 | 195 | 205 | 216 | 229 | 242 | 257 | 273 | 288 |