I have already implemented generating a directory tree using recursion:
import os
import re
import hashlib
PIPE = "│"
ELBOW = "└──"
TEE = "├──"
PIPE_PREFIX = "│ "
SPACE_PREFIX = " "
root_dir = ""
tree = []
def extract_number(file_name):
match = re.search(r"\d+", file_name)
return int(match.group()) if match else float("inf")
def get_md5(file_path):
hasher = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):
hasher.update(chunk)
return hasher.hexdigest()
def tree_head():
tree.append(f"{root_dir}{os.sep}")
tree.append(PIPE)
def get_entries(directory):
with os.scandir(directory) as entries:
entries = list(entries)
entries = [entry for entry in entries if entry.name != ".DS_Store"]
entries = sorted(entries, key = lambda entry: (entry.is_file(), extract_number(entry.name)))
return entries
def tree_body(directory, prefix = ""):
entries = get_entries(directory)
entries_count = len(entries)
for index, entry in enumerate(entries):
connector = ELBOW if index == entries_count - 1 else TEE
if entry.is_dir():
add_directory(entry, index, entries_count, prefix, connector)
else:
add_file(entry, prefix, connector)
def add_directory(directory, index, entries_count, prefix, connector):
tree.append(f"{prefix}{connector} {directory.name}{os.sep}")
if index != entries_count - 1:
prefix += PIPE_PREFIX
else:
prefix += SPACE_PREFIX
tree_body(directory, prefix)
def add_file(file, prefix, connector):
md5_hash = get_md5(file)
tree.append(f"{prefix}{connector} {file.name} -> {md5_hash}")
def build_tree():
tree_head()
tree_body(root_dir)
return tree
def generate():
tree = build_tree()
for entry in tree:
print(entry)
generate()
This code generates a directory tree with file and directory names, as well as their MD5 hashes. Here's an example of the output that this script might produce, assuming the specified directory structure:
root_dir/
│
├── folder1/
│ ├── file1.txt -> d41d8cd98f00b204e9800998ecf8427e
│ └── file2.txt -> 098f6bcd4621d373cade4e832627b4f6
│
└── folder2/
├── file3.txt -> 6dcd4ce23d88e2ee9568ba546c007c63
└── file4.txt -> d41d8cd98f00b204e9800998ecf8427e
How can the tree_body()
function be converted from a recursive implementation to an iterative one?
I have tried many methods and found it difficult to print in the correct order.
I don't know what the correct approach for designing the algorithm for this problem looks like.
I finally implemented it, using a stack to simulate recursion. The principle is to perform a preorder traversal of an n-ary tree, which actually allows control over any previous stack position I want.
This is also a great exercise in converting recursion to iteration.
Additionally, using os.path
and os.sep
helps abstract the differences between operating systems.
import os
import re
import hashlib
PIPE = "│"
ELBOW = "└──"
TEE = "├──"
PIPE_PREFIX = "│ "
SPACE_PREFIX = " "
class DirectoryTree:
def __init__(self, root_dir):
self._generator = _TreeGenerator(root_dir)
def generate(self):
tree = self._generator.build_tree()
for entry in tree:
print(entry)
class _TreeGenerator:
def __init__(self, root_dir):
self._root_dir = root_dir
self._tree = []
def build_tree(self):
self._tree_head()
self._tree_body(self._root_dir)
return self._tree
def _extract_number(self, file_name):
match = re.search(r"\d+", file_name)
return int(match.group()) if match else float("inf")
def _get_md5(self, file_path):
hasher = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):
hasher.update(chunk)
return hasher.hexdigest()
def _get_entries(self, folder):
with os.scandir(folder) as entries:
if entries is None:
return None
entries = list(entries)
entries = [entry for entry in entries if entry.name != ".DS_Store"]
entries = sorted(entries, key = lambda entry: self._extract_number(entry.name))
return entries
def _tree_head(self):
self._tree.append(f"{os.path.basename(self._root_dir)}{os.sep}")
self._tree.append(PIPE)
def _tree_body(self, folder):
folder, prefix, offset = folder, "", 0
stack = [[folder, prefix, offset]]
has_folder = False
while stack:
folder, prefix, offset = stack[-1]
entries = self._get_entries(folder)
if entries is None:
stack = stack[:-1]
break
has_folder = False
inner_folder = None
for index, entry in enumerate(entries[offset:]):
connector = ELBOW if i == len(entries[offset:]) - 1 else TEE
if entry.is_dir():
self._tree.append(f"{prefix}{connector} {entry.name}{os.sep}")
if index != len(entries[offset:]) - 1:
prefix += PIPE_PREFIX
else:
prefix += SPACE_PREFIX
has_folder = True
inner_folder = entry
stack[-1][2] = offset + index + 1
break
else:
md5_hash = self._get_md5(entry)
self._tree.append(f"{prefix}{connector} {entry.name} -> {md5_hash}")
if has_folder:
stack.append([inner_folder, prefix, 0])
else:
stack = stack[:-1]
if self._tree[-1] != prefix.rstrip():
self._tree.append(prefix.rstrip())
tree = DirectoryTree("")
tree.generate()
print:
XX/
│
├── XXX/
│ ├── XXX 1 -> 887100ba4373eebfed52ee1f973161de
│ ├── XXX 2 -> 22fbcce00751c1c51fbd94e83e124fbb
│ ├── XXXX/
│ │ ├── XXXXX.pdf -> f45621b992b922009f4db4422abd6fe4
│ │ └── XXXXX.pdf -> d9cab932175d1e69709291000fa8ffa3
│ │
│ └── XXXX/
│
└── XXX.txt
If you want to add a -L
feature, change the data structure to [[folder, prefix, offset, level]]
, then:
folder, prefix, offset, level = folder, "", 0, 1
stack = [[folder, prefix, offset, level]]
...
while stack:
folder, prefix, offset, level = stack[-1]
...
if entry.is_dir():
self._tree.append(f"{prefix}{connector} {entry.name}{os.sep}")
if level == x:
continue
else:
if index != len(entries[offset:]) - 1:
prefix += PIPE_PREFIX
...
if has_folder:
stack.append([inner_folder, prefix, 0, level])
else:
...
Where x is the desired level.