pythonloopsrecursiontree

How to generate a directory tree iteratively, implemented in Python?


I have already implemented generating a directory tree using recursion:

import os
import re
import hashlib

PIPE = "│"
ELBOW = "└──"
TEE = "├──"
PIPE_PREFIX = "│   "
SPACE_PREFIX = "    "

root_dir = ""
tree = []

def extract_number(file_name):
    match = re.search(r"\d+", file_name)
    return int(match.group()) if match else float("inf")

def get_md5(file_path):
    hasher = hashlib.md5()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            hasher.update(chunk)
    return hasher.hexdigest()

def tree_head():
    tree.append(f"{root_dir}{os.sep}")
    tree.append(PIPE)

def get_entries(directory):
    with os.scandir(directory) as entries:
        entries = list(entries)

        entries = [entry for entry in entries if entry.name != ".DS_Store"]

        entries = sorted(entries, key = lambda entry: (entry.is_file(), extract_number(entry.name)))

    return entries

def tree_body(directory, prefix = ""):
    entries = get_entries(directory)
    entries_count = len(entries)
    for index, entry in enumerate(entries):
        connector = ELBOW if index == entries_count - 1 else TEE
        if entry.is_dir():
            add_directory(entry, index, entries_count, prefix, connector)
        else:
            add_file(entry, prefix, connector)

def add_directory(directory, index, entries_count, prefix, connector):
    tree.append(f"{prefix}{connector} {directory.name}{os.sep}")

    if index != entries_count - 1:
        prefix += PIPE_PREFIX
    else:
        prefix += SPACE_PREFIX

    tree_body(directory, prefix)

def add_file(file, prefix, connector):
    md5_hash = get_md5(file)
    tree.append(f"{prefix}{connector} {file.name} -> {md5_hash}")

def build_tree():
    tree_head()
    tree_body(root_dir)
    return tree

def generate():
    tree = build_tree()

    for entry in tree:
        print(entry)

generate()

This code generates a directory tree with file and directory names, as well as their MD5 hashes. Here's an example of the output that this script might produce, assuming the specified directory structure:

root_dir/ 
│
├── folder1/
│   ├── file1.txt -> d41d8cd98f00b204e9800998ecf8427e
│   └── file2.txt -> 098f6bcd4621d373cade4e832627b4f6
│
└── folder2/
    ├── file3.txt -> 6dcd4ce23d88e2ee9568ba546c007c63
    └── file4.txt -> d41d8cd98f00b204e9800998ecf8427e

How can the tree_body() function be converted from a recursive implementation to an iterative one? I have tried many methods and found it difficult to print in the correct order. I don't know what the correct approach for designing the algorithm for this problem looks like.


Solution

  • I finally implemented it, using a stack to simulate recursion. The principle is to perform a preorder traversal of an n-ary tree, which actually allows control over any previous stack position I want. This is also a great exercise in converting recursion to iteration. Additionally, using os.path and os.sep helps abstract the differences between operating systems.

    import os
    import re
    import hashlib
    
    PIPE = "│"
    ELBOW = "└──"
    TEE = "├──"
    PIPE_PREFIX = "│   "
    SPACE_PREFIX = "    "
    
    class DirectoryTree:
        def __init__(self, root_dir):
            self._generator = _TreeGenerator(root_dir)
    
        def generate(self):
            tree = self._generator.build_tree()
            for entry in tree:
                print(entry)
    
    class _TreeGenerator:
        def __init__(self, root_dir):
            self._root_dir = root_dir
            self._tree = []
    
        def build_tree(self):
            self._tree_head()
            self._tree_body(self._root_dir)
            return self._tree
    
        def _extract_number(self, file_name):
            match = re.search(r"\d+", file_name)
            return int(match.group()) if match else float("inf")
    
        def _get_md5(self, file_path):
            hasher = hashlib.md5()
            with open(file_path, "rb") as f:
                for chunk in iter(lambda: f.read(8192), b""):
                    hasher.update(chunk)
            return hasher.hexdigest()
    
        def _get_entries(self, folder):
            with os.scandir(folder) as entries:
                if entries is None:
                    return None
                entries = list(entries)
                entries = [entry for entry in entries if entry.name != ".DS_Store"]
                entries = sorted(entries, key = lambda entry: self._extract_number(entry.name))
            return entries
    
        def _tree_head(self):
            self._tree.append(f"{os.path.basename(self._root_dir)}{os.sep}")
            self._tree.append(PIPE)
    
        def _tree_body(self, folder):
            folder, prefix, offset = folder, "", 0
            stack = [[folder, prefix, offset]]
            has_folder = False
    
            while stack:
                folder, prefix, offset = stack[-1]
                entries = self._get_entries(folder)
                if entries is None:
                    stack = stack[:-1]
                    break
                has_folder = False
                inner_folder = None
    
                for index, entry in enumerate(entries[offset:]):
                    connector = ELBOW if i == len(entries[offset:]) - 1 else TEE
                    if entry.is_dir():
                        self._tree.append(f"{prefix}{connector} {entry.name}{os.sep}")
                        if index != len(entries[offset:]) - 1:
                            prefix += PIPE_PREFIX
                        else:
                            prefix += SPACE_PREFIX
                        has_folder = True
                        inner_folder = entry
                        stack[-1][2] = offset + index + 1
                        break
                    else:
                        md5_hash = self._get_md5(entry)
                        self._tree.append(f"{prefix}{connector} {entry.name} -> {md5_hash}")
    
                if has_folder:
                    stack.append([inner_folder, prefix, 0])
                else:
                    stack = stack[:-1]
                    if self._tree[-1] != prefix.rstrip():
                        self._tree.append(prefix.rstrip())
    
    tree = DirectoryTree("")
    tree.generate()
    

    print:

    XX/
    │
    ├── XXX/
    │   ├── XXX 1 -> 887100ba4373eebfed52ee1f973161de
    │   ├── XXX 2 -> 22fbcce00751c1c51fbd94e83e124fbb
    │   ├── XXXX/
    │   │   ├── XXXXX.pdf -> f45621b992b922009f4db4422abd6fe4
    │   │   └── XXXXX.pdf -> d9cab932175d1e69709291000fa8ffa3
    │   │
    │   └── XXXX/
    │
    └── XXX.txt
    

    If you want to add a -L feature, change the data structure to [[folder, prefix, offset, level]], then:

    folder, prefix, offset, level = folder, "", 0, 1
    stack = [[folder, prefix, offset, level]]
    ...
    while stack:
        folder, prefix, offset, level = stack[-1]
        ...
        if entry.is_dir():
            self._tree.append(f"{prefix}{connector} {entry.name}{os.sep}")
            if level == x:
                continue
            else:
                if index != len(entries[offset:]) - 1:
                    prefix += PIPE_PREFIX
    ...
    if has_folder:
       stack.append([inner_folder, prefix, 0, level])
    else:
    ...
    

    Where x is the desired level.