enumsunionsrecursive-datastructureszig

I can't pinpoint why memory gets corrupted while working with Recursive Tagged Union in Zig


I am working on a small feature-incomplete Lisp interpreter to learn a bit of Zig. My inspiration is this https://norvig.com/lispy.html, this might not be the most idiomatic way of implementing that in C/Zig but nevertheless quite small and simple which is what I am looking for. I am now on the stage of representing AST via tagged union. However, I can't understand the reason why the code below seem to end me up in a corrupted/overwritten memory state.

const std = @import("std");
// const test_string = "(begin (define r 10) (* pi (* r r)))";
const shorter_test_string = "(begin (define r 10))";
const Allocator = std.mem.Allocator;

const Tag = enum { String, Number, Bool, Symbol };
const Type = union(Tag) { String: std.ArrayList(u8), Number: i64, Bool: bool, Symbol: []const u8 };
const TokenType = enum { Expr, Single };
const Token = union(TokenType) {
    Expr: *std.ArrayList(*Token),
    Single: *Type,
};

pub fn finalizeWord(string_collector: *std.ArrayList(u8), outer_list: *std.ArrayList(std.ArrayList(u8))) !void {
    if (!std.mem.eql(u8, string_collector.items, "")) {
        try outer_list.append(try string_collector.clone());
        string_collector.clearRetainingCapacity();
    }
}

// TODO: I commented out all the deallocations for debugging purposes, will add them later
pub fn tokenize(alloca: *Allocator, parsed_input: *std.ArrayList(std.ArrayList(u8))) !*Token {
    var token: std.ArrayList(u8) = parsed_input.orderedRemove(0);
    if (std.mem.eql(u8, token.items, "(")) {
        std.debug.print("Open bracket encountered \n", .{});
        // defer token.deinit();
        var list = std.ArrayList(*Token).init(alloca.*);
        var local_token = try moveToHeap(alloca, Token{ .Expr = &list });
        while (!std.mem.eql(u8, parsed_input.items[0].items, ")")) {
            std.debug.print("While loop iteration \n", .{});
            var item_to_add = try tokenize(alloca, parsed_input);
            try list.append(item_to_add);
        }
        _ = parsed_input.orderedRemove(0);
        // defer closing_bracket.deinit();
        return local_token;
    } else if (std.mem.eql(u8, token.items, ")")) {
        // TODO: This is definitely not unreachable but a Syntax Error
        unreachable;
    } else {
        var item = try moveToHeap(alloca, Token{ .Single = try atomize(alloca, token) });
        std.debug.print("Single item encountered: {s} \n", .{item.*.Single.*.String.items});
        return item;
    }
}

pub fn atomize(alloca: *Allocator, item: std.ArrayList(u8)) !*Type {
    return try moveToHeap(alloca, Type{ .String = try item.clone() });
}

pub fn parse(alloca: *Allocator, input_string: []const u8) !std.ArrayList(std.ArrayList(u8)) {
    var outer_list = std.ArrayList(std.ArrayList(u8)).init(alloca.*);
    var string_collector = std.ArrayList(u8).init(alloca.*);
    defer string_collector.deinit();
    for (input_string) |character| {
        if (character == '(') {
            try finalizeWord(&string_collector, &outer_list);
            var opening_bracket = std.ArrayList(u8).init(alloca.*);
            try opening_bracket.append('(');
            try outer_list.append(opening_bracket);

            continue;
        } else if (character == ')') {
            try finalizeWord(&string_collector, &outer_list);
            var closing_bracket = std.ArrayList(u8).init(alloca.*);
            try closing_bracket.append(')');
            try outer_list.append(closing_bracket);

            continue;
        } else if (character == ' ') {
            try finalizeWord(&string_collector, &outer_list);
        } else {
            try string_collector.append(character);
        }
    }
    return outer_list;
}

pub fn moveToHeap(allocator: *Allocator, value: anytype) !*@TypeOf(value) {
    const T = @TypeOf(value);
    std.debug.assert(@typeInfo(T) != .Pointer);
    const ptr = try allocator.create(T);
    ptr.* = value;
    return ptr;
}

pub fn print_tokens(token: *Token) void {
    switch (token.*) {
        .Expr => |expr| {
            // TODO: deallocations should happen in separate functions
            // defer expr.deinit();
            std.debug.print("Processing Expression \n", .{});
            for (expr.items) |t| {
                print_tokens(t);
            }
        },
        .Single => |item| {
            // TODO: deallocations should happen in separate functions
            // defer item.*.String.deinit();
            std.debug.print("Processing Token: {s}\n", .{item.*.String.items});
        },
    }
}

pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    defer {
        _ = gpa.deinit();
    }
    var allocator = gpa.allocator();
    var parsed_string = try parse(&allocator, shorter_test_string);
    defer parsed_string.deinit();
    defer {
        for (parsed_string.items) |item| {
            item.deinit();
        }
    }
    var tokens = try tokenize(&allocator, &parsed_string);
    print_tokens(tokens); // SEGFAULT happens here
}

The particular part I am struggling with are functions tokenize and print_tokens. If you run the code in main you get the SEGFAULTS while printing tokens, which seems to occur because once you exit tokenize function the memory for some of the nested Expr get corrupted/overwritten. I have a gut feeling that this form of representation

const Tag = enum { String, Number, Bool, Symbol };
const Type = union(Tag) { String: std.ArrayList(u8), Number: i64, Bool: bool, Symbol: []const u8 };
const TokenType = enum { Expr, Single };
const Token = union(TokenType) {
    Expr: *std.ArrayList(*Token),
    Single: *Type,
};

might have some inherent design problems but at this point I am wondering what might be the actual cause for memory overwriting since everything is manually placed on Heap.

I tried debugging with LLDB and it seems like even putting print or making a heap allocation here

_ = parsed_input.orderedRemove(0);
std.debug.print("Sample print", .{});
// defer closing_bracket.deinit();
return local_token;

overwrites the inner Expr in local_token.*.Expr.*.items[1]. As I mentioned previously I am now more interested in why the memory problems occur then in the final Lisp interpreter implementation :)

EDIT:

It seems like Zig type system kind of tricked me into thinking it was doing more magic than I thought. std.ArrayList is a struct with items allocated on the heap and the problem was that I stored a pointer to a local stack allocated struct list in a heap allocated union local_token

One alternative solution, aside from the one suggested in comments, I found is storing std.ArrayList(*Token) in an Expr, making it de-facto an owner of the struct representing ArrayList.

const Token = union(TokenType) {
    Expr: std.ArrayList(*Token),
    Single: Type,
};

and the function tokenize:

// TODO: I commented out all the deallocations for debugging purposes, will add them later
pub fn tokenize(alloca: *Allocator, parsed_input: *std.ArrayList(std.ArrayList(u8))) !*Token {
    var token: std.ArrayList(u8) = parsed_input.orderedRemove(0);
    if (std.mem.eql(u8, token.items, "(")) {
        std.debug.print("Open bracket encountered \n", .{});
        // defer token.deinit();
        var local_token = try moveToHeap(alloca, Token{ .Expr = std.ArrayList(*Token).init(alloca.*) });
        while (!std.mem.eql(u8, parsed_input.items[0].items, ")")) {
            std.debug.print("While loop iteration \n", .{});
            var item_to_add = try tokenize(alloca, parsed_input);
            try local_token.*.Expr.append(item_to_add);
        }
        _ = parsed_input.orderedRemove(0);
        // defer closing_bracket.deinit();
        return local_token;
    } else if (std.mem.eql(u8, token.items, ")")) {
        // TODO: This is definitely not unreachable but a Syntax Error
        unreachable;
    } else {
        var item = try moveToHeap(alloca, Token{ .Single = try atomize(token) });
        std.debug.print("Single item encountered: {s} \n", .{item.*.Single.String.items});
        return item;
    }
}

EDIT 2:

Apparently, I don't need additional heap allocations altogether. I was worried that I will end up with recursive data structures that aren't sized properly but using ArrayList with a Token stored in it already gives it the correct layout, hence this code would be more optimal:

const Token = union(TokenType) {
    Expr: std.ArrayList(Token),
    Single: Type,
};

Solution

  • The problem is in how you create the expression list in tokenize:

    var list = std.ArrayList(*Token).init(alloca.*);
    var local_token = try moveToHeap(alloca, Token{ .Expr = &list });
    

    You store a pointer to list, but it lives on the function's stack. Once tokenize exits, the pointer becomes invalid. Simply create the list with the allocator. For example:

    var list = try alloca.create(std.ArrayList(*Token));
    list.* = std.ArrayList(*Token).init(alloca.*);
    var local_token = try moveToHeap(alloca, Token{ .Expr = list });