parsingcompiler-constructiongrammarlambda-calculusll-grammar

Parsing extended lambda calculus using recursive descent


I'm writing interpreter for simple lambda calculus based language in C. EBNF for language is

S ::= E
E ::= 'fn' var '->' E | T {'+' T} | T {'-' T}
T ::= F {'*' F} | F {'/' F}
F ::= P {P}
P ::= var | number | '(' E ')'

I've already written parser for same grammar but without 4-th production rule. But how I could deal with {P}?

Of course, I can just check lookahead token, and if it's var, number or '(' then call procedure for parsing P (say parse_primitive), but in this case I will have same extra check of lookahead in parse_primitive, so i definitely doing something wrong.

EDIT: my code now looks like this:

Ast *parse_expr(Lexer *lexer) {
  if (match_token(lexer, TOK_KW_FN) {
    expect_token(lexer, TOK_VAR);
    char *lexeme = get_last_lexeme(lexer);
    Ast *var = make_var(lexeme);
    expect_token(lexer, TOK_ARROW);
    Ast *body = parse_expr(lexer);
    return make_ast(var, body);
  } else {
    Ast *expr = parse_term(lexer);
    for (;;) {
      int op;
      if (match_token(lexer, TOK_PLUS)) {
        op = OP_ADD;
      } else if (match_token(lexer, TOK_MINUS)) {
        op = OP_SUB;
      } else {
        return expr;
      }
      Ast *term = parse_term(lexer);
      expr = make_binop(op, expr, term);
    }
  }
}

Ast *parse_term(Lexer *lexer) {
  Ast *term = parse_factor(lexer);
  for (;;) {
    int op;
    if (match_token(lexer, TOK_STAR)) {
      op = OP_MUL;
    } else if (match_token(lexer, TOK_SLASH)) {
      op = OP_DIV;
    } else {
      return term;
    }
    Ast *factor = parse_factor(lexer);
    term = make_binop(op, term, factor);
  }
}

Ast *parse_factor(Lexer *lexer) {
  Ast *factor = parse_primitive(lexer);
  for (;;) {
    int tok = lexer_peek(lexer);
    if (tok != TOK_LPAR && tok != TOK_VAR && tok != TOK_NUMBER) { /* i want to remove this check as if is actually done by parse_primitive */
      return factor;
    }
    Ast *primitive= parse_primitive(lexer);
    factor = make_app(factor, primitive);
  }
}

Ast *parse_primitive(Lexer *lexer) {
  if (lexer_match(lexer, TOK_LPAR)) {
    Ast *expr = parse_expr(lexer);
    if (!lexer_match(lexer, TOK_RPAR)) {
      parser_error("expected ')'");
    }
    return expr;
  } else if (lexer_match(lexer, TOK_NUMBER)) {
    char *lexeme = get_last_lexeme(lexer);
    return make_number(lexeme);
  } else if (lexer_match(lexer, TOK_VAR)) {
    char *lexeme = get_last_lexeme(lexer);
    return make_var(lexeme);
  }
  parser_error("expected '(', number or var");
}

Solution

  • You could add an argument to parse_primitive to indicate whether the primitive is required or is optional. In case it is optional, don't raise an error when it is not matched, but return NULL:

    Ast *parse_primitive(Lexer *lexer, int mandatory) {
      if (lexer_match(lexer, TOK_LPAR)) {
        Ast *expr = parse_expr(lexer);
        if (!lexer_match(lexer, TOK_RPAR)) {
          // Here we unconditionally raise an error, as the opening bracket
          //    made the closing bracket mandatory.
          parser_error("expected ')'");
        }
        return expr;
      } else if (lexer_match(lexer, TOK_NUMBER)) {
        char *lexeme = get_last_lexeme(lexer);
        return make_number(lexeme);
      } else if (lexer_match(lexer, TOK_VAR)) {
        char *lexeme = get_last_lexeme(lexer);
        return make_var(lexeme);
      }
      // Conditionally raise an error
      if (mandatory) {
        parser_error("expected '(', number or var");
      }
      return NULL;
    }
    

    Now your parse_factor function can look like this:

    Ast *parse_factor(Lexer *lexer) {
      Ast *factor = parse_primitive(lexer, 1); // Mandatory
      for (;;) {
        Ast *primitive = parse_primitive(lexer, 0); // Not mandatory
        if (!primitive) {
          return factor;
        }
        factor = make_app(factor, primitive);
      }
    }
    

    Without extra parameter?

    ...then create two versions of the function:

    Ast *parse_if_primitive(Lexer *lexer) {
      if (lexer_match(lexer, TOK_LPAR)) {
        Ast *expr = parse_expr(lexer);
        if (!lexer_match(lexer, TOK_RPAR)) {
          parser_error("expected ')'");
        }
        return expr;
      } else if (lexer_match(lexer, TOK_NUMBER)) {
        char *lexeme = get_last_lexeme(lexer);
        return make_number(lexeme);
      } else if (lexer_match(lexer, TOK_VAR)) {
        char *lexeme = get_last_lexeme(lexer);
        return make_var(lexeme);
      }
      return NULL; // No primitive here
    }
    
    Ast *parse_primitive(Lexer *lexer) {
      Ast *primitive = parse_if_primitive(lexer);
      if (!primitive) {
        parser_error("expected '(', number or var");
      }
      return primitive;
    }
    

    And call the one you need:

    Ast *parse_factor(Lexer *lexer) {
      Ast *factor = parse_primitive(lexer); // Mandatory
      for (;;) {
        Ast *primitive = parse_if_primitive(lexer); // Not mandatory
        if (!primitive) {
          return factor;
        }
        factor = make_app(factor, primitive);
      }
    }