javaalgorithmlexical-analysislexical

Lexical Analyzer in Java. Operators Shouldn't be Tokenized as individuals like '++' or '>=' and any unlisted tokens shouldnt print out anything


I am using a Lexical analyzer to tokenize some operators, conditions, and syntaxes. My approach is checking each and every character and when it finds a space between characters, it tokenizes the combined characters. eg. when it finds 'String' it tokenizes it as STR or ';' and tokenizes it as SEMI. so whenever it finds an operator like '++' it just tokenizes it as ADD_OP ADD_OP but I want it to tokenize it as one token i.e '++' should be INC or '>=' it prints out GT ASSIGN rather than GE. here is my code:

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;

public class Lexer {

public static void Tokenize(String fileName) {
    BufferedReader reader = null;
    try {
        reader = new BufferedReader(new FileReader(fileName));
        int r;
        String token = "";
        while ((r = reader.read()) != -1) {
            char ch = (char) r;
            if (Character.isWhitespace(ch)) {
                if (!token.isBlank()) {
                    String[] tokens = tokenizeToken(token);
                    for (String t : tokens) {
                        if (!t.isBlank()) {
                            System.out.println(t);
                        }
                    }
                }
                token = "";
            } else {
                token += ch;
            }
        }
        if (!token.isBlank()) {
            String[] tokens = tokenizeToken(token);
            for (String t : tokens) {
                if (!t.isBlank()) {
                    System.out.println(t);
                }
            }
        }
    } catch (IOException e) {
        System.err.println("Error reading file: " + e.getMessage());
    } finally {
        try {
            if (reader != null) {
                reader.close();
            }
        } catch (IOException e) {
            System.err.println("Error closing file: " + e.getMessage());
        }
    }
}

private static String[] tokenizeToken(String token) {
    String[] tokens = token.split("(?=[\\[\\](){}<>=,;+-/*%|&!])|(?<=[\\[\\](){}<>=,;+-/*%|&!])");
    for (int i = 0; i < tokens.length; i++) {
        String t = tokens[i].trim();
        if (t.matches("procedure")) {
            tokens[i] = "PROC";
        } else if (t.matches("int")) {
            tokens[i] = "INT";
        } else if (t.matches("[0-9]+")) {
            tokens[i] = "INT_CONST";
        } else if (t.matches("end")) {
            tokens[i] = "END";
        } else if (t.matches("String") || t.matches("string")) {
            tokens[i] = "STR";
        } else if (t.matches("[(]")) {
            tokens[i] = "LP";
        } else if (t.matches("[)]")) {
            tokens[i] = "RP";
        } else if (t.matches("\".*\"")) {
            tokens[i] = "STR_CONST";
        } else if (t.matches("if")) {
            tokens[i] = "IF";
        } else if (t.matches("for")) {
            tokens[i] = "FOR";
        } else if (t.matches("while")) {
            tokens[i] = "WHILE";
        } else if (t.matches("return")) {
            tokens[i] = "RETURN";
        } else if (t.matches("[;]")) {
            tokens[i] = "SEMI";
        } else if (t.matches("do")) {
            tokens[i] = "DO";
        } else if (t.matches("break")) {
            tokens[i] = "BREAK";
        } else if (t.matches("[a-zA-Z][a-zA-Z0-9]*")) {
            tokens[i] = "IDENT";
        } else if (t.matches("[=]")) {
            tokens[i] = "ASSIGN";
        } else if (t.matches("[<]")) {
            tokens[i] = "LT";
        } else if (t.matches("[>]")) {
            tokens[i] = "RT";
        } else if (token.matches("[++]")) {
            tokens[i] = "INC";
        } else if (t.matches("[+]")) {
            tokens[i] = "ADD_OP";
        } else if (token.matches("[{]")) {
            tokens[i] = "RB";
        } else if (token.matches("[}]")) {
            tokens[i] = "LB";
        } else if (token.matches("[*]")) {
            tokens[i] = "MUL_OP";
        } else if (token.matches("[/]")) {
            tokens[i] = "DIV_OP";
        } else if (token.matches("[>=]")) {
            tokens[i] = "GE";
        } else {
            System.out.println("SYSTEM ERROR: INVALID IDENTIFIER NAME");
        }
    }

    return tokens;
}
}

and here is the output:

PROC
IDENT
LP
INT
IDENT
RP
FOR
LP
INT
IDENT
ASSIGN
INT_CONST
SEMI
IDENT
LT
IDENT
SEMI
IDENT
ASSIGN
IDENT
ADD_OP
ADD_OP
RP
RB
IDENT
ASSIGN
IDENT
MUL_OP
LP
IDENT
DIV_OP
INT_CONST
RP
SEMI
IF
LP
IDENT
RT
ASSIGN
INT_CONST
RP
BREAK
SEMI
LB
RETURN
IDENT
SEMI
END
IDENT
INT
SYSTEM ERROR: INVALID IDENTIFIER NAME
9user
ASSIGN
INT_CONST
SEMI

Also if any syntax or operator isn't listed should not print out in the output. e.g. the 9user shouldn't be printed out.


Solution

  • You are having issues with your regexes.

    1. In the one you are using for split
      i)it splits if there is a +>=< behind or before (lookahead & look behind) so ++ will be split as +,+.
      ii) +-/ "-" has a special meaning inside [] in regex it means match any character from the Unicode value of + till the Unicode value of /
    2. In the if-else section inside the matches you have used the unnecessary "[]" which means any one of the characters inside so "[++]" means match + character or + character

    Split regex can be modified like this : "([^+><]+?=[-[](){}<>=,;+/%|&!])|((?<=[-[](){}<>=,;/+%|&!])(?![+=]))"

    and in the if-else section can be modified as shown in the working example

    A minimal working example is as below

    import java.io.BufferedReader;
    import java.io.FileReader;
    import java.io.IOException;
    
    public class Lexer {
    
        public static void Tokenize(String fileName) {
            BufferedReader reader = null;
            try {
                reader = new BufferedReader(new FileReader(fileName));
                int r;
                String token = "";
                while ((r = reader.read()) != -1) {
                    char ch = (char) r;
                    if (Character.isWhitespace(ch)) {
                        if (!token.isBlank()) {
                            String[] tokens = tokenizeToken(token);
                            for (String t : tokens) {
                                if (!t.isBlank()) {
                                    System.out.println(t);
                                }
                            }
                        }
                        token = "";
                    } else {
                        token += ch;
                    }
                }
                if (!token.isBlank()) {
                    String[] tokens = tokenizeToken(token);
                    for (String t : tokens) {
                        if (!t.isBlank()) {
                            System.out.println(t);
                        }
                    }
                }
            } catch (IOException e) {
                System.err.println("Error reading file: " + e.getMessage());
            } finally {
                try {
                    if (reader != null) {
                        reader.close();
                    }
                } catch (IOException e) {
                    System.err.println("Error closing file: " + e.getMessage());
                }
            }
        }
    
        private static String[] tokenizeToken(String token) {
            String[] tokens = token.split("([^+><]+?=[-\\[\\](){}<>=,;+/*%|&!])|((?<=[-\\[\\](){}<>=,;/+*%|&!])(?![+=]))");
            for (int i = 0; i < tokens.length; i++) {
                String t = tokens[i].trim();
    
                if (t.matches("=")) {
                    tokens[i] = "ASSIGN";
                } else if (t.matches("[<]")) {
                    tokens[i] = "LT";
                } else if (t.matches("[>]")) {
                    tokens[i] = "RT";
                } else if (token.matches("[+]{2}")) {
                    tokens[i] = "INC";
                } else if (t.matches("[+]")) {
                    tokens[i] = "ADD_OP";
                } else if (token.matches(">=")) {
                    tokens[i] = "GE";
                } else {
                    System.out.println("SYSTEM ERROR: INVALID IDENTIFIER NAME");
                }
            }
    
            return tokens;
        }
    }