pythonjavasyntax-checking

How to safely verify a file contains valid Python syntax in java?


Question

After automatically modifying some Python comments in Java, I would like to verify the file still contains valid Python syntax, how can I do that from Java, without actually running some Python code using an interpreter? (To be explicit: I am looking for a java-only solution, not a solution that calls some other code from inside Java to compute whether the syntax is valid or not).

I tried building the AST of the file using ANTLR, however, that seems like a non-trivial task for arbitrary Python files, as explained in this answer. Another suggestion would be to simply try and run the file to see if it runs or not, however, that is unsafe for arbitrary files. Alternatively, one could call some python code that verifies it has runnable code, from Java, however that also relies on executing external (controlled) code, (as shown in this answer), which I would prefer not to do.

MWE

Below is an MWE that still requires/assumes you have Python installed somewhere in your system:

package com.something.is_valid_python_syntax;

import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
import org.antlr.v4.runtime.CharStream;
import org.antlr.v4.runtime.CharStreams;
import org.antlr.v4.runtime.CommonTokenStream;
import org.antlr.v4.runtime.TokenSource;
import org.antlr.v4.runtime.tree.ParseTree;
import org.antlr.v4.runtime.tree.ParseTreeWalker;

import com.doctestbot.is_valid_python_syntax.generated.PythonParser;
import com.doctestbot.is_valid_python_syntax.generated.PythonLexer;

public class IsValidPythonSyntax {
  
  
  public static PythonParser getPythonParser(String pythonCode) {
    // Create a CharStream from the Python code
    CharStream charStream = CharStreams.fromString(pythonCode);

    // Create the lexer
    PythonLexer lexer = new PythonLexer(charStream);

    // Create a token stream from the lexer
    CommonTokenStream tokenStream = new CommonTokenStream((TokenSource) lexer);

    // Create the parser
    return new PythonParser(tokenStream);
  }

  public static boolean isValidPythonSyntax(String pythonCode) {

    PythonParser parser = getPythonParser(pythonCode);

    // Parse the input and get the tree

    // Redirect standard error stream
    PrintStream originalErr = System.err;
    ByteArrayOutputStream errStream = new ByteArrayOutputStream();
    System.setErr(new PrintStream(errStream));

    try {
      ParseTree tree = parser.file_input();

    } finally {
      // Restore the original standard error stream
      System.setErr(originalErr);
    }

    // Check if there were any errors in the error stream
    String errorOutput = errStream.toString();
    if (!errorOutput.isEmpty()) {
      System.out.println("Invalid Python syntax:");
      System.out.println(errorOutput);
      return false;
    } else {
      System.out.println("Valid Python syntax");
      return true;
    }
  }
}

However, that claims that the following Python code is invalid syntax:

def foo():
    print("hello world.")
foo()

Based on the following Antlr error message:

Invalid Python syntax:
line 1:3 extraneous input ' ' expecting {'match', '_', NAME}

Searching this error leads to suggestions on how to adapt the grammar, however, this was autogenerated from the Python 3.12 Antlr grammar.

Issue

It seems that the Antlr error message system does not distinguish between warnings and errors, for example, on (missing closing bracket in print statement):

def foo():
    print("hello world."
foo()

It outputs:

Invalid Python syntax:
line 1:3 extraneous input ' ' expecting {'match', '_', NAME}
line 2:9 no viable alternative at input '('
line 3:0 no viable alternative at input 'foo'

I do not know how many different error messages Antlr can produce on parsing Python code, nor do I know which ones I should take seriously nor whether that decision on valid/invalid Python syntax based on Antlr parsing errors is context dependent or not.


Solution

  • I believe the issue is with the Python 3.12 grammar in the grammars-v4 repository. I used your code as a base, and was able to get it working properly using the grammars in the ANTLR4-parser-for-Python-3.12 repository.

    Here's the full working code:

    package playground;
    
    import org.antlr.v4.runtime.CharStream;
    import org.antlr.v4.runtime.CharStreams;
    import org.antlr.v4.runtime.CommonTokenStream;
    import playground.antlr.generated.PythonLexer;
    import playground.antlr.generated.PythonParser;
    
    import java.io.ByteArrayOutputStream;
    import java.io.PrintStream;
    import java.util.List;
    
    public class PythonValidator {
    
        public static void main(String[] args) {
            List<String> snippets = List.of(
                // Hello world as a function
                """
                def foo():
                    print("hello world.")
                foo()
                """,
    
                // Program to generate a random number between 0 and 0
                """
                # Program to generate a random number between 0 and 9
                import random
                            
                print(random.randint(0,9))
                """,
    
                // Reverse a number
                """
                num = 1234
                reversed_num = 0
                            
                while num != 0:
                    digit = num % 10
                    reversed_num = reversed_num * 10 + digit
                    num //= 10
                            
                print("Reversed Number: " + str(reversed_num))    
                """
            );
            PythonValidator validator = new PythonValidator();
            for (String snippet : snippets) {
                boolean valid = validator.isValidSyntax(snippet);
                System.out.println("Valid? " + valid);
            }
        }
    
        public boolean isValidSyntax(String pythonCode) {
            PythonParser parser = getPythonParser(pythonCode);
    
            // Redirect standard error stream
            PrintStream originalErr = System.err;
            ByteArrayOutputStream errStream = new ByteArrayOutputStream();
            System.setErr(new PrintStream(errStream));
    
            try {
                parser.file_input();
            } finally {
                // Restore the original standard error stream
                System.setErr(originalErr);
            }
    
            // Check if there were any errors in the error stream
            String errorOutput = errStream.toString();
            if (!errorOutput.isEmpty()) {
                System.out.println(errorOutput);
                return false;
            } else {
                return true;
            }
        }
    
        private PythonParser getPythonParser(String pythonCode) {
            // Create a CharStream from the Python code
            CharStream charStream = CharStreams.fromString(pythonCode);
    
            // Create the lexer
            PythonLexer lexer = new PythonLexer(charStream);
    
            // Create a token stream from the lexer
            CommonTokenStream tokenStream = new CommonTokenStream(lexer);
    
            // Create the parser
            return new PythonParser(tokenStream);
        }
    }
    

    Here is a working example.

    Just clone the repo and run:

    ./gradlew generateGrammarSource
    ./gradlew run