c++clanglibtooling

Getting the token sequence from clang AST node


I am writing a tool based on Clang libtooling that checks and warns against functions that are too similar. After obtaining a clang::FunctionDecl, I want to perform some similarity check on the source code.

Currently, I can get the source text following this question, but source-text-based similarity check is not precise enough and too slow. Is there a way of getting the source code in the form of a token sequence? It would be helpful if I can write something like this:

SomeContainer<Token> tokens = getTokenSequence(funcDecl);
for (const auto &t : tokens)
  // ...

Solution

  • The main way that Clang Libtooling offers for getting the token sequence is to first get the text by calling clang::SourceManager::getBufferOrNone, then run clang::Lexer on it to get the tokens. This runs the lexer in "raw" mode, meaning it does not do any preprocessing, and does not remember what happened when preprocessing happened during parsing.

    Here is a visitor function that prints the tokens of every function definition (an excerpt of the complete program further below):

    bool Visitor::VisitFunctionDecl(clang::FunctionDecl *functionDecl)
    {
      // Get the function start and end.  If either arises from a macro
      // expansion, get the location of that expansion, not the place in
      // the macro definition where the token was spelled.
      clang::SourceManager &sm = m_astContext.getSourceManager();
      clang::SourceLocation beginLoc = sm.getExpansionLoc(functionDecl->getBeginLoc());
      clang::SourceLocation endLoc = sm.getExpansionLoc(functionDecl->getEndLoc());
    
      // Print some details about the function.
      cout << functionDecl->Decl::getDeclKindName()
           << " \"" << functionDecl->getQualifiedNameAsString()
           << "\" type=\"" << typeStr(functionDecl->getType())
           << "\" at "
           << declLocStr(functionDecl)
           << ": beginLoc=" << locStr(beginLoc)
           << ", endLoc=" << locStr(endLoc)
           << "\n";
    
      // Get the memory buffer containing the source code.
      clang::FileID fileID = sm.getFileID(beginLoc);
      std::optional<llvm::MemoryBufferRef> buffer = sm.getBufferOrNone(fileID);
      if (!buffer) {
        // I haven't seen this happen, but might be possible when the input
        // is a serialized AST file rather than source code.
        cout << "  No buffer for this source file.\n";
        return true;
      }
    
      // Range of text to lex.
      char const *fileStartPtr = buffer->getBufferStart();
      char const *functionStartPtr = fileStartPtr + sm.getFileOffset(beginLoc);
      char const *functionEndPtr = fileStartPtr + sm.getFileOffset(endLoc);
    
      // Prepare to lex it in "raw" mode, meaning no preprocessing is done.
      clang::Lexer lexer(
        beginLoc,
        m_astContext.getLangOpts(),
        functionStartPtr,
        functionStartPtr,
        functionEndPtr);
    
      // Iterate over all of the tokens in the function definition range.
      // Do not rely only on the return value from 'LexFromRawLexer' to know
      // when to stop because, depending on where the end is relative to
      // tokens, it can skip over the specified end and read into unrelated
      // memory and eventually segfault (Clang bug?).
      clang::Token token;
      while (lexer.getBufferLocation() < functionEndPtr &&
             lexer.LexFromRawLexer(token /*OUT*/) == false) {
        cout << "  " << locStr(token.getLocation())
             << ": kind=" << token.getName()
             << ", len=" << token.getLength()
             << ", text: \""
             << std::string(
                  fileStartPtr + sm.getFileOffset(token.getLocation()),
                  token.getLength())
             << "\"\n";
      }
    
      return true;
    }
    

    When this visitor is run on the input:

    // test.cc
    // Input for print-node-tokens.cc.
    
    // Easy case.
    int f()
    {
      return 2;
    }
    
    // Some macro expansions.
    #define PLUS +
    int g(int x)
    {
      #define ECKS x
      return ECKS PLUS 4;
    }
    
    // Macro expansions at start and end.
    #define INTEGER int
    #define RBRACE }
    INTEGER h()
    {
      return 5;
    RBRACE
    
    // The entire function is inside a macro expansion.  For this
    // definition, we don't get any tokens, and it's not obvious how to
    // improve that.
    #define IDENT(x) x
    IDENT(int j() { return 6; })
    
    // EOF
    

    it prints:

    $ ./print-node-tokens.exe test.cc
    Function "f" type="int (void)" at test.cc:5:5: beginLoc=test.cc:5:1, endLoc=test.cc:8:1
      test.cc:5:1: kind=raw_identifier, len=3, text: "int"
      test.cc:5:5: kind=raw_identifier, len=1, text: "f"
      test.cc:5:6: kind=l_paren, len=1, text: "("
      test.cc:5:7: kind=r_paren, len=1, text: ")"
      test.cc:6:1: kind=l_brace, len=1, text: "{"
      test.cc:7:3: kind=raw_identifier, len=6, text: "return"
      test.cc:7:10: kind=numeric_constant, len=1, text: "2"
      test.cc:7:11: kind=semi, len=1, text: ";"
      test.cc:8:1: kind=r_brace, len=1, text: "}"
    Function "g" type="int (int)" at test.cc:12:5: beginLoc=test.cc:12:1, endLoc=test.cc:16:1
      test.cc:12:1: kind=raw_identifier, len=3, text: "int"
      test.cc:12:5: kind=raw_identifier, len=1, text: "g"
      test.cc:12:6: kind=l_paren, len=1, text: "("
      test.cc:12:7: kind=raw_identifier, len=3, text: "int"
      test.cc:12:11: kind=raw_identifier, len=1, text: "x"
      test.cc:12:12: kind=r_paren, len=1, text: ")"
      test.cc:13:1: kind=l_brace, len=1, text: "{"
      test.cc:14:3: kind=hash, len=1, text: "#"
      test.cc:14:4: kind=raw_identifier, len=6, text: "define"
      test.cc:14:11: kind=raw_identifier, len=4, text: "ECKS"
      test.cc:14:16: kind=raw_identifier, len=1, text: "x"
      test.cc:15:3: kind=raw_identifier, len=6, text: "return"
      test.cc:15:10: kind=raw_identifier, len=4, text: "ECKS"
      test.cc:15:15: kind=raw_identifier, len=4, text: "PLUS"
      test.cc:15:20: kind=numeric_constant, len=1, text: "4"
      test.cc:15:21: kind=semi, len=1, text: ";"
      test.cc:16:1: kind=r_brace, len=1, text: "}"
    Function "h" type="int (void)" at test.cc:21:9: beginLoc=test.cc:21:1, endLoc=test.cc:24:1
      test.cc:21:1: kind=raw_identifier, len=7, text: "INTEGER"
      test.cc:21:9: kind=raw_identifier, len=1, text: "h"
      test.cc:21:10: kind=l_paren, len=1, text: "("
      test.cc:21:11: kind=r_paren, len=1, text: ")"
      test.cc:22:1: kind=l_brace, len=1, text: "{"
      test.cc:23:3: kind=raw_identifier, len=6, text: "return"
      test.cc:23:10: kind=numeric_constant, len=1, text: "5"
      test.cc:23:11: kind=semi, len=1, text: ";"
      test.cc:24:1: kind=raw_identifier, len=6, text: "RBRACE"
    Function "j" type="int (void)" at test.cc:30:1 <Spelling=test.cc:30:11>: beginLoc=test.cc:30:1, endLoc=test.cc:30:1
    

    As an alternative to the "raw" lexer, if you want details about the preprocessor actions performed during the real parse, you could use clang::PPCallbacks to hook into it while it runs. But that seems like overkill for your intended purpose, so I won't elaborate in this answer.

    Tangent: The char* pointers that you pass to the clang::Lexer constructor can point to anything that you want to tokenize; they do not have to come from getBufferOrNone. However, that constructor needs a starting SourceLocation for BufStart that will be used to derive the locations of the found tokens. If the text came from original source code (as is the case when using getBufferOrNone), then that should be no problem. But if you wanted to lex a brand new string for which no existing SourceLocation is appropriate, you would have to use SourceManager to allocate a new range of locations to name positions within that string.


    Complete example program:

    // print-node-tokens.cc
    // Print the tokens in an AST node.
    
    #include "clang/AST/ASTContext.h"                          // clang::ASTContext
    #include "clang/AST/Decl.h"                                // clang::FunctionDecl
    #include "clang/AST/DeclBase.h"                            // clang::Decl
    #include "clang/AST/RecursiveASTVisitor.h"                 // clang::RecursiveASTVisitor
    #include "clang/AST/Type.h"                                // clang::QualType
    #include "clang/Basic/Diagnostic.h"                        // clang::DiagnosticsEngine
    #include "clang/Basic/DiagnosticOptions.h"                 // clang::DiagnosticOptions
    #include "clang/Basic/SourceLocation.h"                    // clang::{FileID, SourceLocation}
    #include "clang/Basic/SourceManager.h"                     // clang::SourceManager
    #include "clang/Frontend/ASTUnit.h"                        // clang::ASTUnit
    #include "clang/Frontend/CompilerInstance.h"               // clang::CompilerInstance
    #include "clang/Lex/Lexer.h"                               // clang::Lexer
    #include "clang/Lex/Token.h"                               // clang::Token
    #include "clang/Serialization/PCHContainerOperations.h"    // clang::PCHContainerOperations
    
    #include "llvm/Support/MemoryBufferRef.h"                  // llvm::MemoryBufferRef
    
    #include <iostream>                                        // std::cout
    #include <optional>                                        // std::optional
    #include <string>                                          // std::string
    
    using std::cout;
    using std::string;
    
    
    // Visitor to print the tokens of every function definition.
    class Visitor : public clang::RecursiveASTVisitor<Visitor> {
    public:      // data
      clang::ASTUnit *m_astUnit;
      clang::ASTContext &m_astContext;
    
    public:      // methods
      Visitor(clang::ASTUnit *astUnit)
        : m_astUnit(astUnit),
          m_astContext(astUnit->getASTContext())
      {}
    
      // Convenience methods to stringify some things.
      string locStr(clang::SourceLocation loc);
      string declLocStr(clang::Decl const *decl);
      string typeStr(clang::QualType qualType);
    
      // Visitor methods.
      bool VisitFunctionDecl(clang::FunctionDecl *functionDecl);
    
      // Kick off the traversal.
      void traverseTU();
    };
    
    string Visitor::locStr(clang::SourceLocation loc)
    {
      return loc.printToString(m_astContext.getSourceManager());
    }
    
    string Visitor::declLocStr(clang::Decl const *decl)
    {
      return locStr(decl->getLocation());
    }
    
    string Visitor::typeStr(clang::QualType qualType)
    {
      return qualType.getAsString();
    }
    
    bool Visitor::VisitFunctionDecl(clang::FunctionDecl *functionDecl)
    {
      // Get the function start and end.  If either arises from a macro
      // expansion, get the location of that expansion, not the place in
      // the macro definition where the token was spelled.
      clang::SourceManager &sm = m_astContext.getSourceManager();
      clang::SourceLocation beginLoc = sm.getExpansionLoc(functionDecl->getBeginLoc());
      clang::SourceLocation endLoc = sm.getExpansionLoc(functionDecl->getEndLoc());
    
      // Print some details about the function.
      cout << functionDecl->Decl::getDeclKindName()
           << " \"" << functionDecl->getQualifiedNameAsString()
           << "\" type=\"" << typeStr(functionDecl->getType())
           << "\" at "
           << declLocStr(functionDecl)
           << ": beginLoc=" << locStr(beginLoc)
           << ", endLoc=" << locStr(endLoc)
           << "\n";
    
      // Get the memory buffer containing the source code.
      clang::FileID fileID = sm.getFileID(beginLoc);
      std::optional<llvm::MemoryBufferRef> buffer = sm.getBufferOrNone(fileID);
      if (!buffer) {
        // I haven't seen this happen, but might be possible when the input
        // is a serialized AST file rather than source code.
        cout << "  No buffer for this source file.\n";
        return true;
      }
    
      // Range of text to lex.
      char const *fileStartPtr = buffer->getBufferStart();
      char const *functionStartPtr = fileStartPtr + sm.getFileOffset(beginLoc);
      char const *functionEndPtr = fileStartPtr + sm.getFileOffset(endLoc);
    
      // Prepare to lex it in "raw" mode, meaning no preprocessing is done.
      clang::Lexer lexer(
        beginLoc,
        m_astContext.getLangOpts(),
        functionStartPtr,
        functionStartPtr,
        functionEndPtr);
    
      // Iterate over all of the tokens in the function definition range.
      // Do not rely only on the return value from 'LexFromRawLexer' to know
      // when to stop because, depending on where the end is relative to
      // tokens, it can skip over the specified end and read into unrelated
      // memory and eventually segfault (Clang bug?).
      clang::Token token;
      while (lexer.getBufferLocation() < functionEndPtr &&
             lexer.LexFromRawLexer(token /*OUT*/) == false) {
        cout << "  " << locStr(token.getLocation())
             << ": kind=" << token.getName()
             << ", len=" << token.getLength()
             << ", text: \""
             << std::string(
                  fileStartPtr + sm.getFileOffset(token.getLocation()),
                  token.getLength())
             << "\"\n";
      }
    
      return true;
    }
    
    
    void Visitor::traverseTU()
    {
      this->TraverseDecl(m_astContext.getTranslationUnitDecl());
    }
    
    
    // This is all boilerplate for a libclang program.
    int main(int argc, char const **argv)
    {
      // Copy the arguments into a vector of char pointers since that is
      // what 'createInvocationFromCommandLine' wants.
      std::vector<char const *> commandLine;
      {
        // Path to the 'clang' binary that I am behaving like.  This path is
        // used to compute the location of compiler headers like stddef.h.
        commandLine.push_back(CLANG_LLVM_INSTALL_DIR "/bin/clang");
    
        for (int i = 1; i < argc; ++i) {
          commandLine.push_back(argv[i]);
        }
      }
    
      // Parse the command line options.
      std::shared_ptr<clang::CompilerInvocation> compilerInvocation(
        clang::createInvocation(llvm::ArrayRef(commandLine)));
      if (!compilerInvocation) {
        // Command line parsing errors have already been printed.
        return 2;
      }
    
      // Boilerplate setup for 'LoadFromCompilerInvocationAction'.
      std::shared_ptr<clang::PCHContainerOperations> pchContainerOps(
        new clang::PCHContainerOperations());
      clang::IntrusiveRefCntPtr<clang::DiagnosticsEngine> diagnosticsEngine(
        clang::CompilerInstance::createDiagnostics(
          new clang::DiagnosticOptions));
    
      // Run the Clang parser to produce an AST.
      std::unique_ptr<clang::ASTUnit> ast(
        clang::ASTUnit::LoadFromCompilerInvocationAction(
          compilerInvocation,
          pchContainerOps,
          diagnosticsEngine));
    
      if (ast == nullptr ||
          diagnosticsEngine->getNumErrors() > 0) {
        // Error messages have already been printed.
        return 2;
      }
    
      Visitor visitor(ast.get());
      visitor.traverseTU();
    
      return 0;
    }
    
    
    // EOF
    

    Makefile:

    # Makefile
    
    # Default target.
    all:
    .PHONY: all
    
    
    # ---- Configuration ----
    # Installation directory from a binary distribution.
    # Has five subdirectories: bin include lib libexec share.
    CLANG_LLVM_INSTALL_DIR = $(HOME)/opt/clang+llvm-16.0.0-x86_64-linux-gnu-ubuntu-18.04
    
    # ---- llvm-config query results ----
    # Program to query the various LLVM configuration options.
    LLVM_CONFIG := $(CLANG_LLVM_INSTALL_DIR)/bin/llvm-config
    
    # C++ compiler options to ensure ABI compatibility.
    LLVM_CXXFLAGS := $(shell $(LLVM_CONFIG) --cxxflags)
    
    # Directory containing the clang library files, both static and dynamic.
    LLVM_LIBDIR := $(shell $(LLVM_CONFIG) --libdir)
    
    # Other flags needed for linking, whether statically or dynamically.
    LLVM_LDFLAGS_AND_SYSTEM_LIBS := $(shell $(LLVM_CONFIG) --ldflags --system-libs)
    
    
    # ---- Compiler options ----
    # C++ compiler.
    CXX := $(CLANG_LLVM_INSTALL_DIR)/bin/clang++
    
    # Compiler options, including preprocessor options.
    CXXFLAGS =
    CXXFLAGS += -g
    CXXFLAGS += -Wall
    CXXFLAGS += -Werror
    
    # Get llvm compilation flags.
    CXXFLAGS += $(LLVM_CXXFLAGS)
    
    # Tell the source code where the clang installation directory is.
    CXXFLAGS += -DCLANG_LLVM_INSTALL_DIR='"$(CLANG_LLVM_INSTALL_DIR)"'
    
    # Linker options.
    LDFLAGS =
    
    LDFLAGS += -g -Wall
    
    # Pull in clang+llvm via libclang-cpp.so, which has everything, but is
    # only available as a dynamic library.
    LDFLAGS += -lclang-cpp
    
    # Arrange for the compiled binary to search the libdir for that library.
    # Otherwise, one can set the LD_LIBRARY_PATH envvar before running it.
    # Note: the -rpath switch does not work on Windows.
    LDFLAGS += -Wl,-rpath=$(LLVM_LIBDIR)
    
    # Get the needed -L search path, plus things like -ldl.
    LDFLAGS += $(LLVM_LDFLAGS_AND_SYSTEM_LIBS)
    
    
    # ---- Recipes ----
    # Compile a C++ source file.
    %.o: %.cc
        $(CXX) -c -o $@ $(CXXFLAGS) $<
    
    # Executable.
    all: print-node-tokens.exe
    print-node-tokens.exe: print-node-tokens.o
        $(CXX) -o $@ $^ $(LDFLAGS)
    
    # Test.
    .PHONY: check
    check: print-node-tokens.exe
        ./print-node-tokens.exe test.cc
    
    .PHONY: clean
    clean:
        $(RM) *.o *.exe
    
    
    # EOF