c++pythonclangllvmlibclang

Retrieving comments using python libclang


In the following header file I'd like to get the corresponding +reflect comment to the class and member variable:

#ifndef __HEADER_FOO
#define __HEADER_FOO

//+reflect
class Foo
{
    public:
    private:
        int m_int; //+reflect
};

#endif

Using the python bindings for libclang and the following script:

import sys
import clang.cindex

def dumpnode(node, indent):
    print ' ' * indent, node.kind, node.spelling
    for i in node.get_children():
        dumpnode(i, indent+2)

def main():
    index = clang.cindex.Index.create()
    tu = index.parse(sys.argv[1], args=['-x', 'c++'])

    dumpnode(tu.cursor, 0)

if __name__ == '__main__':
    main()

Gives me this output:

CursorKind.TRANSLATION_UNIT None
  CursorKind.TYPEDEF_DECL __builtin_va_list
  CursorKind.CLASS_DECL type_info
  CursorKind.CLASS_DECL Foo
    CursorKind.CXX_ACCESS_SPEC_DECL
    CursorKind.CXX_ACCESS_SPEC_DECL
    CursorKind.FIELD_DECL m_int

The problem is that the comments are missing. Are they stripped by the preprocessor? Is there any way to prevent that?


Solution

  • To do this you need to get the tokens, not the cursors. If I run this script on the file above:

    import sys
    import clang.cindex
    
    def srcrangestr(x):
        return '%s:%d:%d - %s:%d:%d' % (x.start.file, x.start.line, x.start.column, x.end.file, x.end.line, x.end.column)
    
    def main():
        index = clang.cindex.Index.create()
        tu = index.parse(sys.argv[1], args=['-x', 'c++'])
    
        for x in tu.cursor.get_tokens():
            print x.kind
            print "  " + srcrangestr(x.extent)
            print "  '" + str(x.spelling) + "'"
    
    if __name__ == '__main__':
        main()
    

    I get the following:

    TokenKind.PUNCTUATION
      test2.h:1:1 - test2.h:1:2
      '#'
    TokenKind.IDENTIFIER
      test2.h:1:2 - test2.h:1:8
      'ifndef'
    TokenKind.IDENTIFIER
      test2.h:1:9 - test2.h:1:21
      '__HEADER_FOO'
    TokenKind.PUNCTUATION
      test2.h:2:1 - test2.h:2:2
      '#'
    TokenKind.IDENTIFIER
      test2.h:2:2 - test2.h:2:8
      'define'
    TokenKind.IDENTIFIER
      test2.h:2:9 - test2.h:2:21
      '__HEADER_FOO'
    TokenKind.COMMENT
      test2.h:4:1 - test2.h:4:11
      '//+reflect'
    TokenKind.KEYWORD
      test2.h:5:1 - test2.h:5:6
      'class'
    TokenKind.IDENTIFIER
      test2.h:5:7 - test2.h:5:10
      'Foo'
    TokenKind.PUNCTUATION
      test2.h:6:1 - test2.h:6:2
      '{'
    TokenKind.KEYWORD
      test2.h:7:5 - test2.h:7:11
      'public'
    TokenKind.PUNCTUATION
      test2.h:7:11 - test2.h:7:12
      ':'
    TokenKind.KEYWORD
      test2.h:8:5 - test2.h:8:12
      'private'
    TokenKind.PUNCTUATION
      test2.h:8:12 - test2.h:8:13
      ':'
    TokenKind.KEYWORD
      test2.h:9:9 - test2.h:9:12
      'int'
    TokenKind.IDENTIFIER
      test2.h:9:13 - test2.h:9:18
      'm_int'
    TokenKind.PUNCTUATION
      test2.h:9:18 - test2.h:9:19
      ';'
    TokenKind.COMMENT
      test2.h:9:20 - test2.h:9:30
      '//+reflect'
    TokenKind.PUNCTUATION
      test2.h:10:1 - test2.h:10:2
      '}'
    TokenKind.PUNCTUATION
      test2.h:10:2 - test2.h:10:3
      ';'
    TokenKind.PUNCTUATION
      test2.h:12:1 - test2.h:12:2
      '#'
    TokenKind.IDENTIFIER
      test2.h:12:2 - test2.h:12:7
      'endif'
    

    Which should be enough for me to work with.