Retrieving comments using python libclang

In the following header file I'd like to get the corresponding +reflect comment to the class and member variable:

#ifndef __HEADER_FOO
#define __HEADER_FOO

//+reflect
class Foo
{
    public:
    private:
        int m_int; //+reflect
};

#endif

Using the python bindings for libclang and the following script:

import sys
import clang.cindex

def dumpnode(node, indent):
    print ' ' * indent, node.kind, node.spelling
    for i in node.get_children():
        dumpnode(i, indent+2)

def main():
    index = clang.cindex.Index.create()
    tu = index.parse(sys.argv[1], args=['-x', 'c++'])

    dumpnode(tu.cursor, 0)

if __name__ == '__main__':
    main()

Gives me this output:

CursorKind.TRANSLATION_UNIT None
  CursorKind.TYPEDEF_DECL __builtin_va_list
  CursorKind.CLASS_DECL type_info
  CursorKind.CLASS_DECL Foo
    CursorKind.CXX_ACCESS_SPEC_DECL
    CursorKind.CXX_ACCESS_SPEC_DECL
    CursorKind.FIELD_DECL m_int

The problem is that the comments are missing. Are they stripped by the preprocessor? Is there any way to prevent that?

Solution

To do this you need to get the tokens, not the cursors. If I run this script on the file above:

import sys
import clang.cindex

def srcrangestr(x):
    return '%s:%d:%d - %s:%d:%d' % (x.start.file, x.start.line, x.start.column, x.end.file, x.end.line, x.end.column)

def main():
    index = clang.cindex.Index.create()
    tu = index.parse(sys.argv[1], args=['-x', 'c++'])

    for x in tu.cursor.get_tokens():
        print x.kind
        print "  " + srcrangestr(x.extent)
        print "  '" + str(x.spelling) + "'"

if __name__ == '__main__':
    main()

I get the following:

TokenKind.PUNCTUATION
  test2.h:1:1 - test2.h:1:2
  '#'
TokenKind.IDENTIFIER
  test2.h:1:2 - test2.h:1:8
  'ifndef'
TokenKind.IDENTIFIER
  test2.h:1:9 - test2.h:1:21
  '__HEADER_FOO'
TokenKind.PUNCTUATION
  test2.h:2:1 - test2.h:2:2
  '#'
TokenKind.IDENTIFIER
  test2.h:2:2 - test2.h:2:8
  'define'
TokenKind.IDENTIFIER
  test2.h:2:9 - test2.h:2:21
  '__HEADER_FOO'
TokenKind.COMMENT
  test2.h:4:1 - test2.h:4:11
  '//+reflect'
TokenKind.KEYWORD
  test2.h:5:1 - test2.h:5:6
  'class'
TokenKind.IDENTIFIER
  test2.h:5:7 - test2.h:5:10
  'Foo'
TokenKind.PUNCTUATION
  test2.h:6:1 - test2.h:6:2
  '{'
TokenKind.KEYWORD
  test2.h:7:5 - test2.h:7:11
  'public'
TokenKind.PUNCTUATION
  test2.h:7:11 - test2.h:7:12
  ':'
TokenKind.KEYWORD
  test2.h:8:5 - test2.h:8:12
  'private'
TokenKind.PUNCTUATION
  test2.h:8:12 - test2.h:8:13
  ':'
TokenKind.KEYWORD
  test2.h:9:9 - test2.h:9:12
  'int'
TokenKind.IDENTIFIER
  test2.h:9:13 - test2.h:9:18
  'm_int'
TokenKind.PUNCTUATION
  test2.h:9:18 - test2.h:9:19
  ';'
TokenKind.COMMENT
  test2.h:9:20 - test2.h:9:30
  '//+reflect'
TokenKind.PUNCTUATION
  test2.h:10:1 - test2.h:10:2
  '}'
TokenKind.PUNCTUATION
  test2.h:10:2 - test2.h:10:3
  ';'
TokenKind.PUNCTUATION
  test2.h:12:1 - test2.h:12:2
  '#'
TokenKind.IDENTIFIER
  test2.h:12:2 - test2.h:12:7
  'endif'

Which should be enough for me to work with.