pythoncomments

Script to remove Python comments/docstrings


Is there a Python script or tool available which can remove comments and docstrings from Python source?

It should take care of cases like:

"""
aas
"""
def f():
    m = {
        u'x':
            u'y'
        } # faake docstring ;)
    if 1:
        'string' >> m
    if 2:
        'string' , m
    if 3:
        'string' > m

So at last I have come up with a simple script, which uses the tokenize module and removes comment tokens. It seems to work pretty well, except that I am not able to remove docstrings in all cases. See if you can improve it to remove docstrings.

import cStringIO
import tokenize

def remove_comments(src):
    """
    This reads tokens using tokenize.generate_tokens and recombines them
    using tokenize.untokenize, and skipping comment/docstring tokens in between
    """
    f = cStringIO.StringIO(src)
    class SkipException(Exception): pass
    processed_tokens = []
    last_token = None
    # go thru all the tokens and try to skip comments and docstrings
    for tok in tokenize.generate_tokens(f.readline):
        t_type, t_string, t_srow_scol, t_erow_ecol, t_line = tok

        try:
            if t_type == tokenize.COMMENT:
                raise SkipException()

            elif t_type == tokenize.STRING:

                if last_token is None or last_token[0] in [tokenize.INDENT]:
                    # FIXEME: this may remove valid strings too?
                    #raise SkipException()
                    pass

        except SkipException:
            pass
        else:
            processed_tokens.append(tok)

        last_token = tok

    return tokenize.untokenize(processed_tokens)

Also I would like to test it on a very large collection of scripts with good unit test coverage. Can you suggest such a open source project?


Solution

  • This does the job:

    """ Strip comments and docstrings from a file.
    """
    
    import sys, token, tokenize
    
    def do_file(fname):
        """ Run on just one file.
    
        """
        source = open(fname)
        mod = open(fname + ",strip", "w")
    
        prev_toktype = token.INDENT
        first_line = None
        last_lineno = -1
        last_col = 0
    
        tokgen = tokenize.generate_tokens(source.readline)
        for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen:
            if 0:   # Change to if 1 to see the tokens fly by.
                print("%10s %-14s %-20r %r" % (
                    tokenize.tok_name.get(toktype, toktype),
                    "%d.%d-%d.%d" % (slineno, scol, elineno, ecol),
                    ttext, ltext
                    ))
            if slineno > last_lineno:
                last_col = 0
            if scol > last_col:
                mod.write(" " * (scol - last_col))
            if toktype == token.STRING and prev_toktype == token.INDENT:
                # Docstring
                mod.write("#--")
            elif toktype == tokenize.COMMENT:
                # Comment
                mod.write("##\n")
            else:
                mod.write(ttext)
            prev_toktype = toktype
            last_col = ecol
            last_lineno = elineno
    
    if __name__ == '__main__':
        do_file(sys.argv[1])
    

    I'm leaving stub comments in the place of docstrings and comments since it simplifies the code. If you remove them completely, you also have to get rid of indentation before them.