pythonpython-sphinxdocutils

Convert reStructuredText to plain text programmatically in Python


Say I have some reStructuredText source in a string

source = """
============
Introduction
============

Hello world.

.. code-block:: bash

    $ echo Greetings.


"""

import sys

import docutils.nodes
import docutils.parsers.rst
import docutils.utils
import sphinx.writers.text
import sphinx.builders.text

def parse_rst(text: str) -> docutils.nodes.document:
    parser = docutils.parsers.rst.Parser()
    components = (docutils.parsers.rst.Parser,)
    settings = docutils.frontend.OptionParser(components=components).get_default_values()
    document = docutils.utils.new_document('<rst-doc>', settings=settings)
    parser.parse(text, document)
    return document

if __name__ == '__main__':        
    document = parse_rst(source)

I'd like to convert it into plain text without the reST markup using Python.

I tried to use sphinx.builders.text.TextBuilder but it seems to want an App object, not a string.



Solution

  • This code works. It has some hacks like setting a fake config dir, maybe there's a better way.

    import sys
    import textwrap
    import types
    
    import docutils.nodes
    import docutils.parsers.rst
    import docutils.utils
    import sphinx.writers.text
    import sphinx.builders.text
    import sphinx.util.osutil
    
    
    def parse_rst(text: str) -> docutils.nodes.document:
        parser = docutils.parsers.rst.Parser()
        components = (docutils.parsers.rst.Parser,)
        settings = docutils.frontend.OptionParser(
            components=components
        ).get_default_values()
        document = docutils.utils.new_document("<rst-doc>", settings=settings)
        parser.parse(text, document)
        return document
    
    
    if __name__ == "__main__":
        source = textwrap.dedent(
            """\
        ============
        Introduction
        ============
    
        Hello world.
    
        .. code-block:: bash
    
            $ echo Greetings.
    
    
        """
        )
    
        document = parse_rst(source)
    
        app = types.SimpleNamespace(
            srcdir=None,
            confdir=None,
            outdir=None,
            doctreedir="/",
            config=types.SimpleNamespace(
                text_newlines="native",
                text_sectionchars="=",
                text_add_secnumbers=False,
                text_secnumber_suffix=".",
            ),
            tags=set(),
            registry=types.SimpleNamespace(
                create_translator=lambda self, something, new_builder: sphinx.writers.text.TextTranslator(
                    document, new_builder
                )
            ),
        )
    
        builder = sphinx.builders.text.TextBuilder(app)
    
        translator = sphinx.writers.text.TextTranslator(document, builder)
    
        document.walkabout(translator)
    
        print(translator.body)
    

    Output:

        Introduction
        ============
    
        Hello world.
    
           $ echo Greetings.