filterluapandoc

How do I include a markdown file in another markdown file using GLFM in pandoc via a Lua filter?


I am new to pandoc/Lua. I have a markdown file from our Gitlab repository I would like to locally convert using pandoc. The file contains ::include directives.

I have started doing a Lua filter for pandoc that replaces the ::include directives with the actual referred markdown. However, while the RawInline blocks I insert seem to be accepted, the --verbose output indicates that they are not rendered (the part I inserted is missing in the output HTML).

[INFO] Running filter gitlab-include.lua
[INFO] Completed filter gitlab-include.lua in 15 ms
[INFO] Not rendering RawInline (Format "markdown")[Markdown content included here]

The file in ::include{file=./my_file.md} is read correctly. If instead of RawInline I return the string, the unrendered markdown is included in the output. However, I did not manage to get the inserted content to be rendered as markdown.

Any clue as to what the filter is missing? Many thanks in advance.

The Lua filter is included below:

local function read_file(filepath)
    local file = io.open(filepath, "r")
    if not file then
        io.stderr:write("Cannot open file: " .. filepath .. "\n")
        return "**Error: Cannot include file " .. filepath .. "**"
    end
    local content = file:read("*all")   
    file:close()
    return pandoc.RawInline('markdown', content)
end

function replace_include(el)
    local pattern = "::include%{file=([^\"]+)%}"
    local filepath = el.text:match(pattern)
    if filepath then
        return read_file(filepath)
    end
    return el
end

return {
    { Str = replace_include }
}

Solution

  • Since as far as I could find (and based on the lack of responses) it seems like there is not a way lua filters can do this, I decided to solve this issue with Python and mark this as solved.

    The workaround I could find is:

    The code I used is provided below. Maybe someone finds a way to do something like this within pandoc, but as for now, this effectively solves my problem :)

    import os
    import re
    
    import pypandoc
    
    # Pre-processes a Gitlab-flavored Markdown file such that
    #   - ::include directives are replaced by the actual file
    #   - [[_TOC_]]
    
    # Requires pandoc!!!
    # See https://pypi.org/project/pypandoc/
    
    pandoc_location = r'<pandoc_folder>\pandoc.exe'
    input_file = r'<path_to_your_file.md>'
    to_format = 'html5'
    
    print(f'Setting pandoc location to {pandoc_location}')
    os.environ.setdefault('PYPANDOC_PANDOC', pandoc_location)
    
    current_path = __file__
    current_folder, current_filename = os.path.split(current_path)
    tmp_file = os.path.join(current_folder, 'tmp.md')
    print(f'Using tmp. file {tmp_file}')
    
    with open(input_file, 'r') as f:
        input_md = f.read()
    
    print(f'Read {input_file}. Length={len(input_md)}')
    
    input_folder, input_file = os.path.split(input_file)
    input_base, input_ext = os.path.splitext(input_file)
    
    all_matches = [re.match(r'\:\:include{file=([\W\w\.\/\d]+)}', e) for e in input_md.splitlines() ]
    all_matches = [e for e in all_matches if e is not None]
    for include_match in all_matches:
        include_path = include_match.group(1)
        abs_path = os.path.abspath(os.path.join(input_folder, include_path))
        print(f'Including {abs_path}')
        try:
            with open(abs_path, 'r') as f:
                include_file_content = f.read()
            input_md = input_md.replace(include_match.group(0), include_file_content)
        except Exception as e:
            print(f'Could not include file: {e}')
    
    # Process ToC
    def slugify(text):
        """Converts heading text into a GitHub-style anchor slug."""
        text = text.strip().lower()
        text = re.sub(r'[^\w\s-]', '', text)
        return re.sub(r'[\s]+', '-', text)
    
    def strip_markdown_links(text):
        """Extracts visible text from markdown-style links [text](url)."""
        return re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
    
    def extract_headings(markdown):
        """Extracts headings ignoring code blocks, and handles markdown links."""
        headings = []
        in_code_block = False
    
        for line in markdown.splitlines():
            if line.strip().startswith("```"):
                in_code_block = not in_code_block
                continue
            if in_code_block:
                continue
    
            match = re.match(r'^(#{1,6})\s+(.*)', line)
            if match:
                level = len(match.group(1))
                raw_text = match.group(2).strip()
                clean_text = strip_markdown_links(raw_text)
                slug = slugify(clean_text)
                headings.append((level, clean_text, slug))
    
        return headings
    
    def generate_toc(headings):
        """Generates TOC from extracted headings."""
        toc_lines = []
        for level, text, slug in headings:
            indent = '  ' * (level - 1)
            toc_lines.append(f"{indent}- [{text}](#{slug})")
        return '\n'.join(toc_lines)
    
    # Replace Gitlab's [[_TOC_]] with the actual ToC
    print(f'Generating ToC from [[_TOC_]]')
    headings_input = extract_headings(input_md)
    toc = generate_toc(headings_input)
    
    # The HTML output seems NOT to like it if the anchor is "#3gppsa2".
    # The number "3" is lost in the HTML conversion. This should remedy this
    # Please note that this "hack" results in the navigation of tmp.md being broken. But the output HTML is OK
    toc = toc.replace('(#3gppsa2', '(#gppsa2')
    
    input_md = input_md.replace('[[_TOC_]]', toc)
    
    with open(tmp_file, 'w') as f:
        f.write(input_md)
    print(f'Wrote {tmp_file}')
    
    print(f'Converting {tmp_file} to {to_format}')
    # CSS from https://jez.io/pandoc-markdown-css-theme/#usage
    # https://github.com/jez/pandoc-markdown-css-theme
    # Fixed title with https://stackoverflow.com/questions/63928077/how-can-i-add-header-metadata-without-adding-the-h1
    # Using markdon-smart to fix wrongly-displayed single-quotes
    output = pypandoc.convert_file(
        source_file='tmp.md',
        to=f'{to_format}',
        extra_args=[
            '--from=markdown-smart',
            '--standalone',
            '--embed-resources=true',
            '--css=theme.css',
            '--html-q-tags=true',
            f'--metadata=title={input_base}',
            '--variable=title='
        ])
    
    match to_format:
        case 'html' | 'html5':
            output_ext = 'html'
        case _:
            output_ext = to_format
    
    output_file = os.path.join(input_folder, f'{input_base}.{output_ext}')
    
    with open(output_file, 'w') as f:
        f.write(output)
    print(f'PyPandoc output saved to: {output_file}')