
XSLT 2 or 3 How to convert <!--/ COMMENT --> in to encapsulating element in output markup

I need to convert no compliant html into xml so I can process thousands of documents in the required json format with XSLT.

          <!-- IMAGECOUNT -->
      <table width="100%">
            <td colspan="1" rowspan="1">
2 Images
      <!-- /IMAGECOUNT -->
<!-- SECTION -->
The section 
<!-- /SECTION -->
<!-- COUNTRY -->
The country
<!-- /COUNTRY -->

      <!-- DATE -->
         <font size="-1">
            <b>Date Posted: 09-Dec-2019</b>
      <!-- /DATE -->
      <!-- TEXT -->
            <font size="-1">
Just some text
      <!-- /TEXT -->
      <!-- TOP THUMBNAILS -->
      <table class="tabletopbottom" width="100%">
            <td colspan="1" rowspan="1">
               <img src="images/s1353556.jpg" alt="Cat"/>
               <img src="images/s1164352.jpg" alt="Dog"/>
      <!-- /TOP THUMBNAILS -->

I need to add structure to the HTML so that I can use a another XSLT to remove all the elements that are not important final Json.

This seems like a group-with @starting-with problem but I can't get the logic to gobble up the following-siblings until the next matching end comment is found.

Start Comment

  <!-- IMAGECOUNT -->
    Lots of content that needs to be children of this new element
  <!-- /IMAGECOUNT -->

Here my latest attempt and unsuccessful attempt

<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="3.0">

  <!-- Identity transform template -->
  <xsl:template match="@* | node()">
      <xsl:apply-templates select="@* | node()"/>

  <!-- Match opening comments and create elements -->
  <xsl:template match="comment()" priority="10">
    <xsl:variable name="commentContent" select="normalize-space(.)"/>
    <xsl:message>Processing comment: <xsl:value-of select="$commentContent"/></xsl:message>
      <xsl:when test="starts-with($commentContent, '/')">
        <xsl:message>   Ignoring closing comment: <xsl:value-of select="$commentContent"/></xsl:message>
        <xsl:message>Creating element for comment: <xsl:value-of select="$commentContent"/></xsl:message>
        <xsl:element name="{replace($commentContent, ' ', '')}">
                      <xsl:apply-templates select="following-sibling::node()[1][not(self::comment())]"/>


  • A first prototype to approach this with for-each-group group-starting-with/group-ending-with is

    <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
      <xsl:output method="html" indent="no" html-version="5"/>
      <xsl:mode on-no-match="shallow-copy"/>
      <xsl:template match="*[comment()]">
          <xsl:apply-templates select="@*"/>
          <xsl:for-each-group select="node()" group-starting-with="comment()[not(matches(., '\s*/[a-z]+', 'i'))]">
              <xsl:when test="self::comment()">
                <xsl:variable name="comment-name" select="replace(., '^\s+|\s+$', '')"/>
                <xsl:for-each-group select="tail(current-group())" group-ending-with="comment()[matches(., '/' || $comment-name)]">
                    <xsl:when test="current-group()[last()][self::comment()]">
                      <xsl:element name="{replace($comment-name, '\s+', '')}">
                        <xsl:apply-templates select="current-group()[not(position() = last())]"/>
                      <xsl:apply-templates select="current-group()"/>
                <xsl:apply-templates select="current-group()"/>

    Online fiddle with Saxon HE Java in the browser.