google-apps-script

Is there a function or example for converting html string to plaintext without html tags using Google Apps Script?


In JavaScript this solution would do the job:

function strip(html)
{
   var tmp = document.createElement("DIV");
   tmp.innerHTML = html;
   return tmp.textContent || tmp.innerText || "";
}

However document is not available in Google Apps Script to my knowledge. Is there another alternative to parse and display plain text from html in Google Apps Script?

I have tried using the

HtmlService.createHtmlOutput('<b>Hello, world!</b>').getContent();

However this just displays the text with all the tags.

My expectation would be that input of

'<b>Hello, world!</b>'

Would output

'Hello, world!'

Solution

  • The html tags can be removed in two different ways:

    1. Reg Exp - Regular Expression
    2. Converting the HTML to XML an using XmlService to get every element and then get the value of each element

    The Reg Exp is better because you don't need to find every HTML element, which requires a lot more code.

    The HTML must first be converted to XML so that XmlService.getPrettyFormat() can be used. If the html tags were removed first with a Regular Expression, then the code wouldn't know where the line breaks were supposed to be.

    Using XmlService.getPrettyFormat() will format the html with line breaks. But to use XmlService, the html string must first be converted to XML. And there are a couple of things that you need to do when converting the html string to XML in order to avoid errors.

    function parseHtml() {
    
      var html = 'This is just a Test<br><br>Here is my List<br>\
        <ol><li>one</li><li>Two</li><li>Three</li></ol><br>And a bulleted one<br><ul>\
        <li>Bullet One</li><li>Bullet Two</li><li>Bullet Three</li></ul>'; 
    
      html = '<div>' + html + '</div>';//To avoid the "Content is not allowed in prolog." error
      html = html.replace(/<br>/g,"");//To avoid an error when parsing to xml
      //Logger.log('html: ' + html)
    
      var document = XmlService.parse(html);
    
      var output = XmlService.getPrettyFormat().format(document);
      //Logger.log(output);
    
      output = output.replace(/<[^>]*>/g,"");
      Logger.log(output)
    }
    

    Another way to do it, which is just provided as a learning example is to parse the HTML as Xml with XmlService and then loop through all the elements. The following code only goes down through a couple layers of children.

    function parseHtml() {
    
      var html = 'This is just a Test<br><br>Here is my List<br>\
        <ol><li>one</li><li>Two</li><li>Three</li></ol><br>And a bulleted one<br><ul>\
        <li>Bullet One</li><li>Bullet Two</li><li>Bullet Three</li></ul>'; 
    
      html = '<div>' + html + '</div>';
      html = html.replace(/<br>/g,"");
      //Logger.log('html: ' + html)
    
      var allText = "";
      var thisTxt;
    
      var document = XmlService.parse(html);
      var root = document.getRootElement();
      //Logger.log('root: ' + JSON.stringify(root))
    
      var content = root.getAllContent();
      //Logger.log('content: ' + JSON.stringify(content))
    
      var L = content.length;
    
      for (var i=0;i<L;i++) {
        var thisEl = content[i];
        if (!thisEl) {continue;}
    
        var theType = thisEl.getType();
        //Logger.log('theType: ' + theType)
        //Logger.log('typeof theType: ' + typeof theType)
    
        if (theType === theType.ELEMENT) {
          var asElmt = thisEl.asElement();
          var allChildren = asElmt.getChildren();
    
          if (allChildren) {
            var nmbrOfChildren = allChildren.length;
            //Logger.log('nmbrOfChildren: ' + nmbrOfChildren)
          }
    
          if (!nmbrOfChildren) {
            thisTxt = asElmt.getValue();
    
            //Logger.log('thisTxt 43: ' + thisTxt)
            allText = allText + thisTxt  + "\n";
            continue;
          }
    
          for (var j=0;j<nmbrOfChildren;j++) {
    
            thisTxt = allChildren[j].getValue();
            if (!thisTxt) {
              continue;
            }
    
            allText = allText + thisTxt + "\n";
    
          }
          continue;
        }
    
        //Logger.log(thisEl.getValue())   
        allText = allText + thisEl.getValue()  + "\n";
    
      }
    
      //Logger.log('allText: ' + allText + "\n")
    
    }