javascriptregexdomsearchhighlight

How to search for and highlight matching text/phrases within the entire document-body's text-content?


I want to regex and replace only text content(innerText) of an html, and in the end keep all the HTML elements (or restore them as they were).

The regex must not check the HTML elements, but only the text content inside the HTML (innerText, textContent etc..)

Made-up example, a "dialogue highlighter"

string:

<html>
<body>
    <h1>Hello, World!</h1>
    <p id="aaaa";>"Omae wa moshindeiru."</p>
    <p id="aaaa";>"Naani!"</p>
</body>
</html>

Javascript:

element = document.querySelector('body');
element.innerText = element.innerText
.replace(/[\"“”]([^\"”“\n]+)[\"”“]/g, '\"€€$1××\"');
element.innerHTML = element.innerHTML
.replace(/€€/g, '<span style="color: red">')
.replace(/××/g, '<\/span>')

expected output:

<html>
<body>
    <h1>Hello, World!</h1>
    <p id="aaaa";>"<span style="color: red;">Omae wa mo shindeiru</span>"</p>
    <p id="aaaa";>"<span style="color: red;">Naani!</span>"</p>
</body>
</html>

Actual output:

<html>
<body>
Hello, World!<br><br>"<span style="color: red;">Omae wa moshindeiru.</span>"<br><br>"<span style="color: red;">Naani!</span>"
</body>
</html>

Yes i know i could adapt the regex, but no. I just want to act in the text content and then restore the lost HTML elements.


Solution

  • Quoting myself from the above comment ...

    An exclusively text-only and regex/replace based solution is not the right tool for such kind of tasks. What you need is a combination of tree-walking (on the DOM) and regex based tests in order to capture all matching text-nodes. From there you need node-methods in order to create and insert the correct mix of matching text-content and each of its new and enclosing <span/> elements.

    The provided example code implements exactly two functions, ...

    const regXQuotedPhrase = /(?<quote>["“”])(?<phrase>[^"“”]+)\k<quote>/;
    
    const matchingTextNodes = collectEveryTextNode(document.body)
     .filter(({ nodeValue }) => regXQuotedPhrase.test(nodeValue));
    
    console.log({ 
      matchingTextContents: matchingTextNodes
        .map(({ nodeValue }) => nodeValue)
    });
    
    matchingTextNodes
      .forEach(replaceWithMatchingMarkerFragment, {
        /**
         *  - `forEach`'s 2nd `thisArg` parameter gets 
         *    used as config, where one can provide the
         *    matching criteria as regular expression and
         *    a custom element node too which wraps itself
         *    as marker around each matching text-fragment.
         */
        regX: regXQuotedPhrase,
        node: document.createElement('mark'),
      });
    mark {
      color: #006;
      background-color: #ff0;
    }
    .as-console-wrapper {
      left: auto!important;
      width: 55%;
      min-height: 100%;
    }
    <h1>Hello, World!</h1>
    
    <p id="aaaa">   Foo ... "Omae wa moshindeiru." ... bar.   </p>
    <p id="aaaa">bar ... "Naani!" ... baz ... "Naani!" ... biz.</p>
    
    
    <script>
    /**
     *  - The **treewalker** which recursively collects
     *    every text-node, starting from either a provided
     *    (elemen/text) node or a node-list/collection.
     */
    function collectEveryTextNode(nodeOrCollection) {
      const { ELEMENT_NODE, TEXT_NODE } = Node;
    
      nodeOrCollection = nodeOrCollection || {};
    
      return (nodeOrCollection.nodeType === TEXT_NODE)
    
        ? [nodeOrCollection]    
        : Array
            .from(
              nodeOrCollection.childNodes ?? nodeOrCollection
            )
            .reduce((result, childNode) => {
              const { nodeType } = childNode;
    
              if (nodeType === TEXT_NODE) {
    
                result
                  .push(childNode);
    
              } else if (nodeType === ELEMENT_NODE) {
    
                result = result
                  .concat(
                    // self recursive call.
                    collectEveryTextNode(childNode)
                  );
              }
              return result;
            }, []);
    }
    
    /**
     *  - The `this` context-aware function which creates
     *    either text- or element-nodes from a provided
     *    text-node, where the latter contains the matching
     *    phrase/substring at least once.
     *  - The passed text-node then gets replaced by a
     *    document-fragment which has got appended at least
     *    one marker node that encloses a matching text.
     */
    function replaceWithMatchingMarkerFragment(textNode) {
    
      const { regX, node: markerNode } = this;
      const { parentNode, nodeValue } = textNode;
     
      const fragment = document.createDocumentFragment();
      const nodeList = [];
    
      let text = nodeValue;
      let regXResult;
    
      while (regXResult = regX.exec(text)) {
    
        const { index, input } = regXResult;
    
        const quotedPhrase = regXResult[0];
        const prePhrase = input.slice(0, index);
    
        if (prePhrase) {
    
          nodeList.push(
            document.createTextNode(prePhrase),
          );
        }
        const elmNode = markerNode.cloneNode(true);
    
        elmNode.appendChild(
          document.createTextNode(quotedPhrase),
        );
        nodeList.push(elmNode);
    
        text = input.slice(index + quotedPhrase.length);
      }
      if (text) {
        // equals a `postPhrase`.
    
        nodeList.push(
          document.createTextNode(text),
        );
      }
      nodeList.forEach(node => fragment.appendChild(node));
    
      textNode.replaceWith(fragment);
    }
    </script>