javascripthtmlshadow-domnative-web-component

How would one extract HTML client-side from a child shadow DOM node, while including other shadow-root elements within?


Per the question, how should one go about completely extracting the client-side HTML code from an encapsulated shadow DOM node which also contains further nested child shadow nodes?

For reference: other questions on Stack Overflow I've visited which had answers that did not help: 1 2 3 4 5 6

I initially used the following code, but it failed to scrape any of the nested shadow DOMs:

const getDeepShadowDomHtml = (element) => {
    let htmlContent = '';

    // Recursively capture all shadow DOMs and their content
    const processElement = (el) => {
        if (el.shadowRoot) {
            htmlContent += `<${el.tagName.toLowerCase()}${Array.from(el.attributes).map(attr => ` ${attr.name}="${attr.value}"`).join('')}>`;
            Array.from(el.shadowRoot.childNodes).forEach(child => processElement(child));
            htmlContent += `</${el.tagName.toLowerCase()}>`;
        } else {
            htmlContent += el.nodeValue || el.outerHTML || '';
        }
    };

    processElement(element);
    return htmlContent;
};

const findNestedShadowRoot = (startElement, selector) => {
    let element = startElement;
    const selectors = selector.split(' ').filter(Boolean);

    for (const sel of selectors) {
        element = element.shadowRoot.querySelector(sel);
        if (!element) break;
    }

    return element;
};

const behaviorTabElement = findNestedShadowRoot(document.querySelector('shadow-host-selector'), 'first-shadow-selector second-shadow-selector #behaviourtab');

if (behaviorTabElement) {
    const shadowDomContent = getDeepShadowDomHtml(behaviorTabElement);
    console.log(shadowDomContent); // Verify content before download

    const blob = new Blob([shadowDomContent], { type: 'text/html' });
    const url = URL.createObjectURL(blob);
    const a = document.createElement('a');
    a.href = url;
    a.download = 'behaviorTabShadowDOMContent.html';
    document.body.appendChild(a);
    a.click();
    URL.revokeObjectURL(url);
} else {
    console.log("The #behaviourtab element was not found.");
}

Solution

  • The following code worked for me. Comments are included in the code.

    // jQuery for good measure
    var script document.createElement('script');
    script.src = 'https://code.jquery.com/jquery-latest.min.js';
    document.getElementsByTagName('head')[0].appendChild(script);
    
    const captureShadowDom = (element) => {
      let htmlContent = '';
    
      const processNode = (node) => {
        if (node.nodeType === Node.ELEMENT_NODE) {
          let nodeHtml = `<${node.tagName.toLowerCase()}`;
    
          // Captures element attributes
          for (const attr of node.attributes) {
            nodeHtml += ` ${attr.name}="${attr.value}"`;
          }
          nodeHtml += '>';
    
          // Checks for shadow DOM to recursively capture its content
          if (node.shadowRoot) {
            nodeHtml += captureShadowDom(node.shadowRoot);
          }
    
          // Captures the children of this element
          for (const child of node.childNodes) {
            nodeHtml += processNode(child);
          }
    
          nodeHtml += `</${node.tagName.toLowerCase()}>`;
          return nodeHtml;
        }
    
        // Text or other node types
        return node.nodeValue || '';
      };
    
      // Starts processing the root element
      for (const child of element.childNodes) {
        htmlContent += processNode(child);
      }
    
      return htmlContent;
    };
    
    // Finds the further nested element
    const findNestedShadowRoot = (startElement, selector) => {
      let element = startElement;
      const selectors = selector.split(' ').filter(Boolean);
    
      for (const sel of selectors) {
        element = element.shadowRoot.querySelector(sel);
        if (!element) break;
      }
    
      return element;
    };
    
    // Update with the correct shadow DOM hierarchy
    const behaviorTabElement = findNestedShadowRoot(document.querySelector('shadow-host-selector'), 'first-shadow-selector second-shadow-selector main-selector');
    
    console.log(behaviorTabElement); // Check if the element is found
    if (behaviorTabElement) {
      const shadowDomContent = collectShadowDomContent(behaviorTabElement);
      console.log(shadowDomContent); // Check if content is being collected
    }
    
    if (behaviorTabElement) {
      const shadowDomContent = captureShadowDom(behaviorTabElement.shadowRoot);
      console.log(shadowDomContent); // Debug output
    
      const blob = new Blob([shadowDomContent], {
        type: 'text/html'
      });
      const url = URL.createObjectURL(blob);
      const a = document.createElement('a');
      a.href = url;
      a.download = 'ShadowDOMContent.html';
      document.body.appendChild(a);
      a.click();
      URL.revokeObjectURL(url);
    } else {
      console.log("The element was not found.");
    }

    This may not be the most efficient code for this and I'm not doing this by profession but just for fun, so if anyone has a better working method, feel free to improve on this!