javascriptcloudflarecloudflare-workers

Access nested elements in HTMLRewriter - Cloudflare Workers


I have to access a nested element using HTMLRewriter in a Cloudflare worker.

Example

<div data-code="ABC">
   <div class="title">Title</div>
   <div class="price">9,99</div>
</div>
<div data-code="XYZ">
   <div class="title">Title</div>
</div>

I was thinking about use multiple .on() but the order is not preserved because some .price are missing and I cannot merge correctly results from codeHandler and a PriceHandler

await new HTMLRewriter().on("[data-code]", codeHandler)
                        .on(".price", priceHandler)
                        .transform(response).arrayBuffer()

I was thinking about iterating new HTMLRewriter() multiple times but the readable stream is locked.

Current code

Worker

class codeHandler {
    constructor() {
        this.values = []
    }

    element(element) {
        let data = {
            code: element.getAttribute("data-code"),
            title: element.querySelector(".title").innerText, <--
            price: element.querySelector(".price").innerText, <--- HERE
        }
        this.values.push( data )
    }
}


const url = "https://www.example.com"

async function handleRequest() {

  const response = await fetch(url)

   const codeHandler = new codeHandler()
   await new HTMLRewriter().on("[data-code]", codeHandler).transform(response).arrayBuffer()
    
    
   console.log(codeHandler.values)

    const json = JSON.stringify(codeHandler.values, null, 2)


    return new Response(json, {
        headers: {
        "content-type": "application/json;charset=UTF-8"
        }
    })  

}

addEventListener("fetch", event => {
  return event.respondWith(handleRequest())
})

Solution

  • After quickly looking at the documentation, it seems the element objects in the HTMLRewriter API are currently limited, and can't access children the way that you'd like.

    It does seem that the handlers are run in the same order that they appear in the document, which means that for your use case, you can keep track of the current element and add data in that context. You can use different handlers to access the same data by using closures (in this case values), like so:

    addEventListener("fetch", event => {
      event.respondWith(handleRequest(event.request))
    });
    
    async function handleRequest(request) {
      var values = [];
      var response = await fetch(request);
      function addToLast(attr, text) {
        var lastIndex = values.length - 1;
        if (lastIndex < 0) {
          // this shouldn't happen, since there should always have been
          // an object created by the parent [data-code] div
          return;
        }
        // need to add them to the previous value, just in case if there
        // are multiple text chunks
        values[lastIndex][attr] = (values[lastIndex][attr] || '') + text;
      }
      await new HTMLRewriter()
        .on("[data-code]", { 
          element(element) { 
            values.push({
              code: element.getAttribute("data-code")
            });
          },
          text(text) {
            addToLast('body', text.text);
          }
        })
        .on("[data-code] .title", {
          text(text) {
            addToLast('title', text.text);
          }
        })
        .on("[data-code] .price", {
          text(text) {
            addToLast('price', text.text);
          }
        })
        .transform(response).arrayBuffer();
      const json = JSON.stringify(values, null, 2)
      return new Response(json, {
        headers: {
          "content-type": "application/json;charset=UTF-8"
        }
      });
    }
    

    Also note that you can use a descendant combinator (e.g. [data-code] .title) to ensure that the only .title divs that are processed are children of the elements with [data-code].