javascripthtmldomdocumentfragment

How to divide a DocumentFragment based on character offset


I have a string that (potentially) contains HTML tags.

I want to split it into smaller valid HTML strings based on (text) character length. The use case is essentially pagination. I know the length of text that can fit on a single page. So I want to divide the target string into "chunks" or pages based on that character length. But I need each of the resulting pages to contain valid HTML without unclosed tags, etc.

So for example:

const pageCharacterSize = 10
const testString = 'some <strong>text with HTML</strong> tags
function paginate(string, pageSize) { //@TODO }
const pages = paginate(testString, pageCharacterSize)
console.log(pages)
// ['some <strong>text </strong>', '<strong>with HTML</strong> ', 'tags'] 

I think this is possible to do with a DocumentFragment or Range but I can't figure out how slice the pages based on character offsets.

This MDN page has a demo that does something close to what I need. But it uses caretPositionFromPoint() which takes X, Y coordinates as arguments.

Update

For the purposes of clarity, here are the tests I'm working with:

import { expect, test } from 'vitest'
import paginate from './paginate'

// 1
test('it should chunk plain text', () => {
  // a
  const testString = 'aa bb cc dd ee';
  const expected = ['aa', 'bb', 'cc', 'dd', 'ee']
  expect(paginate(testString, 2)).toStrictEqual(expected)

  // b
  const testString2 = 'a a b b c c';
  const expected2 = ['a a', 'b b', 'c c']
  expect(paginate(testString2, 3)).toStrictEqual(expected2)

  // c
  const testString3 = 'aa aa bb bb cc cc';
  const expected3 = ['aa aa', 'bb bb', 'cc cc']
  expect(paginate(testString3, 5)).toStrictEqual(expected3)
  
  // d
  const testString4 = 'aa bb cc';
  const expected4 = ['aa', 'bb', 'cc']
  expect(paginate(testString4, 4)).toStrictEqual(expected4)

  // e
  const testString5 = 'a b c d e f g';
  const expected5 = ['a b c', 'd e f', 'g']
  expect(paginate(testString5, 5)).toStrictEqual(expected5)

  // f
  const testString6 = 'aa bb cc';
  const expected6 = ['aa bb', 'cc']
  expect(paginate(testString6, 7)).toStrictEqual(expected6)
})

// 2
test('it should chunk an HTML string without stranding tags', () => {
  const testString = 'aa <strong>bb</strong> <em>cc dd</em>';
  const expected = ['aa', '<strong>bb</strong>', '<em>cc</em>', '<em>dd</em>']
  expect(paginate(testString, 3)).toStrictEqual(expected)
})

// 3
test('it should handle tags that straddle pages', () => {
  const testString = '<strong>aa bb cc</strong>';
  const expected = ['<strong>aa</strong>', '<strong>bb</strong>', '<strong>cc</strong>']
  expect(paginate(testString, 2)).toStrictEqual(expected)
})


Solution

  • Here is a solution that assumes and supports the following:

    function paginate(html, pageSize) {
      let splitRegex = new RegExp('\\s*[\\s\\S]{1,' + pageSize + '}(?!\\S)', 'g');
      let tagsInfo = [];  // saved tags
      let tagOffset = 0;  // running offset of tag in plain text
      let pageOffset = 0; // page offset in plain text
      let openTags = [];  // open tags carried over to next page
      let pages = html.replace(/<\/?[a-z][a-z0-9]*>/gi, (tag, pos) => {
        let obj = { tag: tag, pos: pos - tagOffset };
        tagsInfo.push(obj);
        tagOffset += tag.length;
        return '';
      }).match(splitRegex).map(page => {
        let nextOffset = pageOffset + page.length;
        let prefix = openTags.join('');
        tagsInfo.slice().reverse().forEach(obj => {
          if(obj.pos >= pageOffset && obj.pos < nextOffset) {
            // restore tags in reverse order to maintain proper position
            page = page.substring(0, obj.pos - pageOffset) + obj.tag + page.substring(obj.pos - pageOffset);
          }
        });
        tagsInfo.forEach(obj => {
          let tag = obj.tag;
          if(obj.pos >= pageOffset && obj.pos < nextOffset) {
            if(tag.match(/<\//)) {
              // remove tag from openTags list
              tag = tag.replace(/<\//, '<');
              let index = openTags.indexOf(tag);
              if(index >= 0) {
                openTags.splice(index, 1);
              }
            } else {
              // add tag to openTags list
              openTags.push(tag);
            }
          }
        });
        pageOffset = nextOffset;
        let postfix = openTags.slice().reverse().map(tag => tag.replace(/</, '</')).join('');
        page = prefix + page.trim() + postfix;
        return page.replace(/<(\w+)><\/\1>/g, ''); // remove tags with empty content
      });
      return pages;
    }
    
    [
      { str: 'some <strong>text <i>with</i> HTML</strong> tags, and <i>some <b>nested tags</b> sould be <b>supported</b> as well</i>.', size: 16 },
      { str: 'a a b b c c', size: 3 },
      { str: 'aa aa bb bb cc cc', size: 5 },
      { str: 'aa bb cc', size: 4 },
      { str: 'aa <strong>bb</strong> <em>cc dd</em>', size: 3 },
      { str: '<strong>aa bb cc</strong>', size: 2 }
    ].forEach(o => {
      let pages = paginate(o.str, o.size);
      console.log(pages);
    });

    Output:

    [
      "some <strong>text <i>with</i></strong>",
      "<strong> HTML</strong> tags, and",
      "<i>some <b>nested tags</b></i>",
      "<i> sould be</i>",
      "<i><b>supported</b> as</i>",
      "<i>well</i>."
    ]
    [
      "a a",
      "b b",
      "c c"
    ]
    [
      "aa aa",
      "bb bb",
      "cc cc"
    ]
    [
      "aa",
      "bb",
      "cc"
    ]
    [
      "aa",
      "<strong>bb</strong>",
      " <em>cc</em>",
      "<em>dd</em>"
    ]
    [
      "<strong>aa</strong>",
      "<strong>bb</strong>",
      "<strong>cc</strong>"
    ]
    

    Update

    Based on new request in comment I fixed the split regex from '[\\s\\S]{1,' + pageSize + '}(?!\\S)' to '\\s*[\\s\\S]{1,' + pageSize + '}(?!\\S)', e.g. added \\s* to catch leading spaces. I also added a page.trim() to remove leading spaces. Finally I added a few of the OP examples.