/**
 * Takes a string of text that may contain HTML tags and divides it into
 * pages of a given character length. HTML tags are not included in the page
 * size calculation, but preserved in the resulting pages. If tags straddle
 * a page break, they are closed and reopened on the following page to keep
 * the markup valid.
 *
 * There are a couple caveats worth noting:
 * 1. Does not currently support tags with attributes
 * 2. Assumes that any HTML is well-formed to start
 * 3. This currently does not handle hyphenation. This means it will ommit
 *    a word that is longer then the entire page length.
 *
 * @param {string} html Text that can contain HTML tags
 * @param {number} pageSize A positive number of maxium characters / pagee
 * @returns {string[]} An array of page strings
 */
function paginate(html, pageSize) {
  if (pageSize < 1) {
    return ['']
  }

  html = stripBadStuff(html)
  // \s = whitespace char
  // \S = non-whitespace char
  // (?!\S) = negative lookahead / don't match if followed by non-whitespace
  // Match any sequence of whitespace or non-whitespace chars between 1 and pageSize...
  // length and NOT followed by a non-whitespace char
  const pageSizeMinusOne = pageSize - 1
  const splitRegex = new RegExp('\\s?\\S[\\s\\S]{0,' + pageSizeMinusOne + '}(?!\\S)', 'g')
  const tagsInfo = [] // saved tags
  let tagOffset = 0 // running offset of tag in plain text
  let pageOffset = 0 // page offset in plain text. start index of current page?
  const openTags = [] // open tags carried over to next page

  // For each tag in the source string, create an object with the tag and it's offest position
  // Push the object to tagsInfo and accumulate a total character count for all tags
  const pages = html
    .replace(/<\/?[a-z][a-z0-9]*>/gi, (tag, pos) => {
      const obj = { tag: tag, pos: pos - tagOffset }
      tagsInfo.push(obj)
      tagOffset += tag.length
      return ''
    })
    .match(splitRegex)
    .map((page) => {
      // split the tagless source string into pageSize-sized chunks
      // determine next page boundary?
      const nextOffset = pageOffset + page.length
      // any open tags will need to get re-opened soon...
      const prefix = openTags.join('')
      // copy and reverse list of tagInfo objects
      tagsInfo
        .slice()
        .reverse()
        .forEach((obj) => {
          // if tag position is after current page starts and before the start of the next...
          // splice tag into the plain text in it's original location
          if (obj.pos >= pageOffset && obj.pos < nextOffset) {
            // restore tags in reverse order to maintain proper position
            page =
              page.substring(0, obj.pos - pageOffset) +
              obj.tag +
              page.substring(obj.pos - pageOffset)
          }
        })
      // find closing tags within the current page
      // and remove from openTags if found
      tagsInfo.forEach((obj) => {
        let tag = obj.tag
        if (obj.pos >= pageOffset && obj.pos < nextOffset) {
          // if there's a closing tag...
          if (tag.match(/<\//)) {
            // remove tag from openTags list
            tag = tag.replace(/<\//, '<')
            const index = openTags.indexOf(tag)
            if (index >= 0) {
              openTags.splice(index, 1)
            }
            // no closing tag
          } else {
            // add tag to openTags list
            openTags.push(tag)
          }
        }
      })
      // reset page start offset for next item...
      pageOffset = nextOffset
      // close all the open tags on the current page
      const postfix = openTags
        .slice()
        .reverse()
        .map((tag) => tag.replace(/</, '</'))
        .join('')
      page = prefix + page.trim() + postfix
      return page.replace(/<(\w+)><\/\1>/g, '').trim() // remove tags with empty content
    })
  return pages
}

function stripBadStuff(text) {
  text = stripLineBreaks(text)
  text = stripNbsp(text)
  return text
}

function stripLineBreaks(text) {
  return text.replace(/<br ?\/?>/g, ' ')
}

function stripNbsp(text) {
  return text.replace(/(&nbsp;)+/g, ' ')
}

export default paginate
