
import { MsCommentNormalizer } from '@common/clipboard/ms-html/comments';
import { MsListNormalizer } from '@common/clipboard/ms-html/lists';
import { MsPageBreakNormalizer } from '@common/clipboard/ms-html/page-breaks';
import { MsWTagNormalizer } from '@common/clipboard/ms-html/w-tags';
import { HtmlPasteNormalizerData } from '@common/clipboard/types/html-paste-normalizer-data.type';
import { HtmlPasteNormalizerOptions } from '@common/clipboard/types/html-paste-normalizer-options.type';
import { HtmlPasteNormalizer } from '@common/clipboard/types/html-paste-normalizer.type';
import { NodeType } from '@common/html/enums/node-type.enum';
import { descendants, removeClasses, removeStyles } from '@common/html/util/dom';

const msWordMetaRegex = /<meta\s*name="?generator"?\s*content="?microsoft\s*word\s*\d+"?\/?>/i;
const msWordXmlnsRegex = /xmlns:o="urn:schemas-microsoft-com/i;

/**
 * Normalizes content pasted from MS Word.
 */
export class MsHtmlNormalizer implements HtmlPasteNormalizer {
  /**
   * Checks if the HTML is being pasted from MS Word.
   * @param html The HTML to check.
   * @returns `true` if the HTML is from MS Word, otherwise `false`.
   */
  matches(html: string): boolean {
    return msWordMetaRegex.test(html) || msWordXmlnsRegex.test(html);
  }

  /**
   * Converts proprietary MS Word HTML to standard HTML.
   * @param data The data to normalize.
   * @param options The options to use when normalizing.
   * @returns The normalized HTML.
   */
  normalize(data: HtmlPasteNormalizerData, options?: HtmlPasteNormalizerOptions): Document {
    const { css, doc } = data;

    // Remove extra nodes
    descendants(doc.documentElement, (element: Element) => {
      // Remove comments and cdata sections
      if (element.nodeType === NodeType.COMMENT_NODE || element.nodeType === NodeType.CDATA_SECTION_NODE) {
        element.remove();
      }
      // Replace <o:p> tags with their inner text
      else if (element.nodeName?.toLowerCase() === 'o:p') {
        element.replaceWith(...element.childNodes);
      }
      // Replace mso-spacerun spans with their text content because all we want is the whitespace and not the extra span
      else if (element.nodeName.toLowerCase() === 'span' && element.getAttribute('style')?.includes('spacerun')) {
        element.replaceWith(...element.textContent);
      }
    }, { reverse: true });

    // Normalize `w:` tags
    const wTagNormalizer = new MsWTagNormalizer();
    wTagNormalizer.normalize(doc);

    // Normalize comments
    const commentNormalizer = new MsCommentNormalizer();
    commentNormalizer.normalize(doc);

    // Normalize page breaks
    const pageBreakNormalizer = new MsPageBreakNormalizer();
    pageBreakNormalizer.normalize(doc, options?.pageBreakReplacement);

    // Normalize lists
    const listNormalizer = new MsListNormalizer();
    listNormalizer.normalize(doc, css);

    // Remove classes and styles
    descendants(doc.documentElement, (element: Element) => {
      // Remove classes that start with 'Mso'
      removeClasses(element, className => className.startsWith('Mso'));

      // Remove styles that start with 'mso' and the proprietary 'tab-interval'
      removeStyles(element, name => name.startsWith('mso') || name === 'tab-interval');
    });

    return doc;
  }
}
