2
\$\begingroup\$

I've written a naive shortcode parser in JS that parses shortcodes akin to those found in Wordpress. The intention is for this to be used in realtime on the client in order to preview edits on the fly.

Excusing the strictness of the regular expressions I'd be interested to find about my implementation's:

  • Performance - i.e. will the use of RegEx's be problematic on large strings with a lot of tags
  • Security - any issues injecting shortcodes into RegExs
  • Accuracy - are there any edge cases here that will fail that I haven't accounted for
  • General approach - am I missing something obvious and overcomplicating / misunderstanding some of the problems

before pushing too far with it.

const ATTRIBUTE_PROPERTY = `[a-z]+`;
const ATTRIBUTE_VALUE    = `[a-z0-9\\.\\-]+`;
const ATTRIBUTE          = `(${ATTRIBUTE_PROPERTY})\\=\\"(${ATTRIBUTE_VALUE})\\"`;

class ShortcodeParser {

  constructor() {
    this.allowedTags = [];
  }

  /*
   * Add a shortcode
   */
  addShortcode(name) {
    this.allowedTags.push(name);
  }

  /*
   * Remove a shortcode
   */
  removeShortcode(name) {
    const {allowedTags} = this;
    const index = allowedTags.indexOf(name);
    if (~index) {
      allowedTags.splice(index, 1);
    }
  }

  /*
   * The main parsing function
   */
  parse(str) {

    // Setup root node
    // Probably should use a Node class
    const root = {
      type: 'ROOT ',
      children: [],
    };

    // parentNode will be used to keep track of the lowest open tag
    let parentNode = root;

    // Send back the root with one child if there are not tags to look for
    if (!this.allowedTags.length) {
      parentNode.children.push({
        type: 'TEXT',
        content: str,
        parent: parentNode
      });

      return root;
    }


    let m;
    let re = this._getTagRegex();

    while (m = re.exec(str)) {

      const tag               =  m[0];                      // The whole match
      const tagName           = (m[1] || '').toUpperCase(); // The tag name
      const attsString        =  m[2];                      // The attributes
      const isSelfClosingTag  =  m[3];                      // Does this tag have a trailing slash
      const closingTagName    = (m[4] || '').toUpperCase(); // Is this tag a closing tag

      // Get the from the last tag to this tag
      const pre = str.slice(0, m.index);

      // Get the rest of the string from this point
      str = str.slice(m.index + tag.length);

      // If there is any text preceding this add it to the parent
      if (pre) {
        parentNode.children.push({
          type: 'TEXT',
          content: pre,
          parent: parentNode
        });
      }

      if (closingTagName && closingTagName === parentNode.type) {
        // If this is the closing tag of the parent
        // Otherwise just leave it be

        // Go up the heirarchy
        parentNode = parentNode.parent;

        // Start looking for another opening tag or the parent closing tag
        re = this._getTagRegex(parentNode);

      } else if (tag) {
        // If we are at least on a tag

        // Create a new node
        const node = {
          type: tagName,
          children: [],
          parent: parentNode,
        };

        // Get the attributes
        const atts = this._parseAttributes(attsString);

        node.attributes = atts;

        // add this node to the parent
        parentNode.children.push(node);

        if (!isSelfClosingTag) {
          // If this is not a self closing tag

          // Start looking for another opening tag or this closing tag
          re = this._getTagRegex(node);

          // Set the parent as a reference to the new node
          parentNode = node;

        } else {
          // If this is a closed tag

          // We're goint to use the same regex but we've split the string
          // ourselves so always look for the first match
          re.lastIndex = 0;
        }
      }
    }

    if (str) {
      // If there is still some string left

      // add it to the root as a text node
      root.children.push({
        type: 'TEXT',
        content: str,
      });
    }

    if (parentNode !== root) {
      throw new Error('Missing closing tags');
    }

    return root;
  }

  /*
   * Parses the attributes match and returns an object
   */
  _parseAttributes(atts) {

    const _atts = {};

    let m;
    const re = new RegExp(ATTRIBUTE, 'gi');

    while(m = re.exec(atts)) {

      const property = m[1];
      const value    = m[2];

      _atts[property] = this._typecastValue(value);
    }

    return _atts;
  }


  /*
   * Typecasts an attribute
   * TODO: maybe add JSON?
   */
  _typecastValue(value) {

    if (/^\d+$/.test(value)) {

      return parseInt(value, 10);

    } else if (/^\d+\.\d+$/.test(value)) {

      return parseFloat(value);

    } else if (/^(true|false)$/.test(value)) {

      return (value === 'true');

    } else if (/^undefined$/.test(value)) {

      return undefined;

    } else if (/^null$/i.test(value)) {

      return null;

    } else {

      return value;

    }

  }

  /*
   * Generates a regex that searches for an opening tag or the close tag
   * of the current nesting level
   */
  _getTagRegex(node) {
    const allowedTags = this.allowedTags.join('|');
    const openingTagRegex = `\\[(${allowedTags})(\\s+${ATTRIBUTE_PROPERTY}\\=\\"${ATTRIBUTE_VALUE}\\")*\\s*(\\/?)\\]`;
    if (node) {
      return new RegExp( `(?:${openingTagRegex})|(?:\\[\\/(${node.type})\\])`, 'gi' );
    } else {
      return new RegExp( openingTagRegex, 'gi' );
    }
  }
}

/* example useage

const str = `[Rich]rich[test attr="43.7" /][monkey]this is some content [monkey][/monkey][/monkey] more text [/Rich]more text`;

const parser = new ShortcodeParser();

parser.addShortcode('rich');
parser.addShortcode('monkey');
parser.addShortcode('test');

const tree = parser.parse(str);

*/
\$\endgroup\$
2
  • \$\begingroup\$ Is there any reason you're using a bitwise negation ~ in removeShortcode() instead of a normal logical negation !? \$\endgroup\$ Commented Jun 8, 2016 at 18:24
  • \$\begingroup\$ @gcampbell indexOf returns -1 if not found or between 0 and array.length if it is found. Flipping the bits of a -1 leaves you with a falsely 0 whereas any other number (including 0) will return a truthy integer. It's just a trick really. \$\endgroup\$ Commented Jun 8, 2016 at 18:33

0

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.