I've written a naive shortcode parser in JS that parses shortcodes akin to those found in Wordpress. The intention is for this to be used in realtime on the client in order to preview edits on the fly.
Excusing the strictness of the regular expressions I'd be interested to find about my implementation's:
- Performance - i.e. will the use of RegEx's be problematic on large strings with a lot of tags
- Security - any issues injecting shortcodes into RegExs
- Accuracy - are there any edge cases here that will fail that I haven't accounted for
- General approach - am I missing something obvious and overcomplicating / misunderstanding some of the problems
before pushing too far with it.
const ATTRIBUTE_PROPERTY = `[a-z]+`;
const ATTRIBUTE_VALUE = `[a-z0-9\\.\\-]+`;
const ATTRIBUTE = `(${ATTRIBUTE_PROPERTY})\\=\\"(${ATTRIBUTE_VALUE})\\"`;
class ShortcodeParser {
constructor() {
this.allowedTags = [];
}
/*
* Add a shortcode
*/
addShortcode(name) {
this.allowedTags.push(name);
}
/*
* Remove a shortcode
*/
removeShortcode(name) {
const {allowedTags} = this;
const index = allowedTags.indexOf(name);
if (~index) {
allowedTags.splice(index, 1);
}
}
/*
* The main parsing function
*/
parse(str) {
// Setup root node
// Probably should use a Node class
const root = {
type: 'ROOT ',
children: [],
};
// parentNode will be used to keep track of the lowest open tag
let parentNode = root;
// Send back the root with one child if there are not tags to look for
if (!this.allowedTags.length) {
parentNode.children.push({
type: 'TEXT',
content: str,
parent: parentNode
});
return root;
}
let m;
let re = this._getTagRegex();
while (m = re.exec(str)) {
const tag = m[0]; // The whole match
const tagName = (m[1] || '').toUpperCase(); // The tag name
const attsString = m[2]; // The attributes
const isSelfClosingTag = m[3]; // Does this tag have a trailing slash
const closingTagName = (m[4] || '').toUpperCase(); // Is this tag a closing tag
// Get the from the last tag to this tag
const pre = str.slice(0, m.index);
// Get the rest of the string from this point
str = str.slice(m.index + tag.length);
// If there is any text preceding this add it to the parent
if (pre) {
parentNode.children.push({
type: 'TEXT',
content: pre,
parent: parentNode
});
}
if (closingTagName && closingTagName === parentNode.type) {
// If this is the closing tag of the parent
// Otherwise just leave it be
// Go up the heirarchy
parentNode = parentNode.parent;
// Start looking for another opening tag or the parent closing tag
re = this._getTagRegex(parentNode);
} else if (tag) {
// If we are at least on a tag
// Create a new node
const node = {
type: tagName,
children: [],
parent: parentNode,
};
// Get the attributes
const atts = this._parseAttributes(attsString);
node.attributes = atts;
// add this node to the parent
parentNode.children.push(node);
if (!isSelfClosingTag) {
// If this is not a self closing tag
// Start looking for another opening tag or this closing tag
re = this._getTagRegex(node);
// Set the parent as a reference to the new node
parentNode = node;
} else {
// If this is a closed tag
// We're goint to use the same regex but we've split the string
// ourselves so always look for the first match
re.lastIndex = 0;
}
}
}
if (str) {
// If there is still some string left
// add it to the root as a text node
root.children.push({
type: 'TEXT',
content: str,
});
}
if (parentNode !== root) {
throw new Error('Missing closing tags');
}
return root;
}
/*
* Parses the attributes match and returns an object
*/
_parseAttributes(atts) {
const _atts = {};
let m;
const re = new RegExp(ATTRIBUTE, 'gi');
while(m = re.exec(atts)) {
const property = m[1];
const value = m[2];
_atts[property] = this._typecastValue(value);
}
return _atts;
}
/*
* Typecasts an attribute
* TODO: maybe add JSON?
*/
_typecastValue(value) {
if (/^\d+$/.test(value)) {
return parseInt(value, 10);
} else if (/^\d+\.\d+$/.test(value)) {
return parseFloat(value);
} else if (/^(true|false)$/.test(value)) {
return (value === 'true');
} else if (/^undefined$/.test(value)) {
return undefined;
} else if (/^null$/i.test(value)) {
return null;
} else {
return value;
}
}
/*
* Generates a regex that searches for an opening tag or the close tag
* of the current nesting level
*/
_getTagRegex(node) {
const allowedTags = this.allowedTags.join('|');
const openingTagRegex = `\\[(${allowedTags})(\\s+${ATTRIBUTE_PROPERTY}\\=\\"${ATTRIBUTE_VALUE}\\")*\\s*(\\/?)\\]`;
if (node) {
return new RegExp( `(?:${openingTagRegex})|(?:\\[\\/(${node.type})\\])`, 'gi' );
} else {
return new RegExp( openingTagRegex, 'gi' );
}
}
}
/* example useage
const str = `[Rich]rich[test attr="43.7" /][monkey]this is some content [monkey][/monkey][/monkey] more text [/Rich]more text`;
const parser = new ShortcodeParser();
parser.addShortcode('rich');
parser.addShortcode('monkey');
parser.addShortcode('test');
const tree = parser.parse(str);
*/
~
in removeShortcode() instead of a normal logical negation!
? \$\endgroup\$indexOf
returns -1 if not found or between 0 andarray.length
if it is found. Flipping the bits of a -1 leaves you with a falsely 0 whereas any other number (including 0) will return a truthy integer. It's just a trick really. \$\endgroup\$