Command Tokenizer

Question

I've written some code to tokenize a command string into its tokens.

A token is either:

A block of any non-whitespace characters
A block of characters, which may include whitespace, wrapped in quotes

So, for the input:

This is some text "with information" quoted.

I'd expect the tokens:

This

is

some

text

with information

quoted.

The tokenizer

using System.Collections.Generic;
using System.Text.RegularExpressions;

namespace MudCore
{
    public static class CommandTokenizer
    {
        static Regex _pattern;

        static CommandTokenizer()
        {
            _pattern = new Regex(@"((\s*""(?<token>[^""]*)(""|$)\s*)|(\s*(?<token>[^\s""]+)\s*))*", RegexOptions.Compiled | RegexOptions.ExplicitCapture);
        }

        public static string[] Tokenise(string input)
        {
            List<string> matches = new List<string>();
            var match = _pattern.Match(input);

            if(match.Success)
            {
                    foreach(Capture capture in match.Groups["token"].Captures)
                    {
                        matches.Add(capture.Value);
                    }
            }
            return matches.ToArray();
        }
    }
}

The Tests

using MudCore;
using NUnit.Framework;

namespace MudCoreTests
{
    [TestFixture]
    public class CommandTokenizerTests
    {
        [Test]
        public void SingleWordBecomesSingleToken()
        {
            var tokens = CommandTokenizer.Tokenise("single");
            Assert.AreEqual(1, tokens.Length);
            Assert.AreEqual("single", tokens[0]);
        }

        [Test]
        public void MultipleWordsReturnMultipleTokens()
        {
            var tokens = CommandTokenizer.Tokenise("there are multiple tokens");
            Assert.AreEqual(4, tokens.Length);
            Assert.AreEqual("there", tokens[0]);
            Assert.AreEqual("are", tokens[1]);
            Assert.AreEqual("multiple", tokens[2]);
            Assert.AreEqual("tokens", tokens[3]);
        }

        [Test]
        public void LeadingSpacesIgnored()
        {
            var tokens = CommandTokenizer.Tokenise(" there are multiple tokens");
            Assert.AreEqual(4, tokens.Length);
            Assert.AreEqual("there", tokens[0]);
            Assert.AreEqual("are", tokens[1]);
            Assert.AreEqual("multiple", tokens[2]);
            Assert.AreEqual("tokens", tokens[3]);
        }

        [TestCase("There are \"some quoted tokens\" in the text", 
                  new string[] {
                      "There",
                      "are",
                      "some quoted tokens",
                      "in",
                      "the",
                      "text" }, "quoted in middle")]
        [TestCase("\"some quoted tokens\" There are in the text", 
                  new string[] {
                      "some quoted tokens",
                      "There",
                      "are",
                      "in",
                      "the",
                      "text" }, "quoted at start")]
        [TestCase(" \"some quoted tokens\" There are in the text",
                  new string[] {
                      "some quoted tokens",
                      "There",
                      "are",
                      "in",
                      "the",
                      "text" }, "space then quoted at start")]
        [TestCase("There are in the text \"some quoted tokens\"", 
                  new string[] {
                      "There",
                      "are",
                      "in",
                      "the",
                      "text",
                      "some quoted tokens" }, "quoted at end")]
        [TestCase("There \"are\" in the text \"some quoted tokens\"", 
                  new string[] {
                      "There",
                      "are",
                      "in",
                      "the",
                      "text",
                      "some quoted tokens" }, "multiple quotes")]
        [TestCase("There are in the text \"some quoted tokens, that have punctionation.  And other stuff\"", 
                  new string[] {
                      "There",
                      "are",
                      "in",
                      "the",
                      "text",
                      "some quoted tokens, that have punctionation.  And other stuff" }, "punctuation in quote")]
        [TestCase("There are, in the text \"some quoted tokens\".", 
                  new string[] {
                      "There",
                      "are,",
                      "in",
                      "the",
                      "text",
                      "some quoted tokens",
                      "." }, "punctuation outside of quotes")]
        [TestCase("; There are \"some quoted tokens\" in the text", 
                  new string[] {
                      ";",
                      "There",
                      "are",
                      "some quoted tokens",
                      "in",
                      "the",
                      "text" }, "semi-colon recognised")]
        [TestCase("\"Outer quote\" nested quote \"back out\" really out", 
                  new string[] {
                      "Outer quote",
                      "nested",
                      "quote",
                      "back out",
                      "really",
                      "out" }, "nested quote")]
        [TestCase("Mismatched quotes \"are ignored",
                  new string[]
                  {
                      "Mismatched",
                      "quotes",
                      "are ignored"}, "unclosed quotes run to end of line")]
        public void QuotedStringsTreatedAsSingleToken(string inputText, string[] expectedTokens, string testName)
        {
            var tokens = CommandTokenizer.Tokenise(inputText);

            Assert.AreEqual(expectedTokens.Length, tokens.Length, testName);
            for (var i = 0; i < expectedTokens.Length; i++)
            {
                Assert.AreEqual(expectedTokens[i], tokens[i], testName);
            }
        }

    }
}

It seems like this should be a fairly common task, so maybe there's a better approach I've missed. Is the regex legible / am I missing any optimisations? Or of course, any other feedback's welcome.

Or without regex with an Aggregate like in my similar question LINQish command line parser ;-) — t3chb0t, Dec 7 '16 at 19:21
Your regex does not work if the input string starts with a white space. e.g.: " there are multiple tokens" — JanDotNet, Dec 12 '16 at 20:03
@JanDotNet Fixed, feel free to point out any other gaping holes I've missed. — forsvarir, Dec 12 '16 at 22:21
Unfortunately I don't think there is a nice/easy way to handle the delimited quotes problem using a Regular Expression. I have implemented something similar in the past in the form of CSV parser. If you use a stack to keep track of starting double quotes, then pop from the stack when you have found an ending quote. (Or just use a single field if you aren't using nesting) — Mike Rocke, Dec 12 '16 at 22:42
@MikeRocke I almost tried to support nested quotes, but after thinking about it, I don't really need nested support. I'm just allowing quotes to be used to allow a token to contain spaces. — forsvarir, Dec 12 '16 at 22:46

Denis · Accepted Answer · 2016-12-25 23:37:37Z

You can shorten your Tokenise method using LINQ

public static string[] Tokenise(string input)
{
    List<string> matches = new List<string>();
    var match = _pattern.Match(input);

    if (match.Success)
    {
         foreach (Capture capture in match.Groups["token"].Captures)

         {
             matches.Add(capture.Value);
         }
     }
     return matches.ToArray();
}

Can become

public static string[] Tokenise(string input)
{
    var match = _pattern.Match(input);
    if (match.Success)
    {
        return (from Capture capture in match.Groups["token"].Captures select capture.Value).ToArray();
    }
    return default(string[]);
}

Or even shorter with the ternary operator

public static string[] Tokenise(string input)
{
    var match = _pattern.Match(input);
    return match.Success
        ? (from Capture capture in match.Groups["token"].Captures select capture.Value).ToArray()
        : default(string[]);
}

But if performance concerns you, you're better off with your own implementation instead of using regex, unless you are working with really long strings, in which case regex will probably win in performance.

I've made an alternative solution which works ~4 times faster than your regex version running 1,000,000 iterations with this string

"There are in the text \"some quoted tokens, that have punctionation. And other stuff\""

public static string[] Tokenise(string input)
{
    input = input.Trim();
    List<string> matches = new List<string>();
    StringBuilder builder = new StringBuilder();
    for (int i = 0; i < input.Length; i++)
    {
        if (input[i] == '"')
        {
            int nextQuoteIndex = input.IndexOf('"', i + 1);
            if (nextQuoteIndex != -1)
            {
                matches.Add(input.Substring(i + 1, nextQuoteIndex - i - 1));
                i = nextQuoteIndex;
            }
            else
            {
                matches.Add(input.Substring(i + 1, input.Length - i - 1));
                return matches.ToArray();
            }
        }
        else if (input[i] != ' ')
        {
            builder.Append(input[i]);
        }
        else if (builder.Length > 0)
        {
            matches.Add(builder.ToString());
            builder.Clear();
        }
    }
    if (builder.Length > 0)
    {
        matches.Add(builder.ToString());
    }
    return matches.ToArray();       
}

I will leave that here too

Some people, when confronted with a problem, think "I know, I'll use regular expressions." Now they have two problems.

Jamie Zawinski

Thanks for the suggestions, I figured loops would probably be faster but went with regex initially to try and make it more expressive (although I don't think that was actually the end outcome). — forsvarir, Jan 4 '17 at 12:05
Regex is hard to read and maintain for most people, short regular expressions are completely fine, but yours is a bit longer for me at least. — Denis, Jan 4 '17 at 20:18

Stack Exchange Network

current community

your communities

more stack exchange communities

Command Tokenizer

1 Answer 1

Your Answer

Not the answer you're looking for? Browse other questions tagged c# unit-testing regex or ask your own question.

Linked

Hot Network Questions

Command Tokenizer

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Not the answer you're looking for? Browse other questions tagged c# unit-testing regex or ask your own question.

Linked

Related

Hot Network Questions