I've written some code to tokenize a command string into its tokens.
A token is either:
- A block of any non-whitespace characters
- A block of characters, which may include whitespace, wrapped in quotes
So, for the input:
This is some text "with information" quoted.
I'd expect the tokens:
- This
- is
- some
- text
- with information
- quoted.
The tokenizer
using System.Collections.Generic;
using System.Text.RegularExpressions;
namespace MudCore
{
public static class CommandTokenizer
{
static Regex _pattern;
static CommandTokenizer()
{
_pattern = new Regex(@"((\s*""(?<token>[^""]*)(""|$)\s*)|(\s*(?<token>[^\s""]+)\s*))*", RegexOptions.Compiled | RegexOptions.ExplicitCapture);
}
public static string[] Tokenise(string input)
{
List<string> matches = new List<string>();
var match = _pattern.Match(input);
if(match.Success)
{
foreach(Capture capture in match.Groups["token"].Captures)
{
matches.Add(capture.Value);
}
}
return matches.ToArray();
}
}
}
The Tests
using MudCore;
using NUnit.Framework;
namespace MudCoreTests
{
[TestFixture]
public class CommandTokenizerTests
{
[Test]
public void SingleWordBecomesSingleToken()
{
var tokens = CommandTokenizer.Tokenise("single");
Assert.AreEqual(1, tokens.Length);
Assert.AreEqual("single", tokens[0]);
}
[Test]
public void MultipleWordsReturnMultipleTokens()
{
var tokens = CommandTokenizer.Tokenise("there are multiple tokens");
Assert.AreEqual(4, tokens.Length);
Assert.AreEqual("there", tokens[0]);
Assert.AreEqual("are", tokens[1]);
Assert.AreEqual("multiple", tokens[2]);
Assert.AreEqual("tokens", tokens[3]);
}
[Test]
public void LeadingSpacesIgnored()
{
var tokens = CommandTokenizer.Tokenise(" there are multiple tokens");
Assert.AreEqual(4, tokens.Length);
Assert.AreEqual("there", tokens[0]);
Assert.AreEqual("are", tokens[1]);
Assert.AreEqual("multiple", tokens[2]);
Assert.AreEqual("tokens", tokens[3]);
}
[TestCase("There are \"some quoted tokens\" in the text",
new string[] {
"There",
"are",
"some quoted tokens",
"in",
"the",
"text" }, "quoted in middle")]
[TestCase("\"some quoted tokens\" There are in the text",
new string[] {
"some quoted tokens",
"There",
"are",
"in",
"the",
"text" }, "quoted at start")]
[TestCase(" \"some quoted tokens\" There are in the text",
new string[] {
"some quoted tokens",
"There",
"are",
"in",
"the",
"text" }, "space then quoted at start")]
[TestCase("There are in the text \"some quoted tokens\"",
new string[] {
"There",
"are",
"in",
"the",
"text",
"some quoted tokens" }, "quoted at end")]
[TestCase("There \"are\" in the text \"some quoted tokens\"",
new string[] {
"There",
"are",
"in",
"the",
"text",
"some quoted tokens" }, "multiple quotes")]
[TestCase("There are in the text \"some quoted tokens, that have punctionation. And other stuff\"",
new string[] {
"There",
"are",
"in",
"the",
"text",
"some quoted tokens, that have punctionation. And other stuff" }, "punctuation in quote")]
[TestCase("There are, in the text \"some quoted tokens\".",
new string[] {
"There",
"are,",
"in",
"the",
"text",
"some quoted tokens",
"." }, "punctuation outside of quotes")]
[TestCase("; There are \"some quoted tokens\" in the text",
new string[] {
";",
"There",
"are",
"some quoted tokens",
"in",
"the",
"text" }, "semi-colon recognised")]
[TestCase("\"Outer quote\" nested quote \"back out\" really out",
new string[] {
"Outer quote",
"nested",
"quote",
"back out",
"really",
"out" }, "nested quote")]
[TestCase("Mismatched quotes \"are ignored",
new string[]
{
"Mismatched",
"quotes",
"are ignored"}, "unclosed quotes run to end of line")]
public void QuotedStringsTreatedAsSingleToken(string inputText, string[] expectedTokens, string testName)
{
var tokens = CommandTokenizer.Tokenise(inputText);
Assert.AreEqual(expectedTokens.Length, tokens.Length, testName);
for (var i = 0; i < expectedTokens.Length; i++)
{
Assert.AreEqual(expectedTokens[i], tokens[i], testName);
}
}
}
}
It seems like this should be a fairly common task, so maybe there's a better approach I've missed. Is the regex legible / am I missing any optimisations? Or of course, any other feedback's welcome.
Aggregate
like in my similar question LINQish command line parser ;-) \$\endgroup\$ – t3chb0t Dec 7 '16 at 19:21