One of my tools that renders HTML needs some rules about document formatting. The renderer can format the output so that it is indented and contains appropriate line-breaks. In the first version I used a hardcoded dictionary that looks like this:
public class HtmlFormatting : MarkupFormatting { public const int DefaultIndentWidth = 4; public HtmlFormatting() : this(DefaultIndentWidth) { this["body"] = MarkupFormattingOptions.PlaceClosingTagOnNewLine; this["br"] = MarkupFormattingOptions.IsVoid; //this["span"] = MarkupFormattingOptions.None; this["p"] = MarkupFormattingOptions.PlaceOpeningTagOnNewLine; this["pre"] = MarkupFormattingOptions.PlaceOpeningTagOnNewLine; this["h1"] = MarkupFormattingOptions.PlaceOpeningTagOnNewLine; this["h2"] = MarkupFormattingOptions.PlaceOpeningTagOnNewLine; this["h3"] = MarkupFormattingOptions.PlaceOpeningTagOnNewLine; this["h4"] = MarkupFormattingOptions.PlaceOpeningTagOnNewLine; this["h5"] = MarkupFormattingOptions.PlaceOpeningTagOnNewLine; this["h6"] = MarkupFormattingOptions.PlaceOpeningTagOnNewLine; this["ul"] = MarkupFormattingOptions.PlaceBothTagsOnNewLine; this["ol"] = MarkupFormattingOptions.PlaceBothTagsOnNewLine; this["li"] = MarkupFormattingOptions.PlaceOpeningTagOnNewLine; this["table"] = MarkupFormattingOptions.PlaceClosingTagOnNewLine; this["caption"] = MarkupFormattingOptions.PlaceOpeningTagOnNewLine; this["thead"] = MarkupFormattingOptions.PlaceBothTagsOnNewLine; this["tbody"] = MarkupFormattingOptions.PlaceBothTagsOnNewLine; this["tfoot"] = MarkupFormattingOptions.PlaceBothTagsOnNewLine; this["tr"] = MarkupFormattingOptions.PlaceBothTagsOnNewLine; this["th"] = MarkupFormattingOptions.PlaceOpeningTagOnNewLine; this["td"] = MarkupFormattingOptions.PlaceOpeningTagOnNewLine; } public HtmlFormatting(int indentWidth) { IndentWidth = indentWidth; } }
As with everything hardcoded it's not very maintenance friendly and doesn't allow me to change or add new formattings without recompiling the application.
In order to fix this I thought why not derive the formatting from a real HTML? This way I already can see the output so everything starts with a template. This is how I expect the generated HTML to look like:
var template = @"
<body>
<h1></h1>
<h2></h2>
<p><br><span></span></p>
<div> </div>
<hr>
<ol>
</ol>
<ul>
<li></li>
</ul>
<table>
<thead>
</thead>
<tbody>
<tr>
<th></th>
<td></td>
</tr>
</tbody>
<tfoot>
</tfoot>
</table>
</body>";
With a few patterns, groupings and conditions I then determine the formatting for each element. Because I'm not interested in parsing the HTML but only finding the number of tags, their rows and columns I used regex. A template would never be anything else then the example above. For the sake of this question let's assume the HTML is always valid.
What the expression does is to basically split the template on line breaks and calculates the row and column numbers for each tag. Then based on that I can tell
- whether an element is a void element if it occurs only once in the template
- whether its opening tag should be placed on a new line if it doesn't have any predecessors (based on the column number)
- whether its closing tag should be placed on a new line if both its tags have different row numbers (or there are simply two different row numbers)
static class MarkupFormattingTemplate
{
public static IDictionary<string, MarkupFormattingOptions> Parse(string template)
{
var tags =
template
.ToLines()
.Parse()
.ToList();
var openingTagOptions = tags.DetermineOpeningTagOptions();
var closingTagOptions = tags.DetermineClosingTagOptions();
return Merge(openingTagOptions, closingTagOptions);
}
private static IEnumerable<string> ToLines(this string template)
{
return
Regex
.Split(template, @"(\r\n|\r|\n)")
// Remove empty lines.
.Where(line => !string.IsNullOrEmpty(line.Trim()));
}
private static IEnumerable<Tag> Parse(this IEnumerable<string> lines)
{
return
lines
.Select((line, lineNumber) =>
ParseLine(line)
// Select tag properties for grouping.
.Select(m => new Tag
{
Name = m.Groups["name"].Value,
Line = lineNumber,
Column = m.Groups["name"].Index
}))
.SelectMany(x => x);
IEnumerable<Match> ParseLine(string line)
{
return
Regex
// Find tag names.
.Matches(line, @"</?(?<name>[a-z0-9]+)>", RegexOptions.ExplicitCapture)
.Cast<Match>();
}
}
private static IEnumerable<KeyValuePair<string, MarkupFormattingOptions>> DetermineClosingTagOptions(this IEnumerable<Tag> tags)
{
// Group elements by name to first find out where to place the closing tag.
foreach (var g in tags.GroupBy(t => t.Name))
{
var closingTagOptions =
// If any tag has more the one row then the closing tag should be placed on a new line.
(g.Select(i => i.Line).Distinct().Count() > 1 ? MarkupFormattingOptions.PlaceClosingTagOnNewLine : MarkupFormattingOptions.None) |
// If any tag occurs only once then it's void.
(g.Count() == 1 ? MarkupFormattingOptions.IsVoid : MarkupFormattingOptions.None);
yield return new KeyValuePair<string, MarkupFormattingOptions>(g.Key, closingTagOptions);
};
}
private static IEnumerable<KeyValuePair<string, MarkupFormattingOptions>> DetermineOpeningTagOptions(this IEnumerable<Tag> tags)
{
foreach (var tagName in tags.Select(t => t.Name).Distinct(StringComparer.OrdinalIgnoreCase))
{
var openingTagOptions =
tags
.GroupBy(t => t.Line)
.Where(g => g.Any(x => x.Name == tagName))
.First()
.Select((item, index) => new { item, index })
.First(x => x.item.Name == tagName).index == 0
? MarkupFormattingOptions.PlaceOpeningTagOnNewLine
: MarkupFormattingOptions.None;
yield return new KeyValuePair<string, MarkupFormattingOptions>(tagName, openingTagOptions);
}
}
private static IDictionary<string, MarkupFormattingOptions> Merge(
IEnumerable<KeyValuePair<string, MarkupFormattingOptions>> options1,
IEnumerable<KeyValuePair<string, MarkupFormattingOptions>> options2)
{
var result = options1.ToDictionary(x => x.Key, x => x.Value, StringComparer.OrdinalIgnoreCase);
foreach (var item in options2)
{
result[item.Key] |= item.Value;
}
return result;
}
private class Tag
{
public string Name { get; set; }
public int Line { get; set; }
public int Column { get; set; }
}
}
Formatting options are defined by an enum:
[Flags]
public enum MarkupFormattingOptions
{
None = 0,
PlaceOpeningTagOnNewLine = 1,
PlaceClosingTagOnNewLine = 2,
PlaceBothTagsOnNewLine =
PlaceOpeningTagOnNewLine |
PlaceClosingTagOnNewLine,
IsVoid = 4,
CloseEmptyTag = 8
}
To visualize the steps here are some intermediate results:
Step one: split on new lines so this is actually the same as the template:
<body>
<h1></h1>
<h2></h2>
<p><br><span></span></p>
<div> </div>
<hr>
<ol>
</ol>
<ul>
<li></li>
</ul>
<table>
<thead>
</thead>
<tbody>
<tr>
<th></th>
<td></td>
</tr>
</tbody>
<tfoot>
</tfoot>
</table>
</body>
Step two: tag names and their row and column numbers:
name row column
body 0 3
h1 1 7
h1 1 12
h2 2 7
h2 2 12
p 3 7
br 3 10
span 3 14
span 3 21
p 3 28
div 4 4
div 4 11
hr 5 7
ol 6 7
ol 7 8
ul 8 7
li 9 11
li 9 16
ul 10 8
table 11 7
thead 12 11
thead 13 12
tbody 14 11
tr 15 15
th 16 10
th 16 15
td 17 19
td 17 24
tr 18 16
tbody 19 12
tfoot 20 11
tfoot 21 12
table 22 8
body 23 4
Step three: finding closing tag options:
body PlaceClosingTagOnNewLine
h1 None
h2 None
p None
br IsVoid
span None
div None
hr IsVoid
ol PlaceClosingTagOnNewLine
ul PlaceClosingTagOnNewLine
li None
table PlaceClosingTagOnNewLine
thead PlaceClosingTagOnNewLine
tbody PlaceClosingTagOnNewLine
tr PlaceClosingTagOnNewLine
th None
td None
tfoot PlaceClosingTagOnNewLine
Step four: finding opening tag options and merging it with the previous step so at the same time this is the final step:
body PlaceBothTagsOnNewLine
h1 PlaceOpeningTagOnNewLine
h2 PlaceOpeningTagOnNewLine
p PlaceOpeningTagOnNewLine
br IsVoid
span None
div PlaceOpeningTagOnNewLine
hr PlaceOpeningTagOnNewLine, IsVoid
ol PlaceBothTagsOnNewLine
ul PlaceBothTagsOnNewLine
li PlaceOpeningTagOnNewLine
table PlaceBothTagsOnNewLine
thead PlaceBothTagsOnNewLine
tbody PlaceBothTagsOnNewLine
tr PlaceBothTagsOnNewLine
th PlaceOpeningTagOnNewLine
td PlaceOpeningTagOnNewLine
tfoot PlaceBothTagsOnNewLine