I receive HTML-encoded text that includes named and numbered entities, and I need the decoded HTML.
There were just too many bad solutions for this online.
I use regex pattern &(#x|#){0,1}(\w{1,8});
to find the entities in an HTML text (capturing the numeric prefix and the numeric expression), and I use a Static
Scripting.Dictionary
to look up a (case-sensitive) named entity's decimal encoding. It's Static
because this is the type of function that can get called multiple times, so I only set up the dictionary once.
I then assemble the result using array assignments and a final Join
function to avoid unnecessary concatenation. I could probably eek better performance by using the Mid
statement to progressively fill a String
buffer.
First the function itself:
HTMLDecode
Public Function HTMLDecode(ByRef sourceText As String) As String
Const PROC_NAME As String = "HTMLDecode"
Const VB_HEX_PREFIX As String = "&h"
Const DEC_CHARACTER_REFERENCE_PREFIX As String = "#"
Const HEX_CHARACTER_REFERENCE_PREFIX As String = "#x"
Dim characterReferences As VBScript_RegExp_55.MatchCollection
Set characterReferences = ExtractCharacterReferences(sourceText)
Static namedEntities As Scripting.Dictionary
If namedEntities Is Nothing Then
Set namedEntities = CreateEntityDictionary
End If
Dim currentPosition As Long
Dim nextPosition As Long
Dim characterReference As Match
currentPosition = 1
Dim refCounter As Long
ReDim resultParts(characterReferences.Count * 2) As String
For Each characterReference In characterReferences
nextPosition = characterReference.FirstIndex
Dim character As String
Dim refText As String
refText = characterReference.SubMatches.Item(1)
If namedEntities.Exists(refText) Then
'characterReference is a named entity
character = ChrW$(namedEntities.Item(refText))
Else
'characterReference is probably a numeric entity
Select Case characterReference.SubMatches.Item(0)
Case DEC_CHARACTER_REFERENCE_PREFIX
If IsNumeric(refText) Then
'entity is a decimal codepoint
character = ChrW$(CLng(refText))
Else
Err.Raise ERROR_UNEXPECTED_ENTITY_FORMAT, PROC_NAME, "Unexpected decimal entity value '" & characterReference.Value & "'"
End If
Case HEX_CHARACTER_REFERENCE_PREFIX
Dim hexRefText As String
hexRefText = VB_HEX_PREFIX & refText
If IsNumeric(hexRefText) Then
'entity is a hex codepoint
character = ChrW$(CLng(hexRefText))
Else
Err.Raise ERROR_UNEXPECTED_ENTITY_FORMAT, PROC_NAME, "Unexpected hex entity value '" & characterReference.Value & "'"
End If
Case Else
'Unexpected entity text
Err.Raise ERROR_UNEXPECTED_ENTITY_FORMAT, PROC_NAME, "Unexpected entity name '" & characterReference.Value & "'"
End Select
End If
resultParts(refCounter) = Mid(sourceText, currentPosition, nextPosition - currentPosition + 1)
resultParts(refCounter + 1) = character
currentPosition = nextPosition + characterReference.Length + 1
refCounter = refCounter + 2
Next characterReference
'Catch any text trailing the last entity
If currentPosition <= Len(sourceText) Then
resultParts(refCounter) = Mid$(sourceText, currentPosition)
End If
HTMLDecode = Join(resultParts, vbNullString)
End Function
And the helper constants/functions in the same module:
HTMLHelpers.bas
Option Explicit
Private Const ERROR_UNEXPECTED_ENTITY_COUNT As Long = vbObjectError Or 1
Private Const ERROR_UNEXPECTED_ENTITY_DUPLICATE As Long = vbObjectError Or 2
Private Const ERROR_UNEXPECTED_ENTITY_VALUE As Long = vbObjectError Or 3
Private Const ERROR_UNEXPECTED_ENTITY_FORMAT As Long = vbObjectError Or 4
Private Function CreateEntityDictionary() As Scripting.Dictionary
Const PROC_NAME As String = "getEntityDictionary"
Const ENTITY_NAMES = _
"quot,amp,apos,lt,gt,nbsp,iexcl,cent,pound,curren,yen,brvbar,sect,uml,copy,ordf," & _
"laquo,not,shy,reg,macr,deg,plusmn,sup2,sup3,acute,micro,para,middot,cedil,sup1,ordm," & _
"raquo,frac14,frac12,frac34,iquest,Agrave,Aacute,Acirc,Atilde,Auml,Aring,AElig,Ccedil,Egrave,Eacute,Ecirc," & _
"Euml,Igrave,Iacute,Icirc,Iuml,ETH,Ntilde,Ograve,Oacute,Ocirc,Otilde,Ouml,times,Oslash,Ugrave,Uacute," & _
"Ucirc,Uuml,Yacute,THORN,szlig,agrave,aacute,acirc,atilde,auml,aring,aelig,ccedil,egrave,eacute,ecirc," & _
"euml,igrave,iacute,icirc,iuml,eth,ntilde,ograve,oacute,ocirc,otilde,ouml,divide,oslash,ugrave,uacute," & _
"ucirc,uuml,yacute,thorn,yuml,OElig,oelig,Scaron,scaron,Yuml,fnof,circ,tilde,Alpha,Beta,Gamma," & _
"Delta,Epsilon,Zeta,Eta,Theta,Iota,Kappa,Lambda,Mu,Nu,Xi,Omicron,Pi,Rho,Sigma,Tau," & _
"Upsilon,Phi,Chi,Psi,Omega,alpha,beta,gamma,delta,epsilon,zeta,eta,theta,iota,kappa,lambda," & _
"mu,nu,xi,omicron,pi,rho,sigmaf,sigma,tau,upsilon,phi,chi,psi,omega,thetasym,upsih," & _
"piv,ensp,emsp,thinsp,zwnj,zwj,lrm,rlm,ndash,mdash,lsquo,rsquo,sbquo,ldquo,rdquo,bdquo," & _
"dagger,Dagger,bull,hellip,permil,prime,Prime,lsaquo,rsaquo,oline,frasl,euro,image,weierp,real,trade," & _
"alefsym,larr,uarr,rarr,darr,harr,crarr,lArr,uArr,rArr,dArr,hArr,forall,part,exist,empty," & _
"nabla,isin,notin,ni,prod,sum,minus,lowast,radic,prop,infin,ang,and,or,cap,cup," & _
"int,there4,sim,cong,asymp,ne,equiv,le,ge,sub,sup,nsub,sube,supe,oplus,otimes," & _
"perp,sdot,lceil,rceil,lfloor,rfloor,lang,rang,loz,spades,clubs,hearts,diams"
Const ENTITY_VALUES = _
"34,38,39,60,62,160,161,162,163,164,165,166,167,168,169,170," & _
"171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186," & _
"187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202," & _
"203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218," & _
"219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234," & _
"235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250," & _
"251,252,253,254,255,338,339,352,353,376,402,710,732,913,914,915," & _
"916,917,918,919,920,921,922,923,924,925,926,927,928,929,931,932," & _
"933,934,935,936,937,945,946,947,948,949,950,951,952,953,954,955," & _
"956,957,958,959,960,961,962,963,964,965,966,967,968,969,977,978," & _
"982,8194,8195,8201,8204,8205,8206,8207,8211,8212,8216,8217,8218,8220,8221,8222," & _
"8224,8225,8226,8230,8240,8242,8243,8249,8250,8254,8260,8364,8465,8472,8476,8482," & _
"8501,8592,8593,8594,8595,8596,8629,8656,8657,8658,8659,8660,8704,8706,8707,8709," & _
"8711,8712,8713,8715,8719,8721,8722,8727,8730,8733,8734,8736,8743,8744,8745,8746," & _
"8747,8756,8764,8773,8776,8800,8801,8804,8805,8834,8835,8836,8838,8839,8853,8855," & _
"8869,8901,8968,8969,8970,8971,9001,9002,9674,9824,9827,9829,9830"
Const ENTITY_DELIMITER As String = ","
Set CreateEntityDictionary = New Scripting.Dictionary
'Entity names must be case sensitive
CreateEntityDictionary.CompareMode = BinaryCompare
Dim entityNames() As String
entityNames = Split(ENTITY_NAMES, ENTITY_DELIMITER)
Dim entityValues() As String
entityValues = Split(ENTITY_VALUES, ENTITY_DELIMITER)
If UBound(entityNames) = UBound(entityValues) Then
With CreateEntityDictionary
Dim entityCounter As Long
For entityCounter = LBound(entityNames) To UBound(entityNames)
If Not .Exists(entityNames(entityCounter)) Then
If IsNumeric(entityValues(entityCounter)) Then
.Add entityNames(entityCounter), CLng(entityValues(entityCounter))
Else
Err.Raise ERROR_UNEXPECTED_ENTITY_VALUE, PROC_NAME, "Unexpected entity value: " & entityValues(entityCounter)
End If
Else
Err.Raise ERROR_UNEXPECTED_ENTITY_DUPLICATE, PROC_NAME, "Unexpected duplicate entity name: " & entityNames(entityCounter)
End If
Next entityCounter
End With
Else
Err.Raise ERROR_UNEXPECTED_ENTITY_COUNT, PROC_NAME, "Unexpected number of entity names/values"
End If
End Function
Private Function ExtractCharacterReferences(ByRef sourceText As String) As VBScript_RegExp_55.MatchCollection
Const REFERENCE_PATTERN As String = "&(#{0,1}[x]{0,1})(\w{1,8});"
With New VBScript_RegExp_55.RegExp
.Global = True
.Pattern = REFERENCE_PATTERN
Set ExtractCharacterReferences = .Execute(sourceText)
End With
End Function
Example Usage
?htmlhelpers.HTMLDecode("",&,',<,>, ,¡,¢,£,¤")
",&,',<,>, ,¡,¢,£,¤
?htmlhelpers.HTMLDecode("Bill & Ted&#apos;s Excellent Adventure")
Bill & Ted's Excellent Adventure
More Information:
- Wikipedia : List of XML and HTML character entity references
- Wikipedia : Character Encodings in HTML
Related:
- Wikipedia : Escape Characters in JavaScript, etc
- Wikipedia : SGML entity
"ξ"
and it fails because my regex is careless about catching the numeric prefix. \$\endgroup\$&(#{0,1}[x]{0,1})(\w{1,8});
to&(#x|#){0,1}(\w{1,8});
and fixedξ
problem \$\endgroup\$&(#x|#)?(\w{1,8});
\$\endgroup\$Dim characterReference as Match
- what's a match type? \$\endgroup\$VBScript_RegExp_55.Match
\$\endgroup\$