#
# HTML entity encoding and decoding for Ruby
#
module HTMLEntities
class InstructionError < RuntimeError
end
module Data #:nodoc:
#
# MAP is a hash of all the HTML entities I could discover, as taken
# from the w3schools page on the subject:
# http://www.w3schools.com/html/html_entitiesref.asp
# The format is 'entity name' => codepoint where entity name is given
# without the surrounding ampersand and semicolon.
#
MAP = {
'quot' => 34, 'apos' => 39, 'amp' => 38,
'lt' => 60, 'gt' => 62, 'nbsp' => 160,
'iexcl' => 161, 'curren' => 164, 'cent' => 162,
'pound' => 163, 'yen' => 165, 'brvbar' => 166,
'sect' => 167, 'uml' => 168, 'copy' => 169,
'ordf' => 170, 'laquo' => 171, 'not' => 172,
'shy' => 173, 'reg' => 174, 'trade' => 8482,
'macr' => 175, 'deg' => 176, 'plusmn' => 177,
'sup2' => 178, 'sup3' => 179, 'acute' => 180,
'micro' => 181, 'para' => 182, 'middot' => 183,
'cedil' => 184, 'sup1' => 185, 'ordm' => 186,
'raquo' => 187, 'frac14' => 188, 'frac12' => 189,
'frac34' => 190, 'iquest' => 191, 'times' => 215,
'divide' => 247, 'Agrave' => 192, 'Aacute' => 193,
'Acirc' => 194, 'Atilde' => 195, 'Auml' => 196,
'Aring' => 197, 'AElig' => 198, 'Ccedil' => 199,
'Egrave' => 200, 'Eacute' => 201, 'Ecirc' => 202,
'Euml' => 203, 'Igrave' => 204, 'Iacute' => 205,
'Icirc' => 206, 'Iuml' => 207, 'ETH' => 208,
'Ntilde' => 209, 'Ograve' => 210, 'Oacute' => 211,
'Ocirc' => 212, 'Otilde' => 213, 'Ouml' => 214,
'Oslash' => 216, 'Ugrave' => 217, 'Uacute' => 218,
'Ucirc' => 219, 'Uuml' => 220, 'Yacute' => 221,
'THORN' => 222, 'szlig' => 223, 'agrave' => 224,
'aacute' => 225, 'acirc' => 226, 'atilde' => 227,
'auml' => 228, 'aring' => 229, 'aelig' => 230,
'ccedil' => 231, 'egrave' => 232, 'eacute' => 233,
'ecirc' => 234, 'euml' => 235, 'igrave' => 236,
'iacute' => 237, 'icirc' => 238, 'iuml' => 239,
'eth' => 240, 'ntilde' => 241, 'ograve' => 242,
'oacute' => 243, 'ocirc' => 244, 'otilde' => 245,
'ouml' => 246, 'oslash' => 248, 'ugrave' => 249,
'uacute' => 250, 'ucirc' => 251, 'uuml' => 252,
'yacute' => 253, 'thorn' => 254, 'yuml' => 255,
'OElig' => 338, 'oelig' => 339, 'Scaron' => 352,
'scaron' => 353, 'Yuml' => 376, 'circ' => 710,
'tilde' => 732, 'ensp' => 8194, 'emsp' => 8195,
'thinsp' => 8201, 'zwnj' => 8204, 'zwj' => 8205,
'lrm' => 8206, 'rlm' => 8207, 'ndash' => 8211,
'mdash' => 8212, 'lsquo' => 8216, 'rsquo' => 8217,
'sbquo' => 8218, 'ldquo' => 8220, 'rdquo' => 8221,
'bdquo' => 8222, 'dagger' => 8224, 'Dagger' => 8225,
'hellip' => 8230, 'permil' => 8240, 'lsaquo' => 8249,
'rsaquo' => 8250, 'euro' => 8364
}
MIN_LENGTH = MAP.keys.map{ |a| a.length }.min
MAX_LENGTH = MAP.keys.map{ |a| a.length }.max
NAMED_ENTITY_REGEXP = /&([a-z]{#{MIN_LENGTH},#{MAX_LENGTH}});/i
REVERSE_MAP = MAP.invert
BASIC_ENTITY_REGEXP = /[<>'"&]/
UTF8_NON_ASCII_REGEXP = /[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+/
ENCODE_ENTITIES_COMMAND_ORDER = {
:basic => 0,
:named => 1,
:decimal => 2,
:hexadecimal => 3
}
end
#
# Decode XML and HTML 4.01 entities in a string into their UTF-8
# equivalents. Obviously, if your string is not already in UTF-8, you'd
# better convert it before using this method, or the output will be mixed
# up.
#
# Unknown named entities are not converted
#
def decode_entities(string)
return string.gsub(Data::NAMED_ENTITY_REGEXP) {
(cp = Data::MAP[$1]) ? [cp].pack('U') : $&
}.gsub(/([0-9]{1,7});|([0-9a-f]{1,6});/i) {
$1 ? [$1.to_i].pack('U') : [$2.to_i(16)].pack('U')
}
end
#
# Encode codepoints into their corresponding entities. Various operations
# are possible, and may be specified in order:
#
# :basic :: Convert the five XML entities ('"<>&)
# :named :: Convert non-ASCII characters to their named HTML 4.01 equivalent
# :decimal :: Convert non-ASCII characters to decimal entities (e.g. Ӓ)
# :hexadecimal :: Convert non-ASCII characters to hexadecimal entities (e.g. # ካ)
#
# You can specify the commands in any order, but they will be executed in
# the order listed above to ensure that entity ampersands are not
# clobbered and that named entities are replaced before numeric ones.
#
# If no instructions are specified, :basic will be used.
#
# Examples:
# encode_entities(str) - XML-safe
# encode_entities(str, :basic, :decimal) - XML-safe and 7-bit clean
# encode_entities(str, :basic, :named, :decimal) - 7-bit clean, with all
# non-ASCII characters replaced with their named entity where possible, and
# decimal equivalents otherwise.
#
# Note: It is the program's responsibility to ensure that the string
# contains valid UTF-8 before calling this method.
#
def encode_entities(string, *instructions)
output = nil
if (instructions.empty?)
instructions = [:basic]
else
instructions = instructions.sort_by { |instruction|
Data::ENCODE_ENTITIES_COMMAND_ORDER[instruction] ||
(raise InstructionError, "unknown encode_entities command `#{instruction.inspect}'")
}
end
instructions.each do |instruction|
case instruction
when :basic
# Handled as basic ASCII
output = (output || string).gsub(Data::BASIC_ENTITY_REGEXP) {
# It's safe to use the simpler [0] here because we know
# that the basic entities are ASCII.
'&' << Data::REVERSE_MAP[$&[0]] << ';'
}
when :named
# Test everything except printable ASCII
output = (output || string).gsub(Data::UTF8_NON_ASCII_REGEXP) {
cp = $&.unpack('U')[0]
(e = Data::REVERSE_MAP[cp]) ? "{e};" : $&
}
when :decimal
output = (output || string).gsub(Data::UTF8_NON_ASCII_REGEXP) {
"#{$&.unpack('U')[0]};"
}
when :hexadecimal
output = (output || string).gsub(Data::UTF8_NON_ASCII_REGEXP) {
"#{$&.unpack('U')[0].to_s(16)};"
}
end
end
return output
end
extend self
end