REBOL [
Title: "XHTML Preformatter"
Author: "Christopher Ross-Gill"
Home: http://www.ross-gill.com/
Date: 28-Apr-2003
Rights: http://creativecommons.org/licenses/by-sa/3.0/
Purpose: {
Formats a line of Windows-1252 encoded text to valid XHTML
code, escaping malicious characters.
}
Comment: "Apologies for the lack of comments..."
]
ctx-format: context [
pos: marked: href: ""
ascii-charset: #[bitset! 64#{AAAAADue/6/+///3////fwAAAAAAAAAAAAAAAAAAAAA=}]
html-charset: #[bitset! 64#{/////8RhAFABAAAIAAAAgP////////////////////8=}]
special-charset: #[bitset! 64#{AAQAAABhAAAAAAAIAAAAAAAAAAAAAAAAAAAAAAAAAAA=}]
space: #[bitset! 64#{AAIAAAEAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAA=}]
html-map: [
34 #quot 38 #amp 60 #lt 62 #gt
64 ##064 128 #euro ##8364 130 ##8218 131 ##402 132 ##8222 133 ##8230 134 ##8224 135 ##8225 136 ##710
137 ##8240 138 ##352 139 ##8249 140 ##338 145 ##8216 146 ##8217 147 ##8220
148 ##8221 149 ##8226 150 ##8211 151 ##8212 152 ##732 153 ##8482 154 ##353
155 ##8250 156 ##339 159 ##376 160 #nbsp 161 #iexcl 162 #cent 163 #pound
164 #curren 165 #yen 166 #brvbar 167 #sect 168 #uml 169 #copy 170 #ordf
171 #laquo 172 #not 173 #shy 174 #reg 175 #macr 176 #deg 177 #plusmn 178 #sup2
179 #sup3 180 #acute 181 #micro 182 #para 183 #middot 184 #cedil 185 #sup1
186 #ordm 187 #raquo 188 #frac14 189 #frac12 190 #frac34 191 #iquest
192 #Agrave 193 #Aacute 194 #Acirc 195 #Atilde 196 #Auml 197 #Aring
198 #AElig 199 #Ccedil 200 #Egrave 201 #Eacute 202 #Ecirc 203 #Euml
204 #Igrave 205 #Iacute 206 #Icirc 207 #Iuml 208 #ETH 209 #Ntilde
210 #Ograve 211 #Oacute 212 #Ocirc 213 #Otilde 214 #Ouml 215 #times
216 #Oslash 217 #Ugrave 218 #Uacute 219 #Ucirc 220 #Uuml 221 #Yacute
222 #THORN 223 #szlig 224 #agrave 225 #aacute 226 #acirc 227 #atilde
228 #auml 229 #aring 230 #aelig 231 #ccedil 232 #egrave 233 #eacute
234 #ecirc 235 #euml 236 #igrave 237 #iacute 238 #icirc 239 #iuml 240 #eth
241 #ntilde 242 #ograve 243 #oacute 244 #ocirc 245 #otilde 246 #ouml
247 #divide 248 #oslash 249 #ugrave 250 #uacute 251 #ucirc 252 #uuml
253 #yacute 254 #thorn 255 #yuml
]
to-entity: func [ent [string! issue!]][return rejoin ["&" ent ";"]]
to-encode: func [doc /local old new ent][
old: doc/1
new: switch/default to-integer old [
34 [to-entity either any [head? doc doc/-1 = #" " doc/-1 = #"^(A0)" doc/-1 = #"^/"][##8220][##8221]]
39 [to-entity either any [head? doc doc/-1 = #" " doc/-1 = #"^(A0)" doc/-1 = #"^/"][##8216][##8217]]
][
either ent: select html-map to-integer old [to-entity ent]["?"]
]
change/part doc new length? to-string old
]
to-pre: func [doc /local old new ent][
old: doc/1
new: either ent: select html-map to-integer old [to-entity ent]["?"]
change/part doc new 1
]
regular-rule: [
any [
some ascii-charset |
#"&" ["amp" | "copy" | "nbsp" | "quot"] #";" |
#"<" opt "/" pos: [
"em" | "strong" | "code" | "br /" |
"br/" (change/part pos "br /" 3) :pos "br /" |
"br" (change/part pos "br /" 2) :pos "br /" |
"b" (change/part pos "strong" 1) :pos "strong" |
"i" (change/part pos "em" 1) :pos "em"
] #">" |
#"." pos: [
2 space (change/part pos " " 2) skip |
#"." #"." (change/part back pos "… " 3)
] |
#"(" pos: [
"c)" (change/part back pos "©" 3) |
"r)" (change/part back pos "®" 3) |
"o)" (change/part back pos "°" 3) |
"tm)" (change/part back pos "™" 4) |
"br)" (change/part back pos <br /> 4) :pos 5 skip |
"e)" (change/part back pos "€" 3)
] |
#"-" pos: #"-" (change/part back pos "—" 2) |
#"[" pos: [
"TM]" (change/part back pos "™" 4) |
"break]" (change/part back pos "<br /> " 7) :pos 5 skip
] |
special-charset | #"^/" |
html-charset pos: (to-encode back pos) :pos
]
]
pre-rule: [
any [
some ascii-charset |
#"^/" pos: (change/part back pos <br /> 1) 5 skip |
#"'" | special-charset |
html-charset pos: (to-pre back pos) :pos
]
]
url-rule: [
"[url " copy href to #"]" #"]" copy marked to "[/url]"
(
replace pos rejoin ["[url " href "]" marked "[/url]"] rejoin [
{<a href="} href {" title="} marked {">} marked </a>
]
)
]
bold-rule: [
"[b]" copy marked to "[/b]"
(replace pos rejoin ["[b]" marked "[/b]"] rejoin ["" <strong> marked </strong>])
]
italic-rule: [
"[i]" copy marked to "[/i]"
(replace pos rejoin ["[i]" marked "[/i]"] rejoin ["" <em> marked </em>])
]
markup-rule: [
some [pos: to #"[" [url-rule | bold-rule | italic-rule | #"[" pos:]]
to end
]
set 'escape-html func [doc /tags][
pos: doc
parse/all pos either tags [pre-rule][regular-rule]
pos: doc
if not tags [parse/all pos markup-rule trim/lines doc]
doc
]
]