REBOL [
    Title: "XHTML Preformatter"
    Author: "Christopher Ross-Gill"
    Home: http://www.ross-gill.com/
    Date: 28-Apr-2003
    Rights: http://creativecommons.org/licenses/by-sa/3.0/
    Purpose: {
        Formats a line of Windows-1252 encoded text to valid XHTML
        code, escaping malicious characters.
    }
    Comment: "Apologies for the lack of comments..."
]

ctx-format: context [
    pos: marked: href: ""
    ascii-charset: #[bitset! 64#{AAAAADue/6/+///3////fwAAAAAAAAAAAAAAAAAAAAA=}]
    html-charset: #[bitset! 64#{/////8RhAFABAAAIAAAAgP////////////////////8=}]
    special-charset: #[bitset! 64#{AAQAAABhAAAAAAAIAAAAAAAAAAAAAAAAAAAAAAAAAAA=}]
    space: #[bitset! 64#{AAIAAAEAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAA=}]
    html-map: [
        34 #quot 38 #amp 60 #lt 62 #gt
        64 ##064 128 #euro ##8364 130 ##8218 131 ##402 132 ##8222 133 ##8230 134 ##8224 135 ##8225 136 ##710
        137 ##8240 138 ##352 139 ##8249 140 ##338 145 ##8216 146 ##8217 147 ##8220
        148 ##8221 149 ##8226 150 ##8211 151 ##8212 152 ##732 153 ##8482 154 ##353
        155 ##8250 156 ##339 159 ##376 160 #nbsp 161 #iexcl 162 #cent 163 #pound
        164 #curren 165 #yen 166 #brvbar 167 #sect 168 #uml 169 #copy 170 #ordf
        171 #laquo 172 #not 173 #shy 174 #reg 175 #macr 176 #deg 177 #plusmn 178 #sup2
        179 #sup3 180 #acute 181 #micro 182 #para 183 #middot 184 #cedil 185 #sup1
        186 #ordm 187 #raquo 188 #frac14 189 #frac12 190 #frac34 191 #iquest
        192 #Agrave 193 #Aacute 194 #Acirc 195 #Atilde 196 #Auml 197 #Aring
        198 #AElig 199 #Ccedil 200 #Egrave 201 #Eacute 202 #Ecirc 203 #Euml
        204 #Igrave 205 #Iacute 206 #Icirc 207 #Iuml 208 #ETH 209 #Ntilde
        210 #Ograve 211 #Oacute 212 #Ocirc 213 #Otilde 214 #Ouml 215 #times
        216 #Oslash 217 #Ugrave 218 #Uacute 219 #Ucirc 220 #Uuml 221 #Yacute
        222 #THORN 223 #szlig 224 #agrave 225 #aacute 226 #acirc 227 #atilde
        228 #auml 229 #aring 230 #aelig 231 #ccedil 232 #egrave 233 #eacute
        234 #ecirc 235 #euml 236 #igrave 237 #iacute 238 #icirc 239 #iuml 240 #eth
        241 #ntilde 242 #ograve 243 #oacute 244 #ocirc 245 #otilde 246 #ouml
        247 #divide 248 #oslash 249 #ugrave 250 #uacute 251 #ucirc 252 #uuml
        253 #yacute 254 #thorn 255 #yuml
    ]

    to-entity: func [ent [string! issue!]][return rejoin ["&" ent ";"]]
    to-encode: func [doc /local old new ent][
        old: doc/1
        new: switch/default to-integer old [
            34 [to-entity either any [head? doc doc/-1 = #" " doc/-1 = #"^(A0)" doc/-1 = #"^/"][##8220][##8221]]
            39 [to-entity either any [head? doc doc/-1 = #" " doc/-1 = #"^(A0)" doc/-1 = #"^/"][##8216][##8217]]
        ][
            either ent: select html-map to-integer old [to-entity ent]["?"]
        ]
        change/part doc new length? to-string old
    ]

    to-pre: func [doc /local old new ent][
        old: doc/1
        new: either ent: select html-map to-integer old [to-entity ent]["?"]
        change/part doc new 1
    ]

    regular-rule: [
        any [
            some ascii-charset |
            #"&" ["amp" | "copy" | "nbsp" | "quot"] #";" |
            #"<" opt "/" pos: [
                "em" | "strong" | "code" | "br /" |
                "br/"  (change/part pos "br /" 3) :pos "br /" |
                "br"  (change/part pos "br /" 2) :pos "br /" |
                "b" (change/part pos "strong" 1) :pos "strong" |
                "i" (change/part pos "em" 1) :pos "em"
            ] #">" |
            #"." pos: [
                2 space (change/part pos "&nbsp; " 2) skip |
                #"." #"." (change/part back pos "&#8230; " 3)
            ] |
            #"(" pos: [
                "c)" (change/part back pos "&copy;" 3) |
                "r)" (change/part back pos "&reg;" 3) |
                "o)" (change/part back pos "&deg;" 3) |
                "tm)" (change/part back pos "&#8482;" 4) |
                "br)" (change/part back pos <br /> 4) :pos 5 skip |
                "e)" (change/part back pos "&#8364;" 3)
            ] |
            #"-" pos: #"-" (change/part back pos "&#8212;" 2) |
            #"[" pos: [
                "TM]" (change/part back pos "&#8482;" 4) |
                "break]" (change/part back pos "<br /> " 7) :pos 5 skip
            ] |
            special-charset | #"^/" |
            html-charset pos: (to-encode back pos) :pos
        ]
    ]
    pre-rule: [
        any [
            some ascii-charset |
            #"^/" pos: (change/part back pos <br /> 1) 5 skip |
            #"'" | special-charset |
            html-charset pos: (to-pre back pos) :pos
        ]
    ]

    url-rule: [
        "[url " copy href to #"]" #"]" copy marked to "[/url]"
        (
            replace pos rejoin ["[url " href "]" marked "[/url]"] rejoin [
                {<a href="} href {" title="} marked {">} marked </a>
            ]
        )
    ]
    bold-rule: [
        "[b]" copy marked to "[/b]"
        (replace pos rejoin ["[b]" marked "[/b]"] rejoin ["" <strong> marked </strong>])
    ]
    italic-rule: [
        "[i]" copy marked to "[/i]"
        (replace pos rejoin ["[i]" marked "[/i]"] rejoin ["" <em> marked </em>])
    ]
    markup-rule: [
        some [pos: to #"[" [url-rule | bold-rule | italic-rule | #"[" pos:]]
        to end
    ]

    set 'escape-html func [doc /tags][
        pos: doc
        parse/all pos either tags [pre-rule][regular-rule]
        pos: doc
        if not tags [parse/all pos markup-rule trim/lines doc]
        doc
    ]
]