REBOL [
    title: "Plain Text to XHTML"
    author: "Christopher Ross-Gill"
    home: http://www.ross-gill.com/make-doc/
    file: %xhtml.r
    date: 26-Aug-2003
    rights: http://creativecommons.org/licenses/by-sa/3.0/
]

;-- It is recommended that the #"^M" characters are removed from
;-- strings before processing.

xhtml-ctx: context [
;-- Source shifted left to eof

space: charset " ^-" chars: complement nochar: charset " ^-^/"
spaces: [any space]

format-ctx: context [
    pos: marked: href: element: ""
    encode: func [str len][
        change/part pos str len
        if tag? str [pos: skip pos (length? mold str)]
    ]
    encode-e: func [str len][change/part element str len]
    to-entity: func [ent [string! issue!]][return rejoin ["&" ent ";"]]

    ascii-charset: charset [#" " - #"~"]
    html-charset: exclude ascii-charset charset {"&<>}
    base-charset: exclude html-charset charset {(-.[\'}

    html-map: [
        34 #quot 38 #amp 60 #lt 62 #gt 64 ##064 128 #euro ##8364 130 ##8218
        131 ##402 132 ##8222 133 ##8230 134 ##8224 135 ##8225 136 ##710
        137 ##8240 138 ##352 139 ##8249 140 ##338 145 ##8216 146 ##8217 147 ##8220
        148 ##8221 149 ##8226 150 ##8211 151 ##8212 152 ##732 153 ##8482 154 ##353
        155 ##8250 156 ##339 159 ##376 160 #nbsp 161 #iexcl 162 #cent 163 #pound
        164 #curren 165 #yen 166 #brvbar 167 #sect 168 #uml 169 #copy 170 #ordf
        171 #laquo 172 #not 173 #shy 174 #reg 175 #macr 176 #deg 177 #plusmn 178 #sup2
        179 #sup3 180 #acute 181 #micro 182 #para 183 #middot 184 #cedil 185 #sup1
        186 #ordm 187 #raquo 188 #frac14 189 #frac12 190 #frac34 191 #iquest
        192 #Agrave 193 #Aacute 194 #Acirc 195 #Atilde 196 #Auml 197 #Aring
        198 #AElig 199 #Ccedil 200 #Egrave 201 #Eacute 202 #Ecirc 203 #Euml
        204 #Igrave 205 #Iacute 206 #Icirc 207 #Iuml 208 #ETH 209 #Ntilde
        210 #Ograve 211 #Oacute 212 #Ocirc 213 #Otilde 214 #Ouml 215 #times
        216 #Oslash 217 #Ugrave 218 #Uacute 219 #Ucirc 220 #Uuml 221 #Yacute
        222 #THORN 223 #szlig 224 #agrave 225 #aacute 226 #acirc 227 #atilde
        228 #auml 229 #aring 230 #aelig 231 #ccedil 232 #egrave 233 #eacute
        234 #ecirc 235 #euml 236 #igrave 237 #iacute 238 #icirc 239 #iuml 240 #eth
        241 #ntilde 242 #ograve 243 #oacute 244 #ocirc 245 #otilde 246 #ouml
        247 #divide 248 #oslash 249 #ugrave 250 #uacute 251 #ucirc 252 #uuml
        253 #yacute 254 #thorn 255 #yuml
    ]

    to-entity: func [char [char!] /local ent][
        encode either ent: select html-map to-integer char [rejoin ["&" ent ";"]]["?"] 1
    ]

    line-end: [newline (encode <br /> 1) :pos]
    period: [#"." opt [2 space (encode ".&nbsp; " 3) | #"." #"." (encode "&#8230; " 3)]]
    dashes: [#"-" #"-" (encode "&#8212;" 2)]
    codes: [#"[" "break]" (encode <br /> 6) :pos]

    quote-test: [head? pos pos/-1 = #" " pos/-1 = #"^(A0)" pos/-1 = #"^/" pos/-1 = #">"]
    quotes: [
        #"\" [#"\" (encode "\" 2) | #"^"" (encode {"} 2)] :pos skip |
        #"^"" (encode either any quote-test ["&#8220;"]["&#8221;"] 1) |
        #"'" (encode either any quote-test ["&#8216;"]["&#8217;"] 1)
    ]
    anchor: [#"a" some-space some [{href="} #"^""]]
    allowed-entities: [#"&" ["amp" | "copy" | "nbsp" | "quot" | "lt" | "gt"] #";"]
    allowed-tags: [
        #"<" opt "/" element: [
            "em" | "strong" | "code" | "br /" | "a" |
            ["br/" (encode-e "br /" 3) | "br" (encode-e "br /" 2)] :element "br /" |
            "b" (encode-e "strong" 1) :element "strong" | "i" (encode-e "em" 1) :element "em"
        ] #">" | #"<" (encode "&lt;" 1)
    ]
    shorthand: [
        #"(" [
            "c)" (encode "&copy;" 3) | "r)" (encode "&reg;" 3) |
            "o)" (encode "&deg;" 3) | "tm)" (encode "&#8482;" 4) |
            "br)" (encode <br /> 4) :pos | "e)" (encode "&#8364;" 3)
        ]
    ]

    regular-rule: [
        any [pos: [
            some base-charset | allowed-entities | allowed-tags |
            quotes | period | shorthand | dashes | codes | newline |
            #"(" | #"-" | #"." | #"[" | #"\" | skip (to-entity pos/1)
        ]]
    ]
    pre-rule: [any [pos: [some html-charset | line-end | skip (to-entity pos/1)]]]

    url-rule: [
        "[url " copy href to #"]" #"]" copy marked to "[/url]"
        (
            replace pos rejoin ["[url " href "]" marked "[/url]"] rejoin [
                {<a href="} href {" title="} marked {">} marked </a>
            ]
        )
    ]
    bold-rule: [
        "[b]" copy marked to "[/b]"
        (replace pos rejoin ["[b]" marked "[/b]"] rejoin ["" <strong> marked </strong>])
    ]
    italic-rule: [
        "[i]" copy marked to "[/i]"
        (replace pos rejoin ["[i]" marked "[/i]"] rejoin ["" <em> marked </em>])
    ]
    markup-rule: [
        some [pos: to #"[" [url-rule | bold-rule | italic-rule | #"[" pos:]]
        to end
    ]

    set 'escape-html func [doc /minimal][
        parse/all doc either minimal [pre-rule][regular-rule]
        trim/lines doc
    ]
]

scan-ctx: context [
    out: []
    emit: func ['style data /pre][if string? data [trim/tail data] repend out [style data]]
    text: para: title: none

    some-chars: [some space copy text some chars]
    text-line: [copy text thru newline (trim/head/tail text)]
    paragraph: [copy para some [chars thru newline]]
    example:   [copy code some [indented | some newline indented]]
    indented:  [some space chars thru newline]
    as-file: func [str][to-file escape-html/minimal str]

    ;--- Text Format Language:
    rules: [some parts]

    parts: [ ;here: (print here)
        newline |
        ;--Headers / End of file
        "===" text-line (emit sect1 text title: true) |
        "---" text-line (emit sect2 text title: true) |
        "###" to end (emit end none) |

        ;--Table sections; new row and new cell markers
        "-r-" (emit table-row none) |
        "-c-" (emit table-cell none) |
        "-h-" (emit table-head-cell none) |

        ;--Special common notations:
        "*" paragraph opt newline (emit bullet para) |
        "#" paragraph opt newline (emit enum para) |
        ";" paragraph |  ; comment

        ;--Commands:
        "=image" some-chars copy para to newline newline (emit image reduce [text para]) |
        "=url" some-chars copy para to newline newline (emit url reduce [text para]) |
        "=dialect" some-chars (emit dialect text) |

        ;--Special sections:
        "\define" text-line (emit define-in text) |
        "/define" text-line (emit define-out none)|
        "\table" (emit table-in none) |
        "/table" (emit table-out none) |

        ;--Defaults:
        example (emit code trim/auto code) |
        paragraph (either title [emit para para][emit title trim/lines title: para]) |
        skip
    ]

    set 'scan-doc func [str] [
        clear out title: false
        parse/all detab join str "^/^/" rules
        copy out
    ]
]

xhtml-ctx: context [
    out: make string! 10000
    emit: func [data] [repend out data]
    nsp: "^/ " ; nsp = newline-space
    emit-tag: func ['tagname data /space /local sp][
        sp: copy either space [" "][""]
        if emit-table/status [append sp " "]
        emit [nsp sp to-tag tagname data join </> tagname]
    ]
    heading?: toc?: title?: false

    emit-item: func [doc 'item tag] [
        if doc/-2 <> item [emit [nsp tag]]
        emit-tag/space li doc/2
        if doc/3 <> item [emit [nsp head insert copy tag #"/"]]
    ]
    emit-define: func [text][emit [nsp <dl><dt> text </dt><dd>]]
    emit-image: func [src [file! block!] /local left][
        left: false
        if block? src [if src/2 = 'left [left: true] src: src/1]
        emit [
            nsp either left [<div class="image left">][<div class="image">]
            build-tag compose [img src (src) alt "[ Image ]" /]
            </div>
        ]
    ]
    emit-url: func [doc /local tag href marked][
        if doc/-2 <> 'url [emit [nsp <ul class="links">]]

        href: doc/2/1
        marked: either doc/2/2 [doc/2/2][href]
        tag: reduce ['a 'href href]
        if doc/2/2 [repend tag ['title marked]]
        emit-tag/space li [build-tag tag marked </a>]

        if doc/3 <> 'url [emit [nsp </ul>]]
    ]

    emit-code: func [text] [
        if find text "<br /><br />" [replace/all text "<br /><br />" "<br />&nbsp;<br />"]
        emit-tag blockquote [<pre> text </pre>]
    ]

    emit-table: context [
        status: none
        start: does [
            if not status [status: 'in-table emit [nsp <table>]]
        ]
        row: does [
            if status [
                emit switch status [
                    in-table [<tr>]
                    in-row [[nsp </tr><tr>]]
                    in-cell [[nsp </td> nsp </tr><tr>]]
                    in-head [[nsp </th> nsp </tr><tr>]]
                ]
                status: 'in-row
            ]
        ]
        cell: func ['type][
            if status [
                emit switch status [
                    in-table [[<tr> nsp to-tag type]]
                    in-row [[nsp to-tag type]]
                    in-cell [[nsp </td> nsp to-tag type]]
                    in-head [[nsp </th> nsp to-tag type]]
                ]
                status: either type = 'td ['in-cell]['in-head]
            ]
        ]
        end: does [
            if status [
                emit switch status [
                    in-table [</table>]
                    in-row [[nsp </tr></table>]]
                    in-cell [[nsp </td> nsp </tr></table>]]
                    in-head [[nsp </th> nsp </tr></table>]]
                ]
                status: none
            ]
        ]
    ]

    set 'gen-doc func [doc [block! string! file!]][
        clear out
        switch type?/word doc [file! [doc: scan-doc read doc] string! [doc: scan-doc doc]]
        foreach [style content] doc [
            switch type?/word content [
                string! [
                    either find [code raw url] style [escape-html/minimal content][escape-html content]
                ]
                block! [
                    switch style [
                        image [escape-html/minimal content/1]
                        url [escape-html/minimal content/1 if content/2 [escape-html content/2]]
                        define [escape-html content/1 escape-html content/2]
                    ]
                ]
                file! [escape-html/minimal content]
            ]
        ]
        if doc/1 = 'title [emit-tag h1 doc/2 doc: skip doc 2 title?: true]
        if doc/1 = 'code [emit-tag p doc/2 doc: skip doc 2]

        forskip doc 2 [
            switch/default doc/1 [
                raw [emit-tag p doc/2] para [emit-tag p doc/2] code [emit-code doc/2]
                enum [emit-item doc enum <ol>] bullet [emit-item doc bullet <ul>]
                sect1 [emit-tag h2 doc/2] sect2 [emit-tag h3 doc/2]
                indent-in [emit [nsp <div class="indent">]] indent-out [emit [nsp </div>]]
                define-in [emit-define doc/2] define-out [emit [nsp </dd></dl>]]
                table-in [emit-table/start] table-out [emit-table/end]
                table-row [emit-table/row]
                table-cell [emit-table/cell td]
                table-head-cell [emit-table/cell th]
                image [emit-image doc/2] url [emit-url doc]
            ][if doc/2 [emit-tag p doc/2]]
        ]

        emit-table/end
        return out
    ]
]
]