REBOL [ title: "Plain Text to XHTML" author: "Christopher Ross-Gill" home: http://www.ross-gill.com/make-doc/ file: %xhtml.r date: 26-Aug-2003 rights: http://creativecommons.org/licenses/by-sa/3.0/ ] ;-- It is recommended that the #"^M" characters are removed from ;-- strings before processing. xhtml-ctx: context [ ;-- Source shifted left to eof space: charset " ^-" chars: complement nochar: charset " ^-^/" spaces: [any space] format-ctx: context [ pos: marked: href: element: "" encode: func [str len][ change/part pos str len if tag? str [pos: skip pos (length? mold str)] ] encode-e: func [str len][change/part element str len] to-entity: func [ent [string! issue!]][return rejoin ["&" ent ";"]] ascii-charset: charset [#" " - #"~"] html-charset: exclude ascii-charset charset {"&<>} base-charset: exclude html-charset charset {(-.[\'} html-map: [ 34 #quot 38 #amp 60 #lt 62 #gt 64 ##064 128 #euro ##8364 130 ##8218 131 ##402 132 ##8222 133 ##8230 134 ##8224 135 ##8225 136 ##710 137 ##8240 138 ##352 139 ##8249 140 ##338 145 ##8216 146 ##8217 147 ##8220 148 ##8221 149 ##8226 150 ##8211 151 ##8212 152 ##732 153 ##8482 154 ##353 155 ##8250 156 ##339 159 ##376 160 #nbsp 161 #iexcl 162 #cent 163 #pound 164 #curren 165 #yen 166 #brvbar 167 #sect 168 #uml 169 #copy 170 #ordf 171 #laquo 172 #not 173 #shy 174 #reg 175 #macr 176 #deg 177 #plusmn 178 #sup2 179 #sup3 180 #acute 181 #micro 182 #para 183 #middot 184 #cedil 185 #sup1 186 #ordm 187 #raquo 188 #frac14 189 #frac12 190 #frac34 191 #iquest 192 #Agrave 193 #Aacute 194 #Acirc 195 #Atilde 196 #Auml 197 #Aring 198 #AElig 199 #Ccedil 200 #Egrave 201 #Eacute 202 #Ecirc 203 #Euml 204 #Igrave 205 #Iacute 206 #Icirc 207 #Iuml 208 #ETH 209 #Ntilde 210 #Ograve 211 #Oacute 212 #Ocirc 213 #Otilde 214 #Ouml 215 #times 216 #Oslash 217 #Ugrave 218 #Uacute 219 #Ucirc 220 #Uuml 221 #Yacute 222 #THORN 223 #szlig 224 #agrave 225 #aacute 226 #acirc 227 #atilde 228 #auml 229 #aring 230 #aelig 231 #ccedil 232 #egrave 233 #eacute 234 #ecirc 235 #euml 236 #igrave 237 #iacute 238 #icirc 239 #iuml 240 #eth 241 #ntilde 242 #ograve 243 #oacute 244 #ocirc 245 #otilde 246 #ouml 247 #divide 248 #oslash 249 #ugrave 250 #uacute 251 #ucirc 252 #uuml 253 #yacute 254 #thorn 255 #yuml ] to-entity: func [char [char!] /local ent][ encode either ent: select html-map to-integer char [rejoin ["&" ent ";"]]["?"] 1 ] line-end: [newline (encode <br /> 1) :pos] period: [#"." opt [2 space (encode ". " 3) | #"." #"." (encode "… " 3)]] dashes: [#"-" #"-" (encode "—" 2)] codes: [#"[" "break]" (encode <br /> 6) :pos] quote-test: [head? pos pos/-1 = #" " pos/-1 = #"^(A0)" pos/-1 = #"^/" pos/-1 = #">"] quotes: [ #"\" [#"\" (encode "\" 2) | #"^"" (encode {"} 2)] :pos skip | #"^"" (encode either any quote-test ["“"]["”"] 1) | #"'" (encode either any quote-test ["‘"]["’"] 1) ] anchor: [#"a" some-space some [{href="} #"^""]] allowed-entities: [#"&" ["amp" | "copy" | "nbsp" | "quot" | "lt" | "gt"] #";"] allowed-tags: [ #"<" opt "/" element: [ "em" | "strong" | "code" | "br /" | "a" | ["br/" (encode-e "br /" 3) | "br" (encode-e "br /" 2)] :element "br /" | "b" (encode-e "strong" 1) :element "strong" | "i" (encode-e "em" 1) :element "em" ] #">" | #"<" (encode "<" 1) ] shorthand: [ #"(" [ "c)" (encode "©" 3) | "r)" (encode "®" 3) | "o)" (encode "°" 3) | "tm)" (encode "™" 4) | "br)" (encode <br /> 4) :pos | "e)" (encode "€" 3) ] ] regular-rule: [ any [pos: [ some base-charset | allowed-entities | allowed-tags | quotes | period | shorthand | dashes | codes | newline | #"(" | #"-" | #"." | #"[" | #"\" | skip (to-entity pos/1) ]] ] pre-rule: [any [pos: [some html-charset | line-end | skip (to-entity pos/1)]]] url-rule: [ "[url " copy href to #"]" #"]" copy marked to "[/url]" ( replace pos rejoin ["[url " href "]" marked "[/url]"] rejoin [ {<a href="} href {" title="} marked {">} marked </a> ] ) ] bold-rule: [ "[b]" copy marked to "[/b]" (replace pos rejoin ["[b]" marked "[/b]"] rejoin ["" <strong> marked </strong>]) ] italic-rule: [ "[i]" copy marked to "[/i]" (replace pos rejoin ["[i]" marked "[/i]"] rejoin ["" <em> marked </em>]) ] markup-rule: [ some [pos: to #"[" [url-rule | bold-rule | italic-rule | #"[" pos:]] to end ] set 'escape-html func [doc /minimal][ parse/all doc either minimal [pre-rule][regular-rule] trim/lines doc ] ] scan-ctx: context [ out: [] emit: func ['style data /pre][if string? data [trim/tail data] repend out [style data]] text: para: title: none some-chars: [some space copy text some chars] text-line: [copy text thru newline (trim/head/tail text)] paragraph: [copy para some [chars thru newline]] example: [copy code some [indented | some newline indented]] indented: [some space chars thru newline] as-file: func [str][to-file escape-html/minimal str] ;--- Text Format Language: rules: [some parts] parts: [ ;here: (print here) newline | ;--Headers / End of file "===" text-line (emit sect1 text title: true) | "---" text-line (emit sect2 text title: true) | "###" to end (emit end none) | ;--Table sections; new row and new cell markers "-r-" (emit table-row none) | "-c-" (emit table-cell none) | "-h-" (emit table-head-cell none) | ;--Special common notations: "*" paragraph opt newline (emit bullet para) | "#" paragraph opt newline (emit enum para) | ";" paragraph | ; comment ;--Commands: "=image" some-chars copy para to newline newline (emit image reduce [text para]) | "=url" some-chars copy para to newline newline (emit url reduce [text para]) | "=dialect" some-chars (emit dialect text) | ;--Special sections: "\define" text-line (emit define-in text) | "/define" text-line (emit define-out none)| "\table" (emit table-in none) | "/table" (emit table-out none) | ;--Defaults: example (emit code trim/auto code) | paragraph (either title [emit para para][emit title trim/lines title: para]) | skip ] set 'scan-doc func [str] [ clear out title: false parse/all detab join str "^/^/" rules copy out ] ] xhtml-ctx: context [ out: make string! 10000 emit: func [data] [repend out data] nsp: "^/ " ; nsp = newline-space emit-tag: func ['tagname data /space /local sp][ sp: copy either space [" "][""] if emit-table/status [append sp " "] emit [nsp sp to-tag tagname data join </> tagname] ] heading?: toc?: title?: false emit-item: func [doc 'item tag] [ if doc/-2 <> item [emit [nsp tag]] emit-tag/space li doc/2 if doc/3 <> item [emit [nsp head insert copy tag #"/"]] ] emit-define: func [text][emit [nsp <dl><dt> text </dt><dd>]] emit-image: func [src [file! block!] /local left][ left: false if block? src [if src/2 = 'left [left: true] src: src/1] emit [ nsp either left [<div class="image left">][<div class="image">] build-tag compose [img src (src) alt "[ Image ]" /] </div> ] ] emit-url: func [doc /local tag href marked][ if doc/-2 <> 'url [emit [nsp <ul class="links">]] href: doc/2/1 marked: either doc/2/2 [doc/2/2][href] tag: reduce ['a 'href href] if doc/2/2 [repend tag ['title marked]] emit-tag/space li [build-tag tag marked </a>] if doc/3 <> 'url [emit [nsp </ul>]] ] emit-code: func [text] [ if find text "<br /><br />" [replace/all text "<br /><br />" "<br /> <br />"] emit-tag blockquote [<pre> text </pre>] ] emit-table: context [ status: none start: does [ if not status [status: 'in-table emit [nsp <table>]] ] row: does [ if status [ emit switch status [ in-table [<tr>] in-row [[nsp </tr><tr>]] in-cell [[nsp </td> nsp </tr><tr>]] in-head [[nsp </th> nsp </tr><tr>]] ] status: 'in-row ] ] cell: func ['type][ if status [ emit switch status [ in-table [[<tr> nsp to-tag type]] in-row [[nsp to-tag type]] in-cell [[nsp </td> nsp to-tag type]] in-head [[nsp </th> nsp to-tag type]] ] status: either type = 'td ['in-cell]['in-head] ] ] end: does [ if status [ emit switch status [ in-table [</table>] in-row [[nsp </tr></table>]] in-cell [[nsp </td> nsp </tr></table>]] in-head [[nsp </th> nsp </tr></table>]] ] status: none ] ] ] set 'gen-doc func [doc [block! string! file!]][ clear out switch type?/word doc [file! [doc: scan-doc read doc] string! [doc: scan-doc doc]] foreach [style content] doc [ switch type?/word content [ string! [ either find [code raw url] style [escape-html/minimal content][escape-html content] ] block! [ switch style [ image [escape-html/minimal content/1] url [escape-html/minimal content/1 if content/2 [escape-html content/2]] define [escape-html content/1 escape-html content/2] ] ] file! [escape-html/minimal content] ] ] if doc/1 = 'title [emit-tag h1 doc/2 doc: skip doc 2 title?: true] if doc/1 = 'code [emit-tag p doc/2 doc: skip doc 2] forskip doc 2 [ switch/default doc/1 [ raw [emit-tag p doc/2] para [emit-tag p doc/2] code [emit-code doc/2] enum [emit-item doc enum <ol>] bullet [emit-item doc bullet <ul>] sect1 [emit-tag h2 doc/2] sect2 [emit-tag h3 doc/2] indent-in [emit [nsp <div class="indent">]] indent-out [emit [nsp </div>]] define-in [emit-define doc/2] define-out [emit [nsp </dd></dl>]] table-in [emit-table/start] table-out [emit-table/end] table-row [emit-table/row] table-cell [emit-table/cell td] table-head-cell [emit-table/cell th] image [emit-image doc/2] url [emit-url doc] ][if doc/2 [emit-tag p doc/2]] ] emit-table/end return out ] ] ]