Rebol [
Title: "HTML DOM"
File: %html.r
Purpose: "HTML Parser and Document API for Rebol"
Author: "Christopher Ross-Gill"
Home: http://www.ross-gill.com/page/XML_and_REBOL
Date: 6-Dec-2013
Version: 0.1.1
History: [
0.1.0 18-Apr-2013 "First Version"
0.1.1 6-Dec-2013 "Add GET-BY-CLASS"
]
Type: 'module
Exports: [load-html]
MezzModules: http://www.rebol.it/power-mezz/
; MezzModules: %/Volumes/Sandbox/Downloads/power-mezz-built-1.0.0/
]
do system/script/header/mezzmodules/mezz/module.r
load-module/from system/script/header/mezzmodules
module [
Title: "Load HTML"
Globals: [html-loader]
Imports: [%mezz/load-html.r]
][
html-loader: use [
html! doc make-node
name attribute element
][
html!: context [
name: space: value: tree: branch: position: none
flatten: use [xml path emit encode form-name element attribute tag attr text][
path: copy []
emit: func [data][repend xml data]
encode: use [ch tx][
ch: #[bitset! 64#{/////7v//+////////////////////////////////8=}]
; complement charset {<"&}
tx: [
some ch | text: skip (
text: change/part text switch text/1 [
#"<" ["<"] #"^"" ["""] #"&" ["&"]
] 1
)
]
func [text][parse/all text: copy text [some tx] head text]
]
form-name: func [name [tag! issue!]][
join "" [to-string copy/part head name name ":" to-string name]
]
attribute: [
set attr issue! set text [any-string! | number! | logic!] (
attr: either head? attr [to-string attr][form-name attr]
emit [" " attr {="} encode form text {"}]
)
]
element: [
set tag tag! (
insert path tag: either head? tag [to-string tag][form-name tag]
emit ["<" either head? tag [tag][]]
) [
none! (emit " />" remove path)
| set text string! (emit [">" encode text "</" tag ">"] remove path)
| into [
any attribute [
end (emit " />" remove path)
| (emit ">") some element end (emit ["</" take path ">"])
]
]
]
| %.txt set text string! (emit encode text)
| attribute
]
does [
xml: copy ""
if parse tree element [xml]
]
]
find-element: func [element [tag! issue!]][
find value element
]
get-by-tag: func [tag /local rule hit][
collect [
parse tree rule: [
some [
opt [hit: tag skip (keep make-node hit) :hit]
skip [into rule | skip]
]
]
]
]
get-by-id: func [id /local rule hit at][
parse tree rule: [
some [
at: tag! into [thru #id id to end] (hit: any [hit make-node at])
| skip [into rule | skip]
]
] hit
]
get-by-class: func [class /local rule classes at][
collect [
parse tree rule: [
some [
tag! at: into rule
| #class set classes string! (
if any [
class = classes
find parse/all classes " " class
][
keep make-node back at
]
)
| skip skip
]
]
]
]
text: has [rule text part][
case/all [
string? value [text: value]
block? value [
text: copy ""
parse value rule: [
any [[%.txt | tag!] set part string! (append text part) | skip into rule | 2 skip]
]
text: unless empty? text [trim/auto text]
]
string? text [trim/auto text]
]
]
get: func [name [issue! tag!] /local hit at][
if parse tree [
tag! into [
any [
at: name [block! (hit: make-node at) | set hit skip] to end
| [issue! | tag! | file!] skip
]
]
][hit]
]
sibling: func [/before /after][
case [
all [after find [tag! file!] type?/word position/3] [
make-node skip position 2
]
all [before find [tag! file!] type?/word position/-2] [
make-node skip position -2
]
]
]
parent: has [branch]["Need Branch" none]
children: has [at][
collect [
parse case [
block? value [value] string? value [reduce [%.txt value]] none? value [[]]
][
any [issue! skip]
any [at: [tag! | file!] skip (keep make-node at)]
]
]
]
attributes: has [at][
collect [
parse either block? value [value][[]] [
any [at: issue! skip (keep make-node at)] to end
]
]
]
clone: does [make-node tree]
append-child: func [name data /local at][
case [
none? position/2 [value: tree/2: position/2: copy []]
string? position/2 [
new-line value: tree/2: position/2: compose [%.txt (position/2)] true
]
]
either issue? name [
parse position/2 [any [issue! skip] at:]
][at: tail position/2]
insert at reduce [name data]
new-line at true
]
append-text: func [text][
case [
none? position/2 [value: tree/2: position/2: text]
string? position/2 [append position/2 text]
%.txt = pick tail position/2 -2 [append last position/2 text]
block? position/2 [append-child %.txt text]
]
]
append-attr: func [name value][
name: any [remove find name: to-issue name "/" name]
append-child name value
]
]
doc: make html! [
branch: make block! 16
document: true
dtd: none
new: does [
clear branch
tree: position: reduce ['document none]
]
set-dtd: func [dtd][self/dtd: load dtd]
open-tag: func [tag][
insert/only branch position
tag: any [remove find tag: to tag! tag "/" tag]
tree: position: append-child tag none
]
close-tag: does [
tree: position: take branch
]
]
make-node: func [here /base][
make either base [doc][html!][
position: here
name: here/1
space: all [any-string? name not head? name copy/part head name name]
value: here/2
tree: reduce [name value]
]
]
name: [word! | path!]
element: use [this][
[ 'declaration skip into ['value set this string! (doc/set-dtd this)]
| 'text skip into ['value set this string! (doc/append-text this)]
| 'comment skip skip
| set this name (doc/open-tag this)
block!
opt into [any attribute]
any [into [element]]
end (doc/close-tag)
]
]
attribute: use [attr value][
[ set attr name set value skip
(doc/append-attr attr value)
]
]
html-loader: func [
"Transform an HTML document to a REBOL block"
document [any-string!] "An HTML string/location to transform"
/dom "Returns an object with DOM-like methods to traverse the HTML tree"
/local root
][
if any [file? document url? document][document: read document]
root: doc/new
parse (load-html document) ['root skip skip some [into [element]]]
doc/tree: any [root/document []]
doc/value: doc/tree/2
either dom [make-node/base doc/tree][doc/tree]
]
]
]
load-html: :html-loader