ironhtml_parser/
dom.rs

1//! DOM types for representing parsed HTML.
2//!
3//! This module provides types that represent the Document Object Model (DOM)
4//! as defined by the WHATWG DOM Standard.
5//!
6//! ## Reference
7//!
8//! - [DOM Standard](https://dom.spec.whatwg.org/)
9//! - [Node interface](https://dom.spec.whatwg.org/#interface-node)
10
11use alloc::borrow::Cow;
12use alloc::string::String;
13use alloc::vec::Vec;
14
15/// The type of a DOM node.
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17pub enum NodeType {
18    /// An element node (e.g., `<div>`, `<span>`).
19    Element,
20    /// A text node containing character data.
21    Text,
22    /// A comment node (e.g., `<!-- comment -->`).
23    Comment,
24    /// A document type node (e.g., `<!DOCTYPE html>`).
25    DocumentType,
26    /// The document root node.
27    Document,
28}
29
30/// An HTML document.
31#[derive(Debug, Clone)]
32pub struct Document {
33    /// The document type declaration, if present.
34    pub doctype: Option<DocumentType>,
35    /// The root element (usually `<html>`).
36    pub root: Element,
37}
38
39impl Document {
40    /// Create a new empty document.
41    #[must_use]
42    pub fn new() -> Self {
43        Self {
44            doctype: None,
45            root: Element::new("html"),
46        }
47    }
48
49    /// Get the document's title from `<head><title>`.
50    #[must_use]
51    pub fn title(&self) -> Option<String> {
52        self.root
53            .find_element("head")
54            .and_then(|head| head.find_element("title"))
55            .and_then(Element::text_content)
56            .map(Cow::into_owned)
57    }
58
59    /// Get the body element.
60    #[must_use]
61    pub fn body(&self) -> Option<&Element> {
62        self.root.find_element("body")
63    }
64
65    /// Get the head element.
66    #[must_use]
67    pub fn head(&self) -> Option<&Element> {
68        self.root.find_element("head")
69    }
70
71    /// Render the document back to an HTML string.
72    #[must_use]
73    pub fn to_html(&self) -> String {
74        let mut output = String::new();
75        if let Some(doctype) = &self.doctype {
76            output.push_str("<!DOCTYPE ");
77            output.push_str(&doctype.name);
78            output.push('>');
79        }
80        self.root.render_to(&mut output);
81        output
82    }
83}
84
85impl Default for Document {
86    fn default() -> Self {
87        Self::new()
88    }
89}
90
91/// A document type declaration.
92#[derive(Debug, Clone)]
93pub struct DocumentType {
94    /// The document type name (usually "html").
95    pub name: String,
96    /// The public identifier, if any.
97    pub public_id: Option<String>,
98    /// The system identifier, if any.
99    pub system_id: Option<String>,
100}
101
102impl DocumentType {
103    /// Create a new HTML5 doctype.
104    pub fn html5() -> Self {
105        Self {
106            name: String::from("html"),
107            public_id: None,
108            system_id: None,
109        }
110    }
111}
112
113/// A node in the DOM tree.
114#[derive(Debug, Clone)]
115pub enum Node {
116    /// An element node.
117    Element(Element),
118    /// A text node.
119    Text(Text),
120    /// A comment node.
121    Comment(Comment),
122}
123
124impl Node {
125    /// Get the type of this node.
126    #[must_use]
127    pub const fn node_type(&self) -> NodeType {
128        match self {
129            Self::Element(_) => NodeType::Element,
130            Self::Text(_) => NodeType::Text,
131            Self::Comment(_) => NodeType::Comment,
132        }
133    }
134
135    /// Get this node as an element, if it is one.
136    #[must_use]
137    pub const fn as_element(&self) -> Option<&Element> {
138        match self {
139            Self::Element(e) => Some(e),
140            _ => None,
141        }
142    }
143
144    /// Get this node as a mutable element, if it is one.
145    pub fn as_element_mut(&mut self) -> Option<&mut Element> {
146        match self {
147            Self::Element(e) => Some(e),
148            _ => None,
149        }
150    }
151
152    /// Get this node as a text node, if it is one.
153    #[must_use]
154    pub const fn as_text(&self) -> Option<&Text> {
155        match self {
156            Self::Text(t) => Some(t),
157            _ => None,
158        }
159    }
160
161    /// Render this node to an HTML string.
162    #[must_use]
163    pub fn to_html(&self) -> String {
164        let mut output = String::new();
165        self.render_to(&mut output);
166        output
167    }
168
169    /// Render this node to an existing string buffer.
170    pub fn render_to(&self, output: &mut String) {
171        match self {
172            Self::Element(e) => e.render_to(output),
173            Self::Text(t) => output.push_str(&t.data),
174            Self::Comment(c) => {
175                output.push_str("<!--");
176                output.push_str(&c.data);
177                output.push_str("-->");
178            }
179        }
180    }
181}
182
183/// An HTML element.
184#[derive(Debug, Clone)]
185pub struct Element {
186    /// The tag name (lowercase).
187    pub tag_name: String,
188    /// The element's attributes.
189    pub attributes: Vec<Attribute>,
190    /// The element's child nodes.
191    pub children: Vec<Node>,
192}
193
194impl Element {
195    /// Create a new element with the given tag name.
196    pub fn new(tag_name: impl Into<String>) -> Self {
197        Self {
198            tag_name: tag_name.into().to_ascii_lowercase(),
199            attributes: Vec::new(),
200            children: Vec::new(),
201        }
202    }
203
204    /// Check if this is a void element (self-closing).
205    #[must_use]
206    pub fn is_void(&self) -> bool {
207        matches!(
208            self.tag_name.as_str(),
209            "area"
210                | "base"
211                | "br"
212                | "col"
213                | "embed"
214                | "hr"
215                | "img"
216                | "input"
217                | "link"
218                | "meta"
219                | "source"
220                | "track"
221                | "wbr"
222        )
223    }
224
225    /// Get an attribute value by name.
226    #[must_use]
227    pub fn get_attribute(&self, name: &str) -> Option<&str> {
228        self.attributes
229            .iter()
230            .find(|a| a.name.eq_ignore_ascii_case(name))
231            .map(|a| a.value.as_str())
232    }
233
234    /// Check if the element has an attribute.
235    #[must_use]
236    pub fn has_attribute(&self, name: &str) -> bool {
237        self.attributes
238            .iter()
239            .any(|a| a.name.eq_ignore_ascii_case(name))
240    }
241
242    /// Set an attribute value.
243    pub fn set_attribute(&mut self, name: impl Into<String>, value: impl Into<String>) {
244        let name = name.into();
245        let value = value.into();
246        if let Some(attr) = self
247            .attributes
248            .iter_mut()
249            .find(|a| a.name.eq_ignore_ascii_case(&name))
250        {
251            attr.value = value;
252        } else {
253            self.attributes.push(Attribute { name, value });
254        }
255    }
256
257    /// Get the element's id attribute.
258    #[must_use]
259    pub fn id(&self) -> Option<&str> {
260        self.get_attribute("id")
261    }
262
263    /// Get the element's class attribute.
264    #[must_use]
265    pub fn class(&self) -> Option<&str> {
266        self.get_attribute("class")
267    }
268
269    /// Get the text content of this element and its descendants.
270    #[must_use]
271    pub fn text_content(&self) -> Option<Cow<'_, str>> {
272        let mut text = String::new();
273        self.collect_text(&mut text);
274        if text.is_empty() {
275            None
276        } else {
277            Some(Cow::Owned(text))
278        }
279    }
280
281    fn collect_text(&self, output: &mut String) {
282        for child in &self.children {
283            match child {
284                Node::Text(t) => output.push_str(&t.data),
285                Node::Element(e) => e.collect_text(output),
286                Node::Comment(_) => {}
287            }
288        }
289    }
290
291    /// Find the first descendant element with the given tag name.
292    #[must_use]
293    pub fn find_element(&self, tag_name: &str) -> Option<&Self> {
294        for child in &self.children {
295            if let Node::Element(e) = child {
296                if e.tag_name.eq_ignore_ascii_case(tag_name) {
297                    return Some(e);
298                }
299                if let Some(found) = e.find_element(tag_name) {
300                    return Some(found);
301                }
302            }
303        }
304        None
305    }
306
307    /// Find all descendant elements with the given tag name.
308    #[must_use]
309    pub fn find_all_elements(&self, tag_name: &str) -> Vec<&Self> {
310        let mut results = Vec::new();
311        self.collect_elements(tag_name, &mut results);
312        results
313    }
314
315    fn collect_elements<'a>(&'a self, tag_name: &str, results: &mut Vec<&'a Self>) {
316        for child in &self.children {
317            if let Node::Element(e) = child {
318                if e.tag_name.eq_ignore_ascii_case(tag_name) {
319                    results.push(e);
320                }
321                e.collect_elements(tag_name, results);
322            }
323        }
324    }
325
326    /// Find elements by class name.
327    #[must_use]
328    pub fn find_by_class(&self, class_name: &str) -> Vec<&Self> {
329        let mut results = Vec::new();
330        self.collect_by_class(class_name, &mut results);
331        results
332    }
333
334    fn collect_by_class<'a>(&'a self, class_name: &str, results: &mut Vec<&'a Self>) {
335        if let Some(class) = self.class() {
336            if class.split_whitespace().any(|c| c == class_name) {
337                results.push(self);
338            }
339        }
340        for child in &self.children {
341            if let Node::Element(e) = child {
342                e.collect_by_class(class_name, results);
343            }
344        }
345    }
346
347    /// Find element by id.
348    #[must_use]
349    pub fn find_by_id(&self, id: &str) -> Option<&Self> {
350        if self.id() == Some(id) {
351            return Some(self);
352        }
353        for child in &self.children {
354            if let Node::Element(e) = child {
355                if let Some(found) = e.find_by_id(id) {
356                    return Some(found);
357                }
358            }
359        }
360        None
361    }
362
363    /// Render this element to an HTML string.
364    #[must_use]
365    pub fn to_html(&self) -> String {
366        let mut output = String::new();
367        self.render_to(&mut output);
368        output
369    }
370
371    /// Render this element to an existing string buffer.
372    pub fn render_to(&self, output: &mut String) {
373        output.push('<');
374        output.push_str(&self.tag_name);
375
376        for attr in &self.attributes {
377            output.push(' ');
378            output.push_str(&attr.name);
379            if !attr.value.is_empty() {
380                output.push_str("=\"");
381                // Escape attribute value
382                for c in attr.value.chars() {
383                    match c {
384                        '"' => output.push_str("&quot;"),
385                        '&' => output.push_str("&amp;"),
386                        '<' => output.push_str("&lt;"),
387                        '>' => output.push_str("&gt;"),
388                        _ => output.push(c),
389                    }
390                }
391                output.push('"');
392            }
393        }
394
395        if self.is_void() {
396            output.push_str(" />");
397        } else {
398            output.push('>');
399            for child in &self.children {
400                child.render_to(output);
401            }
402            output.push_str("</");
403            output.push_str(&self.tag_name);
404            output.push('>');
405        }
406    }
407}
408
409/// An attribute on an element.
410#[derive(Debug, Clone)]
411pub struct Attribute {
412    /// The attribute name.
413    pub name: String,
414    /// The attribute value.
415    pub value: String,
416}
417
418impl Attribute {
419    /// Create a new attribute.
420    pub fn new(name: impl Into<String>, value: impl Into<String>) -> Self {
421        Self {
422            name: name.into(),
423            value: value.into(),
424        }
425    }
426}
427
428/// A text node.
429#[derive(Debug, Clone)]
430pub struct Text {
431    /// The text content.
432    pub data: String,
433}
434
435impl Text {
436    /// Create a new text node.
437    pub fn new(data: impl Into<String>) -> Self {
438        Self { data: data.into() }
439    }
440}
441
442/// A comment node.
443#[derive(Debug, Clone)]
444pub struct Comment {
445    /// The comment content.
446    pub data: String,
447}
448
449impl Comment {
450    /// Create a new comment node.
451    pub fn new(data: impl Into<String>) -> Self {
452        Self { data: data.into() }
453    }
454}
455
456#[cfg(test)]
457mod tests {
458    use super::*;
459
460    #[test]
461    fn test_element_new() {
462        let elem = Element::new("DIV");
463        assert_eq!(elem.tag_name, "div");
464    }
465
466    #[test]
467    fn test_element_attributes() {
468        let mut elem = Element::new("div");
469        elem.set_attribute("class", "container");
470        elem.set_attribute("id", "main");
471
472        assert_eq!(elem.get_attribute("class"), Some("container"));
473        assert_eq!(elem.get_attribute("id"), Some("main"));
474        assert_eq!(elem.class(), Some("container"));
475        assert_eq!(elem.id(), Some("main"));
476    }
477
478    #[test]
479    fn test_element_is_void() {
480        assert!(Element::new("br").is_void());
481        assert!(Element::new("img").is_void());
482        assert!(Element::new("input").is_void());
483        assert!(!Element::new("div").is_void());
484        assert!(!Element::new("span").is_void());
485    }
486
487    #[test]
488    fn test_element_text_content() {
489        let mut elem = Element::new("p");
490        elem.children.push(Node::Text(Text::new("Hello, ")));
491        elem.children.push(Node::Text(Text::new("World!")));
492
493        assert_eq!(
494            elem.text_content(),
495            Some(Cow::Owned("Hello, World!".into()))
496        );
497    }
498
499    #[test]
500    fn test_element_render() {
501        let mut elem = Element::new("div");
502        elem.set_attribute("class", "test");
503        elem.children.push(Node::Text(Text::new("Hello")));
504
505        assert_eq!(elem.to_html(), r#"<div class="test">Hello</div>"#);
506    }
507
508    #[test]
509    fn test_void_element_render() {
510        let mut elem = Element::new("img");
511        elem.set_attribute("src", "test.jpg");
512
513        assert_eq!(elem.to_html(), r#"<img src="test.jpg" />"#);
514    }
515
516    #[test]
517    fn test_document() {
518        let mut doc = Document::new();
519        doc.doctype = Some(DocumentType::html5());
520
521        let html = doc.to_html();
522        assert!(html.starts_with("<!DOCTYPE html>"));
523    }
524}