ironhtml_parser/
tree_builder.rs

1//! HTML5 tree builder.
2//!
3//! This module implements the tree construction stage of HTML parsing.
4//!
5//! ## Reference
6//!
7//! - [Tree Construction](https://html.spec.whatwg.org/multipage/parsing.html#tree-construction)
8
9use alloc::string::String;
10use alloc::vec::Vec;
11
12use crate::dom::{Attribute, Comment, Document, DocumentType, Element, Node, Text};
13use crate::tokenizer::Token;
14
15/// HTML5 tree builder.
16///
17/// Constructs a DOM tree from a stream of tokens.
18pub struct TreeBuilder {
19    /// The document being built.
20    document: Document,
21    /// Stack of open element indices (used by `navigate_to_element`).
22    open_elements: Vec<usize>,
23    /// Parallel stack of open element tag names (for `pop_until`).
24    open_element_names: Vec<String>,
25    /// Whether we're in fragment mode.
26    fragment_mode: bool,
27    /// Current insertion mode.
28    insertion_mode: InsertionMode,
29    /// Pending text to be inserted.
30    pending_text: String,
31}
32
33/// Insertion mode for the tree builder.
34#[derive(Debug, Clone, Copy, PartialEq)]
35enum InsertionMode {
36    Initial,
37    BeforeHtml,
38    BeforeHead,
39    InHead,
40    AfterHead,
41    InBody,
42    AfterBody,
43    AfterAfterBody,
44}
45
46/// Navigate from the document root to the element described by `path`.
47///
48/// `path[0]` is the root sentinel (skipped); subsequent entries are
49/// child indices at each nesting level.
50fn navigate_to_element<'a>(root: &'a mut Element, path: &[usize]) -> &'a mut Element {
51    let mut current = root;
52
53    for &idx in path.iter().skip(1) {
54        if idx < current.children.len() && matches!(current.children[idx], Node::Element(_)) {
55            current = match &mut current.children[idx] {
56                Node::Element(elem) => elem,
57                _ => unreachable!(),
58            };
59        } else {
60            break;
61        }
62    }
63
64    current
65}
66
67impl TreeBuilder {
68    /// Create a new tree builder.
69    #[must_use]
70    pub fn new() -> Self {
71        Self {
72            document: Document::new(),
73            open_elements: Vec::new(),
74            open_element_names: Vec::new(),
75            fragment_mode: false,
76            insertion_mode: InsertionMode::Initial,
77            pending_text: String::new(),
78        }
79    }
80
81    /// Set fragment mode (for parsing HTML fragments).
82    ///
83    /// In fragment mode the tree builder skips implicit `<html>`,
84    /// `<head>`, and `<body>` creation and inserts directly into
85    /// `document.root` which acts as a virtual container.
86    pub fn set_fragment_mode(&mut self, fragment: bool) {
87        self.fragment_mode = fragment;
88        if fragment {
89            self.insertion_mode = InsertionMode::InBody;
90            // Push a sentinel so navigate_to_element starts from root.
91            self.open_elements.push(0);
92            self.open_element_names.push(String::new());
93        }
94    }
95
96    /// Process a token.
97    pub fn process_token(&mut self, token: Token) {
98        // Flush pending text before processing non-character tokens
99        match &token {
100            Token::Character(_) => {}
101            _ => self.flush_pending_text(),
102        }
103
104        match token {
105            Token::Doctype {
106                name,
107                public_id,
108                system_id,
109            } => {
110                self.process_doctype(name, public_id, system_id);
111            }
112            Token::StartTag {
113                name,
114                attributes,
115                self_closing,
116            } => {
117                self.process_start_tag(&name, attributes, self_closing);
118            }
119            Token::EndTag { name } => {
120                self.process_end_tag(&name);
121            }
122            Token::Comment(data) => {
123                self.process_comment(data);
124            }
125            Token::Character(c) => {
126                self.pending_text.push(c);
127            }
128            Token::Eof => {
129                self.flush_pending_text();
130            }
131        }
132    }
133
134    fn flush_pending_text(&mut self) {
135        if self.pending_text.is_empty() {
136            return;
137        }
138
139        let text = core::mem::take(&mut self.pending_text);
140
141        // Skip whitespace-only text in certain modes
142        if text.chars().all(|c| c.is_ascii_whitespace()) {
143            match self.insertion_mode {
144                InsertionMode::Initial
145                | InsertionMode::BeforeHtml
146                | InsertionMode::BeforeHead
147                | InsertionMode::AfterHead
148                | InsertionMode::AfterBody
149                | InsertionMode::AfterAfterBody => return,
150                _ => {}
151            }
152        }
153
154        self.insert_text(text);
155    }
156
157    fn process_doctype(
158        &mut self,
159        name: Option<String>,
160        public_id: Option<String>,
161        system_id: Option<String>,
162    ) {
163        if self.insertion_mode == InsertionMode::Initial {
164            self.document.doctype = Some(DocumentType {
165                name: name.unwrap_or_default(),
166                public_id,
167                system_id,
168            });
169            self.insertion_mode = InsertionMode::BeforeHtml;
170        }
171    }
172
173    #[allow(clippy::too_many_lines, clippy::only_used_in_recursion)]
174    fn process_start_tag(
175        &mut self,
176        name: &str,
177        attributes: Vec<(String, String)>,
178        self_closing: bool,
179    ) {
180        let name_lower = name.to_ascii_lowercase();
181
182        match self.insertion_mode {
183            InsertionMode::Initial => {
184                // Implicitly create doctype and html
185                self.insertion_mode = InsertionMode::BeforeHtml;
186                self.process_start_tag(name, attributes, self_closing);
187            }
188
189            InsertionMode::BeforeHtml => {
190                if name_lower == "html" {
191                    self.create_html_element(attributes);
192                    self.insertion_mode = InsertionMode::BeforeHead;
193                } else {
194                    // Implicitly create html element
195                    self.create_html_element(Vec::new());
196                    self.insertion_mode = InsertionMode::BeforeHead;
197                    self.process_start_tag(name, attributes, self_closing);
198                }
199            }
200
201            InsertionMode::BeforeHead => {
202                if name_lower == "head" {
203                    self.insert_element(&name_lower, attributes);
204                    self.insertion_mode = InsertionMode::InHead;
205                } else if name_lower == "html" {
206                    // Merge attributes with existing html element
207                    for (key, value) in attributes {
208                        self.document.root.set_attribute(key, value);
209                    }
210                } else {
211                    // Implicitly create head element
212                    self.insert_element("head", Vec::new());
213                    self.insertion_mode = InsertionMode::InHead;
214                    self.process_start_tag(name, attributes, self_closing);
215                }
216            }
217
218            InsertionMode::InHead => {
219                match name_lower.as_str() {
220                    "meta" | "link" | "base" => {
221                        self.insert_element(&name_lower, attributes);
222                        self.pop_element();
223                    }
224                    "title" | "style" | "script" | "noscript" => {
225                        self.insert_element(&name_lower, attributes);
226                    }
227                    "head" => {
228                        // Ignore duplicate head
229                    }
230                    "body" => {
231                        self.pop_element(); // pop head
232                        self.insertion_mode = InsertionMode::AfterHead;
233                        self.process_start_tag(name, attributes, self_closing);
234                    }
235                    _ => {
236                        // Implicitly close head and switch to after head
237                        self.pop_element();
238                        self.insertion_mode = InsertionMode::AfterHead;
239                        self.process_start_tag(name, attributes, self_closing);
240                    }
241                }
242            }
243
244            InsertionMode::AfterHead => {
245                if name_lower == "body" {
246                    self.insert_element(&name_lower, attributes);
247                    self.insertion_mode = InsertionMode::InBody;
248                } else if name_lower == "html" {
249                    // Merge attributes
250                    for (key, value) in attributes {
251                        self.document.root.set_attribute(key, value);
252                    }
253                } else {
254                    // Implicitly create body
255                    self.insert_element("body", Vec::new());
256                    self.insertion_mode = InsertionMode::InBody;
257                    self.process_start_tag(name, attributes, self_closing);
258                }
259            }
260
261            InsertionMode::InBody => {
262                let is_void = matches!(
263                    name_lower.as_str(),
264                    "area"
265                        | "base"
266                        | "br"
267                        | "col"
268                        | "embed"
269                        | "hr"
270                        | "img"
271                        | "input"
272                        | "link"
273                        | "meta"
274                        | "source"
275                        | "track"
276                        | "wbr"
277                );
278
279                self.insert_element(&name_lower, attributes);
280
281                if is_void {
282                    self.pop_element();
283                }
284            }
285
286            InsertionMode::AfterBody => {
287                if name_lower == "html" {
288                    // Merge attributes
289                    for (key, value) in attributes {
290                        self.document.root.set_attribute(key, value);
291                    }
292                } else {
293                    self.insertion_mode = InsertionMode::InBody;
294                    self.process_start_tag(name, attributes, self_closing);
295                }
296            }
297
298            InsertionMode::AfterAfterBody => {
299                self.insertion_mode = InsertionMode::InBody;
300                self.process_start_tag(name, attributes, self_closing);
301            }
302        }
303    }
304
305    fn process_end_tag(&mut self, name: &str) {
306        let name_lower = name.to_ascii_lowercase();
307
308        match self.insertion_mode {
309            InsertionMode::InHead => {
310                if name_lower == "head" {
311                    self.pop_element();
312                    self.insertion_mode = InsertionMode::AfterHead;
313                }
314            }
315
316            InsertionMode::InBody => {
317                if name_lower == "body" || name_lower == "html" {
318                    self.insertion_mode = InsertionMode::AfterBody;
319                } else {
320                    // Pop elements until we find the matching start tag
321                    self.pop_until(&name_lower);
322                }
323            }
324
325            InsertionMode::AfterBody => {
326                if name_lower == "html" {
327                    self.insertion_mode = InsertionMode::AfterAfterBody;
328                }
329            }
330
331            _ => {}
332        }
333    }
334
335    fn process_comment(&mut self, data: String) {
336        let comment = Node::Comment(Comment::new(data));
337        self.insert_into_current(comment);
338    }
339
340    fn create_html_element(&mut self, attributes: Vec<(String, String)>) {
341        self.document.root = Element::new("html");
342        for (key, value) in attributes {
343            self.document
344                .root
345                .attributes
346                .push(Attribute::new(key, value));
347        }
348        self.open_elements.push(0); // html is always index 0
349        self.open_element_names.push(String::from("html"));
350    }
351
352    fn insert_element(&mut self, tag_name: &str, attributes: Vec<(String, String)>) {
353        let mut element = Element::new(tag_name);
354        for (key, value) in attributes {
355            element.attributes.push(Attribute::new(key, value));
356        }
357
358        let node = Node::Element(element);
359        let idx = self.insert_into_current(node);
360        self.open_elements.push(idx);
361        self.open_element_names.push(String::from(tag_name));
362    }
363
364    fn insert_into_current(&mut self, node: Node) -> usize {
365        let parent = navigate_to_element(&mut self.document.root, &self.open_elements);
366        let idx = parent.children.len();
367        parent.children.push(node);
368        idx
369    }
370
371    fn insert_text(&mut self, text: String) {
372        let text_node = Node::Text(Text::new(text));
373        self.insert_into_current(text_node);
374    }
375
376    fn pop_element(&mut self) {
377        self.open_elements.pop();
378        self.open_element_names.pop();
379    }
380
381    /// Pop elements from the stack until one matching `tag_name` is found
382    /// and popped. Never pops the root sentinel.
383    fn pop_until(&mut self, tag_name: &str) {
384        while self.open_element_names.len() > 1 {
385            if self.open_element_names.last().map(String::as_str) == Some(tag_name) {
386                self.open_elements.pop();
387                self.open_element_names.pop();
388                return;
389            }
390            self.open_elements.pop();
391            self.open_element_names.pop();
392        }
393    }
394
395    /// Finish building and return the document.
396    #[must_use]
397    pub fn finish(mut self) -> Document {
398        self.flush_pending_text();
399        self.document
400    }
401
402    /// Finish building and return fragment nodes.
403    #[must_use]
404    pub fn finish_fragment(mut self) -> Vec<Node> {
405        self.flush_pending_text();
406        self.document.root.children
407    }
408}
409
410impl Default for TreeBuilder {
411    fn default() -> Self {
412        Self::new()
413    }
414}
415
416#[cfg(test)]
417mod tests {
418    use super::*;
419    use crate::tokenizer::Tokenizer;
420
421    fn parse(html: &str) -> Document {
422        let tokenizer = Tokenizer::new(html);
423        let mut builder = TreeBuilder::new();
424        for token in tokenizer {
425            builder.process_token(token);
426        }
427        builder.finish()
428    }
429
430    fn parse_fragment(html: &str) -> Vec<Node> {
431        let tokenizer = Tokenizer::new(html);
432        let mut builder = TreeBuilder::new();
433        builder.set_fragment_mode(true);
434        for token in tokenizer {
435            builder.process_token(token);
436        }
437        builder.finish_fragment()
438    }
439
440    #[test]
441    fn test_simple_document() {
442        let doc = parse(
443            "<!DOCTYPE html><html><head><title>Test</title>\
444             </head><body><p>Hello</p></body></html>",
445        );
446        assert!(doc.doctype.is_some());
447        assert_eq!(doc.doctype.as_ref().unwrap().name, "html");
448        assert_eq!(doc.root.tag_name, "html");
449    }
450
451    #[test]
452    fn test_implicit_html() {
453        let doc = parse("<p>Hello</p>");
454        assert_eq!(doc.root.tag_name, "html");
455        assert!(doc.body().is_some());
456    }
457
458    #[test]
459    fn test_fragment() {
460        let nodes = parse_fragment("<div><span>Hello</span></div>");
461        assert_eq!(nodes.len(), 1);
462        if let Some(Node::Element(div)) = nodes.first() {
463            assert_eq!(div.tag_name, "div");
464        }
465    }
466
467    #[test]
468    fn test_text_content() {
469        let doc = parse("<p>Hello World</p>");
470        if let Some(body) = doc.body() {
471            if let Some(p) = body.find_element("p") {
472                assert_eq!(p.text_content(), Some("Hello World".into()));
473            }
474        }
475    }
476
477    #[test]
478    fn test_attributes() {
479        let nodes = parse_fragment(r#"<div class="container" id="main"></div>"#);
480        if let Some(Node::Element(div)) = nodes.first() {
481            assert_eq!(div.get_attribute("class"), Some("container"));
482            assert_eq!(div.get_attribute("id"), Some("main"));
483        }
484    }
485
486    #[test]
487    fn test_pop_until_nested() {
488        let nodes = parse_fragment("<div><span>Hello</span> World</div>");
489        assert_eq!(nodes.len(), 1);
490        if let Some(Node::Element(div)) = nodes.first() {
491            assert_eq!(div.tag_name, "div");
492            assert_eq!(div.children.len(), 2);
493            if let Some(Node::Element(span)) = div.children.first() {
494                assert_eq!(span.tag_name, "span");
495                assert_eq!(span.text_content(), Some("Hello".into()));
496            }
497        }
498    }
499
500    #[test]
501    fn test_deeply_nested_fragment() {
502        let nodes = parse_fragment("<div><ul><li><span>Deep</span></li></ul></div>");
503        assert_eq!(nodes.len(), 1);
504        if let Some(Node::Element(div)) = nodes.first() {
505            let ul = div.find_element("ul").unwrap();
506            let li = ul.find_element("li").unwrap();
507            let span = li.find_element("span").unwrap();
508            assert_eq!(span.text_content(), Some("Deep".into()));
509        }
510    }
511
512    #[test]
513    fn test_fragment_void_elements() {
514        let nodes = parse_fragment("<div><br><span>After</span></div>");
515        assert_eq!(nodes.len(), 1);
516        if let Some(Node::Element(div)) = nodes.first() {
517            // br is void, should not nest span inside it
518            assert_eq!(div.children.len(), 2);
519            if let Some(Node::Element(br)) = div.children.first() {
520                assert_eq!(br.tag_name, "br");
521                assert!(br.children.is_empty());
522            }
523        }
524    }
525
526    #[test]
527    fn test_fragment_multiple_top_level() {
528        let nodes = parse_fragment("<p>One</p><p>Two</p><p>Three</p>");
529        assert_eq!(nodes.len(), 3);
530    }
531
532    #[test]
533    fn test_many_children_fragment() {
534        use core::fmt::Write;
535        let mut html = String::from("<div>");
536        for i in 0..1100 {
537            let _ = write!(html, "<span>{i}</span>");
538        }
539        html.push_str("</div>");
540        let nodes = parse_fragment(&html);
541        assert_eq!(nodes.len(), 1);
542        if let Some(Node::Element(div)) = nodes.first() {
543            assert_eq!(div.children.len(), 1100);
544        }
545    }
546
547    #[test]
548    fn test_unmatched_end_tag() {
549        // Unmatched </span> should not crash or empty the stack
550        let nodes = parse_fragment("<div>Hello</span></div>");
551        assert_eq!(nodes.len(), 1);
552        if let Some(Node::Element(div)) = nodes.first() {
553            assert_eq!(div.tag_name, "div");
554        }
555    }
556
557    // ── pop_until tests ──────────────────────────────────────────────
558
559    #[test]
560    fn test_pop_until_skips_intermediate() {
561        // </div> should pop both <em> and <span>, closing at <div>
562        let nodes = parse_fragment("<div><span><em>Text</div>");
563        assert_eq!(nodes.len(), 1);
564        let div = nodes[0].as_element().unwrap();
565        assert_eq!(div.tag_name, "div");
566        // span was opened, em was opened inside span, then </div>
567        // pops em, span, div
568        let span = div.find_element("span").unwrap();
569        let em = span.find_element("em").unwrap();
570        assert_eq!(em.text_content(), Some("Text".into()));
571    }
572
573    #[test]
574    fn test_pop_until_no_match_preserves_root() {
575        // </nonexistent> pops elements but stops at root sentinel
576        let nodes = parse_fragment("<div><p>Hello</p></nonexistent></div>");
577        assert_eq!(nodes.len(), 1);
578        let div = nodes[0].as_element().unwrap();
579        assert_eq!(div.tag_name, "div");
580    }
581
582    #[test]
583    fn test_pop_until_closes_correct_level() {
584        // Nested <div>s: inner </div> should only close the inner one
585        let nodes = parse_fragment("<div><div><span>Inner</span></div><span>Outer</span></div>");
586        assert_eq!(nodes.len(), 1);
587        let outer = nodes[0].as_element().unwrap();
588        assert_eq!(outer.tag_name, "div");
589        assert_eq!(outer.children.len(), 2);
590        // First child: inner div
591        let inner = outer.children[0].as_element().unwrap();
592        assert_eq!(inner.tag_name, "div");
593        assert_eq!(
594            inner.find_element("span").unwrap().text_content(),
595            Some("Inner".into())
596        );
597        // Second child: outer span (after inner div was closed)
598        let outer_span = outer.children[1].as_element().unwrap();
599        assert_eq!(outer_span.tag_name, "span");
600        assert_eq!(outer_span.text_content(), Some("Outer".into()));
601    }
602
603    #[test]
604    fn test_pop_until_multiple_same_tag() {
605        // Three nested <span>s, one </span> closes only the innermost
606        let nodes = parse_fragment("<div><span><span><span>Deep</span>Mid</span>Top</span></div>");
607        assert_eq!(nodes.len(), 1);
608        let div = nodes[0].as_element().unwrap();
609        let s1 = div.find_element("span").unwrap();
610        let s2 = s1.find_element("span").unwrap();
611        let s3 = s2.find_element("span").unwrap();
612        assert_eq!(s3.text_content(), Some("Deep".into()));
613        // "Mid" is text after inner span closes, inside middle span
614        assert!(s2.children.len() >= 2);
615        // "Top" is text after middle span closes, inside outer span
616        assert!(s1.children.len() >= 2);
617    }
618
619    // ── fragment nesting tests ───────────────────────────────────────
620
621    #[test]
622    fn test_fragment_five_levels_deep() {
623        let nodes = parse_fragment(
624            "<div><section><article><header><h1>Title</h1>\
625             </header></article></section></div>",
626        );
627        assert_eq!(nodes.len(), 1);
628        let div = nodes[0].as_element().unwrap();
629        let section = div.find_element("section").unwrap();
630        let article = section.find_element("article").unwrap();
631        let header = article.find_element("header").unwrap();
632        let h1 = header.find_element("h1").unwrap();
633        assert_eq!(h1.text_content(), Some("Title".into()));
634    }
635
636    #[test]
637    fn test_fragment_text_at_every_level() {
638        let nodes = parse_fragment("<div>A<span>B<em>C</em>D</span>E</div>");
639        assert_eq!(nodes.len(), 1);
640        let div = nodes[0].as_element().unwrap();
641        // div has: text("A"), span, text("E")
642        assert_eq!(div.children.len(), 3);
643        assert_eq!(div.children[0].as_text().unwrap().data, "A");
644        let span = div.children[1].as_element().unwrap();
645        // span has: text("B"), em, text("D")
646        assert_eq!(span.children.len(), 3);
647        assert_eq!(span.children[0].as_text().unwrap().data, "B");
648        let em = span.children[1].as_element().unwrap();
649        assert_eq!(em.text_content(), Some("C".into()));
650        assert_eq!(span.children[2].as_text().unwrap().data, "D");
651        assert_eq!(div.children[2].as_text().unwrap().data, "E");
652    }
653
654    #[test]
655    fn test_fragment_siblings_with_children() {
656        let nodes = parse_fragment("<ul><li>One<em>!</em></li><li>Two</li><li>Three</li></ul>");
657        assert_eq!(nodes.len(), 1);
658        let ul = nodes[0].as_element().unwrap();
659        assert_eq!(ul.children.len(), 3);
660        // First li has text + em
661        let li1 = ul.children[0].as_element().unwrap();
662        assert_eq!(li1.children.len(), 2);
663        assert_eq!(li1.children[0].as_text().unwrap().data, "One");
664        assert_eq!(
665            li1.children[1].as_element().unwrap().text_content(),
666            Some("!".into())
667        );
668        // Second and third are simple
669        let li2 = ul.children[1].as_element().unwrap();
670        assert_eq!(li2.text_content(), Some("Two".into()));
671        let li3 = ul.children[2].as_element().unwrap();
672        assert_eq!(li3.text_content(), Some("Three".into()));
673    }
674
675    // ── fragment void element tests ──────────────────────────────────
676
677    #[test]
678    fn test_fragment_multiple_void_elements() {
679        let nodes = parse_fragment("<div><br><hr><img><input></div>");
680        assert_eq!(nodes.len(), 1);
681        let div = nodes[0].as_element().unwrap();
682        assert_eq!(div.children.len(), 4);
683        assert_eq!(div.children[0].as_element().unwrap().tag_name, "br");
684        assert_eq!(div.children[1].as_element().unwrap().tag_name, "hr");
685        assert_eq!(div.children[2].as_element().unwrap().tag_name, "img");
686        assert_eq!(div.children[3].as_element().unwrap().tag_name, "input");
687        // None should have children
688        for child in &div.children {
689            assert!(child.as_element().unwrap().children.is_empty());
690        }
691    }
692
693    #[test]
694    fn test_fragment_void_between_text() {
695        let nodes = parse_fragment("<p>Before<br>After</p>");
696        assert_eq!(nodes.len(), 1);
697        let p = nodes[0].as_element().unwrap();
698        assert_eq!(p.children.len(), 3);
699        assert_eq!(p.children[0].as_text().unwrap().data, "Before");
700        assert_eq!(p.children[1].as_element().unwrap().tag_name, "br");
701        assert_eq!(p.children[2].as_text().unwrap().data, "After");
702    }
703
704    #[test]
705    fn test_fragment_void_with_attributes() {
706        let nodes = parse_fragment(r#"<div><img src="a.png" alt="test"><br></div>"#);
707        assert_eq!(nodes.len(), 1);
708        let div = nodes[0].as_element().unwrap();
709        let img = div.children[0].as_element().unwrap();
710        assert_eq!(img.get_attribute("src"), Some("a.png"));
711        assert_eq!(img.get_attribute("alt"), Some("test"));
712        assert!(img.children.is_empty());
713    }
714
715    #[test]
716    fn test_fragment_void_nested_inside() {
717        // Void element inside a nested structure
718        let nodes = parse_fragment("<table><tr><td><input></td></tr></table>");
719        assert_eq!(nodes.len(), 1);
720        let table = nodes[0].as_element().unwrap();
721        let tr = table.find_element("tr").unwrap();
722        let td = tr.find_element("td").unwrap();
723        let input = td.find_element("input").unwrap();
724        assert!(input.children.is_empty());
725    }
726
727    // ── fragment comment tests ───────────────────────────────────────
728
729    #[test]
730    fn test_fragment_comment_top_level() {
731        let nodes = parse_fragment("<!-- top --><div>Hi</div>");
732        assert_eq!(nodes.len(), 2);
733        assert!(matches!(nodes[0], Node::Comment(_)));
734        if let Node::Comment(c) = &nodes[0] {
735            assert_eq!(c.data, " top ");
736        }
737        assert_eq!(nodes[1].as_element().unwrap().tag_name, "div");
738    }
739
740    #[test]
741    fn test_fragment_comment_inside_element() {
742        let nodes = parse_fragment("<div><!-- inside --></div>");
743        assert_eq!(nodes.len(), 1);
744        let div = nodes[0].as_element().unwrap();
745        assert_eq!(div.children.len(), 1);
746        assert!(matches!(div.children[0], Node::Comment(_)));
747    }
748
749    #[test]
750    fn test_fragment_comment_between_elements() {
751        let nodes = parse_fragment("<ul><li>A</li><!-- sep --><li>B</li></ul>");
752        assert_eq!(nodes.len(), 1);
753        let ul = nodes[0].as_element().unwrap();
754        assert_eq!(ul.children.len(), 3);
755        assert_eq!(ul.children[0].as_element().unwrap().tag_name, "li");
756        assert!(matches!(ul.children[1], Node::Comment(_)));
757        assert_eq!(ul.children[2].as_element().unwrap().tag_name, "li");
758    }
759
760    // ── fragment top-level variety tests ─────────────────────────────
761
762    #[test]
763    fn test_fragment_text_only() {
764        let nodes = parse_fragment("Just text");
765        assert_eq!(nodes.len(), 1);
766        assert_eq!(nodes[0].as_text().unwrap().data, "Just text");
767    }
768
769    #[test]
770    fn test_fragment_mixed_top_level() {
771        let nodes = parse_fragment("Hello <em>world</em> and <strong>more</strong>!");
772        // text, em, text, strong, text
773        assert_eq!(nodes.len(), 5);
774        assert_eq!(nodes[0].as_text().unwrap().data, "Hello ");
775        assert_eq!(nodes[1].as_element().unwrap().tag_name, "em");
776        assert_eq!(nodes[2].as_text().unwrap().data, " and ");
777        assert_eq!(nodes[3].as_element().unwrap().tag_name, "strong");
778        assert_eq!(nodes[4].as_text().unwrap().data, "!");
779    }
780
781    #[test]
782    fn test_fragment_empty() {
783        let nodes = parse_fragment("");
784        assert!(nodes.is_empty());
785    }
786
787    #[test]
788    fn test_fragment_whitespace_only() {
789        // Whitespace in InBody mode is NOT skipped
790        let nodes = parse_fragment("   ");
791        assert_eq!(nodes.len(), 1);
792        assert_eq!(nodes[0].as_text().unwrap().data, "   ");
793    }
794
795    // ── malformed input tests ────────────────────────────────────────
796
797    #[test]
798    fn test_malformed_only_end_tags() {
799        let nodes = parse_fragment("</div></span></p>");
800        // No start tags to match, nothing produced
801        assert!(nodes.is_empty());
802    }
803
804    #[test]
805    fn test_malformed_extra_end_tags() {
806        let nodes = parse_fragment("<div>Hello</div></div></div></div>");
807        assert_eq!(nodes.len(), 1);
808        let div = nodes[0].as_element().unwrap();
809        assert_eq!(div.text_content(), Some("Hello".into()));
810    }
811
812    #[test]
813    fn test_malformed_unclosed_tags() {
814        // Tags that are never closed
815        let nodes = parse_fragment("<div><span><em>Text");
816        assert_eq!(nodes.len(), 1);
817        let div = nodes[0].as_element().unwrap();
818        let span = div.find_element("span").unwrap();
819        let em = span.find_element("em").unwrap();
820        assert_eq!(em.text_content(), Some("Text".into()));
821    }
822
823    #[test]
824    fn test_malformed_interleaved_tags() {
825        // <b><i></b></i> - interleaved close order
826        let nodes = parse_fragment("<b><i>Text</b>After</i>");
827        // After </b> pops both i and b (pop_until finds b).
828        // "After" goes to root since both are closed.
829        // </i> is unmatched, ignored.
830        assert!(!nodes.is_empty());
831        let b = nodes[0].as_element().unwrap();
832        assert_eq!(b.tag_name, "b");
833    }
834
835    #[test]
836    fn test_malformed_deeply_mismatched() {
837        let nodes = parse_fragment("<a><b><c><d><e>Text</a>");
838        // </a> pops e, d, c, b, a
839        assert_eq!(nodes.len(), 1);
840        let a = nodes[0].as_element().unwrap();
841        assert_eq!(a.tag_name, "a");
842        assert!(a.find_element("e").is_some());
843    }
844
845    // ── full document tests ──────────────────────────────────────────
846
847    #[test]
848    fn test_document_head_elements() {
849        let doc = parse(
850            r#"<!DOCTYPE html><html><head>
851            <title>Test</title>
852            <meta charset="utf-8">
853            <link rel="stylesheet" href="style.css">
854            </head><body></body></html>"#,
855        );
856        let head = doc.head().unwrap();
857        assert!(head.find_element("title").is_some());
858        assert!(head.find_element("meta").is_some());
859        assert!(head.find_element("link").is_some());
860    }
861
862    #[test]
863    fn test_document_implicit_body() {
864        // No explicit <body> tag, elements go into implicit body
865        let doc = parse("<html><head></head><div>Content</div></html>");
866        let body = doc.body().unwrap();
867        let div = body.find_element("div").unwrap();
868        assert_eq!(div.text_content(), Some("Content".into()));
869    }
870
871    #[test]
872    fn test_document_implicit_head_and_body() {
873        // No head or body, just content
874        let doc = parse("<div>Content</div>");
875        assert_eq!(doc.root.tag_name, "html");
876        assert!(doc.head().is_some());
877        assert!(doc.body().is_some());
878        let body = doc.body().unwrap();
879        let div = body.find_element("div").unwrap();
880        assert_eq!(div.text_content(), Some("Content".into()));
881    }
882
883    #[test]
884    fn test_document_title() {
885        let doc = parse(
886            "<!DOCTYPE html><html><head><title>Hello World</title></head>\
887             <body></body></html>",
888        );
889        assert_eq!(doc.title(), Some(String::from("Hello World")));
890    }
891
892    #[test]
893    fn test_document_round_trip() {
894        let html = "<!DOCTYPE html><html><head><title>Test</title></head>\
895                     <body><p>Hello</p></body></html>";
896        let doc = parse(html);
897        let output = doc.to_html();
898        // Re-parse the output and verify structure
899        let doc2 = parse(&output);
900        assert_eq!(doc2.title(), Some(String::from("Test")));
901        let body = doc2.body().unwrap();
902        let p = body.find_element("p").unwrap();
903        assert_eq!(p.text_content(), Some("Hello".into()));
904    }
905}