ironhtml_parser/
validator.rs

1//! HTML validation.
2//!
3//! This module provides validation for parsed HTML documents,
4//! checking for common issues like missing required attributes.
5//!
6//! ## Reference
7//!
8//! - [HTML Validation](https://html.spec.whatwg.org/multipage/dom.html#content-models)
9
10use alloc::string::{String, ToString};
11use alloc::vec::Vec;
12
13use crate::dom::{Document, Element, Node};
14
15/// A validation error.
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct ValidationError {
18    /// The type of error.
19    pub kind: ValidationErrorKind,
20    /// The tag name of the element with the error.
21    pub element: String,
22    /// A human-readable description of the error.
23    pub message: String,
24}
25
26/// Types of validation errors.
27#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum ValidationErrorKind {
29    /// A required attribute is missing.
30    MissingRequiredAttribute,
31    /// An attribute has an invalid value.
32    InvalidAttributeValue,
33    /// An element is deprecated.
34    DeprecatedElement,
35    /// An invalid parent-child relationship.
36    InvalidNesting,
37    /// Duplicate ID found.
38    DuplicateId,
39}
40
41/// Validation result containing all errors.
42pub type ValidationResult = Vec<ValidationError>;
43
44/// HTML validator.
45pub struct Validator {
46    /// Collected errors.
47    errors: Vec<ValidationError>,
48    /// Seen IDs for duplicate detection.
49    seen_ids: Vec<String>,
50}
51
52impl Validator {
53    /// Create a new validator.
54    #[must_use]
55    pub const fn new() -> Self {
56        Self {
57            errors: Vec::new(),
58            seen_ids: Vec::new(),
59        }
60    }
61
62    /// Validate a document.
63    #[must_use]
64    pub fn validate(mut self, doc: &Document) -> Vec<ValidationError> {
65        self.validate_element(&doc.root);
66        self.errors
67    }
68
69    /// Validate a list of nodes (for fragments).
70    #[must_use]
71    pub fn validate_nodes(mut self, nodes: &[Node]) -> Vec<ValidationError> {
72        for node in nodes {
73            if let Node::Element(elem) = node {
74                self.validate_element(elem);
75            }
76        }
77        self.errors
78    }
79
80    fn validate_element(&mut self, elem: &Element) {
81        // Check for deprecated elements
82        self.check_deprecated(elem);
83
84        // Check required attributes
85        self.check_required_attributes(elem);
86
87        // Check for duplicate IDs
88        self.check_duplicate_id(elem);
89
90        // Check attribute values
91        self.check_attribute_values(elem);
92
93        // Recursively validate children
94        for child in &elem.children {
95            if let Node::Element(child_elem) = child {
96                self.validate_element(child_elem);
97            }
98        }
99    }
100
101    fn check_deprecated(&mut self, elem: &Element) {
102        let deprecated = matches!(
103            elem.tag_name.as_str(),
104            "acronym"
105                | "applet"
106                | "basefont"
107                | "bgsound"
108                | "big"
109                | "blink"
110                | "center"
111                | "font"
112                | "frame"
113                | "frameset"
114                | "isindex"
115                | "keygen"
116                | "listing"
117                | "marquee"
118                | "menuitem"
119                | "multicol"
120                | "nextid"
121                | "nobr"
122                | "noembed"
123                | "noframes"
124                | "plaintext"
125                | "rb"
126                | "rtc"
127                | "spacer"
128                | "strike"
129                | "tt"
130                | "xmp"
131        );
132
133        if deprecated {
134            self.errors.push(ValidationError {
135                kind: ValidationErrorKind::DeprecatedElement,
136                element: elem.tag_name.clone(),
137                message: alloc::format!(
138                    "The <{}> element is deprecated and should not be used",
139                    elem.tag_name,
140                ),
141            });
142        }
143    }
144
145    #[allow(clippy::too_many_lines, clippy::match_same_arms)]
146    fn check_required_attributes(&mut self, elem: &Element) {
147        match elem.tag_name.as_str() {
148            "img" => {
149                if !elem.has_attribute("src") {
150                    self.errors.push(ValidationError {
151                        kind: ValidationErrorKind::MissingRequiredAttribute,
152                        element: elem.tag_name.clone(),
153                        message: "The <img> element requires a 'src' attribute".into(),
154                    });
155                }
156                if !elem.has_attribute("alt") {
157                    self.errors.push(ValidationError {
158                        kind: ValidationErrorKind::MissingRequiredAttribute,
159                        element: elem.tag_name.clone(),
160                        message:
161                            "The <img> element should have an 'alt' attribute for accessibility"
162                                .into(),
163                    });
164                }
165            }
166            "a" => {
167                // href is not strictly required (can be a placeholder link)
168                // but we could warn if it's missing
169            }
170            "input" => {
171                // Check for label association if not a hidden input
172                if elem.get_attribute("type") != Some("hidden") && !elem.has_attribute("id") {
173                    // This is a soft warning - input should have id for label association
174                }
175            }
176            "script" => {
177                // script requires either src or inline content
178            }
179            "link" => {
180                if elem.get_attribute("rel") == Some("stylesheet") && !elem.has_attribute("href") {
181                    self.errors.push(ValidationError {
182                        kind: ValidationErrorKind::MissingRequiredAttribute,
183                        element: elem.tag_name.clone(),
184                        message:
185                            "The <link rel=\"stylesheet\"> element requires an 'href' attribute"
186                                .into(),
187                    });
188                }
189            }
190            "form" => {
191                if !elem.has_attribute("action") {
192                    // action is technically optional in HTML5, defaults to current URL
193                }
194            }
195            "iframe" => {
196                if !elem.has_attribute("src") && !elem.has_attribute("srcdoc") {
197                    self.errors.push(ValidationError {
198                        kind: ValidationErrorKind::MissingRequiredAttribute,
199                        element: elem.tag_name.clone(),
200                        message: "The <iframe> element requires either 'src' or 'srcdoc' attribute"
201                            .into(),
202                    });
203                }
204            }
205            "video" | "audio" => {
206                // Should have src attribute or source children
207                if !elem.has_attribute("src")
208                    && !elem
209                        .children
210                        .iter()
211                        .any(|c| matches!(c, Node::Element(e) if e.tag_name == "source"))
212                {
213                    self.errors.push(ValidationError {
214                        kind: ValidationErrorKind::MissingRequiredAttribute,
215                        element: elem.tag_name.clone(),
216                        message: alloc::format!(
217                            "The <{}> element requires either 'src' attribute or <source> children",
218                            elem.tag_name
219                        ),
220                    });
221                }
222            }
223            "meta" => {
224                // meta should have either charset, name+content, http-equiv+content, or itemprop
225                let has_charset = elem.has_attribute("charset");
226                let has_name = elem.has_attribute("name");
227                let has_http_equiv = elem.has_attribute("http-equiv");
228                let has_content = elem.has_attribute("content");
229                let has_itemprop = elem.has_attribute("itemprop");
230
231                if !has_charset && !has_itemprop && (has_name || has_http_equiv) && !has_content {
232                    self.errors.push(ValidationError {
233                        kind: ValidationErrorKind::MissingRequiredAttribute,
234                        element: elem.tag_name.clone(),
235                        message: "The <meta> element with 'name' or 'http-equiv' requires a 'content' attribute".into(),
236                    });
237                }
238            }
239            "area" => {
240                if !elem.has_attribute("alt") {
241                    self.errors.push(ValidationError {
242                        kind: ValidationErrorKind::MissingRequiredAttribute,
243                        element: elem.tag_name.clone(),
244                        message: "The <area> element requires an 'alt' attribute".into(),
245                    });
246                }
247            }
248            "optgroup" => {
249                if !elem.has_attribute("label") {
250                    self.errors.push(ValidationError {
251                        kind: ValidationErrorKind::MissingRequiredAttribute,
252                        element: elem.tag_name.clone(),
253                        message: "The <optgroup> element requires a 'label' attribute".into(),
254                    });
255                }
256            }
257            "progress" => {
258                // value and max are optional but recommended
259            }
260            "time" => {
261                // datetime attribute is recommended if content is not machine-readable
262            }
263            _ => {}
264        }
265    }
266
267    fn check_duplicate_id(&mut self, elem: &Element) {
268        if let Some(id) = elem.id() {
269            if self.seen_ids.contains(&id.to_string()) {
270                self.errors.push(ValidationError {
271                    kind: ValidationErrorKind::DuplicateId,
272                    element: elem.tag_name.clone(),
273                    message: alloc::format!("Duplicate id '{id}' found"),
274                });
275            } else {
276                self.seen_ids.push(id.to_string());
277            }
278        }
279    }
280
281    fn check_attribute_values(&mut self, elem: &Element) {
282        // Check for empty required values
283        if let Some(id) = elem.get_attribute("id") {
284            if id.is_empty() {
285                self.errors.push(ValidationError {
286                    kind: ValidationErrorKind::InvalidAttributeValue,
287                    element: elem.tag_name.clone(),
288                    message: "The 'id' attribute must not be empty".into(),
289                });
290            } else if id.contains(char::is_whitespace) {
291                self.errors.push(ValidationError {
292                    kind: ValidationErrorKind::InvalidAttributeValue,
293                    element: elem.tag_name.clone(),
294                    message: "The 'id' attribute must not contain whitespace".into(),
295                });
296            }
297        }
298
299        // Check input type values
300        if elem.tag_name == "input" {
301            if let Some(input_type) = elem.get_attribute("type") {
302                let valid_types = [
303                    "button",
304                    "checkbox",
305                    "color",
306                    "date",
307                    "datetime-local",
308                    "email",
309                    "file",
310                    "hidden",
311                    "image",
312                    "month",
313                    "number",
314                    "password",
315                    "radio",
316                    "range",
317                    "reset",
318                    "search",
319                    "submit",
320                    "tel",
321                    "text",
322                    "time",
323                    "url",
324                    "week",
325                ];
326                if !valid_types.contains(&input_type) {
327                    self.errors.push(ValidationError {
328                        kind: ValidationErrorKind::InvalidAttributeValue,
329                        element: elem.tag_name.clone(),
330                        message: alloc::format!("Invalid input type '{input_type}'"),
331                    });
332                }
333            }
334        }
335
336        // Check target values for anchors
337        if elem.tag_name == "a" || elem.tag_name == "form" {
338            if let Some(target) = elem.get_attribute("target") {
339                let valid_targets = ["_self", "_blank", "_parent", "_top"];
340                if !target.starts_with('_') || valid_targets.contains(&target) {
341                    // Valid: either a frame name or a reserved keyword
342                } else if target.starts_with('_') && !valid_targets.contains(&target) {
343                    self.errors.push(ValidationError {
344                        kind: ValidationErrorKind::InvalidAttributeValue,
345                        element: elem.tag_name.clone(),
346                        message: alloc::format!("Invalid target '{target}'"),
347                    });
348                }
349            }
350        }
351    }
352}
353
354impl Default for Validator {
355    fn default() -> Self {
356        Self::new()
357    }
358}
359
360#[cfg(test)]
361mod tests {
362    use super::*;
363    use crate::{parse, parse_fragment};
364
365    #[test]
366    fn test_missing_img_alt() {
367        let doc = parse("<img src=\"test.jpg\">");
368        let errors = Validator::new().validate(&doc);
369        assert!(errors
370            .iter()
371            .any(|e| e.kind == ValidationErrorKind::MissingRequiredAttribute
372                && e.message.contains("alt")));
373    }
374
375    #[test]
376    fn test_missing_img_src() {
377        let doc = parse("<img>");
378        let errors = Validator::new().validate(&doc);
379        assert!(errors
380            .iter()
381            .any(|e| e.kind == ValidationErrorKind::MissingRequiredAttribute
382                && e.message.contains("src")));
383    }
384
385    #[test]
386    fn test_valid_img() {
387        let nodes = parse_fragment("<img src=\"test.jpg\" alt=\"Test image\">");
388        let errors = Validator::new().validate_nodes(&nodes);
389        assert!(!errors.iter().any(|e| e.element == "img"));
390    }
391
392    #[test]
393    fn test_deprecated_element() {
394        let doc = parse("<center>Content</center>");
395        let errors = Validator::new().validate(&doc);
396        assert!(errors
397            .iter()
398            .any(|e| e.kind == ValidationErrorKind::DeprecatedElement && e.element == "center"));
399    }
400
401    #[test]
402    fn test_duplicate_id() {
403        let doc = parse(r#"<div id="same"></div><div id="same"></div>"#);
404        let errors = Validator::new().validate(&doc);
405        assert!(errors
406            .iter()
407            .any(|e| e.kind == ValidationErrorKind::DuplicateId));
408    }
409
410    #[test]
411    fn test_empty_id() {
412        let doc = parse(r#"<div id=""></div>"#);
413        let errors = Validator::new().validate(&doc);
414        assert!(errors
415            .iter()
416            .any(|e| e.kind == ValidationErrorKind::InvalidAttributeValue
417                && e.message.contains("id")));
418    }
419
420    #[test]
421    fn test_invalid_input_type() {
422        let doc = parse(r#"<input type="invalid">"#);
423        let errors = Validator::new().validate(&doc);
424        assert!(errors
425            .iter()
426            .any(|e| e.kind == ValidationErrorKind::InvalidAttributeValue
427                && e.message.contains("input type")));
428    }
429}