ironhtml_parser/
tokenizer.rs

1//! HTML5 tokenizer.
2//!
3//! This module implements the tokenization stage of HTML parsing as specified
4//! in the WHATWG HTML Living Standard.
5//!
6//! ## Reference
7//!
8//! - [Tokenization](https://html.spec.whatwg.org/multipage/parsing.html#tokenization)
9
10use alloc::string::String;
11use alloc::vec::Vec;
12
13/// A token produced by the tokenizer.
14#[derive(Debug, Clone, PartialEq, Eq)]
15pub enum Token {
16    /// A DOCTYPE token.
17    Doctype {
18        name: Option<String>,
19        public_id: Option<String>,
20        system_id: Option<String>,
21    },
22    /// A start tag token.
23    StartTag {
24        name: String,
25        attributes: Vec<(String, String)>,
26        self_closing: bool,
27    },
28    /// An end tag token.
29    EndTag { name: String },
30    /// A comment token.
31    Comment(String),
32    /// A character token (text content).
33    Character(char),
34    /// End of file.
35    Eof,
36}
37
38/// The tokenizer state.
39#[derive(Debug, Clone, Copy, PartialEq)]
40enum State {
41    Data,
42    TagOpen,
43    EndTagOpen,
44    TagName,
45    SelfClosingStartTag,
46    BeforeAttributeName,
47    AttributeName,
48    AfterAttributeName,
49    BeforeAttributeValue,
50    AttributeValueDoubleQuoted,
51    AttributeValueSingleQuoted,
52    AttributeValueUnquoted,
53    AfterAttributeValueQuoted,
54    MarkupDeclarationOpen,
55    CommentStart,
56    Comment,
57    CommentEnd,
58    Doctype,
59    BeforeDoctypeName,
60    DoctypeName,
61    AfterDoctypeName,
62    BogusComment,
63}
64
65/// HTML5 tokenizer.
66pub struct Tokenizer<'a> {
67    input: &'a str,
68    chars: core::iter::Peekable<core::str::CharIndices<'a>>,
69    state: State,
70    current_tag_name: String,
71    current_tag_is_end: bool,
72    current_tag_self_closing: bool,
73    current_attr_name: String,
74    current_attr_value: String,
75    current_attrs: Vec<(String, String)>,
76    current_comment: String,
77    current_doctype_name: Option<String>,
78    pending_tokens: Vec<Token>,
79}
80
81impl<'a> Tokenizer<'a> {
82    /// Create a new tokenizer for the given input.
83    #[must_use]
84    pub fn new(input: &'a str) -> Self {
85        Self {
86            input,
87            chars: input.char_indices().peekable(),
88            state: State::Data,
89            current_tag_name: String::new(),
90            current_tag_is_end: false,
91            current_tag_self_closing: false,
92            current_attr_name: String::new(),
93            current_attr_value: String::new(),
94            current_attrs: Vec::new(),
95            current_comment: String::new(),
96            current_doctype_name: None,
97            pending_tokens: Vec::new(),
98        }
99    }
100
101    fn consume(&mut self) -> Option<char> {
102        self.chars.next().map(|(_, c)| c)
103    }
104
105    fn peek(&mut self) -> Option<char> {
106        self.chars.peek().map(|(_, c)| *c)
107    }
108
109    fn emit_current_tag(&mut self) -> Token {
110        let name = core::mem::take(&mut self.current_tag_name).to_ascii_lowercase();
111        let attrs = core::mem::take(&mut self.current_attrs);
112        let self_closing = self.current_tag_self_closing;
113        let is_end = self.current_tag_is_end;
114        self.current_tag_self_closing = false;
115        self.current_tag_is_end = false;
116
117        if is_end {
118            Token::EndTag { name }
119        } else {
120            Token::StartTag {
121                name,
122                attributes: attrs,
123                self_closing,
124            }
125        }
126    }
127
128    fn emit_current_attr(&mut self) {
129        if !self.current_attr_name.is_empty() {
130            let name = core::mem::take(&mut self.current_attr_name).to_ascii_lowercase();
131            let value = core::mem::take(&mut self.current_attr_value);
132            self.current_attrs.push((name, value));
133        }
134    }
135
136    #[allow(clippy::too_many_lines, clippy::match_same_arms)]
137    fn next_token(&mut self) -> Option<Token> {
138        // Return pending tokens first
139        if !self.pending_tokens.is_empty() {
140            return Some(self.pending_tokens.remove(0));
141        }
142
143        loop {
144            match self.state {
145                State::Data => match self.consume() {
146                    Some('<') => self.state = State::TagOpen,
147                    Some(c) => return Some(Token::Character(c)),
148                    None => return Some(Token::Eof),
149                },
150
151                State::TagOpen => match self.peek() {
152                    Some('!') => {
153                        self.consume();
154                        self.state = State::MarkupDeclarationOpen;
155                    }
156                    Some('/') => {
157                        self.consume();
158                        self.state = State::EndTagOpen;
159                    }
160                    Some(c) if c.is_ascii_alphabetic() => {
161                        self.state = State::TagName;
162                    }
163                    _ => {
164                        self.state = State::Data;
165                        return Some(Token::Character('<'));
166                    }
167                },
168
169                State::EndTagOpen => match self.peek() {
170                    Some(c) if c.is_ascii_alphabetic() => {
171                        self.current_tag_is_end = true;
172                        self.state = State::TagName;
173                    }
174                    Some('>') => {
175                        self.consume();
176                        self.state = State::Data;
177                    }
178                    _ => {
179                        self.state = State::BogusComment;
180                    }
181                },
182
183                State::TagName => match self.consume() {
184                    Some('\t' | '\n' | '\x0C' | ' ') => {
185                        self.state = State::BeforeAttributeName;
186                    }
187                    Some('/') => {
188                        self.state = State::SelfClosingStartTag;
189                    }
190                    Some('>') => {
191                        self.state = State::Data;
192                        return Some(self.emit_current_tag());
193                    }
194                    Some(c) => {
195                        self.current_tag_name.push(c);
196                    }
197                    None => {
198                        self.state = State::Data;
199                        return Some(Token::Eof);
200                    }
201                },
202
203                State::SelfClosingStartTag => match self.consume() {
204                    Some('>') => {
205                        self.current_tag_self_closing = true;
206                        self.state = State::Data;
207                        return Some(self.emit_current_tag());
208                    }
209                    _ => {
210                        self.state = State::BeforeAttributeName;
211                    }
212                },
213
214                State::BeforeAttributeName => match self.peek() {
215                    Some('\t' | '\n' | '\x0C' | ' ') => {
216                        self.consume();
217                    }
218                    Some('/' | '>') | None => {
219                        self.state = State::AfterAttributeName;
220                    }
221                    Some('=') => {
222                        self.consume();
223                        self.current_attr_name.push('=');
224                        self.state = State::AttributeName;
225                    }
226                    _ => {
227                        self.state = State::AttributeName;
228                    }
229                },
230
231                State::AttributeName => match self.peek() {
232                    Some('\t' | '\n' | '\x0C' | ' ' | '/' | '>') => {
233                        self.state = State::AfterAttributeName;
234                    }
235                    Some('=') => {
236                        self.consume();
237                        self.state = State::BeforeAttributeValue;
238                    }
239                    Some(c) => {
240                        self.consume();
241                        self.current_attr_name.push(c);
242                    }
243                    None => {
244                        self.state = State::AfterAttributeName;
245                    }
246                },
247
248                State::AfterAttributeName => match self.peek() {
249                    Some('\t' | '\n' | '\x0C' | ' ') => {
250                        self.consume();
251                    }
252                    Some('/') => {
253                        self.emit_current_attr();
254                        self.consume();
255                        self.state = State::SelfClosingStartTag;
256                    }
257                    Some('=') => {
258                        self.consume();
259                        self.state = State::BeforeAttributeValue;
260                    }
261                    Some('>') => {
262                        self.emit_current_attr();
263                        self.consume();
264                        self.state = State::Data;
265                        return Some(self.emit_current_tag());
266                    }
267                    _ => {
268                        self.emit_current_attr();
269                        self.state = State::AttributeName;
270                    }
271                },
272
273                State::BeforeAttributeValue => match self.peek() {
274                    Some('\t' | '\n' | '\x0C' | ' ') => {
275                        self.consume();
276                    }
277                    Some('"') => {
278                        self.consume();
279                        self.state = State::AttributeValueDoubleQuoted;
280                    }
281                    Some('\'') => {
282                        self.consume();
283                        self.state = State::AttributeValueSingleQuoted;
284                    }
285                    Some('>') => {
286                        self.emit_current_attr();
287                        self.consume();
288                        self.state = State::Data;
289                        return Some(self.emit_current_tag());
290                    }
291                    _ => {
292                        self.state = State::AttributeValueUnquoted;
293                    }
294                },
295
296                State::AttributeValueDoubleQuoted => match self.consume() {
297                    Some('"') => {
298                        self.emit_current_attr();
299                        self.state = State::AfterAttributeValueQuoted;
300                    }
301                    Some(c) => {
302                        self.current_attr_value.push(c);
303                    }
304                    None => {
305                        self.emit_current_attr();
306                        self.state = State::Data;
307                        return Some(Token::Eof);
308                    }
309                },
310
311                State::AttributeValueSingleQuoted => match self.consume() {
312                    Some('\'') => {
313                        self.emit_current_attr();
314                        self.state = State::AfterAttributeValueQuoted;
315                    }
316                    Some(c) => {
317                        self.current_attr_value.push(c);
318                    }
319                    None => {
320                        self.emit_current_attr();
321                        self.state = State::Data;
322                        return Some(Token::Eof);
323                    }
324                },
325
326                State::AttributeValueUnquoted => match self.peek() {
327                    Some('\t' | '\n' | '\x0C' | ' ') => {
328                        self.emit_current_attr();
329                        self.consume();
330                        self.state = State::BeforeAttributeName;
331                    }
332                    Some('>') => {
333                        self.emit_current_attr();
334                        self.consume();
335                        self.state = State::Data;
336                        return Some(self.emit_current_tag());
337                    }
338                    Some(c) => {
339                        self.consume();
340                        self.current_attr_value.push(c);
341                    }
342                    None => {
343                        self.emit_current_attr();
344                        self.state = State::Data;
345                        return Some(Token::Eof);
346                    }
347                },
348
349                State::AfterAttributeValueQuoted => match self.peek() {
350                    Some('\t' | '\n' | '\x0C' | ' ') => {
351                        self.consume();
352                        self.state = State::BeforeAttributeName;
353                    }
354                    Some('/') => {
355                        self.consume();
356                        self.state = State::SelfClosingStartTag;
357                    }
358                    Some('>') => {
359                        self.consume();
360                        self.state = State::Data;
361                        return Some(self.emit_current_tag());
362                    }
363                    _ => {
364                        self.state = State::BeforeAttributeName;
365                    }
366                },
367
368                State::MarkupDeclarationOpen => {
369                    // Check for DOCTYPE or comment
370                    let remaining =
371                        &self.input[self.chars.peek().map_or(self.input.len(), |(i, _)| *i)..];
372
373                    if remaining.starts_with("--") {
374                        self.consume(); // -
375                        self.consume(); // -
376                        self.state = State::CommentStart;
377                    } else if remaining.to_ascii_uppercase().starts_with("DOCTYPE") {
378                        for _ in 0..7 {
379                            self.consume();
380                        }
381                        self.state = State::Doctype;
382                    } else {
383                        self.state = State::BogusComment;
384                    }
385                }
386
387                State::CommentStart => match self.peek() {
388                    Some('-') => {
389                        self.consume();
390                        self.state = State::CommentEnd;
391                    }
392                    Some('>') => {
393                        self.consume();
394                        self.state = State::Data;
395                        return Some(Token::Comment(core::mem::take(&mut self.current_comment)));
396                    }
397                    _ => {
398                        self.state = State::Comment;
399                    }
400                },
401
402                State::Comment => match self.consume() {
403                    Some('-') => {
404                        self.state = State::CommentEnd;
405                    }
406                    Some(c) => {
407                        self.current_comment.push(c);
408                    }
409                    None => {
410                        self.state = State::Data;
411                        return Some(Token::Comment(core::mem::take(&mut self.current_comment)));
412                    }
413                },
414
415                State::CommentEnd => match self.consume() {
416                    Some('-') => {
417                        if self.peek() == Some('>') {
418                            self.consume();
419                            self.state = State::Data;
420                            return Some(Token::Comment(core::mem::take(
421                                &mut self.current_comment,
422                            )));
423                        }
424                        self.current_comment.push('-');
425                    }
426                    Some(c) => {
427                        self.current_comment.push('-');
428                        self.current_comment.push(c);
429                        self.state = State::Comment;
430                    }
431                    None => {
432                        self.state = State::Data;
433                        return Some(Token::Comment(core::mem::take(&mut self.current_comment)));
434                    }
435                },
436
437                State::Doctype => match self.peek() {
438                    Some('\t' | '\n' | '\x0C' | ' ') => {
439                        self.consume();
440                        self.state = State::BeforeDoctypeName;
441                    }
442                    Some('>') => {
443                        self.state = State::BeforeDoctypeName;
444                    }
445                    _ => {
446                        self.state = State::BeforeDoctypeName;
447                    }
448                },
449
450                State::BeforeDoctypeName => match self.peek() {
451                    Some('\t' | '\n' | '\x0C' | ' ') => {
452                        self.consume();
453                    }
454                    Some('>') => {
455                        self.consume();
456                        self.state = State::Data;
457                        return Some(Token::Doctype {
458                            name: self.current_doctype_name.take(),
459                            public_id: None,
460                            system_id: None,
461                        });
462                    }
463                    Some(_) => {
464                        self.current_doctype_name = Some(String::new());
465                        self.state = State::DoctypeName;
466                    }
467                    None => {
468                        self.state = State::Data;
469                        return Some(Token::Doctype {
470                            name: None,
471                            public_id: None,
472                            system_id: None,
473                        });
474                    }
475                },
476
477                State::DoctypeName => match self.consume() {
478                    Some('\t' | '\n' | '\x0C' | ' ') => {
479                        self.state = State::AfterDoctypeName;
480                    }
481                    Some('>') => {
482                        self.state = State::Data;
483                        return Some(Token::Doctype {
484                            name: self
485                                .current_doctype_name
486                                .take()
487                                .map(|s| s.to_ascii_lowercase()),
488                            public_id: None,
489                            system_id: None,
490                        });
491                    }
492                    Some(c) => {
493                        if let Some(ref mut name) = self.current_doctype_name {
494                            name.push(c);
495                        }
496                    }
497                    None => {
498                        self.state = State::Data;
499                        return Some(Token::Doctype {
500                            name: self.current_doctype_name.take(),
501                            public_id: None,
502                            system_id: None,
503                        });
504                    }
505                },
506
507                State::AfterDoctypeName => {
508                    match self.peek() {
509                        Some('\t' | '\n' | '\x0C' | ' ') => {
510                            self.consume();
511                        }
512                        Some('>') => {
513                            self.consume();
514                            self.state = State::Data;
515                            return Some(Token::Doctype {
516                                name: self
517                                    .current_doctype_name
518                                    .take()
519                                    .map(|s| s.to_ascii_lowercase()),
520                                public_id: None,
521                                system_id: None,
522                            });
523                        }
524                        None => {
525                            self.state = State::Data;
526                            return Some(Token::Doctype {
527                                name: self.current_doctype_name.take(),
528                                public_id: None,
529                                system_id: None,
530                            });
531                        }
532                        _ => {
533                            // Skip PUBLIC/SYSTEM identifiers for now
534                            while let Some(c) = self.peek() {
535                                if c == '>' {
536                                    break;
537                                }
538                                self.consume();
539                            }
540                            self.consume(); // consume >
541                            self.state = State::Data;
542                            return Some(Token::Doctype {
543                                name: self
544                                    .current_doctype_name
545                                    .take()
546                                    .map(|s| s.to_ascii_lowercase()),
547                                public_id: None,
548                                system_id: None,
549                            });
550                        }
551                    }
552                }
553
554                State::BogusComment => match self.consume() {
555                    Some('>') => {
556                        self.state = State::Data;
557                        return Some(Token::Comment(core::mem::take(&mut self.current_comment)));
558                    }
559                    Some(c) => {
560                        self.current_comment.push(c);
561                    }
562                    None => {
563                        self.state = State::Data;
564                        return Some(Token::Comment(core::mem::take(&mut self.current_comment)));
565                    }
566                },
567            }
568        }
569    }
570}
571
572impl Iterator for Tokenizer<'_> {
573    type Item = Token;
574
575    fn next(&mut self) -> Option<Self::Item> {
576        match self.next_token() {
577            Some(Token::Eof) => None,
578            token => token,
579        }
580    }
581}
582
583#[cfg(test)]
584mod tests {
585    use super::*;
586    use alloc::vec;
587
588    #[test]
589    fn test_simple_element() {
590        let mut tokenizer = Tokenizer::new("<div></div>");
591        assert_eq!(
592            tokenizer.next(),
593            Some(Token::StartTag {
594                name: "div".into(),
595                attributes: vec![],
596                self_closing: false,
597            })
598        );
599        assert_eq!(tokenizer.next(), Some(Token::EndTag { name: "div".into() }));
600    }
601
602    #[test]
603    fn test_element_with_text() {
604        let mut tokenizer = Tokenizer::new("<p>Hello</p>");
605        assert_eq!(
606            tokenizer.next(),
607            Some(Token::StartTag {
608                name: "p".into(),
609                attributes: vec![],
610                self_closing: false,
611            })
612        );
613        assert_eq!(tokenizer.next(), Some(Token::Character('H')));
614        assert_eq!(tokenizer.next(), Some(Token::Character('e')));
615        assert_eq!(tokenizer.next(), Some(Token::Character('l')));
616        assert_eq!(tokenizer.next(), Some(Token::Character('l')));
617        assert_eq!(tokenizer.next(), Some(Token::Character('o')));
618        assert_eq!(tokenizer.next(), Some(Token::EndTag { name: "p".into() }));
619    }
620
621    #[test]
622    fn test_attributes() {
623        let mut tokenizer = Tokenizer::new(r#"<div class="container" id="main">"#);
624        assert_eq!(
625            tokenizer.next(),
626            Some(Token::StartTag {
627                name: "div".into(),
628                attributes: vec![
629                    ("class".into(), "container".into()),
630                    ("id".into(), "main".into()),
631                ],
632                self_closing: false,
633            })
634        );
635    }
636
637    #[test]
638    fn test_self_closing() {
639        let mut tokenizer = Tokenizer::new("<br/>");
640        assert_eq!(
641            tokenizer.next(),
642            Some(Token::StartTag {
643                name: "br".into(),
644                attributes: vec![],
645                self_closing: true,
646            })
647        );
648    }
649
650    #[test]
651    fn test_doctype() {
652        let mut tokenizer = Tokenizer::new("<!DOCTYPE html>");
653        assert_eq!(
654            tokenizer.next(),
655            Some(Token::Doctype {
656                name: Some("html".into()),
657                public_id: None,
658                system_id: None,
659            })
660        );
661    }
662
663    #[test]
664    fn test_comment() {
665        let mut tokenizer = Tokenizer::new("<!-- This is a comment -->");
666        assert_eq!(
667            tokenizer.next(),
668            Some(Token::Comment(" This is a comment ".into()))
669        );
670    }
671
672    #[test]
673    fn test_boolean_attribute() {
674        let mut tokenizer = Tokenizer::new("<input disabled>");
675        assert_eq!(
676            tokenizer.next(),
677            Some(Token::StartTag {
678                name: "input".into(),
679                attributes: vec![("disabled".into(), String::new())],
680                self_closing: false,
681            })
682        );
683    }
684
685    #[test]
686    fn test_unquoted_attribute() {
687        let mut tokenizer = Tokenizer::new("<div class=container>");
688        assert_eq!(
689            tokenizer.next(),
690            Some(Token::StartTag {
691                name: "div".into(),
692                attributes: vec![("class".into(), "container".into())],
693                self_closing: false,
694            })
695        );
696    }
697}