1use alloc::string::String;
11use alloc::vec::Vec;
12
13#[derive(Debug, Clone, PartialEq, Eq)]
15pub enum Token {
16 Doctype {
18 name: Option<String>,
19 public_id: Option<String>,
20 system_id: Option<String>,
21 },
22 StartTag {
24 name: String,
25 attributes: Vec<(String, String)>,
26 self_closing: bool,
27 },
28 EndTag { name: String },
30 Comment(String),
32 Character(char),
34 Eof,
36}
37
38#[derive(Debug, Clone, Copy, PartialEq)]
40enum State {
41 Data,
42 TagOpen,
43 EndTagOpen,
44 TagName,
45 SelfClosingStartTag,
46 BeforeAttributeName,
47 AttributeName,
48 AfterAttributeName,
49 BeforeAttributeValue,
50 AttributeValueDoubleQuoted,
51 AttributeValueSingleQuoted,
52 AttributeValueUnquoted,
53 AfterAttributeValueQuoted,
54 MarkupDeclarationOpen,
55 CommentStart,
56 Comment,
57 CommentEnd,
58 Doctype,
59 BeforeDoctypeName,
60 DoctypeName,
61 AfterDoctypeName,
62 BogusComment,
63}
64
65pub struct Tokenizer<'a> {
67 input: &'a str,
68 chars: core::iter::Peekable<core::str::CharIndices<'a>>,
69 state: State,
70 current_tag_name: String,
71 current_tag_is_end: bool,
72 current_tag_self_closing: bool,
73 current_attr_name: String,
74 current_attr_value: String,
75 current_attrs: Vec<(String, String)>,
76 current_comment: String,
77 current_doctype_name: Option<String>,
78 pending_tokens: Vec<Token>,
79}
80
81impl<'a> Tokenizer<'a> {
82 #[must_use]
84 pub fn new(input: &'a str) -> Self {
85 Self {
86 input,
87 chars: input.char_indices().peekable(),
88 state: State::Data,
89 current_tag_name: String::new(),
90 current_tag_is_end: false,
91 current_tag_self_closing: false,
92 current_attr_name: String::new(),
93 current_attr_value: String::new(),
94 current_attrs: Vec::new(),
95 current_comment: String::new(),
96 current_doctype_name: None,
97 pending_tokens: Vec::new(),
98 }
99 }
100
101 fn consume(&mut self) -> Option<char> {
102 self.chars.next().map(|(_, c)| c)
103 }
104
105 fn peek(&mut self) -> Option<char> {
106 self.chars.peek().map(|(_, c)| *c)
107 }
108
109 fn emit_current_tag(&mut self) -> Token {
110 let name = core::mem::take(&mut self.current_tag_name).to_ascii_lowercase();
111 let attrs = core::mem::take(&mut self.current_attrs);
112 let self_closing = self.current_tag_self_closing;
113 let is_end = self.current_tag_is_end;
114 self.current_tag_self_closing = false;
115 self.current_tag_is_end = false;
116
117 if is_end {
118 Token::EndTag { name }
119 } else {
120 Token::StartTag {
121 name,
122 attributes: attrs,
123 self_closing,
124 }
125 }
126 }
127
128 fn emit_current_attr(&mut self) {
129 if !self.current_attr_name.is_empty() {
130 let name = core::mem::take(&mut self.current_attr_name).to_ascii_lowercase();
131 let value = core::mem::take(&mut self.current_attr_value);
132 self.current_attrs.push((name, value));
133 }
134 }
135
136 #[allow(clippy::too_many_lines, clippy::match_same_arms)]
137 fn next_token(&mut self) -> Option<Token> {
138 if !self.pending_tokens.is_empty() {
140 return Some(self.pending_tokens.remove(0));
141 }
142
143 loop {
144 match self.state {
145 State::Data => match self.consume() {
146 Some('<') => self.state = State::TagOpen,
147 Some(c) => return Some(Token::Character(c)),
148 None => return Some(Token::Eof),
149 },
150
151 State::TagOpen => match self.peek() {
152 Some('!') => {
153 self.consume();
154 self.state = State::MarkupDeclarationOpen;
155 }
156 Some('/') => {
157 self.consume();
158 self.state = State::EndTagOpen;
159 }
160 Some(c) if c.is_ascii_alphabetic() => {
161 self.state = State::TagName;
162 }
163 _ => {
164 self.state = State::Data;
165 return Some(Token::Character('<'));
166 }
167 },
168
169 State::EndTagOpen => match self.peek() {
170 Some(c) if c.is_ascii_alphabetic() => {
171 self.current_tag_is_end = true;
172 self.state = State::TagName;
173 }
174 Some('>') => {
175 self.consume();
176 self.state = State::Data;
177 }
178 _ => {
179 self.state = State::BogusComment;
180 }
181 },
182
183 State::TagName => match self.consume() {
184 Some('\t' | '\n' | '\x0C' | ' ') => {
185 self.state = State::BeforeAttributeName;
186 }
187 Some('/') => {
188 self.state = State::SelfClosingStartTag;
189 }
190 Some('>') => {
191 self.state = State::Data;
192 return Some(self.emit_current_tag());
193 }
194 Some(c) => {
195 self.current_tag_name.push(c);
196 }
197 None => {
198 self.state = State::Data;
199 return Some(Token::Eof);
200 }
201 },
202
203 State::SelfClosingStartTag => match self.consume() {
204 Some('>') => {
205 self.current_tag_self_closing = true;
206 self.state = State::Data;
207 return Some(self.emit_current_tag());
208 }
209 _ => {
210 self.state = State::BeforeAttributeName;
211 }
212 },
213
214 State::BeforeAttributeName => match self.peek() {
215 Some('\t' | '\n' | '\x0C' | ' ') => {
216 self.consume();
217 }
218 Some('/' | '>') | None => {
219 self.state = State::AfterAttributeName;
220 }
221 Some('=') => {
222 self.consume();
223 self.current_attr_name.push('=');
224 self.state = State::AttributeName;
225 }
226 _ => {
227 self.state = State::AttributeName;
228 }
229 },
230
231 State::AttributeName => match self.peek() {
232 Some('\t' | '\n' | '\x0C' | ' ' | '/' | '>') => {
233 self.state = State::AfterAttributeName;
234 }
235 Some('=') => {
236 self.consume();
237 self.state = State::BeforeAttributeValue;
238 }
239 Some(c) => {
240 self.consume();
241 self.current_attr_name.push(c);
242 }
243 None => {
244 self.state = State::AfterAttributeName;
245 }
246 },
247
248 State::AfterAttributeName => match self.peek() {
249 Some('\t' | '\n' | '\x0C' | ' ') => {
250 self.consume();
251 }
252 Some('/') => {
253 self.emit_current_attr();
254 self.consume();
255 self.state = State::SelfClosingStartTag;
256 }
257 Some('=') => {
258 self.consume();
259 self.state = State::BeforeAttributeValue;
260 }
261 Some('>') => {
262 self.emit_current_attr();
263 self.consume();
264 self.state = State::Data;
265 return Some(self.emit_current_tag());
266 }
267 _ => {
268 self.emit_current_attr();
269 self.state = State::AttributeName;
270 }
271 },
272
273 State::BeforeAttributeValue => match self.peek() {
274 Some('\t' | '\n' | '\x0C' | ' ') => {
275 self.consume();
276 }
277 Some('"') => {
278 self.consume();
279 self.state = State::AttributeValueDoubleQuoted;
280 }
281 Some('\'') => {
282 self.consume();
283 self.state = State::AttributeValueSingleQuoted;
284 }
285 Some('>') => {
286 self.emit_current_attr();
287 self.consume();
288 self.state = State::Data;
289 return Some(self.emit_current_tag());
290 }
291 _ => {
292 self.state = State::AttributeValueUnquoted;
293 }
294 },
295
296 State::AttributeValueDoubleQuoted => match self.consume() {
297 Some('"') => {
298 self.emit_current_attr();
299 self.state = State::AfterAttributeValueQuoted;
300 }
301 Some(c) => {
302 self.current_attr_value.push(c);
303 }
304 None => {
305 self.emit_current_attr();
306 self.state = State::Data;
307 return Some(Token::Eof);
308 }
309 },
310
311 State::AttributeValueSingleQuoted => match self.consume() {
312 Some('\'') => {
313 self.emit_current_attr();
314 self.state = State::AfterAttributeValueQuoted;
315 }
316 Some(c) => {
317 self.current_attr_value.push(c);
318 }
319 None => {
320 self.emit_current_attr();
321 self.state = State::Data;
322 return Some(Token::Eof);
323 }
324 },
325
326 State::AttributeValueUnquoted => match self.peek() {
327 Some('\t' | '\n' | '\x0C' | ' ') => {
328 self.emit_current_attr();
329 self.consume();
330 self.state = State::BeforeAttributeName;
331 }
332 Some('>') => {
333 self.emit_current_attr();
334 self.consume();
335 self.state = State::Data;
336 return Some(self.emit_current_tag());
337 }
338 Some(c) => {
339 self.consume();
340 self.current_attr_value.push(c);
341 }
342 None => {
343 self.emit_current_attr();
344 self.state = State::Data;
345 return Some(Token::Eof);
346 }
347 },
348
349 State::AfterAttributeValueQuoted => match self.peek() {
350 Some('\t' | '\n' | '\x0C' | ' ') => {
351 self.consume();
352 self.state = State::BeforeAttributeName;
353 }
354 Some('/') => {
355 self.consume();
356 self.state = State::SelfClosingStartTag;
357 }
358 Some('>') => {
359 self.consume();
360 self.state = State::Data;
361 return Some(self.emit_current_tag());
362 }
363 _ => {
364 self.state = State::BeforeAttributeName;
365 }
366 },
367
368 State::MarkupDeclarationOpen => {
369 let remaining =
371 &self.input[self.chars.peek().map_or(self.input.len(), |(i, _)| *i)..];
372
373 if remaining.starts_with("--") {
374 self.consume(); self.consume(); self.state = State::CommentStart;
377 } else if remaining.to_ascii_uppercase().starts_with("DOCTYPE") {
378 for _ in 0..7 {
379 self.consume();
380 }
381 self.state = State::Doctype;
382 } else {
383 self.state = State::BogusComment;
384 }
385 }
386
387 State::CommentStart => match self.peek() {
388 Some('-') => {
389 self.consume();
390 self.state = State::CommentEnd;
391 }
392 Some('>') => {
393 self.consume();
394 self.state = State::Data;
395 return Some(Token::Comment(core::mem::take(&mut self.current_comment)));
396 }
397 _ => {
398 self.state = State::Comment;
399 }
400 },
401
402 State::Comment => match self.consume() {
403 Some('-') => {
404 self.state = State::CommentEnd;
405 }
406 Some(c) => {
407 self.current_comment.push(c);
408 }
409 None => {
410 self.state = State::Data;
411 return Some(Token::Comment(core::mem::take(&mut self.current_comment)));
412 }
413 },
414
415 State::CommentEnd => match self.consume() {
416 Some('-') => {
417 if self.peek() == Some('>') {
418 self.consume();
419 self.state = State::Data;
420 return Some(Token::Comment(core::mem::take(
421 &mut self.current_comment,
422 )));
423 }
424 self.current_comment.push('-');
425 }
426 Some(c) => {
427 self.current_comment.push('-');
428 self.current_comment.push(c);
429 self.state = State::Comment;
430 }
431 None => {
432 self.state = State::Data;
433 return Some(Token::Comment(core::mem::take(&mut self.current_comment)));
434 }
435 },
436
437 State::Doctype => match self.peek() {
438 Some('\t' | '\n' | '\x0C' | ' ') => {
439 self.consume();
440 self.state = State::BeforeDoctypeName;
441 }
442 Some('>') => {
443 self.state = State::BeforeDoctypeName;
444 }
445 _ => {
446 self.state = State::BeforeDoctypeName;
447 }
448 },
449
450 State::BeforeDoctypeName => match self.peek() {
451 Some('\t' | '\n' | '\x0C' | ' ') => {
452 self.consume();
453 }
454 Some('>') => {
455 self.consume();
456 self.state = State::Data;
457 return Some(Token::Doctype {
458 name: self.current_doctype_name.take(),
459 public_id: None,
460 system_id: None,
461 });
462 }
463 Some(_) => {
464 self.current_doctype_name = Some(String::new());
465 self.state = State::DoctypeName;
466 }
467 None => {
468 self.state = State::Data;
469 return Some(Token::Doctype {
470 name: None,
471 public_id: None,
472 system_id: None,
473 });
474 }
475 },
476
477 State::DoctypeName => match self.consume() {
478 Some('\t' | '\n' | '\x0C' | ' ') => {
479 self.state = State::AfterDoctypeName;
480 }
481 Some('>') => {
482 self.state = State::Data;
483 return Some(Token::Doctype {
484 name: self
485 .current_doctype_name
486 .take()
487 .map(|s| s.to_ascii_lowercase()),
488 public_id: None,
489 system_id: None,
490 });
491 }
492 Some(c) => {
493 if let Some(ref mut name) = self.current_doctype_name {
494 name.push(c);
495 }
496 }
497 None => {
498 self.state = State::Data;
499 return Some(Token::Doctype {
500 name: self.current_doctype_name.take(),
501 public_id: None,
502 system_id: None,
503 });
504 }
505 },
506
507 State::AfterDoctypeName => {
508 match self.peek() {
509 Some('\t' | '\n' | '\x0C' | ' ') => {
510 self.consume();
511 }
512 Some('>') => {
513 self.consume();
514 self.state = State::Data;
515 return Some(Token::Doctype {
516 name: self
517 .current_doctype_name
518 .take()
519 .map(|s| s.to_ascii_lowercase()),
520 public_id: None,
521 system_id: None,
522 });
523 }
524 None => {
525 self.state = State::Data;
526 return Some(Token::Doctype {
527 name: self.current_doctype_name.take(),
528 public_id: None,
529 system_id: None,
530 });
531 }
532 _ => {
533 while let Some(c) = self.peek() {
535 if c == '>' {
536 break;
537 }
538 self.consume();
539 }
540 self.consume(); self.state = State::Data;
542 return Some(Token::Doctype {
543 name: self
544 .current_doctype_name
545 .take()
546 .map(|s| s.to_ascii_lowercase()),
547 public_id: None,
548 system_id: None,
549 });
550 }
551 }
552 }
553
554 State::BogusComment => match self.consume() {
555 Some('>') => {
556 self.state = State::Data;
557 return Some(Token::Comment(core::mem::take(&mut self.current_comment)));
558 }
559 Some(c) => {
560 self.current_comment.push(c);
561 }
562 None => {
563 self.state = State::Data;
564 return Some(Token::Comment(core::mem::take(&mut self.current_comment)));
565 }
566 },
567 }
568 }
569 }
570}
571
572impl Iterator for Tokenizer<'_> {
573 type Item = Token;
574
575 fn next(&mut self) -> Option<Self::Item> {
576 match self.next_token() {
577 Some(Token::Eof) => None,
578 token => token,
579 }
580 }
581}
582
583#[cfg(test)]
584mod tests {
585 use super::*;
586 use alloc::vec;
587
588 #[test]
589 fn test_simple_element() {
590 let mut tokenizer = Tokenizer::new("<div></div>");
591 assert_eq!(
592 tokenizer.next(),
593 Some(Token::StartTag {
594 name: "div".into(),
595 attributes: vec![],
596 self_closing: false,
597 })
598 );
599 assert_eq!(tokenizer.next(), Some(Token::EndTag { name: "div".into() }));
600 }
601
602 #[test]
603 fn test_element_with_text() {
604 let mut tokenizer = Tokenizer::new("<p>Hello</p>");
605 assert_eq!(
606 tokenizer.next(),
607 Some(Token::StartTag {
608 name: "p".into(),
609 attributes: vec![],
610 self_closing: false,
611 })
612 );
613 assert_eq!(tokenizer.next(), Some(Token::Character('H')));
614 assert_eq!(tokenizer.next(), Some(Token::Character('e')));
615 assert_eq!(tokenizer.next(), Some(Token::Character('l')));
616 assert_eq!(tokenizer.next(), Some(Token::Character('l')));
617 assert_eq!(tokenizer.next(), Some(Token::Character('o')));
618 assert_eq!(tokenizer.next(), Some(Token::EndTag { name: "p".into() }));
619 }
620
621 #[test]
622 fn test_attributes() {
623 let mut tokenizer = Tokenizer::new(r#"<div class="container" id="main">"#);
624 assert_eq!(
625 tokenizer.next(),
626 Some(Token::StartTag {
627 name: "div".into(),
628 attributes: vec![
629 ("class".into(), "container".into()),
630 ("id".into(), "main".into()),
631 ],
632 self_closing: false,
633 })
634 );
635 }
636
637 #[test]
638 fn test_self_closing() {
639 let mut tokenizer = Tokenizer::new("<br/>");
640 assert_eq!(
641 tokenizer.next(),
642 Some(Token::StartTag {
643 name: "br".into(),
644 attributes: vec![],
645 self_closing: true,
646 })
647 );
648 }
649
650 #[test]
651 fn test_doctype() {
652 let mut tokenizer = Tokenizer::new("<!DOCTYPE html>");
653 assert_eq!(
654 tokenizer.next(),
655 Some(Token::Doctype {
656 name: Some("html".into()),
657 public_id: None,
658 system_id: None,
659 })
660 );
661 }
662
663 #[test]
664 fn test_comment() {
665 let mut tokenizer = Tokenizer::new("<!-- This is a comment -->");
666 assert_eq!(
667 tokenizer.next(),
668 Some(Token::Comment(" This is a comment ".into()))
669 );
670 }
671
672 #[test]
673 fn test_boolean_attribute() {
674 let mut tokenizer = Tokenizer::new("<input disabled>");
675 assert_eq!(
676 tokenizer.next(),
677 Some(Token::StartTag {
678 name: "input".into(),
679 attributes: vec![("disabled".into(), String::new())],
680 self_closing: false,
681 })
682 );
683 }
684
685 #[test]
686 fn test_unquoted_attribute() {
687 let mut tokenizer = Tokenizer::new("<div class=container>");
688 assert_eq!(
689 tokenizer.next(),
690 Some(Token::StartTag {
691 name: "div".into(),
692 attributes: vec![("class".into(), "container".into())],
693 self_closing: false,
694 })
695 );
696 }
697}