syntect/parsing/
yaml_load.rs

1use super::regex::{Regex, Region};
2use super::scope::*;
3use super::syntax_definition::*;
4use std::collections::HashMap;
5use std::error::Error;
6use std::ops::DerefMut;
7use std::path::Path;
8use yaml_rust::yaml::Hash;
9use yaml_rust::{ScanError, Yaml, YamlLoader};
10
11#[derive(Debug, thiserror::Error)]
12#[non_exhaustive]
13pub enum ParseSyntaxError {
14    /// Invalid YAML file syntax, or at least something yaml_rust can't handle
15    #[error("Invalid YAML file syntax: {0}")]
16    InvalidYaml(#[from] ScanError),
17    /// The file must contain at least one YAML document
18    #[error("The file must contain at least one YAML document")]
19    EmptyFile,
20    /// Some keys are required for something to be a valid `.sublime-syntax`
21    #[error("Missing mandatory key in YAML file: {0}")]
22    MissingMandatoryKey(&'static str),
23    /// Invalid regex
24    #[error("Error while compiling regex '{0}': {1}")]
25    RegexCompileError(String, #[source] Box<dyn Error + Send + Sync + 'static>),
26    /// A scope that syntect's scope implementation can't handle
27    #[error("Invalid scope: {0}")]
28    InvalidScope(ParseScopeError),
29    /// A reference to another file that is invalid
30    #[error("Invalid file reference")]
31    BadFileRef,
32    /// Syntaxes must have a context named "main"
33    #[error("Context 'main' is missing")]
34    MainMissing,
35    /// Some part of the YAML file is the wrong type (e.g a string but should be a list)
36    /// Sorry this doesn't give you any way to narrow down where this is.
37    /// Maybe use Sublime Text to figure it out.
38    #[error("Type mismatch")]
39    TypeMismatch,
40}
41
42fn get_key<'a, R, F: FnOnce(&'a Yaml) -> Option<R>>(
43    map: &'a Hash,
44    key: &'static str,
45    f: F,
46) -> Result<R, ParseSyntaxError> {
47    map.get(&Yaml::String(key.to_owned()))
48        .ok_or(ParseSyntaxError::MissingMandatoryKey(key))
49        .and_then(|x| f(x).ok_or(ParseSyntaxError::TypeMismatch))
50}
51
52fn str_to_scopes(s: &str, repo: &mut ScopeRepository) -> Result<Vec<Scope>, ParseSyntaxError> {
53    s.split_whitespace()
54        .map(|scope| repo.build(scope).map_err(ParseSyntaxError::InvalidScope))
55        .collect()
56}
57
58struct ParserState<'a> {
59    scope_repo: &'a mut ScopeRepository,
60    variables: HashMap<String, String>,
61    variable_regex: Regex,
62    backref_regex: Regex,
63    lines_include_newline: bool,
64}
65
66// `__start` must not include prototypes from the actual syntax definition,
67// otherwise it's possible that a prototype makes us pop out of `__start`.
68static START_CONTEXT: &str = "
69__start:
70    - meta_include_prototype: false
71    - match: ''
72      push: __main
73__main:
74    - include: main
75";
76
77impl SyntaxDefinition {
78    /// In case you want to create your own SyntaxDefinition's in memory from strings.
79    ///
80    /// Generally you should use a [`SyntaxSet`].
81    ///
82    /// `fallback_name` is an optional name to use when the YAML doesn't provide a `name` key.
83    ///
84    /// [`SyntaxSet`]: ../struct.SyntaxSet.html
85    pub fn load_from_str(
86        s: &str,
87        lines_include_newline: bool,
88        fallback_name: Option<&str>,
89    ) -> Result<SyntaxDefinition, ParseSyntaxError> {
90        let docs = match YamlLoader::load_from_str(s) {
91            Ok(x) => x,
92            Err(e) => return Err(ParseSyntaxError::InvalidYaml(e)),
93        };
94        if docs.is_empty() {
95            return Err(ParseSyntaxError::EmptyFile);
96        }
97        let doc = &docs[0];
98        let mut scope_repo = lock_global_scope_repo();
99        SyntaxDefinition::parse_top_level(
100            doc,
101            scope_repo.deref_mut(),
102            lines_include_newline,
103            fallback_name,
104        )
105    }
106
107    fn parse_top_level(
108        doc: &Yaml,
109        scope_repo: &mut ScopeRepository,
110        lines_include_newline: bool,
111        fallback_name: Option<&str>,
112    ) -> Result<SyntaxDefinition, ParseSyntaxError> {
113        let h = doc.as_hash().ok_or(ParseSyntaxError::TypeMismatch)?;
114
115        let mut variables = HashMap::new();
116        if let Ok(map) = get_key(h, "variables", |x| x.as_hash()) {
117            for (key, value) in map.iter() {
118                if let (Some(key_str), Some(val_str)) = (key.as_str(), value.as_str()) {
119                    variables.insert(key_str.to_owned(), val_str.to_owned());
120                }
121            }
122        }
123        let contexts_hash = get_key(h, "contexts", |x| x.as_hash())?;
124        let top_level_scope = scope_repo
125            .build(get_key(h, "scope", |x| x.as_str())?)
126            .map_err(ParseSyntaxError::InvalidScope)?;
127        let mut state = ParserState {
128            scope_repo,
129            variables,
130            variable_regex: Regex::new(r"\{\{([A-Za-z0-9_]+)\}\}".into()),
131            backref_regex: Regex::new(r"\\\d".into()),
132            lines_include_newline,
133        };
134
135        let mut contexts = SyntaxDefinition::parse_contexts(contexts_hash, &mut state)?;
136        if !contexts.contains_key("main") {
137            return Err(ParseSyntaxError::MainMissing);
138        }
139
140        SyntaxDefinition::add_initial_contexts(&mut contexts, &mut state, top_level_scope);
141
142        let mut file_extensions = Vec::new();
143        for extension_key in &["file_extensions", "hidden_file_extensions"] {
144            if let Ok(v) = get_key(h, extension_key, |x| x.as_vec()) {
145                file_extensions.extend(v.iter().filter_map(|y| y.as_str().map(|s| s.to_owned())))
146            }
147        }
148
149        let defn = SyntaxDefinition {
150            name: get_key(h, "name", |x| x.as_str())
151                .unwrap_or_else(|_| fallback_name.unwrap_or("Unnamed"))
152                .to_owned(),
153            scope: top_level_scope,
154            file_extensions,
155            // TODO maybe cache a compiled version of this Regex
156            first_line_match: get_key(h, "first_line_match", |x| x.as_str())
157                .ok()
158                .map(|s| s.to_owned()),
159            hidden: get_key(h, "hidden", |x| x.as_bool()).unwrap_or(false),
160
161            variables: state.variables,
162            contexts,
163        };
164        Ok(defn)
165    }
166
167    fn parse_contexts(
168        map: &Hash,
169        state: &mut ParserState<'_>,
170    ) -> Result<HashMap<String, Context>, ParseSyntaxError> {
171        let mut contexts = HashMap::new();
172        for (key, value) in map.iter() {
173            if let (Some(name), Some(val_vec)) = (key.as_str(), value.as_vec()) {
174                let is_prototype = name == "prototype";
175                let mut namer = ContextNamer::new(name);
176                SyntaxDefinition::parse_context(
177                    val_vec,
178                    state,
179                    &mut contexts,
180                    is_prototype,
181                    &mut namer,
182                )?;
183            }
184        }
185
186        Ok(contexts)
187    }
188
189    fn parse_context(
190        vec: &[Yaml],
191        // TODO: Maybe just pass the scope repo if that's all that's needed?
192        state: &mut ParserState<'_>,
193        contexts: &mut HashMap<String, Context>,
194        is_prototype: bool,
195        namer: &mut ContextNamer,
196    ) -> Result<String, ParseSyntaxError> {
197        let mut context = Context::new(!is_prototype);
198        let name = namer.next();
199
200        for y in vec.iter() {
201            let map = y.as_hash().ok_or(ParseSyntaxError::TypeMismatch)?;
202
203            let mut is_special = false;
204            if let Ok(x) = get_key(map, "meta_scope", |x| x.as_str()) {
205                context.meta_scope = str_to_scopes(x, state.scope_repo)?;
206                is_special = true;
207            }
208            if let Ok(x) = get_key(map, "meta_content_scope", |x| x.as_str()) {
209                context.meta_content_scope = str_to_scopes(x, state.scope_repo)?;
210                is_special = true;
211            }
212            if let Ok(x) = get_key(map, "meta_include_prototype", |x| x.as_bool()) {
213                context.meta_include_prototype = x;
214                is_special = true;
215            }
216            if let Ok(true) = get_key(map, "clear_scopes", |x| x.as_bool()) {
217                context.clear_scopes = Some(ClearAmount::All);
218                is_special = true;
219            }
220            if let Ok(x) = get_key(map, "clear_scopes", |x| x.as_i64()) {
221                context.clear_scopes = Some(ClearAmount::TopN(x as usize));
222                is_special = true;
223            }
224            if !is_special {
225                if let Ok(x) = get_key(map, "include", Some) {
226                    let reference =
227                        SyntaxDefinition::parse_reference(x, state, contexts, namer, false)?;
228                    context.patterns.push(Pattern::Include(reference));
229                } else {
230                    let pattern =
231                        SyntaxDefinition::parse_match_pattern(map, state, contexts, namer)?;
232                    if pattern.has_captures {
233                        context.uses_backrefs = true;
234                    }
235                    context.patterns.push(Pattern::Match(pattern));
236                }
237            }
238        }
239
240        contexts.insert(name.clone(), context);
241        Ok(name)
242    }
243
244    fn parse_reference(
245        y: &Yaml,
246        state: &mut ParserState<'_>,
247        contexts: &mut HashMap<String, Context>,
248        namer: &mut ContextNamer,
249        with_escape: bool,
250    ) -> Result<ContextReference, ParseSyntaxError> {
251        if let Some(s) = y.as_str() {
252            let parts: Vec<&str> = s.split('#').collect();
253            let sub_context = if parts.len() > 1 {
254                Some(parts[1].to_owned())
255            } else {
256                None
257            };
258            if parts[0].starts_with("scope:") {
259                Ok(ContextReference::ByScope {
260                    scope: state
261                        .scope_repo
262                        .build(&parts[0][6..])
263                        .map_err(ParseSyntaxError::InvalidScope)?,
264                    sub_context,
265                    with_escape,
266                })
267            } else if parts[0].ends_with(".sublime-syntax") {
268                let stem = Path::new(parts[0])
269                    .file_stem()
270                    .and_then(|x| x.to_str())
271                    .ok_or(ParseSyntaxError::BadFileRef)?;
272                Ok(ContextReference::File {
273                    name: stem.to_owned(),
274                    sub_context,
275                    with_escape,
276                })
277            } else {
278                Ok(ContextReference::Named(parts[0].to_owned()))
279            }
280        } else if let Some(v) = y.as_vec() {
281            let subname = SyntaxDefinition::parse_context(v, state, contexts, false, namer)?;
282            Ok(ContextReference::Inline(subname))
283        } else {
284            Err(ParseSyntaxError::TypeMismatch)
285        }
286    }
287
288    fn parse_match_pattern(
289        map: &Hash,
290        state: &mut ParserState<'_>,
291        contexts: &mut HashMap<String, Context>,
292        namer: &mut ContextNamer,
293    ) -> Result<MatchPattern, ParseSyntaxError> {
294        let raw_regex = get_key(map, "match", |x| x.as_str())?;
295        let regex_str = Self::parse_regex(raw_regex, state)?;
296        // println!("{:?}", regex_str);
297
298        let scope = get_key(map, "scope", |x| x.as_str())
299            .ok()
300            .map(|s| str_to_scopes(s, state.scope_repo))
301            .unwrap_or_else(|| Ok(vec![]))?;
302
303        let captures = if let Ok(map) = get_key(map, "captures", |x| x.as_hash()) {
304            Some(Self::parse_captures(map, &regex_str, state)?)
305        } else {
306            None
307        };
308
309        let mut has_captures = false;
310        let operation = if get_key(map, "pop", Some).is_ok() {
311            // Thanks @wbond for letting me know this is the correct way to check for captures
312            has_captures = state
313                .backref_regex
314                .search(&regex_str, 0, regex_str.len(), None);
315            MatchOperation::Pop
316        } else if let Ok(y) = get_key(map, "push", Some) {
317            MatchOperation::Push(SyntaxDefinition::parse_pushargs(y, state, contexts, namer)?)
318        } else if let Ok(y) = get_key(map, "set", Some) {
319            MatchOperation::Set(SyntaxDefinition::parse_pushargs(y, state, contexts, namer)?)
320        } else if let Ok(y) = get_key(map, "embed", Some) {
321            // Same as push so we translate it to what it would be
322            let mut embed_escape_context_yaml = vec![];
323            let mut commands = Hash::new();
324            commands.insert(
325                Yaml::String("meta_include_prototype".to_string()),
326                Yaml::Boolean(false),
327            );
328            embed_escape_context_yaml.push(Yaml::Hash(commands));
329            if let Ok(s) = get_key(map, "embed_scope", Some) {
330                commands = Hash::new();
331                commands.insert(Yaml::String("meta_content_scope".to_string()), s.clone());
332                embed_escape_context_yaml.push(Yaml::Hash(commands));
333            }
334            if let Ok(v) = get_key(map, "escape", Some) {
335                let mut match_map = Hash::new();
336                match_map.insert(Yaml::String("match".to_string()), v.clone());
337                match_map.insert(Yaml::String("pop".to_string()), Yaml::Boolean(true));
338                if let Ok(y) = get_key(map, "escape_captures", Some) {
339                    match_map.insert(Yaml::String("captures".to_string()), y.clone());
340                }
341                embed_escape_context_yaml.push(Yaml::Hash(match_map));
342                let escape_context = SyntaxDefinition::parse_context(
343                    &embed_escape_context_yaml,
344                    state,
345                    contexts,
346                    false,
347                    namer,
348                )?;
349                MatchOperation::Push(vec![
350                    ContextReference::Inline(escape_context),
351                    SyntaxDefinition::parse_reference(y, state, contexts, namer, true)?,
352                ])
353            } else {
354                return Err(ParseSyntaxError::MissingMandatoryKey("escape"));
355            }
356        } else {
357            MatchOperation::None
358        };
359
360        let with_prototype = if let Ok(v) = get_key(map, "with_prototype", |x| x.as_vec()) {
361            // should a with_prototype include the prototype? I don't think so.
362            let subname = Self::parse_context(v, state, contexts, true, namer)?;
363            Some(ContextReference::Inline(subname))
364        } else if let Ok(v) = get_key(map, "escape", Some) {
365            let subname = namer.next();
366
367            let mut context = Context::new(false);
368            let mut match_map = Hash::new();
369            match_map.insert(
370                Yaml::String("match".to_string()),
371                Yaml::String(format!("(?={})", v.as_str().unwrap())),
372            );
373            match_map.insert(Yaml::String("pop".to_string()), Yaml::Boolean(true));
374            let pattern =
375                SyntaxDefinition::parse_match_pattern(&match_map, state, contexts, namer)?;
376            if pattern.has_captures {
377                context.uses_backrefs = true;
378            }
379            context.patterns.push(Pattern::Match(pattern));
380
381            contexts.insert(subname.clone(), context);
382            Some(ContextReference::Inline(subname))
383        } else {
384            None
385        };
386
387        let pattern = MatchPattern::new(
388            has_captures,
389            regex_str,
390            scope,
391            captures,
392            operation,
393            with_prototype,
394        );
395
396        Ok(pattern)
397    }
398
399    fn parse_pushargs(
400        y: &Yaml,
401        state: &mut ParserState<'_>,
402        contexts: &mut HashMap<String, Context>,
403        namer: &mut ContextNamer,
404    ) -> Result<Vec<ContextReference>, ParseSyntaxError> {
405        // check for a push of multiple items
406        if y.as_vec().is_some_and(|v| {
407            !v.is_empty()
408                && (v[0].as_str().is_some()
409                    || (v[0].as_vec().is_some() && v[0].as_vec().unwrap()[0].as_hash().is_some()))
410        }) {
411            // this works because Result implements FromIterator to handle the errors
412            y.as_vec()
413                .unwrap()
414                .iter()
415                .map(|x| SyntaxDefinition::parse_reference(x, state, contexts, namer, false))
416                .collect()
417        } else {
418            let reference = SyntaxDefinition::parse_reference(y, state, contexts, namer, false)?;
419            Ok(vec![reference])
420        }
421    }
422
423    fn parse_regex(raw_regex: &str, state: &ParserState<'_>) -> Result<String, ParseSyntaxError> {
424        let regex = Self::resolve_variables(raw_regex, state);
425        let regex = replace_posix_char_classes(regex);
426        let regex = if state.lines_include_newline {
427            regex_for_newlines(regex)
428        } else {
429            // If the passed in strings don't include newlines (unlike Sublime) we can't match on
430            // them using the original regex. So this tries to rewrite the regex in a way that
431            // allows matching against lines without newlines (essentially replacing `\n` with `$`).
432            regex_for_no_newlines(regex)
433        };
434        Self::try_compile_regex(&regex)?;
435        Ok(regex)
436    }
437
438    fn resolve_variables(raw_regex: &str, state: &ParserState<'_>) -> String {
439        let mut result = String::new();
440        let mut index = 0;
441        let mut region = Region::new();
442        while state
443            .variable_regex
444            .search(raw_regex, index, raw_regex.len(), Some(&mut region))
445        {
446            let (begin, end) = region.pos(0).unwrap();
447
448            result.push_str(&raw_regex[index..begin]);
449
450            let var_pos = region.pos(1).unwrap();
451            let var_name = &raw_regex[var_pos.0..var_pos.1];
452            let var_raw = state
453                .variables
454                .get(var_name)
455                .map(String::as_ref)
456                .unwrap_or("");
457            let var_resolved = Self::resolve_variables(var_raw, state);
458            result.push_str(&var_resolved);
459
460            index = end;
461        }
462        if index < raw_regex.len() {
463            result.push_str(&raw_regex[index..]);
464        }
465        result
466    }
467
468    fn try_compile_regex(regex_str: &str) -> Result<(), ParseSyntaxError> {
469        // Replace backreferences with a placeholder value that will also appear in errors
470        let regex_str =
471            substitute_backrefs_in_regex(regex_str, |i| Some(format!("<placeholder_{}>", i)));
472
473        if let Some(error) = Regex::try_compile(&regex_str) {
474            Err(ParseSyntaxError::RegexCompileError(regex_str, error))
475        } else {
476            Ok(())
477        }
478    }
479
480    fn parse_captures(
481        map: &Hash,
482        regex_str: &str,
483        state: &mut ParserState<'_>,
484    ) -> Result<CaptureMapping, ParseSyntaxError> {
485        let valid_indexes = get_consuming_capture_indexes(regex_str);
486        let mut captures = Vec::new();
487        for (key, value) in map.iter() {
488            if let (Some(key_int), Some(val_str)) = (key.as_i64(), value.as_str()) {
489                if valid_indexes.contains(&(key_int as usize)) {
490                    captures.push((key_int as usize, str_to_scopes(val_str, state.scope_repo)?));
491                }
492            }
493        }
494        Ok(captures)
495    }
496
497    /// Sublime treats the top level context slightly differently from
498    /// including the main context from other syntaxes. When main is popped
499    /// it is immediately re-added and when it is `set` over the file level
500    /// scope remains. This behaviour is emulated through some added contexts
501    /// that are the actual top level contexts used in parsing.
502    /// See <https://github.com/trishume/syntect/issues/58> for more.
503    fn add_initial_contexts(
504        contexts: &mut HashMap<String, Context>,
505        state: &mut ParserState<'_>,
506        top_level_scope: Scope,
507    ) {
508        let yaml_docs = YamlLoader::load_from_str(START_CONTEXT).unwrap();
509        let yaml = &yaml_docs[0];
510
511        let start_yaml: &[Yaml] = yaml["__start"].as_vec().unwrap();
512        SyntaxDefinition::parse_context(
513            start_yaml,
514            state,
515            contexts,
516            false,
517            &mut ContextNamer::new("__start"),
518        )
519        .unwrap();
520        if let Some(start) = contexts.get_mut("__start") {
521            start.meta_content_scope = vec![top_level_scope];
522        }
523
524        let main_yaml: &[Yaml] = yaml["__main"].as_vec().unwrap();
525        SyntaxDefinition::parse_context(
526            main_yaml,
527            state,
528            contexts,
529            false,
530            &mut ContextNamer::new("__main"),
531        )
532        .unwrap();
533
534        let meta_include_prototype = contexts["main"].meta_include_prototype;
535        let meta_scope = contexts["main"].meta_scope.clone();
536        let meta_content_scope = contexts["main"].meta_content_scope.clone();
537
538        if let Some(outer_main) = contexts.get_mut("__main") {
539            outer_main.meta_include_prototype = meta_include_prototype;
540            outer_main.meta_scope = meta_scope;
541            outer_main.meta_content_scope = meta_content_scope;
542        }
543
544        // add the top_level_scope as a meta_content_scope to main so
545        // pushes from other syntaxes add the file scope
546        // TODO: this order is not quite correct if main also has a meta_scope
547        if let Some(main) = contexts.get_mut("main") {
548            main.meta_content_scope.insert(0, top_level_scope);
549        }
550    }
551}
552
553struct ContextNamer {
554    name: String,
555    anonymous_index: Option<usize>,
556}
557
558impl ContextNamer {
559    fn new(name: &str) -> ContextNamer {
560        ContextNamer {
561            name: name.to_string(),
562            anonymous_index: None,
563        }
564    }
565
566    fn next(&mut self) -> String {
567        let name = if let Some(index) = self.anonymous_index {
568            format!("#anon_{}_{}", self.name, index)
569        } else {
570            self.name.clone()
571        };
572
573        self.anonymous_index = Some(self.anonymous_index.map(|i| i + 1).unwrap_or(0));
574        name
575    }
576}
577
578/// In fancy-regex, POSIX character classes only match ASCII characters.
579///
580/// Sublime's syntaxes expect them to match Unicode characters as well, so transform them to
581/// corresponding Unicode character classes.
582fn replace_posix_char_classes(regex: String) -> String {
583    regex
584        .replace("[:alpha:]", r"\p{L}")
585        .replace("[:alnum:]", r"\p{L}\p{N}")
586        .replace("[:lower:]", r"\p{Ll}")
587        .replace("[:upper:]", r"\p{Lu}")
588        .replace("[:digit:]", r"\p{Nd}")
589}
590
591/// Some of the regexes include `$` and expect it to match end of line,
592/// e.g. *before* the `\n` in `test\n`.
593///
594/// In fancy-regex, `$` means end of text by default, so that would
595/// match *after* `\n`. Using `(?m:$)` instead means it matches end of line.
596///
597/// Note that we don't want to add a `(?m)` in the beginning to change the
598/// whole regex because that would also change the meaning of `^`. In
599/// fancy-regex, that also matches at the end of e.g. `test\n` which is
600/// different from onig. It would also change `.` to match more.
601fn regex_for_newlines(regex: String) -> String {
602    if !regex.contains('$') {
603        return regex;
604    }
605
606    let rewriter = RegexRewriterForNewlines {
607        parser: Parser::new(regex.as_bytes()),
608    };
609    rewriter.rewrite()
610}
611
612struct RegexRewriterForNewlines<'a> {
613    parser: Parser<'a>,
614}
615
616impl RegexRewriterForNewlines<'_> {
617    fn rewrite(mut self) -> String {
618        let mut result = Vec::new();
619
620        while let Some(c) = self.parser.peek() {
621            match c {
622                b'$' => {
623                    self.parser.next();
624                    result.extend_from_slice(br"(?m:$)");
625                }
626                b'\\' => {
627                    self.parser.next();
628                    result.push(c);
629                    if let Some(c2) = self.parser.peek() {
630                        self.parser.next();
631                        result.push(c2);
632                    }
633                }
634                b'[' => {
635                    let (mut content, _) = self.parser.parse_character_class();
636                    result.append(&mut content);
637                }
638                _ => {
639                    self.parser.next();
640                    result.push(c);
641                }
642            }
643        }
644        String::from_utf8(result).unwrap()
645    }
646}
647
648/// Rewrite a regex that matches `\n` to one that matches `$` (end of line) instead.
649/// That allows the regex to be used to match lines that don't include a trailing newline character.
650///
651/// The reason we're doing this is because the regexes in the syntax definitions assume that the
652/// lines that are being matched on include a trailing newline.
653///
654/// Note that the rewrite is just an approximation and there's a couple of cases it can not handle,
655/// due to `$` being an anchor whereas `\n` matches a character.
656fn regex_for_no_newlines(regex: String) -> String {
657    if !regex.contains(r"\n") {
658        return regex;
659    }
660
661    // A special fix to rewrite a pattern from the `Rd` syntax that the RegexRewriter can not
662    // handle properly.
663    let regex = regex.replace("(?:\\n)?", "(?:$|)");
664
665    let rewriter = RegexRewriterForNoNewlines {
666        parser: Parser::new(regex.as_bytes()),
667    };
668    rewriter.rewrite()
669}
670
671struct RegexRewriterForNoNewlines<'a> {
672    parser: Parser<'a>,
673}
674
675impl RegexRewriterForNoNewlines<'_> {
676    fn rewrite(mut self) -> String {
677        let mut result = Vec::new();
678        while let Some(c) = self.parser.peek() {
679            match c {
680                b'\\' => {
681                    self.parser.next();
682                    if let Some(c2) = self.parser.peek() {
683                        self.parser.next();
684                        // Replacing `\n` with `$` in `\n?` or `\n+` would make parsing later fail
685                        // with "target of repeat operator is invalid"
686                        let c3 = self.parser.peek();
687                        if c2 == b'n' && c3 != Some(b'?') && c3 != Some(b'+') && c3 != Some(b'*') {
688                            result.extend_from_slice(b"$");
689                        } else {
690                            result.push(c);
691                            result.push(c2);
692                        }
693                    } else {
694                        result.push(c);
695                    }
696                }
697                b'[' => {
698                    let (mut content, matches_newline) = self.parser.parse_character_class();
699                    if matches_newline && self.parser.peek() != Some(b'?') {
700                        result.extend_from_slice(b"(?:");
701                        result.append(&mut content);
702                        result.extend_from_slice(br"|$)");
703                    } else {
704                        result.append(&mut content);
705                    }
706                }
707                _ => {
708                    self.parser.next();
709                    result.push(c);
710                }
711            }
712        }
713        String::from_utf8(result).unwrap()
714    }
715}
716
717fn get_consuming_capture_indexes(regex: &str) -> Vec<usize> {
718    let parser = ConsumingCaptureIndexParser {
719        parser: Parser::new(regex.as_bytes()),
720    };
721    parser.get_consuming_capture_indexes()
722}
723
724struct ConsumingCaptureIndexParser<'a> {
725    parser: Parser<'a>,
726}
727
728impl ConsumingCaptureIndexParser<'_> {
729    /// Find capture groups which are not inside lookarounds.
730    ///
731    /// If, in a YAML syntax definition, a scope stack is applied to a capture group inside a
732    /// lookaround, (i.e. "captures:\n x: scope.stack goes.here", where "x" is the number of a
733    /// capture group in a lookahead/behind), those those scopes are not applied, so no need to
734    /// even parse them.
735    fn get_consuming_capture_indexes(mut self) -> Vec<usize> {
736        let mut result = Vec::new();
737        let mut stack = Vec::new();
738        let mut cap_num = 0;
739        let mut in_lookaround = false;
740        stack.push(in_lookaround);
741        result.push(cap_num);
742
743        while let Some(c) = self.parser.peek() {
744            match c {
745                b'\\' => {
746                    self.parser.next();
747                    self.parser.next();
748                }
749                b'[' => {
750                    self.parser.parse_character_class();
751                }
752                b'(' => {
753                    self.parser.next();
754                    // add the current lookaround state to the stack so we can just pop at a closing paren
755                    stack.push(in_lookaround);
756                    if let Some(c2) = self.parser.peek() {
757                        if c2 != b'?' {
758                            // simple numbered capture group
759                            cap_num += 1;
760                            // if we are not currently in a lookaround,
761                            // add this capture group number to the valid ones
762                            if !in_lookaround {
763                                result.push(cap_num);
764                            }
765                        } else {
766                            self.parser.next();
767                            if let Some(c3) = self.parser.peek() {
768                                self.parser.next();
769                                if c3 == b'=' || c3 == b'!' {
770                                    // lookahead
771                                    in_lookaround = true;
772                                } else if c3 == b'<' {
773                                    if let Some(c4) = self.parser.peek() {
774                                        if c4 == b'=' || c4 == b'!' {
775                                            self.parser.next();
776                                            // lookbehind
777                                            in_lookaround = true;
778                                        }
779                                    }
780                                } else if c3 == b'P' {
781                                    if let Some(c4) = self.parser.peek() {
782                                        if c4 == b'<' {
783                                            // named capture group
784                                            cap_num += 1;
785                                            // if we are not currently in a lookaround,
786                                            // add this capture group number to the valid ones
787                                            if !in_lookaround {
788                                                result.push(cap_num);
789                                            }
790                                        }
791                                    }
792                                }
793                            }
794                        }
795                    }
796                }
797                b')' => {
798                    if let Some(value) = stack.pop() {
799                        in_lookaround = value;
800                    }
801                    self.parser.next();
802                }
803                _ => {
804                    self.parser.next();
805                }
806            }
807        }
808        result
809    }
810}
811
812struct Parser<'a> {
813    bytes: &'a [u8],
814    index: usize,
815}
816
817impl Parser<'_> {
818    fn new(bytes: &[u8]) -> Parser {
819        Parser { bytes, index: 0 }
820    }
821
822    fn peek(&self) -> Option<u8> {
823        self.bytes.get(self.index).copied()
824    }
825
826    fn next(&mut self) {
827        self.index += 1;
828    }
829
830    fn parse_character_class(&mut self) -> (Vec<u8>, bool) {
831        let mut content = Vec::new();
832        let mut negated = false;
833        let mut nesting = 0;
834        let mut matches_newline = false;
835
836        self.next();
837        content.push(b'[');
838        if let Some(b'^') = self.peek() {
839            self.next();
840            content.push(b'^');
841            negated = true;
842        }
843
844        // An unescaped `]` is allowed after `[` or `[^` and doesn't mean the end of the class.
845        if let Some(b']') = self.peek() {
846            self.next();
847            content.push(b']');
848        }
849
850        while let Some(c) = self.peek() {
851            match c {
852                b'\\' => {
853                    self.next();
854                    content.push(c);
855                    if let Some(c2) = self.peek() {
856                        self.next();
857                        if c2 == b'n' && !negated && nesting == 0 {
858                            matches_newline = true;
859                        }
860                        content.push(c2);
861                    }
862                }
863                b'[' => {
864                    self.next();
865                    content.push(b'[');
866                    nesting += 1;
867                }
868                b']' => {
869                    self.next();
870                    content.push(b']');
871                    if nesting == 0 {
872                        break;
873                    }
874                    nesting -= 1;
875                }
876                _ => {
877                    self.next();
878                    content.push(c);
879                }
880            }
881        }
882
883        (content, matches_newline)
884    }
885}
886
887#[cfg(test)]
888mod tests {
889    use super::*;
890    use crate::parsing::Scope;
891
892    #[test]
893    fn can_parse() {
894        let defn: SyntaxDefinition = SyntaxDefinition::load_from_str(
895            "name: C\nscope: source.c\ncontexts: {main: []}",
896            false,
897            None,
898        )
899        .unwrap();
900        assert_eq!(defn.name, "C");
901        assert_eq!(defn.scope, Scope::new("source.c").unwrap());
902        let exts_empty: Vec<String> = Vec::new();
903        assert_eq!(defn.file_extensions, exts_empty);
904        assert!(!defn.hidden);
905        assert!(defn.variables.is_empty());
906        let defn2: SyntaxDefinition = SyntaxDefinition::load_from_str(
907            "
908        name: C
909        scope: source.c
910        file_extensions: [c, h]
911        hidden_file_extensions: [k, l]
912        hidden: true
913        variables:
914          ident: '[QY]+'
915        contexts:
916          prototype:
917            - match: lol
918              scope: source.php
919          main:
920            - match: \\b(if|else|for|while|{{ident}})\\b
921              scope: keyword.control.c keyword.looping.c
922              captures:
923                  1: meta.preprocessor.c++
924                  2: keyword.control.include.c++
925              push: [string, 'scope:source.c#main', 'CSS.sublime-syntax#rule-list-body']
926              with_prototype:
927                - match: wow
928                  pop: true
929            - match: '\"'
930              push: string
931          string:
932            - meta_scope: string.quoted.double.c
933            - meta_include_prototype: false
934            - match: \\\\.
935              scope: constant.character.escape.c
936            - match: '\"'
937              pop: true
938        ",
939            false,
940            None,
941        )
942        .unwrap();
943        assert_eq!(defn2.name, "C");
944        let top_level_scope = Scope::new("source.c").unwrap();
945        assert_eq!(defn2.scope, top_level_scope);
946        let exts: Vec<String> = vec!["c", "h", "k", "l"]
947            .into_iter()
948            .map(String::from)
949            .collect();
950        assert_eq!(defn2.file_extensions, exts);
951        assert!(defn2.hidden);
952        assert_eq!(defn2.variables.get("ident").unwrap(), "[QY]+");
953
954        let n: Vec<Scope> = Vec::new();
955        println!("{:?}", defn2);
956        // unreachable!();
957        let main = &defn2.contexts["main"];
958        assert_eq!(main.meta_content_scope, vec![top_level_scope]);
959        assert_eq!(main.meta_scope, n);
960        assert!(main.meta_include_prototype);
961
962        assert_eq!(defn2.contexts["__main"].meta_content_scope, n);
963        assert_eq!(
964            defn2.contexts["__start"].meta_content_scope,
965            vec![top_level_scope]
966        );
967
968        assert_eq!(
969            defn2.contexts["string"].meta_scope,
970            vec![Scope::new("string.quoted.double.c").unwrap()]
971        );
972        let first_pattern: &Pattern = &main.patterns[0];
973        match *first_pattern {
974            Pattern::Match(ref match_pat) => {
975                let m: &CaptureMapping = match_pat.captures.as_ref().expect("test failed");
976                assert_eq!(
977                    &m[0],
978                    &(1, vec![Scope::new("meta.preprocessor.c++").unwrap()])
979                );
980                use crate::parsing::syntax_definition::ContextReference::*;
981
982                // this is sadly necessary because Context is not Eq because of the Regex
983                let expected = MatchOperation::Push(vec![
984                    Named("string".to_owned()),
985                    ByScope {
986                        scope: Scope::new("source.c").unwrap(),
987                        sub_context: Some("main".to_owned()),
988                        with_escape: false,
989                    },
990                    File {
991                        name: "CSS".to_owned(),
992                        sub_context: Some("rule-list-body".to_owned()),
993                        with_escape: false,
994                    },
995                ]);
996                assert_eq!(
997                    format!("{:?}", match_pat.operation),
998                    format!("{:?}", expected)
999                );
1000
1001                assert_eq!(
1002                    match_pat.scope,
1003                    vec![
1004                        Scope::new("keyword.control.c").unwrap(),
1005                        Scope::new("keyword.looping.c").unwrap()
1006                    ]
1007                );
1008
1009                assert!(match_pat.with_prototype.is_some());
1010            }
1011            _ => unreachable!(),
1012        }
1013    }
1014
1015    #[test]
1016    fn can_parse_embed_as_with_prototypes() {
1017        let old_def = SyntaxDefinition::load_from_str(r#"
1018        name: C
1019        scope: source.c
1020        file_extensions: [c, h]
1021        variables:
1022          ident: '[QY]+'
1023        contexts:
1024          main:
1025            - match: '(>)\s*'
1026              captures:
1027                1: meta.tag.style.begin.html punctuation.definition.tag.end.html
1028              push:
1029                - [{ meta_include_prototype: false }, { meta_content_scope: 'source.css.embedded.html' }, { match: '(?i)(?=</style)', pop: true }]
1030                - scope:source.css
1031              with_prototype:
1032                - match: (?=(?i)(?=</style))
1033                  pop: true
1034        "#,false, None).unwrap();
1035
1036        let mut def_with_embed = SyntaxDefinition::load_from_str(
1037            r#"
1038        name: C
1039        scope: source.c
1040        file_extensions: [c, h]
1041        variables:
1042          ident: '[QY]+'
1043        contexts:
1044          main:
1045            - match: '(>)\s*'
1046              captures:
1047                1: meta.tag.style.begin.html punctuation.definition.tag.end.html
1048              embed: scope:source.css
1049              embed_scope: source.css.embedded.html
1050              escape: (?i)(?=</style)
1051        "#,
1052            false,
1053            None,
1054        )
1055        .unwrap();
1056
1057        // We will soon do an `assert_eq!()`. But there is one difference we must expect, namely
1058        // that for `def_with_embed`, the value of `ContextReference::ByScope::with_escape` will be
1059        // `true`, whereas for `old_def` it will be `false`. So manually adjust `with_escape` to
1060        // `false` so that `assert_eq!()` will work.
1061        let def_with_embed_context = def_with_embed.contexts.get_mut("main").unwrap();
1062        if let Pattern::Match(ref mut match_pattern) = def_with_embed_context.patterns[0] {
1063            if let MatchOperation::Push(ref mut context_references) = match_pattern.operation {
1064                if let ContextReference::ByScope {
1065                    ref mut with_escape,
1066                    ..
1067                } = context_references[1]
1068                {
1069                    *with_escape = false;
1070                }
1071            }
1072        }
1073
1074        assert_eq!(old_def.contexts["main"], def_with_embed.contexts["main"]);
1075    }
1076
1077    #[test]
1078    fn errors_on_embed_without_escape() {
1079        let def = SyntaxDefinition::load_from_str(
1080            r#"
1081        name: C
1082        scope: source.c
1083        file_extensions: [c, h]
1084        variables:
1085          ident: '[QY]+'
1086        contexts:
1087          main:
1088            - match: '(>)\s*'
1089              captures:
1090                1: meta.tag.style.begin.html punctuation.definition.tag.end.html
1091              embed: scope:source.css
1092              embed_scope: source.css.embedded.html
1093        "#,
1094            false,
1095            None,
1096        );
1097        assert!(def.is_err());
1098        match def.unwrap_err() {
1099            ParseSyntaxError::MissingMandatoryKey(key) => assert_eq!(key, "escape"),
1100            _ => unreachable!("Got unexpected ParseSyntaxError"),
1101        }
1102    }
1103
1104    #[test]
1105    fn errors_on_regex_compile_error() {
1106        let def = SyntaxDefinition::load_from_str(
1107            r#"
1108        name: C
1109        scope: source.c
1110        file_extensions: [test]
1111        contexts:
1112          main:
1113            - match: '[a'
1114              scope: keyword.name
1115        "#,
1116            false,
1117            None,
1118        );
1119        assert!(def.is_err());
1120        match def.unwrap_err() {
1121            ParseSyntaxError::RegexCompileError(ref regex, _) => assert_eq!("[a", regex),
1122            _ => unreachable!("Got unexpected ParseSyntaxError"),
1123        }
1124    }
1125
1126    #[test]
1127    fn can_parse_ugly_yaml() {
1128        let defn: SyntaxDefinition = SyntaxDefinition::load_from_str(
1129            "
1130        name: LaTeX
1131        scope: text.tex.latex
1132        contexts:
1133          main:
1134            - match: '((\\\\)(?:framebox|makebox))\\b'
1135              captures:
1136                1: support.function.box.latex
1137                2: punctuation.definition.backslash.latex
1138              push:
1139                - [{meta_scope: meta.function.box.latex}, {match: '', pop: true}]
1140                - argument
1141                - optional-arguments
1142          argument:
1143            - match: '\\{'
1144              scope: punctuation.definition.group.brace.begin.latex
1145            - match: '(?=\\S)'
1146              pop: true
1147          optional-arguments:
1148            - match: '(?=\\S)'
1149              pop: true
1150        ",
1151            false,
1152            None,
1153        )
1154        .unwrap();
1155        assert_eq!(defn.name, "LaTeX");
1156        let top_level_scope = Scope::new("text.tex.latex").unwrap();
1157        assert_eq!(defn.scope, top_level_scope);
1158
1159        let first_pattern: &Pattern = &defn.contexts["main"].patterns[0];
1160        match *first_pattern {
1161            Pattern::Match(ref match_pat) => {
1162                let m: &CaptureMapping = match_pat.captures.as_ref().expect("test failed");
1163                assert_eq!(
1164                    &m[0],
1165                    &(1, vec![Scope::new("support.function.box.latex").unwrap()])
1166                );
1167
1168                //use parsing::syntax_definition::ContextReference::*;
1169                // TODO: check the first pushed reference is Inline(...) and has a meta_scope of meta.function.box.latex
1170                // TODO: check the second pushed reference is Named("argument".to_owned())
1171                // TODO: check the third pushed reference is Named("optional-arguments".to_owned())
1172
1173                assert!(match_pat.with_prototype.is_none());
1174            }
1175            _ => unreachable!(),
1176        }
1177    }
1178
1179    #[test]
1180    fn names_anonymous_contexts() {
1181        let def = SyntaxDefinition::load_from_str(
1182            r#"
1183            scope: source.c
1184            contexts:
1185              main:
1186                - match: a
1187                  push: a
1188              a:
1189                - meta_scope: a
1190                - match: x
1191                  push:
1192                    - meta_scope: anonymous_x
1193                    - match: anything
1194                      push:
1195                        - meta_scope: anonymous_x_2
1196                - match: y
1197                  push:
1198                    - meta_scope: anonymous_y
1199                - match: z
1200                  escape: 'test'
1201            "#,
1202            false,
1203            None,
1204        )
1205        .unwrap();
1206
1207        assert_eq!(def.contexts["a"].meta_scope, vec![Scope::new("a").unwrap()]);
1208        assert_eq!(
1209            def.contexts["#anon_a_0"].meta_scope,
1210            vec![Scope::new("anonymous_x").unwrap()]
1211        );
1212        assert_eq!(
1213            def.contexts["#anon_a_1"].meta_scope,
1214            vec![Scope::new("anonymous_x_2").unwrap()]
1215        );
1216        assert_eq!(
1217            def.contexts["#anon_a_2"].meta_scope,
1218            vec![Scope::new("anonymous_y").unwrap()]
1219        );
1220        assert_eq!(def.contexts["#anon_a_3"].patterns.len(), 1); // escape
1221    }
1222
1223    #[test]
1224    fn can_use_fallback_name() {
1225        let def = SyntaxDefinition::load_from_str(
1226            r#"
1227        scope: source.c
1228        contexts:
1229          main:
1230            - match: ''
1231        "#,
1232            false,
1233            Some("C"),
1234        );
1235        assert_eq!(def.unwrap().name, "C");
1236    }
1237
1238    #[test]
1239    fn can_rewrite_regex_for_newlines() {
1240        fn rewrite(s: &str) -> String {
1241            regex_for_newlines(s.to_string())
1242        }
1243
1244        assert_eq!(&rewrite(r"a"), r"a");
1245        assert_eq!(&rewrite(r"\b"), r"\b");
1246        assert_eq!(&rewrite(r"(a)"), r"(a)");
1247        assert_eq!(&rewrite(r"[a]"), r"[a]");
1248        assert_eq!(&rewrite(r"[^a]"), r"[^a]");
1249        assert_eq!(&rewrite(r"[]a]"), r"[]a]");
1250        assert_eq!(&rewrite(r"[[a]]"), r"[[a]]");
1251
1252        assert_eq!(&rewrite(r"^"), r"^");
1253        assert_eq!(&rewrite(r"$"), r"(?m:$)");
1254        assert_eq!(&rewrite(r"^ab$"), r"^ab(?m:$)");
1255        assert_eq!(&rewrite(r"\^ab\$"), r"\^ab\$");
1256        assert_eq!(&rewrite(r"(//).*$"), r"(//).*(?m:$)");
1257
1258        // Do not rewrite this `$` because it's in a char class and doesn't mean end of line
1259        assert_eq!(&rewrite(r"[a$]"), r"[a$]");
1260    }
1261
1262    #[test]
1263    fn can_rewrite_regex_for_no_newlines() {
1264        fn rewrite(s: &str) -> String {
1265            regex_for_no_newlines(s.to_string())
1266        }
1267
1268        assert_eq!(&rewrite(r"a"), r"a");
1269        assert_eq!(&rewrite(r"\b"), r"\b");
1270        assert_eq!(&rewrite(r"(a)"), r"(a)");
1271        assert_eq!(&rewrite(r"[a]"), r"[a]");
1272        assert_eq!(&rewrite(r"[^a]"), r"[^a]");
1273        assert_eq!(&rewrite(r"[]a]"), r"[]a]");
1274        assert_eq!(&rewrite(r"[[a]]"), r"[[a]]");
1275
1276        assert_eq!(&rewrite(r"\n"), r"$");
1277        assert_eq!(&rewrite(r"\[\n"), r"\[$");
1278        assert_eq!(&rewrite(r"a\n?"), r"a\n?");
1279        assert_eq!(&rewrite(r"a\n+"), r"a\n+");
1280        assert_eq!(&rewrite(r"a\n*"), r"a\n*");
1281        assert_eq!(&rewrite(r"[abc\n]"), r"(?:[abc\n]|$)");
1282        assert_eq!(&rewrite(r"[^\n]"), r"[^\n]");
1283        assert_eq!(&rewrite(r"[^]\n]"), r"[^]\n]");
1284        assert_eq!(&rewrite(r"[\n]?"), r"[\n]?");
1285        // Removing the `\n` might result in an empty character class, so we should leave it.
1286        assert_eq!(&rewrite(r"[\n]"), r"(?:[\n]|$)");
1287        assert_eq!(&rewrite(r"[]\n]"), r"(?:[]\n]|$)");
1288        // In order to properly understand nesting, we'd have to have a full parser, so ignore it.
1289        assert_eq!(&rewrite(r"[[a]&&[\n]]"), r"[[a]&&[\n]]");
1290
1291        assert_eq!(&rewrite(r"ab(?:\n)?"), r"ab(?:$|)");
1292        assert_eq!(&rewrite(r"(?<!\n)ab"), r"(?<!$)ab");
1293        assert_eq!(&rewrite(r"(?<=\n)ab"), r"(?<=$)ab");
1294    }
1295
1296    #[test]
1297    fn can_get_valid_captures_from_regex() {
1298        let regex = "hello(test)(?=(world))(foo(?P<named>bar))";
1299        println!("{:?}", regex);
1300        let valid_indexes = get_consuming_capture_indexes(regex);
1301        println!("{:?}", valid_indexes);
1302        assert_eq!(valid_indexes, [0, 1, 3, 4]);
1303    }
1304
1305    #[test]
1306    fn can_get_valid_captures_from_regex2() {
1307        let regex = "hello(test)[(?=tricked](foo(bar))";
1308        println!("{:?}", regex);
1309        let valid_indexes = get_consuming_capture_indexes(regex);
1310        println!("{:?}", valid_indexes);
1311        assert_eq!(valid_indexes, [0, 1, 2, 3]);
1312    }
1313
1314    #[test]
1315    fn can_get_valid_captures_from_nested_regex() {
1316        let regex = "hello(test)(?=(world(?!(te(?<=(st))))))(foo(bar))";
1317        println!("{:?}", regex);
1318        let valid_indexes = get_consuming_capture_indexes(regex);
1319        println!("{:?}", valid_indexes);
1320        assert_eq!(valid_indexes, [0, 1, 5, 6]);
1321    }
1322
1323    #[test]
1324    fn error_loading_syntax_with_unescaped_backslash() {
1325        let load_err = SyntaxDefinition::load_from_str(
1326            r#"
1327            name: Unescaped Backslash
1328            scope: source.c
1329            file_extensions: [test]
1330            contexts:
1331              main:
1332                - match: '\'
1333            "#,
1334            false,
1335            None,
1336        )
1337        .unwrap_err();
1338        match load_err {
1339            ParseSyntaxError::RegexCompileError(bad_regex, _) => assert_eq!(bad_regex, r"\"),
1340            _ => panic!("Unexpected error: {load_err}"),
1341        }
1342    }
1343}