syntect/parsing/
syntax_definition.rs

1//! Data structures for representing syntax definitions
2//!
3//! Everything here is public becaues I want this library to be useful in super integrated cases
4//! like text editors and I have no idea what kind of monkeying you might want to do with the data.
5//! Perhaps parsing your own syntax format into this data structure?
6
7use super::regex::{Regex, Region};
8use super::{scope::*, ParsingError};
9use crate::parsing::syntax_set::SyntaxSet;
10use regex_syntax::escape;
11use serde::ser::{Serialize, Serializer};
12use serde_derive::{Deserialize, Serialize};
13use std::collections::{BTreeMap, HashMap};
14use std::hash::Hash;
15
16pub type CaptureMapping = Vec<(usize, Vec<Scope>)>;
17
18/// An opaque ID for a [`Context`].
19#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Serialize, Deserialize)]
20pub struct ContextId {
21    /// Index into [`SyntaxSet::syntaxes`]
22    pub(crate) syntax_index: usize,
23
24    /// Index into [`crate::parsing::LazyContexts::contexts`] for the [`Self::syntax_index`] syntax
25    pub(crate) context_index: usize,
26}
27
28/// The main data structure representing a syntax definition loaded from a
29/// `.sublime-syntax` file
30///
31/// You'll probably only need these as references to be passed around to parsing code.
32///
33/// Some useful public fields are the `name` field which is a human readable name to display in
34/// syntax lists, and the `hidden` field which means hide this syntax from any lists because it is
35/// for internal use.
36#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
37pub struct SyntaxDefinition {
38    pub name: String,
39    pub file_extensions: Vec<String>,
40    pub scope: Scope,
41    pub first_line_match: Option<String>,
42    pub hidden: bool,
43    #[serde(serialize_with = "ordered_map")]
44    pub variables: HashMap<String, String>,
45    #[serde(serialize_with = "ordered_map")]
46    pub contexts: HashMap<String, Context>,
47}
48
49#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
50pub struct Context {
51    pub meta_scope: Vec<Scope>,
52    pub meta_content_scope: Vec<Scope>,
53    /// This being set false in the syntax file implies this field being set false,
54    /// but it can also be set falso for contexts that don't include the prototype for other reasons
55    pub meta_include_prototype: bool,
56    pub clear_scopes: Option<ClearAmount>,
57    /// This is filled in by the linker at link time
58    /// for contexts that have `meta_include_prototype==true`
59    /// and are not included from the prototype.
60    pub prototype: Option<ContextId>,
61    pub uses_backrefs: bool,
62
63    pub patterns: Vec<Pattern>,
64}
65
66impl Context {
67    pub fn new(meta_include_prototype: bool) -> Context {
68        Context {
69            meta_scope: Vec::new(),
70            meta_content_scope: Vec::new(),
71            meta_include_prototype,
72            clear_scopes: None,
73            uses_backrefs: false,
74            patterns: Vec::new(),
75            prototype: None,
76        }
77    }
78}
79
80#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
81pub enum Pattern {
82    Match(MatchPattern),
83    Include(ContextReference),
84}
85
86/// Used to iterate over all the match patterns in a context
87///
88/// Basically walks the tree of patterns and include directives in the correct order.
89#[derive(Debug)]
90pub struct MatchIter<'a> {
91    syntax_set: &'a SyntaxSet,
92    ctx_stack: Vec<&'a Context>,
93    index_stack: Vec<usize>,
94}
95
96#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
97pub struct MatchPattern {
98    pub has_captures: bool,
99    pub regex: Regex,
100    pub scope: Vec<Scope>,
101    pub captures: Option<CaptureMapping>,
102    pub operation: MatchOperation,
103    pub with_prototype: Option<ContextReference>,
104}
105
106#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
107#[non_exhaustive]
108pub enum ContextReference {
109    #[non_exhaustive]
110    Named(String),
111    #[non_exhaustive]
112    ByScope {
113        scope: Scope,
114        sub_context: Option<String>,
115        /// `true` if this reference by scope is part of an `embed` for which
116        /// there is an `escape`. In other words a reference for a context for
117        /// which there "always is a way out". Enables falling back to `Plain
118        /// Text` syntax in case the referenced scope is missing.
119        with_escape: bool,
120    },
121    #[non_exhaustive]
122    File {
123        name: String,
124        sub_context: Option<String>,
125        /// Same semantics as for [`Self::ByScope::with_escape`].
126        with_escape: bool,
127    },
128    #[non_exhaustive]
129    Inline(String),
130    #[non_exhaustive]
131    Direct(ContextId),
132}
133
134#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
135pub enum MatchOperation {
136    Push(Vec<ContextReference>),
137    Set(Vec<ContextReference>),
138    Pop,
139    None,
140}
141
142impl<'a> Iterator for MatchIter<'a> {
143    type Item = (&'a Context, usize);
144
145    fn next(&mut self) -> Option<(&'a Context, usize)> {
146        loop {
147            if self.ctx_stack.is_empty() {
148                return None;
149            }
150            // uncomment for debugging infinite recursion
151            // println!("{:?}", self.index_stack);
152            // use std::thread::sleep_ms;
153            // sleep_ms(500);
154            let last_index = self.ctx_stack.len() - 1;
155            let context = self.ctx_stack[last_index];
156            let index = self.index_stack[last_index];
157            self.index_stack[last_index] = index + 1;
158            if index < context.patterns.len() {
159                match context.patterns[index] {
160                    Pattern::Match(_) => {
161                        return Some((context, index));
162                    }
163                    Pattern::Include(ref ctx_ref) => {
164                        let ctx_ptr = match *ctx_ref {
165                            ContextReference::Direct(ref context_id) => {
166                                self.syntax_set.get_context(context_id).unwrap()
167                            }
168                            _ => return self.next(), // skip this and move onto the next one
169                        };
170                        self.ctx_stack.push(ctx_ptr);
171                        self.index_stack.push(0);
172                    }
173                }
174            } else {
175                self.ctx_stack.pop();
176                self.index_stack.pop();
177            }
178        }
179    }
180}
181
182/// Returns an iterator over all the match patterns in this context.
183///
184/// It recursively follows include directives. Can only be run on contexts that have already been
185/// linked up.
186pub fn context_iter<'a>(syntax_set: &'a SyntaxSet, context: &'a Context) -> MatchIter<'a> {
187    MatchIter {
188        syntax_set,
189        ctx_stack: vec![context],
190        index_stack: vec![0],
191    }
192}
193
194impl Context {
195    /// Returns the match pattern at an index
196    pub fn match_at(&self, index: usize) -> Result<&MatchPattern, ParsingError> {
197        match self.patterns[index] {
198            Pattern::Match(ref match_pat) => Ok(match_pat),
199            _ => Err(ParsingError::BadMatchIndex(index)),
200        }
201    }
202}
203
204impl ContextReference {
205    /// find the pointed to context
206    pub fn resolve<'a>(&self, syntax_set: &'a SyntaxSet) -> Result<&'a Context, ParsingError> {
207        match *self {
208            ContextReference::Direct(ref context_id) => syntax_set.get_context(context_id),
209            _ => Err(ParsingError::UnresolvedContextReference(self.clone())),
210        }
211    }
212
213    /// get the context ID this reference points to
214    pub fn id(&self) -> Result<ContextId, ParsingError> {
215        match *self {
216            ContextReference::Direct(ref context_id) => Ok(*context_id),
217            _ => Err(ParsingError::UnresolvedContextReference(self.clone())),
218        }
219    }
220}
221
222pub(crate) fn substitute_backrefs_in_regex<F>(regex_str: &str, substituter: F) -> String
223where
224    F: Fn(usize) -> Option<String>,
225{
226    let mut reg_str = String::with_capacity(regex_str.len());
227
228    let mut last_was_escape = false;
229    for c in regex_str.chars() {
230        if last_was_escape && c.is_ascii_digit() {
231            let val = c.to_digit(10).unwrap() as usize;
232            if let Some(sub) = substituter(val) {
233                reg_str.push_str(&sub);
234            }
235        } else if last_was_escape {
236            reg_str.push('\\');
237            reg_str.push(c);
238        } else if c != '\\' {
239            reg_str.push(c);
240        }
241
242        last_was_escape = c == '\\' && !last_was_escape;
243    }
244    if last_was_escape {
245        reg_str.push('\\');
246    }
247    reg_str
248}
249
250impl MatchPattern {
251    pub fn new(
252        has_captures: bool,
253        regex_str: String,
254        scope: Vec<Scope>,
255        captures: Option<CaptureMapping>,
256        operation: MatchOperation,
257        with_prototype: Option<ContextReference>,
258    ) -> MatchPattern {
259        MatchPattern {
260            has_captures,
261            regex: Regex::new(regex_str),
262            scope,
263            captures,
264            operation,
265            with_prototype,
266        }
267    }
268
269    /// Used by the parser to compile a regex which needs to reference
270    /// regions from another matched pattern.
271    pub fn regex_with_refs(&self, region: &Region, text: &str) -> Regex {
272        let new_regex = substitute_backrefs_in_regex(self.regex.regex_str(), |i| {
273            region.pos(i).map(|(start, end)| escape(&text[start..end]))
274        });
275
276        Regex::new(new_regex)
277    }
278
279    pub fn regex(&self) -> &Regex {
280        &self.regex
281    }
282}
283
284/// Serialize the provided map in natural key order, so that it's deterministic when dumping.
285pub(crate) fn ordered_map<K, V, S>(map: &HashMap<K, V>, serializer: S) -> Result<S::Ok, S::Error>
286where
287    S: Serializer,
288    K: Eq + Hash + Ord + Serialize,
289    V: Serialize,
290{
291    let ordered: BTreeMap<_, _> = map.iter().collect();
292    ordered.serialize(serializer)
293}
294
295#[cfg(test)]
296mod tests {
297    use super::*;
298
299    #[test]
300    fn can_compile_refs() {
301        let pat = MatchPattern {
302            has_captures: true,
303            regex: Regex::new(r"lol \\ \2 \1 '\9' \wz".into()),
304            scope: vec![],
305            captures: None,
306            operation: MatchOperation::None,
307            with_prototype: None,
308        };
309        let r = Regex::new(r"(\\\[\]\(\))(b)(c)(d)(e)".into());
310        let s = r"\[]()bcde";
311        let mut region = Region::new();
312        let matched = r.search(s, 0, s.len(), Some(&mut region));
313        assert!(matched);
314
315        let regex_with_refs = pat.regex_with_refs(&region, s);
316        assert_eq!(regex_with_refs.regex_str(), r"lol \\ b \\\[\]\(\) '' \wz");
317    }
318}