logcheck_fluent_bit_filter/
regex_conversion.rs

1/// Bidirectional conversion between modern regex syntax and logcheck POSIX format
2///
3/// This module handles conversion between:
4/// - Modern regex shorthand (`\d`, `\w`, `\s`) used by grex
5/// - POSIX character classes (`[[:digit:]]`, `[[:alnum:]]`, `[[:space:]]`) used by logcheck
6use once_cell::sync::Lazy;
7
8/// Conversion mapping between modern regex and POSIX character classes
9#[derive(Debug, Clone, PartialEq, Eq)]
10pub struct RegexConversion {
11    /// Modern regex pattern (e.g., `\d`)
12    pub modern: &'static str,
13    /// POSIX character class (e.g., `[[:digit:]]`)
14    pub posix: &'static str,
15    /// Description of what it matches
16    pub description: &'static str,
17}
18
19/// Complete list of regex conversions
20pub static REGEX_CONVERSIONS: Lazy<Vec<RegexConversion>> = Lazy::new(|| {
21    vec![
22        RegexConversion {
23            modern: r"\d",
24            posix: "[[:digit:]]",
25            description: "Digits 0-9",
26        },
27        RegexConversion {
28            modern: r"\D",
29            posix: "[^[:digit:]]",
30            description: "Non-digits",
31        },
32        RegexConversion {
33            modern: r"\s",
34            posix: "[[:space:]]",
35            description: "Whitespace characters",
36        },
37        RegexConversion {
38            modern: r"\S",
39            posix: "[^[:space:]]",
40            description: "Non-whitespace characters",
41        },
42        // Note: \w is approximately [[:alnum:]_] but not exact due to Unicode
43        RegexConversion {
44            modern: r"\w",
45            posix: "[[:alnum:]_]",
46            description: "Word characters (letters, digits, underscore)",
47        },
48        RegexConversion {
49            modern: r"\W",
50            posix: "[^[:alnum:]_]",
51            description: "Non-word characters",
52        },
53    ]
54});
55
56/// Additional POSIX classes that don't have direct modern equivalents
57pub static POSIX_ONLY_CONVERSIONS: Lazy<Vec<(&'static str, &'static str)>> = Lazy::new(|| {
58    vec![
59        ("[[:alpha:]]", "a-zA-Z"),
60        ("[[:alnum:]]", "a-zA-Z0-9"),
61        ("[[:xdigit:]]", "0-9a-fA-F"),
62        ("[[:lower:]]", "a-z"),
63        ("[[:upper:]]", "A-Z"),
64        ("[[:blank:]]", " \\t"),
65        ("[[:punct:]]", "!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~"),
66        ("[[:print:]]", "\\x20-\\x7E"),
67        ("[[:graph:]]", "!-~"),
68        ("[[:cntrl:]]", "\\x00-\\x1F\\x7F"),
69    ]
70});
71
72/// Convert modern regex syntax (from grex) to POSIX format (for logcheck)
73///
74/// Converts shorthand like `\d`, `\w`, `\s` to POSIX classes like `[[:digit:]]`
75///
76/// # Examples
77/// ```
78/// use logcheck_fluent_bit_filter::regex_conversion::modern_to_posix;
79///
80/// assert_eq!(modern_to_posix(r"test\d\d\d"), "test[[:digit:]][[:digit:]][[:digit:]]");
81/// assert_eq!(modern_to_posix(r"\w+"), "[[:alnum:]_]+");
82/// ```
83pub fn modern_to_posix(pattern: &str) -> String {
84    let mut result = pattern.to_string();
85
86    // Apply conversions in order (longer patterns first to avoid partial replacements)
87    for conversion in REGEX_CONVERSIONS.iter() {
88        result = result.replace(conversion.modern, conversion.posix);
89    }
90
91    result
92}
93
94/// Convert POSIX format (from logcheck) to modern regex syntax
95///
96/// Converts POSIX classes like `[[:digit:]]` to shorthand like `\d`
97///
98/// # Examples
99/// ```
100/// use logcheck_fluent_bit_filter::regex_conversion::posix_to_modern;
101///
102/// assert_eq!(posix_to_modern("test[[:digit:]][[:digit:]][[:digit:]]"), r"test\d\d\d");
103/// assert_eq!(posix_to_modern("[[:alnum:]_]+"), r"\w+");
104/// ```
105pub fn posix_to_modern(pattern: &str) -> String {
106    let mut result = pattern.to_string();
107
108    // Apply conversions in reverse order
109    for conversion in REGEX_CONVERSIONS.iter() {
110        result = result.replace(conversion.posix, conversion.modern);
111    }
112
113    // Also convert POSIX-only classes to their regex equivalents
114    for (posix_class, rust_equiv) in POSIX_ONLY_CONVERSIONS.iter() {
115        result = result.replace(posix_class, rust_equiv);
116    }
117
118    result
119}
120
121/// Convert POSIX format to Rust regex syntax (for internal processing)
122///
123/// This is similar to `posix_to_modern` but also handles POSIX-only classes
124///
125/// # Examples
126/// ```
127/// use logcheck_fluent_bit_filter::regex_conversion::posix_to_rust;
128///
129/// assert_eq!(posix_to_rust("[[:digit:]]"), r"\d");
130/// assert_eq!(posix_to_rust("[[:alpha:]]"), "a-zA-Z");
131/// ```
132pub fn posix_to_rust(pattern: &str) -> String {
133    let mut result = pattern.to_string();
134
135    // First apply direct conversions
136    for conversion in REGEX_CONVERSIONS.iter() {
137        result = result.replace(conversion.posix, conversion.modern);
138    }
139
140    // Then handle POSIX-only classes
141    for (posix_class, rust_equiv) in POSIX_ONLY_CONVERSIONS.iter() {
142        result = result.replace(posix_class, rust_equiv);
143    }
144
145    // Escape unescaped curly braces that aren't part of quantifiers
146    result = result.replace(" { ", " \\{ ");
147    result = result.replace(" } ", " \\} ");
148
149    result
150}
151
152/// Remove anchors from a regex pattern
153///
154/// grex always adds `^` and `$` anchors, but logcheck patterns may not need them
155pub fn remove_anchors(pattern: &str) -> String {
156    let mut result = pattern.to_string();
157
158    // Remove start anchor
159    if result.starts_with('^') {
160        result = result[1..].to_string();
161    }
162
163    // Remove end anchor
164    if result.ends_with('$') {
165        result = result[..result.len() - 1].to_string();
166    }
167
168    result
169}
170
171/// Add anchors to a regex pattern if not present
172pub fn ensure_anchors(pattern: &str) -> String {
173    let mut result = pattern.to_string();
174
175    if !result.starts_with('^') {
176        result = format!("^{}", result);
177    }
178
179    if !result.ends_with('$') {
180        result = format!("{}$", result);
181    }
182
183    result
184}
185
186#[cfg(test)]
187mod tests {
188    use super::*;
189
190    #[test]
191    fn test_modern_to_posix() {
192        assert_eq!(modern_to_posix(r"\d"), "[[:digit:]]");
193        assert_eq!(
194            modern_to_posix(r"\d\d\d"),
195            "[[:digit:]][[:digit:]][[:digit:]]"
196        );
197        assert_eq!(modern_to_posix(r"\w+"), "[[:alnum:]_]+");
198        assert_eq!(modern_to_posix(r"\s"), "[[:space:]]");
199        assert_eq!(modern_to_posix(r"\D"), "[^[:digit:]]");
200        assert_eq!(modern_to_posix(r"\W"), "[^[:alnum:]_]");
201        assert_eq!(modern_to_posix(r"\S"), "[^[:space:]]");
202    }
203
204    #[test]
205    fn test_posix_to_modern() {
206        assert_eq!(posix_to_modern("[[:digit:]]"), r"\d");
207        assert_eq!(
208            posix_to_modern("[[:digit:]][[:digit:]][[:digit:]]"),
209            r"\d\d\d"
210        );
211        assert_eq!(posix_to_modern("[[:alnum:]_]+"), r"\w+");
212        assert_eq!(posix_to_modern("[[:space:]]"), r"\s");
213        assert_eq!(posix_to_modern("[^[:digit:]]"), r"\D");
214    }
215
216    #[test]
217    fn test_posix_to_rust() {
218        assert_eq!(posix_to_rust("[[:digit:]]"), r"\d");
219        assert_eq!(posix_to_rust("[[:alpha:]]"), "a-zA-Z");
220        assert_eq!(posix_to_rust("[[:alnum:]]"), "a-zA-Z0-9");
221        assert_eq!(posix_to_rust("[[:space:]]"), r"\s");
222    }
223
224    #[test]
225    fn test_roundtrip() {
226        let modern = r"\d+\s\w+";
227        let posix = modern_to_posix(modern);
228        assert_eq!(posix, "[[:digit:]]+[[:space:]][[:alnum:]_]+");
229        let back_to_modern = posix_to_modern(&posix);
230        assert_eq!(back_to_modern, modern);
231    }
232
233    #[test]
234    fn test_remove_anchors() {
235        assert_eq!(remove_anchors("^test$"), "test");
236        assert_eq!(remove_anchors("^test"), "test");
237        assert_eq!(remove_anchors("test$"), "test");
238        assert_eq!(remove_anchors("test"), "test");
239    }
240
241    #[test]
242    fn test_ensure_anchors() {
243        assert_eq!(ensure_anchors("test"), "^test$");
244        assert_eq!(ensure_anchors("^test"), "^test$");
245        assert_eq!(ensure_anchors("test$"), "^test$");
246        assert_eq!(ensure_anchors("^test$"), "^test$");
247    }
248
249    #[test]
250    fn test_complex_pattern() {
251        let grex_output = r"^pam_unix\(sudo:session\): session \w+ for user \w+$";
252        let posix = modern_to_posix(grex_output);
253        assert_eq!(
254            posix,
255            "^pam_unix\\(sudo:session\\): session [[:alnum:]_]+ for user [[:alnum:]_]+$"
256        );
257    }
258}