revert: going back to regex implem, but change to regex-lite

Aaalibaba42 · Aaalibaba42 · commit 94970f767305 · 2026-03-12T18:18:39.000+01:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/datadog-remote-config/Cargo.toml b/datadog-remote-config/Cargo.toml
@@ -49,6 +49,7 @@ serde_with = "3"
 
 # Test feature
 hyper-util = { workspace = true, features = ["service"], optional = true }
+regex-lite = "0.1.9"
 
 [dev-dependencies]
 futures = "0.3"
diff --git a/datadog-remote-config/src/config/agent_task.rs b/datadog-remote-config/src/config/agent_task.rs
@@ -6,18 +6,9 @@ use serde::Deserialize;
 #[cfg(feature = "test")]
 use serde::Serialize;
 
+use regex_lite::Regex;
 use serde::de::{self, Deserializer};
 
-fn is_valid_suffixed_case_id(s: &str) -> bool {
-    let Some(rest) = s
-        .strip_suffix("-with-debug")
-        .or_else(|| s.strip_suffix("-with-content"))
-    else {
-        return false;
-    };
-    !rest.is_empty() && rest.bytes().all(|b| b.is_ascii_digit())
-}
-
 fn deserialize_case_id<'de, D>(deserializer: D) -> Result<String, D::Error>
 where
     D: Deserializer<'de>,
@@ -32,7 +23,9 @@ where
     if s.chars().all(|c| c.is_ascii_digit()) {
         return Ok(s);
     }
-    if is_valid_suffixed_case_id(&s) {
+    let re = Regex::new(r"^\d+-(with-debug|with-content)$")
+        .map_err(|_| de::Error::custom("Invalid case_id format"))?;
+    if re.is_match(&s) {
         return Ok(s);
     }
     Err(de::Error::custom(
diff --git a/libdd-common/Cargo.toml b/libdd-common/Cargo.toml
@@ -47,6 +47,7 @@ static_assertions = "1.1.0"
 libc = "0.2"
 const_format = "0.2.34"
 nix = { version = "0.29", features = ["process"] }
+regex-lite = "0.1.9"
 [target.'cfg(windows)'.dependencies.windows-sys]
 version = "0.52"
 features = [
diff --git a/libdd-common/src/azure_app_services.rs b/libdd-common/src/azure_app_services.rs
@@ -1,6 +1,7 @@
 // Copyright 2021-Present Datadog, Inc. https://www.datadoghq.com/
 // SPDX-License-Identifier: Apache-2.0
 
+use regex_lite::Regex;
 use std::env;
 use std::sync::LazyLock;
 
@@ -103,23 +104,13 @@ impl AzureMetadata {
     }
 
     fn extract_resource_group(s: Option<String>) -> Option<String> {
-        // /.+\+(.+)-.+webspace(-Linux)?/
-        let text = s.as_ref()?;
-        let (before_plus, after_plus) = text.rsplit_once('+')?;
-        if before_plus.is_empty() {
-            return None;
-        }
-        let webspace_pos = after_plus.rfind("webspace")?;
-        let before_webspace = &after_plus[..webspace_pos];
-        let dash_pos = before_webspace.rfind('-')?;
-        if dash_pos + 1 >= before_webspace.len() {
-            return None;
-        }
-        let resource_group = &before_webspace[..dash_pos];
-        if resource_group.is_empty() {
-            return None;
-        }
-        Some(resource_group.to_string())
+        #[allow(clippy::unwrap_used)]
+        let re: Regex = Regex::new(r".+\+(.+)-.+webspace(-Linux)?").unwrap();
+
+        s.as_ref().and_then(|text| {
+            re.captures(text)
+                .and_then(|caps| caps.get(1).map(|m| m.as_str().to_string()))
+        })
     }
 
     /*
diff --git a/libdd-common/src/entity_id/unix/container_id.rs b/libdd-common/src/entity_id/unix/container_id.rs
@@ -3,85 +3,37 @@
 
 //! This module provides functions to parse the container id from the cgroup file
 use super::CgroupFileParsingError;
+use regex_lite::Regex;
 use std::fs::File;
 use std::io::{BufRead, BufReader};
 use std::path::Path;
+use std::sync::LazyLock;
+
+const UUID_SOURCE: &str =
+    r"[0-9a-f]{8}[-_][0-9a-f]{4}[-_][0-9a-f]{4}[-_][0-9a-f]{4}[-_][0-9a-f]{12}";
+const CONTAINER_SOURCE: &str = r"[0-9a-f]{64}";
+const TASK_SOURCE: &str = r"[0-9a-f]{32}-\d+";
+
+pub(crate) static LINE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
+    #[allow(clippy::unwrap_used)]
+    Regex::new(r"^\d+:[^:]*:(.+)$").unwrap()
+});
+
+pub(crate) static CONTAINER_REGEX: LazyLock<Regex> = LazyLock::new(|| {
+    #[allow(clippy::unwrap_used)]
+    Regex::new(&format!(
+        r"({UUID_SOURCE}|{CONTAINER_SOURCE}|{TASK_SOURCE})(?:.scope)? *$"
+    ))
+    .unwrap()
+});
 
-fn is_lowercase_hex(b: u8) -> bool {
-    matches!(b, b'0'..=b'9' | b'a'..=b'f')
-}
-
-/// Try to match `[0-9a-f]{64}` at the end of `s`.
-fn try_match_hex64(s: &str) -> Option<&str> {
-    if s.len() < 64 {
-        return None;
-    }
-
-    let candidate = &s[s.len() - 64..];
-    candidate.bytes().all(is_lowercase_hex).then_some(candidate)
-}
-
-/// Try to match a UUID `[0-9a-f]{8}[-_][0-9a-f]{4}[-_]..[-_][0-9a-f]{12}` (36 chars) at the end.
-fn try_match_uuid(s: &str) -> Option<&str> {
-    if s.len() < 36 {
-        return None;
-    }
-
-    let candidate = &s[s.len() - 36..];
-    const TEMPLATE: &[u8; 36] = b"hhhhhhhh-hhhh-hhhh-hhhh-hhhhhhhhhhhh";
-    candidate
-        .as_bytes()
-        .iter()
-        .zip(TEMPLATE)
-        .all(|(&c, &t)| match t {
-            b'h' => is_lowercase_hex(c),
-            b'-' => matches!(c, b'-' | b'_'),
-            _ => false,
-        })
-        .then_some(candidate)
-}
-
-/// Try to match `[0-9a-f]{32}-\d+` at the end of `s`.
-fn try_match_task_id(s: &str) -> Option<&str> {
-    let (prefix, digits) = s.rsplit_once('-')?;
-    if digits.is_empty() || !digits.bytes().all(|b| b.is_ascii_digit()) || prefix.len() < 32 {
-        return None;
-    }
-
-    let hex_start = prefix.len() - 32;
-    prefix[hex_start..]
-        .bytes()
-        .all(is_lowercase_hex)
-        .then_some(&s[hex_start..])
-}
-
-/// Extract a container ID from a cgroup path, matching the pattern
-/// `(UUID|HEX64|TASK_ID)(?:.scope)? *$`
-pub(super) fn extract_container_id_from_path(path: &str) -> Option<&str> {
-    let path = {
-        let trimmed = path.trim_end();
-        trimmed.strip_suffix(".scope").unwrap_or(trimmed)
-    };
-
-    try_match_hex64(path)
-        .or_else(|| try_match_uuid(path))
-        .or_else(|| try_match_task_id(path))
-}
-
-/// Parse a cgroup line (`^\d+:[^:]*:(.+)$`) and extract a container ID from the path component.
 fn parse_line(line: &str) -> Option<&str> {
-    let mut parts = line.splitn(3, ':');
-    let hierarchy_id = parts.next()?;
-    let _controllers = parts.next()?;
-    let path = parts.next()?;
-
-    if hierarchy_id.is_empty()
-        || !hierarchy_id.bytes().all(|b| b.is_ascii_digit())
-        || path.is_empty()
-    {
-        return None;
-    }
-    extract_container_id_from_path(path)
+    // unwrap is OK since if regex matches then the groups must exist
+    #[allow(clippy::unwrap_used)]
+    LINE_REGEX
+        .captures(line)
+        .and_then(|captures| CONTAINER_REGEX.captures(captures.get(1).unwrap().as_str()))
+        .map(|captures| captures.get(1).unwrap().as_str())
 }
 
 /// Extract container id contained in the cgroup file located at `cgroup_path`
diff --git a/libdd-common/src/entity_id/unix/mod.rs b/libdd-common/src/entity_id/unix/mod.rs
@@ -104,27 +104,15 @@ pub static ENTITY_ID: LazyLock<Option<&'static str>> = LazyLock::new(|| {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use regex_lite::Regex;
 
-    enum EntityIdKind {
-        Inode,
-        ContainerId,
-    }
-
-    fn matches_entity_id_kind(entity_id: &str, kind: &EntityIdKind) -> bool {
-        match kind {
-            EntityIdKind::Inode => {
-                entity_id.starts_with("in-")
-                    && entity_id[3..].bytes().all(|b| b.is_ascii_digit())
-                    && entity_id.len() > 3
-            }
-            EntityIdKind::ContainerId => {
-                entity_id.starts_with("ci-")
-                    && container_id::extract_container_id_from_path(&entity_id[3..]).is_some()
-            }
-        }
-    }
+    static IN_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"in-\d+").unwrap());
+    static CI_REGEX: LazyLock<Regex> = LazyLock::new(|| {
+        Regex::new(&format!(r"ci-{}", container_id::CONTAINER_REGEX.as_str())).unwrap()
+    });
 
-    fn test_entity_id(filename: &str, expected_kind: Option<EntityIdKind>) {
+    /// The following test can only be run in isolation because of caching behaviour
+    fn test_entity_id(filename: &str, expected_result: Option<&Regex>) {
         let test_root_dir = Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/tests"));
 
         let entity_id = compute_entity_id(
@@ -133,11 +121,12 @@ mod tests {
             test_root_dir.join("cgroup").as_path(),
         );
 
-        if let Some(kind) = expected_kind {
-            let id = entity_id.as_deref().unwrap();
+        if let Some(regex) = expected_result {
             assert!(
-                matches_entity_id_kind(id, &kind),
-                "testing get_entity_id with file {filename}: {id} did not match expected format",
+                regex.is_match(entity_id.as_deref().unwrap()),
+                "testing get_entity_id with file {}: {} is not matching the expected regex",
+                filename,
+                entity_id.as_deref().unwrap_or("None")
             );
         } else {
             assert_eq!(
@@ -150,19 +139,19 @@ mod tests {
     #[cfg_attr(miri, ignore)]
     #[test]
     fn test_entity_id_for_v2() {
-        test_entity_id("cgroup.v2", Some(EntityIdKind::Inode))
+        test_entity_id("cgroup.v2", Some(&*IN_REGEX))
     }
 
     #[cfg_attr(miri, ignore)]
     #[test]
     fn test_entity_id_for_v1() {
-        test_entity_id("cgroup.linux", Some(EntityIdKind::Inode))
+        test_entity_id("cgroup.linux", Some(&*IN_REGEX))
     }
 
     #[cfg_attr(miri, ignore)]
     #[test]
     fn test_entity_id_for_container_id() {
-        test_entity_id("cgroup.docker", Some(EntityIdKind::ContainerId))
+        test_entity_id("cgroup.docker", Some(&*CI_REGEX))
     }
 
     #[cfg_attr(miri, ignore)]
diff --git a/libdd-trace-obfuscation/Cargo.toml b/libdd-trace-obfuscation/Cargo.toml
@@ -20,6 +20,7 @@ log = "0.4"
 libdd-trace-protobuf = { version = "1.1.0", path = "../libdd-trace-protobuf" }
 libdd-trace-utils = { version = "2.0.0", path = "../libdd-trace-utils" }
 libdd-common = { version = "2.0.0", path = "../libdd-common" }
+regex-lite = "0.1.9"
 
 [dev-dependencies]
 duplicate = "0.4.1"
diff --git a/libdd-trace-obfuscation/src/ip_address.rs b/libdd-trace-obfuscation/src/ip_address.rs
@@ -1,7 +1,8 @@
 // Copyright 2024-Present Datadog, Inc. https://www.datadoghq.com/
 // SPDX-License-Identifier: Apache-2.0
 
-use std::{borrow::Cow, collections::HashSet, net::Ipv6Addr};
+use regex_lite::Regex;
+use std::{borrow::Cow, collections::HashSet, net::Ipv6Addr, sync::LazyLock};
 
 const ALLOWED_IP_ADDRESSES: [&str; 5] = [
     // localhost
@@ -14,21 +15,11 @@ const ALLOWED_IP_ADDRESSES: [&str; 5] = [
     "169.254.170.2",
 ];
 
-const PROTOCOL_PREFIXES: &[&str] = &["dnspoll", "ftp", "file", "http", "https"];
-
-fn find_protocol_prefix(s: &str) -> Option<usize> {
-    for &proto in PROTOCOL_PREFIXES {
-        if let Some(rest) = s.strip_prefix(proto) {
-            if rest.starts_with(":///") {
-                return Some(proto.len() + 4);
-            }
-            if rest.starts_with("://") {
-                return Some(proto.len() + 3);
-            }
-        }
-    }
-    None
-}
+const PREFIX_REGEX_LITERAL: &str = r"^((?:dnspoll|ftp|file|http|https):/{2,3})";
+static PREFIX_REGEX: LazyLock<Regex> = LazyLock::new(|| {
+    #[allow(clippy::unwrap_used)]
+    Regex::new(PREFIX_REGEX_LITERAL).unwrap()
+});
 
 /// Quantizes a comma separated list of hosts.
 ///
@@ -96,10 +87,11 @@ fn quantize_ip(s: &str) -> Option<String> {
 
 /// Split the ip prefix, can be either a provider specific prefix or a protocol
 fn split_prefix(s: &str) -> (&str, &str) {
+    #[allow(clippy::unwrap_used)]
     if let Some(tail) = s.strip_prefix("ip-") {
         ("ip-", tail)
-    } else if let Some(end) = find_protocol_prefix(s) {
-        s.split_at(end)
+    } else if let Some(protocol) = PREFIX_REGEX.find(s) {
+        s.split_at(protocol.end())
     } else {
         ("", s)
     }
@@ -112,8 +104,12 @@ fn parse_ip(s: &str) -> Option<(&str, &str)> {
         match ch {
             '0'..='9' => continue,
             '.' | '-' | '_' => return parse_ip_v4(s, ch),
-            ':' | 'A'..='F' | 'a'..='f' if s.parse::<Ipv6Addr>().is_ok() => {
-                return Some((s, ""));
+            ':' | 'A'..='F' | 'a'..='f' => {
+                if s.parse::<Ipv6Addr>().is_ok() {
+                    return Some((s, ""));
+                } else {
+                    return None;
+                }
             }
             '[' => {
                 // Parse IPv6 in [host]:port format