diff --git a/README.md b/README.md index 32dd9fb..5f42a76 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,9 @@ # sed Rust reimplementation of the [sed utility](https://pubs.opengroup.org/onlinepubs/9799919799/utilities/sed.html) -with some [GNU sed](https://www.gnu.org/software/sed/manual/sed.html) -and [FreeBSD sed](https://man.freebsd.org/cgi/man.cgi?sed(1)) extensions. +with some [GNU sed](https://www.gnu.org/software/sed/manual/sed.html), +[FreeBSD sed](https://man.freebsd.org/cgi/man.cgi?sed(1)), +and other extensions. ## Installation @@ -23,6 +24,17 @@ cd sed cargo build --release cargo run --release ``` +## Extensions +### GNU +* Command-line arguments can be specified in long (`--`) form. +* Spaces can precede a regular expression modifier. + +### BSD and GNU +* The second address in a range can be specified as a relative address with +N. + +### Other +* Unicode characters can be specified in regular expression pattern, replacement + and transliteration sequences using `\uXXXX` or `\UXXXXXXXX` sequences. ## License diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index a70f681..53761d8 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -18,7 +18,7 @@ use std::path::PathBuf; // For file descriptors and equivalent // Compilation and processing options provided mostly through the // command-line interface -#[derive(Debug)] +#[derive(Debug, Default)] pub struct CliOptions { // Command-line flags with corresponding names pub all_output_files: bool, @@ -107,6 +107,21 @@ pub struct Command { pub next: Option>, // Pointer to next command } +impl Default for Command { + fn default() -> Self { + Command { + code: '_', + addr1: None, + addr2: None, + non_select: false, + start_line: Some(0), + text: None, + data: CommandData::None, + next: None, + } + } +} + #[derive(Debug)] pub enum CommandData { None, diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 5d01629..c45a1ad 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -8,12 +8,20 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use crate::command::{CliOptions, Command, CommandData, ScriptValue}; +use crate::command::{Address, AddressType, AddressValue, CliOptions, Command, ScriptValue}; +use crate::delimited_parser::{compilation_error, parse_regex}; use crate::script_char_provider::ScriptCharProvider; use crate::script_line_provider::ScriptLineProvider; use once_cell::sync::Lazy; +use regex::Regex; +use std::cell::RefCell; use std::collections::HashMap; -use uucore::error::{UResult, USimpleError}; +use uucore::error::UResult; + +thread_local! { + /// The previously saved RE. It is reused when specifying an empty one. + static SAVED_REGEX: RefCell> = const { RefCell::new(None) }; +} // A global, immutable map of command properties, initialized on first access static CMD_MAP: Lazy> = Lazy::new(build_command_map); @@ -202,9 +210,9 @@ pub fn compile( scripts: Vec, cli_options: &mut CliOptions, ) -> UResult>> { - let mut line_provider = ScriptLineProvider::new(scripts); + let mut make_providers = ScriptLineProvider::new(scripts); - let result = compile_thread(&mut line_provider, cli_options)?; + let result = compile_thread(&mut make_providers, cli_options)?; // TODO: fix-up labels, check used labels, setup append & match structures Ok(result) } @@ -237,18 +245,8 @@ fn compile_thread( continue 'next_char; } - let mut cmd = Box::new(Command { - next: None, - addr1: None, - addr2: None, - start_line: Some(0), - text: None, - data: CommandData::None, - code: '_', - non_select: false, - }); - - let n_addr = compile_addresses(&mut line, &mut cmd); + let mut cmd = Box::new(Command::default()); + let n_addr = compile_address_range(lines, &mut line, &mut cmd)?; let mut cmd_spec = get_cmd_spec(lines, &line, n_addr)?; if cmd_spec.args == CommandArgs::NonSelect { @@ -274,11 +272,150 @@ fn compile_thread( } } -// Compile a command's addresses into cmd. -// Return the number of addresses encountered. -fn compile_addresses(_line: &mut ScriptCharProvider, _cmd: &mut Command) -> usize { - // TODO: implement address parsing - 0 +/// Return true if c is a valid character for specifying a context address +fn is_address_char(c: char) -> bool { + matches!(c, '0'..='9' | '/' | '\\' | '$') +} + +/// Compile a command's optional address range into cmd. +/// Return the number of addresses encountered. +fn compile_address_range( + lines: &ScriptLineProvider, + line: &mut ScriptCharProvider, + cmd: &mut Command, +) -> UResult { + let mut n_addr = 0; + + line.eat_spaces(); + if !line.eol() && is_address_char(line.current()) { + if let Ok(addr1) = compile_address(lines, line) { + cmd.addr1 = Some(addr1); + n_addr += 1; + } + } + + line.eat_spaces(); + if n_addr == 1 && !line.eol() && line.current() == ',' { + line.advance(); + line.eat_spaces(); + if !line.eol() { + if let Ok(addr2) = compile_address(lines, line) { + cmd.addr2 = Some(addr2); + n_addr += 1; + } + } + } + + Ok(n_addr) +} + +/// Compile and return a single range address specification. +fn compile_address(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> UResult
{ + let mut icase = false; + + if line.eol() { + return compilation_error(lines, line, "expected context address"); + } + + match line.current() { + '\\' | '/' => { + // Regular expression + if line.current() == '\\' { + // The next character is an arbitrary delimiter + line.advance(); + } + let re = parse_regex(lines, line)?; + // Skip over delimiter + line.advance(); + + line.eat_spaces(); + if !line.eol() && line.current() == 'I' { + icase = true; + line.advance(); + } + + Ok(Address { + atype: AddressType::Re, + value: AddressValue::Regex(compile_regex(lines, line, &re, icase)?), + }) + } + '$' => { + line.advance(); + Ok(Address { + atype: AddressType::Last, + value: AddressValue::LineNumber(0), + }) + } + '+' => { + line.advance(); + let number = parse_number(lines, line)?; + Ok(Address { + atype: AddressType::RelLine, + value: AddressValue::LineNumber(number), + }) + } + c if c.is_ascii_digit() => { + let number = parse_number(lines, line)?; + Ok(Address { + atype: AddressType::Line, + value: AddressValue::LineNumber(number), + }) + } + _ => panic!("invalid context address"), + } +} + +/// Parse and return the decimal number at the current line position. +/// Advance the line to first non-digit or EOL. +fn parse_number(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> UResult { + let mut num_str = String::new(); + + while !line.eol() && line.current().is_ascii_digit() { + num_str.push(line.current()); + line.advance(); + } + + num_str + .parse::() + .map_err(|_| format!("invalid number '{}'", num_str)) + .map_err(|msg| compilation_error::(lines, line, msg).unwrap_err()) +} + +/// Compile the provided regular expression string into a corresponding engine. +fn compile_regex( + lines: &ScriptLineProvider, + line: &ScriptCharProvider, + pattern: &str, + icase: bool, +) -> UResult { + if pattern.is_empty() { + SAVED_REGEX.with(|cell| { + if let Some(existing) = &*cell.borrow() { + Ok(existing.clone()) + } else { + compilation_error(lines, line, "no previously compiled regex available") + } + }) + } else { + let full_pattern = if icase { + if pattern.is_empty() { + return compilation_error(lines, line, "cannot specify a modifier on an empty RE"); + } + format!("(?i){}", pattern) + } else { + pattern.to_string() + }; + + let compiled = Regex::new(&full_pattern).map_err(|e| { + compilation_error::(lines, line, format!("invalid regex '{}': {}", pattern, e)) + .unwrap_err() + })?; + + SAVED_REGEX.with(|cell| { + *cell.borrow_mut() = Some(compiled.clone()); + }); + Ok(compiled) + } } // Compile the specified command @@ -301,7 +438,7 @@ fn compile_command( return Ok(ContinueAction::NextChar); } if !line.eol() { - return compile_error( + return compilation_error( lines, line, format!("extra characters at the end of the {} command", cmd.code), @@ -336,24 +473,6 @@ fn compile_command( Ok(ContinueAction::NextLine) } -// Fail with msg as a compile error at the current location -fn compile_error( - lines: &ScriptLineProvider, - line: &ScriptCharProvider, - msg: impl ToString, -) -> UResult { - Err(USimpleError::new( - 1, - format!( - "{}:{}:{}: error: {}", - lines.get_input_name(), - lines.get_line_number(), - line.get_pos(), - msg.to_string() - ), - )) -} - // Return the specification for the command letter at the current line position // checking for diverse errors. fn get_cmd_spec( @@ -362,19 +481,19 @@ fn get_cmd_spec( n_addr: usize, ) -> UResult<&'static CommandSpec> { if line.eol() { - return compile_error(lines, line, "command expected"); + return compilation_error(lines, line, "command expected"); } let ch = line.current(); let opt_cmd_spec = lookup_command(ch); if opt_cmd_spec.is_none() { - return compile_error(lines, line, format!("invalid command code {}", ch)); + return compilation_error(lines, line, format!("invalid command code {}", ch)); } let cmd_spec = opt_cmd_spec.unwrap(); if n_addr > cmd_spec.n_addr { - return compile_error( + return compilation_error( lines, line, format!( @@ -396,6 +515,12 @@ fn lookup_command(cmd: char) -> Option<&'static CommandSpec> { mod tests { use super::*; + fn make_providers(input: &str) -> (ScriptLineProvider, ScriptCharProvider) { + let lines = ScriptLineProvider::new(vec![]); // Empty for tests + let line = ScriptCharProvider::new(input); + (lines, line) + } + // lookup_command #[test] fn test_lookup_empty_command() { @@ -492,9 +617,9 @@ mod tests { ScriptCharProvider::new(s) } - // compile_error + // compilation_error #[test] - fn test_compile_error_message_format() { + fn test_compilation_error_message_format() { let lines = ScriptLineProvider::with_active_state("test.sed", 42); let mut line = char_provider_from("whatever"); line.advance(); // move to position 1 @@ -503,7 +628,7 @@ mod tests { line.advance(); // now at position 4 let msg = "unexpected token"; - let result: UResult<()> = compile_error(&lines, &line, msg); + let result: UResult<()> = compilation_error(&lines, &line, msg); assert!(result.is_err()); @@ -514,13 +639,13 @@ mod tests { } #[test] - fn test_compile_error_with_format_message() { + fn test_compilation_error_with_format_message() { let lines = ScriptLineProvider::with_active_state("input.txt", 3); let line = char_provider_from("x"); // We're at position 0 let result: UResult<()> = - compile_error(&lines, &line, format!("invalid command '{}'", 'x')); + compilation_error(&lines, &line, format!("invalid command '{}'", 'x')); assert!(result.is_err()); @@ -576,4 +701,390 @@ mod tests { let spec = result.unwrap(); assert_eq!(spec.code, 'a'); } + + // parse_number + #[test] + fn test_parse_number_basic() { + let (lines, mut chars) = make_providers("123abc"); + assert_eq!(parse_number(&lines, &mut chars).unwrap(), 123); + assert_eq!(chars.current(), 'a'); // Should stop at first non-digit + } + + #[test] + fn test_parse_number_invalid() { + let (lines, mut chars) = make_providers("abc"); + assert!(parse_number(&lines, &mut chars).is_err()); + } + + // compile_re + fn dummy_providers() -> (ScriptLineProvider, ScriptCharProvider) { + make_providers("dummy input") + } + + #[test] + fn test_compile_re_basic() { + let (lines, chars) = dummy_providers(); + let regex = compile_regex(&lines, &chars, "abc", false).unwrap(); + assert!(regex.is_match("abc")); + assert!(!regex.is_match("ABC")); + } + + #[test] + fn test_compile_re_case_insensitive() { + let (lines, chars) = dummy_providers(); + let regex = compile_regex(&lines, &chars, "abc", true).unwrap(); + assert!(regex.is_match("abc")); + assert!(regex.is_match("ABC")); + assert!(regex.is_match("AbC")); + } + + #[test] + fn test_compile_re_saved_and_reuse() { + // Save a regex + let (lines1, chars1) = dummy_providers(); + let _ = compile_regex(&lines1, &chars1, "abc", false).unwrap(); + + // Now try to reuse it + let (lines2, chars2) = dummy_providers(); + let reused = compile_regex(&lines2, &chars2, "", false).unwrap(); + + assert!(reused.is_match("abc")); + } + + #[test] + fn test_compile_re_empty_and_not_saved() { + // Clear saved regex + SAVED_REGEX.with(|cell| { + *cell.borrow_mut() = None; + }); + + let (lines, chars) = dummy_providers(); + let result = compile_regex(&lines, &chars, "", false); + assert!(result.is_err()); // Should fail because nothing was saved + } + + #[test] + fn test_compile_re_invalid() { + let (lines, chars) = dummy_providers(); + let result = compile_regex(&lines, &chars, "a[d", false); + assert!(result.is_err()); // Should fail due to open bracketed expression + } + + // compile_address + #[test] + fn test_compile_addr_line_number() { + let (lines, mut chars) = make_providers("42"); + let addr = compile_address(&lines, &mut chars).unwrap(); + assert!(matches!(addr.atype, AddressType::Line)); + if let AddressValue::LineNumber(n) = addr.value { + assert_eq!(n, 42); + } else { + panic!("expected LineNumber address value"); + } + } + + #[test] + fn test_compile_addr_relative_line() { + let (lines, mut chars) = make_providers("+7"); + let addr = compile_address(&lines, &mut chars).unwrap(); + assert!(matches!(addr.atype, AddressType::RelLine)); + if let AddressValue::LineNumber(n) = addr.value { + assert_eq!(n, 7); + } else { + panic!("expected LineNumber address value"); + } + } + + #[test] + fn test_compile_addr_last_line() { + let (lines, mut chars) = make_providers("$"); + let addr = compile_address(&lines, &mut chars).unwrap(); + assert!(matches!(addr.atype, AddressType::Last)); + } + + #[test] + fn test_compile_addr_regex() { + let (lines, mut chars) = make_providers("/hello/"); + let addr = compile_address(&lines, &mut chars).unwrap(); + assert!(matches!(addr.atype, AddressType::Re)); + if let AddressValue::Regex(re) = addr.value { + assert!(re.is_match("hello")); + } else { + panic!("expected Regex address value"); + } + } + + #[test] + fn test_compile_addr_regex_other_delimiter() { + let (lines, mut chars) = make_providers("\\#hello#"); + let addr = compile_address(&lines, &mut chars).unwrap(); + assert!(matches!(addr.atype, AddressType::Re)); + if let AddressValue::Regex(re) = addr.value { + assert!(re.is_match("hello")); + } else { + panic!("expected Regex address value"); + } + } + + #[test] + fn test_compile_addr_regex_with_modifier() { + let (lines, mut chars) = make_providers("/hello/I"); + let addr = compile_address(&lines, &mut chars).unwrap(); + assert!(matches!(addr.atype, AddressType::Re)); + if let AddressValue::Regex(re) = addr.value { + assert!(re.is_match("HELLO")); // case-insensitive + } else { + panic!("expected Regex address value"); + } + } + + #[test] + fn test_compile_addr_empty_regex_saved() { + // First save a regex + let (lines1, mut chars1) = make_providers("/saved/"); + let _ = compile_address(&lines1, &mut chars1).unwrap(); + + // Then reuse it with empty regex + let (lines2, mut chars2) = make_providers("//"); + let addr = compile_address(&lines2, &mut chars2).unwrap(); + assert!(matches!(addr.atype, AddressType::Re)); + if let AddressValue::Regex(re) = addr.value { + assert!(re.is_match("saved")); + } else { + panic!("expected Regex address value"); + } + } + + // compile_address_range + #[test] + fn test_compile_single_line_address() { + let (lines, mut chars) = make_providers("42"); + let mut cmd = Command::default(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); + + assert_eq!(n_addr, 1); + assert!(matches!( + cmd.addr1.as_ref().unwrap().atype, + AddressType::Line + )); + } + + #[test] + fn test_compile_relative_address_range() { + let (lines, mut chars) = make_providers("2,+3"); + let mut cmd = Command::default(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); + + assert_eq!(n_addr, 2); + + assert!(matches!( + cmd.addr1.as_ref().unwrap().atype, + AddressType::Line + )); + let v1 = match &cmd.addr1.as_ref().unwrap().value { + AddressValue::LineNumber(n) => *n, + _ => panic!(), + }; + assert_eq!(v1, 2); + + assert!(matches!( + cmd.addr2.as_ref().unwrap().atype, + AddressType::RelLine + )); + let v2 = match &cmd.addr2.as_ref().unwrap().value { + AddressValue::LineNumber(n) => *n, + _ => panic!(), + }; + assert_eq!(v2, 3); + } + + #[test] + fn test_compile_last_address() { + let (lines, mut chars) = make_providers("$"); + let mut cmd = Command::default(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); + + assert_eq!(n_addr, 1); + assert!(matches!( + cmd.addr1.as_ref().unwrap().atype, + AddressType::Last + )); + } + + #[test] + fn test_compile_absolute_address_range() { + let (lines, mut chars) = make_providers("5,10"); + let mut cmd = Command::default(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); + + assert_eq!(n_addr, 2); + assert!(matches!( + cmd.addr1.as_ref().unwrap().atype, + AddressType::Line + )); + assert!(matches!( + cmd.addr2.as_ref().unwrap().atype, + AddressType::Line + )); + } + + #[test] + fn test_compile_regex_address() { + let (lines, mut chars) = make_providers("/foo/"); + let mut cmd = Command::default(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); + + assert_eq!(n_addr, 1); + assert!(matches!(cmd.addr1.as_ref().unwrap().atype, AddressType::Re)); + if let AddressValue::Regex(re) = &cmd.addr1.as_ref().unwrap().value { + assert!(re.is_match("foo")); + assert!(!re.is_match("bar")); + } else { + panic!("expected a regex address"); + } + } + + #[test] + fn test_compile_regex_address_range_other_delimiter() { + let (lines, mut chars) = make_providers("\\#foo# , \\|bar|"); + let mut cmd = Command::default(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); + + assert_eq!(n_addr, 2); + + assert!(matches!(cmd.addr1.as_ref().unwrap().atype, AddressType::Re)); + if let AddressValue::Regex(re) = &cmd.addr1.as_ref().unwrap().value { + assert!(re.is_match("foo")); + assert!(!re.is_match("bar")); + } else { + panic!("expected a regex address"); + } + + assert!(matches!(cmd.addr2.as_ref().unwrap().atype, AddressType::Re)); + if let AddressValue::Regex(re) = &cmd.addr2.as_ref().unwrap().value { + assert!(re.is_match("bar")); + assert!(!re.is_match("foo")); + } else { + panic!("expected a regex address"); + } + } + + #[test] + fn test_compile_regex_with_modifier() { + let (lines, mut chars) = make_providers("/foo/I"); + let mut cmd = Command::default(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); + + assert_eq!(n_addr, 1); + assert!(matches!(cmd.addr1.as_ref().unwrap().atype, AddressType::Re)); + if let AddressValue::Regex(re) = &cmd.addr1.as_ref().unwrap().value { + assert!(re.is_match("FOO")); + assert!(re.is_match("foo")); + } else { + panic!("expected a regex address with case-insensitive match"); + } + } + + #[test] + fn test_compile_re_reuse_saved() { + // First save a regex + let (lines1, mut chars1) = make_providers("/abc/"); + let mut cmd1 = Command::default(); + compile_address_range(&lines1, &mut chars1, &mut cmd1).unwrap(); + + // Now reuse it + let (lines2, mut chars2) = make_providers("//"); + let mut cmd2 = Command::default(); + let n_addr = compile_address_range(&lines2, &mut chars2, &mut cmd2).unwrap(); + + assert_eq!(n_addr, 1); + assert!(matches!( + cmd2.addr1.as_ref().unwrap().atype, + AddressType::Re + )); + if let AddressValue::Regex(re) = &cmd2.addr1.as_ref().unwrap().value { + assert!(re.is_match("abc")); + } + } + + // compile_thread + fn make_provider(lines: &[&str]) -> ScriptLineProvider { + let input = lines + .iter() + .map(|s| ScriptValue::StringVal(s.to_string())) + .collect(); + ScriptLineProvider::new(input) + } + + fn make_cli_options() -> CliOptions { + CliOptions::default() + } + + #[test] + fn test_compile_thread_empty_input() { + let mut provider = make_provider(&[]); + let mut opts = make_cli_options(); + + let result = compile_thread(&mut provider, &mut opts).unwrap(); + assert!(result.is_none()); + } + + #[test] + fn test_compile_thread_comment_only() { + let mut provider = make_provider(&["# comment", " ", ";;"]); + let mut opts = make_cli_options(); + + let result = compile_thread(&mut provider, &mut opts).unwrap(); + assert!(result.is_none()); + } + + #[test] + fn test_compile_thread_single_command() { + let mut provider = make_provider(&["42q"]); + let mut opts = make_cli_options(); + + let result = compile_thread(&mut provider, &mut opts).unwrap(); + let cmd = result.unwrap(); + + assert_eq!(cmd.code, 'q'); + + let addr = cmd.addr1.as_ref().expect("addr1 should be set"); + assert!(matches!(addr.atype, AddressType::Line)); + + let value = match &addr.value { + AddressValue::LineNumber(n) => *n, + _ => panic!(), + }; + assert_eq!(value, 42); + + assert!(cmd.next.is_none()); + } + + #[test] + fn test_compile_thread_multiple_lines() { + let mut provider = make_provider(&["1q", "2d"]); + let mut opts = make_cli_options(); + + let result = compile_thread(&mut provider, &mut opts).unwrap(); + let first = result.unwrap(); + + assert_eq!(first.code, 'q'); + let second = first.next.unwrap(); + assert_eq!(second.code, 'd'); + assert!(second.next.is_none()); + } + + #[test] + fn test_compile_thread_single_line_multiple_commands() { + let mut provider = make_provider(&["1q;2d"]); + let mut opts = make_cli_options(); + + let result = compile_thread(&mut provider, &mut opts).unwrap(); + let first = result.unwrap(); + + assert_eq!(first.code, 'q'); + let second = first.next.unwrap(); + assert_eq!(second.code, 'd'); + assert!(second.next.is_none()); + } } diff --git a/src/uu/sed/src/delimited_parser.rs b/src/uu/sed/src/delimited_parser.rs new file mode 100644 index 0000000..ff62df2 --- /dev/null +++ b/src/uu/sed/src/delimited_parser.rs @@ -0,0 +1,918 @@ +// Parse delimited character sequences +// +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Diomidis Spinellis +// +// This file is part of the uutils sed package. +// It is licensed under the MIT License. +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use crate::script_char_provider::ScriptCharProvider; +use crate::script_line_provider::ScriptLineProvider; +use std::char; +use uucore::error::{UResult, USimpleError}; + +// Fail with msg as a compile error at the current location +pub fn compilation_error( + lines: &ScriptLineProvider, + line: &ScriptCharProvider, + msg: impl ToString, +) -> UResult { + Err(USimpleError::new( + 1, + format!( + "{}:{}:{}: error: {}", + lines.get_input_name(), + lines.get_line_number(), + line.get_pos(), + msg.to_string() + ), + )) +} + +/// Return true if c is a valid octal digit +fn is_ascii_octal_digit(c: char) -> bool { + matches!(c, '0'..='7') +} + +/// Parse a numeric character escape and return the corresponding char. +/// Advance line to the first character not part of the escape. +/// ndigits is the number of allowed digits and radix is the value's +/// radix (e.g. 8, 10, 16 for octal, decimal, and hex escapes). +/// For values up to 3 ndigits is the maximum number of allowed digits, +/// for values above 3 ndigits is the exact number of allowed digits. +/// Return `None` if no valid character has been specified. +fn parse_numeric_escape( + line: &mut ScriptCharProvider, + is_allowed_char: fn(char) -> bool, + ndigits: usize, + radix: u32, +) -> Option { + let mut valid_chars = Vec::new(); + + for _ in 0..ndigits { + if !line.eol() && is_allowed_char(line.current()) { + valid_chars.push(line.current()); + line.advance(); + } else { + break; + } + } + + if valid_chars.is_empty() { + return None; + } + + if ndigits > 3 && valid_chars.len() != ndigits { + line.retreat(valid_chars.len()); + return None; + } + + let char_string: String = valid_chars.into_iter().collect(); + match u32::from_str_radix(&char_string, radix) + .ok() + .and_then(char::from_u32) + { + Some(decoded) => Some(decoded), + None => panic!("Unable to decode numeric character escape."), + } +} + +/// Transforms the specified character into the corresponding ASCII +/// control character as follows. +/// - Convert lowercase letters to uppercase +/// - XOR the ASCII value with 0x40 (inverts bit 6) +/// +/// Return `None` if the result is not a valid Unicode scalar. +fn create_control_char(x: char) -> Option { + if !x.is_ascii() { + return None; + } + + let mut c = x; + if c.is_ascii_lowercase() { + c = c.to_ascii_uppercase(); + } + + let transformed = (c as u8) ^ 0x40; + char::from_u32(transformed as u32) +} + +/// Parse a character escape valid in all contexts (RE pattern, substitution, +/// transliterarion) and return the corresponding char. +/// At entry line.current() must have advanced after the `\\`. +/// Advance line to the first character not part of the escape. +/// Return `None` if an invalid escape has been specified. +fn parse_char_escape(line: &mut ScriptCharProvider) -> Option { + match line.current() { + 'a' => { + line.advance(); + Some('\x07') + } + 'f' => { + line.advance(); + Some('\x0c') + } + 'n' => { + line.advance(); + Some('\n') + } + 'r' => { + line.advance(); + Some('\r') + } + 't' => { + line.advance(); + Some('\t') + } + 'v' => { + line.advance(); + Some('\x0b') + } + + 'c' => { + // Control character escape: \cC + line.advance(); // move past 'c' + match create_control_char(line.current()) { + Some(decoded) => { + line.advance(); + Some(decoded) + } + None => Some('c'), + } + } + + 'd' => { + // Decimal escape: \dnnn + line.advance(); // move past 'd' + match parse_numeric_escape(line, |c| c.is_ascii_digit(), 3, 10) { + Some(decoded) => Some(decoded), + None => Some('d'), + } + } + + 'o' => { + // Octal escape: \onnn + line.advance(); // move past 'o' + match parse_numeric_escape(line, is_ascii_octal_digit, 3, 8) { + Some(decoded) => Some(decoded), + None => Some('o'), + } + } + + 'u' => { + // Short Unicode escape \uXXXX (exactly four hex digits) + line.advance(); // move past 'x' + match parse_numeric_escape(line, |c| c.is_ascii_hexdigit(), 4, 16) { + Some(decoded) => Some(decoded), + None => Some('u'), + } + } + + 'U' => { + // Short Unicode escape \UXXXXXXXX (exactly eight heax digits) + line.advance(); // move past 'x' + match parse_numeric_escape(line, |c| c.is_ascii_hexdigit(), 8, 16) { + Some(decoded) => Some(decoded), + None => Some('U'), + } + } + + 'x' => { + // Hexadecimal escape: \xnn + line.advance(); // move past 'x' + match parse_numeric_escape(line, |c| c.is_ascii_hexdigit(), 2, 16) { + Some(decoded) => Some(decoded), + None => Some('x'), + } + } + _ => None, + } +} + +/// Parse a POSIX RE character class returning it as a string. +/// This functionality is needed to avoid terminating delimited +/// sequences when a delimiter appears within a character class. +/// While at it, handle escaped characters for the sake of consistency. +fn parse_character_class( + lines: &ScriptLineProvider, + line: &mut ScriptCharProvider, +) -> UResult { + let mut result = String::new(); + + if line.eol() || line.current() != '[' { + panic!("Invalid character class."); + } + + line.advance(); + result.push('['); + + // Optional negation + if !line.eol() && line.current() == '^' { + result.push('^'); + line.advance(); + } + + // Optional leading ']' inside the class + if !line.eol() && line.current() == ']' { + result.push(']'); + line.advance(); + } + + while !line.eol() { + let ch = line.current(); + + if ch == ']' { + result.push(']'); + line.advance(); + return Ok(result); + } + + if ch == '[' { + line.advance(); + if !line.eol() { + let marker = line.current(); + // POSIX character class, collating symbol, or equivalence + if marker == ':' || marker == '.' || marker == '=' { + line.advance(); + + result.push('['); + result.push(marker); + + let mut inner = String::new(); + let mut terminated = false; + + while !line.eol() { + let c = line.current(); + if c == marker { + line.advance(); + if !line.eol() && line.current() == ']' { + line.advance(); + result.push_str(&inner); + result.push(marker); + result.push(']'); + terminated = true; + break; + } else { + // False alarm, just part of the inner name + inner.push(marker); + } + } else { + inner.push(c); + line.advance(); + } + } + + if !terminated { + return compilation_error( + lines, + line, + "Unterminated POSIX character class, equivalence or collating symbol", + ); + } + + continue; + } else { + // Not a POSIX construct — treat as literal + result.push('['); + result.push(marker); + line.advance(); + continue; + } + } else { + result.push('['); + continue; + } + } + + if ch == '\\' { + // Handle escape sequence + line.advance(); + if line.eol() { + break; + } + match parse_char_escape(line) { + Some(decoded) => result.push(decoded), + None => { + result.push('\\'); + result.push(line.current()); + line.advance(); + } + } + } else { + result.push(ch); + line.advance(); + } + } + + compilation_error(lines, line, "Unterminated bracket expression") +} + +/// Scan and return the opening delimiter of a delimited string +/// Advances the line past the opening delimiter +fn scan_delimiter(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> UResult { + // Sanity check + if line.eol() { + return compilation_error(lines, line, "unexpected end of line".to_string()); + } + + let delimiter = line.current(); + if delimiter == '\\' { + return compilation_error(lines, line, "\\ cannot be used as a string delimiter"); + } + line.advance(); // skip the opening delimiter + Ok(delimiter) +} + +/// Parse the regular expression delimited by the current line +/// character and return it as a string. +/// On return the line is on the closing delimiter. +pub fn parse_regex(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> UResult { + let delimiter = scan_delimiter(lines, line)?; + let mut result = String::new(); + + while !line.eol() { + match line.current() { + '[' if delimiter != '[' => { + let cc = parse_character_class(lines, line)?; + result.push_str(&cc); + continue; + } + '\\' => { + line.advance(); + if line.eol() { + return compilation_error(lines, line, "unterminated regular expression"); + } + if line.current() == delimiter { + // Push escaped delimiter + result.push(line.current()); + line.advance(); + continue; + } + match parse_char_escape(line) { + Some(decoded) => result.push(decoded), + None => { + // Pass through \ to RE engine for further treatment + result.push('\\'); + result.push(line.current()); + line.advance(); + } + } + continue; + } + c if c == delimiter => return Ok(result), + c => result.push(c), + } + line.advance(); + } + compilation_error(lines, line, "unterminated regular expression") +} + +/// Parse the transliteration string delimited by the current line +/// character and return it as a string. +/// On return the line is on the closing delimiter. +pub fn parse_transliteration( + lines: &ScriptLineProvider, + line: &mut ScriptCharProvider, +) -> UResult { + let delimiter = scan_delimiter(lines, line)?; + let mut result = String::new(); + + while !line.eol() { + match line.current() { + '\\' => { + line.advance(); + if line.eol() { + return compilation_error(lines, line, "unterminated transliteration string"); + } + if line.current() == delimiter || line.current() == '\\' { + // Push only the escaped character + result.push(line.current()); + line.advance(); + continue; + } + match parse_char_escape(line) { + Some(decoded) => result.push(decoded), + None => { + // Pass through \ to tr for literal use + result.push('\\'); + result.push(line.current()); + line.advance(); + } + } + continue; + } + c if c == delimiter => return Ok(result), + c => result.push(c), + } + line.advance(); + } + compilation_error(lines, line, "unterminated transliteration string") +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_providers(input: &str) -> (ScriptLineProvider, ScriptCharProvider) { + let lines = ScriptLineProvider::new(vec![]); // Empty for tests + let line = ScriptCharProvider::new(input); + (lines, line) + } + + // parse_numeric_escape + #[test] + fn test_compile_octal_escape() { + let mut provider = ScriptCharProvider::new("141rest"); + let c = parse_numeric_escape(&mut provider, is_ascii_octal_digit, 3, 8); + assert_eq!(c, Some('a')); + assert_eq!(provider.current(), 'r'); // "141" was consumed + } + + #[test] + fn test_compile_octal_escape_eol() { + let mut provider = ScriptCharProvider::new("141"); + let c = parse_numeric_escape(&mut provider, is_ascii_octal_digit, 3, 8); + assert_eq!(c, Some('a')); + assert!(provider.eol()); // "141" was consumed + } + + #[test] + fn test_compile_decimal_escape() { + let mut provider = ScriptCharProvider::new("0659"); + let c = parse_numeric_escape(&mut provider, |c| c.is_ascii_digit(), 3, 10); + assert_eq!(c, Some('A')); + assert_eq!(provider.current(), '9'); // "65" was consumed + } + + #[test] + fn test_compile_decimal_invalid() { + let mut provider = ScriptCharProvider::new("QR"); + let c = parse_numeric_escape(&mut provider, |c| c.is_ascii_digit(), 3, 10); + assert_eq!(c, None); + assert_eq!(provider.current(), 'Q'); + } + + #[test] + fn test_compile_hex_escape() { + let mut provider = ScriptCharProvider::new("3cZ"); + let c = parse_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 2, 16); + assert_eq!(c, Some('<')); + assert_eq!(provider.current(), 'Z'); // "41" was consumed + } + + #[test] + fn test_compile_hex_escape_truncated() { + let mut provider = ScriptCharProvider::new("4G"); + let c = parse_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 2, 16); + assert_eq!(c, Some('\u{4}')); // Only '4' is valid hex + assert_eq!(provider.current(), 'G'); // "41" was consumed + } + + #[test] + fn test_compile_unicode_escape_short() { + // U+2665 = '♄' + let mut provider = ScriptCharProvider::new("26650"); + let c = parse_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 4, 16); + assert_eq!(c, Some('♄')); + assert_eq!(provider.current(), '0'); // "2665" was consumed + } + + #[test] + fn test_compile_unicode_escape_short_invalid() { + let mut provider = ScriptCharProvider::new("123Q"); + let c = parse_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 4, 16); + assert_eq!(c, None); + assert_eq!(provider.current(), '1'); + } + + #[test] + fn test_compile_unicode_escape_long_invalid() { + // U+2665 = '♄' + let mut provider = ScriptCharProvider::new("1234567Q"); + let c = parse_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 8, 16); + assert_eq!(c, None); + assert_eq!(provider.current(), '1'); + } + + #[test] + fn test_compile_unicode_escape_long() { + // U+1F600 = 😀 + let mut provider = ScriptCharProvider::new("0001F6009"); + let c = parse_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 8, 16); + assert_eq!(c, Some('😀')); + assert_eq!(provider.current(), '9'); // "0001F600" was consumed + } + + #[test] + fn test_no_valid_digits() { + let mut provider = ScriptCharProvider::new("xyz"); + let c = parse_numeric_escape(&mut provider, |c| c.is_ascii_digit(), 3, 10); + assert_eq!(c, None); + assert_eq!(provider.current(), 'x'); // No advancement + } + + // create_control_char + #[test] + fn test_lowercase_letter() { + assert_eq!(create_control_char('z'), Some('\u{1a}')); // 0x5A ^ 0x40 = 0x1A + assert_eq!(create_control_char('a'), Some('\u{01}')); // 0x41 ^ 0x40 = 0x01 + } + + #[test] + fn test_uppercase_letter() { + assert_eq!(create_control_char('Z'), Some('\u{1a}')); + assert_eq!(create_control_char('A'), Some('\u{01}')); + } + + #[test] + fn test_symbol_characters() { + assert_eq!(create_control_char('{'), Some(';')); // 0x7B ^ 0x40 = 0x3B + assert_eq!(create_control_char(';'), Some('{')); // 0x3B ^ 0x40 = 0x7B + } + + #[test] + fn test_non_ascii_char() { + // This will not match any transformation and may panic if it overflows + // But the current function only handles ASCII-safe chars + assert_eq!(create_control_char('Ă©'), None); // outside ASCII + } + + #[test] + fn test_edge_ascii_values() { + assert_eq!(create_control_char('@'), Some('\0')); // 0x40 ^ 0x40 = 0x00 + assert_eq!(create_control_char('\x7F'), Some('\x3F')); // 0x7F ^ 0x40 = 0x3F + } + + // parse_char_escape + fn escape_result_with_current(input: &str) -> (Option, Option) { + let mut provider = ScriptCharProvider::new(input); + let result = parse_char_escape(&mut provider); + let current = if provider.eol() { + None + } else { + Some(provider.current()) + }; + (result, current) + } + + #[test] + fn test_standard_escapes_eol() { + assert_eq!(escape_result_with_current("a"), (Some('\x07'), None)); + assert_eq!(escape_result_with_current("f"), (Some('\x0c'), None)); + assert_eq!(escape_result_with_current("n"), (Some('\n'), None)); + assert_eq!(escape_result_with_current("r"), (Some('\r'), None)); + assert_eq!(escape_result_with_current("t"), (Some('\t'), None)); + assert_eq!(escape_result_with_current("v"), (Some('\x0b'), None)); + } + + #[test] + fn test_standard_escapes_more() { + assert_eq!(escape_result_with_current("a."), (Some('\x07'), Some('.'))); + assert_eq!(escape_result_with_current("f."), (Some('\x0c'), Some('.'))); + assert_eq!(escape_result_with_current("n."), (Some('\n'), Some('.'))); + assert_eq!(escape_result_with_current("r."), (Some('\r'), Some('.'))); + assert_eq!(escape_result_with_current("t."), (Some('\t'), Some('.'))); + assert_eq!(escape_result_with_current("v."), (Some('\x0b'), Some('.'))); + } + + #[test] + fn test_escape_invalid() { + assert_eq!(escape_result_with_current("zx"), (None, Some('z'))); + } + + #[test] + fn test_control_escape_valid() { + assert_eq!(escape_result_with_current("cZ"), (Some('\x1A'), None)); + } + + #[test] + fn test_control_escape_invalid() { + assert_eq!(escape_result_with_current("cĂ©"), (Some('c'), Some('Ă©'))); + } + + #[test] + fn test_decimal_escape_valid() { + assert_eq!(escape_result_with_current("d065r"), (Some('A'), Some('r'))); + } + + #[test] + fn test_octal_escape_valid() { + assert_eq!(escape_result_with_current("o141x"), (Some('a'), Some('x'))); + } + + #[test] + fn test_hex_escape_valid() { + assert_eq!(escape_result_with_current("x41;"), (Some('A'), Some(';'))); + } + + #[test] + fn test_short_unicode_escape_valid() { + assert_eq!(escape_result_with_current("u2665;"), (Some('♄'), Some(';'))); + } + + #[test] + fn test_long_unicode_escape_valid() { + assert_eq!( + escape_result_with_current("U0001F600;"), + (Some('😀'), Some(';')) + ); + } + + #[test] + fn test_decimal_escape_fallback() { + assert_eq!(escape_result_with_current("d;."), (Some('d'), Some(';'))); + } + + #[test] + fn test_octal_escape_fallback() { + assert_eq!(escape_result_with_current("o9x"), (Some('o'), Some('9'))); + } + + #[test] + fn test_hex_escape_fallback() { + assert_eq!(escape_result_with_current("xyz"), (Some('x'), Some('y'))); + } + + #[test] + fn test_unknown_escape() { + assert_eq!(escape_result_with_current("q"), (None, Some('q'))); + } + + // parse_character_class + fn char_provider_from(input: &str) -> ScriptCharProvider { + ScriptCharProvider::new(input) + } + + fn test_lines() -> ScriptLineProvider { + ScriptLineProvider::with_active_state("test.sed", 3) + } + + #[test] + fn test_basic_character_class() { + let mut line = char_provider_from("[qr]"); + let lines = test_lines(); + let result = parse_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[qr]"); + } + + #[test] + fn test_negated_class() { + let mut line = char_provider_from("[^abc]"); + let lines = test_lines(); + let result = parse_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[^abc]"); + } + + #[test] + fn test_leading_close_bracket() { + let mut line = char_provider_from("[]abc]"); + let lines = test_lines(); + let result = parse_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[]abc]"); + } + + #[test] + fn test_leading_negated_close_bracket() { + let mut line = char_provider_from("[^]abc]"); + let lines = test_lines(); + let result = parse_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[^]abc]"); + } + + #[test] + fn test_escaped_character_begin() { + let mut line = char_provider_from("[\\nabc]"); + let lines = test_lines(); + let result = parse_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[\nabc]"); + } + + #[test] + fn test_escaped_character_middle() { + let mut line = char_provider_from("[a\\nbc]"); + let lines = test_lines(); + let result = parse_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[a\nbc]"); + } + + #[test] + fn test_escaped_character_end() { + let mut line = char_provider_from("[abc\\n]"); + let lines = test_lines(); + let result = parse_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[abc\n]"); + } + + #[test] + fn test_escaped_delimiter() { + let mut line = char_provider_from("[a\\]bc]"); + let lines = test_lines(); + let result = parse_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[a\\]bc]"); + } + + #[test] + fn test_posix_class() { + let mut line = char_provider_from("[[:digit:]]"); + let lines = test_lines(); + let result = parse_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[[:digit:]]"); + } + + #[test] + fn test_equivalence_class() { + let mut line = char_provider_from("[[=a=]]"); + let lines = test_lines(); + let result = parse_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[[=a=]]"); + } + + #[test] + fn test_collating_symbol() { + let mut line = char_provider_from("[[.ch.]]"); + let lines = test_lines(); + let result = parse_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[[.ch.]]"); + } + + #[test] + fn test_unterminated_class_error() { + let mut line = char_provider_from("[abc"); // missing closing ] + let lines = test_lines(); + let err = parse_character_class(&lines, &mut line); + assert!(err.is_err()); + } + + #[test] + fn test_unterminated_posix_class_error() { + let mut line = char_provider_from("[[:digit:]"); + let lines = test_lines(); + let err = parse_character_class(&lines, &mut line); + assert!(err.is_err()); + } + + #[test] + fn test_unterminated_escape_error() { + let mut line = char_provider_from("[abc\\"); // missing closing ] + let lines = test_lines(); + let err = parse_character_class(&lines, &mut line); + assert!(err.is_err()); + } + + #[test] + fn test_malformed_posix_like_pattern_treated_as_literal() { + let mut line = char_provider_from("[[x]yz]"); + let lines = test_lines(); + let result = parse_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[[x]"); + } + + // parse_regex + #[test] + fn test_simple_regex() { + let (lines, mut line) = make_providers("/abc/"); + let parsed = parse_regex(&lines, &mut line).unwrap(); + assert_eq!(parsed, "abc"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn test_regex_with_escaped_delimiter() { + let (lines, mut line) = make_providers("/ab\\/c/"); + let parsed = parse_regex(&lines, &mut line).unwrap(); + assert_eq!(parsed, "ab/c"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn test_regex_with_escape_sequence() { + let (lines, mut line) = make_providers("/ab\\n/"); + let parsed = parse_regex(&lines, &mut line).unwrap(); + assert_eq!(parsed, "ab\n"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn errors_on_unterminated_regex() { + let (lines, mut line) = make_providers("/unterminated"); + let err = parse_regex(&lines, &mut line).unwrap_err(); + assert!(err.to_string().contains("unterminated regular expression")); + } + + #[test] + fn errors_on_esc_at_re_eol() { + let (lines, mut line) = make_providers("/foo\\"); + let err = parse_regex(&lines, &mut line).unwrap_err(); + assert!(err.to_string().contains("unterminated regular expression")); + } + + #[test] + fn errors_on_backslash_delimiter() { + let (lines, mut line) = make_providers("\\bad"); + let err = parse_regex(&lines, &mut line).unwrap_err(); + assert!(err + .to_string() + .contains("\\ cannot be used as a string delimiter")); + } + + #[test] + fn test_regex_with_character_class() { + let (lines, mut line) = make_providers("/[a-z]/"); + let parsed = parse_regex(&lines, &mut line).unwrap(); + assert_eq!(parsed, "[a-z]"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn test_regex_with_bracket_delimiter() { + let (lines, mut line) = make_providers("[abc["); + let parsed = parse_regex(&lines, &mut line).unwrap(); + assert_eq!(parsed, "abc"); + assert_eq!(line.current(), '['); + } + + #[test] + fn test_bracket_regex_with_bracket_delimiter() { + let (lines, mut line) = make_providers("[a\\[0-9]bc["); + let parsed = parse_regex(&lines, &mut line).unwrap(); + assert_eq!(parsed, "a[0-9]bc"); + assert_eq!(line.current(), '['); + } + + #[test] + fn test_regex_with_escaped_bracket_in_character_class() { + let (lines, mut line) = make_providers("/[a\\]z]/"); + let parsed = parse_regex(&lines, &mut line).unwrap(); + assert_eq!(parsed, "[a\\]z]"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn test_regex_with_delimiter_inside_character_class() { + let (lines, mut line) = make_providers("/[a/c]/"); + let parsed = parse_regex(&lines, &mut line).unwrap(); + assert_eq!(parsed, "[a/c]"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn test_regex_with_escaped_paren_and_backslash() { + let (lines, mut line) = make_providers("/\\(\\\\/"); + let parsed = parse_regex(&lines, &mut line).unwrap(); + assert_eq!(parsed, "\\(\\\\"); + assert_eq!(line.current(), '/'); + } + + // parse_transliteration + #[test] + fn test_simple_transliteration() { + let (lines, mut line) = make_providers("/abc/"); + let parsed = parse_transliteration(&lines, &mut line).unwrap(); + assert_eq!(parsed, "abc"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn test_transliteration_with_escaped_delimiter() { + let (lines, mut line) = make_providers("/ab\\/c/"); + let parsed = parse_transliteration(&lines, &mut line).unwrap(); + assert_eq!(parsed, "ab/c"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn test_transliteration_with_escaped_backslash() { + let (lines, mut line) = make_providers("/ab\\\\c/"); + let parsed = parse_transliteration(&lines, &mut line).unwrap(); + assert_eq!(parsed, "ab\\c"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn test_transliteration_with_escape_sequence() { + let (lines, mut line) = make_providers("/ab\\n/"); + let parsed = parse_transliteration(&lines, &mut line).unwrap(); + assert_eq!(parsed, "ab\n"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn errors_on_unterminated_transliteration() { + let (lines, mut line) = make_providers("/unterminated"); + let err = parse_transliteration(&lines, &mut line).unwrap_err(); + assert!(err + .to_string() + .contains("unterminated transliteration string")); + } + + #[test] + fn errors_on_esc_at_tr_eol() { + let (lines, mut line) = make_providers("/foo\\"); + let err = parse_transliteration(&lines, &mut line).unwrap_err(); + assert!(err + .to_string() + .contains("unterminated transliteration string")); + } +} diff --git a/src/uu/sed/src/script_char_provider.rs b/src/uu/sed/src/script_char_provider.rs index 6f40469..bb537f3 100644 --- a/src/uu/sed/src/script_char_provider.rs +++ b/src/uu/sed/src/script_char_provider.rs @@ -28,6 +28,15 @@ impl ScriptCharProvider { } } + /// Retreats current position by specified number or to beginning. + pub fn retreat(&mut self, n: usize) { + if n > self.pos { + self.pos = 0; + } else { + self.pos -= n; + } + } + /// Returns the current character. Panics if out of bounds. pub fn current(&self) -> char { self.line[self.pos] @@ -98,4 +107,34 @@ mod tests { provider.eat_spaces(); assert_eq!(provider.current(), 'a'); } + + #[test] + fn test_retreat_normal() { + let mut chars = ScriptCharProvider::new("abcdef"); + chars.pos = 4; // simulate position at 'e' + chars.retreat(2); + + assert_eq!(chars.get_pos(), 2); + assert_eq!(chars.current(), 'c'); + } + + #[test] + fn test_retreat_to_start() { + let mut chars = ScriptCharProvider::new("abcdef"); + chars.pos = 3; // simulate position at 'd' + chars.retreat(5); // retreat more than current pos + + assert_eq!(chars.get_pos(), 0); + assert_eq!(chars.current(), 'a'); + } + + #[test] + fn test_retreat_zero() { + let mut chars = ScriptCharProvider::new("abcdef"); + chars.pos = 2; // at 'c' + chars.retreat(0); // retreat by 0 + + assert_eq!(chars.get_pos(), 2); + assert_eq!(chars.current(), 'c'); + } } diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index 2a3dae8..1f73629 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -10,6 +10,7 @@ pub mod command; pub mod compiler; +pub mod delimited_parser; pub mod processor; pub mod script_char_provider; pub mod script_line_provider;