Skip to content
This repository was archived by the owner on Jun 2, 2025. It is now read-only.

Fix identifier (un)escaping #47

Merged
merged 7 commits into from
May 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
411 changes: 399 additions & 12 deletions tests/WP_SQLite_Driver_Tests.php

Large diffs are not rendered by default.

120 changes: 60 additions & 60 deletions tests/WP_SQLite_Driver_Translation_Tests.php

Large diffs are not rendered by default.

23 changes: 18 additions & 5 deletions wp-includes/mysql/class-wp-mysql-lexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -2130,7 +2130,7 @@ class WP_MySQL_Lexer {
*
* @var int
*/
private $sql_modes;
private $sql_modes = 0;

/**
* How many bytes from the original SQL payload have been read and tokenized.
Expand Down Expand Up @@ -2181,16 +2181,28 @@ class WP_MySQL_Lexer {
/**
* @param string $sql The SQL payload to tokenize.
* @param int $mysql_version The version of the MySQL server that the SQL payload is intended for.
* @param int $sql_modes The SQL modes that should be considered active during tokenization.
* @param string[] $sql_modes The SQL modes that should be considered active during tokenization.
*/
public function __construct(
string $sql,
int $mysql_version = 80038,
int $sql_modes = 0
array $sql_modes = array()
) {
$this->sql = $sql;
$this->mysql_version = $mysql_version;
$this->sql_modes = $sql_modes;

foreach ( $sql_modes as $sql_mode ) {
$sql_mode = strtoupper( $sql_mode );
if ( 'HIGH_NOT_PRECEDENCE' === $sql_mode ) {
$this->sql_modes |= self::SQL_MODE_HIGH_NOT_PRECEDENCE;
} elseif ( 'PIPES_AS_CONCAT' === $sql_mode ) {
$this->sql_modes |= self::SQL_MODE_PIPES_AS_CONCAT;
} elseif ( 'IGNORE_SPACE' === $sql_mode ) {
$this->sql_modes |= self::SQL_MODE_IGNORE_SPACE;
} elseif ( 'NO_BACKSLASH_ESCAPES' === $sql_mode ) {
$this->sql_modes |= self::SQL_MODE_NO_BACKSLASH_ESCAPES;
}
}
}

/**
Expand Down Expand Up @@ -2251,7 +2263,8 @@ public function get_token(): ?WP_MySQL_Token {
$this->token_type,
$this->token_starts_at,
$this->bytes_already_read - $this->token_starts_at,
$this->sql
$this->sql,
$this->is_sql_mode_active( self::SQL_MODE_NO_BACKSLASH_ESCAPES )
);
}

Expand Down
144 changes: 144 additions & 0 deletions wp-includes/mysql/class-wp-mysql-token.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,33 @@
* and consumed by WP_MySQL_Parser during the parsing process.
*/
class WP_MySQL_Token extends WP_Parser_Token {
/**
* Whether the NO_BACKSLASH_ESCAPES SQL mode is enabled.
*
* @var bool
*/
private $sql_mode_no_backslash_escapes_enabled;

/**
* Constructor.
*
* @param int $id Token type.
* @param int $start Byte offset in the input where the token begins.
* @param int $length Byte length of the token in the input.
* @param string $input Input bytes from which the token was parsed.
* @param bool $sql_mode_no_backslash_escapes_enabled Whether the NO_BACKSLASH_ESCAPES SQL mode is enabled.
*/
public function __construct(
int $id,
int $start,
int $length,
string $input,
bool $sql_mode_no_backslash_escapes_enabled
) {
parent::__construct( $id, $start, $length, $input );
$this->sql_mode_no_backslash_escapes_enabled = $sql_mode_no_backslash_escapes_enabled;
}

/**
* Get the name of the token.
*
Expand All @@ -24,6 +51,123 @@ public function get_name(): string {
return $name;
}

/**
* Get the real unquoted value of the token.
*
* @return string The token value.
*/
public function get_value(): string {
Copy link
Contributor Author

@JanJakes JanJakes May 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While I like that using get_value() is lazy and can generally nicely work for any token type where we need to interpret or normalize any values, I'm wondering how to solve the NO_BACKSLASH_ESCAPES SQL mode.

It's a very simple IF, but in the token instance, we just know nothing about SQL modes 🤔 The tokenizer knows it, so it could pass in a flag, or use a different token instance, but that makes it a bit less elegant.

Copy link
Contributor

@adamziel adamziel May 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could it be a constructor argument? The mode is already determined when the token is created. If that was a boolean flag baked into the Token instance, we could still keep the get_value() method argument-less.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 Done in 6e5a8f5.

$value = $this->get_bytes();
if (
WP_MySQL_Lexer::SINGLE_QUOTED_TEXT === $this->id
|| WP_MySQL_Lexer::DOUBLE_QUOTED_TEXT === $this->id
|| WP_MySQL_Lexer::BACK_TICK_QUOTED_ID === $this->id
) {
// Remove bounding quotes.
$quote = $value[0];
$value = substr( $value, 1, -1 );

/*
* When the NO_BACKSLASH_ESCAPES SQL mode is enabled, we only need to
* handle escaped bounding quotes, as the other characters preserve
* their literal values.
*/
if ( $this->sql_mode_no_backslash_escapes_enabled ) {
return str_replace( $quote . $quote, $quote, $value );
}

/**
* Unescape MySQL escape sequences.
*
* MySQL string literals use backslash as an escape character, and
* the string bounding quotes can also be escaped by being doubled.
*
* The escaping is done according to the following rules:
*
* 1. Some special character escape sequences are recognized.
* For example, "\n" is a newline character, "\0" is ASCII NULL.
* 2. A specific treatment is applied to "\%" and "\_" sequences.
* This is due to their special meaning for pattern matching.
* 3. Other backslash-prefixed characters resolve to their literal
* values. For example, "\x" represents "x", "\\" represents "\".
*
* Despite looking similar, these rules are different from the C-style
* string escaping, so we cannot use "strip(c)slashes()" in this case.
*
* See: https://dev.mysql.com/doc/refman/8.4/en/string-literals.html
*/
$backslash = chr( 92 );
$replacements = array(
/*
* MySQL special character escape sequences.
*/
( $backslash . '0' ) => chr( 0 ), // An ASCII NULL character (\0).
( $backslash . "'" ) => chr( 39 ), // A single quote character (').
( $backslash . '"' ) => chr( 34 ), // A double quote character (").
( $backslash . 'b' ) => chr( 8 ), // A backspace character.
( $backslash . 'n' ) => chr( 10 ), // A newline (linefeed) character (\n).
( $backslash . 'r' ) => chr( 13 ), // A carriage return character (\r).
( $backslash . 't' ) => chr( 9 ), // A tab character (\t).
( $backslash . 'Z' ) => chr( 26 ), // An ASCII 26 (Control+Z) character.

/*
* Normalize escaping of "%" and "_" characters.
*
* MySQL has unusual handling for "\%" and "\_" in all string literals.
* While other sequences follow the C-style escaping ("\?" is "?", etc.),
* "\%" resolves to "\%" and "\_" resolves to "\_" (unlike in C strings).
*
* This means that "\%" behaves like "\\%", and "\_" behaves like "\\_".
* To preserve this behavior, we need to add a second backslash here.
*
* From https://dev.mysql.com/doc/refman/8.4/en/string-literals.html:
* > The \% and \_ sequences are used to search for literal instances
* > of % and _ in pattern-matching contexts where they would otherwise
* > be interpreted as wildcard characters. If you use \% or \_ outside
* > of pattern-matching contexts, they evaluate to the strings \% and
* > \_, not to % and _.
*/
( $backslash . '%' ) => $backslash . $backslash . '%',
( $backslash . '_' ) => $backslash . $backslash . '_',

/*
* Preserve a double backslash as-is, so that the trailing backslash
* is not consumed as the beginning of an escape sequence like "\n".
*
* Resolving "\\" to "\" will be handled in the next step, where all
* other backslash-prefixed characters resolve to their literal values.
*/
( $backslash . $backslash )
=> $backslash . $backslash,

/*
* The bounding quotes can also be escaped by being doubled.
*/
( $quote . $quote ) => $quote,
);

/*
* Apply the replacements.
*
* It is important to use "strtr()" and not "str_replace()", because
Copy link
Contributor

@adamziel adamziel May 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Such a brilliant find ❤️

* "str_replace()" applies replacements one after another, modifying
* intermediate changes rather than just the original string:
*
* - str_replace( [ 'a', 'b' ], [ 'b', 'c' ], 'ab' ); // 'cc' (bad)
* - strtr( 'ab', [ 'a' => 'b', 'b' => 'c' ] ); // 'bc' (good)
*/
$value = strtr( $value, $replacements );

/*
* A backslash with any other character represents the character itself.
* That is, \x evaluates to x, \\ evaluates to \, and \🙂 evaluates to 🙂.
*/
$preg_quoted_backslash = preg_quote( $backslash );
$value = preg_replace( "/$preg_quoted_backslash(.)/u", '$1', $value );
}
return $value;
}

/**
* Get the token representation as a string.
*
Expand Down
13 changes: 11 additions & 2 deletions wp-includes/parser/class-wp-parser-token.php
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,20 @@ public function __construct(
}

/**
* Get the token value as raw bytes from the input.
* Get the raw bytes of the token from the input.
*
* @return string The token bytes.
*/
public function get_bytes(): string {
return substr( $this->input, $this->start, $this->length );
}

/**
* Get the real unquoted value of the token.
*
* @return string The token value.
*/
public function get_value(): string {
return substr( $this->input, $this->start, $this->length );
return $this->get_bytes();
}
}
Loading