Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MONGOCRYPT-755 Implement StrEncode #928

Open
wants to merge 25 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ set (MONGOCRYPT_SOURCES
src/mc-range-encoding.c
src/mc-rangeopts.c
src/mc-reader.c
src/mc-str-encode-string-sets.c
src/mc-text-search-str-encode.c
src/mc-tokens.c
src/mc-writer.c
src/mongocrypt-binary.c
Expand Down Expand Up @@ -474,6 +476,7 @@ set (TEST_MONGOCRYPT_SOURCES
test/test-mc-range-mincover.c
test/test-mc-rangeopts.c
test/test-mc-reader.c
test/test-mc-text-search-str-encode.c
test/test-mc-tokens.c
test/test-mc-range-encoding.c
test/test-mc-writer.c
Expand Down
55 changes: 55 additions & 0 deletions src/mc-fle2-encryption-placeholder-private.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,61 @@ bool mc_FLE2RangeInsertSpec_parse(mc_FLE2RangeInsertSpec_t *out,
bool use_range_v2,
mongocrypt_status_t *status);

// Note: For the substring/suffix/prefix insert specs, all lengths are in terms of number of UTF-8 codepoints, not
// number of bytes.
typedef struct {
// mlen is the max string length that can be indexed.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// mlen is the max string length that can be indexed.
// mlen is the max string length (in characters, not bytes) that can be indexed.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a clarifying comment above.

uint32_t mlen;
// lb is the lower bound on the length of substrings to be indexed.
uint32_t lb;
// ub is the upper bound on the length of substrings to be indexed.
uint32_t ub;
} mc_FLE2SubstringInsertSpec_t;

typedef struct {
// lb is the lower bound on the length of suffixes to be indexed.
uint32_t lb;
// ub is the upper bound on the length of suffixes to be indexed.
uint32_t ub;
} mc_FLE2SuffixInsertSpec_t;

typedef struct {
// lb is the lower bound on the length of prefixes to be indexed.
uint32_t lb;
// ub is the upper bound on the length of prefixes to be indexed.
uint32_t ub;
} mc_FLE2PrefixInsertSpec_t;

typedef struct {
// v is the value to encrypt.
const char *v;
// len is the byte length of v.
uint32_t len;

// substr is the spec for substring indexing.
struct {
mc_FLE2SubstringInsertSpec_t value;
bool set;
} substr;

// suffix is the spec for suffix indexing.
struct {
mc_FLE2SuffixInsertSpec_t value;
bool set;
} suffix;

// prefix is the spec for prefix indexing.
struct {
mc_FLE2PrefixInsertSpec_t value;
bool set;
} prefix;

// casef indicates if case folding is enabled.
bool casef;
// diacf indicates if diacritic folding is enabled.
bool diacf;
} mc_FLE2TextSearchInsertSpec_t;

/** FLE2EncryptionPlaceholder implements Encryption BinData (subtype 6)
* sub-subtype 0, the intent-to-encrypt mapping. Contains a value to encrypt and
* a description of how it should be encrypted.
Expand Down
95 changes: 95 additions & 0 deletions src/mc-str-encode-string-sets-private.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/*
* Copyright 2024-present MongoDB, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MONGOCRYPT_STR_ENCODE_STRING_SETS_PRIVATE_H
#define MONGOCRYPT_STR_ENCODE_STRING_SETS_PRIVATE_H

#include "mongocrypt-buffer-private.h"
#include "mongocrypt.h"

// Represents a valid unicode string with the bad character 0xFF appended to the end. This is our base string which
// we build substring trees on. Stores all the valid code points in the string, plus one code point for 0xFF.
// Exposed for testing.
typedef struct {
_mongocrypt_buffer_t buf;
uint32_t *codepoint_offsets;
uint32_t codepoint_len;
} mc_utf8_string_with_bad_char_t;

// Initialize by copying buffer into data and adding the bad character.
mc_utf8_string_with_bad_char_t *mc_utf8_string_with_bad_char_from_buffer(const char *buf, uint32_t len);

void mc_utf8_string_with_bad_char_destroy(mc_utf8_string_with_bad_char_t *utf8);

// Set of affixes of a shared base string. Does not do any duplicate prevention.
typedef struct _mc_affix_set_t mc_affix_set_t;

// Initialize affix set from base string and number of entries (this must be known as a prior).
mc_affix_set_t *mc_affix_set_new(const mc_utf8_string_with_bad_char_t *base_string, uint32_t n_indices);

void mc_affix_set_destroy(mc_affix_set_t *set);

// Insert affix into set. base_start/end_idx are codepoint indices. base_end_idx is exclusive. Returns true if
// inserted, false otherwise.
bool mc_affix_set_insert(mc_affix_set_t *set, uint32_t base_start_idx, uint32_t base_end_idx);

// Insert the base string count times into the set. Treated as a special case, since this is the only affix that
// will appear multiple times. Returns true if inserted, false otherwise.
bool mc_affix_set_insert_base_string(mc_affix_set_t *set, uint32_t count);

// Iterator on affix set.
typedef struct {
mc_affix_set_t *set;
uint32_t cur_idx;
} mc_affix_set_iter_t;

// Point the iterator to the first affix of the given set.
void mc_affix_set_iter_init(mc_affix_set_iter_t *it, mc_affix_set_t *set);

// Get the next affix, its length in bytes, and its count. Returns false if the set does not have a next element, true
// otherwise.
bool mc_affix_set_iter_next(mc_affix_set_iter_t *it, const char **str, uint32_t *byte_len, uint32_t *count);

// Set of substrings of a shared base string. Prevents duplicates.
typedef struct _mc_substring_set_t mc_substring_set_t;

mc_substring_set_t *mc_substring_set_new(const mc_utf8_string_with_bad_char_t *base_string);

void mc_substring_set_destroy(mc_substring_set_t *set);

// Insert the base string count times into the set. Treated as a special case, since this is the only substring that
// will appear multiple times. Always inserts successfully.
void mc_substring_set_increment_fake_string(mc_substring_set_t *set, uint32_t count);

// Insert substring into set. base_start/end_idx are codepoint indices. base_end_idx is exclusive. Returns true if
// inserted, false otherwise.
bool mc_substring_set_insert(mc_substring_set_t *set, uint32_t base_start_idx, uint32_t base_end_idx);

// Iterator on substring set.
typedef struct {
mc_substring_set_t *set;
void *cur_node;
uint32_t cur_idx;
} mc_substring_set_iter_t;

// Point the iterator to the first substring of the given set.
void mc_substring_set_iter_init(mc_substring_set_iter_t *it, mc_substring_set_t *set);

// Get the next substring, its length in bytes, and its count. Returns false if the set does not have a next element,
// true otherwise.
bool mc_substring_set_iter_next(mc_substring_set_iter_t *it, const char **str, uint32_t *byte_len, uint32_t *count);

#endif
Loading
Loading