Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 12 additions & 11 deletions tensorflow_text/core/kernels/spanning_tree_iterator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,15 @@ bool SpanningTreeIterator::HasCycle(const SourceList &sources) {
visiting_.assign(sources.size(), false);

// Search upwards from each node to find cycles.
for (uint32 initial_node = 0; initial_node < sources.size(); ++initial_node) {
for (uint32_t initial_node = 0; initial_node < sources.size();
++initial_node) {
// Search upwards to try to find a cycle.
uint32 current_node = initial_node;
uint32_t current_node = initial_node;
while (true) {
if (searched_[current_node]) break; // already searched
if (visiting_[current_node]) return true; // revisiting implies cycle
visiting_[current_node] = true; // mark as being currently visited
const uint32 source_node = sources[current_node];
const uint32_t source_node = sources[current_node];
if (source_node == current_node) break; // self-loops are roots
current_node = source_node; // advance upwards
}
Expand All @@ -45,7 +46,7 @@ bool SpanningTreeIterator::HasCycle(const SourceList &sources) {
if (searched_[current_node]) break; // already searched
searched_[current_node] = true;
visiting_[current_node] = false;
const uint32 source_node = sources[current_node];
const uint32_t source_node = sources[current_node];
if (source_node == current_node) break; // self-loops are roots
current_node = source_node; // advance upwards
}
Expand All @@ -54,18 +55,18 @@ bool SpanningTreeIterator::HasCycle(const SourceList &sources) {
return false;
}

uint32 SpanningTreeIterator::NumRoots(const SourceList &sources) {
uint32 num_roots = 0;
for (uint32 node = 0; node < sources.size(); ++node) {
uint32_t SpanningTreeIterator::NumRoots(const SourceList& sources) {
uint32_t num_roots = 0;
for (uint32_t node = 0; node < sources.size(); ++node) {
num_roots += (node == sources[node]);
}
return num_roots;
}

bool SpanningTreeIterator::NextSourceList(SourceList *sources) {
const uint32 num_nodes = sources->size();
for (uint32 i = 0; i < num_nodes; ++i) {
const uint32 new_source = ++(*sources)[i];
const uint32_t num_nodes = sources->size();
for (uint32_t i = 0; i < num_nodes; ++i) {
const uint32_t new_source = ++(*sources)[i];
if (new_source < num_nodes) return true; // absorbed in this digit
(*sources)[i] = 0; // overflowed this digit, carry to next digit
}
Expand All @@ -76,7 +77,7 @@ bool SpanningTreeIterator::NextTree(SourceList *sources) {
// Iterate source lists, skipping non-trees.
while (NextSourceList(sources)) {
// Check the number of roots.
const uint32 num_roots = NumRoots(*sources);
const uint32_t num_roots = NumRoots(*sources);
if (forest_) {
if (num_roots == 0) continue;
} else {
Expand Down
6 changes: 3 additions & 3 deletions tensorflow_text/core/kernels/spanning_tree_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class SpanningTreeIterator {
public:
// An array that provides the source of the inbound arc for each node. Roots
// are represented as self-loops.
using SourceList = std::vector<uint32>;
using SourceList = std::vector<uint32_t>;

// Creates a spanning tree iterator. If |forest| is true, then this iterates
// over forests instead of trees (i.e., multiple roots are allowed).
Expand All @@ -41,7 +41,7 @@ class SpanningTreeIterator {
// true) of a complete digraph containing |num_nodes| nodes. Each tree is
// passed to the |functor| as a SourceList.
template <class Functor>
void ForEachTree(uint32 num_nodes, Functor functor) {
void ForEachTree(uint32_t num_nodes, Functor functor) {
// Conveniently, the all-zero vector represents a valid tree.
SourceList sources(num_nodes, 0);
do {
Expand All @@ -54,7 +54,7 @@ class SpanningTreeIterator {
bool HasCycle(const SourceList &sources);

// Returns the number of roots in the |sources|.
static uint32 NumRoots(const SourceList &sources);
static uint32_t NumRoots(const SourceList& sources);

// Advances |sources| to the next source list, or returns false if there are
// no more source lists.
Expand Down
42 changes: 21 additions & 21 deletions tensorflow_text/core/kernels/split_merge_tokenize_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -65,31 +65,31 @@ bool IsBreakChar(absl::string_view text) {
return u_isUWhiteSpace(c);
}

Status TokenizeByLabel(const absl::string_view& text,
const Tensor& labels_tensor,
bool force_split_at_break_character,
std::vector<std::string>* tokens,
std::vector<int>* begin_offset,
std::vector<int>* end_offset, int* num_tokens) {
absl::Status TokenizeByLabel(const absl::string_view& text,
const Tensor& labels_tensor,
bool force_split_at_break_character,
std::vector<std::string>* tokens,
std::vector<int>* begin_offset,
std::vector<int>* end_offset, int* num_tokens) {
std::vector<absl::string_view> chars;
if (!GetUTF8Chars(text, &chars)) {
return Status(static_cast<::absl::StatusCode>(
absl::StatusCode::kInvalidArgument),
absl::StrCat("Input string is not utf8 valid: ", text));
return absl::Status(
static_cast<::absl::StatusCode>(absl::StatusCode::kInvalidArgument),
absl::StrCat("Input string is not utf8 valid: ", text));
}

if (chars.size() > labels_tensor.dim_size(0)) {
return Status(static_cast<::absl::StatusCode>(
absl::StatusCode::kInvalidArgument),
absl::StrCat("Number of labels ", labels_tensor.dim_size(0),
" is insufficient for text ", text));
return absl::Status(
static_cast<::absl::StatusCode>(absl::StatusCode::kInvalidArgument),
absl::StrCat("Number of labels ", labels_tensor.dim_size(0),
" is insufficient for text ", text));
}

const int split_label = 0;
bool last_character_is_break_character = false;
int start = 0;
bool has_new_token_generated_for_text = false;
const auto& labels = labels_tensor.unaligned_flat<int32>();
const auto& labels = labels_tensor.unaligned_flat<int32_t>();
for (int i = 0; i < chars.size(); ++i) {
const bool is_break_character = IsBreakChar(chars[i]);
if (!is_break_character) {
Expand Down Expand Up @@ -138,14 +138,14 @@ class SplitMergeTokenizeWithOffsetsOp : public OpKernel {
" elements, got ",
row_splits->dim_size(0)));

std::vector<string> tokens;
std::vector<std::string> tokens;
std::vector<int> begin_offset;
std::vector<int> end_offset;
std::vector<int> output_row_splits(1, 0);

// Iterate through all the values and tokenize them.
const auto& values_vec = input_values->flat<tstring>();
const auto& row_splits_vec = row_splits->flat<int32>();
const auto& row_splits_vec = row_splits->flat<int32_t>();
for (int i = 0; i < values_vec.size(); ++i) {
// Tokenize into tokens and record the offset locations.
int num_tokens = 0;
Expand All @@ -160,10 +160,10 @@ class SplitMergeTokenizeWithOffsetsOp : public OpKernel {
output_row_splits.push_back(num_tokens + output_row_splits.back());
}

std::vector<int64> output_tokens_shape;
std::vector<int64_t> output_tokens_shape;
output_tokens_shape.push_back(tokens.size());

std::vector<int64> output_row_splits_shape;
std::vector<int64_t> output_row_splits_shape;
output_row_splits_shape.push_back(output_row_splits.size());

Tensor* output_values;
Expand All @@ -177,19 +177,19 @@ class SplitMergeTokenizeWithOffsetsOp : public OpKernel {
ctx->allocate_output("output_row_splits",
TensorShape(output_row_splits_shape),
&output_row_splits_tensor));
auto output_row_splits_vec = output_row_splits_tensor->vec<int64>();
auto output_row_splits_vec = output_row_splits_tensor->vec<int64_t>();

Tensor* start_values;
OP_REQUIRES_OK(ctx, ctx->allocate_output("start_values",
TensorShape(output_tokens_shape),
&start_values));
auto start_values_vec = start_values->vec<int64>();
auto start_values_vec = start_values->vec<int64_t>();

Tensor* limit_values;
OP_REQUIRES_OK(ctx, ctx->allocate_output("limit_values",
TensorShape(output_tokens_shape),
&limit_values));
auto limit_values_vec = limit_values->vec<int64>();
auto limit_values_vec = limit_values->vec<int64_t>();

for (int i = 0; i < tokens.size(); ++i) {
output_values_vec(i) = tokens[i];
Expand Down
4 changes: 2 additions & 2 deletions tensorflow_text/core/kernels/text_kernels_test_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ namespace text_kernels_test_util {

bool TensorEqMatcher::MatchAndExplain(
Tensor actual, ::testing::MatchResultListener* listener) const {
string expect_values = expect_.SummarizeValue(expect_.NumElements());
string actual_values = actual.SummarizeValue(actual.NumElements());
std::string expect_values = expect_.SummarizeValue(expect_.NumElements());
std::string actual_values = actual.SummarizeValue(actual.NumElements());
if (expect_.dtype() != actual.dtype() || expect_.shape() != actual.shape() ||
expect_values != actual_values) {
*listener << "\n dtype=" << DataTypeString(actual.dtype());
Expand Down
6 changes: 3 additions & 3 deletions tensorflow_text/core/kernels/text_kernels_test_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ ::testing::Matcher<Tensor> TensorHasShapeAndValues(
// VectorEq<int64>({1, 2, 3, 4, 5, 6});
template <typename DTYPE>
::testing::Matcher<Tensor> VectorEq(const std::vector<DTYPE>& values) {
int64 nvals = values.size();
int64_t nvals = values.size();
Tensor expect = test::AsTensor<DTYPE>(values, {nvals});
// MakeMatcher takes ownership of the TensorEqMatcher.
return ::testing::MakeMatcher(new TensorEqMatcher(expect));
Expand All @@ -95,11 +95,11 @@ ::testing::Matcher<Tensor> VectorEq(const std::vector<DTYPE>& values) {
template <typename DTYPE>
::testing::Matcher<Tensor> MatrixEq(
const std::vector<std::vector<DTYPE>>& values) {
int64 nrows = values.size();
int64_t nrows = values.size();
CHECK_GT(nrows, 0) // Crash OK
<< "Invalid use of MatrixEq: to test empty matrices, use "
<< "TensorHasShapeAndValues<dtype>{{0, ndims}, {}} instead.";
int64 ncols = values[0].size();
int64_t ncols = values[0].size();
std::vector<DTYPE> flat;
for (const auto& row : values) {
CHECK_EQ(ncols, row.size()) // Crash OK
Expand Down
30 changes: 15 additions & 15 deletions tensorflow_text/core/kernels/tokenizer_from_logits_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -68,22 +68,22 @@ bool IsBreakChar(absl::string_view text) {
// allows us to retrieve the corresponding data from logits. I.e., the logits
// for the i-th character from text are logits(batch_index, i, 0) (for the
// "split" action) and logits(batch_index, i, 1) (for the "merge" action).
Status TokenizeByLogits(const absl::string_view& text,
const TTypes<const float, 3>::Tensor& logits,
int batch_index,
bool force_split_at_break_character,
std::vector<std::string>* tokens,
std::vector<int>* begin_offset,
std::vector<int>* end_offset, int* num_tokens) {
absl::Status TokenizeByLogits(const absl::string_view& text,
const TTypes<const float, 3>::Tensor& logits,
int batch_index,
bool force_split_at_break_character,
std::vector<std::string>* tokens,
std::vector<int>* begin_offset,
std::vector<int>* end_offset, int* num_tokens) {
std::vector<absl::string_view> chars;
if (!GetUTF8Chars(text, &chars)) {
return Status(
return absl::Status(
static_cast<absl::StatusCode>(absl::StatusCode::kInvalidArgument),
absl::StrCat("Input string is not utf8 valid: ", text));
}

if (chars.size() > logits.dimension(1)) {
return Status(
return absl::Status(
static_cast<absl::StatusCode>(absl::StatusCode::kInvalidArgument),
absl::StrCat("Number of logits, ", logits.dimension(1),
", is insufficient for text \"", text, "\""));
Expand Down Expand Up @@ -142,7 +142,7 @@ class TokenizerFromLogitsOp : public OpKernel {
const bool force_split_at_break_character_bool =
force_split_at_break_character->scalar<bool>()();

std::vector<string> tokens;
std::vector<std::string> tokens;
std::vector<int> begin_offset;
std::vector<int> end_offset;
std::vector<int> output_row_splits(1, 0);
Expand Down Expand Up @@ -175,10 +175,10 @@ class TokenizerFromLogitsOp : public OpKernel {
output_row_splits.push_back(num_tokens + output_row_splits.back());
}

std::vector<int64> output_tokens_shape;
std::vector<int64_t> output_tokens_shape;
output_tokens_shape.push_back(tokens.size());

std::vector<int64> output_row_splits_shape;
std::vector<int64_t> output_row_splits_shape;
output_row_splits_shape.push_back(output_row_splits.size());

Tensor* output_values;
Expand All @@ -192,19 +192,19 @@ class TokenizerFromLogitsOp : public OpKernel {
ctx->allocate_output("row_splits",
TensorShape(output_row_splits_shape),
&output_row_splits_tensor));
auto output_row_splits_vec = output_row_splits_tensor->vec<int64>();
auto output_row_splits_vec = output_row_splits_tensor->vec<int64_t>();

Tensor* start_values;
OP_REQUIRES_OK(ctx, ctx->allocate_output("start_values",
TensorShape(output_tokens_shape),
&start_values));
auto start_values_vec = start_values->vec<int64>();
auto start_values_vec = start_values->vec<int64_t>();

Tensor* limit_values;
OP_REQUIRES_OK(ctx, ctx->allocate_output("limit_values",
TensorShape(output_tokens_shape),
&limit_values));
auto limit_values_vec = limit_values->vec<int64>();
auto limit_values_vec = limit_values->vec<int64_t>();

for (int i = 0; i < tokens.size(); ++i) {
output_values_vec(i) = tokens[i];
Expand Down
27 changes: 14 additions & 13 deletions tensorflow_text/core/kernels/unicode_script_tokenize_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class UnicodeScriptTokenizeWithOffsetsOp : public OpKernel {
void Compute(OpKernelContext* context) override {
// Get inputs
const Tensor& input_values_tensor = context->input(0);
const auto input_values_flat = input_values_tensor.flat<int32>();
const auto input_values_flat = input_values_tensor.flat<int32_t>();
const Tensor& input_splits_tensor = context->input(1);
const auto input_splits_flat = input_splits_tensor.flat<SPLITS_TYPE>();

Expand All @@ -80,10 +80,10 @@ class UnicodeScriptTokenizeWithOffsetsOp : public OpKernel {
auto output_outer_splits_flat =
output_outer_splits_tensor->flat<SPLITS_TYPE>();

std::vector<int32> output_values;
std::vector<int32_t> output_values;
std::vector<SPLITS_TYPE> output_values_inner_splits;
std::vector<int64> output_offset_starts;
std::vector<int64> output_offset_limits;
std::vector<int64_t> output_offset_starts;
std::vector<int64_t> output_offset_limits;

// Loop over the codepoints (a split at a time) and create splits of tokens.
icu::ErrorCode status;
Expand All @@ -92,12 +92,13 @@ class UnicodeScriptTokenizeWithOffsetsOp : public OpKernel {
output_outer_splits_flat(splits_idx) = output_offset_starts.size();
UScriptCode prev_script = USCRIPT_INVALID_CODE;
bool token_has_start_set = false;
int32 curr_skipped_spaces = 0; // Used when computing the end of a token
int32_t curr_skipped_spaces =
0; // Used when computing the end of a token
const int curr_word_start_idx = input_splits_flat(splits_idx);
bool was_space = false;
for (int values_idx = curr_word_start_idx;
values_idx < input_splits_flat(splits_idx + 1); values_idx++) {
const int32 input_value = input_values_flat(values_idx);
const int32_t input_value = input_values_flat(values_idx);
const bool is_space = u_isUWhiteSpace(input_value);
UScriptCode script = uscript_getScript(input_value, status);
// Split these failures out as if they are a different code and ignore
Expand Down Expand Up @@ -166,11 +167,11 @@ class UnicodeScriptTokenizeWithOffsetsOp : public OpKernel {
do { \
} while (false)

DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_values, int32);
DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_values, int32_t);
DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_values_inner_splits,
SPLITS_TYPE);
DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_offset_starts, int64);
DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_offset_limits, int64);
DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_offset_starts, int64_t);
DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_offset_limits, int64_t);

#undef DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR
}
Expand All @@ -183,12 +184,12 @@ class UnicodeScriptTokenizeWithOffsetsOp : public OpKernel {

REGISTER_KERNEL_BUILDER(Name("UnicodeScriptTokenizeWithOffsets")
.Device(DEVICE_CPU)
.TypeConstraint<int32>("Tsplits"),
UnicodeScriptTokenizeWithOffsetsOp<int32>);
.TypeConstraint<int32_t>("Tsplits"),
UnicodeScriptTokenizeWithOffsetsOp<int32_t>);
REGISTER_KERNEL_BUILDER(Name("UnicodeScriptTokenizeWithOffsets")
.Device(DEVICE_CPU)
.TypeConstraint<int64>("Tsplits"),
UnicodeScriptTokenizeWithOffsetsOp<int64>);
.TypeConstraint<int64_t>("Tsplits"),
UnicodeScriptTokenizeWithOffsetsOp<int64_t>);

} // namespace text
} // namespace tensorflow
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,15 @@ class UnicodeScriptTokenizeWithOffsetsKernelTest

TEST_F(UnicodeScriptTokenizeWithOffsetsKernelTest, Test) {
MakeOp();
AddInputFromArray<int32>(TensorShape({6}), {111, 112, 32, 116, 117, 118});
AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
AddInputFromArray<int32_t>(TensorShape({6}), {111, 112, 32, 116, 117, 118});
AddInputFromArray<int64_t>(TensorShape({3}), {0, 4, 6});
TF_ASSERT_OK(RunOpKernel());

std::vector<int32> expected_values({111, 112, 116, 117, 118});
std::vector<int64> expected_values_inner_splits({0, 2, 3, 5});
std::vector<int64> expected_offset_starts({0, 3, 0});
std::vector<int64> expected_offset_limits({2, 4, 2});
std::vector<int64> output_outer_splits({0, 2, 3});
std::vector<int32_t> expected_values({111, 112, 116, 117, 118});
std::vector<int64_t> expected_values_inner_splits({0, 2, 3, 5});
std::vector<int64_t> expected_offset_starts({0, 3, 0});
std::vector<int64_t> expected_offset_limits({2, 4, 2});
std::vector<int64_t> output_outer_splits({0, 2, 3});
EXPECT_THAT(*GetOutput(0), VectorEq(expected_values));
EXPECT_THAT(*GetOutput(1), VectorEq(expected_values_inner_splits));
EXPECT_THAT(*GetOutput(2), VectorEq(expected_offset_starts));
Expand Down
Loading