Skip to content

Commit

Permalink
#8: full unicode support
Browse files Browse the repository at this point in the history
  • Loading branch information
hohwille committed Dec 25, 2024
1 parent 2e87308 commit 0044317
Show file tree
Hide file tree
Showing 15 changed files with 266 additions and 254 deletions.
87 changes: 43 additions & 44 deletions core/src/main/java/io/github/mmm/base/filter/CharFilter.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,86 +3,84 @@
package io.github.mmm.base.filter;

/**
* Interface for a filter of characters that decides if a given character is {@link #accept(char) accepted}. <br>
* Interface for a filter of characters that decides if a given character is {@link #accept(int) accepted}. <br>
* Unlike {@link java.util.function.Predicate} it avoids boxing and unboxing between {@code char} and {@link Character}
* for performance reasons. It is especially used by {code io.github.mmm.scanner.CharSequenceScanner}.
*
* @see #accept(char)
* @see #accept(int)
* @since 1.0.0
*/
@FunctionalInterface
public interface CharFilter {

/** A filter that only {@link #accept(char) accepts} the lower case Latin ASCII letters 'a'-'z'. */
/** A filter that only {@link #accept(int) accepts} the lower case Latin ASCII letters 'a'-'z'. */
CharFilter LATIN_LOWER_CASE_LETTER = new RangeCharFilter('a', 'z');

/** A filter that only {@link #accept(char) accepts} the upper case Latin ASCII letters 'A'-'Z'. */
/** A filter that only {@link #accept(int) accepts} the upper case Latin ASCII letters 'A'-'Z'. */
CharFilter LATIN_UPPER_CASE_LETTER = new RangeCharFilter('A', 'Z');

/** A filter that only {@link #accept(char) accepts} the Latin ASCII letters 'a'-'z' and 'A'-'Z'. */
/** A filter that only {@link #accept(int) accepts} the Latin ASCII letters 'a'-'z' and 'A'-'Z'. */
CharFilter LATIN_LETTER = LATIN_LOWER_CASE_LETTER.compose(LATIN_UPPER_CASE_LETTER);

/** A filter that only {@link #accept(char) accepts} the Latin digits '0'-'9'. */
/** A filter that only {@link #accept(int) accepts} the Latin digits '0'-'9'. */
CharFilter LATIN_DIGIT = new RangeCharFilter('0', '9');

/** A filter that only {@link #accept(char) accepts} the Latin digits '0'-'9' or ASCII letters 'a'-'z' and 'A'-'Z'. */
/** A filter that only {@link #accept(int) accepts} the Latin digits '0'-'9' or ASCII letters 'a'-'z' and 'A'-'Z'. */
CharFilter LATIN_LETTER_OR_DIGIT = LATIN_LETTER.compose(LATIN_DIGIT);

/**
* A filter that only {@link #accept(char) accepts} characters valid for a technical identifier (e.g. literal oder
* A filter that only {@link #accept(int) accepts} characters valid for a technical identifier (e.g. literal oder
* variable-name). This means accepted characters are Latin digits, ASCII letters. '.', '_' or '-'.
*/
CharFilter IDENTIFIER = LATIN_LETTER_OR_DIGIT.compose(new ListCharFilter("._-"));

/**
* A filter that only {@link #accept(char) accepts} characters valid for a technical segment (e.g. convenient name of
* A filter that only {@link #accept(int) accepts} characters valid for a technical segment (e.g. convenient name of
* variable, method, field, class, etc.). This means accepted characters are Latin digits, ASCII letters, '_' or '$'.
*/
CharFilter SEGMENT = LATIN_LETTER_OR_DIGIT.compose(new ListCharFilter("_$"));

/** A filter that {@link #accept(char) accepts} only {@link Character#isWhitespace(char) whitespaces}. */
/** A filter that {@link #accept(int) accepts} only {@link Character#isWhitespace(char) whitespaces}. */
CharFilter WHITESPACE = of(c -> Character.isWhitespace(c), "whitespace");

/** A filter that {@link #accept(char) accepts} any charater. */
/** A filter that {@link #accept(int) accepts} any charater. */
CharFilter ANY = of(c -> true, "**");

/** A filter that only {@link #accept(char) accepts} the file separator characters '/' and '\\'. */
/** A filter that only {@link #accept(int) accepts} the file separator characters '/' and '\\'. */
CharFilter FILE_SEPARATOR = new ListCharFilter("/\\");

/** {@link CharFilter} that {@link #accept(char) accepts} only carriage return ('\r') and line feed ('\n'). */
/** {@link CharFilter} that {@link #accept(int) accepts} only carriage return ('\r') and line feed ('\n'). */
CharFilter NEWLINE = new ListCharFilter("\r\n");

/** {@link CharFilter} that {@link #accept(char) accepts} only {@link #NEWLINE newlines} and space (' '). */
/** {@link CharFilter} that {@link #accept(int) accepts} only {@link #NEWLINE newlines} and space (' '). */
CharFilter NEWLINE_OR_SPACE = new ListCharFilter("\r \n");

/**
* {@link CharFilter} that {@link #accept(char) accepts} only {@link #NEWLINE_OR_SPACE newlines, space} and tab
* ('\t').
* {@link CharFilter} that {@link #accept(int) accepts} only {@link #NEWLINE_OR_SPACE newlines, space} and tab ('\t').
*/
CharFilter NEWLINE_OR_SPACE_OR_TAB = new ListCharFilter("\r \n\t");

/** {@link CharFilter} that {@link #accept(char) accepts} only the ocatal digits '0'-'7'. */
/** {@link CharFilter} that {@link #accept(int) accepts} only the ocatal digits '0'-'7'. */
CharFilter OCTAL_DIGIT = new RangeCharFilter('0', '7');

/** {@link CharFilter} that {@link #accept(char) accepts} only the hex digits '0'-'9', 'a'-'f', or 'A'-'F'. */
CharFilter HEX_DIGIT = LATIN_DIGIT.compose(new RangeCharFilter('a', 'f'))
.compose(new RangeCharFilter('A', 'F'));
/** {@link CharFilter} that {@link #accept(int) accepts} only the hex digits '0'-'9', 'a'-'f', or 'A'-'F'. */
CharFilter HEX_DIGIT = LATIN_DIGIT.compose(new RangeCharFilter('a', 'f')).compose(new RangeCharFilter('A', 'F'));

/** Fallback for {@link #getDescription() description} if not available. */
String NO_DESCRIPTION = "?";

/**
* @param c is the character to check.
* @return {@code true} if the given character {@code c} is acceptable, {@code false} if it should be filtered.
* @param codePoint is the {@link String#codePointAt(int) code-point} to check.
* @return {@code true} if the given {@code codePoint} is acceptable, {@code false} if it should be filtered.
*/
boolean accept(char c);
boolean accept(int codePoint);

/**
* Removes all characters that are not {@link #accept(char) accepted} by this filter. Intended only for simple cases
* or testing. For real parsing consider using {@code CharStreamScanner}.
* Removes all characters that are not {@link #accept(int) accepted} by this filter. Intended only for simple cases or
* testing. For real parsing consider using {@code CharStreamScanner}.
*
* @param string the {@link String} to filter.
* @return the given {@link String} with all characters removed that are not {@link #accept(char) accepted} by this
* @return the given {@link String} with all characters removed that are not {@link #accept(int) accepted} by this
* filter.
*/
default String filter(String string) {
Expand All @@ -93,16 +91,16 @@ default String filter(String string) {
StringBuilder sb = new StringBuilder(string.length());
int len = string.length();
for (int i = 0; i < len; i++) {
char c = string.charAt(i);
if (accept(c)) {
sb.append(c);
int cp = string.codePointAt(i);
if (accept(cp)) {
sb.appendCodePoint(cp);
}
}
return sb.toString();
}

/**
* @return the negation of this {@link CharFilter} that returns !{@link #accept(char) accept(c)}.
* @return the negation of this {@link CharFilter} that returns !{@link #accept(int) accept(c)}.
*/
default CharFilter negate() {

Expand All @@ -111,8 +109,8 @@ default CharFilter negate() {

/**
* @param filter the {@link CharFilter} to compose with.
* @return a {@link ComposedCharFilter} that {@link #accept(char) accepts} a character that is accepted by
* {@code this} OR the given {@link CharFilter}.
* @return a {@link ComposedCharFilter} that {@link #accept(int) accepts} a character that is accepted by {@code this}
* OR the given {@link CharFilter}.
*/
default CharFilter compose(CharFilter filter) {

Expand All @@ -123,20 +121,21 @@ default CharFilter compose(CharFilter filter) {
* Appends the given character to the given {@link StringBuilder}. In addition to {@link StringBuilder#append(char)}
* this method will escape some special characters to make them visible.
*
* @param c the character to {@link StringBuilder#append(char) append}.
* @param codePoint the {@link String#codePointAt(int) code-point} to {@link StringBuilder#appendCodePoint(int)
* append}.
* @param sb the {@link StringBuilder} where to append.
*/
static void append(char c, StringBuilder sb) {
static void append(int codePoint, StringBuilder sb) {

if (c == '\n') {
if (codePoint == '\n') {
sb.append("\\n");
} else if (c == '\r') {
} else if (codePoint == '\r') {
sb.append("\\r");
} else if (c == '\t') {
} else if (codePoint == '\t') {
sb.append("\\t");
} else if (isInvisible(c)) {
} else if (isInvisible(codePoint)) {
sb.append("\\u");
String hex = Integer.toString(c, 16);
String hex = Integer.toString(codePoint, 16);
int len = hex.length();
int zeros = 4 - len;
if (zeros < 0) {
Expand All @@ -148,17 +147,17 @@ static void append(char c, StringBuilder sb) {
}
sb.append(hex);
} else {
sb.append(c);
sb.appendCodePoint(codePoint);
}
}

private static boolean isInvisible(char c) {
private static boolean isInvisible(int codePoint) {

if (c < 20) { // technically space (20) is also invisible
if (codePoint < 20) { // technically space (20) is also invisible
return true;
} else if ((c >= 0x07F) && (c <= 0x0A0)) {
} else if ((codePoint >= 0x07F) && (codePoint <= 0x0A0)) {
return true;
} else if ((c == 0x0AD)) {
} else if ((codePoint == 0x0AD)) {
return true;
}
return false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
import java.util.Objects;

/**
* Implementation of {@link CharFilter} that combines multiple given {@link CharFilter}s with a logical OR so that if
* any of the given {@link CharFilter}s {@link #accept(char) matches} then this {@link ComposedCharFilter} will match.
* Implementation of {@link CharFilter} that combines multiple given {@link CharFilter}s with a logical OR so this
* {@link ComposedCharFilter} will {@link #accept(int) accept} a {@link String#codePointAt(int) code-point} if any of
* the given {@link CharFilter}s {@link #accept(int) accepts}.
*/
public class ComposedCharFilter extends AbstractCharFilter {

Expand All @@ -25,10 +26,10 @@ public ComposedCharFilter(CharFilter... filters) {
}

@Override
public boolean accept(char c) {
public boolean accept(int codePoint) {

for (CharFilter filter : this.filters) {
if (filter.accept(c)) {
if (filter.accept(codePoint)) {
return true;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ class DescriptiveCharFilter extends AbstractCharFilter {
}

@Override
public boolean accept(char c) {
public boolean accept(int codePoint) {

return this.filter.accept(c);
return this.filter.accept(codePoint);
}

}
Loading

0 comments on commit 0044317

Please sign in to comment.