Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
192 changes: 132 additions & 60 deletions src/main/java/com/github/packageurl/PackageURL.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import java.io.Serializable;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collections;
Expand All @@ -34,6 +35,7 @@
import java.util.TreeMap;
import java.util.function.IntPredicate;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.jspecify.annotations.Nullable;

/**
Expand All @@ -53,9 +55,10 @@
* @since 1.0.0
*/
public final class PackageURL implements Serializable {

private static final long serialVersionUID = 3243226021636427586L;

private static final char PERCENT_CHAR = '%';

/**
* Constructs a new PackageURL object by parsing the specified string.
*
Expand Down Expand Up @@ -494,35 +497,14 @@ private String canonicalize(boolean coordinatesOnly) {
return purl.toString();
}

/**
* Encodes the input in conformance with RFC 3986.
*
* @param input the String to encode
* @return an encoded String
*/
private String percentEncode(final String input) {
if (input.isEmpty()) {
return input;
}

StringBuilder builder = new StringBuilder();
for (byte b : input.getBytes(StandardCharsets.UTF_8)) {
if (isUnreserved(b)) {
builder.append((char) b);
}
else {
// Substitution: A '%' followed by the hexadecimal representation of the ASCII value of the replaced character
builder.append('%');
builder.append(Integer.toHexString(b).toUpperCase());
}
}
return builder.toString();
}

private static boolean isUnreserved(int c) {
return (isValidCharForKey(c) || c == '~');
}

private static boolean shouldEncode(int c) {
return !isUnreserved(c);
}

private static boolean isAlpha(int c) {
return (isLowerCase(c) || isUpperCase(c));
}
Expand Down Expand Up @@ -578,43 +560,134 @@ private static String toLowerCase(String s) {
return new String(chars);
}

/**
* Optionally decodes a String, if it's encoded. If String is not encoded,
* method will return the original input value.
*
* @param input the value String to decode
* @return a decoded String
*/
private String percentDecode(final String input) {
final String decoded = uriDecode(input);
if (!decoded.equals(input)) {
return decoded;
private static int indexOfPercentChar(final byte[] bytes, final int start) {
return IntStream.range(start, bytes.length).filter(i -> isPercent(bytes[i])).findFirst().orElse(-1);
}

private static int indexOfUnsafeChar(final byte[] bytes, final int start) {
return IntStream.range(start, bytes.length).filter(i -> shouldEncode(bytes[i])).findFirst().orElse(-1);
}

private static byte percentDecode(final byte[] bytes, final int start) {
if (start + 2 >= bytes.length) {
throw new ValidationException("Incomplete percent encoding at offset " + start + " with value '" + new String(bytes, start, bytes.length - start, StandardCharsets.UTF_8) + "'");
}

int pos1 = start + 1;
byte b1 = bytes[pos1];
int c1 = Character.digit(b1, 16);

if (c1 == -1) {
throw new ValidationException("Invalid percent encoding char 1 at offset " + pos1 + " with value '" + ((char) b1) + "'");
}
return input;

int pos2 = pos1 + 1;
byte b2 = bytes[pos2];
int c2 = Character.digit(bytes[pos2], 16);

if (c2 == -1) {
throw new ValidationException("Invalid percent encoding char 2 at offset " + pos2 + " with value '" + ((char) b2) + "'");
}

return ((byte) ((c1 << 4) + c2));
}

/**
* Decodes a percent-encoded string.
*
* @param source string to decode, not {@code null}
* @return A decoded string
* @throws NullPointerException if {@code source} is {@code null}
*/
public static String uriDecode(String source) {
int length = source.length();
StringBuilder builder = new StringBuilder();
for (int i = 0; i < length; i++) {
if (source.charAt(i) == '%') {
String str = source.substring(i + 1, i + 3);
char c = (char) Integer.parseInt(str, 16);
builder.append(c);
i += 2;
public static String percentDecode(final String source) {
if (source.isEmpty()) {
return source;
}

byte[] bytes = source.getBytes(StandardCharsets.UTF_8);

int off = 0;
int idx = indexOfPercentChar(bytes, off);

if (idx == -1) {
return source;
}

ByteBuffer buffer = ByteBuffer.wrap(bytes);

while (true) {
int len = idx - off;

if (len > 0) {
buffer.put(bytes, off, len);
off += len;
}
else {
builder.append(source.charAt(i));

buffer.put(percentDecode(bytes, off));
off += 3;
idx = indexOfPercentChar(bytes, off);

if (idx == -1) {
int rem = bytes.length - off;

if (rem > 0) {
buffer.put(bytes, off, rem);
}

break;
}
}
return builder.toString();

return new String(buffer.array(), 0, buffer.position(), StandardCharsets.UTF_8);
}

@Deprecated
public String uriDecode(final String source) {
return source != null ? percentDecode(source) : null;
}

private static boolean isPercent(int c) {
return (c == PERCENT_CHAR);
}

private static byte[] percentEncode(byte b) {
byte b1 = (byte) Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, 16));
byte b2 = (byte) Character.toUpperCase(Character.forDigit(b & 0xF, 16));
return new byte[] {(byte) PERCENT_CHAR, b1, b2};
}

public static String percentEncode(final String source) {
if (source.isEmpty()) {
return source;
}

byte[] bytes = source.getBytes(StandardCharsets.UTF_8);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
byte[] bytes = source.getBytes(StandardCharsets.UTF_8);
byte[] bytes = source.getBytes(StandardCharsets.US_ASCII);

The encoded URI should only contain ASCII characters.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This results in incorrect behavior.

This method always replaces malformed-input and unmappable-character sequences with this charset's default replacement byte array. The CharsetEncoder class should be used when more control over the encoding process is required.

Meaning the proposed version would convert non-ASCII characters into ?. For example: pkg:generic/a?b=☃ becomes pkg:generic/a?b=?. If you want to reject non-ASCII characters you need to use a different conversion.

The spec says that PURLs are ASCII strings, but it seems like all parsers except maennchen/purl accept Unicode inputs, and it's probably better to continue accepting Unicode inputs:

althonos/packageurl.rs: {"type": "generic", "name": "a", "qualifiers": {"b": "\u2603"}}
anchore/packageurl-go: {"type": "generic", "name": "a", "qualifiers": {"b": "\u2603"}}
giterlizzi/perl-URI-PackageURL: {"type": "generic", "name": "a", "qualifiers": {"b": "\u00e2\u0098\u0083"}}
maennchen/purl: {"error": "cannot parse due to reason invalid_uri: \":\""}
package-url/packageurl-dotnet: {"type": "generic", "name": "a", "qualifiers": {"b": "\u2603"}}
package-url/packageurl-go: {"type": "generic", "name": "a", "qualifiers": {"b": "\u2603"}}
package-url/packageurl-java: {"type": "generic", "name": "a", "qualifiers": {"b": "\u2603"}}
package-url/packageurl-js: {"type": "generic", "name": "a", "qualifiers": {"b": "\u2603"}}
package-url/packageurl-php: {"type": "generic", "name": "a", "qualifiers": {"b": "\u2603"}}
package-url/packageurl-python: {"type": "generic", "name": "a", "qualifiers": {"b": "\u2603"}}
package-url/packageurl-ruby: {"type": "generic", "name": "a", "qualifiers": {"b": "\u2603"}}
package-url/packageurl-swift: {"type": "generic", "name": "a", "qualifiers": {"b": "\u2603"}}
phylum-dev/purl: {"type": "generic", "name": "a", "qualifiers": {"b": "\u2603"}}
sonatype/package-url-java: {"type": "generic", "name": "a", "qualifiers": {"b": "\u2603"}}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, the Java URI implementation also seems to tolerate non-blank Unicode characters outside of ASCII (cf. URI Javadoc and it defines a new other category of allowed characters:

The Unicode characters that are not in the US-ASCII character set, are not control characters (according to the Character.isISOControl method), and are not space characters (according to the Character.isSpaceChar method)  (Deviation from RFC 2396, which is limited to US-ASCII)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, use UTF_8 everywhere, then?


int off = 0;
int idx = indexOfUnsafeChar(bytes, off);

if (idx == -1) {
return source;
}

ByteBuffer buffer = ByteBuffer.allocate(bytes.length * 3);

while (true) {
int len = idx - off;

if (len > 0) {
buffer.put(bytes, off, len);
off += len;
}

buffer.put(percentEncode(bytes[off++]));
idx = indexOfUnsafeChar(bytes, off);

if (idx == -1) {
int rem = bytes.length - off;

if (rem > 0) {
buffer.put(bytes, off, rem);
}

break;
}
}

return new String(buffer.array(), 0, buffer.position(), StandardCharsets.UTF_8);
}

/**
Expand Down Expand Up @@ -758,12 +831,12 @@ private void verifyTypeConstraints(String type, @Nullable String namespace, @Nul
private String[] parsePath(final String path, final boolean isSubpath) {
return Arrays.stream(path.split("/"))
.filter(segment -> !segment.isEmpty() && !(isSubpath && (".".equals(segment) || "..".equals(segment))))
.map(this::percentDecode)
.map(PackageURL::percentDecode)
.toArray(String[]::new);
}

private String encodePath(final String path) {
return Arrays.stream(path.split("/")).map(this::percentEncode).collect(Collectors.joining("/"));
return Arrays.stream(path.split("/")).map(PackageURL::percentEncode).collect(Collectors.joining("/"));
}

/**
Expand Down Expand Up @@ -894,5 +967,4 @@ private StandardTypes() {

}
}

}
25 changes: 25 additions & 0 deletions src/test/java/com/github/packageurl/PackageURLTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,31 @@ static void resetLocale() {
Locale.setDefault(defaultLocale);
}

@Test
void validPercentEncoding() throws MalformedPackageURLException {
PackageURL purl = new PackageURL("maven", "com.google.summit", "summit-ast", "2.2.0\n", null, null);
assertEquals("pkg:maven/com.google.summit/[email protected]%0A", purl.toString());
PackageURL purl2 = new PackageURL("pkg:nuget/%D0%9Cicros%D0%BEft.%D0%95ntit%D1%83Fram%D0%B5work%D0%A1%D0%BEr%D0%B5");
assertEquals("Мicrosоft.ЕntitуFramеworkСоrе", purl2.getName());
assertEquals("pkg:nuget/%D0%9Cicros%D0%BEft.%D0%95ntit%D1%83Fram%D0%B5work%D0%A1%D0%BEr%D0%B5", purl2.toString());
}

@SuppressWarnings("deprecation")
@Test
void invalidPercentEncoding() throws MalformedPackageURLException {
assertThrows(MalformedPackageURLException.class, () -> new PackageURL("pkg:maven/com.google.summit/[email protected]%"));
assertThrows(MalformedPackageURLException.class, () -> new PackageURL("pkg:maven/com.google.summit/[email protected]%0"));
PackageURL purl = new PackageURL("pkg:maven/com.google.summit/[email protected]");
Throwable t1 = assertThrows(ValidationException.class, () -> purl.uriDecode("%"));
assertEquals("Incomplete percent encoding at offset 0 with value '%'", t1.getMessage());
Throwable t2 = assertThrows(ValidationException.class, () -> purl.uriDecode("a%0"));
assertEquals("Incomplete percent encoding at offset 1 with value '%0'", t2.getMessage());
Throwable t3 = assertThrows(ValidationException.class, () -> purl.uriDecode("aaaa%%0A"));
assertEquals("Invalid percent encoding char 1 at offset 5 with value '%'", t3.getMessage());
Throwable t4 = assertThrows(ValidationException.class, () -> purl.uriDecode("%0G"));
assertEquals("Invalid percent encoding char 2 at offset 2 with value 'G'", t4.getMessage());
}

@Test
void constructorParsing() throws Exception {
for (int i = 0; i < json.length(); i++) {
Expand Down