Skip to content

Commit

Permalink
fix: rare decoding error on UTF-8 documents
Browse files Browse the repository at this point in the history
On rare occasions, decoding UTF-8 documents caused a fatal error RSC-016
(`Invalid byte 2 of 4-byte UTF-8 sequence.`).

This was likely due to a bug in the Xerces XML parser decoding component,
see https://issues.apache.org/jira/browse/XERCESJ-1668

As a workaround, we now read documents using the Java built-in UTF-8
decoder instead of Xerces's own decoder, by creating the SAX parsers'
InputSource from an InputStreamReader instead of the raw InputStream.

Fixes #1548
  • Loading branch information
rdeltour committed Dec 23, 2024
1 parent 270ee85 commit 90e87b2
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 7 deletions.
20 changes: 20 additions & 0 deletions src/main/java/com/adobe/epubcheck/xml/XMLEncodingSniffer.java
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,28 @@ public static String sniffEncoding(InputStream in)
return encoding.toUpperCase(Locale.ROOT);
}

/**
* Checks if the parameter input stream has a UTF-8 byte order mark.
*
* @param in
* an input stream
* @return <code>true</code> if and only if the input stream starts with a
* UTF-8 BOM
* @throws IOException
*/
public static boolean hasUTF8BOM(InputStream in)
throws IOException
{
byte[] buffer = new byte[3];
in.mark(buffer.length);
int len = in.read(buffer);
in.reset();
return (len == 3 && matchesMagic(UTF8_MAGIC, buffer));
}

private XMLEncodingSniffer()
{
// Not instanciable.
}

}
30 changes: 23 additions & 7 deletions src/main/java/com/adobe/epubcheck/xml/XMLParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
Expand Down Expand Up @@ -136,8 +138,8 @@ public void process()
return;
}

// Check encoding
// If the result is null, the XML parser will must parse it as UTF-8
// Create the InputSource based on the encoding
final InputSource source;
String encoding = XMLEncodingSniffer.sniffEncoding(buffered);
if (encoding != null && !encoding.equals("UTF-8"))
{
Expand All @@ -158,13 +160,27 @@ public void process()
{
report.message(MessageId.RSC_028, EPUBLocation.of(context), encoding);
}

// We do not set the source encoding name, but instead let the SAXParser
// apply its own encoding-sniffing logic, as it can report useful errors
// (for instance a mismatch between a BOM and the XML declaration)
source = new InputSource(buffered);

}
else
{
// Decode the UTF-8 stream with java.io instead of letting Xerces
// do it, to work around Xerces issue #1668
// (see https://issues.apache.org/jira/browse/XERCESJ-1668),
// skipping any UTF-8 BOM first (disallowed by that constructor)
if (XMLEncodingSniffer.hasUTF8BOM(buffered))
{
buffered.skip(3);
}
source = new InputSource(new InputStreamReader(buffered, StandardCharsets.UTF_8));
}

// Build the input source
// We do not set the source encoding name, but instead let the SAXParser
// apply its own encoding-sniffing logic, as it can report useful errors
// (for instance a mismatch between a BOM and the XML declaration)
InputSource source = new InputSource(buffered);
// Set the source's system ID
source.setSystemId(url.toString());

// Set the error handler
Expand Down

0 comments on commit 90e87b2

Please sign in to comment.