Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: rare decoding error on UTF-8 documents #1579

Merged
merged 1 commit into from
Dec 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions src/main/java/com/adobe/epubcheck/xml/XMLEncodingSniffer.java
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,28 @@ public static String sniffEncoding(InputStream in)
return encoding.toUpperCase(Locale.ROOT);
}

/**
* Checks if the parameter input stream has a UTF-8 byte order mark.
*
* @param in
* an input stream
* @return <code>true</code> if and only if the input stream starts with a
* UTF-8 BOM
* @throws IOException
*/
public static boolean hasUTF8BOM(InputStream in)
throws IOException
{
byte[] buffer = new byte[3];
in.mark(buffer.length);
int len = in.read(buffer);
in.reset();
return (len == 3 && matchesMagic(UTF8_MAGIC, buffer));
}

private XMLEncodingSniffer()
{
// Not instanciable.
}

}
30 changes: 23 additions & 7 deletions src/main/java/com/adobe/epubcheck/xml/XMLParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
Expand Down Expand Up @@ -136,8 +138,8 @@ public void process()
return;
}

// Check encoding
// If the result is null, the XML parser will must parse it as UTF-8
// Create the InputSource based on the encoding
final InputSource source;
String encoding = XMLEncodingSniffer.sniffEncoding(buffered);
if (encoding != null && !encoding.equals("UTF-8"))
{
Expand All @@ -158,13 +160,27 @@ public void process()
{
report.message(MessageId.RSC_028, EPUBLocation.of(context), encoding);
}

// We do not set the source encoding name, but instead let the SAXParser
// apply its own encoding-sniffing logic, as it can report useful errors
// (for instance a mismatch between a BOM and the XML declaration)
source = new InputSource(buffered);

}
else
{
// Decode the UTF-8 stream with java.io instead of letting Xerces
// do it, to work around Xerces issue #1668
// (see https://issues.apache.org/jira/browse/XERCESJ-1668),
// skipping any UTF-8 BOM first (disallowed by that constructor)
if (XMLEncodingSniffer.hasUTF8BOM(buffered))
{
buffered.skip(3);
}
source = new InputSource(new InputStreamReader(buffered, StandardCharsets.UTF_8));
}

// Build the input source
// We do not set the source encoding name, but instead let the SAXParser
// apply its own encoding-sniffing logic, as it can report useful errors
// (for instance a mismatch between a BOM and the XML declaration)
InputSource source = new InputSource(buffered);
// Set the source's system ID
source.setSystemId(url.toString());

// Set the error handler
Expand Down
Loading