17
17
*/
18
18
public class CsvEncoder
19
19
{
20
-
21
- /*
22
- * default set of escaped characters.
23
- */
20
+ // Default set of escaped characters (none)
24
21
private static final int [] sOutputEscapes = new int [0 ];
25
22
26
23
final protected static char [] HEX_CHARS = CharTypes .copyHexChars ();
27
24
28
- /* As an optimization we try coalescing short writes into
25
+ /**
26
+ * As an optimization we try coalescing short writes into
29
27
* buffer; but pass longer directly.
30
28
*/
31
29
final protected static int SHORT_WRITE = 32 ;
32
30
33
- /* Also: only do check for optional quotes for short
31
+ /**
32
+ * Also: only do check for optional quotes for short
34
33
* values; longer ones will always be quoted.
35
34
*/
36
35
final protected static int MAX_QUOTE_CHECK = 24 ;
@@ -221,9 +220,9 @@ public CsvEncoder(IOContext ctxt, int csvFeatures, Writer out, CsvSchema schema)
221
220
_cfgMaxQuoteCheckChars = MAX_QUOTE_CHECK ;
222
221
223
222
_cfgQuoteCharEscapeChar = _getQuoteCharEscapeChar (
224
- _cfgEscapeQuoteCharWithEscapeChar ,
225
- _cfgQuoteCharacter ,
226
- _cfgEscapeCharacter
223
+ _cfgEscapeQuoteCharWithEscapeChar ,
224
+ _cfgQuoteCharacter ,
225
+ _cfgEscapeCharacter
227
226
);
228
227
229
228
_cfgControlCharEscapeChar = _cfgEscapeCharacter > 0 ? (char ) _cfgEscapeCharacter : '\\' ;
@@ -256,31 +255,31 @@ public CsvEncoder(CsvEncoder base, CsvSchema newSchema)
256
255
_cfgMinSafeChar = _calcSafeChar ();
257
256
_columnCount = newSchema .size ();
258
257
_cfgQuoteCharEscapeChar = _getQuoteCharEscapeChar (
259
- base ._cfgEscapeQuoteCharWithEscapeChar ,
260
- newSchema .getQuoteChar (),
261
- newSchema .getEscapeChar ()
258
+ base ._cfgEscapeQuoteCharWithEscapeChar ,
259
+ newSchema .getQuoteChar (),
260
+ newSchema .getEscapeChar ()
262
261
);
263
262
_cfgControlCharEscapeChar = _cfgEscapeCharacter > 0 ? (char ) _cfgEscapeCharacter : '\\' ;
264
263
}
265
264
266
265
private final char _getQuoteCharEscapeChar (
267
- final boolean escapeQuoteCharWithEscapeChar ,
268
- final int quoteCharacter ,
269
- final int escapeCharacter ) {
270
-
271
- final char quoteEscapeChar ;
272
-
273
- if (_cfgEscapeQuoteCharWithEscapeChar && _cfgEscapeCharacter > 0 ) {
274
- quoteEscapeChar = (char ) _cfgEscapeCharacter ;
275
- }
276
- else if (_cfgQuoteCharacter > 0 ) {
277
- quoteEscapeChar = (char ) _cfgQuoteCharacter ;
278
- }
279
- else {
280
- quoteEscapeChar = '\\' ;
281
- }
282
-
283
- return quoteEscapeChar ;
266
+ final boolean escapeQuoteCharWithEscapeChar ,
267
+ final int quoteCharacter ,
268
+ final int escapeCharacter )
269
+ {
270
+ final char quoteEscapeChar ;
271
+
272
+ if (_cfgEscapeQuoteCharWithEscapeChar && _cfgEscapeCharacter > 0 ) {
273
+ quoteEscapeChar = (char ) _cfgEscapeCharacter ;
274
+ }
275
+ else if (_cfgQuoteCharacter > 0 ) {
276
+ quoteEscapeChar = (char ) _cfgQuoteCharacter ;
277
+ }
278
+ else {
279
+ quoteEscapeChar = '\\' ;
280
+ }
281
+
282
+ return quoteEscapeChar ;
284
283
}
285
284
286
285
private final int _calcSafeChar ()
@@ -289,6 +288,8 @@ private final int _calcSafeChar()
289
288
int min = Math .max (_cfgColumnSeparator , _cfgQuoteCharacter );
290
289
// 06-Nov-2015, tatu: We will NOT apply escape character, because it usually
291
290
// has higher ascii value (with backslash); better handle separately.
291
+ // 23-Sep-2020, tatu: Should not actually need to consider anything but the
292
+ // first character when checking... but leaving rest for now
292
293
for (int i = 0 ; i < _cfgLineSeparatorLength ; ++i ) {
293
294
min = Math .max (min , _cfgLineSeparator [i ]);
294
295
}
@@ -312,7 +313,7 @@ public CsvEncoder overrideFormatFeatures(int feat) {
312
313
return this ;
313
314
}
314
315
315
- public CsvEncoder setOutputEscapes (int [] esc ) {
316
+ public CsvEncoder setOutputEscapes (int [] esc ) {
316
317
_outputEscapes = (esc != null ) ? esc : sOutputEscapes ;
317
318
return this ;
318
319
}
@@ -934,6 +935,7 @@ private final void _writeLongQuotedAndEscaped(String text, char esc) throws IOEx
934
935
final int len = text .length ();
935
936
// NOTE: caller should guarantee quote char is valid (not -1) at this point:
936
937
final char q = (char ) _cfgQuoteCharacter ;
938
+ // 23-Sep-2020, tatu: Why was this defined but not used? Commented out in 2.11.3
937
939
// final char quoteEscape = _cfgEscapeQuoteCharWithEscapeChar ? esc : q;
938
940
for (int i = 0 ; i < len ; ++i ) {
939
941
if (_outputTail >= _outputEnd ) {
@@ -1063,7 +1065,7 @@ protected final boolean _needsQuotingLoose(String value, int esc)
1063
1065
}
1064
1066
return false ;
1065
1067
}
1066
-
1068
+
1067
1069
/**
1068
1070
* @since 2.4
1069
1071
*/
@@ -1073,12 +1075,16 @@ protected boolean _needsQuotingStrict(String value)
1073
1075
1074
1076
final int [] escCodes = _outputEscapes ;
1075
1077
final int escLen = escCodes .length ;
1078
+ // 23-Sep-2020, tatu: [dataformats-text#217] Must also ensure line separator
1079
+ // leads to quoting
1080
+ final int lfFirst = (_cfgLineSeparatorLength == 0 ) ? 0 : _cfgLineSeparator [0 ];
1076
1081
1077
1082
for (int i = 0 , len = value .length (); i < len ; ++i ) {
1078
1083
int c = value .charAt (i );
1079
1084
if (c < minSafe ) {
1080
1085
if (c == _cfgColumnSeparator || c == _cfgQuoteCharacter
1081
1086
|| (c < escLen && escCodes [c ] != 0 )
1087
+ || (c == lfFirst )
1082
1088
// 31-Dec-2014, tatu: Comment lines start with # so quote if starts with #
1083
1089
|| (c == '#' && i == 0 )) {
1084
1090
return true ;
@@ -1094,15 +1100,18 @@ protected boolean _needsQuotingStrict(String value)
1094
1100
protected boolean _needsQuotingStrict (String value , int esc )
1095
1101
{
1096
1102
final int minSafe = _cfgMinSafeChar ;
1097
-
1098
1103
final int [] escCodes = _outputEscapes ;
1099
1104
final int escLen = escCodes .length ;
1105
+ // 23-Sep-2020, tatu: [dataformats-text#217] Must also ensure line separator
1106
+ // leads to quoting
1107
+ final int lfFirst = (_cfgLineSeparatorLength == 0 ) ? 0 : _cfgLineSeparator [0 ];
1100
1108
1101
1109
for (int i = 0 , len = value .length (); i < len ; ++i ) {
1102
1110
int c = value .charAt (i );
1103
1111
if (c < minSafe ) {
1104
1112
if (c == _cfgColumnSeparator || c == _cfgQuoteCharacter
1105
1113
|| (c < escLen && escCodes [c ] != 0 )
1114
+ || (c == lfFirst )
1106
1115
// 31-Dec-2014, tatu: Comment lines start with # so quote if starts with #
1107
1116
|| (c == '#' && i == 0 )) {
1108
1117
return true ;
0 commit comments