88 "encoding/xml"
99 "fmt"
1010 "io"
11- "strings"
1211 "unicode/utf8"
1312
1413 "miniflux.app/v2/internal/reader/encoding"
@@ -23,34 +22,36 @@ func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder {
2322 io .Copy (buffer , data )
2423
2524 enc := getEncoding (buffer .Bytes ())
26- if enc == "" || strings .EqualFold (enc , "utf-8" ) {
27- // filter invalid chars now, since decoder.CharsetReader not called for utf-8 content
25+ if enc == nil || bytes .EqualFold (enc , [] byte ( "utf-8" ) ) {
26+ // filter invalid chars now, since decoder.CharsetReader isn't called for utf-8 content
2827 filteredBytes := filterValidXMLChars (buffer .Bytes ())
2928 decoder = xml .NewDecoder (bytes .NewReader (filteredBytes ))
3029 } else {
31- // filter invalid chars later within decoder.CharsetReader
3230 data .Seek (0 , io .SeekStart )
31+ // invalid characters will be filtered later via decoder.CharsetReader
3332 decoder = xml .NewDecoder (data )
3433 }
3534
35+ decoder .CharsetReader = charsetReaderFilterInvalidUtf8
3636 decoder .Entity = xml .HTMLEntity
3737 decoder .Strict = false
38- decoder .CharsetReader = func (charset string , input io.Reader ) (io.Reader , error ) {
39- utf8Reader , err := encoding .CharsetReader (charset , input )
40- if err != nil {
41- return nil , err
42- }
43- rawData , err := io .ReadAll (utf8Reader )
44- if err != nil {
45- return nil , fmt .Errorf ("encoding: unable to read data: %w" , err )
46- }
47- filteredBytes := filterValidXMLChars (rawData )
48- return bytes .NewReader (filteredBytes ), nil
49- }
5038
5139 return decoder
5240}
5341
42+ func charsetReaderFilterInvalidUtf8 (charset string , input io.Reader ) (io.Reader , error ) {
43+ utf8Reader , err := encoding .CharsetReader (charset , input )
44+ if err != nil {
45+ return nil , err
46+ }
47+ rawData , err := io .ReadAll (utf8Reader )
48+ if err != nil {
49+ return nil , fmt .Errorf ("encoding: unable to read data: %w" , err )
50+ }
51+ filteredBytes := filterValidXMLChars (rawData )
52+ return bytes .NewReader (filteredBytes ), nil
53+ }
54+
5455// filterValidXMLChars filters inplace invalid XML characters.
5556// This function is inspired from bytes.Map
5657func filterValidXMLChars (s []byte ) []byte {
@@ -89,23 +90,23 @@ func filterValidXMLChar(r rune) rune {
8990}
9091
9192// This function is copied from encoding/xml's procInst and adapted for []bytes instead of string
92- func getEncoding (b []byte ) string {
93+ func getEncoding (b []byte ) [] byte {
9394 // This parsing is somewhat lame and not exact.
9495 // It works for all actual cases, though.
9596 idx := bytes .Index (b , []byte ("encoding=" ))
9697 if idx == - 1 {
97- return ""
98+ return nil
9899 }
99100 v := b [idx + len ("encoding=" ):]
100101 if len (v ) == 0 {
101- return ""
102+ return nil
102103 }
103104 if v [0 ] != '\'' && v [0 ] != '"' {
104- return ""
105+ return nil
105106 }
106107 idx = bytes .IndexRune (v [1 :], rune (v [0 ]))
107108 if idx == - 1 {
108- return ""
109+ return nil
109110 }
110- return string ( v [1 : idx + 1 ])
111+ return v [1 : idx + 1 ]
111112}
0 commit comments