Skip to content

Commit ecd945a

Browse files
bassosimoneMurphy-OrangeMud
authored andcommitted
feat(model): introduce ArchivalBinaryData (ooni#1321)
There are cases where we know we have binary data in input and we want the output to be binary data using the dictionary encoding like `{"format":"base64","data":"..."}`. Such cases are, for example, DNS messages bodies. There is no need for us to pass through the MaybeArchivalBinaryData in such cases. Additionally, this makes MaybeArchivalBinaryData a bit more complex than it probably needs to be. What's more, ArchivalBinaryData is for when you do not require to scrub the data. I want to introduce new data types that automatically perform scrubbing when they're used. But this puts even more pressure on MaybeArchivalBinaryData, and hence this commit, to start differentiating between: 1. always binary data vs maybe binary data 2. no scrubbing required vs scrubbing required The rationale for doing this set of changes has been explained in ooni#1319. The reference issue is ooni/probe#2531. For now, this commit just adds the new type and tests for it without using the type, which we'll do later once we have added better marshal/unmarshal testing for the interested types.
1 parent 65d1a18 commit ecd945a

File tree

2 files changed

+330
-7
lines changed

2 files changed

+330
-7
lines changed

internal/model/archival.go

Lines changed: 74 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,5 @@
11
package model
22

3-
import (
4-
"encoding/base64"
5-
"encoding/json"
6-
"errors"
7-
"unicode/utf8"
8-
)
9-
103
//
114
// Archival format for individual measurement results
125
// such as TCP connect, TLS handshake, DNS lookup.
@@ -17,6 +10,15 @@ import (
1710
// See https://github.com/ooni/spec/tree/master/data-formats.
1811
//
1912

13+
import (
14+
"bytes"
15+
"encoding/base64"
16+
"encoding/json"
17+
"errors"
18+
"fmt"
19+
"unicode/utf8"
20+
)
21+
2022
//
2123
// Data format extension specification
2224
//
@@ -59,6 +61,66 @@ var (
5961
// Base types
6062
//
6163

64+
// ArchivalBinaryData is a wrapper for bytes that serializes the enclosed
65+
// data using the specific ooni/spec data format for binary data.
66+
//
67+
// See https://github.com/ooni/spec/blob/master/data-formats/df-001-httpt.md#maybebinarydata.
68+
type ArchivalBinaryData struct {
69+
Value []byte
70+
}
71+
72+
// archivalBinaryDataRepr is the wire representation of binary data according to
73+
// https://github.com/ooni/spec/blob/master/data-formats/df-001-httpt.md#maybebinarydata.
74+
type archivalBinaryDataRepr struct {
75+
Data []byte `json:"data"`
76+
Format string `json:"format"`
77+
}
78+
79+
var (
80+
_ json.Marshaler = ArchivalBinaryData{}
81+
_ json.Unmarshaler = &ArchivalBinaryData{}
82+
)
83+
84+
// MarshalJSON implements json.Marshaler.
85+
func (value ArchivalBinaryData) MarshalJSON() ([]byte, error) {
86+
// special case: we need to marshal the empty data as the null value
87+
if len(value.Value) <= 0 {
88+
return json.Marshal(nil)
89+
}
90+
91+
// construct and serialize the OONI representation
92+
repr := &archivalBinaryDataRepr{Format: "base64", Data: value.Value}
93+
return json.Marshal(repr)
94+
}
95+
96+
// ErrInvalidBinaryDataFormat is the format returned when marshaling and
97+
// unmarshaling binary data and the value of "format" is unknown.
98+
var ErrInvalidBinaryDataFormat = errors.New("model: invalid binary data format")
99+
100+
// UnmarshalJSON implements json.Unmarshaler.
101+
func (value *ArchivalBinaryData) UnmarshalJSON(raw []byte) error {
102+
// handle the case where input is a literal null
103+
if bytes.Equal(raw, []byte("null")) {
104+
value.Value = nil
105+
return nil
106+
}
107+
108+
// attempt to unmarshal into the archival representation
109+
var repr archivalBinaryDataRepr
110+
if err := json.Unmarshal(raw, &repr); err != nil {
111+
return err
112+
}
113+
114+
// make sure the data format is "base64"
115+
if repr.Format != "base64" {
116+
return fmt.Errorf("%w: '%s'", ErrInvalidBinaryDataFormat, repr.Format)
117+
}
118+
119+
// we're good because Go uses base64 for []byte automatically
120+
value.Value = repr.Data
121+
return nil
122+
}
123+
62124
// ArchivalMaybeBinaryData is a possibly binary string. We use this helper class
63125
// to define a custom JSON encoder that allows us to choose the proper
64126
// representation depending on whether the Value field is valid UTF-8 or not.
@@ -72,9 +134,12 @@ type ArchivalMaybeBinaryData struct {
72134
// says that UTF-8 content is represented as string and non-UTF-8 content is
73135
// instead represented using `{"format":"base64","data":"..."}`.
74136
func (hb ArchivalMaybeBinaryData) MarshalJSON() ([]byte, error) {
137+
// if we can serialize as UTF-8 string, do that
75138
if utf8.ValidString(hb.Value) {
76139
return json.Marshal(hb.Value)
77140
}
141+
142+
// otherwise fallback to the ooni/spec representation for binary data
78143
er := make(map[string]string)
79144
er["format"] = "base64"
80145
er["data"] = base64.StdEncoding.EncodeToString([]byte(hb.Value))
@@ -242,6 +307,8 @@ type ArchivalHTTPResponse struct {
242307
// ArchivalHTTPBody is an HTTP body. As an implementation note, this type must
243308
// be an alias for the MaybeBinaryValue type, otherwise the specific serialisation
244309
// mechanism implemented by MaybeBinaryValue is not working.
310+
//
311+
// QUIRK: it's quite fragile we must use an alias here--more robust solution?
245312
type ArchivalHTTPBody = ArchivalMaybeBinaryData
246313

247314
// ArchivalHTTPHeader is a single HTTP header.

internal/model/archival_test.go

Lines changed: 256 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
package model_test
22

33
import (
4+
"encoding/json"
5+
"errors"
46
"testing"
57

68
"github.com/google/go-cmp/cmp"
9+
"github.com/google/go-cmp/cmp/cmpopts"
710
"github.com/ooni/probe-cli/v3/internal/model"
811
"github.com/ooni/probe-cli/v3/internal/testingx"
912
)
@@ -37,6 +40,259 @@ var archivalBinaryInput = []uint8{
3740
// we use this value below to test we can handle binary data
3841
var archivalEncodedBinaryInput = []byte(`{"data":"V+V5+6a7DbzOvaeguqR4eBJZ7mg5pAeYxT68Vcv+NDx+G1qzIp3BLW7KW/EQJUceROItYAjqsArMBUig9Xg48Ns/nZ8lb4kAlpOvQ6xNyawT2yK+en3ZJKJSadiJwdFXqgQrotixGfbVETm7gM+G+V+djKv1xXQkOqLUQE7XEB8=","format":"base64"}`)
3942

43+
func TestArchivalBinaryData(t *testing.T) {
44+
// This test verifies that we correctly serialize binary data to JSON by
45+
// producing null | {"format":"base64","data":"<base64>"}
46+
t.Run("MarshalJSON", func(t *testing.T) {
47+
// testcase is a test case defined by this function
48+
type testcase struct {
49+
// name is the name of the test case
50+
name string
51+
52+
// input is the binary input
53+
input model.ArchivalBinaryData
54+
55+
// expectErr is the error we expect to see or nil
56+
expectErr error
57+
58+
// expectData is the data we expect to see
59+
expectData []byte
60+
}
61+
62+
cases := []testcase{{
63+
name: "with nil .Value",
64+
input: model.ArchivalBinaryData{Value: nil},
65+
expectErr: nil,
66+
expectData: []byte("null"),
67+
}, {
68+
name: "with zero length .Value",
69+
input: model.ArchivalBinaryData{Value: []byte{}},
70+
expectErr: nil,
71+
expectData: []byte("null"),
72+
}, {
73+
name: "with .Value being a simple binary string",
74+
input: model.ArchivalBinaryData{Value: []byte("Elliot")},
75+
expectErr: nil,
76+
expectData: []byte(`{"data":"RWxsaW90","format":"base64"}`),
77+
}, {
78+
name: "with .Value being a long binary string",
79+
input: model.ArchivalBinaryData{Value: archivalBinaryInput},
80+
expectErr: nil,
81+
expectData: archivalEncodedBinaryInput,
82+
}}
83+
84+
for _, tc := range cases {
85+
t.Run(tc.name, func(t *testing.T) {
86+
// serialize to JSON
87+
output, err := json.Marshal(tc.input)
88+
89+
t.Log("got this error", err)
90+
t.Log("got this binary data", output)
91+
t.Logf("converted to string: %s", string(output))
92+
93+
// handle errors
94+
switch {
95+
case err == nil && tc.expectErr != nil:
96+
t.Fatal("expected", tc.expectErr, "got", err)
97+
98+
case err != nil && tc.expectErr == nil:
99+
t.Fatal("expected", tc.expectErr, "got", err)
100+
101+
case err != nil && tc.expectErr != nil:
102+
if err.Error() != tc.expectErr.Error() {
103+
t.Fatal("expected", tc.expectErr, "got", err)
104+
}
105+
106+
case err == nil && tc.expectErr == nil:
107+
// all good--fallthrough
108+
}
109+
110+
if diff := cmp.Diff(tc.expectData, output); diff != "" {
111+
t.Fatal(diff)
112+
}
113+
})
114+
}
115+
})
116+
117+
// This test verifies that we correctly parse binary data to JSON by
118+
// reading from null | {"format":"base64","data":"<base64>"}
119+
t.Run("UnmarshalJSON", func(t *testing.T) {
120+
// testcase is a test case defined by this function
121+
type testcase struct {
122+
// name is the name of the test case
123+
name string
124+
125+
// input is the binary input
126+
input []byte
127+
128+
// expectErr is the error we expect to see or nil
129+
expectErr error
130+
131+
// expectData is the data we expect to see
132+
expectData model.ArchivalBinaryData
133+
}
134+
135+
cases := []testcase{{
136+
name: "with nil input array",
137+
input: nil,
138+
expectErr: errors.New("unexpected end of JSON input"),
139+
expectData: model.ArchivalBinaryData{Value: nil},
140+
}, {
141+
name: "with zero-length input array",
142+
input: []byte{},
143+
expectErr: errors.New("unexpected end of JSON input"),
144+
expectData: model.ArchivalBinaryData{Value: nil},
145+
}, {
146+
name: "with binary input that is not a complete JSON",
147+
input: []byte("{"),
148+
expectErr: errors.New("unexpected end of JSON input"),
149+
expectData: model.ArchivalBinaryData{Value: nil},
150+
}, {
151+
name: "with ~random binary data as input",
152+
input: archivalBinaryInput,
153+
expectErr: errors.New("invalid character 'W' looking for beginning of value"),
154+
expectData: model.ArchivalBinaryData{},
155+
}, {
156+
name: "with valid JSON of the wrong type (array)",
157+
input: []byte("[]"),
158+
expectErr: errors.New("json: cannot unmarshal array into Go value of type model.archivalBinaryDataRepr"),
159+
expectData: model.ArchivalBinaryData{},
160+
}, {
161+
name: "with valid JSON of the wrong type (number)",
162+
input: []byte("1.17"),
163+
expectErr: errors.New("json: cannot unmarshal number into Go value of type model.archivalBinaryDataRepr"),
164+
expectData: model.ArchivalBinaryData{},
165+
}, {
166+
name: "with input being the liternal null",
167+
input: []byte(`null`),
168+
expectErr: nil,
169+
expectData: model.ArchivalBinaryData{Value: nil},
170+
}, {
171+
name: "with empty JSON object",
172+
input: []byte("{}"),
173+
expectErr: errors.New("model: invalid binary data format: ''"),
174+
expectData: model.ArchivalBinaryData{},
175+
}, {
176+
name: "with correct data model but invalid format",
177+
input: []byte(`{"data":"","format":"antani"}`),
178+
expectErr: errors.New("model: invalid binary data format: 'antani'"),
179+
expectData: model.ArchivalBinaryData{},
180+
}, {
181+
name: "with correct data model and format but invalid base64 string",
182+
input: []byte(`{"data":"x","format":"base64"}`),
183+
expectErr: errors.New("illegal base64 data at input byte 0"),
184+
expectData: model.ArchivalBinaryData{},
185+
}, {
186+
name: "with correct data model and format but empty base64 string",
187+
input: []byte(`{"data":"","format":"base64"}`),
188+
expectErr: nil,
189+
expectData: model.ArchivalBinaryData{Value: []byte{}},
190+
}, {
191+
name: "with the encoding of a simple binary string",
192+
input: []byte(`{"data":"RWxsaW90","format":"base64"}`),
193+
expectErr: nil,
194+
expectData: model.ArchivalBinaryData{Value: []byte("Elliot")},
195+
}, {
196+
name: "with the encoding of a complex binary string",
197+
input: archivalEncodedBinaryInput,
198+
expectErr: nil,
199+
expectData: model.ArchivalBinaryData{Value: archivalBinaryInput},
200+
}}
201+
202+
for _, tc := range cases {
203+
t.Run(tc.name, func(t *testing.T) {
204+
// unmarshal the raw input into an ArchivalBinaryData type
205+
var abd model.ArchivalBinaryData
206+
err := json.Unmarshal(tc.input, &abd)
207+
208+
t.Log("got this error", err)
209+
t.Log("got this .Value filed", abd.Value)
210+
t.Logf("converted to string: %s", string(abd.Value))
211+
212+
// handle errors
213+
switch {
214+
case err == nil && tc.expectErr != nil:
215+
t.Fatal("expected", tc.expectErr, "got", err)
216+
217+
case err != nil && tc.expectErr == nil:
218+
t.Fatal("expected", tc.expectErr, "got", err)
219+
220+
case err != nil && tc.expectErr != nil:
221+
if err.Error() != tc.expectErr.Error() {
222+
t.Fatal("expected", tc.expectErr, "got", err)
223+
}
224+
225+
case err == nil && tc.expectErr == nil:
226+
// all good--fallthrough
227+
}
228+
229+
if diff := cmp.Diff(tc.expectData, abd); diff != "" {
230+
t.Fatal(diff)
231+
}
232+
})
233+
}
234+
})
235+
236+
// This test verifies that we correctly round trip through JSON
237+
t.Run("MarshalJSON then UnmarshalJSON", func(t *testing.T) {
238+
// testcase is a test case defined by this function
239+
type testcase struct {
240+
// name is the name of the test case
241+
name string
242+
243+
// input is the binary input
244+
input model.ArchivalBinaryData
245+
}
246+
247+
cases := []testcase{{
248+
name: "with nil .Value",
249+
input: model.ArchivalBinaryData{Value: nil},
250+
}, {
251+
name: "with zero length .Value",
252+
input: model.ArchivalBinaryData{Value: []byte{}},
253+
}, {
254+
name: "with .Value being a simple binary string",
255+
input: model.ArchivalBinaryData{Value: []byte("Elliot")},
256+
}, {
257+
name: "with .Value being a long binary string",
258+
input: model.ArchivalBinaryData{Value: archivalBinaryInput},
259+
}}
260+
261+
for _, tc := range cases {
262+
t.Run(tc.name, func(t *testing.T) {
263+
// serialize to JSON
264+
output, err := json.Marshal(tc.input)
265+
266+
t.Log("got this error", err)
267+
t.Log("got this binary data", output)
268+
t.Logf("converted to string: %s", string(output))
269+
270+
if err != nil {
271+
t.Fatal(err)
272+
}
273+
274+
// parse from JSON
275+
var abc model.ArchivalBinaryData
276+
if err := json.Unmarshal(output, &abc); err != nil {
277+
t.Fatal(err)
278+
}
279+
280+
// make sure we round tripped
281+
//
282+
// Note: the round trip is not perfect because the zero length value,
283+
// which originally is []byte{}, unmarshals to a nil value.
284+
//
285+
// Because the two are ~equivalent in Go most intents and purposes
286+
// and the wire representation does not change, this is OK(TM)
287+
diffOptions := []cmp.Option{cmpopts.EquateEmpty()}
288+
if diff := cmp.Diff(tc.input, abc, diffOptions...); diff != "" {
289+
t.Fatal(diff)
290+
}
291+
})
292+
}
293+
})
294+
}
295+
40296
func TestMaybeBinaryValue(t *testing.T) {
41297
t.Run("MarshalJSON", func(t *testing.T) {
42298
tests := []struct {

0 commit comments

Comments
 (0)