Skip to content

Commit 7a9a509

Browse files
ODeux2WerWolv
andauthored
patterns: Add Python Pickle Pattern (#446)
* Add pickle pattern file * Add test file * Update README.md --------- Co-authored-by: Nik <[email protected]>
1 parent 0e67ee1 commit 7a9a509

File tree

3 files changed

+358
-0
lines changed

3 files changed

+358
-0
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ Everything will immediately show up in ImHex's Content Store and gets bundled wi
141141
| PP | | [`patterns/selinuxpp.hexpat`](patterns/selinuxpp.pat) | SE Linux package |
142142
| PFS0 | | [`patterns/pfs0.hexpat`](patterns/pfs0.hexpat) | Nintendo Switch PFS0 archive (NSP files) |
143143
| PF | | [`patterns/pf.hexpat`](patterns/pf.hexpat) | Microsoft uncompressed prefetch files (.pf) |
144+
| Pickle | | [`patterns/pickle.hexpat`](patterns/pickle.hexpat) | Python Pickle Protocol |
144145
| PIF | `image/pif` | [`patterns/pif.hexpat`](patterns/pif.hexpat) | PIF Image Format |
145146
| PKM | | [`patterns/pkm.hexpat`](patterns/pkm.hexpat) | PKM texture format |
146147
| PNG | `image/png` | [`patterns/png.hexpat`](patterns/png.hexpat) | PNG image files |

patterns/pickle.hexpat

Lines changed: 357 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,357 @@
1+
2+
/*
3+
References:
4+
Pickle Source Code:
5+
https://github.com/python/cpython/blob/main/Lib/pickle.py
6+
Pickle Protocol Version Breakdown:
7+
https://docs.python.org/3.13/library/pickle.html#data-stream-format
8+
Pickle OpCode Breakdown:
9+
https://github.com/python/cpython/blob/main/Lib/pickletools.py
10+
*/
11+
12+
#pragma author ODeux
13+
#pragma description Python Binary Object Serialization Protocol
14+
15+
#pragma endian little
16+
17+
import std.mem;
18+
import std.string;
19+
20+
#pragma array_limit 524288
21+
22+
fn todo(auto message){
23+
std::error(std::format("@0x{:08X} TODO: " + message, $));
24+
};
25+
26+
fn utf8_fmt(auto s){
27+
return std::format("{}", s);
28+
};
29+
#define UTF8_FMT format("utf8_fmt"), transform("utf8_fmt")
30+
31+
fn utf8_rl_fmt(auto s){
32+
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
33+
return std::format("{}", new_s);
34+
};
35+
#define UTF8_RL_FMT format("utf8_rl_fmt"), transform("utf8_rl_fmt")
36+
37+
fn int_rl_fmt(auto s){
38+
if(s == "01\n") return true; /* == TRUE(b'I01\n')[1:] */
39+
if(s == "00\n") return false; /* == FALSE(b'I00\n')[1:] */
40+
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
41+
return std::string::parse_int(new_s, 0);
42+
};
43+
#define INT_RL_FMT format("int_rl_fmt"), transform("int_rl_fmt")
44+
45+
fn float_rl_fmt(auto s){
46+
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
47+
return std::string::parse_float(new_s);
48+
};
49+
#define FLOAT_RL_FMT format("float_rl_fmt"), transform("float_rl_fmt")
50+
51+
fn long_rl_fmt(auto s){
52+
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
53+
if(new_s != "" && std::string::at(new_s, std::string::length(new_s) - 1) == "L")
54+
new_s = std::string::substr(new_s, 0, std::string::length(new_s) - 1);
55+
return std::string::parse_int(new_s, 0);
56+
};
57+
#define LONG_RL_FMT format("long_rl_fmt"), transform("long_rl_fmt")
58+
59+
fn ascii_rl_fmt(auto s){
60+
return std::string::substr(s, 0, std::string::length(s) - 1);
61+
};
62+
#define ASCII_RL_FMT format("ascii_rl_fmt"), transform("ascii_rl_fmt")
63+
64+
fn integer_rl_fmt(auto s){
65+
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
66+
return std::string::parse_int(new_s, 0);
67+
};
68+
#define INTEGER_RL_FMT format("integer_rl_fmt"), transform("integer_rl_fmt")
69+
70+
fn string_rl_fmt(auto s){
71+
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
72+
auto length = std::string::length(new_s);
73+
if(length >= 2 && new_s[0] == new_s[length - 1] && (new_s[0] == '\'' || new_s[0] == '"'))
74+
new_s = std::string::substr(new_s, 1, length - 1);
75+
else std::error("the STRING opcode argument must be quoted");
76+
return new_s;
77+
};
78+
#define STRING_RL_FMT format("string_rl_fmt"), transform("string_rl_fmt")
79+
80+
enum OpcodesEnum: u8{
81+
MARK = '(', /* push special markobject on stack */
82+
STOP = '.', /* every pickle ends with STOP */
83+
POP = '0', /* discard topmost stack item */
84+
POP_MARK = '1', /* discard stack top through topmost markobject */
85+
DUP = '2', /* duplicate top stack item */
86+
FLOAT = 'F', /* push float object; decimal string argument */
87+
INT = 'I', /* push integer or bool; decimal string argument */
88+
BININT = 'J', /* push four-byte signed int */
89+
BININT1 = 'K', /* push 1-byte unsigned int */
90+
LONG = 'L', /* push long; decimal string argument */
91+
BININT2 = 'M', /* push 2-byte unsigned int */
92+
NONE = 'N', /* push None */
93+
PERSID = 'P', /* push persistent object; id is taken from string arg */
94+
BINPERSID = 'Q', /* " " " ; " " " " stack */
95+
REDUCE = 'R', /* apply callable to argtuple, both on stack */
96+
STRING = 'S', /* push string; NL-terminated string argument */
97+
BINSTRING = 'T', /* push string; counted binary string argument */
98+
SHORT_BINSTRING = 'U', /* " " ; " " " " < 256 bytes */
99+
UNICODE = 'V', /* push Unicode string; raw-unicode-escaped'd argument */
100+
BINUNICODE = 'X', /* " " " ; counted UTF-8 string argument */
101+
APPEND = 'a', /* append stack top to list below it */
102+
BUILD = 'b', /* call __setstate__ or __dict__.update() */
103+
GLOBAL = 'c', /* push self.find_class(modname, name); 2 string args */
104+
DICT = 'd', /* build a dict from stack items */
105+
EMPTY_DICT = '}', /* push empty dict */
106+
APPENDS = 'e', /* extend list on stack by topmost stack slice */
107+
GET = 'g', /* push item from memo on stack; index is string arg */
108+
BINGET = 'h', /* " " " " " " ; " " 1-byte arg */
109+
INST = 'i', /* build & push class instance */
110+
LONG_BINGET = 'j', /* push item from memo on stack; index is 4-byte arg */
111+
LIST = 'l', /* build list from topmost stack items */
112+
EMPTY_LIST = ']', /* push empty list */
113+
OBJ = 'o', /* build & push class instance */
114+
PUT = 'p', /* store stack top in memo; index is string arg */
115+
BINPUT = 'q', /* " " " " " ; " " 1-byte arg */
116+
LONG_BINPUT = 'r', /* " " " " " ; " " 4-byte arg */
117+
SETITEM = 's', /* add key+value pair to dict */
118+
TUPLE = 't', /* build tuple from topmost stack items */
119+
EMPTY_TUPLE = ')', /* push empty tuple */
120+
SETITEMS = 'u', /* modify dict by adding topmost key+value pairs */
121+
BINFLOAT = 'G', /* push float; arg is 8-byte float encoding */
122+
/* ---- Protocol 2 ---- */
123+
PROTO = 0x80, /* identify pickle protocol */
124+
NEWOBJ = 0x81, /* build object by applying cls.__new__ to argtuple */
125+
EXT1 = 0x82, /* push object from extension registry; 1-byte index */
126+
EXT2 = 0x83, /* ditto, but 2-byte index */
127+
EXT4 = 0x84, /* ditto, but 4-byte index */
128+
TUPLE1 = 0x85, /* build 1-tuple from stack top */
129+
TUPLE2 = 0x86, /* build 2-tuple from two topmost stack items */
130+
TUPLE3 = 0x87, /* build 3-tuple from three topmost stack items */
131+
NEWTRUE = 0x88, /* push True */
132+
NEWFALSE = 0x89, /* push False */
133+
LONG1 = 0x8A, /* push long from < 256 bytes */
134+
LONG4 = 0x8B, /* push really big long */
135+
/* ---- Protocol 3 (Python 3.x) ---- */
136+
BINBYTES = 'B', /* push bytes; counted binary string argument */
137+
SHORT_BINBYTES = 'C', /* " " ; " " " " < 256 bytes */
138+
/* ---- Protocol 4 ---- */
139+
SHORT_BINUNICODE = 0x8C, /* push short string; UTF-8 length < 256 bytes */
140+
BINUNICODE8 = 0x8D, /* push very long string */
141+
BINBYTES8 = 0x8E, /* push very long bytes string */
142+
EMPTY_SET = 0x8F, /* push empty set on the stack */
143+
ADDITEMS = 0x90, /* modify set by adding topmost stack items */
144+
FROZENSET = 0x91, /* build frozenset from topmost stack items */
145+
NEWOBJ_EX = 0x92, /* like NEWOBJ but work with keyword only arguments */
146+
STACK_GLOBAL = 0x93, /* same as GLOBAL but using names on the stacks */
147+
MEMOIZE = 0x94, /* store top of the stack in memo */
148+
FRAME = 0x95, /* indicate the beginning of a new frame */
149+
/* ---- Protocol 5 ---- */
150+
BYTEARRAY8 = 0x96, /* push bytearray */
151+
NEXT_BUFFER = 0x97, /* push next out-of-band buffer */
152+
READONLY_BUFFER = 0x98 /* make top of stack readonly */
153+
};
154+
155+
fn readline(){
156+
auto i = 0;
157+
while(std::mem::read_unsigned($ + i, 1) != '\n') i += 1;
158+
return i + 1;
159+
};
160+
161+
struct Opcodes{
162+
OpcodesEnum opcode;
163+
match(opcode){
164+
(OpcodesEnum::MARK): {}
165+
(OpcodesEnum::STOP): break;
166+
(OpcodesEnum::POP): {}
167+
(OpcodesEnum::POP_MARK): {}
168+
(OpcodesEnum::DUP): {}
169+
(OpcodesEnum::FLOAT): {
170+
char Float[readline()] [[FLOAT_RL_FMT]]; /* float(readline()[:1]) */
171+
}
172+
(OpcodesEnum::INT): {
173+
/* == TRUE(b'I01\n')[1:], == FALSE(b'I00\n')[1:], int(readline(), 0) */
174+
char Int[readline()] [[INT_RL_FMT]];
175+
}
176+
(OpcodesEnum::BININT): {
177+
s32 Int;
178+
}
179+
(OpcodesEnum::BININT1): {
180+
s8 Int;
181+
}
182+
(OpcodesEnum::LONG): {
183+
/* val = readline()[:-1], val = val and val[-1] == b"L"[0] ? val[:-1]: val */
184+
char Long[readline()] [[LONG_RL_FMT]]; /* int(val, 0) */
185+
}
186+
(OpcodesEnum::BININT2): {
187+
u16 Int;
188+
}
189+
(OpcodesEnum::NONE): {}
190+
(OpcodesEnum::PERSID): {
191+
char id[readline()] [[ASCII_RL_FMT]]; /* readline()[:-1].decode("ascii") */
192+
}
193+
(OpcodesEnum::BINPERSID): {}
194+
(OpcodesEnum::REDUCE): {}
195+
/*
196+
def _decode_string(self, value):
197+
# Used to allow strings from Python 2 to be decoded either as bytes or Unicode strings.
198+
# This should be used only with the STRING, BINSTRING and SHORT_BINSTRING opcodes.
199+
if self.encoding == "bytes":
200+
return value
201+
else:
202+
return value.decode(self.encoding, self.errors)
203+
*/
204+
(OpcodesEnum::STRING): {
205+
/* data must be in quotes ("..." or '...'), dataStripped = stripQuote(readline()[:-1]) */
206+
/* _decode_string(codecs.escape_decode(dataStripped)[0]) */
207+
char data[readline()] [[STRING_RL_FMT]];
208+
}
209+
(OpcodesEnum::BINSTRING): {
210+
s32 length;
211+
char data[length]; /* _decode_string(data) */
212+
}
213+
(OpcodesEnum::SHORT_BINSTRING): {
214+
u8 length;
215+
char data[length]; /* _decode_string(data) */
216+
}
217+
(OpcodesEnum::UNICODE): {
218+
/*
219+
"raw-unicode-escape":
220+
Latin-1 encoding with \uXXXX and \UXXXXXXXX for other code points.
221+
Existing backslashes are not escaped in any way.
222+
*/
223+
char data[readline()] [[UTF8_RL_FMT]]; /* str(readline()[:-1], "raw-unicode-escape") */
224+
}
225+
(OpcodesEnum::BINUNICODE): {
226+
u32 length;
227+
char data[length] [[UTF8_FMT]]; /* str(data, "utf-8", "surrogatepass") */
228+
}
229+
(OpcodesEnum::APPEND): {}
230+
(OpcodesEnum::BUILD): {}
231+
(OpcodesEnum::GLOBAL): {
232+
char module[readline()] [[UTF8_RL_FMT]]; /* readline()[:-1].decode("utf-8") */
233+
char name[readline()] [[UTF8_RL_FMT]]; /* readline()[:-1].decode("utf-8") */
234+
}
235+
(OpcodesEnum::DICT): {}
236+
(OpcodesEnum::EMPTY_DICT): {}
237+
(OpcodesEnum::APPENDS): {}
238+
(OpcodesEnum::GET): {
239+
char index[readline()] [[INTEGER_RL_FMT]]; /* int(readline()[:-1]) */
240+
}
241+
(OpcodesEnum::BINGET): {
242+
u8 index;
243+
}
244+
(OpcodesEnum::INST): {
245+
char module[readline()] [[ASCII_RL_FMT]]; /* readline()[:-1].decode("ascii") */
246+
char name[readline()] [[ASCII_RL_FMT]]; /* readline()[:-1].decode("ascii") */
247+
}
248+
(OpcodesEnum::LONG_BINGET): {
249+
u32 index;
250+
}
251+
(OpcodesEnum::LIST): {}
252+
(OpcodesEnum::EMPTY_LIST): {}
253+
(OpcodesEnum::OBJ): {}
254+
(OpcodesEnum::PUT): {
255+
char index[readline()] [[INTEGER_RL_FMT]]; /* int(readline()[:-1]) */
256+
}
257+
(OpcodesEnum::BINPUT): {
258+
s8 index;
259+
}
260+
(OpcodesEnum::LONG_BINPUT): {
261+
u32 index;
262+
}
263+
(OpcodesEnum::SETITEM): {}
264+
(OpcodesEnum::TUPLE): {}
265+
(OpcodesEnum::EMPTY_TUPLE): {}
266+
(OpcodesEnum::SETITEMS): {}
267+
(OpcodesEnum::BINFLOAT): {
268+
be double Double;
269+
}
270+
/* ---- Protocol 2 ---- */
271+
(OpcodesEnum::PROTO): {
272+
u8 version;
273+
}
274+
(OpcodesEnum::NEWOBJ): {}
275+
(OpcodesEnum::EXT1): {
276+
u8 code;
277+
}
278+
(OpcodesEnum::EXT2): {
279+
u16 code;
280+
}
281+
(OpcodesEnum::EXT4): {
282+
s32 code;
283+
}
284+
(OpcodesEnum::TUPLE1): {}
285+
(OpcodesEnum::TUPLE2): {}
286+
(OpcodesEnum::TUPLE3): {}
287+
(OpcodesEnum::NEWTRUE): {}
288+
(OpcodesEnum::NEWFALSE): {}
289+
/*
290+
def decode_long(data):
291+
r"""Decode a long from a two's complement little-endian binary string.
292+
>>> decode_long(b"") => 0
293+
>>> decode_long(b"\xff\x00") => 255
294+
>>> decode_long(b"\xff\x7f") => 32767
295+
>>> decode_long(b"\x00\xff") => -256
296+
>>> decode_long(b"\x00\x80") => -32768
297+
>>> decode_long(b"\x80") => -128
298+
>>> decode_long(b"\x7f") => 127
299+
"""
300+
return int.from_bytes(data, byteorder="little", signed=True)
301+
*/
302+
(OpcodesEnum::LONG1): {
303+
u8 length;
304+
u8 data[length]; /* decode_long(data) */
305+
}
306+
(OpcodesEnum::LONG4): {
307+
s32 length;
308+
u8 data[length]; /* decode_long(data) */
309+
}
310+
/* ---- Protocol 3 (Python 3.x) ---- */
311+
(OpcodesEnum::BINBYTES): {
312+
u32 length;
313+
u8 bytes[length];
314+
}
315+
(OpcodesEnum::SHORT_BINBYTES): {
316+
u8 length;
317+
u8 bytes[length];
318+
}
319+
/* ---- Protocol 4 ---- */
320+
(OpcodesEnum::SHORT_BINUNICODE): {
321+
u8 length;
322+
char data[length] [[UTF8_FMT]]; /* str(data, "utf-8", "surrogatepass") */
323+
}
324+
(OpcodesEnum::BINUNICODE8): {
325+
u64 length;
326+
char data[length] [[UTF8_FMT]]; /* str(data, "utf-8", "surrogatepass") */
327+
}
328+
(OpcodesEnum::BINBYTES8): {
329+
u64 length;
330+
u8 bytes[length];
331+
}
332+
(OpcodesEnum::EMPTY_SET): {}
333+
(OpcodesEnum::ADDITEMS): {}
334+
(OpcodesEnum::FROZENSET): {}
335+
(OpcodesEnum::NEWOBJ_EX): {}
336+
(OpcodesEnum::STACK_GLOBAL): {}
337+
(OpcodesEnum::MEMOIZE): {}
338+
(OpcodesEnum::FRAME): {
339+
u64 length;
340+
Opcodes opcodes[while($ < addressof(length) + sizeof(length) + length)];
341+
}
342+
/* ---- Protocol 5 ---- */
343+
(OpcodesEnum::BYTEARRAY8): {
344+
u64 length;
345+
u8 array[length];
346+
}
347+
(OpcodesEnum::NEXT_BUFFER): {}
348+
(OpcodesEnum::READONLY_BUFFER): {}
349+
(_): std::error(std::format("Unrecognized {}", opcode));
350+
}
351+
};
352+
353+
struct Pickle{
354+
Opcodes opcodes[while(!std::mem::eof())];
355+
};
356+
357+
Pickle pickle @ 0x0;
541 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)