|
| 1 | + |
| 2 | +/* |
| 3 | + References: |
| 4 | + Pickle Source Code: |
| 5 | + https://github.com/python/cpython/blob/main/Lib/pickle.py |
| 6 | + Pickle Protocol Version Breakdown: |
| 7 | + https://docs.python.org/3.13/library/pickle.html#data-stream-format |
| 8 | + Pickle OpCode Breakdown: |
| 9 | + https://github.com/python/cpython/blob/main/Lib/pickletools.py |
| 10 | +*/ |
| 11 | + |
| 12 | +#pragma author ODeux |
| 13 | +#pragma description Python Binary Object Serialization Protocol |
| 14 | + |
| 15 | +#pragma endian little |
| 16 | + |
| 17 | +import std.mem; |
| 18 | +import std.string; |
| 19 | + |
| 20 | +#pragma array_limit 524288 |
| 21 | + |
| 22 | +fn todo(auto message){ |
| 23 | + std::error(std::format("@0x{:08X} TODO: " + message, $)); |
| 24 | +}; |
| 25 | + |
| 26 | +fn utf8_fmt(auto s){ |
| 27 | + return std::format("{}", s); |
| 28 | +}; |
| 29 | +#define UTF8_FMT format("utf8_fmt"), transform("utf8_fmt") |
| 30 | + |
| 31 | +fn utf8_rl_fmt(auto s){ |
| 32 | + str new_s = std::string::substr(s, 0, std::string::length(s) - 1); |
| 33 | + return std::format("{}", new_s); |
| 34 | +}; |
| 35 | +#define UTF8_RL_FMT format("utf8_rl_fmt"), transform("utf8_rl_fmt") |
| 36 | + |
| 37 | +fn int_rl_fmt(auto s){ |
| 38 | + if(s == "01\n") return true; /* == TRUE(b'I01\n')[1:] */ |
| 39 | + if(s == "00\n") return false; /* == FALSE(b'I00\n')[1:] */ |
| 40 | + str new_s = std::string::substr(s, 0, std::string::length(s) - 1); |
| 41 | + return std::string::parse_int(new_s, 0); |
| 42 | +}; |
| 43 | +#define INT_RL_FMT format("int_rl_fmt"), transform("int_rl_fmt") |
| 44 | + |
| 45 | +fn float_rl_fmt(auto s){ |
| 46 | + str new_s = std::string::substr(s, 0, std::string::length(s) - 1); |
| 47 | + return std::string::parse_float(new_s); |
| 48 | +}; |
| 49 | +#define FLOAT_RL_FMT format("float_rl_fmt"), transform("float_rl_fmt") |
| 50 | + |
| 51 | +fn long_rl_fmt(auto s){ |
| 52 | + str new_s = std::string::substr(s, 0, std::string::length(s) - 1); |
| 53 | + if(new_s != "" && std::string::at(new_s, std::string::length(new_s) - 1) == "L") |
| 54 | + new_s = std::string::substr(new_s, 0, std::string::length(new_s) - 1); |
| 55 | + return std::string::parse_int(new_s, 0); |
| 56 | +}; |
| 57 | +#define LONG_RL_FMT format("long_rl_fmt"), transform("long_rl_fmt") |
| 58 | + |
| 59 | +fn ascii_rl_fmt(auto s){ |
| 60 | + return std::string::substr(s, 0, std::string::length(s) - 1); |
| 61 | +}; |
| 62 | +#define ASCII_RL_FMT format("ascii_rl_fmt"), transform("ascii_rl_fmt") |
| 63 | + |
| 64 | +fn integer_rl_fmt(auto s){ |
| 65 | + str new_s = std::string::substr(s, 0, std::string::length(s) - 1); |
| 66 | + return std::string::parse_int(new_s, 0); |
| 67 | +}; |
| 68 | +#define INTEGER_RL_FMT format("integer_rl_fmt"), transform("integer_rl_fmt") |
| 69 | + |
| 70 | +fn string_rl_fmt(auto s){ |
| 71 | + str new_s = std::string::substr(s, 0, std::string::length(s) - 1); |
| 72 | + auto length = std::string::length(new_s); |
| 73 | + if(length >= 2 && new_s[0] == new_s[length - 1] && (new_s[0] == '\'' || new_s[0] == '"')) |
| 74 | + new_s = std::string::substr(new_s, 1, length - 1); |
| 75 | + else std::error("the STRING opcode argument must be quoted"); |
| 76 | + return new_s; |
| 77 | +}; |
| 78 | +#define STRING_RL_FMT format("string_rl_fmt"), transform("string_rl_fmt") |
| 79 | + |
| 80 | +enum OpcodesEnum: u8{ |
| 81 | + MARK = '(', /* push special markobject on stack */ |
| 82 | + STOP = '.', /* every pickle ends with STOP */ |
| 83 | + POP = '0', /* discard topmost stack item */ |
| 84 | + POP_MARK = '1', /* discard stack top through topmost markobject */ |
| 85 | + DUP = '2', /* duplicate top stack item */ |
| 86 | + FLOAT = 'F', /* push float object; decimal string argument */ |
| 87 | + INT = 'I', /* push integer or bool; decimal string argument */ |
| 88 | + BININT = 'J', /* push four-byte signed int */ |
| 89 | + BININT1 = 'K', /* push 1-byte unsigned int */ |
| 90 | + LONG = 'L', /* push long; decimal string argument */ |
| 91 | + BININT2 = 'M', /* push 2-byte unsigned int */ |
| 92 | + NONE = 'N', /* push None */ |
| 93 | + PERSID = 'P', /* push persistent object; id is taken from string arg */ |
| 94 | + BINPERSID = 'Q', /* " " " ; " " " " stack */ |
| 95 | + REDUCE = 'R', /* apply callable to argtuple, both on stack */ |
| 96 | + STRING = 'S', /* push string; NL-terminated string argument */ |
| 97 | + BINSTRING = 'T', /* push string; counted binary string argument */ |
| 98 | + SHORT_BINSTRING = 'U', /* " " ; " " " " < 256 bytes */ |
| 99 | + UNICODE = 'V', /* push Unicode string; raw-unicode-escaped'd argument */ |
| 100 | + BINUNICODE = 'X', /* " " " ; counted UTF-8 string argument */ |
| 101 | + APPEND = 'a', /* append stack top to list below it */ |
| 102 | + BUILD = 'b', /* call __setstate__ or __dict__.update() */ |
| 103 | + GLOBAL = 'c', /* push self.find_class(modname, name); 2 string args */ |
| 104 | + DICT = 'd', /* build a dict from stack items */ |
| 105 | + EMPTY_DICT = '}', /* push empty dict */ |
| 106 | + APPENDS = 'e', /* extend list on stack by topmost stack slice */ |
| 107 | + GET = 'g', /* push item from memo on stack; index is string arg */ |
| 108 | + BINGET = 'h', /* " " " " " " ; " " 1-byte arg */ |
| 109 | + INST = 'i', /* build & push class instance */ |
| 110 | + LONG_BINGET = 'j', /* push item from memo on stack; index is 4-byte arg */ |
| 111 | + LIST = 'l', /* build list from topmost stack items */ |
| 112 | + EMPTY_LIST = ']', /* push empty list */ |
| 113 | + OBJ = 'o', /* build & push class instance */ |
| 114 | + PUT = 'p', /* store stack top in memo; index is string arg */ |
| 115 | + BINPUT = 'q', /* " " " " " ; " " 1-byte arg */ |
| 116 | + LONG_BINPUT = 'r', /* " " " " " ; " " 4-byte arg */ |
| 117 | + SETITEM = 's', /* add key+value pair to dict */ |
| 118 | + TUPLE = 't', /* build tuple from topmost stack items */ |
| 119 | + EMPTY_TUPLE = ')', /* push empty tuple */ |
| 120 | + SETITEMS = 'u', /* modify dict by adding topmost key+value pairs */ |
| 121 | + BINFLOAT = 'G', /* push float; arg is 8-byte float encoding */ |
| 122 | + /* ---- Protocol 2 ---- */ |
| 123 | + PROTO = 0x80, /* identify pickle protocol */ |
| 124 | + NEWOBJ = 0x81, /* build object by applying cls.__new__ to argtuple */ |
| 125 | + EXT1 = 0x82, /* push object from extension registry; 1-byte index */ |
| 126 | + EXT2 = 0x83, /* ditto, but 2-byte index */ |
| 127 | + EXT4 = 0x84, /* ditto, but 4-byte index */ |
| 128 | + TUPLE1 = 0x85, /* build 1-tuple from stack top */ |
| 129 | + TUPLE2 = 0x86, /* build 2-tuple from two topmost stack items */ |
| 130 | + TUPLE3 = 0x87, /* build 3-tuple from three topmost stack items */ |
| 131 | + NEWTRUE = 0x88, /* push True */ |
| 132 | + NEWFALSE = 0x89, /* push False */ |
| 133 | + LONG1 = 0x8A, /* push long from < 256 bytes */ |
| 134 | + LONG4 = 0x8B, /* push really big long */ |
| 135 | + /* ---- Protocol 3 (Python 3.x) ---- */ |
| 136 | + BINBYTES = 'B', /* push bytes; counted binary string argument */ |
| 137 | + SHORT_BINBYTES = 'C', /* " " ; " " " " < 256 bytes */ |
| 138 | + /* ---- Protocol 4 ---- */ |
| 139 | + SHORT_BINUNICODE = 0x8C, /* push short string; UTF-8 length < 256 bytes */ |
| 140 | + BINUNICODE8 = 0x8D, /* push very long string */ |
| 141 | + BINBYTES8 = 0x8E, /* push very long bytes string */ |
| 142 | + EMPTY_SET = 0x8F, /* push empty set on the stack */ |
| 143 | + ADDITEMS = 0x90, /* modify set by adding topmost stack items */ |
| 144 | + FROZENSET = 0x91, /* build frozenset from topmost stack items */ |
| 145 | + NEWOBJ_EX = 0x92, /* like NEWOBJ but work with keyword only arguments */ |
| 146 | + STACK_GLOBAL = 0x93, /* same as GLOBAL but using names on the stacks */ |
| 147 | + MEMOIZE = 0x94, /* store top of the stack in memo */ |
| 148 | + FRAME = 0x95, /* indicate the beginning of a new frame */ |
| 149 | + /* ---- Protocol 5 ---- */ |
| 150 | + BYTEARRAY8 = 0x96, /* push bytearray */ |
| 151 | + NEXT_BUFFER = 0x97, /* push next out-of-band buffer */ |
| 152 | + READONLY_BUFFER = 0x98 /* make top of stack readonly */ |
| 153 | +}; |
| 154 | + |
| 155 | +fn readline(){ |
| 156 | + auto i = 0; |
| 157 | + while(std::mem::read_unsigned($ + i, 1) != '\n') i += 1; |
| 158 | + return i + 1; |
| 159 | +}; |
| 160 | + |
| 161 | +struct Opcodes{ |
| 162 | + OpcodesEnum opcode; |
| 163 | + match(opcode){ |
| 164 | + (OpcodesEnum::MARK): {} |
| 165 | + (OpcodesEnum::STOP): break; |
| 166 | + (OpcodesEnum::POP): {} |
| 167 | + (OpcodesEnum::POP_MARK): {} |
| 168 | + (OpcodesEnum::DUP): {} |
| 169 | + (OpcodesEnum::FLOAT): { |
| 170 | + char Float[readline()] [[FLOAT_RL_FMT]]; /* float(readline()[:1]) */ |
| 171 | + } |
| 172 | + (OpcodesEnum::INT): { |
| 173 | + /* == TRUE(b'I01\n')[1:], == FALSE(b'I00\n')[1:], int(readline(), 0) */ |
| 174 | + char Int[readline()] [[INT_RL_FMT]]; |
| 175 | + } |
| 176 | + (OpcodesEnum::BININT): { |
| 177 | + s32 Int; |
| 178 | + } |
| 179 | + (OpcodesEnum::BININT1): { |
| 180 | + s8 Int; |
| 181 | + } |
| 182 | + (OpcodesEnum::LONG): { |
| 183 | + /* val = readline()[:-1], val = val and val[-1] == b"L"[0] ? val[:-1]: val */ |
| 184 | + char Long[readline()] [[LONG_RL_FMT]]; /* int(val, 0) */ |
| 185 | + } |
| 186 | + (OpcodesEnum::BININT2): { |
| 187 | + u16 Int; |
| 188 | + } |
| 189 | + (OpcodesEnum::NONE): {} |
| 190 | + (OpcodesEnum::PERSID): { |
| 191 | + char id[readline()] [[ASCII_RL_FMT]]; /* readline()[:-1].decode("ascii") */ |
| 192 | + } |
| 193 | + (OpcodesEnum::BINPERSID): {} |
| 194 | + (OpcodesEnum::REDUCE): {} |
| 195 | + /* |
| 196 | + def _decode_string(self, value): |
| 197 | + # Used to allow strings from Python 2 to be decoded either as bytes or Unicode strings. |
| 198 | + # This should be used only with the STRING, BINSTRING and SHORT_BINSTRING opcodes. |
| 199 | + if self.encoding == "bytes": |
| 200 | + return value |
| 201 | + else: |
| 202 | + return value.decode(self.encoding, self.errors) |
| 203 | + */ |
| 204 | + (OpcodesEnum::STRING): { |
| 205 | + /* data must be in quotes ("..." or '...'), dataStripped = stripQuote(readline()[:-1]) */ |
| 206 | + /* _decode_string(codecs.escape_decode(dataStripped)[0]) */ |
| 207 | + char data[readline()] [[STRING_RL_FMT]]; |
| 208 | + } |
| 209 | + (OpcodesEnum::BINSTRING): { |
| 210 | + s32 length; |
| 211 | + char data[length]; /* _decode_string(data) */ |
| 212 | + } |
| 213 | + (OpcodesEnum::SHORT_BINSTRING): { |
| 214 | + u8 length; |
| 215 | + char data[length]; /* _decode_string(data) */ |
| 216 | + } |
| 217 | + (OpcodesEnum::UNICODE): { |
| 218 | + /* |
| 219 | + "raw-unicode-escape": |
| 220 | + Latin-1 encoding with \uXXXX and \UXXXXXXXX for other code points. |
| 221 | + Existing backslashes are not escaped in any way. |
| 222 | + */ |
| 223 | + char data[readline()] [[UTF8_RL_FMT]]; /* str(readline()[:-1], "raw-unicode-escape") */ |
| 224 | + } |
| 225 | + (OpcodesEnum::BINUNICODE): { |
| 226 | + u32 length; |
| 227 | + char data[length] [[UTF8_FMT]]; /* str(data, "utf-8", "surrogatepass") */ |
| 228 | + } |
| 229 | + (OpcodesEnum::APPEND): {} |
| 230 | + (OpcodesEnum::BUILD): {} |
| 231 | + (OpcodesEnum::GLOBAL): { |
| 232 | + char module[readline()] [[UTF8_RL_FMT]]; /* readline()[:-1].decode("utf-8") */ |
| 233 | + char name[readline()] [[UTF8_RL_FMT]]; /* readline()[:-1].decode("utf-8") */ |
| 234 | + } |
| 235 | + (OpcodesEnum::DICT): {} |
| 236 | + (OpcodesEnum::EMPTY_DICT): {} |
| 237 | + (OpcodesEnum::APPENDS): {} |
| 238 | + (OpcodesEnum::GET): { |
| 239 | + char index[readline()] [[INTEGER_RL_FMT]]; /* int(readline()[:-1]) */ |
| 240 | + } |
| 241 | + (OpcodesEnum::BINGET): { |
| 242 | + u8 index; |
| 243 | + } |
| 244 | + (OpcodesEnum::INST): { |
| 245 | + char module[readline()] [[ASCII_RL_FMT]]; /* readline()[:-1].decode("ascii") */ |
| 246 | + char name[readline()] [[ASCII_RL_FMT]]; /* readline()[:-1].decode("ascii") */ |
| 247 | + } |
| 248 | + (OpcodesEnum::LONG_BINGET): { |
| 249 | + u32 index; |
| 250 | + } |
| 251 | + (OpcodesEnum::LIST): {} |
| 252 | + (OpcodesEnum::EMPTY_LIST): {} |
| 253 | + (OpcodesEnum::OBJ): {} |
| 254 | + (OpcodesEnum::PUT): { |
| 255 | + char index[readline()] [[INTEGER_RL_FMT]]; /* int(readline()[:-1]) */ |
| 256 | + } |
| 257 | + (OpcodesEnum::BINPUT): { |
| 258 | + s8 index; |
| 259 | + } |
| 260 | + (OpcodesEnum::LONG_BINPUT): { |
| 261 | + u32 index; |
| 262 | + } |
| 263 | + (OpcodesEnum::SETITEM): {} |
| 264 | + (OpcodesEnum::TUPLE): {} |
| 265 | + (OpcodesEnum::EMPTY_TUPLE): {} |
| 266 | + (OpcodesEnum::SETITEMS): {} |
| 267 | + (OpcodesEnum::BINFLOAT): { |
| 268 | + be double Double; |
| 269 | + } |
| 270 | + /* ---- Protocol 2 ---- */ |
| 271 | + (OpcodesEnum::PROTO): { |
| 272 | + u8 version; |
| 273 | + } |
| 274 | + (OpcodesEnum::NEWOBJ): {} |
| 275 | + (OpcodesEnum::EXT1): { |
| 276 | + u8 code; |
| 277 | + } |
| 278 | + (OpcodesEnum::EXT2): { |
| 279 | + u16 code; |
| 280 | + } |
| 281 | + (OpcodesEnum::EXT4): { |
| 282 | + s32 code; |
| 283 | + } |
| 284 | + (OpcodesEnum::TUPLE1): {} |
| 285 | + (OpcodesEnum::TUPLE2): {} |
| 286 | + (OpcodesEnum::TUPLE3): {} |
| 287 | + (OpcodesEnum::NEWTRUE): {} |
| 288 | + (OpcodesEnum::NEWFALSE): {} |
| 289 | + /* |
| 290 | + def decode_long(data): |
| 291 | + r"""Decode a long from a two's complement little-endian binary string. |
| 292 | + >>> decode_long(b"") => 0 |
| 293 | + >>> decode_long(b"\xff\x00") => 255 |
| 294 | + >>> decode_long(b"\xff\x7f") => 32767 |
| 295 | + >>> decode_long(b"\x00\xff") => -256 |
| 296 | + >>> decode_long(b"\x00\x80") => -32768 |
| 297 | + >>> decode_long(b"\x80") => -128 |
| 298 | + >>> decode_long(b"\x7f") => 127 |
| 299 | + """ |
| 300 | + return int.from_bytes(data, byteorder="little", signed=True) |
| 301 | + */ |
| 302 | + (OpcodesEnum::LONG1): { |
| 303 | + u8 length; |
| 304 | + u8 data[length]; /* decode_long(data) */ |
| 305 | + } |
| 306 | + (OpcodesEnum::LONG4): { |
| 307 | + s32 length; |
| 308 | + u8 data[length]; /* decode_long(data) */ |
| 309 | + } |
| 310 | + /* ---- Protocol 3 (Python 3.x) ---- */ |
| 311 | + (OpcodesEnum::BINBYTES): { |
| 312 | + u32 length; |
| 313 | + u8 bytes[length]; |
| 314 | + } |
| 315 | + (OpcodesEnum::SHORT_BINBYTES): { |
| 316 | + u8 length; |
| 317 | + u8 bytes[length]; |
| 318 | + } |
| 319 | + /* ---- Protocol 4 ---- */ |
| 320 | + (OpcodesEnum::SHORT_BINUNICODE): { |
| 321 | + u8 length; |
| 322 | + char data[length] [[UTF8_FMT]]; /* str(data, "utf-8", "surrogatepass") */ |
| 323 | + } |
| 324 | + (OpcodesEnum::BINUNICODE8): { |
| 325 | + u64 length; |
| 326 | + char data[length] [[UTF8_FMT]]; /* str(data, "utf-8", "surrogatepass") */ |
| 327 | + } |
| 328 | + (OpcodesEnum::BINBYTES8): { |
| 329 | + u64 length; |
| 330 | + u8 bytes[length]; |
| 331 | + } |
| 332 | + (OpcodesEnum::EMPTY_SET): {} |
| 333 | + (OpcodesEnum::ADDITEMS): {} |
| 334 | + (OpcodesEnum::FROZENSET): {} |
| 335 | + (OpcodesEnum::NEWOBJ_EX): {} |
| 336 | + (OpcodesEnum::STACK_GLOBAL): {} |
| 337 | + (OpcodesEnum::MEMOIZE): {} |
| 338 | + (OpcodesEnum::FRAME): { |
| 339 | + u64 length; |
| 340 | + Opcodes opcodes[while($ < addressof(length) + sizeof(length) + length)]; |
| 341 | + } |
| 342 | + /* ---- Protocol 5 ---- */ |
| 343 | + (OpcodesEnum::BYTEARRAY8): { |
| 344 | + u64 length; |
| 345 | + u8 array[length]; |
| 346 | + } |
| 347 | + (OpcodesEnum::NEXT_BUFFER): {} |
| 348 | + (OpcodesEnum::READONLY_BUFFER): {} |
| 349 | + (_): std::error(std::format("Unrecognized {}", opcode)); |
| 350 | + } |
| 351 | +}; |
| 352 | + |
| 353 | +struct Pickle{ |
| 354 | + Opcodes opcodes[while(!std::mem::eof())]; |
| 355 | +}; |
| 356 | + |
| 357 | +Pickle pickle @ 0x0; |
0 commit comments