forked from google/binexport
-
Notifications
You must be signed in to change notification settings - Fork 2
/
binexport2.proto
413 lines (334 loc) · 14.7 KB
/
binexport2.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
// Copyright 2011-2024 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This file describes a compact representation for disassembled binaries. It is
// loosely based on the PostgreSQL database schema used by BinNavi's
// postgresql_tables.sql (https://git.io/vzlYw).
// It is the output format for the BinExport IDA plugin and the BinDetego
// disassembler and consumed by the BinDiff comparison engine.
// The representation is generic to accommodate various source architectures.
// In particular 32 and 64 bit versions of x86, ARM, PowerPC and MIPS have been
// tested.
//
// Multiple levels of deduping have been applied to make the format more compact
// and avoid redundant data duplication. Some of this due to hard-earned
// experience trying to cope with intentionally obfuscated malicious binaries.
// Note in particular that the same instruction may occur in multiple basic
// blocks and the same basic block in multiple functions (instruction and basic
// block sharing). Implemented naively, malware can use this to cause
// combinatorial explosion in memory usage, DOSing the analyst. This format
// should store every unique expression, mnemonic, operand, instruction and
// basic block only once instead of duplicating the information for every
// instance of it.
//
// This format does _not_ try to be 100% backwards compatible with the old
// version. In particular, we do not store IDA's comment types, making lossless
// porting of IDA comments impossible. We do however, store comments and
// expression substitutions, so porting the actual data is possible, just not
// the exact IDA type.
//
// While it would be more natural to use addresses when defining call graph and
// flow graph edges and other such references, it is more efficient to employ
// one more level of indirection and use indices into the basic block or
// function arrays instead. This is because addresses will usually use most of
// the available 64 bit space while indices will be much smaller and compress
// much better (less randomly distributed).
//
// We omit all fields that are set to their default value anyways. Note that
// this has two side effects:
// - changing the defaults in this proto file will, in effect, change what's
// read from disk
// - the generated code has_* methods are somewhat less useful
// WARNING: We omit the defaults manually in the code writing the data. Do not
// change the defaults here without changing the code!
//
// TODO(cblichmann): Link flow graphs to call graph nodes. The connection is
// there via the address, but tricky to extract.
syntax = "proto2";
option java_package = "com.google.security.zynamics";
option java_outer_classname = "BinExport";
message BinExport2 {
message Meta {
reserved 5; // Pre-BinDiff 4.3 padding
// Input binary filename including file extension but excluding file path.
// example: "insider_gcc.exe"
optional string executable_name = 1;
// Application defined executable id. Often the SHA256 hash of the input
// binary.
optional string executable_id = 2;
// Input architecture name, e.g. x86-32.
optional string architecture_name = 3;
// When did this file get created? Unix time. This may be used for some
// primitive versioning in case the file format ever changes.
optional int64 timestamp = 4;
}
message CallGraph {
message Vertex {
enum Type {
// Regular function with full disassembly.
NORMAL = 0;
// This function is a well known library function.
LIBRARY = 1;
// Imported from a dynamic link library (e.g. dll).
IMPORTED = 2;
// A thunk function, forwarding its work via an unconditional jump.
THUNK = 3;
// An invalid function (a function that contained invalid code or was
// considered invalid by some heuristics).
INVALID = 4;
}
// The function's entry point address. Messages need to be sorted, see
// comment below on `vertex`.
optional uint64 address = 1;
optional Type type = 2 [default = NORMAL];
// If the function has a user defined, real name it will be given here.
// main() is a proper name, sub_BAADF00D is not (auto generated dummy
// name).
optional string mangled_name = 3;
// Demangled name if the function is a mangled C++ function and we could
// demangle it.
optional string demangled_name = 4;
// If this is a library function, what is its index in library arrays.
optional int32 library_index = 5;
// If module name, such as class name for DEX files, is present - index in
// module table.
optional int32 module_index = 6;
}
message Edge {
// source and target index into the vertex repeated field.
optional int32 source_vertex_index = 1;
optional int32 target_vertex_index = 2;
}
// vertices == functions in the call graph.
// Important: Most downstream tooling (notably BinDiff), need these to be
// sorted by `Vertex::address` (ascending). For C++, the
// `BinExport2Writer` class enforces this invariant.
repeated Vertex vertex = 1;
// edges == calls in the call graph.
repeated Edge edge = 2;
}
// An operand consists of 1 or more expressions, linked together as a tree.
message Expression {
enum Type {
SYMBOL = 1;
IMMEDIATE_INT = 2;
IMMEDIATE_FLOAT = 3;
OPERATOR = 4;
REGISTER = 5;
SIZE_PREFIX = 6;
DEREFERENCE = 7;
}
// IMMEDIATE_INT is by far the most common type and thus we can save some
// space by omitting it as the default.
optional Type type = 1 [default = IMMEDIATE_INT];
// Symbol for this expression. Interpretation depends on type. Examples
// include: "eax", "[", "+"
optional string symbol = 2;
// If the expression can be interpreted as an integer value (IMMEDIATE_INT)
// the value is given here.
optional uint64 immediate = 3;
// The parent expression. Example expression tree for the second operand of:
// mov eax, b4 [ebx + 12]
// "b4" --- "[" --- "+" --- "ebx"
// \ "12"
optional int32 parent_index = 4;
// true if the expression has entry in relocation table
optional bool is_relocation = 5;
}
// An instruction may have 0 or more operands.
message Operand {
// Contains all expressions constituting this operand. All expressions
// should be linked into a single tree, i.e. there should only be one
// expression in this list with parent_index == NULL and all others should
// descend from that. Rendering order for expressions on the same tree level
// (siblings) is implicitly given by the order they are referenced in this
// repeated field.
// Implicit: expression sequence
repeated int32 expression_index = 1;
}
// An instruction has exactly 1 mnemonic.
message Mnemonic {
// Literal representation of the mnemonic, e.g.: "mov".
optional string name = 1;
}
message Instruction {
// This will only be filled for instructions that do not just flow from the
// immediately preceding instruction. Regular instructions will have to
// calculate their own address by adding raw_bytes.size() to the previous
// instruction's address.
optional uint64 address = 1;
// If this is a call instruction and call targets could be determined
// they'll be given here. Note that we may or may not have a flow graph for
// the target and thus cannot use an index into the flow graph table here.
// We could potentially use call graph nodes, but linking instructions to
// the call graph directly does not seem a good choice.
repeated uint64 call_target = 2;
// Index into the mnemonic array of strings. Used for de-duping the data.
// The default value is used for the most common mnemonic in the executable.
optional int32 mnemonic_index = 3 [default = 0];
// Indices into the operand tree. On X86 this can be 0, 1 or 2 elements
// long, 3 elements with VEX/EVEX.
// Implicit: operand sequence
repeated int32 operand_index = 4;
// The unmodified input bytes corresponding to this instruction.
optional bytes raw_bytes = 5;
// Implicit: comment sequence
repeated int32 comment_index = 6;
}
message BasicBlock {
// This is a space optimization. The instructions for an individual basic
// block will usually be in a continuous index range. Thus it is more
// efficient to store the range instead of individual indices. However, this
// does not hold true for all basic blocks, so we need to be able to store
// multiple index ranges per block.
message IndexRange {
// These work like begin and end iterators, i.e. the sequence is
// [begin_index, end_index). If the sequence only contains a single
// element end_index will be omitted.
optional int32 begin_index = 1;
optional int32 end_index = 2;
}
// Implicit: instruction sequence
repeated IndexRange instruction_index = 1;
}
message FlowGraph {
message Edge {
enum Type {
CONDITION_TRUE = 1;
CONDITION_FALSE = 2;
UNCONDITIONAL = 3;
SWITCH = 4;
}
// Source instruction will always be the last instruction of the source
// basic block, target instruction the first instruction of the target
// basic block.
optional int32 source_basic_block_index = 1;
optional int32 target_basic_block_index = 2;
optional Type type = 3 [default = UNCONDITIONAL];
// Indicates whether this is a loop edge as determined by Lengauer-Tarjan.
optional bool is_back_edge = 4 [default = false];
}
// Basic blocks are sorted by address.
repeated int32 basic_block_index = 1;
// The flow graph's entry point address is the first instruction of the
// entry_basic_block.
optional int32 entry_basic_block_index = 3;
repeated Edge edge = 2;
}
// Generic reference class used for address comments (deprecated), string
// references and expression substitutions. It allows referencing from an
// instruction, operand, expression subtree tuple to a de-duped string in the
// string table.
message Reference {
// Index into the global instruction table.
optional int32 instruction_index = 1;
// Index into the operand array local to an instruction.
optional int32 instruction_operand_index = 2 [default = 0];
// Index into the expression array local to an operand.
optional int32 operand_expression_index = 3 [default = 0];
// Index into the global string table.
optional int32 string_table_index = 4;
}
message DataReference {
// Index into the global instruction table.
optional int32 instruction_index = 1;
// Address being referred.
optional uint64 address = 2;
}
message Comment {
enum Type {
// A regular instruction comment. Typically displayed next to the
// instruction disassembly.
DEFAULT = 0;
// A comment line that is typically displayed before (above) the
// instruction it refers to.
ANTERIOR = 1;
// Like ANTERIOR, but a typically displayed after (below).
POSTERIOR = 2;
// Similar to an ANTERIOR comment, but applies to the beginning of an
// identified function. Programs displaying the proto may choose to render
// these differently (e.g. above an inferred function signature).
FUNCTION = 3;
// Named constants, bitfields and similar.
ENUM = 4;
// Named locations, usually the target of a jump.
LOCATION = 5;
// Data cross references.
GLOBAL_REFERENCE = 6;
// Local/stack variables.
LOCAL_REFERENCE = 7;
}
// Index into the global instruction table. This is here to enable
// comment processing without having to iterate over all instructions.
// There is an N:M mapping of instructions to comments.
optional int32 instruction_index = 1;
// Index into the operand array local to an instruction.
optional int32 instruction_operand_index = 2 [default = 0];
// Index into the expression array local to an operand, like in Reference.
// This expression comment (if present) annotates the expression with
// a markup string, and is better rendered instead of its raw value.
optional int32 operand_expression_index = 3 [default = 0];
// Index into the global string table.
optional int32 string_table_index = 4;
// Comment is propagated to all locations that reference the original
// location.
optional bool repeatable = 5;
optional Type type = 6 [default = DEFAULT];
}
message Section {
// Section start address.
optional uint64 address = 1;
// Section size.
optional uint64 size = 2;
// Read flag of the section, True when section is readable.
optional bool flag_r = 3;
// Write flag of the section, True when section is writable.
optional bool flag_w = 4;
// Execute flag of the section, True when section is executable.
optional bool flag_x = 5;
}
message Library {
// If this library is statically linked.
optional bool is_static = 1;
// Address where this library was loaded, 0 if unknown.
optional uint64 load_address = 2 [default = 0];
// Name of the library (format is platform-dependent).
optional string name = 3;
}
message Module {
// Name, such as Java class name. Platform-dependent.
optional string name = 1;
}
optional Meta meta_information = 1;
repeated Expression expression = 2;
repeated Operand operand = 3;
repeated Mnemonic mnemonic = 4;
repeated Instruction instruction = 5;
repeated BasicBlock basic_block = 6;
repeated FlowGraph flow_graph = 7;
optional CallGraph call_graph = 8;
repeated string string_table = 9;
// No longer written. This is here so that BinDiff can work with older
// BinExport files.
repeated Reference address_comment = 10 [deprecated = true];
// Rich comment index used for BinDiff's comment porting.
repeated Comment comment = 17;
repeated Reference string_reference = 11;
repeated Reference expression_substitution = 12;
repeated Section section = 13;
repeated Library library = 14;
repeated DataReference data_reference = 15;
repeated Module module = 16;
// Allow for future extensions.
extensions 100000000 to max;
}