forked from google/binexport
-
Notifications
You must be signed in to change notification settings - Fork 2
/
flow_graph.cc
553 lines (509 loc) · 22.9 KB
/
flow_graph.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
// Copyright 2011-2024 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "third_party/zynamics/binexport/flow_graph.h"
#include <algorithm>
#include <cstdint>
#include <iterator>
#include <ostream>
#include <stack>
#include <string>
#include <tuple>
#include <unordered_map>
#include <utility>
#include <vector>
#include "third_party/absl/container/flat_hash_set.h"
#include "third_party/absl/log/check.h"
#include "third_party/absl/log/log.h"
#include "third_party/absl/strings/str_cat.h"
#include "third_party/zynamics/binexport/call_graph.h"
#include "third_party/zynamics/binexport/function.h"
#include "third_party/zynamics/binexport/util/format.h"
using ::security::binexport::FormatAddress;
FlowGraph::~FlowGraph() {
for (auto function : functions_) {
delete function.second;
}
}
void FlowGraph::Render(std::ostream* stream,
const CallGraph& call_graph) const {
for (const auto& function : functions_) {
function.second->Render(stream, call_graph, *this);
*stream << std::endl;
}
}
void FlowGraph::AddEdge(const FlowGraphEdge& edge) { edges_.push_back(edge); }
const Function* FlowGraph::GetFunction(Address address) const {
Functions::const_iterator i = functions_.find(address);
return i != functions_.end() ? i->second : 0;
}
Function* FlowGraph::GetFunction(Address address) {
Functions::iterator i = functions_.find(address);
return i != functions_.end() ? i->second : 0;
}
void FlowGraph::MarkOrphanInstructions(Instructions* instructions) const {
// First: Mark all instructions as invalid.
for (auto& instruction : *instructions) {
instruction.SetFlag(FLAG_INVALID, true);
}
// Second: Mark instructions that are still being referenced as valid.
for (auto& function : functions_) {
for (auto* basic_block : function.second->GetBasicBlocks()) {
for (auto& instruction : *basic_block) {
if (instruction.GetMnemonic().empty()) {
LOG(WARNING) << absl::StrCat(
FormatAddress(instruction.GetAddress()),
" is reachable from function ",
FormatAddress(function.second->GetEntryPoint()), " basic block ",
FormatAddress(basic_block->GetEntryPoint()), " but invalid!");
continue;
}
instruction.SetFlag(FLAG_INVALID, false);
}
}
}
}
void FlowGraph::AddExpressionSubstitution(Address address, uint8_t operator_num,
int expression_id,
const std::string& substitution) {
substitutions_[std::make_tuple(address, operator_num, expression_id)] =
&*string_cache_.insert(substitution).first;
}
std::vector<Address> FlowGraph::FindBasicBlockBreaks(
detego::Instructions* instructions, CallGraph* call_graph,
NoReturnHeuristic noreturn_heuristic) {
// Find non-returning calls. We simply assume any call followed by an invalid
// instruction or a multi-byte padding (nop) instruction to be non-returning.
// For a rationale for the nop instruction, please refer to b/24084521.
// TODO(soerenme) Mark the target function as non-returning and use that
// information for all future calls to the function, even if those calls are
// not followed by an invalid or nop instruction. This is a bit dangerous, as
// it introduces an order dependency on the disassembly - do we already know
// the function is non-returning by the time we find a call to it or not?
for (auto call_instruction = instructions->begin();
call_instruction != instructions->end(); ++call_instruction) {
if (!call_instruction->HasFlag(FLAG_CALL)) {
continue;
}
int num_nop_bytes = 0;
auto last_nop = instructions->end();
for (auto instruction = GetNextInstruction(instructions, call_instruction);
instruction != instructions->end();
instruction = GetNextInstruction(instructions, instruction)) {
if (instruction->HasFlag(FLAG_INVALID)) {
call_instruction->SetFlag(FLAG_FLOW, false);
break;
}
if (!instruction->HasFlag(FLAG_NOP)) {
break;
}
num_nop_bytes += instruction->GetSize();
last_nop = instruction;
}
if (noreturn_heuristic == NoReturnHeuristic::kNopsAfterCall &&
num_nop_bytes > 1) {
// A single byte nop may or may not indicate a non-returning call.
// http://reverseengineering.stackexchange.com/questions/8030/purpose-of-nop-immediately-after-call-instruction
call_instruction->SetFlag(FLAG_FLOW, false);
// If the last nop flows into an instruction with an in-degree of 1 it is
// the only incoming code flow and a function should be created.
auto potential_entry_point = GetNextInstruction(instructions, last_nop);
if (potential_entry_point->GetInDegree() == 1) {
call_graph->AddFunction(potential_entry_point->GetAddress());
}
}
}
// Addresses where a basic block break should occur.
std::vector<Address> basic_block_breaks;
// Find resynchronization points. Typically there should only ever be one
// incoming code flow per instruction. Except when we have overlapping
// instructions. A synchronization point is when two such sequences of
// overlapping instructions eventually align again and merge into a single
// instruction stream. We track only 32 bytes to keep the map small and
// because x86 instructions cannot be larger than that (and presumably no
// other instruction set either).
// Since we do this globally without having functions we could end up with
// basic block breaks that are not warranted by any of the functions in the
// final disassembly. This is unfortunate, but not dangerous, as the
// disassembly is still correct, just with spurious edges.
// Note: we cannot use [flat, node]_hash_map here since we repeatedly iterate
// over a map from which (possibly) a lot of elements have been erased.
// This has a complexity of O(num_elements) for unordered_map, but has
// O(num_deleted_elements + num_elements) complexity for the other maps.
// TODO(jannewger): performance of the algorithm below is non-ideal and should
// be improved.
std::unordered_map<Address, int> incoming_code_flows;
for (const auto& instruction : *instructions) {
const Address address = instruction.GetAddress();
const Address flow_address = instruction.GetNextInstruction();
for (auto it = incoming_code_flows.begin(), end = incoming_code_flows.end();
it != end;) {
if (it->second > 1) {
// More than one incoming code flow to this instruction -> the preceding
// instructions must have been overlapping.
basic_block_breaks.emplace_back(it->first);
} else if (it->first + 32 > address) {
// We are still within a 32 bytes trailing window of the current
// instruction, thus we keep the instruction in the map as it could
// potentially still overlap.
break;
}
it = incoming_code_flows.erase(it);
}
if (instruction.IsFlow()) {
++incoming_code_flows[flow_address];
}
}
// If we've had overlapping code in the final bytes of the executable the
// above loop will miss it. So we deal with those trailing bytes here.
for (const auto incoming_code_flow : incoming_code_flows) {
if (incoming_code_flow.second > 1) {
basic_block_breaks.emplace_back(incoming_code_flow.first);
}
}
// Find branch targets.
for (const auto& edge : edges_) {
basic_block_breaks.emplace_back(edge.target);
}
// Find call targets (function entry points).
for (Address entry_point : call_graph->GetFunctions()) {
basic_block_breaks.emplace_back(entry_point);
}
// Make basic block breaks unique and sort them.
std::sort(basic_block_breaks.begin(), basic_block_breaks.end());
basic_block_breaks.erase(
std::unique(basic_block_breaks.begin(), basic_block_breaks.end()),
basic_block_breaks.end());
return basic_block_breaks;
}
void FlowGraph::CreateBasicBlocks(Instructions* instructions,
CallGraph* call_graph,
NoReturnHeuristic noreturn_heuristic) {
// Sort edges by source address.
std::sort(edges_.begin(), edges_.end());
call_graph->SortEdges();
// If we split basic blocks we add new synthetic (unconditional) edges. These
// will be added here.
Edges new_edges;
const auto basic_block_breaks =
FindBasicBlockBreaks(instructions, call_graph, noreturn_heuristic);
// Reset instruction visited flag. Every instruction should belong to exactly
// one basic block, we use the flag to keep track of this constraint.
for (auto& instruction : *instructions) {
instruction.SetFlag(FLAG_VISITED, false);
}
absl::flat_hash_set<Address> invalid_functions;
// Start with every known function entry point address and follow flow from
// there. Create new functions and add basic blocks and edges to them.
for (Address entry_point : call_graph->GetFunctions()) {
if (GetInstruction(instructions, entry_point) == instructions->end()) {
continue; // Skip invalid/imported functions.
}
std::stack<Address> address_stack;
address_stack.push(entry_point);
const int initial_edge_number = new_edges.size();
int current_bb_number = 0;
int number_of_instructions = 0;
// Keep track of basic blocks already added to this function.
absl::flat_hash_set<Address> function_basic_blocks;
while (!address_stack.empty()) {
Address address = address_stack.top();
address_stack.pop();
Instructions::iterator instruction =
GetInstruction(instructions, address);
if (instruction == instructions->end()) {
continue;
}
CHECK(instruction->GetAddress() == address);
if (!function_basic_blocks.insert(address).second) {
// We've already dealt with this basic block.
continue;
}
BasicBlock* basic_block = BasicBlock::Find(address);
if (basic_block == nullptr) {
// We need to create a new basic block.
BasicBlockInstructions basic_block_instructions;
int bb_instr_cnt = 0;
bool bb_skip = false;
do {
if (++bb_instr_cnt > kMaxFunctionInstructions) {
bb_skip = true;
break;
}
CHECK(!instruction->HasFlag(FLAG_VISITED));
instruction->SetFlag(FLAG_VISITED, true);
basic_block_instructions.AddInstruction(instruction);
instruction = GetNextInstruction(instructions, instruction);
} while (instruction != instructions->end() &&
!std::binary_search(basic_block_breaks.begin(),
basic_block_breaks.end(),
instruction->GetAddress()));
basic_block = BasicBlock::Create(&basic_block_instructions);
number_of_instructions += bb_instr_cnt;
if (bb_skip) {
LOG(WARNING) << absl::StrCat("Skipping enormous basic block: ",
FormatAddress(address));
continue;
}
} else {
number_of_instructions += basic_block->GetInstructionCount();
}
CHECK(basic_block != nullptr);
++current_bb_number;
// Erases all new_edges for the current entry point (to save some memory)
// and add entry point to the list of invalid functions, which is
// processed later (after all entry points). This step saves memory and
// cpu, because it skips generating results which otherwise would be
// discarded by the FlowGraph::FinalizeFunctions
if ((current_bb_number > kMaxFunctionEarlyBasicBlocks) ||
(number_of_instructions > kMaxFunctionInstructions)) {
if (initial_edge_number < new_edges.size()) {
new_edges.erase(new_edges.begin() + initial_edge_number,
new_edges.end());
}
invalid_functions.insert(entry_point);
break;
}
// Three possibilities:
// - the basic block ends in a non-code flow instruction
// - the basic block ends in a branch instruction
// - the basic block ends before a basic_block_breaks instruction
CHECK(basic_block != nullptr);
address = basic_block->GetLastAddress();
instruction = GetInstruction(instructions, address);
CHECK(instruction != instructions->end());
auto edge =
std::lower_bound(edges_.begin(), edges_.end(), address,
[](const FlowGraphEdge& edge, Address address) {
return edge.source < address;
});
if (edge != edges_.end() && edge->source == address) {
// A branch instruction.
for (; edge != edges_.end() && edge->source == address; ++edge) {
address_stack.push(edge->target);
}
} else if (instruction->IsFlow()) {
// We must have hit a basic_block_breaks instruction. Add a synthetic
// unconditional branch edge to the next instruction.
const Address next_address = instruction->GetNextInstruction();
address_stack.push(next_address);
new_edges.emplace_back(address, next_address,
FlowGraphEdge::TYPE_UNCONDITIONAL);
}
}
}
if (!invalid_functions.empty()) {
LOG(INFO) << "Early erase of " << invalid_functions.size()
<< " invalid functions.";
// Drops all entry-basic-blocks for "invalid" functions, we keep entry
// points in the call_graph->functions_, so later
// FlowGraph::FinalizeFunctions can add "dummy" function entries for call
// targets which might be wrong, but we can't really tell at this moment.
// Such invalid call targets will be marked as imported functions in the
// output binexport.
for (const Address& addr : invalid_functions) {
BasicBlock::blocks().erase(addr);
}
// Removes all edges that go from/to the invalid_functions. This step might
// be not necessary, but it reduces amount of data which would be processed
// later.
edges_.erase(
std::remove_if(edges_.begin(), edges_.end(),
[&invalid_functions](const FlowGraphEdge& edge) {
return (invalid_functions.find(edge.source) !=
invalid_functions.end()) ||
(invalid_functions.find(edge.target) !=
invalid_functions.end());
}),
edges_.end());
}
// Add the new synthetic edges.
edges_.insert(edges_.end(), new_edges.begin(), new_edges.end());
std::sort(edges_.begin(), edges_.end());
}
void FlowGraph::MergeBasicBlocks(const CallGraph& call_graph) {
// At this point we have created basic blocks but have not yet added edges to
// the functions. We perform basic block merging first before finally creating
// functions.
auto delete_edge = [this, &call_graph](const FlowGraphEdge& edge) {
if (edge.type != FlowGraphEdge::TYPE_UNCONDITIONAL) {
// Only unconditional edges can potentially lead to a
// merge.
return false;
}
BasicBlock* target_basic_block = BasicBlock::Find(edge.target);
if (!target_basic_block) {
DLOG(INFO) << absl::StrCat("No target basic block for edge ",
FormatAddress(edge.source), " -> ",
FormatAddress(edge.target));
return true;
}
if (target_basic_block->begin()->GetInDegree() != 1) {
// We have more than a single in edge => do not delete.
return false;
}
if (call_graph.GetFunctions().count(target_basic_block->GetEntryPoint())) {
// Target basic block is a function entry point => do not delete.
return false;
}
BasicBlock* source_basic_block = BasicBlock::FindContaining(edge.source);
if (!source_basic_block) {
LOG(INFO) << absl::StrCat("No source basic block for edge ",
FormatAddress(edge.source), " -> ",
FormatAddress(edge.target));
return true;
}
CHECK(source_basic_block->GetLastAddress() == edge.source);
if (source_basic_block == target_basic_block) {
// This is a self loop edge => do not delete.
return false;
}
auto edges = std::equal_range(
edges_.begin(), edges_.end(), edge,
[](const FlowGraphEdge& one, const FlowGraphEdge& two) {
// Consider edges to be equivalent based on their source address only.
return one.source < two.source;
});
if (std::distance(edges.first, edges.second) > 1) {
// There is more than one edge originating at the source address. Do not
// merge. While this will typically not happen for unconditional edges it
// will for synthetic try/catch block edges. First introduced for the DEX
// disassembler in cl/100567062.
CHECK(edge.source == edges.first->source);
return false;
}
source_basic_block->AppendBlock(*target_basic_block);
BasicBlock::blocks().erase(target_basic_block->GetEntryPoint());
return true;
};
// Delete all edges that connect merged basic blocks.
edges_.erase(std::remove_if(edges_.begin(), edges_.end(), delete_edge),
edges_.end());
}
void FlowGraph::FinalizeFunctions(CallGraph* call_graph) {
// We now have a global "soup" of basic blocks and edges. Next we need to
// follow flow from every function entry point and collect a list of basic
// blocks and edges per function. While doing so we also link the new function
// into the call graph.
for (Address entry_point : call_graph->GetFunctions()) {
std::stack<Address> address_stack;
address_stack.push(entry_point);
auto function = std::make_unique<Function>(entry_point);
size_t num_instructions = 0;
// Keep track of basic blocks and edges already added to this function.
absl::flat_hash_set<Address> function_basic_blocks;
// TODO(cblichmann): Encountering the same basic block multiple times during
// traversal is expected and ok. Encountering the same
// edge is not. This is just inefficient - why did we add
// it twice in the first place? Only the ARM disassembler
// produces redundant edges atm.
absl::flat_hash_set<FlowGraphEdge, FlowGraphEdgeHash> done_edges;
while (!address_stack.empty()) {
Address address = address_stack.top();
address_stack.pop();
if (!function_basic_blocks.insert(address).second) {
continue; // Already added to the function.
}
if (function_basic_blocks.size() >= kMaxFunctionBasicBlocks ||
function->GetEdges().size() >= kMaxFunctionEdges ||
num_instructions >= kMaxFunctionInstructions) {
break;
}
BasicBlock* basic_block = BasicBlock::Find(address);
if (!basic_block) {
// A function without a body (imported/invalid).
continue;
}
function->AddBasicBlock(basic_block);
num_instructions += basic_block->GetInstructionCount();
const auto source_address = basic_block->GetLastAddress();
auto edge =
std::lower_bound(edges_.begin(), edges_.end(), source_address,
[](const FlowGraphEdge& edge, Address address) {
return edge.source < address;
});
uint64_t dropped_edges = 0;
for (; edge != edges_.end() && edge->source == source_address; ++edge) {
if (!BasicBlock::Find(edge->target)) {
DLOG(INFO) << absl::StrCat("Dropping edge ",
FormatAddress(edge->source), " -> ",
FormatAddress(edge->target),
" because the target address is invalid.");
++dropped_edges;
continue;
}
if (done_edges.insert(*edge).second) {
function->AddEdge(*edge);
address_stack.push(edge->target);
}
}
LOG_IF(INFO, dropped_edges > 0) << absl::StrCat(
"Dropped ", dropped_edges,
" edges because the target address is invalid (current basic block ",
FormatAddress(address), ").");
}
if (function_basic_blocks.size() >= kMaxFunctionBasicBlocks ||
function->GetEdges().size() >= kMaxFunctionEdges ||
num_instructions >= kMaxFunctionInstructions) {
DLOG(INFO) << absl::StrCat(
"Discarding excessively large function ", FormatAddress(entry_point),
": ", function_basic_blocks.size(), " basic blocks, ",
function->GetEdges().size(), " edges, ", num_instructions,
" instructions (Limit is ", kMaxFunctionBasicBlocks, ", ",
kMaxFunctionEdges, ", ", kMaxFunctionInstructions, ")");
function->Clear();
// https://github.com/google/binexport/issues/90
function->SetType(Function::TYPE_INVALID);
}
for (const auto* basic_block : function->GetBasicBlocks()) {
// Iterate the basic block and add edges to the call graph for every
// call instruction. The call graph already knows about source and
// target address, but is not linked to functions yet.
const auto& call_edges = call_graph->GetEdges();
for (const auto& instruction : *basic_block) {
if (instruction.HasFlag(FLAG_CALL)) {
auto edge = std::lower_bound(
call_edges.begin(), call_edges.end(), instruction.GetAddress(),
[](const EdgeInfo& edge, Address address) {
return edge.source_ < address;
});
for (; edge != call_edges.end() &&
edge->source_ == instruction.GetAddress();
++edge) {
call_graph->ScheduleEdgeAdd(function.get(), edge->source_,
edge->target_);
}
}
}
}
function->SortGraph();
functions_.insert(std::make_pair(entry_point, function.release()));
}
// We don't need them any longer (functions store their own copy).
Edges().swap(edges_);
call_graph->CommitEdges(); // Clear temporary edges.
}
void FlowGraph::ReconstructFunctions(Instructions* instructions,
CallGraph* call_graph,
NoReturnHeuristic noreturn_heuristic) {
CreateBasicBlocks(instructions, call_graph, noreturn_heuristic);
MergeBasicBlocks(*call_graph);
FinalizeFunctions(call_graph);
}
void FlowGraph::PruneFlowGraphEdges() {
// Stupid post processing step but IDA sometimes produces broken edges
// (pointing nowhere) if it incorrectly disassembled a data section.
for (auto kv : functions_) {
kv.second->FixEdges();
}
}