-
Notifications
You must be signed in to change notification settings - Fork 2
/
fstore_op_latency.rb
executable file
·133 lines (125 loc) · 5.81 KB
/
fstore_op_latency.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env ruby
# This script is designed to parse OSD logs with debug_osd=10 and
# debug_filestore = 10 (or 15 if you want to more precisely measure the time
# inside of pwritev()) and calculate the latency of various parts of the
# process.
#
# It has two main purposes, the first is to count and calculate the latency of
# every operation, and secondly is to time specific parts of it with an
# original focus on the time spent inside the pwritev() call to the kernel,
# i.e. when we submit the I/O to the page cache.
#
# The reason I wanted to catch this is that if a write is made that is not
# wholly formed of entire 4K pages, then the kernel may need to fetch the rest
# of that 4K from the disk as it cannot buffer the write in memory of only part
# of a page. In this case the pwritev() call which is normally a buffered write
# then BLOCKS waiting for the rest of the page to come back from disk. This
# holds up your filestore_op_thread of which by default there are only 2,
# possibly for 100s of milliseconds in a HDD environment.
#
# In that particular environment the fix for this problem, because they had
# a high percentage of what I call 'Partial' writes (that is writes that are
# not 4K aligned and also sized in multiples of 4K) due to having primarily
# Windows clients which generally align writes to 512b and not 4k unlike Linux
# clients which almost exclusively send 4k I/Os because that is the granularity
# of the page cache (at least where PAGE_SIZE is 4k anyway.. usually the case..
# for now..)
#
# The fix for that particular environment was to set filestore_fadvise=false..
# the problem was that Ceph helpfully tells the kernel to throw away the page
# cache for objects on secondary OSDs because it doesn't expect to read from it
# again. But doesn't account for needing the rest of the page to complete
# a partial write in the future. This is problematic on it's own but
# particularly problematic when you have a fast NVMe journal but slower HDD
# OSD.
# Author: Trent Lloyd
# https://github.com/lathiat/ceph-tools
# License: GPL v3
require 'pp'
require 'date'
partial_ops = 0
slow_ops = 0
slow_partial_ops = 0
slow_ops_time = 0
total_ops = 0
thread = {}
partial_sizes = {}
threshold = 0.2
ARGF.each_line do |l|
a = l.split(" ")
begin
cur_t = DateTime.parse(a[0..1].join(" ")).to_time
rescue ArgumentError
next
end
thread_id = a[2]
next if cur_t.nil?
if l =~ / _do_op .* start$/
thread[thread_id] = {start: {time: cur_t, line: l}}
end
if l =~ / 10 filestore(.*) write .*/
next unless thread[thread_id]
write_data = a[7].split("~")
wr_offset = write_data[0].to_i
wr_size = write_data[1].to_i
start_partial = (wr_offset % 4096)
end_partial = ((wr_offset + wr_size) % 4096)
start_block = wr_offset / 4096
end_block = (wr_offset + wr_size) / 4096
num_blocks = end_block - start_block + 1
if num_blocks == 1
if (start_partial > 0) && (end_partial > 0)
end_partial = 0
end
end
if start_partial > 0
partial_sizes[start_partial] ||= 0
partial_sizes[start_partial] += 1
end
if end_partial > 0
partial_sizes[end_partial] ||= 0
partial_sizes[end_partial] += 1
end
thread[thread_id][:write] = {time: cur_t, line: l, start_partial: start_partial, end_partial: end_partial}
end
if l =~ / 15 filestore(.*) write .*/
next unless thread[thread_id]
thread[thread_id][:write_start] = {time: cur_t, line: l}
end
if l=~ /_do_op .*, finisher/
next unless thread[thread_id]
next unless thread[thread_id][:write]
next unless thread[thread_id][:start]
total_ops += 1
thread[thread_id][:finish] = {time: cur_t, line: l}
delta_w = thread[thread_id][:write][:time] - thread[thread_id][:start][:time]
delta_f = thread[thread_id][:finish][:time] - thread[thread_id][:write][:time]
if thread[thread_id][:write_start]
delta_ws = thread[thread_id][:write][:time] - thread[thread_id][:write_start][:time]
else
delta_ws = 0.000000
end
partial = false
if thread[thread_id][:write][:start_partial] > 0 || thread[thread_id][:write][:end_partial] > 0
partial = true
partial_ops += 1
#printf("0%1s%1s%1s%1s %.6f %.6f %.6f %s", delta_w > threshold ? "W" : " ", delta_ws > threshold ? "S" : " ", delta_f > threshold ? "F" : "", partial ? "P" : " ", delta_w, delta_ws, delta_f, thread[thread_id][:write][:line])
end
if delta_w > threshold || delta_f > threshold || delta_ws > threshold
slow_ops += 1
slow_ops_time += delta_w
slow_ops_time += delta_f
slow_partial_ops += 1 if partial
printf("1%1s%1s%1s%1s %.6f %.6f %.6f %s", delta_w > threshold ? "W" : " ", delta_ws > threshold ? "S" : " ", delta_f > threshold ? "F" : "", partial ? "P" : " ", delta_w, delta_ws, delta_f, thread[thread_id][:start][:line])
printf("2%1s%1s%1s%1s %.6f %.6f %.6f %s", delta_w > threshold ? "W" : " ", delta_ws > threshold ? "S" : " ", delta_f > threshold ? "F" : "", partial ? "P" : " ", delta_w, delta_ws, delta_f, thread[thread_id][:write][:line])
printf("3%1s%1s%1s%1s %.6f %.6f %.6f %s", delta_w > threshold ? "W" : " ", delta_ws > threshold ? "S" : " ", delta_f > threshold ? "F" : "", partial ? "P" : " ", delta_w, delta_ws, delta_f, thread[thread_id][:finish][:line])
printf("\n")
end
end
end
printf("%8s: %d\n", "Slow Ops", slow_ops)
printf("%8s: %d\n", "Partial (non-4K) Ops", partial_ops)
printf("%8s: %d\n", "Slow & Partial Ops", slow_partial_ops)
printf("%8s: %d\n", "Total Ops", total_ops)
printf("%8s: %d\n", "Slow Ops Time", slow_ops_time)
pp partial_sizes