-
Notifications
You must be signed in to change notification settings - Fork 0
/
bencode.jq
650 lines (606 loc) · 25.9 KB
/
bencode.jq
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
def _log($string):
. as $save |
$string |
debug |
$save
;
# Outputs the input character length as the length of its UTF-8-encoded version in bytes
def _bytecharlength:
utf8bytelength
;
# Outputs the input character length in UTF-8 code units
def _charlength:
1
;
# Outputs the input character length in UTF-16 code units
def _u16charlength:
if utf8bytelength > 3 then 2 else 1 end
;
# Bencode to JSON decoder
# The argument is a function returning the "length" of a character
def _bdecode(length_function):
def integer_process($ichar):
# $ichar: current character of the Bencode-d input being processed
if $ichar == "e" then (
# End of the integer string representation: converts it to an actual integer
( .[2][2] | tonumber ) as $value |
# Adds the leaf value to the current path
.[0][-1][1] |= $value |
# Deletes the transient work structure
del( .[2] )
) else (
# Builds the string representation of the integer value
.[2][2] |= . + $ichar
) end
;
def string_process($ichar;length_function):
# $ichar: current character of the Bencode-d input being processed
# length_function: the filter matching the character length algorithm used for the Bencode-d string
#
# .[2] -> [
# "standalone" or "key" or "value",
# "string",
# "length" or "value",
# "", Length of the string, filled as a string while .[2][2] is "length", a number afterwards
# "" Actual string contents
# ]
def zero_length_check:
# Function checking for strings fully processed and performing actions at that point
if .[2][3] == 0 then (
# String length at 0: empty string and/or end of the string
if .[2][0] == "key" then (
# Adds the key to the current path
.[0][-1][0] += [.[2][4]] |
# Switches to evaluating the value of the KV-pair
.[2] = [ "value" ]
) else (
# KV-pair value or standalone string
# Writing the string as leaf value of the current path
.[0][-1][1] = .[2][4] |
# Deletes the transient work structure: end of string processing
del( .[2] )
) end
) else (
# Not the end of the string: proceeds from here
.
) end
;
if .[2][2] == "length" and (.[2][3]|type) == "string" then (
# Building the string length from the prefix
if $ichar == ":" then (
# End of prefix length: converts it to a number for later use
.[2][3] |= tonumber |
zero_length_check
) else (
# Filling the prefix length
.[2][3] |= . + $ichar
) end
) elif .[2][2] == "length" and (.[2][3]|type) == "number" then (
# Prefix length evaluation ended: first character of the actual contents
# Value mode
.[2][2] |= "value" |
# Decreases the string length by the current character's length according
# to length_function
.[2][3] |= . - ( $ichar | length_function ) |
# Appends $ichar to the output string
.[2][4] |= . + $ichar |
zero_length_check
) elif .[2][2] == "value" and .[2][3] != 0 then (
# Decreases the string length by the current character's length according
# to length_function
.[2][3] |= . - ( $ichar | length_function ) |
# Appends $ichar to the output string
.[2][4] |= . + $ichar |
zero_length_check
) else (
error("This shouldn't be possible!")
) end
;
# Wraps within a list the input
"l\(.)e" |
# Reduces character by character the input Bencode-d string
reduce split("")[] as $item (
# Initial state
[
# Array accumulating stream elements for output
[
[
null
]
],
# [ previous_stack_of_structures, current_stack_of_structures ]
[ [], [] ]
# Transient array (missing here) for building keys/values/standalones
];
#
# Start of the iterative code
#
# Convenient aliases
.[1][1] as $stack |
.[1][1] as $prev_stack |
.[1][0] as $old_stack |
( $stack | length ) as $stack_length |
( $prev_stack | length ) as $prev_stack_length |
( $old_stack | length ) as $old_stack_length |
.[0][-1] as $prev |
.[0][-1][0] as $prev_path |
.[0][-1][1] as $prev_value |
( $prev_path | length ) as $prev_path_length |
if length == 2 then (
# TL Length == 2 -> No value to process
if (.[0][0][0] == null) then (
# .[0][0][0] == null -> First run
if $item == "l" then (
# Start of list
[
[
[
[]
]
],
[ [], ["array"] ]
]
) elif $item == "d" then (
# Start of dictionary
[
[
[
[]
]
],
[ [], ["dictionary"] ]
]
) elif $item == "i" then (
# Start of integer value
[
[
[
[],0
]
],
[ [], [] ],
[
"standalone",
"integer",
""
]
]
) else (
# Start of string value
[
[
[
[],""
]
],
[ [], [] ],
[
"standalone",
"string",
"length",
$item,
""
]
]
) end
) else (
# Inbetween values -> Second run onwards
if $item == "e" then (
# End of dictionary or list
$stack[-2:] as $last_two |
# Removes the last element (array/dictionary) we are closing from the stack
del(.[1][1][-1]) |
.[1][1] as $stack |
( $stack | length ) as $stack_length |
if $old_stack_length == $stack_length and $old_stack_length < $prev_stack_length and $prev_value == null then (
# If a list or dictionary closed as empty, adds it as value for the previous path
if $last_two[-1] == "array" then (
.[0][-1][1] = []
) elif $last_two[-1] == "dictionary" then (
.[0][-1][1] = {}
) else (
error("This shouldn't be possible!")
) end
) else (
.
) end |
# Adds intermediate paths to continue
# E.g. if the last element was [[0,"a"],2] and the dictionary ended, add [[0,"a"]]
.[0] += [
# Calculates the number of intermediate paths needed
[
range(
if ( $prev_value == null ) then (
# The previous iteration was the end of another list/dictionary or the start of one
$prev_path_length - 1
) else (
# The previous element had a standalone leaf value
$prev_path_length
) end;
if (
(
$last_two |
IN( ["dictionary","array"], ["dictionary","dictionary"] )
) and $stack_length > 1
) then (
# If the last element was, for instance, [[0,2,"a","b",0],2]
# the stack would contain ["array","array","dictionary","dictionary","array"]
# Closing the trailing array would also mean removing the dictionary key from the intermediate paths, hence
# [[0,2,"a","b",0]],[[0,2,"a","b"]] added at the end of this step
$stack_length - 2
) else (
# Not "closing" a key-value pair
$stack_length - 1
) end;
# Negative step
-1
)
] |
# Creates the list of intermediate paths in decreasing length
# by slicing the last element multiple times with decreasing
# ranges provided by the "range" function above.
# If no intermediate paths needs to be added "range" outputs
# an empty list.
map(
[ $prev_path[0:.] ]
)[]
]
) else (
# Start of dictionary/list/KV-pair/value
if $stack[-1] == "array" then (
if $prev_path_length < $stack_length then (
# First element of a new array
.[0][-1][0] += [0]
) elif $prev_path_length == $stack_length then (
# Subsequent elements
.[0] += [[
$prev_path[ 0:($stack_length - 1) ] +
[ $prev_path[ -1 ] + 1 ]
]]
) else (
error("This shouldn't be possible!")
) end
) elif $stack[-1] == "dictionary" then (
if $prev_path_length == $stack_length then (
# "Closes" the KV-pair and possibly the dictionary if it's the last KV-pair
.[0] += [[
$prev_path[ 0:($stack_length - 1) ]
]]
) elif $prev_path_length < $stack_length then (
# A new KV-pair opened: nothing to do
.
) else (
error("This shouldn't be possible!")
) end
) else (
error("This shouldn't be possible!")
) end |
if $item == "l" then (
# Starting a list/array: adds that to the stack
.[1][-1] += ["array"]
) elif $item == "d" then (
# Starting a dictionary: adds that to the stack
.[1][-1] += ["dictionary"] |
# Starting a dictionary: goes into key processing of the KV-pair
.[2] = [
"key"
]
) elif $item == "i" then (
# Start of an integer standalone value
. +=
[[
"standalone",
"integer",
""
]]
) else (
if $stack[-1] == "dictionary" then (
# Starts processing the prefix length of the key of a KV-pair
. +=
[[
"key",
"string",
"length",
$item,
""
]]
) else (
# Starts processing the prefix length of a standalone string value
. +=
[[
"standalone",
"string",
"length",
$item,
""
]]
) end
) end
) end
) end
) elif length == 3 then (
# Processing of values after the first character
if .[2][0] == "standalone" and .[2][1] == "string" then (
# Standalone string
string_process($item;length_function)
) elif .[2][0] == "standalone" and .[2][1] == "integer" then (
# Standalone integer
integer_process($item)
) elif .[2][0] == "key" then (
# KV-pair: key
if (.[2]|length == 1) then (
# Starting processing the first character of the key
if $item == "e" then (
# Empty dictionary: writing it as leaf value of the current path
del(.[1][1][-1]) |
.[0][-1][1] = {} |
# Deletes the transient work structure
del(.[2])
) else (
# Transient ds with the first character of the prefix key length
.[2] = [ "key", "string","length",$item,"" ]
) end
) else (
# Processing from the second character of the prefix length of the key
# to the end of the actual key
string_process($item;length_function)
) end
) elif .[2][0] == "value" then (
# KV-pair: value
if (.[2]|length == 1) then (
# Processing the first character of the value part of a KV-pair
if $item == "i" then (
# Integer value: transient ds for the processing of the value
.[2] = [ "value", "integer","" ]
) elif $item == "l" then (
# Nested array/list
.[1][1] += ["array"] |
# Deleting the transient ds to leave the value processing phase
del(.[2])
) elif $item == "d" then (
# Nested dictionary
.[1][1] += ["dictionary"] |
# Deleting the transient ds to leave the value processing phase
del(.[2])
) else (
# String value: transient ds with the first character of the prefix value length
.[2] = [ "value", "string","length",$item,"" ]
) end
) else (
# Processing from the second character of integer/string values onwards
if .[2][0] == "value" and .[2][1] == "integer" then (
# Integer value: processing
integer_process($item)
) elif .[2][0] == "value" and .[2][1] == "string" then (
# String value: processing
string_process($item;length_function)
) else (
error("This shouldn't be possible!")
) end
) end
) else (
error("This shouldn't be possible!")
) end
) else (
error("This shouldn't be possible!")
) end |
if .[1][1] | length > 0 then (
# Saves the "previous" stack for the next iteration
.[1][0] = $prev_stack
) else (
# Unless we've finished processing the input
.
) end
) |
# Converts the stream representation into a JSON DS, removing the initial list wrapping
fromstream(.[0][0:-1][])[]
;
# JSON to Bencode encoder
# The argument is a function returning the "length" of a character
def _bencode(length_function):
# Function expressing the length of the leading
# slice of an array/list common to both the input
# and $b. Which also coincides with the index of
# the first element which differs between the two
# lists.
def common_length($b):
. as $a |
($a|length) as $len_a |
($b|length) as $len_b |
([$len_a,$len_b] | min ) as $min |
[ range(0;$min;1) ] |
map($a[.] == $b[.]) |
index(false) // $min
;
# Function computing the length of its string argument
# following the string length conventions of "length_function"
def string_length(length_function):
reduce split("")[] as $char (
# Initial state
0;
. + ( $char | length_function )
) |
tostring
;
# Wraps within a list the input
[.] |
# Generates a streaming form of the wrapped input, wrapped into a list
[tostream] |
# Processes the streaming form list elements through "reduce", with each
# subsequent element aliased to $item
reduce .[] as $item (
# Initial reduce data structure
[
# String accumulating the Bencode-d form of the input
"",
# Data from previous iteration
[
[]
],
# Data stack from previous iteration
[]
# Working container object (missing at the start here)
];
# Aliases for the previous iteration elements
.[1] as $previous |
.[1][0] as $prev_path |
( .[1] | length ) as $prev_length |
( .[1][0] | length ) as $prev_path_length |
.[2] as $previous_stack |
( .[2] | length ) as $prev_stack_length |
# Processes the new stream element
.[1] |= $item |
# Builds the stack from the new stream element
# (e.g. from [[0,2,"a",0,"b",0],2] build
# ["array","array","dictionary","array","dictionary","array"] )
.[2] |= (
$item[0] |
map(
if type == "number" then (
"array"
) else (
"dictionary"
) end
)
) |
# Aliases for the current iteration elements
.[2] as $current_stack |
( .[2] | length ) as $curr_stack_length |
.[1] as $current |
.[1][0] as $curr_path |
( .[1] | length ) as $curr_length |
( .[1][0] | length ) as $curr_path_length |
# Find the index of the first different element between
# the paths of the previous and the current stream elements
( $curr_path | common_length($prev_path)) as $divergence_index |
if $prev_path_length > 0 and $divergence_index > 0 and $curr_length == 1 then (
# Add a closing mark (i.e. "e") for each list/dictionary still open once
# we've reached the base level of the input data
.[0] += (
(
"e" * ( $previous_stack[$divergence_index:] | length )
) // ""
)
) else (
if $prev_path_length > 1 then (
if $divergence_index + 1 < $prev_path_length then (
# One or more data structures have been closed since the last iteration: adds
# the appropriate number of markers (i.e. "e") to reflect that
#
# Let's take the "bencode" function input to be [ [[[1]]], {"a":{"b":{"c":2}}} ] .
# With the initial list wrapping, the stream element before the start
# of the dictionary will be
# [[0,0,0]]
# followed by
# [[0,1,"a","b","c"],2]
# The index at which the paths diverge is 1, but that's the index of the array key.
# The same would go for a dictionary.
# Hence we count the number of elements following the aforementioned index as the
# number of end markers (i.e. "e") we have to add.
.[0] += (
(
"e" * ( $previous_stack[$divergence_index+1:] | length )
) // ""
)
) else (
.
) end
) else (
.
) end |
# Copy the trailing portion of the current path which differs from
# the one of the previous iteration for further processing
.[3] |= $item[0][$divergence_index:] |
# Adds any start of dictionary/list marker (i.e. "d" or "l") needed
.[0] += (
reduce .[3][] as $key (
[
# Bencode-d string accumulator
"",
# Index of the key being processed
0
];
if ( $key | type == "number" and $key == 0 ) then (
# The key is a number and that number is 0, hence we are at the start of a list/array
.[0] += "l"
) elif ( $key | type == "number" ) then (
# The key is a number different from 0, we are within a list/array
.
) elif ( $key | type == "string" ) then (
# The key is a string, hence there's a dictionary
.[0] += (
(
# Adds a start of dictionary marker unless the key index is 0
# (which means that a new key-value pair is being added to a
# previously opened dictionary)
if .[1] == 0
then ""
else "d"
end
) + (
# Adds the dictionary key in Bencode-d form
$key |
# Computes the length of the key according to our definition of string length
string_length( length_function )
) + ":" + $key
)
) else (
# We should never reach this fork
error("This should not happen!")
) end |
# Increase the key index counter
.[1] += 1
) |
# Extracts the Bencode-d string
.[0]
) |
if $item | length == 2 then (
# We are looking at a stream leaf element (i.e. we have an actual value)
.[0] += (
# Translates values in Bencode-d forms, with a few implementation choices
# where data types have no direct correspondance
if $item[1] | type == "number" then (
"i" + ( $item[1] | tostring ) + "e"
) elif $item[1] | type == "string" then (
(
$item[1] |
# Computes the length of the string value according to our definition of string length
string_length( length_function )
) + ":" + $item[1]
) elif $item[1] | type == "boolean" then (
# We choose to represent boolean values as "1" or "0" integer values
if $item[1] == true
then "i1e"
else "i0e"
end
) elif $item[1] | type == "null" then (
# We choose to represent null values as empty strings
"0:"
) elif $item[1] | type == "array" then (
"le"
) elif $item[1] | type == "object" then (
"de"
) else (
# We should never reach this fork
error("This should not happen!")
) end
)
) else (
.
) end
) end
)|
# Removes the leading "l" coming from the list wrapper at the start
.[0][1:]
;
# Implementation of Bencode where the string length is expressed as a count of UTF-16 code units.
# So every character within the Basic Multilingual Plane requires 1 code unit
# to encode, while every character in Supplementary planes requires 2 code units.
def u16strbdecode: _bdecode(_u16charlength);
def u16strbencode: _bencode(_u16charlength);
# Implementation of Bencode where the string length is expressed as a count of UTF-8 code units.
def strbdecode: _bdecode(_charlength);
def strbencode: _bencode(_charlength);
# Implementation of Bencode (the most common one) where the string length is expressed as a byte count.
def bdecode: _bdecode(_bytecharlength);
def bencode: _bencode(_bytecharlength);