Fix unflatten with field names like . .x or x..y (#1735)

* Fix unflatten with field name like `.` `.x` or `x..y` * docs & test data
johnkerl · Dec 23, 2024 · cc1cd95 · cc1cd95
1 parent 8088850
commit cc1cd95
Show file tree

Hide file tree

Showing 12 changed files with 164 additions and 43 deletions.
diff --git a/docs/src/data/flatten-dots.csv b/docs/src/data/flatten-dots.csv
@@ -0,0 +1,2 @@
+a,b.,.c,.,d..e,f.g
+1,2,3,4,5,6
diff --git a/docs/src/flatten-unflatten.md b/docs/src/flatten-unflatten.md
@@ -348,6 +348,59 @@ a.1,a.3,a.5
 ]
 </pre>
 
+## Non-inferencing cases
+
+An additional heuristic is that if a field name starts with a `.`, ends with
+a `.`, or has two or more consecutive `.` characters, no attempt is made
+to unflatten it on conversion from non-JSON to JSON.
+
+<pre class="pre-highlight-in-pair">
+<b>cat data/flatten-dots.csv</b>
+</pre>
+<pre class="pre-non-highlight-in-pair">
+a,b.,.c,.,d..e,f.g
+1,2,3,4,5,6
+</pre>
+
+<pre class="pre-highlight-in-pair">
+<b>mlr --icsv --oxtab cat data/flatten-dots.csv</b>
+</pre>
+<pre class="pre-non-highlight-in-pair">
+a    1
+b.   2
+.c   3
+.    4
+d..e 5
+f.g  6
+</pre>
+
+<pre class="pre-highlight-in-pair">
+<b>mlr --icsv --ojson cat data/flatten-dots.csv</b>
+</pre>
+<pre class="pre-non-highlight-in-pair">
+[
+{
+  "a": 1,
+  "b.": 2,
+  ".c": 3,
+  ".": 4,
+  "d..e": 5,
+  "f": {
+    "g": 6
+  }
+}
+]
+</pre>
+
+## Non-inferencing cases
+
+An additional heuristic is that if a field name starts with a `.`, ends with
+a `.`, or has two or more consecutive `.` characters, no attempt is made
+to unflatten it on conversion from non-JSON to JSON.
+
+## Manual control
+
+
 ## Manual control
 
 To see what our options are for manually controlling flattening and

diff --git a/docs/src/flatten-unflatten.md.in b/docs/src/flatten-unflatten.md.in
@@ -156,6 +156,33 @@ GENMD-RUN-COMMAND
 mlr --c2j cat data/non-consecutive.csv
 GENMD-EOF
 
+## Non-inferencing cases
+
+An additional heuristic is that if a field name starts with a `.`, ends with
+a `.`, or has two or more consecutive `.` characters, no attempt is made
+to unflatten it on conversion from non-JSON to JSON.
+
+GENMD-RUN-COMMAND
+cat data/flatten-dots.csv
+GENMD-EOF
+
+GENMD-RUN-COMMAND
+mlr --icsv --oxtab cat data/flatten-dots.csv
+GENMD-EOF
+
+GENMD-RUN-COMMAND
+mlr --icsv --ojson cat data/flatten-dots.csv
+GENMD-EOF
+
+## Non-inferencing cases
+
+An additional heuristic is that if a field name starts with a `.`, ends with
+a `.`, or has two or more consecutive `.` characters, no attempt is made
+to unflatten it on conversion from non-JSON to JSON.
+
+## Manual control
+
+
 ## Manual control
 
 To see what our options are for manually controlling flattening and

diff --git a/docs/src/manpage.md b/docs/src/manpage.md
@@ -424,7 +424,7 @@ This is simply a copy of what you should see on running `man mlr` at a command p
 1mFLATTEN-UNFLATTEN FLAGS0m
        These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening).
 
-       See the Flatten/unflatten doc page for more information.
+       See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information.
 
        --flatsep or --jflatsep {string}
                                 Separator for flattening multi-level JSON keys, e.g.
@@ -435,10 +435,10 @@ This is simply a copy of what you should see on running `man mlr` at a command p
                                 then this flattens to `y.1=7,y.2=8,y.3=9, and
                                 similarly for maps. With `--no-auto-flatten`, instead
                                 we get `$y=[1, 2, 3]`.
-       --no-auto-unflatten      When input non-JSON and output is JSON, suppress the
-                                default auto-unflatten behavior. Default: if the
+       --no-auto-unflatten      When input is non-JSON and output is JSON, suppress
+                                the default auto-unflatten behavior. Default: if the
                                 input has `y.1=7,y.2=8,y.3=9` then this unflattens to
-                                `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With
+                                `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=91. With
                                 `--no-auto-flatten`, instead we get
                                 `${y.1}=7,${y.2}=8,${y.3}=9`.
 
@@ -3737,5 +3737,5 @@ This is simply a copy of what you should see on running `man mlr` at a command p
        MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite
        https://miller.readthedocs.io
 
-                                  2024-11-23                         4mMILLER24m(1)
+                                  2024-12-23                         4mMILLER24m(1)
 </pre>
diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt
@@ -403,7 +403,7 @@
 1mFLATTEN-UNFLATTEN FLAGS0m
        These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening).
 
-       See the Flatten/unflatten doc page for more information.
+       See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information.
 
        --flatsep or --jflatsep {string}
                                 Separator for flattening multi-level JSON keys, e.g.
@@ -414,10 +414,10 @@
                                 then this flattens to `y.1=7,y.2=8,y.3=9, and
                                 similarly for maps. With `--no-auto-flatten`, instead
                                 we get `$y=[1, 2, 3]`.
-       --no-auto-unflatten      When input non-JSON and output is JSON, suppress the
-                                default auto-unflatten behavior. Default: if the
+       --no-auto-unflatten      When input is non-JSON and output is JSON, suppress
+                                the default auto-unflatten behavior. Default: if the
                                 input has `y.1=7,y.2=8,y.3=9` then this unflattens to
-                                `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With
+                                `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=91. With
                                 `--no-auto-flatten`, instead we get
                                 `${y.1}=7,${y.2}=8,${y.3}=9`.
 
@@ -3716,4 +3716,4 @@
        MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite
        https://miller.readthedocs.io
 
-                                  2024-11-23                         4mMILLER24m(1)
+                                  2024-12-23                         4mMILLER24m(1)
diff --git a/docs/src/reference-main-flag-list.md b/docs/src/reference-main-flag-list.md
@@ -195,14 +195,14 @@ are overridden in all cases by setting output format to `format2`.
 
 These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening).
 
-See the Flatten/unflatten doc page for more information.
+See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information.
 
 
 **Flags:**
 
 * `--flatsep or --jflatsep {string}`: Separator for flattening multi-level JSON keys, e.g. `{"a":{"b":3}}` becomes `a:b => 3` for non-JSON formats. Defaults to `.`.
-* `--no-auto-flatten`: When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`.
-* `--no-auto-unflatten`: When input non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`.  flattens to `y.1=7,y.2=8,y.3=9. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`.
+* `--no-auto-flatten`: When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9`, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`.
+* `--no-auto-unflatten`: When input is non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`.  With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`.
 
 ## Format-conversion keystroke-saver flags
 

diff --git a/man/manpage.txt b/man/manpage.txt
@@ -403,7 +403,7 @@
 1mFLATTEN-UNFLATTEN FLAGS0m
        These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening).
 
-       See the Flatten/unflatten doc page for more information.
+       See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information.
 
        --flatsep or --jflatsep {string}
                                 Separator for flattening multi-level JSON keys, e.g.
@@ -414,10 +414,10 @@
                                 then this flattens to `y.1=7,y.2=8,y.3=9, and
                                 similarly for maps. With `--no-auto-flatten`, instead
                                 we get `$y=[1, 2, 3]`.
-       --no-auto-unflatten      When input non-JSON and output is JSON, suppress the
-                                default auto-unflatten behavior. Default: if the
+       --no-auto-unflatten      When input is non-JSON and output is JSON, suppress
+                                the default auto-unflatten behavior. Default: if the
                                 input has `y.1=7,y.2=8,y.3=9` then this unflattens to
-                                `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With
+                                `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=91. With
                                 `--no-auto-flatten`, instead we get
                                 `${y.1}=7,${y.2}=8,${y.3}=9`.
 
@@ -3716,4 +3716,4 @@
        MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite
        https://miller.readthedocs.io
 
-                                  2024-11-23                         4mMILLER24m(1)
+                                  2024-12-23                         4mMILLER24m(1)
diff --git a/man/mlr.1 b/man/mlr.1
@@ -2,12 +2,12 @@
 .\"     Title: mlr
 .\"    Author: [see the "AUTHOR" section]
 .\" Generator: ./mkman.rb
-.\"      Date: 2024-11-23
+.\"      Date: 2024-12-23
 .\"    Manual: \ \&
 .\"    Source: \ \&
 .\"  Language: English
 .\"
-.TH "MILLER" "1" "2024-11-23" "\ \&" "\ \&"
+.TH "MILLER" "1" "2024-12-23" "\ \&" "\ \&"
 .\" -----------------------------------------------------------------
 .\" * Portability definitions
 .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -492,7 +492,7 @@ are overridden in all cases by setting output format to `format2`.
 .nf
 These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening).
 
-See the Flatten/unflatten doc page for more information.
+See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information.
 
 --flatsep or --jflatsep {string}
                          Separator for flattening multi-level JSON keys, e.g.
@@ -503,10 +503,10 @@ See the Flatten/unflatten doc page for more information.
                          then this flattens to `y.1=7,y.2=8,y.3=9, and
                          similarly for maps. With `--no-auto-flatten`, instead
                          we get `$y=[1, 2, 3]`.
---no-auto-unflatten      When input non-JSON and output is JSON, suppress the
-                         default auto-unflatten behavior. Default: if the
+--no-auto-unflatten      When input is non-JSON and output is JSON, suppress
+                         the default auto-unflatten behavior. Default: if the
                          input has `y.1=7,y.2=8,y.3=9` then this unflattens to
-                         `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=9. With
+                         `$y=[7,8,9]`. flattens to `y.1=7,y.2=8,y.3=91. With
                          `--no-auto-flatten`, instead we get
                          `${y.1}=7,${y.2}=8,${y.3}=9`.
 .fi

diff --git a/pkg/cli/option_parse.go b/pkg/cli/option_parse.go
@@ -2877,7 +2877,7 @@ var OutputColorizationFlagSection = FlagSection{
 func FlattenUnflattenPrintInfo() {
 	fmt.Println("These flags control how Miller converts record values which are maps or arrays, when input is JSON and output is non-JSON (flattening) or input is non-JSON and output is JSON (unflattening).")
 	fmt.Println()
-	fmt.Println("See the Flatten/unflatten doc page for more information.")
+	fmt.Println("See the flatten/unflatten doc page https://miller.readthedocs.io/en/latest/flatten-unflatten for more information.")
 }
 
 func init() { FlattenUnflattenFlagSection.Sort() }
@@ -2901,7 +2901,7 @@ var FlattenUnflattenFlagSection = FlagSection{
 
 		{
 			name: "--no-auto-flatten",
-			help: "When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`.",
+			help: "When output is non-JSON, suppress the default auto-flatten behavior. Default: if `$y = [7,8,9]` then this flattens to `y.1=7,y.2=8,y.3=9`, and similarly for maps. With `--no-auto-flatten`, instead we get `$y=[1, 2, 3]`.",
 			parser: func(args []string, argc int, pargi *int, options *TOptions) {
 				options.WriterOptions.AutoFlatten = false
 				*pargi += 1
@@ -2910,7 +2910,7 @@ var FlattenUnflattenFlagSection = FlagSection{
 
 		{
 			name: "--no-auto-unflatten",
-			help: "When input non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`.  flattens to `y.1=7,y.2=8,y.3=9. With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`.",
+			help: "When input is non-JSON and output is JSON, suppress the default auto-unflatten behavior. Default: if the input has `y.1=7,y.2=8,y.3=9` then this unflattens to `$y=[7,8,9]`.  With `--no-auto-flatten`, instead we get `${y.1}=7,${y.2}=8,${y.3}=9`.",
 			parser: func(args []string, argc int, pargi *int, options *TOptions) {
 				options.WriterOptions.AutoUnflatten = false
 				*pargi += 1

diff --git a/pkg/mlrval/mlrmap_flatten_unflatten.go b/pkg/mlrval/mlrmap_flatten_unflatten.go
@@ -106,7 +106,18 @@ func (mlrmap *Mlrmap) isFlattenable() bool {
 // For mlr unflatten without -f. This undoes Unflatten.  This is for conversion
 // from non-JSON to JSON.  If there are fields x.a, x.b, x.c, etc. they're put
 // into a single field x with map-valued value keyed by "a", "b", "c".
-
+//
+// There is a heurtistic here though. Miller is (wildly) multi-format and needs
+// to accommodate all manner of data. In the JSON world, "." is the default
+// delimiter for nested data, and we're here to handle that. But in the R world,
+// "." is just like "_" in other languages: witness "data.frame" rather than
+// "data_frame". If the "." was intended as punctuation, in a say a field named
+// "a.b" with value 3, then unflatten-to-JSON will make `{"a": {"b": 3}}`.  This
+// is just our default behavior; users can use --no-auto-unflatten. Weirder
+// are field names like ".", ".x", "x.", "x..y", etc. The heuristic here
+// is that when we split on "." and any of the pieces around/between the dots
+// are empty string, we don't try to unflatten that field.
+//
 // Special case: if the resulting string keys are string representations of 1,
 // 2, 3, etc -- without gaps -- then the map is converted to an array.
 //
@@ -134,22 +145,38 @@ func (mlrmap *Mlrmap) CopyUnflattened(
 
 	// We'll come through this loop once for x.a, another for x.b, etc.
 	for pe := mlrmap.Head; pe != nil; pe = pe.Next {
-		// Is the field name something dot something?
-		if strings.Contains(pe.Key, separator) {
-			arrayOfIndices := SplitAXHelper(pe.Key, separator)
-			arrayval := arrayOfIndices.intf.([]*Mlrval)
-			lib.InternalCodingErrorIf(len(arrayval) < 1)
-			// If the input field name was "x.a" then remember the "x".
-			baseIndex := arrayval[0].String()
-			affectedBaseIndices[baseIndex] = true
-			// Use PutIndexed to assign $x["a"] = 7, or $x["b"] = 8, etc.
-			other.PutIndexed(
-				CopyMlrvalArray(arrayval),
-				unflattenTerminal(pe.Value).Copy(),
-			)
-		} else {
+		// If there are no dots in the field name, treat it as a terminal.
+		if !strings.Contains(pe.Key, separator) {
+			other.PutReference(pe.Key, unflattenTerminal(pe.Value))
+			continue
+		}
+
+		arrayOfIndices := SplitAXHelper(pe.Key, separator)
+		arrayval := arrayOfIndices.intf.([]*Mlrval)
+		lib.InternalCodingErrorIf(len(arrayval) < 1)
+
+		// Check for "" in any of the split pieces; treat the field as terminal if so.
+		legitDots := true
+		for i, _ := range arrayval {
+			piece := arrayval[i].String()
+			if piece == "" {
+				legitDots = false
+				break
+			}
+		}
+		if !legitDots {
 			other.PutReference(pe.Key, unflattenTerminal(pe.Value))
+			continue
 		}
+
+		// If the input field name was "x.a" then remember the "x".
+		baseIndex := arrayval[0].String()
+		affectedBaseIndices[baseIndex] = true
+		// Use PutIndexed to assign $x["a"] = 7, or $x["b"] = 8, etc.
+		other.PutIndexed(
+			CopyMlrvalArray(arrayval),
+			unflattenTerminal(pe.Value).Copy(),
+		)
 	}
 
 	// Go through all the field names which were turned into maps -- e.g.  "x"

diff --git a/test/cases/verb-flatten-unflatten/0011/expout b/test/cases/verb-flatten-unflatten/0011/expout
@@ -24,6 +24,13 @@
   "wrapper": {
     "empty3": {},
     "emtpy4": []
-  }
+  },
+  "x": {
+    "y": 1
+  },
+  "@": 2,
+  "x@": 3,
+  "@y": 4,
+  "x@@y": 5
 }
 ]
diff --git a/test/input/unflatten-input-2.xtab b/test/input/unflatten-input-2.xtab
@@ -13,3 +13,8 @@ empty1                      {}
 empty2                      []
 wrapper@empty3              {}
 wrapper@emtpy4              []
+x@y                         1
+@                           2
+x@                          3
+@y                          4
+x@@y                        5