From 7d41da931abd5a991af868b78ab9cdfa8a4c1b08 Mon Sep 17 00:00:00 2001 From: David Raznick Date: Thu, 25 Jan 2024 21:48:11 +0000 Subject: [PATCH] fix truncate in middle of unicode char --- src/lib.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 4f90e30..124b6e9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2587,7 +2587,11 @@ impl FlatFiles { if cell.len() > 32767 { log::warn!("WARNING: Cell larger than 32767 chararcters which is too large for XLSX format. The cell will be truncated, so some data will be missing."); - cell.truncate(32767) + let mut index: usize = 32767; + while !cell.is_char_boundary(index) { + index -= 1; + } + cell.truncate(index) } if metadata.describers[order].guess_type().0 == "number" { @@ -4377,6 +4381,15 @@ mod tests { ) } + // #[test] + // fn test_is_char_boundry() { + // test_output( + // "fixtures/is_char_boundry.txt", + // vec![], + // json!({"ndjson": true, "xlsx": true}), + // ) + // } + #[test] fn test_s3_input() { if std::env::var("AWS_DEFAULT_REGION").is_ok() {