From 36704c7e24a30dea5dddb18593306f3f3f053e38 Mon Sep 17 00:00:00 2001
From: blob42 <contact@blob42.xyz>
Date: Tue, 18 Jul 2023 02:40:15 +0200
Subject: [PATCH] wip: column split using regex

---
 Cargo.lock   | 45 ++++++++++++++++++++++++++++++++++++++++++++
 Cargo.toml   |  1 +
 README.md    | 11 ++---------
 src/input.rs | 53 +++++++++++++++++++++++++++++++++++-----------------
 src/main.rs  |  2 +-
 5 files changed, 85 insertions(+), 27 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index beea855..55406e6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,6 +2,15 @@
 # It is not intended for manual editing.
 version = 3
 
+[[package]]
+name = "aho-corasick"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "atty"
 version = "0.2.14"
@@ -61,6 +70,7 @@ name = "colmap"
 version = "0.1.0"
 dependencies = [
  "clap",
+ "regex",
 ]
 
 [[package]]
@@ -84,6 +94,12 @@ version = "0.2.137"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fc7fcc620a3bff7cdd7a365be3376c97191aeaccc2a603e600951e452615bf89"
 
+[[package]]
+name = "memchr"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
+
 [[package]]
 name = "once_cell"
 version = "1.16.0"
@@ -138,6 +154,35 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "regex"
+version = "1.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2eae68fc220f7cf2532e4494aded17545fce192d59cd996e0fe7887f4ceb575"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39354c10dd07468c2e73926b23bb9c2caca74c5501e38a35da70406f1d923310"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2"
+
 [[package]]
 name = "strsim"
 version = "0.10.0"
diff --git a/Cargo.toml b/Cargo.toml
index 4926026..36f5784 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,3 +7,4 @@ edition = "2021"
 
 [dependencies]
 clap = { version = "4.0.26", features = ["derive"] }
+regex = "1.9.1"
diff --git a/README.md b/README.md
index bc0195a..32350f9 100644
--- a/README.md
+++ b/README.md
@@ -53,14 +53,6 @@ foo_cmd | colmap -t 'basename {}' 'awk { print $2 }'
 
 
 
-
-
-
-
-
-
-
-
 ---
 [I am using Github under protest](protest.md)
 
@@ -68,4 +60,5 @@ foo_cmd | colmap -t 'basename {}' 'awk { print $2 }'
 TODO:
 ----
 
-[ ] use non-dashed approach to cli (rwxrob/bonzai)
+[ ] use golden tests
+[ ] use non-dashed cli like (rwxrob/bonzai)
diff --git a/src/input.rs b/src/input.rs
index 57be3b7..abc787a 100644
--- a/src/input.rs
+++ b/src/input.rs
@@ -2,51 +2,56 @@
 #![allow(dead_code)]
 
 use std::fmt;
+use regex::Regex;
+
+const DEFAULT_SEP: &str = r"[\t]+";
 
 type Column = Vec<String>;
 type Columns = Vec<Column>;
 
 /// split input text into columns based on separator character
 /// returns a type representing a variable length array of strings (columns) ?
+///
 /// TODO:
 ///
-///   accept &str and String
 ///   error handling
-pub fn split_columns(text: &str, sep: char) -> Result<Columns, fmt::Error>  {
+///   accept &str and String
+///
+pub fn split_columns(text: &str, sep: &str) -> Result<Columns, fmt::Error>  {
     // read the first line stripping empty lines
     let lines: Vec<&str> = text.trim().lines().collect();
     eprintln!("lines: {:?}", lines);
 
+    let re = Regex::new(sep).unwrap();
+
     // count number of columns 
     let n_col = match lines.first() {
-        Some(line) => line.split(sep).count(),
+        Some(line) => re.split(line).count(),
         None => return Err(std::fmt::Error)
     };
-    // eprintln!("first line: {:?}", lines.first().unwrap());
+
     // eprintln!("# columns: {n_col}");
 
     let mut columns = vec![Column::new(); n_col];
-    for (_l_idx, line) in lines.iter().enumerate() {
 
-        let new_n_col = line.split(sep).count();
+    for (i, line) in lines.iter().enumerate() {
+        eprintln!("checking line {}", i);
+
+        let new_n_col = re.split(line).count();
 
-        // HACK: I should handle repeating separators with a glob or regex library
-        // TIP: usek 
         if new_n_col != n_col {
-            return Err(std::fmt::Error)
+            return Err(fmt::Error)
         }
         eprintln!("number of columns: {}", columns.len());
-        for (c_idx, col) in line.split(sep).enumerate() {
+
+        for (c_idx, col) in re.split(line).enumerate() {
             columns[c_idx].push(col.to_string())
         }
     }
 
     eprintln!("{:?}", columns);
 
-    // let n_col = *lines.first().unwrap();
-
-    // detect number of columns
-    Ok(Columns::new())
+    Ok(columns)
 }
 
 #[test]
@@ -55,11 +60,25 @@ fn test_split_columns(){
 file1.txt		title1
 file2.pdf		title2
 file3			title3
+file with space	title 4
         "###;
-    let columns = split_columns(coltext1, '\t');
+    let columns = split_columns(coltext1, DEFAULT_SEP);
 
     // should have two columns
-    assert_eq!(2, columns.unwrap().len());
+    assert_eq!(2, columns.clone().unwrap().len());
+
+    assert_eq!(vec!["file1.txt",
+                   "file2.pdf",
+                    "file3",
+                   "file with space"
+            ], columns.unwrap()[0]);
+}
 
-    // println!("columns:\n{:?}", columns);
+// #[test]
+fn test_re_split() {
+    let text = "this is		two tabs";
+    let re = Regex::new(r"[\t]+").unwrap();
+    let fields: Vec<&str> = re.split(text).collect();
+    eprintln!("{:?}", fields);
+    assert!(false);
 }
diff --git a/src/main.rs b/src/main.rs
index c918eba..33deffe 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -6,7 +6,7 @@
  * . dynamically generate field parameters ?
  */
 
-use clap::{Parser};
+use clap::Parser;
 use std::process;
 
 mod input;