wip: column split using regex

10 months ago · 36704c7e24
parent 3fa4559117
commit 36704c7e24
5 changed files with 85 additions and 27 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2,6 +2,15 @@
 # It is not intended for manual editing.
 version = 3

+[[package]]
+name = "aho-corasick"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "atty"
 version = "0.2.14"
@ -61,6 +70,7 @@ name = "colmap"
 version = "0.1.0"
 dependencies = [
 "clap",
+ "regex",
 ]

 [[package]]
@ -84,6 +94,12 @@ version = "0.2.137"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fc7fcc620a3bff7cdd7a365be3376c97191aeaccc2a603e600951e452615bf89"

+[[package]]
+name = "memchr"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
+
 [[package]]
 name = "once_cell"
 version = "1.16.0"
@ -138,6 +154,35 @@ dependencies = [
 "proc-macro2",
 ]

+[[package]]
+name = "regex"
+version = "1.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2eae68fc220f7cf2532e4494aded17545fce192d59cd996e0fe7887f4ceb575"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39354c10dd07468c2e73926b23bb9c2caca74c5501e38a35da70406f1d923310"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2"
+
 [[package]]
 name = "strsim"
 version = "0.10.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -7,3 +7,4 @@ edition = "2021"

 [dependencies]
 clap = { version = "4.0.26", features = ["derive"] }
+regex = "1.9.1"
--- a/README.md
+++ b/README.md
@ -53,14 +53,6 @@ foo_cmd | colmap -t 'basename {}' 'awk { print $2 }'



-
-
-
-
-
-
-
-
 ---
 [I am using Github under protest](protest.md)

@ -68,4 +60,5 @@ foo_cmd | colmap -t 'basename {}' 'awk { print $2 }'
 TODO:
 ----

-[ ] use non-dashed approach to cli (rwxrob/bonzai)
+[ ] use golden tests
+[ ] use non-dashed cli like (rwxrob/bonzai)
--- a/src/input.rs
+++ b/src/input.rs
@ -2,51 +2,56 @@
 #![allow(dead_code)]

 use std::fmt;
+use regex::Regex;
+
+const DEFAULT_SEP: &str = r"[\t]+";

 type Column = Vec<String>;
 type Columns = Vec<Column>;

 /// split input text into columns based on separator character
 /// returns a type representing a variable length array of strings (columns) ?
+///
 /// TODO:
 ///
-///   accept &str and String
 ///   error handling
-pub fn split_columns(text: &str, sep: char) -> Result<Columns, fmt::Error>  {
+///   accept &str and String
+///
+pub fn split_columns(text: &str, sep: &str) -> Result<Columns, fmt::Error>  {
    // read the first line stripping empty lines
    let lines: Vec<&str> = text.trim().lines().collect();
    eprintln!("lines: {:?}", lines);

+    let re = Regex::new(sep).unwrap();
+
    // count number of columns 
    let n_col = match lines.first() {
-        Some(line) => line.split(sep).count(),
+        Some(line) => re.split(line).count(),
        None => return Err(std::fmt::Error)
    };
-    // eprintln!("first line: {:?}", lines.first().unwrap());
+
    // eprintln!("# columns: {n_col}");

    let mut columns = vec![Column::new(); n_col];
-    for (_l_idx, line) in lines.iter().enumerate() {

-        let new_n_col = line.split(sep).count();
+    for (i, line) in lines.iter().enumerate() {
+        eprintln!("checking line {}", i);
+
+        let new_n_col = re.split(line).count();

-        // HACK: I should handle repeating separators with a glob or regex library
-        // TIP: usek 
        if new_n_col != n_col {
-            return Err(std::fmt::Error)
+            return Err(fmt::Error)
        }
        eprintln!("number of columns: {}", columns.len());
-        for (c_idx, col) in line.split(sep).enumerate() {
+
+        for (c_idx, col) in re.split(line).enumerate() {
            columns[c_idx].push(col.to_string())
        }
    }

    eprintln!("{:?}", columns);

-    // let n_col = *lines.first().unwrap();
-
-    // detect number of columns
-    Ok(Columns::new())
+    Ok(columns)
 }

 #[test]
@ -55,11 +60,25 @@ fn test_split_columns(){
 file1.txt		title1
 file2.pdf		title2
 file3			title3
+file with space	title 4
        "###;
-    let columns = split_columns(coltext1, '\t');
+    let columns = split_columns(coltext1, DEFAULT_SEP);

    // should have two columns
-    assert_eq!(2, columns.unwrap().len());
+    assert_eq!(2, columns.clone().unwrap().len());
+
+    assert_eq!(vec!["file1.txt",
+                   "file2.pdf",
+                    "file3",
+                   "file with space"
+            ], columns.unwrap()[0]);
+}

-    // println!("columns:\n{:?}", columns);
+// #[test]
+fn test_re_split() {
+    let text = "this is		two tabs";
+    let re = Regex::new(r"[\t]+").unwrap();
+    let fields: Vec<&str> = re.split(text).collect();
+    eprintln!("{:?}", fields);
+    assert!(false);
 }
--- a/src/main.rs
+++ b/src/main.rs
@ -6,7 +6,7 @@
 * . dynamically generate field parameters ?
 */

-use clap::{Parser};
+use clap::Parser;
 use std::process;

 mod input;