From 36704c7e24a30dea5dddb18593306f3f3f053e38 Mon Sep 17 00:00:00 2001 From: blob42 Date: Tue, 18 Jul 2023 02:40:15 +0200 Subject: [PATCH] wip: column split using regex --- Cargo.lock | 45 ++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 1 + README.md | 11 ++--------- src/input.rs | 53 +++++++++++++++++++++++++++++++++++----------------- src/main.rs | 2 +- 5 files changed, 85 insertions(+), 27 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index beea855..55406e6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "aho-corasick" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" +dependencies = [ + "memchr", +] + [[package]] name = "atty" version = "0.2.14" @@ -61,6 +70,7 @@ name = "colmap" version = "0.1.0" dependencies = [ "clap", + "regex", ] [[package]] @@ -84,6 +94,12 @@ version = "0.2.137" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc7fcc620a3bff7cdd7a365be3376c97191aeaccc2a603e600951e452615bf89" +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + [[package]] name = "once_cell" version = "1.16.0" @@ -138,6 +154,35 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "regex" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2eae68fc220f7cf2532e4494aded17545fce192d59cd996e0fe7887f4ceb575" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39354c10dd07468c2e73926b23bb9c2caca74c5501e38a35da70406f1d923310" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2" + [[package]] name = "strsim" version = "0.10.0" diff --git a/Cargo.toml b/Cargo.toml index 4926026..36f5784 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,3 +7,4 @@ edition = "2021" [dependencies] clap = { version = "4.0.26", features = ["derive"] } +regex = "1.9.1" diff --git a/README.md b/README.md index bc0195a..32350f9 100644 --- a/README.md +++ b/README.md @@ -53,14 +53,6 @@ foo_cmd | colmap -t 'basename {}' 'awk { print $2 }' - - - - - - - - --- [I am using Github under protest](protest.md) @@ -68,4 +60,5 @@ foo_cmd | colmap -t 'basename {}' 'awk { print $2 }' TODO: ---- -[ ] use non-dashed approach to cli (rwxrob/bonzai) +[ ] use golden tests +[ ] use non-dashed cli like (rwxrob/bonzai) diff --git a/src/input.rs b/src/input.rs index 57be3b7..abc787a 100644 --- a/src/input.rs +++ b/src/input.rs @@ -2,51 +2,56 @@ #![allow(dead_code)] use std::fmt; +use regex::Regex; + +const DEFAULT_SEP: &str = r"[\t]+"; type Column = Vec; type Columns = Vec; /// split input text into columns based on separator character /// returns a type representing a variable length array of strings (columns) ? +/// /// TODO: /// -///  accept &str and String ///  error handling -pub fn split_columns(text: &str, sep: char) -> Result { +///  accept &str and String +/// +pub fn split_columns(text: &str, sep: &str) -> Result { // read the first line stripping empty lines let lines: Vec<&str> = text.trim().lines().collect(); eprintln!("lines: {:?}", lines); + let re = Regex::new(sep).unwrap(); + // count number of columns let n_col = match lines.first() { - Some(line) => line.split(sep).count(), + Some(line) => re.split(line).count(), None => return Err(std::fmt::Error) }; - // eprintln!("first line: {:?}", lines.first().unwrap()); + // eprintln!("# columns: {n_col}"); let mut columns = vec![Column::new(); n_col]; - for (_l_idx, line) in lines.iter().enumerate() { - let new_n_col = line.split(sep).count(); + for (i, line) in lines.iter().enumerate() { + eprintln!("checking line {}", i); + + let new_n_col = re.split(line).count(); - // HACK: I should handle repeating separators with a glob or regex library - // TIP: usek if new_n_col != n_col { - return Err(std::fmt::Error) + return Err(fmt::Error) } eprintln!("number of columns: {}", columns.len()); - for (c_idx, col) in line.split(sep).enumerate() { + + for (c_idx, col) in re.split(line).enumerate() { columns[c_idx].push(col.to_string()) } } eprintln!("{:?}", columns); - // let n_col = *lines.first().unwrap(); - - // detect number of columns - Ok(Columns::new()) + Ok(columns) } #[test] @@ -55,11 +60,25 @@ fn test_split_columns(){ file1.txt title1 file2.pdf title2 file3 title3 +file with space title 4 "###; - let columns = split_columns(coltext1, '\t'); + let columns = split_columns(coltext1, DEFAULT_SEP); // should have two columns - assert_eq!(2, columns.unwrap().len()); + assert_eq!(2, columns.clone().unwrap().len()); + + assert_eq!(vec!["file1.txt", + "file2.pdf", + "file3", + "file with space" + ], columns.unwrap()[0]); +} - // println!("columns:\n{:?}", columns); +// #[test] +fn test_re_split() { + let text = "this is two tabs"; + let re = Regex::new(r"[\t]+").unwrap(); + let fields: Vec<&str> = re.split(text).collect(); + eprintln!("{:?}", fields); + assert!(false); } diff --git a/src/main.rs b/src/main.rs index c918eba..33deffe 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,7 +6,7 @@ * . dynamically generate field parameters ? */ -use clap::{Parser}; +use clap::Parser; use std::process; mod input;