wip: column split using regex

main
blob42 10 months ago
parent 3fa4559117
commit 36704c7e24

45
Cargo.lock generated

@ -2,6 +2,15 @@
# It is not intended for manual editing.
version = 3
[[package]]
name = "aho-corasick"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41"
dependencies = [
"memchr",
]
[[package]]
name = "atty"
version = "0.2.14"
@ -61,6 +70,7 @@ name = "colmap"
version = "0.1.0"
dependencies = [
"clap",
"regex",
]
[[package]]
@ -84,6 +94,12 @@ version = "0.2.137"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc7fcc620a3bff7cdd7a365be3376c97191aeaccc2a603e600951e452615bf89"
[[package]]
name = "memchr"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
[[package]]
name = "once_cell"
version = "1.16.0"
@ -138,6 +154,35 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "regex"
version = "1.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2eae68fc220f7cf2532e4494aded17545fce192d59cd996e0fe7887f4ceb575"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39354c10dd07468c2e73926b23bb9c2caca74c5501e38a35da70406f1d923310"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2"
[[package]]
name = "strsim"
version = "0.10.0"

@ -7,3 +7,4 @@ edition = "2021"
[dependencies]
clap = { version = "4.0.26", features = ["derive"] }
regex = "1.9.1"

@ -53,14 +53,6 @@ foo_cmd | colmap -t 'basename {}' 'awk { print $2 }'
---
[I am using Github under protest](protest.md)
@ -68,4 +60,5 @@ foo_cmd | colmap -t 'basename {}' 'awk { print $2 }'
TODO:
----
[ ] use non-dashed approach to cli (rwxrob/bonzai)
[ ] use golden tests
[ ] use non-dashed cli like (rwxrob/bonzai)

@ -2,51 +2,56 @@
#![allow(dead_code)]
use std::fmt;
use regex::Regex;
const DEFAULT_SEP: &str = r"[\t]+";
type Column = Vec<String>;
type Columns = Vec<Column>;
/// split input text into columns based on separator character
/// returns a type representing a variable length array of strings (columns) ?
///
/// TODO:
///
///  accept &str and String
///  error handling
pub fn split_columns(text: &str, sep: char) -> Result<Columns, fmt::Error> {
///  accept &str and String
///
pub fn split_columns(text: &str, sep: &str) -> Result<Columns, fmt::Error> {
// read the first line stripping empty lines
let lines: Vec<&str> = text.trim().lines().collect();
eprintln!("lines: {:?}", lines);
let re = Regex::new(sep).unwrap();
// count number of columns
let n_col = match lines.first() {
Some(line) => line.split(sep).count(),
Some(line) => re.split(line).count(),
None => return Err(std::fmt::Error)
};
// eprintln!("first line: {:?}", lines.first().unwrap());
// eprintln!("# columns: {n_col}");
let mut columns = vec![Column::new(); n_col];
for (_l_idx, line) in lines.iter().enumerate() {
let new_n_col = line.split(sep).count();
for (i, line) in lines.iter().enumerate() {
eprintln!("checking line {}", i);
let new_n_col = re.split(line).count();
// HACK: I should handle repeating separators with a glob or regex library
// TIP: usek
if new_n_col != n_col {
return Err(std::fmt::Error)
return Err(fmt::Error)
}
eprintln!("number of columns: {}", columns.len());
for (c_idx, col) in line.split(sep).enumerate() {
for (c_idx, col) in re.split(line).enumerate() {
columns[c_idx].push(col.to_string())
}
}
eprintln!("{:?}", columns);
// let n_col = *lines.first().unwrap();
// detect number of columns
Ok(Columns::new())
Ok(columns)
}
#[test]
@ -55,11 +60,25 @@ fn test_split_columns(){
file1.txt title1
file2.pdf title2
file3 title3
file with space title 4
"###;
let columns = split_columns(coltext1, '\t');
let columns = split_columns(coltext1, DEFAULT_SEP);
// should have two columns
assert_eq!(2, columns.unwrap().len());
assert_eq!(2, columns.clone().unwrap().len());
assert_eq!(vec!["file1.txt",
"file2.pdf",
"file3",
"file with space"
], columns.unwrap()[0]);
}
// println!("columns:\n{:?}", columns);
// #[test]
fn test_re_split() {
let text = "this is two tabs";
let re = Regex::new(r"[\t]+").unwrap();
let fields: Vec<&str> = re.split(text).collect();
eprintln!("{:?}", fields);
assert!(false);
}

@ -6,7 +6,7 @@
* . dynamically generate field parameters ?
*/
use clap::{Parser};
use clap::Parser;
use std::process;
mod input;

Loading…
Cancel
Save