From ae96038fbf5884d5f4e52d0b0488dedb6e5f4050 Mon Sep 17 00:00:00 2001 From: Manos Pitsidianakis Date: Thu, 11 Apr 2024 21:15:47 +0300 Subject: [PATCH] Make unicode-segmentation a hard dependency meli/melib are UTF8 software, so we should have proper Unicode support. A compile-time env var is added, `UNICODE_REGENERATE_TABLES` to force network access and rebuild the cached unicode tables. Signed-off-by: Manos Pitsidianakis --- BUILD.md | 5 ++- fuzz/Cargo.toml | 5 +-- meli/Cargo.toml | 2 +- melib/Cargo.toml | 5 +-- melib/README.md | 18 --------- melib/build.rs | 7 ++-- melib/src/email/compose/mime.rs | 59 ----------------------------- melib/src/lib.rs | 1 - melib/src/text/grapheme_clusters.rs | 4 +- melib/src/text/line_break.rs | 3 +- melib/src/text/mod.rs | 18 +-------- melib/src/thread.rs | 34 ----------------- tools/Cargo.toml | 2 +- 13 files changed, 16 insertions(+), 147 deletions(-) diff --git a/BUILD.md b/BUILD.md index ed45cf83..dfaabe23 100644 --- a/BUILD.md +++ b/BUILD.md @@ -3,7 +3,7 @@ For a quick start, build and install locally: ```sh - PREFIX=~/.local make install +PREFIX=~/.local make install ``` Available subcommands for `make` are listed with `make help`. @@ -34,6 +34,9 @@ Some functionality is held behind "feature gates", or compile-time flags. The fo Since it's actual use in the code is very limited, it is not recommended to use this (off by default). - `static` and `*-static` bundle C libraries in dependencies so that you don't need them installed in your system (on by default). +Though not a feature, the presence of the environment variable `UNICODE_REGENERATE_TABLES` in compile-time of the `melib` crate will force the regeneration of unicode tables. +Otherwise the tables are included with the source code, and there's no real reason to regenerate them unless you intend to modify the code or update to a new Unicode version. + ## Build Debian package (*deb*) Building with Debian's packaged cargo might require the installation of these two packages: `librust-openssl-sys-dev librust-libdbus-sys-dev` diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 459b6b70..6eb4324b 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -14,10 +14,7 @@ path = "fuzz_targets/envelope_parse.rs" [dependencies] libfuzzer-sys = "0.3" - -[dependencies.melib] -path = "../melib" -features = ["unicode-algorithms"] +melib = { path = "../melib" } # Prevent this from interfering with workspaces [workspace] diff --git a/meli/Cargo.toml b/meli/Cargo.toml index 84b57f48..ea3d866b 100644 --- a/meli/Cargo.toml +++ b/meli/Cargo.toml @@ -31,7 +31,7 @@ indexmap = { version = "^1.6", features = ["serde-1"] } libc = { version = "0.2.125", default-features = false, features = ["extra_traits"] } libz-sys = { version = "1.1", features = ["static"], optional = true } linkify = { version = "^0.8", default-features = false } -melib = { path = "../melib", version = "0.8.5-rc.3", features = ["unicode-algorithms"] } +melib = { path = "../melib", version = "0.8.5-rc.3", features = [] } nix = { version = "0.27", default-features = false, features = ["signal", "poll", "term", "ioctl", "process"] } notify = { version = "4.0.1", default-features = false } # >:c num_cpus = "1.12.0" diff --git a/melib/Cargo.toml b/melib/Cargo.toml index d5884bdc..b3ad509c 100644 --- a/melib/Cargo.toml +++ b/melib/Cargo.toml @@ -50,7 +50,7 @@ serde_path_to_error = { version = "0.1" } smallvec = { version = "^1.5.0", features = ["serde"] } smol = "1.0.0" socket2 = { version = "0.5", features = [] } -unicode-segmentation = { version = "1.2.1", default-features = false, optional = true } +unicode-segmentation = { version = "1.2.1", default-features = false } url = { version = "2.4", optional = true } uuid = { version = "^1", features = ["serde", "v4", "v5"] } xdg = "2.1.0" @@ -77,9 +77,6 @@ sqlite3 = ["rusqlite"] sqlite3-static = ["sqlite3", "rusqlite/bundled-full"] tls = ["native-tls"] tls-static = ["tls", "native-tls/vendored"] -text-processing = [] -unicode-algorithms = ["text-processing", "unicode-segmentation"] -unicode-algorithms-cached = ["text-processing", "unicode-segmentation"] vcard = [] [build-dependencies] diff --git a/melib/README.md b/melib/README.md index 36fe3b7b..b3cbe221 100644 --- a/melib/README.md +++ b/melib/README.md @@ -22,24 +22,6 @@ Library for handling mail. |------------------------------|-------------------------------------|--------------------------| | `sqlite` | `rusqlite` | Used in IMAP cache. | |------------------------------|-------------------------------------|--------------------------| -| `unicode-algorithms` | `unicode-segmentation` | Linebreaking algo etc | -| | | For a fresh clean build, | -| | | Network access is | -| | | required to fetch data | -| | | from Unicode's website. | -|------------------------------|-------------------------------------|--------------------------| -| `unicode-algorithms-cached` | `unicode-segmentation` | Linebreaking algo etc | -| | | but it uses a cached | -| | | version of Unicode data | -| | | which might be stale. | -| | | | -| | | Use this feature instead | -| | | of the previous one for | -| | | building without network | -| | | access. | -|------------------------------|-------------------------------------|--------------------------| -| `unicode-algorithms` | `unicode-segmentation` | | -|------------------------------|-------------------------------------|--------------------------| | `vcard` | | vcard parsing | |------------------------------|-------------------------------------|--------------------------| | `gpgme` | | GPG use with libgpgme | diff --git a/melib/build.rs b/melib/build.rs index b9a0dbd0..348ad207 100644 --- a/melib/build.rs +++ b/melib/build.rs @@ -21,15 +21,14 @@ #![allow(clippy::needless_range_loop)] -#[cfg(any(feature = "unicode-algorithms", feature = "unicode-algorithms-cached"))] include!("src/text/types.rs"); fn main() -> Result<(), std::io::Error> { - #[cfg(any(feature = "unicode-algorithms", feature = "unicode-algorithms-cached"))] { const MOD_PATH: &str = "src/text/tables.rs"; + println!("cargo:rerun-if-env-changed=UNICODE_REGENERATE_TABLES"); println!("cargo:rerun-if-changed=build.rs"); - println!("cargo:rerun-if-changed={}", MOD_PATH); + println!("cargo:rerun-if-changed={MOD_PATH}"); /* Line break tables */ use std::{ fs::File, @@ -54,7 +53,7 @@ fn main() -> Result<(), std::io::Error> { ); return Ok(()); } - if cfg!(feature = "unicode-algorithms-cached") { + if std::env::var("UNICODE_REGENERATE_TABLES").is_err() { const CACHED_MODULE: &[u8] = include_bytes!(concat!("./src/text/tables.rs.gz")); let mut gz = GzDecoder::new(CACHED_MODULE); diff --git a/melib/src/email/compose/mime.rs b/melib/src/email/compose/mime.rs index 061e57e7..6b11acb3 100644 --- a/melib/src/email/compose/mime.rs +++ b/melib/src/email/compose/mime.rs @@ -20,14 +20,12 @@ */ use super::*; -#[cfg(feature = "text-processing")] use crate::text::grapheme_clusters::TextProcessing; pub fn encode_header(value: &str) -> String { let mut ret = String::with_capacity(value.len()); let mut is_current_window_ascii = true; let mut current_window_start = 0; - #[cfg(feature = "text-processing")] { let graphemes = value.graphemes_indices(); for (idx, g) in graphemes { @@ -81,63 +79,6 @@ pub fn encode_header(value: &str) -> String { } } } - #[cfg(not(feature = "text-processing"))] - { - /* [ref:VERIFY] [ref:TODO]: test this. If it works as fine as the one above, there's no need to - * keep the above implementation. */ - for (i, g) in value.char_indices() { - match (g.is_ascii(), is_current_window_ascii) { - (true, true) => { - ret.push(g); - } - (true, false) => { - /* If !g.is_whitespace() - * - * Whitespaces inside encoded tokens must be greedily taken, - * instead of splitting each non-ascii word into separate encoded tokens. */ - if !g.is_whitespace() && value.is_char_boundary(i) { - ret.push_str(&format!( - "=?UTF-8?B?{}?=", - BASE64_MIME - .encode(value[current_window_start..i].as_bytes()) - .trim() - )); - if i != value.len() - 1 { - ret.push(' '); - } - is_current_window_ascii = true; - current_window_start = i; - ret.push(g); - } - } - (false, true) => { - current_window_start = i; - is_current_window_ascii = false; - } - /* RFC2047 recommends: - * 'While there is no limit to the length of a multiple-line header field, each - * line of a header field that contains one or more - * 'encoded-word's is limited to 76 characters.' - * This is a rough compliance. - */ - (false, false) - if value.is_char_boundary(i) && value[current_window_start..i].len() > 76 => - { - ret.push_str(&format!( - "=?UTF-8?B?{}?=", - BASE64_MIME - .encode(value[current_window_start..i].as_bytes()) - .trim() - )); - if i != value.len() - 1 { - ret.push(' '); - } - current_window_start = i; - } - (false, false) => {} - } - } - } /* If the last part of the header value is encoded, it won't be pushed inside * the previous for block */ if !is_current_window_ascii { diff --git a/melib/src/lib.rs b/melib/src/lib.rs index 25541343..d27731ed 100644 --- a/melib/src/lib.rs +++ b/melib/src/lib.rs @@ -132,7 +132,6 @@ pub mod dbg { } } -#[cfg(feature = "text-processing")] pub mod text; pub use utils::{ diff --git a/melib/src/text/grapheme_clusters.rs b/melib/src/text/grapheme_clusters.rs index 670b25e5..324d7fc1 100644 --- a/melib/src/text/grapheme_clusters.rs +++ b/melib/src/text/grapheme_clusters.rs @@ -29,12 +29,12 @@ */ +use unicode_segmentation::UnicodeSegmentation; + use super::{ types::Reflow, wcwidth::{wcwidth, CodePointsIter}, }; -extern crate unicode_segmentation; -use self::unicode_segmentation::UnicodeSegmentation; pub trait TextProcessing: UnicodeSegmentation + CodePointsIter { fn split_graphemes(&self) -> Vec<&str> { diff --git a/melib/src/text/line_break.rs b/melib/src/text/line_break.rs index e4e7e269..5a20864d 100644 --- a/melib/src/text/line_break.rs +++ b/melib/src/text/line_break.rs @@ -19,12 +19,11 @@ * along with meli. If not, see . */ -extern crate unicode_segmentation; use std::{cmp::Ordering, collections::VecDeque, iter::Peekable, str::FromStr}; +use unicode_segmentation::UnicodeSegmentation; use LineBreakClass::*; -use self::unicode_segmentation::UnicodeSegmentation; use super::{ grapheme_clusters::TextProcessing, tables::LINE_BREAK_RULES, diff --git a/melib/src/text/mod.rs b/melib/src/text/mod.rs index 4348aff2..1cc1a188 100644 --- a/melib/src/text/mod.rs +++ b/melib/src/text/mod.rs @@ -19,6 +19,8 @@ * along with meli. If not, see . */ +use unicode_segmentation::UnicodeSegmentation; + pub mod grapheme_clusters; pub mod line_break; pub mod search; @@ -43,8 +45,6 @@ impl Truncate for &str { return; } - extern crate unicode_segmentation; - use unicode_segmentation::UnicodeSegmentation; if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(*self, true) .take(new_len) .last() @@ -58,8 +58,6 @@ impl Truncate for &str { return self; } - extern crate unicode_segmentation; - use unicode_segmentation::UnicodeSegmentation; if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(*self, true) .take(new_len) .last() @@ -75,8 +73,6 @@ impl Truncate for &str { return ""; } - extern crate unicode_segmentation; - use unicode_segmentation::UnicodeSegmentation; if let Some((first, _)) = UnicodeSegmentation::grapheme_indices(*self, true).nth(skip_len) { &self[first..] } else { @@ -90,8 +86,6 @@ impl Truncate for &str { return; } - extern crate unicode_segmentation; - use unicode_segmentation::UnicodeSegmentation; if let Some((first, _)) = UnicodeSegmentation::grapheme_indices(*self, true).nth(skip_len) { *self = &self[first..]; } @@ -104,8 +98,6 @@ impl Truncate for String { return; } - extern crate unicode_segmentation; - use unicode_segmentation::UnicodeSegmentation; if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(self.as_str(), true) .take(new_len) .last() @@ -119,8 +111,6 @@ impl Truncate for String { return self; } - extern crate unicode_segmentation; - use unicode_segmentation::UnicodeSegmentation; if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(self.as_str(), true) .take(new_len) .last() @@ -136,8 +126,6 @@ impl Truncate for String { return ""; } - extern crate unicode_segmentation; - use unicode_segmentation::UnicodeSegmentation; if let Some((first, _)) = UnicodeSegmentation::grapheme_indices(self.as_str(), true).nth(skip_len) { @@ -153,8 +141,6 @@ impl Truncate for String { return; } - extern crate unicode_segmentation; - use unicode_segmentation::UnicodeSegmentation; if let Some((first, _)) = UnicodeSegmentation::grapheme_indices(self.as_str(), true).nth(skip_len) { diff --git a/melib/src/thread.rs b/melib/src/thread.rs index e917e263..ebd0b2fb 100644 --- a/melib/src/thread.rs +++ b/melib/src/thread.rs @@ -52,7 +52,6 @@ pub use iterators::*; use smallvec::SmallVec; use uuid::Uuid; -#[cfg(feature = "text-processing")] use crate::text::grapheme_clusters::*; type Envelopes = Arc>>; @@ -1223,16 +1222,11 @@ impl Threads { } let ma = &envelopes[&a.unwrap()]; let mb = &envelopes[&b.unwrap()]; - #[cfg(feature = "text-processing")] { ma.subject() .split_graphemes() .cmp(&mb.subject().split_graphemes()) } - #[cfg(not(feature = "text-processing"))] - { - ma.subject().cmp(&mb.subject()) - } } (SortField::Subject, SortOrder::Asc) => { let a = &self.thread_nodes[&self.thread_ref(*a).root()].message(); @@ -1252,18 +1246,12 @@ impl Threads { } let ma = &envelopes[&a.unwrap()]; let mb = &envelopes[&b.unwrap()]; - #[cfg(feature = "text-processing")] { mb.subject() .as_ref() .split_graphemes() .cmp(&ma.subject().split_graphemes()) } - - #[cfg(not(feature = "text-processing"))] - { - mb.subject().as_ref().cmp(&ma.subject()) - } } }); } @@ -1303,16 +1291,11 @@ impl Threads { } let ma = &envelopes[&a.unwrap()]; let mb = &envelopes[&b.unwrap()]; - #[cfg(feature = "text-processing")] { ma.subject() .split_graphemes() .cmp(&mb.subject().split_graphemes()) } - #[cfg(not(feature = "text-processing"))] - { - ma.subject().cmp(&mb.subject()) - } } (SortField::Subject, SortOrder::Asc) => { let a = &self.thread_nodes[a].message(); @@ -1332,18 +1315,12 @@ impl Threads { } let ma = &envelopes[&a.unwrap()]; let mb = &envelopes[&b.unwrap()]; - #[cfg(feature = "text-processing")] { mb.subject() .as_ref() .split_graphemes() .cmp(&ma.subject().split_graphemes()) } - - #[cfg(not(feature = "text-processing"))] - { - mb.subject().as_ref().cmp(&ma.subject()) - } } }); } @@ -1379,16 +1356,11 @@ impl Threads { } let ma = &envelopes[&a.unwrap()]; let mb = &envelopes[&b.unwrap()]; - #[cfg(feature = "text-processing")] { ma.subject() .split_graphemes() .cmp(&mb.subject().split_graphemes()) } - #[cfg(not(feature = "text-processing"))] - { - ma.subject().cmp(&mb.subject()) - } } (SortField::Subject, SortOrder::Asc) => { let a = &self.thread_nodes[a].message(); @@ -1408,18 +1380,12 @@ impl Threads { } let ma = &envelopes[&a.unwrap()]; let mb = &envelopes[&b.unwrap()]; - #[cfg(feature = "text-processing")] { mb.subject() .as_ref() .split_graphemes() .cmp(&ma.subject().split_graphemes()) } - - #[cfg(not(feature = "text-processing"))] - { - mb.subject().as_ref().cmp(&ma.subject()) - } } }); } diff --git a/tools/Cargo.toml b/tools/Cargo.toml index e5621c9a..5cdf3db9 100644 --- a/tools/Cargo.toml +++ b/tools/Cargo.toml @@ -40,7 +40,7 @@ required-features = ["melib/imap"] [dependencies] crossbeam = { version = "^0.8" } meli = { path = "../meli", version = "0.8" } -melib = { path = "../melib", version = "0.8", features = ["debug-tracing", "unicode-algorithms"] } +melib = { path = "../melib", version = "0.8", features = ["debug-tracing" ] } nix = { version = "^0.24", default-features = false } signal-hook = { version = "^0.3", default-features = false, features = ["iterator"] } signal-hook-registry = { version = "1.2.0", default-features = false }