From ae96038fbf5884d5f4e52d0b0488dedb6e5f4050 Mon Sep 17 00:00:00 2001
From: Manos Pitsidianakis <manos@pitsidianak.is>
Date: Thu, 11 Apr 2024 21:15:47 +0300
Subject: [PATCH] Make unicode-segmentation a hard dependency

meli/melib are UTF8 software, so we should have proper Unicode support.

A compile-time env var is added, `UNICODE_REGENERATE_TABLES` to force
network access and rebuild the cached unicode tables.

Signed-off-by: Manos Pitsidianakis <manos@pitsidianak.is>
---
 BUILD.md                            |  5 ++-
 fuzz/Cargo.toml                     |  5 +--
 meli/Cargo.toml                     |  2 +-
 melib/Cargo.toml                    |  5 +--
 melib/README.md                     | 18 ---------
 melib/build.rs                      |  7 ++--
 melib/src/email/compose/mime.rs     | 59 -----------------------------
 melib/src/lib.rs                    |  1 -
 melib/src/text/grapheme_clusters.rs |  4 +-
 melib/src/text/line_break.rs        |  3 +-
 melib/src/text/mod.rs               | 18 +--------
 melib/src/thread.rs                 | 34 -----------------
 tools/Cargo.toml                    |  2 +-
 13 files changed, 16 insertions(+), 147 deletions(-)

diff --git a/BUILD.md b/BUILD.md
index ed45cf83..dfaabe23 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -3,7 +3,7 @@
 For a quick start, build and install locally:
 
 ```sh
- PREFIX=~/.local make install
+PREFIX=~/.local make install
 ```
 
 Available subcommands for `make` are listed with `make help`.
@@ -34,6 +34,9 @@ Some functionality is held behind "feature gates", or compile-time flags. The fo
   Since it's actual use in the code is very limited, it is not recommended to use this (off by default).
 - `static` and `*-static` bundle C libraries in dependencies so that you don't need them installed in your system (on by default).
 
+Though not a feature, the presence of the environment variable `UNICODE_REGENERATE_TABLES` in compile-time of the `melib` crate will force the regeneration of unicode tables.
+Otherwise the tables are included with the source code, and there's no real reason to regenerate them unless you intend to modify the code or update to a new Unicode version.
+
 ## Build Debian package (*deb*)
 
 Building with Debian's packaged cargo might require the installation of these two packages: `librust-openssl-sys-dev librust-libdbus-sys-dev`
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
index 459b6b70..6eb4324b 100644
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@@ -14,10 +14,7 @@ path = "fuzz_targets/envelope_parse.rs"
 
 [dependencies]
 libfuzzer-sys = "0.3"
-
-[dependencies.melib]
-path = "../melib"
-features = ["unicode-algorithms"]
+melib = { path = "../melib" }
 
 # Prevent this from interfering with workspaces
 [workspace]
diff --git a/meli/Cargo.toml b/meli/Cargo.toml
index 84b57f48..ea3d866b 100644
--- a/meli/Cargo.toml
+++ b/meli/Cargo.toml
@@ -31,7 +31,7 @@ indexmap = { version = "^1.6", features = ["serde-1"] }
 libc = { version = "0.2.125", default-features = false, features = ["extra_traits"] }
 libz-sys = { version = "1.1", features = ["static"], optional = true }
 linkify = { version = "^0.8", default-features = false }
-melib = { path = "../melib", version = "0.8.5-rc.3", features = ["unicode-algorithms"] }
+melib = { path = "../melib", version = "0.8.5-rc.3", features = [] }
 nix = { version = "0.27", default-features = false, features = ["signal", "poll", "term", "ioctl", "process"] }
 notify = { version = "4.0.1", default-features = false } # >:c
 num_cpus = "1.12.0"
diff --git a/melib/Cargo.toml b/melib/Cargo.toml
index d5884bdc..b3ad509c 100644
--- a/melib/Cargo.toml
+++ b/melib/Cargo.toml
@@ -50,7 +50,7 @@ serde_path_to_error = { version = "0.1" }
 smallvec = { version = "^1.5.0", features = ["serde"] }
 smol = "1.0.0"
 socket2 = { version = "0.5", features = [] }
-unicode-segmentation = { version = "1.2.1", default-features = false, optional = true }
+unicode-segmentation = { version = "1.2.1", default-features = false }
 url = { version = "2.4", optional = true }
 uuid = { version = "^1", features = ["serde", "v4", "v5"] }
 xdg = "2.1.0"
@@ -77,9 +77,6 @@ sqlite3 = ["rusqlite"]
 sqlite3-static = ["sqlite3", "rusqlite/bundled-full"]
 tls = ["native-tls"]
 tls-static = ["tls", "native-tls/vendored"]
-text-processing = []
-unicode-algorithms = ["text-processing", "unicode-segmentation"]
-unicode-algorithms-cached = ["text-processing", "unicode-segmentation"]
 vcard = []
 
 [build-dependencies]
diff --git a/melib/README.md b/melib/README.md
index 36fe3b7b..b3cbe221 100644
--- a/melib/README.md
+++ b/melib/README.md
@@ -22,24 +22,6 @@ Library for handling mail.
 |------------------------------|-------------------------------------|--------------------------|
 | `sqlite`                     | `rusqlite`                          | Used in IMAP cache.      |
 |------------------------------|-------------------------------------|--------------------------|
-| `unicode-algorithms`         | `unicode-segmentation`              | Linebreaking algo etc    |
-|                              |                                     | For a fresh clean build, |
-|                              |                                     | Network access is        |
-|                              |                                     | required to fetch data   |
-|                              |                                     | from Unicode's website.  |
-|------------------------------|-------------------------------------|--------------------------|
-| `unicode-algorithms-cached`  | `unicode-segmentation`              | Linebreaking algo etc    |
-|                              |                                     | but it uses a cached     |
-|                              |                                     | version of Unicode data  |
-|                              |                                     | which might be stale.    |
-|                              |                                     |                          |
-|                              |                                     | Use this feature instead |
-|                              |                                     | of the previous one for  |
-|                              |                                     | building without network |
-|                              |                                     | access.                  |
-|------------------------------|-------------------------------------|--------------------------|
-| `unicode-algorithms`         | `unicode-segmentation`              |                          |
-|------------------------------|-------------------------------------|--------------------------|
 | `vcard`                      |                                     | vcard parsing            |
 |------------------------------|-------------------------------------|--------------------------|
 | `gpgme`                      |                                     | GPG use with libgpgme    |
diff --git a/melib/build.rs b/melib/build.rs
index b9a0dbd0..348ad207 100644
--- a/melib/build.rs
+++ b/melib/build.rs
@@ -21,15 +21,14 @@
 
 #![allow(clippy::needless_range_loop)]
 
-#[cfg(any(feature = "unicode-algorithms", feature = "unicode-algorithms-cached"))]
 include!("src/text/types.rs");
 
 fn main() -> Result<(), std::io::Error> {
-    #[cfg(any(feature = "unicode-algorithms", feature = "unicode-algorithms-cached"))]
     {
         const MOD_PATH: &str = "src/text/tables.rs";
+        println!("cargo:rerun-if-env-changed=UNICODE_REGENERATE_TABLES");
         println!("cargo:rerun-if-changed=build.rs");
-        println!("cargo:rerun-if-changed={}", MOD_PATH);
+        println!("cargo:rerun-if-changed={MOD_PATH}");
         /* Line break tables */
         use std::{
             fs::File,
@@ -54,7 +53,7 @@ fn main() -> Result<(), std::io::Error> {
             );
             return Ok(());
         }
-        if cfg!(feature = "unicode-algorithms-cached") {
+        if std::env::var("UNICODE_REGENERATE_TABLES").is_err() {
             const CACHED_MODULE: &[u8] = include_bytes!(concat!("./src/text/tables.rs.gz"));
 
             let mut gz = GzDecoder::new(CACHED_MODULE);
diff --git a/melib/src/email/compose/mime.rs b/melib/src/email/compose/mime.rs
index 061e57e7..6b11acb3 100644
--- a/melib/src/email/compose/mime.rs
+++ b/melib/src/email/compose/mime.rs
@@ -20,14 +20,12 @@
  */
 
 use super::*;
-#[cfg(feature = "text-processing")]
 use crate::text::grapheme_clusters::TextProcessing;
 
 pub fn encode_header(value: &str) -> String {
     let mut ret = String::with_capacity(value.len());
     let mut is_current_window_ascii = true;
     let mut current_window_start = 0;
-    #[cfg(feature = "text-processing")]
     {
         let graphemes = value.graphemes_indices();
         for (idx, g) in graphemes {
@@ -81,63 +79,6 @@ pub fn encode_header(value: &str) -> String {
             }
         }
     }
-    #[cfg(not(feature = "text-processing"))]
-    {
-        /* [ref:VERIFY] [ref:TODO]: test this. If it works as fine as the one above, there's no need to
-         * keep the above implementation. */
-        for (i, g) in value.char_indices() {
-            match (g.is_ascii(), is_current_window_ascii) {
-                (true, true) => {
-                    ret.push(g);
-                }
-                (true, false) => {
-                    /* If !g.is_whitespace()
-                     *
-                     * Whitespaces inside encoded tokens must be greedily taken,
-                     * instead of splitting each non-ascii word into separate encoded tokens. */
-                    if !g.is_whitespace() && value.is_char_boundary(i) {
-                        ret.push_str(&format!(
-                            "=?UTF-8?B?{}?=",
-                            BASE64_MIME
-                                .encode(value[current_window_start..i].as_bytes())
-                                .trim()
-                        ));
-                        if i != value.len() - 1 {
-                            ret.push(' ');
-                        }
-                        is_current_window_ascii = true;
-                        current_window_start = i;
-                        ret.push(g);
-                    }
-                }
-                (false, true) => {
-                    current_window_start = i;
-                    is_current_window_ascii = false;
-                }
-                /* RFC2047 recommends:
-                 * 'While there is no limit to the length of a multiple-line header field, each
-                 * line of a header field that contains one or more
-                 * 'encoded-word's is limited to 76 characters.'
-                 * This is a rough compliance.
-                 */
-                (false, false)
-                    if value.is_char_boundary(i) && value[current_window_start..i].len() > 76 =>
-                {
-                    ret.push_str(&format!(
-                        "=?UTF-8?B?{}?=",
-                        BASE64_MIME
-                            .encode(value[current_window_start..i].as_bytes())
-                            .trim()
-                    ));
-                    if i != value.len() - 1 {
-                        ret.push(' ');
-                    }
-                    current_window_start = i;
-                }
-                (false, false) => {}
-            }
-        }
-    }
     /* If the last part of the header value is encoded, it won't be pushed inside
      * the previous for block */
     if !is_current_window_ascii {
diff --git a/melib/src/lib.rs b/melib/src/lib.rs
index 25541343..d27731ed 100644
--- a/melib/src/lib.rs
+++ b/melib/src/lib.rs
@@ -132,7 +132,6 @@ pub mod dbg {
     }
 }
 
-#[cfg(feature = "text-processing")]
 pub mod text;
 
 pub use utils::{
diff --git a/melib/src/text/grapheme_clusters.rs b/melib/src/text/grapheme_clusters.rs
index 670b25e5..324d7fc1 100644
--- a/melib/src/text/grapheme_clusters.rs
+++ b/melib/src/text/grapheme_clusters.rs
@@ -29,12 +29,12 @@
 
 */
 
+use unicode_segmentation::UnicodeSegmentation;
+
 use super::{
     types::Reflow,
     wcwidth::{wcwidth, CodePointsIter},
 };
-extern crate unicode_segmentation;
-use self::unicode_segmentation::UnicodeSegmentation;
 
 pub trait TextProcessing: UnicodeSegmentation + CodePointsIter {
     fn split_graphemes(&self) -> Vec<&str> {
diff --git a/melib/src/text/line_break.rs b/melib/src/text/line_break.rs
index e4e7e269..5a20864d 100644
--- a/melib/src/text/line_break.rs
+++ b/melib/src/text/line_break.rs
@@ -19,12 +19,11 @@
  * along with meli. If not, see <http://www.gnu.org/licenses/>.
  */
 
-extern crate unicode_segmentation;
 use std::{cmp::Ordering, collections::VecDeque, iter::Peekable, str::FromStr};
 
+use unicode_segmentation::UnicodeSegmentation;
 use LineBreakClass::*;
 
-use self::unicode_segmentation::UnicodeSegmentation;
 use super::{
     grapheme_clusters::TextProcessing,
     tables::LINE_BREAK_RULES,
diff --git a/melib/src/text/mod.rs b/melib/src/text/mod.rs
index 4348aff2..1cc1a188 100644
--- a/melib/src/text/mod.rs
+++ b/melib/src/text/mod.rs
@@ -19,6 +19,8 @@
  * along with meli. If not, see <http://www.gnu.org/licenses/>.
  */
 
+use unicode_segmentation::UnicodeSegmentation;
+
 pub mod grapheme_clusters;
 pub mod line_break;
 pub mod search;
@@ -43,8 +45,6 @@ impl Truncate for &str {
             return;
         }
 
-        extern crate unicode_segmentation;
-        use unicode_segmentation::UnicodeSegmentation;
         if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(*self, true)
             .take(new_len)
             .last()
@@ -58,8 +58,6 @@ impl Truncate for &str {
             return self;
         }
 
-        extern crate unicode_segmentation;
-        use unicode_segmentation::UnicodeSegmentation;
         if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(*self, true)
             .take(new_len)
             .last()
@@ -75,8 +73,6 @@ impl Truncate for &str {
             return "";
         }
 
-        extern crate unicode_segmentation;
-        use unicode_segmentation::UnicodeSegmentation;
         if let Some((first, _)) = UnicodeSegmentation::grapheme_indices(*self, true).nth(skip_len) {
             &self[first..]
         } else {
@@ -90,8 +86,6 @@ impl Truncate for &str {
             return;
         }
 
-        extern crate unicode_segmentation;
-        use unicode_segmentation::UnicodeSegmentation;
         if let Some((first, _)) = UnicodeSegmentation::grapheme_indices(*self, true).nth(skip_len) {
             *self = &self[first..];
         }
@@ -104,8 +98,6 @@ impl Truncate for String {
             return;
         }
 
-        extern crate unicode_segmentation;
-        use unicode_segmentation::UnicodeSegmentation;
         if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(self.as_str(), true)
             .take(new_len)
             .last()
@@ -119,8 +111,6 @@ impl Truncate for String {
             return self;
         }
 
-        extern crate unicode_segmentation;
-        use unicode_segmentation::UnicodeSegmentation;
         if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(self.as_str(), true)
             .take(new_len)
             .last()
@@ -136,8 +126,6 @@ impl Truncate for String {
             return "";
         }
 
-        extern crate unicode_segmentation;
-        use unicode_segmentation::UnicodeSegmentation;
         if let Some((first, _)) =
             UnicodeSegmentation::grapheme_indices(self.as_str(), true).nth(skip_len)
         {
@@ -153,8 +141,6 @@ impl Truncate for String {
             return;
         }
 
-        extern crate unicode_segmentation;
-        use unicode_segmentation::UnicodeSegmentation;
         if let Some((first, _)) =
             UnicodeSegmentation::grapheme_indices(self.as_str(), true).nth(skip_len)
         {
diff --git a/melib/src/thread.rs b/melib/src/thread.rs
index e917e263..ebd0b2fb 100644
--- a/melib/src/thread.rs
+++ b/melib/src/thread.rs
@@ -52,7 +52,6 @@ pub use iterators::*;
 use smallvec::SmallVec;
 use uuid::Uuid;
 
-#[cfg(feature = "text-processing")]
 use crate::text::grapheme_clusters::*;
 
 type Envelopes = Arc<RwLock<HashMap<EnvelopeHash, Envelope>>>;
@@ -1223,16 +1222,11 @@ impl Threads {
                 }
                 let ma = &envelopes[&a.unwrap()];
                 let mb = &envelopes[&b.unwrap()];
-                #[cfg(feature = "text-processing")]
                 {
                     ma.subject()
                         .split_graphemes()
                         .cmp(&mb.subject().split_graphemes())
                 }
-                #[cfg(not(feature = "text-processing"))]
-                {
-                    ma.subject().cmp(&mb.subject())
-                }
             }
             (SortField::Subject, SortOrder::Asc) => {
                 let a = &self.thread_nodes[&self.thread_ref(*a).root()].message();
@@ -1252,18 +1246,12 @@ impl Threads {
                 }
                 let ma = &envelopes[&a.unwrap()];
                 let mb = &envelopes[&b.unwrap()];
-                #[cfg(feature = "text-processing")]
                 {
                     mb.subject()
                         .as_ref()
                         .split_graphemes()
                         .cmp(&ma.subject().split_graphemes())
                 }
-
-                #[cfg(not(feature = "text-processing"))]
-                {
-                    mb.subject().as_ref().cmp(&ma.subject())
-                }
             }
         });
     }
@@ -1303,16 +1291,11 @@ impl Threads {
                 }
                 let ma = &envelopes[&a.unwrap()];
                 let mb = &envelopes[&b.unwrap()];
-                #[cfg(feature = "text-processing")]
                 {
                     ma.subject()
                         .split_graphemes()
                         .cmp(&mb.subject().split_graphemes())
                 }
-                #[cfg(not(feature = "text-processing"))]
-                {
-                    ma.subject().cmp(&mb.subject())
-                }
             }
             (SortField::Subject, SortOrder::Asc) => {
                 let a = &self.thread_nodes[a].message();
@@ -1332,18 +1315,12 @@ impl Threads {
                 }
                 let ma = &envelopes[&a.unwrap()];
                 let mb = &envelopes[&b.unwrap()];
-                #[cfg(feature = "text-processing")]
                 {
                     mb.subject()
                         .as_ref()
                         .split_graphemes()
                         .cmp(&ma.subject().split_graphemes())
                 }
-
-                #[cfg(not(feature = "text-processing"))]
-                {
-                    mb.subject().as_ref().cmp(&ma.subject())
-                }
             }
         });
     }
@@ -1379,16 +1356,11 @@ impl Threads {
                 }
                 let ma = &envelopes[&a.unwrap()];
                 let mb = &envelopes[&b.unwrap()];
-                #[cfg(feature = "text-processing")]
                 {
                     ma.subject()
                         .split_graphemes()
                         .cmp(&mb.subject().split_graphemes())
                 }
-                #[cfg(not(feature = "text-processing"))]
-                {
-                    ma.subject().cmp(&mb.subject())
-                }
             }
             (SortField::Subject, SortOrder::Asc) => {
                 let a = &self.thread_nodes[a].message();
@@ -1408,18 +1380,12 @@ impl Threads {
                 }
                 let ma = &envelopes[&a.unwrap()];
                 let mb = &envelopes[&b.unwrap()];
-                #[cfg(feature = "text-processing")]
                 {
                     mb.subject()
                         .as_ref()
                         .split_graphemes()
                         .cmp(&ma.subject().split_graphemes())
                 }
-
-                #[cfg(not(feature = "text-processing"))]
-                {
-                    mb.subject().as_ref().cmp(&ma.subject())
-                }
             }
         });
     }
diff --git a/tools/Cargo.toml b/tools/Cargo.toml
index e5621c9a..5cdf3db9 100644
--- a/tools/Cargo.toml
+++ b/tools/Cargo.toml
@@ -40,7 +40,7 @@ required-features = ["melib/imap"]
 [dependencies]
 crossbeam = { version = "^0.8" }
 meli = { path = "../meli", version = "0.8" }
-melib = { path = "../melib", version = "0.8", features = ["debug-tracing", "unicode-algorithms"] }
+melib = { path = "../melib", version = "0.8", features = ["debug-tracing" ] }
 nix = { version = "^0.24", default-features = false }
 signal-hook = { version = "^0.3", default-features = false, features = ["iterator"] }
 signal-hook-registry = { version = "1.2.0", default-features = false }