Make unicode-segmentation a hard dependency

meli/melib are UTF8 software, so we should have proper Unicode support. A compile-time env var is added, `UNICODE_REGENERATE_TABLES` to force network access and rebuild the cached unicode tables. Signed-off-by: Manos Pitsidianakis <manos@pitsidianak.is>
1 month ago · ae96038fbf
parent 07072e2e3f
commit ae96038fbf
13 changed files with 16 additions and 147 deletions
--- a/BUILD.md
+++ b/BUILD.md
@ -3,7 +3,7 @@
 For a quick start, build and install locally:
 ```sh
- PREFIX=~/.local make install
+PREFIX=~/.local make install
 ```
 Available subcommands for `make` are listed with `make help`.
@ -34,6 +34,9 @@ Some functionality is held behind "feature gates", or compile-time flags. The fo
  Since it's actual use in the code is very limited, it is not recommended to use this (off by default).
 - `static` and `*-static` bundle C libraries in dependencies so that you don't need them installed in your system (on by default).
 Though not a feature, the presence of the environment variable `UNICODE_REGENERATE_TABLES` in compile-time of the `melib` crate will force the regeneration of unicode tables.
 Otherwise the tables are included with the source code, and there's no real reason to regenerate them unless you intend to modify the code or update to a new Unicode version.
 ## Build Debian package (*deb*)
 Building with Debian's packaged cargo might require the installation of these two packages: `librust-openssl-sys-dev librust-libdbus-sys-dev`
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@ -14,10 +14,7 @@ path = "fuzz_targets/envelope_parse.rs"
 [dependencies]
 libfuzzer-sys = "0.3"
-
+melib = { path = "../melib" }
 [dependencies.melib]
 path = "../melib"
 features = ["unicode-algorithms"]
 # Prevent this from interfering with workspaces
 [workspace]
--- a/meli/Cargo.toml
+++ b/meli/Cargo.toml
@ -31,7 +31,7 @@ indexmap = { version = "^1.6", features = ["serde-1"] }
 libc = { version = "0.2.125", default-features = false, features = ["extra_traits"] }
 libz-sys = { version = "1.1", features = ["static"], optional = true }
 linkify = { version = "^0.8", default-features = false }
-melib = { path = "../melib", version = "0.8.5-rc.3", features = ["unicode-algorithms"] }
+melib = { path = "../melib", version = "0.8.5-rc.3", features = [] }
 nix = { version = "0.27", default-features = false, features = ["signal", "poll", "term", "ioctl", "process"] }
 notify = { version = "4.0.1", default-features = false } # >:c
 num_cpus = "1.12.0"
--- a/melib/Cargo.toml
+++ b/melib/Cargo.toml
@ -50,7 +50,7 @@ serde_path_to_error = { version = "0.1" }
 smallvec = { version = "^1.5.0", features = ["serde"] }
 smol = "1.0.0"
 socket2 = { version = "0.5", features = [] }
-unicode-segmentation = { version = "1.2.1", default-features = false, optional = true }
+unicode-segmentation = { version = "1.2.1", default-features = false }
 url = { version = "2.4", optional = true }
 uuid = { version = "^1", features = ["serde", "v4", "v5"] }
 xdg = "2.1.0"
@ -77,9 +77,6 @@ sqlite3 = ["rusqlite"]
 sqlite3-static = ["sqlite3", "rusqlite/bundled-full"]
 tls = ["native-tls"]
 tls-static = ["tls", "native-tls/vendored"]
 text-processing = []
 unicode-algorithms = ["text-processing", "unicode-segmentation"]
 unicode-algorithms-cached = ["text-processing", "unicode-segmentation"]
 vcard = []
 [build-dependencies]
--- a/melib/README.md
+++ b/melib/README.md
@ -22,24 +22,6 @@ Library for handling mail.
 |------------------------------|-------------------------------------|--------------------------|
 | `sqlite`                     | `rusqlite`                          | Used in IMAP cache.      |
 |------------------------------|-------------------------------------|--------------------------|
 | `unicode-algorithms`         | `unicode-segmentation`              | Linebreaking algo etc    |
 |                              |                                     | For a fresh clean build, |
 |                              |                                     | Network access is        |
 |                              |                                     | required to fetch data   |
 |                              |                                     | from Unicode's website.  |
 |------------------------------|-------------------------------------|--------------------------|
 | `unicode-algorithms-cached`  | `unicode-segmentation`              | Linebreaking algo etc    |
 |                              |                                     | but it uses a cached     |
 |                              |                                     | version of Unicode data  |
 |                              |                                     | which might be stale.    |
 |                              |                                     |                          |
 |                              |                                     | Use this feature instead |
 |                              |                                     | of the previous one for  |
 |                              |                                     | building without network |
 |                              |                                     | access.                  |
 |------------------------------|-------------------------------------|--------------------------|
 | `unicode-algorithms`         | `unicode-segmentation`              |                          |
 |------------------------------|-------------------------------------|--------------------------|
 | `vcard`                      |                                     | vcard parsing            |
 |------------------------------|-------------------------------------|--------------------------|
 | `gpgme`                      |                                     | GPG use with libgpgme    |
--- a/melib/build.rs
+++ b/melib/build.rs
@ -21,15 +21,14 @@
 #![allow(clippy::needless_range_loop)]
 #[cfg(any(feature = "unicode-algorithms", feature = "unicode-algorithms-cached"))]
 include!("src/text/types.rs");
 fn main() -> Result<(), std::io::Error> {
    #[cfg(any(feature = "unicode-algorithms", feature = "unicode-algorithms-cached"))]
    {
        const MOD_PATH: &str = "src/text/tables.rs";
        println!("cargo:rerun-if-env-changed=UNICODE_REGENERATE_TABLES");
        println!("cargo:rerun-if-changed=build.rs");
-        println!("cargo:rerun-if-changed={}", MOD_PATH);
+        println!("cargo:rerun-if-changed={MOD_PATH}");
        /* Line break tables */
        use std::{
            fs::File,
@ -54,7 +53,7 @@ fn main() -> Result<(), std::io::Error> {
            );
            return Ok(());
        }
-        if cfg!(feature = "unicode-algorithms-cached") {
+        if std::env::var("UNICODE_REGENERATE_TABLES").is_err() {
            const CACHED_MODULE: &[u8] = include_bytes!(concat!("./src/text/tables.rs.gz"));
            let mut gz = GzDecoder::new(CACHED_MODULE);
--- a/melib/src/email/compose/mime.rs
+++ b/melib/src/email/compose/mime.rs
@ -20,14 +20,12 @@
 */
 use super::*;
 #[cfg(feature = "text-processing")]
 use crate::text::grapheme_clusters::TextProcessing;
 pub fn encode_header(value: &str) -> String {
    let mut ret = String::with_capacity(value.len());
    let mut is_current_window_ascii = true;
    let mut current_window_start = 0;
    #[cfg(feature = "text-processing")]
    {
        let graphemes = value.graphemes_indices();
        for (idx, g) in graphemes {
@ -81,63 +79,6 @@ pub fn encode_header(value: &str) -> String {
            }
        }
    }
    #[cfg(not(feature = "text-processing"))]
    {
        /* [ref:VERIFY] [ref:TODO]: test this. If it works as fine as the one above, there's no need to
         * keep the above implementation. */
        for (i, g) in value.char_indices() {
            match (g.is_ascii(), is_current_window_ascii) {
                (true, true) => {
                    ret.push(g);
                }
                (true, false) => {
                    /* If !g.is_whitespace()
                     *
                     * Whitespaces inside encoded tokens must be greedily taken,
                     * instead of splitting each non-ascii word into separate encoded tokens. */
                    if !g.is_whitespace() && value.is_char_boundary(i) {
                        ret.push_str(&format!(
                            "=?UTF-8?B?{}?=",
                            BASE64_MIME
                                .encode(value[current_window_start..i].as_bytes())
                                .trim()
                        ));
                        if i != value.len() - 1 {
                            ret.push(' ');
                        }
                        is_current_window_ascii = true;
                        current_window_start = i;
                        ret.push(g);
                    }
                }
                (false, true) => {
                    current_window_start = i;
                    is_current_window_ascii = false;
                }
                /* RFC2047 recommends:
                 * 'While there is no limit to the length of a multiple-line header field, each
                 * line of a header field that contains one or more
                 * 'encoded-word's is limited to 76 characters.'
                 * This is a rough compliance.
                 */
                (false, false)
                    if value.is_char_boundary(i) && value[current_window_start..i].len() > 76 =>
                {
                    ret.push_str(&format!(
                        "=?UTF-8?B?{}?=",
                        BASE64_MIME
                            .encode(value[current_window_start..i].as_bytes())
                            .trim()
                    ));
                    if i != value.len() - 1 {
                        ret.push(' ');
                    }
                    current_window_start = i;
                }
                (false, false) => {}
            }
        }
    }
    /* If the last part of the header value is encoded, it won't be pushed inside
     * the previous for block */
    if !is_current_window_ascii {
--- a/melib/src/lib.rs
+++ b/melib/src/lib.rs
@ -132,7 +132,6 @@ pub mod dbg {
    }
 }
 #[cfg(feature = "text-processing")]
 pub mod text;
 pub use utils::{
--- a/melib/src/text/grapheme_clusters.rs
+++ b/melib/src/text/grapheme_clusters.rs
@ -29,12 +29,12 @@
 */
 use unicode_segmentation::UnicodeSegmentation;
 use super::{
    types::Reflow,
    wcwidth::{wcwidth, CodePointsIter},
 };
 extern crate unicode_segmentation;
 use self::unicode_segmentation::UnicodeSegmentation;
 pub trait TextProcessing: UnicodeSegmentation + CodePointsIter {
    fn split_graphemes(&self) -> Vec<&str> {
--- a/melib/src/text/line_break.rs
+++ b/melib/src/text/line_break.rs
@ -19,12 +19,11 @@
 * along with meli. If not, see <http://www.gnu.org/licenses/>.
 */
 extern crate unicode_segmentation;
 use std::{cmp::Ordering, collections::VecDeque, iter::Peekable, str::FromStr};
 use unicode_segmentation::UnicodeSegmentation;
 use LineBreakClass::*;
 use self::unicode_segmentation::UnicodeSegmentation;
 use super::{
    grapheme_clusters::TextProcessing,
    tables::LINE_BREAK_RULES,
--- a/melib/src/text/mod.rs
+++ b/melib/src/text/mod.rs
@ -19,6 +19,8 @@
 * along with meli. If not, see <http://www.gnu.org/licenses/>.
 */
 use unicode_segmentation::UnicodeSegmentation;
 pub mod grapheme_clusters;
 pub mod line_break;
 pub mod search;
@ -43,8 +45,6 @@ impl Truncate for &str {
            return;
        }
        extern crate unicode_segmentation;
        use unicode_segmentation::UnicodeSegmentation;
        if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(*self, true)
            .take(new_len)
            .last()
@ -58,8 +58,6 @@ impl Truncate for &str {
            return self;
        }
        extern crate unicode_segmentation;
        use unicode_segmentation::UnicodeSegmentation;
        if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(*self, true)
            .take(new_len)
            .last()
@ -75,8 +73,6 @@ impl Truncate for &str {
            return "";
        }
        extern crate unicode_segmentation;
        use unicode_segmentation::UnicodeSegmentation;
        if let Some((first, _)) = UnicodeSegmentation::grapheme_indices(*self, true).nth(skip_len) {
            &self[first..]
        } else {
@ -90,8 +86,6 @@ impl Truncate for &str {
            return;
        }
        extern crate unicode_segmentation;
        use unicode_segmentation::UnicodeSegmentation;
        if let Some((first, _)) = UnicodeSegmentation::grapheme_indices(*self, true).nth(skip_len) {
            *self = &self[first..];
        }
@ -104,8 +98,6 @@ impl Truncate for String {
            return;
        }
        extern crate unicode_segmentation;
        use unicode_segmentation::UnicodeSegmentation;
        if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(self.as_str(), true)
            .take(new_len)
            .last()
@ -119,8 +111,6 @@ impl Truncate for String {
            return self;
        }
        extern crate unicode_segmentation;
        use unicode_segmentation::UnicodeSegmentation;
        if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(self.as_str(), true)
            .take(new_len)
            .last()
@ -136,8 +126,6 @@ impl Truncate for String {
            return "";
        }
        extern crate unicode_segmentation;
        use unicode_segmentation::UnicodeSegmentation;
        if let Some((first, _)) =
            UnicodeSegmentation::grapheme_indices(self.as_str(), true).nth(skip_len)
        {
@ -153,8 +141,6 @@ impl Truncate for String {
            return;
        }
        extern crate unicode_segmentation;
        use unicode_segmentation::UnicodeSegmentation;
        if let Some((first, _)) =
            UnicodeSegmentation::grapheme_indices(self.as_str(), true).nth(skip_len)
        {
--- a/melib/src/thread.rs
+++ b/melib/src/thread.rs
@ -52,7 +52,6 @@ pub use iterators::*;
 use smallvec::SmallVec;
 use uuid::Uuid;
 #[cfg(feature = "text-processing")]
 use crate::text::grapheme_clusters::*;
 type Envelopes = Arc<RwLock<HashMap<EnvelopeHash, Envelope>>>;
@ -1223,16 +1222,11 @@ impl Threads {
                }
                let ma = &envelopes[&a.unwrap()];
                let mb = &envelopes[&b.unwrap()];
                #[cfg(feature = "text-processing")]
                {
                    ma.subject()
                        .split_graphemes()
                        .cmp(&mb.subject().split_graphemes())
                }
                #[cfg(not(feature = "text-processing"))]
                {
                    ma.subject().cmp(&mb.subject())
                }
            }
            (SortField::Subject, SortOrder::Asc) => {
                let a = &self.thread_nodes[&self.thread_ref(*a).root()].message();
@ -1252,18 +1246,12 @@ impl Threads {
                }
                let ma = &envelopes[&a.unwrap()];
                let mb = &envelopes[&b.unwrap()];
                #[cfg(feature = "text-processing")]
                {
                    mb.subject()
                        .as_ref()
                        .split_graphemes()
                        .cmp(&ma.subject().split_graphemes())
                }
                #[cfg(not(feature = "text-processing"))]
                {
                    mb.subject().as_ref().cmp(&ma.subject())
                }
            }
        });
    }
@ -1303,16 +1291,11 @@ impl Threads {
                }
                let ma = &envelopes[&a.unwrap()];
                let mb = &envelopes[&b.unwrap()];
                #[cfg(feature = "text-processing")]
                {
                    ma.subject()
                        .split_graphemes()
                        .cmp(&mb.subject().split_graphemes())
                }
                #[cfg(not(feature = "text-processing"))]
                {
                    ma.subject().cmp(&mb.subject())
                }
            }
            (SortField::Subject, SortOrder::Asc) => {
                let a = &self.thread_nodes[a].message();
@ -1332,18 +1315,12 @@ impl Threads {
                }
                let ma = &envelopes[&a.unwrap()];
                let mb = &envelopes[&b.unwrap()];
                #[cfg(feature = "text-processing")]
                {
                    mb.subject()
                        .as_ref()
                        .split_graphemes()
                        .cmp(&ma.subject().split_graphemes())
                }
                #[cfg(not(feature = "text-processing"))]
                {
                    mb.subject().as_ref().cmp(&ma.subject())
                }
            }
        });
    }
@ -1379,16 +1356,11 @@ impl Threads {
                }
                let ma = &envelopes[&a.unwrap()];
                let mb = &envelopes[&b.unwrap()];
                #[cfg(feature = "text-processing")]
                {
                    ma.subject()
                        .split_graphemes()
                        .cmp(&mb.subject().split_graphemes())
                }
                #[cfg(not(feature = "text-processing"))]
                {
                    ma.subject().cmp(&mb.subject())
                }
            }
            (SortField::Subject, SortOrder::Asc) => {
                let a = &self.thread_nodes[a].message();
@ -1408,18 +1380,12 @@ impl Threads {
                }
                let ma = &envelopes[&a.unwrap()];
                let mb = &envelopes[&b.unwrap()];
                #[cfg(feature = "text-processing")]
                {
                    mb.subject()
                        .as_ref()
                        .split_graphemes()
                        .cmp(&ma.subject().split_graphemes())
                }
                #[cfg(not(feature = "text-processing"))]
                {
                    mb.subject().as_ref().cmp(&ma.subject())
                }
            }
        });
    }
--- a/tools/Cargo.toml
+++ b/tools/Cargo.toml
@ -40,7 +40,7 @@ required-features = ["melib/imap"]
 [dependencies]
 crossbeam = { version = "^0.8" }
 meli = { path = "../meli", version = "0.8" }
-melib = { path = "../melib", version = "0.8", features = ["debug-tracing", "unicode-algorithms"] }
+melib = { path = "../melib", version = "0.8", features = ["debug-tracing" ] }
 nix = { version = "^0.24", default-features = false }
 signal-hook = { version = "^0.3", default-features = false, features = ["iterator"] }
 signal-hook-registry = { version = "1.2.0", default-features = false }