Make unicode-segmentation a hard dependency

meli/melib are UTF8 software, so we should have proper Unicode support.

A compile-time env var is added, `UNICODE_REGENERATE_TABLES` to force
network access and rebuild the cached unicode tables.

Signed-off-by: Manos Pitsidianakis <manos@pitsidianak.is>
pull/377/head
Manos Pitsidianakis 1 month ago
parent 07072e2e3f
commit ae96038fbf
No known key found for this signature in database
GPG Key ID: 7729C7707F7E09D0

@ -3,7 +3,7 @@
For a quick start, build and install locally: For a quick start, build and install locally:
```sh ```sh
PREFIX=~/.local make install PREFIX=~/.local make install
``` ```
Available subcommands for `make` are listed with `make help`. Available subcommands for `make` are listed with `make help`.
@ -34,6 +34,9 @@ Some functionality is held behind "feature gates", or compile-time flags. The fo
Since it's actual use in the code is very limited, it is not recommended to use this (off by default). Since it's actual use in the code is very limited, it is not recommended to use this (off by default).
- `static` and `*-static` bundle C libraries in dependencies so that you don't need them installed in your system (on by default). - `static` and `*-static` bundle C libraries in dependencies so that you don't need them installed in your system (on by default).
Though not a feature, the presence of the environment variable `UNICODE_REGENERATE_TABLES` in compile-time of the `melib` crate will force the regeneration of unicode tables.
Otherwise the tables are included with the source code, and there's no real reason to regenerate them unless you intend to modify the code or update to a new Unicode version.
## Build Debian package (*deb*) ## Build Debian package (*deb*)
Building with Debian's packaged cargo might require the installation of these two packages: `librust-openssl-sys-dev librust-libdbus-sys-dev` Building with Debian's packaged cargo might require the installation of these two packages: `librust-openssl-sys-dev librust-libdbus-sys-dev`

@ -14,10 +14,7 @@ path = "fuzz_targets/envelope_parse.rs"
[dependencies] [dependencies]
libfuzzer-sys = "0.3" libfuzzer-sys = "0.3"
melib = { path = "../melib" }
[dependencies.melib]
path = "../melib"
features = ["unicode-algorithms"]
# Prevent this from interfering with workspaces # Prevent this from interfering with workspaces
[workspace] [workspace]

@ -31,7 +31,7 @@ indexmap = { version = "^1.6", features = ["serde-1"] }
libc = { version = "0.2.125", default-features = false, features = ["extra_traits"] } libc = { version = "0.2.125", default-features = false, features = ["extra_traits"] }
libz-sys = { version = "1.1", features = ["static"], optional = true } libz-sys = { version = "1.1", features = ["static"], optional = true }
linkify = { version = "^0.8", default-features = false } linkify = { version = "^0.8", default-features = false }
melib = { path = "../melib", version = "0.8.5-rc.3", features = ["unicode-algorithms"] } melib = { path = "../melib", version = "0.8.5-rc.3", features = [] }
nix = { version = "0.27", default-features = false, features = ["signal", "poll", "term", "ioctl", "process"] } nix = { version = "0.27", default-features = false, features = ["signal", "poll", "term", "ioctl", "process"] }
notify = { version = "4.0.1", default-features = false } # >:c notify = { version = "4.0.1", default-features = false } # >:c
num_cpus = "1.12.0" num_cpus = "1.12.0"

@ -50,7 +50,7 @@ serde_path_to_error = { version = "0.1" }
smallvec = { version = "^1.5.0", features = ["serde"] } smallvec = { version = "^1.5.0", features = ["serde"] }
smol = "1.0.0" smol = "1.0.0"
socket2 = { version = "0.5", features = [] } socket2 = { version = "0.5", features = [] }
unicode-segmentation = { version = "1.2.1", default-features = false, optional = true } unicode-segmentation = { version = "1.2.1", default-features = false }
url = { version = "2.4", optional = true } url = { version = "2.4", optional = true }
uuid = { version = "^1", features = ["serde", "v4", "v5"] } uuid = { version = "^1", features = ["serde", "v4", "v5"] }
xdg = "2.1.0" xdg = "2.1.0"
@ -77,9 +77,6 @@ sqlite3 = ["rusqlite"]
sqlite3-static = ["sqlite3", "rusqlite/bundled-full"] sqlite3-static = ["sqlite3", "rusqlite/bundled-full"]
tls = ["native-tls"] tls = ["native-tls"]
tls-static = ["tls", "native-tls/vendored"] tls-static = ["tls", "native-tls/vendored"]
text-processing = []
unicode-algorithms = ["text-processing", "unicode-segmentation"]
unicode-algorithms-cached = ["text-processing", "unicode-segmentation"]
vcard = [] vcard = []
[build-dependencies] [build-dependencies]

@ -22,24 +22,6 @@ Library for handling mail.
|------------------------------|-------------------------------------|--------------------------| |------------------------------|-------------------------------------|--------------------------|
| `sqlite` | `rusqlite` | Used in IMAP cache. | | `sqlite` | `rusqlite` | Used in IMAP cache. |
|------------------------------|-------------------------------------|--------------------------| |------------------------------|-------------------------------------|--------------------------|
| `unicode-algorithms` | `unicode-segmentation` | Linebreaking algo etc |
| | | For a fresh clean build, |
| | | Network access is |
| | | required to fetch data |
| | | from Unicode's website. |
|------------------------------|-------------------------------------|--------------------------|
| `unicode-algorithms-cached` | `unicode-segmentation` | Linebreaking algo etc |
| | | but it uses a cached |
| | | version of Unicode data |
| | | which might be stale. |
| | | |
| | | Use this feature instead |
| | | of the previous one for |
| | | building without network |
| | | access. |
|------------------------------|-------------------------------------|--------------------------|
| `unicode-algorithms` | `unicode-segmentation` | |
|------------------------------|-------------------------------------|--------------------------|
| `vcard` | | vcard parsing | | `vcard` | | vcard parsing |
|------------------------------|-------------------------------------|--------------------------| |------------------------------|-------------------------------------|--------------------------|
| `gpgme` | | GPG use with libgpgme | | `gpgme` | | GPG use with libgpgme |

@ -21,15 +21,14 @@
#![allow(clippy::needless_range_loop)] #![allow(clippy::needless_range_loop)]
#[cfg(any(feature = "unicode-algorithms", feature = "unicode-algorithms-cached"))]
include!("src/text/types.rs"); include!("src/text/types.rs");
fn main() -> Result<(), std::io::Error> { fn main() -> Result<(), std::io::Error> {
#[cfg(any(feature = "unicode-algorithms", feature = "unicode-algorithms-cached"))]
{ {
const MOD_PATH: &str = "src/text/tables.rs"; const MOD_PATH: &str = "src/text/tables.rs";
println!("cargo:rerun-if-env-changed=UNICODE_REGENERATE_TABLES");
println!("cargo:rerun-if-changed=build.rs"); println!("cargo:rerun-if-changed=build.rs");
println!("cargo:rerun-if-changed={}", MOD_PATH); println!("cargo:rerun-if-changed={MOD_PATH}");
/* Line break tables */ /* Line break tables */
use std::{ use std::{
fs::File, fs::File,
@ -54,7 +53,7 @@ fn main() -> Result<(), std::io::Error> {
); );
return Ok(()); return Ok(());
} }
if cfg!(feature = "unicode-algorithms-cached") { if std::env::var("UNICODE_REGENERATE_TABLES").is_err() {
const CACHED_MODULE: &[u8] = include_bytes!(concat!("./src/text/tables.rs.gz")); const CACHED_MODULE: &[u8] = include_bytes!(concat!("./src/text/tables.rs.gz"));
let mut gz = GzDecoder::new(CACHED_MODULE); let mut gz = GzDecoder::new(CACHED_MODULE);

@ -20,14 +20,12 @@
*/ */
use super::*; use super::*;
#[cfg(feature = "text-processing")]
use crate::text::grapheme_clusters::TextProcessing; use crate::text::grapheme_clusters::TextProcessing;
pub fn encode_header(value: &str) -> String { pub fn encode_header(value: &str) -> String {
let mut ret = String::with_capacity(value.len()); let mut ret = String::with_capacity(value.len());
let mut is_current_window_ascii = true; let mut is_current_window_ascii = true;
let mut current_window_start = 0; let mut current_window_start = 0;
#[cfg(feature = "text-processing")]
{ {
let graphemes = value.graphemes_indices(); let graphemes = value.graphemes_indices();
for (idx, g) in graphemes { for (idx, g) in graphemes {
@ -81,63 +79,6 @@ pub fn encode_header(value: &str) -> String {
} }
} }
} }
#[cfg(not(feature = "text-processing"))]
{
/* [ref:VERIFY] [ref:TODO]: test this. If it works as fine as the one above, there's no need to
* keep the above implementation. */
for (i, g) in value.char_indices() {
match (g.is_ascii(), is_current_window_ascii) {
(true, true) => {
ret.push(g);
}
(true, false) => {
/* If !g.is_whitespace()
*
* Whitespaces inside encoded tokens must be greedily taken,
* instead of splitting each non-ascii word into separate encoded tokens. */
if !g.is_whitespace() && value.is_char_boundary(i) {
ret.push_str(&format!(
"=?UTF-8?B?{}?=",
BASE64_MIME
.encode(value[current_window_start..i].as_bytes())
.trim()
));
if i != value.len() - 1 {
ret.push(' ');
}
is_current_window_ascii = true;
current_window_start = i;
ret.push(g);
}
}
(false, true) => {
current_window_start = i;
is_current_window_ascii = false;
}
/* RFC2047 recommends:
* 'While there is no limit to the length of a multiple-line header field, each
* line of a header field that contains one or more
* 'encoded-word's is limited to 76 characters.'
* This is a rough compliance.
*/
(false, false)
if value.is_char_boundary(i) && value[current_window_start..i].len() > 76 =>
{
ret.push_str(&format!(
"=?UTF-8?B?{}?=",
BASE64_MIME
.encode(value[current_window_start..i].as_bytes())
.trim()
));
if i != value.len() - 1 {
ret.push(' ');
}
current_window_start = i;
}
(false, false) => {}
}
}
}
/* If the last part of the header value is encoded, it won't be pushed inside /* If the last part of the header value is encoded, it won't be pushed inside
* the previous for block */ * the previous for block */
if !is_current_window_ascii { if !is_current_window_ascii {

@ -132,7 +132,6 @@ pub mod dbg {
} }
} }
#[cfg(feature = "text-processing")]
pub mod text; pub mod text;
pub use utils::{ pub use utils::{

@ -29,12 +29,12 @@
*/ */
use unicode_segmentation::UnicodeSegmentation;
use super::{ use super::{
types::Reflow, types::Reflow,
wcwidth::{wcwidth, CodePointsIter}, wcwidth::{wcwidth, CodePointsIter},
}; };
extern crate unicode_segmentation;
use self::unicode_segmentation::UnicodeSegmentation;
pub trait TextProcessing: UnicodeSegmentation + CodePointsIter { pub trait TextProcessing: UnicodeSegmentation + CodePointsIter {
fn split_graphemes(&self) -> Vec<&str> { fn split_graphemes(&self) -> Vec<&str> {

@ -19,12 +19,11 @@
* along with meli. If not, see <http://www.gnu.org/licenses/>. * along with meli. If not, see <http://www.gnu.org/licenses/>.
*/ */
extern crate unicode_segmentation;
use std::{cmp::Ordering, collections::VecDeque, iter::Peekable, str::FromStr}; use std::{cmp::Ordering, collections::VecDeque, iter::Peekable, str::FromStr};
use unicode_segmentation::UnicodeSegmentation;
use LineBreakClass::*; use LineBreakClass::*;
use self::unicode_segmentation::UnicodeSegmentation;
use super::{ use super::{
grapheme_clusters::TextProcessing, grapheme_clusters::TextProcessing,
tables::LINE_BREAK_RULES, tables::LINE_BREAK_RULES,

@ -19,6 +19,8 @@
* along with meli. If not, see <http://www.gnu.org/licenses/>. * along with meli. If not, see <http://www.gnu.org/licenses/>.
*/ */
use unicode_segmentation::UnicodeSegmentation;
pub mod grapheme_clusters; pub mod grapheme_clusters;
pub mod line_break; pub mod line_break;
pub mod search; pub mod search;
@ -43,8 +45,6 @@ impl Truncate for &str {
return; return;
} }
extern crate unicode_segmentation;
use unicode_segmentation::UnicodeSegmentation;
if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(*self, true) if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(*self, true)
.take(new_len) .take(new_len)
.last() .last()
@ -58,8 +58,6 @@ impl Truncate for &str {
return self; return self;
} }
extern crate unicode_segmentation;
use unicode_segmentation::UnicodeSegmentation;
if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(*self, true) if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(*self, true)
.take(new_len) .take(new_len)
.last() .last()
@ -75,8 +73,6 @@ impl Truncate for &str {
return ""; return "";
} }
extern crate unicode_segmentation;
use unicode_segmentation::UnicodeSegmentation;
if let Some((first, _)) = UnicodeSegmentation::grapheme_indices(*self, true).nth(skip_len) { if let Some((first, _)) = UnicodeSegmentation::grapheme_indices(*self, true).nth(skip_len) {
&self[first..] &self[first..]
} else { } else {
@ -90,8 +86,6 @@ impl Truncate for &str {
return; return;
} }
extern crate unicode_segmentation;
use unicode_segmentation::UnicodeSegmentation;
if let Some((first, _)) = UnicodeSegmentation::grapheme_indices(*self, true).nth(skip_len) { if let Some((first, _)) = UnicodeSegmentation::grapheme_indices(*self, true).nth(skip_len) {
*self = &self[first..]; *self = &self[first..];
} }
@ -104,8 +98,6 @@ impl Truncate for String {
return; return;
} }
extern crate unicode_segmentation;
use unicode_segmentation::UnicodeSegmentation;
if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(self.as_str(), true) if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(self.as_str(), true)
.take(new_len) .take(new_len)
.last() .last()
@ -119,8 +111,6 @@ impl Truncate for String {
return self; return self;
} }
extern crate unicode_segmentation;
use unicode_segmentation::UnicodeSegmentation;
if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(self.as_str(), true) if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(self.as_str(), true)
.take(new_len) .take(new_len)
.last() .last()
@ -136,8 +126,6 @@ impl Truncate for String {
return ""; return "";
} }
extern crate unicode_segmentation;
use unicode_segmentation::UnicodeSegmentation;
if let Some((first, _)) = if let Some((first, _)) =
UnicodeSegmentation::grapheme_indices(self.as_str(), true).nth(skip_len) UnicodeSegmentation::grapheme_indices(self.as_str(), true).nth(skip_len)
{ {
@ -153,8 +141,6 @@ impl Truncate for String {
return; return;
} }
extern crate unicode_segmentation;
use unicode_segmentation::UnicodeSegmentation;
if let Some((first, _)) = if let Some((first, _)) =
UnicodeSegmentation::grapheme_indices(self.as_str(), true).nth(skip_len) UnicodeSegmentation::grapheme_indices(self.as_str(), true).nth(skip_len)
{ {

@ -52,7 +52,6 @@ pub use iterators::*;
use smallvec::SmallVec; use smallvec::SmallVec;
use uuid::Uuid; use uuid::Uuid;
#[cfg(feature = "text-processing")]
use crate::text::grapheme_clusters::*; use crate::text::grapheme_clusters::*;
type Envelopes = Arc<RwLock<HashMap<EnvelopeHash, Envelope>>>; type Envelopes = Arc<RwLock<HashMap<EnvelopeHash, Envelope>>>;
@ -1223,16 +1222,11 @@ impl Threads {
} }
let ma = &envelopes[&a.unwrap()]; let ma = &envelopes[&a.unwrap()];
let mb = &envelopes[&b.unwrap()]; let mb = &envelopes[&b.unwrap()];
#[cfg(feature = "text-processing")]
{ {
ma.subject() ma.subject()
.split_graphemes() .split_graphemes()
.cmp(&mb.subject().split_graphemes()) .cmp(&mb.subject().split_graphemes())
} }
#[cfg(not(feature = "text-processing"))]
{
ma.subject().cmp(&mb.subject())
}
} }
(SortField::Subject, SortOrder::Asc) => { (SortField::Subject, SortOrder::Asc) => {
let a = &self.thread_nodes[&self.thread_ref(*a).root()].message(); let a = &self.thread_nodes[&self.thread_ref(*a).root()].message();
@ -1252,18 +1246,12 @@ impl Threads {
} }
let ma = &envelopes[&a.unwrap()]; let ma = &envelopes[&a.unwrap()];
let mb = &envelopes[&b.unwrap()]; let mb = &envelopes[&b.unwrap()];
#[cfg(feature = "text-processing")]
{ {
mb.subject() mb.subject()
.as_ref() .as_ref()
.split_graphemes() .split_graphemes()
.cmp(&ma.subject().split_graphemes()) .cmp(&ma.subject().split_graphemes())
} }
#[cfg(not(feature = "text-processing"))]
{
mb.subject().as_ref().cmp(&ma.subject())
}
} }
}); });
} }
@ -1303,16 +1291,11 @@ impl Threads {
} }
let ma = &envelopes[&a.unwrap()]; let ma = &envelopes[&a.unwrap()];
let mb = &envelopes[&b.unwrap()]; let mb = &envelopes[&b.unwrap()];
#[cfg(feature = "text-processing")]
{ {
ma.subject() ma.subject()
.split_graphemes() .split_graphemes()
.cmp(&mb.subject().split_graphemes()) .cmp(&mb.subject().split_graphemes())
} }
#[cfg(not(feature = "text-processing"))]
{
ma.subject().cmp(&mb.subject())
}
} }
(SortField::Subject, SortOrder::Asc) => { (SortField::Subject, SortOrder::Asc) => {
let a = &self.thread_nodes[a].message(); let a = &self.thread_nodes[a].message();
@ -1332,18 +1315,12 @@ impl Threads {
} }
let ma = &envelopes[&a.unwrap()]; let ma = &envelopes[&a.unwrap()];
let mb = &envelopes[&b.unwrap()]; let mb = &envelopes[&b.unwrap()];
#[cfg(feature = "text-processing")]
{ {
mb.subject() mb.subject()
.as_ref() .as_ref()
.split_graphemes() .split_graphemes()
.cmp(&ma.subject().split_graphemes()) .cmp(&ma.subject().split_graphemes())
} }
#[cfg(not(feature = "text-processing"))]
{
mb.subject().as_ref().cmp(&ma.subject())
}
} }
}); });
} }
@ -1379,16 +1356,11 @@ impl Threads {
} }
let ma = &envelopes[&a.unwrap()]; let ma = &envelopes[&a.unwrap()];
let mb = &envelopes[&b.unwrap()]; let mb = &envelopes[&b.unwrap()];
#[cfg(feature = "text-processing")]
{ {
ma.subject() ma.subject()
.split_graphemes() .split_graphemes()
.cmp(&mb.subject().split_graphemes()) .cmp(&mb.subject().split_graphemes())
} }
#[cfg(not(feature = "text-processing"))]
{
ma.subject().cmp(&mb.subject())
}
} }
(SortField::Subject, SortOrder::Asc) => { (SortField::Subject, SortOrder::Asc) => {
let a = &self.thread_nodes[a].message(); let a = &self.thread_nodes[a].message();
@ -1408,18 +1380,12 @@ impl Threads {
} }
let ma = &envelopes[&a.unwrap()]; let ma = &envelopes[&a.unwrap()];
let mb = &envelopes[&b.unwrap()]; let mb = &envelopes[&b.unwrap()];
#[cfg(feature = "text-processing")]
{ {
mb.subject() mb.subject()
.as_ref() .as_ref()
.split_graphemes() .split_graphemes()
.cmp(&ma.subject().split_graphemes()) .cmp(&ma.subject().split_graphemes())
} }
#[cfg(not(feature = "text-processing"))]
{
mb.subject().as_ref().cmp(&ma.subject())
}
} }
}); });
} }

@ -40,7 +40,7 @@ required-features = ["melib/imap"]
[dependencies] [dependencies]
crossbeam = { version = "^0.8" } crossbeam = { version = "^0.8" }
meli = { path = "../meli", version = "0.8" } meli = { path = "../meli", version = "0.8" }
melib = { path = "../melib", version = "0.8", features = ["debug-tracing", "unicode-algorithms"] } melib = { path = "../melib", version = "0.8", features = ["debug-tracing" ] }
nix = { version = "^0.24", default-features = false } nix = { version = "^0.24", default-features = false }
signal-hook = { version = "^0.3", default-features = false, features = ["iterator"] } signal-hook = { version = "^0.3", default-features = false, features = ["iterator"] }
signal-hook-registry = { version = "1.2.0", default-features = false } signal-hook-registry = { version = "1.2.0", default-features = false }

Loading…
Cancel
Save