mirror of https://git.meli.delivery/meli/meli
text_processing: implement Unicode line breaking algorithm
Not conforming to the unicode standard yetembed
parent
d84ceca88e
commit
5b679be782
@ -0,0 +1,162 @@
|
||||
extern crate melib;
|
||||
use melib::Result;
|
||||
use melib::StackVec;
|
||||
|
||||
extern crate text_processing;
|
||||
use text_processing::line_break::*;
|
||||
|
||||
fn cost(i: usize, j: usize, width: usize, minima: &Vec<usize>, offsets: &Vec<usize>) -> usize {
|
||||
let w = offsets[j] - offsets[i] + j - i - 1;
|
||||
if w > width {
|
||||
return 65536 * (w - width);
|
||||
}
|
||||
minima[i] + (width - w) * (width - w)
|
||||
}
|
||||
|
||||
fn smawk(
|
||||
rows: &mut StackVec<usize>,
|
||||
columns: &mut StackVec<usize>,
|
||||
minima: &mut Vec<usize>,
|
||||
breaks: &mut Vec<usize>,
|
||||
width: usize,
|
||||
offsets: &Vec<usize>,
|
||||
) {
|
||||
let mut stack = StackVec::new();
|
||||
let mut i = 0;
|
||||
while i < rows.len() {
|
||||
if stack.len() > 0 {
|
||||
let c = columns[stack.len() - 1];
|
||||
if cost(*stack.iter().last().unwrap(), c, width, minima, offsets)
|
||||
< cost(rows[i], c, width, minima, offsets)
|
||||
{
|
||||
if stack.len() < columns.len() {
|
||||
stack.push(rows[i]);
|
||||
}
|
||||
i += 1;
|
||||
} else {
|
||||
stack.pop();
|
||||
}
|
||||
} else {
|
||||
stack.push(rows[i]);
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
let rows = &mut stack;
|
||||
if columns.len() > 1 {
|
||||
let mut odd_columns = columns.iter().skip(1).step_by(2).cloned().collect();
|
||||
smawk(rows, &mut odd_columns, minima, breaks, width, offsets);
|
||||
for (i, o) in odd_columns.into_iter().enumerate() {
|
||||
columns.set(2 * i + 1, o);
|
||||
}
|
||||
}
|
||||
let mut i = 0;
|
||||
let mut j = 0;
|
||||
while j < columns.len() {
|
||||
let end = if j + 1 < columns.len() {
|
||||
breaks[columns[j + 1]]
|
||||
} else {
|
||||
*rows.iter().last().unwrap()
|
||||
};
|
||||
let c = cost(rows[i], columns[j], width, minima, offsets);
|
||||
if c < minima[columns[j]] {
|
||||
minima[columns[j]] = c;
|
||||
breaks[columns[j]] = rows[i];
|
||||
}
|
||||
if rows[i] < end {
|
||||
i += 1;
|
||||
} else {
|
||||
j += 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn linear(text: &str, width: usize) -> Vec<String> {
|
||||
let mut words = Vec::new();
|
||||
let breaks = LineBreakCandidateIter::new(text).collect::<Vec<(usize, LineBreakCandidate)>>();
|
||||
{
|
||||
let mut prev = 0;
|
||||
for b in breaks {
|
||||
if &text[prev..b.0] != "\n" {
|
||||
words.push(text[prev..b.0].trim_end_matches("\n"));
|
||||
if text[prev..b.0].ends_with("\n") {
|
||||
words.push(" ");
|
||||
}
|
||||
}
|
||||
prev = b.0;
|
||||
}
|
||||
if &text[prev..] != "\n" {
|
||||
words.push(text[prev..].trim_end_matches("\n"));
|
||||
}
|
||||
}
|
||||
let count = words.len();
|
||||
let mut minima = vec![std::usize::MAX - 1; count + 1];
|
||||
minima[0] = 0;
|
||||
let mut offsets = Vec::with_capacity(words.len());
|
||||
offsets.push(0);
|
||||
for w in words.iter() {
|
||||
offsets.push(offsets.iter().last().unwrap() + w.len());
|
||||
}
|
||||
|
||||
let mut breaks = vec![0; count + 1];
|
||||
|
||||
let mut n = count + 1;
|
||||
let mut i = 1;
|
||||
let mut offset = 0;
|
||||
loop {
|
||||
let r = std::cmp::min(n, 2 * i);
|
||||
let edge = i + offset;
|
||||
smawk(
|
||||
&mut (offset..edge).collect(),
|
||||
&mut (edge..(r + offset)).collect(),
|
||||
&mut minima,
|
||||
&mut breaks,
|
||||
width,
|
||||
&offsets,
|
||||
);
|
||||
let x = minima[r - 1 + offset];
|
||||
let mut for_was_broken = false;
|
||||
for j in i..(r - 1) {
|
||||
let y = cost(j + offset, r - 1 + offset, width, &minima, &offsets);
|
||||
if y <= x {
|
||||
n -= j;
|
||||
i = 1;
|
||||
offset += j;
|
||||
for_was_broken = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if !for_was_broken || i >= (r - 1) {
|
||||
if r == n {
|
||||
break;
|
||||
}
|
||||
i *= 2;
|
||||
}
|
||||
}
|
||||
let mut lines = Vec::new();
|
||||
let mut j = count;
|
||||
while j > 0 {
|
||||
let mut line = String::new();
|
||||
for i in breaks[j]..j {
|
||||
line.push_str(words[i]);
|
||||
}
|
||||
lines.push(line);
|
||||
j = breaks[j];
|
||||
}
|
||||
lines.reverse();
|
||||
lines
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let text = std::fs::read_to_string(std::env::args().nth(1).unwrap())?;
|
||||
let paragraphs = text.split("\n\n").collect::<Vec<&str>>();
|
||||
for (i, p) in paragraphs.iter().enumerate() {
|
||||
for l in linear(&p, 72) {
|
||||
println!("{}", l.trim());
|
||||
}
|
||||
if i + 1 < paragraphs.len() {
|
||||
println!("");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
@ -0,0 +1,73 @@
|
||||
const LINE_BREAK_TABLE_URL: &str = "http://www.unicode.org/Public/UCD/latest/ucd/LineBreak.txt";
|
||||
use std::fs::File;
|
||||
use std::io::prelude::*;
|
||||
use std::io::BufReader;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
|
||||
include!("src/types.rs");
|
||||
|
||||
fn main() -> Result<(), std::io::Error> {
|
||||
let mod_path = PathBuf::from("src/tables.rs");
|
||||
if mod_path.exists() {
|
||||
eprintln!(
|
||||
"{} already exists, delete it if you want to replace it.",
|
||||
mod_path.display()
|
||||
);
|
||||
std::process::exit(0);
|
||||
}
|
||||
let mut tmpdir_path = PathBuf::from(
|
||||
std::str::from_utf8(&Command::new("mktemp").arg("-d").output()?.stdout)
|
||||
.unwrap()
|
||||
.trim(),
|
||||
);
|
||||
tmpdir_path.push("LineBreak.txt");
|
||||
Command::new("curl")
|
||||
.args(&["-o", tmpdir_path.to_str().unwrap(), LINE_BREAK_TABLE_URL])
|
||||
.output()?;
|
||||
|
||||
let file = File::open(&tmpdir_path)?;
|
||||
let buf_reader = BufReader::new(file);
|
||||
|
||||
let mut line_break_table: Vec<(u32, u32, LineBreakClass)> = Vec::with_capacity(3800);
|
||||
for line in buf_reader.lines() {
|
||||
let line = line.unwrap();
|
||||
if line.starts_with('#') || line.starts_with(' ') || line.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let tokens: &str = line.split_whitespace().next().unwrap();
|
||||
|
||||
let semicolon_idx: usize = tokens.chars().position(|c| c == ';').unwrap();
|
||||
/* LineBreak.txt list is ascii encoded so we can assume each char takes one byte: */
|
||||
let chars_str: &str = &tokens[..semicolon_idx];
|
||||
|
||||
let mut codepoint_iter = chars_str.split("..");
|
||||
|
||||
let first_codepoint: u32 =
|
||||
u32::from_str_radix(std::dbg!(codepoint_iter.next().unwrap()), 16).unwrap();
|
||||
|
||||
let sec_codepoint: u32 = codepoint_iter
|
||||
.next()
|
||||
.map(|v| u32::from_str_radix(std::dbg!(v), 16).unwrap())
|
||||
.unwrap_or(first_codepoint);
|
||||
let class = &tokens[semicolon_idx + 1..semicolon_idx + 1 + 2];
|
||||
line_break_table.push((first_codepoint, sec_codepoint, LineBreakClass::from(class)));
|
||||
}
|
||||
|
||||
let mut file = File::create(&mod_path)?;
|
||||
file.write_all(b"use crate::types::LineBreakClass::*;\n")
|
||||
.unwrap();
|
||||
file.write_all(b"use crate::types::LineBreakClass;\n\n")
|
||||
.unwrap();
|
||||
file.write_all(b"const line_break_rules: &'static [(u32, u32, LineBreakClass)] = &[\n")
|
||||
.unwrap();
|
||||
for l in &line_break_table {
|
||||
file.write_all(format!(" (0x{:X}, 0x{:X}, {:?}),\n", l.0, l.1, l.2).as_bytes())
|
||||
.unwrap();
|
||||
}
|
||||
file.write_all(b"];").unwrap();
|
||||
std::fs::remove_file(&tmpdir_path).unwrap();
|
||||
tmpdir_path.pop();
|
||||
std::fs::remove_dir(&tmpdir_path).unwrap();
|
||||
Ok(())
|
||||
}
|
@ -1,4 +1,8 @@
|
||||
pub mod grapheme_clusters;
|
||||
pub mod line_break;
|
||||
mod tables;
|
||||
mod types;
|
||||
pub mod wcwidth;
|
||||
pub use grapheme_clusters::*;
|
||||
pub use line_break::*;
|
||||
pub use wcwidth::*;
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,102 @@
|
||||
#[derive(Debug, Copy, Clone, PartialEq)]
|
||||
pub enum LineBreakClass {
|
||||
BK,
|
||||
CM,
|
||||
CR,
|
||||
GL,
|
||||
LF,
|
||||
NL,
|
||||
SP,
|
||||
WJ,
|
||||
ZW,
|
||||
ZWJ,
|
||||
AI,
|
||||
AL,
|
||||
B2,
|
||||
BA,
|
||||
BB,
|
||||
CB,
|
||||
CJ,
|
||||
CL,
|
||||
CP,
|
||||
EB,
|
||||
EM,
|
||||
EX,
|
||||
H2,
|
||||
H3,
|
||||
HL,
|
||||
HY,
|
||||
ID,
|
||||
IN,
|
||||
IS,
|
||||
JL,
|
||||
JT,
|
||||
JV,
|
||||
NS,
|
||||
NU,
|
||||
OP,
|
||||
PO,
|
||||
PR,
|
||||
QU,
|
||||
RI,
|
||||
SA,
|
||||
SG,
|
||||
SY,
|
||||
XX,
|
||||
}
|
||||
|
||||
use LineBreakClass::*;
|
||||
|
||||
impl From<&str> for LineBreakClass {
|
||||
fn from(val: &str) -> Self {
|
||||
match val {
|
||||
stringify!(BK) => BK,
|
||||
stringify!(CM) => CM,
|
||||
stringify!(CR) => CR,
|
||||
stringify!(GL) => GL,
|
||||
stringify!(LF) => LF,
|
||||
stringify!(NL) => NL,
|
||||
stringify!(SP) => SP,
|
||||
stringify!(WJ) => WJ,
|
||||
stringify!(ZW) => ZW,
|
||||
stringify!(ZWJ) => ZWJ,
|
||||
stringify!(AI) => AI,
|
||||
stringify!(AL) => AL,
|
||||
stringify!(B2) => B2,
|
||||
stringify!(BA) => BA,
|
||||
stringify!(BB) => BB,
|
||||
stringify!(CB) => CB,
|
||||
stringify!(CJ) => CJ,
|
||||
stringify!(CL) => CL,
|
||||
stringify!(CP) => CP,
|
||||
stringify!(EB) => EB,
|
||||
|
||||
stringify!(EM) => EM,
|
||||
stringify!(EX) => EX,
|
||||
stringify!(H2) => H2,
|
||||
stringify!(H3) => H3,
|
||||
stringify!(HL) => HL,
|
||||
stringify!(HY) => HY,
|
||||
stringify!(ID) => ID,
|
||||
stringify!(IN) => IN,
|
||||
stringify!(IS) => IS,
|
||||
stringify!(JL) => JL,
|
||||
|
||||
stringify!(JT) => JT,
|
||||
stringify!(JV) => JV,
|
||||
stringify!(NS) => NS,
|
||||
stringify!(NU) => NU,
|
||||
stringify!(OP) => OP,
|
||||
stringify!(PO) => PO,
|
||||
stringify!(PR) => PR,
|
||||
stringify!(QU) => QU,
|
||||
stringify!(RI) => RI,
|
||||
stringify!(SA) => SA,
|
||||
|
||||
stringify!(SG) => SG,
|
||||
stringify!(SY) => SY,
|
||||
stringify!(XX) => XX,
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue