text_processing: implement Unicode line breaking algorithm

Not conforming to the unicode standard yet
5 years ago · 5b679be782
parent d84ceca88e
commit 5b679be782
8 changed files with 4438 additions and 0 deletions
--- a/testing/Cargo.toml
+++ b/testing/Cargo.toml
@ -8,7 +8,11 @@ edition = "2018"
 [[bin]]
 name = "emailparse"
 path = "src/email_parse.rs"
+[[bin]]
+name = "linebreak"
+path = "src/linebreak.rs"


 [dependencies]
 melib = { path = "../melib", version = "*" }
+text_processing = { path = "../text_processing", version = "*" }
--- a/testing/src/linebreak.rs
+++ b/testing/src/linebreak.rs
@ -0,0 +1,162 @@
+extern crate melib;
+use melib::Result;
+use melib::StackVec;
+
+extern crate text_processing;
+use text_processing::line_break::*;
+
+fn cost(i: usize, j: usize, width: usize, minima: &Vec<usize>, offsets: &Vec<usize>) -> usize {
+    let w = offsets[j] - offsets[i] + j - i - 1;
+    if w > width {
+        return 65536 * (w - width);
+    }
+    minima[i] + (width - w) * (width - w)
+}
+
+fn smawk(
+    rows: &mut StackVec<usize>,
+    columns: &mut StackVec<usize>,
+    minima: &mut Vec<usize>,
+    breaks: &mut Vec<usize>,
+    width: usize,
+    offsets: &Vec<usize>,
+) {
+    let mut stack = StackVec::new();
+    let mut i = 0;
+    while i < rows.len() {
+        if stack.len() > 0 {
+            let c = columns[stack.len() - 1];
+            if cost(*stack.iter().last().unwrap(), c, width, minima, offsets)
+                < cost(rows[i], c, width, minima, offsets)
+            {
+                if stack.len() < columns.len() {
+                    stack.push(rows[i]);
+                }
+                i += 1;
+            } else {
+                stack.pop();
+            }
+        } else {
+            stack.push(rows[i]);
+            i += 1;
+        }
+    }
+    let rows = &mut stack;
+    if columns.len() > 1 {
+        let mut odd_columns = columns.iter().skip(1).step_by(2).cloned().collect();
+        smawk(rows, &mut odd_columns, minima, breaks, width, offsets);
+        for (i, o) in odd_columns.into_iter().enumerate() {
+            columns.set(2 * i + 1, o);
+        }
+    }
+    let mut i = 0;
+    let mut j = 0;
+    while j < columns.len() {
+        let end = if j + 1 < columns.len() {
+            breaks[columns[j + 1]]
+        } else {
+            *rows.iter().last().unwrap()
+        };
+        let c = cost(rows[i], columns[j], width, minima, offsets);
+        if c < minima[columns[j]] {
+            minima[columns[j]] = c;
+            breaks[columns[j]] = rows[i];
+        }
+        if rows[i] < end {
+            i += 1;
+        } else {
+            j += 2;
+        }
+    }
+}
+
+fn linear(text: &str, width: usize) -> Vec<String> {
+    let mut words = Vec::new();
+    let breaks = LineBreakCandidateIter::new(text).collect::<Vec<(usize, LineBreakCandidate)>>();
+    {
+        let mut prev = 0;
+        for b in breaks {
+            if &text[prev..b.0] != "\n" {
+                words.push(text[prev..b.0].trim_end_matches("\n"));
+                if text[prev..b.0].ends_with("\n") {
+                    words.push(" ");
+                }
+            }
+            prev = b.0;
+        }
+        if &text[prev..] != "\n" {
+            words.push(text[prev..].trim_end_matches("\n"));
+        }
+    }
+    let count = words.len();
+    let mut minima = vec![std::usize::MAX - 1; count + 1];
+    minima[0] = 0;
+    let mut offsets = Vec::with_capacity(words.len());
+    offsets.push(0);
+    for w in words.iter() {
+        offsets.push(offsets.iter().last().unwrap() + w.len());
+    }
+
+    let mut breaks = vec![0; count + 1];
+
+    let mut n = count + 1;
+    let mut i = 1;
+    let mut offset = 0;
+    loop {
+        let r = std::cmp::min(n, 2 * i);
+        let edge = i + offset;
+        smawk(
+            &mut (offset..edge).collect(),
+            &mut (edge..(r + offset)).collect(),
+            &mut minima,
+            &mut breaks,
+            width,
+            &offsets,
+        );
+        let x = minima[r - 1 + offset];
+        let mut for_was_broken = false;
+        for j in i..(r - 1) {
+            let y = cost(j + offset, r - 1 + offset, width, &minima, &offsets);
+            if y <= x {
+                n -= j;
+                i = 1;
+                offset += j;
+                for_was_broken = true;
+                break;
+            }
+        }
+
+        if !for_was_broken || i >= (r - 1) {
+            if r == n {
+                break;
+            }
+            i *= 2;
+        }
+    }
+    let mut lines = Vec::new();
+    let mut j = count;
+    while j > 0 {
+        let mut line = String::new();
+        for i in breaks[j]..j {
+            line.push_str(words[i]);
+        }
+        lines.push(line);
+        j = breaks[j];
+    }
+    lines.reverse();
+    lines
+}
+
+fn main() -> Result<()> {
+    let text = std::fs::read_to_string(std::env::args().nth(1).unwrap())?;
+    let paragraphs = text.split("\n\n").collect::<Vec<&str>>();
+    for (i, p) in paragraphs.iter().enumerate() {
+        for l in linear(&p, 72) {
+            println!("{}", l.trim());
+        }
+        if i + 1 < paragraphs.len() {
+            println!("");
+        }
+    }
+    Ok(())
+}
--- a/text_processing/Cargo.toml
+++ b/text_processing/Cargo.toml
@ -4,6 +4,7 @@ version = "0.0.1"  #:version
 authors = ["Manos Pitsidianakis <el13635@mail.ntua.gr>"]
 workspace = ".."
 edition = "2018"
+build = "build.rs"

 [dependencies]
 unicode-segmentation = "1.2.1"
--- a/text_processing/build.rs
+++ b/text_processing/build.rs
@ -0,0 +1,73 @@
+const LINE_BREAK_TABLE_URL: &str = "http://www.unicode.org/Public/UCD/latest/ucd/LineBreak.txt";
+use std::fs::File;
+use std::io::prelude::*;
+use std::io::BufReader;
+use std::path::PathBuf;
+use std::process::Command;
+
+include!("src/types.rs");
+
+fn main() -> Result<(), std::io::Error> {
+    let mod_path = PathBuf::from("src/tables.rs");
+    if mod_path.exists() {
+        eprintln!(
+            "{} already exists, delete it if you want to replace it.",
+            mod_path.display()
+        );
+        std::process::exit(0);
+    }
+    let mut tmpdir_path = PathBuf::from(
+        std::str::from_utf8(&Command::new("mktemp").arg("-d").output()?.stdout)
+            .unwrap()
+            .trim(),
+    );
+    tmpdir_path.push("LineBreak.txt");
+    Command::new("curl")
+        .args(&["-o", tmpdir_path.to_str().unwrap(), LINE_BREAK_TABLE_URL])
+        .output()?;
+
+    let file = File::open(&tmpdir_path)?;
+    let buf_reader = BufReader::new(file);
+
+    let mut line_break_table: Vec<(u32, u32, LineBreakClass)> = Vec::with_capacity(3800);
+    for line in buf_reader.lines() {
+        let line = line.unwrap();
+        if line.starts_with('#') || line.starts_with(' ') || line.is_empty() {
+            continue;
+        }
+        let tokens: &str = line.split_whitespace().next().unwrap();
+
+        let semicolon_idx: usize = tokens.chars().position(|c| c == ';').unwrap();
+        /* LineBreak.txt list is ascii encoded so we can assume each char takes one byte: */
+        let chars_str: &str = &tokens[..semicolon_idx];
+
+        let mut codepoint_iter = chars_str.split("..");
+
+        let first_codepoint: u32 =
+            u32::from_str_radix(std::dbg!(codepoint_iter.next().unwrap()), 16).unwrap();
+
+        let sec_codepoint: u32 = codepoint_iter
+            .next()
+            .map(|v| u32::from_str_radix(std::dbg!(v), 16).unwrap())
+            .unwrap_or(first_codepoint);
+        let class = &tokens[semicolon_idx + 1..semicolon_idx + 1 + 2];
+        line_break_table.push((first_codepoint, sec_codepoint, LineBreakClass::from(class)));
+    }
+
+    let mut file = File::create(&mod_path)?;
+    file.write_all(b"use crate::types::LineBreakClass::*;\n")
+        .unwrap();
+    file.write_all(b"use crate::types::LineBreakClass;\n\n")
+        .unwrap();
+    file.write_all(b"const line_break_rules: &'static [(u32, u32, LineBreakClass)] = &[\n")
+        .unwrap();
+    for l in &line_break_table {
+        file.write_all(format!("    (0x{:X}, 0x{:X}, {:?}),\n", l.0, l.1, l.2).as_bytes())
+            .unwrap();
+    }
+    file.write_all(b"];").unwrap();
+    std::fs::remove_file(&tmpdir_path).unwrap();
+    tmpdir_path.pop();
+    std::fs::remove_dir(&tmpdir_path).unwrap();
+    Ok(())
+}
--- a/text_processing/src/lib.rs
+++ b/text_processing/src/lib.rs
@ -1,4 +1,8 @@
 pub mod grapheme_clusters;
+pub mod line_break;
+mod tables;
+mod types;
 pub mod wcwidth;
 pub use grapheme_clusters::*;
+pub use line_break::*;
 pub use wcwidth::*;
--- a/text_processing/src/line_break.rs
+++ b/text_processing/src/line_break.rs
@ -0,0 +1,703 @@
+extern crate unicode_segmentation;
+use self::unicode_segmentation::UnicodeSegmentation;
+use crate::tables::LINE_BREAK_RULES;
+use crate::types::LineBreakClass;
+use core::cmp::Ordering;
+use core::iter::Peekable;
+use core::str::FromStr;
+use LineBreakClass::*;
+
+#[derive(Debug, PartialEq)]
+pub enum LineBreakCandidate {
+    MandatoryBreak,
+    BreakAllowed,
+    // NoBreak, Not used.
+}
+
+use LineBreakCandidate::*;
+
+pub struct LineBreakCandidateIter<'a> {
+    text: &'a str,
+    iter: Peekable<unicode_segmentation::GraphemeIndices<'a>>,
+    pos: usize,
+    /* Needed for rule LB30a */
+    reg_ind_streak: u32,
+}
+
+impl<'a> LineBreakCandidateIter<'a> {
+    pub fn new(text: &'a str) -> Self {
+        LineBreakCandidateIter {
+            text,
+            pos: 0,
+            iter: UnicodeSegmentation::grapheme_indices(text, true).peekable(),
+            reg_ind_streak: 0,
+        }
+    }
+}
+
+macro_rules! get_base_character {
+    ($grapheme:ident) => {{
+        char::from_str($grapheme.get(0..1).unwrap_or_else(|| {
+            $grapheme.get(0..2).unwrap_or_else(|| {
+                $grapheme
+                    .get(0..3)
+                    .unwrap_or_else(|| $grapheme.get(0..4).unwrap())
+            })
+        }))
+    }};
+    ($grapheme:expr) => {{
+        char::from_str($grapheme.get(0..1).unwrap_or_else(|| {
+            $grapheme.get(0..2).unwrap_or_else(|| {
+                $grapheme
+                    .get(0..3)
+                    .unwrap_or_else(|| $grapheme.get(0..4).unwrap())
+            })
+        }))
+    }};
+}
+
+/// Side effects: none
+macro_rules! get_class {
+    ($grapheme:ident) => {{
+        get_base_character!($grapheme)
+            .map(|char| search_table(char as u32, LINE_BREAK_RULES))
+            .unwrap_or(XX)
+    }};
+    ($grapheme:expr) => {{
+        get_base_character!($grapheme)
+            .map(|char| search_table(char as u32, LINE_BREAK_RULES))
+            .unwrap_or(XX)
+    }};
+}
+
+/// Side effects: Updates $graph_iter and potentially $idx and $grapheme
+macro_rules! next_grapheme_class {
+    ($graph_iter:ident, $grapheme:ident) => ({
+        if let Some((_, g)) = $graph_iter.next() {
+            $grapheme = g;
+            Some(get_class!(g))
+        } else { None }
+    });
+    (($next_char:ident is $class:expr)) => ({
+        $next_char.is_some() && get_class!(($next_char.unwrap().1)) == $class
+    });
+    (($next_char:ident is $($class:ident),+)) => ({
+        $next_char.is_some() && ($(get_class!(($next_char.unwrap().1)) == $class)||+)
+    });
+}
+
+/// Returns positions where breaks can happen
+/// Examples:
+/// ```
+/// use text_processing::{self, LineBreakCandidate::{self, *}};
+/// use text_processing::line_break::LineBreakCandidateIter;
+///
+/// assert!(LineBreakCandidateIter::new("").collect::<Vec<(usize, LineBreakCandidate)>>().is_empty());
+/// assert_eq!(&[(7, BreakAllowed), (12, MandatoryBreak)],
+///            LineBreakCandidateIter::new("Sample Text.").collect::<Vec<(usize, LineBreakCandidate)>>().as_slice());
+/// assert_eq!(&[(3, MandatoryBreak), (7, MandatoryBreak), (10, BreakAllowed), (17, MandatoryBreak)],
+///            LineBreakCandidateIter::new("Sa\nmp\r\nle T(e)xt.").collect::<Vec<(usize, LineBreakCandidate)>>().as_slice());
+/// ```
+impl<'a> Iterator for LineBreakCandidateIter<'a> {
+    type Item = (usize, LineBreakCandidate);
+    fn next(&mut self) -> Option<Self::Item> {
+        // After end of text, there are no breaks.
+        if self.pos >= self.text.len() {
+            return None;
+        }
+        // LB3 Always break at the end of text
+        if self.pos + 1 == self.text.len() {
+            self.pos += 1;
+            return Some((self.pos, MandatoryBreak));
+        }
+
+        let (idx, mut grapheme) = self.iter.next().unwrap();
+        let LineBreakCandidateIter {
+            ref mut iter,
+            ref text,
+            ref mut reg_ind_streak,
+            ref mut pos,
+        } = self;
+        let iter = iter.by_ref();
+
+        debug_assert_eq!(idx, *pos);
+
+        // LB2 Never break at the start of text
+        if idx == 0 {
+            *pos += grapheme.len();
+            return self.next();
+        }
+
+        let class = get_class!(grapheme);
+
+        if class != RI {
+            *reg_ind_streak = 0;
+        }
+
+        /* LB1 Assign a line breaking class to each code point of the input. Resolve AI, CB, CJ,
+         * SA, SG, and XX into other line breaking classes depending on criteria outside the scope
+         * of this algorithm.
+         *
+         * In the absence of such criteria all characters with a specific combination of original
+         * class and General_Category property value are resolved as follows:
+         * Resolved Original     General_Category
+         * AL       AI, SG, XX   Any
+         * CM       SA           Only Mn or Mc
+         * AL       SA           Any except Mn and Mc
+         * NS       SJ           Any
+         */
+
+        // TODO: LB1
+
+        /* Check if next character class allows breaks before it */
+        let next_char: Option<&(usize, &str)> = iter.peek();
+
+        match class {
+            BK => {
+                // LB4 Always Break after hard line breaks.
+                *pos += grapheme.len();
+                return Some((*pos, MandatoryBreak));
+            }
+            // LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks
+            CR if next_grapheme_class!((next_char is LF)) => {
+                *pos += grapheme.len();
+                assert!(Some(LF) == next_grapheme_class!(iter, grapheme));
+                *pos += grapheme.len();
+                return Some((*pos, MandatoryBreak));
+            }
+            CR | LF | NL => {
+                *pos += grapheme.len();
+                return Some((*pos, MandatoryBreak));
+            }
+            _ => {}
+        }
+        if let Some((_, next_grapheme)) = next_char {
+            let next_class = get_class!(next_grapheme);
+            match next_class {
+                /* LB6 Do not break before hard line breaks.  × ( BK | CR | LF | NL ) */
+                BK | CR | LF | NL => {
+                    *pos += grapheme.len();
+                    return self.next();
+                }
+                /* LB7 Do not break before spaces or zero width
+                 * space. × SP × ZW */
+                SP | ZW => {
+                    *pos += grapheme.len();
+                    return self.next();
+                }
+                _ => {}
+            }
+        }
+        match class {
+            ZW => {
+                // LB8 Break before any character following a zero-width space, even if one or more
+                // spaces intervene
+                // ZW SP* ÷
+                *pos += grapheme.len();
+                while Some(SP) == next_grapheme_class!(iter, grapheme) {
+                    *pos += grapheme.len();
+                }
+                return Some((*pos, MandatoryBreak));
+            }
+            ZWJ => {
+                // LB8a Do not break after a zero width joiner.
+                *pos += grapheme.len();
+                return self.next();
+            }
+
+            CM => {
+                // LB9 Do not break a combining character sequence; treat it as if it has the line
+                // breaking class of the base character in all of the following rules. Treat ZWJ as
+                // if it were CM.
+                // Treat X (CM | ZWJ)* as if it were X.
+                // where X is any line break class except BK, CR, LF, NL, SP, or ZW.
+
+                /* Unreachable since we break lines based on graphemes, not characters */
+                unreachable!();
+            }
+            WJ => {
+                /*: LB11 Do not break before or after Word joiner and related characters.*/
+                *pos += grapheme.len();
+                /* Get next grapheme */
+                if next_grapheme_class!(iter, grapheme).is_some() {
+                    *pos += grapheme.len();
+                }
+                return self.next();
+            }
+            GL => {
+                /*LB12 Non-breaking characters: LB12 Do not break after NBSP and related characters.*/
+                *pos += grapheme.len();
+                return self.next();
+            }
+            _ => {}
+        }
+        if let Some((next_idx, next_grapheme)) = next_char {
+            let next_class = get_class!(next_grapheme);
+            match next_class {
+                GL if ![SP, BA, HY].contains(&class) => {
+                    /* LB12a Do not break before NBSP and related characters, except after spaces and
+                     * hyphens.  [^SP BA HY] × GL
+                     * Also LB12 Do not break after NBSP and related characters */
+                    *pos += grapheme.len();
+                    return self.next();
+                }
+                /* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */
+                CL | CP | EX | IS | SY => {
+                    *pos = *next_idx;
+                    return self.next();
+                }
+                _ => {}
+            }
+        }
+
+        match class {
+            /* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */
+            SP if [CL, CP, EX, IS, SY].contains(&get_class!(text[idx..].trim_start())) => {
+                *pos += grapheme.len();
+                while ![CL, CP, EX, IS, SY].contains(&next_grapheme_class!(iter, grapheme).unwrap())
+                {
+                    *pos += grapheme.len();
+                }
+                *pos += grapheme.len();
+                return self.next();
+            }
+            OP => {
+                /* LB14 Do not break after ‘[’, even after spaces.
+                 * OP SP* ×
+                 */
+                while let Some((idx, grapheme)) = self.iter.next() {
+                    *pos = idx + grapheme.len();
+                    if !(get_class!(grapheme) == SP) {
+                        break;
+                    }
+                }
+                return self.next();
+            }
+            QU if get_class!(text[idx..].trim_start()) == OP => {
+                /* LB15 Do not break within ‘”[’, even with intervening spaces.
+                 * QU SP* × OP */
+                *pos += grapheme.len();
+                while Some(SP) == next_grapheme_class!(iter, grapheme) {
+                    *pos += grapheme.len();
+                }
+                *pos = idx;
+                return self.next();
+            }
+            QU => {
+                /* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */
+                *pos += grapheme.len();
+                if let Some((_, g)) = self.iter.next() {
+                    *pos += g.len();
+                }
+                return self.next();
+            }
+            LineBreakClass::CL | LineBreakClass::CP
+                if get_class!(text[idx..].trim_start()) == NS =>
+            {
+                /* LB16 Do not break between closing punctuation and a nonstarter (lb=NS), even with
+                 * intervening spaces.
+                 * (CL | CP) SP* × NS */
+                *pos += grapheme.len();
+                while Some(SP) == next_grapheme_class!(iter, grapheme) {
+                    *pos += grapheme.len();
+                }
+                return self.next();
+            }
+            B2 if get_class!(text[idx..].trim_start()) == B2 => {
+                *pos += grapheme.len();
+                while Some(SP) == next_grapheme_class!(iter, grapheme) {
+                    *pos += grapheme.len();
+                }
+                return self.next();
+            }
+            SP => {
+                /* LB18 Break after spaces.  SP ÷ */
+                // Space 0x20 is 1 byte long.
+                *pos += 1;
+                return Some((*pos, BreakAllowed));
+            }
+            _ => {}
+        }
+        if let Some((next_idx, next_grapheme)) = next_char {
+            let next_class = get_class!(next_grapheme);
+            match next_class {
+                QU if class != SP => {
+                    /* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */
+                    *pos = *next_idx + next_grapheme.len();
+                    self.iter.next();
+                    return self.next();
+                }
+                _ => {}
+            }
+        }
+        match class {
+            CB => {
+                /* LB20 Break before and after unresolved CB. */
+                *pos += grapheme.len();
+                return Some((*pos - 1, BreakAllowed));
+            }
+            /* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small
+             * kana, and other non-starters, or after acute accents.  × BA,  × HY, × NS,  BB × */
+            BB => {
+                *pos += grapheme.len();
+                return self.next();
+            }
+            _ => {}
+        }
+
+        if let Some((_, next_grapheme)) = next_char {
+            let next_class = get_class!(next_grapheme);
+            match next_class {
+                BA | HY | NS => {
+                    /* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small
+                     * kana, and other non-starters, or after acute accents.  × BA,  × HY, × NS,  BB × */
+                    *pos += grapheme.len();
+                    return self.next();
+                }
+                _ => {}
+            }
+        }
+        match class {
+            HL if next_grapheme_class!((next_char is HY, BA)) => {
+                /* LB21a Don’t break after Hebrew + Hyphen.  HL (HY | BA) × */
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* LB21b Don’t break between ,Solidus and Hebrew letters.  SY × HL */
+            SY if next_grapheme_class!((next_char is HL)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                /* bypass next_char */
+                self.iter.next().unwrap();
+                if let Some((idx, next_grapheme)) = self.iter.next() {
+                    *pos = idx + next_grapheme.len();
+                }
+                return self.next();
+            }
+            /*  LB22 Do not break between two ellipses, or between letters, numbers or excla-
+             *  mations and ellipsis.
+             *  Examples: ‘9...’, ‘a...’, ‘H...’
+             *  (AL | HL) × IN */
+            AL | HL if next_grapheme_class!((next_char is IN)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /*  EX × IN */
+            EX if next_grapheme_class!((next_char is IN)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            EX => {
+                // LB13
+                *pos += grapheme.len();
+                return self.next();
+            }
+            /*  (ID | EB | EM) × IN */
+            ID | EB | EM if next_grapheme_class!((next_char is IN)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /*  IN × IN */
+            IN if next_grapheme_class!((next_char is IN)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /*  NU × IN */
+            NU if next_grapheme_class!((next_char is IN)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* LB23 Do not break between digits and letters.
+             * (AL | HL) × NU */
+            AL | HL if next_grapheme_class!((next_char is NU)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* NU × (AL | HL) */
+            NU if next_grapheme_class!((next_char is AL, HL)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* LB23a Do not break between numeric prefixes and ideographs, or between ideographs
+             * and numeric postfixes.
+             * PR × (ID | EB | EM) */
+            PR if next_grapheme_class!((next_char is ID, EB, EM)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* (ID | EB | EM) × PO */
+            ID | EB | EM if next_grapheme_class!((next_char is PO)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* B24 Do not break between numeric prefix/postfix and letters, or between
+            letters and prefix/postfix.
+            (PR | PO) × (AL | HL)*/
+            PR | PO if next_grapheme_class!((next_char is AL, HL)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /*(AL | HL) × (PR | PO) */
+            AL | HL if next_grapheme_class!((next_char is PR, PO)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* LB25 Do not break between the following pairs of classes relevant to numbers:
+             * CL × PO */
+            CL if next_grapheme_class!((next_char is PO)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* CP × PO */
+            CP if next_grapheme_class!((next_char is PO)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* CL × PR */
+            CL if next_grapheme_class!((next_char is PR)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* CP × PR */
+            CP if next_grapheme_class!((next_char is PR)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* NU × PO */
+            NU if next_grapheme_class!((next_char is PO)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* NU × PR */
+            NU if next_grapheme_class!((next_char is PR)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* PO × OP */
+            PO if next_grapheme_class!((next_char is OP)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* PO × NU */
+            PO if next_grapheme_class!((next_char is NU)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* PR × OP */
+            PR if next_grapheme_class!((next_char is OP)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* PR × NU */
+            PR if next_grapheme_class!((next_char is NU)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* HY × NU */
+            HY if next_grapheme_class!((next_char is NU)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* IS × NU */
+            IS if next_grapheme_class!((next_char is NU)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* NU × NU */
+            NU if next_grapheme_class!((next_char is NU)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* SY × NU */
+            SY if next_grapheme_class!((next_char is NU)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* LB26 Do not break a Korean syllable.
+             * JL × (JL | JV | H2 | H3) */
+            JL if next_grapheme_class!((next_char is JL, JV, H2, H3)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* (JV | H2) × (JV | JT) */
+            JV | H2 if next_grapheme_class!((next_char is JV, JT)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* (JT | H3) × JT */
+            JT | H3 if next_grapheme_class!((next_char is JT)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* LB27 Treat a Korean Syllable Block the same as ID.
+             * (JL | JV | JT | H2 | H3) × IN */
+            JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is IN)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* (JL | JV | JT | H2 | H3) × PO */
+            JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is PO)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* PR × (JL | JV | JT | H2 | H3) */
+            PR if next_grapheme_class!((next_char is JL, JV, JT, H2, H3)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* LB28 Do not break between alphabetics (“at”).
+            (AL | HL) × (AL | HL) */
+            AL | HL if next_grapheme_class!((next_char is AL, HL)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
+            IS × (AL | HL) */
+            IS if next_grapheme_class!((next_char is AL, HL)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* LB30 Do not break between letters, numbers, or ordinary symbols and opening
+            or closing parentheses.
+            (AL | HL | NU) × OP */
+            AL | HL | NU if next_grapheme_class!((next_char is OP)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /* CP × (AL | HL | NU) */
+            CP if next_grapheme_class!((next_char is AL, HL , NU)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            /*LB30b Do not break between an emoji base and an emoji modifier.
+             * EB × EM */
+            EB if next_grapheme_class!((next_char is EM)) => {
+                let (idx, next_grapheme) = next_char.unwrap();
+                *pos = idx + next_grapheme.len();
+                self.iter.next();
+                return self.next();
+            }
+            RI => {
+                /* LB30a Break between two regional indicator symbols if and only if there are an
+                 * even number of regional indicators preceding the position of the break.
+                 * sot (RI RI)* RI × RI
+                 * [^RI] (RI RI)* RI × RI */
+                *reg_ind_streak += 1;
+                *pos += grapheme.len();
+                if *reg_ind_streak % 2 == 1 {
+                    return Some((*pos - grapheme.len(), BreakAllowed));
+                }
+                self.iter.next();
+                return self.next();
+            }
+            _ => {
+                *pos += grapheme.len();
+                return Some((*pos - grapheme.len(), BreakAllowed));
+            }
+        }
+    }
+}
+
+fn search_table(c: u32, t: &'static [(u32, u32, LineBreakClass)]) -> LineBreakClass {
+    match t.binary_search_by(|&(lo, hi, _)| {
+        if lo <= c && c <= hi {
+            Ordering::Equal
+        } else if hi < c {
+            Ordering::Less
+        } else {
+            Ordering::Greater
+        }
+    }) {
+        Ok(idx) => t[idx].2,
+        Err(_) => XX,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_line_breaks() {
+        let s = "Fell past it.\n\n‘Well!’ thought Alice to herself.";
+        let breaks = LineBreakCandidateIter::new(s).collect::<Vec<(usize, LineBreakCandidate)>>();
+        let mut prev = 0;
+        for b in breaks {
+            println!("{:?}", &s[prev..b.0]);
+            prev = b.0;
+        }
+        println!("{:?}", &s[prev..]);
+    }
+}
--- a/text_processing/src/tables.rs
+++ b/text_processing/src/tables.rs
--- a/text_processing/src/types.rs
+++ b/text_processing/src/types.rs
@ -0,0 +1,102 @@
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub enum LineBreakClass {
+    BK,
+    CM,
+    CR,
+    GL,
+    LF,
+    NL,
+    SP,
+    WJ,
+    ZW,
+    ZWJ,
+    AI,
+    AL,
+    B2,
+    BA,
+    BB,
+    CB,
+    CJ,
+    CL,
+    CP,
+    EB,
+    EM,
+    EX,
+    H2,
+    H3,
+    HL,
+    HY,
+    ID,
+    IN,
+    IS,
+    JL,
+    JT,
+    JV,
+    NS,
+    NU,
+    OP,
+    PO,
+    PR,
+    QU,
+    RI,
+    SA,
+    SG,
+    SY,
+    XX,
+}
+
+use LineBreakClass::*;
+
+impl From<&str> for LineBreakClass {
+    fn from(val: &str) -> Self {
+        match val {
+            stringify!(BK) => BK,
+            stringify!(CM) => CM,
+            stringify!(CR) => CR,
+            stringify!(GL) => GL,
+            stringify!(LF) => LF,
+            stringify!(NL) => NL,
+            stringify!(SP) => SP,
+            stringify!(WJ) => WJ,
+            stringify!(ZW) => ZW,
+            stringify!(ZWJ) => ZWJ,
+            stringify!(AI) => AI,
+            stringify!(AL) => AL,
+            stringify!(B2) => B2,
+            stringify!(BA) => BA,
+            stringify!(BB) => BB,
+            stringify!(CB) => CB,
+            stringify!(CJ) => CJ,
+            stringify!(CL) => CL,
+            stringify!(CP) => CP,
+            stringify!(EB) => EB,
+
+            stringify!(EM) => EM,
+            stringify!(EX) => EX,
+            stringify!(H2) => H2,
+            stringify!(H3) => H3,
+            stringify!(HL) => HL,
+            stringify!(HY) => HY,
+            stringify!(ID) => ID,
+            stringify!(IN) => IN,
+            stringify!(IS) => IS,
+            stringify!(JL) => JL,
+
+            stringify!(JT) => JT,
+            stringify!(JV) => JV,
+            stringify!(NS) => NS,
+            stringify!(NU) => NU,
+            stringify!(OP) => OP,
+            stringify!(PO) => PO,
+            stringify!(PR) => PR,
+            stringify!(QU) => QU,
+            stringify!(RI) => RI,
+            stringify!(SA) => SA,
+
+            stringify!(SG) => SG,
+            stringify!(SY) => SY,
+            stringify!(XX) => XX,
+            _ => unreachable!(),
+        }
+    }
+}