Merge remote-tracking branch 'upstream/master' into mbox-extractor
commit
2259730c67
@ -0,0 +1,27 @@
|
|||||||
|
---
|
||||||
|
name: Bug report
|
||||||
|
about: Create a report to help us improve
|
||||||
|
title: ''
|
||||||
|
labels: bug
|
||||||
|
assignees: ''
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Describe the bug**
|
||||||
|
|
||||||
|
|
||||||
|
**To Reproduce**
|
||||||
|
|
||||||
|
Attach example file:
|
||||||
|
|
||||||
|
Run command:
|
||||||
|
|
||||||
|
**Output**
|
||||||
|
|
||||||
|
**Screenshots**
|
||||||
|
If applicable, add screenshots to help explain your problem.
|
||||||
|
|
||||||
|
**Operating System and Version**
|
||||||
|
|
||||||
|
|
||||||
|
**Output of `rga --version`**
|
@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
name: Feature request
|
||||||
|
about: Suggest an idea for this project
|
||||||
|
title: ''
|
||||||
|
labels: ''
|
||||||
|
assignees: ''
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Is your feature request related to a problem? Please describe.**
|
||||||
|
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
|
||||||
|
|
||||||
|
**Describe the solution you'd like**
|
||||||
|
A clear and concise description of what you want to happen.
|
||||||
|
|
||||||
|
**Describe alternatives you've considered**
|
||||||
|
A clear and concise description of any alternative solutions or features you've considered.
|
||||||
|
|
||||||
|
**Additional context**
|
||||||
|
Add any other context or screenshots about the feature request here.
|
@ -1,75 +1,25 @@
|
|||||||
# Based on https://github.com/actions-rs/meta/blob/master/recipes/quickstart.md
|
# Based on https://github.com/actions-rs/meta/blob/master/recipes/quickstart.md
|
||||||
#
|
#
|
||||||
# While our "example" application has the platform-specific code,
|
# While our "example" application has platform-specific code,
|
||||||
# for simplicity we are compiling and testing everything on the Ubuntu environment only.
|
# for simplicity we are compiling and testing everything in a nix-on-Linux environment only.
|
||||||
# For multi-OS testing see the `cross.yml` workflow.
|
|
||||||
|
|
||||||
on: [push, pull_request]
|
on: [push, pull_request]
|
||||||
|
|
||||||
name: ci
|
name: ci
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
check:
|
nix-flake-check:
|
||||||
name: Check
|
name: nix flake check
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout sources
|
- name: Checkout sources
|
||||||
uses: actions/checkout@v2
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Install stable toolchain
|
- name: Install nix
|
||||||
uses: actions-rs/toolchain@v1
|
uses: cachix/install-nix-action@v21
|
||||||
with:
|
|
||||||
profile: minimal
|
|
||||||
toolchain: stable
|
|
||||||
override: true
|
|
||||||
|
|
||||||
- name: Run cargo check
|
- name: Ensure the build succeeds
|
||||||
uses: actions-rs/cargo@v1
|
run: nix build
|
||||||
with:
|
|
||||||
command: check
|
|
||||||
|
|
||||||
test:
|
- name: Run `nix flake check` to run formatters, linters, and tests
|
||||||
name: Test Suite
|
run: nix flake check --print-build-logs
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: Checkout sources
|
|
||||||
uses: actions/checkout@v2
|
|
||||||
|
|
||||||
- name: Install stable toolchain
|
|
||||||
uses: actions-rs/toolchain@v1
|
|
||||||
with:
|
|
||||||
profile: minimal
|
|
||||||
toolchain: stable
|
|
||||||
override: true
|
|
||||||
|
|
||||||
- name: Run cargo test
|
|
||||||
uses: actions-rs/cargo@v1
|
|
||||||
with:
|
|
||||||
command: test
|
|
||||||
|
|
||||||
lints:
|
|
||||||
name: Lints
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: Checkout sources
|
|
||||||
uses: actions/checkout@v2
|
|
||||||
|
|
||||||
- name: Install stable toolchain
|
|
||||||
uses: actions-rs/toolchain@v1
|
|
||||||
with:
|
|
||||||
profile: minimal
|
|
||||||
toolchain: stable
|
|
||||||
override: true
|
|
||||||
components: rustfmt, clippy
|
|
||||||
|
|
||||||
- name: Run cargo fmt
|
|
||||||
uses: actions-rs/cargo@v1
|
|
||||||
with:
|
|
||||||
command: fmt
|
|
||||||
args: --all -- --check
|
|
||||||
|
|
||||||
- name: Run cargo clippy
|
|
||||||
uses: actions-rs/cargo@v1
|
|
||||||
with:
|
|
||||||
command: clippy
|
|
||||||
args: -- -D warnings
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
After Width: | Height: | Size: 1.9 MiB |
@ -1,135 +1,188 @@
|
|||||||
use crate::{config::CacheConfig, print_bytes, print_dur};
|
use crate::{adapters::FileAdapter, preproc::ActiveAdapters};
|
||||||
use anyhow::{format_err, Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use log::*;
|
use path_clean::PathClean;
|
||||||
use rkv::backend::{BackendEnvironmentBuilder, LmdbEnvironment};
|
use rusqlite::{named_params, OptionalExtension};
|
||||||
use std::{fmt::Display, path::Path, time::Instant};
|
use std::{path::Path, time::UNIX_EPOCH};
|
||||||
|
use tokio_rusqlite::Connection;
|
||||||
|
|
||||||
pub trait PreprocCache: Send + Sync {
|
#[derive(Clone)]
|
||||||
/*/// gets cache at specified key.
|
pub struct CacheKey {
|
||||||
/// if cache hit, return the resulting data
|
adapter: String,
|
||||||
/// else, run the given lambda, and store its result in the cache if present
|
adapter_version: i32,
|
||||||
fn get_or_run<'a>(
|
active_adapters: String,
|
||||||
&mut self,
|
file_path: String,
|
||||||
db_name: &str,
|
file_mtime_unix_ms: i64,
|
||||||
key: &[u8],
|
|
||||||
debug_name: &str,
|
|
||||||
runner: Box<dyn FnOnce() -> Result<Option<Vec<u8>>> + 'a>,
|
|
||||||
) -> Result<Option<Vec<u8>>>;*/
|
|
||||||
|
|
||||||
fn get(&self, db_name: &str, key: &[u8]) -> Result<Option<Vec<u8>>>;
|
|
||||||
fn set(&mut self, db_name: &str, key: &[u8], value: &[u8]) -> Result<()>;
|
|
||||||
}
|
}
|
||||||
|
impl CacheKey {
|
||||||
/// opens a LMDB cache
|
pub fn new(
|
||||||
fn open_cache_db(
|
filepath_hint: &Path,
|
||||||
path: &Path,
|
adapter: &dyn FileAdapter,
|
||||||
) -> Result<std::sync::Arc<std::sync::RwLock<rkv::Rkv<LmdbEnvironment>>>> {
|
active_adapters: &ActiveAdapters,
|
||||||
std::fs::create_dir_all(path)?;
|
) -> Result<CacheKey> {
|
||||||
// use rkv::backend::LmdbEnvironmentFlags;
|
let meta = std::fs::metadata(filepath_hint)
|
||||||
|
.with_context(|| format!("reading metadata for {}", filepath_hint.to_string_lossy()))?;
|
||||||
rkv::Manager::<LmdbEnvironment>::singleton()
|
let modified = meta.modified().expect("weird OS that can't into mtime");
|
||||||
.write()
|
let file_mtime_unix_ms = modified.duration_since(UNIX_EPOCH)?.as_millis() as i64;
|
||||||
.map_err(|_| format_err!("could not write cache db manager"))?
|
let active_adapters = if adapter.metadata().recurses {
|
||||||
.get_or_create(path, |p| {
|
serde_json::to_string(
|
||||||
let mut builder = rkv::Rkv::environment_builder::<rkv::backend::Lmdb>();
|
&active_adapters
|
||||||
builder
|
.iter()
|
||||||
.set_flags(rkv::EnvironmentFlags::NO_SYNC)
|
.map(|a| format!("{}.v{}", a.metadata().name, a.metadata().version))
|
||||||
.set_flags(rkv::EnvironmentFlags::WRITE_MAP) // not durable cuz it's a cache
|
.collect::<Vec<_>>(),
|
||||||
// i'm not sure why NO_TLS is needed. otherwise LMDB transactions (open readers) will keep piling up until it fails with
|
)?
|
||||||
// LmdbError(ReadersFull). Those "open readers" stay even after the corresponding processes exit.
|
} else {
|
||||||
// hope setting this doesn't break integrity
|
"null".to_string()
|
||||||
.set_flags(rkv::EnvironmentFlags::NO_TLS)
|
};
|
||||||
// sometimes, this seems to cause the data.mdb file to appear as 2GB in size (with holes), but sometimes not?
|
Ok(CacheKey {
|
||||||
.set_map_size(2 * 1024 * 1024 * 1024)
|
adapter: adapter.metadata().name.clone(),
|
||||||
.set_max_dbs(100)
|
adapter_version: adapter.metadata().version,
|
||||||
.set_max_readers(128);
|
file_path: filepath_hint.clean().to_string_lossy().to_string(),
|
||||||
rkv::Rkv::from_builder(p, builder)
|
file_mtime_unix_ms,
|
||||||
|
active_adapters,
|
||||||
})
|
})
|
||||||
.map_err(|e| format_err!("could not get/create cache db: {}", e))
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct LmdbCache {
|
#[async_trait::async_trait]
|
||||||
db_arc: std::sync::Arc<std::sync::RwLock<rkv::Rkv<LmdbEnvironment>>>,
|
pub trait PreprocCache {
|
||||||
|
async fn get(&self, key: &CacheKey) -> Result<Option<Vec<u8>>>;
|
||||||
|
async fn set(&mut self, key: &CacheKey, value: Vec<u8>) -> Result<()>;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl LmdbCache {
|
async fn connect_pragmas(db: &Connection) -> Result<()> {
|
||||||
pub fn open(config: &CacheConfig) -> Result<Option<LmdbCache>> {
|
// https://phiresky.github.io/blog/2020/sqlite-performance-tuning/
|
||||||
if config.disabled {
|
//let want_page_size = 32768;
|
||||||
return Ok(None);
|
//db.execute(&format!("pragma page_size = {};", want_page_size))
|
||||||
}
|
// .context("setup pragma 1")?;
|
||||||
let path = Path::new(&config.path.0);
|
db.call(|db| {
|
||||||
Ok(Some(LmdbCache {
|
db.execute_batch(
|
||||||
db_arc: open_cache_db(path)?,
|
"
|
||||||
}))
|
pragma journal_mode = WAL;
|
||||||
|
pragma foreign_keys = on;
|
||||||
|
pragma temp_store = memory;
|
||||||
|
pragma synchronous = off; -- integrity isn't very important here
|
||||||
|
pragma mmap_size = 30000000000;
|
||||||
|
|
||||||
|
create table if not exists preproc_cache (
|
||||||
|
adapter text not null,
|
||||||
|
adapter_version integer not null,
|
||||||
|
created_unix_ms integer not null default (unixepoch() * 1000),
|
||||||
|
active_adapters text not null, -- 'null' if adapter cannot recurse
|
||||||
|
file_path text not null,
|
||||||
|
file_mtime_unix_ms integer not null,
|
||||||
|
text_content_zstd blob not null
|
||||||
|
) strict;
|
||||||
|
|
||||||
|
create unique index if not exists preproc_cache_idx on preproc_cache (adapter, adapter_version, file_path, active_adapters);
|
||||||
|
",
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.await.context("connect_pragmas")?;
|
||||||
|
let jm: i64 = db
|
||||||
|
.call(|db| db.pragma_query_value(None, "application_id", |r| r.get(0)))
|
||||||
|
.await?;
|
||||||
|
if jm != 924716026 {
|
||||||
|
// (probably) newly created db
|
||||||
|
create_pragmas(db).await.context("create_pragmas")?;
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
async fn create_pragmas(db: &Connection) -> Result<()> {
|
||||||
struct RkvErrWrap(rkv::StoreError);
|
db.call(|db| {
|
||||||
impl Display for RkvErrWrap {
|
db.execute_batch(
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
"
|
||||||
self.0.fmt(f)
|
pragma application_id = 924716026;
|
||||||
}
|
pragma user_version = 2; -- todo: on upgrade clear db if version is unexpected
|
||||||
|
",
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
impl std::error::Error for RkvErrWrap {}
|
struct SqliteCache {
|
||||||
|
db: Connection,
|
||||||
|
}
|
||||||
|
impl SqliteCache {
|
||||||
|
async fn new(path: &Path) -> Result<SqliteCache> {
|
||||||
|
let db = Connection::open(path.join("cache.sqlite3")).await?;
|
||||||
|
connect_pragmas(&db).await?;
|
||||||
|
|
||||||
impl PreprocCache for LmdbCache {
|
Ok(SqliteCache { db })
|
||||||
fn get(&self, db_name: &str, key: &[u8]) -> Result<Option<Vec<u8>>> {
|
}
|
||||||
let start = Instant::now();
|
}
|
||||||
let db_env = self
|
|
||||||
.db_arc
|
|
||||||
.read()
|
|
||||||
.map_err(|_| anyhow::anyhow!("Could not open lock, some lock writer panicked"))?;
|
|
||||||
let db = db_env
|
|
||||||
.open_single(db_name, rkv::store::Options::create())
|
|
||||||
.map_err(RkvErrWrap)
|
|
||||||
.context("could not open cache db store")?;
|
|
||||||
|
|
||||||
let reader = db_env.read().expect("could not get reader");
|
#[async_trait::async_trait]
|
||||||
let cached = db
|
impl PreprocCache for SqliteCache {
|
||||||
.get(&reader, key)
|
async fn get(&self, key: &CacheKey) -> Result<Option<Vec<u8>>> {
|
||||||
.map_err(RkvErrWrap)
|
let key = (*key).clone(); // todo: without cloning
|
||||||
.context("could not read from db")?;
|
Ok(self
|
||||||
|
.db
|
||||||
|
.call(move |db| {
|
||||||
|
db.query_row(
|
||||||
|
"select text_content_zstd from preproc_cache where
|
||||||
|
adapter = :adapter
|
||||||
|
and adapter_version = :adapter_version
|
||||||
|
and active_adapters = :active_adapters
|
||||||
|
and file_path = :file_path
|
||||||
|
and file_mtime_unix_ms = :file_mtime_unix_ms
|
||||||
|
",
|
||||||
|
named_params! {
|
||||||
|
":adapter": &key.adapter,
|
||||||
|
":adapter_version": &key.adapter_version,
|
||||||
|
":active_adapters": &key.active_adapters,
|
||||||
|
":file_path": &key.file_path,
|
||||||
|
":file_mtime_unix_ms": &key.file_mtime_unix_ms
|
||||||
|
},
|
||||||
|
|r| r.get::<_, Vec<u8>>(0),
|
||||||
|
)
|
||||||
|
.optional()
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.context("reading from cache")?)
|
||||||
|
}
|
||||||
|
|
||||||
match cached {
|
async fn set(&mut self, key: &CacheKey, value: Vec<u8>) -> Result<()> {
|
||||||
Some(rkv::Value::Blob(cached)) => {
|
let key = (*key).clone(); // todo: without cloning
|
||||||
debug!(
|
Ok(self
|
||||||
"cache HIT, reading {} (compressed) from cache",
|
.db
|
||||||
print_bytes(cached.len() as f64)
|
.call(move |db| {
|
||||||
);
|
db.execute(
|
||||||
debug!("reading from cache took {}", print_dur(start));
|
"insert into preproc_cache (adapter, adapter_version, active_adapters, file_path, file_mtime_unix_ms, text_content_zstd) values
|
||||||
Ok(Some(Vec::from(cached)))
|
(:adapter, :adapter_version, :active_adapters, :file_path, :file_mtime_unix_ms, :text_content_zstd)
|
||||||
}
|
on conflict (adapter, adapter_version, active_adapters, file_path) do update set
|
||||||
Some(_) => Err(format_err!("Integrity: value not blob"))?,
|
file_mtime_unix_ms = :file_mtime_unix_ms,
|
||||||
None => Ok(None),
|
created_unix_ms = unixepoch() * 1000,
|
||||||
}
|
text_content_zstd = :text_content_zstd",
|
||||||
|
named_params! {
|
||||||
|
":adapter": &key.adapter,
|
||||||
|
":adapter_version": &key.adapter_version,
|
||||||
|
":active_adapters": &key.active_adapters,
|
||||||
|
":file_path": &key.file_path,
|
||||||
|
":file_mtime_unix_ms": &key.file_mtime_unix_ms,
|
||||||
|
":text_content_zstd": value
|
||||||
|
}
|
||||||
|
).map(|_| ())
|
||||||
|
})
|
||||||
|
.await?)
|
||||||
}
|
}
|
||||||
fn set(&mut self, db_name: &str, key: &[u8], got: &[u8]) -> Result<()> {
|
}
|
||||||
let start = Instant::now();
|
/// opens a default cache
|
||||||
debug!("writing {} to cache", print_bytes(got.len() as f64));
|
pub async fn open_cache_db(path: &Path) -> Result<impl PreprocCache> {
|
||||||
let db_env = self
|
std::fs::create_dir_all(path)?;
|
||||||
.db_arc
|
SqliteCache::new(path).await
|
||||||
.read()
|
}
|
||||||
.map_err(|_| anyhow::anyhow!("Could not open lock, some lock writer panicked"))?;
|
|
||||||
|
|
||||||
let db = db_env
|
#[cfg(test)]
|
||||||
.open_single(db_name, rkv::store::Options::create())
|
mod test {
|
||||||
.map_err(RkvErrWrap)
|
|
||||||
.context("could not open cache db store")?;
|
|
||||||
|
|
||||||
let mut writer = db_env
|
use crate::preproc_cache::*;
|
||||||
.write()
|
|
||||||
.map_err(RkvErrWrap)
|
|
||||||
.with_context(|| format_err!("could not open write handle to cache"))?;
|
|
||||||
|
|
||||||
db.put(&mut writer, key, &rkv::Value::Blob(got))
|
#[tokio::test]
|
||||||
.map_err(RkvErrWrap)
|
async fn test_read_write() -> anyhow::Result<()> {
|
||||||
.with_context(|| format_err!("could not write to cache"))?;
|
let path = tempfile::tempdir()?;
|
||||||
writer
|
let _db = open_cache_db(&path.path().join("foo.sqlite3")).await?;
|
||||||
.commit()
|
// db.set();
|
||||||
.map_err(RkvErrWrap)
|
|
||||||
.with_context(|| "could not write cache".to_string())?;
|
|
||||||
debug!("writing to cache took {}", print_dur(start));
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue