From a954dedb77e1c2b52bdec471aaaa74a7ac9e26a2 Mon Sep 17 00:00:00 2001 From: Chris Papademetrious Date: Thu, 4 Apr 2024 12:40:16 -0400 Subject: [PATCH] langchain[minor]: enhance `LocalFileStore` to allow directory/file permissions to be specified (#18857) **Description:** The `LocalFileStore` class can be used to create an on-disk `CacheBackedEmbeddings` cache. However, the default `umask` settings gives file/directory write permissions only to the original user. Once the cache directory is created by the first user, other users cannot write their own cache entries into the directory. To make the cache usable by multiple users, this pull request updates the `LocalFileStore` constructor to allow the permissions for newly created directories and files to be specified. The specified permissions override the default `umask` values. For example, when configured as follows: ```python file_store = LocalFileStore(temp_dir, chmod_dir=0o770, chmod_file=0o660) ``` then "user" and "group" (but not "other") have permissions to access the store, which means: * Anyone in our group could contribute embeddings to the cache. * If we implement cache cleanup/eviction in the future, anyone in our group could perform the cleanup. The default values for the `chmod_dir` and `chmod_file` parameters is `None`, which retains the original behavior of using the default `umask` settings. **Issue:** Implements enhancement #18075. **Testing:** I updated the `LocalFileStore` unit tests to test the permissions. --------- Signed-off-by: chrispy Co-authored-by: Eugene Yurtsev --- .../langchain/storage/file_system.py | 36 +++++++++++++++++-- .../unit_tests/storage/test_filesystem.py | 28 +++++++++++++++ 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/libs/langchain/langchain/storage/file_system.py b/libs/langchain/langchain/storage/file_system.py index bd0aca4875..3437684758 100644 --- a/libs/langchain/langchain/storage/file_system.py +++ b/libs/langchain/langchain/storage/file_system.py @@ -36,14 +36,26 @@ class LocalFileStore(ByteStore): """ - def __init__(self, root_path: Union[str, Path]) -> None: + def __init__( + self, + root_path: Union[str, Path], + *, + chmod_file: Optional[int] = None, + chmod_dir: Optional[int] = None, + ) -> None: """Implement the BaseStore interface for the local file system. Args: root_path (Union[str, Path]): The root path of the file store. All keys are interpreted as paths relative to this root. + chmod_file: (optional, defaults to `None`) If specified, sets permissions + for newly created files, overriding the current `umask` if needed. + chmod_dir: (optional, defaults to `None`) If specified, sets permissions + for newly created dirs, overriding the current `umask` if needed. """ self.root_path = Path(root_path).absolute() + self.chmod_file = chmod_file + self.chmod_dir = chmod_dir def _get_full_path(self, key: str) -> Path: """Get the full path for a given key relative to the root path. @@ -66,6 +78,24 @@ class LocalFileStore(ByteStore): return Path(full_path) + def _mkdir_for_store(self, dir: Path) -> None: + """Makes a store directory path (including parents) with specified permissions + + This is needed because `Path.mkdir()` is restricted by the current `umask`, + whereas the explicit `os.chmod()` used here is not. + + Args: + dir: (Path) The store directory to make + + Returns: + None + """ + if not dir.exists(): + self._mkdir_for_store(dir.parent) + dir.mkdir(exist_ok=True) + if self.chmod_dir is not None: + os.chmod(dir, self.chmod_dir) + def mget(self, keys: Sequence[str]) -> List[Optional[bytes]]: """Get the values associated with the given keys. @@ -97,8 +127,10 @@ class LocalFileStore(ByteStore): """ for key, value in key_value_pairs: full_path = self._get_full_path(key) - full_path.parent.mkdir(parents=True, exist_ok=True) + self._mkdir_for_store(full_path.parent) full_path.write_bytes(value) + if self.chmod_file is not None: + os.chmod(full_path, self.chmod_file) def mdelete(self, keys: Sequence[str]) -> None: """Delete the given keys and their associated values. diff --git a/libs/langchain/tests/unit_tests/storage/test_filesystem.py b/libs/langchain/tests/unit_tests/storage/test_filesystem.py index 4213006bf2..26d5cccd68 100644 --- a/libs/langchain/tests/unit_tests/storage/test_filesystem.py +++ b/libs/langchain/tests/unit_tests/storage/test_filesystem.py @@ -29,6 +29,34 @@ def test_mset_and_mget(file_store: LocalFileStore) -> None: assert values == [b"value1", b"value2"] +@pytest.mark.parametrize( + "chmod_dir_s, chmod_file_s", [("777", "666"), ("770", "660"), ("700", "600")] +) +def test_mset_chmod(chmod_dir_s: str, chmod_file_s: str) -> None: + chmod_dir = int(chmod_dir_s, base=8) + chmod_file = int(chmod_file_s, base=8) + + # Create a temporary directory for testing + with tempfile.TemporaryDirectory() as temp_dir: + # Instantiate the LocalFileStore with a directory inside the temporary directory + # as the root path + temp_dir = os.path.join(temp_dir, "store_dir") + file_store = LocalFileStore( + temp_dir, chmod_dir=chmod_dir, chmod_file=chmod_file + ) + + # Set values for keys + key_value_pairs = [("key1", b"value1"), ("key2", b"value2")] + file_store.mset(key_value_pairs) + + # verify the permissions are set correctly + # (test only the standard user/group/other bits) + dir_path = str(file_store.root_path) + file_path = os.path.join(dir_path, "key1") + assert (os.stat(dir_path).st_mode & 0o777) == chmod_dir + assert (os.stat(file_path).st_mode & 0o777) == chmod_file + + def test_mdelete(file_store: LocalFileStore) -> None: # Set values for keys key_value_pairs = [("key1", b"value1"), ("key2", b"value2")]