From 531541fea4fd259897892c763773b84c49386e12 Mon Sep 17 00:00:00 2001 From: deadc0de6 Date: Wed, 19 Sep 2018 19:30:44 +0200 Subject: [PATCH] adding ability to re-index a storage --- catcli/catcli.py | 35 ++++++++++++++++++--- catcli/noder.py | 24 ++++++++++++-- catcli/walker.py | 81 +++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 118 insertions(+), 22 deletions(-) diff --git a/catcli/catcli.py b/catcli/catcli.py index 5ee8575..3ad6c24 100755 --- a/catcli/catcli.py +++ b/catcli/catcli.py @@ -37,6 +37,7 @@ USAGE = """ Usage: {1} index [--catalog=] [--meta=...] [-acfuV] + {1} update [--catalog=] [-acfuV] {1} ls [--catalog=] [-arVS] [] {1} find [--catalog=] [-abV] {1} rm [--catalog=] [-fV] @@ -51,7 +52,7 @@ Usage: Options: --catalog= Path to the catalog [default: {2}]. --meta= Additional attribute to store [default: ]. - -u --subsize Store size of folders [default: False]. + -u --subsize Store size of directories [default: False]. -a --archive Handle archive file [default: False]. -f --force Do not ask when updating the catalog [default: False]. -b --script Output script to manage found file(s) [default: False]. @@ -64,7 +65,7 @@ Options: """.format(BANNER, NAME, CATALOGPATH) -def cmd_index(args, noder, catalog, top): +def cmd_index(args, noder, catalog, top, debug=False): path = args[''] name = args[''] nohash = not args['--hash'] @@ -79,10 +80,10 @@ def cmd_index(args, noder, catalog, top): node = noder.get_storage_node(top, name) node.parent = None start = datetime.datetime.now() - walker = Walker(noder, nohash=nohash) + walker = Walker(noder, nohash=nohash, debug=debug) attr = noder.format_storage_attr(args['--meta']) root = noder.storage_node(name, path, parent=top, attr=attr) - _, cnt = walker.index(path, name, parent=root, parentpath=path) + _, cnt = walker.index(path, name, root) if subsize: noder.rec_size(root) stop = datetime.datetime.now() @@ -90,6 +91,28 @@ def cmd_index(args, noder, catalog, top): catalog.save(top) +def cmd_update(args, noder, catalog, top, debug=False): + path = args[''] + name = args[''] + nohash = not args['--hash'] + subsize = args['--subsize'] + if not os.path.exists(path): + Logger.err('\"{}\" does not exist'.format(path)) + return + root = noder.get_storage_node(top, name) + if not root: + Logger.err('storage named \"{}\" does not exist'.format(name)) + return + start = datetime.datetime.now() + walker = Walker(noder, nohash=nohash, debug=debug) + cnt = walker.reindex(path, root, top) + if subsize: + noder.rec_size(root) + stop = datetime.datetime.now() + Logger.info('updated {} file(s) in {}'.format(cnt, stop - start)) + catalog.save(top) + + def cmd_ls(args, noder, top): path = args[''] if not path: @@ -203,7 +226,9 @@ def main(): # parse command if args['index']: - cmd_index(args, noder, catalog, top) + cmd_index(args, noder, catalog, top, debug=args['--verbose']) + if args['update']: + cmd_update(args, noder, catalog, top, debug=args['--verbose']) elif args['find']: cmd_find(args, noder, top) elif args['tree']: diff --git a/catcli/noder.py b/catcli/noder.py index 8071902..c62bccc 100644 --- a/catcli/noder.py +++ b/catcli/noder.py @@ -55,16 +55,34 @@ class Noder: continue if n.name == name: return n + return None - def get_node(self, top, path): + def get_node(self, top, path, quiet=False): '''get the node by internal tree path''' r = anytree.resolver.Resolver('name') try: return r.get(top, path) except anytree.resolver.ChildResolverError: - Logger.err('No node at path \"{}\"'.format(path)) + if not quiet: + Logger.err('No node at path \"{}\"'.format(path)) return None + def get_node_if_newer(self, top, path): + '''return the node (if any) and if path is newer''' + treepath = path.lstrip(os.sep) + node = self.get_node(top, treepath, quiet=True) + if not node: + # node does not exist + return None, True + if not node.maccess: + # force re-indexing if no maccess + return node, True + maccess = node.maccess + cur_maccess = os.path.getmtime(path) + if float(cur_maccess) > maccess: + return node, True + return node, False + def get_meta_node(self, top): '''return the meta node if any''' try: @@ -76,7 +94,7 @@ class Noder: def rec_size(self, node): '''recursively traverse tree and store dir size''' if self.verbose: - Logger.info('getting folder size recursively') + Logger.info('getting directory size recursively') if node.type == self.TYPE_FILE: return node.size size = 0 diff --git a/catcli/walker.py b/catcli/walker.py index 56faa97..e908ec8 100644 --- a/catcli/walker.py +++ b/catcli/walker.py @@ -17,12 +17,44 @@ class Walker: MAXLINE = 80 - 15 - def __init__(self, noder, nohash=False): + def __init__(self, noder, nohash=False, debug=False): self.noder = noder self.noder.set_hashing(not nohash) + self.debug = debug - def index(self, path, name, parentpath=None, parent=None, isdir=False): - '''index a folder and store in tree''' + def index(self, path, name, parent): + return self._index(path, name, parent) + + def reindex(self, path, parent, top): + '''reindex a directory and store in tree''' + cnt = 0 + for (root, dirs, files) in os.walk(path): + for f in files: + sub = os.path.join(root, f) + if not self._need_reindex(top, sub): + self._debug('ignore {}'.format(sub)) + continue + self._debug('re-index {}'.format(sub)) + self._log(f) + self.noder.file_node(os.path.basename(f), sub, + parent, path) + cnt += 1 + for d in dirs: + base = os.path.basename(d) + sub = os.path.join(root, d) + if not self._need_reindex(top, sub): + self._debug('ignore {}'.format(sub)) + continue + self._debug('re-index {}'.format(sub)) + dummy = self.noder.dir_node(base, sub, parent, path) + cnt2 = self.reindex(sub, dummy, top) + cnt += cnt2 + break + self._log(None) + return cnt + + def _index(self, path, name, parent): + '''index a directory and store in tree''' if not parent: parent = noder.dir_node(name, path, parent) @@ -30,22 +62,43 @@ class Walker: for (root, dirs, files) in os.walk(path): for f in files: sub = os.path.join(root, f) - n = f - if len(n) > self.MAXLINE: - n = f[:self.MAXLINE] + '...' - Logger.progr('indexing: {:80}'.format(n)) + self._log(f) self.noder.file_node(os.path.basename(f), sub, - parent, parentpath) + parent, path) cnt += 1 for d in dirs: base = os.path.basename(d) sub = os.path.join(root, d) - dummy = self.noder.dir_node(base, sub, parent, parentpath) - _, cnt2 = self.index(sub, base, - parent=dummy, parentpath=parentpath) + dummy = self.noder.dir_node(base, sub, parent, path) + _, cnt2 = self._index(sub, base, dummy) cnt += cnt2 break - # clean line - Logger.progr('{:80}'.format(' ')) - + self._log(None) return parent, cnt + + def _need_reindex(self, top, path): + '''test if node needs re-indexing''' + cnode, newer = self.noder.get_node_if_newer(top, path) + if cnode and not newer: + # ignore this node + return False + if cnode and newer: + # remove this node and re-add + cnode.parent = None + return True + + def _debug(self, string): + if not self.debug: + return + Logger.info(string) + + def _log(self, string): + if self.debug: + return + if not string: + # clean + Logger.progr('{:80}'.format(' ')) + return + if len(string) > self.MAXLINE: + string = string[:self.MAXLINE] + '...' + Logger.progr('indexing: {:80}'.format(string))