Major optimisations: SQLite index + faster disk glob + new exiftool streaming

1. Move from a JSON index to a SQLite database. - This allows the indexing to be interrupted & resumed - Updating the index consumes less RAM than loading / saving an entire JSON object - Loading the index consumes less RAM since it can be streamed, only exacting the properties we need every time (instead of loading all EXIF data in memory, only to discard most of it later) - These make a big difference when processing 10,000+ photos 2. Switch from <glob> to a manual <readdir> - Glob would take several hundred or GB of RAM when asked to find several thousand files - Manual approach with <micromatch> library does the same thing in a fraction of the time / memory usage 3. Exiftool optimisations - Run 1 exiftool process per CPU, still in batch mode (divide all files to be read into 1 bucket per CPU) - Stream the exiftool output instead of buffering it in memory
7 years ago · 24b2f9bd7c
parent bfdebe2c0f
commit 24b2f9bd7c
20 changed files with 658 additions and 18 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,3 +3,4 @@ test-integration/output-actual
 test-integration/output-expected/public
 test-integration/output-expected/metadata.json
 .DS_Store
+thumbsup.db
--- a/fixtures/holidays/beach.jpg
+++ b/fixtures/holidays/beach.jpg
--- a/fixtures/holidays/tower.jpg
+++ b/fixtures/holidays/tower.jpg
--- a/fixtures/home/desk.jpg
+++ b/fixtures/home/desk.jpg
--- a/fixtures/home/fruit.jpg
+++ b/fixtures/home/fruit.jpg
--- a/package.json
+++ b/package.json
@ -27,18 +27,23 @@
  },
  "dependencies": {
    "async": "^2.5.0",
+    "better-sqlite3": "^4.0.3",
    "debug": "^3.1.0",
+    "event-stream": "^3.3.4",
    "exiftool-json-db": "~1.0.0",
    "fs-extra": "^4.0.1",
    "fs-readdir-recursive": "^1.0.0",
+    "JSONStream": "^1.3.1",
    "handlebars": "~4.0.5",
    "ini": "^1.3.4",
    "less": "^2.7.1",
    "lightgallery": "1.2.14",
    "listr": "^0.12.0",
    "listr-work-queue": "https://github.com/thumbsup/listr-work-queue.git",
-    "lodash": "^4.16.6",
-    "moment": "^2.16.0",
+    "lodash": "^4.17.4",
+    "micromatch": "^3.1.4",
+    "moment": "^2.19.2",
+    "readdir-enhanced": "^2.0.0",
    "thumbsup-downsize": "^1.0.0",
    "url-join": "^2.0.2",
    "video.js": "^6.2.8",
--- a/src/components/exiftool/README.md
+++ b/src/components/exiftool/README.md
@ -0,0 +1,68 @@
+# exiftool-batch
+
+> Process a batch of files through [exiftool](http://www.sno.phy.queensu.ca/~phil/exiftool/) and stream the results
+
+## Usage
+
+```js
+const exiftool = require('./exiftool-batch/parallel')
+const stream = exiftool.parse('./photos', [
+  'IMG_000001.jpg',
+  'IMG_000002.jpg',
+  'IMG_000003.jpg'
+])
+stream.on('data', entry => console.log(`Processed ${entry.SourceFile}`))
+stream.on('end', () => console.log('Finished'))
+```
+
+Each stream entry will be an object in the following format.
+
+```js
+{
+  SourceFile: 'NewYork/IMG_5364.jpg',
+  File: {
+    FileSize: '449 kB',
+    MIMEType: 'image/jpeg',
+    /* ... */
+  },
+  EXIF: {
+    Orientation: 'Horizontal (normal)',
+    DateTimeOriginal: '2017:01:07 13:59:56',
+    /* ... */
+  },
+  Composite: {
+    GPSLatitude: '+51.5285578',
+    GPSLongitude: -0.2420248,
+    /* ... */
+  }
+}
+```
+
+## Stream structure
+
+Some notes on the structure:
+
+- the `SourceFile` property is a relative path to the root folder given to `parse()`
+- the format is identical to the raw JSON `exiftool` output
+  * the name of the groups and tags are exactly as documented at  http://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/index.html
+  * it doesn't try to parse date strings, and doesn't assume timezones when absent
+  * it doesn't fix GPS format oddities, like `-10.000` (number) and `"+10.000"` (string)
+  * it doesn't merge similar fields together, like `EXIF:ImageDescription` and `IPTC:Caption-Abstract`
+
+## Performance
+
+- uses `exiftool` in batch mode, instead of spawning 1 instance per file
+- runs 1 `exiftool` process per core to speed-up parsing
+
+The following stats were captured while processing 10,000 photos stored on an SSD drive:
+
+| Metric | Value |
+|--------|-------|
+| Total time | 30 sec |
+| Peak throughput | 300 photos / sec |
+
+| Process | CPU (4 cores) | RAM |
+|---------|-----|-----|
+| Node    | 25% | 70 MB  |
+| Exiftool | 85% per instance | 20 MB per instance |
+| Total | 365% | 150 MB |
--- a/src/components/exiftool/parallel.js
+++ b/src/components/exiftool/parallel.js
@ -0,0 +1,20 @@
+const _ = require('lodash')
+const os = require('os')
+const es = require('event-stream')
+const exiftool = require('./stream.js')
+
+/*
+  Fans out the list of files to multiple exiftool processes (= CPU count)
+  Returns a single stream of javascript objects, parsed from the JSON response
+*/
+exports.parse = (rootFolder, filePaths) => {
+  // create several buckets of work
+  const workers = os.cpus().length
+  const buckets = _.chunk(filePaths, Math.ceil(filePaths.length / workers))
+  // create several <exiftool> streams that can work in parallel
+  const streams = _.range(buckets.length).map(i => {
+    return exiftool.parse(rootFolder, buckets[i])
+  })
+  // merge the object streams
+  return es.merge(streams)
+}
--- a/src/components/exiftool/stream.js
+++ b/src/components/exiftool/stream.js
@ -0,0 +1,48 @@
+const childProcess = require('child_process')
+const debug = require('debug')('exiftool-stream')
+const es = require('event-stream')
+const JSONStream = require('JSONStream')
+
+/*
+  Spawn a single <exiftool> process and send all the files to be parsed
+  Returns a stream which emits JS objects as they get returned
+*/
+exports.parse = (rootFolder, filePaths) => {
+  const args = [
+    '-a', // include duplicate tags
+    '-s', // use tag ID, not display name
+    '-g', // include group names, as nested structures
+    '-c', // specify format for GPS lat/long
+    '%+.6f', // lat/long = float values
+    '-struct', // preserve XMP structure
+    '-json', // JSON output
+    '-@', // specify more arguments separately
+    '-' // read arguments from standard in
+  ]
+
+  // create a new <exiftool> child process
+  const child = childProcess.spawn('exiftool', args, {
+    cwd: rootFolder,
+    stdio: [ 'pipe', 'pipe', 'ignore' ]
+  })
+  child.on('error', (err) => {
+    debug(`Error: please verify that <exiftool> is installed on your system`)
+    debug(err.toString())
+  })
+  child.on('close', (code, signal) => {
+    debug(`Exiftool exited with code ${code}`)
+  })
+
+  // write all files to <stdin>
+  // exiftool will only start processing after <stdin> is closed
+  const allFiles = filePaths.join('\n')
+  child.stdin.write(allFiles + '\n')
+  child.stdin.end()
+
+  // stream <stdout> into a JSON parser
+  // parse every top-level object and emit it on the stream
+  return es.pipeline(
+    child.stdout,
+    JSONStream.parse([true])
+  )
+}
--- a/src/components/index/README.md
+++ b/src/components/index/README.md
@ -0,0 +1,103 @@
+# index
+
+> Index a folder of photos and videos, and cache all their metadata for fast retrieval.
+
+## Usage
+
+```js
+const Index = require('./index/index')
+
+// index the contents of a folder
+const index = new Index('thumbsup.db')
+const emitter = index.update('/Volumes/photos')
+
+// indexing stats
+// this happens before any new files are parsed for metadata
+emitter.on('stats', stats => {
+  assert(stats, {
+    total: 14710,
+    unchanged: 14561,
+    added: 135,
+    modified: 14,
+    deleted: 2
+  })
+})
+
+// parsing progress, emitted for every new or modified file
+emitter.on('progress', progress => {
+  assert(progress, {
+    path: 'holidays/IMG_5725.jpg',
+    completed: 17,
+    total: 149
+  })
+})
+
+// every file which has been indexed
+// these are emitted after all parsing has been done
+// files are emitted one by one so you can process them however you need
+// without accumulating a large array of metadata in memory if you don't need it
+emitter.on('file', file => {
+  assert(file, {
+    path: 'holidays/IMG_5831.jpg',
+    timestamp: new Date(1510410524753),
+    metadata: { /* parsed metadata */ }
+  })
+})
+
+// finished
+emitter.on('done', () => {
+  console.log('Finished indexing')
+})
+```
+
+## Metadata format
+
+The `metadata` object contains all metadata embedded inside the image or video files, in the raw `exiftool` format.
+See the [exiftool component](../exiftool) for more detail.
+
+For example:
+
+```js
+{
+  "SourceFile": "NewYork/IMG_5364.jpg",
+  "File": {
+    "FileSize": "449 kB",
+    "MIMEType": "image/jpeg",
+    /* ... */
+  },
+  "EXIF": {
+    "Orientation": "Horizontal (normal)",
+    "DateTimeOriginal": "2017:01:07 13:59:56",
+    /* ... */
+  },
+  "Composite": {
+    "GPSLatitude": "+51.5285578",
+    "GPSLongitude": -0.2420248,
+    /* ... */
+  }
+}
+```
+
+## Database size
+
+The SQLite database will grow proportionally with the number of files indexed.
+However it doesn't shrink automatically when files are deleted.
+If applicable you should shrink the database periodically using:
+
+```js
+const index = new Index('thumbsup.db')
+index.vacuum()
+```
+
+## Technical design
+
+The index's roles is to
+- keep a local [SQLite](https://sqlite.org) database of known files and their metadata
+- compare it with the current files on disk
+- process any new/updated files so the database is up to date
+
+The SQLite database includes a `files` table with the following columns:
+
+- `path`: the path to every photo & video file, relative to the source folder
+- `timestamp`: the file modification date (milliseconds since Epoch)
+- `metadata`: the file's raw metadata stored as a blob (JSON format)
--- a/src/components/index/delta.js
+++ b/src/components/index/delta.js
@ -0,0 +1,33 @@
+const _ = require('lodash')
+
+/*
+  Calculate the difference between files on disk and already indexed
+  - databaseMap = hashmap of {path, timestamp}
+  - diskMap = hashmap of {path, timestamp}
+*/
+exports.calculate = (databaseMap, diskMap) => {
+  const delta = {
+    unchanged: [],
+    added: [],
+    modified: [],
+    deleted: []
+  }
+  _.each(databaseMap, (dbTime, dbPath) => {
+    if (diskMap.hasOwnProperty(dbPath)) {
+      const modified = Math.abs(dbTime - diskMap[dbPath]) > 1000
+      if (modified) {
+        delta.modified.push(dbPath)
+      } else {
+        delta.unchanged.push(dbPath)
+      }
+    } else {
+      delta.deleted.push(dbPath)
+    }
+  })
+  _.each(diskMap, (diskTime, diskPath) => {
+    if (!databaseMap.hasOwnProperty(diskPath)) {
+      delta.added.push(diskPath)
+    }
+  })
+  return delta
+}
--- a/src/components/index/glob.js
+++ b/src/components/index/glob.js
@ -0,0 +1,27 @@
+const micromatch = require('micromatch')
+const readdir = require('readdir-enhanced')
+
+const PHOTO_EXT = ['bmp', 'gif', 'jpg', 'jpeg', 'png', 'tif', 'tiff', 'webp']
+const VIDEO_EXT = ['3gp', 'flv', 'm2ts', 'mkv', 'mp4', 'mov', 'mts', 'ogg', 'ogv', 'webm']
+const MEDIA_GLOB = '**/*.{' + PHOTO_EXT.join(',') + ',' + VIDEO_EXT.join(',') + '}'
+
+/*
+  Return a hashmap of {path, timestamp}
+  for all the media files in the target folder
+*/
+exports.find = function (rootFolder, callback) {
+  const entries = {}
+  const stream = readdir.readdirStreamStat(rootFolder, {
+    filter: entry => micromatch(entry.path, MEDIA_GLOB).length !== 0,
+    deep: stats => stats.path.charAt(0) !== '.',
+    basePath: '',
+    sep: '/'
+  })
+  stream.on('data', stats => { entries[stats.path] = stats.mtime.getTime() })
+  stream.on('error', err => isEnoentCodeError(err) ? callback(null, {}) : callback(err))
+  stream.on('end', () => callback(null, entries))
+}
+
+function isEnoentCodeError (err) {
+  return err.code === 'ENOENT'
+}
--- a/src/components/index/index.js
+++ b/src/components/index/index.js
@ -0,0 +1,104 @@
+const _ = require('lodash')
+const Database = require('better-sqlite3')
+const delta = require('./delta')
+const EventEmitter = require('events')
+const exiftool = require('../exiftool/parallel')
+const fs = require('fs-extra')
+const globber = require('./glob')
+const moment = require('moment')
+const path = require('path')
+
+const EXIF_DATE_FORMAT = 'YYYY:MM:DD HH:mm:ssZ'
+
+class Index {
+  constructor (indexPath) {
+    // create the database if it doesn't exist
+    fs.mkdirpSync(path.dirname(indexPath))
+    this.db = new Database(indexPath, {})
+    this.db.exec('CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, timestamp INTEGER, metadata BLOB)')
+  }
+
+  /*
+    Index all the files in <media> and store into <database>
+  */
+  update (mediaFolder) {
+    // will emit many different events
+    const emitter = new EventEmitter()
+
+    // prepared database statements
+    const selectStatement = this.db.prepare('SELECT path, timestamp FROM files')
+    const insertStatement = this.db.prepare('INSERT OR REPLACE INTO files VALUES (?, ?, ?)')
+    const deleteStatement = this.db.prepare('DELETE FROM files WHERE path = ?')
+    const countStatement = this.db.prepare('SELECT COUNT(*) AS count FROM files')
+    const selectMetadata = this.db.prepare('SELECT * FROM files')
+
+    // create hashmap of all files in the database
+    const databaseMap = {}
+    for (var row of selectStatement.iterate()) {
+      databaseMap[row.path] = row.timestamp
+    }
+
+    function finished () {
+      // emit every file in the index
+      for (var row of selectMetadata.iterate()) {
+        emitter.emit('file', {
+          path: row.path,
+          timestamp: new Date(row.timestamp),
+          metadata: JSON.parse(row.metadata)
+        })
+      }
+      // emit the final count
+      const result = countStatement.get()
+      emitter.emit('done', {count: result.count})
+    }
+
+    // find all files on disk
+    globber.find(mediaFolder, (err, diskMap) => {
+      if (err) return console.error('error', err)
+
+      // calculate the difference: which files have been added, modified, etc
+      const deltaFiles = delta.calculate(databaseMap, diskMap)
+      emitter.emit('stats', {
+        unchanged: deltaFiles.unchanged.length,
+        added: deltaFiles.added.length,
+        modified: deltaFiles.modified.length,
+        deleted: deltaFiles.deleted.length,
+        total: Object.keys(diskMap).length
+      })
+
+      // remove deleted files from the DB
+      _.each(deltaFiles.deleted, path => {
+        deleteStatement.run(path)
+      })
+
+      // check if any files need parsing
+      var processed = 0
+      const toProcess = _.union(deltaFiles.added, deltaFiles.modified)
+      if (toProcess.length === 0) {
+        return finished()
+      }
+
+      // call <exiftool> on added and modified files
+      // and write each entry to the database
+      const stream = exiftool.parse(mediaFolder, toProcess)
+      stream.on('data', entry => {
+        const timestamp = moment(entry.File.FileModifyDate, EXIF_DATE_FORMAT).valueOf()
+        insertStatement.run(entry.SourceFile, timestamp, JSON.stringify(entry))
+        ++processed
+        emitter.emit('progress', {path: entry.SourceFile, processed: processed, total: toProcess.length})
+      }).on('end', finished)
+    })
+
+    return emitter
+  }
+
+  /*
+    Do a full vacuum to optimise the database
+    which can be needed if files are often deleted/modified
+  */
+  vacuum () {
+    this.db.exec('VACUUM')
+  }
+}
+
+module.exports = Index
--- a/src/index.js
+++ b/src/index.js
@ -1,5 +1,4 @@
 const fs = require('fs-extra')
-const path = require('path')
 const Listr = require('listr')
 const steps = require('./steps/index')
 const summary = require('./steps/summary')
@ -8,28 +7,19 @@ const website = require('./website/website')
 exports.build = function (opts) {
  const tasks = new Listr([
    {
-      title: 'Updating database',
+      title: 'Indexing folder',
      task: (ctx, task) => {
-        // returns an observable which will complete when the database is loaded
        fs.mkdirpSync(opts.output)
-        const databaseFile = path.join(opts.output, 'metadata.json')
-        return steps.database(opts.input, databaseFile, (err, res) => {
+        return steps.index(opts, (err, files, album) => {
          if (!err) {
-            ctx.database = res.database
+            ctx.files = files
+            ctx.album = album
          }
        })
      }
    },
    {
-      title: 'Creating model',
-      task: (ctx) => {
-        const res = steps.model(ctx.database, opts)
-        ctx.files = res.files
-        ctx.album = res.album
-      }
-    },
-    {
-      title: 'Processing media',
+      title: 'Resizing media',
      task: (ctx, task) => {
        return steps.process(ctx.files, opts, task)
      }
--- a/src/steps/index.js
+++ b/src/steps/index.js
@ -1,5 +1,6 @@
 exports.database = require('./step-database').run
-exports.model = require('./step-model').run
+exports.index = require('./step-index').run
+// exports.model = require('./step-model').run
 exports.process = require('./step-process').run
 exports.cleanup = require('./step-cleanup').run
 // exports.website = require('./website')
--- a/src/steps/step-index.js
+++ b/src/steps/step-index.js
@ -0,0 +1,46 @@
+/*
+--------------------------------------------------------------------------------
+Provides most metadata based on the output of <exiftool>
+Caches the resulting DB in <metadata.json> for faster re-runs
+--------------------------------------------------------------------------------
+*/
+
+const hierarchy = require('../input/hierarchy.js')
+const Index = require('../components/index/index')
+const mapper = require('../input/mapper')
+const Metadata = require('../model/metadata')
+const File = require('../model/file')
+const Observable = require('zen-observable')
+const path = require('path')
+const Picasa = require('../input/picasa')
+
+exports.run = function (opts, callback) {
+  return new Observable(observer => {
+    // console.log(`RAM before index: ${ram()} MB`)
+    const picasaReader = new Picasa()
+    const index = new Index(path.join(opts.output, 'thumbsup.db'))
+    const emitter = index.update(opts.input)
+    const files = []
+    // var total = 0
+    emitter.on('stats', stats => {
+      // console.log(stats)
+      // total = stats.added + stats.modified
+    })
+    emitter.on('progress', stats => {
+      observer.next(`Indexing ${stats.processed}/${stats.total}`)
+    })
+    emitter.on('file', file => {
+      const picasa = picasaReader.file(file.metadata.SourceFile)
+      const meta = new Metadata(file.metadata, picasa || {})
+      const model = new File(file.metadata, meta, opts)
+      files.push(model)
+    })
+    emitter.on('done', stats => {
+      const albumMapper = mapper.create(opts)
+      const album = hierarchy.createAlbums(files, albumMapper, opts)
+      // console.log(`RAM after index: ${ram()} MB`)
+      callback(null, files, album)
+      observer.complete()
+    })
+  })
+}
--- a/test/components/exiftool/integration.spec.js
+++ b/test/components/exiftool/integration.spec.js
@ -0,0 +1,27 @@
+const exiftool = require('../../../src/components/exiftool/parallel')
+const path = require('path')
+const readdir = require('readdir-enhanced')
+const should = require('should/as-function')
+
+// Find all test photos
+const folder = path.join(__dirname, '..', '..', '..', 'fixtures')
+const files = readdir.sync(folder, {
+  filter: stats => stats.isFile() && stats.path.charAt(0) !== '.',
+  deep: true
+})
+
+describe('exiftool', () => {
+  it('processes all files', (done) => {
+    const processed = []
+    const stream = exiftool.parse(folder, files)
+    stream.on('data', entry => {
+      processed.push(entry.SourceFile)
+    })
+    .on('end', () => {
+      files.sort()
+      processed.sort()
+      should(processed).eql(files)
+      done()
+    })
+  })
+})
--- a/test/components/index/delta.spec.js
+++ b/test/components/index/delta.spec.js
@ -0,0 +1,114 @@
+const delta = require('../../../src/components/index/delta')
+const should = require('should/as-function')
+
+describe('Index: delta', () => {
+  it('no changes', () => {
+    const database = {
+      'IMG_0001': 1410000000000,
+      'IMG_0002': 1420000000000
+    }
+    const disk = {
+      'IMG_0001': 1410000000000,
+      'IMG_0002': 1420000000000
+    }
+    const res = delta.calculate(database, disk)
+    should(res).eql({
+      unchanged: ['IMG_0001', 'IMG_0002'],
+      added: [],
+      modified: [],
+      deleted: []
+    })
+  })
+
+  it('no changes within a second', () => {
+    const database = {
+      'IMG_0001': 1410000001000,
+      'IMG_0002': 1420000001000
+    }
+    const disk = {
+      'IMG_0001': 1410000001500,  // 500ms later
+      'IMG_0002': 1420000000500   // 500ms earlier
+    }
+    const res = delta.calculate(database, disk)
+    should(res).eql({
+      unchanged: ['IMG_0001', 'IMG_0002'],
+      added: [],
+      modified: [],
+      deleted: []
+    })
+  })
+
+  it('new files', () => {
+    const database = {
+      'IMG_0001': 1410000000000,
+      'IMG_0002': 1420000000000
+    }
+    const disk = {
+      'IMG_0001': 1410000000000,
+      'IMG_0002': 1420000000000,
+      'IMG_0003': 1430000000000
+    }
+    const res = delta.calculate(database, disk)
+    should(res).eql({
+      unchanged: ['IMG_0001', 'IMG_0002'],
+      added: ['IMG_0003'],
+      modified: [],
+      deleted: []
+    })
+  })
+
+  it('deleted files', () => {
+    const database = {
+      'IMG_0001': 1410000000000,
+      'IMG_0002': 1420000000000
+    }
+    const disk = {
+      'IMG_0001': 1410000000000
+    }
+    const res = delta.calculate(database, disk)
+    should(res).eql({
+      unchanged: ['IMG_0001'],
+      added: [],
+      modified: [],
+      deleted: ['IMG_0002']
+    })
+  })
+
+  it('modified files', () => {
+    const database = {
+      'IMG_0001': 1410000000000,
+      'IMG_0002': 1420000000000
+    }
+    const disk = {
+      'IMG_0001': 1410000000000,
+      'IMG_0002': 1420000002000
+    }
+    const res = delta.calculate(database, disk)
+    should(res).eql({
+      unchanged: ['IMG_0001'],
+      added: [],
+      modified: ['IMG_0002'],
+      deleted: []
+    })
+  })
+
+  it('all cases', () => {
+    const database = {
+      'IMG_0001': 1410000000000,
+      'IMG_0002': 1420000000000,
+      'IMG_0003': 1430000000000
+    }
+    const disk = {
+      'IMG_0001': 1410000000000,
+      'IMG_0002': 1420000002000,
+      'IMG_0004': 1445000000000
+    }
+    const res = delta.calculate(database, disk)
+    should(res).eql({
+      unchanged: ['IMG_0001'],
+      added: ['IMG_0004'],
+      modified: ['IMG_0002'],
+      deleted: ['IMG_0003']
+    })
+  })
+})
--- a/test/components/index/glob.spec.js
+++ b/test/components/index/glob.spec.js
@ -0,0 +1,24 @@
+const path = require('path')
+const glob = require('../../../src/components/index/glob')
+const should = require('should/as-function')
+
+describe('Index: glob', function () {
+  this.slow(1000)
+  this.timeout(1000)
+  it('finds all files', (done) => {
+    const fixtures = path.join(__dirname, '..', '..', '..', 'fixtures')
+    glob.find(fixtures, (err, map) => {
+      if (err) {
+        return done(err)
+      }
+      const keys = Object.keys(map).sort()
+      should(keys).eql([
+        'holidays/beach.jpg',
+        'holidays/tower.jpg',
+        'home/desk.jpg',
+        'home/fruit.jpg'
+      ])
+      done()
+    })
+  })
+})
--- a/test/components/index/index.spec.js
+++ b/test/components/index/index.spec.js
@ -0,0 +1,29 @@
+const Index = require('../../../src/components/index/index')
+const path = require('path')
+const should = require('should/as-function')
+
+describe('Index', function () {
+  this.slow(1000)
+  this.timeout(1000)
+  it('indexes the fixtures', (done) => {
+    const index = new Index('thumbsup.db')
+    const fixtures = path.join(__dirname, '..', '..', '..', 'fixtures')
+    const emitter = index.update(fixtures)
+    const emitted = []
+    emitter.on('file', meta => emitted.push(meta))
+    emitter.on('done', () => {
+      // check all files were indexed
+      const paths = emitted.map(e => e.path).sort()
+      should(paths).eql([
+        'holidays/beach.jpg',
+        'holidays/tower.jpg',
+        'home/desk.jpg',
+        'home/fruit.jpg'
+      ])
+      // check the path matches the SourceFile property
+      const sourceFiles = emitted.map(e => e.metadata.SourceFile).sort()
+      should(paths).eql(sourceFiles)
+      done()
+    })
+  })
+})