Major optimisations: SQLite index + faster disk glob + new exiftool streaming

1. Move from a JSON index to a SQLite database.
  - This allows the indexing to be interrupted & resumed
  - Updating the index consumes less RAM than loading / saving an entire JSON object
  - Loading the index consumes less RAM since it can be streamed, only exacting the properties we need every time (instead of loading all EXIF data in memory, only to discard most of it later)
  - These make a big difference when processing 10,000+ photos

2. Switch from <glob> to a manual <readdir>
  - Glob would take several hundred or GB of RAM when asked to find several thousand files
  - Manual approach with <micromatch> library does the same thing in a fraction of the time / memory usage

3. Exiftool optimisations
  - Run 1 exiftool process per CPU, still in batch mode (divide all files to be read into 1 bucket per CPU)
  - Stream the exiftool output instead of buffering it in memory
exif-summary
Romain 7 years ago
parent bfdebe2c0f
commit 24b2f9bd7c

1
.gitignore vendored

@ -3,3 +3,4 @@ test-integration/output-actual
test-integration/output-expected/public
test-integration/output-expected/metadata.json
.DS_Store
thumbsup.db

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.5 KiB

@ -27,18 +27,23 @@
},
"dependencies": {
"async": "^2.5.0",
"better-sqlite3": "^4.0.3",
"debug": "^3.1.0",
"event-stream": "^3.3.4",
"exiftool-json-db": "~1.0.0",
"fs-extra": "^4.0.1",
"fs-readdir-recursive": "^1.0.0",
"JSONStream": "^1.3.1",
"handlebars": "~4.0.5",
"ini": "^1.3.4",
"less": "^2.7.1",
"lightgallery": "1.2.14",
"listr": "^0.12.0",
"listr-work-queue": "https://github.com/thumbsup/listr-work-queue.git",
"lodash": "^4.16.6",
"moment": "^2.16.0",
"lodash": "^4.17.4",
"micromatch": "^3.1.4",
"moment": "^2.19.2",
"readdir-enhanced": "^2.0.0",
"thumbsup-downsize": "^1.0.0",
"url-join": "^2.0.2",
"video.js": "^6.2.8",

@ -0,0 +1,68 @@
# exiftool-batch
> Process a batch of files through [exiftool](http://www.sno.phy.queensu.ca/~phil/exiftool/) and stream the results
## Usage
```js
const exiftool = require('./exiftool-batch/parallel')
const stream = exiftool.parse('./photos', [
'IMG_000001.jpg',
'IMG_000002.jpg',
'IMG_000003.jpg'
])
stream.on('data', entry => console.log(`Processed ${entry.SourceFile}`))
stream.on('end', () => console.log('Finished'))
```
Each stream entry will be an object in the following format.
```js
{
SourceFile: 'NewYork/IMG_5364.jpg',
File: {
FileSize: '449 kB',
MIMEType: 'image/jpeg',
/* ... */
},
EXIF: {
Orientation: 'Horizontal (normal)',
DateTimeOriginal: '2017:01:07 13:59:56',
/* ... */
},
Composite: {
GPSLatitude: '+51.5285578',
GPSLongitude: -0.2420248,
/* ... */
}
}
```
## Stream structure
Some notes on the structure:
- the `SourceFile` property is a relative path to the root folder given to `parse()`
- the format is identical to the raw JSON `exiftool` output
* the name of the groups and tags are exactly as documented at http://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/index.html
* it doesn't try to parse date strings, and doesn't assume timezones when absent
* it doesn't fix GPS format oddities, like `-10.000` (number) and `"+10.000"` (string)
* it doesn't merge similar fields together, like `EXIF:ImageDescription` and `IPTC:Caption-Abstract`
## Performance
- uses `exiftool` in batch mode, instead of spawning 1 instance per file
- runs 1 `exiftool` process per core to speed-up parsing
The following stats were captured while processing 10,000 photos stored on an SSD drive:
| Metric | Value |
|--------|-------|
| Total time | 30 sec |
| Peak throughput | 300 photos / sec |
| Process | CPU (4 cores) | RAM |
|---------|-----|-----|
| Node | 25% | 70 MB |
| Exiftool | 85% per instance | 20 MB per instance |
| Total | 365% | 150 MB |

@ -0,0 +1,20 @@
const _ = require('lodash')
const os = require('os')
const es = require('event-stream')
const exiftool = require('./stream.js')
/*
Fans out the list of files to multiple exiftool processes (= CPU count)
Returns a single stream of javascript objects, parsed from the JSON response
*/
exports.parse = (rootFolder, filePaths) => {
// create several buckets of work
const workers = os.cpus().length
const buckets = _.chunk(filePaths, Math.ceil(filePaths.length / workers))
// create several <exiftool> streams that can work in parallel
const streams = _.range(buckets.length).map(i => {
return exiftool.parse(rootFolder, buckets[i])
})
// merge the object streams
return es.merge(streams)
}

@ -0,0 +1,48 @@
const childProcess = require('child_process')
const debug = require('debug')('exiftool-stream')
const es = require('event-stream')
const JSONStream = require('JSONStream')
/*
Spawn a single <exiftool> process and send all the files to be parsed
Returns a stream which emits JS objects as they get returned
*/
exports.parse = (rootFolder, filePaths) => {
const args = [
'-a', // include duplicate tags
'-s', // use tag ID, not display name
'-g', // include group names, as nested structures
'-c', // specify format for GPS lat/long
'%+.6f', // lat/long = float values
'-struct', // preserve XMP structure
'-json', // JSON output
'-@', // specify more arguments separately
'-' // read arguments from standard in
]
// create a new <exiftool> child process
const child = childProcess.spawn('exiftool', args, {
cwd: rootFolder,
stdio: [ 'pipe', 'pipe', 'ignore' ]
})
child.on('error', (err) => {
debug(`Error: please verify that <exiftool> is installed on your system`)
debug(err.toString())
})
child.on('close', (code, signal) => {
debug(`Exiftool exited with code ${code}`)
})
// write all files to <stdin>
// exiftool will only start processing after <stdin> is closed
const allFiles = filePaths.join('\n')
child.stdin.write(allFiles + '\n')
child.stdin.end()
// stream <stdout> into a JSON parser
// parse every top-level object and emit it on the stream
return es.pipeline(
child.stdout,
JSONStream.parse([true])
)
}

@ -0,0 +1,103 @@
# index
> Index a folder of photos and videos, and cache all their metadata for fast retrieval.
## Usage
```js
const Index = require('./index/index')
// index the contents of a folder
const index = new Index('thumbsup.db')
const emitter = index.update('/Volumes/photos')
// indexing stats
// this happens before any new files are parsed for metadata
emitter.on('stats', stats => {
assert(stats, {
total: 14710,
unchanged: 14561,
added: 135,
modified: 14,
deleted: 2
})
})
// parsing progress, emitted for every new or modified file
emitter.on('progress', progress => {
assert(progress, {
path: 'holidays/IMG_5725.jpg',
completed: 17,
total: 149
})
})
// every file which has been indexed
// these are emitted after all parsing has been done
// files are emitted one by one so you can process them however you need
// without accumulating a large array of metadata in memory if you don't need it
emitter.on('file', file => {
assert(file, {
path: 'holidays/IMG_5831.jpg',
timestamp: new Date(1510410524753),
metadata: { /* parsed metadata */ }
})
})
// finished
emitter.on('done', () => {
console.log('Finished indexing')
})
```
## Metadata format
The `metadata` object contains all metadata embedded inside the image or video files, in the raw `exiftool` format.
See the [exiftool component](../exiftool) for more detail.
For example:
```js
{
"SourceFile": "NewYork/IMG_5364.jpg",
"File": {
"FileSize": "449 kB",
"MIMEType": "image/jpeg",
/* ... */
},
"EXIF": {
"Orientation": "Horizontal (normal)",
"DateTimeOriginal": "2017:01:07 13:59:56",
/* ... */
},
"Composite": {
"GPSLatitude": "+51.5285578",
"GPSLongitude": -0.2420248,
/* ... */
}
}
```
## Database size
The SQLite database will grow proportionally with the number of files indexed.
However it doesn't shrink automatically when files are deleted.
If applicable you should shrink the database periodically using:
```js
const index = new Index('thumbsup.db')
index.vacuum()
```
## Technical design
The index's roles is to
- keep a local [SQLite](https://sqlite.org) database of known files and their metadata
- compare it with the current files on disk
- process any new/updated files so the database is up to date
The SQLite database includes a `files` table with the following columns:
- `path`: the path to every photo & video file, relative to the source folder
- `timestamp`: the file modification date (milliseconds since Epoch)
- `metadata`: the file's raw metadata stored as a blob (JSON format)

@ -0,0 +1,33 @@
const _ = require('lodash')
/*
Calculate the difference between files on disk and already indexed
- databaseMap = hashmap of {path, timestamp}
- diskMap = hashmap of {path, timestamp}
*/
exports.calculate = (databaseMap, diskMap) => {
const delta = {
unchanged: [],
added: [],
modified: [],
deleted: []
}
_.each(databaseMap, (dbTime, dbPath) => {
if (diskMap.hasOwnProperty(dbPath)) {
const modified = Math.abs(dbTime - diskMap[dbPath]) > 1000
if (modified) {
delta.modified.push(dbPath)
} else {
delta.unchanged.push(dbPath)
}
} else {
delta.deleted.push(dbPath)
}
})
_.each(diskMap, (diskTime, diskPath) => {
if (!databaseMap.hasOwnProperty(diskPath)) {
delta.added.push(diskPath)
}
})
return delta
}

@ -0,0 +1,27 @@
const micromatch = require('micromatch')
const readdir = require('readdir-enhanced')
const PHOTO_EXT = ['bmp', 'gif', 'jpg', 'jpeg', 'png', 'tif', 'tiff', 'webp']
const VIDEO_EXT = ['3gp', 'flv', 'm2ts', 'mkv', 'mp4', 'mov', 'mts', 'ogg', 'ogv', 'webm']
const MEDIA_GLOB = '**/*.{' + PHOTO_EXT.join(',') + ',' + VIDEO_EXT.join(',') + '}'
/*
Return a hashmap of {path, timestamp}
for all the media files in the target folder
*/
exports.find = function (rootFolder, callback) {
const entries = {}
const stream = readdir.readdirStreamStat(rootFolder, {
filter: entry => micromatch(entry.path, MEDIA_GLOB).length !== 0,
deep: stats => stats.path.charAt(0) !== '.',
basePath: '',
sep: '/'
})
stream.on('data', stats => { entries[stats.path] = stats.mtime.getTime() })
stream.on('error', err => isEnoentCodeError(err) ? callback(null, {}) : callback(err))
stream.on('end', () => callback(null, entries))
}
function isEnoentCodeError (err) {
return err.code === 'ENOENT'
}

@ -0,0 +1,104 @@
const _ = require('lodash')
const Database = require('better-sqlite3')
const delta = require('./delta')
const EventEmitter = require('events')
const exiftool = require('../exiftool/parallel')
const fs = require('fs-extra')
const globber = require('./glob')
const moment = require('moment')
const path = require('path')
const EXIF_DATE_FORMAT = 'YYYY:MM:DD HH:mm:ssZ'
class Index {
constructor (indexPath) {
// create the database if it doesn't exist
fs.mkdirpSync(path.dirname(indexPath))
this.db = new Database(indexPath, {})
this.db.exec('CREATE TABLE IF NOT EXISTS files (path TEXT PRIMARY KEY, timestamp INTEGER, metadata BLOB)')
}
/*
Index all the files in <media> and store into <database>
*/
update (mediaFolder) {
// will emit many different events
const emitter = new EventEmitter()
// prepared database statements
const selectStatement = this.db.prepare('SELECT path, timestamp FROM files')
const insertStatement = this.db.prepare('INSERT OR REPLACE INTO files VALUES (?, ?, ?)')
const deleteStatement = this.db.prepare('DELETE FROM files WHERE path = ?')
const countStatement = this.db.prepare('SELECT COUNT(*) AS count FROM files')
const selectMetadata = this.db.prepare('SELECT * FROM files')
// create hashmap of all files in the database
const databaseMap = {}
for (var row of selectStatement.iterate()) {
databaseMap[row.path] = row.timestamp
}
function finished () {
// emit every file in the index
for (var row of selectMetadata.iterate()) {
emitter.emit('file', {
path: row.path,
timestamp: new Date(row.timestamp),
metadata: JSON.parse(row.metadata)
})
}
// emit the final count
const result = countStatement.get()
emitter.emit('done', {count: result.count})
}
// find all files on disk
globber.find(mediaFolder, (err, diskMap) => {
if (err) return console.error('error', err)
// calculate the difference: which files have been added, modified, etc
const deltaFiles = delta.calculate(databaseMap, diskMap)
emitter.emit('stats', {
unchanged: deltaFiles.unchanged.length,
added: deltaFiles.added.length,
modified: deltaFiles.modified.length,
deleted: deltaFiles.deleted.length,
total: Object.keys(diskMap).length
})
// remove deleted files from the DB
_.each(deltaFiles.deleted, path => {
deleteStatement.run(path)
})
// check if any files need parsing
var processed = 0
const toProcess = _.union(deltaFiles.added, deltaFiles.modified)
if (toProcess.length === 0) {
return finished()
}
// call <exiftool> on added and modified files
// and write each entry to the database
const stream = exiftool.parse(mediaFolder, toProcess)
stream.on('data', entry => {
const timestamp = moment(entry.File.FileModifyDate, EXIF_DATE_FORMAT).valueOf()
insertStatement.run(entry.SourceFile, timestamp, JSON.stringify(entry))
++processed
emitter.emit('progress', {path: entry.SourceFile, processed: processed, total: toProcess.length})
}).on('end', finished)
})
return emitter
}
/*
Do a full vacuum to optimise the database
which can be needed if files are often deleted/modified
*/
vacuum () {
this.db.exec('VACUUM')
}
}
module.exports = Index

@ -1,5 +1,4 @@
const fs = require('fs-extra')
const path = require('path')
const Listr = require('listr')
const steps = require('./steps/index')
const summary = require('./steps/summary')
@ -8,28 +7,19 @@ const website = require('./website/website')
exports.build = function (opts) {
const tasks = new Listr([
{
title: 'Updating database',
title: 'Indexing folder',
task: (ctx, task) => {
// returns an observable which will complete when the database is loaded
fs.mkdirpSync(opts.output)
const databaseFile = path.join(opts.output, 'metadata.json')
return steps.database(opts.input, databaseFile, (err, res) => {
return steps.index(opts, (err, files, album) => {
if (!err) {
ctx.database = res.database
ctx.files = files
ctx.album = album
}
})
}
},
{
title: 'Creating model',
task: (ctx) => {
const res = steps.model(ctx.database, opts)
ctx.files = res.files
ctx.album = res.album
}
},
{
title: 'Processing media',
title: 'Resizing media',
task: (ctx, task) => {
return steps.process(ctx.files, opts, task)
}

@ -1,5 +1,6 @@
exports.database = require('./step-database').run
exports.model = require('./step-model').run
exports.index = require('./step-index').run
// exports.model = require('./step-model').run
exports.process = require('./step-process').run
exports.cleanup = require('./step-cleanup').run
// exports.website = require('./website')

@ -0,0 +1,46 @@
/*
--------------------------------------------------------------------------------
Provides most metadata based on the output of <exiftool>
Caches the resulting DB in <metadata.json> for faster re-runs
--------------------------------------------------------------------------------
*/
const hierarchy = require('../input/hierarchy.js')
const Index = require('../components/index/index')
const mapper = require('../input/mapper')
const Metadata = require('../model/metadata')
const File = require('../model/file')
const Observable = require('zen-observable')
const path = require('path')
const Picasa = require('../input/picasa')
exports.run = function (opts, callback) {
return new Observable(observer => {
// console.log(`RAM before index: ${ram()} MB`)
const picasaReader = new Picasa()
const index = new Index(path.join(opts.output, 'thumbsup.db'))
const emitter = index.update(opts.input)
const files = []
// var total = 0
emitter.on('stats', stats => {
// console.log(stats)
// total = stats.added + stats.modified
})
emitter.on('progress', stats => {
observer.next(`Indexing ${stats.processed}/${stats.total}`)
})
emitter.on('file', file => {
const picasa = picasaReader.file(file.metadata.SourceFile)
const meta = new Metadata(file.metadata, picasa || {})
const model = new File(file.metadata, meta, opts)
files.push(model)
})
emitter.on('done', stats => {
const albumMapper = mapper.create(opts)
const album = hierarchy.createAlbums(files, albumMapper, opts)
// console.log(`RAM after index: ${ram()} MB`)
callback(null, files, album)
observer.complete()
})
})
}

@ -0,0 +1,27 @@
const exiftool = require('../../../src/components/exiftool/parallel')
const path = require('path')
const readdir = require('readdir-enhanced')
const should = require('should/as-function')
// Find all test photos
const folder = path.join(__dirname, '..', '..', '..', 'fixtures')
const files = readdir.sync(folder, {
filter: stats => stats.isFile() && stats.path.charAt(0) !== '.',
deep: true
})
describe('exiftool', () => {
it('processes all files', (done) => {
const processed = []
const stream = exiftool.parse(folder, files)
stream.on('data', entry => {
processed.push(entry.SourceFile)
})
.on('end', () => {
files.sort()
processed.sort()
should(processed).eql(files)
done()
})
})
})

@ -0,0 +1,114 @@
const delta = require('../../../src/components/index/delta')
const should = require('should/as-function')
describe('Index: delta', () => {
it('no changes', () => {
const database = {
'IMG_0001': 1410000000000,
'IMG_0002': 1420000000000
}
const disk = {
'IMG_0001': 1410000000000,
'IMG_0002': 1420000000000
}
const res = delta.calculate(database, disk)
should(res).eql({
unchanged: ['IMG_0001', 'IMG_0002'],
added: [],
modified: [],
deleted: []
})
})
it('no changes within a second', () => {
const database = {
'IMG_0001': 1410000001000,
'IMG_0002': 1420000001000
}
const disk = {
'IMG_0001': 1410000001500, // 500ms later
'IMG_0002': 1420000000500 // 500ms earlier
}
const res = delta.calculate(database, disk)
should(res).eql({
unchanged: ['IMG_0001', 'IMG_0002'],
added: [],
modified: [],
deleted: []
})
})
it('new files', () => {
const database = {
'IMG_0001': 1410000000000,
'IMG_0002': 1420000000000
}
const disk = {
'IMG_0001': 1410000000000,
'IMG_0002': 1420000000000,
'IMG_0003': 1430000000000
}
const res = delta.calculate(database, disk)
should(res).eql({
unchanged: ['IMG_0001', 'IMG_0002'],
added: ['IMG_0003'],
modified: [],
deleted: []
})
})
it('deleted files', () => {
const database = {
'IMG_0001': 1410000000000,
'IMG_0002': 1420000000000
}
const disk = {
'IMG_0001': 1410000000000
}
const res = delta.calculate(database, disk)
should(res).eql({
unchanged: ['IMG_0001'],
added: [],
modified: [],
deleted: ['IMG_0002']
})
})
it('modified files', () => {
const database = {
'IMG_0001': 1410000000000,
'IMG_0002': 1420000000000
}
const disk = {
'IMG_0001': 1410000000000,
'IMG_0002': 1420000002000
}
const res = delta.calculate(database, disk)
should(res).eql({
unchanged: ['IMG_0001'],
added: [],
modified: ['IMG_0002'],
deleted: []
})
})
it('all cases', () => {
const database = {
'IMG_0001': 1410000000000,
'IMG_0002': 1420000000000,
'IMG_0003': 1430000000000
}
const disk = {
'IMG_0001': 1410000000000,
'IMG_0002': 1420000002000,
'IMG_0004': 1445000000000
}
const res = delta.calculate(database, disk)
should(res).eql({
unchanged: ['IMG_0001'],
added: ['IMG_0004'],
modified: ['IMG_0002'],
deleted: ['IMG_0003']
})
})
})

@ -0,0 +1,24 @@
const path = require('path')
const glob = require('../../../src/components/index/glob')
const should = require('should/as-function')
describe('Index: glob', function () {
this.slow(1000)
this.timeout(1000)
it('finds all files', (done) => {
const fixtures = path.join(__dirname, '..', '..', '..', 'fixtures')
glob.find(fixtures, (err, map) => {
if (err) {
return done(err)
}
const keys = Object.keys(map).sort()
should(keys).eql([
'holidays/beach.jpg',
'holidays/tower.jpg',
'home/desk.jpg',
'home/fruit.jpg'
])
done()
})
})
})

@ -0,0 +1,29 @@
const Index = require('../../../src/components/index/index')
const path = require('path')
const should = require('should/as-function')
describe('Index', function () {
this.slow(1000)
this.timeout(1000)
it('indexes the fixtures', (done) => {
const index = new Index('thumbsup.db')
const fixtures = path.join(__dirname, '..', '..', '..', 'fixtures')
const emitter = index.update(fixtures)
const emitted = []
emitter.on('file', meta => emitted.push(meta))
emitter.on('done', () => {
// check all files were indexed
const paths = emitted.map(e => e.path).sort()
should(paths).eql([
'holidays/beach.jpg',
'holidays/tower.jpg',
'home/desk.jpg',
'home/fruit.jpg'
])
// check the path matches the SourceFile property
const sourceFiles = emitted.map(e => e.metadata.SourceFile).sort()
should(paths).eql(sourceFiles)
done()
})
})
})
Loading…
Cancel
Save