wip recursive parse node tree

This commit is contained in:
Chakib Ben Ziane 2017-11-30 16:08:12 +01:00
parent e1d281de99
commit 2ff96e1509
6 changed files with 157 additions and 59 deletions

2
api.go
View File

@ -20,7 +20,7 @@ func getBookmarks(c *gin.Context) {
err = rows.Scan(&bookmark.URL, &bookmark.Metadata, &tags) err = rows.Scan(&bookmark.URL, &bookmark.Metadata, &tags)
logPanic(err) logPanic(err)
bookmark.Tags = strings.Split(tags, " ") bookmark.Tags = strings.Split(tags, TagJoinSep)
//log.Debugf("GET %s", tags) //log.Debugf("GET %s", tags)
//log.Debugf("%v", bookmark) //log.Debugf("%v", bookmark)

View File

@ -10,7 +10,7 @@ type Bookmark struct {
Metadata string `json:"metadata"` Metadata string `json:"metadata"`
Tags []string `json:"tags"` Tags []string `json:"tags"`
Desc string `json:"desc"` Desc string `json:"desc"`
Node Node Node Node `json:"-"`
//flags int //flags int
} }
@ -25,7 +25,7 @@ func (bk *Bookmark) add(db *DB) {
logError(err) logError(err)
defer stmt.Close() defer stmt.Close()
_, err = stmt.Exec(bk.URL, bk.Metadata, strings.Join(bk.Tags, " "), "", 0) _, err = stmt.Exec(bk.URL, bk.Metadata, strings.Join(bk.Tags, TagJoinSep), "", 0)
sqlErrorMsg(err, bk.URL) sqlErrorMsg(err, bk.URL)
err = tx.Commit() err = tx.Commit()

View File

@ -5,6 +5,7 @@ import (
"path" "path"
"github.com/fsnotify/fsnotify" "github.com/fsnotify/fsnotify"
"github.com/sp4ke/hashmap"
) )
type BrowserType uint8 type BrowserType uint8
@ -27,6 +28,7 @@ var Chrome = struct {
type IBrowser interface { type IBrowser interface {
IWatchable IWatchable
InitBuffer() // init buffer db, should be defered to close after call InitBuffer() // init buffer db, should be defered to close after call
InitIndex() // Creates in memory Index
RegisterHooks(...ParseHook) RegisterHooks(...ParseHook)
Load() // Loads bookmarks to db without watching Load() // Loads bookmarks to db without watching
//Parse(...ParseHook) // Main parsing method with different parsing hooks //Parse(...ParseHook) // Main parsing method with different parsing hooks
@ -42,6 +44,9 @@ type BaseBrowser struct {
baseDir string baseDir string
bkFile string bkFile string
bufferDB *DB bufferDB *DB
URLIndex *hashmap.RBTree
nodeTree *Node
cNode *Node //current node
stats *ParserStats stats *ParserStats
bType BrowserType bType BrowserType
name string name string
@ -79,6 +84,10 @@ func (bw *BaseBrowser) Close() {
logPanic(err) logPanic(err)
} }
func (b *BaseBrowser) InitIndex() {
b.URLIndex = NewIndex()
}
func (b *BaseBrowser) InitBuffer() { func (b *BaseBrowser) InitBuffer() {
bufferName := fmt.Sprintf("buffer_%s", b.name) bufferName := fmt.Sprintf("buffer_%s", b.name)
@ -98,8 +107,8 @@ func (b *BaseBrowser) RegisterHooks(hooks ...ParseHook) {
} }
// Runs browsed defined hooks on bookmark // Runs browsed defined hooks on bookmark
func (b *BaseBrowser) RunParseHooks(bk *Bookmark) { func (b *BaseBrowser) RunParseHooks(node *Node) {
for _, hook := range b.parseHooks { for _, hook := range b.parseHooks {
hook(bk) hook(node)
} }
} }

130
chrome.go
View File

@ -4,6 +4,7 @@ import (
"io/ioutil" "io/ioutil"
"path" "path"
"github.com/OneOfOne/xxhash"
"github.com/buger/jsonparser" "github.com/buger/jsonparser"
) )
@ -29,6 +30,8 @@ func NewChromeBrowser() IBrowser {
browser.baseDir = Chrome.BookmarkDir browser.baseDir = Chrome.BookmarkDir
browser.bkFile = Chrome.BookmarkFile browser.bkFile = Chrome.BookmarkFile
browser.stats = &ParserStats{} browser.stats = &ParserStats{}
browser.nodeTree = &Node{Name: "root", Parent: nil}
browser.cNode = browser.nodeTree
browser.SetupWatcher() browser.SetupWatcher()
@ -47,6 +50,8 @@ func (bw *ChromeBrowser) Watch() bool {
func (bw *ChromeBrowser) Load() { func (bw *ChromeBrowser) Load() {
bw.InitIndex()
// Check if cache is initialized // Check if cache is initialized
if cacheDB == nil || cacheDB.handle == nil { if cacheDB == nil || cacheDB.handle == nil {
log.Critical("cache is not yet initialized !") log.Critical("cache is not yet initialized !")
@ -84,21 +89,22 @@ func (bw *ChromeBrowser) Run() {
gJsonParseRecursive(nil, childVal, dataType, offset) gJsonParseRecursive(nil, childVal, dataType, offset)
} }
rootsNode := new(Node)
currentNode := rootsNode
gJsonParseRecursive = func(key []byte, node []byte, dataType jsonparser.ValueType, offset int) error { gJsonParseRecursive = func(key []byte, node []byte, dataType jsonparser.ValueType, offset int) error {
// Core of google chrome bookmark parsing // Core of google chrome bookmark parsing
// Any loading to local db is done here // Any loading to local db is done here
bw.stats.currentNodeCount++ bw.stats.currentNodeCount++
parentNode := currentNode //log.Debugf("moving current node %v as parent", currentNode.Name)
currentNode := new(Node) currentNode := new(Node)
currentNode.Parent = parentNode
var nodeType, children []byte currentNode.Parent = bw.cNode
bw.cNode.Children = append(bw.cNode.Children, currentNode)
bw.cNode = currentNode
var nodeType, nodeName, nodeURL, children []byte
var childrenType jsonparser.ValueType var childrenType jsonparser.ValueType
bookmark := &Bookmark{}
//log.Debugf("parent %v", parentNode)
// Paths to lookup in node payload // Paths to lookup in node payload
paths := [][]string{ paths := [][]string{
@ -112,49 +118,101 @@ func (bw *ChromeBrowser) Run() {
switch idx { switch idx {
case 0: case 0:
nodeType = value nodeType = value
currentNode.Type = _s(value) //currentNode.Type = _s(value)
case 1: // name or title case 1: // name or title
currentNode.Name = _s(value) //currentNode.Name = _s(value)
nodeName = value
case 2: case 2:
currentNode.URL = _s(value) //currentNode.URL = _s(value)
nodeURL = value
case 3: case 3:
children, childrenType = value, vt children, childrenType = value, vt
} }
}, paths...) }, paths...)
bookmark.Metadata = currentNode.Name log.Debugf("parsing node %s", nodeName)
bookmark.URL = currentNode.URL
// If node type is string ignore (needed for sync_transaction_version) // If node type is string ignore (needed for sync_transaction_version)
if dataType == jsonparser.String { if dataType == jsonparser.String {
return nil return nil
} }
// if node is url(leaf), handle the url
if _s(nodeType) == jsonNodeTypes.URL {
// Add bookmark to db here
//debugPrint("%s", url)
//debugPrint("%s", node)
// Find tags in title
//findTagsInTitle(name)
bw.stats.currentUrlCount++
// Run parsehoos before adding bookmark
bw.RunParseHooks(bookmark)
// Add bookmark
bookmark.add(bw.bufferDB)
}
parentNode.Children = append(parentNode.Children, currentNode)
// if node is a folder with children // if node is a folder with children
if childrenType == jsonparser.Array && len(children) > 2 { // if len(children) > len("[]") if childrenType == jsonparser.Array && len(children) > 2 { // if len(children) > len("[]")
jsonparser.ArrayEach(node, parseChildren, jsonNodePaths.Children) jsonparser.ArrayEach(node, parseChildren, jsonNodePaths.Children)
// Finished parsing all children
// Add them into current node ?
} }
currentNode.Type = _s(nodeType)
currentNode.Name = _s(nodeName)
// if node is url(leaf), handle the url
if _s(nodeType) == jsonNodeTypes.URL {
currentNode.URL = _s(nodeURL)
bw.stats.currentUrlCount++
// Check if url-node already in index
var nodeVal *Node
iVal, found := bw.URLIndex.Get(currentNode.URL)
nameHash := xxhash.ChecksumString64(currentNode.Name)
// If node url not in index, add it to index
if !found {
//log.Debugf("Not found")
// store hash(name)
currentNode.NameHash = nameHash
// The value in the index will be a
// pointer to currentNode
//log.Debugf("Inserting url %s to index", nodeURL)
bw.URLIndex.Insert(currentNode.URL, currentNode)
// If we find the node already in index
// we check if the hash(name) changed meaning
// the data changed
} else {
//log.Debugf("Found")
nodeVal = iVal.(*Node)
// hash(name) is different, we will update the
// index and parse the bookmark
if nodeVal.NameHash != nameHash {
// Update node in index
currentNode.NameHash = nameHash
if currentNode.NameHash != nodeVal.NameHash {
panic("currentNode.NameHash != indexValue.NameHash")
}
// Run parse hooks on node
bw.RunParseHooks(currentNode)
}
// Else we do nothing, the node will not
// change
}
// If parent is folder, add it as tag and add current node as child
// And add this link as child
if currentNode.Parent.Type == jsonNodeTypes.Folder {
log.Debug("Parent is folder, parsing as tag ...")
currentNode.Tags = append(currentNode.Tags, currentNode.Parent.Name)
}
}
//log.Debugf("Adding current node %v to parent %v", currentNode.Name, parentNode)
//parentNode.Children = append(parentNode.Children, currentNode)
//currentNode.Parent = parentNode
return nil return nil
} }
@ -162,12 +220,12 @@ func (bw *ChromeBrowser) Run() {
// Begin parsing // Begin parsing
rootsData, _, _, _ := jsonparser.Get(f, "roots") rootsData, _, _, _ := jsonparser.Get(f, "roots")
log.Debug("loading bookmarks to bufferdb") log.Debug("loading bookmarks to index")
// Load bookmarks to currentJobDB
jsonparser.ObjectEach(rootsData, gJsonParseRecursive) jsonparser.ObjectEach(rootsData, gJsonParseRecursive)
// Debug walk tree // Debug walk tree
//go WalkNode(rootsNode) go WalkNode(bw.nodeTree)
// Finished parsing // Finished parsing
log.Debugf("parsed %d bookmarks", bw.stats.currentUrlCount) log.Debugf("parsed %d bookmarks", bw.stats.currentUrlCount)
@ -178,7 +236,9 @@ func (bw *ChromeBrowser) Run() {
bw.stats.currentNodeCount = 0 bw.stats.currentNodeCount = 0
bw.stats.currentUrlCount = 0 bw.stats.currentUrlCount = 0
// Compare currentDb with memCacheDb for new bookmarks // Compare currentDb with index for new bookmarks
log.Debug("TODO: Compare cacheDB with index")
// If cacheDB is empty just copy bufferDB to cacheDB // If cacheDB is empty just copy bufferDB to cacheDB
// until local db is already populated and preloaded // until local db is already populated and preloaded

26
index.go Normal file
View File

@ -0,0 +1,26 @@
package main
import (
"github.com/OneOfOne/xxhash"
"github.com/sp4ke/hashmap"
)
// In memory index used for fast lookup of url-title(tags) pairs
// to quickly detect bookmark which changed when bookmarks are reloaded
// from browser on a watch event
// Input `in` must be of type []byte
// The index is a map of [urlhash]*Node
func xxHashFunc(in interface{}) uint64 {
input, ok := in.(string)
if !ok {
log.Panicf("wrong data type to hash, exptected string given %T", in)
}
sum := xxhash.ChecksumString64(input)
//log.Debugf("Calculating hash of %s as %d", input, sum)
return sum
}
// Returns *hashmap.RBTree
func NewIndex() *hashmap.RBTree {
return hashmap.New(xxHashFunc)
}

View File

@ -5,15 +5,22 @@ import (
) )
const ( const (
RE_TAGS = `\B#\w+` // First group is tag
// TODO: use named groups
// [named groups](https://github.com/StefanSchroeder/Golang-Regex-Tutorial/blob/master/01-chapter2.markdown)
ReTags = "\\B#(?P<tag>\\w+)"
TagJoinSep = "|"
) )
type NodeType uint8 type NodeType uint8
type Node struct { type Node struct {
Type string
Name string Name string
Type string
URL string URL string
Tags []string
NameHash uint64 // hash of the metadata
Parent *Node Parent *Node
Children []*Node Children []*Node
} }
@ -25,10 +32,12 @@ type ParserStats struct {
currentUrlCount int currentUrlCount int
} }
type ParseHook func(bk *Bookmark) type ParseHook func(node *Node)
// Debuggin bookmark node tree
// TODO: Better usage of node trees
func WalkNode(node *Node) { func WalkNode(node *Node) {
log.Debugf("Node --> %s | %s", node.Name, node.Type) log.Debugf("Node --> %s | %s | children: %d | parent: %v", node.Name, node.Type, len(node.Children), node.Parent)
if len(node.Children) > 0 { if len(node.Children) > 0 {
for _, node := range node.Children { for _, node := range node.Children {
@ -37,28 +46,22 @@ func WalkNode(node *Node) {
} }
} }
func ParseTags(bk *Bookmark) { func ParseTags(node *Node) {
var regex = regexp.MustCompile(RE_TAGS) var regex = regexp.MustCompile(ReTags)
bk.Tags = regex.FindAllString(bk.Metadata, -1) matches := regex.FindAllStringSubmatch(node.Name, -1)
for _, m := range matches {
node.Tags = append(node.Tags, _s(m[1]))
}
//res := regex.FindAllStringSubmatch(bk.Metadata, -1)
if len(bk.Tags) > 0 { if len(node.Tags) > 0 {
log.Debugf("[Title] found following tags: %s", bk.Tags) log.Debugf("[Title] found following tags: %s", node.Tags)
} }
//bk.tags = regex.FindAllString(bk.url, -1)
//if len(tags) > 0 {
//log.Debugf("[URL] found following tags: %s", tags)
//}
} }
func _s(value interface{}) string { func _s(value interface{}) string {
return string(value.([]byte)) return string(value.([]byte))
} }
func findTagsInTitle(title []byte) {
var regex = regexp.MustCompile(RE_TAGS)
tags := regex.FindAll(title, -1)
debugPrint("%s ---> found following tags: %s", title, tags)
}