You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
zk/internal/adapter/markdown/markdown.go

375 lines
9.3 KiB
Go

package markdown
import (
"bufio"
"fmt"
"net/url"
"regexp"
"strings"
"github.com/zk-org/zk/internal/adapter/markdown/extensions"
"github.com/zk-org/zk/internal/core"
"github.com/zk-org/zk/internal/util"
"github.com/zk-org/zk/internal/util/opt"
strutil "github.com/zk-org/zk/internal/util/strings"
"github.com/zk-org/zk/internal/util/yaml"
"github.com/mvdan/xurls"
"github.com/yuin/goldmark"
meta "github.com/yuin/goldmark-meta"
"github.com/yuin/goldmark/ast"
"github.com/yuin/goldmark/extension"
"github.com/yuin/goldmark/parser"
"github.com/yuin/goldmark/text"
)
// Parser parses the content of Markdown notes.
type Parser struct {
md goldmark.Markdown
logger util.Logger
}
type ParserOpts struct {
// Indicates whether #hashtags are parsed.
HashtagEnabled bool
// Indicates whether Bear's multi-word tags are parsed. Hashtags must be enabled as well.
MultiWordTagEnabled bool
// Indicates whether :colon:tags: are parsed.
ColontagEnabled bool
}
// NewParser creates a new Markdown Parser.
func NewParser(options ParserOpts, logger util.Logger) *Parser {
return &Parser{
md: goldmark.New(
goldmark.WithExtensions(
meta.Meta,
extension.NewLinkify(
extension.WithLinkifyAllowedProtocols([][]byte{
[]byte("http:"),
[]byte("https:"),
}),
extension.WithLinkifyURLRegexp(
xurls.Strict,
),
),
extensions.WikiLinkExt,
&extensions.TagExt{
HashtagEnabled: options.HashtagEnabled,
MultiWordTagEnabled: options.MultiWordTagEnabled,
ColontagEnabled: options.ColontagEnabled,
},
),
),
logger: logger,
}
}
// ParseNoteContent implements core.NoteContentParser.
func (p *Parser) ParseNoteContent(content string) (*core.NoteContent, error) {
bytes := []byte(content)
context := parser.NewContext()
root := p.md.Parser().Parse(
text.NewReader(bytes),
parser.WithContext(context),
)
links, err := p.parseLinks(root, bytes)
if err != nil {
return nil, err
}
frontmatter, err := parseFrontmatter(context, bytes)
if err != nil {
return nil, err
}
title, bodyStart, err := parseTitle(frontmatter, root, bytes)
if err != nil {
return nil, err
}
body := parseBody(bodyStart, bytes)
tags, err := parseTags(frontmatter, root, bytes)
if err != nil {
return nil, err
}
return &core.NoteContent{
Title: title,
Body: body,
Lead: parseLead(body),
Links: links,
Tags: tags,
Metadata: frontmatter.values,
}, nil
}
// parseTitle extracts the note title with its node.
func parseTitle(frontmatter frontmatter, root ast.Node, source []byte) (title opt.String, bodyStart int, err error) {
if title = frontmatter.getString("title", "Title"); !title.IsNull() {
bodyStart = frontmatter.end
return
}
var titleNode *ast.Heading
err = ast.Walk(root, func(n ast.Node, entering bool) (ast.WalkStatus, error) {
if heading, ok := n.(*ast.Heading); ok && entering &&
(titleNode == nil || heading.Level < titleNode.Level) {
titleNode = heading
if heading.Level == 1 {
return ast.WalkStop, nil
}
}
return ast.WalkContinue, nil
})
if err != nil {
return
}
if titleNode != nil {
title = opt.NewNotEmptyString(string(titleNode.Text(source)))
if lines := titleNode.Lines(); lines.Len() > 0 {
bodyStart = lines.At(lines.Len() - 1).Stop
}
}
return
}
// parseBody extracts the whole content after the title.
func parseBody(startIndex int, source []byte) opt.String {
return opt.NewNotEmptyString(
strings.TrimSpace(
string(source[startIndex:]),
),
)
}
// parseLead extracts the body content until the first blank line.
func parseLead(body opt.String) opt.String {
lead := ""
scanner := bufio.NewScanner(strings.NewReader(body.String()))
for scanner.Scan() {
if strings.TrimSpace(scanner.Text()) == "" {
break
}
lead += scanner.Text() + "\n"
}
return opt.NewNotEmptyString(strings.TrimSpace(lead))
}
// parseTags extracts tags as #hashtags, :colon:tags: or from the YAML frontmatter.
func parseTags(frontmatter frontmatter, root ast.Node, source []byte) ([]string, error) {
tags := make([]string, 0)
// Parse from YAML frontmatter, either:
// * a list of strings
// * a single space-separated string
findFMTags := func(key string) []string {
if tags, ok := frontmatter.getStrings(key); ok {
return tags
} else if tags := frontmatter.getString(key); !tags.IsNull() {
// Parse a space-separated string list
res := []string{}
for _, s := range strings.Fields(tags.Unwrap()) {
s = strings.TrimSpace(s)
if len(s) > 0 {
res = append(res, s)
}
}
return res
} else {
return []string{}
}
}
for _, key := range []string{"tag", "tags", "keyword", "keywords"} {
for _, t := range findFMTags(key) {
// Trims any # prefix to support hashtags embedded in YAML
// frontmatter, as in Simple Markdown Zettelkasten:
// http://evantravers.com/articles/2020/11/23/zettelkasten-updates/
tags = append(tags, strings.TrimPrefix(t, "#"))
}
}
// Parse #hashtags and :colon:tags:
err := ast.Walk(root, func(n ast.Node, entering bool) (ast.WalkStatus, error) {
if tagsNode, ok := n.(*extensions.Tags); ok && entering {
for _, tag := range tagsNode.Tags {
tags = append(tags, tag)
}
}
return ast.WalkContinue, nil
})
return strutil.RemoveDuplicates(tags), err
}
// parseLinks extracts outbound links from the note.
func (p *Parser) parseLinks(root ast.Node, source []byte) ([]core.Link, error) {
links := make([]core.Link, 0)
err := ast.Walk(root, func(n ast.Node, entering bool) (ast.WalkStatus, error) {
if entering {
switch link := n.(type) {
case *ast.Link:
href, err := url.PathUnescape(string(link.Destination))
p.logger.Err(err)
if href != "" {
snippet, snStart, snEnd := extractLines(n, source)
links = append(links, core.Link{
Title: string(link.Text(source)),
Href: href,
Type: core.LinkTypeMarkdown,
Rels: core.LinkRels(strings.Fields(string(link.Title))...),
IsExternal: strutil.IsURL(href),
Snippet: snippet,
SnippetStart: snStart,
SnippetEnd: snEnd,
})
}
case *ast.AutoLink:
if href := string(link.URL(source)); href != "" && link.AutoLinkType == ast.AutoLinkURL {
snippet, snStart, snEnd := extractLines(n, source)
links = append(links, core.Link{
Title: string(link.Label(source)),
Href: href,
Type: core.LinkTypeImplicit,
Rels: []core.LinkRelation{},
IsExternal: true,
Snippet: snippet,
SnippetStart: snStart,
SnippetEnd: snEnd,
})
}
case *extensions.WikiLink:
href := string(link.Destination)
if href != "" {
snippet, snStart, snEnd := extractLines(n, source)
links = append(links, core.Link{
Title: string(link.Text(source)),
Href: href,
Type: core.LinkTypeWikiLink,
Rels: core.LinkRels(strings.Fields(string(link.Title))...),
IsExternal: strutil.IsURL(href),
Snippet: snippet,
SnippetStart: snStart,
SnippetEnd: snEnd,
})
}
}
}
return ast.WalkContinue, nil
})
return links, err
}
func extractLines(n ast.Node, source []byte) (content string, start, end int) {
if n == nil {
return
}
switch n.Type() {
case ast.TypeInline:
return extractLines(n.Parent(), source)
case ast.TypeBlock:
segs := n.Lines()
if segs.Len() == 0 {
return
}
start = segs.At(0).Start
end = segs.At(segs.Len() - 1).Stop
content = string(source[start:end])
}
return
}
// frontmatter contains metadata parsed from a YAML frontmatter.
type frontmatter struct {
values map[string]interface{}
start int
end int
}
var frontmatterRegex = regexp.MustCompile(`(?ms)^\s*-+\s*$.*?^\s*-+\s*$`)
func parseFrontmatter(context parser.Context, source []byte) (frontmatter, error) {
var front frontmatter
front.values = map[string]interface{}{}
index := frontmatterRegex.FindIndex(source)
if index == nil {
return front, nil
}
front.start = index[0]
front.end = index[1]
values, err := meta.TryGet(context)
if err != nil {
return front, err
}
// The YAML parser parses nested maps as map[interface{}]interface{}
// instead of map[string]interface{}, which doesn't work with the JSON
// marshaller.
values = yaml.ConvertMapToJSONCompatible(values)
// Convert keys to lowercase, because we don't want to be case sensitive.
for k, v := range values {
front.values[strings.ToLower(k)] = v
}
return front, nil
}
// getString returns the first string value found for any of the given keys.
func (m frontmatter) getString(keys ...string) opt.String {
if m.values == nil {
return opt.NullString
}
for _, key := range keys {
key = strings.ToLower(key)
if val, ok := m.values[key]; ok {
if val, ok := val.(string); ok {
return opt.NewNotEmptyString(val)
}
}
}
return opt.NullString
}
// getStrings returns the first string list found for any of the given keys.
func (m frontmatter) getStrings(keys ...string) ([]string, bool) {
if m.values == nil {
return nil, false
}
for _, key := range keys {
key = strings.ToLower(key)
if val, ok := m.values[key]; ok {
if val, ok := val.([]interface{}); ok {
strs := []string{}
for _, v := range val {
s := strings.TrimSpace(fmt.Sprint(v))
if len(s) > 0 {
strs = append(strs, s)
}
}
return strs, true
}
}
}
return nil, false
}