refactor skills loader markdown metadata parsing (#1354)

This commit is contained in:
nayihz
2026-03-11 18:08:00 +08:00
committed by GitHub
parent 30584f04cb
commit 8a398988d7
4 changed files with 204 additions and 43 deletions
+1
View File
@@ -11,6 +11,7 @@ require (
github.com/ergochat/irc-go v0.5.0
github.com/gdamore/tcell/v2 v2.13.8
github.com/google/uuid v1.6.0
github.com/gomarkdown/markdown v0.0.0-20260217112301-37c66b85d6ab
github.com/gorilla/websocket v1.5.3
github.com/h2non/filetype v1.1.3
github.com/larksuite/oapi-sdk-go/v3 v3.5.3
+2
View File
@@ -79,6 +79,8 @@ github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvq
github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
github.com/gomarkdown/markdown v0.0.0-20260217112301-37c66b85d6ab h1:VYNivV7P8IRHUam2swVUNkhIdp0LRRFKe4hXNnoZKTc=
github.com/gomarkdown/markdown v0.0.0-20260217112301-37c66b85d6ab/go.mod h1:JDGcbDT52eL4fju3sZ4TeHGsQwhG9nbDV21aMyhwPoA=
github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+126 -43
View File
@@ -10,14 +10,15 @@ import (
"regexp"
"strings"
"github.com/gomarkdown/markdown"
"github.com/gomarkdown/markdown/ast"
"github.com/gomarkdown/markdown/parser"
"gopkg.in/yaml.v3"
"github.com/sipeed/picoclaw/pkg/logger"
)
var (
namePattern = regexp.MustCompile(`^[a-zA-Z0-9]+(-[a-zA-Z0-9]+)*$`)
reFrontmatter = regexp.MustCompile(`(?s)^---(?:\r\n|\n|\r)(.*?)(?:\r\n|\n|\r)---`)
reStripFrontmatter = regexp.MustCompile(`(?s)^---(?:\r\n|\n|\r)(.*?)(?:\r\n|\n|\r)---(?:\r\n|\n|\r)*`)
)
var namePattern = regexp.MustCompile(`^[a-zA-Z0-9]+(-[a-zA-Z0-9]+)*$`)
const (
MaxNameLength = 64
@@ -226,11 +227,20 @@ func (sl *SkillsLoader) getSkillMetadata(skillPath string) *SkillMetadata {
return nil
}
frontmatter := sl.extractFrontmatter(string(content))
frontmatter, bodyContent := splitFrontmatter(string(content))
dirName := filepath.Base(filepath.Dir(skillPath))
title, bodyDescription := extractMarkdownMetadata(bodyContent)
metadata := &SkillMetadata{
Name: dirName,
Description: bodyDescription,
}
if title != "" && namePattern.MatchString(title) && len(title) <= MaxNameLength {
metadata.Name = title
}
if frontmatter == "" {
return &SkillMetadata{
Name: filepath.Base(filepath.Dir(skillPath)),
}
return metadata
}
// Try JSON first (for backward compatibility)
@@ -239,60 +249,133 @@ func (sl *SkillsLoader) getSkillMetadata(skillPath string) *SkillMetadata {
Description string `json:"description"`
}
if err := json.Unmarshal([]byte(frontmatter), &jsonMeta); err == nil {
return &SkillMetadata{
Name: jsonMeta.Name,
Description: jsonMeta.Description,
if jsonMeta.Name != "" {
metadata.Name = jsonMeta.Name
}
if jsonMeta.Description != "" {
metadata.Description = jsonMeta.Description
}
return metadata
}
// Fall back to simple YAML parsing
yamlMeta := sl.parseSimpleYAML(frontmatter)
return &SkillMetadata{
Name: yamlMeta["name"],
Description: yamlMeta["description"],
if name := yamlMeta["name"]; name != "" {
metadata.Name = name
}
if description := yamlMeta["description"]; description != "" {
metadata.Description = description
}
return metadata
}
// parseSimpleYAML parses simple key: value YAML format
// Example: name: github\n description: "..."
// Normalizes line endings to handle \n (Unix), \r\n (Windows), and \r (classic Mac)
func extractMarkdownMetadata(content string) (title, description string) {
p := parser.NewWithExtensions(parser.CommonExtensions)
doc := markdown.Parse([]byte(content), p)
if doc == nil {
return "", ""
}
ast.WalkFunc(doc, func(node ast.Node, entering bool) ast.WalkStatus {
if !entering {
return ast.GoToNext
}
switch n := node.(type) {
case *ast.Heading:
if title == "" && n.Level == 1 {
title = nodeText(n)
if title != "" && description != "" {
return ast.Terminate
}
}
case *ast.Paragraph:
if description == "" {
description = nodeText(n)
if title != "" && description != "" {
return ast.Terminate
}
}
}
return ast.GoToNext
})
return title, description
}
func nodeText(n ast.Node) string {
var b strings.Builder
ast.WalkFunc(n, func(node ast.Node, entering bool) ast.WalkStatus {
if !entering {
return ast.GoToNext
}
switch t := node.(type) {
case *ast.Text:
b.Write(t.Literal)
case *ast.Code:
b.Write(t.Literal)
case *ast.Softbreak, *ast.Hardbreak, *ast.NonBlockingSpace:
b.WriteByte(' ')
}
return ast.GoToNext
})
return strings.Join(strings.Fields(b.String()), " ")
}
// parseSimpleYAML parses YAML frontmatter and extracts known metadata fields.
func (sl *SkillsLoader) parseSimpleYAML(content string) map[string]string {
result := make(map[string]string)
// Normalize line endings: convert \r\n and \r to \n
normalized := strings.ReplaceAll(content, "\r\n", "\n")
normalized = strings.ReplaceAll(normalized, "\r", "\n")
for line := range strings.SplitSeq(normalized, "\n") {
line = strings.TrimSpace(line)
if line == "" || strings.HasPrefix(line, "#") {
continue
}
parts := strings.SplitN(line, ":", 2)
if len(parts) == 2 {
key := strings.TrimSpace(parts[0])
value := strings.TrimSpace(parts[1])
// Remove quotes if present
value = strings.Trim(value, "\"'")
result[key] = value
}
var meta struct {
Name string `yaml:"name"`
Description string `yaml:"description"`
}
if err := yaml.Unmarshal([]byte(content), &meta); err != nil {
return result
}
if meta.Name != "" {
result["name"] = meta.Name
}
if meta.Description != "" {
result["description"] = meta.Description
}
return result
}
func (sl *SkillsLoader) extractFrontmatter(content string) string {
// Support \n (Unix), \r\n (Windows), and \r (classic Mac) line endings for frontmatter blocks
match := reFrontmatter.FindStringSubmatch(content)
if len(match) > 1 {
return match[1]
}
return ""
frontmatter, _ := splitFrontmatter(content)
return frontmatter
}
func (sl *SkillsLoader) stripFrontmatter(content string) string {
return reStripFrontmatter.ReplaceAllString(content, "")
_, body := splitFrontmatter(content)
return body
}
func splitFrontmatter(content string) (frontmatter, body string) {
normalized := string(parser.NormalizeNewlines([]byte(content)))
lines := strings.Split(normalized, "\n")
if len(lines) == 0 || lines[0] != "---" {
return "", content
}
end := -1
for i := 1; i < len(lines); i++ {
if lines[i] == "---" {
end = i
break
}
}
if end == -1 {
return "", content
}
frontmatter = strings.Join(lines[1:end], "\n")
body = strings.Join(lines[end+1:], "\n")
body = strings.TrimLeft(body, "\n")
return frontmatter, body
}
func escapeXML(s string) string {
+75
View File
@@ -342,3 +342,78 @@ func TestSkillRootsTrimsWhitespaceAndDedups(t *testing.T) {
builtin,
}, roots)
}
func TestGetSkillMetadata_UsesMarkdownParagraphWhenNoFrontmatter(t *testing.T) {
tmp := t.TempDir()
skillDir := filepath.Join(tmp, "workspace", "skills", "plain-skill")
require.NoError(t, os.MkdirAll(skillDir, 0o755))
content := "# Plain Skill\n\nThis is parsed from markdown paragraph.\n"
require.NoError(t, os.WriteFile(filepath.Join(skillDir, "SKILL.md"), []byte(content), 0o644))
sl := &SkillsLoader{}
meta := sl.getSkillMetadata(filepath.Join(skillDir, "SKILL.md"))
require.NotNil(t, meta)
assert.Equal(t, "plain-skill", meta.Name)
assert.Equal(t, "This is parsed from markdown paragraph.", meta.Description)
}
func TestGetSkillMetadata_FrontmatterOverridesMarkdown(t *testing.T) {
tmp := t.TempDir()
skillDir := filepath.Join(tmp, "workspace", "skills", "plain-skill")
require.NoError(t, os.MkdirAll(skillDir, 0o755))
content := "---\nname: frontmatter-skill\ndescription: frontmatter description\n---\n\n# Plain Skill\n\nBody description.\n"
require.NoError(t, os.WriteFile(filepath.Join(skillDir, "SKILL.md"), []byte(content), 0o644))
sl := &SkillsLoader{}
meta := sl.getSkillMetadata(filepath.Join(skillDir, "SKILL.md"))
require.NotNil(t, meta)
assert.Equal(t, "frontmatter-skill", meta.Name)
assert.Equal(t, "frontmatter description", meta.Description)
}
func TestGetSkillMetadata_YAMLMultilineDescription(t *testing.T) {
tmp := t.TempDir()
skillDir := filepath.Join(tmp, "workspace", "skills", "plain-skill")
require.NoError(t, os.MkdirAll(skillDir, 0o755))
content := "---\nname: frontmatter-skill\ndescription: |\n line 1: with colon\n line 2\n---\n\n# Plain Skill\n\nBody description.\n"
require.NoError(t, os.WriteFile(filepath.Join(skillDir, "SKILL.md"), []byte(content), 0o644))
sl := &SkillsLoader{}
meta := sl.getSkillMetadata(filepath.Join(skillDir, "SKILL.md"))
require.NotNil(t, meta)
assert.Equal(t, "frontmatter-skill", meta.Name)
assert.Equal(t, "line 1: with colon\nline 2", meta.Description)
}
func TestGetSkillMetadata_InvalidHeadingNameFallsBackToDirName(t *testing.T) {
tmp := t.TempDir()
skillDir := filepath.Join(tmp, "workspace", "skills", "valid-name")
require.NoError(t, os.MkdirAll(skillDir, 0o755))
content := "# Invalid Heading Name\n\nBody description.\n"
require.NoError(t, os.WriteFile(filepath.Join(skillDir, "SKILL.md"), []byte(content), 0o644))
sl := &SkillsLoader{}
meta := sl.getSkillMetadata(filepath.Join(skillDir, "SKILL.md"))
require.NotNil(t, meta)
assert.Equal(t, "valid-name", meta.Name)
assert.Equal(t, "Body description.", meta.Description)
}
func TestGetSkillMetadata_IgnoresHTMLCommentBlocks(t *testing.T) {
tmp := t.TempDir()
skillDir := filepath.Join(tmp, "workspace", "skills", "biomed-skill")
require.NoError(t, os.MkdirAll(skillDir, 0o755))
content := "<!--\n# COPYRIGHT NOTICE\n# This file is part of the \"Universal Biomedical Skills\" project.\n# Copyright (c) 2026 MD BABU MIA, PhD <md.babu.mia@mssm.edu>\n# All Rights Reserved.\n#\n# This code is proprietary and confidential.\n# Unauthorized copying of this file, via any medium is strictly prohibited.\n#\n# Provenance: Authenticated by MD BABU MIA\n\n-->\n\n# Biomed Skill\n\nSummarize biomedical papers.\n"
require.NoError(t, os.WriteFile(filepath.Join(skillDir, "SKILL.md"), []byte(content), 0o644))
sl := &SkillsLoader{}
meta := sl.getSkillMetadata(filepath.Join(skillDir, "SKILL.md"))
require.NotNil(t, meta)
assert.Equal(t, "biomed-skill", meta.Name)
assert.Equal(t, "Summarize biomedical papers.", meta.Description)
}