diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 9b89b69ae..def19c3e5 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -16,5 +16,5 @@ jobs:
with:
go-version-file: go.mod
- - name: Build
+ - name: Build core binaries
run: make build-all
diff --git a/.github/workflows/create_dmg.yml b/.github/workflows/create_dmg.yml
index e03357566..67fded40a 100644
--- a/.github/workflows/create_dmg.yml
+++ b/.github/workflows/create_dmg.yml
@@ -17,29 +17,38 @@ jobs:
with:
ref: main
- # 1. 安装指定版本的 Go (可选,但推荐)
+ # 1. Install Go from go.mod
- name: Setup Go
uses: actions/setup-go@v6
with:
go-version-file: go.mod
- # 2. 安装 pnpm
- - name: Install pnpm
- run: brew install pnpm
+ - name: Setup pnpm
+ uses: pnpm/action-setup@v4
+ with:
+ version: 10.33.0
+ run_install: false
- # 3. 运行你的 Makefile 编译二进制文件
+ - name: Setup Node.js
+ uses: actions/setup-node@v6
+ with:
+ node-version: 22
+ cache: pnpm
+ cache-dependency-path: web/frontend/pnpm-lock.yaml
+
+ # 3. Build the application bundle
- name: Build with Make
run: make build ARCH=${{ matrix.arch }} && make build-macos-app ARCH=${{ matrix.arch }}
- # 4. 签名
+ # 4. Apply ad-hoc signing
- name: Ad-hoc Sign
run: codesign --force --deep --sign - "build/PicoClaw Launcher.app"
- # 5. 安装打包工具
+ # 5. Install the DMG packaging tool
- name: Install create-dmg
run: brew install create-dmg
- # 6. 执行打包命令
+ # 6. Create the DMG
- name: Create DMG
run: |
mkdir -p dist
@@ -54,7 +63,7 @@ jobs:
"dist/picoclaw-${{ matrix.arch }}.dmg" \
"build/PicoClaw Launcher.app"
- # 7. 上传文件到 GitHub Artifacts (供你下载)
+ # 7. Upload the DMG as a GitHub artifact
- name: Upload DMG
uses: actions/upload-artifact@v7
with:
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index a5002fec5..0e619dd27 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -47,13 +47,18 @@ jobs:
with:
go-version-file: go.mod
+ - name: Setup pnpm
+ uses: pnpm/action-setup@v4
+ with:
+ version: 10.33.0
+ run_install: false
+
- name: Setup Node.js
uses: actions/setup-node@v6
with:
node-version: 22
-
- - name: Setup pnpm
- run: corepack enable && corepack prepare pnpm@latest --activate
+ cache: pnpm
+ cache-dependency-path: web/frontend/pnpm-lock.yaml
- name: Set up QEMU
uses: docker/setup-qemu-action@v4
@@ -75,6 +80,9 @@ jobs:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
+ - name: Install zip
+ run: sudo apt-get install -y zip
+
- name: Create local tag for GoReleaser
run: git tag "${{ steps.version.outputs.version }}"
@@ -90,6 +98,7 @@ jobs:
DOCKERHUB_IMAGE_NAME: ${{ vars.DOCKERHUB_REPOSITORY }}
GOVERSION: ${{ steps.setup-go.outputs.go-version }}
GORELEASER_CURRENT_TAG: ${{ steps.version.outputs.version }}
+ INCLUDE_ANDROID_BUNDLE: "true"
NIGHTLY_BUILD: "true"
MACOS_SIGN_P12: ${{ secrets.MACOS_SIGN_P12 }}
MACOS_SIGN_PASSWORD: ${{ secrets.MACOS_SIGN_PASSWORD }}
@@ -123,7 +132,7 @@ jobs:
# Collect release artifacts from goreleaser dist/
ASSETS=()
- for f in dist/*.tar.gz dist/*.zip dist/*.deb dist/*.rpm dist/checksums.txt; do
+ for f in dist/*.tar.gz dist/*.zip dist/*.deb dist/*.rpm dist/checksums.txt build/picoclaw-android-universal.zip; do
[ -f "$f" ] && ASSETS+=("$f")
done
@@ -135,4 +144,3 @@ jobs:
--prerelease \
--latest=false \
"${ASSETS[@]}"
-
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 2d544d4f0..795fa5eba 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -41,10 +41,11 @@ jobs:
with:
go-version-file: go.mod
+ - name: Install govulncheck
+ run: go install golang.org/x/vuln/cmd/govulncheck@v1.1.4
+
- name: Run Govulncheck
- uses: golang/govulncheck-action@v1
- with:
- go-package: ./...
+ run: govulncheck -C . -format text ./...
test:
name: Tests
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 2ce341770..c887bf493 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -65,13 +65,18 @@ jobs:
with:
go-version-file: go.mod
+ - name: Setup pnpm
+ uses: pnpm/action-setup@v4
+ with:
+ version: 10.33.0
+ run_install: false
+
- name: Setup Node.js
uses: actions/setup-node@v6
with:
node-version: 22
-
- - name: Setup pnpm
- run: corepack enable && corepack prepare pnpm@latest --activate
+ cache: pnpm
+ cache-dependency-path: web/frontend/pnpm-lock.yaml
- name: Set up QEMU
uses: docker/setup-qemu-action@v4
@@ -93,6 +98,9 @@ jobs:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
+ - name: Install zip
+ run: sudo apt-get install -y zip
+
- name: Run GoReleaser
uses: goreleaser/goreleaser-action@v7
with:
@@ -104,6 +112,7 @@ jobs:
GITHUB_REPOSITORY_OWNER: ${{ github.repository_owner }}
DOCKERHUB_IMAGE_NAME: ${{ vars.DOCKERHUB_REPOSITORY }}
GOVERSION: ${{ steps.setup-go.outputs.go-version }}
+ INCLUDE_ANDROID_BUNDLE: "true"
MACOS_SIGN_P12: ${{ secrets.MACOS_SIGN_P12 }}
MACOS_SIGN_PASSWORD: ${{ secrets.MACOS_SIGN_PASSWORD }}
MACOS_NOTARY_ISSUER_ID: ${{ secrets.MACOS_NOTARY_ISSUER_ID }}
diff --git a/.goreleaser.yaml b/.goreleaser.yaml
index 9c26de34f..d8c51b069 100644
--- a/.goreleaser.yaml
+++ b/.goreleaser.yaml
@@ -9,11 +9,10 @@ git:
before:
hooks:
- - go mod tidy
- go generate ./...
- - sh -c 'cd web/frontend && pnpm install && pnpm build:backend'
- - go install github.com/tc-hib/go-winres@latest
- - go-winres make --in web/backend/winres/winres.json --out web/backend/rsrc --product-version={{ .Version }} --file-version={{ .Version }}
+ - sh -c 'cd web/frontend && CI=true pnpm install --frozen-lockfile && pnpm build:backend'
+ - sh -c 'GOBIN="$(go env GOPATH)/bin"; mkdir -p "$GOBIN"; go install github.com/tc-hib/go-winres@v0.3.3 && "$GOBIN/go-winres" make --in web/backend/winres/winres.json --out web/backend/rsrc --product-version={{ .Version }} --file-version={{ .Version }}'
+ - sh -c 'if [ "${INCLUDE_ANDROID_BUNDLE:-}" = "true" ]; then make build-android-bundle; fi'
builds:
- id: picoclaw
@@ -27,7 +26,7 @@ builds:
- -X github.com/sipeed/picoclaw/pkg/config.Version={{ .Version }}
- -X github.com/sipeed/picoclaw/pkg/config.GitCommit={{ .ShortCommit }}
- -X github.com/sipeed/picoclaw/pkg/config.BuildTime={{ .Date }}
- - -X github.com/sipeed/picoclaw/pkg/config.GoVersion={{ .Env.GOVERSION }}
+ - -X github.com/sipeed/picoclaw/pkg/config.GoVersion={{ with index .Env "GOVERSION" }}{{ . }}{{ else }}unknown{{ end }}
goos:
- linux
- windows
@@ -67,6 +66,10 @@ builds:
- stdjson
ldflags:
- -s -w
+ - -X github.com/sipeed/picoclaw/pkg/config.Version={{ .Version }}
+ - -X github.com/sipeed/picoclaw/pkg/config.GitCommit={{ .ShortCommit }}
+ - -X github.com/sipeed/picoclaw/pkg/config.BuildTime={{ .Date }}
+ - -X github.com/sipeed/picoclaw/pkg/config.GoVersion={{ with index .Env "GOVERSION" }}{{ . }}{{ else }}unknown{{ end }}
goos:
- linux
- windows
@@ -106,6 +109,10 @@ builds:
- stdjson
ldflags:
- -s -w
+ - -X github.com/sipeed/picoclaw/pkg/config.Version={{ .Version }}
+ - -X github.com/sipeed/picoclaw/pkg/config.GitCommit={{ .ShortCommit }}
+ - -X github.com/sipeed/picoclaw/pkg/config.BuildTime={{ .Date }}
+ - -X github.com/sipeed/picoclaw/pkg/config.GoVersion={{ with index .Env "GOVERSION" }}{{ . }}{{ else }}unknown{{ end }}
goos:
- linux
- windows
@@ -245,6 +252,8 @@ changelog:
release:
disable: '{{ isEnvSet "NIGHTLY_BUILD" }}'
+ extra_files:
+ - glob: ./build/picoclaw-android-universal.zip
footer: >-
---
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index ceff723d2..cbb6a6347 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -108,7 +108,7 @@ Use descriptive branch names, e.g. `fix/telegram-timeout`, `feat/ollama-provider
- Reference the related issue when relevant: `Fix session leak (#123)`.
- Keep commits focused. One logical change per commit is preferred.
- For minor cleanups or typo fixes, squash them into a single commit before opening a PR.
-- Refer to https://www.conventionalcommits.org/zh-hans/v1.0.0/
+- Refer to [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/)
### Keeping Up to Date
diff --git a/CONTRIBUTING.zh.md b/CONTRIBUTING.zh.md
index 196aecc65..ca6c66b3d 100644
--- a/CONTRIBUTING.zh.md
+++ b/CONTRIBUTING.zh.md
@@ -108,7 +108,7 @@ git checkout -b 你的功能分支名
- 有关联 Issue 时请引用:`Fix session leak (#123)`。
- 保持 commit 专注,每个 commit 只做一件事。
- 对于小的清理或拼写修正,提 PR 前请将其合并为一个 commit。
-- 按照 https://www.conventionalcommits.org/zh-hans/v1.0.0/ 规范来撰写
+- 按照 [Conventional Commits](https://www.conventionalcommits.org/zh-hans/v1.0.0/) 规范来撰写
### 保持与上游同步
diff --git a/Makefile b/Makefile
index f7ebc7411..afaa7c29a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: all build install uninstall clean help test
+.PHONY: all build install uninstall clean help test build-all
# Build variables
BINARY_NAME=picoclaw
@@ -205,11 +205,44 @@ build-linux-mipsle: generate
$(call PATCH_MIPS_FLAGS,$(BUILD_DIR)/$(BINARY_NAME)-linux-mipsle)
@echo "Build complete: $(BUILD_DIR)/$(BINARY_NAME)-linux-mipsle"
+## build-android-arm64: Build core for Android ARM64
+build-android-arm64: generate
+ @echo "Building for android/arm64..."
+ @mkdir -p $(BUILD_DIR)
+ GOOS=android GOARCH=arm64 $(GO) build -tags stdjson -ldflags "$(LDFLAGS)" -o $(BUILD_DIR)/$(BINARY_NAME)-android-arm64 ./$(CMD_DIR)
+ @echo "Build complete: $(BUILD_DIR)/$(BINARY_NAME)-android-arm64"
+
+## build-launcher-android-arm64: Build launcher for Android ARM64
+build-launcher-android-arm64:
+ @echo "Building picoclaw-launcher for android/arm64..."
+ @mkdir -p $(BUILD_DIR)
+ @$(MAKE) -C web build-android-arm64 \
+ OUTPUT_ANDROID_ARM64="$(CURDIR)/$(BUILD_DIR)/picoclaw-launcher-android-arm64" \
+ GO='$(GO)' \
+ LDFLAGS='$(LDFLAGS)'
+ @echo "Build complete: $(BUILD_DIR)/picoclaw-launcher-android-arm64"
+
+## build-android-bundle: Build core and launcher for all Android architectures and package as universal zip
+build-android-bundle: generate
+ @echo "Building core for all Android architectures..."
+ @mkdir -p $(BUILD_DIR)
+ GOOS=android GOARCH=arm64 $(GO) build -tags stdjson -ldflags "$(LDFLAGS)" -o $(BUILD_DIR)/$(BINARY_NAME)-android-arm64 ./$(CMD_DIR)
+ @echo "Building launcher for Android arm64..."
+ @$(MAKE) build-launcher-android-arm64
+ @echo "Staging JNI libs..."
+ @rm -rf $(BUILD_DIR)/android-staging
+ @mkdir -p $(BUILD_DIR)/android-staging/arm64-v8a
+ @cp $(BUILD_DIR)/$(BINARY_NAME)-android-arm64 $(BUILD_DIR)/android-staging/arm64-v8a/libpicoclaw.so
+ @cp $(BUILD_DIR)/picoclaw-launcher-android-arm64 $(BUILD_DIR)/android-staging/arm64-v8a/libpicoclaw-web.so
+ @cd $(BUILD_DIR)/android-staging && zip -r ../picoclaw-android-universal.zip .
+ @rm -rf $(BUILD_DIR)/android-staging
+ @echo "All Android builds complete: $(BUILD_DIR)/picoclaw-android-universal.zip"
+
## build-pi-zero: Build for Raspberry Pi Zero 2 W (32-bit and 64-bit)
build-pi-zero: build-linux-arm build-linux-arm64
@echo "Pi Zero 2 W builds: $(BUILD_DIR)/$(BINARY_NAME)-linux-arm (32-bit), $(BUILD_DIR)/$(BINARY_NAME)-linux-arm64 (64-bit)"
-## build-all: Build picoclaw for all platforms
+## build-all: Build the picoclaw core binary for all Makefile-managed platforms
build-all: generate
@echo "Building for multiple platforms..."
@mkdir -p $(BUILD_DIR)
@@ -226,7 +259,7 @@ build-all: generate
GOOS=windows GOARCH=amd64 $(GO) build $(GOFLAGS) -ldflags "$(LDFLAGS)" -o $(BUILD_DIR)/$(BINARY_NAME)-windows-amd64.exe ./$(CMD_DIR)
GOOS=netbsd GOARCH=amd64 $(GO) build $(GOFLAGS) -ldflags "$(LDFLAGS)" -o $(BUILD_DIR)/$(BINARY_NAME)-netbsd-amd64 ./$(CMD_DIR)
GOOS=netbsd GOARCH=arm64 $(GO) build $(GOFLAGS) -ldflags "$(LDFLAGS)" -o $(BUILD_DIR)/$(BINARY_NAME)-netbsd-arm64 ./$(CMD_DIR)
- @echo "All builds complete"
+ @echo "Core builds complete"
## install: Install picoclaw to system and copy builtin skills
install: build
diff --git a/README.fr.md b/README.fr.md
index a26c89f14..8fa67fa02 100644
--- a/README.fr.md
+++ b/README.fr.md
@@ -18,7 +18,7 @@
-
-
-
-
diff --git a/README.id.md b/README.id.md
index d3c556dde..525d4dc72 100644
--- a/README.id.md
+++ b/README.id.md
@@ -18,7 +18,7 @@
-
diff --git a/README.it.md b/README.it.md
index 6fe6c5e17..c560976cf 100644
--- a/README.it.md
+++ b/README.it.md
@@ -18,7 +18,7 @@
+
+|
+
+ |
+
+
+ |
+
+
+
+
+
풀스택 엔지니어 모드 |
+로깅 및 계획 |
+웹 검색 및 학습 |
+
|---|---|---|
|
+
|
+
|
+
| 개발 · 배포 · 확장 | +스케줄링 · 자동화 · 기억 | +탐색 · 인사이트 · 트렌드 | +
+
+
+
+
+
+
+
+
![]() |
+ ![]() |
+ ![]() |
+ ![]() |
+
+
+런처 UI 없이 `picoclaw` 코어 바이너리만 있는 최소 환경에서는 명령줄과 JSON 설정 파일만으로도 모든 설정을 마칠 수 있습니다.
+
+**1. 초기화**
+
+```bash
+picoclaw onboard
+```
+
+그러면 `~/.picoclaw/config.json`과 워크스페이스 디렉터리가 생성됩니다.
+
+**2. 설정** (`~/.picoclaw/config.json`)
+
+```jsonc
+{
+ "agents": {
+ "defaults": {
+ "model_name": "gpt-5.4"
+ }
+ },
+ "model_list": [
+ {
+ "model_name": "gpt-5.4",
+ "model": "openai/gpt-5.4",
+ // api_key는 이제 .security.yml에서 로드됩니다.
+ }
+ ]
+}
+```
+
+> 사용 가능한 모든 옵션이 포함된 전체 설정 템플릿은 저장소의 `config/config.example.json`을 참고하세요.
+>
+> 참고: `config.example.json` 형식은 버전 0이며 민감 정보가 포함되어 있습니다. 실행 시 자동으로 버전 1+로 마이그레이션되며, 이후 `config.json`에는 비민감 정보만 저장되고 민감 정보는 `.security.yml`에 저장됩니다. 민감 정보를 직접 수정해야 한다면 `docs/security_configuration.md`를 참고하세요.
+
+**3. 채팅**
+
+```bash
+# 단발성 질문
+picoclaw agent -m "2+2는 얼마야?"
+
+# 대화형 모드
+picoclaw agent
+
+# 채팅 앱 연동용 게이트웨이 시작
+picoclaw gateway
+```
+
+
diff --git a/README.md b/README.md
index a48a53d47..1ab514a29 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@
-**Getting started:**
+**Getting started:**
Open the WebUI, then: **1)** Configure a Provider (add your LLM API key) -> **2)** Configure a Channel (e.g., Telegram) -> **3)** Start the Gateway -> **4)** Chat!
@@ -293,7 +303,7 @@ picoclaw-launcher-tui
-**Getting started:**
+**Getting started:**
Use the TUI menus to: **1)** Configure a Provider -> **2)** Configure a Channel -> **3)** Start the Gateway -> **4)** Chat!
@@ -368,7 +378,7 @@ This creates `~/.picoclaw/config.json` and the workspace directory.
```
> See `config/config.example.json` in the repo for a complete configuration template with all available options.
->
+>
> Please note: config.example.json format is version 0, with sensitive codes in it, and will be auto migrated to version 1+, then, the config.json will only store insensitive data, the sensitive codes will be stored in .security.yml, if you need manually modify the codes, please see `docs/security_configuration.md` for more details.
@@ -513,7 +523,7 @@ picoclaw skills search "web scraping"
picoclaw skills install
-
-
-
-
-
diff --git a/assets/wechat.png b/assets/wechat.png
index 66ffa99e9..d538f40e6 100644
Binary files a/assets/wechat.png and b/assets/wechat.png differ
diff --git a/cmd/membench/eval.go b/cmd/membench/eval.go
index bddee76fd..729c9f97f 100644
--- a/cmd/membench/eval.go
+++ b/cmd/membench/eval.go
@@ -36,6 +36,7 @@ type AggMetrics struct {
OverallHitRate float64 `json:"overallHitRate"`
ByCategory map[int]*CatMetrics `json:"byCategory"`
TotalQuestions int `json:"totalQuestions"`
+ ValidF1Count int `json:"validF1Count"`
}
// CatMetrics holds metrics for a single category.
@@ -43,6 +44,7 @@ type CatMetrics struct {
F1 float64 `json:"f1"`
HitRate float64 `json:"hitRate"`
QuestionCount int `json:"questionCount"`
+ ValidF1Count int `json:"validF1Count"`
}
// EvalLegacy evaluates using legacy session store (raw history + budget truncation).
@@ -201,38 +203,64 @@ func EvalSeahorse(
// aggregateMetrics computes overall and per-category metrics.
func aggregateMetrics(qaResults []QAResult) AggMetrics {
- byCat := map[int]*CatMetrics{}
+ type catAccum struct {
+ f1Sum float64
+ f1Count int
+ hitRateSum float64
+ hitRateCount int
+ }
+ byCatAcc := map[int]*catAccum{}
totalF1 := 0.0
totalHitRate := 0.0
+ validF1Count := 0
for _, qr := range qaResults {
- totalF1 += qr.TokenF1
- totalHitRate += qr.HitRate
- cat, ok := byCat[qr.Category]
- if !ok {
- cat = &CatMetrics{}
- byCat[qr.Category] = cat
+ // Skip sentinel -1.0 scores (LLM API/parse failures) from F1 averaging.
+ if qr.TokenF1 >= 0 {
+ totalF1 += qr.TokenF1
+ validF1Count++
}
- cat.F1 += qr.TokenF1
- cat.HitRate += qr.HitRate
- cat.QuestionCount++
+ totalHitRate += qr.HitRate
+ acc, ok := byCatAcc[qr.Category]
+ if !ok {
+ acc = &catAccum{}
+ byCatAcc[qr.Category] = acc
+ }
+ if qr.TokenF1 >= 0 {
+ acc.f1Sum += qr.TokenF1
+ acc.f1Count++
+ }
+ acc.hitRateSum += qr.HitRate
+ acc.hitRateCount++
}
- n := len(qaResults)
- if n == 0 {
- n = 1
+ nHit := len(qaResults)
+ if nHit == 0 {
+ nHit = 1
}
- agg := AggMetrics{
- OverallF1: totalF1 / float64(n),
- OverallHitRate: totalHitRate / float64(n),
+ byCat := map[int]*CatMetrics{}
+ for cat, acc := range byCatAcc {
+ cm := &CatMetrics{
+ QuestionCount: acc.hitRateCount,
+ ValidF1Count: acc.f1Count,
+ }
+ if acc.f1Count > 0 {
+ cm.F1 = acc.f1Sum / float64(acc.f1Count)
+ }
+ if acc.hitRateCount > 0 {
+ cm.HitRate = acc.hitRateSum / float64(acc.hitRateCount)
+ }
+ byCat[cat] = cm
+ }
+ var overallF1 float64
+ if validF1Count > 0 {
+ overallF1 = totalF1 / float64(validF1Count)
+ }
+ return AggMetrics{
+ OverallF1: overallF1,
+ OverallHitRate: totalHitRate / float64(nHit),
ByCategory: byCat,
TotalQuestions: len(qaResults),
+ ValidF1Count: validF1Count,
}
- for _, cat := range agg.ByCategory {
- if cat.QuestionCount > 0 {
- cat.F1 /= float64(cat.QuestionCount)
- cat.HitRate /= float64(cat.QuestionCount)
- }
- }
- return agg
}
// SaveResults writes per-sample eval results to JSON files.
@@ -277,27 +305,43 @@ func SaveAggregated(results []EvalResult, outDir string) error {
func computeModeAgg(results []EvalResult) AggMetrics {
agg := AggMetrics{ByCategory: map[int]*CatMetrics{}}
for _, r := range results {
- agg.OverallF1 += r.Agg.OverallF1 * float64(r.Agg.TotalQuestions)
+ // Backward compat: old eval JSON (token mode) without ValidF1Count → use TotalQuestions.
+ // LLM modes may legitimately have ValidF1Count==0 (all failures).
+ vf1 := r.Agg.ValidF1Count
+ if vf1 == 0 && r.Agg.TotalQuestions > 0 && !strings.HasSuffix(r.Mode, "-llm") {
+ vf1 = r.Agg.TotalQuestions
+ }
+ agg.OverallF1 += r.Agg.OverallF1 * float64(vf1)
agg.OverallHitRate += r.Agg.OverallHitRate * float64(r.Agg.TotalQuestions)
agg.TotalQuestions += r.Agg.TotalQuestions
+ agg.ValidF1Count += vf1
for cat, cm := range r.Agg.ByCategory {
existing, ok := agg.ByCategory[cat]
if !ok {
existing = &CatMetrics{}
agg.ByCategory[cat] = existing
}
- existing.F1 += cm.F1 * float64(cm.QuestionCount)
+ cvf1 := cm.ValidF1Count
+ if cvf1 == 0 && cm.QuestionCount > 0 && !strings.HasSuffix(r.Mode, "-llm") {
+ cvf1 = cm.QuestionCount
+ }
+ existing.F1 += cm.F1 * float64(cvf1)
existing.HitRate += cm.HitRate * float64(cm.QuestionCount)
existing.QuestionCount += cm.QuestionCount
+ existing.ValidF1Count += cvf1
}
}
+ if agg.ValidF1Count > 0 {
+ agg.OverallF1 /= float64(agg.ValidF1Count)
+ }
if agg.TotalQuestions > 0 {
- agg.OverallF1 /= float64(agg.TotalQuestions)
agg.OverallHitRate /= float64(agg.TotalQuestions)
}
for _, cat := range agg.ByCategory {
+ if cat.ValidF1Count > 0 {
+ cat.F1 /= float64(cat.ValidF1Count)
+ }
if cat.QuestionCount > 0 {
- cat.F1 /= float64(cat.QuestionCount)
cat.HitRate /= float64(cat.QuestionCount)
}
}
@@ -359,7 +403,9 @@ func printSection(title string, results []EvalResult) {
// PrintComparison outputs a human-readable comparison table to stdout.
func PrintComparison(results []EvalResult, llmResults []EvalResult) {
- printSection("No LLM generation", results)
+ if len(results) > 0 {
+ printSection("No LLM generation", results)
+ }
if len(llmResults) > 0 {
printSection("With LLM", llmResults)
}
diff --git a/cmd/membench/eval_llm.go b/cmd/membench/eval_llm.go
new file mode 100644
index 000000000..ee401d134
--- /dev/null
+++ b/cmd/membench/eval_llm.go
@@ -0,0 +1,346 @@
+package main
+
+import (
+ "context"
+ "fmt"
+ "log"
+ "regexp"
+ "sort"
+ "strconv"
+ "strings"
+ "sync"
+
+ "github.com/sipeed/picoclaw/pkg/seahorse"
+)
+
+const answerSystemPrompt = `You are a helpful assistant. Given conversation context, answer the question concisely and accurately. If the answer is not in the context, say "I don't know". Answer in 1-3 sentences maximum.`
+
+const judgeSystemPrompt = `You are an impartial judge evaluating answer quality.
+Compare the candidate answer against the reference answer.
+Consider semantic equivalence — different wording expressing the same meaning should score high.
+
+Output ONLY a single integer score from 1 to 5:
+1 = completely wrong or irrelevant
+2 = partially related but mostly incorrect
+3 = partially correct, missing key details
+4 = mostly correct with minor omissions
+5 = fully correct, semantically equivalent
+
+Output ONLY the number, nothing else.`
+
+// generateAnswer asks the LLM to answer a question given retrieved context.
+func generateAnswer(ctx context.Context, client *LLMClient, contextText, question string) (string, error) {
+ // Truncate context to avoid exceeding model limits while preserving valid UTF-8.
+ contextRunes := []rune(contextText)
+ if len(contextRunes) > 6000 {
+ contextText = string(contextRunes[:6000]) + "\n... [truncated]"
+ }
+
+ userPrompt := fmt.Sprintf("## Conversation Context\n\n%s\n\n## Question\n\n%s", contextText, question)
+ return client.Complete(ctx, answerSystemPrompt, userPrompt)
+}
+
+// scoreRe matches the first standalone integer 1-5 in the judge response.
+var scoreRe = regexp.MustCompile(`\b([1-5])\b`)
+
+// judgeAnswer asks the LLM to score the candidate answer vs the gold answer.
+// Returns a score from 0.0 to 1.0, or -1.0 on parse failure.
+func judgeAnswer(
+ ctx context.Context,
+ judgeClient *LLMClient,
+ question, goldAnswer, candidateAnswer string,
+) (float64, error) {
+ userPrompt := fmt.Sprintf(
+ "Question: %s\n\nReference Answer: %s\n\nCandidate Answer: %s\n\nScore:",
+ question, goldAnswer, candidateAnswer,
+ )
+
+ response, err := judgeClient.Complete(ctx, judgeSystemPrompt, userPrompt)
+ if err != nil {
+ return -1.0, err
+ }
+
+ response = strings.TrimSpace(response)
+ if m := scoreRe.FindStringSubmatch(response); len(m) == 2 {
+ score, _ := strconv.Atoi(m[1])
+ return float64(score-1) / 4.0, nil // Normalize 1-5 to 0.0-1.0
+ }
+ log.Printf("WARNING: could not parse judge score from: %q, returning -1", response)
+ return -1.0, nil
+}
+
+// qaWork describes one QA evaluation unit.
+type qaWork struct {
+ sampleID string
+ qaIndex int
+ globalIndex int
+ totalQA int
+ qa *LocomoQA
+ contextText string
+ sample *LocomoSample
+}
+
+// qaResult collects one QA evaluation output.
+type qaResultOut struct {
+ index int // position in the flat QA list for ordering
+ result QAResult
+ answer string
+ score float64
+}
+
+// evalQAWorker processes a single QA item: generate answer + judge score.
+func evalQAWorker(
+ ctx context.Context,
+ w qaWork,
+ answerClient, judgeClient *LLMClient,
+ logPrefix string,
+) qaResultOut {
+ llmAnswer, err := generateAnswer(ctx, answerClient, w.contextText, w.qa.Question)
+ if err != nil {
+ log.Printf("WARN: LLM generation failed for sample %s Q%d: %v", w.sampleID, w.qaIndex, err)
+ llmAnswer = ""
+ }
+
+ score := -1.0
+ if llmAnswer != "" {
+ score, err = judgeAnswer(ctx, judgeClient, w.qa.Question, w.qa.AnswerString(), llmAnswer)
+ if err != nil {
+ log.Printf("WARN: LLM judge failed for sample %s Q%d: %v", w.sampleID, w.qaIndex, err)
+ }
+ }
+
+ hitRate := RecallHitRate(w.qa.Evidence, w.sample, w.contextText)
+
+ log.Printf("[%s] sample=%s q=%d/%d score=%.2f answer=%q",
+ logPrefix, w.sampleID, w.globalIndex, w.totalQA, score, truncateStr(llmAnswer, 80))
+
+ return qaResultOut{
+ index: w.globalIndex,
+ result: QAResult{
+ Question: w.qa.Question,
+ Category: w.qa.Category,
+ GoldAnswer: w.qa.AnswerString(),
+ TokenF1: score,
+ HitRate: hitRate,
+ },
+ answer: llmAnswer,
+ score: score,
+ }
+}
+
+// EvalLegacyLLM evaluates legacy store using LLM generation + LLM-as-Judge.
+func EvalLegacyLLM(
+ ctx context.Context,
+ samples []LocomoSample,
+ legacy *LegacyStore,
+ budgetTokens int,
+ answerClient, judgeClient *LLMClient,
+ concurrency int,
+) []EvalResult {
+ if concurrency < 1 {
+ concurrency = 1
+ }
+ totalQA := countTotalQA(samples)
+ results := make([]EvalResult, 0, len(samples))
+
+ for si := range samples {
+ sample := &samples[si]
+ history := legacy.GetHistory(sample.SampleID)
+
+ allContent := make([]string, 0, len(history))
+ for _, msg := range history {
+ allContent = append(allContent, msg.Content)
+ }
+
+ truncated, _ := BudgetTruncate(allContent, budgetTokens)
+ contextText := StringListToContent(truncated)
+
+ qaResults := make([]QAResult, len(sample.QA))
+
+ if concurrency <= 1 {
+ for qi := range sample.QA {
+ out := evalQAWorker(ctx, qaWork{
+ sampleID: sample.SampleID, qaIndex: qi,
+ globalIndex: si*len(sample.QA) + qi + 1, totalQA: totalQA,
+ qa: &sample.QA[qi], contextText: contextText, sample: sample,
+ }, answerClient, judgeClient, "legacy-llm")
+ qaResults[qi] = out.result
+ }
+ } else {
+ sem := make(chan struct{}, concurrency)
+ var wg sync.WaitGroup
+ for qi := range sample.QA {
+ wg.Add(1)
+ go func() {
+ defer wg.Done()
+ sem <- struct{}{}
+ defer func() { <-sem }()
+ out := evalQAWorker(ctx, qaWork{
+ sampleID: sample.SampleID, qaIndex: qi,
+ globalIndex: si*len(sample.QA) + qi + 1, totalQA: totalQA,
+ qa: &sample.QA[qi], contextText: contextText, sample: sample,
+ }, answerClient, judgeClient, "legacy-llm")
+ qaResults[qi] = out.result // safe: each goroutine writes distinct index
+ }()
+ }
+ wg.Wait()
+ }
+
+ results = append(results, EvalResult{
+ Mode: "legacy-llm",
+ SampleID: sample.SampleID,
+ QAResults: qaResults,
+ Agg: aggregateMetrics(qaResults),
+ })
+ }
+ return results
+}
+
+// buildSeahorseContext retrieves context for a seahorse QA item.
+func buildSeahorseContext(
+ ctx context.Context,
+ ir *SeahorseIngestResult,
+ sample *LocomoSample,
+ qa *LocomoQA,
+ budgetTokens int,
+) string {
+ store := ir.Engine.GetRetrieval().Store()
+ retrieval := ir.Engine.GetRetrieval()
+ convID := ir.ConvMap[sample.SampleID]
+
+ keywords := ExtractKeywords(qa.Question)
+ bestRank := map[int64]float64{}
+ for _, kw := range keywords {
+ searchResults, err := store.SearchMessages(ctx, seahorse.SearchInput{
+ Pattern: kw,
+ ConversationID: convID,
+ Limit: 20,
+ })
+ if err != nil {
+ continue
+ }
+ for _, sr := range searchResults {
+ if sr.MessageID > 0 {
+ if prev, ok := bestRank[sr.MessageID]; !ok || sr.Rank < prev {
+ bestRank[sr.MessageID] = sr.Rank
+ }
+ }
+ }
+ }
+
+ messageIDs := make([]int64, 0, len(bestRank))
+ for id := range bestRank {
+ messageIDs = append(messageIDs, id)
+ }
+ sort.Slice(messageIDs, func(i, j int) bool {
+ return bestRank[messageIDs[i]] < bestRank[messageIDs[j]]
+ })
+
+ var contentParts []string
+ if len(messageIDs) > 0 {
+ expandResult, err := retrieval.ExpandMessages(ctx, messageIDs)
+ if err == nil {
+ for _, msg := range expandResult.Messages {
+ contentParts = append(contentParts, msg.Content)
+ }
+ }
+ }
+ if len(contentParts) == 0 {
+ return ""
+ }
+ truncated, _ := BudgetTruncate(contentParts, budgetTokens)
+ return StringListToContent(truncated)
+}
+
+// EvalSeahorseLLM evaluates seahorse retrieval using LLM generation + LLM-as-Judge.
+func EvalSeahorseLLM(
+ ctx context.Context,
+ samples []LocomoSample,
+ ir *SeahorseIngestResult,
+ budgetTokens int,
+ answerClient, judgeClient *LLMClient,
+ concurrency int,
+) []EvalResult {
+ if concurrency < 1 {
+ concurrency = 1
+ }
+ totalQA := countTotalQA(samples)
+ results := make([]EvalResult, 0, len(samples))
+
+ for si := range samples {
+ sample := &samples[si]
+ if _, ok := ir.ConvMap[sample.SampleID]; !ok {
+ log.Printf("WARN: no conversation ID for sample %s", sample.SampleID)
+ continue
+ }
+
+ qaResults := make([]QAResult, len(sample.QA))
+
+ evalOne := func(qi int) {
+ qa := &sample.QA[qi]
+ contextText := buildSeahorseContext(ctx, ir, sample, qa, budgetTokens)
+ if contextText == "" {
+ qaResults[qi] = QAResult{
+ Question: qa.Question,
+ Category: qa.Category,
+ GoldAnswer: qa.AnswerString(),
+ TokenF1: 0.0,
+ HitRate: 0.0,
+ }
+ log.Printf("[seahorse-llm] sample=%s q=%d/%d score=0.00 answer=(no context)",
+ sample.SampleID, si*len(sample.QA)+qi+1, totalQA)
+ return
+ }
+ out := evalQAWorker(ctx, qaWork{
+ sampleID: sample.SampleID, qaIndex: qi,
+ globalIndex: si*len(sample.QA) + qi + 1, totalQA: totalQA,
+ qa: qa, contextText: contextText, sample: sample,
+ }, answerClient, judgeClient, "seahorse-llm")
+ qaResults[qi] = out.result
+ }
+
+ if concurrency <= 1 {
+ for qi := range sample.QA {
+ evalOne(qi)
+ }
+ } else {
+ sem := make(chan struct{}, concurrency)
+ var wg sync.WaitGroup
+ for qi := range sample.QA {
+ wg.Add(1)
+ go func() {
+ defer wg.Done()
+ sem <- struct{}{}
+ defer func() { <-sem }()
+ evalOne(qi)
+ }()
+ }
+ wg.Wait()
+ }
+
+ results = append(results, EvalResult{
+ Mode: "seahorse-llm",
+ SampleID: sample.SampleID,
+ QAResults: qaResults,
+ Agg: aggregateMetrics(qaResults),
+ })
+ }
+ return results
+}
+
+func countTotalQA(samples []LocomoSample) int {
+ n := 0
+ for i := range samples {
+ n += len(samples[i].QA)
+ }
+ return n
+}
+
+func truncateStr(s string, maxLen int) string {
+ s = strings.ReplaceAll(s, "\n", " ")
+ runes := []rune(s)
+ if len(runes) > maxLen {
+ return string(runes[:maxLen]) + "..."
+ }
+ return s
+}
diff --git a/cmd/membench/eval_test.go b/cmd/membench/eval_test.go
index d500a38ca..32dea07c9 100644
--- a/cmd/membench/eval_test.go
+++ b/cmd/membench/eval_test.go
@@ -102,3 +102,81 @@ func TestComputeModeAgg(t *testing.T) {
t.Errorf("TotalQuestions = %d, want 10", got.TotalQuestions)
}
}
+
+func TestAggregateMetricsSentinel(t *testing.T) {
+ qa := []QAResult{
+ {Category: 1, TokenF1: 0.8, HitRate: 0.5},
+ {Category: 1, TokenF1: -1.0, HitRate: 0.3},
+ {Category: 1, TokenF1: 0.4, HitRate: 0.7},
+ }
+ agg := aggregateMetrics(qa)
+
+ if agg.ValidF1Count != 2 {
+ t.Errorf("ValidF1Count = %d, want 2", agg.ValidF1Count)
+ }
+ if agg.TotalQuestions != 3 {
+ t.Errorf("TotalQuestions = %d, want 3", agg.TotalQuestions)
+ }
+ wantF1 := (0.8 + 0.4) / 2.0
+ if math.Abs(agg.OverallF1-wantF1) > 1e-9 {
+ t.Errorf("OverallF1 = %.6f, want %.6f", agg.OverallF1, wantF1)
+ }
+ wantHR := (0.5 + 0.3 + 0.7) / 3.0
+ if math.Abs(agg.OverallHitRate-wantHR) > 1e-9 {
+ t.Errorf("OverallHitRate = %.6f, want %.6f", agg.OverallHitRate, wantHR)
+ }
+}
+
+func TestAggregateMetricsAllSentinel(t *testing.T) {
+ qa := []QAResult{
+ {Category: 1, TokenF1: -1.0, HitRate: 0.5},
+ {Category: 1, TokenF1: -1.0, HitRate: 0.3},
+ }
+ agg := aggregateMetrics(qa)
+
+ if agg.ValidF1Count != 0 {
+ t.Errorf("ValidF1Count = %d, want 0", agg.ValidF1Count)
+ }
+ if agg.OverallF1 != 0 {
+ t.Errorf("OverallF1 = %.6f, want 0", agg.OverallF1)
+ }
+}
+
+func TestComputeModeAggSentinelWeighting(t *testing.T) {
+ results := []EvalResult{
+ {
+ Mode: "test",
+ SampleID: "s1",
+ QAResults: []QAResult{
+ {Category: 1, TokenF1: 0.8, HitRate: 0.5},
+ {Category: 1, TokenF1: -1.0, HitRate: 0.3},
+ },
+ },
+ {
+ Mode: "test",
+ SampleID: "s2",
+ QAResults: []QAResult{
+ {Category: 1, TokenF1: 0.4, HitRate: 0.6},
+ {Category: 1, TokenF1: 0.6, HitRate: 0.8},
+ },
+ },
+ }
+ for i := range results {
+ results[i].Agg = aggregateMetrics(results[i].QAResults)
+ }
+
+ got := computeModeAgg(results)
+
+ // s1: ValidF1Count=1, F1=0.8; s2: ValidF1Count=2, F1=0.5
+ // Weighted: (0.8*1 + 0.5*2) / 3 = 1.8/3 = 0.6
+ wantF1 := 0.6
+ if math.Abs(got.OverallF1-wantF1) > 1e-9 {
+ t.Errorf("OverallF1 = %.6f, want %.6f", got.OverallF1, wantF1)
+ }
+ if got.ValidF1Count != 3 {
+ t.Errorf("ValidF1Count = %d, want 3", got.ValidF1Count)
+ }
+ if got.TotalQuestions != 4 {
+ t.Errorf("TotalQuestions = %d, want 4", got.TotalQuestions)
+ }
+}
diff --git a/cmd/membench/llm_client.go b/cmd/membench/llm_client.go
new file mode 100644
index 000000000..6c62424da
--- /dev/null
+++ b/cmd/membench/llm_client.go
@@ -0,0 +1,198 @@
+package main
+
+import (
+ "bytes"
+ "context"
+ "encoding/json"
+ "fmt"
+ "io"
+ "log"
+ "net/http"
+ "strings"
+ "time"
+)
+
+// LLMClient wraps an OpenAI-compatible chat completion endpoint.
+type LLMClient struct {
+ BaseURL string
+ Model string
+ APIKey string
+ NoThinking bool // send chat_template_kwargs to disable thinking (llama.cpp specific)
+ MaxRetries int // max retry attempts for transient errors (0 = no retry)
+ Client *http.Client
+}
+
+// LLMClientOptions configures the LLM client.
+type LLMClientOptions struct {
+ BaseURL string
+ Model string
+ APIKey string
+ Timeout time.Duration
+ NoThinking bool
+ MaxRetries int // max retry attempts (default 3)
+}
+
+// NewLLMClient creates a client for an OpenAI-compatible chat completion API.
+func NewLLMClient(opts LLMClientOptions) *LLMClient {
+ if opts.Timeout == 0 {
+ opts.Timeout = 120 * time.Second
+ }
+ maxRetries := opts.MaxRetries
+ if maxRetries < 0 {
+ maxRetries = 3
+ }
+ return &LLMClient{
+ BaseURL: strings.TrimRight(opts.BaseURL, "/"),
+ Model: opts.Model,
+ APIKey: opts.APIKey,
+ NoThinking: opts.NoThinking,
+ MaxRetries: maxRetries,
+ Client: &http.Client{
+ Timeout: opts.Timeout,
+ },
+ }
+}
+
+type chatRequest struct {
+ Model string `json:"model"`
+ Messages []chatMessage `json:"messages"`
+ Temperature float64 `json:"temperature"`
+ MaxTokens int `json:"max_tokens"`
+ ChatTemplateKwargs map[string]any `json:"chat_template_kwargs,omitempty"` // llama.cpp
+ Think *bool `json:"think,omitempty"` // Ollama
+ Thinking map[string]any `json:"thinking,omitempty"` // GLM (智谱)
+}
+
+type chatMessage struct {
+ Role string `json:"role"`
+ Content string `json:"content"`
+}
+
+type chatResponse struct {
+ Choices []struct {
+ Message struct {
+ Content string `json:"content"`
+ ReasoningContent string `json:"reasoning_content,omitempty"`
+ } `json:"message"`
+ } `json:"choices"`
+}
+
+// Complete sends a chat completion request and returns the assistant's reply.
+func (c *LLMClient) Complete(ctx context.Context, systemPrompt, userPrompt string) (string, error) {
+ sysContent := systemPrompt
+ if c.NoThinking && sysContent != "" {
+ // Prepend /no_think tag — works with Ollama /v1 endpoint and
+ // Qwen chat templates where the JSON think field is ignored.
+ sysContent = "/no_think\n" + sysContent
+ }
+ messages := []chatMessage{}
+ if sysContent != "" {
+ messages = append(messages, chatMessage{Role: "system", Content: sysContent})
+ }
+ messages = append(messages, chatMessage{Role: "user", Content: userPrompt})
+
+ body := chatRequest{
+ Model: c.Model,
+ Messages: messages,
+ Temperature: 0.1,
+ MaxTokens: 512,
+ }
+ if c.NoThinking {
+ // llama.cpp: chat_template_kwargs
+ body.ChatTemplateKwargs = map[string]any{
+ "enable_thinking": false,
+ }
+ // Ollama (0.9+): think field
+ thinkFalse := false
+ body.Think = &thinkFalse
+ // GLM (智谱): thinking field
+ body.Thinking = map[string]any{
+ "type": "disabled",
+ }
+ }
+
+ jsonBody, err := json.Marshal(body)
+ if err != nil {
+ return "", fmt.Errorf("marshal request: %w", err)
+ }
+
+ endpoint := strings.TrimRight(c.BaseURL, "/") + "/chat/completions"
+ req, err := http.NewRequestWithContext(ctx, "POST", endpoint, bytes.NewReader(jsonBody))
+ if err != nil {
+ return "", fmt.Errorf("create request: %w", err)
+ }
+ req.Header.Set("Content-Type", "application/json")
+ if c.APIKey != "" {
+ req.Header.Set("Authorization", "Bearer "+c.APIKey)
+ }
+
+ var respBody []byte
+ var lastErr error
+ for attempt := 0; attempt <= c.MaxRetries; attempt++ {
+ if attempt > 0 {
+ backoff := time.Duration(1<<(attempt-1)) * time.Second // 1s, 2s, 4s, ...
+ log.Printf("LLM retry %d/%d after %v: %v", attempt, c.MaxRetries, backoff, lastErr)
+ select {
+ case <-ctx.Done():
+ return "", ctx.Err()
+ case <-time.After(backoff):
+ }
+ // Rebuild request (body reader is consumed)
+ req, err = http.NewRequestWithContext(ctx, "POST", endpoint, bytes.NewReader(jsonBody))
+ if err != nil {
+ return "", fmt.Errorf("create request: %w", err)
+ }
+ req.Header.Set("Content-Type", "application/json")
+ if c.APIKey != "" {
+ req.Header.Set("Authorization", "Bearer "+c.APIKey)
+ }
+ }
+
+ var resp *http.Response
+ resp, lastErr = c.Client.Do(req)
+ if lastErr != nil {
+ continue // network/timeout error → retry
+ }
+
+ respBody, lastErr = io.ReadAll(resp.Body)
+ resp.Body.Close()
+ if lastErr != nil {
+ continue
+ }
+
+ if resp.StatusCode == 429 || resp.StatusCode >= 500 {
+ lastErr = fmt.Errorf("API error %d: %s", resp.StatusCode, string(respBody))
+ continue // rate limit or server error → retry
+ }
+ if resp.StatusCode != 200 {
+ return "", fmt.Errorf("API error %d: %s", resp.StatusCode, string(respBody))
+ }
+
+ lastErr = nil
+ break
+ }
+ if lastErr != nil {
+ return "", fmt.Errorf("after %d retries: %w", c.MaxRetries, lastErr)
+ }
+
+ var chatResp chatResponse
+ if err := json.Unmarshal(respBody, &chatResp); err != nil {
+ return "", fmt.Errorf("parse response: %w", err)
+ }
+ if len(chatResp.Choices) == 0 {
+ return "", fmt.Errorf("no choices in response")
+ }
+ content := strings.TrimSpace(chatResp.Choices[0].Message.Content)
+ // Strip any residual