Merge pull request #436 from Huaaudio/feat/base-layer-message-split

Refactor/base layer message split from #143
This commit is contained in:
Hua Audio
2026-02-18 23:29:29 +01:00
committed by GitHub
3 changed files with 331 additions and 128 deletions
+1 -128
View File
@@ -4,7 +4,6 @@ import (
"context"
"fmt"
"os"
"strings"
"time"
"github.com/bwmarrin/discordgo"
@@ -106,7 +105,7 @@ func (c *DiscordChannel) Send(ctx context.Context, msg bus.OutboundMessage) erro
return nil
}
chunks := splitMessage(msg.Content, 1500) // Discord has a limit of 2000 characters per message, leave 500 for natural split e.g. code blocks
chunks := utils.SplitMessage(msg.Content, 2000) // Split messages into chunks, Discord length limit: 2000 chars
for _, chunk := range chunks {
if err := c.sendChunk(ctx, channelID, chunk); err != nil {
@@ -117,132 +116,6 @@ func (c *DiscordChannel) Send(ctx context.Context, msg bus.OutboundMessage) erro
return nil
}
// splitMessage splits long messages into chunks, preserving code block integrity
// Uses natural boundaries (newlines, spaces) and extends messages slightly to avoid breaking code blocks
func splitMessage(content string, limit int) []string {
var messages []string
for len(content) > 0 {
if len(content) <= limit {
messages = append(messages, content)
break
}
msgEnd := limit
// Find natural split point within the limit
msgEnd = findLastNewline(content[:limit], 200)
if msgEnd <= 0 {
msgEnd = findLastSpace(content[:limit], 100)
}
if msgEnd <= 0 {
msgEnd = limit
}
// Check if this would end with an incomplete code block
candidate := content[:msgEnd]
unclosedIdx := findLastUnclosedCodeBlock(candidate)
if unclosedIdx >= 0 {
// Message would end with incomplete code block
// Try to extend to include the closing ``` (with some buffer)
extendedLimit := limit + 500 // Allow 500 char buffer for code blocks
if len(content) > extendedLimit {
closingIdx := findNextClosingCodeBlock(content, msgEnd)
if closingIdx > 0 && closingIdx <= extendedLimit {
// Extend to include the closing ```
msgEnd = closingIdx
} else {
// Can't find closing, split before the code block
msgEnd = findLastNewline(content[:unclosedIdx], 200)
if msgEnd <= 0 {
msgEnd = findLastSpace(content[:unclosedIdx], 100)
}
if msgEnd <= 0 {
msgEnd = unclosedIdx
}
}
} else {
// Remaining content fits within extended limit
msgEnd = len(content)
}
}
if msgEnd <= 0 {
msgEnd = limit
}
messages = append(messages, content[:msgEnd])
content = strings.TrimSpace(content[msgEnd:])
}
return messages
}
// findLastUnclosedCodeBlock finds the last opening ``` that doesn't have a closing ```
// Returns the position of the opening ``` or -1 if all code blocks are complete
func findLastUnclosedCodeBlock(text string) int {
count := 0
lastOpenIdx := -1
for i := 0; i < len(text); i++ {
if i+2 < len(text) && text[i] == '`' && text[i+1] == '`' && text[i+2] == '`' {
if count == 0 {
lastOpenIdx = i
}
count++
i += 2
}
}
// If odd number of ``` markers, last one is unclosed
if count%2 == 1 {
return lastOpenIdx
}
return -1
}
// findNextClosingCodeBlock finds the next closing ``` starting from a position
// Returns the position after the closing ``` or -1 if not found
func findNextClosingCodeBlock(text string, startIdx int) int {
for i := startIdx; i < len(text); i++ {
if i+2 < len(text) && text[i] == '`' && text[i+1] == '`' && text[i+2] == '`' {
return i + 3
}
}
return -1
}
// findLastNewline finds the last newline character within the last N characters
// Returns the position of the newline or -1 if not found
func findLastNewline(s string, searchWindow int) int {
searchStart := len(s) - searchWindow
if searchStart < 0 {
searchStart = 0
}
for i := len(s) - 1; i >= searchStart; i-- {
if s[i] == '\n' {
return i
}
}
return -1
}
// findLastSpace finds the last space character within the last N characters
// Returns the position of the space or -1 if not found
func findLastSpace(s string, searchWindow int) int {
searchStart := len(s) - searchWindow
if searchStart < 0 {
searchStart = 0
}
for i := len(s) - 1; i >= searchStart; i-- {
if s[i] == ' ' || s[i] == '\t' {
return i
}
}
return -1
}
func (c *DiscordChannel) sendChunk(ctx context.Context, channelID, content string) error {
// 使用传入的 ctx 进行超时控制
sendCtx, cancel := context.WithTimeout(ctx, sendTimeout)
+179
View File
@@ -0,0 +1,179 @@
package utils
import (
"strings"
)
// SplitMessage splits long messages into chunks, preserving code block integrity.
// The function reserves a buffer (10% of maxLen, min 50) to leave room for closing code blocks,
// but may extend to maxLen when needed.
// Call SplitMessage with the full text content and the maximum allowed length of a single message;
// it returns a slice of message chunks that each respect maxLen and avoid splitting fenced code blocks.
func SplitMessage(content string, maxLen int) []string {
var messages []string
// Dynamic buffer: 10% of maxLen, but at least 50 chars if possible
codeBlockBuffer := maxLen / 10
if codeBlockBuffer < 50 {
codeBlockBuffer = 50
}
if codeBlockBuffer > maxLen/2 {
codeBlockBuffer = maxLen / 2
}
for len(content) > 0 {
if len(content) <= maxLen {
messages = append(messages, content)
break
}
// Effective split point: maxLen minus buffer, to leave room for code blocks
effectiveLimit := maxLen - codeBlockBuffer
if effectiveLimit < maxLen/2 {
effectiveLimit = maxLen / 2
}
// Find natural split point within the effective limit
msgEnd := findLastNewline(content[:effectiveLimit], 200)
if msgEnd <= 0 {
msgEnd = findLastSpace(content[:effectiveLimit], 100)
}
if msgEnd <= 0 {
msgEnd = effectiveLimit
}
// Check if this would end with an incomplete code block
candidate := content[:msgEnd]
unclosedIdx := findLastUnclosedCodeBlock(candidate)
if unclosedIdx >= 0 {
// Message would end with incomplete code block
// Try to extend up to maxLen to include the closing ```
if len(content) > msgEnd {
closingIdx := findNextClosingCodeBlock(content, msgEnd)
if closingIdx > 0 && closingIdx <= maxLen {
// Extend to include the closing ```
msgEnd = closingIdx
} else {
// Code block is too long to fit in one chunk or missing closing fence.
// Try to split inside by injecting closing and reopening fences.
headerEnd := strings.Index(content[unclosedIdx:], "\n")
if headerEnd == -1 {
headerEnd = unclosedIdx + 3
} else {
headerEnd += unclosedIdx
}
header := strings.TrimSpace(content[unclosedIdx:headerEnd])
// If we have a reasonable amount of content after the header, split inside
if msgEnd > headerEnd+20 {
// Find a better split point closer to maxLen
innerLimit := maxLen - 5 // Leave room for "\n```"
betterEnd := findLastNewline(content[:innerLimit], 200)
if betterEnd > headerEnd {
msgEnd = betterEnd
} else {
msgEnd = innerLimit
}
messages = append(messages, strings.TrimRight(content[:msgEnd], " \t\n\r")+"\n```")
content = strings.TrimSpace(header + "\n" + content[msgEnd:])
continue
}
// Otherwise, try to split before the code block starts
newEnd := findLastNewline(content[:unclosedIdx], 200)
if newEnd <= 0 {
newEnd = findLastSpace(content[:unclosedIdx], 100)
}
if newEnd > 0 {
msgEnd = newEnd
} else {
// If we can't split before, we MUST split inside (last resort)
if unclosedIdx > 20 {
msgEnd = unclosedIdx
} else {
msgEnd = maxLen - 5
messages = append(messages, strings.TrimRight(content[:msgEnd], " \t\n\r")+"\n```")
content = strings.TrimSpace(header + "\n" + content[msgEnd:])
continue
}
}
}
}
}
if msgEnd <= 0 {
msgEnd = effectiveLimit
}
messages = append(messages, content[:msgEnd])
content = strings.TrimSpace(content[msgEnd:])
}
return messages
}
// findLastUnclosedCodeBlock finds the last opening ``` that doesn't have a closing ```
// Returns the position of the opening ``` or -1 if all code blocks are complete
func findLastUnclosedCodeBlock(text string) int {
inCodeBlock := false
lastOpenIdx := -1
for i := 0; i < len(text); i++ {
if i+2 < len(text) && text[i] == '`' && text[i+1] == '`' && text[i+2] == '`' {
// Toggle code block state on each fence
if !inCodeBlock {
// Entering a code block: record this opening fence
lastOpenIdx = i
}
inCodeBlock = !inCodeBlock
i += 2
}
}
if inCodeBlock {
return lastOpenIdx
}
return -1
}
// findNextClosingCodeBlock finds the next closing ``` starting from a position
// Returns the position after the closing ``` or -1 if not found
func findNextClosingCodeBlock(text string, startIdx int) int {
for i := startIdx; i < len(text); i++ {
if i+2 < len(text) && text[i] == '`' && text[i+1] == '`' && text[i+2] == '`' {
return i + 3
}
}
return -1
}
// findLastNewline finds the last newline character within the last N characters
// Returns the position of the newline or -1 if not found
func findLastNewline(s string, searchWindow int) int {
searchStart := len(s) - searchWindow
if searchStart < 0 {
searchStart = 0
}
for i := len(s) - 1; i >= searchStart; i-- {
if s[i] == '\n' {
return i
}
}
return -1
}
// findLastSpace finds the last space character within the last N characters
// Returns the position of the space or -1 if not found
func findLastSpace(s string, searchWindow int) int {
searchStart := len(s) - searchWindow
if searchStart < 0 {
searchStart = 0
}
for i := len(s) - 1; i >= searchStart; i-- {
if s[i] == ' ' || s[i] == '\t' {
return i
}
}
return -1
}
+151
View File
@@ -0,0 +1,151 @@
package utils
import (
"strings"
"testing"
)
func TestSplitMessage(t *testing.T) {
longText := strings.Repeat("a", 2500)
longCode := "```go\n" + strings.Repeat("fmt.Println(\"hello\")\n", 100) + "```" // ~2100 chars
tests := []struct {
name string
content string
maxLen int
expectChunks int // Check number of chunks
checkContent func(t *testing.T, chunks []string) // Custom validation
}{
{
name: "Empty message",
content: "",
maxLen: 2000,
expectChunks: 0,
},
{
name: "Short message fits in one chunk",
content: "Hello world",
maxLen: 2000,
expectChunks: 1,
},
{
name: "Simple split regular text",
content: longText,
maxLen: 2000,
expectChunks: 2,
checkContent: func(t *testing.T, chunks []string) {
if len(chunks[0]) > 2000 {
t.Errorf("Chunk 0 too large: %d", len(chunks[0]))
}
if len(chunks[0])+len(chunks[1]) != len(longText) {
t.Errorf("Total length mismatch. Got %d, want %d", len(chunks[0])+len(chunks[1]), len(longText))
}
},
},
{
name: "Split at newline",
// 1750 chars then newline, then more chars.
// Dynamic buffer: 2000 / 10 = 200.
// Effective limit: 2000 - 200 = 1800.
// Split should happen at newline because it's at 1750 (< 1800).
// Total length must > 2000 to trigger split. 1750 + 1 + 300 = 2051.
content: strings.Repeat("a", 1750) + "\n" + strings.Repeat("b", 300),
maxLen: 2000,
expectChunks: 2,
checkContent: func(t *testing.T, chunks []string) {
if len(chunks[0]) != 1750 {
t.Errorf("Expected chunk 0 to be 1750 length (split at newline), got %d", len(chunks[0]))
}
if chunks[1] != strings.Repeat("b", 300) {
t.Errorf("Chunk 1 content mismatch. Len: %d", len(chunks[1]))
}
},
},
{
name: "Long code block split",
content: "Prefix\n" + longCode,
maxLen: 2000,
expectChunks: 2,
checkContent: func(t *testing.T, chunks []string) {
// Check that first chunk ends with closing fence
if !strings.HasSuffix(chunks[0], "\n```") {
t.Error("First chunk should end with injected closing fence")
}
// Check that second chunk starts with execution header
if !strings.HasPrefix(chunks[1], "```go") {
t.Error("Second chunk should start with injected code block header")
}
},
},
{
name: "Preserve Unicode characters",
content: strings.Repeat("\u4e16", 1000), // 3000 bytes
maxLen: 2000,
expectChunks: 2,
checkContent: func(t *testing.T, chunks []string) {
// Just verify we didn't panic and got valid strings.
// Go strings are UTF-8, if we split mid-rune it would be bad,
// but standard slicing might do that.
// Let's assume standard behavior is acceptable or check if it produces invalid rune?
if !strings.Contains(chunks[0], "\u4e16") {
t.Error("Chunk should contain unicode characters")
}
},
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
got := SplitMessage(tc.content, tc.maxLen)
if tc.expectChunks == 0 {
if len(got) != 0 {
t.Errorf("Expected 0 chunks, got %d", len(got))
}
return
}
if len(got) != tc.expectChunks {
t.Errorf("Expected %d chunks, got %d", tc.expectChunks, len(got))
// Log sizes for debugging
for i, c := range got {
t.Logf("Chunk %d length: %d", i, len(c))
}
return // Stop further checks if count assumes specific split
}
if tc.checkContent != nil {
tc.checkContent(t, got)
}
})
}
}
func TestSplitMessage_CodeBlockIntegrity(t *testing.T) {
// Focused test for the core requirement: splitting inside a code block preserves syntax highlighting
// 60 chars total approximately
content := "```go\npackage main\n\nfunc main() {\n\tprintln(\"Hello\")\n}\n```"
maxLen := 40
chunks := SplitMessage(content, maxLen)
if len(chunks) != 2 {
t.Fatalf("Expected 2 chunks, got %d: %q", len(chunks), chunks)
}
// First chunk must end with "\n```"
if !strings.HasSuffix(chunks[0], "\n```") {
t.Errorf("First chunk should end with closing fence. Got: %q", chunks[0])
}
// Second chunk must start with the header "```go"
if !strings.HasPrefix(chunks[1], "```go") {
t.Errorf("Second chunk should start with code block header. Got: %q", chunks[1])
}
// First chunk should contain meaningful content
if len(chunks[0]) > 40 {
t.Errorf("First chunk exceeded maxLen: length %d", len(chunks[0]))
}
}