mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
fix(telegram): improve HTML chunking and preserve word boundaries (#1651)
* fix(telegram): improve HTML chunking and preserve word boundaries * fix(telegram): address copilot feedback, filter empty chunks and add word-boundary regression test * style(telegram): fix gofmt and gci lint errors in tests * fix to feedback
This commit is contained in:
@@ -191,15 +191,44 @@ func (c *TelegramChannel) Send(ctx context.Context, msg bus.OutboundMessage) err
|
||||
htmlContent := markdownToTelegramHTML(chunk)
|
||||
|
||||
if len([]rune(htmlContent)) > 4096 {
|
||||
ratio := float64(len([]rune(chunk))) / float64(len([]rune(htmlContent)))
|
||||
runeChunk := []rune(chunk)
|
||||
ratio := float64(len(runeChunk)) / float64(len([]rune(htmlContent)))
|
||||
smallerLen := int(float64(4096) * ratio * 0.95) // 5% safety margin
|
||||
if smallerLen < 100 {
|
||||
smallerLen = 100
|
||||
|
||||
// Guarantee progress: if estimated length is >= chunk length, force it smaller
|
||||
if smallerLen >= len(runeChunk) {
|
||||
smallerLen = len(runeChunk) - 1
|
||||
}
|
||||
// Push sub-chunks back to the front of the queue for
|
||||
// re-validation instead of sending them blindly.
|
||||
|
||||
if smallerLen <= 0 {
|
||||
if err := c.sendHTMLChunk(ctx, chatID, threadID, htmlContent, chunk, replyToID); err != nil {
|
||||
return err
|
||||
}
|
||||
replyToID = ""
|
||||
continue
|
||||
}
|
||||
|
||||
// Use the estimated smaller length as a guide for SplitMessage.
|
||||
// SplitMessage will find natural break points (newlines/spaces) and respect code blocks.
|
||||
subChunks := channels.SplitMessage(chunk, smallerLen)
|
||||
queue = append(subChunks, queue...)
|
||||
|
||||
// Safety fallback: If SplitMessage failed to shorten the chunk, force a manual hard split.
|
||||
if len(subChunks) == 1 && subChunks[0] == chunk {
|
||||
part1 := string(runeChunk[:smallerLen])
|
||||
part2 := string(runeChunk[smallerLen:])
|
||||
subChunks = []string{part1, part2}
|
||||
}
|
||||
|
||||
// Filter out empty chunks to avoid sending empty messages to Telegram.
|
||||
nonEmpty := make([]string, 0, len(subChunks))
|
||||
for _, s := range subChunks {
|
||||
if s != "" {
|
||||
nonEmpty = append(nonEmpty, s)
|
||||
}
|
||||
}
|
||||
|
||||
// Push sub-chunks back to the front of the queue
|
||||
queue = append(nonEmpty, queue...)
|
||||
continue
|
||||
}
|
||||
|
||||
|
||||
@@ -47,7 +47,14 @@ type multipartCall struct {
|
||||
}
|
||||
|
||||
func (s *stubConstructor) JSONRequest(parameters any) (*ta.RequestData, error) {
|
||||
return &ta.RequestData{}, nil
|
||||
b, err := json.Marshal(parameters)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &ta.RequestData{
|
||||
ContentType: "application/json",
|
||||
BodyRaw: b,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (s *stubConstructor) MultipartRequest(
|
||||
@@ -367,6 +374,55 @@ func TestSend_MarkdownShortButHTMLLong_MultipleCalls(t *testing.T) {
|
||||
)
|
||||
}
|
||||
|
||||
func TestSend_HTMLOverflow_WordBoundary(t *testing.T) {
|
||||
caller := &stubCaller{
|
||||
callFn: func(ctx context.Context, url string, data *ta.RequestData) (*ta.Response, error) {
|
||||
return successResponse(t), nil
|
||||
},
|
||||
}
|
||||
ch := newTestChannel(t, caller)
|
||||
|
||||
// We want to force a split near index ~2600 while keeping markdown length <= 4000.
|
||||
// Prefix of 430 bold units (6 chars each) = 2580 chars.
|
||||
// Expansion per unit is +3 chars when converted to HTML, so 2580 + 430*3 = 3870.
|
||||
prefix := strings.Repeat("**a** ", 430)
|
||||
targetWord := "TARGETWORDTHATSTAYSTOGETHER"
|
||||
// Suffix of 230 bold units (6 chars each) = 1380 chars.
|
||||
// Total markdown length: 2580 (prefix) + 27 (target word) + 1380 (suffix) = 3987 <= 4000.
|
||||
// HTML expansion adds ~3 chars per bold unit: (430 + 230)*3 = 1980 extra chars,
|
||||
// so total HTML length comfortably exceeds 4096.
|
||||
suffix := strings.Repeat(" **b**", 230)
|
||||
content := prefix + targetWord + suffix
|
||||
|
||||
// Ensure the test content matches the intended boundary conditions.
|
||||
assert.LessOrEqual(t, len([]rune(content)), 4000, "markdown content must not exceed chunk size for this test")
|
||||
|
||||
err := ch.Send(context.Background(), bus.OutboundMessage{
|
||||
ChatID: "123456",
|
||||
Content: content,
|
||||
})
|
||||
|
||||
assert.NoError(t, err)
|
||||
|
||||
foundFullWord := false
|
||||
for i, call := range caller.calls {
|
||||
var params map[string]any
|
||||
err := json.Unmarshal(call.Data.BodyRaw, ¶ms)
|
||||
require.NoError(t, err)
|
||||
text, _ := params["text"].(string)
|
||||
|
||||
hasWord := strings.Contains(text, targetWord)
|
||||
t.Logf("Chunk %d length: %d, contains target word: %v", i, len(text), hasWord)
|
||||
|
||||
if hasWord {
|
||||
foundFullWord = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
assert.True(t, foundFullWord, "The target word should not be split between chunks")
|
||||
}
|
||||
|
||||
func TestSend_NotRunning(t *testing.T) {
|
||||
caller := &stubCaller{
|
||||
callFn: func(ctx context.Context, url string, data *ta.RequestData) (*ta.Response, error) {
|
||||
|
||||
Reference in New Issue
Block a user