fix(telegram): improve HTML chunking and preserve word boundaries (#1651)

* fix(telegram): improve HTML chunking and preserve word boundaries

* fix(telegram): address copilot feedback, filter empty chunks and add word-boundary regression test

* style(telegram): fix gofmt and gci lint errors in tests

* fix to feedback
This commit is contained in:
badgerbees
2026-03-18 15:44:30 +07:00
committed by GitHub
parent 363861c917
commit a1e8ee56f0
2 changed files with 92 additions and 7 deletions
+35 -6
View File
@@ -191,15 +191,44 @@ func (c *TelegramChannel) Send(ctx context.Context, msg bus.OutboundMessage) err
htmlContent := markdownToTelegramHTML(chunk)
if len([]rune(htmlContent)) > 4096 {
ratio := float64(len([]rune(chunk))) / float64(len([]rune(htmlContent)))
runeChunk := []rune(chunk)
ratio := float64(len(runeChunk)) / float64(len([]rune(htmlContent)))
smallerLen := int(float64(4096) * ratio * 0.95) // 5% safety margin
if smallerLen < 100 {
smallerLen = 100
// Guarantee progress: if estimated length is >= chunk length, force it smaller
if smallerLen >= len(runeChunk) {
smallerLen = len(runeChunk) - 1
}
// Push sub-chunks back to the front of the queue for
// re-validation instead of sending them blindly.
if smallerLen <= 0 {
if err := c.sendHTMLChunk(ctx, chatID, threadID, htmlContent, chunk, replyToID); err != nil {
return err
}
replyToID = ""
continue
}
// Use the estimated smaller length as a guide for SplitMessage.
// SplitMessage will find natural break points (newlines/spaces) and respect code blocks.
subChunks := channels.SplitMessage(chunk, smallerLen)
queue = append(subChunks, queue...)
// Safety fallback: If SplitMessage failed to shorten the chunk, force a manual hard split.
if len(subChunks) == 1 && subChunks[0] == chunk {
part1 := string(runeChunk[:smallerLen])
part2 := string(runeChunk[smallerLen:])
subChunks = []string{part1, part2}
}
// Filter out empty chunks to avoid sending empty messages to Telegram.
nonEmpty := make([]string, 0, len(subChunks))
for _, s := range subChunks {
if s != "" {
nonEmpty = append(nonEmpty, s)
}
}
// Push sub-chunks back to the front of the queue
queue = append(nonEmpty, queue...)
continue
}
+57 -1
View File
@@ -47,7 +47,14 @@ type multipartCall struct {
}
func (s *stubConstructor) JSONRequest(parameters any) (*ta.RequestData, error) {
return &ta.RequestData{}, nil
b, err := json.Marshal(parameters)
if err != nil {
return nil, err
}
return &ta.RequestData{
ContentType: "application/json",
BodyRaw: b,
}, nil
}
func (s *stubConstructor) MultipartRequest(
@@ -367,6 +374,55 @@ func TestSend_MarkdownShortButHTMLLong_MultipleCalls(t *testing.T) {
)
}
func TestSend_HTMLOverflow_WordBoundary(t *testing.T) {
caller := &stubCaller{
callFn: func(ctx context.Context, url string, data *ta.RequestData) (*ta.Response, error) {
return successResponse(t), nil
},
}
ch := newTestChannel(t, caller)
// We want to force a split near index ~2600 while keeping markdown length <= 4000.
// Prefix of 430 bold units (6 chars each) = 2580 chars.
// Expansion per unit is +3 chars when converted to HTML, so 2580 + 430*3 = 3870.
prefix := strings.Repeat("**a** ", 430)
targetWord := "TARGETWORDTHATSTAYSTOGETHER"
// Suffix of 230 bold units (6 chars each) = 1380 chars.
// Total markdown length: 2580 (prefix) + 27 (target word) + 1380 (suffix) = 3987 <= 4000.
// HTML expansion adds ~3 chars per bold unit: (430 + 230)*3 = 1980 extra chars,
// so total HTML length comfortably exceeds 4096.
suffix := strings.Repeat(" **b**", 230)
content := prefix + targetWord + suffix
// Ensure the test content matches the intended boundary conditions.
assert.LessOrEqual(t, len([]rune(content)), 4000, "markdown content must not exceed chunk size for this test")
err := ch.Send(context.Background(), bus.OutboundMessage{
ChatID: "123456",
Content: content,
})
assert.NoError(t, err)
foundFullWord := false
for i, call := range caller.calls {
var params map[string]any
err := json.Unmarshal(call.Data.BodyRaw, &params)
require.NoError(t, err)
text, _ := params["text"].(string)
hasWord := strings.Contains(text, targetWord)
t.Logf("Chunk %d length: %d, contains target word: %v", i, len(text), hasWord)
if hasWord {
foundFullWord = true
break
}
}
assert.True(t, foundFullWord, "The target word should not be split between chunks")
}
func TestSend_NotRunning(t *testing.T) {
caller := &stubCaller{
callFn: func(ctx context.Context, url string, data *ta.RequestData) (*ta.Response, error) {