fix(telegram): improve HTML chunking and preserve word boundaries (#1651)

* fix(telegram): improve HTML chunking and preserve word boundaries * fix(telegram): address copilot feedback, filter empty chunks and add word-boundary regression test * style(telegram): fix gofmt and gci lint errors in tests * fix to feedback
2026-06-12 18:08:54 +00:00 · 2026-03-18 15:44:30 +07:00
parent 363861c917
commit a1e8ee56f0
2 changed files with 92 additions and 7 deletions
@@ -191,15 +191,44 @@ func (c *TelegramChannel) Send(ctx context.Context, msg bus.OutboundMessage) err
 		htmlContent := markdownToTelegramHTML(chunk)

 		if len([]rune(htmlContent)) > 4096 {
-			ratio := float64(len([]rune(chunk))) / float64(len([]rune(htmlContent)))
+			runeChunk := []rune(chunk)
+			ratio := float64(len(runeChunk)) / float64(len([]rune(htmlContent)))
 			smallerLen := int(float64(4096) * ratio * 0.95) // 5% safety margin
-			if smallerLen < 100 {
-				smallerLen = 100
+
+			// Guarantee progress: if estimated length is >= chunk length, force it smaller
+			if smallerLen >= len(runeChunk) {
+				smallerLen = len(runeChunk) - 1
 			}
-			// Push sub-chunks back to the front of the queue for
-			// re-validation instead of sending them blindly.
+
+			if smallerLen <= 0 {
+				if err := c.sendHTMLChunk(ctx, chatID, threadID, htmlContent, chunk, replyToID); err != nil {
+					return err
+				}
+				replyToID = ""
+				continue
+			}
+
+			// Use the estimated smaller length as a guide for SplitMessage.
+			// SplitMessage will find natural break points (newlines/spaces) and respect code blocks.
 			subChunks := channels.SplitMessage(chunk, smallerLen)
-			queue = append(subChunks, queue...)
+
+			// Safety fallback: If SplitMessage failed to shorten the chunk, force a manual hard split.
+			if len(subChunks) == 1 && subChunks[0] == chunk {
+				part1 := string(runeChunk[:smallerLen])
+				part2 := string(runeChunk[smallerLen:])
+				subChunks = []string{part1, part2}
+			}
+
+			// Filter out empty chunks to avoid sending empty messages to Telegram.
+			nonEmpty := make([]string, 0, len(subChunks))
+			for _, s := range subChunks {
+				if s != "" {
+					nonEmpty = append(nonEmpty, s)
+				}
+			}
+
+			// Push sub-chunks back to the front of the queue
+			queue = append(nonEmpty, queue...)
 			continue
 		}

@@ -47,7 +47,14 @@ type multipartCall struct {
 }

 func (s *stubConstructor) JSONRequest(parameters any) (*ta.RequestData, error) {
-	return &ta.RequestData{}, nil
+	b, err := json.Marshal(parameters)
+	if err != nil {
+		return nil, err
+	}
+	return &ta.RequestData{
+		ContentType: "application/json",
+		BodyRaw:     b,
+	}, nil
 }

 func (s *stubConstructor) MultipartRequest(
@@ -367,6 +374,55 @@ func TestSend_MarkdownShortButHTMLLong_MultipleCalls(t *testing.T) {
 	)
 }

+func TestSend_HTMLOverflow_WordBoundary(t *testing.T) {
+	caller := &stubCaller{
+		callFn: func(ctx context.Context, url string, data *ta.RequestData) (*ta.Response, error) {
+			return successResponse(t), nil
+		},
+	}
+	ch := newTestChannel(t, caller)
+
+	// We want to force a split near index ~2600 while keeping markdown length <= 4000.
+	// Prefix of 430 bold units (6 chars each) = 2580 chars.
+	// Expansion per unit is +3 chars when converted to HTML, so 2580 + 430*3 = 3870.
+	prefix := strings.Repeat("**a** ", 430)
+	targetWord := "TARGETWORDTHATSTAYSTOGETHER"
+	// Suffix of 230 bold units (6 chars each) = 1380 chars.
+	// Total markdown length: 2580 (prefix) + 27 (target word) + 1380 (suffix) = 3987 <= 4000.
+	// HTML expansion adds ~3 chars per bold unit: (430 + 230)*3 = 1980 extra chars,
+	// so total HTML length comfortably exceeds 4096.
+	suffix := strings.Repeat(" **b**", 230)
+	content := prefix + targetWord + suffix
+
+	// Ensure the test content matches the intended boundary conditions.
+	assert.LessOrEqual(t, len([]rune(content)), 4000, "markdown content must not exceed chunk size for this test")
+
+	err := ch.Send(context.Background(), bus.OutboundMessage{
+		ChatID:  "123456",
+		Content: content,
+	})
+
+	assert.NoError(t, err)
+
+	foundFullWord := false
+	for i, call := range caller.calls {
+		var params map[string]any
+		err := json.Unmarshal(call.Data.BodyRaw, &params)
+		require.NoError(t, err)
+		text, _ := params["text"].(string)
+
+		hasWord := strings.Contains(text, targetWord)
+		t.Logf("Chunk %d length: %d, contains target word: %v", i, len(text), hasWord)
+
+		if hasWord {
+			foundFullWord = true
+			break
+		}
+	}
+
+	assert.True(t, foundFullWord, "The target word should not be split between chunks")
+}
+
 func TestSend_NotRunning(t *testing.T) {
 	caller := &stubCaller{
 		callFn: func(ctx context.Context, url string, data *ta.RequestData) (*ta.Response, error) {