diff --git a/pkg/channels/whatsapp_native.go b/pkg/channels/whatsapp_native.go index cae89bb00..4f40dc18c 100644 --- a/pkg/channels/whatsapp_native.go +++ b/pkg/channels/whatsapp_native.go @@ -174,6 +174,9 @@ func (c *WhatsAppNativeChannel) handleIncoming(evt *events.Message) { if content == "" && evt.Message.ExtendedTextMessage != nil { content = evt.Message.ExtendedTextMessage.GetText() } + content = utils.SanitizeMessageContent(content) + + if content == "" { return } // ignore empty messages var mediaPaths []string // Optional: resolve media to local paths if needed; for now we only forward text to the bus. diff --git a/pkg/utils/string.go b/pkg/utils/string.go index 62d9beee0..edc413972 100644 --- a/pkg/utils/string.go +++ b/pkg/utils/string.go @@ -1,5 +1,31 @@ package utils +import ( + "strings" + "unicode" +) + +// SanitizeMessage removes Unicode control characters, format characters (RTL overrides, +// zero-width characters), and other non-graphic characters that could confuse an LLM +// or cause display issues in the agent UI. +func SanitizeMessageContent(input string) string { + var sb strings.Builder + // Pre-allocate memory to avoid multiple allocations + sb.Grow(len(input)) + + for _, r := range input { + // unicode.IsGraphic returns true if the rune is a Unicode graphic character. + // This includes letters, marks, numbers, punctuation, and symbols. + // It excludes control characters (Cc), format characters (Cf), + // surrogates (Cs), and private use (Co). + if unicode.IsGraphic(r) || r == '\n' || r == '\r' || r == '\t' { + sb.WriteRune(r) + } + } + + return sb.String() +} + // Truncate returns a truncated version of s with at most maxLen runes. // Handles multi-byte Unicode characters properly. // If the string is truncated, "..." is appended to indicate truncation. diff --git a/pkg/utils/string_test.go b/pkg/utils/string_test.go index a44ead228..fffa0cff3 100644 --- a/pkg/utils/string_test.go +++ b/pkg/utils/string_test.go @@ -104,3 +104,27 @@ func TestTruncate(t *testing.T) { }) } } + +func TestSanitizeMessageContent(t *testing.T) { + tests := []struct { + name string + input string + want string + }{ + {"empty", "", ""}, + {"plain text unchanged", "Hello world", "Hello world"}, + {"strip ZWSP", "Hello\u200bworld", "Helloworld"}, + {"strip RTL override", "Hi\u202eevil", "Hievil"}, + {"strip BOM", "\uFEFFcontent", "content"}, + {"strip multiple", "a\u200c\u202ab\u202cc", "abc"}, + {"unicode letters preserved", "café 日本語", "café 日本語"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := SanitizeMessageContent(tt.input) + if got != tt.want { + t.Errorf("SanitizeMessageContent(%q) = %q, want %q", tt.input, got, tt.want) + } + }) + } +}