mirror of
https://github.com/sipeed/picoclaw.git
synced 2026-06-12 18:08:54 +00:00
Sanitize WhatsApp messages and remove extra log messages.
This commit is contained in:
@@ -174,6 +174,9 @@ func (c *WhatsAppNativeChannel) handleIncoming(evt *events.Message) {
|
||||
if content == "" && evt.Message.ExtendedTextMessage != nil {
|
||||
content = evt.Message.ExtendedTextMessage.GetText()
|
||||
}
|
||||
content = utils.SanitizeMessageContent(content)
|
||||
|
||||
if content == "" { return } // ignore empty messages
|
||||
|
||||
var mediaPaths []string
|
||||
// Optional: resolve media to local paths if needed; for now we only forward text to the bus.
|
||||
|
||||
@@ -1,5 +1,31 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// SanitizeMessage removes Unicode control characters, format characters (RTL overrides,
|
||||
// zero-width characters), and other non-graphic characters that could confuse an LLM
|
||||
// or cause display issues in the agent UI.
|
||||
func SanitizeMessageContent(input string) string {
|
||||
var sb strings.Builder
|
||||
// Pre-allocate memory to avoid multiple allocations
|
||||
sb.Grow(len(input))
|
||||
|
||||
for _, r := range input {
|
||||
// unicode.IsGraphic returns true if the rune is a Unicode graphic character.
|
||||
// This includes letters, marks, numbers, punctuation, and symbols.
|
||||
// It excludes control characters (Cc), format characters (Cf),
|
||||
// surrogates (Cs), and private use (Co).
|
||||
if unicode.IsGraphic(r) || r == '\n' || r == '\r' || r == '\t' {
|
||||
sb.WriteRune(r)
|
||||
}
|
||||
}
|
||||
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
// Truncate returns a truncated version of s with at most maxLen runes.
|
||||
// Handles multi-byte Unicode characters properly.
|
||||
// If the string is truncated, "..." is appended to indicate truncation.
|
||||
|
||||
@@ -104,3 +104,27 @@ func TestTruncate(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestSanitizeMessageContent(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
want string
|
||||
}{
|
||||
{"empty", "", ""},
|
||||
{"plain text unchanged", "Hello world", "Hello world"},
|
||||
{"strip ZWSP", "Hello\u200bworld", "Helloworld"},
|
||||
{"strip RTL override", "Hi\u202eevil", "Hievil"},
|
||||
{"strip BOM", "\uFEFFcontent", "content"},
|
||||
{"strip multiple", "a\u200c\u202ab\u202cc", "abc"},
|
||||
{"unicode letters preserved", "café 日本語", "café 日本語"},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := SanitizeMessageContent(tt.input)
|
||||
if got != tt.want {
|
||||
t.Errorf("SanitizeMessageContent(%q) = %q, want %q", tt.input, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user