package utils import ( "testing" "github.com/sipeed/picoclaw/pkg/logger" ) func TestHtmlToMarkdown(t *testing.T) { // Define our test cases tests := []struct { name string input string expected string }{ { name: "Removes scripts and styles", input: `

Clean text

`, expected: "Clean text", }, { name: "Extracts links correctly", input: `Visit my website for info.`, expected: "Visit my [website](https://example.com) for info.", }, { name: "Converts headers (H1, H2, H3)", input: `

Main Title

Subtitle

Section

`, expected: "# Main Title\n\n## Subtitle\n\n### Section", }, { name: "Handles bold and italics", input: `Text bold and strong, then italic and em.`, expected: "Text **bold** and **strong**, then *italic* and *em*.", }, { name: "Converts lists", input: `

First element
Second element

`, expected: "- First element\n- Second element", }, { name: "Handles paragraphs and line breaks (
)", input: `

First paragraph

Second paragraph with
a line break.

`, expected: "First paragraph\n\nSecond paragraph with\na line break.", }, { name: "Decodes HTML entities", input: `Math: 5 > 3 & 2 < 4. A "quote".`, expected: "Math: 5 > 3 & 2 < 4. A \"quote\".", }, { name: "Cleans up residual HTML tags", input: `

Text inside div and span

`, expected: "Text inside div and span", }, { name: "Removes multiple spaces and excessive empty lines", input: `This text has too many spaces.

And too many newlines.`, expected: "This text has too many spaces.\n\nAnd too many newlines.", }, { name: "Nested lists with indentation", input: "

One
- Two

", // Expect the sub-element to have 4 spaces of indentation expected: "- One\n - Two", }, { name: "Image support", input: `

`, // Correct Markdown syntax for images expected: "![alternative text](image.jpg)", }, { name: "Image support without alt-text", input: `

`, // If alt is missing, square brackets remain empty expected: "![](image.jpg)", }, { name: "XSS Bypass on Links (Obfuscated HTML entities)", // The Go HTML parser resolves entities, so this becomes "javascript:alert(1)" input: `Click here`, // Our isSafeHref (if updated with net/url) should neutralize it to "#" expected: "[Click here](#)", }, { name: "Empty link or used as anchor", input: ``, // With no text or href, it shouldn't print anything (not even empty brackets) expected: "", }, { name: "Link without href but with text (Textual anchor)", input: `Back to top`, // Should extract only plain text, without generating a broken Markdown link like [Back to top](#) or [Back to top]() expected: "Back to top", }, { name: "Badly spaced bold and italics (Edge Case)", input: ` Text `, // In Markdown `** Text **` is often not formatted correctly. The ideal is `**Text**` expected: "**Text**", }, { name: "Complex Test - Real Article", input: `

Article Title

This is an introductory text with a link.

Subtitle

Point one
Point two

`, // Note: The indentation of the real HTML test will generate spaces that // regex will clean up. expected: "# Article Title\n\nThis is an **introductory text** with a [link](http://link.com).\n\n## Subtitle\n\n- Point one\n- Point two", }, { name: "Ordered list (OL)", input: `

First
Second
Third

`, expected: "1. First\n2. Second\n3. Third", }, { name: "Ordered list nested in unordered list", input: `

Fruits
1. Apples
2. Pears
Vegetables

`, expected: "- Fruits\n 1. Apples\n 2. Pears\n- Vegetables", }, { name: "Code block (pre/code)", input: "

func main() {\n    fmt.Println(\"hello\")\n}

", expected: "```\nfunc main() {\n fmt.Println(\"hello\")\n}\n```", }, { name: "Inline code", input: `

Use the command go test ./... to run the tests.

`, expected: "Use the command `go test ./...` to run the tests.", }, { name: "Simple blockquote", input: `

An important quote.

`, expected: "> An important quote.", }, { name: "Multiline blockquote", input: `

First line of the quote.
Second line of the quote.

`, expected: "> First line of the quote.\n>\n> Second line of the quote.", }, { name: "Strikethrough text (del/s)", input: `This text is ~~deleted~~ and this is ~~crossed out~~.`, expected: "This text is ~~deleted~~ and this is ~~crossed out~~.", }, { name: "Horizontal separator (HR)", input: `

Above the line

Below the line

`, expected: "Above the line\n\n---\n\nBelow the line", }, { name: "Bold nested in link", input: `Linked bold text`, expected: "[**Linked bold text**](https://example.com)", }, { name: "data-src Image (lazy loading)", input: `

`, expected: "![Lazy image](lazy.jpg)", }, { name: "Image with javascript: src blocked", input: ` XSS

`, // src is not safe, so the image is not emitted expected: "", }, { name: "Link with data: href blocked", input: `Click`, expected: "[Click](#)", }, { name: "Deeply nested divs", input: `

Deeply nested text

`, expected: "Deeply nested text", }, { name: "Non-consecutive headers (H1, H3, H5)", input: `

Title

Subsection

Sub-subsection

`, expected: "# Title\n\n### Subsection\n\n##### Sub-subsection", }, { name: "Paragraph with mixed multiple emphasis", input: `

Important: read the critical instructions carefully.

`, expected: "**Important:** read the ***critical instructions*** *carefully*.", }, { name: "Article with nav and aside sections (noise to filter)", input: `

Article title

This is the body of the article.

`, expected: "## Article title\n\nThis is the body of the article.", }, { name: "Text with mixed special HTML entities", input: `Copyright © 2024 — All rights reserved ®`, expected: "Copyright © 2024 — All rights reserved ®", }, { name: "Mailto link", input: `Write to us at info@example.com`, expected: "Write to us at [info@example.com](mailto:info@example.com)", }, { name: "Image inside a link (clickable figure)", input: `

`, // The image-link without text must not generate broken markup expected: "[![Photo](photo.jpg)](https://example.com)", }, { name: "Empty content or only whitespace", input: `

`, expected: "", }, } // Iterate over all test cases for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { got, err := HtmlToMarkdown(tt.input) if err != nil { logger.ErrorCF("tool", "Failed to parse html to markdown: %s", map[string]any{"error": err.Error()}) } if got != tt.expected { t.Errorf("\nTest case failed: %s\nInput: %q\nGot: %q\nExpected: %q", tt.name, tt.input, got, tt.expected) } }) } }