pkg/bleach refactor

Instead of multiple maps for tags to keep, rename or remove, we use
a single map with flags.

Tag removal and renaming now takes place in a single loop.
This commit is contained in:
Olivier Meunier
2025-12-17 09:13:47 +01:00
parent e8817333c1
commit fc16870ea5
3 changed files with 219 additions and 210 deletions

View File

@@ -10,22 +10,24 @@ import (
"strings"
"unicode"
"github.com/go-shiori/dom"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
"github.com/go-shiori/dom"
)
// Policy holds the cleaning rules and provides methods to
// perform the DOM cleaning.
type Policy struct {
blockAttrs []*regexp.Regexp
elementMap map[string]string
elementMap map[string]tagRule
}
// New creates a new cleaning policy.
func New(blockAttrs []*regexp.Regexp, elementMap map[string]string) Policy {
func New(blockAttrs []*regexp.Regexp, elements map[string]tagRule) Policy {
return Policy{
blockAttrs: blockAttrs,
elementMap: elementMap,
elementMap: elements,
}
}
@@ -55,24 +57,35 @@ func (p Policy) Clean(top *html.Node) {
p.cleanAttributes(top)
}
// cleanTags discards unwanted tags from all nodes.
// cleanTags cleans up all the [html.Node] children.
// It applies, in one pass, a removal or renaming of elements.
func (p *Policy) cleanTags(top *html.Node) {
// Remove unwanted tags
dom.RemoveNodes(dom.QuerySelectorAll(top, "*"), func(node *html.Node) bool {
if e, ok := p.elementMap[dom.TagName(node)]; ok && e == "-" {
if node.Type != html.ElementNode {
return false
}
name := node.Data
rule, exists := p.elementMap[name]
if rule&tagRemove > 0 {
// Remove tag, done
return true
}
return false
})
// Rename tags
dom.ForEachNode(dom.QuerySelectorAll(top, "*"), func(node *html.Node, _ int) {
if e, ok := p.elementMap[dom.TagName(node)]; ok && e != "" && e != "-" {
node.Data = e
} else if !ok {
// unknown tags become div
node.Data = "div"
// Rename tag when it's unknown or has the [tagRename] flag.
if !exists || rule&tagRename > 0 {
if _, ok := blockTags[name]; ok || !exists {
// a block or unknown tag becomes a div
node.Data = "div"
node.DataAtom = atom.Div
} else {
// otherwise, a span
node.Data = "span"
node.DataAtom = atom.Span
}
}
return false
})
}
@@ -97,13 +110,18 @@ func (p *Policy) cleanAttributes(top *html.Node) {
// empty means: no child nodes, no attributes and no text content.
func (p Policy) RemoveEmptyNodes(top *html.Node) {
dom.RemoveNodes(dom.QuerySelectorAll(top, "*"), func(node *html.Node) bool {
if node.Type != html.ElementNode {
return false
}
name := node.Data
// Keep tags that are explicitly allowed to be empty, e.g. <hr>
if _, ok := keepEmptyTags[dom.TagName(node)]; ok {
if p.elementMap[name]&tagKeepEmpty > 0 {
return false
}
// Keep <a name> tags
if dom.TagName(node) == "a" && dom.GetAttribute(node, "name") != "" {
if name == "a" && dom.GetAttribute(node, "name") != "" {
return false
}

View File

@@ -48,6 +48,13 @@ func TestClean(t *testing.T) {
`<div><custom><script>alert("test")</script></custom></div>`,
`<body><div><div></div></div></body>`,
},
{
func(n *html.Node) {
bleach.DefaultPolicy.Clean(n)
},
`<p><font>test</font></div>`,
`<body><p><span>test</span></p></body>`,
},
{
func(n *html.Node) {
bleach.DefaultPolicy.Clean(n)

View File

@@ -6,31 +6,14 @@ package bleach
import "strings"
var keepEmptyTags = map[string]struct{}{
"area": {},
"base": {},
"br": {},
"col": {},
"command": {},
"embed": {},
"hr": {},
"img": {},
"input": {},
"keygen": {},
"link": {},
"menuitem": {},
"meta": {},
"mprescripts": {},
"mrow": {},
"mspace": {},
"mtd": {},
"param": {},
"source": {},
"th": {},
"td": {},
"track": {},
"wbr": {},
}
type tagRule uint8
const (
tagKeep = 1 << iota
tagKeepEmpty
tagRemove
tagRename
)
var blockTags = map[string]struct{}{
"address": {},
@@ -70,177 +53,178 @@ var blockTags = map[string]struct{}{
}
// elementMap is the map of all known elements
// and what they can be transformed to.
// A value of "-" means the elements must be removed.
// and the [tagRule]s that apply to them.
// As per https://developer.mozilla.org/en-US/docs/Web/HTML/Element
var elementMap = map[string]string{
"a": "",
"abbr": "",
"acronym": "",
"address": "",
"applet": "-", // remove
"area": "",
"article": "",
"aside": "",
"audio": "-", // remove
"b": "",
"base": "-", // remove
"bdi": "",
"bdo": "",
"big": "",
"blockquote": "",
"body": "",
"br": "",
"button": "-", // remove
"canvas": "-", // remove
"caption": "",
"center": "",
"cite": "",
"code": "",
"col": "",
"colgroup": "",
"data": "",
"datalist": "",
"dd": "",
"del": "",
"details": "",
"dfn": "",
"dialog": "-", // remove
"dir": "",
"div": "",
"dl": "",
"dt": "",
"em": "",
"embed": "-", // remove
"fieldset": "div",
"figcaption": "",
"figure": "",
"font": "span",
"footer": "",
"form": "div",
"frame": "-", // remove
"frameset": "-", // remove
"h1": "",
"h2": "",
"h3": "",
"h4": "",
"h5": "",
"h6": "",
"head": "-", // remove
"header": "",
"hgroup": "",
"hr": "",
"html": "",
"i": "",
"iframe": "-", // remove
"image": "",
"img": "",
"input": "-", // remove
"ins": "",
"kbd": "",
"label": "",
"legend": "",
"li": "",
"link": "-", // remove
"main": "",
"map": "",
"mark": "",
"marquee": "",
"menu": "",
"menuitem": "",
"meta": "-", // remove
"meter": "",
"nav": "",
"nobr": "",
"noembed": "div",
"noframes": "div",
"noscript": "div",
"object": "-", // remove
"ol": "",
"optgroup": "",
"option": "",
"output": "",
"p": "",
"param": "-", // remove
"picture": "",
"plaintext": "",
"portal": "-", // remove
"pre": "",
"progress": "",
"q": "",
"rb": "",
"rp": "",
"rt": "",
"rtc": "",
"ruby": "",
"s": "",
"samp": "",
"script": "-", // remove
"search": "",
"section": "",
"select": "-", // remove
"slot": "-", // remove
"small": "",
"source": "-", // remove
"span": "",
"strike": "",
"strong": "",
"style": "-", // remove
"sub": "",
"summary": "",
"sup": "",
"table": "",
"tbody": "",
"td": "",
"template": "-", // remove
"textarea": "-", // remove
"tfoot": "",
"th": "",
"thead": "",
"time": "",
"title": "-", // remove
"tr": "",
"track": "-", // remove
"tt": "",
"u": "",
"ul": "",
"var": "",
"video": "-", // remove
"wbr": "",
"xmp": "",
var elementMap = map[string]tagRule{
"a": tagKeep,
"abbr": tagKeep,
"acronym": tagKeep,
"address": tagKeep,
"applet": tagRemove,
"area": tagKeep | tagKeepEmpty,
"article": tagKeep,
"aside": tagKeep,
"audio": tagRemove,
"b": tagKeep,
"base": tagRemove | tagKeepEmpty,
"bdi": tagKeep,
"bdo": tagKeep,
"big": tagKeep,
"blockquote": tagKeep,
"body": tagKeep,
"br": tagKeep | tagKeepEmpty,
"button": tagRemove,
"canvas": tagRemove,
"caption": tagKeep,
"center": tagKeep,
"cite": tagKeep,
"code": tagKeep,
"col": tagKeep | tagKeepEmpty,
"command": tagKeepEmpty,
"colgroup": tagKeep,
"data": tagKeep,
"datalist": tagKeep,
"dd": tagKeep,
"del": tagKeep,
"details": tagKeep,
"dfn": tagKeep,
"dialog": tagRemove,
"dir": tagKeep,
"div": tagKeep,
"dl": tagKeep,
"dt": tagKeep,
"em": tagKeep,
"embed": tagRemove | tagKeepEmpty,
"fieldset": tagRename,
"figcaption": tagKeep,
"figure": tagKeep,
"font": tagRename,
"footer": tagKeep,
"form": tagRename,
"frame": tagRemove,
"frameset": tagRemove,
"h1": tagKeep,
"h2": tagKeep,
"h3": tagKeep,
"h4": tagKeep,
"h5": tagKeep,
"h6": tagKeep,
"head": tagRemove,
"header": tagKeep,
"hgroup": tagKeep,
"hr": tagKeep | tagKeepEmpty,
"html": tagKeep,
"i": tagKeep,
"iframe": tagRemove,
"image": tagKeep,
"img": tagKeep | tagKeepEmpty,
"input": tagRemove | tagKeepEmpty,
"ins": tagKeep,
"kbd": tagKeep,
"keygen": tagKeepEmpty,
"label": tagKeep,
"legend": tagKeep,
"li": tagKeep,
"link": tagRemove | tagKeepEmpty,
"main": tagKeep,
"map": tagKeep,
"mark": tagKeep,
"marquee": tagKeep,
"menu": tagKeep,
"menuitem": tagKeep | tagKeepEmpty,
"meta": tagRemove | tagKeepEmpty,
"meter": tagKeep,
"nav": tagKeep,
"nobr": tagKeep,
"noembed": tagRename,
"noframes": tagRename,
"noscript": tagRename,
"object": tagRemove,
"ol": tagKeep,
"optgroup": tagKeep,
"option": tagKeep,
"output": tagKeep,
"p": tagKeep,
"param": tagRemove | tagKeepEmpty,
"picture": tagKeep,
"plaintext": tagKeep,
"portal": tagRemove,
"pre": tagKeep,
"progress": tagKeep,
"q": tagKeep,
"rb": tagKeep,
"rp": tagKeep,
"rt": tagKeep,
"rtc": tagKeep,
"ruby": tagKeep,
"s": tagKeep,
"samp": tagKeep,
"script": tagRemove,
"search": tagKeep,
"section": tagKeep,
"select": tagRemove,
"slot": tagRemove,
"small": tagKeep,
"source": tagRemove | tagKeepEmpty,
"span": tagKeep,
"strike": tagKeep,
"strong": tagKeep,
"style": tagRemove,
"sub": tagKeep,
"summary": tagKeep,
"sup": tagKeep,
"table": tagKeep,
"tbody": tagKeep,
"td": tagKeep | tagKeepEmpty,
"template": tagRemove,
"textarea": tagRemove,
"tfoot": tagKeep,
"th": tagKeep | tagKeepEmpty,
"thead": tagKeep,
"time": tagKeep,
"title": tagRemove,
"tr": tagKeep,
"track": tagRemove | tagKeepEmpty,
"tt": tagKeep,
"u": tagKeep,
"ul": tagKeep,
"var": tagKeep,
"video": tagRemove,
"wbr": tagKeep | tagKeepEmpty,
"xmp": tagKeep,
// MathML Core elements
// curl -fsSL https://www.w3.org/Math/RelaxNG/mathml4/mathml4-core.rnc | grep -oE 'element [a-z-]+' | cut -d' ' -f2 | grep -v '^none$' | sort
"annotation": "",
"annotation-xml": "",
"maction": "",
"math": "",
"merror": "",
"mfrac": "",
"mi": "",
"mmultiscripts": "",
"mn": "",
"mo": "",
"mover": "",
"mpadded": "",
"mphantom": "",
"mprescripts": "",
"mroot": "",
"mrow": "",
"ms": "",
"mspace": "",
"msqrt": "",
"mstyle": "",
"msub": "",
"msubsup": "",
"msup": "",
"mtable": "",
"mtd": "",
"mtext": "",
"mtr": "",
"munder": "",
"munderover": "",
"semantics": "",
"annotation": tagKeep,
"annotation-xml": tagKeep,
"maction": tagKeep,
"math": tagKeep,
"merror": tagKeep,
"mfrac": tagKeep,
"mi": tagKeep,
"mmultiscripts": tagKeep,
"mn": tagKeep,
"mo": tagKeep,
"mover": tagKeep,
"mpadded": tagKeep,
"mphantom": tagKeep,
"mprescripts": tagKeep | tagKeepEmpty,
"mroot": tagKeep,
"mrow": tagKeep | tagKeepEmpty,
"ms": tagKeep,
"mspace": tagKeep | tagKeepEmpty,
"msqrt": tagKeep,
"mstyle": tagKeep,
"msub": tagKeep,
"msubsup": tagKeep,
"msup": tagKeep,
"mtable": tagKeep,
"mtd": tagKeep | tagKeepEmpty,
"mtext": tagKeep,
"mtr": tagKeep,
"munder": tagKeep,
"munderover": tagKeep,
"semantics": tagKeep,
}
var excludedChars = [][2]int{