mirror of
https://codeberg.org/readeck/readeck.git
synced 2025-12-22 13:17:10 +00:00
pkg/bleach refactor
Instead of multiple maps for tags to keep, rename or remove, we use a single map with flags. Tag removal and renaming now takes place in a single loop.
This commit is contained in:
@@ -10,22 +10,24 @@ import (
|
||||
"strings"
|
||||
"unicode"
|
||||
|
||||
"github.com/go-shiori/dom"
|
||||
"golang.org/x/net/html"
|
||||
"golang.org/x/net/html/atom"
|
||||
|
||||
"github.com/go-shiori/dom"
|
||||
)
|
||||
|
||||
// Policy holds the cleaning rules and provides methods to
|
||||
// perform the DOM cleaning.
|
||||
type Policy struct {
|
||||
blockAttrs []*regexp.Regexp
|
||||
elementMap map[string]string
|
||||
elementMap map[string]tagRule
|
||||
}
|
||||
|
||||
// New creates a new cleaning policy.
|
||||
func New(blockAttrs []*regexp.Regexp, elementMap map[string]string) Policy {
|
||||
func New(blockAttrs []*regexp.Regexp, elements map[string]tagRule) Policy {
|
||||
return Policy{
|
||||
blockAttrs: blockAttrs,
|
||||
elementMap: elementMap,
|
||||
elementMap: elements,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -55,24 +57,35 @@ func (p Policy) Clean(top *html.Node) {
|
||||
p.cleanAttributes(top)
|
||||
}
|
||||
|
||||
// cleanTags discards unwanted tags from all nodes.
|
||||
// cleanTags cleans up all the [html.Node] children.
|
||||
// It applies, in one pass, a removal or renaming of elements.
|
||||
func (p *Policy) cleanTags(top *html.Node) {
|
||||
// Remove unwanted tags
|
||||
dom.RemoveNodes(dom.QuerySelectorAll(top, "*"), func(node *html.Node) bool {
|
||||
if e, ok := p.elementMap[dom.TagName(node)]; ok && e == "-" {
|
||||
if node.Type != html.ElementNode {
|
||||
return false
|
||||
}
|
||||
name := node.Data
|
||||
|
||||
rule, exists := p.elementMap[name]
|
||||
if rule&tagRemove > 0 {
|
||||
// Remove tag, done
|
||||
return true
|
||||
}
|
||||
return false
|
||||
})
|
||||
|
||||
// Rename tags
|
||||
dom.ForEachNode(dom.QuerySelectorAll(top, "*"), func(node *html.Node, _ int) {
|
||||
if e, ok := p.elementMap[dom.TagName(node)]; ok && e != "" && e != "-" {
|
||||
node.Data = e
|
||||
} else if !ok {
|
||||
// unknown tags become div
|
||||
node.Data = "div"
|
||||
// Rename tag when it's unknown or has the [tagRename] flag.
|
||||
if !exists || rule&tagRename > 0 {
|
||||
if _, ok := blockTags[name]; ok || !exists {
|
||||
// a block or unknown tag becomes a div
|
||||
node.Data = "div"
|
||||
node.DataAtom = atom.Div
|
||||
} else {
|
||||
// otherwise, a span
|
||||
node.Data = "span"
|
||||
node.DataAtom = atom.Span
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
})
|
||||
}
|
||||
|
||||
@@ -97,13 +110,18 @@ func (p *Policy) cleanAttributes(top *html.Node) {
|
||||
// empty means: no child nodes, no attributes and no text content.
|
||||
func (p Policy) RemoveEmptyNodes(top *html.Node) {
|
||||
dom.RemoveNodes(dom.QuerySelectorAll(top, "*"), func(node *html.Node) bool {
|
||||
if node.Type != html.ElementNode {
|
||||
return false
|
||||
}
|
||||
name := node.Data
|
||||
|
||||
// Keep tags that are explicitly allowed to be empty, e.g. <hr>
|
||||
if _, ok := keepEmptyTags[dom.TagName(node)]; ok {
|
||||
if p.elementMap[name]&tagKeepEmpty > 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
// Keep <a name> tags
|
||||
if dom.TagName(node) == "a" && dom.GetAttribute(node, "name") != "" {
|
||||
if name == "a" && dom.GetAttribute(node, "name") != "" {
|
||||
return false
|
||||
}
|
||||
|
||||
|
||||
@@ -48,6 +48,13 @@ func TestClean(t *testing.T) {
|
||||
`<div><custom><script>alert("test")</script></custom></div>`,
|
||||
`<body><div><div></div></div></body>`,
|
||||
},
|
||||
{
|
||||
func(n *html.Node) {
|
||||
bleach.DefaultPolicy.Clean(n)
|
||||
},
|
||||
`<p><font>test</font></div>`,
|
||||
`<body><p><span>test</span></p></body>`,
|
||||
},
|
||||
{
|
||||
func(n *html.Node) {
|
||||
bleach.DefaultPolicy.Clean(n)
|
||||
|
||||
@@ -6,31 +6,14 @@ package bleach
|
||||
|
||||
import "strings"
|
||||
|
||||
var keepEmptyTags = map[string]struct{}{
|
||||
"area": {},
|
||||
"base": {},
|
||||
"br": {},
|
||||
"col": {},
|
||||
"command": {},
|
||||
"embed": {},
|
||||
"hr": {},
|
||||
"img": {},
|
||||
"input": {},
|
||||
"keygen": {},
|
||||
"link": {},
|
||||
"menuitem": {},
|
||||
"meta": {},
|
||||
"mprescripts": {},
|
||||
"mrow": {},
|
||||
"mspace": {},
|
||||
"mtd": {},
|
||||
"param": {},
|
||||
"source": {},
|
||||
"th": {},
|
||||
"td": {},
|
||||
"track": {},
|
||||
"wbr": {},
|
||||
}
|
||||
type tagRule uint8
|
||||
|
||||
const (
|
||||
tagKeep = 1 << iota
|
||||
tagKeepEmpty
|
||||
tagRemove
|
||||
tagRename
|
||||
)
|
||||
|
||||
var blockTags = map[string]struct{}{
|
||||
"address": {},
|
||||
@@ -70,177 +53,178 @@ var blockTags = map[string]struct{}{
|
||||
}
|
||||
|
||||
// elementMap is the map of all known elements
|
||||
// and what they can be transformed to.
|
||||
// A value of "-" means the elements must be removed.
|
||||
// and the [tagRule]s that apply to them.
|
||||
// As per https://developer.mozilla.org/en-US/docs/Web/HTML/Element
|
||||
var elementMap = map[string]string{
|
||||
"a": "",
|
||||
"abbr": "",
|
||||
"acronym": "",
|
||||
"address": "",
|
||||
"applet": "-", // remove
|
||||
"area": "",
|
||||
"article": "",
|
||||
"aside": "",
|
||||
"audio": "-", // remove
|
||||
"b": "",
|
||||
"base": "-", // remove
|
||||
"bdi": "",
|
||||
"bdo": "",
|
||||
"big": "",
|
||||
"blockquote": "",
|
||||
"body": "",
|
||||
"br": "",
|
||||
"button": "-", // remove
|
||||
"canvas": "-", // remove
|
||||
"caption": "",
|
||||
"center": "",
|
||||
"cite": "",
|
||||
"code": "",
|
||||
"col": "",
|
||||
"colgroup": "",
|
||||
"data": "",
|
||||
"datalist": "",
|
||||
"dd": "",
|
||||
"del": "",
|
||||
"details": "",
|
||||
"dfn": "",
|
||||
"dialog": "-", // remove
|
||||
"dir": "",
|
||||
"div": "",
|
||||
"dl": "",
|
||||
"dt": "",
|
||||
"em": "",
|
||||
"embed": "-", // remove
|
||||
"fieldset": "div",
|
||||
"figcaption": "",
|
||||
"figure": "",
|
||||
"font": "span",
|
||||
"footer": "",
|
||||
"form": "div",
|
||||
"frame": "-", // remove
|
||||
"frameset": "-", // remove
|
||||
"h1": "",
|
||||
"h2": "",
|
||||
"h3": "",
|
||||
"h4": "",
|
||||
"h5": "",
|
||||
"h6": "",
|
||||
"head": "-", // remove
|
||||
"header": "",
|
||||
"hgroup": "",
|
||||
"hr": "",
|
||||
"html": "",
|
||||
"i": "",
|
||||
"iframe": "-", // remove
|
||||
"image": "",
|
||||
"img": "",
|
||||
"input": "-", // remove
|
||||
"ins": "",
|
||||
"kbd": "",
|
||||
"label": "",
|
||||
"legend": "",
|
||||
"li": "",
|
||||
"link": "-", // remove
|
||||
"main": "",
|
||||
"map": "",
|
||||
"mark": "",
|
||||
"marquee": "",
|
||||
"menu": "",
|
||||
"menuitem": "",
|
||||
"meta": "-", // remove
|
||||
"meter": "",
|
||||
"nav": "",
|
||||
"nobr": "",
|
||||
"noembed": "div",
|
||||
"noframes": "div",
|
||||
"noscript": "div",
|
||||
"object": "-", // remove
|
||||
"ol": "",
|
||||
"optgroup": "",
|
||||
"option": "",
|
||||
"output": "",
|
||||
"p": "",
|
||||
"param": "-", // remove
|
||||
"picture": "",
|
||||
"plaintext": "",
|
||||
"portal": "-", // remove
|
||||
"pre": "",
|
||||
"progress": "",
|
||||
"q": "",
|
||||
"rb": "",
|
||||
"rp": "",
|
||||
"rt": "",
|
||||
"rtc": "",
|
||||
"ruby": "",
|
||||
"s": "",
|
||||
"samp": "",
|
||||
"script": "-", // remove
|
||||
"search": "",
|
||||
"section": "",
|
||||
"select": "-", // remove
|
||||
"slot": "-", // remove
|
||||
"small": "",
|
||||
"source": "-", // remove
|
||||
"span": "",
|
||||
"strike": "",
|
||||
"strong": "",
|
||||
"style": "-", // remove
|
||||
"sub": "",
|
||||
"summary": "",
|
||||
"sup": "",
|
||||
"table": "",
|
||||
"tbody": "",
|
||||
"td": "",
|
||||
"template": "-", // remove
|
||||
"textarea": "-", // remove
|
||||
"tfoot": "",
|
||||
"th": "",
|
||||
"thead": "",
|
||||
"time": "",
|
||||
"title": "-", // remove
|
||||
"tr": "",
|
||||
"track": "-", // remove
|
||||
"tt": "",
|
||||
"u": "",
|
||||
"ul": "",
|
||||
"var": "",
|
||||
"video": "-", // remove
|
||||
"wbr": "",
|
||||
"xmp": "",
|
||||
var elementMap = map[string]tagRule{
|
||||
"a": tagKeep,
|
||||
"abbr": tagKeep,
|
||||
"acronym": tagKeep,
|
||||
"address": tagKeep,
|
||||
"applet": tagRemove,
|
||||
"area": tagKeep | tagKeepEmpty,
|
||||
"article": tagKeep,
|
||||
"aside": tagKeep,
|
||||
"audio": tagRemove,
|
||||
"b": tagKeep,
|
||||
"base": tagRemove | tagKeepEmpty,
|
||||
"bdi": tagKeep,
|
||||
"bdo": tagKeep,
|
||||
"big": tagKeep,
|
||||
"blockquote": tagKeep,
|
||||
"body": tagKeep,
|
||||
"br": tagKeep | tagKeepEmpty,
|
||||
"button": tagRemove,
|
||||
"canvas": tagRemove,
|
||||
"caption": tagKeep,
|
||||
"center": tagKeep,
|
||||
"cite": tagKeep,
|
||||
"code": tagKeep,
|
||||
"col": tagKeep | tagKeepEmpty,
|
||||
"command": tagKeepEmpty,
|
||||
"colgroup": tagKeep,
|
||||
"data": tagKeep,
|
||||
"datalist": tagKeep,
|
||||
"dd": tagKeep,
|
||||
"del": tagKeep,
|
||||
"details": tagKeep,
|
||||
"dfn": tagKeep,
|
||||
"dialog": tagRemove,
|
||||
"dir": tagKeep,
|
||||
"div": tagKeep,
|
||||
"dl": tagKeep,
|
||||
"dt": tagKeep,
|
||||
"em": tagKeep,
|
||||
"embed": tagRemove | tagKeepEmpty,
|
||||
"fieldset": tagRename,
|
||||
"figcaption": tagKeep,
|
||||
"figure": tagKeep,
|
||||
"font": tagRename,
|
||||
"footer": tagKeep,
|
||||
"form": tagRename,
|
||||
"frame": tagRemove,
|
||||
"frameset": tagRemove,
|
||||
"h1": tagKeep,
|
||||
"h2": tagKeep,
|
||||
"h3": tagKeep,
|
||||
"h4": tagKeep,
|
||||
"h5": tagKeep,
|
||||
"h6": tagKeep,
|
||||
"head": tagRemove,
|
||||
"header": tagKeep,
|
||||
"hgroup": tagKeep,
|
||||
"hr": tagKeep | tagKeepEmpty,
|
||||
"html": tagKeep,
|
||||
"i": tagKeep,
|
||||
"iframe": tagRemove,
|
||||
"image": tagKeep,
|
||||
"img": tagKeep | tagKeepEmpty,
|
||||
"input": tagRemove | tagKeepEmpty,
|
||||
"ins": tagKeep,
|
||||
"kbd": tagKeep,
|
||||
"keygen": tagKeepEmpty,
|
||||
"label": tagKeep,
|
||||
"legend": tagKeep,
|
||||
"li": tagKeep,
|
||||
"link": tagRemove | tagKeepEmpty,
|
||||
"main": tagKeep,
|
||||
"map": tagKeep,
|
||||
"mark": tagKeep,
|
||||
"marquee": tagKeep,
|
||||
"menu": tagKeep,
|
||||
"menuitem": tagKeep | tagKeepEmpty,
|
||||
"meta": tagRemove | tagKeepEmpty,
|
||||
"meter": tagKeep,
|
||||
"nav": tagKeep,
|
||||
"nobr": tagKeep,
|
||||
"noembed": tagRename,
|
||||
"noframes": tagRename,
|
||||
"noscript": tagRename,
|
||||
"object": tagRemove,
|
||||
"ol": tagKeep,
|
||||
"optgroup": tagKeep,
|
||||
"option": tagKeep,
|
||||
"output": tagKeep,
|
||||
"p": tagKeep,
|
||||
"param": tagRemove | tagKeepEmpty,
|
||||
"picture": tagKeep,
|
||||
"plaintext": tagKeep,
|
||||
"portal": tagRemove,
|
||||
"pre": tagKeep,
|
||||
"progress": tagKeep,
|
||||
"q": tagKeep,
|
||||
"rb": tagKeep,
|
||||
"rp": tagKeep,
|
||||
"rt": tagKeep,
|
||||
"rtc": tagKeep,
|
||||
"ruby": tagKeep,
|
||||
"s": tagKeep,
|
||||
"samp": tagKeep,
|
||||
"script": tagRemove,
|
||||
"search": tagKeep,
|
||||
"section": tagKeep,
|
||||
"select": tagRemove,
|
||||
"slot": tagRemove,
|
||||
"small": tagKeep,
|
||||
"source": tagRemove | tagKeepEmpty,
|
||||
"span": tagKeep,
|
||||
"strike": tagKeep,
|
||||
"strong": tagKeep,
|
||||
"style": tagRemove,
|
||||
"sub": tagKeep,
|
||||
"summary": tagKeep,
|
||||
"sup": tagKeep,
|
||||
"table": tagKeep,
|
||||
"tbody": tagKeep,
|
||||
"td": tagKeep | tagKeepEmpty,
|
||||
"template": tagRemove,
|
||||
"textarea": tagRemove,
|
||||
"tfoot": tagKeep,
|
||||
"th": tagKeep | tagKeepEmpty,
|
||||
"thead": tagKeep,
|
||||
"time": tagKeep,
|
||||
"title": tagRemove,
|
||||
"tr": tagKeep,
|
||||
"track": tagRemove | tagKeepEmpty,
|
||||
"tt": tagKeep,
|
||||
"u": tagKeep,
|
||||
"ul": tagKeep,
|
||||
"var": tagKeep,
|
||||
"video": tagRemove,
|
||||
"wbr": tagKeep | tagKeepEmpty,
|
||||
"xmp": tagKeep,
|
||||
|
||||
// MathML Core elements
|
||||
// curl -fsSL https://www.w3.org/Math/RelaxNG/mathml4/mathml4-core.rnc | grep -oE 'element [a-z-]+' | cut -d' ' -f2 | grep -v '^none$' | sort
|
||||
"annotation": "",
|
||||
"annotation-xml": "",
|
||||
"maction": "",
|
||||
"math": "",
|
||||
"merror": "",
|
||||
"mfrac": "",
|
||||
"mi": "",
|
||||
"mmultiscripts": "",
|
||||
"mn": "",
|
||||
"mo": "",
|
||||
"mover": "",
|
||||
"mpadded": "",
|
||||
"mphantom": "",
|
||||
"mprescripts": "",
|
||||
"mroot": "",
|
||||
"mrow": "",
|
||||
"ms": "",
|
||||
"mspace": "",
|
||||
"msqrt": "",
|
||||
"mstyle": "",
|
||||
"msub": "",
|
||||
"msubsup": "",
|
||||
"msup": "",
|
||||
"mtable": "",
|
||||
"mtd": "",
|
||||
"mtext": "",
|
||||
"mtr": "",
|
||||
"munder": "",
|
||||
"munderover": "",
|
||||
"semantics": "",
|
||||
"annotation": tagKeep,
|
||||
"annotation-xml": tagKeep,
|
||||
"maction": tagKeep,
|
||||
"math": tagKeep,
|
||||
"merror": tagKeep,
|
||||
"mfrac": tagKeep,
|
||||
"mi": tagKeep,
|
||||
"mmultiscripts": tagKeep,
|
||||
"mn": tagKeep,
|
||||
"mo": tagKeep,
|
||||
"mover": tagKeep,
|
||||
"mpadded": tagKeep,
|
||||
"mphantom": tagKeep,
|
||||
"mprescripts": tagKeep | tagKeepEmpty,
|
||||
"mroot": tagKeep,
|
||||
"mrow": tagKeep | tagKeepEmpty,
|
||||
"ms": tagKeep,
|
||||
"mspace": tagKeep | tagKeepEmpty,
|
||||
"msqrt": tagKeep,
|
||||
"mstyle": tagKeep,
|
||||
"msub": tagKeep,
|
||||
"msubsup": tagKeep,
|
||||
"msup": tagKeep,
|
||||
"mtable": tagKeep,
|
||||
"mtd": tagKeep | tagKeepEmpty,
|
||||
"mtext": tagKeep,
|
||||
"mtr": tagKeep,
|
||||
"munder": tagKeep,
|
||||
"munderover": tagKeep,
|
||||
"semantics": tagKeep,
|
||||
}
|
||||
|
||||
var excludedChars = [][2]int{
|
||||
|
||||
Reference in New Issue
Block a user