Merge branch 'release'

This commit is contained in:
Olivier Meunier
2025-12-19 12:16:02 +01:00
7 changed files with 228 additions and 20 deletions

View File

@@ -0,0 +1,23 @@
{
"title_selectors": null,
"body_selectors": null,
"date_selectors": null,
"author_selectors": null,
"strip_selectors": null,
"strip_id_or_class": null,
"strip_image_src": null,
"native_ad_selectors": null,
"tidy": false,
"prune": false,
"autodetect_on_failure": true,
"single_page_link_selectors": null,
"next_page_link_selectors": null,
"replace_strings": null,
"http_headers": null,
"tests": [
{
"url": "https://www.as-web.jp/f1/1275289",
"contains": []
}
]
}

View File

@@ -0,0 +1,49 @@
{
"title_selectors": [
"substring-before(//meta[@property='og:title']/@content , ' | Business Insider Japan')"
],
"body_selectors": [
"//div[contains(concat(' ',normalize-space(@class),' '),' article_pArticle_Body ')]"
],
"date_selectors": null,
"author_selectors": null,
"strip_selectors": null,
"strip_id_or_class": [
"bws-avoid",
"displayhandler_sm__stLxT",
"displayhandler_lg__cvCnu"
],
"strip_image_src": null,
"native_ad_selectors": null,
"tidy": false,
"prune": false,
"autodetect_on_failure": true,
"single_page_link_selectors": null,
"next_page_link_selectors": [
"//a[contains(text(),\"次ページ\")]"
],
"replace_strings": null,
"http_headers": null,
"tests": [
{
"url": "https://www.businessinsider.jp/article/2511-mobile-4-carriers-strategy-analysis/",
"contains": []
},
{
"url": "https://www.businessinsider.jp/article/2511-canon-semiconductor-business/",
"contains": []
},
{
"url": "https://www.businessinsider.jp/article/2511-google-company-turnaround-moment-reasons-ai-race-gemini/",
"contains": []
},
{
"url": "https://www.businessinsider.jp/article/2511-google-deepmind-cracks-century-old-physics-mystery-ai-fluid-dynamics/",
"contains": []
},
{
"url": "https://www.businessinsider.jp/article/2511-ai-bubble-dot-com-crash-data-centers-infrastructure-jeremy-siegel/",
"contains": []
}
]
}

View File

@@ -0,0 +1,27 @@
{
"title_selectors": [
"substring-before(//meta[@property='og:title']/@content , ' | 毎日新聞')"
],
"body_selectors": null,
"date_selectors": null,
"author_selectors": null,
"strip_selectors": null,
"strip_id_or_class": [
"ad-articledetail-2"
],
"strip_image_src": null,
"native_ad_selectors": null,
"tidy": false,
"prune": false,
"autodetect_on_failure": true,
"single_page_link_selectors": null,
"next_page_link_selectors": null,
"replace_strings": null,
"http_headers": null,
"tests": [
{
"url": "https://mainichi.jp/articles/20251122/k00/00m/010/037000c",
"contains": []
}
]
}

View File

@@ -0,0 +1,29 @@
{
"title_selectors": null,
"body_selectors": [
"//div[@class=\"content-area\"]"
],
"date_selectors": null,
"author_selectors": null,
"strip_selectors": null,
"strip_id_or_class": [
"cmp-lst016",
"cmp-misc010",
"cmp-hdg005"
],
"strip_image_src": null,
"native_ad_selectors": null,
"tidy": false,
"prune": false,
"autodetect_on_failure": true,
"single_page_link_selectors": null,
"next_page_link_selectors": null,
"replace_strings": null,
"http_headers": null,
"tests": [
{
"url": "https://www.tokyo-np.co.jp/article/454831",
"contains": []
}
]
}

View File

@@ -13,6 +13,7 @@ import (
"log/slog"
"mime"
"net/http"
"net/netip"
"net/url"
"path"
"regexp"
@@ -81,19 +82,53 @@ func NewDrop(src *url.URL) *Drop {
// SetURL sets the Drop's URL and Domain properties in their unicode versions.
func (d *Drop) SetURL(src *url.URL) {
// First, copy url and ensure it's a unicode version
var uri *url.URL
domain := ""
if src != nil {
uri = new(url.URL)
*uri = *src
if src == nil {
d.URL = nil
d.Domain = ""
d.Site = ""
return
}
uri := new(url.URL)
*uri = *src
// Remove port when it's not needed
// Note: only numeric ports are valid in [url.URL].
port := uri.Port()
if uri.Scheme == "http" && port == "80" || uri.Scheme == "https" && port == "443" {
port = ""
// we want to keep the brackets on ipv6 here
uri.Host = uri.Host[:strings.LastIndexByte(uri.Host, ':')]
}
hostname := uri.Hostname()
if ip, err := netip.ParseAddr(hostname); err == nil {
// Hostname is an IP address. Shorten the address and use it as the domain.
s := ip.String()
if ip.Is6() {
uri.Host = "[" + s + "]"
} else {
uri.Host = s
}
if port != "" {
uri.Host += ":" + port
}
d.Domain = s
} else {
// Always encode the URL to unicode
if host, err := idna.ToUnicode(uri.Host); err == nil {
uri.Host = host
}
domain, _ = publicsuffix.EffectiveTLDPlusOne(uri.Hostname())
d.Domain, _ = publicsuffix.EffectiveTLDPlusOne(uri.Hostname())
}
if d.Domain == "" {
d.Domain = hostname
}
d.URL = uri
d.Domain = domain
}
// Load loads the remote URL and retrieve data.

View File

@@ -97,6 +97,26 @@ func TestDrop(t *testing.T) {
"http://example.net/test/test",
"example.net",
},
{
"http://example.net:8888/test/test",
"http://example.net:8888/test/test",
"example.net",
},
{
"http://example.net:80/test/test",
"http://example.net/test/test",
"example.net",
},
{
"http://example.net:80/test/test",
"http://example.net/test/test",
"example.net",
},
{
"https://example.net:443/test/test",
"https://example.net/test/test",
"example.net",
},
{
"http://belgië.icom.museum/€test",
"http://belgië.icom.museum/€test",
@@ -117,6 +137,31 @@ func TestDrop(t *testing.T) {
"http://example.co.jp",
"example.co.jp",
},
{
"http://127.0.0.1:5000",
"http://127.0.0.1:5000",
"127.0.0.1",
},
{
"http://[fd66:2244:0::0:1]:5000",
"http://[fd66:2244::1]:5000",
"fd66:2244::1",
},
{
"http://[::1]/",
"http://[::1]/",
"::1",
},
{
"http://[::1]:80/",
"http://[::1]/",
"::1",
},
{
"https://[fd66:8282::a]:443/",
"https://[fd66:8282::a]/",
"fd66:8282::a",
},
}
for _, x := range tests {

24
web/package-lock.json generated
View File

@@ -3087,9 +3087,9 @@
"license": "MIT"
},
"node_modules/baseline-browser-mapping": {
"version": "2.9.5",
"resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.5.tgz",
"integrity": "sha512-D5vIoztZOq1XM54LUdttJVc96ggEsIfju2JBvht06pSzpckp3C7HReun67Bghzrtdsq9XdMGbSSB3v3GhMNmAA==",
"version": "2.9.10",
"resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.10.tgz",
"integrity": "sha512-2VIKvDx8Z1a9rTB2eCkdPE5nSe28XnA+qivGnWHoB40hMMt/h1hSz0960Zqsn6ZyxWXUie0EBdElKv8may20AA==",
"license": "Apache-2.0",
"bin": {
"baseline-browser-mapping": "dist/cli.js"
@@ -3285,9 +3285,9 @@
}
},
"node_modules/caniuse-lite": {
"version": "1.0.30001760",
"resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001760.tgz",
"integrity": "sha512-7AAMPcueWELt1p3mi13HR/LHH0TJLT11cnwDJEs3xA4+CK/PLKeO9Kl1oru24htkyUKtkGCvAx4ohB0Ttry8Dw==",
"version": "1.0.30001761",
"resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001761.tgz",
"integrity": "sha512-JF9ptu1vP2coz98+5051jZ4PwQgd2ni8A+gYSN7EA7dPKIMf0pDlSUxhdmVOaV3/fYK5uWBkgSXJaRLr4+3A6g==",
"funding": [
{
"type": "opencollective",
@@ -12154,9 +12154,9 @@
"integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA=="
},
"baseline-browser-mapping": {
"version": "2.9.5",
"resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.5.tgz",
"integrity": "sha512-D5vIoztZOq1XM54LUdttJVc96ggEsIfju2JBvht06pSzpckp3C7HReun67Bghzrtdsq9XdMGbSSB3v3GhMNmAA=="
"version": "2.9.10",
"resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.10.tgz",
"integrity": "sha512-2VIKvDx8Z1a9rTB2eCkdPE5nSe28XnA+qivGnWHoB40hMMt/h1hSz0960Zqsn6ZyxWXUie0EBdElKv8may20AA=="
},
"binary-extensions": {
"version": "2.3.0",
@@ -12288,9 +12288,9 @@
}
},
"caniuse-lite": {
"version": "1.0.30001760",
"resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001760.tgz",
"integrity": "sha512-7AAMPcueWELt1p3mi13HR/LHH0TJLT11cnwDJEs3xA4+CK/PLKeO9Kl1oru24htkyUKtkGCvAx4ohB0Ttry8Dw=="
"version": "1.0.30001761",
"resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001761.tgz",
"integrity": "sha512-JF9ptu1vP2coz98+5051jZ4PwQgd2ni8A+gYSN7EA7dPKIMf0pDlSUxhdmVOaV3/fYK5uWBkgSXJaRLr4+3A6g=="
},
"chalk": {
"version": "4.1.2",