diff --git a/pkg/extract/contentscripts/assets/site-config/as-web.jp.json b/pkg/extract/contentscripts/assets/site-config/as-web.jp.json new file mode 100644 index 00000000..0dd48bd1 --- /dev/null +++ b/pkg/extract/contentscripts/assets/site-config/as-web.jp.json @@ -0,0 +1,23 @@ +{ + "title_selectors": null, + "body_selectors": null, + "date_selectors": null, + "author_selectors": null, + "strip_selectors": null, + "strip_id_or_class": null, + "strip_image_src": null, + "native_ad_selectors": null, + "tidy": false, + "prune": false, + "autodetect_on_failure": true, + "single_page_link_selectors": null, + "next_page_link_selectors": null, + "replace_strings": null, + "http_headers": null, + "tests": [ + { + "url": "https://www.as-web.jp/f1/1275289", + "contains": [] + } + ] +} diff --git a/pkg/extract/contentscripts/assets/site-config/businessinsider.jp.json b/pkg/extract/contentscripts/assets/site-config/businessinsider.jp.json new file mode 100644 index 00000000..004bfc6e --- /dev/null +++ b/pkg/extract/contentscripts/assets/site-config/businessinsider.jp.json @@ -0,0 +1,49 @@ +{ + "title_selectors": [ + "substring-before(//meta[@property='og:title']/@content , ' | Business Insider Japan')" + ], + "body_selectors": [ + "//div[contains(concat(' ',normalize-space(@class),' '),' article_pArticle_Body ')]" + ], + "date_selectors": null, + "author_selectors": null, + "strip_selectors": null, + "strip_id_or_class": [ + "bws-avoid", + "displayhandler_sm__stLxT", + "displayhandler_lg__cvCnu" + ], + "strip_image_src": null, + "native_ad_selectors": null, + "tidy": false, + "prune": false, + "autodetect_on_failure": true, + "single_page_link_selectors": null, + "next_page_link_selectors": [ + "//a[contains(text(),\"次ページ\")]" + ], + "replace_strings": null, + "http_headers": null, + "tests": [ + { + "url": "https://www.businessinsider.jp/article/2511-mobile-4-carriers-strategy-analysis/", + "contains": [] + }, + { + "url": "https://www.businessinsider.jp/article/2511-canon-semiconductor-business/", + "contains": [] + }, + { + "url": "https://www.businessinsider.jp/article/2511-google-company-turnaround-moment-reasons-ai-race-gemini/", + "contains": [] + }, + { + "url": "https://www.businessinsider.jp/article/2511-google-deepmind-cracks-century-old-physics-mystery-ai-fluid-dynamics/", + "contains": [] + }, + { + "url": "https://www.businessinsider.jp/article/2511-ai-bubble-dot-com-crash-data-centers-infrastructure-jeremy-siegel/", + "contains": [] + } + ] +} diff --git a/pkg/extract/contentscripts/assets/site-config/mainichi.jp.json b/pkg/extract/contentscripts/assets/site-config/mainichi.jp.json new file mode 100644 index 00000000..7cf40618 --- /dev/null +++ b/pkg/extract/contentscripts/assets/site-config/mainichi.jp.json @@ -0,0 +1,27 @@ +{ + "title_selectors": [ + "substring-before(//meta[@property='og:title']/@content , ' | 毎日新聞')" + ], + "body_selectors": null, + "date_selectors": null, + "author_selectors": null, + "strip_selectors": null, + "strip_id_or_class": [ + "ad-articledetail-2" + ], + "strip_image_src": null, + "native_ad_selectors": null, + "tidy": false, + "prune": false, + "autodetect_on_failure": true, + "single_page_link_selectors": null, + "next_page_link_selectors": null, + "replace_strings": null, + "http_headers": null, + "tests": [ + { + "url": "https://mainichi.jp/articles/20251122/k00/00m/010/037000c", + "contains": [] + } + ] +} diff --git a/pkg/extract/contentscripts/assets/site-config/tokyo-np.co.jp.json b/pkg/extract/contentscripts/assets/site-config/tokyo-np.co.jp.json new file mode 100644 index 00000000..c443e58b --- /dev/null +++ b/pkg/extract/contentscripts/assets/site-config/tokyo-np.co.jp.json @@ -0,0 +1,29 @@ +{ + "title_selectors": null, + "body_selectors": [ + "//div[@class=\"content-area\"]" + ], + "date_selectors": null, + "author_selectors": null, + "strip_selectors": null, + "strip_id_or_class": [ + "cmp-lst016", + "cmp-misc010", + "cmp-hdg005" + ], + "strip_image_src": null, + "native_ad_selectors": null, + "tidy": false, + "prune": false, + "autodetect_on_failure": true, + "single_page_link_selectors": null, + "next_page_link_selectors": null, + "replace_strings": null, + "http_headers": null, + "tests": [ + { + "url": "https://www.tokyo-np.co.jp/article/454831", + "contains": [] + } + ] +} diff --git a/pkg/extract/drop.go b/pkg/extract/drop.go index edf8de63..eaad3e17 100644 --- a/pkg/extract/drop.go +++ b/pkg/extract/drop.go @@ -13,6 +13,7 @@ import ( "log/slog" "mime" "net/http" + "net/netip" "net/url" "path" "regexp" @@ -81,19 +82,53 @@ func NewDrop(src *url.URL) *Drop { // SetURL sets the Drop's URL and Domain properties in their unicode versions. func (d *Drop) SetURL(src *url.URL) { - // First, copy url and ensure it's a unicode version - var uri *url.URL - domain := "" - if src != nil { - uri = new(url.URL) - *uri = *src + if src == nil { + d.URL = nil + d.Domain = "" + d.Site = "" + return + } + + uri := new(url.URL) + *uri = *src + + // Remove port when it's not needed + // Note: only numeric ports are valid in [url.URL]. + port := uri.Port() + if uri.Scheme == "http" && port == "80" || uri.Scheme == "https" && port == "443" { + port = "" + // we want to keep the brackets on ipv6 here + uri.Host = uri.Host[:strings.LastIndexByte(uri.Host, ':')] + } + + hostname := uri.Hostname() + + if ip, err := netip.ParseAddr(hostname); err == nil { + // Hostname is an IP address. Shorten the address and use it as the domain. + s := ip.String() + if ip.Is6() { + uri.Host = "[" + s + "]" + } else { + uri.Host = s + } + if port != "" { + uri.Host += ":" + port + } + + d.Domain = s + } else { + // Always encode the URL to unicode if host, err := idna.ToUnicode(uri.Host); err == nil { uri.Host = host } - domain, _ = publicsuffix.EffectiveTLDPlusOne(uri.Hostname()) + d.Domain, _ = publicsuffix.EffectiveTLDPlusOne(uri.Hostname()) } + + if d.Domain == "" { + d.Domain = hostname + } + d.URL = uri - d.Domain = domain } // Load loads the remote URL and retrieve data. diff --git a/pkg/extract/drop_test.go b/pkg/extract/drop_test.go index 5b950a0f..1507ba25 100644 --- a/pkg/extract/drop_test.go +++ b/pkg/extract/drop_test.go @@ -97,6 +97,26 @@ func TestDrop(t *testing.T) { "http://example.net/test/test", "example.net", }, + { + "http://example.net:8888/test/test", + "http://example.net:8888/test/test", + "example.net", + }, + { + "http://example.net:80/test/test", + "http://example.net/test/test", + "example.net", + }, + { + "http://example.net:80/test/test", + "http://example.net/test/test", + "example.net", + }, + { + "https://example.net:443/test/test", + "https://example.net/test/test", + "example.net", + }, { "http://belgië.icom.museum/€test", "http://belgië.icom.museum/€test", @@ -117,6 +137,31 @@ func TestDrop(t *testing.T) { "http://example.co.jp", "example.co.jp", }, + { + "http://127.0.0.1:5000", + "http://127.0.0.1:5000", + "127.0.0.1", + }, + { + "http://[fd66:2244:0::0:1]:5000", + "http://[fd66:2244::1]:5000", + "fd66:2244::1", + }, + { + "http://[::1]/", + "http://[::1]/", + "::1", + }, + { + "http://[::1]:80/", + "http://[::1]/", + "::1", + }, + { + "https://[fd66:8282::a]:443/", + "https://[fd66:8282::a]/", + "fd66:8282::a", + }, } for _, x := range tests { diff --git a/web/package-lock.json b/web/package-lock.json index 3b8538fc..c51f3fa9 100644 --- a/web/package-lock.json +++ b/web/package-lock.json @@ -3087,9 +3087,9 @@ "license": "MIT" }, "node_modules/baseline-browser-mapping": { - "version": "2.9.5", - "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.5.tgz", - "integrity": "sha512-D5vIoztZOq1XM54LUdttJVc96ggEsIfju2JBvht06pSzpckp3C7HReun67Bghzrtdsq9XdMGbSSB3v3GhMNmAA==", + "version": "2.9.10", + "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.10.tgz", + "integrity": "sha512-2VIKvDx8Z1a9rTB2eCkdPE5nSe28XnA+qivGnWHoB40hMMt/h1hSz0960Zqsn6ZyxWXUie0EBdElKv8may20AA==", "license": "Apache-2.0", "bin": { "baseline-browser-mapping": "dist/cli.js" @@ -3285,9 +3285,9 @@ } }, "node_modules/caniuse-lite": { - "version": "1.0.30001760", - "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001760.tgz", - "integrity": "sha512-7AAMPcueWELt1p3mi13HR/LHH0TJLT11cnwDJEs3xA4+CK/PLKeO9Kl1oru24htkyUKtkGCvAx4ohB0Ttry8Dw==", + "version": "1.0.30001761", + "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001761.tgz", + "integrity": "sha512-JF9ptu1vP2coz98+5051jZ4PwQgd2ni8A+gYSN7EA7dPKIMf0pDlSUxhdmVOaV3/fYK5uWBkgSXJaRLr4+3A6g==", "funding": [ { "type": "opencollective", @@ -12154,9 +12154,9 @@ "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==" }, "baseline-browser-mapping": { - "version": "2.9.5", - "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.5.tgz", - "integrity": "sha512-D5vIoztZOq1XM54LUdttJVc96ggEsIfju2JBvht06pSzpckp3C7HReun67Bghzrtdsq9XdMGbSSB3v3GhMNmAA==" + "version": "2.9.10", + "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.10.tgz", + "integrity": "sha512-2VIKvDx8Z1a9rTB2eCkdPE5nSe28XnA+qivGnWHoB40hMMt/h1hSz0960Zqsn6ZyxWXUie0EBdElKv8may20AA==" }, "binary-extensions": { "version": "2.3.0", @@ -12288,9 +12288,9 @@ } }, "caniuse-lite": { - "version": "1.0.30001760", - "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001760.tgz", - "integrity": "sha512-7AAMPcueWELt1p3mi13HR/LHH0TJLT11cnwDJEs3xA4+CK/PLKeO9Kl1oru24htkyUKtkGCvAx4ohB0Ttry8Dw==" + "version": "1.0.30001761", + "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001761.tgz", + "integrity": "sha512-JF9ptu1vP2coz98+5051jZ4PwQgd2ni8A+gYSN7EA7dPKIMf0pDlSUxhdmVOaV3/fYK5uWBkgSXJaRLr4+3A6g==" }, "chalk": { "version": "4.1.2",