Fixed arstechnica content-script

- the body selector is now div.post-content
- removed the now useless replaceStrings
- give priority to {Article,NewsArticle,Web}.description in JSON-LD,
  before *.description

See https://community.readeck.org/d/113-some-webpages-content-not-being-extracted
This commit is contained in:
Olivier Meunier
2025-11-17 21:19:22 +01:00
parent f3d7c29d0a
commit 746466a3fe
2 changed files with 5 additions and 5 deletions

View File

@@ -18,11 +18,11 @@ exports.isActive = function () {
*/
exports.setConfig = function (config) {
switch (true) {
case $.domain == "arstechnica.co.uk":
$.overrideConfig(config, "https://arstechnica.com/")
case $.domain == "arstechnica.com":
config.replaceStrings = [
['" data-src="', '"><img src="'],
['" data-responsive="', '" /><span data-responsive="'],
['<figure style="', '</span><figure data-style="'],
config.bodySelectors = [
"//div[contains(concat(' ',normalize-space(@class),' '),' post-content ')]",
]
break

View File

@@ -58,7 +58,7 @@ func ExtractMeta(m *extract.ProcessMessage, next extract.Processor) extract.Proc
if headline, ok := md.getProp("Article.name", "*.headline", "{Movie,VideObject}.name").(string); ok {
d.Title = headline
}
if description, ok := md.getProp("*.description").(string); ok {
if description, ok := md.getProp("{Article,NewsArticle,WebPage}.description", "*.description").(string); ok {
d.Description = description
}
if image, ok := md.getProp("*.{image,image.url,thumbnailUrl}").(string); ok {