[xenforo] extract 'author_slug' metadata (#8785)

This commit is contained in:
Mike Fährmann
2026-01-13 21:04:12 +01:00
parent 45ee639bd4
commit 812482e53e
3 changed files with 37 additions and 24 deletions

View File

@@ -245,7 +245,6 @@ class XenforoExtractor(BaseExtractor):
author = schema["author"]
stats = schema["interactionStatistic"]
url_t = schema.get("url") or schema.get("@id") or ""
url_a = author.get("url") or ""
thread = {
"id" : url_t[url_t.rfind(".")+1:-1],
@@ -254,13 +253,19 @@ class XenforoExtractor(BaseExtractor):
"date" : self.parse_datetime_iso(schema["datePublished"]),
"tags" : (schema["keywords"].split(", ")
if "keywords" in schema else ()),
"section" : schema["articleSection"],
"author" : author.get("name") or "",
"author_id" : (url_a[url_a.rfind(".")+1:-1] if url_a else
(author.get("name") or "")[15:]),
"author_url": url_a,
"section": schema["articleSection"],
"author" : author.get("name") or "",
}
if url_a := author.get("url"):
thread["author_url"] = url_a
thread["author_slug"], _, thread["author_id"] = \
url_a[url_a.rfind("/", 0, -1)+1:-1].rpartition(".")
else:
thread["author_url"] = ""
thread["author_slug"] = text.slugify(thread["author"][:15])
thread["author_id"] = thread["author"][15:]
if isinstance(stats, list):
thread["views"] = stats[0]["userInteractionCount"]
thread["posts"] = stats[1]["userInteractionCount"]
@@ -286,7 +291,8 @@ class XenforoExtractor(BaseExtractor):
}
url_a = post["author_url"]
post["author_id"] = url_a[url_a.rfind(".")+1:-1]
post["author_slug"], _, post["author_id"] = \
url_a[url_a.rfind("/", 0, -1)+1:-1].rpartition(".")
con = post["content"]
if (pos := con.find('<div class="bbWrapper')) >= 0:
@@ -307,7 +313,6 @@ class XenforoExtractor(BaseExtractor):
schema = self._extract_jsonld(page)
main = schema["mainEntity"]
author = main["author"]
url_a = author.get("url") or ""
stats = main["interactionStatistic"]
media = text.nameext_from_name(main["name"], {
@@ -320,19 +325,25 @@ class XenforoExtractor(BaseExtractor):
w["name"].partition(" ")[0]) or 0,
"height": (h := main.get("height")) and text.parse_int(
h["name"].partition(" ")[0]) or 0,
"author" : author.get("name") or "",
"author_id" : (url_a[url_a.rfind(".")+1:-1] if url_a else
(author.get("name") or "")[15:]),
"author_url": url_a,
"author": author.get("name") or "",
})
if url_a := author.get("url"):
media["author_url"] = url_a
media["author_slug"], _, media["author_id"] = \
url_a[url_a.rfind("/", 0, -1)+1:-1].rpartition(".")
else:
media["author_url"] = ""
media["author_slug"] = text.slugify(media["author"][15:])
media["author_id"] = media["author"][15:]
if ext := main.get("encodingFormat"):
media["extension"] = ext
if isinstance(stats, list):
media["likes"] = stats[1]["userInteractionCount"]
media["views"] = stats[0]["userInteractionCount"]
media["comments"] = stats[0]["userInteractionCount"]
media["likes"] = stats[1]["userInteractionCount"]
media["comments"] = stats[2]["userInteractionCount"]
return main["contentUrl"], media

View File

@@ -26,6 +26,7 @@ __tests__ = (
"author" : "djokica",
"author_id" : "3471965",
"author_url" : "/forum/members/djokica.3471965/",
"author_slug": "djokica",
"content" : """<div class="bbWrapper"><a href="https://imagetwist.com/bvolb8129fnm/v1.jpg" target="_blank" class="link link--external" rel="nofollow noopener"><img src="https://s10.imagetwist.com/th/73048/bvolb8129fnm.jpg" data-url="https://s10.imagetwist.com/th/73048/bvolb8129fnm.jpg" class="bbImage " style="" alt="" title="" /></a> <a href="https://imagetwist.com/9pddder15iow/v2.jpg" target="_blank" class="link link--external" rel="nofollow noopener"><img src="https://s10.imagetwist.com/th/73048/9pddder15iow.jpg" data-url="https://s10.imagetwist.com/th/73048/9pddder15iow.jpg" class="bbImage " style="" alt="" title="" /></a> <a href="https://imagetwist.com/zzonmk0gqqdv/v3.jpg" target="_blank" class="link link--external" rel="nofollow noopener"><img src="https://s10.imagetwist.com/th/73048/zzonmk0gqqdv.jpg" data-url="https://s10.imagetwist.com/th/73048/zzonmk0gqqdv.jpg" class="bbImage " style="" alt="" title="" /></a></div>""",
"count" : 3,
"date" : "dt:2025-10-31 21:26:42",

View File

@@ -22,25 +22,24 @@ __tests__ = (
"author" : "Zebrabobinn",
"author_id" : "171827",
"author_url": "https://simpcity.cr/members/zebrabobinn.171827/",
"author_slug": "zebrabobinn",
"count" : 1,
"date" : "dt:2023-03-08 12:59:10",
"id" : "1753131",
"content" : """\
<div class="bbWrapper"><a href="https://jpg6.su/img/coWRwo" target="_blank" class="link link--external" rel="noopener"><img src="https://simp6.jpg6.su/images/FqsNcNCaIAITBEL.md.jpg" data-url="https://simp6.jpg6.su/images/FqsNcNCaIAITBEL.md.jpg" class="bbImage " loading="lazy"
\t\talt="FqsNcNCaIAITBEL.md.jpg" title="FqsNcNCaIAITBEL.md.jpg" style="" width="" height="" /></a></div>\
""",
"content" : str,
},
"thread": {
"author" : "eula",
"author_id" : "54987",
"author_url": "https://simpcity.cr/members/eula.54987/",
"author_slug": "eula",
"date" : "dt:2022-03-11 17:15:59",
"id" : "10731",
"posts" : range(320, 500),
"section" : "Asians",
"title" : "Ririkana | RR_loveit",
"url" : "https://simpcity.cr/threads/ririkana-rr_loveit.10731/",
"views" : range(790_000, 900_000),
"views" : range(900_000, 2_000_000),
"tags" : [
"asian",
"big ass",
@@ -69,10 +68,10 @@ __tests__ = (
"#auth" : True,
"#results" : (
"https://jpg6.su/img/NNFssUg",
"https://saint2.cr/embed/nPy1kG3w55V",
"https://saint2.cr/embed/c0KhPjU4-F3",
"https://saint2.cr/embed/sZWnVZ_mQsV",
"https://saint2.cr/embed/MEBiLx6DETQ",
"https://turbovid.cr/embed/nPy1kG3w55V",
"https://turbovid.cr/embed/c0KhPjU4-F3",
"https://turbovid.cr/embed/sZWnVZ_mQsV",
"https://turbovid.cr/embed/MEBiLx6DETQ",
),
},
@@ -104,6 +103,7 @@ __tests__ = (
"author" : "Hexorium",
"author_id" : "3715883",
"author_url": "https://simpcity.cr/members/hexorium.3715883/",
"author_slug": "hexorium",
"count" : 6,
"date" : "dt:2024-12-15 21:37:05",
"id" : "12065490",
@@ -112,6 +112,7 @@ __tests__ = (
"author" : "Deleted member 166159",
"author_id" : "166159",
"author_url": "",
"author_slug": "deleted-member",
"date" : "dt:2022-04-05 14:48:14",
"id" : "36572",
"section" : "Premium Asians",
@@ -244,7 +245,7 @@ __tests__ = (
"#category": ("xenforo", "simpcity", "thread"),
"#class" : xenforo.XenforoThreadExtractor,
"#auth" : True,
"#pattern" : r"https://(jpg6\.su/img/\w+|bunkr\.\w+/[fiv]/\w+|pixeldrain.com/l/\w+|alua.com/tatakai)|saint2.cr/embed",
"#pattern" : r"https://(jpg6\.su/img/\w+|bunkr\.\w+/[fiv]/\w+|pixeldrain.com/l/\w+|alua.com/tatakai)|turbovid.cr/embed",
"#count" : range(100, 300),
"count" : int,