[bellazon] improve video URL regex (#8392)

match <source> elements with attributes before 'src="..."'
This commit is contained in:
Mike Fährmann
2025-10-12 08:35:32 +02:00
parent 65feed5b64
commit f89f55b215
2 changed files with 11 additions and 1 deletions

View File

@@ -27,7 +27,7 @@ class BellazonExtractor(Extractor):
native = (f"{self.root}/", f"{self.root[6:]}/")
extract_urls = text.re(
r'(?s)<('
r'(?:video .*?<source src|a [^>]*?href)="([^"]+).*?</a>'
r'(?:video .*?<source [^>]*?src|a [^>]*?href)="([^"]+).*?</a>'
r'|img [^>]*?src="([^"]+)"[^>]*>'
r')'
).findall

View File

@@ -204,6 +204,16 @@ __tests__ = (
"#results" : "https://www.bellazon.com/main/uploads/monthly_2018_04/30602369_1891291154222843_1650952189830496256_n.jpg.33e6ab78dd0e8723f790ad4f58f3761a.jpg",
},
{
"#url" : "https://www.bellazon.com/main/topic/70367-elyzaveta-kovalenko/page/5/#comment-5464973",
"#comment" : "(#8392)",
"#class" : bellazon.BellazonPostExtractor,
"#results" : (
"https://www.bellazon.com/main/uploads/monthly_2022_05/917305269_LizaKovalenko-Instagram2021_04_19.mp4.467d190a54e1bcabc50767a69706501d.mp4",
"https://www.bellazon.com/main/uploads/monthly_2022_05/2027180206_LizaKovalenko-Instagram2021_04_23.mp4.2eae87d7e9d6f1a993611fa1f73e8e7b.mp4",
),
},
{
"#url" : "https://www.bellazon.com/main/topic/57872-millie-brady/",
"#class" : bellazon.BellazonThreadExtractor,