diff --git a/gallery_dl/extractor/4chan.py b/gallery_dl/extractor/4chan.py
index bc23ed3e..7d6c826b 100644
--- a/gallery_dl/extractor/4chan.py
+++ b/gallery_dl/extractor/4chan.py
@@ -18,51 +18,55 @@ info = {
"directory": ["{category}", "{board}-{thread-id}"],
"filename": "{timestamp}-{name}",
"pattern": [
- r"(?:https?://)?boards\.4chan\.org/([^/]+)/thread/(\d+)(?:/([^#]*))?",
+ r"(?:https?://)?boards\.4chan\.org/([^/]+)/thread/(\d+).*",
],
}
class FourChanExtractor(SequentialExtractor):
- url_fmt = "https://boards.4chan.org/{board}/res/{thread-id}.html"
+ url_fmt = "https://boards.4chan.org/{0}/res/{1}.html"
regex = (
- r'([^<]+)'
+ r'[^"]+)" )?href="'
+ r'(?P//i.4cdn.org/[^/]+/(?P\d+)\.(?P[^"]+))'
+ r'" target="_blank">(?P[^<]+) '
+ r'\((?P[^,]+), (?P\d+)x(?P\d+)\)'
)
def __init__(self, match, config):
SequentialExtractor.__init__(self, config)
- self.metadata = self.get_job_metadata(match)
+ self.match = match
+ self.metadata = None
def items(self):
yield Message.Version, 1
- yield Message.Directory, self.metadata
- url = self.url_fmt.format(**self.metadata)
+ url = self.url_fmt.format(*self.match.groups())
text = self.request(url).text
+ self.metadata = self.get_job_metadata(text)
+
+ yield Message.Directory, self.metadata
for match in re.finditer(self.regex, text):
yield Message.Url, self.get_file_url(match), self.get_file_metadata(match)
- @staticmethod
- def get_job_metadata(match):
+ def get_job_metadata(self, text):
"""Collect metadata for extractor-job"""
- board, thread_id, title = match.groups()
+ board, thread_id = self.match.groups()
+ title, _ = self.extract(text, '"description" content="', ' - "/')
return {
"category": info["category"],
"board": board,
"thread-id": thread_id,
- "title": title,
+ "title": unquote(title),
}
def get_file_metadata(self, match):
"""Collect metadata for a downloadable file"""
data = self.metadata
- data["timestamp"] = match.group(2)
- data["extension"] = match.group(3)
- data["name"] = unquote(match.group(4))
+ data.update(match.groupdict(default=""))
+ data["name"] = unquote(data["orig_name"] or data["name"])
return data
@staticmethod
def get_file_url(match):
"""Extract download-url from 'match'"""
- return "https:" + match.group(1)
+ return "https:" + match.group("url")