[shopify] use alternate regex for products on collection pages

when the first on doesn't yield any results
This commit is contained in:
Mike Fährmann
2020-08-15 18:24:14 +02:00
parent 7619152988
commit d06ad148c7

View File

@@ -74,21 +74,33 @@ class ShopifyCollectionExtractor(ShopifyExtractor):
def products(self): def products(self):
params = text.parse_query(self.params) params = text.parse_query(self.params)
params["page"] = text.parse_int(params.get("page"), 1) params["page"] = text.parse_int(params.get("page"), 1)
search_re = re.compile(r"/collections/[\w-]+/products/[\w-]+") fetch = True
last = None
while True: for pattern in (
page = self.request(self.item_url, params=params).text r"/collections/[\w-]+/products/[\w-]+",
urls = search_re.findall(page) r"href=[\"'](/products/[\w-]+)",
last = None ):
search_re = re.compile(pattern)
if not urls: while True:
return if fetch:
for path in urls: page = self.request(self.item_url, params=params).text
if last == path: urls = search_re.findall(page)
continue
last = path if len(urls) < 3:
yield self.root + path if last:
params["page"] += 1 return
fetch = False
break
fetch = True
for path in urls:
if last == path:
continue
last = path
yield self.root + path
params["page"] += 1
class ShopifyProductExtractor(ShopifyExtractor): class ShopifyProductExtractor(ShopifyExtractor):
@@ -121,7 +133,6 @@ EXTRACTORS = {
("https://www.fashionnova.com/collections/mini-dresses/?page=1"), ("https://www.fashionnova.com/collections/mini-dresses/?page=1"),
("https://www.fashionnova.com/collections/mini-dresses#1"), ("https://www.fashionnova.com/collections/mini-dresses#1"),
), ),
}, },
} }