Files
endoflife-date-release-data/src/cos.py

89 lines
3.3 KiB
Python

import urllib.request
from bs4 import BeautifulSoup
import re
import json
from datetime import datetime
REGEX = r"^(cos-\d+-\d+-\d+-\d+)"
def fetch_all_milestones():
url = "https://cloud.google.com/container-optimized-os/docs/release-notes/"
# Google Docs website often returns SSL errors, retry the request in case of failures.
for i in range(0,10):
try:
with urllib.request.urlopen(url, data=None, timeout=5) as response:
soup = BeautifulSoup(response, features="html5lib")
break
except Exception as e:
print("Retrying Request, got error: " + str(e))
continue
else:
raise Exception("Failed to fetch COS milestones")
milestones = soup.find_all('td', text=re.compile(r'COS \d+ LTS'))
return [m.text.split(' ')[1] for m in milestones]
def fetch_milestone(channel):
url = "https://cloud.google.com/container-optimized-os/docs/release-notes/m{}".format(channel)
# Google Docs website often returns SSL errors, retry the request in case of failures.
for i in range(0,5):
try:
with urllib.request.urlopen(url, data=None, timeout=5) as response:
return BeautifulSoup(response, features="html5lib")
except Exception as e:
print("Retrying Request")
continue
raise Exception("Failed to fetch COS milestone {}".format(channel))
"""
Takes soup, and returns a dictionary of versions and their release dates
"""
def parse_soup_for_versions(soup):
""" Parse the soup """
versions = {}
for article in soup.find_all('article', class_='devsite-article'):
def parse_date(d):
# If the date begins with a >3 letter month name, trim it to just 3 letters
# Strip out the Date: section from the start
d = re.sub(r'(?:Date\: )?(\w{3})(?:\w{1,4})? (\d{1,2}), (\d{4})', r'\1 \2, \3', d)
return datetime.strptime(d, date_format).strftime('%Y-%m-%d')
# h2 contains the date, which we parse
for heading in article.find_all(['h2', 'h3']):
version = heading.get('data-text')
m = re.match(REGEX, version)
if m:
version = m.group(1)
date_format = '%b %d, %Y'
try:
# The first row is the header, so we pick the first td in the second row
d = heading.find_next('tr').find_next('tr').find_next('td').text
except:
# In some older releases, it is mentioned as Date: [Date] in the text
d = heading.find_next('i').text
try:
date = parse_date(d)
except ValueError:
d = heading.find_previous('h2').get('data-text')
date = parse_date(d)
versions[version] = date
return versions
def get_all_versions():
all_versions = {}
all_milestones = fetch_all_milestones()
for milestone in all_milestones:
soup = fetch_milestone(milestone)
print("::group::COS - {}".format(milestone))
versions = parse_soup_for_versions(soup)
all_versions |= versions
print("::endgroup::")
return all_versions
if __name__ == '__main__':
v = get_all_versions()
with open('releases/cos.json', "w") as f:
f.write(json.dumps(v, indent=2))