Simplify date parsing (#195)

Create common functions parse_date, parse_month_year_date and parse_datetime.

Those functions support trying multiple formats, and come with default formats lists that support most of the date format encountered so far.

Notable change: year-month dates are now set to the end of month (impacted couchbase-server and ibm-aix).
This commit is contained in:
Marc Wrobel
2023-11-26 21:01:35 +01:00
committed by GitHub
parent 1e65a048b0
commit 0d17306872
24 changed files with 133 additions and 168 deletions

View File

@@ -1,22 +1,22 @@
{
"7.2.3": "2023-11-15",
"7.1.6": "2023-11-15",
"7.2.2": "2023-09-15",
"7.2.1": "2023-09-15",
"7.1.5": "2023-08-15",
"7.2.3": "2023-11-30",
"7.1.6": "2023-11-30",
"7.2.2": "2023-09-30",
"7.2.1": "2023-09-30",
"7.1.5": "2023-08-31",
"7.2.0": "2023-06-01",
"7.1.4": "2023-03-15",
"7.0.5": "2022-12-15",
"7.1.3": "2022-11-15",
"7.1.2": "2022-10-15",
"7.1.1": "2022-07-15",
"7.0.4": "2022-06-15",
"7.1.0": "2022-05-15",
"6.6.5": "2022-01-15",
"7.0.3": "2021-12-15",
"7.0.2": "2021-10-15",
"7.0.1": "2021-09-15",
"7.0.0": "2021-07-15",
"7.1.4": "2023-03-31",
"7.0.5": "2022-12-31",
"7.1.3": "2022-11-30",
"7.1.2": "2022-10-31",
"7.1.1": "2022-07-31",
"7.0.4": "2022-06-30",
"7.1.0": "2022-05-31",
"6.6.5": "2022-01-31",
"7.0.3": "2021-12-31",
"7.0.2": "2021-10-31",
"7.0.1": "2021-09-30",
"7.0.0": "2021-07-31",
"6.6.0": "2020-08-12",
"6.0.1": "2019-02-15",
"6.0.0": "2018-10-31"

View File

@@ -1,12 +1,12 @@
{
"7.3.2": "2023-11-01",
"7.3.1": "2022-12-01",
"7.3.0": "2021-12-01",
"7.2.5": "2020-11-01",
"7.2.4": "2019-11-01",
"7.2.3": "2018-09-01",
"7.2.2": "2017-10-01",
"7.1.5": "2017-10-01",
"7.2.1": "2016-11-01",
"7.2.0": "2015-12-01"
"7.3.2": "2023-11-30",
"7.3.1": "2022-12-31",
"7.3.0": "2021-12-31",
"7.2.5": "2020-11-30",
"7.2.4": "2019-11-30",
"7.2.3": "2018-09-30",
"7.2.2": "2017-10-31",
"7.1.5": "2017-10-31",
"7.2.1": "2016-11-30",
"7.2.0": "2015-12-31"
}

View File

@@ -1,7 +1,7 @@
import re
from xml.dom.minidom import parseString
from common import dates
from common import endoflife
from datetime import datetime
"""Fetch versions with their dates from the RSS feed of
https://docs.aws.amazon.com/neptune/latest/userguide/engine-releases.html.
@@ -22,7 +22,7 @@ for item in rss.getElementsByTagName("item"):
matches = re.match(REGEX, title)
if matches:
version = matches['version']
date = datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S %Z").strftime("%Y-%m-%d")
date = dates.parse_datetime(pubDate).strftime("%Y-%m-%d")
versions[version] = date
print(f"{version}: {date}")

View File

@@ -1,6 +1,6 @@
import re
from datetime import datetime
from pathlib import Path
from common import dates
from common import endoflife
from common.git import Git
@@ -18,13 +18,8 @@ REPO_URL = "https://github.com/apache/httpd.git"
def parse(date: str) -> str:
date = date.replace("Feburary", "February")
for format in ["%B %d, %Y", "%B %d, %Y", "%b %d, %Y", "%b. %d, %Y"]:
try:
return datetime.strptime(date, format).strftime("%Y-%m-%d")
except ValueError:
pass
raise ValueError(f"Unknown date format for '{date}'")
date = date.replace(". ", " ")
return dates.parse_date(date).strftime("%Y-%m-%d")
def fetch_versions_from_file(release_notes_file: Path, versions: dict):

View File

@@ -1,6 +1,6 @@
import datetime
import re
from bs4 import BeautifulSoup
from common import dates
from common import endoflife
URLS = [
@@ -46,10 +46,9 @@ CONFIG = {
}
def parse_date(s):
d, m, y = s.strip().split(" ")
m = m[0:3].lower() # reduce months to 3 letters, such as "Sept" to "Sep", so it can be parsed
return datetime.datetime.strptime(f"{d} {m} {y}", "%d %b %Y")
def parse_date(date_str):
date_str = date_str.replace("Sept", "Sep")
return dates.parse_date(date_str)
print("::group::apple")

View File

@@ -1,22 +1,14 @@
from common import dates
from common import endoflife
from datetime import datetime
from requests_html import HTMLSession
URL = "https://jfrog.com/help/r/jfrog-release-information/artifactory-end-of-life"
PRODUCT = "artifactory"
def parse_date(text):
text = text.replace("Sept", "Sep").replace("_", "-")
date_formats = ['%d-%b-%Y', '%d-%B-%Y']
for date_format in date_formats:
try:
return datetime.strptime(text, date_format).strftime("%Y-%m-%d")
except ValueError:
pass
raise ValueError("Cannot parse '" + text + "' with formats " + str(date_formats))
def parse_date(date_str):
date_str = date_str.replace("Sept", "Sep").replace("_", "-")
return dates.parse_date(date_str).strftime("%Y-%m-%d")
def fetch_releases():

View File

@@ -1,8 +1,8 @@
import re
import sys
from bs4 import BeautifulSoup
from common import dates
from common import endoflife
from datetime import datetime, timezone
from liquid import Template
"""Fetch versions with their dates from a cgit repository, such as
@@ -22,15 +22,6 @@ DEFAULT_VERSION_REGEX = (
)
# Parse date with format 2023-05-01 08:32:34 +0900 and convert to UTC
def parse_date(d):
return (
datetime.strptime(d, "%Y-%m-%d %H:%M:%S %z")
.astimezone(timezone.utc)
.strftime("%Y-%m-%d")
)
def make_bs_request(url):
response = endoflife.fetch_url(url + '/refs/tags')
return BeautifulSoup(response, features="html5lib")
@@ -54,7 +45,7 @@ def fetch_releases(url, regex, template):
if matches:
match_data = matches.groupdict()
version_string = l_template.render(**match_data)
date = parse_date(datetime_text)
date = dates.parse_datetime(datetime_text).strftime("%Y-%m-%d")
print(f"{version_string} : {date}")
releases[version_string] = date

51
src/common/dates.py Normal file
View File

@@ -0,0 +1,51 @@
from datetime import datetime, timezone
import calendar
def parse_date(text, formats=frozenset([
"%B %d, %Y", # January 1, 2020
"%b %d, %Y", # Jan 1, 2020
"%B %d %Y", # January 1 2020
"%b %d %Y", # Jan 1 2020
"%d %B %Y", # 1 January 2020
"%d %b %Y", # 1 Jan 2020
"%d-%b-%Y", # 1-Jan-2020
"%d-%B-%Y", # 1-January-2020
])) -> datetime:
"""Parse a given text representing a date using a list of formats.
"""
return parse_datetime(text, formats, to_utc=False)
def parse_month_year_date(text, formats=frozenset([
"%B %Y", # January 2020
"%b %Y", # Jan 2020
])) -> datetime:
"""Parse a given text representing a partial date using a list of formats,
adjusting it to the last day of the month.
"""
date = parse_datetime(text, formats, to_utc=False)
_, last_day = calendar.monthrange(date.year, date.month)
return date.replace(day=last_day)
def parse_datetime(text, formats=frozenset([
"%Y-%m-%d %H:%M:%S", # 2023-05-01 08:32:34
"%Y-%m-%dT%H:%M:%S", # 2023-05-01T08:32:34
"%Y-%m-%d %H:%M:%S %z", # 2023-05-01 08:32:34 +0900
"%a, %d %b %Y %H:%M:%S %Z", # Wed, 01 Jan 2020 00:00:00 GMT
"%Y-%m-%dT%H:%M:%S%z", # 2023-05-01T08:32:34+0900
]), to_utc=True) -> datetime:
"""Parse a given text representing a datetime using a list of formats,
optionally converting it to UTC.
"""
text = text.strip()
for fmt in formats:
try:
date = datetime.strptime(text, fmt)
date = date.astimezone(timezone.utc) if to_utc else date
return date
except ValueError:
pass
raise ValueError(f"'{text}' could not be parsed as a date with any of the formats: {str(formats)}")

View File

@@ -1,6 +1,6 @@
from requests_html import HTMLSession
from common import dates
from common import endoflife
from datetime import datetime
"""Fetch Confluence versions with their dates from the Atlassian Website.
@@ -11,13 +11,7 @@ because the page needs JavaScript to render correctly.
PRODUCT = 'confluence'
URL = 'https://www.atlassian.com/software/confluence/download-archives'
def parse_date(text):
return datetime.strptime(text, "%d-%b-%Y").strftime("%Y-%m-%d")
print(f"::group::{PRODUCT}")
session = HTMLSession()
r = session.get(URL)
r.html.render(sleep=1, scrolldown=3)
@@ -25,7 +19,8 @@ r.html.render(sleep=1, scrolldown=3)
versions = {}
for version_block in r.html.find('.versions-list'):
version = version_block.find('a.product-versions', first=True).attrs['data-version']
date = parse_date(version_block.find('.release-date', first=True).text)
date_text = version_block.find('.release-date', first=True).text
date = dates.parse_date(date_text).strftime('%Y-%m-%d')
print(f"{version}: {date}")
versions[version] = date

View File

@@ -1,10 +1,9 @@
import re
from bs4 import BeautifulSoup
from common import dates
from common import endoflife
from datetime import datetime
URL = "https://cloud.google.com/container-optimized-os/docs/release-notes/"
DATE_FORMAT = '%b %d, %Y'
REGEX = r"^(cos-\d+-\d+-\d+-\d+)"
@@ -20,11 +19,10 @@ def fetch_milestones(milestones):
return endoflife.fetch_urls(urls)
def parse_date(d):
# If the date begins with a >3 letter month name, trim it to just 3 letters
# Strip out the Date: section from the start
d = re.sub(r'(?:Date\: )?(\w{3})(?:\w{1,})? (\d{1,2}), (\d{4})', r'\1 \2, \3', d)
return datetime.strptime(d, DATE_FORMAT).strftime('%Y-%m-%d')
def parse_date(date_str):
date_str = date_str.strip().replace('Date: ', '')
date_str = re.sub(r'Sep[a-zA-Z]+', 'Sep', date_str)
return dates.parse_date(date_str).strftime('%Y-%m-%d')
def find_versions(text):

View File

@@ -1,7 +1,7 @@
import re
from bs4 import BeautifulSoup
from common import dates
from common import endoflife
from datetime import datetime
"""Fetch versions with their dates from docs.couchbase.com.
@@ -46,7 +46,7 @@ for response in endoflife.fetch_urls(minor_version_urls):
m = re.match(REGEX, versionAndDate)
if m:
version = f"{m['version']}.0" if len(m['version'].split('.')) == 2 else m['version']
date = datetime.strptime(m['date'], "%B %Y").strftime("%Y-%m-15")
date = dates.parse_month_year_date(m['date']).strftime("%Y-%m-%d")
versions[version] = date
print(f"{version}: {date}")

View File

@@ -1,8 +1,8 @@
import re
import urllib.parse
from bs4 import BeautifulSoup
from common import dates
from common import endoflife
from datetime import datetime
"""Fetch Firefox versions with their dates from https://www.mozilla.org/"""
@@ -12,13 +12,7 @@ PRODUCT = "firefox"
def format_date(text: str) -> str:
text = text.replace(')', '')
formats = ["%b %d, %Y", "%B %d, %Y"]
for f in formats:
try:
return datetime.strptime(text, f).strftime("%Y-%m-%d")
except ValueError:
pass
return ""
return dates.parse_date(text).strftime("%Y-%m-%d")
print(f"::group::{PRODUCT}")

View File

@@ -1,7 +1,7 @@
import re
from bs4 import BeautifulSoup
from common import dates
from common import endoflife
from datetime import datetime
# https://regex101.com/r/zPxBqT/1
REGEX = r"\d.\d+\.\d+-gke\.\d+"
@@ -22,7 +22,7 @@ def parse_soup_for_versions(soup):
# h2 contains the date, which we parse
for h2 in section.find_all('h2'):
date = h2.get('data-text')
date = datetime.strptime(date, '%B %d, %Y').strftime('%Y-%m-%d')
date = dates.parse_date(date).strftime("%Y-%m-%d")
# The div next to the h2 contains the notes about changes made
# on that date
next_div = h2.find_next('div')

View File

@@ -1,14 +1,11 @@
from bs4 import BeautifulSoup
from common import dates
from common import endoflife
from datetime import datetime
URL = "https://www.graalvm.org/release-calendar/"
# https://regex101.com/r/877ibq/1
regex = r"RHEL (?P<major>\d)(\. ?(?P<minor>\d+))?(( Update (?P<minor2>\d))| GA)?"
def parse_date(text):
return datetime.strptime(text, "%B %d, %Y").strftime("%Y-%m-%d")
def split_versions(text):
# GraalVM for JDK versions has to be prefixed as their release cycle collide
# with older GraalVM release cycles. Example: GraalVM for JDK 20 and 20.0.
@@ -21,7 +18,7 @@ soup = BeautifulSoup(response, features="html5lib")
versions = {}
for tr in soup.findAll("table")[1].find("tbody").findAll("tr"):
td_list = tr.findAll("td")
date = parse_date(td_list[0].get_text())
date = dates.parse_date(td_list[0].get_text()).strftime("%Y-%m-%d")
for version in split_versions(td_list[2].get_text()):
versions[version] = date

View File

@@ -1,17 +1,10 @@
from bs4 import BeautifulSoup
from common import dates
from common import endoflife
from datetime import datetime
PRODUCT = "ibm-aix"
URL = "https://www.ibm.com/support/pages/aix-support-lifecycle-information"
# Convert date from e.g. "November 2022" format to "2022-11-01"
def convert_date(date_str):
return datetime.strptime(date_str, "%B %Y").strftime("%Y-%m-%d")
def fetch_releases():
response = endoflife.fetch_url(URL)
soup = BeautifulSoup(response, features="html5lib")
@@ -23,7 +16,7 @@ def fetch_releases():
for row in release_table.find_all("tr")[1:]:
cells = row.find_all("td")
version = cells[0].text.strip("AIX ").replace(' TL', '.')
date = convert_date(cells[1].text)
date = dates.parse_month_year_date(cells[1].text).strftime("%Y-%m-%d")
print(f"{version} : {date}")
releases[version] = date

View File

@@ -1,6 +1,6 @@
from requests_html import HTMLSession
from common import dates
from common import endoflife
from datetime import datetime
"""Fetch Jira versions with their dates from the Atlassian Website.
@@ -11,13 +11,7 @@ because the page needs JavaScript to render correctly.
PRODUCT = 'jira'
URL = 'https://www.atlassian.com/software/jira/update'
def parse_date(text):
return datetime.strptime(text, "%d-%b-%Y").strftime("%Y-%m-%d")
print(f"::group::{PRODUCT}")
session = HTMLSession()
r = session.get(URL)
r.html.render(sleep=1, scrolldown=3)
@@ -25,7 +19,8 @@ r.html.render(sleep=1, scrolldown=3)
versions = {}
for version_block in r.html.find('.versions-list'):
version = version_block.find('a.product-versions', first=True).attrs['data-version']
date = parse_date(version_block.find('.release-date', first=True).text)
date_text = version_block.find('.release-date', first=True).text
date = dates.parse_date(date_text).strftime('%Y-%m-%d')
print(f"{version}: {date}")
versions[version] = date

View File

@@ -1,8 +1,7 @@
import re
from bs4 import BeautifulSoup
from common import dates
from common import endoflife
from datetime import datetime, timezone
from xml.dom.minidom import parseString
"""Fetch Looker versions with their dates from the Google Cloud release notes RSS feed.
@@ -14,16 +13,13 @@ ANNOUNCEMENT_PATTERN = re.compile(r"includes\s+the\s+following\s+changes", re.IG
VERSION_PATTERN = re.compile(r"Looker\s+(?P<version>\d+\.\d+)", re.IGNORECASE)
def parse_date(date_str):
return datetime.fromisoformat(date_str).astimezone(timezone.utc).strftime("%Y-%m-%d")
print(f"::group::{PRODUCT}")
versions = {}
response = endoflife.fetch_url(URL)
rss = parseString(response)
for item in rss.getElementsByTagName("entry"):
date = parse_date(item.getElementsByTagName("updated")[0].firstChild.nodeValue)
date = dates.parse_datetime(item.getElementsByTagName("updated")[0].firstChild.nodeValue).strftime("%Y-%m-%d")
content = item.getElementsByTagName("content")[0].firstChild.nodeValue
soup = BeautifulSoup(content, features="html5lib")

View File

@@ -1,26 +1,16 @@
import json
from common import dates
from common import endoflife
from datetime import datetime
PHP_MAJOR_VERSIONS = [4, 5, 7, 8]
# Date format is 03 Nov 2022
# With some versions using 03 November 2022 instead
# we return it as YYYY-MM-DD
def parse_date(date_str):
try:
return datetime.strptime(date_str, "%d %b %Y").strftime("%Y-%m-%d")
except ValueError:
return datetime.strptime(date_str, "%d %B %Y").strftime("%Y-%m-%d")
def fetch_versions(major_version):
url = f"https://www.php.net/releases/index.php?json&max=-1&version={major_version}"
response = endoflife.fetch_url(url)
data = json.loads(response)
for v in data:
data[v] = parse_date(data[v]["date"])
data[v] = dates.parse_date(data[v]["date"]).strftime("%Y-%m-%d")
print(f"{v}: {data[v]}")
return data

View File

@@ -1,6 +1,6 @@
from bs4 import BeautifulSoup
from common import dates
from common import endoflife
from datetime import datetime
URL = "https://docs.plesk.com/release-notes/obsidian/change-log"
PRODUCT = "plesk"
@@ -27,7 +27,7 @@ def fetch_releases():
version = version.replace(' Update ', '.').replace('Plesk Obsidian ', '')
if ' ' in version:
continue
date = datetime.strptime(release.p.text.strip(), '%d %B %Y').strftime("%Y-%m-%d")
date = dates.parse_date(release.p.text).strftime("%Y-%m-%d")
result[version] = date
print(f"{version}: {date}")

View File

@@ -1,8 +1,8 @@
import json
import re
import sys
from common import dates
from common import endoflife
from datetime import datetime
METHOD = "pypi"
DEFAULT_TAG_TEMPLATE = ( # Same as used in Ruby (update.rb)
@@ -27,7 +27,7 @@ def fetch_releases(pypi_id, regex):
if re.match(r, version):
matches = True
if matches and R:
d = datetime.fromisoformat(R[0]["upload_time"]).strftime("%Y-%m-%d")
d = dates.parse_datetime(R[0]["upload_time"], to_utc=False).strftime("%Y-%m-%d")
releases[version] = d
print(f"{version}: {d}")

View File

@@ -1,7 +1,7 @@
import re
from bs4 import BeautifulSoup
from common import dates
from common import endoflife
from datetime import datetime
VERSION_REGEX = r"(?P<v>\d+(?:\.\d+)*)" # https://regex101.com/r/BY1vwV/1
DBS = {
@@ -9,14 +9,6 @@ DBS = {
"postgresql": "https://docs.aws.amazon.com/AmazonRDS/latest/PostgreSQLReleaseNotes/postgresql-release-calendar.html",
}
def parse_date(d):
try:
return datetime.strptime(d, "%d %B %Y").strftime("%Y-%m-%d")
except ValueError:
return None
for db, url in DBS.items():
print(f"::group::{db}")
versions = {}
@@ -33,7 +25,7 @@ for db, url in DBS.items():
if len(columns) > 3:
m = re.search(VERSION_REGEX, columns[0].text.strip(), flags=re.IGNORECASE)
if m:
date = parse_date(columns[2].text.strip())
date = dates.parse_date(columns[2].text).strftime("%Y-%m-%d")
if date:
version = m.group("v")
print(f"{version} : {date}")

View File

@@ -1,16 +1,15 @@
import re
from common import dates
from common import endoflife
from datetime import datetime
URL = "https://raw.githubusercontent.com/rocky-linux/wiki.rockylinux.org/development/docs/include/releng/version_table.md"
REGEX = r"^(\d+\.\d+)$"
def parse_date(date_str):
date_str = date_str.replace(',', '').strip()
try:
return datetime.strptime(date_str, "%B %d %Y").strftime("%Y-%m-%d")
except ValueError:
return datetime.strptime(date_str, "%b %d %Y").strftime("%Y-%m-%d")
return dates.parse_date(date_str).strftime("%Y-%m-%d")
def parse_markdown_table(table_text):
lines = table_text.strip().split('\n')
@@ -26,6 +25,7 @@ def parse_markdown_table(table_text):
return versions
print("::group::rockylinux")
response = endoflife.fetch_url(URL)
versions = parse_markdown_table(response)

View File

@@ -1,19 +1,10 @@
import re
from bs4 import BeautifulSoup
from common import dates
from common import endoflife
from datetime import datetime
PRODUCT = "sles"
URL = "https://www.suse.com/lifecycle"
DATE_FORMAT = "%d %b %Y"
# Convert date from e.g. "16 Jul 2018" to "2018-07-16"
def convert_date(date_str):
# If the date begins with a >3 letter month name, trim it to just 3 letters
# Strip out the Date: section from the start
d = re.sub(r'(\d{1,2}) (\w{3})(?:\w{1,4})? (\d{4})', r'\1 \2 \3', date_str)
return datetime.strptime(d, DATE_FORMAT).strftime('%Y-%m-%d')
def strip_version(version_str):
@@ -47,7 +38,7 @@ def fetch_releases():
version = strip_version(cells[0].text)
try:
release_date = convert_date(cells[1].text)
release_date = dates.parse_date(cells[1].text).strftime("%Y-%m-%d")
versions[version] = release_date
print(f"{version}: {release_date}")
except ValueError as e:

View File

@@ -1,7 +1,7 @@
import re
from bs4 import BeautifulSoup
from common import dates
from common import endoflife
from datetime import datetime
PRODUCT = "splunk"
URL = "https://docs.splunk.com/Documentation/Splunk"
@@ -9,10 +9,6 @@ RELNOTES_URL_TEMPLATE = "https://docs.splunk.com/Documentation/Splunk/{version}/
PATTERN = r"Splunk Enterprise (?P<version>\d+\.\d+(?:\.\d+)*) was (?:first )?released on (?P<date>\w+\s\d\d?,\s\d{4})\."
def convert_date(date: str) -> str:
return datetime.strptime(date, "%B %d, %Y").strftime("%Y-%m-%d")
def get_latest_minor_versions(versions):
versions_split = [version.split('.') for version in versions]
@@ -55,7 +51,7 @@ latest_minor_versions_urls = [RELNOTES_URL_TEMPLATE.format(version=v) for v in l
for response in endoflife.fetch_urls(latest_minor_versions_urls):
for (version, date_str) in re.findall(PATTERN, response.text, re.MULTILINE):
version = f"{version}.0" if len(version.split(".")) == 2 else version # convert x.y to x.y.0
date = convert_date(date_str)
date = dates.parse_date(date_str).strftime("%Y-%m-%d")
versions[version] = date
print(f"{version}: {date}")