Source code for simple503

#!/usr/bin/env python3
#
#  __init__.py
"""
:pep:`503` Python package repository generator.
"""
#
#  Copyright © 2021 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
#  Permission is hereby granted, free of charge, to any person obtaining a copy
#  of this software and associated documentation files (the "Software"), to deal
#  in the Software without restriction, including without limitation the rights
#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#  copies of the Software, and to permit persons to whom the Software is
#  furnished to do so, subject to the following conditions:
#
#  The above copyright notice and this permission notice shall be included in all
#  copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
#  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
#  DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
#  OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
#  OR OTHER DEALINGS IN THE SOFTWARE.
#

# stdlib
import posixpath
import re
import shutil
from collections import defaultdict
from html import escape
from operator import attrgetter
from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, NamedTuple, Optional, Union

# 3rd party
from airium import Airium  # type: ignore
from apeye.url import URL
from dist_meta import distributions, metadata
from domdf_python_tools.paths import PathPlus
from domdf_python_tools.typing import PathLike
from natsort import natsorted
from shippinglabel import normalize
from shippinglabel.checksum import get_sha256_hash
from typing_extensions import Literal

if TYPE_CHECKING:
	# stdlib
	from hashlib import _Hash
else:
	try:
		# 3rd party
		from _hashlib import HASH as _Hash
	except ImportError:  # pragma: no cover
		try:
			# 3rd party
			from _hashlib import Hash as _Hash
		except ImportError:
			pass

__author__: str = "Dominic Davis-Foster"
__copyright__: str = "2021 Dominic Davis-Foster"
__license__: str = "MIT License"
__version__: str = "0.4.0"
__email__: str = "dominic@davis-foster.co.uk"

__all__ = [
		"WheelFile",
		"generate_index",
		"generate_project_page",
		"make_simple",
		]


[docs]def make_simple(
		origin: PathLike,
		target: Optional[PathLike] = None,
		base_url: Union[str, URL] = '/',
		*,
		sort: bool = False,
		copy: bool = False,
		extract_metadata: bool = True,
		) -> Dict[str, List["WheelFile"]]:
	"""
	Generate a simple repository of Python wheels.

	:param origin: A directory containing wheels. The wheels may be arranged in subdirectories.
	:param target: The directory to create the repository in.
		The directory structure of ``origin`` will be recreated there.
		Defaults to ``origin``.
	:no-default target:
	:param base_url: The base URL of the simple repository.
	:param sort: Sort the wheel files into per-project base directories.
	:param copy: Copy files from the source to the destination, rather than moving them.
	:param extract_metadata: Extract and serve METADATA files per :pep:`658`.

	:returns: A mapping of (unnormalized) project names to a list of wheels for that project.

	.. versionchanged:: 0.2.0

		Now ignores wheels in the following directories: ``.git``, ``.hg``, ``.tox``, ``.tox4``,
		``.nox``, ``venv``, ``.venv``.

	.. versionchanged:: 0.3.0

		* Renamed the ``move`` option to ``sort`` to better reflect its behaviour.
		* Files are moved to the destination by default, unless the ``copy`` option is :py:obj:`True`.

	.. versionchanged:: 0.4.0  Added the ``extract_metadata`` option.
	"""

	if target is None:
		target = origin

	origin = PathPlus(origin).abspath()
	target = PathPlus(target).abspath()

	target.maybe_make(parents=True)

	projects: Dict[str, List[WheelFile]] = defaultdict(list)

	move_operation: Callable = shutil.copyfile if copy else shutil.move  # type: ignore[assignment]

	unwanted_dirs = (".git", ".hg", "venv", ".venv", ".tox", ".tox4", ".nox")
	for wheel_file in origin.iterchildren(exclude_dirs=unwanted_dirs, match="**/*.whl"):
		target_file = target / wheel_file.relative_to(origin)

		with distributions.WheelDistribution.from_path(wheel_file) as wd:
			if not wd.has_file("METADATA"):  # pragma: no cover
				raise FileNotFoundError(f"METADATA file not found in {wheel_file}")

			metadata_string = wd.read_file("METADATA")
			wheel_metadata = metadata.loads(metadata_string)

		if sort:
			# Move to the appropriate directory
			project_dir = target / normalize(wheel_metadata["Name"])
			project_dir.maybe_make()
			if wheel_file.relative_to(origin).parts[0] != project_dir.parts[-1]:
				destination = project_dir / wheel_file.name
				destination.parent.maybe_make(parents=True)
				move_operation(wheel_file, destination)
				target_file = destination

		else:
			if not target_file.is_file() or not wheel_file.samefile(target_file):
				# note: will not overwrite files with the same name, even if the source file changed
				target_file.parent.maybe_make(parents=True)
				move_operation(wheel_file, target_file)

		if extract_metadata:
			metadata_filename = target_file.with_suffix(f"{target_file.suffix}.metadata")
			metadata_filename.write_text(metadata_string)
			metadata_hash = get_sha256_hash(metadata_filename)

		else:
			metadata_hash = None

		projects[wheel_metadata["Name"]].append(
				WheelFile(
						filename=target_file.relative_to(target).as_posix(),
						wheel_hash=get_sha256_hash(target_file),
						requires_python=wheel_metadata.get("Requires-Python"),
						metadata_hash=metadata_hash,
						)
				)

	index_content = str(generate_index(projects.keys(), base_url=base_url))
	_update_file(target / "index.html", index_content)

	for project_name, project_files in projects.items():
		project_dir = target / normalize(project_name)
		project_dir.maybe_make()

		project_index = generate_project_page(
				project_name,
				natsorted(project_files, key=attrgetter("filename"), reverse=True),
				base_url,
				)

		_update_file(project_dir / "index.html", str(project_index))

	return projects


[docs]def generate_index(projects: Iterable[str], base_url: Union[str, URL] = '/') -> Airium:
	"""
	Generate the simple repository index page, containing a list of all projects.

	:param projects: The list of projects to generate links for.
	:param base_url: The base URL of the Python package repository.
		For example, with PyPI's URL, a URL of /foo/ would be https://pypi.org/simple/foo/.
	"""

	base_url = URL(base_url)
	index = Airium()

	index("<!DOCTYPE html>")
	with index.html(lang="en"):
		with index.head():
			get_meta_tags(index)

			with index.title():
				index(f"Simple Package Repository")

		with index.body():
			for project_name in natsorted(projects, key=str.lower):
				normalized_name = normalize(project_name)

				with index.a(href=f"{base_url / normalized_name}/"):
					index(project_name)
				index.br()

	return index


[docs]class WheelFile(NamedTuple):
	"""
	Represents a wheel file in the repository.
	"""

	#: The name of the wheel file.
	filename: str

	wheel_hash: "_Hash"
	"""
	The hash of the wheel file.

	Repositories SHOULD choose a hash function from one of the ones guaranteed
	to be available via the hashlib module in the Python standard library
	(currently ``md5``, ``sha1``, ``sha224``, ``sha256``, ``sha384``, ``sha512``).
	The current recommendation is to use ``sha256``.
	"""

	requires_python: Optional[str] = None
	"""
	The ``Requires-Python`` attribute from the wheel's ``METADATA`` file.

	:py:obj:`None` if undefined.
	"""

	metadata_hash: Union["_Hash", Literal[True], None] = None
	"""
	The hash of the wheel's METADATA file.

	:py:obj:`None` if the metadata file is not exposed.
	May be :py:obj:`True` if no hash is available.
	"""

[docs]	def as_anchor(self, page: Airium, base_url: Union[str, URL] = '/') -> None:
		"""
		Generate an anchor tag in a :class:`airium.Airium` document for this file.

		:param page:
		:param base_url: The base URL of the Python package repository.
		"""

		base_url = URL(base_url)

		href = f"{base_url / self.filename}#{self.wheel_hash.name.lower()}={self.wheel_hash.hexdigest()}"
		kwargs = {"href": href}

		if self.requires_python is not None:
			kwargs["data-requires-python"] = escape(self.requires_python)

		if self.metadata_hash is True:
			kwargs["data-dist-info-metadata"] = "true"
		elif self.metadata_hash is not None:
			hash_string = f"{self.metadata_hash.name.lower()}={self.metadata_hash.hexdigest()}"
			kwargs["data-dist-info-metadata"] = hash_string

		with page.a(**kwargs):
			page(posixpath.basename(self.filename))


[docs]def generate_project_page(name: str, files: Iterable[WheelFile], base_url: Union[str, URL] = '/') -> Airium:
	"""
	Generate the repository page for a project.

	:param name: The project name, e.g. ``domdf-python-tools``.
	:param files: An iterable of files for the project, which will be linked to from the index page.
	:param base_url: The base URL of the Python package repository.
		For example, with PyPI's URL, a URL of /foo/ would be https://pypi.org/simple/foo/.
	"""

	name = normalize(name)
	base_url = URL(base_url)
	page = Airium()

	page("<!DOCTYPE html>")
	with page.html(lang="en"):

		with page.head():
			get_meta_tags(page)
			with page.title():
				page(f"Links for {name}")

		with page.body():

			with page.h1():
				# Not part of the spec, but allowed
				page(f"Links for {name}")

			for wheel_file in files:
				wheel_file.as_anchor(page, base_url)
				page.br()

	return page


def get_meta_tags(page: Airium) -> None:
	# Not part of the spec, but allowed
	page.meta(name="generator", content=f"simple503 version {__version__}")
	page.meta(name="pypi:repository-version", content="1.0")
	page.meta(charset="UTF-8")


def cleanup(directory: PathLike) -> None:
	"""
	Cleanup files generated by ``simple503`` in the directory.

	This entails removing:

	* all ``index.html`` files
	* all ``.whl.metadata`` files
	* all empty directories.

	:param directory:
	"""

	directory = PathPlus(directory).abspath()

	for filename in directory.rglob("**/*"):
		if not filename.is_file():
			continue

		if filename.match("**/index.html"):
			filename.unlink()
		elif filename.match("**/*.whl.metadata"):
			filename.unlink()

	for filename in directory.rglob("**/*"):
		if not filename.is_dir():
			continue

		if next(filename.iterdir(), None) is None:
			filename.rmdir()


_minify_re = re.compile(r"\n\s*")


def _update_file(filename: PathPlus, new_content: str) -> bool:
	"""
	Write ``new_content`` to filename, but only if the content has changed.

	Requires the ``incremental`` extra (``BeautifulSoup`` and ``html5lib``), otherwise always writes.

	.. versionadded:: 0.2.0 (private)

	:param filename:
	:param new_content:

	:returns: Whether the file was updated on disk.
	"""

	try:
		# 3rd party
		from bs4 import BeautifulSoup as soup  # type: ignore
	except ImportError:  # pragma: no cover
		soup = None

	if not filename.exists() or soup is None:
		filename.write_clean(new_content)
		return True

	current_soup = soup(_minify_re.sub('', filename.read_text().strip()), "html.parser").body
	new_soup = soup(_minify_re.sub('', new_content.strip()), "html.parser").body

	if current_soup != new_soup:
		filename.write_clean(new_content)
		return True

	return False