osxphotos/osxphotos/gitignorefile.py
Rhet Turnbull 8be71243d5
Refactor ruff (#1155)
* Refactored to make ruff happy

* Refactored tests, fixed bug in Mojave path_raw
2023-08-12 19:24:46 -07:00

432 lines
13 KiB
Python

"""A spec-compliant `.gitignore` parser for Python.
Versioned from: https://github.com/excitoon/gitignorefile to add parse_pattern_list() function
to apply .gitignore rules to a list of patterns that aren't actually a .gitignore file.
The original code was licensed under the MIT license, Copyright (c) 2022 Vladimir Chebotarev
"""
from __future__ import annotations
import os
import re
from typing import Callable
DEFAULT_IGNORE_NAMES = [".gitignore", ".git/info/exclude"]
def parse_pattern_list(
patterns: list[str], base_path: str = None
) -> Callable[[str], bool]:
"""Parse a list of patterns and return a callable to match against a path.
Args:
patterns (list[str]): List of patterns to match against.
base_path (str): Base path for applying ignore rules.
Returns:
Callable[[str], bool]: Callable which returns `True` if specified path is ignored.
You can also pass `is_dir: bool` optional parameter if you know whether the specified path is a directory.
"""
rules = []
for pattern in patterns:
pattern = pattern.rstrip("\r\n")
if rule := _rule_from_pattern(pattern):
rules.append(rule)
return _IgnoreRules(rules, base_path).match
def parse(path, base_path=None):
"""Parses single `.gitignore` file.
Args:
path (str): Path to `.gitignore` file.
base_path (str): Base path for applying ignore rules.
Returns:
Callable[[str], bool]: Callable which returns `True` if specified path is ignored.
You can also pass `is_dir: bool` optional parameter if you know whether the specified path is a directory.
"""
if base_path is None:
base_path = os.path.dirname(path) or os.path.dirname(os.path.abspath(path))
rules = []
with open(path) as ignore_file:
for line in ignore_file:
line = line.rstrip("\r\n")
if rule := _rule_from_pattern(line):
rules.append(rule)
return _IgnoreRules(rules, base_path).match
def ignore(ignore_names=DEFAULT_IGNORE_NAMES):
"""Returns `shutil.copytree()`-compatible ignore function for skipping ignored files.
It will check if file is ignored by any `.gitignore` in the directory tree.
Args:
ignore_names (list[str], optional): List of names of ignore files.
Returns:
Callable[[str, list[str]], list[str]]: Callable compatible with `shutil.copytree()`.
"""
matches = Cache(ignore_names=ignore_names)
return lambda root, names: {
name for name in names if matches(os.path.join(root, name))
}
def ignored(path, is_dir=None, ignore_names=DEFAULT_IGNORE_NAMES):
"""Checks if file is ignored by any `.gitignore` in the directory tree.
Args:
path (str): Path to check against ignore rules.
is_dir (bool, optional): Set if you know whether the specified path is a directory.
ignore_names (list[str], optional): List of names of ignore files.
Returns:
bool: `True` if the path is ignored.
"""
return Cache(ignore_names=ignore_names)(path, is_dir=is_dir)
class Cache:
"""Caches information about different `.gitignore` files in the directory tree.
Allows to reduce number of queries to filesystem to mininum.
"""
def __init__(self, ignore_names=DEFAULT_IGNORE_NAMES):
"""Constructs `Cache` objects.
Args:
ignore_names (list[str], optional): List of names of ignore files.
"""
self.__ignore_names = ignore_names
self.__gitignores = {}
def __call__(self, path, is_dir=None):
"""Checks whether the specified path is ignored.
Args:
path (str): Path to check against ignore rules.
is_dir (bool, optional): Set if you know whether the specified path is a directory.
"""
path = _Path(path)
add_to_children = {}
plain_paths = []
for parent in path.parents():
if parent.parts in self.__gitignores:
break
ignore_paths = []
for ignore_name in self.__ignore_names:
ignore_path = parent.join(ignore_name)
if ignore_path.isfile():
ignore_paths.append(str(ignore_path))
if ignore_paths:
matches = [
parse(ignore_path, base_path=parent) for ignore_path in ignore_paths
]
add_to_children[parent] = (matches, plain_paths)
plain_paths = []
else:
plain_paths.append(parent)
else:
parent = _Path(tuple()) # Null path.
self.__gitignores[parent.parts] = []
for plain_path in plain_paths:
# assert plain_path.parts not in self.__gitignores
self.__gitignores[plain_path.parts] = self.__gitignores[parent.parts]
for parent, (_, parent_plain_paths) in reversed(list(add_to_children.items())):
# assert parent.parts not in self.__gitignores
self.__gitignores[parent.parts] = self.__gitignores[
parent.parts[:-1]
].copy()
for parent_to_add, (gitignores_to_add, _) in reversed(
list(add_to_children.items())
):
self.__gitignores[parent.parts].extend(gitignores_to_add)
if parent_to_add == parent:
break
self.__gitignores[parent.parts].reverse()
for plain_path in parent_plain_paths:
# assert plain_path.parts not in self.__gitignores
self.__gitignores[plain_path.parts] = self.__gitignores[parent.parts]
# This parent comes either from first or second loop.
return any((m(path, is_dir=is_dir) for m in self.__gitignores[parent.parts]))
class _Path:
def __init__(self, path):
if isinstance(path, (str, bytes, os.PathLike)):
abs_path = os.path.abspath(path)
self.__parts = tuple(_path_split(abs_path))
self.__joined = abs_path
self.__is_dir = None
else:
self.__parts = path
self.__joined = None
self.__is_dir = None
@property
def parts(self):
return self.__parts
def join(self, name):
return _Path(self.__parts + (name,))
def relpath(self, base_path):
if self.__parts[: len(base_path.__parts)] == base_path.__parts:
return "/".join(self.__parts[len(base_path.__parts) :])
else:
return None
def parents(self):
for i in range(len(self.__parts) - 1, 0, -1):
yield _Path(self.__parts[:i])
def isfile(self):
return os.path.isfile(str(self))
def isdir(self):
if self.__is_dir is not None:
return self.__is_dir
self.__is_dir = os.path.isdir(str(self))
return self.__is_dir
def __str__(self):
if self.__joined is None:
self.__joined = (
os.sep.join(self.__parts) if self.__parts != ("",) else os.sep
)
return self.__joined
def _rule_from_pattern(pattern):
# Takes a `.gitignore` match pattern, such as "*.py[cod]" or "**/*.bak",
# and returns an `_IgnoreRule` suitable for matching against files and
# directories. Patterns which do not match files, such as comments
# and blank lines, will return `None`.
# Early returns follow
# Discard comments and separators
if not pattern.lstrip() or pattern.lstrip().startswith("#"):
return
# Discard anything with more than two consecutive asterisks
if "***" in pattern:
return
# Strip leading bang before examining double asterisks
if pattern.startswith("!"):
negation = True
pattern = pattern[1:]
else:
negation = False
# Discard anything with invalid double-asterisks -- they can appear
# at the start or the end, or be surrounded by slashes
for m in re.finditer("\\*\\*", pattern):
start_index = m.start()
if (
start_index != 0
and start_index != len(pattern) - 2
and (pattern[start_index - 1] != "/" or pattern[start_index + 2] != "/")
):
return
# Special-casing '/', which doesn't match any files or directories
if pattern.rstrip() == "/":
return
directory_only = pattern.endswith("/")
# A slash is a sign that we're tied to the `base_path` of our rule
# set.
anchored = "/" in pattern[:-1]
if pattern.startswith("/"):
pattern = pattern[1:]
if pattern.startswith("**"):
pattern = pattern[2:]
anchored = False
if pattern.startswith("/"):
pattern = pattern[1:]
if pattern.endswith("/"):
pattern = pattern[:-1]
# patterns with leading hashes are escaped with a backslash in front, unescape it
if pattern.startswith("\\#"):
pattern = pattern[1:]
# trailing spaces are ignored unless they are escaped with a backslash
i = len(pattern) - 1
striptrailingspaces = True
while i > 1 and pattern[i] == " ":
if pattern[i - 1] == "\\":
pattern = pattern[: i - 1] + pattern[i:]
i -= 1
striptrailingspaces = False
else:
if striptrailingspaces:
pattern = pattern[:i]
i -= 1
regexp = _fnmatch_pathname_to_regexp(pattern, anchored, directory_only)
return _IgnoreRule(regexp, negation, directory_only)
class _IgnoreRules:
def __init__(self, rules, base_path):
self.__rules = rules
self.__can_return_immediately = not any((r.negation for r in rules))
self.__base_path = (
_Path(base_path) if not isinstance(base_path, _Path) else base_path
)
def match(self, path, is_dir=None):
if not isinstance(path, _Path):
path = _Path(path)
rel_path = path.relpath(self.__base_path)
if rel_path is not None:
if is_dir is None:
is_dir = path.isdir() # TODO Pass callable here.
if self.__can_return_immediately:
return any((r.match(rel_path, is_dir) for r in self.__rules))
else:
matched = False
for rule in self.__rules:
if rule.match(rel_path, is_dir):
matched = not rule.negation
else:
return matched
else:
return False
class _IgnoreRule:
def __init__(self, regexp, negation, directory_only):
self.__regexp = re.compile(regexp)
self.__negation = negation
self.__directory_only = directory_only
self.__match = self.__regexp.match
@property
def regexp(self):
return self.__regexp
@property
def negation(self):
return self.__negation
def match(self, rel_path, is_dir):
m = self.__match(rel_path)
# If we need a directory, check there is something after slash and if there is not, target must be a directory.
# If there is something after slash then it's a directory irrelevant to type of target.
# `self.directory_only` implies we have group number 1.
# N.B. Question mark inside a group without a name can shift indices. :(
return m and (not self.__directory_only or m.group(1) is not None or is_dir)
if os.altsep is not None:
_all_seps_expr = f"[{re.escape(os.sep)}{re.escape(os.altsep)}]"
_path_split = lambda path: re.split(_all_seps_expr, path) # noqa: E731
else:
_path_split = lambda path: path.split(os.sep) # noqa: E731
def _fnmatch_pathname_to_regexp(pattern, anchored, directory_only):
# Implements `fnmatch` style-behavior, as though with `FNM_PATHNAME` flagged;
# the path separator will not match shell-style `*` and `.` wildcards.
# Frustratingly, python's fnmatch doesn't provide the FNM_PATHNAME
# option that `.gitignore`'s behavior depends on.
if not pattern:
if directory_only:
return "[^/]+(/.+)?$" # Empty name means no path fragment.
else:
return ".*"
i, n = 0, len(pattern)
res = ["(?:^|.+/)" if not anchored else ""]
while i < n:
c = pattern[i]
i += 1
if c == "*":
if i < n and pattern[i] == "*":
i += 1
if i < n and pattern[i] == "/":
i += 1
res.append("(.+/)?") # `/**/` matches `/`.
else:
res.append(".*")
else:
res.append("[^/]*")
elif c == "?":
res.append("[^/]")
elif c == "[":
j = i
if j < n and pattern[j] == "!":
j += 1
if j < n and pattern[j] == "]":
j += 1
while j < n and pattern[j] != "]":
j += 1
if j >= n:
res.append("\\[")
else:
stuff = pattern[i:j].replace("\\", "\\\\")
i = j + 1
if stuff[0] == "!":
stuff = f"^{stuff[1:]}"
elif stuff[0] == "^":
stuff = f"\\{stuff}"
res.append(f"[{stuff}]")
else:
res.append(re.escape(c))
if (
directory_only
): # In this case we are interested if there is something after slash.
res.append("(/.+)?$")
else:
res.append("(?:/.+)?$")
return "".join(res)