432 lines
13 KiB
Python
432 lines
13 KiB
Python
"""A spec-compliant `.gitignore` parser for Python.
|
|
|
|
Versioned from: https://github.com/excitoon/gitignorefile to add parse_pattern_list() function
|
|
to apply .gitignore rules to a list of patterns that aren't actually a .gitignore file.
|
|
|
|
The original code was licensed under the MIT license, Copyright (c) 2022 Vladimir Chebotarev
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import re
|
|
from typing import Callable
|
|
|
|
DEFAULT_IGNORE_NAMES = [".gitignore", ".git/info/exclude"]
|
|
|
|
|
|
def parse_pattern_list(
|
|
patterns: list[str], base_path: str = None
|
|
) -> Callable[[str], bool]:
|
|
"""Parse a list of patterns and return a callable to match against a path.
|
|
|
|
Args:
|
|
patterns (list[str]): List of patterns to match against.
|
|
base_path (str): Base path for applying ignore rules.
|
|
|
|
Returns:
|
|
Callable[[str], bool]: Callable which returns `True` if specified path is ignored.
|
|
You can also pass `is_dir: bool` optional parameter if you know whether the specified path is a directory.
|
|
"""
|
|
rules = []
|
|
for pattern in patterns:
|
|
pattern = pattern.rstrip("\r\n")
|
|
if rule := _rule_from_pattern(pattern):
|
|
rules.append(rule)
|
|
|
|
return _IgnoreRules(rules, base_path).match
|
|
|
|
|
|
def parse(path, base_path=None):
|
|
"""Parses single `.gitignore` file.
|
|
|
|
Args:
|
|
path (str): Path to `.gitignore` file.
|
|
base_path (str): Base path for applying ignore rules.
|
|
|
|
Returns:
|
|
Callable[[str], bool]: Callable which returns `True` if specified path is ignored.
|
|
You can also pass `is_dir: bool` optional parameter if you know whether the specified path is a directory.
|
|
"""
|
|
|
|
if base_path is None:
|
|
base_path = os.path.dirname(path) or os.path.dirname(os.path.abspath(path))
|
|
|
|
rules = []
|
|
with open(path) as ignore_file:
|
|
for line in ignore_file:
|
|
line = line.rstrip("\r\n")
|
|
if rule := _rule_from_pattern(line):
|
|
rules.append(rule)
|
|
|
|
return _IgnoreRules(rules, base_path).match
|
|
|
|
|
|
def ignore(ignore_names=DEFAULT_IGNORE_NAMES):
|
|
"""Returns `shutil.copytree()`-compatible ignore function for skipping ignored files.
|
|
|
|
It will check if file is ignored by any `.gitignore` in the directory tree.
|
|
|
|
Args:
|
|
ignore_names (list[str], optional): List of names of ignore files.
|
|
|
|
Returns:
|
|
Callable[[str, list[str]], list[str]]: Callable compatible with `shutil.copytree()`.
|
|
"""
|
|
|
|
matches = Cache(ignore_names=ignore_names)
|
|
return lambda root, names: {
|
|
name for name in names if matches(os.path.join(root, name))
|
|
}
|
|
|
|
|
|
def ignored(path, is_dir=None, ignore_names=DEFAULT_IGNORE_NAMES):
|
|
"""Checks if file is ignored by any `.gitignore` in the directory tree.
|
|
|
|
Args:
|
|
path (str): Path to check against ignore rules.
|
|
is_dir (bool, optional): Set if you know whether the specified path is a directory.
|
|
ignore_names (list[str], optional): List of names of ignore files.
|
|
|
|
Returns:
|
|
bool: `True` if the path is ignored.
|
|
"""
|
|
|
|
return Cache(ignore_names=ignore_names)(path, is_dir=is_dir)
|
|
|
|
|
|
class Cache:
|
|
"""Caches information about different `.gitignore` files in the directory tree.
|
|
|
|
Allows to reduce number of queries to filesystem to mininum.
|
|
"""
|
|
|
|
def __init__(self, ignore_names=DEFAULT_IGNORE_NAMES):
|
|
"""Constructs `Cache` objects.
|
|
|
|
Args:
|
|
ignore_names (list[str], optional): List of names of ignore files.
|
|
"""
|
|
|
|
self.__ignore_names = ignore_names
|
|
self.__gitignores = {}
|
|
|
|
def __call__(self, path, is_dir=None):
|
|
"""Checks whether the specified path is ignored.
|
|
|
|
Args:
|
|
path (str): Path to check against ignore rules.
|
|
is_dir (bool, optional): Set if you know whether the specified path is a directory.
|
|
"""
|
|
|
|
path = _Path(path)
|
|
add_to_children = {}
|
|
plain_paths = []
|
|
for parent in path.parents():
|
|
if parent.parts in self.__gitignores:
|
|
break
|
|
|
|
ignore_paths = []
|
|
for ignore_name in self.__ignore_names:
|
|
ignore_path = parent.join(ignore_name)
|
|
if ignore_path.isfile():
|
|
ignore_paths.append(str(ignore_path))
|
|
|
|
if ignore_paths:
|
|
matches = [
|
|
parse(ignore_path, base_path=parent) for ignore_path in ignore_paths
|
|
]
|
|
add_to_children[parent] = (matches, plain_paths)
|
|
plain_paths = []
|
|
|
|
else:
|
|
plain_paths.append(parent)
|
|
|
|
else:
|
|
parent = _Path(tuple()) # Null path.
|
|
self.__gitignores[parent.parts] = []
|
|
|
|
for plain_path in plain_paths:
|
|
# assert plain_path.parts not in self.__gitignores
|
|
self.__gitignores[plain_path.parts] = self.__gitignores[parent.parts]
|
|
|
|
for parent, (_, parent_plain_paths) in reversed(list(add_to_children.items())):
|
|
# assert parent.parts not in self.__gitignores
|
|
self.__gitignores[parent.parts] = self.__gitignores[
|
|
parent.parts[:-1]
|
|
].copy()
|
|
for parent_to_add, (gitignores_to_add, _) in reversed(
|
|
list(add_to_children.items())
|
|
):
|
|
self.__gitignores[parent.parts].extend(gitignores_to_add)
|
|
if parent_to_add == parent:
|
|
break
|
|
|
|
self.__gitignores[parent.parts].reverse()
|
|
|
|
for plain_path in parent_plain_paths:
|
|
# assert plain_path.parts not in self.__gitignores
|
|
self.__gitignores[plain_path.parts] = self.__gitignores[parent.parts]
|
|
|
|
# This parent comes either from first or second loop.
|
|
return any((m(path, is_dir=is_dir) for m in self.__gitignores[parent.parts]))
|
|
|
|
|
|
class _Path:
|
|
def __init__(self, path):
|
|
if isinstance(path, (str, bytes, os.PathLike)):
|
|
abs_path = os.path.abspath(path)
|
|
self.__parts = tuple(_path_split(abs_path))
|
|
self.__joined = abs_path
|
|
self.__is_dir = None
|
|
|
|
else:
|
|
self.__parts = path
|
|
self.__joined = None
|
|
self.__is_dir = None
|
|
|
|
@property
|
|
def parts(self):
|
|
return self.__parts
|
|
|
|
def join(self, name):
|
|
return _Path(self.__parts + (name,))
|
|
|
|
def relpath(self, base_path):
|
|
if self.__parts[: len(base_path.__parts)] == base_path.__parts:
|
|
return "/".join(self.__parts[len(base_path.__parts) :])
|
|
|
|
else:
|
|
return None
|
|
|
|
def parents(self):
|
|
for i in range(len(self.__parts) - 1, 0, -1):
|
|
yield _Path(self.__parts[:i])
|
|
|
|
def isfile(self):
|
|
return os.path.isfile(str(self))
|
|
|
|
def isdir(self):
|
|
if self.__is_dir is not None:
|
|
return self.__is_dir
|
|
self.__is_dir = os.path.isdir(str(self))
|
|
return self.__is_dir
|
|
|
|
def __str__(self):
|
|
if self.__joined is None:
|
|
self.__joined = (
|
|
os.sep.join(self.__parts) if self.__parts != ("",) else os.sep
|
|
)
|
|
return self.__joined
|
|
|
|
|
|
def _rule_from_pattern(pattern):
|
|
# Takes a `.gitignore` match pattern, such as "*.py[cod]" or "**/*.bak",
|
|
# and returns an `_IgnoreRule` suitable for matching against files and
|
|
# directories. Patterns which do not match files, such as comments
|
|
# and blank lines, will return `None`.
|
|
|
|
# Early returns follow
|
|
# Discard comments and separators
|
|
if not pattern.lstrip() or pattern.lstrip().startswith("#"):
|
|
return
|
|
|
|
# Discard anything with more than two consecutive asterisks
|
|
if "***" in pattern:
|
|
return
|
|
|
|
# Strip leading bang before examining double asterisks
|
|
if pattern.startswith("!"):
|
|
negation = True
|
|
pattern = pattern[1:]
|
|
else:
|
|
negation = False
|
|
|
|
# Discard anything with invalid double-asterisks -- they can appear
|
|
# at the start or the end, or be surrounded by slashes
|
|
for m in re.finditer("\\*\\*", pattern):
|
|
start_index = m.start()
|
|
if (
|
|
start_index != 0
|
|
and start_index != len(pattern) - 2
|
|
and (pattern[start_index - 1] != "/" or pattern[start_index + 2] != "/")
|
|
):
|
|
return
|
|
|
|
# Special-casing '/', which doesn't match any files or directories
|
|
if pattern.rstrip() == "/":
|
|
return
|
|
|
|
directory_only = pattern.endswith("/")
|
|
|
|
# A slash is a sign that we're tied to the `base_path` of our rule
|
|
# set.
|
|
anchored = "/" in pattern[:-1]
|
|
|
|
if pattern.startswith("/"):
|
|
pattern = pattern[1:]
|
|
if pattern.startswith("**"):
|
|
pattern = pattern[2:]
|
|
anchored = False
|
|
if pattern.startswith("/"):
|
|
pattern = pattern[1:]
|
|
if pattern.endswith("/"):
|
|
pattern = pattern[:-1]
|
|
|
|
# patterns with leading hashes are escaped with a backslash in front, unescape it
|
|
if pattern.startswith("\\#"):
|
|
pattern = pattern[1:]
|
|
|
|
# trailing spaces are ignored unless they are escaped with a backslash
|
|
i = len(pattern) - 1
|
|
striptrailingspaces = True
|
|
while i > 1 and pattern[i] == " ":
|
|
if pattern[i - 1] == "\\":
|
|
pattern = pattern[: i - 1] + pattern[i:]
|
|
i -= 1
|
|
striptrailingspaces = False
|
|
else:
|
|
if striptrailingspaces:
|
|
pattern = pattern[:i]
|
|
i -= 1
|
|
|
|
regexp = _fnmatch_pathname_to_regexp(pattern, anchored, directory_only)
|
|
return _IgnoreRule(regexp, negation, directory_only)
|
|
|
|
|
|
class _IgnoreRules:
|
|
def __init__(self, rules, base_path):
|
|
self.__rules = rules
|
|
self.__can_return_immediately = not any((r.negation for r in rules))
|
|
self.__base_path = (
|
|
_Path(base_path) if not isinstance(base_path, _Path) else base_path
|
|
)
|
|
|
|
def match(self, path, is_dir=None):
|
|
if not isinstance(path, _Path):
|
|
path = _Path(path)
|
|
|
|
rel_path = path.relpath(self.__base_path)
|
|
|
|
if rel_path is not None:
|
|
if is_dir is None:
|
|
is_dir = path.isdir() # TODO Pass callable here.
|
|
|
|
if self.__can_return_immediately:
|
|
return any((r.match(rel_path, is_dir) for r in self.__rules))
|
|
|
|
else:
|
|
matched = False
|
|
for rule in self.__rules:
|
|
if rule.match(rel_path, is_dir):
|
|
matched = not rule.negation
|
|
|
|
else:
|
|
return matched
|
|
|
|
else:
|
|
return False
|
|
|
|
|
|
class _IgnoreRule:
|
|
def __init__(self, regexp, negation, directory_only):
|
|
self.__regexp = re.compile(regexp)
|
|
self.__negation = negation
|
|
self.__directory_only = directory_only
|
|
self.__match = self.__regexp.match
|
|
|
|
@property
|
|
def regexp(self):
|
|
return self.__regexp
|
|
|
|
@property
|
|
def negation(self):
|
|
return self.__negation
|
|
|
|
def match(self, rel_path, is_dir):
|
|
m = self.__match(rel_path)
|
|
|
|
# If we need a directory, check there is something after slash and if there is not, target must be a directory.
|
|
# If there is something after slash then it's a directory irrelevant to type of target.
|
|
# `self.directory_only` implies we have group number 1.
|
|
# N.B. Question mark inside a group without a name can shift indices. :(
|
|
return m and (not self.__directory_only or m.group(1) is not None or is_dir)
|
|
|
|
|
|
if os.altsep is not None:
|
|
_all_seps_expr = f"[{re.escape(os.sep)}{re.escape(os.altsep)}]"
|
|
_path_split = lambda path: re.split(_all_seps_expr, path) # noqa: E731
|
|
|
|
else:
|
|
_path_split = lambda path: path.split(os.sep) # noqa: E731
|
|
|
|
|
|
def _fnmatch_pathname_to_regexp(pattern, anchored, directory_only):
|
|
# Implements `fnmatch` style-behavior, as though with `FNM_PATHNAME` flagged;
|
|
# the path separator will not match shell-style `*` and `.` wildcards.
|
|
|
|
# Frustratingly, python's fnmatch doesn't provide the FNM_PATHNAME
|
|
# option that `.gitignore`'s behavior depends on.
|
|
|
|
if not pattern:
|
|
if directory_only:
|
|
return "[^/]+(/.+)?$" # Empty name means no path fragment.
|
|
|
|
else:
|
|
return ".*"
|
|
|
|
i, n = 0, len(pattern)
|
|
|
|
res = ["(?:^|.+/)" if not anchored else ""]
|
|
while i < n:
|
|
c = pattern[i]
|
|
i += 1
|
|
if c == "*":
|
|
if i < n and pattern[i] == "*":
|
|
i += 1
|
|
if i < n and pattern[i] == "/":
|
|
i += 1
|
|
res.append("(.+/)?") # `/**/` matches `/`.
|
|
|
|
else:
|
|
res.append(".*")
|
|
|
|
else:
|
|
res.append("[^/]*")
|
|
|
|
elif c == "?":
|
|
res.append("[^/]")
|
|
|
|
elif c == "[":
|
|
j = i
|
|
if j < n and pattern[j] == "!":
|
|
j += 1
|
|
if j < n and pattern[j] == "]":
|
|
j += 1
|
|
while j < n and pattern[j] != "]":
|
|
j += 1
|
|
|
|
if j >= n:
|
|
res.append("\\[")
|
|
else:
|
|
stuff = pattern[i:j].replace("\\", "\\\\")
|
|
i = j + 1
|
|
if stuff[0] == "!":
|
|
stuff = f"^{stuff[1:]}"
|
|
elif stuff[0] == "^":
|
|
stuff = f"\\{stuff}"
|
|
res.append(f"[{stuff}]")
|
|
|
|
else:
|
|
res.append(re.escape(c))
|
|
|
|
if (
|
|
directory_only
|
|
): # In this case we are interested if there is something after slash.
|
|
res.append("(/.+)?$")
|
|
|
|
else:
|
|
res.append("(?:/.+)?$")
|
|
|
|
return "".join(res)
|