1311 lines
47 KiB
Python
1311 lines
47 KiB
Python
|
"""CSS selector parser."""
|
||
|
import re
|
||
|
from functools import lru_cache
|
||
|
from . import util
|
||
|
from . import css_match as cm
|
||
|
from . import css_types as ct
|
||
|
from .util import SelectorSyntaxError
|
||
|
import warnings
|
||
|
from typing import Optional, Dict, Match, Tuple, Type, Any, List, Union, Iterator, cast
|
||
|
|
||
|
UNICODE_REPLACEMENT_CHAR = 0xFFFD
|
||
|
|
||
|
# Simple pseudo classes that take no parameters
|
||
|
PSEUDO_SIMPLE = {
|
||
|
":any-link",
|
||
|
":empty",
|
||
|
":first-child",
|
||
|
":first-of-type",
|
||
|
":in-range",
|
||
|
":out-of-range",
|
||
|
":last-child",
|
||
|
":last-of-type",
|
||
|
":link",
|
||
|
":only-child",
|
||
|
":only-of-type",
|
||
|
":root",
|
||
|
':checked',
|
||
|
':default',
|
||
|
':disabled',
|
||
|
':enabled',
|
||
|
':indeterminate',
|
||
|
':optional',
|
||
|
':placeholder-shown',
|
||
|
':read-only',
|
||
|
':read-write',
|
||
|
':required',
|
||
|
':scope',
|
||
|
':defined'
|
||
|
}
|
||
|
|
||
|
# Supported, simple pseudo classes that match nothing in the Soup Sieve environment
|
||
|
PSEUDO_SIMPLE_NO_MATCH = {
|
||
|
':active',
|
||
|
':current',
|
||
|
':focus',
|
||
|
':focus-visible',
|
||
|
':focus-within',
|
||
|
':future',
|
||
|
':host',
|
||
|
':hover',
|
||
|
':local-link',
|
||
|
':past',
|
||
|
':paused',
|
||
|
':playing',
|
||
|
':target',
|
||
|
':target-within',
|
||
|
':user-invalid',
|
||
|
':visited'
|
||
|
}
|
||
|
|
||
|
# Complex pseudo classes that take selector lists
|
||
|
PSEUDO_COMPLEX = {
|
||
|
':contains',
|
||
|
':-soup-contains',
|
||
|
':-soup-contains-own',
|
||
|
':has',
|
||
|
':is',
|
||
|
':matches',
|
||
|
':not',
|
||
|
':where'
|
||
|
}
|
||
|
|
||
|
PSEUDO_COMPLEX_NO_MATCH = {
|
||
|
':current',
|
||
|
':host',
|
||
|
':host-context'
|
||
|
}
|
||
|
|
||
|
# Complex pseudo classes that take very specific parameters and are handled special
|
||
|
PSEUDO_SPECIAL = {
|
||
|
':dir',
|
||
|
':lang',
|
||
|
':nth-child',
|
||
|
':nth-last-child',
|
||
|
':nth-last-of-type',
|
||
|
':nth-of-type'
|
||
|
}
|
||
|
|
||
|
PSEUDO_SUPPORTED = PSEUDO_SIMPLE | PSEUDO_SIMPLE_NO_MATCH | PSEUDO_COMPLEX | PSEUDO_COMPLEX_NO_MATCH | PSEUDO_SPECIAL
|
||
|
|
||
|
# Sub-patterns parts
|
||
|
# Whitespace
|
||
|
NEWLINE = r'(?:\r\n|(?!\r\n)[\n\f\r])'
|
||
|
WS = r'(?:[ \t]|{})'.format(NEWLINE)
|
||
|
# Comments
|
||
|
COMMENTS = r'(?:/\*[^*]*\*+(?:[^/*][^*]*\*+)*/)'
|
||
|
# Whitespace with comments included
|
||
|
WSC = r'(?:{ws}|{comments})'.format(ws=WS, comments=COMMENTS)
|
||
|
# CSS escapes
|
||
|
CSS_ESCAPES = r'(?:\\(?:[a-f0-9]{{1,6}}{ws}?|[^\r\n\f]|$))'.format(ws=WS)
|
||
|
CSS_STRING_ESCAPES = r'(?:\\(?:[a-f0-9]{{1,6}}{ws}?|[^\r\n\f]|$|{nl}))'.format(ws=WS, nl=NEWLINE)
|
||
|
# CSS Identifier
|
||
|
IDENTIFIER = r'''
|
||
|
(?:(?:-?(?:[^\x00-\x2f\x30-\x40\x5B-\x5E\x60\x7B-\x9f]|{esc})+|--)
|
||
|
(?:[^\x00-\x2c\x2e\x2f\x3A-\x40\x5B-\x5E\x60\x7B-\x9f]|{esc})*)
|
||
|
'''.format(esc=CSS_ESCAPES)
|
||
|
# `nth` content
|
||
|
NTH = r'(?:[-+])?(?:[0-9]+n?|n)(?:(?<=n){ws}*(?:[-+]){ws}*(?:[0-9]+))?'.format(ws=WSC)
|
||
|
# Value: quoted string or identifier
|
||
|
VALUE = r'''
|
||
|
(?:"(?:\\(?:.|{nl})|[^\\"\r\n\f]+)*?"|'(?:\\(?:.|{nl})|[^\\'\r\n\f]+)*?'|{ident}+)
|
||
|
'''.format(nl=NEWLINE, ident=IDENTIFIER)
|
||
|
# Attribute value comparison. `!=` is handled special as it is non-standard.
|
||
|
ATTR = r'''
|
||
|
(?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}+(?P<case>[is]))?)?{ws}*\]
|
||
|
'''.format(ws=WSC, value=VALUE)
|
||
|
|
||
|
# Selector patterns
|
||
|
# IDs (`#id`)
|
||
|
PAT_ID = r'\#{ident}'.format(ident=IDENTIFIER)
|
||
|
# Classes (`.class`)
|
||
|
PAT_CLASS = r'\.{ident}'.format(ident=IDENTIFIER)
|
||
|
# Prefix:Tag (`prefix|tag`)
|
||
|
PAT_TAG = r'(?P<tag_ns>(?:{ident}|\*)?\|)?(?P<tag_name>{ident}|\*)'.format(ident=IDENTIFIER)
|
||
|
# Attributes (`[attr]`, `[attr=value]`, etc.)
|
||
|
PAT_ATTR = r'''
|
||
|
\[{ws}*(?P<attr_ns>(?:{ident}|\*)?\|)?(?P<attr_name>{ident}){attr}
|
||
|
'''.format(ws=WSC, ident=IDENTIFIER, attr=ATTR)
|
||
|
# Pseudo class (`:pseudo-class`, `:pseudo-class(`)
|
||
|
PAT_PSEUDO_CLASS = r'(?P<name>:{ident})(?P<open>\({ws}*)?'.format(ws=WSC, ident=IDENTIFIER)
|
||
|
# Pseudo class special patterns. Matches `:pseudo-class(` for special case pseudo classes.
|
||
|
PAT_PSEUDO_CLASS_SPECIAL = r'(?P<name>:{ident})(?P<open>\({ws}*)'.format(ws=WSC, ident=IDENTIFIER)
|
||
|
# Custom pseudo class (`:--custom-pseudo`)
|
||
|
PAT_PSEUDO_CLASS_CUSTOM = r'(?P<name>:(?=--){ident})'.format(ident=IDENTIFIER)
|
||
|
# Closing pseudo group (`)`)
|
||
|
PAT_PSEUDO_CLOSE = r'{ws}*\)'.format(ws=WSC)
|
||
|
# Pseudo element (`::pseudo-element`)
|
||
|
PAT_PSEUDO_ELEMENT = r':{}'.format(PAT_PSEUDO_CLASS)
|
||
|
# At rule (`@page`, etc.) (not supported)
|
||
|
PAT_AT_RULE = r'@P{ident}'.format(ident=IDENTIFIER)
|
||
|
# Pseudo class `nth-child` (`:nth-child(an+b [of S]?)`, `:first-child`, etc.)
|
||
|
PAT_PSEUDO_NTH_CHILD = r'''
|
||
|
(?P<pseudo_nth_child>{name}
|
||
|
(?P<nth_child>{nth}|even|odd))(?:{wsc}*\)|(?P<of>{comments}*{ws}{wsc}*of{comments}*{ws}{wsc}*))
|
||
|
'''.format(name=PAT_PSEUDO_CLASS_SPECIAL, wsc=WSC, comments=COMMENTS, ws=WS, nth=NTH)
|
||
|
# Pseudo class `nth-of-type` (`:nth-of-type(an+b)`, `:first-of-type`, etc.)
|
||
|
PAT_PSEUDO_NTH_TYPE = r'''
|
||
|
(?P<pseudo_nth_type>{name}
|
||
|
(?P<nth_type>{nth}|even|odd)){ws}*\)
|
||
|
'''.format(name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, nth=NTH)
|
||
|
# Pseudo class language (`:lang("*-de", en)`)
|
||
|
PAT_PSEUDO_LANG = r'{name}(?P<values>{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format(
|
||
|
name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, value=VALUE
|
||
|
)
|
||
|
# Pseudo class direction (`:dir(ltr)`)
|
||
|
PAT_PSEUDO_DIR = r'{name}(?P<dir>ltr|rtl){ws}*\)'.format(name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC)
|
||
|
# Combining characters (`>`, `~`, ` `, `+`, `,`)
|
||
|
PAT_COMBINE = r'{wsc}*?(?P<relation>[,+>~]|{ws}(?![,+>~])){wsc}*'.format(ws=WS, wsc=WSC)
|
||
|
# Extra: Contains (`:contains(text)`)
|
||
|
PAT_PSEUDO_CONTAINS = r'{name}(?P<values>{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format(
|
||
|
name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, value=VALUE
|
||
|
)
|
||
|
|
||
|
# Regular expressions
|
||
|
# CSS escape pattern
|
||
|
RE_CSS_ESC = re.compile(r'(?:(\\[a-f0-9]{{1,6}}{ws}?)|(\\[^\r\n\f])|(\\$))'.format(ws=WSC), re.I)
|
||
|
RE_CSS_STR_ESC = re.compile(
|
||
|
r'(?:(\\[a-f0-9]{{1,6}}{ws}?)|(\\[^\r\n\f])|(\\$)|(\\{nl}))'.format(ws=WS, nl=NEWLINE), re.I
|
||
|
)
|
||
|
# Pattern to break up `nth` specifiers
|
||
|
RE_NTH = re.compile(
|
||
|
r'(?P<s1>[-+])?(?P<a>[0-9]+n?|n)(?:(?<=n){ws}*(?P<s2>[-+]){ws}*(?P<b>[0-9]+))?'.format(ws=WSC),
|
||
|
re.I
|
||
|
)
|
||
|
# Pattern to iterate multiple values.
|
||
|
RE_VALUES = re.compile(r'(?:(?P<value>{value})|(?P<split>{ws}*,{ws}*))'.format(ws=WSC, value=VALUE), re.X)
|
||
|
# Whitespace checks
|
||
|
RE_WS = re.compile(WS)
|
||
|
RE_WS_BEGIN = re.compile('^{}*'.format(WSC))
|
||
|
RE_WS_END = re.compile('{}*$'.format(WSC))
|
||
|
RE_CUSTOM = re.compile(r'^{}$'.format(PAT_PSEUDO_CLASS_CUSTOM), re.X)
|
||
|
|
||
|
# Constants
|
||
|
# List split token
|
||
|
COMMA_COMBINATOR = ','
|
||
|
# Relation token for descendant
|
||
|
WS_COMBINATOR = " "
|
||
|
|
||
|
# Parse flags
|
||
|
FLG_PSEUDO = 0x01
|
||
|
FLG_NOT = 0x02
|
||
|
FLG_RELATIVE = 0x04
|
||
|
FLG_DEFAULT = 0x08
|
||
|
FLG_HTML = 0x10
|
||
|
FLG_INDETERMINATE = 0x20
|
||
|
FLG_OPEN = 0x40
|
||
|
FLG_IN_RANGE = 0x80
|
||
|
FLG_OUT_OF_RANGE = 0x100
|
||
|
FLG_PLACEHOLDER_SHOWN = 0x200
|
||
|
FLG_FORGIVE = 0x400
|
||
|
|
||
|
# Maximum cached patterns to store
|
||
|
_MAXCACHE = 500
|
||
|
|
||
|
|
||
|
@lru_cache(maxsize=_MAXCACHE)
|
||
|
def _cached_css_compile(
|
||
|
pattern: str,
|
||
|
namespaces: Optional[ct.Namespaces],
|
||
|
custom: Optional[ct.CustomSelectors],
|
||
|
flags: int
|
||
|
) -> cm.SoupSieve:
|
||
|
"""Cached CSS compile."""
|
||
|
|
||
|
custom_selectors = process_custom(custom)
|
||
|
return cm.SoupSieve(
|
||
|
pattern,
|
||
|
CSSParser(
|
||
|
pattern,
|
||
|
custom=custom_selectors,
|
||
|
flags=flags
|
||
|
).process_selectors(),
|
||
|
namespaces,
|
||
|
custom,
|
||
|
flags
|
||
|
)
|
||
|
|
||
|
|
||
|
def _purge_cache() -> None:
|
||
|
"""Purge the cache."""
|
||
|
|
||
|
_cached_css_compile.cache_clear()
|
||
|
|
||
|
|
||
|
def process_custom(custom: Optional[ct.CustomSelectors]) -> Dict[str, Union[str, ct.SelectorList]]:
|
||
|
"""Process custom."""
|
||
|
|
||
|
custom_selectors = {}
|
||
|
if custom is not None:
|
||
|
for key, value in custom.items():
|
||
|
name = util.lower(key)
|
||
|
if RE_CUSTOM.match(name) is None:
|
||
|
raise SelectorSyntaxError("The name '{}' is not a valid custom pseudo-class name".format(name))
|
||
|
if name in custom_selectors:
|
||
|
raise KeyError("The custom selector '{}' has already been registered".format(name))
|
||
|
custom_selectors[css_unescape(name)] = value
|
||
|
return custom_selectors
|
||
|
|
||
|
|
||
|
def css_unescape(content: str, string: bool = False) -> str:
|
||
|
"""
|
||
|
Unescape CSS value.
|
||
|
|
||
|
Strings allow for spanning the value on multiple strings by escaping a new line.
|
||
|
"""
|
||
|
|
||
|
def replace(m: Match[str]) -> str:
|
||
|
"""Replace with the appropriate substitute."""
|
||
|
|
||
|
if m.group(1):
|
||
|
codepoint = int(m.group(1)[1:], 16)
|
||
|
if codepoint == 0:
|
||
|
codepoint = UNICODE_REPLACEMENT_CHAR
|
||
|
value = chr(codepoint)
|
||
|
elif m.group(2):
|
||
|
value = m.group(2)[1:]
|
||
|
elif m.group(3):
|
||
|
value = '\ufffd'
|
||
|
else:
|
||
|
value = ''
|
||
|
|
||
|
return value
|
||
|
|
||
|
return (RE_CSS_ESC if not string else RE_CSS_STR_ESC).sub(replace, content)
|
||
|
|
||
|
|
||
|
def escape(ident: str) -> str:
|
||
|
"""Escape identifier."""
|
||
|
|
||
|
string = []
|
||
|
length = len(ident)
|
||
|
start_dash = length > 0 and ident[0] == '-'
|
||
|
if length == 1 and start_dash:
|
||
|
# Need to escape identifier that is a single `-` with no other characters
|
||
|
string.append('\\{}'.format(ident))
|
||
|
else:
|
||
|
for index, c in enumerate(ident):
|
||
|
codepoint = ord(c)
|
||
|
if codepoint == 0x00:
|
||
|
string.append('\ufffd')
|
||
|
elif (0x01 <= codepoint <= 0x1F) or codepoint == 0x7F:
|
||
|
string.append('\\{:x} '.format(codepoint))
|
||
|
elif (index == 0 or (start_dash and index == 1)) and (0x30 <= codepoint <= 0x39):
|
||
|
string.append('\\{:x} '.format(codepoint))
|
||
|
elif (
|
||
|
codepoint in (0x2D, 0x5F) or codepoint >= 0x80 or (0x30 <= codepoint <= 0x39) or
|
||
|
(0x30 <= codepoint <= 0x39) or (0x41 <= codepoint <= 0x5A) or (0x61 <= codepoint <= 0x7A)
|
||
|
):
|
||
|
string.append(c)
|
||
|
else:
|
||
|
string.append('\\{}'.format(c))
|
||
|
return ''.join(string)
|
||
|
|
||
|
|
||
|
class SelectorPattern:
|
||
|
"""Selector pattern."""
|
||
|
|
||
|
def __init__(self, name: str, pattern: str) -> None:
|
||
|
"""Initialize."""
|
||
|
|
||
|
self.name = name
|
||
|
self.re_pattern = re.compile(pattern, re.I | re.X | re.U)
|
||
|
|
||
|
def get_name(self) -> str:
|
||
|
"""Get name."""
|
||
|
|
||
|
return self.name
|
||
|
|
||
|
def match(self, selector: str, index: int, flags: int) -> Optional[Match[str]]:
|
||
|
"""Match the selector."""
|
||
|
|
||
|
return self.re_pattern.match(selector, index)
|
||
|
|
||
|
|
||
|
class SpecialPseudoPattern(SelectorPattern):
|
||
|
"""Selector pattern."""
|
||
|
|
||
|
def __init__(self, patterns: Tuple[Tuple[str, Tuple[str, ...], str, Type[SelectorPattern]], ...]) -> None:
|
||
|
"""Initialize."""
|
||
|
|
||
|
self.patterns = {}
|
||
|
for p in patterns:
|
||
|
name = p[0]
|
||
|
pattern = p[3](name, p[2])
|
||
|
for pseudo in p[1]:
|
||
|
self.patterns[pseudo] = pattern
|
||
|
|
||
|
self.matched_name = None # type: Optional[SelectorPattern]
|
||
|
self.re_pseudo_name = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U)
|
||
|
|
||
|
def get_name(self) -> str:
|
||
|
"""Get name."""
|
||
|
|
||
|
return '' if self.matched_name is None else self.matched_name.get_name()
|
||
|
|
||
|
def match(self, selector: str, index: int, flags: int) -> Optional[Match[str]]:
|
||
|
"""Match the selector."""
|
||
|
|
||
|
pseudo = None
|
||
|
m = self.re_pseudo_name.match(selector, index)
|
||
|
if m:
|
||
|
name = util.lower(css_unescape(m.group('name')))
|
||
|
pattern = self.patterns.get(name)
|
||
|
if pattern:
|
||
|
pseudo = pattern.match(selector, index, flags)
|
||
|
if pseudo:
|
||
|
self.matched_name = pattern
|
||
|
|
||
|
return pseudo
|
||
|
|
||
|
|
||
|
class _Selector:
|
||
|
"""
|
||
|
Intermediate selector class.
|
||
|
|
||
|
This stores selector data for a compound selector as we are acquiring them.
|
||
|
Once we are done collecting the data for a compound selector, we freeze
|
||
|
the data in an object that can be pickled and hashed.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, **kwargs: Any) -> None:
|
||
|
"""Initialize."""
|
||
|
|
||
|
self.tag = kwargs.get('tag', None) # type: Optional[ct.SelectorTag]
|
||
|
self.ids = kwargs.get('ids', []) # type: List[str]
|
||
|
self.classes = kwargs.get('classes', []) # type: List[str]
|
||
|
self.attributes = kwargs.get('attributes', []) # type: List[ct.SelectorAttribute]
|
||
|
self.nth = kwargs.get('nth', []) # type: List[ct.SelectorNth]
|
||
|
self.selectors = kwargs.get('selectors', []) # type: List[ct.SelectorList]
|
||
|
self.relations = kwargs.get('relations', []) # type: List[_Selector]
|
||
|
self.rel_type = kwargs.get('rel_type', None) # type: Optional[str]
|
||
|
self.contains = kwargs.get('contains', []) # type: List[ct.SelectorContains]
|
||
|
self.lang = kwargs.get('lang', []) # type: List[ct.SelectorLang]
|
||
|
self.flags = kwargs.get('flags', 0) # type: int
|
||
|
self.no_match = kwargs.get('no_match', False) # type: bool
|
||
|
|
||
|
def _freeze_relations(self, relations: List['_Selector']) -> ct.SelectorList:
|
||
|
"""Freeze relation."""
|
||
|
|
||
|
if relations:
|
||
|
sel = relations[0]
|
||
|
sel.relations.extend(relations[1:])
|
||
|
return ct.SelectorList([sel.freeze()])
|
||
|
else:
|
||
|
return ct.SelectorList()
|
||
|
|
||
|
def freeze(self) -> Union[ct.Selector, ct.SelectorNull]:
|
||
|
"""Freeze self."""
|
||
|
|
||
|
if self.no_match:
|
||
|
return ct.SelectorNull()
|
||
|
else:
|
||
|
return ct.Selector(
|
||
|
self.tag,
|
||
|
tuple(self.ids),
|
||
|
tuple(self.classes),
|
||
|
tuple(self.attributes),
|
||
|
tuple(self.nth),
|
||
|
tuple(self.selectors),
|
||
|
self._freeze_relations(self.relations),
|
||
|
self.rel_type,
|
||
|
tuple(self.contains),
|
||
|
tuple(self.lang),
|
||
|
self.flags
|
||
|
)
|
||
|
|
||
|
def __str__(self) -> str: # pragma: no cover
|
||
|
"""String representation."""
|
||
|
|
||
|
return (
|
||
|
'_Selector(tag={!r}, ids={!r}, classes={!r}, attributes={!r}, nth={!r}, selectors={!r}, '
|
||
|
'relations={!r}, rel_type={!r}, contains={!r}, lang={!r}, flags={!r}, no_match={!r})'
|
||
|
).format(
|
||
|
self.tag, self.ids, self.classes, self.attributes, self.nth, self.selectors,
|
||
|
self.relations, self.rel_type, self.contains, self.lang, self.flags, self.no_match
|
||
|
)
|
||
|
|
||
|
__repr__ = __str__
|
||
|
|
||
|
|
||
|
class CSSParser:
|
||
|
"""Parse CSS selectors."""
|
||
|
|
||
|
css_tokens = (
|
||
|
SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE),
|
||
|
SpecialPseudoPattern(
|
||
|
(
|
||
|
(
|
||
|
"pseudo_contains",
|
||
|
(':contains', ':-soup-contains', ':-soup-contains-own'),
|
||
|
PAT_PSEUDO_CONTAINS,
|
||
|
SelectorPattern
|
||
|
),
|
||
|
("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD, SelectorPattern),
|
||
|
("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE, SelectorPattern),
|
||
|
("pseudo_lang", (':lang',), PAT_PSEUDO_LANG, SelectorPattern),
|
||
|
("pseudo_dir", (':dir',), PAT_PSEUDO_DIR, SelectorPattern)
|
||
|
)
|
||
|
),
|
||
|
SelectorPattern("pseudo_class_custom", PAT_PSEUDO_CLASS_CUSTOM),
|
||
|
SelectorPattern("pseudo_class", PAT_PSEUDO_CLASS),
|
||
|
SelectorPattern("pseudo_element", PAT_PSEUDO_ELEMENT),
|
||
|
SelectorPattern("at_rule", PAT_AT_RULE),
|
||
|
SelectorPattern("id", PAT_ID),
|
||
|
SelectorPattern("class", PAT_CLASS),
|
||
|
SelectorPattern("tag", PAT_TAG),
|
||
|
SelectorPattern("attribute", PAT_ATTR),
|
||
|
SelectorPattern("combine", PAT_COMBINE)
|
||
|
)
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
selector: str,
|
||
|
custom: Optional[Dict[str, Union[str, ct.SelectorList]]] = None,
|
||
|
flags: int = 0
|
||
|
) -> None:
|
||
|
"""Initialize."""
|
||
|
|
||
|
self.pattern = selector.replace('\x00', '\ufffd')
|
||
|
self.flags = flags
|
||
|
self.debug = self.flags & util.DEBUG
|
||
|
self.custom = {} if custom is None else custom
|
||
|
|
||
|
def parse_attribute_selector(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
|
||
|
"""Create attribute selector from the returned regex match."""
|
||
|
|
||
|
inverse = False
|
||
|
op = m.group('cmp')
|
||
|
case = util.lower(m.group('case')) if m.group('case') else None
|
||
|
ns = css_unescape(m.group('attr_ns')[:-1]) if m.group('attr_ns') else ''
|
||
|
attr = css_unescape(m.group('attr_name'))
|
||
|
is_type = False
|
||
|
pattern2 = None
|
||
|
value = ''
|
||
|
|
||
|
if case:
|
||
|
flags = (re.I if case == 'i' else 0) | re.DOTALL
|
||
|
elif util.lower(attr) == 'type':
|
||
|
flags = re.I | re.DOTALL
|
||
|
is_type = True
|
||
|
else:
|
||
|
flags = re.DOTALL
|
||
|
|
||
|
if op:
|
||
|
if m.group('value').startswith(('"', "'")):
|
||
|
value = css_unescape(m.group('value')[1:-1], True)
|
||
|
else:
|
||
|
value = css_unescape(m.group('value'))
|
||
|
|
||
|
if not op:
|
||
|
# Attribute name
|
||
|
pattern = None
|
||
|
elif op.startswith('^'):
|
||
|
# Value start with
|
||
|
pattern = re.compile(r'^%s.*' % re.escape(value), flags)
|
||
|
elif op.startswith('$'):
|
||
|
# Value ends with
|
||
|
pattern = re.compile(r'.*?%s$' % re.escape(value), flags)
|
||
|
elif op.startswith('*'):
|
||
|
# Value contains
|
||
|
pattern = re.compile(r'.*?%s.*' % re.escape(value), flags)
|
||
|
elif op.startswith('~'):
|
||
|
# Value contains word within space separated list
|
||
|
# `~=` should match nothing if it is empty or contains whitespace,
|
||
|
# so if either of these cases is present, use `[^\s\S]` which cannot be matched.
|
||
|
value = r'[^\s\S]' if not value or RE_WS.search(value) else re.escape(value)
|
||
|
pattern = re.compile(r'.*?(?:(?<=^)|(?<=[ \t\r\n\f]))%s(?=(?:[ \t\r\n\f]|$)).*' % value, flags)
|
||
|
elif op.startswith('|'):
|
||
|
# Value starts with word in dash separated list
|
||
|
pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags)
|
||
|
else:
|
||
|
# Value matches
|
||
|
pattern = re.compile(r'^%s$' % re.escape(value), flags)
|
||
|
if op.startswith('!'):
|
||
|
# Equivalent to `:not([attr=value])`
|
||
|
inverse = True
|
||
|
if is_type and pattern:
|
||
|
pattern2 = re.compile(pattern.pattern)
|
||
|
|
||
|
# Append the attribute selector
|
||
|
sel_attr = ct.SelectorAttribute(attr, ns, pattern, pattern2)
|
||
|
if inverse:
|
||
|
# If we are using `!=`, we need to nest the pattern under a `:not()`.
|
||
|
sub_sel = _Selector()
|
||
|
sub_sel.attributes.append(sel_attr)
|
||
|
not_list = ct.SelectorList([sub_sel.freeze()], True, False)
|
||
|
sel.selectors.append(not_list)
|
||
|
else:
|
||
|
sel.attributes.append(sel_attr)
|
||
|
|
||
|
has_selector = True
|
||
|
return has_selector
|
||
|
|
||
|
def parse_tag_pattern(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
|
||
|
"""Parse tag pattern from regex match."""
|
||
|
|
||
|
prefix = css_unescape(m.group('tag_ns')[:-1]) if m.group('tag_ns') else None
|
||
|
tag = css_unescape(m.group('tag_name'))
|
||
|
sel.tag = ct.SelectorTag(tag, prefix)
|
||
|
has_selector = True
|
||
|
return has_selector
|
||
|
|
||
|
def parse_pseudo_class_custom(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
|
||
|
"""
|
||
|
Parse custom pseudo class alias.
|
||
|
|
||
|
Compile custom selectors as we need them. When compiling a custom selector,
|
||
|
set it to `None` in the dictionary so we can avoid an infinite loop.
|
||
|
"""
|
||
|
|
||
|
pseudo = util.lower(css_unescape(m.group('name')))
|
||
|
selector = self.custom.get(pseudo)
|
||
|
if selector is None:
|
||
|
raise SelectorSyntaxError(
|
||
|
"Undefined custom selector '{}' found at position {}".format(pseudo, m.end(0)),
|
||
|
self.pattern,
|
||
|
m.end(0)
|
||
|
)
|
||
|
|
||
|
if not isinstance(selector, ct.SelectorList):
|
||
|
del self.custom[pseudo]
|
||
|
selector = CSSParser(
|
||
|
selector, custom=self.custom, flags=self.flags
|
||
|
).process_selectors(flags=FLG_PSEUDO)
|
||
|
self.custom[pseudo] = selector
|
||
|
|
||
|
sel.selectors.append(selector)
|
||
|
has_selector = True
|
||
|
return has_selector
|
||
|
|
||
|
def parse_pseudo_class(
|
||
|
self,
|
||
|
sel: _Selector,
|
||
|
m: Match[str],
|
||
|
has_selector: bool,
|
||
|
iselector: Iterator[Tuple[str, Match[str]]],
|
||
|
is_html: bool
|
||
|
) -> Tuple[bool, bool]:
|
||
|
"""Parse pseudo class."""
|
||
|
|
||
|
complex_pseudo = False
|
||
|
pseudo = util.lower(css_unescape(m.group('name')))
|
||
|
if m.group('open'):
|
||
|
complex_pseudo = True
|
||
|
if complex_pseudo and pseudo in PSEUDO_COMPLEX:
|
||
|
has_selector = self.parse_pseudo_open(sel, pseudo, has_selector, iselector, m.end(0))
|
||
|
elif not complex_pseudo and pseudo in PSEUDO_SIMPLE:
|
||
|
if pseudo == ':root':
|
||
|
sel.flags |= ct.SEL_ROOT
|
||
|
elif pseudo == ':defined':
|
||
|
sel.flags |= ct.SEL_DEFINED
|
||
|
is_html = True
|
||
|
elif pseudo == ':scope':
|
||
|
sel.flags |= ct.SEL_SCOPE
|
||
|
elif pseudo == ':empty':
|
||
|
sel.flags |= ct.SEL_EMPTY
|
||
|
elif pseudo in (':link', ':any-link'):
|
||
|
sel.selectors.append(CSS_LINK)
|
||
|
elif pseudo == ':checked':
|
||
|
sel.selectors.append(CSS_CHECKED)
|
||
|
elif pseudo == ':default':
|
||
|
sel.selectors.append(CSS_DEFAULT)
|
||
|
elif pseudo == ':indeterminate':
|
||
|
sel.selectors.append(CSS_INDETERMINATE)
|
||
|
elif pseudo == ":disabled":
|
||
|
sel.selectors.append(CSS_DISABLED)
|
||
|
elif pseudo == ":enabled":
|
||
|
sel.selectors.append(CSS_ENABLED)
|
||
|
elif pseudo == ":required":
|
||
|
sel.selectors.append(CSS_REQUIRED)
|
||
|
elif pseudo == ":optional":
|
||
|
sel.selectors.append(CSS_OPTIONAL)
|
||
|
elif pseudo == ":read-only":
|
||
|
sel.selectors.append(CSS_READ_ONLY)
|
||
|
elif pseudo == ":read-write":
|
||
|
sel.selectors.append(CSS_READ_WRITE)
|
||
|
elif pseudo == ":in-range":
|
||
|
sel.selectors.append(CSS_IN_RANGE)
|
||
|
elif pseudo == ":out-of-range":
|
||
|
sel.selectors.append(CSS_OUT_OF_RANGE)
|
||
|
elif pseudo == ":placeholder-shown":
|
||
|
sel.selectors.append(CSS_PLACEHOLDER_SHOWN)
|
||
|
elif pseudo == ':first-child':
|
||
|
sel.nth.append(ct.SelectorNth(1, False, 0, False, False, ct.SelectorList()))
|
||
|
elif pseudo == ':last-child':
|
||
|
sel.nth.append(ct.SelectorNth(1, False, 0, False, True, ct.SelectorList()))
|
||
|
elif pseudo == ':first-of-type':
|
||
|
sel.nth.append(ct.SelectorNth(1, False, 0, True, False, ct.SelectorList()))
|
||
|
elif pseudo == ':last-of-type':
|
||
|
sel.nth.append(ct.SelectorNth(1, False, 0, True, True, ct.SelectorList()))
|
||
|
elif pseudo == ':only-child':
|
||
|
sel.nth.extend(
|
||
|
[
|
||
|
ct.SelectorNth(1, False, 0, False, False, ct.SelectorList()),
|
||
|
ct.SelectorNth(1, False, 0, False, True, ct.SelectorList())
|
||
|
]
|
||
|
)
|
||
|
elif pseudo == ':only-of-type':
|
||
|
sel.nth.extend(
|
||
|
[
|
||
|
ct.SelectorNth(1, False, 0, True, False, ct.SelectorList()),
|
||
|
ct.SelectorNth(1, False, 0, True, True, ct.SelectorList())
|
||
|
]
|
||
|
)
|
||
|
has_selector = True
|
||
|
elif complex_pseudo and pseudo in PSEUDO_COMPLEX_NO_MATCH:
|
||
|
self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN)
|
||
|
sel.no_match = True
|
||
|
has_selector = True
|
||
|
elif not complex_pseudo and pseudo in PSEUDO_SIMPLE_NO_MATCH:
|
||
|
sel.no_match = True
|
||
|
has_selector = True
|
||
|
elif pseudo in PSEUDO_SUPPORTED:
|
||
|
raise SelectorSyntaxError(
|
||
|
"Invalid syntax for pseudo class '{}'".format(pseudo),
|
||
|
self.pattern,
|
||
|
m.start(0)
|
||
|
)
|
||
|
else:
|
||
|
raise NotImplementedError(
|
||
|
"'{}' pseudo-class is not implemented at this time".format(pseudo)
|
||
|
)
|
||
|
|
||
|
return has_selector, is_html
|
||
|
|
||
|
def parse_pseudo_nth(
|
||
|
self,
|
||
|
sel: _Selector,
|
||
|
m: Match[str],
|
||
|
has_selector: bool,
|
||
|
iselector: Iterator[Tuple[str, Match[str]]]
|
||
|
) -> bool:
|
||
|
"""Parse `nth` pseudo."""
|
||
|
|
||
|
mdict = m.groupdict()
|
||
|
if mdict.get('pseudo_nth_child'):
|
||
|
postfix = '_child'
|
||
|
else:
|
||
|
postfix = '_type'
|
||
|
mdict['name'] = util.lower(css_unescape(mdict['name']))
|
||
|
content = util.lower(mdict.get('nth' + postfix))
|
||
|
if content == 'even':
|
||
|
# 2n
|
||
|
s1 = 2
|
||
|
s2 = 0
|
||
|
var = True
|
||
|
elif content == 'odd':
|
||
|
# 2n+1
|
||
|
s1 = 2
|
||
|
s2 = 1
|
||
|
var = True
|
||
|
else:
|
||
|
nth_parts = cast(Match[str], RE_NTH.match(content))
|
||
|
_s1 = '-' if nth_parts.group('s1') and nth_parts.group('s1') == '-' else ''
|
||
|
a = nth_parts.group('a')
|
||
|
var = a.endswith('n')
|
||
|
if a.startswith('n'):
|
||
|
_s1 += '1'
|
||
|
elif var:
|
||
|
_s1 += a[:-1]
|
||
|
else:
|
||
|
_s1 += a
|
||
|
_s2 = '-' if nth_parts.group('s2') and nth_parts.group('s2') == '-' else ''
|
||
|
if nth_parts.group('b'):
|
||
|
_s2 += nth_parts.group('b')
|
||
|
else:
|
||
|
_s2 = '0'
|
||
|
s1 = int(_s1, 10)
|
||
|
s2 = int(_s2, 10)
|
||
|
|
||
|
pseudo_sel = mdict['name']
|
||
|
if postfix == '_child':
|
||
|
if m.group('of'):
|
||
|
# Parse the rest of `of S`.
|
||
|
nth_sel = self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN)
|
||
|
else:
|
||
|
# Use default `*|*` for `of S`.
|
||
|
nth_sel = CSS_NTH_OF_S_DEFAULT
|
||
|
if pseudo_sel == ':nth-child':
|
||
|
sel.nth.append(ct.SelectorNth(s1, var, s2, False, False, nth_sel))
|
||
|
elif pseudo_sel == ':nth-last-child':
|
||
|
sel.nth.append(ct.SelectorNth(s1, var, s2, False, True, nth_sel))
|
||
|
else:
|
||
|
if pseudo_sel == ':nth-of-type':
|
||
|
sel.nth.append(ct.SelectorNth(s1, var, s2, True, False, ct.SelectorList()))
|
||
|
elif pseudo_sel == ':nth-last-of-type':
|
||
|
sel.nth.append(ct.SelectorNth(s1, var, s2, True, True, ct.SelectorList()))
|
||
|
has_selector = True
|
||
|
return has_selector
|
||
|
|
||
|
def parse_pseudo_open(
|
||
|
self,
|
||
|
sel: _Selector,
|
||
|
name: str,
|
||
|
has_selector: bool,
|
||
|
iselector: Iterator[Tuple[str, Match[str]]],
|
||
|
index: int
|
||
|
) -> bool:
|
||
|
"""Parse pseudo with opening bracket."""
|
||
|
|
||
|
flags = FLG_PSEUDO | FLG_OPEN
|
||
|
if name == ':not':
|
||
|
flags |= FLG_NOT
|
||
|
elif name == ':has':
|
||
|
flags |= FLG_RELATIVE | FLG_FORGIVE
|
||
|
elif name in (':where', ':is'):
|
||
|
flags |= FLG_FORGIVE
|
||
|
|
||
|
sel.selectors.append(self.parse_selectors(iselector, index, flags))
|
||
|
has_selector = True
|
||
|
|
||
|
return has_selector
|
||
|
|
||
|
def parse_has_combinator(
|
||
|
self,
|
||
|
sel: _Selector,
|
||
|
m: Match[str],
|
||
|
has_selector: bool,
|
||
|
selectors: List[_Selector],
|
||
|
rel_type: str,
|
||
|
index: int
|
||
|
) -> Tuple[bool, _Selector, str]:
|
||
|
"""Parse combinator tokens."""
|
||
|
|
||
|
combinator = m.group('relation').strip()
|
||
|
if not combinator:
|
||
|
combinator = WS_COMBINATOR
|
||
|
if combinator == COMMA_COMBINATOR:
|
||
|
if not has_selector:
|
||
|
# If we've not captured any selector parts, the comma is either at the beginning of the pattern
|
||
|
# or following another comma, both of which are unexpected. But shouldn't fail the pseudo-class.
|
||
|
sel.no_match = True
|
||
|
|
||
|
sel.rel_type = rel_type
|
||
|
selectors[-1].relations.append(sel)
|
||
|
rel_type = ":" + WS_COMBINATOR
|
||
|
selectors.append(_Selector())
|
||
|
else:
|
||
|
if has_selector:
|
||
|
# End the current selector and associate the leading combinator with this selector.
|
||
|
sel.rel_type = rel_type
|
||
|
selectors[-1].relations.append(sel)
|
||
|
elif rel_type[1:] != WS_COMBINATOR:
|
||
|
# It's impossible to have two whitespace combinators after each other as the patterns
|
||
|
# will gobble up trailing whitespace. It is also impossible to have a whitespace
|
||
|
# combinator after any other kind for the same reason. But we could have
|
||
|
# multiple non-whitespace combinators. So if the current combinator is not a whitespace,
|
||
|
# then we've hit the multiple combinator case, so we should fail.
|
||
|
raise SelectorSyntaxError(
|
||
|
'The multiple combinators at position {}'.format(index),
|
||
|
self.pattern,
|
||
|
index
|
||
|
)
|
||
|
|
||
|
# Set the leading combinator for the next selector.
|
||
|
rel_type = ':' + combinator
|
||
|
|
||
|
sel = _Selector()
|
||
|
has_selector = False
|
||
|
return has_selector, sel, rel_type
|
||
|
|
||
|
def parse_combinator(
|
||
|
self,
|
||
|
sel: _Selector,
|
||
|
m: Match[str],
|
||
|
has_selector: bool,
|
||
|
selectors: List[_Selector],
|
||
|
relations: List[_Selector],
|
||
|
is_pseudo: bool,
|
||
|
is_forgive: bool,
|
||
|
index: int
|
||
|
) -> Tuple[bool, _Selector]:
|
||
|
"""Parse combinator tokens."""
|
||
|
|
||
|
combinator = m.group('relation').strip()
|
||
|
if not combinator:
|
||
|
combinator = WS_COMBINATOR
|
||
|
if not has_selector:
|
||
|
if not is_forgive or combinator != COMMA_COMBINATOR:
|
||
|
raise SelectorSyntaxError(
|
||
|
"The combinator '{}' at position {}, must have a selector before it".format(combinator, index),
|
||
|
self.pattern,
|
||
|
index
|
||
|
)
|
||
|
|
||
|
# If we are in a forgiving pseudo class, just make the selector a "no match"
|
||
|
if combinator == COMMA_COMBINATOR:
|
||
|
sel.no_match = True
|
||
|
del relations[:]
|
||
|
selectors.append(sel)
|
||
|
else:
|
||
|
if combinator == COMMA_COMBINATOR:
|
||
|
if not sel.tag and not is_pseudo:
|
||
|
# Implied `*`
|
||
|
sel.tag = ct.SelectorTag('*', None)
|
||
|
sel.relations.extend(relations)
|
||
|
selectors.append(sel)
|
||
|
del relations[:]
|
||
|
else:
|
||
|
sel.relations.extend(relations)
|
||
|
sel.rel_type = combinator
|
||
|
del relations[:]
|
||
|
relations.append(sel)
|
||
|
|
||
|
sel = _Selector()
|
||
|
has_selector = False
|
||
|
|
||
|
return has_selector, sel
|
||
|
|
||
|
def parse_class_id(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
|
||
|
"""Parse HTML classes and ids."""
|
||
|
|
||
|
selector = m.group(0)
|
||
|
if selector.startswith('.'):
|
||
|
sel.classes.append(css_unescape(selector[1:]))
|
||
|
else:
|
||
|
sel.ids.append(css_unescape(selector[1:]))
|
||
|
has_selector = True
|
||
|
return has_selector
|
||
|
|
||
|
def parse_pseudo_contains(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
|
||
|
"""Parse contains."""
|
||
|
|
||
|
pseudo = util.lower(css_unescape(m.group('name')))
|
||
|
if pseudo == ":contains":
|
||
|
warnings.warn(
|
||
|
"The pseudo class ':contains' is deprecated, ':-soup-contains' should be used moving forward.",
|
||
|
FutureWarning
|
||
|
)
|
||
|
contains_own = pseudo == ":-soup-contains-own"
|
||
|
values = css_unescape(m.group('values'))
|
||
|
patterns = []
|
||
|
for token in RE_VALUES.finditer(values):
|
||
|
if token.group('split'):
|
||
|
continue
|
||
|
value = token.group('value')
|
||
|
if value.startswith(("'", '"')):
|
||
|
value = css_unescape(value[1:-1], True)
|
||
|
else:
|
||
|
value = css_unescape(value)
|
||
|
patterns.append(value)
|
||
|
sel.contains.append(ct.SelectorContains(patterns, contains_own))
|
||
|
has_selector = True
|
||
|
return has_selector
|
||
|
|
||
|
def parse_pseudo_lang(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
|
||
|
"""Parse pseudo language."""
|
||
|
|
||
|
values = m.group('values')
|
||
|
patterns = []
|
||
|
for token in RE_VALUES.finditer(values):
|
||
|
if token.group('split'):
|
||
|
continue
|
||
|
value = token.group('value')
|
||
|
if value.startswith(('"', "'")):
|
||
|
value = css_unescape(value[1:-1], True)
|
||
|
else:
|
||
|
value = css_unescape(value)
|
||
|
|
||
|
patterns.append(value)
|
||
|
|
||
|
sel.lang.append(ct.SelectorLang(patterns))
|
||
|
has_selector = True
|
||
|
|
||
|
return has_selector
|
||
|
|
||
|
def parse_pseudo_dir(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
|
||
|
"""Parse pseudo direction."""
|
||
|
|
||
|
value = ct.SEL_DIR_LTR if util.lower(m.group('dir')) == 'ltr' else ct.SEL_DIR_RTL
|
||
|
sel.flags |= value
|
||
|
has_selector = True
|
||
|
return has_selector
|
||
|
|
||
|
def parse_selectors(
|
||
|
self,
|
||
|
iselector: Iterator[Tuple[str, Match[str]]],
|
||
|
index: int = 0,
|
||
|
flags: int = 0
|
||
|
) -> ct.SelectorList:
|
||
|
"""Parse selectors."""
|
||
|
|
||
|
# Initialize important variables
|
||
|
sel = _Selector()
|
||
|
selectors = []
|
||
|
has_selector = False
|
||
|
closed = False
|
||
|
relations = [] # type: List[_Selector]
|
||
|
rel_type = ":" + WS_COMBINATOR
|
||
|
|
||
|
# Setup various flags
|
||
|
is_open = bool(flags & FLG_OPEN)
|
||
|
is_pseudo = bool(flags & FLG_PSEUDO)
|
||
|
is_relative = bool(flags & FLG_RELATIVE)
|
||
|
is_not = bool(flags & FLG_NOT)
|
||
|
is_html = bool(flags & FLG_HTML)
|
||
|
is_default = bool(flags & FLG_DEFAULT)
|
||
|
is_indeterminate = bool(flags & FLG_INDETERMINATE)
|
||
|
is_in_range = bool(flags & FLG_IN_RANGE)
|
||
|
is_out_of_range = bool(flags & FLG_OUT_OF_RANGE)
|
||
|
is_placeholder_shown = bool(flags & FLG_PLACEHOLDER_SHOWN)
|
||
|
is_forgive = bool(flags & FLG_FORGIVE)
|
||
|
|
||
|
# Print out useful debug stuff
|
||
|
if self.debug: # pragma: no cover
|
||
|
if is_pseudo:
|
||
|
print(' is_pseudo: True')
|
||
|
if is_open:
|
||
|
print(' is_open: True')
|
||
|
if is_relative:
|
||
|
print(' is_relative: True')
|
||
|
if is_not:
|
||
|
print(' is_not: True')
|
||
|
if is_html:
|
||
|
print(' is_html: True')
|
||
|
if is_default:
|
||
|
print(' is_default: True')
|
||
|
if is_indeterminate:
|
||
|
print(' is_indeterminate: True')
|
||
|
if is_in_range:
|
||
|
print(' is_in_range: True')
|
||
|
if is_out_of_range:
|
||
|
print(' is_out_of_range: True')
|
||
|
if is_placeholder_shown:
|
||
|
print(' is_placeholder_shown: True')
|
||
|
if is_forgive:
|
||
|
print(' is_forgive: True')
|
||
|
|
||
|
# The algorithm for relative selectors require an initial selector in the selector list
|
||
|
if is_relative:
|
||
|
selectors.append(_Selector())
|
||
|
|
||
|
try:
|
||
|
while True:
|
||
|
key, m = next(iselector)
|
||
|
|
||
|
# Handle parts
|
||
|
if key == "at_rule":
|
||
|
raise NotImplementedError("At-rules found at position {}".format(m.start(0)))
|
||
|
elif key == 'pseudo_class_custom':
|
||
|
has_selector = self.parse_pseudo_class_custom(sel, m, has_selector)
|
||
|
elif key == 'pseudo_class':
|
||
|
has_selector, is_html = self.parse_pseudo_class(sel, m, has_selector, iselector, is_html)
|
||
|
elif key == 'pseudo_element':
|
||
|
raise NotImplementedError("Pseudo-element found at position {}".format(m.start(0)))
|
||
|
elif key == 'pseudo_contains':
|
||
|
has_selector = self.parse_pseudo_contains(sel, m, has_selector)
|
||
|
elif key in ('pseudo_nth_type', 'pseudo_nth_child'):
|
||
|
has_selector = self.parse_pseudo_nth(sel, m, has_selector, iselector)
|
||
|
elif key == 'pseudo_lang':
|
||
|
has_selector = self.parse_pseudo_lang(sel, m, has_selector)
|
||
|
elif key == 'pseudo_dir':
|
||
|
has_selector = self.parse_pseudo_dir(sel, m, has_selector)
|
||
|
# Currently only supports HTML
|
||
|
is_html = True
|
||
|
elif key == 'pseudo_close':
|
||
|
if not has_selector:
|
||
|
if not is_forgive:
|
||
|
raise SelectorSyntaxError(
|
||
|
"Expected a selector at position {}".format(m.start(0)),
|
||
|
self.pattern,
|
||
|
m.start(0)
|
||
|
)
|
||
|
sel.no_match = True
|
||
|
if is_open:
|
||
|
closed = True
|
||
|
break
|
||
|
else:
|
||
|
raise SelectorSyntaxError(
|
||
|
"Unmatched pseudo-class close at position {}".format(m.start(0)),
|
||
|
self.pattern,
|
||
|
m.start(0)
|
||
|
)
|
||
|
elif key == 'combine':
|
||
|
if is_relative:
|
||
|
has_selector, sel, rel_type = self.parse_has_combinator(
|
||
|
sel, m, has_selector, selectors, rel_type, index
|
||
|
)
|
||
|
else:
|
||
|
has_selector, sel = self.parse_combinator(
|
||
|
sel, m, has_selector, selectors, relations, is_pseudo, is_forgive, index
|
||
|
)
|
||
|
elif key == 'attribute':
|
||
|
has_selector = self.parse_attribute_selector(sel, m, has_selector)
|
||
|
elif key == 'tag':
|
||
|
if has_selector:
|
||
|
raise SelectorSyntaxError(
|
||
|
"Tag name found at position {} instead of at the start".format(m.start(0)),
|
||
|
self.pattern,
|
||
|
m.start(0)
|
||
|
)
|
||
|
has_selector = self.parse_tag_pattern(sel, m, has_selector)
|
||
|
elif key in ('class', 'id'):
|
||
|
has_selector = self.parse_class_id(sel, m, has_selector)
|
||
|
|
||
|
index = m.end(0)
|
||
|
except StopIteration:
|
||
|
pass
|
||
|
|
||
|
# Handle selectors that are not closed
|
||
|
if is_open and not closed:
|
||
|
raise SelectorSyntaxError(
|
||
|
"Unclosed pseudo-class at position {}".format(index),
|
||
|
self.pattern,
|
||
|
index
|
||
|
)
|
||
|
|
||
|
# Cleanup completed selector piece
|
||
|
if has_selector:
|
||
|
if not sel.tag and not is_pseudo:
|
||
|
# Implied `*`
|
||
|
sel.tag = ct.SelectorTag('*', None)
|
||
|
if is_relative:
|
||
|
sel.rel_type = rel_type
|
||
|
selectors[-1].relations.append(sel)
|
||
|
else:
|
||
|
sel.relations.extend(relations)
|
||
|
del relations[:]
|
||
|
selectors.append(sel)
|
||
|
|
||
|
# Forgive empty slots in pseudo-classes that have lists (and are forgiving)
|
||
|
elif is_forgive:
|
||
|
if is_relative:
|
||
|
# Handle relative selectors pseudo-classes with empty slots like `:has()`
|
||
|
if selectors and selectors[-1].rel_type is None and rel_type == ': ':
|
||
|
sel.rel_type = rel_type
|
||
|
sel.no_match = True
|
||
|
selectors[-1].relations.append(sel)
|
||
|
has_selector = True
|
||
|
else:
|
||
|
# Handle normal pseudo-classes with empty slots
|
||
|
if not selectors or not relations:
|
||
|
# Others like `:is()` etc.
|
||
|
sel.no_match = True
|
||
|
del relations[:]
|
||
|
selectors.append(sel)
|
||
|
has_selector = True
|
||
|
|
||
|
if not has_selector:
|
||
|
# We will always need to finish a selector when `:has()` is used as it leads with combining.
|
||
|
# May apply to others as well.
|
||
|
raise SelectorSyntaxError(
|
||
|
'Expected a selector at position {}'.format(index),
|
||
|
self.pattern,
|
||
|
index
|
||
|
)
|
||
|
|
||
|
# Some patterns require additional logic, such as default. We try to make these the
|
||
|
# last pattern, and append the appropriate flag to that selector which communicates
|
||
|
# to the matcher what additional logic is required.
|
||
|
if is_default:
|
||
|
selectors[-1].flags = ct.SEL_DEFAULT
|
||
|
if is_indeterminate:
|
||
|
selectors[-1].flags = ct.SEL_INDETERMINATE
|
||
|
if is_in_range:
|
||
|
selectors[-1].flags = ct.SEL_IN_RANGE
|
||
|
if is_out_of_range:
|
||
|
selectors[-1].flags = ct.SEL_OUT_OF_RANGE
|
||
|
if is_placeholder_shown:
|
||
|
selectors[-1].flags = ct.SEL_PLACEHOLDER_SHOWN
|
||
|
|
||
|
# Return selector list
|
||
|
return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html)
|
||
|
|
||
|
def selector_iter(self, pattern: str) -> Iterator[Tuple[str, Match[str]]]:
|
||
|
"""Iterate selector tokens."""
|
||
|
|
||
|
# Ignore whitespace and comments at start and end of pattern
|
||
|
m = RE_WS_BEGIN.search(pattern)
|
||
|
index = m.end(0) if m else 0
|
||
|
m = RE_WS_END.search(pattern)
|
||
|
end = (m.start(0) - 1) if m else (len(pattern) - 1)
|
||
|
|
||
|
if self.debug: # pragma: no cover
|
||
|
print('## PARSING: {!r}'.format(pattern))
|
||
|
while index <= end:
|
||
|
m = None
|
||
|
for v in self.css_tokens:
|
||
|
m = v.match(pattern, index, self.flags)
|
||
|
if m:
|
||
|
name = v.get_name()
|
||
|
if self.debug: # pragma: no cover
|
||
|
print("TOKEN: '{}' --> {!r} at position {}".format(name, m.group(0), m.start(0)))
|
||
|
index = m.end(0)
|
||
|
yield name, m
|
||
|
break
|
||
|
if m is None:
|
||
|
c = pattern[index]
|
||
|
# If the character represents the start of one of the known selector types,
|
||
|
# throw an exception mentioning that the known selector type is in error;
|
||
|
# otherwise, report the invalid character.
|
||
|
if c == '[':
|
||
|
msg = "Malformed attribute selector at position {}".format(index)
|
||
|
elif c == '.':
|
||
|
msg = "Malformed class selector at position {}".format(index)
|
||
|
elif c == '#':
|
||
|
msg = "Malformed id selector at position {}".format(index)
|
||
|
elif c == ':':
|
||
|
msg = "Malformed pseudo-class selector at position {}".format(index)
|
||
|
else:
|
||
|
msg = "Invalid character {!r} position {}".format(c, index)
|
||
|
raise SelectorSyntaxError(msg, self.pattern, index)
|
||
|
if self.debug: # pragma: no cover
|
||
|
print('## END PARSING')
|
||
|
|
||
|
def process_selectors(self, index: int = 0, flags: int = 0) -> ct.SelectorList:
|
||
|
"""Process selectors."""
|
||
|
|
||
|
return self.parse_selectors(self.selector_iter(self.pattern), index, flags)
|
||
|
|
||
|
|
||
|
# Precompile CSS selector lists for pseudo-classes (additional logic may be required beyond the pattern)
|
||
|
# A few patterns are order dependent as they use patterns previous compiled.
|
||
|
|
||
|
# CSS pattern for `:link` and `:any-link`
|
||
|
CSS_LINK = CSSParser(
|
||
|
'html|*:is(a, area)[href]'
|
||
|
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
|
||
|
# CSS pattern for `:checked`
|
||
|
CSS_CHECKED = CSSParser(
|
||
|
'''
|
||
|
html|*:is(input[type=checkbox], input[type=radio])[checked], html|option[selected]
|
||
|
'''
|
||
|
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
|
||
|
# CSS pattern for `:default` (must compile CSS_CHECKED first)
|
||
|
CSS_DEFAULT = CSSParser(
|
||
|
'''
|
||
|
:checked,
|
||
|
|
||
|
/*
|
||
|
This pattern must be at the end.
|
||
|
Special logic is applied to the last selector.
|
||
|
*/
|
||
|
html|form html|*:is(button, input)[type="submit"]
|
||
|
'''
|
||
|
).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_DEFAULT)
|
||
|
# CSS pattern for `:indeterminate`
|
||
|
CSS_INDETERMINATE = CSSParser(
|
||
|
'''
|
||
|
html|input[type="checkbox"][indeterminate],
|
||
|
html|input[type="radio"]:is(:not([name]), [name=""]):not([checked]),
|
||
|
html|progress:not([value]),
|
||
|
|
||
|
/*
|
||
|
This pattern must be at the end.
|
||
|
Special logic is applied to the last selector.
|
||
|
*/
|
||
|
html|input[type="radio"][name]:not([name='']):not([checked])
|
||
|
'''
|
||
|
).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_INDETERMINATE)
|
||
|
# CSS pattern for `:disabled`
|
||
|
CSS_DISABLED = CSSParser(
|
||
|
'''
|
||
|
html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset)[disabled],
|
||
|
html|optgroup[disabled] > html|option,
|
||
|
html|fieldset[disabled] > html|*:is(input:not([type=hidden]), button, select, textarea, fieldset),
|
||
|
html|fieldset[disabled] >
|
||
|
html|*:not(legend:nth-of-type(1)) html|*:is(input:not([type=hidden]), button, select, textarea, fieldset)
|
||
|
'''
|
||
|
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
|
||
|
# CSS pattern for `:enabled`
|
||
|
CSS_ENABLED = CSSParser(
|
||
|
'''
|
||
|
html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled)
|
||
|
'''
|
||
|
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
|
||
|
# CSS pattern for `:required`
|
||
|
CSS_REQUIRED = CSSParser(
|
||
|
'html|*:is(input, textarea, select)[required]'
|
||
|
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
|
||
|
# CSS pattern for `:optional`
|
||
|
CSS_OPTIONAL = CSSParser(
|
||
|
'html|*:is(input, textarea, select):not([required])'
|
||
|
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
|
||
|
# CSS pattern for `:placeholder-shown`
|
||
|
CSS_PLACEHOLDER_SHOWN = CSSParser(
|
||
|
'''
|
||
|
html|input:is(
|
||
|
:not([type]),
|
||
|
[type=""],
|
||
|
[type=text],
|
||
|
[type=search],
|
||
|
[type=url],
|
||
|
[type=tel],
|
||
|
[type=email],
|
||
|
[type=password],
|
||
|
[type=number]
|
||
|
)[placeholder]:not([placeholder='']):is(:not([value]), [value=""]),
|
||
|
html|textarea[placeholder]:not([placeholder=''])
|
||
|
'''
|
||
|
).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_PLACEHOLDER_SHOWN)
|
||
|
# CSS pattern default for `:nth-child` "of S" feature
|
||
|
CSS_NTH_OF_S_DEFAULT = CSSParser(
|
||
|
'*|*'
|
||
|
).process_selectors(flags=FLG_PSEUDO)
|
||
|
# CSS pattern for `:read-write` (CSS_DISABLED must be compiled first)
|
||
|
CSS_READ_WRITE = CSSParser(
|
||
|
'''
|
||
|
html|*:is(
|
||
|
textarea,
|
||
|
input:is(
|
||
|
:not([type]),
|
||
|
[type=""],
|
||
|
[type=text],
|
||
|
[type=search],
|
||
|
[type=url],
|
||
|
[type=tel],
|
||
|
[type=email],
|
||
|
[type=number],
|
||
|
[type=password],
|
||
|
[type=date],
|
||
|
[type=datetime-local],
|
||
|
[type=month],
|
||
|
[type=time],
|
||
|
[type=week]
|
||
|
)
|
||
|
):not([readonly], :disabled),
|
||
|
html|*:is([contenteditable=""], [contenteditable="true" i])
|
||
|
'''
|
||
|
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
|
||
|
# CSS pattern for `:read-only`
|
||
|
CSS_READ_ONLY = CSSParser(
|
||
|
'''
|
||
|
html|*:not(:read-write)
|
||
|
'''
|
||
|
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
|
||
|
# CSS pattern for `:in-range`
|
||
|
CSS_IN_RANGE = CSSParser(
|
||
|
'''
|
||
|
html|input:is(
|
||
|
[type="date"],
|
||
|
[type="month"],
|
||
|
[type="week"],
|
||
|
[type="time"],
|
||
|
[type="datetime-local"],
|
||
|
[type="number"],
|
||
|
[type="range"]
|
||
|
):is(
|
||
|
[min],
|
||
|
[max]
|
||
|
)
|
||
|
'''
|
||
|
).process_selectors(flags=FLG_PSEUDO | FLG_IN_RANGE | FLG_HTML)
|
||
|
# CSS pattern for `:out-of-range`
|
||
|
CSS_OUT_OF_RANGE = CSSParser(
|
||
|
'''
|
||
|
html|input:is(
|
||
|
[type="date"],
|
||
|
[type="month"],
|
||
|
[type="week"],
|
||
|
[type="time"],
|
||
|
[type="datetime-local"],
|
||
|
[type="number"],
|
||
|
[type="range"]
|
||
|
):is(
|
||
|
[min],
|
||
|
[max]
|
||
|
)
|
||
|
'''
|
||
|
).process_selectors(flags=FLG_PSEUDO | FLG_OUT_OF_RANGE | FLG_HTML)
|