back to Reference (Gold) summary
Reference (Gold): parsel
Pytest Summary for test tests
status | count |
---|---|
passed | 206 |
skipped | 2 |
total | 208 |
collected | 208 |
Failed pytests:
Patch diff
diff --git a/parsel/csstranslator.py b/parsel/csstranslator.py
index 2bf8cc7..80bfc7c 100644
--- a/parsel/csstranslator.py
+++ b/parsel/csstranslator.py
@@ -1,36 +1,76 @@
from functools import lru_cache
from typing import TYPE_CHECKING, Any, Optional, Protocol
+
from cssselect import GenericTranslator as OriginalGenericTranslator
from cssselect import HTMLTranslator as OriginalHTMLTranslator
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
from cssselect.xpath import ExpressionError
from cssselect.xpath import XPathExpr as OriginalXPathExpr
+
if TYPE_CHECKING:
+ # typing.Self requires Python 3.11
from typing_extensions import Self
class XPathExpr(OriginalXPathExpr):
+
textnode: bool = False
attribute: Optional[str] = None
- def __str__(self) ->str:
+ @classmethod
+ def from_xpath(
+ cls,
+ xpath: OriginalXPathExpr,
+ textnode: bool = False,
+ attribute: Optional[str] = None,
+ ) -> "Self":
+ x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
+ x.textnode = textnode
+ x.attribute = attribute
+ return x
+
+ def __str__(self) -> str:
path = super().__str__()
if self.textnode:
- if path == '*':
- path = 'text()'
- elif path.endswith('::*/*'):
- path = path[:-3] + 'text()'
+ if path == "*":
+ path = "text()"
+ elif path.endswith("::*/*"):
+ path = path[:-3] + "text()"
else:
- path += '/text()'
+ path += "/text()"
+
if self.attribute is not None:
- if path.endswith('::*/*'):
+ if path.endswith("::*/*"):
path = path[:-2]
- path += f'/@{self.attribute}'
- return path
+ path += f"/@{self.attribute}"
+ return path
+ def join(
+ self: "Self",
+ combiner: str,
+ other: OriginalXPathExpr,
+ *args: Any,
+ **kwargs: Any,
+ ) -> "Self":
+ if not isinstance(other, XPathExpr):
+ raise ValueError(
+ f"Expressions of type {__name__}.XPathExpr can ony join expressions"
+ f" of the same type (or its descendants), got {type(other)}"
+ )
+ super().join(combiner, other, *args, **kwargs)
+ self.textnode = other.textnode
+ self.attribute = other.attribute
+ return self
+
+
+# e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
class TranslatorProtocol(Protocol):
- pass
+ def xpath_element(self, selector: Element) -> OriginalXPathExpr:
+ pass
+
+ def css_to_xpath(self, css: str, prefix: str = ...) -> str:
+ pass
class TranslatorMixin:
@@ -39,35 +79,67 @@ class TranslatorMixin:
Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``.
"""
- def xpath_pseudo_element(self, xpath: OriginalXPathExpr, pseudo_element:
- PseudoElement) ->OriginalXPathExpr:
+ def xpath_element(self: TranslatorProtocol, selector: Element) -> XPathExpr:
+ # https://github.com/python/mypy/issues/12344
+ xpath = super().xpath_element(selector) # type: ignore[safe-super]
+ return XPathExpr.from_xpath(xpath)
+
+ def xpath_pseudo_element(
+ self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement
+ ) -> OriginalXPathExpr:
"""
Dispatch method that transforms XPath to support pseudo-element
"""
- pass
-
- def xpath_attr_functional_pseudo_element(self, xpath: OriginalXPathExpr,
- function: FunctionalPseudoElement) ->XPathExpr:
+ if isinstance(pseudo_element, FunctionalPseudoElement):
+ method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
+ method = getattr(self, method_name, None)
+ if not method:
+ raise ExpressionError(
+ f"The functional pseudo-element ::{pseudo_element.name}() is unknown"
+ )
+ xpath = method(xpath, pseudo_element)
+ else:
+ method_name = (
+ f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
+ )
+ method = getattr(self, method_name, None)
+ if not method:
+ raise ExpressionError(
+ f"The pseudo-element ::{pseudo_element} is unknown"
+ )
+ xpath = method(xpath)
+ return xpath
+
+ def xpath_attr_functional_pseudo_element(
+ self, xpath: OriginalXPathExpr, function: FunctionalPseudoElement
+ ) -> XPathExpr:
"""Support selecting attribute values using ::attr() pseudo-element"""
- pass
+ if function.argument_types() not in (["STRING"], ["IDENT"]):
+ raise ExpressionError(
+ f"Expected a single string or ident for ::attr(), got {function.arguments!r}"
+ )
+ return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value)
- def xpath_text_simple_pseudo_element(self, xpath: OriginalXPathExpr
- ) ->XPathExpr:
+ def xpath_text_simple_pseudo_element(self, xpath: OriginalXPathExpr) -> XPathExpr:
"""Support selecting text nodes using ::text pseudo-element"""
- pass
+ return XPathExpr.from_xpath(xpath, textnode=True)
class GenericTranslator(TranslatorMixin, OriginalGenericTranslator):
- pass
+ @lru_cache(maxsize=256)
+ def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
+ return super().css_to_xpath(css, prefix)
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
- pass
+ @lru_cache(maxsize=256)
+ def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
+ return super().css_to_xpath(css, prefix)
_translator = HTMLTranslator()
-def css2xpath(query: str) ->str:
- """Return translated XPath version of a given CSS query"""
- pass
+def css2xpath(query: str) -> str:
+ "Return translated XPath version of a given CSS query"
+ return _translator.css_to_xpath(query)
diff --git a/parsel/selector.py b/parsel/selector.py
index 6aa73da..2027599 100644
--- a/parsel/selector.py
+++ b/parsel/selector.py
@@ -1,21 +1,44 @@
"""XPath and JMESPath selectors based on the lxml and jmespath Python
packages."""
+
import json
import typing
import warnings
from io import BytesIO
-from typing import Any, Dict, List, Literal, Mapping, Optional, Pattern, SupportsIndex, Tuple, Type, TypedDict, TypeVar, Union
+from typing import (
+ Any,
+ Dict,
+ List,
+ Literal,
+ Mapping,
+ Optional,
+ Pattern,
+ SupportsIndex,
+ Tuple,
+ Type,
+ TypedDict,
+ TypeVar,
+ Union,
+)
from warnings import warn
+
import jmespath
from lxml import etree, html
from packaging.version import Version
+
from .csstranslator import GenericTranslator, HTMLTranslator
from .utils import extract_regex, flatten, iflatten, shorten
-_SelectorType = TypeVar('_SelectorType', bound='Selector')
-_ParserType = Union[etree.XMLParser, etree.HTMLParser]
-_TostringMethodType = Literal['html', 'xml']
+
+_SelectorType = TypeVar("_SelectorType", bound="Selector")
+_ParserType = Union[etree.XMLParser, etree.HTMLParser] # type: ignore[type-arg]
+# simplified _OutputMethodArg from types-lxml
+_TostringMethodType = Literal[
+ "html",
+ "xml",
+]
+
lxml_version = Version(etree.__version__)
-lxml_huge_tree_version = Version('4.2')
+lxml_huge_tree_version = Version("4.2")
LXML_SUPPORTS_HUGE_TREE = lxml_version >= lxml_huge_tree_version
@@ -31,30 +54,65 @@ class CannotDropElementWithoutParent(CannotRemoveElementWithoutParent):
pass
-class SafeXMLParser(etree.XMLParser):
-
- def __init__(self, *args: Any, **kwargs: Any) ->None:
- kwargs.setdefault('resolve_entities', False)
+class SafeXMLParser(etree.XMLParser): # type: ignore[type-arg]
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
+ kwargs.setdefault("resolve_entities", False)
super().__init__(*args, **kwargs)
class CTGroupValue(TypedDict):
- _parser: Union[Type[etree.XMLParser], Type[html.HTMLParser]]
+ _parser: Union[Type[etree.XMLParser], Type[html.HTMLParser]] # type: ignore[type-arg]
_csstranslator: Union[GenericTranslator, HTMLTranslator]
_tostring_method: str
-_ctgroup: Dict[str, CTGroupValue] = {'html': {'_parser': html.HTMLParser,
- '_csstranslator': HTMLTranslator(), '_tostring_method': 'html'}, 'xml':
- {'_parser': SafeXMLParser, '_csstranslator': GenericTranslator(),
- '_tostring_method': 'xml'}}
-
-
-def create_root_node(text: str, parser_cls: Type[_ParserType], base_url:
- Optional[str]=None, huge_tree: bool=LXML_SUPPORTS_HUGE_TREE, body:
- bytes=b'', encoding: str='utf8') ->etree._Element:
+_ctgroup: Dict[str, CTGroupValue] = {
+ "html": {
+ "_parser": html.HTMLParser,
+ "_csstranslator": HTMLTranslator(),
+ "_tostring_method": "html",
+ },
+ "xml": {
+ "_parser": SafeXMLParser,
+ "_csstranslator": GenericTranslator(),
+ "_tostring_method": "xml",
+ },
+}
+
+
+def _xml_or_html(type: Optional[str]) -> str:
+ return "xml" if type == "xml" else "html"
+
+
+def create_root_node(
+ text: str,
+ parser_cls: Type[_ParserType],
+ base_url: Optional[str] = None,
+ huge_tree: bool = LXML_SUPPORTS_HUGE_TREE,
+ body: bytes = b"",
+ encoding: str = "utf8",
+) -> etree._Element:
"""Create root node for text using given parser class."""
- pass
+ if not text:
+ body = body.replace(b"\x00", b"").strip()
+ else:
+ body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
+
+ if huge_tree and LXML_SUPPORTS_HUGE_TREE:
+ parser = parser_cls(recover=True, encoding=encoding, huge_tree=True)
+ root = etree.fromstring(body, parser=parser, base_url=base_url)
+ else:
+ parser = parser_cls(recover=True, encoding=encoding)
+ root = etree.fromstring(body, parser=parser, base_url=base_url)
+ for error in parser.error_log:
+ if "use XML_PARSE_HUGE option" in error.message:
+ warnings.warn(
+ f"Input data is too big. Upgrade to lxml "
+ f"{lxml_huge_tree_version} or later for huge_tree support."
+ )
+ if root is None:
+ root = etree.fromstring(b"<html/>", parser=parser, base_url=base_url)
+ return root
class SelectorList(List[_SelectorType]):
@@ -64,27 +122,26 @@ class SelectorList(List[_SelectorType]):
"""
@typing.overload
- def __getitem__(self, pos: 'SupportsIndex') ->_SelectorType:
+ def __getitem__(self, pos: "SupportsIndex") -> _SelectorType:
pass
@typing.overload
- def __getitem__(self, pos: slice) ->'SelectorList[_SelectorType]':
+ def __getitem__(self, pos: slice) -> "SelectorList[_SelectorType]":
pass
- def __getitem__(self, pos: Union['SupportsIndex', slice]) ->Union[
- _SelectorType, 'SelectorList[_SelectorType]']:
+ def __getitem__(
+ self, pos: Union["SupportsIndex", slice]
+ ) -> Union[_SelectorType, "SelectorList[_SelectorType]"]:
o = super().__getitem__(pos)
if isinstance(pos, slice):
- return self.__class__(typing.cast('SelectorList[_SelectorType]', o)
- )
+ return self.__class__(typing.cast("SelectorList[_SelectorType]", o))
else:
return typing.cast(_SelectorType, o)
- def __getstate__(self) ->None:
+ def __getstate__(self) -> None:
raise TypeError("can't pickle SelectorList objects")
- def jmespath(self, query: str, **kwargs: Any
- ) ->'SelectorList[_SelectorType]':
+ def jmespath(self, query: str, **kwargs: Any) -> "SelectorList[_SelectorType]":
"""
Call the ``.jmespath()`` method for each element in this list and return
their results flattened as another :class:`SelectorList`.
@@ -96,10 +153,14 @@ class SelectorList(List[_SelectorType]):
selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict))
"""
- pass
+ return self.__class__(flatten([x.jmespath(query, **kwargs) for x in self]))
- def xpath(self, xpath: str, namespaces: Optional[Mapping[str, str]]=
- None, **kwargs: Any) ->'SelectorList[_SelectorType]':
+ def xpath(
+ self,
+ xpath: str,
+ namespaces: Optional[Mapping[str, str]] = None,
+ **kwargs: Any,
+ ) -> "SelectorList[_SelectorType]":
"""
Call the ``.xpath()`` method for each element in this list and return
their results flattened as another :class:`SelectorList`.
@@ -116,19 +177,22 @@ class SelectorList(List[_SelectorType]):
selector.xpath('//a[href=$url]', url="http://www.example.com")
"""
- pass
+ return self.__class__(
+ flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self])
+ )
- def css(self, query: str) ->'SelectorList[_SelectorType]':
+ def css(self, query: str) -> "SelectorList[_SelectorType]":
"""
Call the ``.css()`` method for each element in this list and return
their results flattened as another :class:`SelectorList`.
``query`` is the same argument as the one in :meth:`Selector.css`
"""
- pass
+ return self.__class__(flatten([x.css(query) for x in self]))
- def re(self, regex: Union[str, Pattern[str]], replace_entities: bool=True
- ) ->List[str]:
+ def re(
+ self, regex: Union[str, Pattern[str]], replace_entities: bool = True
+ ) -> List[str]:
"""
Call the ``.re()`` method for each element in this list and return
their results flattened, as a list of strings.
@@ -138,10 +202,32 @@ class SelectorList(List[_SelectorType]):
Passing ``replace_entities`` as ``False`` switches off these
replacements.
"""
+ return flatten([x.re(regex, replace_entities=replace_entities) for x in self])
+
+ @typing.overload
+ def re_first(
+ self,
+ regex: Union[str, Pattern[str]],
+ default: None = None,
+ replace_entities: bool = True,
+ ) -> Optional[str]:
+ pass
+
+ @typing.overload
+ def re_first(
+ self,
+ regex: Union[str, Pattern[str]],
+ default: str,
+ replace_entities: bool = True,
+ ) -> str:
pass
- def re_first(self, regex: Union[str, Pattern[str]], default: Optional[
- str]=None, replace_entities: bool=True) ->Optional[str]:
+ def re_first(
+ self,
+ regex: Union[str, Pattern[str]],
+ default: Optional[str] = None,
+ replace_entities: bool = True,
+ ) -> Optional[str]:
"""
Call the ``.re()`` method for the first element in this list and
return the result in an string. If the list is empty or the
@@ -153,47 +239,156 @@ class SelectorList(List[_SelectorType]):
Passing ``replace_entities`` as ``False`` switches off these
replacements.
"""
- pass
+ for el in iflatten(
+ x.re(regex, replace_entities=replace_entities) for x in self
+ ):
+ return typing.cast(str, el)
+ return default
- def getall(self) ->List[str]:
+ def getall(self) -> List[str]:
"""
Call the ``.get()`` method for each element is this list and return
their results flattened, as a list of strings.
"""
- pass
+ return [x.get() for x in self]
+
extract = getall
- def get(self, default: Optional[str]=None) ->Any:
+ @typing.overload
+ def get(self, default: None = None) -> Optional[str]:
+ pass
+
+ @typing.overload
+ def get(self, default: str) -> str:
+ pass
+
+ def get(self, default: Optional[str] = None) -> Any:
"""
Return the result of ``.get()`` for the first element in this list.
If the list is empty, return the default value.
"""
- pass
+ for x in self:
+ return x.get()
+ return default
+
extract_first = get
@property
- def attrib(self) ->Mapping[str, str]:
+ def attrib(self) -> Mapping[str, str]:
"""Return the attributes dictionary for the first element.
If the list is empty, return an empty dict.
"""
- pass
+ for x in self:
+ return x.attrib
+ return {}
- def remove(self) ->None:
+ def remove(self) -> None: # type: ignore[override]
"""
Remove matched nodes from the parent for each element in this list.
"""
- pass
+ warn(
+ "Method parsel.selector.SelectorList.remove is deprecated, please use parsel.selector.SelectorList.drop method instead",
+ category=DeprecationWarning,
+ stacklevel=2,
+ )
+ for x in self:
+ x.remove()
- def drop(self) ->None:
+ def drop(self) -> None:
"""
Drop matched nodes from the parent for each element in this list.
"""
- pass
+ for x in self:
+ x.drop()
_NOT_SET = object()
+def _get_root_from_text(text: str, *, type: str, **lxml_kwargs: Any) -> etree._Element:
+ return create_root_node(text, _ctgroup[type]["_parser"], **lxml_kwargs)
+
+
+def _get_root_and_type_from_bytes(
+ body: bytes,
+ encoding: str,
+ *,
+ input_type: Optional[str],
+ **lxml_kwargs: Any,
+) -> Tuple[Any, str]:
+ if input_type == "text":
+ return body.decode(encoding), input_type
+ if encoding == "utf8":
+ try:
+ data = json.load(BytesIO(body))
+ except ValueError:
+ data = _NOT_SET
+ if data is not _NOT_SET:
+ return data, "json"
+ if input_type == "json":
+ return None, "json"
+ assert input_type in ("html", "xml", None) # nosec
+ type = _xml_or_html(input_type)
+ root = create_root_node(
+ text="",
+ body=body,
+ encoding=encoding,
+ parser_cls=_ctgroup[type]["_parser"],
+ **lxml_kwargs,
+ )
+ return root, type
+
+
+def _get_root_and_type_from_text(
+ text: str, *, input_type: Optional[str], **lxml_kwargs: Any
+) -> Tuple[Any, str]:
+ if input_type == "text":
+ return text, input_type
+ try:
+ data = json.loads(text)
+ except ValueError:
+ data = _NOT_SET
+ if data is not _NOT_SET:
+ return data, "json"
+ if input_type == "json":
+ return None, "json"
+ assert input_type in ("html", "xml", None) # nosec
+ type = _xml_or_html(input_type)
+ root = _get_root_from_text(text, type=type, **lxml_kwargs)
+ return root, type
+
+
+def _get_root_type(root: Any, *, input_type: Optional[str]) -> str:
+ if isinstance(root, etree._Element): # pylint: disable=protected-access
+ if input_type in {"json", "text"}:
+ raise ValueError(
+ f"Selector got an lxml.etree._Element object as root, "
+ f"and {input_type!r} as type."
+ )
+ return _xml_or_html(input_type)
+ elif isinstance(root, (dict, list)) or _is_valid_json(root):
+ return "json"
+ return input_type or "json"
+
+
+def _is_valid_json(text: str) -> bool:
+ try:
+ json.loads(text)
+ except (TypeError, ValueError):
+ return False
+ else:
+ return True
+
+
+def _load_json_or_none(text: str) -> Any:
+ if isinstance(text, (str, bytes, bytearray)):
+ try:
+ return json.loads(text)
+ except ValueError:
+ return None
+ return None
+
+
class Selector:
"""Wrapper for input data in HTML, JSON, or XML format, that allows
selecting parts of it using selection expressions.
@@ -220,68 +415,125 @@ class Selector:
See `this lxml FAQ entry <https://lxml.de/FAQ.html#is-lxml-vulnerable-to-xml-bombs>`_
for more information.
"""
- __slots__ = ['namespaces', 'type', '_expr', '_huge_tree', 'root',
- '_text', 'body', '__weakref__']
- _default_namespaces = {'re': 'http://exslt.org/regular-expressions',
- 'set': 'http://exslt.org/sets'}
- _lxml_smart_strings = False
- selectorlist_cls = SelectorList['Selector']
- def __init__(self, text: Optional[str]=None, type: Optional[str]=None,
- body: bytes=b'', encoding: str='utf8', namespaces: Optional[Mapping
- [str, str]]=None, root: Optional[Any]=_NOT_SET, base_url: Optional[
- str]=None, _expr: Optional[str]=None, huge_tree: bool=
- LXML_SUPPORTS_HUGE_TREE) ->None:
+ __slots__ = [
+ "namespaces",
+ "type",
+ "_expr",
+ "_huge_tree",
+ "root",
+ "_text",
+ "body",
+ "__weakref__",
+ ]
+
+ _default_namespaces = {
+ "re": "http://exslt.org/regular-expressions",
+ # supported in libxslt:
+ # set:difference
+ # set:has-same-node
+ # set:intersection
+ # set:leading
+ # set:trailing
+ "set": "http://exslt.org/sets",
+ }
+ _lxml_smart_strings = False
+ selectorlist_cls = SelectorList["Selector"]
+
+ def __init__(
+ self,
+ text: Optional[str] = None,
+ type: Optional[str] = None,
+ body: bytes = b"",
+ encoding: str = "utf8",
+ namespaces: Optional[Mapping[str, str]] = None,
+ root: Optional[Any] = _NOT_SET,
+ base_url: Optional[str] = None,
+ _expr: Optional[str] = None,
+ huge_tree: bool = LXML_SUPPORTS_HUGE_TREE,
+ ) -> None:
self.root: Any
- if type not in ('html', 'json', 'text', 'xml', None):
- raise ValueError(f'Invalid type: {type}')
+ if type not in ("html", "json", "text", "xml", None):
+ raise ValueError(f"Invalid type: {type}")
+
if text is None and not body and root is _NOT_SET:
- raise ValueError('Selector needs text, body, or root arguments')
+ raise ValueError("Selector needs text, body, or root arguments")
+
if text is not None and not isinstance(text, str):
- msg = f'text argument should be of type str, got {text.__class__}'
+ msg = f"text argument should be of type str, got {text.__class__}"
raise TypeError(msg)
+
if text is not None:
if root is not _NOT_SET:
warnings.warn(
- 'Selector got both text and root, root is being ignored.',
- stacklevel=2)
+ "Selector got both text and root, root is being ignored.",
+ stacklevel=2,
+ )
if not isinstance(text, str):
- msg = (
- f'text argument should be of type str, got {text.__class__}'
- )
+ msg = f"text argument should be of type str, got {text.__class__}"
raise TypeError(msg)
- root, type = _get_root_and_type_from_text(text, input_type=type,
- base_url=base_url, huge_tree=huge_tree)
+
+ root, type = _get_root_and_type_from_text(
+ text,
+ input_type=type,
+ base_url=base_url,
+ huge_tree=huge_tree,
+ )
self.root = root
self.type = type
elif body:
if not isinstance(body, bytes):
- msg = (
- f'body argument should be of type bytes, got {body.__class__}'
- )
+ msg = f"body argument should be of type bytes, got {body.__class__}"
raise TypeError(msg)
- root, type = _get_root_and_type_from_bytes(body=body, encoding=
- encoding, input_type=type, base_url=base_url, huge_tree=
- huge_tree)
+ root, type = _get_root_and_type_from_bytes(
+ body=body,
+ encoding=encoding,
+ input_type=type,
+ base_url=base_url,
+ huge_tree=huge_tree,
+ )
self.root = root
self.type = type
elif root is _NOT_SET:
- raise ValueError('Selector needs text, body, or root arguments')
+ raise ValueError("Selector needs text, body, or root arguments")
else:
self.root = root
self.type = _get_root_type(root, input_type=type)
+
self.namespaces = dict(self._default_namespaces)
if namespaces is not None:
self.namespaces.update(namespaces)
+
self._expr = _expr
self._huge_tree = huge_tree
self._text = text
- def __getstate__(self) ->Any:
+ def __getstate__(self) -> Any:
raise TypeError("can't pickle Selector objects")
- def jmespath(self: _SelectorType, query: str, **kwargs: Any
- ) ->SelectorList[_SelectorType]:
+ def _get_root(
+ self,
+ text: str = "",
+ base_url: Optional[str] = None,
+ huge_tree: bool = LXML_SUPPORTS_HUGE_TREE,
+ type: Optional[str] = None,
+ body: bytes = b"",
+ encoding: str = "utf8",
+ ) -> etree._Element:
+ return create_root_node(
+ text,
+ body=body,
+ encoding=encoding,
+ parser_cls=_ctgroup[type or self.type]["_parser"],
+ base_url=base_url,
+ huge_tree=huge_tree,
+ )
+
+ def jmespath(
+ self: _SelectorType,
+ query: str,
+ **kwargs: Any,
+ ) -> SelectorList[_SelectorType]:
"""
Find objects matching the JMESPath ``query`` and return the result as a
:class:`SelectorList` instance with all elements flattened. List
@@ -295,10 +547,37 @@ class Selector:
selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict))
"""
- pass
-
- def xpath(self: _SelectorType, query: str, namespaces: Optional[Mapping
- [str, str]]=None, **kwargs: Any) ->SelectorList[_SelectorType]:
+ if self.type == "json":
+ if isinstance(self.root, str):
+ # Selector received a JSON string as root.
+ data = _load_json_or_none(self.root)
+ else:
+ data = self.root
+ else:
+ assert self.type in {"html", "xml"} # nosec
+ data = _load_json_or_none(self.root.text)
+
+ result = jmespath.search(query, data, **kwargs)
+ if result is None:
+ result = []
+ elif not isinstance(result, list):
+ result = [result]
+
+ def make_selector(x: Any) -> _SelectorType: # closure function
+ if isinstance(x, str):
+ return self.__class__(text=x, _expr=query, type="text")
+ else:
+ return self.__class__(root=x, _expr=query)
+
+ result = [make_selector(x) for x in result]
+ return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result))
+
+ def xpath(
+ self: _SelectorType,
+ query: str,
+ namespaces: Optional[Mapping[str, str]] = None,
+ **kwargs: Any,
+ ) -> SelectorList[_SelectorType]:
"""
Find nodes matching the xpath ``query`` and return the result as a
:class:`SelectorList` instance with all elements flattened. List
@@ -316,9 +595,51 @@ class Selector:
selector.xpath('//a[href=$url]', url="http://www.example.com")
"""
- pass
+ if self.type not in ("html", "xml", "text"):
+ raise ValueError(f"Cannot use xpath on a Selector of type {self.type!r}")
+ if self.type in ("html", "xml"):
+ try:
+ xpathev = self.root.xpath
+ except AttributeError:
+ return typing.cast(
+ SelectorList[_SelectorType], self.selectorlist_cls([])
+ )
+ else:
+ try:
+ xpathev = self._get_root(self._text or "", type="html").xpath
+ except AttributeError:
+ return typing.cast(
+ SelectorList[_SelectorType], self.selectorlist_cls([])
+ )
- def css(self: _SelectorType, query: str) ->SelectorList[_SelectorType]:
+ nsp = dict(self.namespaces)
+ if namespaces is not None:
+ nsp.update(namespaces)
+ try:
+ result = xpathev(
+ query,
+ namespaces=nsp,
+ smart_strings=self._lxml_smart_strings,
+ **kwargs,
+ )
+ except etree.XPathError as exc:
+ raise ValueError(f"XPath error: {exc} in {query}")
+
+ if type(result) is not list:
+ result = [result]
+
+ result = [
+ self.__class__(
+ root=x,
+ _expr=query,
+ namespaces=self.namespaces,
+ type=_xml_or_html(self.type),
+ )
+ for x in result
+ ]
+ return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result))
+
+ def css(self: _SelectorType, query: str) -> SelectorList[_SelectorType]:
"""
Apply the given CSS selector and return a :class:`SelectorList` instance.
@@ -329,10 +650,17 @@ class Selector:
.. _cssselect: https://pypi.python.org/pypi/cssselect/
"""
- pass
+ if self.type not in ("html", "xml", "text"):
+ raise ValueError(f"Cannot use css on a Selector of type {self.type!r}")
+ return self.xpath(self._css2xpath(query))
- def re(self, regex: Union[str, Pattern[str]], replace_entities: bool=True
- ) ->List[str]:
+ def _css2xpath(self, query: str) -> str:
+ type = _xml_or_html(self.type)
+ return _ctgroup[type]["_csstranslator"].css_to_xpath(query)
+
+ def re(
+ self, regex: Union[str, Pattern[str]], replace_entities: bool = True
+ ) -> List[str]:
"""
Apply the given regex and return a list of strings with the
matches.
@@ -345,10 +673,33 @@ class Selector:
Passing ``replace_entities`` as ``False`` switches off these
replacements.
"""
+ data = self.get()
+ return extract_regex(regex, data, replace_entities=replace_entities)
+
+ @typing.overload
+ def re_first(
+ self,
+ regex: Union[str, Pattern[str]],
+ default: None = None,
+ replace_entities: bool = True,
+ ) -> Optional[str]:
pass
- def re_first(self, regex: Union[str, Pattern[str]], default: Optional[
- str]=None, replace_entities: bool=True) ->Optional[str]:
+ @typing.overload
+ def re_first(
+ self,
+ regex: Union[str, Pattern[str]],
+ default: str,
+ replace_entities: bool = True,
+ ) -> str:
+ pass
+
+ def re_first(
+ self,
+ regex: Union[str, Pattern[str]],
+ default: Optional[str] = None,
+ replace_entities: bool = True,
+ ) -> Optional[str]:
"""
Apply the given regex and return the first string which matches. If
there is no match, return the default value (``None`` if the argument
@@ -359,68 +710,145 @@ class Selector:
Passing ``replace_entities`` as ``False`` switches off these
replacements.
"""
- pass
+ return next(
+ iflatten(self.re(regex, replace_entities=replace_entities)),
+ default,
+ )
- def get(self) ->Any:
+ def get(self) -> Any:
"""
Serialize and return the matched nodes.
For HTML and XML, the result is always a string, and percent-encoded
content is unquoted.
"""
- pass
+ if self.type in ("text", "json"):
+ return self.root
+ try:
+ return typing.cast(
+ str,
+ etree.tostring(
+ self.root,
+ method=_ctgroup[self.type]["_tostring_method"],
+ encoding="unicode",
+ with_tail=False,
+ ),
+ )
+ except (AttributeError, TypeError):
+ if self.root is True:
+ return "1"
+ elif self.root is False:
+ return "0"
+ else:
+ return str(self.root)
+
extract = get
- def getall(self) ->List[str]:
+ def getall(self) -> List[str]:
"""
Serialize and return the matched node in a 1-element list of strings.
"""
- pass
+ return [self.get()]
- def register_namespace(self, prefix: str, uri: str) ->None:
+ def register_namespace(self, prefix: str, uri: str) -> None:
"""
Register the given namespace to be used in this :class:`Selector`.
Without registering namespaces you can't select or extract data from
non-standard namespaces. See :ref:`selector-examples-xml`.
"""
- pass
+ self.namespaces[prefix] = uri
- def remove_namespaces(self) ->None:
+ def remove_namespaces(self) -> None:
"""
Remove all namespaces, allowing to traverse the document using
namespace-less xpaths. See :ref:`removing-namespaces`.
"""
- pass
+ for el in self.root.iter("*"):
+ if el.tag.startswith("{"):
+ el.tag = el.tag.split("}", 1)[1]
+ # loop on element attributes also
+ for an in el.attrib:
+ if an.startswith("{"):
+ el.attrib[an.split("}", 1)[1]] = el.attrib.pop(an)
+ # remove namespace declarations
+ etree.cleanup_namespaces(self.root)
- def remove(self) ->None:
+ def remove(self) -> None:
"""
Remove matched nodes from the parent element.
"""
- pass
-
- def drop(self) ->None:
+ warn(
+ "Method parsel.selector.Selector.remove is deprecated, please use parsel.selector.Selector.drop method instead",
+ category=DeprecationWarning,
+ stacklevel=2,
+ )
+ try:
+ parent = self.root.getparent()
+ except AttributeError:
+ # 'str' object has no attribute 'getparent'
+ raise CannotRemoveElementWithoutRoot(
+ "The node you're trying to remove has no root, "
+ "are you trying to remove a pseudo-element? "
+ "Try to use 'li' as a selector instead of 'li::text' or "
+ "'//li' instead of '//li/text()', for example."
+ )
+
+ try:
+ parent.remove(self.root)
+ except AttributeError:
+ # 'NoneType' object has no attribute 'remove'
+ raise CannotRemoveElementWithoutParent(
+ "The node you're trying to remove has no parent, "
+ "are you trying to remove a root element?"
+ )
+
+ def drop(self) -> None:
"""
Drop matched nodes from the parent element.
"""
- pass
+ try:
+ parent = self.root.getparent()
+ except AttributeError:
+ # 'str' object has no attribute 'getparent'
+ raise CannotRemoveElementWithoutRoot(
+ "The node you're trying to drop has no root, "
+ "are you trying to drop a pseudo-element? "
+ "Try to use 'li' as a selector instead of 'li::text' or "
+ "'//li' instead of '//li/text()', for example."
+ )
+
+ try:
+ if self.type == "xml":
+ if parent is None:
+ raise ValueError("This node has no parent")
+ parent.remove(self.root)
+ else:
+ typing.cast(html.HtmlElement, self.root).drop_tree()
+ except (AttributeError, AssertionError):
+ # 'NoneType' object has no attribute 'drop'
+ raise CannotDropElementWithoutParent(
+ "The node you're trying to remove has no parent, "
+ "are you trying to remove a root element?"
+ )
@property
- def attrib(self) ->Dict[str, str]:
+ def attrib(self) -> Dict[str, str]:
"""Return the attributes dictionary for underlying element."""
- pass
+ return dict(self.root.attrib)
- def __bool__(self) ->bool:
+ def __bool__(self) -> bool:
"""
Return ``True`` if there is any real content selected or ``False``
otherwise. In other words, the boolean value of a :class:`Selector` is
given by the contents it selects.
"""
return bool(self.get())
+
__nonzero__ = __bool__
- def __str__(self) ->str:
+ def __str__(self) -> str:
return str(self.get())
- def __repr__(self) ->str:
+ def __repr__(self) -> str:
data = repr(shorten(str(self.get()), width=40))
- return f'<{type(self).__name__} query={self._expr!r} data={data}>'
+ return f"<{type(self).__name__} query={self._expr!r} data={data}>"
diff --git a/parsel/utils.py b/parsel/utils.py
index 361f99c..ec77d74 100644
--- a/parsel/utils.py
+++ b/parsel/utils.py
@@ -1,9 +1,10 @@
import re
from typing import Any, Iterable, Iterator, List, Match, Pattern, Union, cast
+
from w3lib.html import replace_entities as w3lib_replace_entities
-def flatten(x: Iterable[Any]) ->List[Any]:
+def flatten(x: Iterable[Any]) -> List[Any]:
"""flatten(sequence) -> list
Returns a single, flat list which contains all elements retrieved
from the sequence and all recursively contained sub-sequences
@@ -18,16 +19,20 @@ def flatten(x: Iterable[Any]) ->List[Any]:
>>> flatten(["foo", ["baz", 42], "bar"])
['foo', 'baz', 42, 'bar']
"""
- pass
+ return list(iflatten(x))
-def iflatten(x: Iterable[Any]) ->Iterator[Any]:
+def iflatten(x: Iterable[Any]) -> Iterator[Any]:
"""iflatten(sequence) -> Iterator
Similar to ``.flatten()``, but returns iterator instead"""
- pass
+ for el in x:
+ if _is_listlike(el):
+ yield from flatten(el)
+ else:
+ yield el
-def _is_listlike(x: Any) ->bool:
+def _is_listlike(x: Any) -> bool:
"""
>>> _is_listlike("foo")
False
@@ -48,19 +53,44 @@ def _is_listlike(x: Any) ->bool:
>>> _is_listlike(range(5))
True
"""
- pass
+ return hasattr(x, "__iter__") and not isinstance(x, (str, bytes))
-def extract_regex(regex: Union[str, Pattern[str]], text: str,
- replace_entities: bool=True) ->List[str]:
+def extract_regex(
+ regex: Union[str, Pattern[str]], text: str, replace_entities: bool = True
+) -> List[str]:
"""Extract a list of strings from the given text/encoding using the following policies:
* if the regex contains a named group called "extract" that will be returned
* if the regex contains multiple numbered groups, all those will be returned (flattened)
* if the regex doesn't contain any group the entire regex matching is returned
"""
- pass
+ if isinstance(regex, str):
+ regex = re.compile(regex, re.UNICODE)
+
+ if "extract" in regex.groupindex:
+ # named group
+ try:
+ extracted = cast(Match[str], regex.search(text)).group("extract")
+ except AttributeError:
+ strings = []
+ else:
+ strings = [extracted] if extracted is not None else []
+ else:
+ # full regex or numbered groups
+ strings = regex.findall(text)
+
+ strings = flatten(strings)
+ if not replace_entities:
+ return strings
+ return [w3lib_replace_entities(s, keep=["lt", "amp"]) for s in strings]
-def shorten(text: str, width: int, suffix: str='...') ->str:
+def shorten(text: str, width: int, suffix: str = "...") -> str:
"""Truncate the given text to fit in the given width."""
- pass
+ if len(text) <= width:
+ return text
+ if width > len(suffix):
+ return text[: width - len(suffix)] + suffix
+ if width >= 0:
+ return suffix[len(suffix) - width :]
+ raise ValueError("width must be equal or greater than 0")
diff --git a/parsel/xpathfuncs.py b/parsel/xpathfuncs.py
index 55d8f4a..7633d10 100644
--- a/parsel/xpathfuncs.py
+++ b/parsel/xpathfuncs.py
@@ -1,12 +1,14 @@
import re
from typing import Any, Callable, Optional
+
from lxml import etree
from w3lib.html import HTML5_WHITESPACE
-regex = f'[{HTML5_WHITESPACE}]+'
+
+regex = f"[{HTML5_WHITESPACE}]+"
replace_html5_whitespaces = re.compile(regex).sub
-def set_xpathfunc(fname: str, func: Optional[Callable]) ->None:
+def set_xpathfunc(fname: str, func: Optional[Callable]) -> None: # type: ignore[type-arg]
"""Register a custom extension function to use in XPath expressions.
The function ``func`` registered under ``fname`` identifier will be called
@@ -20,13 +22,37 @@ def set_xpathfunc(fname: str, func: Optional[Callable]) ->None:
.. _`in lxml documentation`: https://lxml.de/extensions.html#xpath-extension-functions
"""
- pass
+ ns_fns = etree.FunctionNamespace(None)
+ if func is not None:
+ ns_fns[fname] = func
+ else:
+ del ns_fns[fname]
+
+
+def setup() -> None:
+ set_xpathfunc("has-class", has_class)
-def has_class(context: Any, *classes: str) ->bool:
+def has_class(context: Any, *classes: str) -> bool:
"""has-class function.
Return True if all ``classes`` are present in element's class attr.
"""
- pass
+ if not context.eval_context.get("args_checked"):
+ if not classes:
+ raise ValueError("XPath error: has-class must have at least 1 argument")
+ for c in classes:
+ if not isinstance(c, str):
+ raise ValueError("XPath error: has-class arguments must be strings")
+ context.eval_context["args_checked"] = True
+
+ node_cls = context.context_node.get("class")
+ if node_cls is None:
+ return False
+ node_cls = " " + node_cls + " "
+ node_cls = replace_html5_whitespaces(" ", node_cls)
+ for cls in classes:
+ if " " + cls + " " not in node_cls:
+ return False
+ return True