Reference (Gold): parsel

Pytest Summary for test `tests`

status	count
passed	206
skipped	2
total	208
collected	208
Failed pytests:

Patch diff

diff --git a/parsel/csstranslator.py b/parsel/csstranslator.py
index 2bf8cc7..80bfc7c 100644
--- a/parsel/csstranslator.py
+++ b/parsel/csstranslator.py
@@ -1,36 +1,76 @@
 from functools import lru_cache
 from typing import TYPE_CHECKING, Any, Optional, Protocol
+
 from cssselect import GenericTranslator as OriginalGenericTranslator
 from cssselect import HTMLTranslator as OriginalHTMLTranslator
 from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
 from cssselect.xpath import ExpressionError
 from cssselect.xpath import XPathExpr as OriginalXPathExpr
+
 if TYPE_CHECKING:
+    # typing.Self requires Python 3.11
     from typing_extensions import Self


 class XPathExpr(OriginalXPathExpr):
+
     textnode: bool = False
     attribute: Optional[str] = None

-    def __str__(self) ->str:
+    @classmethod
+    def from_xpath(
+        cls,
+        xpath: OriginalXPathExpr,
+        textnode: bool = False,
+        attribute: Optional[str] = None,
+    ) -> "Self":
+        x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
+        x.textnode = textnode
+        x.attribute = attribute
+        return x
+
+    def __str__(self) -> str:
         path = super().__str__()
         if self.textnode:
-            if path == '*':
-                path = 'text()'
-            elif path.endswith('::*/*'):
-                path = path[:-3] + 'text()'
+            if path == "*":
+                path = "text()"
+            elif path.endswith("::*/*"):
+                path = path[:-3] + "text()"
             else:
-                path += '/text()'
+                path += "/text()"
+
         if self.attribute is not None:
-            if path.endswith('::*/*'):
+            if path.endswith("::*/*"):
                 path = path[:-2]
-            path += f'/@{self.attribute}'
-        return path
+            path += f"/@{self.attribute}"

+        return path

+    def join(
+        self: "Self",
+        combiner: str,
+        other: OriginalXPathExpr,
+        *args: Any,
+        **kwargs: Any,
+    ) -> "Self":
+        if not isinstance(other, XPathExpr):
+            raise ValueError(
+                f"Expressions of type {__name__}.XPathExpr can ony join expressions"
+                f" of the same type (or its descendants), got {type(other)}"
+            )
+        super().join(combiner, other, *args, **kwargs)
+        self.textnode = other.textnode
+        self.attribute = other.attribute
+        return self
+
+
+# e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
 class TranslatorProtocol(Protocol):
-    pass
+    def xpath_element(self, selector: Element) -> OriginalXPathExpr:
+        pass
+
+    def css_to_xpath(self, css: str, prefix: str = ...) -> str:
+        pass


 class TranslatorMixin:
@@ -39,35 +79,67 @@ class TranslatorMixin:
     Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``.
     """

-    def xpath_pseudo_element(self, xpath: OriginalXPathExpr, pseudo_element:
-        PseudoElement) ->OriginalXPathExpr:
+    def xpath_element(self: TranslatorProtocol, selector: Element) -> XPathExpr:
+        # https://github.com/python/mypy/issues/12344
+        xpath = super().xpath_element(selector)  # type: ignore[safe-super]
+        return XPathExpr.from_xpath(xpath)
+
+    def xpath_pseudo_element(
+        self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement
+    ) -> OriginalXPathExpr:
         """
         Dispatch method that transforms XPath to support pseudo-element
         """
-        pass
-
-    def xpath_attr_functional_pseudo_element(self, xpath: OriginalXPathExpr,
-        function: FunctionalPseudoElement) ->XPathExpr:
+        if isinstance(pseudo_element, FunctionalPseudoElement):
+            method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
+            method = getattr(self, method_name, None)
+            if not method:
+                raise ExpressionError(
+                    f"The functional pseudo-element ::{pseudo_element.name}() is unknown"
+                )
+            xpath = method(xpath, pseudo_element)
+        else:
+            method_name = (
+                f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
+            )
+            method = getattr(self, method_name, None)
+            if not method:
+                raise ExpressionError(
+                    f"The pseudo-element ::{pseudo_element} is unknown"
+                )
+            xpath = method(xpath)
+        return xpath
+
+    def xpath_attr_functional_pseudo_element(
+        self, xpath: OriginalXPathExpr, function: FunctionalPseudoElement
+    ) -> XPathExpr:
         """Support selecting attribute values using ::attr() pseudo-element"""
-        pass
+        if function.argument_types() not in (["STRING"], ["IDENT"]):
+            raise ExpressionError(
+                f"Expected a single string or ident for ::attr(), got {function.arguments!r}"
+            )
+        return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value)

-    def xpath_text_simple_pseudo_element(self, xpath: OriginalXPathExpr
-        ) ->XPathExpr:
+    def xpath_text_simple_pseudo_element(self, xpath: OriginalXPathExpr) -> XPathExpr:
         """Support selecting text nodes using ::text pseudo-element"""
-        pass
+        return XPathExpr.from_xpath(xpath, textnode=True)


 class GenericTranslator(TranslatorMixin, OriginalGenericTranslator):
-    pass
+    @lru_cache(maxsize=256)
+    def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
+        return super().css_to_xpath(css, prefix)


 class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
-    pass
+    @lru_cache(maxsize=256)
+    def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
+        return super().css_to_xpath(css, prefix)


 _translator = HTMLTranslator()


-def css2xpath(query: str) ->str:
-    """Return translated XPath version of a given CSS query"""
-    pass
+def css2xpath(query: str) -> str:
+    "Return translated XPath version of a given CSS query"
+    return _translator.css_to_xpath(query)
diff --git a/parsel/selector.py b/parsel/selector.py
index 6aa73da..2027599 100644
--- a/parsel/selector.py
+++ b/parsel/selector.py
@@ -1,21 +1,44 @@
 """XPath and JMESPath selectors based on the lxml and jmespath Python
 packages."""
+
 import json
 import typing
 import warnings
 from io import BytesIO
-from typing import Any, Dict, List, Literal, Mapping, Optional, Pattern, SupportsIndex, Tuple, Type, TypedDict, TypeVar, Union
+from typing import (
+    Any,
+    Dict,
+    List,
+    Literal,
+    Mapping,
+    Optional,
+    Pattern,
+    SupportsIndex,
+    Tuple,
+    Type,
+    TypedDict,
+    TypeVar,
+    Union,
+)
 from warnings import warn
+
 import jmespath
 from lxml import etree, html
 from packaging.version import Version
+
 from .csstranslator import GenericTranslator, HTMLTranslator
 from .utils import extract_regex, flatten, iflatten, shorten
-_SelectorType = TypeVar('_SelectorType', bound='Selector')
-_ParserType = Union[etree.XMLParser, etree.HTMLParser]
-_TostringMethodType = Literal['html', 'xml']
+
+_SelectorType = TypeVar("_SelectorType", bound="Selector")
+_ParserType = Union[etree.XMLParser, etree.HTMLParser]  # type: ignore[type-arg]
+# simplified _OutputMethodArg from types-lxml
+_TostringMethodType = Literal[
+    "html",
+    "xml",
+]
+
 lxml_version = Version(etree.__version__)
-lxml_huge_tree_version = Version('4.2')
+lxml_huge_tree_version = Version("4.2")
 LXML_SUPPORTS_HUGE_TREE = lxml_version >= lxml_huge_tree_version


@@ -31,30 +54,65 @@ class CannotDropElementWithoutParent(CannotRemoveElementWithoutParent):
     pass


-class SafeXMLParser(etree.XMLParser):
-
-    def __init__(self, *args: Any, **kwargs: Any) ->None:
-        kwargs.setdefault('resolve_entities', False)
+class SafeXMLParser(etree.XMLParser):  # type: ignore[type-arg]
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        kwargs.setdefault("resolve_entities", False)
         super().__init__(*args, **kwargs)


 class CTGroupValue(TypedDict):
-    _parser: Union[Type[etree.XMLParser], Type[html.HTMLParser]]
+    _parser: Union[Type[etree.XMLParser], Type[html.HTMLParser]]  # type: ignore[type-arg]
     _csstranslator: Union[GenericTranslator, HTMLTranslator]
     _tostring_method: str


-_ctgroup: Dict[str, CTGroupValue] = {'html': {'_parser': html.HTMLParser,
-    '_csstranslator': HTMLTranslator(), '_tostring_method': 'html'}, 'xml':
-    {'_parser': SafeXMLParser, '_csstranslator': GenericTranslator(),
-    '_tostring_method': 'xml'}}
-
-
-def create_root_node(text: str, parser_cls: Type[_ParserType], base_url:
-    Optional[str]=None, huge_tree: bool=LXML_SUPPORTS_HUGE_TREE, body:
-    bytes=b'', encoding: str='utf8') ->etree._Element:
+_ctgroup: Dict[str, CTGroupValue] = {
+    "html": {
+        "_parser": html.HTMLParser,
+        "_csstranslator": HTMLTranslator(),
+        "_tostring_method": "html",
+    },
+    "xml": {
+        "_parser": SafeXMLParser,
+        "_csstranslator": GenericTranslator(),
+        "_tostring_method": "xml",
+    },
+}
+
+
+def _xml_or_html(type: Optional[str]) -> str:
+    return "xml" if type == "xml" else "html"
+
+
+def create_root_node(
+    text: str,
+    parser_cls: Type[_ParserType],
+    base_url: Optional[str] = None,
+    huge_tree: bool = LXML_SUPPORTS_HUGE_TREE,
+    body: bytes = b"",
+    encoding: str = "utf8",
+) -> etree._Element:
     """Create root node for text using given parser class."""
-    pass
+    if not text:
+        body = body.replace(b"\x00", b"").strip()
+    else:
+        body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
+
+    if huge_tree and LXML_SUPPORTS_HUGE_TREE:
+        parser = parser_cls(recover=True, encoding=encoding, huge_tree=True)
+        root = etree.fromstring(body, parser=parser, base_url=base_url)
+    else:
+        parser = parser_cls(recover=True, encoding=encoding)
+        root = etree.fromstring(body, parser=parser, base_url=base_url)
+        for error in parser.error_log:
+            if "use XML_PARSE_HUGE option" in error.message:
+                warnings.warn(
+                    f"Input data is too big. Upgrade to lxml "
+                    f"{lxml_huge_tree_version} or later for huge_tree support."
+                )
+    if root is None:
+        root = etree.fromstring(b"<html/>", parser=parser, base_url=base_url)
+    return root


 class SelectorList(List[_SelectorType]):
@@ -64,27 +122,26 @@ class SelectorList(List[_SelectorType]):
     """

     @typing.overload
-    def __getitem__(self, pos: 'SupportsIndex') ->_SelectorType:
+    def __getitem__(self, pos: "SupportsIndex") -> _SelectorType:
         pass

     @typing.overload
-    def __getitem__(self, pos: slice) ->'SelectorList[_SelectorType]':
+    def __getitem__(self, pos: slice) -> "SelectorList[_SelectorType]":
         pass

-    def __getitem__(self, pos: Union['SupportsIndex', slice]) ->Union[
-        _SelectorType, 'SelectorList[_SelectorType]']:
+    def __getitem__(
+        self, pos: Union["SupportsIndex", slice]
+    ) -> Union[_SelectorType, "SelectorList[_SelectorType]"]:
         o = super().__getitem__(pos)
         if isinstance(pos, slice):
-            return self.__class__(typing.cast('SelectorList[_SelectorType]', o)
-                )
+            return self.__class__(typing.cast("SelectorList[_SelectorType]", o))
         else:
             return typing.cast(_SelectorType, o)

-    def __getstate__(self) ->None:
+    def __getstate__(self) -> None:
         raise TypeError("can't pickle SelectorList objects")

-    def jmespath(self, query: str, **kwargs: Any
-        ) ->'SelectorList[_SelectorType]':
+    def jmespath(self, query: str, **kwargs: Any) -> "SelectorList[_SelectorType]":
         """
         Call the ``.jmespath()`` method for each element in this list and return
         their results flattened as another :class:`SelectorList`.
@@ -96,10 +153,14 @@ class SelectorList(List[_SelectorType]):

             selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict))
         """
-        pass
+        return self.__class__(flatten([x.jmespath(query, **kwargs) for x in self]))

-    def xpath(self, xpath: str, namespaces: Optional[Mapping[str, str]]=
-        None, **kwargs: Any) ->'SelectorList[_SelectorType]':
+    def xpath(
+        self,
+        xpath: str,
+        namespaces: Optional[Mapping[str, str]] = None,
+        **kwargs: Any,
+    ) -> "SelectorList[_SelectorType]":
         """
         Call the ``.xpath()`` method for each element in this list and return
         their results flattened as another :class:`SelectorList`.
@@ -116,19 +177,22 @@ class SelectorList(List[_SelectorType]):

             selector.xpath('//a[href=$url]', url="http://www.example.com")
         """
-        pass
+        return self.__class__(
+            flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self])
+        )

-    def css(self, query: str) ->'SelectorList[_SelectorType]':
+    def css(self, query: str) -> "SelectorList[_SelectorType]":
         """
         Call the ``.css()`` method for each element in this list and return
         their results flattened as another :class:`SelectorList`.

         ``query`` is the same argument as the one in :meth:`Selector.css`
         """
-        pass
+        return self.__class__(flatten([x.css(query) for x in self]))

-    def re(self, regex: Union[str, Pattern[str]], replace_entities: bool=True
-        ) ->List[str]:
+    def re(
+        self, regex: Union[str, Pattern[str]], replace_entities: bool = True
+    ) -> List[str]:
         """
         Call the ``.re()`` method for each element in this list and return
         their results flattened, as a list of strings.
@@ -138,10 +202,32 @@ class SelectorList(List[_SelectorType]):
         Passing ``replace_entities`` as ``False`` switches off these
         replacements.
         """
+        return flatten([x.re(regex, replace_entities=replace_entities) for x in self])
+
+    @typing.overload
+    def re_first(
+        self,
+        regex: Union[str, Pattern[str]],
+        default: None = None,
+        replace_entities: bool = True,
+    ) -> Optional[str]:
+        pass
+
+    @typing.overload
+    def re_first(
+        self,
+        regex: Union[str, Pattern[str]],
+        default: str,
+        replace_entities: bool = True,
+    ) -> str:
         pass

-    def re_first(self, regex: Union[str, Pattern[str]], default: Optional[
-        str]=None, replace_entities: bool=True) ->Optional[str]:
+    def re_first(
+        self,
+        regex: Union[str, Pattern[str]],
+        default: Optional[str] = None,
+        replace_entities: bool = True,
+    ) -> Optional[str]:
         """
         Call the ``.re()`` method for the first element in this list and
         return the result in an string. If the list is empty or the
@@ -153,47 +239,156 @@ class SelectorList(List[_SelectorType]):
         Passing ``replace_entities`` as ``False`` switches off these
         replacements.
         """
-        pass
+        for el in iflatten(
+            x.re(regex, replace_entities=replace_entities) for x in self
+        ):
+            return typing.cast(str, el)
+        return default

-    def getall(self) ->List[str]:
+    def getall(self) -> List[str]:
         """
         Call the ``.get()`` method for each element is this list and return
         their results flattened, as a list of strings.
         """
-        pass
+        return [x.get() for x in self]
+
     extract = getall

-    def get(self, default: Optional[str]=None) ->Any:
+    @typing.overload
+    def get(self, default: None = None) -> Optional[str]:
+        pass
+
+    @typing.overload
+    def get(self, default: str) -> str:
+        pass
+
+    def get(self, default: Optional[str] = None) -> Any:
         """
         Return the result of ``.get()`` for the first element in this list.
         If the list is empty, return the default value.
         """
-        pass
+        for x in self:
+            return x.get()
+        return default
+
     extract_first = get

     @property
-    def attrib(self) ->Mapping[str, str]:
+    def attrib(self) -> Mapping[str, str]:
         """Return the attributes dictionary for the first element.
         If the list is empty, return an empty dict.
         """
-        pass
+        for x in self:
+            return x.attrib
+        return {}

-    def remove(self) ->None:
+    def remove(self) -> None:  # type: ignore[override]
         """
         Remove matched nodes from the parent for each element in this list.
         """
-        pass
+        warn(
+            "Method parsel.selector.SelectorList.remove is deprecated, please use parsel.selector.SelectorList.drop method instead",
+            category=DeprecationWarning,
+            stacklevel=2,
+        )
+        for x in self:
+            x.remove()

-    def drop(self) ->None:
+    def drop(self) -> None:
         """
         Drop matched nodes from the parent for each element in this list.
         """
-        pass
+        for x in self:
+            x.drop()


 _NOT_SET = object()


+def _get_root_from_text(text: str, *, type: str, **lxml_kwargs: Any) -> etree._Element:
+    return create_root_node(text, _ctgroup[type]["_parser"], **lxml_kwargs)
+
+
+def _get_root_and_type_from_bytes(
+    body: bytes,
+    encoding: str,
+    *,
+    input_type: Optional[str],
+    **lxml_kwargs: Any,
+) -> Tuple[Any, str]:
+    if input_type == "text":
+        return body.decode(encoding), input_type
+    if encoding == "utf8":
+        try:
+            data = json.load(BytesIO(body))
+        except ValueError:
+            data = _NOT_SET
+        if data is not _NOT_SET:
+            return data, "json"
+    if input_type == "json":
+        return None, "json"
+    assert input_type in ("html", "xml", None)  # nosec
+    type = _xml_or_html(input_type)
+    root = create_root_node(
+        text="",
+        body=body,
+        encoding=encoding,
+        parser_cls=_ctgroup[type]["_parser"],
+        **lxml_kwargs,
+    )
+    return root, type
+
+
+def _get_root_and_type_from_text(
+    text: str, *, input_type: Optional[str], **lxml_kwargs: Any
+) -> Tuple[Any, str]:
+    if input_type == "text":
+        return text, input_type
+    try:
+        data = json.loads(text)
+    except ValueError:
+        data = _NOT_SET
+    if data is not _NOT_SET:
+        return data, "json"
+    if input_type == "json":
+        return None, "json"
+    assert input_type in ("html", "xml", None)  # nosec
+    type = _xml_or_html(input_type)
+    root = _get_root_from_text(text, type=type, **lxml_kwargs)
+    return root, type
+
+
+def _get_root_type(root: Any, *, input_type: Optional[str]) -> str:
+    if isinstance(root, etree._Element):  # pylint: disable=protected-access
+        if input_type in {"json", "text"}:
+            raise ValueError(
+                f"Selector got an lxml.etree._Element object as root, "
+                f"and {input_type!r} as type."
+            )
+        return _xml_or_html(input_type)
+    elif isinstance(root, (dict, list)) or _is_valid_json(root):
+        return "json"
+    return input_type or "json"
+
+
+def _is_valid_json(text: str) -> bool:
+    try:
+        json.loads(text)
+    except (TypeError, ValueError):
+        return False
+    else:
+        return True
+
+
+def _load_json_or_none(text: str) -> Any:
+    if isinstance(text, (str, bytes, bytearray)):
+        try:
+            return json.loads(text)
+        except ValueError:
+            return None
+    return None
+
+
 class Selector:
     """Wrapper for input data in HTML, JSON, or XML format, that allows
     selecting parts of it using selection expressions.
@@ -220,68 +415,125 @@ class Selector:
     See `this lxml FAQ entry <https://lxml.de/FAQ.html#is-lxml-vulnerable-to-xml-bombs>`_
     for more information.
     """
-    __slots__ = ['namespaces', 'type', '_expr', '_huge_tree', 'root',
-        '_text', 'body', '__weakref__']
-    _default_namespaces = {'re': 'http://exslt.org/regular-expressions',
-        'set': 'http://exslt.org/sets'}
-    _lxml_smart_strings = False
-    selectorlist_cls = SelectorList['Selector']

-    def __init__(self, text: Optional[str]=None, type: Optional[str]=None,
-        body: bytes=b'', encoding: str='utf8', namespaces: Optional[Mapping
-        [str, str]]=None, root: Optional[Any]=_NOT_SET, base_url: Optional[
-        str]=None, _expr: Optional[str]=None, huge_tree: bool=
-        LXML_SUPPORTS_HUGE_TREE) ->None:
+    __slots__ = [
+        "namespaces",
+        "type",
+        "_expr",
+        "_huge_tree",
+        "root",
+        "_text",
+        "body",
+        "__weakref__",
+    ]
+
+    _default_namespaces = {
+        "re": "http://exslt.org/regular-expressions",
+        # supported in libxslt:
+        # set:difference
+        # set:has-same-node
+        # set:intersection
+        # set:leading
+        # set:trailing
+        "set": "http://exslt.org/sets",
+    }
+    _lxml_smart_strings = False
+    selectorlist_cls = SelectorList["Selector"]
+
+    def __init__(
+        self,
+        text: Optional[str] = None,
+        type: Optional[str] = None,
+        body: bytes = b"",
+        encoding: str = "utf8",
+        namespaces: Optional[Mapping[str, str]] = None,
+        root: Optional[Any] = _NOT_SET,
+        base_url: Optional[str] = None,
+        _expr: Optional[str] = None,
+        huge_tree: bool = LXML_SUPPORTS_HUGE_TREE,
+    ) -> None:
         self.root: Any
-        if type not in ('html', 'json', 'text', 'xml', None):
-            raise ValueError(f'Invalid type: {type}')
+        if type not in ("html", "json", "text", "xml", None):
+            raise ValueError(f"Invalid type: {type}")
+
         if text is None and not body and root is _NOT_SET:
-            raise ValueError('Selector needs text, body, or root arguments')
+            raise ValueError("Selector needs text, body, or root arguments")
+
         if text is not None and not isinstance(text, str):
-            msg = f'text argument should be of type str, got {text.__class__}'
+            msg = f"text argument should be of type str, got {text.__class__}"
             raise TypeError(msg)
+
         if text is not None:
             if root is not _NOT_SET:
                 warnings.warn(
-                    'Selector got both text and root, root is being ignored.',
-                    stacklevel=2)
+                    "Selector got both text and root, root is being ignored.",
+                    stacklevel=2,
+                )
             if not isinstance(text, str):
-                msg = (
-                    f'text argument should be of type str, got {text.__class__}'
-                    )
+                msg = f"text argument should be of type str, got {text.__class__}"
                 raise TypeError(msg)
-            root, type = _get_root_and_type_from_text(text, input_type=type,
-                base_url=base_url, huge_tree=huge_tree)
+
+            root, type = _get_root_and_type_from_text(
+                text,
+                input_type=type,
+                base_url=base_url,
+                huge_tree=huge_tree,
+            )
             self.root = root
             self.type = type
         elif body:
             if not isinstance(body, bytes):
-                msg = (
-                    f'body argument should be of type bytes, got {body.__class__}'
-                    )
+                msg = f"body argument should be of type bytes, got {body.__class__}"
                 raise TypeError(msg)
-            root, type = _get_root_and_type_from_bytes(body=body, encoding=
-                encoding, input_type=type, base_url=base_url, huge_tree=
-                huge_tree)
+            root, type = _get_root_and_type_from_bytes(
+                body=body,
+                encoding=encoding,
+                input_type=type,
+                base_url=base_url,
+                huge_tree=huge_tree,
+            )
             self.root = root
             self.type = type
         elif root is _NOT_SET:
-            raise ValueError('Selector needs text, body, or root arguments')
+            raise ValueError("Selector needs text, body, or root arguments")
         else:
             self.root = root
             self.type = _get_root_type(root, input_type=type)
+
         self.namespaces = dict(self._default_namespaces)
         if namespaces is not None:
             self.namespaces.update(namespaces)
+
         self._expr = _expr
         self._huge_tree = huge_tree
         self._text = text

-    def __getstate__(self) ->Any:
+    def __getstate__(self) -> Any:
         raise TypeError("can't pickle Selector objects")

-    def jmespath(self: _SelectorType, query: str, **kwargs: Any
-        ) ->SelectorList[_SelectorType]:
+    def _get_root(
+        self,
+        text: str = "",
+        base_url: Optional[str] = None,
+        huge_tree: bool = LXML_SUPPORTS_HUGE_TREE,
+        type: Optional[str] = None,
+        body: bytes = b"",
+        encoding: str = "utf8",
+    ) -> etree._Element:
+        return create_root_node(
+            text,
+            body=body,
+            encoding=encoding,
+            parser_cls=_ctgroup[type or self.type]["_parser"],
+            base_url=base_url,
+            huge_tree=huge_tree,
+        )
+
+    def jmespath(
+        self: _SelectorType,
+        query: str,
+        **kwargs: Any,
+    ) -> SelectorList[_SelectorType]:
         """
         Find objects matching the JMESPath ``query`` and return the result as a
         :class:`SelectorList` instance with all elements flattened. List
@@ -295,10 +547,37 @@ class Selector:

             selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict))
         """
-        pass
-
-    def xpath(self: _SelectorType, query: str, namespaces: Optional[Mapping
-        [str, str]]=None, **kwargs: Any) ->SelectorList[_SelectorType]:
+        if self.type == "json":
+            if isinstance(self.root, str):
+                # Selector received a JSON string as root.
+                data = _load_json_or_none(self.root)
+            else:
+                data = self.root
+        else:
+            assert self.type in {"html", "xml"}  # nosec
+            data = _load_json_or_none(self.root.text)
+
+        result = jmespath.search(query, data, **kwargs)
+        if result is None:
+            result = []
+        elif not isinstance(result, list):
+            result = [result]
+
+        def make_selector(x: Any) -> _SelectorType:  # closure function
+            if isinstance(x, str):
+                return self.__class__(text=x, _expr=query, type="text")
+            else:
+                return self.__class__(root=x, _expr=query)
+
+        result = [make_selector(x) for x in result]
+        return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result))
+
+    def xpath(
+        self: _SelectorType,
+        query: str,
+        namespaces: Optional[Mapping[str, str]] = None,
+        **kwargs: Any,
+    ) -> SelectorList[_SelectorType]:
         """
         Find nodes matching the xpath ``query`` and return the result as a
         :class:`SelectorList` instance with all elements flattened. List
@@ -316,9 +595,51 @@ class Selector:

             selector.xpath('//a[href=$url]', url="http://www.example.com")
         """
-        pass
+        if self.type not in ("html", "xml", "text"):
+            raise ValueError(f"Cannot use xpath on a Selector of type {self.type!r}")
+        if self.type in ("html", "xml"):
+            try:
+                xpathev = self.root.xpath
+            except AttributeError:
+                return typing.cast(
+                    SelectorList[_SelectorType], self.selectorlist_cls([])
+                )
+        else:
+            try:
+                xpathev = self._get_root(self._text or "", type="html").xpath
+            except AttributeError:
+                return typing.cast(
+                    SelectorList[_SelectorType], self.selectorlist_cls([])
+                )

-    def css(self: _SelectorType, query: str) ->SelectorList[_SelectorType]:
+        nsp = dict(self.namespaces)
+        if namespaces is not None:
+            nsp.update(namespaces)
+        try:
+            result = xpathev(
+                query,
+                namespaces=nsp,
+                smart_strings=self._lxml_smart_strings,
+                **kwargs,
+            )
+        except etree.XPathError as exc:
+            raise ValueError(f"XPath error: {exc} in {query}")
+
+        if type(result) is not list:
+            result = [result]
+
+        result = [
+            self.__class__(
+                root=x,
+                _expr=query,
+                namespaces=self.namespaces,
+                type=_xml_or_html(self.type),
+            )
+            for x in result
+        ]
+        return typing.cast(SelectorList[_SelectorType], self.selectorlist_cls(result))
+
+    def css(self: _SelectorType, query: str) -> SelectorList[_SelectorType]:
         """
         Apply the given CSS selector and return a :class:`SelectorList` instance.

@@ -329,10 +650,17 @@ class Selector:

         .. _cssselect: https://pypi.python.org/pypi/cssselect/
         """
-        pass
+        if self.type not in ("html", "xml", "text"):
+            raise ValueError(f"Cannot use css on a Selector of type {self.type!r}")
+        return self.xpath(self._css2xpath(query))

-    def re(self, regex: Union[str, Pattern[str]], replace_entities: bool=True
-        ) ->List[str]:
+    def _css2xpath(self, query: str) -> str:
+        type = _xml_or_html(self.type)
+        return _ctgroup[type]["_csstranslator"].css_to_xpath(query)
+
+    def re(
+        self, regex: Union[str, Pattern[str]], replace_entities: bool = True
+    ) -> List[str]:
         """
         Apply the given regex and return a list of strings with the
         matches.
@@ -345,10 +673,33 @@ class Selector:
         Passing ``replace_entities`` as ``False`` switches off these
         replacements.
         """
+        data = self.get()
+        return extract_regex(regex, data, replace_entities=replace_entities)
+
+    @typing.overload
+    def re_first(
+        self,
+        regex: Union[str, Pattern[str]],
+        default: None = None,
+        replace_entities: bool = True,
+    ) -> Optional[str]:
         pass

-    def re_first(self, regex: Union[str, Pattern[str]], default: Optional[
-        str]=None, replace_entities: bool=True) ->Optional[str]:
+    @typing.overload
+    def re_first(
+        self,
+        regex: Union[str, Pattern[str]],
+        default: str,
+        replace_entities: bool = True,
+    ) -> str:
+        pass
+
+    def re_first(
+        self,
+        regex: Union[str, Pattern[str]],
+        default: Optional[str] = None,
+        replace_entities: bool = True,
+    ) -> Optional[str]:
         """
         Apply the given regex and return the first string which matches. If
         there is no match, return the default value (``None`` if the argument
@@ -359,68 +710,145 @@ class Selector:
         Passing ``replace_entities`` as ``False`` switches off these
         replacements.
         """
-        pass
+        return next(
+            iflatten(self.re(regex, replace_entities=replace_entities)),
+            default,
+        )

-    def get(self) ->Any:
+    def get(self) -> Any:
         """
         Serialize and return the matched nodes.

         For HTML and XML, the result is always a string, and percent-encoded
         content is unquoted.
         """
-        pass
+        if self.type in ("text", "json"):
+            return self.root
+        try:
+            return typing.cast(
+                str,
+                etree.tostring(
+                    self.root,
+                    method=_ctgroup[self.type]["_tostring_method"],
+                    encoding="unicode",
+                    with_tail=False,
+                ),
+            )
+        except (AttributeError, TypeError):
+            if self.root is True:
+                return "1"
+            elif self.root is False:
+                return "0"
+            else:
+                return str(self.root)
+
     extract = get

-    def getall(self) ->List[str]:
+    def getall(self) -> List[str]:
         """
         Serialize and return the matched node in a 1-element list of strings.
         """
-        pass
+        return [self.get()]

-    def register_namespace(self, prefix: str, uri: str) ->None:
+    def register_namespace(self, prefix: str, uri: str) -> None:
         """
         Register the given namespace to be used in this :class:`Selector`.
         Without registering namespaces you can't select or extract data from
         non-standard namespaces. See :ref:`selector-examples-xml`.
         """
-        pass
+        self.namespaces[prefix] = uri

-    def remove_namespaces(self) ->None:
+    def remove_namespaces(self) -> None:
         """
         Remove all namespaces, allowing to traverse the document using
         namespace-less xpaths. See :ref:`removing-namespaces`.
         """
-        pass
+        for el in self.root.iter("*"):
+            if el.tag.startswith("{"):
+                el.tag = el.tag.split("}", 1)[1]
+            # loop on element attributes also
+            for an in el.attrib:
+                if an.startswith("{"):
+                    el.attrib[an.split("}", 1)[1]] = el.attrib.pop(an)
+        # remove namespace declarations
+        etree.cleanup_namespaces(self.root)

-    def remove(self) ->None:
+    def remove(self) -> None:
         """
         Remove matched nodes from the parent element.
         """
-        pass
-
-    def drop(self) ->None:
+        warn(
+            "Method parsel.selector.Selector.remove is deprecated, please use parsel.selector.Selector.drop method instead",
+            category=DeprecationWarning,
+            stacklevel=2,
+        )
+        try:
+            parent = self.root.getparent()
+        except AttributeError:
+            # 'str' object has no attribute 'getparent'
+            raise CannotRemoveElementWithoutRoot(
+                "The node you're trying to remove has no root, "
+                "are you trying to remove a pseudo-element? "
+                "Try to use 'li' as a selector instead of 'li::text' or "
+                "'//li' instead of '//li/text()', for example."
+            )
+
+        try:
+            parent.remove(self.root)
+        except AttributeError:
+            # 'NoneType' object has no attribute 'remove'
+            raise CannotRemoveElementWithoutParent(
+                "The node you're trying to remove has no parent, "
+                "are you trying to remove a root element?"
+            )
+
+    def drop(self) -> None:
         """
         Drop matched nodes from the parent element.
         """
-        pass
+        try:
+            parent = self.root.getparent()
+        except AttributeError:
+            # 'str' object has no attribute 'getparent'
+            raise CannotRemoveElementWithoutRoot(
+                "The node you're trying to drop has no root, "
+                "are you trying to drop a pseudo-element? "
+                "Try to use 'li' as a selector instead of 'li::text' or "
+                "'//li' instead of '//li/text()', for example."
+            )
+
+        try:
+            if self.type == "xml":
+                if parent is None:
+                    raise ValueError("This node has no parent")
+                parent.remove(self.root)
+            else:
+                typing.cast(html.HtmlElement, self.root).drop_tree()
+        except (AttributeError, AssertionError):
+            # 'NoneType' object has no attribute 'drop'
+            raise CannotDropElementWithoutParent(
+                "The node you're trying to remove has no parent, "
+                "are you trying to remove a root element?"
+            )

     @property
-    def attrib(self) ->Dict[str, str]:
+    def attrib(self) -> Dict[str, str]:
         """Return the attributes dictionary for underlying element."""
-        pass
+        return dict(self.root.attrib)

-    def __bool__(self) ->bool:
+    def __bool__(self) -> bool:
         """
         Return ``True`` if there is any real content selected or ``False``
         otherwise.  In other words, the boolean value of a :class:`Selector` is
         given by the contents it selects.
         """
         return bool(self.get())
+
     __nonzero__ = __bool__

-    def __str__(self) ->str:
+    def __str__(self) -> str:
         return str(self.get())

-    def __repr__(self) ->str:
+    def __repr__(self) -> str:
         data = repr(shorten(str(self.get()), width=40))
-        return f'<{type(self).__name__} query={self._expr!r} data={data}>'
+        return f"<{type(self).__name__} query={self._expr!r} data={data}>"
diff --git a/parsel/utils.py b/parsel/utils.py
index 361f99c..ec77d74 100644
--- a/parsel/utils.py
+++ b/parsel/utils.py
@@ -1,9 +1,10 @@
 import re
 from typing import Any, Iterable, Iterator, List, Match, Pattern, Union, cast
+
 from w3lib.html import replace_entities as w3lib_replace_entities


-def flatten(x: Iterable[Any]) ->List[Any]:
+def flatten(x: Iterable[Any]) -> List[Any]:
     """flatten(sequence) -> list
     Returns a single, flat list which contains all elements retrieved
     from the sequence and all recursively contained sub-sequences
@@ -18,16 +19,20 @@ def flatten(x: Iterable[Any]) ->List[Any]:
     >>> flatten(["foo", ["baz", 42], "bar"])
     ['foo', 'baz', 42, 'bar']
     """
-    pass
+    return list(iflatten(x))


-def iflatten(x: Iterable[Any]) ->Iterator[Any]:
+def iflatten(x: Iterable[Any]) -> Iterator[Any]:
     """iflatten(sequence) -> Iterator
     Similar to ``.flatten()``, but returns iterator instead"""
-    pass
+    for el in x:
+        if _is_listlike(el):
+            yield from flatten(el)
+        else:
+            yield el


-def _is_listlike(x: Any) ->bool:
+def _is_listlike(x: Any) -> bool:
     """
     >>> _is_listlike("foo")
     False
@@ -48,19 +53,44 @@ def _is_listlike(x: Any) ->bool:
     >>> _is_listlike(range(5))
     True
     """
-    pass
+    return hasattr(x, "__iter__") and not isinstance(x, (str, bytes))


-def extract_regex(regex: Union[str, Pattern[str]], text: str,
-    replace_entities: bool=True) ->List[str]:
+def extract_regex(
+    regex: Union[str, Pattern[str]], text: str, replace_entities: bool = True
+) -> List[str]:
     """Extract a list of strings from the given text/encoding using the following policies:
     * if the regex contains a named group called "extract" that will be returned
     * if the regex contains multiple numbered groups, all those will be returned (flattened)
     * if the regex doesn't contain any group the entire regex matching is returned
     """
-    pass
+    if isinstance(regex, str):
+        regex = re.compile(regex, re.UNICODE)
+
+    if "extract" in regex.groupindex:
+        # named group
+        try:
+            extracted = cast(Match[str], regex.search(text)).group("extract")
+        except AttributeError:
+            strings = []
+        else:
+            strings = [extracted] if extracted is not None else []
+    else:
+        # full regex or numbered groups
+        strings = regex.findall(text)
+
+    strings = flatten(strings)
+    if not replace_entities:
+        return strings
+    return [w3lib_replace_entities(s, keep=["lt", "amp"]) for s in strings]


-def shorten(text: str, width: int, suffix: str='...') ->str:
+def shorten(text: str, width: int, suffix: str = "...") -> str:
     """Truncate the given text to fit in the given width."""
-    pass
+    if len(text) <= width:
+        return text
+    if width > len(suffix):
+        return text[: width - len(suffix)] + suffix
+    if width >= 0:
+        return suffix[len(suffix) - width :]
+    raise ValueError("width must be equal or greater than 0")
diff --git a/parsel/xpathfuncs.py b/parsel/xpathfuncs.py
index 55d8f4a..7633d10 100644
--- a/parsel/xpathfuncs.py
+++ b/parsel/xpathfuncs.py
@@ -1,12 +1,14 @@
 import re
 from typing import Any, Callable, Optional
+
 from lxml import etree
 from w3lib.html import HTML5_WHITESPACE
-regex = f'[{HTML5_WHITESPACE}]+'
+
+regex = f"[{HTML5_WHITESPACE}]+"
 replace_html5_whitespaces = re.compile(regex).sub


-def set_xpathfunc(fname: str, func: Optional[Callable]) ->None:
+def set_xpathfunc(fname: str, func: Optional[Callable]) -> None:  # type: ignore[type-arg]
     """Register a custom extension function to use in XPath expressions.

     The function ``func`` registered under ``fname`` identifier will be called
@@ -20,13 +22,37 @@ def set_xpathfunc(fname: str, func: Optional[Callable]) ->None:
     .. _`in lxml documentation`: https://lxml.de/extensions.html#xpath-extension-functions

     """
-    pass
+    ns_fns = etree.FunctionNamespace(None)
+    if func is not None:
+        ns_fns[fname] = func
+    else:
+        del ns_fns[fname]
+
+
+def setup() -> None:
+    set_xpathfunc("has-class", has_class)


-def has_class(context: Any, *classes: str) ->bool:
+def has_class(context: Any, *classes: str) -> bool:
     """has-class function.

     Return True if all ``classes`` are present in element's class attr.

     """
-    pass
+    if not context.eval_context.get("args_checked"):
+        if not classes:
+            raise ValueError("XPath error: has-class must have at least 1 argument")
+        for c in classes:
+            if not isinstance(c, str):
+                raise ValueError("XPath error: has-class arguments must be strings")
+        context.eval_context["args_checked"] = True
+
+    node_cls = context.context_node.get("class")
+    if node_cls is None:
+        return False
+    node_cls = " " + node_cls + " "
+    node_cls = replace_html5_whitespaces(" ", node_cls)
+    for cls in classes:
+        if " " + cls + " " not in node_cls:
+            return False
+    return True
Reference (Gold): parsel

Pytest Summary for test tests

Failed pytests:

Patch diff

Pytest Summary for test `tests`