back to Claude Sonnet 3.5 - Fill-in summary
Claude Sonnet 3.5 - Fill-in: parsel
Pytest Summary for test tests
status | count |
---|---|
failed | 5 |
passed | 10 |
total | 15 |
collected | 15 |
Failed pytests:
test_utils.py::test_shorten[-1-ValueError]
test_utils.py::test_shorten[-1-ValueError]
width = -1, expected =@mark.parametrize( "width,expected", ( (-1, ValueError), (0, ""), (1, "."), (2, ".."), (3, "..."), (4, "f..."), (5, "fo..."), (6, "foobar"), (7, "foobar"), ), ) def test_shorten(width: int, expected: Union[str, Type[Exception]]) -> None: if isinstance(expected, str): assert shorten("foobar", width) == expected else: > with raises(expected): E Failed: DID NOT RAISE tests/test_utils.py:26: Failed
test_utils.py::test_shorten[0-]
test_utils.py::test_shorten[0-]
width = 0, expected = '' @mark.parametrize( "width,expected", ( (-1, ValueError), (0, ""), (1, "."), (2, ".."), (3, "..."), (4, "f..."), (5, "fo..."), (6, "foobar"), (7, "foobar"), ), ) def test_shorten(width: int, expected: Union[str, Type[Exception]]) -> None: if isinstance(expected, str): > assert shorten("foobar", width) == expected E AssertionError tests/test_utils.py:24: AssertionError
test_utils.py::test_shorten[1-.]
test_utils.py::test_shorten[1-.]
width = 1, expected = '.' @mark.parametrize( "width,expected", ( (-1, ValueError), (0, ""), (1, "."), (2, ".."), (3, "..."), (4, "f..."), (5, "fo..."), (6, "foobar"), (7, "foobar"), ), ) def test_shorten(width: int, expected: Union[str, Type[Exception]]) -> None: if isinstance(expected, str): > assert shorten("foobar", width) == expected E AssertionError tests/test_utils.py:24: AssertionError
test_utils.py::test_shorten[2-..]
test_utils.py::test_shorten[2-..]
width = 2, expected = '..' @mark.parametrize( "width,expected", ( (-1, ValueError), (0, ""), (1, "."), (2, ".."), (3, "..."), (4, "f..."), (5, "fo..."), (6, "foobar"), (7, "foobar"), ), ) def test_shorten(width: int, expected: Union[str, Type[Exception]]) -> None: if isinstance(expected, str): > assert shorten("foobar", width) == expected E AssertionError tests/test_utils.py:24: AssertionError
test_utils.py::test_extract_regex[^.*$-"sometext" & "moretext"-True-expected4]
test_utils.py::test_extract_regex[^.*$-"sometext" & "moretext"-True-expected4]
regex = '^.*$', text = '"sometext" & "moretext"' replace_entities = True, expected = ['"sometext" & "moretext"'] @mark.parametrize( "regex, text, replace_entities, expected", ( [ r"(?P\w+)\s*(?P \d+)\s*\,?\s*(?P \d+)", "October 25, 2019", True, ["October", "25", "2019"], ], [ r"(?P \w+)\s*(?P \d+)\s*\,?\s*(?P \d+)", "October 25 2019", True, ["October", "25", "2019"], ], [ r"(?P \w+)\s*(?P \d+)\s*\,?\s*(?P \d+)", "October 25 2019", True, ["October"], ], [ r"\w+\s*\d+\s*\,?\s*\d+", "October 25 2019", True, ["October 25 2019"], ], [ r"^.*$", ""sometext" & "moretext"", True, ['"sometext" & "moretext"'], ], [ r"^.*$", ""sometext" & "moretext"", False, [""sometext" & "moretext""], ], ), ) def test_extract_regex( regex: Union[str, Pattern[str]], text: str, replace_entities: bool, expected: List[str], ) -> None: > assert extract_regex(regex, text, replace_entities) == expected E AssertionError tests/test_utils.py:77: AssertionError
Patch diff
diff --git a/parsel/csstranslator.py b/parsel/csstranslator.py
index 2bf8cc7..dbf9a2e 100644
--- a/parsel/csstranslator.py
+++ b/parsel/csstranslator.py
@@ -44,17 +44,30 @@ class TranslatorMixin:
"""
Dispatch method that transforms XPath to support pseudo-element
"""
- pass
+ if isinstance(pseudo_element, FunctionalPseudoElement):
+ if pseudo_element.name == 'attr':
+ return self.xpath_attr_functional_pseudo_element(xpath, pseudo_element)
+ elif isinstance(pseudo_element, PseudoElement):
+ if pseudo_element.name == 'text':
+ return self.xpath_text_simple_pseudo_element(xpath)
+ return xpath
def xpath_attr_functional_pseudo_element(self, xpath: OriginalXPathExpr,
function: FunctionalPseudoElement) ->XPathExpr:
"""Support selecting attribute values using ::attr() pseudo-element"""
- pass
+ if not function.arguments:
+ raise ExpressionError("The ::attr() pseudo-element requires an argument.")
+ attribute = function.arguments[0]
+ xpath = XPathExpr(xpath.path, xpath.element)
+ xpath.attribute = attribute
+ return xpath
def xpath_text_simple_pseudo_element(self, xpath: OriginalXPathExpr
) ->XPathExpr:
"""Support selecting text nodes using ::text pseudo-element"""
- pass
+ xpath = XPathExpr(xpath.path, xpath.element)
+ xpath.textnode = True
+ return xpath
class GenericTranslator(TranslatorMixin, OriginalGenericTranslator):
@@ -68,6 +81,7 @@ class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
_translator = HTMLTranslator()
+@lru_cache(maxsize=256)
def css2xpath(query: str) ->str:
"""Return translated XPath version of a given CSS query"""
- pass
+ return _translator.css_to_xpath(query)
diff --git a/parsel/selector.py b/parsel/selector.py
index 6aa73da..b1288f3 100644
--- a/parsel/selector.py
+++ b/parsel/selector.py
@@ -54,7 +54,15 @@ def create_root_node(text: str, parser_cls: Type[_ParserType], base_url:
Optional[str]=None, huge_tree: bool=LXML_SUPPORTS_HUGE_TREE, body:
bytes=b'', encoding: str='utf8') ->etree._Element:
"""Create root node for text using given parser class."""
- pass
+ parser = parser_cls(recover=True, encoding=encoding)
+ if LXML_SUPPORTS_HUGE_TREE:
+ parser.set_option(etree.XML_PARSE_HUGE, huge_tree)
+ if body:
+ text = body.decode(encoding)
+ root = etree.fromstring(text.encode('utf-8'), parser=parser, base_url=base_url)
+ if root is None:
+ root = etree.fromstring('<html/>', parser=parser, base_url=base_url)
+ return root
class SelectorList(List[_SelectorType]):
@@ -96,7 +104,7 @@ class SelectorList(List[_SelectorType]):
selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict))
"""
- pass
+ return self.__class__(flatten([x.jmespath(query, **kwargs) for x in self]))
def xpath(self, xpath: str, namespaces: Optional[Mapping[str, str]]=
None, **kwargs: Any) ->'SelectorList[_SelectorType]':
@@ -116,7 +124,7 @@ class SelectorList(List[_SelectorType]):
selector.xpath('//a[href=$url]', url="http://www.example.com")
"""
- pass
+ return self.__class__(flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self]))
def css(self, query: str) ->'SelectorList[_SelectorType]':
"""
@@ -125,7 +133,7 @@ class SelectorList(List[_SelectorType]):
``query`` is the same argument as the one in :meth:`Selector.css`
"""
- pass
+ return self.__class__(flatten([x.css(query) for x in self]))
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool=True
) ->List[str]:
@@ -138,7 +146,7 @@ class SelectorList(List[_SelectorType]):
Passing ``replace_entities`` as ``False`` switches off these
replacements.
"""
- pass
+ return flatten([x.re(regex, replace_entities=replace_entities) for x in self])
def re_first(self, regex: Union[str, Pattern[str]], default: Optional[
str]=None, replace_entities: bool=True) ->Optional[str]:
@@ -153,14 +161,18 @@ class SelectorList(List[_SelectorType]):
Passing ``replace_entities`` as ``False`` switches off these
replacements.
"""
- pass
+ for el in iflatten(self):
+ m = el.re(regex, replace_entities=replace_entities)
+ if m:
+ return m[0]
+ return default
def getall(self) ->List[str]:
"""
Call the ``.get()`` method for each element is this list and return
their results flattened, as a list of strings.
"""
- pass
+ return [x.get() for x in self]
extract = getall
def get(self, default: Optional[str]=None) ->Any:
@@ -168,7 +180,9 @@ class SelectorList(List[_SelectorType]):
Return the result of ``.get()`` for the first element in this list.
If the list is empty, return the default value.
"""
- pass
+ for x in self:
+ return x.get()
+ return default
extract_first = get
@property
@@ -176,19 +190,23 @@ class SelectorList(List[_SelectorType]):
"""Return the attributes dictionary for the first element.
If the list is empty, return an empty dict.
"""
- pass
+ for x in self:
+ return x.attrib
+ return {}
def remove(self) ->None:
"""
Remove matched nodes from the parent for each element in this list.
"""
- pass
+ for x in self:
+ x.remove()
def drop(self) ->None:
"""
Drop matched nodes from the parent for each element in this list.
"""
- pass
+ for x in self:
+ x.drop()
_NOT_SET = object()
@@ -295,7 +313,11 @@ class Selector:
selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict))
"""
- pass
+ result = jmespath.search(query, self.get(), **kwargs)
+ if isinstance(result, list):
+ return self.selectorlist_cls(Selector(root=r, _expr=query) for r in result)
+ else:
+ return self.selectorlist_cls([Selector(root=result, _expr=query)])
def xpath(self: _SelectorType, query: str, namespaces: Optional[Mapping
[str, str]]=None, **kwargs: Any) ->SelectorList[_SelectorType]:
@@ -316,7 +338,25 @@ class Selector:
selector.xpath('//a[href=$url]', url="http://www.example.com")
"""
- pass
+ try:
+ xpathev = self.root.xpath
+ except AttributeError:
+ return self.selectorlist_cls([])
+
+ nsp = dict(self.namespaces)
+ if namespaces is not None:
+ nsp.update(namespaces)
+ try:
+ result = xpathev(query, namespaces=nsp, smart_strings=self._lxml_smart_strings, **kwargs)
+ except etree.XPathError as exc:
+ msg = f"XPath error: {exc} in {query}"
+ raise ValueError(msg)
+
+ if type(result) is not list:
+ result = [result]
+
+ return self.selectorlist_cls(self.__class__(root=x, _expr=query)
+ for x in result)
def css(self: _SelectorType, query: str) ->SelectorList[_SelectorType]:
"""
@@ -329,7 +369,7 @@ class Selector:
.. _cssselect: https://pypi.python.org/pypi/cssselect/
"""
- pass
+ return self.xpath(_ctgroup[self.type]['_csstranslator'].css_to_xpath(query))
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool=True
) ->List[str]:
@@ -345,7 +385,7 @@ class Selector:
Passing ``replace_entities`` as ``False`` switches off these
replacements.
"""
- pass
+ return extract_regex(regex, self.get(), replace_entities=replace_entities)
def re_first(self, regex: Union[str, Pattern[str]], default: Optional[
str]=None, replace_entities: bool=True) ->Optional[str]:
@@ -359,7 +399,7 @@ class Selector:
Passing ``replace_entities`` as ``False`` switches off these
replacements.
"""
- pass
+ return next(iflatten(self.re(regex, replace_entities)), default)
def get(self) ->Any:
"""
@@ -368,14 +408,25 @@ class Selector:
For HTML and XML, the result is always a string, and percent-encoded
content is unquoted.
"""
- pass
+ try:
+ return etree.tostring(self.root,
+ method=_ctgroup[self.type]['_tostring_method'],
+ encoding='unicode',
+ with_tail=False)
+ except (AttributeError, TypeError):
+ if self.root is True:
+ return '1'
+ elif self.root is False:
+ return '0'
+ else:
+ return str(self.root)
extract = get
def getall(self) ->List[str]:
"""
Serialize and return the matched node in a 1-element list of strings.
"""
- pass
+ return [self.get()]
def register_namespace(self, prefix: str, uri: str) ->None:
"""
@@ -383,26 +434,37 @@ class Selector:
Without registering namespaces you can't select or extract data from
non-standard namespaces. See :ref:`selector-examples-xml`.
"""
- pass
+ self.namespaces[prefix] = uri
def remove_namespaces(self) ->None:
"""
Remove all namespaces, allowing to traverse the document using
namespace-less xpaths. See :ref:`removing-namespaces`.
"""
- pass
+ for el in self.root.iter('*'):
+ if el.tag.startswith('{'):
+ el.tag = el.tag.split('}', 1)[1]
+ for an in list(el.attrib.keys()):
+ if an.startswith('{'):
+ el.attrib[an.split('}', 1)[1]] = el.attrib.pop(an)
def remove(self) ->None:
"""
Remove matched nodes from the parent element.
"""
- pass
+ parent = self.root.getparent()
+ if parent is None:
+ raise CannotRemoveElementWithoutRoot("Element has no parent")
+ parent.remove(self.root)
def drop(self) ->None:
"""
Drop matched nodes from the parent element.
"""
- pass
+ parent = self.root.getparent()
+ if parent is None:
+ raise CannotDropElementWithoutParent("Element has no parent")
+ parent.drop(self.root)
@property
def attrib(self) ->Dict[str, str]:
diff --git a/parsel/utils.py b/parsel/utils.py
index 361f99c..36c006f 100644
--- a/parsel/utils.py
+++ b/parsel/utils.py
@@ -18,13 +18,23 @@ def flatten(x: Iterable[Any]) ->List[Any]:
>>> flatten(["foo", ["baz", 42], "bar"])
['foo', 'baz', 42, 'bar']
"""
- pass
+ result = []
+ for el in x:
+ if _is_listlike(el):
+ result.extend(flatten(el))
+ else:
+ result.append(el)
+ return result
def iflatten(x: Iterable[Any]) ->Iterator[Any]:
"""iflatten(sequence) -> Iterator
Similar to ``.flatten()``, but returns iterator instead"""
- pass
+ for el in x:
+ if _is_listlike(el):
+ yield from iflatten(el)
+ else:
+ yield el
def _is_listlike(x: Any) ->bool:
@@ -48,7 +58,10 @@ def _is_listlike(x: Any) ->bool:
>>> _is_listlike(range(5))
True
"""
- pass
+ return (
+ hasattr(x, '__iter__') and
+ not isinstance(x, (str, bytes))
+ )
def extract_regex(regex: Union[str, Pattern[str]], text: str,
@@ -58,9 +71,25 @@ def extract_regex(regex: Union[str, Pattern[str]], text: str,
* if the regex contains multiple numbered groups, all those will be returned (flattened)
* if the regex doesn't contain any group the entire regex matching is returned
"""
- pass
+ if isinstance(regex, str):
+ regex = re.compile(regex)
+
+ if replace_entities:
+ text = w3lib_replace_entities(text)
+
+ if 'extract' in regex.groupindex:
+ extracted = [match.group('extract') for match in regex.finditer(text)]
+ elif regex.groups > 0:
+ extracted = [cast(Match[str], match).groups() for match in regex.finditer(text)]
+ extracted = flatten(extracted)
+ else:
+ extracted = regex.findall(text)
+
+ return [str(s) for s in extracted]
def shorten(text: str, width: int, suffix: str='...') ->str:
"""Truncate the given text to fit in the given width."""
- pass
+ if len(text) <= width:
+ return text
+ return text[:width - len(suffix)] + suffix
diff --git a/parsel/xpathfuncs.py b/parsel/xpathfuncs.py
index 55d8f4a..0de3274 100644
--- a/parsel/xpathfuncs.py
+++ b/parsel/xpathfuncs.py
@@ -20,7 +20,12 @@ def set_xpathfunc(fname: str, func: Optional[Callable]) ->None:
.. _`in lxml documentation`: https://lxml.de/extensions.html#xpath-extension-functions
"""
- pass
+ ns = etree.FunctionNamespace(None)
+ if func is None:
+ if fname in ns:
+ del ns[fname]
+ else:
+ ns[fname] = func
def has_class(context: Any, *classes: str) ->bool:
@@ -29,4 +34,7 @@ def has_class(context: Any, *classes: str) ->bool:
Return True if all ``classes`` are present in element's class attr.
"""
- pass
+ if not context.context_node.get('class'):
+ return False
+ node_classes = set(replace_html5_whitespaces(' ', context.context_node.get('class')).split())
+ return all(cls in node_classes for cls in classes)