back to SWE-Agent summary
SWE-Agent: chardet
Pytest Summary for test .
status |
count |
failed |
375 |
passed |
1 |
xfailed |
7 |
total |
383 |
collected |
383 |
Failed pytests:
_ude_4.txt-iso-8859-1]
_ude_4.txt-iso-8859-1]
file_name = 'tests/iso-8859-1/_ude_4.txt', encoding = 'iso-8859-1'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
result = chardet.detect(input_bytes)
try:
expected_unicode = input_bytes.decode(encoding)
except LookupError:
expected_unicode = ""
try:
detected_unicode = input_bytes.decode(result["encoding"])
except (LookupError, UnicodeDecodeError, TypeError):
detected_unicode = ""
if result:
encoding_match = (result["encoding"] or "").lower() == encoding
else:
encoding_match = False
# Only care about mismatches that would actually result in different
# behavior when decoding
if not encoding_match and expected_unicode != detected_unicode:
wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
diff = "".join(
list(
ndiff(
wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
)
)[:20]
)
> all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True)
test.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
byte_str = bytearray(b"Il padre. Ecco, sissignore! Ma un fatto \xe8 come un sacco: vuoto, non si regge. Perch\xe9 si regga, bisog...ntre quella poverina\ncredeva di sacrificarsi per me e per quei due, cucendo anche di notte la roba di Madama Pace!\n")
ignore_threshold = True
def detect_all(byte_str, ignore_threshold=False):
"""
Detect all the possible encodings of the given byte string.
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
:param ignore_threshold: Include encodings that are below
``UniversalDetector.MINIMUM_THRESHOLD``
in results.
:type ignore_threshold: ``bool``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError(
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
)
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector.feed(byte_str)
detector.close()
> if detector.input_state == InputState.HIGH_BYTE:
E AttributeError: 'UniversalDetector' object has no attribute 'input_state'. Did you mean: '_input_state'?
chardet/__init__.py:65: AttributeError
_ude_6.txt-iso-8859-1]
_ude_6.txt-iso-8859-1]
file_name = 'tests/iso-8859-1/_ude_6.txt', encoding = 'iso-8859-1'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
result = chardet.detect(input_bytes)
try:
expected_unicode = input_bytes.decode(encoding)
except LookupError:
expected_unicode = ""
try:
detected_unicode = input_bytes.decode(result["encoding"])
except (LookupError, UnicodeDecodeError, TypeError):
detected_unicode = ""
if result:
encoding_match = (result["encoding"] or "").lower() == encoding
else:
encoding_match = False
# Only care about mismatches that would actually result in different
# behavior when decoding
if not encoding_match and expected_unicode != detected_unicode:
wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
diff = "".join(
list(
ndiff(
wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
)
)[:20]
)
> all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True)
test.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
byte_str = bytearray(b'Viel\xe4 ehdit perehty\xe4 Sibeliuksen el\xe4m\xe4\xe4n Ateneumissa, aina 22. maaliskuuta asti.\nMoniaisti...in ja paahtimoihin.\nTapahtumassa kilpaillaan lis\xe4ksi Cup Tasting, Brewers Cup ja Vuoden Barista titteleist\xe4.\n')
ignore_threshold = True
def detect_all(byte_str, ignore_threshold=False):
"""
Detect all the possible encodings of the given byte string.
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
:param ignore_threshold: Include encodings that are below
``UniversalDetector.MINIMUM_THRESHOLD``
in results.
:type ignore_threshold: ``bool``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError(
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
)
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector.feed(byte_str)
detector.close()
> if detector.input_state == InputState.HIGH_BYTE:
E AttributeError: 'UniversalDetector' object has no attribute 'input_state'. Did you mean: '_input_state'?
chardet/__init__.py:65: AttributeError
_ude_2.txt-iso-8859-1]
_ude_2.txt-iso-8859-1]
file_name = 'tests/iso-8859-1/_ude_2.txt', encoding = 'iso-8859-1'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
result = chardet.detect(input_bytes)
try:
expected_unicode = input_bytes.decode(encoding)
except LookupError:
expected_unicode = ""
try:
detected_unicode = input_bytes.decode(result["encoding"])
except (LookupError, UnicodeDecodeError, TypeError):
detected_unicode = ""
if result:
encoding_match = (result["encoding"] or "").lower() == encoding
else:
encoding_match = False
# Only care about mismatches that would actually result in different
# behavior when decoding
if not encoding_match and expected_unicode != detected_unicode:
wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
diff = "".join(
list(
ndiff(
wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
)
)[:20]
)
> all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True)
test.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
byte_str = bytearray(b'Le type de visa requis pour entrer en France d\xe9pend \xe0 la fois de la dur\xe9e et des motifs du s\xe9j...urn\xe9e, sportifs disputant un championnat, salari\xe9 d\xe9tach\xe9 dans le cadre d\'une\nprestation de service).\n')
ignore_threshold = True
def detect_all(byte_str, ignore_threshold=False):
"""
Detect all the possible encodings of the given byte string.
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
:param ignore_threshold: Include encodings that are below
``UniversalDetector.MINIMUM_THRESHOLD``
in results.
:type ignore_threshold: ``bool``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError(
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
)
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector.feed(byte_str)
detector.close()
> if detector.input_state == InputState.HIGH_BYTE:
E AttributeError: 'UniversalDetector' object has no attribute 'input_state'. Did you mean: '_input_state'?
chardet/__init__.py:65: AttributeError
_ude_5.txt-iso-8859-1]
_ude_5.txt-iso-8859-1]
file_name = 'tests/iso-8859-1/_ude_5.txt', encoding = 'iso-8859-1'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
result = chardet.detect(input_bytes)
try:
expected_unicode = input_bytes.decode(encoding)
except LookupError:
expected_unicode = ""
try:
detected_unicode = input_bytes.decode(result["encoding"])
except (LookupError, UnicodeDecodeError, TypeError):
detected_unicode = ""
if result:
encoding_match = (result["encoding"] or "").lower() == encoding
else:
encoding_match = False
# Only care about mismatches that would actually result in different
# behavior when decoding
if not encoding_match and expected_unicode != detected_unicode:
wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
diff = "".join(
list(
ndiff(
wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
)
)[:20]
)
> all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True)
test.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
byte_str = bytearray(b'Agentes de la Guardia Civil de la Comandancia de Madrid, integrantes del Equipo Mujer Menor, han detenido ...clases a Alumnos de Primaria,\nera tutor de ni\xf1os de 11 a\xf1os, pero daba clases a otros menores de 13 a\xf1os.\n')
ignore_threshold = True
def detect_all(byte_str, ignore_threshold=False):
"""
Detect all the possible encodings of the given byte string.
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
:param ignore_threshold: Include encodings that are below
``UniversalDetector.MINIMUM_THRESHOLD``
in results.
:type ignore_threshold: ``bool``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError(
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
)
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector.feed(byte_str)
detector.close()
> if detector.input_state == InputState.HIGH_BYTE:
E AttributeError: 'UniversalDetector' object has no attribute 'input_state'. Did you mean: '_input_state'?
chardet/__init__.py:65: AttributeError
_ude_3.txt-iso-8859-1]
_ude_3.txt-iso-8859-1]
file_name = 'tests/iso-8859-1/_ude_3.txt', encoding = 'iso-8859-1'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
result = chardet.detect(input_bytes)
try:
expected_unicode = input_bytes.decode(encoding)
except LookupError:
expected_unicode = ""
try:
detected_unicode = input_bytes.decode(result["encoding"])
except (LookupError, UnicodeDecodeError, TypeError):
detected_unicode = ""
if result:
encoding_match = (result["encoding"] or "").lower() == encoding
else:
encoding_match = False
# Only care about mismatches that would actually result in different
# behavior when decoding
if not encoding_match and expected_unicode != detected_unicode:
wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
diff = "".join(
list(
ndiff(
wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
)
)[:20]
)
> all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True)
test.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
byte_str = bytearray(b"La commedia non ha atti n\xe9 scene. La rappresentazione sar\xe0 interrotta una prima volta, senza che il ... mettersi in ginocchio e\ninchiodarli. Alle martellate accorrer\xe0 dalla porta dei camerini il Direttore di scena.\n")
ignore_threshold = True
def detect_all(byte_str, ignore_threshold=False):
"""
Detect all the possible encodings of the given byte string.
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
:param ignore_threshold: Include encodings that are below
``UniversalDetector.MINIMUM_THRESHOLD``
in results.
:type ignore_threshold: ``bool``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError(
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
)
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector.feed(byte_str)
detector.close()
> if detector.input_state == InputState.HIGH_BYTE:
E AttributeError: 'UniversalDetector' object has no attribute 'input_state'. Did you mean: '_input_state'?
chardet/__init__.py:65: AttributeError
_ude_1.txt-iso-8859-1]
_ude_1.txt-iso-8859-1]
file_name = 'tests/iso-8859-1/_ude_1.txt', encoding = 'iso-8859-1'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
result = chardet.detect(input_bytes)
try:
expected_unicode = input_bytes.decode(encoding)
except LookupError:
expected_unicode = ""
try:
detected_unicode = input_bytes.decode(result["encoding"])
except (LookupError, UnicodeDecodeError, TypeError):
detected_unicode = ""
if result:
encoding_match = (result["encoding"] or "").lower() == encoding
else:
encoding_match = False
# Only care about mismatches that would actually result in different
# behavior when decoding
if not encoding_match and expected_unicode != detected_unicode:
wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
diff = "".join(
list(
ndiff(
wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
)
)[:20]
)
> all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True)
test.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
byte_str = bytearray(b"Nas paginas que em seguida se leem acha-se t\xe3o bem determinada, com tanta eloquencia e t\xe3o profunda ...stra\xeddo que se affasta da sala do festim, e cuja voz se perde pouco a pouco no silencio da distancia e da noute.\n")
ignore_threshold = True
def detect_all(byte_str, ignore_threshold=False):
"""
Detect all the possible encodings of the given byte string.
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
:param ignore_threshold: Include encodings that are below
``UniversalDetector.MINIMUM_THRESHOLD``
in results.
:type ignore_threshold: ``bool``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError(
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
)
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector.feed(byte_str)
detector.close()
> if detector.input_state == InputState.HIGH_BYTE:
E AttributeError: 'UniversalDetector' object has no attribute 'input_state'. Did you mean: '_input_state'?
chardet/__init__.py:65: AttributeError
nobom-utf32le.txt-utf-32le]
nobom-utf32le.txt-utf-32le]
file_name = 'tests/UTF-32LE/nobom-utf32le.txt', encoding = 'utf-32le'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
result = chardet.detect(input_bytes)
try:
expected_unicode = input_bytes.decode(encoding)
except LookupError:
expected_unicode = ""
try:
detected_unicode = input_bytes.decode(result["encoding"])
except (LookupError, UnicodeDecodeError, TypeError):
detected_unicode = ""
if result:
encoding_match = (result["encoding"] or "").lower() == encoding
else:
encoding_match = False
# Only care about mismatches that would actually result in different
# behavior when decoding
if not encoding_match and expected_unicode != detected_unicode:
wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
diff = "".join(
list(
ndiff(
wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
)
)[:20]
)
> all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True)
test.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
byte_str = bytearray(b'D\x00\x00\x00a\x00\x00\x00t\x00\x00\x00e\x00\x00\x00T\x00\x00\x00i\x00\x00\x00m\x00\x00\x00e\x00\x00\x00,\...x00\x00\x00.\x00\x00\x000\x00\x00\x008\x00\x00\x003\x00\x00\x005\x00\x00\x003\x00\x00\x00\r\x00\x00\x00\n\x00\x00\x00')
ignore_threshold = True
def detect_all(byte_str, ignore_threshold=False):
"""
Detect all the possible encodings of the given byte string.
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
:param ignore_threshold: Include encodings that are below
``UniversalDetector.MINIMUM_THRESHOLD``
in results.
:type ignore_threshold: ``bool``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError(
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
)
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector.feed(byte_str)
detector.close()
> if detector.input_state == InputState.HIGH_BYTE:
E AttributeError: 'UniversalDetector' object has no attribute 'input_state'. Did you mean: '_input_state'?
chardet/__init__.py:65: AttributeError
plane1-utf-32le.html-utf-32le]
plane1-utf-32le.html-utf-32le]
file_name = 'tests/UTF-32LE/plane1-utf-32le.html', encoding = 'utf-32le'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
result = chardet.detect(input_bytes)
try:
expected_unicode = input_bytes.decode(encoding)
except LookupError:
expected_unicode = ""
try:
detected_unicode = input_bytes.decode(result["encoding"])
except (LookupError, UnicodeDecodeError, TypeError):
detected_unicode = ""
if result:
encoding_match = (result["encoding"] or "").lower() == encoding
else:
encoding_match = False
# Only care about mismatches that would actually result in different
# behavior when decoding
if not encoding_match and expected_unicode != detected_unicode:
wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
diff = "".join(
list(
ndiff(
wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
)
)[:20]
)
> all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True)
test.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
byte_str = bytearray(b'<\x00\x00\x00!\x00\x00\x00D\x00\x00\x00O\x00\x00\x00C\x00\x00\x00T\x00\x00\x00Y\x00\x00\x00P\x00\x00\x00E\...x00\x00\x00/\x00\x00\x00h\x00\x00\x00t\x00\x00\x00m\x00\x00\x00l\x00\x00\x00>\x00\x00\x00\r\x00\x00\x00\n\x00\x00\x00')
ignore_threshold = True
def detect_all(byte_str, ignore_threshold=False):
"""
Detect all the possible encodings of the given byte string.
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
:param ignore_threshold: Include encodings that are below
``UniversalDetector.MINIMUM_THRESHOLD``
in results.
:type ignore_threshold: ``bool``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError(
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
)
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector.feed(byte_str)
detector.close()
> if detector.input_state == InputState.HIGH_BYTE:
E AttributeError: 'UniversalDetector' object has no attribute 'input_state'. Did you mean: '_input_state'?
chardet/__init__.py:65: AttributeError
bom-utf-16-le.srt-utf-16]
bom-utf-16-le.srt-utf-16]
file_name = 'tests/UTF-16/bom-utf-16-le.srt', encoding = 'utf-16'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
result = chardet.detect(input_bytes)
try:
expected_unicode = input_bytes.decode(encoding)
except LookupError:
expected_unicode = ""
try:
detected_unicode = input_bytes.decode(result["encoding"])
except (LookupError, UnicodeDecodeError, TypeError):
detected_unicode = ""
if result:
encoding_match = (result["encoding"] or "").lower() == encoding
else:
encoding_match = False
# Only care about mismatches that would actually result in different
# behavior when decoding
if not encoding_match and expected_unicode != detected_unicode:
wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
diff = "".join(
list(
ndiff(
wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
)
)[:20]
)
> all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True)
test.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
byte_str = bytearray(b"\xff\xfe1\x00\n\x000\x000\x00:\x000\x000\x00:\x000\x006\x00,\x005\x000\x000\x00 \x00-\x00-\x00>\x00 \x000\... \x00g\x00l\x00o\x00b\x00a\x00l\x00 \x00a\x00w\x00a\x00r\x00e\x00n\x00e\x00s\x00s\x00 \x00d\x00a\x00y\x00\n\x00\n\x00")
ignore_threshold = True
def detect_all(byte_str, ignore_threshold=False):
"""
Detect all the possible encodings of the given byte string.
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
:param ignore_threshold: Include encodings that are below
``UniversalDetector.MINIMUM_THRESHOLD``
in results.
:type ignore_threshold: ``bool``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError(
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
)
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector.feed(byte_str)
detector.close()
> if detector.input_state == InputState.HIGH_BYTE:
E AttributeError: 'UniversalDetector' object has no attribute 'input_state'. Did you mean: '_input_state'?
chardet/__init__.py:65: AttributeError
bom-utf-16-be.srt-utf-16]
bom-utf-16-be.srt-utf-16]
file_name = 'tests/UTF-16/bom-utf-16-be.srt', encoding = 'utf-16'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
result = chardet.detect(input_bytes)
try:
expected_unicode = input_bytes.decode(encoding)
except LookupError:
expected_unicode = ""
try:
detected_unicode = input_bytes.decode(result["encoding"])
except (LookupError, UnicodeDecodeError, TypeError):
detected_unicode = ""
if result:
encoding_match = (result["encoding"] or "").lower() == encoding
else:
encoding_match = False
# Only care about mismatches that would actually result in different
# behavior when decoding
if not encoding_match and expected_unicode != detected_unicode:
wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
diff = "".join(
list(
ndiff(
wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
)
)[:20]
)
> all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True)
test.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
byte_str = bytearray(b"\xfe\xff\x001\x00\n\x000\x000\x00:\x000\x000\x00:\x000\x006\x00,\x005\x000\x000\x00 \x00-\x00-\x00>\x00 \x...\x00 \x00g\x00l\x00o\x00b\x00a\x00l\x00 \x00a\x00w\x00a\x00r\x00e\x00n\x00e\x00s\x00s\x00 \x00d\x00a\x00y\x00\n\x00\n")
ignore_threshold = True
def detect_all(byte_str, ignore_threshold=False):
"""
Detect all the possible encodings of the given byte string.
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
:param ignore_threshold: Include encodings that are below
``UniversalDetector.MINIMUM_THRESHOLD``
in results.
:type ignore_threshold: ``bool``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError(
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
)
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector.feed(byte_str)
detector.close()
> if detector.input_state == InputState.HIGH_BYTE:
E AttributeError: 'UniversalDetector' object has no attribute 'input_state'. Did you mean: '_input_state'?
chardet/__init__.py:65: AttributeError
janulalife.blogspot.com.xml-iso-8859-5]
janulalife.blogspot.com.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-russian/janulalife.blogspot.com.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
result = chardet.detect(input_bytes)
try:
expected_unicode = input_bytes.decode(encoding)
except LookupError:
expected_unicode = ""
try:
detected_unicode = input_bytes.decode(result["encoding"])
except (LookupError, UnicodeDecodeError, TypeError):
detected_unicode = ""
if result:
encoding_match = (result["encoding"] or "").lower() == encoding
else:
encoding_match = False
# Only care about mismatches that would actually result in different
# behavior when decoding
if not encoding_match and expected_unicode != detected_unicode:
wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
diff = "".join(
list(
ndiff(
wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
)
)[:20]
)
> all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True)
test.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
byte_str = bytearray(b'\n\n