back to OpenHands summary
OpenHands: chardet
Pytest Summary for test .
status |
count |
failed |
367 |
xfailed |
7 |
passed |
9 |
total |
383 |
collected |
383 |
Failed pytests:
plane1-utf-32be.html-utf-32be]
plane1-utf-32be.html-utf-32be]
file_name = 'tests/UTF-32BE/plane1-utf-32be.html', encoding = 'utf-32be'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
nobom-utf32be.txt-utf-32be]
nobom-utf32be.txt-utf-32be]
file_name = 'tests/UTF-32BE/nobom-utf32be.txt', encoding = 'utf-32be'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
result = chardet.detect(input_bytes)
try:
expected_unicode = input_bytes.decode(encoding)
except LookupError:
expected_unicode = ""
try:
detected_unicode = input_bytes.decode(result["encoding"])
except (LookupError, UnicodeDecodeError, TypeError):
detected_unicode = ""
if result:
encoding_match = (result["encoding"] or "").lower() == encoding
else:
encoding_match = False
# Only care about mismatches that would actually result in different
# behavior when decoding
if not encoding_match and expected_unicode != detected_unicode:
wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
diff = "".join(
list(
ndiff(
wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
)
)[:20]
)
all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True)
else:
diff = ""
encoding_match = True
all_encodings = [result]
> assert encoding_match, (
f"Expected {encoding}, but got {result} for {file_name}. First 20 "
f"lines of character differences: \n{diff}\n"
f"All encodings: {pformat(all_encodings)}"
)
E AssertionError: Expected utf-32be, but got {'encoding': 'ascii', 'confidence': 1.0, 'language': ''} for tests/UTF-32BE/nobom-utf32be.txt. First 20 lines of character differences:
E - DateTime,Bid,Ask 07/19/2015 21:00:15.469,1.083,1.08332 07/19/2015 21:00:16.949,1.08311,1.08332
E - 07/19/2015 21:00:16.955,1.08311,1.08338 07/19/2015 21:00:17.120,1.08313,1.08338 07/19/2015
E - 21:00:17.166,1.08313,1.0834 07/19/2015 21:00:17.205,1.08313,1.08339 07/19/2015
E - 21:00:17.218,1.08312,1.08339 07/19/2015 21:00:17.469,1.08316,1.08339 07/19/2015
E - 21:00:17.476,1.08316,1.08347 07/19/2015 21:00:17.505,1.08316,1.08344 07/19/2015
E - 21:00:17.584,1.08316,1.08348 07/19/2015 21:00:18.905,1.08316,1.08351 07/19/2015
E - 21:00:19.005,1.08336,1.08351 07/19/2015 21:00:19.011,1.08336,1.08403 07/19/2015
E - 21:00:19.019,1.08334,1.08403 07/19/2015 21:00:19.025,1.08334,1.08406 07/19/2015
E - 21:00:20.310,1.08307,1.08353 07/19/2015 21:00:20.317,1.08278,1.08353 07/19/2015
E - 21:00:20.319,1.08307,1.08353
E + D a t e T i m e , B i d , A s k
E + 0 7 / 1 9 / 2 0 1 5 2 1 : 0 0 : 1 5 . 4 6 9 , 1
E + . 0 8 3 , 1 . 0 8 3 3 2 0 7 / 1 9 / 2 0 1 5
E + 2 1 : 0 0 : 1 6 . 9 4 9 , 1 . 0 8 3 1 1 , 1 . 0 8
E + 3 3 2 0 7 / 1 9 / 2 0 1 5 2 1 : 0 0 : 1 6 .
E + 9 5 5 , 1 . 0 8 3 1 1 , 1 . 0 8 3 3 8
E + 0 7 / 1 9 / 2 0 1 5 2 1 : 0 0 : 1 7 . 1 2 0 , 1
E + . 0 8 3 1 3 , 1 . 0 8 3 3 8
E + 0 7 / 1 9 / 2 0 1 5 2 1 : 0 0 : 1 7 . 1 6 6 , 1
E + . 0 8 3 1 3 , 1 . 0 8 3 4
E
E All encodings: [{'confidence': 1.0, 'encoding': 'ascii', 'language': ''}]
E assert False
test.py:110: AssertionError
_ude_2.txt-iso-8859-9]
_ude_2.txt-iso-8859-9]
file_name = 'tests/iso-8859-9-turkish/_ude_2.txt', encoding = 'iso-8859-9'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
wikitop_tr_ISO-8859-9.txt-iso-8859-9]
wikitop_tr_ISO-8859-9.txt-iso-8859-9]
file_name = 'tests/iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt'
encoding = 'iso-8859-9'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
divxplanet.com.xml-iso-8859-9]
divxplanet.com.xml-iso-8859-9]
file_name = 'tests/iso-8859-9-turkish/divxplanet.com.xml'
encoding = 'iso-8859-9'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
subtitle.srt-iso-8859-9]
subtitle.srt-iso-8859-9]
file_name = 'tests/iso-8859-9-turkish/subtitle.srt', encoding = 'iso-8859-9'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_1.txt-iso-8859-9]
_ude_1.txt-iso-8859-9]
file_name = 'tests/iso-8859-9-turkish/_ude_1.txt', encoding = 'iso-8859-9'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.9.xml-ibm866]
file_name = 'tests/IBM866/forum.template-toolkit.ru.9.xml', encoding = 'ibm866'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_1.txt-ibm866]
_ude_1.txt-ibm866]
file_name = 'tests/IBM866/_ude_1.txt', encoding = 'ibm866'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
money.rin.ru.xml-ibm866]
money.rin.ru.xml-ibm866]
file_name = 'tests/IBM866/money.rin.ru.xml', encoding = 'ibm866'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
music.peeps.ru.xml-ibm866]
music.peeps.ru.xml-ibm866]
file_name = 'tests/IBM866/music.peeps.ru.xml', encoding = 'ibm866'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.6.xml-ibm866]
file_name = 'tests/IBM866/forum.template-toolkit.ru.6.xml', encoding = 'ibm866'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.4.xml-ibm866]
file_name = 'tests/IBM866/forum.template-toolkit.ru.4.xml', encoding = 'ibm866'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
intertat.ru.xml-ibm866]
intertat.ru.xml-ibm866]
file_name = 'tests/IBM866/intertat.ru.xml', encoding = 'ibm866'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
janulalife.blogspot.com.xml-ibm866]
janulalife.blogspot.com.xml-ibm866]
file_name = 'tests/IBM866/janulalife.blogspot.com.xml', encoding = 'ibm866'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.1.xml-ibm866]
file_name = 'tests/IBM866/forum.template-toolkit.ru.1.xml', encoding = 'ibm866'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
blog.mlmaster.com.xml-ibm866]
blog.mlmaster.com.xml-ibm866]
file_name = 'tests/IBM866/blog.mlmaster.com.xml', encoding = 'ibm866'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
newsru.com.xml-ibm866]
newsru.com.xml-ibm866]
file_name = 'tests/IBM866/newsru.com.xml', encoding = 'ibm866'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
greek.ru.xml-ibm866]
greek.ru.xml-ibm866]
file_name = 'tests/IBM866/greek.ru.xml', encoding = 'ibm866'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
kapranoff.ru.xml-ibm866]
kapranoff.ru.xml-ibm866]
file_name = 'tests/IBM866/kapranoff.ru.xml', encoding = 'ibm866'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
aif.ru.health.xml-ibm866]
aif.ru.health.xml-ibm866]
file_name = 'tests/IBM866/aif.ru.health.xml', encoding = 'ibm866'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.8.xml-ibm866]
file_name = 'tests/IBM866/forum.template-toolkit.ru.8.xml', encoding = 'ibm866'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
aug32.hole.ru.xml-ibm866]
aug32.hole.ru.xml-ibm866]
file_name = 'tests/IBM866/aug32.hole.ru.xml', encoding = 'ibm866'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
aviaport.ru.xml-ibm866]
aviaport.ru.xml-ibm866]
file_name = 'tests/IBM866/aviaport.ru.xml', encoding = 'ibm866'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
susu.ac.ru.xml-ibm866]
susu.ac.ru.xml-ibm866]
file_name = 'tests/IBM866/susu.ac.ru.xml', encoding = 'ibm866'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
hardsoft.at.webry.info.xml-cp932]
hardsoft.at.webry.info.xml-cp932]
file_name = 'tests/CP932/hardsoft.at.webry.info.xml', encoding = 'cp932'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
y-moto.com.xml-cp932]
y-moto.com.xml-cp932]
file_name = 'tests/CP932/y-moto.com.xml', encoding = 'cp932'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
www2.chuo-u.ac.jp-suishin.xml-cp932]
www2.chuo-u.ac.jp-suishin.xml-cp932]
file_name = 'tests/CP932/www2.chuo-u.ac.jp-suishin.xml', encoding = 'cp932'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
balatonblog.typepad.com.xml-utf-8]
balatonblog.typepad.com.xml-utf-8]
file_name = 'tests/utf-8/balatonblog.typepad.com.xml', encoding = 'utf-8'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_2.txt-utf-8]
_ude_2.txt-utf-8]
file_name = 'tests/utf-8/_ude_2.txt', encoding = 'utf-8'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_greek.txt-utf-8]
_ude_greek.txt-utf-8]
file_name = 'tests/utf-8/_ude_greek.txt', encoding = 'utf-8'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_5.txt-utf-8]
_ude_5.txt-utf-8]
file_name = 'tests/utf-8/_ude_5.txt', encoding = 'utf-8'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_he1.txt-utf-8]
_ude_he1.txt-utf-8]
file_name = 'tests/utf-8/_ude_he1.txt', encoding = 'utf-8'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_mozilla_bug426271_text-utf-8.html-utf-8]
_mozilla_bug426271_text-utf-8.html-utf-8]
file_name = 'tests/utf-8/_mozilla_bug426271_text-utf-8.html', encoding = 'utf-8'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_he3.txt-utf-8]
_ude_he3.txt-utf-8]
file_name = 'tests/utf-8/_ude_he3.txt', encoding = 'utf-8'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_chromium_UTF-8_with_no_encoding_specified.html-utf-8]
_chromium_UTF-8_with_no_encoding_specified.html-utf-8]
file_name = 'tests/utf-8/_chromium_UTF-8_with_no_encoding_specified.html'
encoding = 'utf-8'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_mozilla_bug306272_text.html-utf-8]
_mozilla_bug306272_text.html-utf-8]
file_name = 'tests/utf-8/_mozilla_bug306272_text.html', encoding = 'utf-8'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
weblabor.hu.2.xml-utf-8]
weblabor.hu.2.xml-utf-8]
file_name = 'tests/utf-8/weblabor.hu.2.xml', encoding = 'utf-8'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_1.txt-utf-8]
_ude_1.txt-utf-8]
file_name = 'tests/utf-8/_ude_1.txt', encoding = 'utf-8'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
boobooo.blogspot.com.xml-utf-8]
boobooo.blogspot.com.xml-utf-8]
file_name = 'tests/utf-8/boobooo.blogspot.com.xml', encoding = 'utf-8'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
weblabor.hu.xml-utf-8]
weblabor.hu.xml-utf-8]
file_name = 'tests/utf-8/weblabor.hu.xml', encoding = 'utf-8'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
anitabee.blogspot.com.xml-utf-8]
anitabee.blogspot.com.xml-utf-8]
file_name = 'tests/utf-8/anitabee.blogspot.com.xml', encoding = 'utf-8'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
pihgy.hu.xml-utf-8]
pihgy.hu.xml-utf-8]
file_name = 'tests/utf-8/pihgy.hu.xml', encoding = 'utf-8'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_3.txt-utf-8]
_ude_3.txt-utf-8]
file_name = 'tests/utf-8/_ude_3.txt', encoding = 'utf-8'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
linuxbox.hu.xml-utf-8]
linuxbox.hu.xml-utf-8]
file_name = 'tests/utf-8/linuxbox.hu.xml', encoding = 'utf-8'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_russian.txt-utf-8]
_ude_russian.txt-utf-8]
file_name = 'tests/utf-8/_ude_russian.txt', encoding = 'utf-8'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_he2.txt-utf-8]
_ude_he2.txt-utf-8]
file_name = 'tests/utf-8/_ude_he2.txt', encoding = 'utf-8'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_iso1.txt-iso-2022-kr]
_ude_iso1.txt-iso-2022-kr]
file_name = 'tests/iso-2022-kr/_ude_iso1.txt', encoding = 'iso-2022-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:137: in feed
self._esc_charset_prober = EscCharSetProber()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self =
lang_filter = None
def __init__(self, lang_filter=None):
super().__init__(lang_filter=lang_filter)
self.coding_sm = []
> if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:
E TypeError: unsupported operand type(s) for &: 'NoneType' and 'int'
chardet/escprober.py:16: TypeError
_ude_iso2.txt-iso-2022-kr]
_ude_iso2.txt-iso-2022-kr]
file_name = 'tests/iso-2022-kr/_ude_iso2.txt', encoding = 'iso-2022-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:137: in feed
self._esc_charset_prober = EscCharSetProber()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self =
lang_filter = None
def __init__(self, lang_filter=None):
super().__init__(lang_filter=lang_filter)
self.coding_sm = []
> if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:
E TypeError: unsupported operand type(s) for &: 'NoneType' and 'int'
chardet/escprober.py:16: TypeError
contents-factory.com.xml-euc-jp]
contents-factory.com.xml-euc-jp]
file_name = 'tests/EUC-JP/contents-factory.com.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
artifact-jp.com.xml-euc-jp]
artifact-jp.com.xml-euc-jp]
file_name = 'tests/EUC-JP/artifact-jp.com.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
siesta.co.jp.aozora.xml-euc-jp]
siesta.co.jp.aozora.xml-euc-jp]
file_name = 'tests/EUC-JP/siesta.co.jp.aozora.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
tls.org.xml-euc-jp]
tls.org.xml-euc-jp]
file_name = 'tests/EUC-JP/tls.org.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_mozilla_bug431054_text.html-euc-jp]
_mozilla_bug431054_text.html-euc-jp]
file_name = 'tests/EUC-JP/_mozilla_bug431054_text.html', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
azoz.org.xml-euc-jp]
azoz.org.xml-euc-jp]
file_name = 'tests/EUC-JP/azoz.org.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
atom.ycf.nanet.co.jp.xml-euc-jp]
atom.ycf.nanet.co.jp.xml-euc-jp]
file_name = 'tests/EUC-JP/atom.ycf.nanet.co.jp.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
bphrs.net.xml-euc-jp]
bphrs.net.xml-euc-jp]
file_name = 'tests/EUC-JP/bphrs.net.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
ch.kitaguni.tv.xml-euc-jp]
ch.kitaguni.tv.xml-euc-jp]
file_name = 'tests/EUC-JP/ch.kitaguni.tv.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
rdf.ycf.nanet.co.jp.xml-euc-jp]
rdf.ycf.nanet.co.jp.xml-euc-jp]
file_name = 'tests/EUC-JP/rdf.ycf.nanet.co.jp.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
manana.moo.jp.xml-euc-jp]
manana.moo.jp.xml-euc-jp]
file_name = 'tests/EUC-JP/manana.moo.jp.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
furusatonoeki.cutegirl.jp.xml-euc-jp]
furusatonoeki.cutegirl.jp.xml-euc-jp]
file_name = 'tests/EUC-JP/furusatonoeki.cutegirl.jp.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
overcube.com.xml-euc-jp]
overcube.com.xml-euc-jp]
file_name = 'tests/EUC-JP/overcube.com.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
pinkupa.com.xml-euc-jp]
pinkupa.com.xml-euc-jp]
file_name = 'tests/EUC-JP/pinkupa.com.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
mimizun.com.xml-euc-jp]
mimizun.com.xml-euc-jp]
file_name = 'tests/EUC-JP/mimizun.com.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
club.h14m.org.xml-euc-jp]
club.h14m.org.xml-euc-jp]
file_name = 'tests/EUC-JP/club.h14m.org.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
aristrist.s57.xrea.com.xml-euc-jp]
aristrist.s57.xrea.com.xml-euc-jp]
file_name = 'tests/EUC-JP/aristrist.s57.xrea.com.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
azito.under.jp.xml-euc-jp]
azito.under.jp.xml-euc-jp]
file_name = 'tests/EUC-JP/azito.under.jp.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_mozilla_bug620106_text.html-euc-jp]
_mozilla_bug620106_text.html-euc-jp]
file_name = 'tests/EUC-JP/_mozilla_bug620106_text.html', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
yukiboh.moo.jp.xml-euc-jp]
yukiboh.moo.jp.xml-euc-jp]
file_name = 'tests/EUC-JP/yukiboh.moo.jp.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
blog.kabu-navi.com.atom.xml-euc-jp]
blog.kabu-navi.com.atom.xml-euc-jp]
file_name = 'tests/EUC-JP/blog.kabu-navi.com.atom.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
misuzilla.org.xml-euc-jp]
misuzilla.org.xml-euc-jp]
file_name = 'tests/EUC-JP/misuzilla.org.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
overcube.com.atom.xml-euc-jp]
overcube.com.atom.xml-euc-jp]
file_name = 'tests/EUC-JP/overcube.com.atom.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
arclamp.jp.xml-euc-jp]
arclamp.jp.xml-euc-jp]
file_name = 'tests/EUC-JP/arclamp.jp.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
aivy.co.jp.xml-euc-jp]
aivy.co.jp.xml-euc-jp]
file_name = 'tests/EUC-JP/aivy.co.jp.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_1.txt-euc-jp]
_ude_1.txt-euc-jp]
file_name = 'tests/EUC-JP/_ude_1.txt', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_mozilla_bug426271_text-euc-jp.html-euc-jp]
_mozilla_bug426271_text-euc-jp.html-euc-jp]
file_name = 'tests/EUC-JP/_mozilla_bug426271_text-euc-jp.html'
encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
akaname.main.jp.xml-euc-jp]
akaname.main.jp.xml-euc-jp]
file_name = 'tests/EUC-JP/akaname.main.jp.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
blog.kabu-navi.com.xml-euc-jp]
blog.kabu-navi.com.xml-euc-jp]
file_name = 'tests/EUC-JP/blog.kabu-navi.com.xml', encoding = 'euc-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
informator.org.xml-windows-1251]
file_name = 'tests/windows-1251-bulgarian/informator.org.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
bpm.cult.bg.9.xml-windows-1251]
bpm.cult.bg.9.xml-windows-1251]
file_name = 'tests/windows-1251-bulgarian/bpm.cult.bg.9.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
debian.gabrovo.com.news.xml-windows-1251]
debian.gabrovo.com.news.xml-windows-1251]
file_name = 'tests/windows-1251-bulgarian/debian.gabrovo.com.news.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
linux-bg.org.xml-windows-1251]
linux-bg.org.xml-windows-1251]
file_name = 'tests/windows-1251-bulgarian/linux-bg.org.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
debian.gabrovo.com.xml-windows-1251]
debian.gabrovo.com.xml-windows-1251]
file_name = 'tests/windows-1251-bulgarian/debian.gabrovo.com.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
ecloga.cult.bg.xml-windows-1251]
ecloga.cult.bg.xml-windows-1251]
file_name = 'tests/windows-1251-bulgarian/ecloga.cult.bg.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
bpm.cult.bg.4.xml-windows-1251]
bpm.cult.bg.4.xml-windows-1251]
file_name = 'tests/windows-1251-bulgarian/bpm.cult.bg.4.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
doncho.net.comments.xml-windows-1251]
file_name = 'tests/windows-1251-bulgarian/doncho.net.comments.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
doncho.net.xml-windows-1251]
doncho.net.xml-windows-1251]
file_name = 'tests/windows-1251-bulgarian/doncho.net.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
bpm.cult.bg.medusa.4.xml-windows-1251]
bpm.cult.bg.medusa.4.xml-windows-1251]
file_name = 'tests/windows-1251-bulgarian/bpm.cult.bg.medusa.4.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
ide.li.xml-windows-1251]
ide.li.xml-windows-1251]
file_name = 'tests/windows-1251-bulgarian/ide.li.xml', encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
bpm.cult.bg.3.xml-windows-1251]
bpm.cult.bg.3.xml-windows-1251]
file_name = 'tests/windows-1251-bulgarian/bpm.cult.bg.3.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
bbc.co.uk.popshow.xml-windows-1251]
bbc.co.uk.popshow.xml-windows-1251]
file_name = 'tests/windows-1251-bulgarian/bbc.co.uk.popshow.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
bpm.cult.bg.xml-windows-1251]
bpm.cult.bg.xml-windows-1251]
file_name = 'tests/windows-1251-bulgarian/bpm.cult.bg.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
rinennor.org.xml-windows-1251]
rinennor.org.xml-windows-1251]
file_name = 'tests/windows-1251-bulgarian/rinennor.org.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
bpm.cult.bg.2.xml-windows-1251]
bpm.cult.bg.2.xml-windows-1251]
file_name = 'tests/windows-1251-bulgarian/bpm.cult.bg.2.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
nobom-utf32le.txt-utf-32le]
nobom-utf32le.txt-utf-32le]
file_name = 'tests/UTF-32LE/nobom-utf32le.txt', encoding = 'utf-32le'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
result = chardet.detect(input_bytes)
try:
expected_unicode = input_bytes.decode(encoding)
except LookupError:
expected_unicode = ""
try:
detected_unicode = input_bytes.decode(result["encoding"])
except (LookupError, UnicodeDecodeError, TypeError):
detected_unicode = ""
if result:
encoding_match = (result["encoding"] or "").lower() == encoding
else:
encoding_match = False
# Only care about mismatches that would actually result in different
# behavior when decoding
if not encoding_match and expected_unicode != detected_unicode:
wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
diff = "".join(
list(
ndiff(
wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
)
)[:20]
)
all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True)
else:
diff = ""
encoding_match = True
all_encodings = [result]
> assert encoding_match, (
f"Expected {encoding}, but got {result} for {file_name}. First 20 "
f"lines of character differences: \n{diff}\n"
f"All encodings: {pformat(all_encodings)}"
)
E AssertionError: Expected utf-32le, but got {'encoding': 'ascii', 'confidence': 1.0, 'language': ''} for tests/UTF-32LE/nobom-utf32le.txt. First 20 lines of character differences:
E - DateTime,Bid,Ask 07/19/2015 21:00:15.469,1.083,1.08332 07/19/2015 21:00:16.949,1.08311,1.08332
E - 07/19/2015 21:00:16.955,1.08311,1.08338 07/19/2015 21:00:17.120,1.08313,1.08338 07/19/2015
E - 21:00:17.166,1.08313,1.0834 07/19/2015 21:00:17.205,1.08313,1.08339 07/19/2015
E - 21:00:17.218,1.08312,1.08339 07/19/2015 21:00:17.469,1.08316,1.08339 07/19/2015
E - 21:00:17.476,1.08316,1.08347 07/19/2015 21:00:17.505,1.08316,1.08344 07/19/2015
E - 21:00:17.584,1.08316,1.08348 07/19/2015 21:00:18.905,1.08316,1.08351 07/19/2015
E - 21:00:19.005,1.08336,1.08351 07/19/2015 21:00:19.011,1.08336,1.08403 07/19/2015
E - 21:00:19.019,1.08334,1.08403 07/19/2015 21:00:19.025,1.08334,1.08406 07/19/2015
E - 21:00:20.310,1.08307,1.08353 07/19/2015 21:00:20.317,1.08278,1.08353 07/19/2015
E - 21:00:20.319,1.08307,1.08353
E + D a t e T i m e , B i d , A s k
E + 0 7 / 1 9 / 2 0 1 5 2 1 : 0 0 : 1 5 . 4 6 9 , 1
E + . 0 8 3 , 1 . 0 8 3 3 2 0 7 / 1 9 / 2 0 1 5
E + 2 1 : 0 0 : 1 6 . 9 4 9 , 1 . 0 8 3 1 1 , 1 . 0 8
E + 3 3 2 0 7 / 1 9 / 2 0 1 5 2 1 : 0 0 : 1 6 .
E + 9 5 5 , 1 . 0 8 3 1 1 , 1 . 0 8 3 3 8
E + 0 7 / 1 9 / 2 0 1 5 2 1 : 0 0 : 1 7 . 1 2 0 , 1
E + . 0 8 3 1 3 , 1 . 0 8 3 3 8
E + 0 7 / 1 9 / 2 0 1 5 2 1 : 0 0 : 1 7 . 1 6 6 , 1
E + . 0 8 3 1 3 , 1 . 0 8 3 4
E
E All encodings: [{'confidence': 1.0, 'encoding': 'ascii', 'language': ''}]
E assert False
test.py:110: AssertionError
plane1-utf-32le.html-utf-32le]
plane1-utf-32le.html-utf-32le]
file_name = 'tests/UTF-32LE/plane1-utf-32le.html', encoding = 'utf-32le'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
pcplus.co.il.xml-windows-1255]
pcplus.co.il.xml-windows-1255]
file_name = 'tests/windows-1255-hebrew/pcplus.co.il.xml'
encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
sharks.co.il.xml-windows-1255]
sharks.co.il.xml-windows-1255]
file_name = 'tests/windows-1255-hebrew/sharks.co.il.xml'
encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
whatsup.org.il.xml-windows-1255]
whatsup.org.il.xml-windows-1255]
file_name = 'tests/windows-1255-hebrew/whatsup.org.il.xml'
encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
info.org.il.xml-windows-1255]
info.org.il.xml-windows-1255]
file_name = 'tests/windows-1255-hebrew/info.org.il.xml'
encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
notes.co.il.50.xml-windows-1255]
notes.co.il.50.xml-windows-1255]
file_name = 'tests/windows-1255-hebrew/notes.co.il.50.xml'
encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
notes.co.il.8.xml-windows-1255]
notes.co.il.8.xml-windows-1255]
file_name = 'tests/windows-1255-hebrew/notes.co.il.8.xml'
encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
law.co.il.xml-windows-1255]
law.co.il.xml-windows-1255]
file_name = 'tests/windows-1255-hebrew/law.co.il.xml', encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
neviim.net.xml-windows-1255]
neviim.net.xml-windows-1255]
file_name = 'tests/windows-1255-hebrew/neviim.net.xml'
encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
notes.co.il.6.xml-windows-1255]
notes.co.il.6.xml-windows-1255]
file_name = 'tests/windows-1255-hebrew/notes.co.il.6.xml'
encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
maakav.org.xml-windows-1255]
maakav.org.xml-windows-1255]
file_name = 'tests/windows-1255-hebrew/maakav.org.xml'
encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
exego.net.2.xml-windows-1255]
exego.net.2.xml-windows-1255]
file_name = 'tests/windows-1255-hebrew/exego.net.2.xml'
encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_chromium_ISO-8859-8_with_no_encoding_specified.html-windows-1255]
_chromium_ISO-8859-8_with_no_encoding_specified.html-windows-1255]
file_name = 'tests/windows-1255-hebrew/_chromium_ISO-8859-8_with_no_encoding_specified.html'
encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
hagada.org.il.xml-windows-1255]
hagada.org.il.xml-windows-1255]
file_name = 'tests/windows-1255-hebrew/hagada.org.il.xml'
encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
hevra.org.il.xml-windows-1255]
hevra.org.il.xml-windows-1255]
file_name = 'tests/windows-1255-hebrew/hevra.org.il.xml'
encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_he2.txt-windows-1255]
_ude_he2.txt-windows-1255]
file_name = 'tests/windows-1255-hebrew/_ude_he2.txt', encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
carshops.co.il.xml-windows-1255]
carshops.co.il.xml-windows-1255]
file_name = 'tests/windows-1255-hebrew/carshops.co.il.xml'
encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
notes.co.il.7.xml-windows-1255]
notes.co.il.7.xml-windows-1255]
file_name = 'tests/windows-1255-hebrew/notes.co.il.7.xml'
encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
infomed.co.il.xml-windows-1255]
infomed.co.il.xml-windows-1255]
file_name = 'tests/windows-1255-hebrew/infomed.co.il.xml'
encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_he1.txt-windows-1255]
_ude_he1.txt-windows-1255]
file_name = 'tests/windows-1255-hebrew/_ude_he1.txt', encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
hydepark.hevre.co.il.7957.xml-windows-1255]
hydepark.hevre.co.il.7957.xml-windows-1255]
file_name = 'tests/windows-1255-hebrew/hydepark.hevre.co.il.7957.xml'
encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_he3.txt-windows-1255]
_ude_he3.txt-windows-1255]
file_name = 'tests/windows-1255-hebrew/_ude_he3.txt', encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
halemo.net.edoar.xml-windows-1255]
halemo.net.edoar.xml-windows-1255]
file_name = 'tests/windows-1255-hebrew/halemo.net.edoar.xml'
encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_chromium_windows-1255_with_no_encoding_specified.html-windows-1255]
_chromium_windows-1255_with_no_encoding_specified.html-windows-1255]
file_name = 'tests/windows-1255-hebrew/_chromium_windows-1255_with_no_encoding_specified.html'
encoding = 'windows-1255'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_euc-tw1.txt-euc-tw]
_ude_euc-tw1.txt-euc-tw]
file_name = 'tests/EUC-TW/_ude_euc-tw1.txt', encoding = 'euc-tw'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
intertat.ru.xml-windows-1251]
intertat.ru.xml-windows-1251]
file_name = 'tests/windows-1251-russian/intertat.ru.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
money.rin.ru.xml-windows-1251]
money.rin.ru.xml-windows-1251]
file_name = 'tests/windows-1251-russian/money.rin.ru.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
aif.ru.health.xml-windows-1251]
aif.ru.health.xml-windows-1251]
file_name = 'tests/windows-1251-russian/aif.ru.health.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
aviaport.ru.xml-windows-1251]
aviaport.ru.xml-windows-1251]
file_name = 'tests/windows-1251-russian/aviaport.ru.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
newsru.com.xml-windows-1251]
newsru.com.xml-windows-1251]
file_name = 'tests/windows-1251-russian/newsru.com.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
music.peeps.ru.xml-windows-1251]
music.peeps.ru.xml-windows-1251]
file_name = 'tests/windows-1251-russian/music.peeps.ru.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
janulalife.blogspot.com.xml-windows-1251]
janulalife.blogspot.com.xml-windows-1251]
file_name = 'tests/windows-1251-russian/janulalife.blogspot.com.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_chromium_windows-1251_with_no_encoding_specified.html-windows-1251]
_chromium_windows-1251_with_no_encoding_specified.html-windows-1251]
file_name = 'tests/windows-1251-russian/_chromium_windows-1251_with_no_encoding_specified.html'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
greek.ru.xml-windows-1251]
greek.ru.xml-windows-1251]
file_name = 'tests/windows-1251-russian/greek.ru.xml', encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
anthropology.ru.xml-windows-1251]
anthropology.ru.xml-windows-1251]
file_name = 'tests/windows-1251-russian/anthropology.ru.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
kapranoff.ru.xml-windows-1251]
kapranoff.ru.xml-windows-1251]
file_name = 'tests/windows-1251-russian/kapranoff.ru.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.4.xml-windows-1251]
file_name = 'tests/windows-1251-russian/forum.template-toolkit.ru.4.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.6.xml-windows-1251]
file_name = 'tests/windows-1251-russian/forum.template-toolkit.ru.6.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_1.txt-windows-1251]
_ude_1.txt-windows-1251]
file_name = 'tests/windows-1251-russian/_ude_1.txt', encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.9.xml-windows-1251]
file_name = 'tests/windows-1251-russian/forum.template-toolkit.ru.9.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.1.xml-windows-1251]
file_name = 'tests/windows-1251-russian/forum.template-toolkit.ru.1.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.8.xml-windows-1251]
file_name = 'tests/windows-1251-russian/forum.template-toolkit.ru.8.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
aug32.hole.ru.xml-windows-1251]
aug32.hole.ru.xml-windows-1251]
file_name = 'tests/windows-1251-russian/aug32.hole.ru.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
blog.mlmaster.com.xml-windows-1251]
blog.mlmaster.com.xml-windows-1251]
file_name = 'tests/windows-1251-russian/blog.mlmaster.com.xml'
encoding = 'windows-1251'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_1.txt-windows-1252]
_ude_1.txt-windows-1252]
file_name = 'tests/windows-1252/_ude_1.txt', encoding = 'windows-1252'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
github_bug_9.txt-windows-1252]
github_bug_9.txt-windows-1252]
file_name = 'tests/windows-1252/github_bug_9.txt', encoding = 'windows-1252'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_mozilla_bug421271_text.html-windows-1252]
_mozilla_bug421271_text.html-windows-1252]
file_name = 'tests/windows-1252/_mozilla_bug421271_text.html'
encoding = 'windows-1252'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_2.txt-windows-1252]
_ude_2.txt-windows-1252]
file_name = 'tests/windows-1252/_ude_2.txt', encoding = 'windows-1252'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_4.txt-iso-8859-1]
_ude_4.txt-iso-8859-1]
file_name = 'tests/iso-8859-1/_ude_4.txt', encoding = 'iso-8859-1'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_2.txt-iso-8859-1]
_ude_2.txt-iso-8859-1]
file_name = 'tests/iso-8859-1/_ude_2.txt', encoding = 'iso-8859-1'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_3.txt-iso-8859-1]
_ude_3.txt-iso-8859-1]
file_name = 'tests/iso-8859-1/_ude_3.txt', encoding = 'iso-8859-1'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_6.txt-iso-8859-1]
_ude_6.txt-iso-8859-1]
file_name = 'tests/iso-8859-1/_ude_6.txt', encoding = 'iso-8859-1'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_5.txt-iso-8859-1]
_ude_5.txt-iso-8859-1]
file_name = 'tests/iso-8859-1/_ude_5.txt', encoding = 'iso-8859-1'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_1.txt-iso-8859-1]
_ude_1.txt-iso-8859-1]
file_name = 'tests/iso-8859-1/_ude_1.txt', encoding = 'iso-8859-1'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
greek.ru.xml-koi8-r]
greek.ru.xml-koi8-r]
file_name = 'tests/KOI8-R/greek.ru.xml', encoding = 'koi8-r'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
blog.mlmaster.com.xml-koi8-r]
blog.mlmaster.com.xml-koi8-r]
file_name = 'tests/KOI8-R/blog.mlmaster.com.xml', encoding = 'koi8-r'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
aviaport.ru.xml-koi8-r]
aviaport.ru.xml-koi8-r]
file_name = 'tests/KOI8-R/aviaport.ru.xml', encoding = 'koi8-r'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_1.txt-koi8-r]
_ude_1.txt-koi8-r]
file_name = 'tests/KOI8-R/_ude_1.txt', encoding = 'koi8-r'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.1.xml-koi8-r]
file_name = 'tests/KOI8-R/forum.template-toolkit.ru.1.xml', encoding = 'koi8-r'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
aug32.hole.ru.xml-koi8-r]
aug32.hole.ru.xml-koi8-r]
file_name = 'tests/KOI8-R/aug32.hole.ru.xml', encoding = 'koi8-r'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
aif.ru.health.xml-koi8-r]
aif.ru.health.xml-koi8-r]
file_name = 'tests/KOI8-R/aif.ru.health.xml', encoding = 'koi8-r'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_chromium_KOI8-R_with_no_encoding_specified.html-koi8-r]
_chromium_KOI8-R_with_no_encoding_specified.html-koi8-r]
file_name = 'tests/KOI8-R/_chromium_KOI8-R_with_no_encoding_specified.html'
encoding = 'koi8-r'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
janulalife.blogspot.com.xml-koi8-r]
janulalife.blogspot.com.xml-koi8-r]
file_name = 'tests/KOI8-R/janulalife.blogspot.com.xml', encoding = 'koi8-r'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.9.xml-koi8-r]
file_name = 'tests/KOI8-R/forum.template-toolkit.ru.9.xml', encoding = 'koi8-r'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
newsru.com.xml-koi8-r]
newsru.com.xml-koi8-r]
file_name = 'tests/KOI8-R/newsru.com.xml', encoding = 'koi8-r'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
koi.kinder.ru.xml-koi8-r]
koi.kinder.ru.xml-koi8-r]
file_name = 'tests/KOI8-R/koi.kinder.ru.xml', encoding = 'koi8-r'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.4.xml-koi8-r]
file_name = 'tests/KOI8-R/forum.template-toolkit.ru.4.xml', encoding = 'koi8-r'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
kapranoff.ru.xml-koi8-r]
kapranoff.ru.xml-koi8-r]
file_name = 'tests/KOI8-R/kapranoff.ru.xml', encoding = 'koi8-r'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.6.xml-koi8-r]
file_name = 'tests/KOI8-R/forum.template-toolkit.ru.6.xml', encoding = 'koi8-r'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
music.peeps.ru.xml-koi8-r]
music.peeps.ru.xml-koi8-r]
file_name = 'tests/KOI8-R/music.peeps.ru.xml', encoding = 'koi8-r'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.8.xml-koi8-r]
file_name = 'tests/KOI8-R/forum.template-toolkit.ru.8.xml', encoding = 'koi8-r'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
susu.ac.ru.xml-koi8-r]
susu.ac.ru.xml-koi8-r]
file_name = 'tests/KOI8-R/susu.ac.ru.xml', encoding = 'koi8-r'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
intertat.ru.xml-koi8-r]
intertat.ru.xml-koi8-r]
file_name = 'tests/KOI8-R/intertat.ru.xml', encoding = 'koi8-r'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
money.rin.ru.xml-koi8-r]
money.rin.ru.xml-koi8-r]
file_name = 'tests/KOI8-R/money.rin.ru.xml', encoding = 'koi8-r'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
naftemporiki.gr.mrt.xml-iso-8859-7]
naftemporiki.gr.mrt.xml-iso-8859-7]
file_name = 'tests/iso-8859-7-greek/naftemporiki.gr.mrt.xml'
encoding = 'iso-8859-7'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
naftemporiki.gr.wld.xml-iso-8859-7]
naftemporiki.gr.wld.xml-iso-8859-7]
file_name = 'tests/iso-8859-7-greek/naftemporiki.gr.wld.xml'
encoding = 'iso-8859-7'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
disabled.gr.xml-iso-8859-7]
disabled.gr.xml-iso-8859-7]
file_name = 'tests/iso-8859-7-greek/disabled.gr.xml', encoding = 'iso-8859-7'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
hotstation.gr.xml-iso-8859-7]
hotstation.gr.xml-iso-8859-7]
file_name = 'tests/iso-8859-7-greek/hotstation.gr.xml', encoding = 'iso-8859-7'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
naftemporiki.gr.bus.xml-iso-8859-7]
naftemporiki.gr.bus.xml-iso-8859-7]
file_name = 'tests/iso-8859-7-greek/naftemporiki.gr.bus.xml'
encoding = 'iso-8859-7'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
naftemporiki.gr.cmm.xml-iso-8859-7]
naftemporiki.gr.cmm.xml-iso-8859-7]
file_name = 'tests/iso-8859-7-greek/naftemporiki.gr.cmm.xml'
encoding = 'iso-8859-7'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
naftemporiki.gr.spo.xml-iso-8859-7]
naftemporiki.gr.spo.xml-iso-8859-7]
file_name = 'tests/iso-8859-7-greek/naftemporiki.gr.spo.xml'
encoding = 'iso-8859-7'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_3.txt-iso-8859-7]
_ude_3.txt-iso-8859-7]
file_name = 'tests/iso-8859-7-greek/_ude_3.txt', encoding = 'iso-8859-7'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_2.txt-iso-8859-7]
_ude_2.txt-iso-8859-7]
file_name = 'tests/iso-8859-7-greek/_ude_2.txt', encoding = 'iso-8859-7'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
naftemporiki.gr.mrk.xml-iso-8859-7]
naftemporiki.gr.mrk.xml-iso-8859-7]
file_name = 'tests/iso-8859-7-greek/naftemporiki.gr.mrk.xml'
encoding = 'iso-8859-7'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
naftemporiki.gr.fin.xml-iso-8859-7]
naftemporiki.gr.fin.xml-iso-8859-7]
file_name = 'tests/iso-8859-7-greek/naftemporiki.gr.fin.xml'
encoding = 'iso-8859-7'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_chromium_ISO-8859-7_with_no_encoding_specified.html-iso-8859-7]
_chromium_ISO-8859-7_with_no_encoding_specified.html-iso-8859-7]
file_name = 'tests/iso-8859-7-greek/_chromium_ISO-8859-7_with_no_encoding_specified.html'
encoding = 'iso-8859-7'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_1.txt-iso-8859-7]
_ude_1.txt-iso-8859-7]
file_name = 'tests/iso-8859-7-greek/_ude_1.txt', encoding = 'iso-8859-7'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_greek.txt-iso-8859-7]
_ude_greek.txt-iso-8859-7]
file_name = 'tests/iso-8859-7-greek/_ude_greek.txt', encoding = 'iso-8859-7'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
andore.com.money.xml-shift_jis]
andore.com.money.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/andore.com.money.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
10e.org.xml-shift_jis]
10e.org.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/10e.org.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
setsuzei119.jp.xml-shift_jis]
setsuzei119.jp.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/setsuzei119.jp.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
nextbeaut.com.xml-shift_jis]
nextbeaut.com.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/nextbeaut.com.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
bloglelife.com.xml-shift_jis]
bloglelife.com.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/bloglelife.com.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
accessories-brand.com.xml-shift_jis]
accessories-brand.com.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/accessories-brand.com.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_2.txt-shift_jis]
_ude_2.txt-shift_jis]
file_name = 'tests/SHIFT_JIS/_ude_2.txt', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_4.txt-shift_jis]
_ude_4.txt-shift_jis]
file_name = 'tests/SHIFT_JIS/_ude_4.txt', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
perth-on.net.xml-shift_jis]
perth-on.net.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/perth-on.net.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
ooganemochi.com.xml-shift_jis]
ooganemochi.com.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/ooganemochi.com.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
sakusaka-silk.net.xml-shift_jis]
sakusaka-silk.net.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/sakusaka-silk.net.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
amefoot.net.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/amefoot.net.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_3.txt-shift_jis]
_ude_3.txt-shift_jis]
file_name = 'tests/SHIFT_JIS/_ude_3.txt', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
brag.zaka.to.xml-shift_jis]
brag.zaka.to.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/brag.zaka.to.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_1.txt-shift_jis]
_ude_1.txt-shift_jis]
file_name = 'tests/SHIFT_JIS/_ude_1.txt', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
grebeweb.net.xml-shift_jis]
grebeweb.net.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/grebeweb.net.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
andore.com.xml-shift_jis]
andore.com.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/andore.com.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
tamuyou.haun.org.xml-shift_jis]
tamuyou.haun.org.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/tamuyou.haun.org.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
andore.com.inami.xml-shift_jis]
andore.com.inami.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/andore.com.inami.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
blog.paseri.ne.jp.xml-shift_jis]
blog.paseri.ne.jp.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/blog.paseri.ne.jp.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
1affliate.com.xml-shift_jis]
1affliate.com.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/1affliate.com.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
moon-light.ne.jp.xml-shift_jis]
moon-light.ne.jp.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/moon-light.ne.jp.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_chromium_Shift-JIS_with_no_encoding_specified.html-shift_jis]
_chromium_Shift-JIS_with_no_encoding_specified.html-shift_jis]
file_name = 'tests/SHIFT_JIS/_chromium_Shift-JIS_with_no_encoding_specified.html'
encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
do.beginnersrack.com.xml-shift_jis]
do.beginnersrack.com.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/do.beginnersrack.com.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
blog.inkase.net.xml-shift_jis]
blog.inkase.net.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/blog.inkase.net.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
milliontimes.jp.xml-shift_jis]
milliontimes.jp.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/milliontimes.jp.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
celeb.lalalu.com.xml-shift_jis]
celeb.lalalu.com.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/celeb.lalalu.com.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
yasuhisa.com.xml-shift_jis]
yasuhisa.com.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/yasuhisa.com.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
clickablewords.com.xml-shift_jis]
clickablewords.com.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/clickablewords.com.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
dogsinn.jp.xml-shift_jis]
dogsinn.jp.xml-shift_jis]
file_name = 'tests/SHIFT_JIS/dogsinn.jp.xml', encoding = 'shift_jis'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
iyagi-readme.txt-johab]
iyagi-readme.txt-johab]
file_name = 'tests/Johab/iyagi-readme.txt', encoding = 'johab'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
hlpro-readme.txt-johab]
hlpro-readme.txt-johab]
file_name = 'tests/Johab/hlpro-readme.txt', encoding = 'johab'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
mdir-doc.txt-johab]
mdir-doc.txt-johab]
file_name = 'tests/Johab/mdir-doc.txt', encoding = 'johab'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
bpm.cult.bg.9.xml-iso-8859-5]
bpm.cult.bg.9.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-bulgarian/bpm.cult.bg.9.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
bbc.co.uk.popshow.xml-iso-8859-5]
bbc.co.uk.popshow.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-bulgarian/bbc.co.uk.popshow.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
bpm.cult.bg.4.xml-iso-8859-5]
bpm.cult.bg.4.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-bulgarian/bpm.cult.bg.4.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
ide.li.xml-iso-8859-5]
ide.li.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-bulgarian/ide.li.xml', encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
bpm.cult.bg.medusa.4.xml-iso-8859-5]
bpm.cult.bg.medusa.4.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-bulgarian/bpm.cult.bg.medusa.4.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
debian.gabrovo.com.xml-iso-8859-5]
debian.gabrovo.com.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-bulgarian/debian.gabrovo.com.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
ecloga.cult.bg.xml-iso-8859-5]
ecloga.cult.bg.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-bulgarian/ecloga.cult.bg.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
bpm.cult.bg.xml-iso-8859-5]
bpm.cult.bg.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-bulgarian/bpm.cult.bg.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
linux-bg.org.xml-iso-8859-5]
linux-bg.org.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-bulgarian/linux-bg.org.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
debian.gabrovo.com.news.xml-iso-8859-5]
debian.gabrovo.com.news.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-bulgarian/debian.gabrovo.com.news.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
bpm.cult.bg.2.xml-iso-8859-5]
bpm.cult.bg.2.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-bulgarian/bpm.cult.bg.2.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
aero-bg.com.xml-iso-8859-5]
aero-bg.com.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-bulgarian/aero-bg.com.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
doncho.net.comments.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-bulgarian/doncho.net.comments.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
ricanet.com.xml-cp949]
ricanet.com.xml-cp949]
file_name = 'tests/CP949/ricanet.com.xml', encoding = 'cp949'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
catshadow.blogspot.com.xml-big5]
catshadow.blogspot.com.xml-big5]
file_name = 'tests/Big5/catshadow.blogspot.com.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
letterlego.blogspot.com.xml-big5]
letterlego.blogspot.com.xml-big5]
file_name = 'tests/Big5/letterlego.blogspot.com.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
0804.blogspot.com.xml-big5]
0804.blogspot.com.xml-big5]
file_name = 'tests/Big5/0804.blogspot.com.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
oui-design.com.xml-big5]
oui-design.com.xml-big5]
file_name = 'tests/Big5/oui-design.com.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
ytc.blogspot.com.xml-big5]
ytc.blogspot.com.xml-big5]
file_name = 'tests/Big5/ytc.blogspot.com.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
kafkatseng.blogspot.com.xml-big5]
kafkatseng.blogspot.com.xml-big5]
file_name = 'tests/Big5/kafkatseng.blogspot.com.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_1.txt-big5]
_ude_1.txt-big5]
file_name = 'tests/Big5/_ude_1.txt', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
ebao.us.xml-big5]
ebao.us.xml-big5]
file_name = 'tests/Big5/ebao.us.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
fudesign.blogspot.com.xml-big5]
fudesign.blogspot.com.xml-big5]
file_name = 'tests/Big5/fudesign.blogspot.com.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
leavesth.blogspot.com.xml-big5]
leavesth.blogspot.com.xml-big5]
file_name = 'tests/Big5/leavesth.blogspot.com.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
linyijen.blogspot.com.xml-big5]
linyijen.blogspot.com.xml-big5]
file_name = 'tests/Big5/linyijen.blogspot.com.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
sanwenji.blogspot.com.xml-big5]
sanwenji.blogspot.com.xml-big5]
file_name = 'tests/Big5/sanwenji.blogspot.com.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
sinica.edu.tw.xml-big5]
sinica.edu.tw.xml-big5]
file_name = 'tests/Big5/sinica.edu.tw.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
coolloud.org.tw.xml-big5]
coolloud.org.tw.xml-big5]
file_name = 'tests/Big5/coolloud.org.tw.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
sylvia1976.blogspot.com.xml-big5]
sylvia1976.blogspot.com.xml-big5]
file_name = 'tests/Big5/sylvia1976.blogspot.com.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
unoriginalblog.com.xml-big5]
unoriginalblog.com.xml-big5]
file_name = 'tests/Big5/unoriginalblog.com.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
myblog.pchome.com.tw.xml-big5]
myblog.pchome.com.tw.xml-big5]
file_name = 'tests/Big5/myblog.pchome.com.tw.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
tlkkuo.blogspot.com.xml-big5]
tlkkuo.blogspot.com.xml-big5]
file_name = 'tests/Big5/tlkkuo.blogspot.com.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
digitalwall.com.xml-big5]
digitalwall.com.xml-big5]
file_name = 'tests/Big5/digitalwall.com.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_chromium_Big5_with_no_encoding_specified.html-big5]
_chromium_Big5_with_no_encoding_specified.html-big5]
file_name = 'tests/Big5/_chromium_Big5_with_no_encoding_specified.html'
encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
blog.worren.net.xml-big5]
blog.worren.net.xml-big5]
file_name = 'tests/Big5/blog.worren.net.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
carbonxiv.blogspot.com.xml-big5]
carbonxiv.blogspot.com.xml-big5]
file_name = 'tests/Big5/carbonxiv.blogspot.com.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
ke207.blogspot.com.xml-big5]
ke207.blogspot.com.xml-big5]
file_name = 'tests/Big5/ke207.blogspot.com.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
upsaid.com.xml-big5]
upsaid.com.xml-big5]
file_name = 'tests/Big5/upsaid.com.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
willythecop.blogspot.com.xml-big5]
willythecop.blogspot.com.xml-big5]
file_name = 'tests/Big5/willythecop.blogspot.com.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
marilynwu.blogspot.com.xml-big5]
marilynwu.blogspot.com.xml-big5]
file_name = 'tests/Big5/marilynwu.blogspot.com.xml', encoding = 'big5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
plane1-utf-16le.html-utf-16le]
plane1-utf-16le.html-utf-16le]
file_name = 'tests/UTF-16LE/plane1-utf-16le.html', encoding = 'utf-16le'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
nobom-utf16le.txt-utf-16le]
nobom-utf16le.txt-utf-16le]
file_name = 'tests/UTF-16LE/nobom-utf16le.txt', encoding = 'utf-16le'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
result = chardet.detect(input_bytes)
try:
expected_unicode = input_bytes.decode(encoding)
except LookupError:
expected_unicode = ""
try:
detected_unicode = input_bytes.decode(result["encoding"])
except (LookupError, UnicodeDecodeError, TypeError):
detected_unicode = ""
if result:
encoding_match = (result["encoding"] or "").lower() == encoding
else:
encoding_match = False
# Only care about mismatches that would actually result in different
# behavior when decoding
if not encoding_match and expected_unicode != detected_unicode:
wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
diff = "".join(
list(
ndiff(
wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
)
)[:20]
)
all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True)
else:
diff = ""
encoding_match = True
all_encodings = [result]
> assert encoding_match, (
f"Expected {encoding}, but got {result} for {file_name}. First 20 "
f"lines of character differences: \n{diff}\n"
f"All encodings: {pformat(all_encodings)}"
)
E AssertionError: Expected utf-16le, but got {'encoding': 'ascii', 'confidence': 1.0, 'language': ''} for tests/UTF-16LE/nobom-utf16le.txt. First 20 lines of character differences:
E - DateTime,Bid,Ask 07/19/2015 21:00:15.469,1.083,1.08332 07/19/2015 21:00:16.949,1.08311,1.08332
E - 07/19/2015 21:00:16.955,1.08311,1.08338 07/19/2015 21:00:17.120,1.08313,1.08338 07/19/2015
E - 21:00:17.166,1.08313,1.0834 07/19/2015 21:00:17.205,1.08313,1.08339 07/19/2015
E - 21:00:17.218,1.08312,1.08339 07/19/2015 21:00:17.469,1.08316,1.08339 07/19/2015
E - 21:00:17.476,1.08316,1.08347 07/19/2015 21:00:17.505,1.08316,1.08344 07/19/2015
E - 21:00:17.584,1.08316,1.08348 07/19/2015 21:00:18.905,1.08316,1.08351 07/19/2015
E - 21:00:19.005,1.08336,1.08351 07/19/2015 21:00:19.011,1.08336,1.08403 07/19/2015
E - 21:00:19.019,1.08334,1.08403 07/19/2015 21:00:19.025,1.08334,1.08406 07/19/2015
E - 21:00:20.310,1.08307,1.08353 07/19/2015 21:00:20.317,1.08278,1.08353 07/19/2015
E - 21:00:20.319,1.08307,1.08353
E + D a t e T i m e , B i d , A s k 0 7 / 1 9 / 2 0 1 5
E + 2 1 : 0 0 : 1 5 . 4 6 9 , 1 . 0 8 3 , 1 . 0 8 3 3 2 0 7 / 1 9 / 2 0 1 5
E + 2 1 : 0 0 : 1 6 . 9 4 9 , 1 . 0 8 3 1 1 , 1 . 0 8 3 3 2 0 7 / 1 9 / 2 0 1 5
E + 2 1 : 0 0 : 1 6 . 9 5 5 , 1 . 0 8 3 1 1 , 1 . 0 8 3 3 8 0 7 / 1 9 / 2 0 1 5
E + 2 1 : 0 0 : 1 7 . 1 2 0 , 1 . 0 8 3 1 3 , 1 . 0 8 3 3 8 0 7 / 1 9 / 2 0 1 5
E + 2 1 : 0 0 : 1 7 . 1 6 6 , 1 . 0 8 3 1 3 , 1 . 0 8 3 4 0 7 / 1 9 / 2 0 1 5
E + 2 1 : 0 0 : 1 7 . 2 0 5 , 1 . 0 8 3 1 3 , 1 . 0 8 3 3 9 0 7 / 1 9 / 2 0 1 5
E + 2 1 : 0 0 : 1 7 . 2 1 8 , 1 . 0 8 3 1 2 , 1 . 0 8 3 3 9 0 7 / 1 9 / 2 0 1 5
E + 2 1 : 0 0 : 1 7 . 4 6 9 , 1 . 0 8 3 1 6 , 1 . 0 8 3 3 9 0 7 / 1 9 / 2 0 1 5
E + 2 1 : 0 0 : 1 7 . 4 7 6 , 1 . 0 8 3 1 6 , 1 . 0 8 3 4 7 0 7 / 1 9 / 2 0 1 5
E
E All encodings: [{'confidence': 1.0, 'encoding': 'ascii', 'language': ''}]
E assert False
test.py:110: AssertionError
intertat.ru.xml-maccyrillic]
intertat.ru.xml-maccyrillic]
file_name = 'tests/MacCyrillic/intertat.ru.xml', encoding = 'maccyrillic'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
music.peeps.ru.xml-maccyrillic]
music.peeps.ru.xml-maccyrillic]
file_name = 'tests/MacCyrillic/music.peeps.ru.xml', encoding = 'maccyrillic'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.4.xml-maccyrillic]
file_name = 'tests/MacCyrillic/forum.template-toolkit.ru.4.xml'
encoding = 'maccyrillic'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
aviaport.ru.xml-maccyrillic]
aviaport.ru.xml-maccyrillic]
file_name = 'tests/MacCyrillic/aviaport.ru.xml', encoding = 'maccyrillic'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
aif.ru.health.xml-maccyrillic]
aif.ru.health.xml-maccyrillic]
file_name = 'tests/MacCyrillic/aif.ru.health.xml', encoding = 'maccyrillic'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
susu.ac.ru.xml-maccyrillic]
susu.ac.ru.xml-maccyrillic]
file_name = 'tests/MacCyrillic/susu.ac.ru.xml', encoding = 'maccyrillic'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
kapranoff.ru.xml-maccyrillic]
kapranoff.ru.xml-maccyrillic]
file_name = 'tests/MacCyrillic/kapranoff.ru.xml', encoding = 'maccyrillic'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
blog.mlmaster.com.xml-maccyrillic]
blog.mlmaster.com.xml-maccyrillic]
file_name = 'tests/MacCyrillic/blog.mlmaster.com.xml', encoding = 'maccyrillic'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.8.xml-maccyrillic]
file_name = 'tests/MacCyrillic/forum.template-toolkit.ru.8.xml'
encoding = 'maccyrillic'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.9.xml-maccyrillic]
file_name = 'tests/MacCyrillic/forum.template-toolkit.ru.9.xml'
encoding = 'maccyrillic'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
newsru.com.xml-maccyrillic]
newsru.com.xml-maccyrillic]
file_name = 'tests/MacCyrillic/newsru.com.xml', encoding = 'maccyrillic'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
aug32.hole.ru.xml-maccyrillic]
aug32.hole.ru.xml-maccyrillic]
file_name = 'tests/MacCyrillic/aug32.hole.ru.xml', encoding = 'maccyrillic'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
greek.ru.xml-maccyrillic]
greek.ru.xml-maccyrillic]
file_name = 'tests/MacCyrillic/greek.ru.xml', encoding = 'maccyrillic'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.6.xml-maccyrillic]
file_name = 'tests/MacCyrillic/forum.template-toolkit.ru.6.xml'
encoding = 'maccyrillic'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
money.rin.ru.xml-maccyrillic]
money.rin.ru.xml-maccyrillic]
file_name = 'tests/MacCyrillic/money.rin.ru.xml', encoding = 'maccyrillic'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_1.txt-maccyrillic]
_ude_1.txt-maccyrillic]
file_name = 'tests/MacCyrillic/_ude_1.txt', encoding = 'maccyrillic'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
koi.kinder.ru.xml-maccyrillic]
koi.kinder.ru.xml-maccyrillic]
file_name = 'tests/MacCyrillic/koi.kinder.ru.xml', encoding = 'maccyrillic'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
aviaport.ru.xml-iso-8859-5]
aviaport.ru.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-russian/aviaport.ru.xml', encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
kapranoff.ru.xml-iso-8859-5]
kapranoff.ru.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-russian/kapranoff.ru.xml', encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
janulalife.blogspot.com.xml-iso-8859-5]
janulalife.blogspot.com.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-russian/janulalife.blogspot.com.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
newsru.com.xml-iso-8859-5]
newsru.com.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-russian/newsru.com.xml', encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
aug32.hole.ru.xml-iso-8859-5]
aug32.hole.ru.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-russian/aug32.hole.ru.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
greek.ru.xml-iso-8859-5]
greek.ru.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-russian/greek.ru.xml', encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.9.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-russian/forum.template-toolkit.ru.9.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.1.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-russian/forum.template-toolkit.ru.1.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.4.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-russian/forum.template-toolkit.ru.4.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.8.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-russian/forum.template-toolkit.ru.8.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.6.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-russian/forum.template-toolkit.ru.6.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
aif.ru.health.xml-iso-8859-5]
aif.ru.health.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-russian/aif.ru.health.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_chromium_ISO-8859-5_with_no_encoding_specified.html-iso-8859-5]
_chromium_ISO-8859-5_with_no_encoding_specified.html-iso-8859-5]
file_name = 'tests/iso-8859-5-russian/_chromium_ISO-8859-5_with_no_encoding_specified.html'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
money.rin.ru.xml-iso-8859-5]
money.rin.ru.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-russian/money.rin.ru.xml', encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
susu.ac.ru.xml-iso-8859-5]
susu.ac.ru.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-russian/susu.ac.ru.xml', encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
music.peeps.ru.xml-iso-8859-5]
music.peeps.ru.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-russian/music.peeps.ru.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
intertat.ru.xml-iso-8859-5]
intertat.ru.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-russian/intertat.ru.xml', encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
blog.mlmaster.com.xml-iso-8859-5]
blog.mlmaster.com.xml-iso-8859-5]
file_name = 'tests/iso-8859-5-russian/blog.mlmaster.com.xml'
encoding = 'iso-8859-5'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_mozilla_bug171813_text.html-gb2312]
_mozilla_bug171813_text.html-gb2312]
file_name = 'tests/GB2312/_mozilla_bug171813_text.html', encoding = 'gb2312'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
xy15400.blogcn.com.xml-gb2312]
xy15400.blogcn.com.xml-gb2312]
file_name = 'tests/GB2312/xy15400.blogcn.com.xml', encoding = 'gb2312'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
acnnewswire.net.xml-gb2312]
acnnewswire.net.xml-gb2312]
file_name = 'tests/GB2312/acnnewswire.net.xml', encoding = 'gb2312'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
2.blog.westca.com.xml-gb2312]
2.blog.westca.com.xml-gb2312]
file_name = 'tests/GB2312/2.blog.westca.com.xml', encoding = 'gb2312'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
eighthday.blogspot.com.xml-gb2312]
eighthday.blogspot.com.xml-gb2312]
file_name = 'tests/GB2312/eighthday.blogspot.com.xml', encoding = 'gb2312'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_chromium_gb18030_with_no_encoding_specified.html.xml-gb2312]
_chromium_gb18030_with_no_encoding_specified.html.xml-gb2312]
file_name = 'tests/GB2312/_chromium_gb18030_with_no_encoding_specified.html.xml'
encoding = 'gb2312'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
cindychen.com.xml-gb2312]
cindychen.com.xml-gb2312]
file_name = 'tests/GB2312/cindychen.com.xml', encoding = 'gb2312'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
chen56.blogcn.com.xml-gb2312]
chen56.blogcn.com.xml-gb2312]
file_name = 'tests/GB2312/chen56.blogcn.com.xml', encoding = 'gb2312'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
lily.blogsome.com.xml-gb2312]
lily.blogsome.com.xml-gb2312]
file_name = 'tests/GB2312/lily.blogsome.com.xml', encoding = 'gb2312'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
w3cn.org.xml-gb2312]
w3cn.org.xml-gb2312]
file_name = 'tests/GB2312/w3cn.org.xml', encoding = 'gb2312'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
jjgod.3322.org.xml-gb2312]
jjgod.3322.org.xml-gb2312]
file_name = 'tests/GB2312/jjgod.3322.org.xml', encoding = 'gb2312'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
coverer.com.xml-gb2312]
coverer.com.xml-gb2312]
file_name = 'tests/GB2312/coverer.com.xml', encoding = 'gb2312'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
14.blog.westca.com.xml-gb2312]
14.blog.westca.com.xml-gb2312]
file_name = 'tests/GB2312/14.blog.westca.com.xml', encoding = 'gb2312'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
bbs.blogsome.com.xml-gb2312]
bbs.blogsome.com.xml-gb2312]
file_name = 'tests/GB2312/bbs.blogsome.com.xml', encoding = 'gb2312'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
softsea.net.xml-gb2312]
softsea.net.xml-gb2312]
file_name = 'tests/GB2312/softsea.net.xml', encoding = 'gb2312'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
luciferwang.blogcn.com.xml-gb2312]
luciferwang.blogcn.com.xml-gb2312]
file_name = 'tests/GB2312/luciferwang.blogcn.com.xml', encoding = 'gb2312'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
pda.blogsome.com.xml-gb2312]
pda.blogsome.com.xml-gb2312]
file_name = 'tests/GB2312/pda.blogsome.com.xml', encoding = 'gb2312'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
cappuccinos.3322.org.xml-gb2312]
cappuccinos.3322.org.xml-gb2312]
file_name = 'tests/GB2312/cappuccinos.3322.org.xml', encoding = 'gb2312'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
cnblog.org.xml-gb2312]
cnblog.org.xml-gb2312]
file_name = 'tests/GB2312/cnblog.org.xml', encoding = 'gb2312'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
godthink.blogsome.com.xml-gb2312]
godthink.blogsome.com.xml-gb2312]
file_name = 'tests/GB2312/godthink.blogsome.com.xml', encoding = 'gb2312'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
plane1-utf-16be.html-utf-16be]
plane1-utf-16be.html-utf-16be]
file_name = 'tests/UTF-16BE/plane1-utf-16be.html', encoding = 'utf-16be'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
nobom-utf16be.txt-utf-16be]
nobom-utf16be.txt-utf-16be]
file_name = 'tests/UTF-16BE/nobom-utf16be.txt', encoding = 'utf-16be'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
result = chardet.detect(input_bytes)
try:
expected_unicode = input_bytes.decode(encoding)
except LookupError:
expected_unicode = ""
try:
detected_unicode = input_bytes.decode(result["encoding"])
except (LookupError, UnicodeDecodeError, TypeError):
detected_unicode = ""
if result:
encoding_match = (result["encoding"] or "").lower() == encoding
else:
encoding_match = False
# Only care about mismatches that would actually result in different
# behavior when decoding
if not encoding_match and expected_unicode != detected_unicode:
wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
diff = "".join(
list(
ndiff(
wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
)
)[:20]
)
all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True)
else:
diff = ""
encoding_match = True
all_encodings = [result]
> assert encoding_match, (
f"Expected {encoding}, but got {result} for {file_name}. First 20 "
f"lines of character differences: \n{diff}\n"
f"All encodings: {pformat(all_encodings)}"
)
E AssertionError: Expected utf-16be, but got {'encoding': 'ascii', 'confidence': 1.0, 'language': ''} for tests/UTF-16BE/nobom-utf16be.txt. First 20 lines of character differences:
E - DateTime,Bid,Ask 07/19/2015 21:00:15.469,1.083,1.08332 07/19/2015 21:00:16.949,1.08311,1.08332
E - 07/19/2015 21:00:16.955,1.08311,1.08338 07/19/2015 21:00:17.120,1.08313,1.08338 07/19/2015
E - 21:00:17.166,1.08313,1.0834 07/19/2015 21:00:17.205,1.08313,1.08339 07/19/2015
E - 21:00:17.218,1.08312,1.08339 07/19/2015 21:00:17.469,1.08316,1.08339 07/19/2015
E - 21:00:17.476,1.08316,1.08347 07/19/2015 21:00:17.505,1.08316,1.08344 07/19/2015
E - 21:00:17.584,1.08316,1.08348 07/19/2015 21:00:18.905,1.08316,1.08351 07/19/2015
E - 21:00:19.005,1.08336,1.08351 07/19/2015 21:00:19.011,1.08336,1.08403 07/19/2015
E - 21:00:19.019,1.08334,1.08403 07/19/2015 21:00:19.025,1.08334,1.08406 07/19/2015
E - 21:00:20.310,1.08307,1.08353 07/19/2015 21:00:20.317,1.08278,1.08353 07/19/2015
E - 21:00:20.319,1.08307,1.08353
E + D a t e T i m e , B i d , A s k 0 7 / 1 9 / 2 0 1 5
E + 2 1 : 0 0 : 1 5 . 4 6 9 , 1 . 0 8 3 , 1 . 0 8 3 3 2 0 7 / 1 9 / 2 0 1 5
E + 2 1 : 0 0 : 1 6 . 9 4 9 , 1 . 0 8 3 1 1 , 1 . 0 8 3 3 2 0 7 / 1 9 / 2 0 1 5
E + 2 1 : 0 0 : 1 6 . 9 5 5 , 1 . 0 8 3 1 1 , 1 . 0 8 3 3 8 0 7 / 1 9 / 2 0 1 5
E + 2 1 : 0 0 : 1 7 . 1 2 0 , 1 . 0 8 3 1 3 , 1 . 0 8 3 3 8 0 7 / 1 9 / 2 0 1 5
E + 2 1 : 0 0 : 1 7 . 1 6 6 , 1 . 0 8 3 1 3 , 1 . 0 8 3 4 0 7 / 1 9 / 2 0 1 5
E + 2 1 : 0 0 : 1 7 . 2 0 5 , 1 . 0 8 3 1 3 , 1 . 0 8 3 3 9 0 7 / 1 9 / 2 0 1 5
E + 2 1 : 0 0 : 1 7 . 2 1 8 , 1 . 0 8 3 1 2 , 1 . 0 8 3 3 9 0 7 / 1 9 / 2 0 1 5
E + 2 1 : 0 0 : 1 7 . 4 6 9 , 1 . 0 8 3 1 6 , 1 . 0 8 3 3 9 0 7 / 1 9 / 2 0 1 5
E + 2 1 : 0 0 : 1 7 . 4 7 6 , 1 . 0 8 3 1 6 , 1 . 0 8 3 4 7 0 7 / 1 9 / 2 0 1 5
E
E All encodings: [{'confidence': 1.0, 'encoding': 'ascii', 'language': ''}]
E assert False
test.py:110: AssertionError
_ude_1.txt-iso-2022-jp]
_ude_1.txt-iso-2022-jp]
file_name = 'tests/iso-2022-jp/_ude_1.txt', encoding = 'iso-2022-jp'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:137: in feed
self._esc_charset_prober = EscCharSetProber()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self =
lang_filter = None
def __init__(self, lang_filter=None):
super().__init__(lang_filter=lang_filter)
self.coding_sm = []
> if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:
E TypeError: unsupported operand type(s) for &: 'NoneType' and 'int'
chardet/escprober.py:16: TypeError
pharmacy.kku.ac.th.centerlab.xml-tis-620]
pharmacy.kku.ac.th.centerlab.xml-tis-620]
file_name = 'tests/TIS-620/pharmacy.kku.ac.th.centerlab.xml'
encoding = 'tis-620'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
trickspot.boxchart.com.xml-tis-620]
trickspot.boxchart.com.xml-tis-620]
file_name = 'tests/TIS-620/trickspot.boxchart.com.xml', encoding = 'tis-620'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
pharmacy.kku.ac.th.analyse1.xml-tis-620]
pharmacy.kku.ac.th.analyse1.xml-tis-620]
file_name = 'tests/TIS-620/pharmacy.kku.ac.th.analyse1.xml'
encoding = 'tis-620'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_mozilla_bug488426_text.html-tis-620]
_mozilla_bug488426_text.html-tis-620]
file_name = 'tests/TIS-620/_mozilla_bug488426_text.html', encoding = 'tis-620'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
opentle.org.xml-tis-620]
opentle.org.xml-tis-620]
file_name = 'tests/TIS-620/opentle.org.xml', encoding = 'tis-620'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
pharmacy.kku.ac.th.healthinfo-ne.xml-tis-620]
pharmacy.kku.ac.th.healthinfo-ne.xml-tis-620]
file_name = 'tests/TIS-620/pharmacy.kku.ac.th.healthinfo-ne.xml'
encoding = 'tis-620'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
money.rin.ru.xml-ibm855]
money.rin.ru.xml-ibm855]
file_name = 'tests/IBM855/money.rin.ru.xml', encoding = 'ibm855'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.4.xml-ibm855]
file_name = 'tests/IBM855/forum.template-toolkit.ru.4.xml', encoding = 'ibm855'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
blog.mlmaster.com.xml-ibm855]
blog.mlmaster.com.xml-ibm855]
file_name = 'tests/IBM855/blog.mlmaster.com.xml', encoding = 'ibm855'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
aug32.hole.ru.xml-ibm855]
aug32.hole.ru.xml-ibm855]
file_name = 'tests/IBM855/aug32.hole.ru.xml', encoding = 'ibm855'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
intertat.ru.xml-ibm855]
intertat.ru.xml-ibm855]
file_name = 'tests/IBM855/intertat.ru.xml', encoding = 'ibm855'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
aviaport.ru.xml-ibm855]
aviaport.ru.xml-ibm855]
file_name = 'tests/IBM855/aviaport.ru.xml', encoding = 'ibm855'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
music.peeps.ru.xml-ibm855]
music.peeps.ru.xml-ibm855]
file_name = 'tests/IBM855/music.peeps.ru.xml', encoding = 'ibm855'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
greek.ru.xml-ibm855]
greek.ru.xml-ibm855]
file_name = 'tests/IBM855/greek.ru.xml', encoding = 'ibm855'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_1.txt-ibm855]
_ude_1.txt-ibm855]
file_name = 'tests/IBM855/_ude_1.txt', encoding = 'ibm855'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
susu.ac.ru.xml-ibm855]
susu.ac.ru.xml-ibm855]
file_name = 'tests/IBM855/susu.ac.ru.xml', encoding = 'ibm855'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.8.xml-ibm855]
file_name = 'tests/IBM855/forum.template-toolkit.ru.8.xml', encoding = 'ibm855'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.9.xml-ibm855]
file_name = 'tests/IBM855/forum.template-toolkit.ru.9.xml', encoding = 'ibm855'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.6.xml-ibm855]
file_name = 'tests/IBM855/forum.template-toolkit.ru.6.xml', encoding = 'ibm855'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
forum.template-toolkit.ru.1.xml-ibm855]
file_name = 'tests/IBM855/forum.template-toolkit.ru.1.xml', encoding = 'ibm855'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
kapranoff.ru.xml-ibm855]
kapranoff.ru.xml-ibm855]
file_name = 'tests/IBM855/kapranoff.ru.xml', encoding = 'ibm855'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
janulalife.blogspot.com.xml-ibm855]
janulalife.blogspot.com.xml-ibm855]
file_name = 'tests/IBM855/janulalife.blogspot.com.xml', encoding = 'ibm855'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
aif.ru.health.xml-ibm855]
aif.ru.health.xml-ibm855]
file_name = 'tests/IBM855/aif.ru.health.xml', encoding = 'ibm855'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
newsru.com.xml-ibm855]
newsru.com.xml-ibm855]
file_name = 'tests/IBM855/newsru.com.xml', encoding = 'ibm855'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
arts.egloos.com.xml-euc-kr]
arts.egloos.com.xml-euc-kr]
file_name = 'tests/EUC-KR/arts.egloos.com.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
scarletkh2.egloos.com.xml-euc-kr]
scarletkh2.egloos.com.xml-euc-kr]
file_name = 'tests/EUC-KR/scarletkh2.egloos.com.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_euc2.txt-euc-kr]
_ude_euc2.txt-euc-kr]
file_name = 'tests/EUC-KR/_ude_euc2.txt', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_mozilla_bug9357_text.html-euc-kr]
_mozilla_bug9357_text.html-euc-kr]
file_name = 'tests/EUC-KR/_mozilla_bug9357_text.html', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
jely.pe.kr.xml-euc-kr]
jely.pe.kr.xml-euc-kr]
file_name = 'tests/EUC-KR/jely.pe.kr.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_ude_euc1.txt-euc-kr]
_ude_euc1.txt-euc-kr]
file_name = 'tests/EUC-KR/_ude_euc1.txt', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
willis.egloos.com.xml-euc-kr]
willis.egloos.com.xml-euc-kr]
file_name = 'tests/EUC-KR/willis.egloos.com.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
blog.bd-lab.com.xml-euc-kr]
blog.bd-lab.com.xml-euc-kr]
file_name = 'tests/EUC-KR/blog.bd-lab.com.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
blog.empas.com.xml-euc-kr]
blog.empas.com.xml-euc-kr]
file_name = 'tests/EUC-KR/blog.empas.com.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
_chromium_windows-949_with_no_encoding_specified.html-euc-kr]
_chromium_windows-949_with_no_encoding_specified.html-euc-kr]
file_name = 'tests/EUC-KR/_chromium_windows-949_with_no_encoding_specified.html'
encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
chisato.info.xml-euc-kr]
chisato.info.xml-euc-kr]
file_name = 'tests/EUC-KR/chisato.info.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
tori02.egloos.com.xml-euc-kr]
tori02.egloos.com.xml-euc-kr]
file_name = 'tests/EUC-KR/tori02.egloos.com.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
acnnewswire.net.xml-euc-kr]
acnnewswire.net.xml-euc-kr]
file_name = 'tests/EUC-KR/acnnewswire.net.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
epitaph.egloos.com.xml-euc-kr]
epitaph.egloos.com.xml-euc-kr]
file_name = 'tests/EUC-KR/epitaph.egloos.com.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
blog.rss.naver.com.xml-euc-kr]
file_name = 'tests/EUC-KR/blog.rss.naver.com.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
xenix.egloos.com.xml-euc-kr]
xenix.egloos.com.xml-euc-kr]
file_name = 'tests/EUC-KR/xenix.egloos.com.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
kina.egloos.com.xml-euc-kr]
kina.egloos.com.xml-euc-kr]
file_name = 'tests/EUC-KR/kina.egloos.com.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
birder.egloos.com.xml-euc-kr]
birder.egloos.com.xml-euc-kr]
file_name = 'tests/EUC-KR/birder.egloos.com.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
siwoo.org.xml-euc-kr]
siwoo.org.xml-euc-kr]
file_name = 'tests/EUC-KR/siwoo.org.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
jowchung.oolim.net.xml-euc-kr]
jowchung.oolim.net.xml-euc-kr]
file_name = 'tests/EUC-KR/jowchung.oolim.net.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
yunho.egloos.com.xml-euc-kr]
yunho.egloos.com.xml-euc-kr]
file_name = 'tests/EUC-KR/yunho.egloos.com.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
zangsalang.egloos.com.xml-euc-kr]
zangsalang.egloos.com.xml-euc-kr]
file_name = 'tests/EUC-KR/zangsalang.egloos.com.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
ittrend.egloos.com.xml-euc-kr]
ittrend.egloos.com.xml-euc-kr]
file_name = 'tests/EUC-KR/ittrend.egloos.com.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
poliplus.egloos.com.xml-euc-kr]
poliplus.egloos.com.xml-euc-kr]
file_name = 'tests/EUC-KR/poliplus.egloos.com.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
console.linuxstudy.pe.kr.xml-euc-kr]
console.linuxstudy.pe.kr.xml-euc-kr]
file_name = 'tests/EUC-KR/console.linuxstudy.pe.kr.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
oroll.egloos.com.xml-euc-kr]
oroll.egloos.com.xml-euc-kr]
file_name = 'tests/EUC-KR/oroll.egloos.com.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
calmguy.egloos.com.xml-euc-kr]
calmguy.egloos.com.xml-euc-kr]
file_name = 'tests/EUC-KR/calmguy.egloos.com.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
critique.or.kr.xml-euc-kr]
critique.or.kr.xml-euc-kr]
file_name = 'tests/EUC-KR/critique.or.kr.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
jely.egloos.com.xml-euc-kr]
jely.egloos.com.xml-euc-kr]
file_name = 'tests/EUC-KR/jely.egloos.com.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
sparcs.kaist.ac.kr.xml-euc-kr]
sparcs.kaist.ac.kr.xml-euc-kr]
file_name = 'tests/EUC-KR/sparcs.kaist.ac.kr.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
alogblog.com.xml-euc-kr]
alogblog.com.xml-euc-kr]
file_name = 'tests/EUC-KR/alogblog.com.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
lennon81.egloos.com.xml-euc-kr]
lennon81.egloos.com.xml-euc-kr]
file_name = 'tests/EUC-KR/lennon81.egloos.com.xml', encoding = 'euc-kr'
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
> result = chardet.detect(input_bytes)
test.py:80:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
chardet/__init__.py:39: in detect
detector.feed(byte_str)
chardet/universaldetector.py:154: in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
chardet/utf1632prober.py:113: in feed
if not self.validate_utf16_characters(pair_be):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = , pair = []
def validate_utf16_characters(self, pair):
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
> value = (pair[0] << 8) | pair[1]
E IndexError: list index out of range
chardet/utf1632prober.py:56: IndexError
test.py::test_never_fails_to_detect_if_there_is_a_valid_encoding
test.py::test_never_fails_to_detect_if_there_is_a_valid_encoding
+ Exception Group Traceback (most recent call last):
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/runner.py", line 341, in from_call
| result: TResult | None = func()
| ^^^^^^
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/runner.py", line 242, in
| lambda: runtest_hook(item=item, **kwds), when=when, reraise=reraise
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_hooks.py", line 513, in __call__
| return self._hookexec(self.name, self._hookimpls.copy(), kwargs, firstresult)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_manager.py", line 120, in _hookexec
| return self._inner_hookexec(hook_name, methods, kwargs, firstresult)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_callers.py", line 182, in _multicall
| return outcome.get_result()
| ^^^^^^^^^^^^^^^^^^^^
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_result.py", line 100, in get_result
| raise exc.with_traceback(exc.__traceback__)
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_callers.py", line 167, in _multicall
| teardown.throw(outcome._exception)
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/threadexception.py", line 92, in pytest_runtest_call
| yield from thread_exception_runtest_hook()
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/threadexception.py", line 68, in thread_exception_runtest_hook
| yield
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_callers.py", line 167, in _multicall
| teardown.throw(outcome._exception)
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/unraisableexception.py", line 95, in pytest_runtest_call
| yield from unraisable_exception_runtest_hook()
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/unraisableexception.py", line 70, in unraisable_exception_runtest_hook
| yield
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_callers.py", line 167, in _multicall
| teardown.throw(outcome._exception)
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/logging.py", line 846, in pytest_runtest_call
| yield from self._runtest_for(item, "call")
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/logging.py", line 829, in _runtest_for
| yield
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_callers.py", line 167, in _multicall
| teardown.throw(outcome._exception)
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/capture.py", line 880, in pytest_runtest_call
| return (yield)
| ^^^^^
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_callers.py", line 167, in _multicall
| teardown.throw(outcome._exception)
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/skipping.py", line 257, in pytest_runtest_call
| return (yield)
| ^^^^^
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_callers.py", line 103, in _multicall
| res = hook_impl.function(*args)
| ^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/runner.py", line 174, in pytest_runtest_call
| item.runtest()
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/python.py", line 1627, in runtest
| self.ihook.pytest_pyfunc_call(pyfuncitem=self)
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_hooks.py", line 513, in __call__
| return self._hookexec(self.name, self._hookimpls.copy(), kwargs, firstresult)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_manager.py", line 120, in _hookexec
| return self._inner_hookexec(hook_name, methods, kwargs, firstresult)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_callers.py", line 139, in _multicall
| raise exception.with_traceback(exception.__traceback__)
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_callers.py", line 103, in _multicall
| res = hook_impl.function(*args)
| ^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/python.py", line 159, in pytest_pyfunc_call
| result = testfunction(**testargs)
| ^^^^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/test.py", line 123, in test_never_fails_to_detect_if_there_is_a_valid_encoding
| @given(
|
| File "/testbed/.venv/lib/python3.12/site-packages/hypothesis/core.py", line 1722, in wrapped_test
| raise the_error_hypothesis_found
| ExceptionGroup: Hypothesis found 2 distinct failures. (2 sub-exceptions)
+-+---------------- 1 ----------------
| Traceback (most recent call last):
| File "/testbed/test.py", line 144, in test_never_fails_to_detect_if_there_is_a_valid_encoding
| detected = chardet.detect(data)["encoding"]
| ^^^^^^^^^^^^^^^^^^^^
| File "/testbed/chardet/__init__.py", line 39, in detect
| detector.feed(byte_str)
| File "/testbed/chardet/universaldetector.py", line 154, in feed
| if prober.feed(byte_str) == ProbingState.FOUND_IT:
| ^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/chardet/utf1632prober.py", line 113, in feed
| if not self.validate_utf16_characters(pair_be):
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/chardet/utf1632prober.py", line 56, in validate_utf16_characters
| value = (pair[0] << 8) | pair[1]
| ~~~~^^^
| IndexError: list index out of range
| Falsifying example: test_never_fails_to_detect_if_there_is_a_valid_encoding(
| txt='𐀀',
| enc='utf-8',
| rnd=HypothesisRandom(generated data),
| )
| Explanation:
| These lines were always and only run by failing examples:
| /testbed/chardet/charsetprober.py:13
| /testbed/chardet/codingstatemachine.py:33
| /testbed/chardet/universaldetector.py:129
+---------------- 2 ----------------
| Traceback (most recent call last):
| File "/testbed/test.py", line 144, in test_never_fails_to_detect_if_there_is_a_valid_encoding
| detected = chardet.detect(data)["encoding"]
| ^^^^^^^^^^^^^^^^^^^^
| File "/testbed/chardet/__init__.py", line 39, in detect
| detector.feed(byte_str)
| File "/testbed/chardet/universaldetector.py", line 154, in feed
| if prober.feed(byte_str) == ProbingState.FOUND_IT:
| ^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/chardet/charsetgroupprober.py", line 35, in feed
| state = prober.feed(byte_str)
| ^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/chardet/sbcharsetprober.py", line 52, in feed
| byte_str = self.filter_with_english_letters(byte_str)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| AttributeError: 'SingleByteCharSetProber' object has no attribute 'filter_with_english_letters'
| Falsifying example: test_never_fails_to_detect_if_there_is_a_valid_encoding(
| txt='\x80',
| enc='utf-8',
| rnd=HypothesisRandom(generated data),
| )
| Explanation:
| These lines were always and only run by failing examples:
| /testbed/chardet/charsetprober.py:13
| /testbed/chardet/codingstatemachine.py:33
| /testbed/chardet/universaldetector.py:129
+------------------------------------
test.py::test_detect_all_and_detect_one_should_agree
test.py::test_detect_all_and_detect_one_should_agree
+ Exception Group Traceback (most recent call last):
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/runner.py", line 341, in from_call
| result: TResult | None = func()
| ^^^^^^
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/runner.py", line 242, in
| lambda: runtest_hook(item=item, **kwds), when=when, reraise=reraise
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_hooks.py", line 513, in __call__
| return self._hookexec(self.name, self._hookimpls.copy(), kwargs, firstresult)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_manager.py", line 120, in _hookexec
| return self._inner_hookexec(hook_name, methods, kwargs, firstresult)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_callers.py", line 182, in _multicall
| return outcome.get_result()
| ^^^^^^^^^^^^^^^^^^^^
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_result.py", line 100, in get_result
| raise exc.with_traceback(exc.__traceback__)
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_callers.py", line 167, in _multicall
| teardown.throw(outcome._exception)
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/threadexception.py", line 92, in pytest_runtest_call
| yield from thread_exception_runtest_hook()
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/threadexception.py", line 68, in thread_exception_runtest_hook
| yield
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_callers.py", line 167, in _multicall
| teardown.throw(outcome._exception)
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/unraisableexception.py", line 95, in pytest_runtest_call
| yield from unraisable_exception_runtest_hook()
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/unraisableexception.py", line 70, in unraisable_exception_runtest_hook
| yield
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_callers.py", line 167, in _multicall
| teardown.throw(outcome._exception)
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/logging.py", line 846, in pytest_runtest_call
| yield from self._runtest_for(item, "call")
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/logging.py", line 829, in _runtest_for
| yield
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_callers.py", line 167, in _multicall
| teardown.throw(outcome._exception)
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/capture.py", line 880, in pytest_runtest_call
| return (yield)
| ^^^^^
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_callers.py", line 167, in _multicall
| teardown.throw(outcome._exception)
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/skipping.py", line 257, in pytest_runtest_call
| return (yield)
| ^^^^^
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_callers.py", line 103, in _multicall
| res = hook_impl.function(*args)
| ^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/runner.py", line 174, in pytest_runtest_call
| item.runtest()
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/python.py", line 1627, in runtest
| self.ihook.pytest_pyfunc_call(pyfuncitem=self)
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_hooks.py", line 513, in __call__
| return self._hookexec(self.name, self._hookimpls.copy(), kwargs, firstresult)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_manager.py", line 120, in _hookexec
| return self._inner_hookexec(hook_name, methods, kwargs, firstresult)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_callers.py", line 139, in _multicall
| raise exception.with_traceback(exception.__traceback__)
| File "/testbed/.venv/lib/python3.12/site-packages/pluggy/_callers.py", line 103, in _multicall
| res = hook_impl.function(*args)
| ^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/.venv/lib/python3.12/site-packages/_pytest/python.py", line 159, in pytest_pyfunc_call
| result = testfunction(**testargs)
| ^^^^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/test.py", line 160, in test_detect_all_and_detect_one_should_agree
| st.text(min_size=1),
| ^^^
| File "/testbed/.venv/lib/python3.12/site-packages/hypothesis/core.py", line 1722, in wrapped_test
| raise the_error_hypothesis_found
| ExceptionGroup: Hypothesis found 2 distinct failures. (2 sub-exceptions)
+-+---------------- 1 ----------------
| Traceback (most recent call last):
| File "/testbed/test.py", line 181, in test_detect_all_and_detect_one_should_agree
| result = chardet.detect(data)
| ^^^^^^^^^^^^^^^^^^^^
| File "/testbed/chardet/__init__.py", line 39, in detect
| detector.feed(byte_str)
| File "/testbed/chardet/universaldetector.py", line 154, in feed
| if prober.feed(byte_str) == ProbingState.FOUND_IT:
| ^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/chardet/utf1632prober.py", line 113, in feed
| if not self.validate_utf16_characters(pair_be):
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/chardet/utf1632prober.py", line 56, in validate_utf16_characters
| value = (pair[0] << 8) | pair[1]
| ~~~~^^^
| IndexError: list index out of range
|
| During handling of the above exception, another exception occurred:
|
| Traceback (most recent call last):
| File "/testbed/test.py", line 185, in test_detect_all_and_detect_one_should_agree
| raise RuntimeError(f"{result} != {results}") from exc
| ^^^^^^
| UnboundLocalError: cannot access local variable 'result' where it is not associated with a value
| Falsifying example: test_detect_all_and_detect_one_should_agree(
| txt='𐀀',
| enc='utf-8',
| _=HypothesisRandom(generated data),
| )
| Explanation:
| These lines were always and only run by failing examples:
| /testbed/chardet/charsetprober.py:13
| /testbed/chardet/codingstatemachine.py:33
| /testbed/chardet/universaldetector.py:129
+---------------- 2 ----------------
| Traceback (most recent call last):
| File "/testbed/test.py", line 181, in test_detect_all_and_detect_one_should_agree
| result = chardet.detect(data)
| ^^^^^^^^^^^^^^^^^^^^
| File "/testbed/chardet/__init__.py", line 39, in detect
| detector.feed(byte_str)
| File "/testbed/chardet/universaldetector.py", line 154, in feed
| if prober.feed(byte_str) == ProbingState.FOUND_IT:
| ^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/chardet/charsetgroupprober.py", line 35, in feed
| state = prober.feed(byte_str)
| ^^^^^^^^^^^^^^^^^^^^^
| File "/testbed/chardet/sbcharsetprober.py", line 52, in feed
| byte_str = self.filter_with_english_letters(byte_str)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| AttributeError: 'SingleByteCharSetProber' object has no attribute 'filter_with_english_letters'
|
| During handling of the above exception, another exception occurred:
|
| Traceback (most recent call last):
| File "/testbed/test.py", line 185, in test_detect_all_and_detect_one_should_agree
| raise RuntimeError(f"{result} != {results}") from exc
| ^^^^^^
| UnboundLocalError: cannot access local variable 'result' where it is not associated with a value
| Falsifying example: test_detect_all_and_detect_one_should_agree(
| txt='\x80',
| enc='utf-8',
| _=HypothesisRandom(generated data),
| )
| Explanation:
| These lines were always and only run by failing examples:
| /testbed/chardet/charsetprober.py:13
| /testbed/chardet/codingstatemachine.py:33
| /testbed/chardet/universaldetector.py:129
+------------------------------------
Patch diff
diff --git a/chardet/big5prober.py b/chardet/big5prober.py
index 51ab8fb..7c27716 100644
--- a/chardet/big5prober.py
+++ b/chardet/big5prober.py
@@ -1,5 +1,6 @@
from .chardistribution import Big5DistributionAnalysis
from .codingstatemachine import CodingStateMachine
+from .enums import ProbingState
from .mbcharsetprober import MultiByteCharSetProber
from .mbcssm import BIG5_SM_MODEL
@@ -9,4 +10,16 @@ class Big5Prober(MultiByteCharSetProber):
super().__init__()
self.coding_sm = CodingStateMachine(BIG5_SM_MODEL)
self.distribution_analyzer = Big5DistributionAnalysis()
- self.reset()
\ No newline at end of file
+ self.reset()
+
+ def reset(self):
+ super().reset()
+ self._state = ProbingState.DETECTING
+
+ @property
+ def charset_name(self):
+ return "Big5"
+
+ @property
+ def language(self):
+ return "Traditional Chinese"
\ No newline at end of file
diff --git a/chardet/chardistribution.py b/chardet/chardistribution.py
index 1b51bcd..9043d80 100644
--- a/chardet/chardistribution.py
+++ b/chardet/chardistribution.py
@@ -22,15 +22,39 @@ class CharDistributionAnalysis:
def reset(self):
"""reset analyser, clear any state"""
- pass
+ self._done = False
+ self._total_chars = 0
+ self._freq_chars = 0
def feed(self, char, char_len):
"""feed a character with known length"""
- pass
+ if char_len == 2:
+ # we only care about 2-bytes character in our distribution analysis
+ order = -1
+ if char[0] in self._char_to_freq_order:
+ order = self._char_to_freq_order[char[0]]
+ if order != -1 and order < self._table_size:
+ self._total_chars += 1
+ if order < 512:
+ self._freq_chars += 1
+
+ def got_enough_data(self):
+ # It is not necessary to receive all data to draw conclusion.
+ # For charset probers, certain amount of data is enough
+ return self._total_chars > self.ENOUGH_DATA_THRESHOLD
def get_confidence(self):
"""return confidence based on existing data"""
- pass
+ if self._total_chars <= 0 or self._freq_chars <= self.MINIMUM_DATA_THRESHOLD:
+ return self.SURE_NO
+
+ if self._total_chars != self._freq_chars:
+ r = self._freq_chars / ((self._total_chars - self._freq_chars) * self.typical_distribution_ratio)
+ if r < self.SURE_YES:
+ return r
+
+ # normalize confidence, (we don't want to be 100% sure)
+ return self.SURE_YES
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
diff --git a/chardet/charsetgroupprober.py b/chardet/charsetgroupprober.py
index db44415..f89bbbc 100644
--- a/chardet/charsetgroupprober.py
+++ b/chardet/charsetgroupprober.py
@@ -7,4 +7,57 @@ class CharSetGroupProber(CharSetProber):
super().__init__(lang_filter=lang_filter)
self._active_num = 0
self.probers = []
- self._best_guess_prober = None
\ No newline at end of file
+ self._best_guess_prober = None
+
+ def reset(self):
+ super().reset()
+ self._active_num = 0
+ for prober in self.probers:
+ if prober:
+ prober.reset()
+ prober.active = True
+ self._active_num += 1
+ self._best_guess_prober = None
+
+ def get_charset_name(self):
+ if not self._best_guess_prober:
+ self.get_confidence()
+ if not self._best_guess_prober:
+ return None
+ return self._best_guess_prober.get_charset_name()
+
+ def feed(self, byte_str):
+ for prober in self.probers:
+ if not prober:
+ continue
+ if not prober.active:
+ continue
+ state = prober.feed(byte_str)
+ if not state:
+ continue
+ if state == ProbingState.FOUND_IT:
+ self._best_guess_prober = prober
+ return self.state
+ elif state == ProbingState.NOT_ME:
+ prober.active = False
+ self._active_num -= 1
+ if self._active_num <= 0:
+ self._state = ProbingState.NOT_ME
+ return self.state
+ return self.state
+
+ def get_confidence(self):
+ st = 0.0
+ if not self._best_guess_prober:
+ for prober in self.probers:
+ if not prober:
+ continue
+ if not prober.active:
+ continue
+ cf = prober.get_confidence()
+ if cf > st:
+ st = cf
+ self._best_guess_prober = prober
+ if not self._best_guess_prober:
+ return 0.0
+ return st
\ No newline at end of file
diff --git a/chardet/charsetprober.py b/chardet/charsetprober.py
index 7f492d7..4b6f6ea 100644
--- a/chardet/charsetprober.py
+++ b/chardet/charsetprober.py
@@ -10,6 +10,7 @@ class CharSetProber:
self._state = None
self.lang_filter = lang_filter
self.logger = logging.getLogger(__name__)
+ self.active = True
@staticmethod
def filter_international_words(buf):
@@ -24,7 +25,33 @@ class CharSetProber:
are replaced by a single space ascii character.
This filter applies to all scripts which do not use English characters.
"""
- pass
+ filtered = bytearray()
+ in_word = False
+ prev_marker = True
+ for byte in buf:
+ # Get the byte value as an integer
+ byte_int = byte if isinstance(byte, int) else ord(byte)
+
+ # Check if it's an alphabet character
+ is_alpha = (byte_int >= 65 and byte_int <= 90) or (byte_int >= 97 and byte_int <= 122)
+ # Check if it's an international character
+ is_international = byte_int >= 0x80 and byte_int <= 0xFF
+
+ if is_alpha or is_international:
+ if prev_marker and not in_word:
+ in_word = True
+ if in_word:
+ filtered.append(byte_int)
+ else: # it's a marker
+ if in_word:
+ in_word = False
+ if not prev_marker:
+ filtered.append(32) # ASCII space
+ prev_marker = True
+ continue
+ prev_marker = False
+
+ return bytes(filtered)
@staticmethod
def remove_xml_tags(buf):
@@ -35,4 +62,58 @@ class CharSetProber:
characters and extended ASCII characters, but is currently only used by
``Latin1Prober``.
"""
- pass
\ No newline at end of file
+ filtered = bytearray()
+ in_tag = False
+ for byte in buf:
+ byte_int = byte if isinstance(byte, int) else ord(byte)
+
+ if byte_int == ord('<'):
+ in_tag = True
+ continue
+ elif byte_int == ord('>'):
+ in_tag = False
+ continue
+
+ if not in_tag:
+ filtered.append(byte_int)
+
+ return bytes(filtered)
+
+ def reset(self):
+ """
+ Reset the prober state to its initial value.
+ """
+ self._state = ProbingState.DETECTING
+
+ def feed(self, buf):
+ """
+ Feed a chunk of bytes to the prober and update its state.
+ """
+ raise NotImplementedError
+
+ def get_confidence(self):
+ """
+ Return confidence level of the prober.
+ """
+ raise NotImplementedError
+
+ @property
+ def charset_name(self):
+ """
+ Return the charset name detected by the prober.
+ """
+ raise NotImplementedError
+
+ @property
+ def state(self):
+ """
+ Return the state of the prober.
+ """
+ return self._state
+
+ @property
+ def language(self):
+ """
+ Return the language detected by the prober.
+ """
+ raise NotImplementedError
\ No newline at end of file
diff --git a/chardet/codingstatemachine.py b/chardet/codingstatemachine.py
index 14d1fa4..4aa404e 100644
--- a/chardet/codingstatemachine.py
+++ b/chardet/codingstatemachine.py
@@ -30,4 +30,42 @@ class CodingStateMachine:
self._curr_char_len = 0
self._curr_state = None
self.logger = logging.getLogger(__name__)
- self.reset()
\ No newline at end of file
+ self.reset()
+
+ def reset(self):
+ """
+ Reset the state machine to its initial state.
+ """
+ self._curr_state = MachineState.START
+ self._curr_byte_pos = 0
+ self._curr_char_len = 0
+
+ def next_state(self, c):
+ """
+ Process one byte at a time and return the new state.
+ """
+ # for each byte we get its class
+ byte_class = self._model['class_table'][c]
+ if byte_class == 'eError': # we represent error class as None
+ self._curr_state = MachineState.ERROR
+ return self._curr_state
+
+ # for each byte class we get a state transition table
+ if self._curr_state == MachineState.START:
+ self._curr_byte_pos = 0
+ self._curr_char_len = self._model['char_len_table'][byte_class]
+
+ # from byte's class and state_table, we get its next state
+ curr_state = self._curr_state * self._model['class_factor'] + byte_class
+ self._curr_state = self._model['state_table'][curr_state]
+
+ # we increment the byte position counter
+ self._curr_byte_pos += 1
+
+ return self._curr_state
+
+ def get_current_charlen(self):
+ """
+ Return the length of the current character being detected.
+ """
+ return self._curr_char_len
\ No newline at end of file
diff --git a/chardet/cp949prober.py b/chardet/cp949prober.py
index 1b272ad..0c4f482 100644
--- a/chardet/cp949prober.py
+++ b/chardet/cp949prober.py
@@ -1,5 +1,6 @@
from .chardistribution import EUCKRDistributionAnalysis
from .codingstatemachine import CodingStateMachine
+from .enums import ProbingState
from .mbcharsetprober import MultiByteCharSetProber
from .mbcssm import CP949_SM_MODEL
@@ -9,4 +10,16 @@ class CP949Prober(MultiByteCharSetProber):
super().__init__()
self.coding_sm = CodingStateMachine(CP949_SM_MODEL)
self.distribution_analyzer = EUCKRDistributionAnalysis()
- self.reset()
\ No newline at end of file
+ self.reset()
+
+ def reset(self):
+ super().reset()
+ self._state = ProbingState.DETECTING
+
+ @property
+ def charset_name(self):
+ return "CP949"
+
+ @property
+ def language(self):
+ return "Korean"
\ No newline at end of file
diff --git a/chardet/enums.py b/chardet/enums.py
index 0b0e575..1858080 100644
--- a/chardet/enums.py
+++ b/chardet/enums.py
@@ -54,7 +54,7 @@ class SequenceLikelihood:
@classmethod
def get_num_categories(cls):
""":returns: The number of likelihood categories in the enum."""
- pass
+ return 4 # NEGATIVE through POSITIVE
class CharacterCategory:
"""
diff --git a/chardet/eucjpprober.py b/chardet/eucjpprober.py
index cdbbf2f..7306a5f 100644
--- a/chardet/eucjpprober.py
+++ b/chardet/eucjpprober.py
@@ -12,4 +12,50 @@ class EUCJPProber(MultiByteCharSetProber):
self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL)
self.distribution_analyzer = EUCJPDistributionAnalysis()
self.context_analyzer = EUCJPContextAnalysis()
- self.reset()
\ No newline at end of file
+ self.reset()
+
+ def reset(self):
+ super().reset()
+ self.context_analyzer.reset()
+ self._state = ProbingState.DETECTING
+
+ @property
+ def charset_name(self):
+ return self.context_analyzer.charset_name
+
+ @property
+ def language(self):
+ return "Japanese"
+
+ def feed(self, byte_str):
+ for i in range(len(byte_str)):
+ coding_state = self.coding_sm.next_state(byte_str[i])
+ if coding_state == MachineState.ERROR:
+ self._state = ProbingState.NOT_ME
+ break
+ elif coding_state == MachineState.ITS_ME:
+ self._state = ProbingState.FOUND_IT
+ break
+ elif coding_state == MachineState.START:
+ char_len = self.coding_sm.get_current_charlen()
+ if i == 0:
+ self._last_char[1] = byte_str[0]
+ self.context_analyzer.feed(self._last_char, char_len)
+ self.distribution_analyzer.feed(self._last_char, char_len)
+ else:
+ self.context_analyzer.feed(byte_str[i-1:i+1], char_len)
+ self.distribution_analyzer.feed(byte_str[i-1:i+1], char_len)
+
+ self._last_char[0] = byte_str[-1]
+
+ if self.state == ProbingState.DETECTING:
+ if self.context_analyzer.got_enough_data() and (
+ self.get_confidence() > self.SHORTCUT_THRESHOLD):
+ self._state = ProbingState.FOUND_IT
+
+ return self.state
+
+ def get_confidence(self):
+ context_conf = self.context_analyzer.get_confidence()
+ distrib_conf = self.distribution_analyzer.get_confidence()
+ return max(context_conf, distrib_conf)
\ No newline at end of file
diff --git a/chardet/euckrprober.py b/chardet/euckrprober.py
index 067b9e8..646aa7a 100644
--- a/chardet/euckrprober.py
+++ b/chardet/euckrprober.py
@@ -1,5 +1,6 @@
from .chardistribution import EUCKRDistributionAnalysis
from .codingstatemachine import CodingStateMachine
+from .enums import ProbingState
from .mbcharsetprober import MultiByteCharSetProber
from .mbcssm import EUCKR_SM_MODEL
@@ -9,4 +10,16 @@ class EUCKRProber(MultiByteCharSetProber):
super().__init__()
self.coding_sm = CodingStateMachine(EUCKR_SM_MODEL)
self.distribution_analyzer = EUCKRDistributionAnalysis()
- self.reset()
\ No newline at end of file
+ self.reset()
+
+ def reset(self):
+ super().reset()
+ self._state = ProbingState.DETECTING
+
+ @property
+ def charset_name(self):
+ return "EUC-KR"
+
+ @property
+ def language(self):
+ return "Korean"
\ No newline at end of file
diff --git a/chardet/euctwprober.py b/chardet/euctwprober.py
index 8fa06d8..7417fd1 100644
--- a/chardet/euctwprober.py
+++ b/chardet/euctwprober.py
@@ -1,5 +1,6 @@
from .chardistribution import EUCTWDistributionAnalysis
from .codingstatemachine import CodingStateMachine
+from .enums import ProbingState
from .mbcharsetprober import MultiByteCharSetProber
from .mbcssm import EUCTW_SM_MODEL
@@ -9,4 +10,16 @@ class EUCTWProber(MultiByteCharSetProber):
super().__init__()
self.coding_sm = CodingStateMachine(EUCTW_SM_MODEL)
self.distribution_analyzer = EUCTWDistributionAnalysis()
- self.reset()
\ No newline at end of file
+ self.reset()
+
+ def reset(self):
+ super().reset()
+ self._state = ProbingState.DETECTING
+
+ @property
+ def charset_name(self):
+ return "EUC-TW"
+
+ @property
+ def language(self):
+ return "Traditional Chinese"
\ No newline at end of file
diff --git a/chardet/gb2312prober.py b/chardet/gb2312prober.py
index a897d30..69373da 100644
--- a/chardet/gb2312prober.py
+++ b/chardet/gb2312prober.py
@@ -1,5 +1,6 @@
from .chardistribution import GB2312DistributionAnalysis
from .codingstatemachine import CodingStateMachine
+from .enums import ProbingState
from .mbcharsetprober import MultiByteCharSetProber
from .mbcssm import GB2312_SM_MODEL
@@ -9,4 +10,16 @@ class GB2312Prober(MultiByteCharSetProber):
super().__init__()
self.coding_sm = CodingStateMachine(GB2312_SM_MODEL)
self.distribution_analyzer = GB2312DistributionAnalysis()
- self.reset()
\ No newline at end of file
+ self.reset()
+
+ def reset(self):
+ super().reset()
+ self._state = ProbingState.DETECTING
+
+ @property
+ def charset_name(self):
+ return "GB2312"
+
+ @property
+ def language(self):
+ return "Chinese"
\ No newline at end of file
diff --git a/chardet/hebrewprober.py b/chardet/hebrewprober.py
index b6262c5..57c7bd8 100644
--- a/chardet/hebrewprober.py
+++ b/chardet/hebrewprober.py
@@ -25,4 +25,107 @@ class HebrewProber(CharSetProber):
self._before_prev = None
self._logical_prober = None
self._visual_prober = None
- self.reset()
\ No newline at end of file
+ self.reset()
+
+ def reset(self):
+ self._final_char_logical_score = 0
+ self._final_char_visual_score = 0
+ self._prev = ' '
+ self._before_prev = ' '
+ self._logical_prober = None
+ self._visual_prober = None
+
+ def set_model_probers(self, logical_prober, visual_prober):
+ self._logical_prober = logical_prober
+ self._visual_prober = visual_prober
+
+ def is_final(self, c):
+ return c in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN,
+ self.FINAL_PE, self.FINAL_TSADI]
+
+ def is_non_final(self, c):
+ return c in [self.NORMAL_KAF, self.NORMAL_MEM, self.NORMAL_NUN,
+ self.NORMAL_PE, self.NORMAL_TSADI]
+
+ def feed(self, byte_str):
+ if self._state == ProbingState.NOT_ME:
+ return self._state
+
+ for c in byte_str:
+ if c >= 128:
+ # If we got a non-ascii character, check if it's a final or non-final letter
+ if self.is_final(c):
+ # If the previous character was a non-final letter, this is logical
+ if self._prev == ' ':
+ self._final_char_logical_score += 0
+ self._final_char_visual_score += 0
+ elif self.is_non_final(self._prev):
+ self._final_char_logical_score += 1
+ self._final_char_visual_score -= 1
+ else:
+ self._final_char_logical_score += 0
+ self._final_char_visual_score += 0
+ elif self.is_non_final(c):
+ # If the previous character was a final letter, this is visual
+ if self._prev == ' ':
+ self._final_char_logical_score += 0
+ self._final_char_visual_score += 0
+ elif self.is_final(self._prev):
+ self._final_char_logical_score -= 1
+ self._final_char_visual_score += 1
+ else:
+ self._final_char_logical_score += 0
+ self._final_char_visual_score += 0
+
+ self._before_prev = self._prev
+ self._prev = c
+
+ return self._state
+
+ def get_charset_name(self):
+ # If we have both probers and one is significantly more confident,
+ # use its charset name
+ finalsub = abs(self._final_char_logical_score - self._final_char_visual_score)
+ if finalsub >= self.MIN_FINAL_CHAR_DISTANCE:
+ if self._final_char_logical_score > self._final_char_visual_score:
+ return self.LOGICAL_HEBREW_NAME
+ return self.VISUAL_HEBREW_NAME
+
+ # If we don't have a clear winner, use the one with higher confidence
+ if self._logical_prober and self._visual_prober:
+ logical_conf = self._logical_prober.get_confidence()
+ visual_conf = self._visual_prober.get_confidence()
+ diff = abs(logical_conf - visual_conf)
+ if diff >= self.MIN_MODEL_DISTANCE:
+ if logical_conf > visual_conf:
+ return self.LOGICAL_HEBREW_NAME
+ return self.VISUAL_HEBREW_NAME
+
+ # Still no clear winner, return logical Hebrew by default
+ return self.LOGICAL_HEBREW_NAME
+
+ def get_state(self):
+ # Assume we're good unless both model probers say otherwise
+ if (self._logical_prober and self._visual_prober and
+ self._logical_prober.get_state() == ProbingState.NOT_ME and
+ self._visual_prober.get_state() == ProbingState.NOT_ME):
+ return ProbingState.NOT_ME
+ return ProbingState.DETECTING
+
+ def get_confidence(self):
+ # If we have a clear winner from final letters analysis, use that
+ finalsub = abs(self._final_char_logical_score - self._final_char_visual_score)
+ if finalsub >= self.MIN_FINAL_CHAR_DISTANCE:
+ return 0.95
+
+ # If we have both probers and one is significantly more confident,
+ # use its confidence
+ if self._logical_prober and self._visual_prober:
+ logical_conf = self._logical_prober.get_confidence()
+ visual_conf = self._visual_prober.get_confidence()
+ diff = abs(logical_conf - visual_conf)
+ if diff >= self.MIN_MODEL_DISTANCE:
+ return max(logical_conf, visual_conf)
+
+ # No clear winner, return a moderate confidence
+ return 0.5
\ No newline at end of file
diff --git a/chardet/johabprober.py b/chardet/johabprober.py
index caeafe1..1696da3 100644
--- a/chardet/johabprober.py
+++ b/chardet/johabprober.py
@@ -1,5 +1,6 @@
from .chardistribution import JOHABDistributionAnalysis
from .codingstatemachine import CodingStateMachine
+from .enums import ProbingState
from .mbcharsetprober import MultiByteCharSetProber
from .mbcssm import JOHAB_SM_MODEL
@@ -9,4 +10,16 @@ class JOHABProber(MultiByteCharSetProber):
super().__init__()
self.coding_sm = CodingStateMachine(JOHAB_SM_MODEL)
self.distribution_analyzer = JOHABDistributionAnalysis()
- self.reset()
\ No newline at end of file
+ self.reset()
+
+ def reset(self):
+ super().reset()
+ self._state = ProbingState.DETECTING
+
+ @property
+ def charset_name(self):
+ return "JOHAB"
+
+ @property
+ def language(self):
+ return "Korean"
\ No newline at end of file
diff --git a/chardet/jpcntx.py b/chardet/jpcntx.py
index 0652b56..7933219 100644
--- a/chardet/jpcntx.py
+++ b/chardet/jpcntx.py
@@ -15,11 +15,101 @@ class JapaneseContextAnalysis:
self._done = None
self.reset()
+ def reset(self):
+ """Reset the context analysis."""
+ self._total_rel = 0 # Total relative order
+ self._rel_sample = [0] * self.NUM_OF_CATEGORY # Category counters
+ self._need_to_skip_char_num = 0 # Number of characters to skip
+ self._last_char_order = self.DONT_KNOW # Last character's relative order
+ self._done = False # Done analyzing
+
+ def get_order(self, byte_str):
+ """Get the order of the byte string."""
+ return -1
+
+ def get_confidence(self):
+ """Return confidence based on existing data."""
+ if self._total_rel > self.MINIMUM_DATA_THRESHOLD:
+ return 0.99
+ elif self._total_rel > 0:
+ return 0.75
+ return 0.0
+
+ def got_enough_data(self):
+ """Return true if we've received enough data."""
+ return self._done
+
+ def feed(self, byte_str, num_bytes):
+ """Feed a character with its byte length."""
+ if self._done:
+ return
+
+ # We only care about 2-bytes characters in our analysis
+ if num_bytes != 2:
+ return
+
+ # Skip half the input of less than 512 bytes
+ if self._total_rel < 512:
+ self._need_to_skip_char_num += 1
+ if self._need_to_skip_char_num % 2:
+ return
+
+ order = self.get_order(byte_str)
+ if order != self.DONT_KNOW:
+ self._total_rel += 1
+ if self._last_char_order != self.DONT_KNOW:
+ if self._total_rel > self.MAX_REL_THRESHOLD:
+ self._done = True
+ return
+ if order < self.NUM_OF_CATEGORY:
+ self._rel_sample[order] += 1
+ self._last_char_order = order
+
class SJISContextAnalysis(JapaneseContextAnalysis):
def __init__(self):
super().__init__()
self._charset_name = 'SHIFT_JIS'
+ def get_order(self, byte_str):
+ if not byte_str:
+ return -1
+ # find out current char's byte length
+ first_char = byte_str[0]
+ if (0x81 <= first_char <= 0x9F or 0xE0 <= first_char <= 0xFC):
+ char_len = 2
+ if len(byte_str) < char_len:
+ return -1
+ order = jp2_char_context[first_char - 0x81]
+ else:
+ char_len = 1
+ if first_char < 0x80:
+ return -1
+ order = jp2_char_context[first_char - 0xA1]
+ return order
+
class EUCJPContextAnalysis(JapaneseContextAnalysis):
- pass
\ No newline at end of file
+ def __init__(self):
+ super().__init__()
+ self._charset_name = 'EUC-JP'
+
+ def get_order(self, byte_str):
+ if not byte_str:
+ return -1
+ # find out current char's byte length
+ first_char = byte_str[0]
+ if first_char == 0x8E or first_char == 0x8F:
+ char_len = 2
+ if len(byte_str) < char_len:
+ return -1
+ if first_char == 0x8F:
+ char_len = 3
+ if len(byte_str) < char_len:
+ return -1
+ return -1
+ else:
+ char_len = 1
+ if first_char < 0xA1:
+ return -1
+ order = jp2_char_context[first_char - 0xA1]
+ return order
\ No newline at end of file
diff --git a/chardet/mbcharsetprober.py b/chardet/mbcharsetprober.py
index 28d62fe..9a98f22 100644
--- a/chardet/mbcharsetprober.py
+++ b/chardet/mbcharsetprober.py
@@ -10,4 +10,42 @@ class MultiByteCharSetProber(CharSetProber):
super().__init__(lang_filter=lang_filter)
self.distribution_analyzer = None
self.coding_sm = None
- self._last_char = [0, 0]
\ No newline at end of file
+ self._last_char = [0, 0]
+
+ def reset(self):
+ super().reset()
+ if self.coding_sm:
+ self.coding_sm.reset()
+ if self.distribution_analyzer:
+ self.distribution_analyzer.reset()
+ self._last_char = [0, 0]
+ self._state = ProbingState.DETECTING
+
+ def feed(self, byte_str):
+ for i in range(len(byte_str)):
+ coding_state = self.coding_sm.next_state(byte_str[i])
+ if coding_state == MachineState.ERROR:
+ self._state = ProbingState.NOT_ME
+ break
+ elif coding_state == MachineState.ITS_ME:
+ self._state = ProbingState.FOUND_IT
+ break
+ elif coding_state == MachineState.START:
+ char_len = self.coding_sm.get_current_charlen()
+ if i == 0:
+ self._last_char[1] = byte_str[0]
+ self.distribution_analyzer.feed(self._last_char, char_len)
+ else:
+ self.distribution_analyzer.feed(byte_str[i-1:i+1], char_len)
+
+ self._last_char[0] = byte_str[-1]
+
+ if self.state == ProbingState.DETECTING:
+ if self.distribution_analyzer.got_enough_data() and (
+ self.get_confidence() > self.SHORTCUT_THRESHOLD):
+ self._state = ProbingState.FOUND_IT
+
+ return self.state
+
+ def get_confidence(self):
+ return self.distribution_analyzer.get_confidence()
\ No newline at end of file
diff --git a/chardet/sbcharsetprober.py b/chardet/sbcharsetprober.py
index 003f325..5068a27 100644
--- a/chardet/sbcharsetprober.py
+++ b/chardet/sbcharsetprober.py
@@ -20,4 +20,81 @@ class SingleByteCharSetProber(CharSetProber):
self._total_char = None
self._control_char = None
self._freq_char = None
- self.reset()
\ No newline at end of file
+ self.reset()
+
+ def reset(self):
+ super().reset()
+ self._last_order = 255
+ self._seq_counters = [0] * SequenceLikelihood.get_num_categories()
+ self._total_seqs = 0
+ self._total_char = 0
+ self._control_char = 0
+ self._freq_char = 0
+
+ def get_charset_name(self):
+ if self._name_prober:
+ return self._name_prober.get_charset_name()
+ return self._model.charset_name
+
+ @property
+ def charset_name(self):
+ return self._model.charset_name
+
+ @property
+ def language(self):
+ return self._model.language
+
+ def feed(self, byte_str):
+ if not self._model.keep_ascii_letters:
+ byte_str = self.filter_international_words(byte_str)
+ if not byte_str:
+ return self.state
+ byte_str = self.filter_with_english_letters(byte_str)
+ if not byte_str:
+ return self.state
+
+ char_len = len(byte_str)
+ if char_len > 0:
+ if not self._model.char_to_order_map or not self._model.language_model:
+ self._state = ProbingState.NOT_ME
+ return self.state
+
+ for i, c in enumerate(byte_str):
+ order = self._model.char_to_order_map.get(c, CharacterCategory.UNDEFINED)
+ if order < CharacterCategory.CONTROL:
+ self._control_char += 1
+ elif order == CharacterCategory.SAME_CLASS_WORD:
+ self._freq_char += 1
+
+ if order < len(self._model.language_model):
+ if i > 0:
+ last_order = self._last_order
+ if last_order < len(self._model.language_model):
+ self._total_seqs += 1
+ if not self._reversed:
+ lm_cat = self._model.language_model[last_order][order]
+ self._seq_counters[lm_cat] += 1
+ else:
+ lm_cat = self._model.language_model[order][last_order]
+ self._seq_counters[lm_cat] += 1
+ self._last_order = order
+
+ charset_name = self.charset_name
+ if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD:
+ cf = self.get_confidence()
+ if cf > self.POSITIVE_SHORTCUT_THRESHOLD:
+ self._state = ProbingState.FOUND_IT
+ elif cf < self.NEGATIVE_SHORTCUT_THRESHOLD:
+ self._state = ProbingState.NOT_ME
+
+ return self.state
+
+ def get_confidence(self):
+ r = 0.01
+ if self._total_seqs > 0:
+ r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) / self._total_seqs
+ / self._model.typical_positive_ratio)
+ r = r * (self._total_seqs / self.SAMPLE_SIZE)
+ if r >= 1.0:
+ r = 0.99
+ return r
\ No newline at end of file
diff --git a/chardet/sjisprober.py b/chardet/sjisprober.py
index fe26d49..3e8f3a4 100644
--- a/chardet/sjisprober.py
+++ b/chardet/sjisprober.py
@@ -12,4 +12,50 @@ class SJISProber(MultiByteCharSetProber):
self.coding_sm = CodingStateMachine(SJIS_SM_MODEL)
self.distribution_analyzer = SJISDistributionAnalysis()
self.context_analyzer = SJISContextAnalysis()
- self.reset()
\ No newline at end of file
+ self.reset()
+
+ def reset(self):
+ super().reset()
+ self.context_analyzer.reset()
+ self._state = ProbingState.DETECTING
+
+ @property
+ def charset_name(self):
+ return self.context_analyzer.charset_name
+
+ @property
+ def language(self):
+ return "Japanese"
+
+ def feed(self, byte_str):
+ for i in range(len(byte_str)):
+ coding_state = self.coding_sm.next_state(byte_str[i])
+ if coding_state == MachineState.ERROR:
+ self._state = ProbingState.NOT_ME
+ break
+ elif coding_state == MachineState.ITS_ME:
+ self._state = ProbingState.FOUND_IT
+ break
+ elif coding_state == MachineState.START:
+ char_len = self.coding_sm.get_current_charlen()
+ if i == 0:
+ self._last_char[1] = byte_str[0]
+ self.context_analyzer.feed(self._last_char, char_len)
+ self.distribution_analyzer.feed(self._last_char, char_len)
+ else:
+ self.context_analyzer.feed(byte_str[i-1:i+1], char_len)
+ self.distribution_analyzer.feed(byte_str[i-1:i+1], char_len)
+
+ self._last_char[0] = byte_str[-1]
+
+ if self.state == ProbingState.DETECTING:
+ if self.context_analyzer.got_enough_data() and (
+ self.get_confidence() > self.SHORTCUT_THRESHOLD):
+ self._state = ProbingState.FOUND_IT
+
+ return self.state
+
+ def get_confidence(self):
+ context_conf = self.context_analyzer.get_confidence()
+ distrib_conf = self.distribution_analyzer.get_confidence()
+ return max(context_conf, distrib_conf)
\ No newline at end of file
diff --git a/chardet/universaldetector.py b/chardet/universaldetector.py
index a0351fc..18d7631 100644
--- a/chardet/universaldetector.py
+++ b/chardet/universaldetector.py
@@ -54,13 +54,31 @@ class UniversalDetector:
self._has_win_bytes = None
self.reset()
+ @property
+ def input_state(self):
+ return self._input_state
+
def reset(self):
"""
Reset the UniversalDetector and all of its probers back to their
initial states. This is called by ``__init__``, so you only need to
call this directly in between analyses of different documents.
"""
- pass
+ self.result = {'encoding': None, 'confidence': 0.0, 'language': None}
+ self.done = False
+ self._got_data = False
+ self._has_win_bytes = False
+ self._input_state = InputState.PURE_ASCII
+ self._last_char = None
+ if self._esc_charset_prober:
+ self._esc_charset_prober.reset()
+ if self._utf1632_prober:
+ self._utf1632_prober.reset()
+ for prober in self._charset_probers:
+ prober.reset()
+ self._esc_charset_prober = None
+ self._utf1632_prober = None
+ self._charset_probers = []
def feed(self, byte_str):
"""
@@ -76,7 +94,74 @@ class UniversalDetector:
You should always call ``close`` when you're done feeding in your
document if ``done`` is not already ``True``.
"""
- pass
+ if self.done:
+ return
+
+ if not len(byte_str):
+ return
+
+ if not self._got_data:
+ self._got_data = True
+ if byte_str.startswith(codecs.BOM_UTF8):
+ self.result = {'encoding': 'UTF-8-SIG', 'confidence': 1.0, 'language': ''}
+ self.done = True
+ return
+ if byte_str.startswith(codecs.BOM_UTF32_LE):
+ self.result = {'encoding': 'UTF-32', 'confidence': 1.0, 'language': ''}
+ self.done = True
+ return
+ if byte_str.startswith(codecs.BOM_UTF32_BE):
+ self.result = {'encoding': 'UTF-32', 'confidence': 1.0, 'language': ''}
+ self.done = True
+ return
+ if byte_str.startswith(codecs.BOM_UTF16_LE):
+ self.result = {'encoding': 'UTF-16', 'confidence': 1.0, 'language': ''}
+ self.done = True
+ return
+ if byte_str.startswith(codecs.BOM_UTF16_BE):
+ self.result = {'encoding': 'UTF-16', 'confidence': 1.0, 'language': ''}
+ self.done = True
+ return
+
+ # If none of the above BOMs matched and we see a high byte
+ if self._input_state == InputState.PURE_ASCII:
+ if self.HIGH_BYTE_DETECTOR.search(byte_str):
+ self._input_state = InputState.HIGH_BYTE
+ elif self.ESC_DETECTOR.search(byte_str):
+ self._input_state = InputState.ESC_ASCII
+
+ self._last_char = byte_str[-1:]
+
+ if self._input_state == InputState.ESC_ASCII:
+ if not self._esc_charset_prober:
+ self._esc_charset_prober = EscCharSetProber()
+ if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT:
+ self.result = {'encoding': self._esc_charset_prober.charset_name,
+ 'confidence': self._esc_charset_prober.get_confidence(),
+ 'language': self._esc_charset_prober.language}
+ self.done = True
+ elif self._input_state == InputState.HIGH_BYTE:
+ if not self._utf1632_prober:
+ self._utf1632_prober = UTF1632Prober()
+ if not self._charset_probers:
+ self._charset_probers = [MBCSGroupProber(self.lang_filter),
+ SBCSGroupProber(),
+ Latin1Prober()]
+ if self.WIN_BYTE_DETECTOR.search(byte_str):
+ self._has_win_bytes = True
+
+ for prober in [self._utf1632_prober] + self._charset_probers:
+ if prober.feed(byte_str) == ProbingState.FOUND_IT:
+ charset_name = prober.charset_name
+ if charset_name.startswith('UTF-16'):
+ charset_name = 'UTF-16'
+ elif charset_name.startswith('UTF-32'):
+ charset_name = 'UTF-32'
+ self.result = {'encoding': charset_name,
+ 'confidence': prober.get_confidence(),
+ 'language': prober.language}
+ self.done = True
+ break
def close(self):
"""
@@ -86,4 +171,56 @@ class UniversalDetector:
:returns: The ``result`` attribute, a ``dict`` with the keys
`encoding`, `confidence`, and `language`.
"""
- pass
\ No newline at end of file
+ if self.done:
+ return self.result
+
+ if not self._got_data:
+ self.logger.debug('no data received!')
+ return self.result
+
+ if self._input_state == InputState.PURE_ASCII:
+ self.result = {'encoding': 'ascii',
+ 'confidence': 1.0,
+ 'language': ''}
+ return self.result
+
+ if self._input_state == InputState.HIGH_BYTE:
+ probers = [self._utf1632_prober] if self._utf1632_prober else []
+ probers.extend(self._charset_probers)
+ max_prober = None
+ max_confidence = 0.0
+ for prober in probers:
+ if not prober:
+ continue
+ prober.close()
+ confidence = prober.get_confidence()
+ if confidence > max_confidence:
+ max_confidence = confidence
+ max_prober = prober
+
+ if max_prober and max_confidence > self.MINIMUM_THRESHOLD:
+ charset_name = max_prober.charset_name
+ lower_charset_name = charset_name.lower()
+ confidence = max_prober.get_confidence()
+ # Use Windows encoding name instead of ISO
+ if lower_charset_name in self.ISO_WIN_MAP and self._has_win_bytes:
+ charset_name = self.ISO_WIN_MAP[lower_charset_name]
+ confidence = confidence * 0.9 # Penalize for using Windows charset
+ # Normalize UTF-16/32 names
+ if lower_charset_name.startswith('utf-16'):
+ charset_name = 'UTF-16'
+ elif lower_charset_name.startswith('utf-32'):
+ charset_name = 'UTF-32'
+ self.result = {'encoding': charset_name,
+ 'confidence': confidence,
+ 'language': max_prober.language}
+
+ if self._input_state == InputState.ESC_ASCII and self._esc_charset_prober:
+ self._esc_charset_prober.close()
+ confidence = self._esc_charset_prober.get_confidence()
+ if confidence > self.MINIMUM_THRESHOLD:
+ self.result = {'encoding': self._esc_charset_prober.charset_name,
+ 'confidence': confidence,
+ 'language': self._esc_charset_prober.language}
+
+ return self.result
\ No newline at end of file
diff --git a/chardet/utf1632prober.py b/chardet/utf1632prober.py
index be3cac6..8716059 100644
--- a/chardet/utf1632prober.py
+++ b/chardet/utf1632prober.py
@@ -36,7 +36,12 @@ class UTF1632Prober(CharSetProber):
https://en.wikipedia.org/wiki/UTF-32
"""
- pass
+ value = (quad[0] << 24) | (quad[1] << 16) | (quad[2] << 8) | quad[3]
+ if value > 0x0010FFFF:
+ return False
+ if 0xD800 <= value <= 0xDFFF:
+ return False
+ return True
def validate_utf16_characters(self, pair):
"""
@@ -48,4 +53,121 @@ class UTF1632Prober(CharSetProber):
https://en.wikipedia.org/wiki/UTF-16
"""
- pass
\ No newline at end of file
+ value = (pair[0] << 8) | pair[1]
+ if 0xD800 <= value <= 0xDBFF:
+ return True # First half of surrogate pair
+ if 0xDC00 <= value <= 0xDFFF:
+ return True # Second half of surrogate pair
+ if value >= 0xD800 and value <= 0xDFFF:
+ return False # Invalid surrogate value
+ return True
+
+ def reset(self):
+ """
+ Reset the prober to its initial state.
+ """
+ super().reset()
+ self.position = 0
+ self.zeros_at_mod = [0] * 4
+ self.nonzeros_at_mod = [0] * 4
+ self._state = ProbingState.DETECTING
+ self.quad = [0, 0, 0, 0]
+ self.invalid_utf16be = False
+ self.invalid_utf16le = False
+ self.invalid_utf32be = False
+ self.invalid_utf32le = False
+ self.first_half_surrogate_pair_detected_16be = False
+ self.first_half_surrogate_pair_detected_16le = False
+ self._charset_name = None
+
+ def feed(self, byte_str):
+ """
+ Feed a chunk of bytes to the prober and update its state.
+ """
+ if self._state == ProbingState.NOT_ME:
+ return self._state
+
+ for byte in byte_str:
+ self.quad[self.position % 4] = byte
+ if byte == 0:
+ self.zeros_at_mod[self.position % 4] += 1
+ else:
+ self.nonzeros_at_mod[self.position % 4] += 1
+
+ if self.position % 4 == 3: # We have a complete quad
+ # Check UTF-32BE
+ if not self.invalid_utf32be:
+ if not self.validate_utf32_characters(self.quad):
+ self.invalid_utf32be = True
+
+ # Check UTF-32LE
+ quad_le = self.quad[::-1] # Reverse the quad for LE
+ if not self.invalid_utf32le:
+ if not self.validate_utf32_characters(quad_le):
+ self.invalid_utf32le = True
+
+ if self.position % 2 == 1: # We have a complete pair
+ # Check UTF-16BE
+ if not self.invalid_utf16be:
+ pair_be = self.quad[(self.position - 1) % 4:(self.position + 1) % 4]
+ if not self.validate_utf16_characters(pair_be):
+ self.invalid_utf16be = True
+
+ # Check UTF-16LE
+ if not self.invalid_utf16le:
+ pair_le = self.quad[(self.position - 1) % 4:(self.position + 1) % 4][::-1]
+ if not self.validate_utf16_characters(pair_le):
+ self.invalid_utf16le = True
+
+ self.position += 1
+
+ # Early detection if we have enough data
+ if self.position >= self.MIN_CHARS_FOR_DETECTION:
+ # Check UTF-32BE pattern
+ if (self.zeros_at_mod[0] > 0 and self.zeros_at_mod[1] > 0 and
+ self.zeros_at_mod[2] > 0 and not self.invalid_utf32be):
+ ratio = min(self.zeros_at_mod[0:3]) / (self.position / 4)
+ if ratio > self.EXPECTED_RATIO:
+ self._charset_name = "UTF-32BE"
+ self._state = ProbingState.FOUND_IT
+ return self._state
+
+ # Check UTF-32LE pattern
+ if (self.zeros_at_mod[1] > 0 and self.zeros_at_mod[2] > 0 and
+ self.zeros_at_mod[3] > 0 and not self.invalid_utf32le):
+ ratio = min(self.zeros_at_mod[1:4]) / (self.position / 4)
+ if ratio > self.EXPECTED_RATIO:
+ self._charset_name = "UTF-32LE"
+ self._state = ProbingState.FOUND_IT
+ return self._state
+
+ # Check UTF-16BE pattern
+ if self.zeros_at_mod[0] > 0 and not self.invalid_utf16be:
+ ratio = self.zeros_at_mod[0] / (self.position / 2)
+ if ratio > self.EXPECTED_RATIO:
+ self._charset_name = "UTF-16BE"
+ self._state = ProbingState.FOUND_IT
+ return self._state
+
+ # Check UTF-16LE pattern
+ if self.zeros_at_mod[1] > 0 and not self.invalid_utf16le:
+ ratio = self.zeros_at_mod[1] / (self.position / 2)
+ if ratio > self.EXPECTED_RATIO:
+ self._charset_name = "UTF-16LE"
+ self._state = ProbingState.FOUND_IT
+ return self._state
+
+ return self._state
+
+ @property
+ def charset_name(self):
+ return self._charset_name
+
+ @property
+ def language(self):
+ return ""
+
+ def get_confidence(self):
+ if self._state == ProbingState.FOUND_IT:
+ return 0.99
+ return 0.0
\ No newline at end of file
diff --git a/chardet/utf8prober.py b/chardet/utf8prober.py
index fb6f22f..41e3bc3 100644
--- a/chardet/utf8prober.py
+++ b/chardet/utf8prober.py
@@ -10,4 +10,45 @@ class UTF8Prober(CharSetProber):
super().__init__()
self.coding_sm = CodingStateMachine(UTF8_SM_MODEL)
self._num_mb_chars = None
- self.reset()
\ No newline at end of file
+ self.reset()
+
+ def reset(self):
+ super().reset()
+ self.coding_sm.reset()
+ self._num_mb_chars = 0
+
+ @property
+ def charset_name(self):
+ return "utf-8"
+
+ @property
+ def language(self):
+ return ""
+
+ def feed(self, byte_str):
+ for c in byte_str:
+ coding_state = self.coding_sm.next_state(c)
+ if coding_state == MachineState.ERROR:
+ self._state = ProbingState.NOT_ME
+ break
+ elif coding_state == MachineState.ITS_ME:
+ self._state = ProbingState.FOUND_IT
+ break
+ elif coding_state == MachineState.START:
+ char_len = self.coding_sm.get_current_charlen()
+ if char_len >= 2:
+ self._num_mb_chars += 1
+
+ if self.state == ProbingState.DETECTING:
+ if self.get_confidence() > self.SHORTCUT_THRESHOLD:
+ self._state = ProbingState.FOUND_IT
+
+ return self.state
+
+ def get_confidence(self):
+ unlike = 0.99
+ if self._num_mb_chars < 6:
+ for i in range(0, self._num_mb_chars):
+ unlike = unlike * self.ONE_CHAR_PROB
+ return 1.0 - unlike
+ return unlike
\ No newline at end of file