| from __future__ import absolute_import, division, unicode_literals |
| |
| import os |
| import unittest |
| |
| try: |
| unittest.TestCase.assertEqual |
| except AttributeError: |
| unittest.TestCase.assertEqual = unittest.TestCase.assertEquals |
| |
| from .support import get_data_files, TestData, test_dir, errorMessage |
| from html5lib import HTMLParser, inputstream |
| |
| |
| class Html5EncodingTestCase(unittest.TestCase): |
| def test_codec_name_a(self): |
| self.assertEqual(inputstream.codecName("utf-8"), "utf-8") |
| |
| def test_codec_name_b(self): |
| self.assertEqual(inputstream.codecName("utf8"), "utf-8") |
| |
| def test_codec_name_c(self): |
| self.assertEqual(inputstream.codecName(" utf8 "), "utf-8") |
| |
| def test_codec_name_d(self): |
| self.assertEqual(inputstream.codecName("ISO_8859--1"), "windows-1252") |
| |
| |
| def runParserEncodingTest(data, encoding): |
| p = HTMLParser() |
| assert p.documentEncoding is None |
| p.parse(data, useChardet=False) |
| encoding = encoding.lower().decode("ascii") |
| |
| assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding) |
| |
| |
| def runPreScanEncodingTest(data, encoding): |
| stream = inputstream.HTMLBinaryInputStream(data, chardet=False) |
| encoding = encoding.lower().decode("ascii") |
| |
| # Very crude way to ignore irrelevant tests |
| if len(data) > stream.numBytesMeta: |
| return |
| |
| assert encoding == stream.charEncoding[0], errorMessage(data, encoding, stream.charEncoding[0]) |
| |
| |
| def test_encoding(): |
| for filename in get_data_files("encoding"): |
| tests = TestData(filename, b"data", encoding=None) |
| for idx, test in enumerate(tests): |
| yield (runParserEncodingTest, test[b'data'], test[b'encoding']) |
| yield (runPreScanEncodingTest, test[b'data'], test[b'encoding']) |
| |
| try: |
| try: |
| import charade # flake8: noqa |
| except ImportError: |
| import chardet # flake8: noqa |
| except ImportError: |
| print("charade/chardet not found, skipping chardet tests") |
| else: |
| def test_chardet(): |
| with open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt"), "rb") as fp: |
| encoding = inputstream.HTMLInputStream(fp.read()).charEncoding |
| assert encoding[0].lower() == "big5" |