third_party/web_platform_tests/tools/html5lib/html5lib/inputstream.py - cobalt - Git at Google

 from __future__ import absolute_import, division, unicode_literals
 from six import text_type
 from six.moves import http_client

 import codecs
 import re

 from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
 from .constants import encodings, ReparseException
 from . import utils

 from io import StringIO

 try:
     from io import BytesIO
 except ImportError:
     BytesIO = StringIO

 try:
     from io import BufferedIOBase
 except ImportError:
     class BufferedIOBase(object):
         pass

 # Non-unicode versions of constants for use in the pre-parser
 spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
 asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
 asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
 spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])

 invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")

 non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
                                   0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
                                   0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
                                   0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
                                   0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
                                   0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
                                   0x10FFFE, 0x10FFFF])

 ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")

 # Cache for charsUntil()
 charsUntilRegEx = {}


 class BufferedStream(object):
     """Buffering for streams that do not have buffering of their own

     The buffer is implemented as a list of chunks on the assumption that
     joining many strings will be slow since it is O(n**2)
     """

     def __init__(self, stream):
         self.stream = stream
         self.buffer = []
         self.position = [-1, 0]  # chunk number, offset

     def tell(self):
         pos = 0
         for chunk in self.buffer[:self.position[0]]:
             pos += len(chunk)
         pos += self.position[1]
         return pos

     def seek(self, pos):
         assert pos <= self._bufferedBytes()
         offset = pos
         i = 0
         while len(self.buffer[i]) < offset:
             offset -= len(self.buffer[i])
             i += 1
         self.position = [i, offset]

     def read(self, bytes):
         if not self.buffer:
             return self._readStream(bytes)
         elif (self.position[0] == len(self.buffer) and
               self.position[1] == len(self.buffer[-1])):
             return self._readStream(bytes)
         else:
             return self._readFromBuffer(bytes)

     def _bufferedBytes(self):
         return sum([len(item) for item in self.buffer])

     def _readStream(self, bytes):
         data = self.stream.read(bytes)
         self.buffer.append(data)
         self.position[0] += 1
         self.position[1] = len(data)
         return data

     def _readFromBuffer(self, bytes):
         remainingBytes = bytes
         rv = []
         bufferIndex = self.position[0]
         bufferOffset = self.position[1]
         while bufferIndex < len(self.buffer) and remainingBytes != 0:
             assert remainingBytes > 0
             bufferedData = self.buffer[bufferIndex]

             if remainingBytes <= len(bufferedData) - bufferOffset:
                 bytesToRead = remainingBytes
                 self.position = [bufferIndex, bufferOffset + bytesToRead]
             else:
                 bytesToRead = len(bufferedData) - bufferOffset
                 self.position = [bufferIndex, len(bufferedData)]
                 bufferIndex += 1
             rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
             remainingBytes -= bytesToRead

             bufferOffset = 0

         if remainingBytes:
             rv.append(self._readStream(remainingBytes))

         return b"".join(rv)


 def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
     if isinstance(source, http_client.HTTPResponse):
         # Work around Python bug #20007: read(0) closes the connection.
         # http://bugs.python.org/issue20007
         isUnicode = False
     elif hasattr(source, "read"):
         isUnicode = isinstance(source.read(0), text_type)
     else:
         isUnicode = isinstance(source, text_type)

     if isUnicode:
         if encoding is not None:
             raise TypeError("Cannot explicitly set an encoding with a unicode string")

         return HTMLUnicodeInputStream(source)
     else:
         return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)


 class HTMLUnicodeInputStream(object):
     """Provides a unicode stream of characters to the HTMLTokenizer.

     This class takes care of character encoding and removing or replacing
     incorrect byte-sequences and also provides column and line tracking.

     """

     _defaultChunkSize = 10240

     def __init__(self, source):
         """Initialises the HTMLInputStream.

         HTMLInputStream(source, [encoding]) -> Normalized stream from source
         for use by html5lib.

         source can be either a file-object, local filename or a string.

         The optional encoding parameter must be a string that indicates
         the encoding.  If specified, that encoding will be used,
         regardless of any BOM or later declaration (such as in a meta
         element)

         parseMeta - Look for a <meta> element containing encoding information

         """

         # Craziness
         if len("\U0010FFFF") == 1:
             self.reportCharacterErrors = self.characterErrorsUCS4
             self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
         else:
             self.reportCharacterErrors = self.characterErrorsUCS2
             self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")

         # List of where new lines occur
         self.newLines = [0]

         self.charEncoding = ("utf-8", "certain")
         self.dataStream = self.openStream(source)

         self.reset()

     def reset(self):
         self.chunk = ""
         self.chunkSize = 0
         self.chunkOffset = 0
         self.errors = []

         # number of (complete) lines in previous chunks
         self.prevNumLines = 0
         # number of columns in the last line of the previous chunk
         self.prevNumCols = 0

         # Deal with CR LF and surrogates split over chunk boundaries
         self._bufferedCharacter = None

     def openStream(self, source):
         """Produces a file object from source.

         source can be either a file object, local filename or a string.

         """
         # Already a file object
         if hasattr(source, 'read'):
             stream = source
         else:
             stream = StringIO(source)

         return stream

     def _position(self, offset):
         chunk = self.chunk
         nLines = chunk.count('\n', 0, offset)
         positionLine = self.prevNumLines + nLines
         lastLinePos = chunk.rfind('\n', 0, offset)
         if lastLinePos == -1:
             positionColumn = self.prevNumCols + offset
         else:
             positionColumn = offset - (lastLinePos + 1)
         return (positionLine, positionColumn)

     def position(self):
         """Returns (line, col) of the current position in the stream."""
         line, col = self._position(self.chunkOffset)
         return (line + 1, col)

     def char(self):
         """ Read one character from the stream or queue if available. Return
             EOF when EOF is reached.
         """
         # Read a new chunk from the input stream if necessary
         if self.chunkOffset >= self.chunkSize:
             if not self.readChunk():
                 return EOF

         chunkOffset = self.chunkOffset
         char = self.chunk[chunkOffset]
         self.chunkOffset = chunkOffset + 1

         return char

     def readChunk(self, chunkSize=None):
         if chunkSize is None:
             chunkSize = self._defaultChunkSize

         self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)

         self.chunk = ""
         self.chunkSize = 0
         self.chunkOffset = 0

         data = self.dataStream.read(chunkSize)

         # Deal with CR LF and surrogates broken across chunks
         if self._bufferedCharacter:
             data = self._bufferedCharacter + data
             self._bufferedCharacter = None
         elif not data:
             # We have no more data, bye-bye stream
             return False

         if len(data) > 1:
             lastv = ord(data[-1])
             if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
                 self._bufferedCharacter = data[-1]
                 data = data[:-1]

         self.reportCharacterErrors(data)

         # Replace invalid characters
         # Note U+0000 is dealt with in the tokenizer
         data = self.replaceCharactersRegexp.sub("\ufffd", data)

         data = data.replace("\r\n", "\n")
         data = data.replace("\r", "\n")

         self.chunk = data
         self.chunkSize = len(data)

         return True

     def characterErrorsUCS4(self, data):
         for i in range(len(invalid_unicode_re.findall(data))):
             self.errors.append("invalid-codepoint")

     def characterErrorsUCS2(self, data):
         # Someone picked the wrong compile option
         # You lose
         skip = False
         for match in invalid_unicode_re.finditer(data):
             if skip:
                 continue
             codepoint = ord(match.group())
             pos = match.start()
             # Pretty sure there should be endianness issues here
             if utils.isSurrogatePair(data[pos:pos + 2]):
                 # We have a surrogate pair!
                 char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2])
                 if char_val in non_bmp_invalid_codepoints:
                     self.errors.append("invalid-codepoint")
                 skip = True
             elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
                   pos == len(data) - 1):
                 self.errors.append("invalid-codepoint")
             else:
                 skip = False
                 self.errors.append("invalid-codepoint")

     def charsUntil(self, characters, opposite=False):
         """ Returns a string of characters from the stream up to but not
         including any character in 'characters' or EOF. 'characters' must be
         a container that supports the 'in' method and iteration over its
         characters.
         """

         # Use a cache of regexps to find the required characters
         try:
             chars = charsUntilRegEx[(characters, opposite)]
         except KeyError:
             if __debug__:
                 for c in characters:
                     assert(ord(c) < 128)
             regex = "".join(["\\x%02x" % ord(c) for c in characters])
             if not opposite:
                 regex = "^%s" % regex
             chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)

         rv = []

         while True:
             # Find the longest matching prefix
             m = chars.match(self.chunk, self.chunkOffset)
             if m is None:
                 # If nothing matched, and it wasn't because we ran out of chunk,
                 # then stop
                 if self.chunkOffset != self.chunkSize:
                     break
             else:
                 end = m.end()
                 # If not the whole chunk matched, return everything
                 # up to the part that didn't match
                 if end != self.chunkSize:
                     rv.append(self.chunk[self.chunkOffset:end])
                     self.chunkOffset = end
                     break
             # If the whole remainder of the chunk matched,
             # use it all and read the next chunk
             rv.append(self.chunk[self.chunkOffset:])
             if not self.readChunk():
                 # Reached EOF
                 break

         r = "".join(rv)
         return r

     def unget(self, char):
         # Only one character is allowed to be ungotten at once - it must
         # be consumed again before any further call to unget
         if char is not None:
             if self.chunkOffset == 0:
                 # unget is called quite rarely, so it's a good idea to do
                 # more work here if it saves a bit of work in the frequently
                 # called char and charsUntil.
                 # So, just prepend the ungotten character onto the current
                 # chunk:
                 self.chunk = char + self.chunk
                 self.chunkSize += 1
             else:
                 self.chunkOffset -= 1
                 assert self.chunk[self.chunkOffset] == char


 class HTMLBinaryInputStream(HTMLUnicodeInputStream):
     """Provides a unicode stream of characters to the HTMLTokenizer.

     This class takes care of character encoding and removing or replacing
     incorrect byte-sequences and also provides column and line tracking.

     """

     def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
         """Initialises the HTMLInputStream.

         HTMLInputStream(source, [encoding]) -> Normalized stream from source
         for use by html5lib.

         source can be either a file-object, local filename or a string.

         The optional encoding parameter must be a string that indicates
         the encoding.  If specified, that encoding will be used,
         regardless of any BOM or later declaration (such as in a meta
         element)

         parseMeta - Look for a <meta> element containing encoding information

         """
         # Raw Stream - for unicode objects this will encode to utf-8 and set
         #              self.charEncoding as appropriate
         self.rawStream = self.openStream(source)

         HTMLUnicodeInputStream.__init__(self, self.rawStream)

         self.charEncoding = (codecName(encoding), "certain")

         # Encoding Information
         # Number of bytes to use when looking for a meta element with
         # encoding information
         self.numBytesMeta = 512
         # Number of bytes to use when using detecting encoding using chardet
         self.numBytesChardet = 100
         # Encoding to use if no other information can be found
         self.defaultEncoding = "windows-1252"

         # Detect encoding iff no explicit "transport level" encoding is supplied
         if (self.charEncoding[0] is None):
             self.charEncoding = self.detectEncoding(parseMeta, chardet)

         # Call superclass
         self.reset()

     def reset(self):
         self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
                                                                  'replace')
         HTMLUnicodeInputStream.reset(self)

     def openStream(self, source):
         """Produces a file object from source.

         source can be either a file object, local filename or a string.

         """
         # Already a file object
         if hasattr(source, 'read'):
             stream = source
         else:
             stream = BytesIO(source)

         try:
             stream.seek(stream.tell())
         except:
             stream = BufferedStream(stream)

         return stream

     def detectEncoding(self, parseMeta=True, chardet=True):
         # First look for a BOM
         # This will also read past the BOM if present
         encoding = self.detectBOM()
         confidence = "certain"
         # If there is no BOM need to look for meta elements with encoding
         # information
         if encoding is None and parseMeta:
             encoding = self.detectEncodingMeta()
             confidence = "tentative"
         # Guess with chardet, if avaliable
         if encoding is None and chardet:
             confidence = "tentative"
             try:
                 try:
                     from charade.universaldetector import UniversalDetector
                 except ImportError:
                     from chardet.universaldetector import UniversalDetector
                 buffers = []
                 detector = UniversalDetector()
                 while not detector.done:
                     buffer = self.rawStream.read(self.numBytesChardet)
                     assert isinstance(buffer, bytes)
                     if not buffer:
                         break
                     buffers.append(buffer)
                     detector.feed(buffer)
                 detector.close()
                 encoding = detector.result['encoding']
                 self.rawStream.seek(0)
             except ImportError:
                 pass
         # If all else fails use the default encoding
         if encoding is None:
             confidence = "tentative"
             encoding = self.defaultEncoding

         # Substitute for equivalent encodings:
         encodingSub = {"iso-8859-1": "windows-1252"}

         if encoding.lower() in encodingSub:
             encoding = encodingSub[encoding.lower()]

         return encoding, confidence

     def changeEncoding(self, newEncoding):
         assert self.charEncoding[1] != "certain"
         newEncoding = codecName(newEncoding)
         if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
             newEncoding = "utf-8"
         if newEncoding is None:
             return
         elif newEncoding == self.charEncoding[0]:
             self.charEncoding = (self.charEncoding[0], "certain")
         else:
             self.rawStream.seek(0)
             self.reset()
             self.charEncoding = (newEncoding, "certain")
             raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))

     def detectBOM(self):
         """Attempts to detect at BOM at the start of the stream. If
         an encoding can be determined from the BOM return the name of the
         encoding otherwise return None"""
         bomDict = {
             codecs.BOM_UTF8: 'utf-8',
             codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
             codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
         }

         # Go to beginning of file and read in 4 bytes
         string = self.rawStream.read(4)
         assert isinstance(string, bytes)

         # Try detecting the BOM using bytes from the string
         encoding = bomDict.get(string[:3])         # UTF-8
         seek = 3
         if not encoding:
             # Need to detect UTF-32 before UTF-16
             encoding = bomDict.get(string)         # UTF-32
             seek = 4
             if not encoding:
                 encoding = bomDict.get(string[:2])  # UTF-16
                 seek = 2

         # Set the read position past the BOM if one was found, otherwise
         # set it to the start of the stream
         self.rawStream.seek(encoding and seek or 0)

         return encoding

     def detectEncodingMeta(self):
         """Report the encoding declared by the meta element
         """
         buffer = self.rawStream.read(self.numBytesMeta)
         assert isinstance(buffer, bytes)
         parser = EncodingParser(buffer)
         self.rawStream.seek(0)
         encoding = parser.getEncoding()

         if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
             encoding = "utf-8"

         return encoding


 class EncodingBytes(bytes):
     """String-like object with an associated position and various extra methods
     If the position is ever greater than the string length then an exception is
     raised"""
     def __new__(self, value):
         assert isinstance(value, bytes)
         return bytes.__new__(self, value.lower())

     def __init__(self, value):
         self._position = -1

     def __iter__(self):
         return self

     def __next__(self):
         p = self._position = self._position + 1
         if p >= len(self):
             raise StopIteration
         elif p < 0:
             raise TypeError
         return self[p:p + 1]

     def next(self):
         # Py2 compat
         return self.__next__()

     def previous(self):
         p = self._position
         if p >= len(self):
             raise StopIteration
         elif p < 0:
             raise TypeError
         self._position = p = p - 1
         return self[p:p + 1]

     def setPosition(self, position):
         if self._position >= len(self):
             raise StopIteration
         self._position = position

     def getPosition(self):
         if self._position >= len(self):
             raise StopIteration
         if self._position >= 0:
             return self._position
         else:
             return None

     position = property(getPosition, setPosition)

     def getCurrentByte(self):
         return self[self.position:self.position + 1]

     currentByte = property(getCurrentByte)

     def skip(self, chars=spaceCharactersBytes):
         """Skip past a list of characters"""
         p = self.position               # use property for the error-checking
         while p < len(self):
             c = self[p:p + 1]
             if c not in chars:
                 self._position = p
                 return c
             p += 1
         self._position = p
         return None

     def skipUntil(self, chars):
         p = self.position
         while p < len(self):
             c = self[p:p + 1]
             if c in chars:
                 self._position = p
                 return c
             p += 1
         self._position = p
         return None

     def matchBytes(self, bytes):
         """Look for a sequence of bytes at the start of a string. If the bytes
         are found return True and advance the position to the byte after the
         match. Otherwise return False and leave the position alone"""
         p = self.position
         data = self[p:p + len(bytes)]
         rv = data.startswith(bytes)
         if rv:
             self.position += len(bytes)
         return rv

     def jumpTo(self, bytes):
         """Look for the next sequence of bytes matching a given sequence. If
         a match is found advance the position to the last byte of the match"""
         newPosition = self[self.position:].find(bytes)
         if newPosition > -1:
             # XXX: This is ugly, but I can't see a nicer way to fix this.
             if self._position == -1:
                 self._position = 0
             self._position += (newPosition + len(bytes) - 1)
             return True
         else:
             raise StopIteration


 class EncodingParser(object):
     """Mini parser for detecting character encoding from meta elements"""

     def __init__(self, data):
         """string - the data to work on for encoding detection"""
         self.data = EncodingBytes(data)
         self.encoding = None

     def getEncoding(self):
         methodDispatch = (
             (b"<!--", self.handleComment),
             (b"<meta", self.handleMeta),
             (b"</", self.handlePossibleEndTag),
             (b"<!", self.handleOther),
             (b"<?", self.handleOther),
             (b"<", self.handlePossibleStartTag))
         for byte in self.data:
             keepParsing = True
             for key, method in methodDispatch:
                 if self.data.matchBytes(key):
                     try:
                         keepParsing = method()
                         break
                     except StopIteration:
                         keepParsing = False
                         break
             if not keepParsing:
                 break

         return self.encoding

     def handleComment(self):
         """Skip over comments"""
         return self.data.jumpTo(b"-->")

     def handleMeta(self):
         if self.data.currentByte not in spaceCharactersBytes:
             # if we have <meta not followed by a space so just keep going
             return True
         # We have a valid meta element we want to search for attributes
         hasPragma = False
         pendingEncoding = None
         while True:
             # Try to find the next attribute after the current position
             attr = self.getAttribute()
             if attr is None:
                 return True
             else:
                 if attr[0] == b"http-equiv":
                     hasPragma = attr[1] == b"content-type"
                     if hasPragma and pendingEncoding is not None:
                         self.encoding = pendingEncoding
                         return False
                 elif attr[0] == b"charset":
                     tentativeEncoding = attr[1]
                     codec = codecName(tentativeEncoding)
                     if codec is not None:
                         self.encoding = codec
                         return False
                 elif attr[0] == b"content":
                     contentParser = ContentAttrParser(EncodingBytes(attr[1]))
                     tentativeEncoding = contentParser.parse()
                     if tentativeEncoding is not None:
                         codec = codecName(tentativeEncoding)
                         if codec is not None:
                             if hasPragma:
                                 self.encoding = codec
                                 return False
                             else:
                                 pendingEncoding = codec

     def handlePossibleStartTag(self):
         return self.handlePossibleTag(False)

     def handlePossibleEndTag(self):
         next(self.data)
         return self.handlePossibleTag(True)

     def handlePossibleTag(self, endTag):
         data = self.data
         if data.currentByte not in asciiLettersBytes:
             # If the next byte is not an ascii letter either ignore this
             # fragment (possible start tag case) or treat it according to
             # handleOther
             if endTag:
                 data.previous()
                 self.handleOther()
             return True

         c = data.skipUntil(spacesAngleBrackets)
         if c == b"<":
             # return to the first step in the overall "two step" algorithm
             # reprocessing the < byte
             data.previous()
         else:
             # Read all attributes
             attr = self.getAttribute()
             while attr is not None:
                 attr = self.getAttribute()
         return True

     def handleOther(self):
         return self.data.jumpTo(b">")

     def getAttribute(self):
         """Return a name,value pair for the next attribute in the stream,
         if one is found, or None"""
         data = self.data
         # Step 1 (skip chars)
         c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
         assert c is None or len(c) == 1
         # Step 2
         if c in (b">", None):
             return None
         # Step 3
         attrName = []
         attrValue = []
         # Step 4 attribute name
         while True:
             if c == b"=" and attrName:
                 break
             elif c in spaceCharactersBytes:
                 # Step 6!
                 c = data.skip()
                 break
             elif c in (b"/", b">"):
                 return b"".join(attrName), b""
             elif c in asciiUppercaseBytes:
                 attrName.append(c.lower())
             elif c is None:
                 return None
             else:
                 attrName.append(c)
             # Step 5
             c = next(data)
         # Step 7
         if c != b"=":
             data.previous()
             return b"".join(attrName), b""
         # Step 8
         next(data)
         # Step 9
         c = data.skip()
         # Step 10
         if c in (b"'", b'"'):
             # 10.1
             quoteChar = c
             while True:
                 # 10.2
                 c = next(data)
                 # 10.3
                 if c == quoteChar:
                     next(data)
                     return b"".join(attrName), b"".join(attrValue)
                 # 10.4
                 elif c in asciiUppercaseBytes:
                     attrValue.append(c.lower())
                 # 10.5
                 else:
                     attrValue.append(c)
         elif c == b">":
             return b"".join(attrName), b""
         elif c in asciiUppercaseBytes:
             attrValue.append(c.lower())
         elif c is None:
             return None
         else:
             attrValue.append(c)
         # Step 11
         while True:
             c = next(data)
             if c in spacesAngleBrackets:
                 return b"".join(attrName), b"".join(attrValue)
             elif c in asciiUppercaseBytes:
                 attrValue.append(c.lower())
             elif c is None:
                 return None
             else:
                 attrValue.append(c)


 class ContentAttrParser(object):
     def __init__(self, data):
         assert isinstance(data, bytes)
         self.data = data

     def parse(self):
         try:
             # Check if the attr name is charset
             # otherwise return
             self.data.jumpTo(b"charset")
             self.data.position += 1
             self.data.skip()
             if not self.data.currentByte == b"=":
                 # If there is no = sign keep looking for attrs
                 return None
             self.data.position += 1
             self.data.skip()
             # Look for an encoding between matching quote marks
             if self.data.currentByte in (b'"', b"'"):
                 quoteMark = self.data.currentByte
                 self.data.position += 1
                 oldPosition = self.data.position
                 if self.data.jumpTo(quoteMark):
                     return self.data[oldPosition:self.data.position]
                 else:
                     return None
             else:
                 # Unquoted value
                 oldPosition = self.data.position
                 try:
                     self.data.skipUntil(spaceCharactersBytes)
                     return self.data[oldPosition:self.data.position]
                 except StopIteration:
                     # Return the whole remaining value
                     return self.data[oldPosition:]
         except StopIteration:
             return None


 def codecName(encoding):
     """Return the python codec name corresponding to an encoding or None if the
     string doesn't correspond to a valid encoding."""
     if isinstance(encoding, bytes):
         try:
             encoding = encoding.decode("ascii")
         except UnicodeDecodeError:
             return None
     if encoding:
         canonicalName = ascii_punctuation_re.sub("", encoding).lower()
         return encodings.get(canonicalName, None)
     else:
         return None