blob: 05eab57ad4e9ce31abbb276c9adfc925c805fede [file] [log] [blame]
// Protocol Buffers - Google's data interchange format
// Copyright 2008 Google Inc. All rights reserved.
// http://code.google.com/p/protobuf/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
package com.google.protobuf;
import java.io.UnsupportedEncodingException;
/**
* The classes contained within are used internally by the Protocol Buffer
* library and generated message implementations. They are public only because
* those generated messages do not reside in the {@code protobuf} package.
* Others should not use this class directly.
*
* @author kenton@google.com (Kenton Varda)
*/
public class Internal {
/**
* Helper called by generated code to construct default values for string
* fields.
* <p>
* The protocol compiler does not actually contain a UTF-8 decoder -- it
* just pushes UTF-8-encoded text around without touching it. The one place
* where this presents a problem is when generating Java string literals.
* Unicode characters in the string literal would normally need to be encoded
* using a Unicode escape sequence, which would require decoding them.
* To get around this, protoc instead embeds the UTF-8 bytes into the
* generated code and leaves it to the runtime library to decode them.
* <p>
* It gets worse, though. If protoc just generated a byte array, like:
* new byte[] {0x12, 0x34, 0x56, 0x78}
* Java actually generates *code* which allocates an array and then fills
* in each value. This is much less efficient than just embedding the bytes
* directly into the bytecode. To get around this, we need another
* work-around. String literals are embedded directly, so protoc actually
* generates a string literal corresponding to the bytes. The easiest way
* to do this is to use the ISO-8859-1 character set, which corresponds to
* the first 256 characters of the Unicode range. Protoc can then use
* good old CEscape to generate the string.
* <p>
* So we have a string literal which represents a set of bytes which
* represents another string. This function -- stringDefaultValue --
* converts from the generated string to the string we actually want. The
* generated code calls this automatically.
*/
public static String stringDefaultValue(String bytes) {
try {
return new String(bytes.getBytes("ISO-8859-1"), "UTF-8");
} catch (UnsupportedEncodingException e) {
// This should never happen since all JVMs are required to implement
// both of the above character sets.
throw new IllegalStateException(
"Java VM does not support a standard character set.", e);
}
}
/**
* Helper called by generated code to construct default values for bytes
* fields.
* <p>
* This is a lot like {@link #stringDefaultValue}, but for bytes fields.
* In this case we only need the second of the two hacks -- allowing us to
* embed raw bytes as a string literal with ISO-8859-1 encoding.
*/
public static ByteString bytesDefaultValue(String bytes) {
try {
return ByteString.copyFrom(bytes.getBytes("ISO-8859-1"));
} catch (UnsupportedEncodingException e) {
// This should never happen since all JVMs are required to implement
// ISO-8859-1.
throw new IllegalStateException(
"Java VM does not support a standard character set.", e);
}
}
/**
* Helper called by generated code to determine if a byte array is a valid
* UTF-8 encoded string such that the original bytes can be converted to
* a String object and then back to a byte array round tripping the bytes
* without loss.
* <p>
* This is inspired by UTF_8.java in sun.nio.cs.
*
* @param byteString the string to check
* @return whether the byte array is round trippable
*/
public static boolean isValidUtf8(ByteString byteString) {
int index = 0;
int size = byteString.size();
// To avoid the masking, we could change this to use bytes;
// Then X > 0xC2 gets turned into X < -0xC2; X < 0x80
// gets turned into X >= 0, etc.
while (index < size) {
int byte1 = byteString.byteAt(index++) & 0xFF;
if (byte1 < 0x80) {
// fast loop for single bytes
continue;
// we know from this point on that we have 2-4 byte forms
} else if (byte1 < 0xC2 || byte1 > 0xF4) {
// catch illegal first bytes: < C2 or > F4
return false;
}
if (index >= size) {
// fail if we run out of bytes
return false;
}
int byte2 = byteString.byteAt(index++) & 0xFF;
if (byte2 < 0x80 || byte2 > 0xBF) {
// general trail-byte test
return false;
}
if (byte1 <= 0xDF) {
// two-byte form; general trail-byte test is sufficient
continue;
}
// we know from this point on that we have 3 or 4 byte forms
if (index >= size) {
// fail if we run out of bytes
return false;
}
int byte3 = byteString.byteAt(index++) & 0xFF;
if (byte3 < 0x80 || byte3 > 0xBF) {
// general trail-byte test
return false;
}
if (byte1 <= 0xEF) {
// three-byte form. Vastly more frequent than four-byte forms
// The following has an extra test, but not worth restructuring
if (byte1 == 0xE0 && byte2 < 0xA0 ||
byte1 == 0xED && byte2 > 0x9F) {
// check special cases of byte2
return false;
}
} else {
// four-byte form
if (index >= size) {
// fail if we run out of bytes
return false;
}
int byte4 = byteString.byteAt(index++) & 0xFF;
if (byte4 < 0x80 || byte4 > 0xBF) {
// general trail-byte test
return false;
}
// The following has an extra test, but not worth restructuring
if (byte1 == 0xF0 && byte2 < 0x90 ||
byte1 == 0xF4 && byte2 > 0x8F) {
// check special cases of byte2
return false;
}
}
}
return true;
}
/**
* Interface for an enum value or value descriptor, to be used in FieldSet.
* The lite library stores enum values directly in FieldSets but the full
* library stores EnumValueDescriptors in order to better support reflection.
*/
public interface EnumLite {
int getNumber();
}
/**
* Interface for an object which maps integers to {@link EnumLite}s.
* {@link Descriptors.EnumDescriptor} implements this interface by mapping
* numbers to {@link Descriptors.EnumValueDescriptor}s. Additionally,
* every generated enum type has a static method internalGetValueMap() which
* returns an implementation of this type that maps numbers to enum values.
*/
public interface EnumLiteMap<T extends EnumLite> {
T findValueByNumber(int number);
}
}