blob: c97b10a91892182123603e26ec99594e6ddf3414 [file] [log] [blame]
/*
* Copyright 2007 Google Inc. All rights reserved.
* Copyright 2012 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef URLParse_h
#define URLParse_h
#include "URLComponent.h"
#include "URLSegments.h"
#include <wtf/unicode/Unicode.h>
#if USE(WTFURL)
namespace WTF {
namespace URLParser {
// Initialization functions ---------------------------------------------------
//
// These functions parse the given URL, filling in all of the structure's
// components. These functions can not fail, they will always do their best
// at interpreting the input given.
//
// The string length of the URL MUST be specified, we do not check for NULLs
// at any point in the process, and will actually handle embedded NULLs.
//
// IMPORTANT: These functions do NOT hang on to the given pointer or copy it
// in any way. See the comment above the struct.
//
// The 8-bit versions require UTF-8 encoding.
// StandardURL is for when the scheme is known to be one that has an
// authority (host) like "http". This function will not handle weird ones
// like "about:" and "javascript:", or do the right thing for "file:" URLs.
void ParseStandardURL(const char* url, int urlLength, URLSegments* parsed);
void ParseStandardURL(const UChar* url, int urlLength, URLSegments* parsed);
// PathURL is for when the scheme is known not to have an authority (host)
// section but that aren't file URLs either. The scheme is parsed, and
// everything after the scheme is considered as the path. This is used for
// things like "about:" and "javascript:"
void ParsePathURL(const char* url, int urlLength, URLSegments* parsed);
void ParsePathURL(const UChar* url, int urlLength, URLSegments* parsed);
// FileURL is for file URLs. There are some special rules for interpreting
// these.
void ParseFileURL(const char* url, int urlLength, URLSegments* parsed);
void ParseFileURL(const UChar* url, int urlLength, URLSegments* parsed);
// Filesystem URLs are structured differently than other URLs.
void ParseFileSystemURL(const char* url, int urlLength, URLSegments* parsed);
void ParseFileSystemURL(const UChar* url, int urlLength, URLSegments* parsed);
// MailtoURL is for mailto: urls. They are made up scheme,path,query
void ParseMailtoURL(const char* url, int urlLength, URLSegments* parsed);
void ParseMailtoURL(const UChar* url, int urlLength, URLSegments* parsed);
// Helper functions -----------------------------------------------------------
// Locates the scheme according to the URL parser's rules. This function is
// designed so the caller can find the scheme and call the correct Init*
// function according to their known scheme types.
//
// It also does not perform any validation on the scheme.
//
// This function will return true if the scheme is found and will put the
// scheme's range into *scheme. False means no scheme could be found. Note
// that a URL beginning with a colon has a scheme, but it is empty, so this
// function will return true but *scheme will = (0,0).
//
// The scheme is found by skipping spaces and control characters at the
// beginning, and taking everything from there to the first colon to be the
// scheme. The character at scheme.end() will be the colon (we may enhance
// this to handle full width colons or something, so don't count on the
// actual character value). The character at scheme.end()+1 will be the
// beginning of the rest of the URL, be it the authority or the path (or the
// end of the string).
//
// The 8-bit version requires UTF-8 encoding.
bool ExtractScheme(const char* url, int urlLength, URLComponent* scheme);
bool ExtractScheme(const UChar* url, int urlLength, URLComponent* scheme);
// Returns true if ch is a character that terminates the authority segment of a URL.
bool IsAuthorityTerminator(UChar);
// Does a best effort parse of input |spec|, in range |auth|. If a particular
// component is not found, it will be set to invalid.
void ParseAuthority(const char* spec, const URLComponent& auth,
URLComponent* username, URLComponent* password, URLComponent* hostname, URLComponent* portNumber);
void ParseAuthority(const UChar* spec, const URLComponent& auth,
URLComponent* username, URLComponent* password, URLComponent* hostname, URLComponent* portNumber);
// Computes the integer port value from the given port component. The port
// component should have been identified by one of the init functions on
// |Parsed| for the given input url.
//
// The return value will be a positive integer between 0 and 64K, or one of
// the two special values below.
enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 };
int ParsePort(const char* url, const URLComponent& port);
int ParsePort(const UChar* url, const URLComponent& port);
// Extracts the range of the file name in the given url. The path must
// already have been computed by the parse function, and the matching URL
// and extracted path are provided to this function. The filename is
// defined as being everything from the last slash/backslash of the path
// to the end of the path.
//
// The file name will be empty if the path is empty or there is nothing
// following the last slash.
//
// The 8-bit version requires UTF-8 encoding.
void ExtractFileName(const char* url, const URLComponent& path, URLComponent* fileName);
void ExtractFileName(const UChar* url, const URLComponent& path, URLComponent* fileName);
// Extract the first key/value from the range defined by |*query|. Updates
// |*query| to start at the end of the extracted key/value pair. This is
// designed for use in a loop: you can keep calling it with the same query
// object and it will iterate over all items in the query.
//
// Some key/value pairs may have the key, the value, or both be empty (for
// example, the query string "?&"). These will be returned. Note that an empty
// last parameter "foo.com?" or foo.com?a&" will not be returned, this case
// is the same as "done."
//
// The initial query component should not include the '?' (this is the default
// for parsed URLs).
//
// If no key/value are found |*key| and |*value| will be unchanged and it will
// return false.
bool ExtractQueryKeyValue(const char* url, URLComponent* query, URLComponent* key, URLComponent* value);
bool ExtractQueryKeyValue(const UChar* url, URLComponent* query, URLComponent* key, URLComponent* value);
} // namespace URLParser
} // namespace WTF
#endif // USE(WTFURL)
#endif // URLParse_h