94 lines
3.1 KiB
C++
94 lines
3.1 KiB
C++
/*
|
|
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
// Some utility routines relating to unicode.
|
|
|
|
#pragma once
|
|
|
|
#include <cstdint>
|
|
#include <stdexcept>
|
|
#include <string>
|
|
|
|
#include <folly/lang/Exception.h>
|
|
|
|
namespace folly {
|
|
|
|
class FOLLY_EXPORT unicode_error : public std::runtime_error {
|
|
public:
|
|
using std::runtime_error::runtime_error;
|
|
};
|
|
|
|
// Unicode code points are split into 17 planes.
|
|
//
|
|
// The Basic Multilingual Plane covers code points in [0-0xFFFF] but reserves
|
|
// two invalid ranges:
|
|
// - High surrogates: [0xD800-0xDBFF].
|
|
// - Low surrogates: [0xDC00-0xDFFF].
|
|
//
|
|
// UTF-16 code units are 2 bytes wide and are represented here with char16_t.
|
|
// Unicode code points are represented in UTF-16 across either 1-2 code units:
|
|
// - Valid BMP code points [0x0000-0xD7FF] + [0xE000-0xFFFF] are encoded
|
|
// directly as 1 code unit.
|
|
// - Code points larger than BMP (>0xFFFF) are encoded as 2 code units, with
|
|
// values respectively in the high surrogates and low surrogates ranges.
|
|
//
|
|
// JSON text permits the inclusion of Unicode escape sequences within quoted
|
|
// strings:
|
|
// - Valid BMP code points are encoded as \xXXXX, where XXXX are the base-16
|
|
// digits of the code point.
|
|
// - Code points larger than BMP are encoded as \uHHHH\uLLLL, where HHHH and
|
|
// LLLL are respectively the base-16 digits of the high and low surrogates of
|
|
// the UTF-16 encoding of the code point.
|
|
|
|
inline bool utf16_code_unit_is_bmp(char16_t const c) {
|
|
return c < 0xd800 || c >= 0xe000;
|
|
}
|
|
inline bool utf16_code_unit_is_high_surrogate(char16_t const c) {
|
|
return c >= 0xd800 && c < 0xdc00;
|
|
}
|
|
inline bool utf16_code_unit_is_low_surrogate(char16_t const c) {
|
|
return c >= 0xdc00 && c < 0xe000;
|
|
}
|
|
inline char32_t unicode_code_point_from_utf16_surrogate_pair(
|
|
char16_t const high, char16_t const low) {
|
|
if (!utf16_code_unit_is_high_surrogate(high)) {
|
|
throw_exception<unicode_error>("invalid high surrogate");
|
|
}
|
|
if (!utf16_code_unit_is_low_surrogate(low)) {
|
|
throw_exception<unicode_error>("invalid low surrogate");
|
|
}
|
|
return 0x10000 + ((char32_t(high) & 0x3ff) << 10) + (char32_t(low) & 0x3ff);
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////
|
|
|
|
/*
|
|
* Encode a single unicode code point into a UTF-8 byte sequence.
|
|
*
|
|
* Return value is undefined if `cp' is an invalid code point.
|
|
*/
|
|
std::string codePointToUtf8(char32_t cp);
|
|
|
|
/*
|
|
* Decode a single unicode code point from UTF-8 byte sequence.
|
|
*/
|
|
char32_t utf8ToCodePoint(
|
|
const unsigned char*& p, const unsigned char* const e, bool skipOnError);
|
|
|
|
//////////////////////////////////////////////////////////////////////
|
|
|
|
} // namespace folly
|