1
0
mirror of https://git.dev.opencascade.org/repos/occt.git synced 2025-04-04 18:06:22 +03:00
occt/src/NCollection/NCollection_UtfIterator.lxx
dpasukhi a5a7b3185b Coding - Apply .clang-format formatting #286
Update empty method guards to new style with regex (see PR).
Used clang-format 18.1.8.
New actions to validate code formatting is added.
Update .clang-format with disabling of include sorting.
  It is temporary changes, then include will be sorted.
Apply formatting for /src and /tools folder.
The files with .hxx,.cxx,.lxx,.h,.pxx,.hpp,*.cpp extensions.
2025-01-26 00:43:57 +00:00

343 lines
12 KiB
Plaintext
Executable File

// Created on: 2013-01-28
// Created by: Kirill GAVRILOV
// Copyright (c) 2013-2014 OPEN CASCADE SAS
//
// This file is part of Open CASCADE Technology software library.
//
// This library is free software; you can redistribute it and/or modify it under
// the terms of the GNU Lesser General Public License version 2.1 as published
// by the Free Software Foundation, with special exception defined in the file
// OCCT_LGPL_EXCEPTION.txt. Consult the file LICENSE_LGPL_21.txt included in OCCT
// distribution for complete text of the license and disclaimer of any warranty.
//
// Alternatively, this file may be used under the terms of Open CASCADE
// commercial license or contractual agreement.
// Portions of code are copyrighted by Unicode, Inc.
//
// Copyright (c) 2001-2004 Unicode, Inc.
//
// Disclaimer
//
// This source code is provided as is by Unicode, Inc. No claims are
// made as to fitness for any particular purpose. No warranties of any
// kind are expressed or implied. The recipient agrees to determine
// applicability of information provided. If this file has been
// purchased on magnetic or optical media from Unicode, Inc., the
// sole remedy for any claim will be exchange of defective media
// within 90 days of receipt.
//
// Limitations on Rights to Redistribute This Code
//
// Unicode, Inc. hereby grants the right to freely use the information
// supplied in this file in the creation of products supporting the
// Unicode Standard, and to make copies of this file in any form
// for internal or external distribution as long as this notice
// remains attached.
//! The first character in a UTF-8 sequence indicates how many bytes
//! to read (among other things).
template <typename Type>
const unsigned char NCollection_UtfIterator<Type>::UTF8_BYTES_MINUS_ONE[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5};
//! Magic values subtracted from a buffer value during UTF-8 conversion.
//! This table contains as many values as there might be trailing bytes
//! in a UTF-8 sequence.
template <typename Type>
const Standard_Utf32Char NCollection_UtfIterator<Type>::offsetsFromUTF8[6] =
{0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL, 0x82082080UL};
//! The first character in a UTF-8 sequence indicates how many bytes to read.
template <typename Type>
const unsigned char NCollection_UtfIterator<Type>::UTF8_FIRST_BYTE_MARK[7] =
{0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
// =======================================================================
// function : readUTF8
// purpose : Get a UTF-8 character; leave the tracking pointer at the start of the next character.
// Not protected against invalid UTF-8.
// =======================================================================
template <typename Type>
inline void NCollection_UtfIterator<Type>::readUTF8()
{
// unsigned arithmetic used
Standard_Utf8UChar* aPos = (Standard_Utf8UChar*)myPosNext;
const unsigned char aBytesToRead = UTF8_BYTES_MINUS_ONE[*aPos];
myCharUtf32 = 0;
switch (aBytesToRead)
{
case 5:
myCharUtf32 += *aPos++;
myCharUtf32 <<= 6; // remember, illegal UTF-8
Standard_FALLTHROUGH
case 4:
myCharUtf32 += *aPos++;
myCharUtf32 <<= 6; // remember, illegal UTF-8
Standard_FALLTHROUGH
case 3:
myCharUtf32 += *aPos++;
myCharUtf32 <<= 6;
Standard_FALLTHROUGH
case 2:
myCharUtf32 += *aPos++;
myCharUtf32 <<= 6;
Standard_FALLTHROUGH
case 1:
myCharUtf32 += *aPos++;
myCharUtf32 <<= 6;
Standard_FALLTHROUGH
case 0:
myCharUtf32 += *aPos++;
}
myCharUtf32 -= offsetsFromUTF8[aBytesToRead];
myPosNext = (Type*)aPos;
}
// magic numbers
template <typename Type>
const Standard_Utf32Char NCollection_UtfIterator<Type>::UTF8_BYTE_MASK = 0xBF;
template <typename Type>
const Standard_Utf32Char NCollection_UtfIterator<Type>::UTF8_BYTE_MARK = 0x80;
template <typename Type>
const Standard_Utf32Char NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_START = 0xD800;
template <typename Type>
const Standard_Utf32Char NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_END = 0xDBFF;
template <typename Type>
const Standard_Utf32Char NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_START = 0xDC00;
template <typename Type>
const Standard_Utf32Char NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_END = 0xDFFF;
template <typename Type>
const Standard_Utf32Char NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_SHIFT = 10;
template <typename Type>
const Standard_Utf32Char NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_BASE = 0x0010000UL;
template <typename Type>
const Standard_Utf32Char NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_MASK = 0x3FFUL;
template <typename Type>
const Standard_Utf32Char NCollection_UtfIterator<Type>::UTF32_MAX_BMP = 0x0000FFFFUL;
template <typename Type>
const Standard_Utf32Char NCollection_UtfIterator<Type>::UTF32_MAX_LEGAL = 0x0010FFFFUL;
// =======================================================================
// function : readUTF16
// purpose :
// =======================================================================
template <typename Type>
inline void NCollection_UtfIterator<Type>::readUTF16()
{
Standard_Utf32Char aChar = *myPosNext++;
// if we have the first half of the surrogate pair
if (aChar >= UTF16_SURROGATE_HIGH_START && aChar <= UTF16_SURROGATE_HIGH_END)
{
const Standard_Utf32Char aChar2 = *myPosNext;
// complete the surrogate pair
if (aChar2 >= UTF16_SURROGATE_LOW_START && aChar2 <= UTF16_SURROGATE_LOW_END)
{
aChar = ((aChar - UTF16_SURROGATE_HIGH_START) << UTF16_SURROGATE_HIGH_SHIFT)
+ (aChar2 - UTF16_SURROGATE_LOW_START) + UTF16_SURROGATE_LOW_BASE;
++myPosNext;
}
}
myCharUtf32 = aChar;
}
// =======================================================================
// function : AdvanceBytesUtf8
// purpose :
// =======================================================================
template <typename Type>
inline Standard_Integer NCollection_UtfIterator<Type>::AdvanceBytesUtf8() const
{
if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
{
// UTF-16 surrogate values are illegal in UTF-32
return 0;
}
else if (myCharUtf32 < Standard_Utf32Char(0x80))
{
return 1;
}
else if (myCharUtf32 < Standard_Utf32Char(0x800))
{
return 2;
}
else if (myCharUtf32 < Standard_Utf32Char(0x10000))
{
return 3;
}
else if (myCharUtf32 <= UTF32_MAX_LEGAL)
{
return 4;
}
else
{
// illegal
return 0;
}
}
// =======================================================================
// function : GetUtf8
// purpose :
// =======================================================================
template <typename Type>
inline Standard_Utf8Char* NCollection_UtfIterator<Type>::GetUtf8(Standard_Utf8Char* theBuffer) const
{
// unsigned arithmetic used
return (Standard_Utf8Char*)GetUtf8((Standard_Utf8UChar*)theBuffer);
}
// =======================================================================
// function : GetUtf8
// purpose :
// =======================================================================
template <typename Type>
inline Standard_Utf8UChar* NCollection_UtfIterator<Type>::GetUtf8(
Standard_Utf8UChar* theBuffer) const
{
Standard_Utf32Char aChar = myCharUtf32;
if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
{
// UTF-16 surrogate values are illegal in UTF-32
return theBuffer;
}
else if (myCharUtf32 < Standard_Utf32Char(0x80))
{
*theBuffer++ = Standard_Utf8UChar(aChar | UTF8_FIRST_BYTE_MARK[1]);
return theBuffer;
}
else if (myCharUtf32 < Standard_Utf32Char(0x800))
{
*++theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
aChar >>= 6;
*--theBuffer = Standard_Utf8UChar(aChar | UTF8_FIRST_BYTE_MARK[2]);
return theBuffer + 2;
}
else if (myCharUtf32 < Standard_Utf32Char(0x10000))
{
theBuffer += 3;
*--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
aChar >>= 6;
*--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
aChar >>= 6;
*--theBuffer = Standard_Utf8UChar(aChar | UTF8_FIRST_BYTE_MARK[3]);
return theBuffer + 3;
}
else if (myCharUtf32 <= UTF32_MAX_LEGAL)
{
theBuffer += 4;
*--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
aChar >>= 6;
*--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
aChar >>= 6;
*--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
aChar >>= 6;
*--theBuffer = Standard_Utf8UChar(aChar | UTF8_FIRST_BYTE_MARK[4]);
return theBuffer + 4;
}
else
{
// illegal
return theBuffer;
}
}
// =======================================================================
// function : AdvanceBytesUtf16
// purpose :
// =======================================================================
template <typename Type>
inline Standard_Integer NCollection_UtfIterator<Type>::AdvanceBytesUtf16() const
{
return AdvanceCodeUnitsUtf16() * sizeof(Standard_Utf16Char);
}
// =======================================================================
// function : AdvanceCodeUnitsUtf16
// purpose :
// =======================================================================
template <typename Type>
inline Standard_Integer NCollection_UtfIterator<Type>::AdvanceCodeUnitsUtf16() const
{
if (myCharUtf32 <= UTF32_MAX_BMP) // target is a character <= 0xFFFF
{
// UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values
if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
{
return 0;
}
else
{
return 1;
}
}
else if (myCharUtf32 > UTF32_MAX_LEGAL)
{
// illegal
return 0;
}
else
{
// target is a character in range 0xFFFF - 0x10FFFF
// surrogate pair
return 2;
}
}
// =======================================================================
// function : GetUtf16
// purpose :
// =======================================================================
template <typename Type>
inline Standard_Utf16Char* NCollection_UtfIterator<Type>::GetUtf16(
Standard_Utf16Char* theBuffer) const
{
if (myCharUtf32 <= UTF32_MAX_BMP) // target is a character <= 0xFFFF
{
// UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values
if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
{
return theBuffer;
}
else
{
*theBuffer++ = Standard_Utf16Char(myCharUtf32);
return theBuffer;
}
}
else if (myCharUtf32 > UTF32_MAX_LEGAL)
{
// illegal
return theBuffer;
}
else
{
// surrogate pair
Standard_Utf32Char aChar = myCharUtf32 - UTF16_SURROGATE_LOW_BASE;
*theBuffer++ =
Standard_Utf16Char((aChar >> UTF16_SURROGATE_HIGH_SHIFT) + UTF16_SURROGATE_HIGH_START);
*theBuffer++ =
Standard_Utf16Char((aChar & UTF16_SURROGATE_LOW_MASK) + UTF16_SURROGATE_LOW_START);
return theBuffer;
}
}
// =======================================================================
// function : GetUtf32
// purpose :
// =======================================================================
template <typename Type>
inline Standard_Utf32Char* NCollection_UtfIterator<Type>::GetUtf32(
Standard_Utf32Char* theBuffer) const
{
*theBuffer++ = myCharUtf32;
return theBuffer;
}