// Created on: 2013-01-28 // Created by: Kirill GAVRILOV // Copyright (c) 2013-2014 OPEN CASCADE SAS // // This file is part of Open CASCADE Technology software library. // // This library is free software; you can redistribute it and/or modify it under // the terms of the GNU Lesser General Public License version 2.1 as published // by the Free Software Foundation, with special exception defined in the file // OCCT_LGPL_EXCEPTION.txt. Consult the file LICENSE_LGPL_21.txt included in OCCT // distribution for complete text of the license and disclaimer of any warranty. // // Alternatively, this file may be used under the terms of Open CASCADE // commercial license or contractual agreement. // Portions of code are copyrighted by Unicode, Inc. // // Copyright (c) 2001-2004 Unicode, Inc. // // Disclaimer // // This source code is provided as is by Unicode, Inc. No claims are // made as to fitness for any particular purpose. No warranties of any // kind are expressed or implied. The recipient agrees to determine // applicability of information provided. If this file has been // purchased on magnetic or optical media from Unicode, Inc., the // sole remedy for any claim will be exchange of defective media // within 90 days of receipt. // // Limitations on Rights to Redistribute This Code // // Unicode, Inc. hereby grants the right to freely use the information // supplied in this file in the creation of products supporting the // Unicode Standard, and to make copies of this file in any form // for internal or external distribution as long as this notice // remains attached. //! The first character in a UTF-8 sequence indicates how many bytes //! to read (among other things). template const unsigned char NCollection_UtfIterator::UTF8_BYTES_MINUS_ONE[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5}; //! Magic values subtracted from a buffer value during UTF-8 conversion. //! This table contains as many values as there might be trailing bytes //! in a UTF-8 sequence. template const Standard_Utf32Char NCollection_UtfIterator::offsetsFromUTF8[6] = {0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL, 0x82082080UL}; //! The first character in a UTF-8 sequence indicates how many bytes to read. template const unsigned char NCollection_UtfIterator::UTF8_FIRST_BYTE_MARK[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC}; // ======================================================================= // function : readUTF8 // purpose : Get a UTF-8 character; leave the tracking pointer at the start of the next character. // Not protected against invalid UTF-8. // ======================================================================= template inline void NCollection_UtfIterator::readUTF8() { // unsigned arithmetic used Standard_Utf8UChar* aPos = (Standard_Utf8UChar*)myPosNext; const unsigned char aBytesToRead = UTF8_BYTES_MINUS_ONE[*aPos]; myCharUtf32 = 0; switch (aBytesToRead) { case 5: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; // remember, illegal UTF-8 Standard_FALLTHROUGH case 4: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; // remember, illegal UTF-8 Standard_FALLTHROUGH case 3: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; Standard_FALLTHROUGH case 2: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; Standard_FALLTHROUGH case 1: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; Standard_FALLTHROUGH case 0: myCharUtf32 += *aPos++; } myCharUtf32 -= offsetsFromUTF8[aBytesToRead]; myPosNext = (Type*)aPos; } // magic numbers template const Standard_Utf32Char NCollection_UtfIterator::UTF8_BYTE_MASK = 0xBF; template const Standard_Utf32Char NCollection_UtfIterator::UTF8_BYTE_MARK = 0x80; template const Standard_Utf32Char NCollection_UtfIterator::UTF16_SURROGATE_HIGH_START = 0xD800; template const Standard_Utf32Char NCollection_UtfIterator::UTF16_SURROGATE_HIGH_END = 0xDBFF; template const Standard_Utf32Char NCollection_UtfIterator::UTF16_SURROGATE_LOW_START = 0xDC00; template const Standard_Utf32Char NCollection_UtfIterator::UTF16_SURROGATE_LOW_END = 0xDFFF; template const Standard_Utf32Char NCollection_UtfIterator::UTF16_SURROGATE_HIGH_SHIFT = 10; template const Standard_Utf32Char NCollection_UtfIterator::UTF16_SURROGATE_LOW_BASE = 0x0010000UL; template const Standard_Utf32Char NCollection_UtfIterator::UTF16_SURROGATE_LOW_MASK = 0x3FFUL; template const Standard_Utf32Char NCollection_UtfIterator::UTF32_MAX_BMP = 0x0000FFFFUL; template const Standard_Utf32Char NCollection_UtfIterator::UTF32_MAX_LEGAL = 0x0010FFFFUL; // ======================================================================= // function : readUTF16 // purpose : // ======================================================================= template inline void NCollection_UtfIterator::readUTF16() { Standard_Utf32Char aChar = *myPosNext++; // if we have the first half of the surrogate pair if (aChar >= UTF16_SURROGATE_HIGH_START && aChar <= UTF16_SURROGATE_HIGH_END) { const Standard_Utf32Char aChar2 = *myPosNext; // complete the surrogate pair if (aChar2 >= UTF16_SURROGATE_LOW_START && aChar2 <= UTF16_SURROGATE_LOW_END) { aChar = ((aChar - UTF16_SURROGATE_HIGH_START) << UTF16_SURROGATE_HIGH_SHIFT) + (aChar2 - UTF16_SURROGATE_LOW_START) + UTF16_SURROGATE_LOW_BASE; ++myPosNext; } } myCharUtf32 = aChar; } // ======================================================================= // function : AdvanceBytesUtf8 // purpose : // ======================================================================= template inline Standard_Integer NCollection_UtfIterator::AdvanceBytesUtf8() const { if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START && myCharUtf32 <= UTF16_SURROGATE_LOW_END) { // UTF-16 surrogate values are illegal in UTF-32 return 0; } else if (myCharUtf32 < Standard_Utf32Char(0x80)) { return 1; } else if (myCharUtf32 < Standard_Utf32Char(0x800)) { return 2; } else if (myCharUtf32 < Standard_Utf32Char(0x10000)) { return 3; } else if (myCharUtf32 <= UTF32_MAX_LEGAL) { return 4; } else { // illegal return 0; } } // ======================================================================= // function : GetUtf8 // purpose : // ======================================================================= template inline Standard_Utf8Char* NCollection_UtfIterator::GetUtf8(Standard_Utf8Char* theBuffer) const { // unsigned arithmetic used return (Standard_Utf8Char*)GetUtf8((Standard_Utf8UChar*)theBuffer); } // ======================================================================= // function : GetUtf8 // purpose : // ======================================================================= template inline Standard_Utf8UChar* NCollection_UtfIterator::GetUtf8( Standard_Utf8UChar* theBuffer) const { Standard_Utf32Char aChar = myCharUtf32; if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START && myCharUtf32 <= UTF16_SURROGATE_LOW_END) { // UTF-16 surrogate values are illegal in UTF-32 return theBuffer; } else if (myCharUtf32 < Standard_Utf32Char(0x80)) { *theBuffer++ = Standard_Utf8UChar(aChar | UTF8_FIRST_BYTE_MARK[1]); return theBuffer; } else if (myCharUtf32 < Standard_Utf32Char(0x800)) { *++theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; *--theBuffer = Standard_Utf8UChar(aChar | UTF8_FIRST_BYTE_MARK[2]); return theBuffer + 2; } else if (myCharUtf32 < Standard_Utf32Char(0x10000)) { theBuffer += 3; *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; *--theBuffer = Standard_Utf8UChar(aChar | UTF8_FIRST_BYTE_MARK[3]); return theBuffer + 3; } else if (myCharUtf32 <= UTF32_MAX_LEGAL) { theBuffer += 4; *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; *--theBuffer = Standard_Utf8UChar(aChar | UTF8_FIRST_BYTE_MARK[4]); return theBuffer + 4; } else { // illegal return theBuffer; } } // ======================================================================= // function : AdvanceBytesUtf16 // purpose : // ======================================================================= template inline Standard_Integer NCollection_UtfIterator::AdvanceBytesUtf16() const { return AdvanceCodeUnitsUtf16() * sizeof(Standard_Utf16Char); } // ======================================================================= // function : AdvanceCodeUnitsUtf16 // purpose : // ======================================================================= template inline Standard_Integer NCollection_UtfIterator::AdvanceCodeUnitsUtf16() const { if (myCharUtf32 <= UTF32_MAX_BMP) // target is a character <= 0xFFFF { // UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START && myCharUtf32 <= UTF16_SURROGATE_LOW_END) { return 0; } else { return 1; } } else if (myCharUtf32 > UTF32_MAX_LEGAL) { // illegal return 0; } else { // target is a character in range 0xFFFF - 0x10FFFF // surrogate pair return 2; } } // ======================================================================= // function : GetUtf16 // purpose : // ======================================================================= template inline Standard_Utf16Char* NCollection_UtfIterator::GetUtf16( Standard_Utf16Char* theBuffer) const { if (myCharUtf32 <= UTF32_MAX_BMP) // target is a character <= 0xFFFF { // UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START && myCharUtf32 <= UTF16_SURROGATE_LOW_END) { return theBuffer; } else { *theBuffer++ = Standard_Utf16Char(myCharUtf32); return theBuffer; } } else if (myCharUtf32 > UTF32_MAX_LEGAL) { // illegal return theBuffer; } else { // surrogate pair Standard_Utf32Char aChar = myCharUtf32 - UTF16_SURROGATE_LOW_BASE; *theBuffer++ = Standard_Utf16Char((aChar >> UTF16_SURROGATE_HIGH_SHIFT) + UTF16_SURROGATE_HIGH_START); *theBuffer++ = Standard_Utf16Char((aChar & UTF16_SURROGATE_LOW_MASK) + UTF16_SURROGATE_LOW_START); return theBuffer; } } // ======================================================================= // function : GetUtf32 // purpose : // ======================================================================= template inline Standard_Utf32Char* NCollection_UtfIterator::GetUtf32( Standard_Utf32Char* theBuffer) const { *theBuffer++ = myCharUtf32; return theBuffer; }