occt/src/NCollection/NCollection_UtfIterator.lxx

// Created on: 2013-01-28
// Created by: Kirill GAVRILOV
// Copyright (c) 2013-2014 OPEN CASCADE SAS
//
// This file is part of Open CASCADE Technology software library.
//
// This library is free software; you can redistribute it and/or modify it under
// the terms of the GNU Lesser General Public License version 2.1 as published
// by the Free Software Foundation, with special exception defined in the file
// OCCT_LGPL_EXCEPTION.txt. Consult the file LICENSE_LGPL_21.txt included in OCCT
// distribution for complete text of the license and disclaimer of any warranty.
//
// Alternatively, this file may be used under the terms of Open CASCADE
// commercial license or contractual agreement.

// Portions of code are copyrighted by Unicode, Inc.
//
// Copyright (c) 2001-2004 Unicode, Inc.
//
// Disclaimer
//
// This source code is provided as is by Unicode, Inc. No claims are
// made as to fitness for any particular purpose. No warranties of any
// kind are expressed or implied. The recipient agrees to determine
// applicability of information provided. If this file has been
// purchased on magnetic or optical media from Unicode, Inc., the
// sole remedy for any claim will be exchange of defective media
// within 90 days of receipt.
//
// Limitations on Rights to Redistribute This Code
//
// Unicode, Inc. hereby grants the right to freely use the information
// supplied in this file in the creation of products supporting the
// Unicode Standard, and to make copies of this file in any form
// for internal or external distribution as long as this notice
// remains attached.

//! The first character in a UTF-8 sequence indicates how many bytes
//! to read (among other things).
template<typename Type>
const unsigned char NCollection_UtfIterator<Type>::UTF8_BYTES_MINUS_ONE[256] =
{
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};

//! Magic values subtracted from a buffer value during UTF-8 conversion.
//! This table contains as many values as there might be trailing bytes
//! in a UTF-8 sequence.
template<typename Type>
const unsigned long NCollection_UtfIterator<Type>::offsetsFromUTF8[6] =
{
  0x00000000UL, 0x00003080UL, 0x000E2080UL,
  0x03C82080UL, 0xFA082080UL, 0x82082080UL
};

//! The first character in a UTF-8 sequence indicates how many bytes to read.
template<typename Type>
const unsigned char NCollection_UtfIterator<Type>::UTF8_FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };

// =======================================================================
// function : readUTF8
// purpose  : Get a UTF-8 character; leave the tracking pointer at the start of the next character.
//            Not protected against invalid UTF-8.
// =======================================================================
template<typename Type>
inline void NCollection_UtfIterator<Type>::readUTF8()
{
  // unsigned arithmetic used
  Standard_Utf8UChar* aPos = (Standard_Utf8UChar* )myPosNext;
  const unsigned char aBytesToRead = UTF8_BYTES_MINUS_ONE[*aPos];
  myCharUtf32 = 0;
  switch (aBytesToRead)
  {
    case 5: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; // remember, illegal UTF-8
      Standard_FALLTHROUGH
    case 4: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; // remember, illegal UTF-8
      Standard_FALLTHROUGH
    case 3: myCharUtf32 += *aPos++; myCharUtf32 <<= 6;
      Standard_FALLTHROUGH
    case 2: myCharUtf32 += *aPos++; myCharUtf32 <<= 6;
      Standard_FALLTHROUGH
    case 1: myCharUtf32 += *aPos++; myCharUtf32 <<= 6;
      Standard_FALLTHROUGH
    case 0: myCharUtf32 += *aPos++;
  }
  myCharUtf32 -= offsetsFromUTF8[aBytesToRead];
  myPosNext = (Type* )aPos;
}

// magic numbers
template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF8_BYTE_MASK = 0xBF;
template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF8_BYTE_MARK = 0x80;
template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_START = 0xD800;
template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_END   = 0xDBFF;
template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_START  = 0xDC00;
template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_END    = 0xDFFF;
template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_SHIFT = 10;
template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_BASE   = 0x0010000UL;
template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_MASK   = 0x3FFUL;
template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF32_MAX_BMP   = 0x0000FFFFUL;
template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF32_MAX_LEGAL = 0x0010FFFFUL;

// =======================================================================
// function : readUTF16
// purpose  :
// =======================================================================
template<typename Type> inline
void NCollection_UtfIterator<Type>::readUTF16()
{
  Standard_Utf32Char aChar = *myPosNext++;
  // if we have the first half of the surrogate pair
  if (aChar >= UTF16_SURROGATE_HIGH_START
   && aChar <= UTF16_SURROGATE_HIGH_END)
  {
    const Standard_Utf32Char aChar2 = *myPosNext;
    // complete the surrogate pair
    if (aChar2 >= UTF16_SURROGATE_LOW_START
     && aChar2 <= UTF16_SURROGATE_LOW_END)
    {
      aChar = ((aChar - UTF16_SURROGATE_HIGH_START) << UTF16_SURROGATE_HIGH_SHIFT)
            + (aChar2 - UTF16_SURROGATE_LOW_START)   + UTF16_SURROGATE_LOW_BASE;
      ++myPosNext;
    }
  }
  myCharUtf32 = aChar;
}

// =======================================================================
// function : AdvanceBytesUtf8
// purpose  :
// =======================================================================
template<typename Type> inline
Standard_Integer NCollection_UtfIterator<Type>::AdvanceBytesUtf8() const
{
  if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START
   && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
   {
    // UTF-16 surrogate values are illegal in UTF-32
    return 0;
  }
  else if (myCharUtf32 < Standard_Utf32Char(0x80))
  {
    return 1;
  }
  else if (myCharUtf32 < Standard_Utf32Char(0x800))
  {
    return 2;
  }
  else if (myCharUtf32 < Standard_Utf32Char(0x10000))
  {
    return 3;
  }
  else if (myCharUtf32 <= UTF32_MAX_LEGAL)
  {
    return 4;
  }
  else
  {
    // illegal
    return 0;
  }
}

// =======================================================================
// function : GetUtf8
// purpose  :
// =======================================================================
template<typename Type> inline
Standard_Utf8Char* NCollection_UtfIterator<Type>::GetUtf8 (Standard_Utf8Char* theBuffer) const
{
  // unsigned arithmetic used
  return (Standard_Utf8Char* )GetUtf8 ((Standard_Utf8UChar* )theBuffer);
}

// =======================================================================
// function : GetUtf8
// purpose  :
// =======================================================================
template<typename Type> inline
Standard_Utf8UChar* NCollection_UtfIterator<Type>::GetUtf8 (Standard_Utf8UChar* theBuffer) const
{
  Standard_Utf32Char aChar = myCharUtf32;
  if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START
   && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
  {
    // UTF-16 surrogate values are illegal in UTF-32
    return theBuffer;
  }
  else if (myCharUtf32 < Standard_Utf32Char(0x80))
  {
    *theBuffer++ = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[1]);
    return theBuffer;
  }
  else if (myCharUtf32 < Standard_Utf32Char(0x800))
  {
    *++theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
    *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[2]);
    return theBuffer + 2;
  }
  else if (myCharUtf32 < Standard_Utf32Char(0x10000))
  {
    theBuffer += 3;
    *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
    *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
    *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[3]);
    return theBuffer + 3;
  }
  else if (myCharUtf32 <= UTF32_MAX_LEGAL)
  {
    theBuffer += 4;
    *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
    *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
    *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
    *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[4]);
    return theBuffer + 4;
  }
  else
  {
    // illegal
    return theBuffer;
  }
}

// =======================================================================
// function : AdvanceBytesUtf16
// purpose  :
// =======================================================================
template<typename Type> inline
Standard_Integer NCollection_UtfIterator<Type>::AdvanceBytesUtf16() const
{
  return AdvanceCodeUnitsUtf16() * sizeof(Standard_Utf16Char);
}

// =======================================================================
// function : AdvanceCodeUnitsUtf16
// purpose  :
// =======================================================================
template<typename Type> inline
Standard_Integer NCollection_UtfIterator<Type>::AdvanceCodeUnitsUtf16() const
{
  if (myCharUtf32 <= UTF32_MAX_BMP) // target is a character <= 0xFFFF
  {
    // UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values
    if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START
     && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
    {
      return 0;
    }
    else
    {
      return 1;
    }
  }
  else if (myCharUtf32 > UTF32_MAX_LEGAL)
  {
    // illegal
    return 0;
  }
  else
  {
    // target is a character in range 0xFFFF - 0x10FFFF
    // surrogate pair
    return 2;
  }
}

// =======================================================================
// function : GetUtf16
// purpose  :
// =======================================================================
template<typename Type> inline
Standard_Utf16Char* NCollection_UtfIterator<Type>::GetUtf16 (Standard_Utf16Char* theBuffer) const
{
  if (myCharUtf32 <= UTF32_MAX_BMP) // target is a character <= 0xFFFF
  {
    // UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values
    if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START
     && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
    {
      return theBuffer;
    }
    else
    {
      *theBuffer++ = Standard_Utf16Char(myCharUtf32);
      return theBuffer;
    }
  }
  else if (myCharUtf32 > UTF32_MAX_LEGAL)
  {
    // illegal
    return theBuffer;
  }
  else
  {
    // surrogate pair
    Standard_Utf32Char aChar = myCharUtf32 - UTF16_SURROGATE_LOW_BASE;
    *theBuffer++ = Standard_Utf16Char((aChar >> UTF16_SURROGATE_HIGH_SHIFT) + UTF16_SURROGATE_HIGH_START);
    *theBuffer++ = Standard_Utf16Char((aChar &  UTF16_SURROGATE_LOW_MASK)   + UTF16_SURROGATE_LOW_START);
    return theBuffer;
  }
}

// =======================================================================
// function : GetUtf32
// purpose  :
// =======================================================================
template<typename Type> inline
Standard_Utf32Char* NCollection_UtfIterator<Type>::GetUtf32 (Standard_Utf32Char* theBuffer) const
{
  *theBuffer++ = myCharUtf32;
  return theBuffer;
}