1
0
mirror of https://git.dev.opencascade.org/repos/occt.git synced 2025-04-04 18:06:22 +03:00

0031340: LDOM fails to read XML file starting with BOM

Added support of BOM (byte order mask) characters at the start of an XML stream or file and provided information about found BOM in LDOMParser.
This commit is contained in:
mpv 2020-09-02 13:53:07 +03:00 committed by bugmaster
parent b3df3dcd82
commit 8f34d47e21
7 changed files with 169 additions and 6 deletions

View File

@ -106,6 +106,18 @@ const TCollection_AsciiString& LDOMParser::GetError
return myError;
}
//=======================================================================
//function : GetBOM
//purpose : Returns the byte order mask defined at the start of a stream
//=======================================================================
LDOM_OSStream::BOMType LDOMParser::GetBOM() const
{
if (myReader)
return myReader->GetBOM();
return LDOM_OSStream::BOM_UNDEFINED;
}
//=======================================================================
//function : parse
//purpose :

View File

@ -64,6 +64,9 @@ class LDOMParser
GetError (TCollection_AsciiString& aData) const;
// Return text describing a parsing error, or Empty if no error occurred
// Returns the byte order mask defined at the start of a stream
Standard_EXPORT LDOM_OSStream::BOMType GetBOM() const;
protected:
// ---------- PROTECTED METHODS ----------

View File

@ -112,6 +112,23 @@ public:
private:
LDOM_SBuffer myBuffer;
public:
// byte order mark defined at the start of a stream
enum BOMType {
BOM_UNDEFINED,
BOM_UTF8,
BOM_UTF16BE,
BOM_UTF16LE,
BOM_UTF32BE,
BOM_UTF32LE,
BOM_UTF7,
BOM_UTF1,
BOM_UTFEBCDIC,
BOM_SCSU,
BOM_BOCU1,
BOM_GB18030
};
};
#endif

View File

@ -73,7 +73,8 @@ LDOM_XmlReader::LDOM_XmlReader (
myLastChild(NULL),
myPtr (&myBuffer[0]),
myEndPtr (&myBuffer[0]),
myTagPerStep (theTagPerStep)
myTagPerStep (theTagPerStep),
myBOM (LDOM_OSStream::BOM_UNDEFINED)
{
}
@ -92,6 +93,7 @@ LDOM_XmlReader::RecordType LDOM_XmlReader::ReadRecord (Standard_IStream& theIStr
LDOMBasicString anAttrName, anAttrValue;
char anAttDelimiter = '\0';
Standard_Boolean aHasRead = Standard_False;
Standard_Boolean isFileStart = !myEOF && theIStream.tellg() == std::iostream::pos_type(0);
for(;;) {
// Check if the current file buffer is exhausted
@ -153,6 +155,98 @@ LDOM_XmlReader::RecordType LDOM_XmlReader::ReadRecord (Standard_IStream& theIStr
myBuffer[aBytesRest + aNBytes] = '\0';
}
}
if (isFileStart)
{
isFileStart = Standard_False;
// check for BOM block
Standard_Utf8UChar aFirstChar = Standard_Utf8UChar(myPtr[0]);
switch(aFirstChar) {
case 0xEF:
if (Standard_Utf8UChar(myPtr[1]) == 0xBB && Standard_Utf8UChar(myPtr[2]) == 0xBF)
{
myBOM = LDOM_OSStream::BOM_UTF8;
myPtr += 3;
}
break;
case 0xFE:
if (Standard_Utf8UChar(myPtr[1]) == 0xFF)
{
myBOM = LDOM_OSStream::BOM_UTF16BE;
myPtr += 2;
}
break;
case 0xFF:
if (Standard_Utf8UChar(myPtr[1]) == 0xFE)
{
if (myPtr[2] == 0 && myPtr[3] == 0)
{
myBOM = LDOM_OSStream::BOM_UTF32LE;
myPtr += 4;
}
else
{
myBOM = LDOM_OSStream::BOM_UTF16LE;
myPtr += 2;
}
}
break;
case 0x00:
if (myPtr[1] == 0 && Standard_Utf8UChar(myPtr[2]) == 0xFE && Standard_Utf8UChar(myPtr[3]) == 0xFF)
{
myBOM = LDOM_OSStream::BOM_UTF32BE;
myPtr += 4;
}
break;
case 0x2B:
if (myPtr[1] == 47 && myPtr[2] == 118 &&
(myPtr[3] == 43 || myPtr[3] == 47 || myPtr[3] == 56 || myPtr[3] == 57))
{
myBOM = LDOM_OSStream::BOM_UTF7;
if (myPtr[3] == 56 && myPtr[3] == 45)
myPtr += 5;
else
myPtr += 4;
}
break;
case 0xF7:
if (myPtr[1] == 100 && myPtr[2] == 76)
{
myBOM = LDOM_OSStream::BOM_UTF1;
myPtr += 3;
}
break;
case 0xDD:
if (myPtr[1] == 115 && myPtr[2] == 102 && myPtr[3] == 115)
{
myBOM = LDOM_OSStream::BOM_UTFEBCDIC;
myPtr += 4;
}
break;
case 0x0E:
if (Standard_Utf8UChar(myPtr[1]) == 0xFE && Standard_Utf8UChar(myPtr[2]) == 0xFF)
{
myBOM = LDOM_OSStream::BOM_SCSU;
myPtr += 3;
}
break;
case 0xFB:
if (Standard_Utf8UChar(myPtr[1]) == 0xEE && myPtr[2] == 40)
{
myBOM = LDOM_OSStream::BOM_BOCU1;
myPtr += 3;
}
break;
case 0x84:
if (myPtr[1] == 49 && Standard_Utf8UChar(myPtr[2]) == 0x95 && myPtr[3] == 51)
{
myBOM = LDOM_OSStream::BOM_GB18030;
myPtr += 4;
}
break;
}
if (myBOM != LDOM_OSStream::BOM_UNDEFINED)
continue;
}
// Check the character data
switch (aState) {

View File

@ -23,9 +23,9 @@
#define XML_BUFFER_SIZE 20480
#include <LDOM_BasicElement.hxx>
#include <LDOM_OSStream.hxx>
class TCollection_AsciiString;
class LDOM_OSStream;
// Class LDOM_XmlReader
//
@ -66,6 +66,9 @@ class LDOM_XmlReader
const char * theEnd);
// try convert string theStart to LDOM_AsciiInteger, return False on success
// Returns the byte order mask defined at the start of a stream
LDOM_OSStream::BOMType GetBOM() const { return myBOM; }
private:
// ---------- PRIVATE (PROHIBITED) METHODS ----------
LDOM_XmlReader (const LDOM_XmlReader& theOther);
@ -86,6 +89,7 @@ class LDOM_XmlReader
const char * myEndPtr;
char myBuffer [XML_BUFFER_SIZE+4];
Standard_Boolean myTagPerStep;
LDOM_OSStream::BOMType myBOM;
};
#endif

View File

@ -674,10 +674,6 @@ static Standard_Integer OCC983 (Draw_Interpretor& di, Standard_Integer argc, co
di << " AttributeValue = " << itemValue.ToCString() << "\n";
}
// LDOM_Element element;
// for ( element = (const LDOM_Element&) root.getFirstChild();
// !element.isNull();
// element = (const LDOM_Element&) element.getNextSibling() ) {
LDOM_Element element;
LDOM_Node node;
for ( node = root.getFirstChild(), element = (const LDOM_Element&) node;
@ -694,6 +690,25 @@ static Standard_Integer OCC983 (Draw_Interpretor& di, Standard_Integer argc, co
di << " AttributeValue = " << itemValue2.ToCString() << "\n";
}
}
if (aParser.GetBOM() != LDOM_OSStream::BOM_UNDEFINED)
{
di << "BOM is ";
switch (aParser.GetBOM()) {
case LDOM_OSStream::BOM_UTF8: di << "UTF-8"; break;
case LDOM_OSStream::BOM_UTF16BE: di << "UTF-16 (BE)"; break;
case LDOM_OSStream::BOM_UTF16LE: di << "UTF-16 (LE)"; break;
case LDOM_OSStream::BOM_UTF32BE: di << "UTF-32 (BE)"; break;
case LDOM_OSStream::BOM_UTF32LE: di << "UTF-32 (LE)"; break;
case LDOM_OSStream::BOM_UTF7: di << "UTF-7"; break;
case LDOM_OSStream::BOM_UTF1: di << "UTF-1"; break;
case LDOM_OSStream::BOM_UTFEBCDIC: di << "UTF-EBCDIC"; break;
case LDOM_OSStream::BOM_SCSU: di << "SCSU"; break;
case LDOM_OSStream::BOM_BOCU1: di << "BOCU-1"; break;
case LDOM_OSStream::BOM_GB18030: di << "GB-18030"; break;
default: di << "unexpected";
}
di << "\n";
}
return 0;
}

View File

@ -0,0 +1,18 @@
puts "================"
puts "0031340: LDOM fails to read XML file starting with BOM"
puts "================"
puts ""
pload QAcommands
set list [OCC983 [locate_data_file bug31340.xml]]
if { [regexp "Document parsed" $list] == 1 } {
if {[lsearch -exact ${list} "UTF-8"] != -1 } {
puts "OK"
} else {
puts "Error : BOM was not found in $list"
}
} else {
puts "Error : document not parsed"
}