From 8f34d47e211037d7fa6df8c4f060a161d9d84037 Mon Sep 17 00:00:00 2001 From: mpv Date: Wed, 2 Sep 2020 13:53:07 +0300 Subject: [PATCH] 0031340: LDOM fails to read XML file starting with BOM Added support of BOM (byte order mask) characters at the start of an XML stream or file and provided information about found BOM in LDOMParser. --- src/LDOM/LDOMParser.cxx | 12 +++++ src/LDOM/LDOMParser.hxx | 3 ++ src/LDOM/LDOM_OSStream.hxx | 17 +++++++ src/LDOM/LDOM_XmlReader.cxx | 96 +++++++++++++++++++++++++++++++++++- src/LDOM/LDOM_XmlReader.hxx | 6 ++- src/QABugs/QABugs_14.cxx | 23 +++++++-- tests/bugs/fclasses/bug31340 | 18 +++++++ 7 files changed, 169 insertions(+), 6 deletions(-) create mode 100644 tests/bugs/fclasses/bug31340 diff --git a/src/LDOM/LDOMParser.cxx b/src/LDOM/LDOMParser.cxx index 82730ed576..341d902067 100644 --- a/src/LDOM/LDOMParser.cxx +++ b/src/LDOM/LDOMParser.cxx @@ -106,6 +106,18 @@ const TCollection_AsciiString& LDOMParser::GetError return myError; } +//======================================================================= +//function : GetBOM +//purpose : Returns the byte order mask defined at the start of a stream +//======================================================================= + +LDOM_OSStream::BOMType LDOMParser::GetBOM() const +{ + if (myReader) + return myReader->GetBOM(); + return LDOM_OSStream::BOM_UNDEFINED; +} + //======================================================================= //function : parse //purpose : diff --git a/src/LDOM/LDOMParser.hxx b/src/LDOM/LDOMParser.hxx index 85919de2b5..d064e5d019 100644 --- a/src/LDOM/LDOMParser.hxx +++ b/src/LDOM/LDOMParser.hxx @@ -64,6 +64,9 @@ class LDOMParser GetError (TCollection_AsciiString& aData) const; // Return text describing a parsing error, or Empty if no error occurred + // Returns the byte order mask defined at the start of a stream + Standard_EXPORT LDOM_OSStream::BOMType GetBOM() const; + protected: // ---------- PROTECTED METHODS ---------- diff --git a/src/LDOM/LDOM_OSStream.hxx b/src/LDOM/LDOM_OSStream.hxx index e3f88a5a2d..511c683ff0 100644 --- a/src/LDOM/LDOM_OSStream.hxx +++ b/src/LDOM/LDOM_OSStream.hxx @@ -112,6 +112,23 @@ public: private: LDOM_SBuffer myBuffer; + +public: + // byte order mark defined at the start of a stream + enum BOMType { + BOM_UNDEFINED, + BOM_UTF8, + BOM_UTF16BE, + BOM_UTF16LE, + BOM_UTF32BE, + BOM_UTF32LE, + BOM_UTF7, + BOM_UTF1, + BOM_UTFEBCDIC, + BOM_SCSU, + BOM_BOCU1, + BOM_GB18030 + }; }; #endif diff --git a/src/LDOM/LDOM_XmlReader.cxx b/src/LDOM/LDOM_XmlReader.cxx index 66a3cc1ca6..5b12bce602 100644 --- a/src/LDOM/LDOM_XmlReader.cxx +++ b/src/LDOM/LDOM_XmlReader.cxx @@ -73,7 +73,8 @@ LDOM_XmlReader::LDOM_XmlReader ( myLastChild(NULL), myPtr (&myBuffer[0]), myEndPtr (&myBuffer[0]), - myTagPerStep (theTagPerStep) + myTagPerStep (theTagPerStep), + myBOM (LDOM_OSStream::BOM_UNDEFINED) { } @@ -92,6 +93,7 @@ LDOM_XmlReader::RecordType LDOM_XmlReader::ReadRecord (Standard_IStream& theIStr LDOMBasicString anAttrName, anAttrValue; char anAttDelimiter = '\0'; Standard_Boolean aHasRead = Standard_False; + Standard_Boolean isFileStart = !myEOF && theIStream.tellg() == std::iostream::pos_type(0); for(;;) { // Check if the current file buffer is exhausted @@ -153,6 +155,98 @@ LDOM_XmlReader::RecordType LDOM_XmlReader::ReadRecord (Standard_IStream& theIStr myBuffer[aBytesRest + aNBytes] = '\0'; } } + if (isFileStart) + { + isFileStart = Standard_False; + // check for BOM block + Standard_Utf8UChar aFirstChar = Standard_Utf8UChar(myPtr[0]); + switch(aFirstChar) { + case 0xEF: + if (Standard_Utf8UChar(myPtr[1]) == 0xBB && Standard_Utf8UChar(myPtr[2]) == 0xBF) + { + myBOM = LDOM_OSStream::BOM_UTF8; + myPtr += 3; + } + break; + case 0xFE: + if (Standard_Utf8UChar(myPtr[1]) == 0xFF) + { + myBOM = LDOM_OSStream::BOM_UTF16BE; + myPtr += 2; + } + break; + case 0xFF: + if (Standard_Utf8UChar(myPtr[1]) == 0xFE) + { + if (myPtr[2] == 0 && myPtr[3] == 0) + { + myBOM = LDOM_OSStream::BOM_UTF32LE; + myPtr += 4; + } + else + { + myBOM = LDOM_OSStream::BOM_UTF16LE; + myPtr += 2; + } + } + break; + case 0x00: + if (myPtr[1] == 0 && Standard_Utf8UChar(myPtr[2]) == 0xFE && Standard_Utf8UChar(myPtr[3]) == 0xFF) + { + myBOM = LDOM_OSStream::BOM_UTF32BE; + myPtr += 4; + } + break; + case 0x2B: + if (myPtr[1] == 47 && myPtr[2] == 118 && + (myPtr[3] == 43 || myPtr[3] == 47 || myPtr[3] == 56 || myPtr[3] == 57)) + { + myBOM = LDOM_OSStream::BOM_UTF7; + if (myPtr[3] == 56 && myPtr[3] == 45) + myPtr += 5; + else + myPtr += 4; + } + break; + case 0xF7: + if (myPtr[1] == 100 && myPtr[2] == 76) + { + myBOM = LDOM_OSStream::BOM_UTF1; + myPtr += 3; + } + break; + case 0xDD: + if (myPtr[1] == 115 && myPtr[2] == 102 && myPtr[3] == 115) + { + myBOM = LDOM_OSStream::BOM_UTFEBCDIC; + myPtr += 4; + } + break; + case 0x0E: + if (Standard_Utf8UChar(myPtr[1]) == 0xFE && Standard_Utf8UChar(myPtr[2]) == 0xFF) + { + myBOM = LDOM_OSStream::BOM_SCSU; + myPtr += 3; + } + break; + case 0xFB: + if (Standard_Utf8UChar(myPtr[1]) == 0xEE && myPtr[2] == 40) + { + myBOM = LDOM_OSStream::BOM_BOCU1; + myPtr += 3; + } + break; + case 0x84: + if (myPtr[1] == 49 && Standard_Utf8UChar(myPtr[2]) == 0x95 && myPtr[3] == 51) + { + myBOM = LDOM_OSStream::BOM_GB18030; + myPtr += 4; + } + break; + } + if (myBOM != LDOM_OSStream::BOM_UNDEFINED) + continue; + } // Check the character data switch (aState) { diff --git a/src/LDOM/LDOM_XmlReader.hxx b/src/LDOM/LDOM_XmlReader.hxx index 16b923e3d1..cc8965ffd0 100644 --- a/src/LDOM/LDOM_XmlReader.hxx +++ b/src/LDOM/LDOM_XmlReader.hxx @@ -23,9 +23,9 @@ #define XML_BUFFER_SIZE 20480 #include +#include class TCollection_AsciiString; -class LDOM_OSStream; // Class LDOM_XmlReader // @@ -66,6 +66,9 @@ class LDOM_XmlReader const char * theEnd); // try convert string theStart to LDOM_AsciiInteger, return False on success + // Returns the byte order mask defined at the start of a stream + LDOM_OSStream::BOMType GetBOM() const { return myBOM; } + private: // ---------- PRIVATE (PROHIBITED) METHODS ---------- LDOM_XmlReader (const LDOM_XmlReader& theOther); @@ -86,6 +89,7 @@ class LDOM_XmlReader const char * myEndPtr; char myBuffer [XML_BUFFER_SIZE+4]; Standard_Boolean myTagPerStep; + LDOM_OSStream::BOMType myBOM; }; #endif diff --git a/src/QABugs/QABugs_14.cxx b/src/QABugs/QABugs_14.cxx index c49cb76940..6d40fbc5de 100644 --- a/src/QABugs/QABugs_14.cxx +++ b/src/QABugs/QABugs_14.cxx @@ -674,10 +674,6 @@ static Standard_Integer OCC983 (Draw_Interpretor& di, Standard_Integer argc, co di << " AttributeValue = " << itemValue.ToCString() << "\n"; } -// LDOM_Element element; -// for ( element = (const LDOM_Element&) root.getFirstChild(); -// !element.isNull(); -// element = (const LDOM_Element&) element.getNextSibling() ) { LDOM_Element element; LDOM_Node node; for ( node = root.getFirstChild(), element = (const LDOM_Element&) node; @@ -694,6 +690,25 @@ static Standard_Integer OCC983 (Draw_Interpretor& di, Standard_Integer argc, co di << " AttributeValue = " << itemValue2.ToCString() << "\n"; } } + if (aParser.GetBOM() != LDOM_OSStream::BOM_UNDEFINED) + { + di << "BOM is "; + switch (aParser.GetBOM()) { + case LDOM_OSStream::BOM_UTF8: di << "UTF-8"; break; + case LDOM_OSStream::BOM_UTF16BE: di << "UTF-16 (BE)"; break; + case LDOM_OSStream::BOM_UTF16LE: di << "UTF-16 (LE)"; break; + case LDOM_OSStream::BOM_UTF32BE: di << "UTF-32 (BE)"; break; + case LDOM_OSStream::BOM_UTF32LE: di << "UTF-32 (LE)"; break; + case LDOM_OSStream::BOM_UTF7: di << "UTF-7"; break; + case LDOM_OSStream::BOM_UTF1: di << "UTF-1"; break; + case LDOM_OSStream::BOM_UTFEBCDIC: di << "UTF-EBCDIC"; break; + case LDOM_OSStream::BOM_SCSU: di << "SCSU"; break; + case LDOM_OSStream::BOM_BOCU1: di << "BOCU-1"; break; + case LDOM_OSStream::BOM_GB18030: di << "GB-18030"; break; + default: di << "unexpected"; + } + di << "\n"; + } return 0; } diff --git a/tests/bugs/fclasses/bug31340 b/tests/bugs/fclasses/bug31340 new file mode 100644 index 0000000000..7f0e0dcc19 --- /dev/null +++ b/tests/bugs/fclasses/bug31340 @@ -0,0 +1,18 @@ +puts "================" +puts "0031340: LDOM fails to read XML file starting with BOM" +puts "================" +puts "" + +pload QAcommands + +set list [OCC983 [locate_data_file bug31340.xml]] + +if { [regexp "Document parsed" $list] == 1 } { + if {[lsearch -exact ${list} "UTF-8"] != -1 } { + puts "OK" + } else { + puts "Error : BOM was not found in $list" + } +} else { + puts "Error : document not parsed" +}