diff --git a/src/LDOM/LDOMParser.cxx b/src/LDOM/LDOMParser.cxx index 82730ed576..341d902067 100644 --- a/src/LDOM/LDOMParser.cxx +++ b/src/LDOM/LDOMParser.cxx @@ -106,6 +106,18 @@ const TCollection_AsciiString& LDOMParser::GetError return myError; } +//======================================================================= +//function : GetBOM +//purpose : Returns the byte order mask defined at the start of a stream +//======================================================================= + +LDOM_OSStream::BOMType LDOMParser::GetBOM() const +{ + if (myReader) + return myReader->GetBOM(); + return LDOM_OSStream::BOM_UNDEFINED; +} + //======================================================================= //function : parse //purpose : diff --git a/src/LDOM/LDOMParser.hxx b/src/LDOM/LDOMParser.hxx index 85919de2b5..d064e5d019 100644 --- a/src/LDOM/LDOMParser.hxx +++ b/src/LDOM/LDOMParser.hxx @@ -64,6 +64,9 @@ class LDOMParser GetError (TCollection_AsciiString& aData) const; // Return text describing a parsing error, or Empty if no error occurred + // Returns the byte order mask defined at the start of a stream + Standard_EXPORT LDOM_OSStream::BOMType GetBOM() const; + protected: // ---------- PROTECTED METHODS ---------- diff --git a/src/LDOM/LDOM_OSStream.hxx b/src/LDOM/LDOM_OSStream.hxx index e3f88a5a2d..511c683ff0 100644 --- a/src/LDOM/LDOM_OSStream.hxx +++ b/src/LDOM/LDOM_OSStream.hxx @@ -112,6 +112,23 @@ public: private: LDOM_SBuffer myBuffer; + +public: + // byte order mark defined at the start of a stream + enum BOMType { + BOM_UNDEFINED, + BOM_UTF8, + BOM_UTF16BE, + BOM_UTF16LE, + BOM_UTF32BE, + BOM_UTF32LE, + BOM_UTF7, + BOM_UTF1, + BOM_UTFEBCDIC, + BOM_SCSU, + BOM_BOCU1, + BOM_GB18030 + }; }; #endif diff --git a/src/LDOM/LDOM_XmlReader.cxx b/src/LDOM/LDOM_XmlReader.cxx index 66a3cc1ca6..5b12bce602 100644 --- a/src/LDOM/LDOM_XmlReader.cxx +++ b/src/LDOM/LDOM_XmlReader.cxx @@ -73,7 +73,8 @@ LDOM_XmlReader::LDOM_XmlReader ( myLastChild(NULL), myPtr (&myBuffer[0]), myEndPtr (&myBuffer[0]), - myTagPerStep (theTagPerStep) + myTagPerStep (theTagPerStep), + myBOM (LDOM_OSStream::BOM_UNDEFINED) { } @@ -92,6 +93,7 @@ LDOM_XmlReader::RecordType LDOM_XmlReader::ReadRecord (Standard_IStream& theIStr LDOMBasicString anAttrName, anAttrValue; char anAttDelimiter = '\0'; Standard_Boolean aHasRead = Standard_False; + Standard_Boolean isFileStart = !myEOF && theIStream.tellg() == std::iostream::pos_type(0); for(;;) { // Check if the current file buffer is exhausted @@ -153,6 +155,98 @@ LDOM_XmlReader::RecordType LDOM_XmlReader::ReadRecord (Standard_IStream& theIStr myBuffer[aBytesRest + aNBytes] = '\0'; } } + if (isFileStart) + { + isFileStart = Standard_False; + // check for BOM block + Standard_Utf8UChar aFirstChar = Standard_Utf8UChar(myPtr[0]); + switch(aFirstChar) { + case 0xEF: + if (Standard_Utf8UChar(myPtr[1]) == 0xBB && Standard_Utf8UChar(myPtr[2]) == 0xBF) + { + myBOM = LDOM_OSStream::BOM_UTF8; + myPtr += 3; + } + break; + case 0xFE: + if (Standard_Utf8UChar(myPtr[1]) == 0xFF) + { + myBOM = LDOM_OSStream::BOM_UTF16BE; + myPtr += 2; + } + break; + case 0xFF: + if (Standard_Utf8UChar(myPtr[1]) == 0xFE) + { + if (myPtr[2] == 0 && myPtr[3] == 0) + { + myBOM = LDOM_OSStream::BOM_UTF32LE; + myPtr += 4; + } + else + { + myBOM = LDOM_OSStream::BOM_UTF16LE; + myPtr += 2; + } + } + break; + case 0x00: + if (myPtr[1] == 0 && Standard_Utf8UChar(myPtr[2]) == 0xFE && Standard_Utf8UChar(myPtr[3]) == 0xFF) + { + myBOM = LDOM_OSStream::BOM_UTF32BE; + myPtr += 4; + } + break; + case 0x2B: + if (myPtr[1] == 47 && myPtr[2] == 118 && + (myPtr[3] == 43 || myPtr[3] == 47 || myPtr[3] == 56 || myPtr[3] == 57)) + { + myBOM = LDOM_OSStream::BOM_UTF7; + if (myPtr[3] == 56 && myPtr[3] == 45) + myPtr += 5; + else + myPtr += 4; + } + break; + case 0xF7: + if (myPtr[1] == 100 && myPtr[2] == 76) + { + myBOM = LDOM_OSStream::BOM_UTF1; + myPtr += 3; + } + break; + case 0xDD: + if (myPtr[1] == 115 && myPtr[2] == 102 && myPtr[3] == 115) + { + myBOM = LDOM_OSStream::BOM_UTFEBCDIC; + myPtr += 4; + } + break; + case 0x0E: + if (Standard_Utf8UChar(myPtr[1]) == 0xFE && Standard_Utf8UChar(myPtr[2]) == 0xFF) + { + myBOM = LDOM_OSStream::BOM_SCSU; + myPtr += 3; + } + break; + case 0xFB: + if (Standard_Utf8UChar(myPtr[1]) == 0xEE && myPtr[2] == 40) + { + myBOM = LDOM_OSStream::BOM_BOCU1; + myPtr += 3; + } + break; + case 0x84: + if (myPtr[1] == 49 && Standard_Utf8UChar(myPtr[2]) == 0x95 && myPtr[3] == 51) + { + myBOM = LDOM_OSStream::BOM_GB18030; + myPtr += 4; + } + break; + } + if (myBOM != LDOM_OSStream::BOM_UNDEFINED) + continue; + } // Check the character data switch (aState) { diff --git a/src/LDOM/LDOM_XmlReader.hxx b/src/LDOM/LDOM_XmlReader.hxx index 16b923e3d1..cc8965ffd0 100644 --- a/src/LDOM/LDOM_XmlReader.hxx +++ b/src/LDOM/LDOM_XmlReader.hxx @@ -23,9 +23,9 @@ #define XML_BUFFER_SIZE 20480 #include +#include class TCollection_AsciiString; -class LDOM_OSStream; // Class LDOM_XmlReader // @@ -66,6 +66,9 @@ class LDOM_XmlReader const char * theEnd); // try convert string theStart to LDOM_AsciiInteger, return False on success + // Returns the byte order mask defined at the start of a stream + LDOM_OSStream::BOMType GetBOM() const { return myBOM; } + private: // ---------- PRIVATE (PROHIBITED) METHODS ---------- LDOM_XmlReader (const LDOM_XmlReader& theOther); @@ -86,6 +89,7 @@ class LDOM_XmlReader const char * myEndPtr; char myBuffer [XML_BUFFER_SIZE+4]; Standard_Boolean myTagPerStep; + LDOM_OSStream::BOMType myBOM; }; #endif diff --git a/src/QABugs/QABugs_14.cxx b/src/QABugs/QABugs_14.cxx index c49cb76940..6d40fbc5de 100644 --- a/src/QABugs/QABugs_14.cxx +++ b/src/QABugs/QABugs_14.cxx @@ -674,10 +674,6 @@ static Standard_Integer OCC983 (Draw_Interpretor& di, Standard_Integer argc, co di << " AttributeValue = " << itemValue.ToCString() << "\n"; } -// LDOM_Element element; -// for ( element = (const LDOM_Element&) root.getFirstChild(); -// !element.isNull(); -// element = (const LDOM_Element&) element.getNextSibling() ) { LDOM_Element element; LDOM_Node node; for ( node = root.getFirstChild(), element = (const LDOM_Element&) node; @@ -694,6 +690,25 @@ static Standard_Integer OCC983 (Draw_Interpretor& di, Standard_Integer argc, co di << " AttributeValue = " << itemValue2.ToCString() << "\n"; } } + if (aParser.GetBOM() != LDOM_OSStream::BOM_UNDEFINED) + { + di << "BOM is "; + switch (aParser.GetBOM()) { + case LDOM_OSStream::BOM_UTF8: di << "UTF-8"; break; + case LDOM_OSStream::BOM_UTF16BE: di << "UTF-16 (BE)"; break; + case LDOM_OSStream::BOM_UTF16LE: di << "UTF-16 (LE)"; break; + case LDOM_OSStream::BOM_UTF32BE: di << "UTF-32 (BE)"; break; + case LDOM_OSStream::BOM_UTF32LE: di << "UTF-32 (LE)"; break; + case LDOM_OSStream::BOM_UTF7: di << "UTF-7"; break; + case LDOM_OSStream::BOM_UTF1: di << "UTF-1"; break; + case LDOM_OSStream::BOM_UTFEBCDIC: di << "UTF-EBCDIC"; break; + case LDOM_OSStream::BOM_SCSU: di << "SCSU"; break; + case LDOM_OSStream::BOM_BOCU1: di << "BOCU-1"; break; + case LDOM_OSStream::BOM_GB18030: di << "GB-18030"; break; + default: di << "unexpected"; + } + di << "\n"; + } return 0; } diff --git a/tests/bugs/fclasses/bug31340 b/tests/bugs/fclasses/bug31340 new file mode 100644 index 0000000000..7f0e0dcc19 --- /dev/null +++ b/tests/bugs/fclasses/bug31340 @@ -0,0 +1,18 @@ +puts "================" +puts "0031340: LDOM fails to read XML file starting with BOM" +puts "================" +puts "" + +pload QAcommands + +set list [OCC983 [locate_data_file bug31340.xml]] + +if { [regexp "Document parsed" $list] == 1 } { + if {[lsearch -exact ${list} "UTF-8"] != -1 } { + puts "OK" + } else { + puts "Error : BOM was not found in $list" + } +} else { + puts "Error : document not parsed" +}