1
0
mirror of https://git.dev.opencascade.org/repos/occt.git synced 2025-08-04 13:13:25 +03:00

0028454: Data Exchange, STEP reader - names with special characters cannot be read

- Add support of the control directives ( "\X2\" "\X4" "\X\" "\P*\" "\S\");
- Make param "read.stepcaf.codepage" base for conversion inside StepData instead of CAF;
- Rename "read.stepcaf.codepage" to "read.step.codepage".
- Add ISO 8859-1 - 9 code pages for conversion
- Add Resource_FormatType_NoConversion format type, that indicates non-conversion behavior
- Update old test cases that contain control directives
This commit is contained in:
dpasukhi
2020-10-09 13:57:30 +03:00
committed by bugmaster
parent 380748c340
commit 1b9cb073b9
22 changed files with 949 additions and 89 deletions

View File

@@ -28,12 +28,14 @@
#include <StepData_StepModel.hxx>
#include <StepData_StepWriter.hxx>
#include <TCollection_HAsciiString.hxx>
#include <Interface_Static.hxx>
#include <stdio.h>
IMPLEMENT_STANDARD_RTTIEXT(StepData_StepModel,Interface_InterfaceModel)
// Entete de fichier : liste d entites
StepData_StepModel::StepData_StepModel () { }
StepData_StepModel::StepData_StepModel () :mySourceCodePage((Resource_FormatType)Interface_Static::IVal("read.step.codepage"))
{}
Handle(Standard_Transient) StepData_StepModel::Entity

View File

@@ -20,6 +20,7 @@
#include <Interface_EntityList.hxx>
#include <TColStd_HArray1OfInteger.hxx>
#include <Interface_InterfaceModel.hxx>
#include <Resource_FormatType.hxx>
class Standard_NoSuchObject;
class Standard_Transient;
@@ -97,7 +98,12 @@ public:
//! same form as for PrintLabel
Standard_EXPORT Handle(TCollection_HAsciiString) StringLabel (const Handle(Standard_Transient)& ent) const Standard_OVERRIDE;
//! Return the encoding of STEP file for converting names into UNICODE.
//! Initialized from "read.step.codepage" variable by constructor, which is Resource_UTF8 by default.
Resource_FormatType SourceCodePage() const { return mySourceCodePage; }
//! Return the encoding of STEP file for converting names into UNICODE.
void SetSourceCodePage (Resource_FormatType theCode) { mySourceCodePage = theCode; }
DEFINE_STANDARD_RTTIEXT(StepData_StepModel,Interface_InterfaceModel)
@@ -112,6 +118,7 @@ private:
Interface_EntityList theheader;
Handle(TColStd_HArray1OfInteger) theidnums;
Resource_FormatType mySourceCodePage;
};

View File

@@ -18,6 +18,7 @@
#include <Interface_FileParameter.hxx>
#include <Interface_HArray1OfHAsciiString.hxx>
#include <Interface_Macros.hxx>
#include <Interface_Static.hxx>
#include <Interface_ParamList.hxx>
#include <Message.hxx>
#include <Message_Messenger.hxx>
@@ -37,6 +38,8 @@
#include <StepData_StepModel.hxx>
#include <StepData_StepReaderData.hxx>
#include <TCollection_AsciiString.hxx>
#include <TCollection_ExtendedString.hxx>
#include <NCollection_UtfIterator.hxx>
#include <TCollection_HAsciiString.hxx>
#include <TColStd_Array1OfInteger.hxx>
#include <TColStd_HArray1OfInteger.hxx>
@@ -46,6 +49,7 @@
#include <TColStd_IndexedMapOfInteger.hxx>
#include <TColStd_SequenceOfInteger.hxx>
#include <StepData_UndefinedEntity.hxx>
#include <Resource_Unicode.hxx>
#include <stdio.h>
IMPLEMENT_STANDARD_RTTIEXT(StepData_StepReaderData, Interface_FileReaderData)
@@ -61,7 +65,6 @@ static char txtmes[200]; // plus commode que redeclarer partout
static Standard_Boolean initstr = Standard_False;
#define Maxlst 64
//static TCollection_AsciiString subl[Maxlst]; // Maxlst : minimum 10
@@ -69,37 +72,232 @@ static Standard_Integer acceptvoid = 0;
// ---------- Fonctions Utilitaires ----------
//! Convert unsigned character to hexadecimal system,
//! if character hasn't representation in this system, returns 0.
static Standard_Integer convertCharacterTo16bit(const unsigned char theCharacter)
{
switch (theCharacter)
{
case '0': return 0;
case '1': return 1;
case '2': return 2;
case '3': return 3;
case '4': return 4;
case '5': return 5;
case '6': return 6;
case '7': return 7;
case '8': return 8;
case '9': return 9;
case 'A': return 10;
case 'B': return 11;
case 'C': return 12;
case 'D': return 13;
case 'E': return 14;
case 'F': return 15;
default : return 0;
}
}
//=======================================================================
//function : CleanText
//function : cleanText
//purpose :
//=======================================================================
static void CleanText(const Handle(TCollection_HAsciiString)& val)
void StepData_StepReaderData::cleanText(const Handle(TCollection_HAsciiString)& theVal) const
{
Standard_Integer n = val->Length(); // avant reduction
val->Remove(n);
val->Remove(1);
// Ne pas oublier de traiter les caracteres speciaux
Standard_Integer n = theVal->Length(); // string size before reduction
theVal->Remove(n);
theVal->Remove(1);
// Don't forget to treat the special characters
for (Standard_Integer i = n - 2; i > 0; i--) {
char uncar = val->Value(i);
if (uncar == '\n')
{ val->Remove(i); if (i < n-2) uncar = val->Value(i); }
if (uncar == '\'' && i < n - 2) {
if (val->Value(i + 1) == '\'') { val->Remove(i + 1); continue; }
char aChar = theVal->Value(i);
if (aChar == '\n')
{ theVal->Remove(i); if (i < n-2) aChar = theVal->Value(i); }
if (aChar == '\'' && i < n - 2) {
if (theVal->Value(i + 1) == '\'') { theVal->Remove(i + 1); continue; }
}
if (uncar == '\\' && i < n - 2) {
if (val->Value(i + 1) == '\\') { val->Remove(i + 1); continue; }
}
else if (uncar == '\\' && i < n - 3) {
if (val->Value(i + 2) == '\\') {
if (val->Value(i + 1) == 'N')
{ val->SetValue(i,'\n'); val->Remove(i+1,2); continue; }
if (val->Value(i + 1) == 'T')
{ val->SetValue(i,'\t'); val->Remove(i+1,2); continue; }
}
if (aChar == '\\' && i < n - 3) {
if (theVal->Value(i + 2) == '\\') {
if (theVal->Value(i + 1) == 'N')
{ theVal->SetValue(i,'\n'); theVal->Remove(i+1,2); continue; }
if (theVal->Value(i + 1) == 'T')
{ theVal->SetValue(i,'\t'); theVal->Remove(i+1,2); continue; }
}
}
}
// pass through without conversion the control directives
if (mySourceCodePage == Resource_FormatType_NoConversion)
return;
Standard_Integer aFirstCharInd = 1; // begin index of substring to conversion before the control directives
Standard_Integer aLastCharInd = 1; // end index of substring to conversion before the control directives
TCollection_ExtendedString aTempExtString; // string for characters within control directives
TCollection_ExtendedString anOutputExtString; // string for conversion in UTF-8
Resource_FormatType aLocalFormatType = Resource_FormatType_iso8859_1; // a code page for a "\S\" control directive
for (Standard_Integer i = 1; i <= theVal->Length(); ++i)
{
unsigned char aChar = theVal->Value(i);
if (aChar != '\\' || (theVal->Length() - i) < 3) // does not contain the control directive
{
continue;
}
Standard_Integer aLocalLastCharInd = i - 1;
Standard_Boolean isConverted = Standard_False;
// Encoding ISO 8859 characters within a string;
// ("\P{N}\") control directive;
// indicates code page for ("\S\") control directive;
// {N}: "A", "B", "C", "D", "E", "F", "G", "H", "I";
// "A" identifies ISO 8859-1; "B" identifies ISO 8859-2, etc.
if (theVal->Value(i + 1) == 'P' && theVal->Length() - i > 3 && theVal->Value(i + 3) == '\\')
{
Standard_Character aPageId = UpperCase (theVal->Value(i + 2));
if (aPageId >= 'A' && aPageId <= 'I')
{
aLocalFormatType = (Resource_FormatType)(Resource_FormatType_iso8859_1 + (aPageId - 'A'));
}
else
{
thecheck->AddWarning("String control directive \\P*\\ with an unsupported symbol in place of *");
}
isConverted = Standard_True;
i += 3;
}
// Encoding ISO 8859 characters within a string;
// ("\S\") control directive;
// converts followed a LATIN CODEPOINT character.
else if (theVal->Value(i + 1) == 'S' && theVal->Length() - i > 2 && theVal->Value(i + 2) == '\\')
{
Standard_Character aResChar = theVal->Value(i + 3) | 0x80;
const char aStrForCovert[2] = { aResChar, '\0' };
Resource_Unicode::ConvertFormatToUnicode(aLocalFormatType, aStrForCovert, aTempExtString);
isConverted = Standard_True;
i += 3;
}
// Encoding U+0000 to U+00FF in a string
// ("\X\") control directive;
// converts followed two hexadecimal character.
else if (theVal->Value(i + 1) == 'X' && theVal->Length() - i > 3 && theVal->Value(i + 2) == '\\')
{
Standard_Character aResChar = (char)convertCharacterTo16bit(theVal->Value(i + 3));
aResChar = (aResChar << 4) | (char)convertCharacterTo16bit(theVal->Value(i + 4));
const char aStrForCovert[2] = { aResChar, '\0' };
aTempExtString = TCollection_ExtendedString(aStrForCovert, Standard_False); // pass through without conversion
isConverted = Standard_True;
i += 4;
}
// Encoding ISO 10646 characters within a string
// ("\X{N}\") control directive;
// {N}: "0", "2", "4";
// "\X2\" or "\X4\" converts followed a hexadecimal character sequence;
// "\X0\" indicate the end of the "\X2\" or "\X4\".
else if (theVal->Value(i + 1) == 'X' && theVal->Length() - i > 2 && theVal->Value(i + 3) == '\\')
{
Standard_Integer aFirstInd = i + 3;
Standard_Integer aLastInd = i;
Standard_Boolean isClosed = Standard_False;
for (; i <= theVal->Length() && !isClosed; ++i) // find the end of the "\X2\" or "\X4\" by an external "i"
{
if (theVal->Length() - i > 2 && theVal->Value(i) == '\\' && theVal->Value(i + 1) == 'X' && theVal->Value(i + 2) == '0' && theVal->Value(i + 3) == '\\')
{
aLastInd = i - 1;
i = i + 2;
isClosed = Standard_True;
}
}
if (!isClosed) // "\X0\" not exists
{
aLastInd = theVal->Length();
}
TCollection_AsciiString aBitString;
aBitString = TCollection_AsciiString(theVal->ToCString() + aFirstInd, aLastInd - aFirstInd);
aBitString.UpperCase(); // make valid for conversion into 16-bit
// "\X2\" control directive;
// followed by multiples of four or three hexadecimal characters.
// Encoding in UTF-16
if (theVal->Value(aFirstInd - 1) == '2' && theVal->Length() - aFirstInd > 3)
{
Standard_Integer anIterStep = (aBitString.Length() % 4 == 0) ? 4 : 3;
if (aBitString.Length() % anIterStep)
{
aTempExtString.AssignCat('?');
thecheck->AddWarning("String control directive \\X2\\ is followed by number of digits not multiple of 4");
}
else
{
Standard_Integer aStrLen = aBitString.Length() / anIterStep;
Standard_Utf16Char aUtfCharacter = '\0';
for (Standard_Integer aCharInd = 1; aCharInd <= aStrLen * anIterStep; ++aCharInd)
{
aUtfCharacter |= convertCharacterTo16bit(aBitString.Value(aCharInd));
if (aCharInd % anIterStep == 0)
{
aTempExtString.AssignCat(aUtfCharacter);
aUtfCharacter = '\0';
}
aUtfCharacter = aUtfCharacter << 4;
}
}
}
// "\X4\" control directive;
// followed by multiples of eight hexadecimal characters.
// Encoding in UTF-32
else if (theVal->Value(aFirstInd - 1) == '4' && theVal->Length() - aFirstInd > 7)
{
if (aBitString.Length() % 8)
{
aTempExtString.AssignCat('?');
thecheck->AddWarning("String control directive \\X4\\ is followed by number of digits not multiple of 8");
}
else
{
Standard_Integer aStrLen = aBitString.Length() / 8;
Standard_Utf32Char aUtfCharacter[2] = {'\0', '\0'};
for (Standard_Integer aCharInd = 1; aCharInd <= aStrLen * 8; ++aCharInd)
{
aUtfCharacter[0] |= convertCharacterTo16bit(aBitString.Value(aCharInd));
if (aCharInd % 8 == 0)
{
NCollection_Utf32Iter aUtfIter(aUtfCharacter);
Standard_Utf16Char aStringBuffer[3];
Standard_Utf16Char* aUtfPntr = aUtfIter.GetUtf16(aStringBuffer);
*aUtfPntr++ = '\0';
TCollection_ExtendedString aUtfString(aStringBuffer);
aTempExtString.AssignCat(aUtfString);
aUtfCharacter[0] = '\0';
}
aUtfCharacter[0] = aUtfCharacter[0] << 4;
}
}
}
isConverted = Standard_True;
}
if (isConverted) // find the control directive
{
TCollection_ExtendedString anExtString;
if (aFirstCharInd <= aLocalLastCharInd)
{
Resource_Unicode::ConvertFormatToUnicode(mySourceCodePage, theVal->SubString(aFirstCharInd, aLocalLastCharInd)->ToCString(), anExtString);
}
anOutputExtString.AssignCat(anExtString);
anOutputExtString.AssignCat(aTempExtString);
aFirstCharInd = i + 1;
aLastCharInd = aFirstCharInd;
aTempExtString.Clear();
}
}
if (aLastCharInd <= theVal->Length())
{
Resource_Unicode::ConvertFormatToUnicode(mySourceCodePage, theVal->ToCString() + aLastCharInd - 1, aTempExtString);
anOutputExtString.AssignCat(aTempExtString);
}
theVal->Clear();
TCollection_AsciiString aTmpString(anOutputExtString, 0);
theVal->AssignCat(aTmpString.ToCString());
}
// ------------- METHODES -------------
//=======================================================================
@@ -109,9 +307,9 @@ static void CleanText(const Handle(TCollection_HAsciiString)& val)
StepData_StepReaderData::StepData_StepReaderData
(const Standard_Integer nbheader, const Standard_Integer nbtotal,
const Standard_Integer nbpar)
const Standard_Integer nbpar, const Resource_FormatType theSourceCodePage)
: Interface_FileReaderData(nbtotal, nbpar), theidents(1, nbtotal),
thetypes(1, nbtotal) //, themults (1,nbtotal)
thetypes(1, nbtotal), mySourceCodePage(theSourceCodePage) //, themults (1,nbtotal)
{
// char textnum[10];
thenbscop = 0; thenbents = 0; thelastn = 0; thenbhead = nbheader;
@@ -564,7 +762,9 @@ Standard_Integer StepData_StepReaderData::ReadSub(const Standard_Integer numsub,
case 6: {
if (FT != Interface_ParamText) { kod = 0; break; }
Handle(TCollection_HAsciiString) txt = new TCollection_HAsciiString(str);
CleanText(txt); hst->SetValue(ip, txt); break;
cleanText(txt);
hst->SetValue(ip, txt);
break;
}
case 7: {
Handle(Standard_Transient) ent = BoundEntity(FP.EntityNumber());
@@ -636,7 +836,9 @@ Standard_Integer StepData_StepReaderData::ReadSub(const Standard_Integer numsub,
case Interface_ParamLogical: break;
case Interface_ParamText: {
Handle(TCollection_HAsciiString) txt = new TCollection_HAsciiString(str);
CleanText(txt); htr->SetValue(ip, txt); break;
cleanText(txt);
htr->SetValue(ip, txt);
break;
}
case Interface_ParamSub: {
Handle(Standard_Transient) sub;
@@ -714,7 +916,9 @@ Standard_Boolean StepData_StepReaderData::ReadField(const Standard_Integer num,
case Interface_ParamVoid: break;
case Interface_ParamText:
txt = new TCollection_HAsciiString(str);
CleanText(txt); fild.Set(txt); break;
cleanText(txt);
fild.Set(txt);
break;
case Interface_ParamEnum:
if (!strcmp(str, ".T.")) fild.SetLogical(StepData_LTrue);
else if (!strcmp(str, ".F.")) fild.SetLogical(StepData_LFalse);
@@ -841,7 +1045,7 @@ Standard_Boolean StepData_StepReaderData::ReadAny(const Standard_Integer num,
case Interface_ParamLogical: break;
case Interface_ParamText: {
Handle(TCollection_HAsciiString) txt = new TCollection_HAsciiString(str);
CleanText(txt);
cleanText(txt);
// PDN May 2000: for reading SOURCE_ITEM (external references)
if (!val.IsNull()) {
@@ -1242,7 +1446,7 @@ Standard_Boolean StepData_StepReaderData::ReadString(const Standard_Integer num,
CleanText (val);
}*/
val = new TCollection_HAsciiString(FP.CValue());
CleanText(val);
cleanText(val);
} else {
if (acceptvoid && FP.ParamType() == Interface_ParamVoid) warn = Standard_True;
errmess = new String("Parameter n0.%d (%s) not a quoted String");

View File

@@ -19,6 +19,7 @@
#include <Standard.hxx>
#include <Standard_Type.hxx>
#include <Resource_FormatType.hxx>
#include <TColStd_Array1OfInteger.hxx>
#include <Interface_IndexedMapOfAsciiString.hxx>
@@ -63,7 +64,7 @@ public:
//! creation time, because it contains arrays)
//! nbheader is nb of records for Header, nbtotal for Header+Data
//! and nbpar gives the total count of parameters
Standard_EXPORT StepData_StepReaderData(const Standard_Integer nbheader, const Standard_Integer nbtotal, const Standard_Integer nbpar);
Standard_EXPORT StepData_StepReaderData(const Standard_Integer nbheader, const Standard_Integer nbtotal, const Standard_Integer nbpar, const Resource_FormatType theSourceCodePage = Resource_FormatType_UTF8);
//! Fills the fields of a record
Standard_EXPORT void SetRecord (const Standard_Integer num, const Standard_CString ident, const Standard_CString type, const Standard_Integer nbpar);
@@ -349,6 +350,16 @@ private:
//! If found, returns its EntityNumber, else returns Zero.
Standard_EXPORT Standard_Integer FindEntityNumber (const Standard_Integer num, const Standard_Integer id) const;
//! Prepare string to use in OCCT exchange structure.
//! If code page is Resource_FormatType_NoConversion,
//! clean only special characters without conversion;
//! else convert a string to UTF8 using the code page
//! and handle the control directives.
Standard_EXPORT void cleanText(const Handle(TCollection_HAsciiString)& theVal) const;
private:
TColStd_Array1OfInteger theidents;
TColStd_Array1OfInteger thetypes;
Interface_IndexedMapOfAsciiString thenametypes;
@@ -358,6 +369,7 @@ private:
Standard_Integer thenbhead;
Standard_Integer thenbscop;
Handle(Interface_Check) thecheck;
Resource_FormatType mySourceCodePage;
};