1
0
mirror of https://git.dev.opencascade.org/repos/occt.git synced 2025-08-14 13:30:48 +03:00

0028454: Data Exchange, STEP reader - names with special characters cannot be read

- Add support of the control directives ( "\X2\" "\X4" "\X\" "\P*\" "\S\");
- Make param "read.stepcaf.codepage" base for conversion inside StepData instead of CAF;
- Rename "read.stepcaf.codepage" to "read.step.codepage".
- Add ISO 8859-1 - 9 code pages for conversion
- Add Resource_FormatType_NoConversion format type, that indicates non-conversion behavior
- Update old test cases that contain control directives
This commit is contained in:
dpasukhi
2020-10-09 13:57:30 +03:00
committed by bugmaster
parent 380748c340
commit 1b9cb073b9
22 changed files with 949 additions and 89 deletions

View File

@@ -1,5 +1,5 @@
Resource_ANSI.pxx
Resource_Big5.pxx
Resource_CodePages.pxx
Resource_ConvertUnicode.c
Resource_ConvertUnicode.hxx
Resource_DataMapOfAsciiStringAsciiString.hxx

View File

@@ -14,7 +14,7 @@
#include <Standard_TypeDef.hxx>
// Code pages ANSI -> UTF16
static const Standard_ExtCharacter THE_CODEPAGES_ANSI[9][128] =
static const Standard_ExtCharacter THE_CODEPAGES_ANSI[Resource_FormatType_iso8859_9 - Resource_FormatType_CP1250 + 1][128] =
{
{
// code page: cp1250
@@ -338,5 +338,329 @@ static const Standard_ExtCharacter THE_CODEPAGES_ANSI[9][128] =
0xf4, 0x1a1, 0xf6, 0xf7,
0xf8, 0xf9, 0xfa, 0xfb,
0xfc, 0x1b0, 0x20ab, 0xff
},
{
// code page: ISO 8859-1
0x80, 0x81, 0x82, 0x83,
0x84, 0x85, 0x86, 0x87,
0x88, 0x89, 0x8a, 0x8b,
0x8c, 0x8d, 0x8e, 0x8f,
0x90, 0x91, 0x92, 0x93,
0x94, 0x95, 0x96, 0x97,
0x98, 0x99, 0x9a, 0x9b,
0x9c, 0x9d, 0x9e, 0x9f,
0xa0, 0xa1, 0xa2, 0xa3,
0xa4, 0xa5, 0xa6, 0xa7,
0xa8, 0xa9, 0xaa, 0xab,
0xac, 0xad, 0xae, 0xaf,
0xb0, 0xb1, 0xb2, 0xb3,
0xb4, 0xb5, 0xb6, 0xb7,
0xb8, 0xb9, 0xba, 0xbb,
0xbc, 0xbd, 0xbe, 0xbf,
0xc0, 0xc1, 0xc2, 0xc3,
0xc4, 0xc5, 0xc6, 0xc7,
0xc8, 0xc9, 0xca, 0xcb,
0xcc, 0xcd, 0xce, 0xcf,
0xd0, 0xd1, 0xd2, 0xd3,
0xd4, 0xd5, 0xd6, 0xd7,
0xd8, 0xd9, 0xda, 0xdb,
0xdc, 0xdd, 0xde, 0xdf,
0xe0, 0xe1, 0xe2, 0xe3,
0xe4, 0xe5, 0xe6, 0xe7,
0xe8, 0xe9, 0xea, 0xeb,
0xec, 0xed, 0xee, 0xef,
0xf0, 0xf1, 0xf2, 0xf3,
0xf4, 0xf5, 0xf6, 0xf7,
0xf8, 0xf9, 0xfa, 0xfb,
0xfc, 0xfd, 0xfe, 0xff
},
{
// code page: ISO 8859-2
0x80, 0x81, 0x82, 0x83,
0x84, 0x85, 0x86, 0x87,
0x88, 0x89, 0x8a, 0x8b,
0x8c, 0x8d, 0x8e, 0x8f,
0x90, 0x91, 0x92, 0x93,
0x94, 0x95, 0x96, 0x97,
0x98, 0x99, 0x9a, 0x9b,
0x9c, 0x9d, 0x9e, 0x9f,
0xa0, 0x104, 0x2d8, 0x141,
0xa4, 0x13d, 0x15a, 0xa7,
0xa8, 0x160, 0x15e, 0x164,
0x179, 0xad, 0x17d, 0x17b,
0xb0, 0x105, 0x2db, 0x142,
0xb4, 0x13e, 0x15b, 0x2c7,
0xb8, 0x161, 0x15f, 0x165,
0x17a, 0x2dd, 0x17e, 0x17c,
0x154, 0xc1, 0xc2, 0x102,
0xc4, 0x139, 0x106, 0xc7,
0x10c, 0xc9, 0x118, 0xcb,
0x11a, 0xcd, 0xce, 0x10e,
0x110, 0x143, 0x147, 0xd3,
0xd4, 0x150, 0xd6, 0xd7,
0x158, 0x16e, 0xda, 0x170,
0xdc, 0xdd, 0x162, 0xdf,
0x155, 0xe1, 0xe2, 0x103,
0xe4, 0x13a, 0x107, 0xe7,
0x10d, 0xe9, 0x119, 0xeb,
0x11b, 0xed, 0xee, 0x10f,
0x111, 0x144, 0x148, 0xf3,
0xf4, 0x151, 0xf6, 0xf7,
0x159, 0x16f, 0xfa, 0x171,
0xfc, 0xfd, 0x163, 0x2d9
},
{
// code page: ISO 8859-3
0x80, 0x81, 0x82, 0x83,
0x84, 0x85, 0x86, 0x87,
0x88, 0x89, 0x8a, 0x8b,
0x8c, 0x8d, 0x8e, 0x8f,
0x90, 0x91, 0x92, 0x93,
0x94, 0x95, 0x96, 0x97,
0x98, 0x99, 0x9a, 0x9b,
0x9c, 0x9d, 0x9e, 0x9f,
0xa0, 0x126, 0x2d8, 0xa3,
0xa4, 0x0, 0x124, 0xa7,
0xa8, 0x130, 0x15e, 0x11e,
0x134, 0xad, 0x0, 0x17b,
0xb0, 0x127, 0xb2, 0xb3,
0xb4, 0xb5, 0x125, 0xb7,
0xb8, 0x131, 0x15f, 0x11f,
0x135, 0xbd, 0x0, 0x17c,
0xc0, 0xc1, 0xc2, 0x0,
0xc4, 0x10a, 0x108, 0xc7,
0xc8, 0xc9, 0xca, 0xcb,
0xcc, 0xcd, 0xce, 0xcf,
0x0, 0xd1, 0xd2, 0xd3,
0xd4, 0x120, 0xd6, 0xd7,
0x11c, 0xd9, 0xda, 0xdb,
0xdc, 0x16c, 0x15c, 0xdf,
0xe0, 0xe1, 0xe2, 0x0,
0xe4, 0x10b, 0x109, 0xe7,
0xe8, 0xe9, 0xea, 0xeb,
0xec, 0xed, 0xee, 0xef,
0x0, 0xf1, 0xf2, 0xf3,
0xf4, 0x121, 0xf6, 0xf7,
0x11d, 0xf9, 0xfa, 0xfb,
0xfc, 0x16d, 0x15d, 0x2d9
},
{
// code page: ISO 8859-4
0x80, 0x81, 0x82, 0x83,
0x84, 0x85, 0x86, 0x87,
0x88, 0x89, 0x8a, 0x8b,
0x8c, 0x8d, 0x8e, 0x8f,
0x90, 0x91, 0x92, 0x93,
0x94, 0x95, 0x96, 0x97,
0x98, 0x99, 0x9a, 0x9b,
0x9c, 0x9d, 0x9e, 0x9f,
0xa0, 0x104, 0x138, 0x156,
0xa4, 0x128, 0x13b, 0xa7,
0xa8, 0x160, 0x112, 0x122,
0x166, 0xad, 0x17d, 0xaf,
0xb0, 0x105, 0x2db, 0x157,
0xb4, 0x129, 0x13c, 0x2c7,
0xb8, 0x161, 0x113, 0x123,
0x167, 0x14a, 0x17e, 0x14b,
0x100, 0xc1, 0xc2, 0xc3,
0xc4, 0xc5, 0xc6, 0x12e,
0x10c, 0xc9, 0x118, 0xcb,
0x116, 0xcd, 0xce, 0x12a,
0x110, 0x145, 0x14c, 0x136,
0xd4, 0xd5, 0xd6, 0xd7,
0xd8, 0x172, 0xda, 0xdb,
0xdc, 0x168, 0x16a, 0xdf,
0x101, 0xe1, 0xe2, 0xe3,
0xe4, 0xe5, 0xe6, 0x12f,
0x10d, 0xe9, 0x119, 0xeb,
0x117, 0xed, 0xee, 0x12b,
0x111, 0x146, 0x14d, 0x137,
0xf4, 0xf5, 0xf6, 0xf7,
0xf8, 0x173, 0xfa, 0xfb,
0xfc, 0x169, 0x16b, 0x2d9
},
{
// code page: ISO 8859-5
0x80, 0x81, 0x82, 0x83,
0x84, 0x85, 0x86, 0x87,
0x88, 0x89, 0x8a, 0x8b,
0x8c, 0x8d, 0x8e, 0x8f,
0x90, 0x91, 0x92, 0x93,
0x94, 0x95, 0x96, 0x97,
0x98, 0x99, 0x9a, 0x9b,
0x9c, 0x9d, 0x9e, 0x9f,
0xa0, 0x401, 0x402, 0x403,
0x404, 0x405, 0x406, 0x407,
0x408, 0x409, 0x40a, 0x40b,
0x40c, 0xad, 0x40e, 0x40f,
0x410, 0x411, 0x412, 0x413,
0x414, 0x415, 0x416, 0x417,
0x418, 0x419, 0x41a, 0x41b,
0x41c, 0x41d, 0x41e, 0x41f,
0x420, 0x421, 0x422, 0x423,
0x424, 0x425, 0x426, 0x427,
0x428, 0x429, 0x42a, 0x42b,
0x42c, 0x42d, 0x42e, 0x42f,
0x430, 0x431, 0x432, 0x433,
0x434, 0x435, 0x436, 0x437,
0x438, 0x439, 0x43a, 0x43b,
0x43c, 0x43d, 0x43e, 0x43f,
0x440, 0x441, 0x442, 0x443,
0x444, 0x445, 0x446, 0x447,
0x448, 0x449, 0x44a, 0x44b,
0x44c, 0x44d, 0x44e, 0x44f,
0x2116, 0x451, 0x452, 0x453,
0x454, 0x455, 0x456, 0x457,
0x458, 0x459, 0x45a, 0x45b,
0x45c, 0xa7, 0x45e, 0x45f
},
{
// code page: ISO 8859-6
0x80, 0x81, 0x82, 0x83,
0x84, 0x85, 0x86, 0x87,
0x88, 0x89, 0x8a, 0x8b,
0x8c, 0x8d, 0x8e, 0x8f,
0x90, 0x91, 0x92, 0x93,
0x94, 0x95, 0x96, 0x97,
0x98, 0x99, 0x9a, 0x9b,
0x9c, 0x9d, 0x9e, 0x9f,
0xa0, 0x0, 0x0, 0x0,
0xa4, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0,
0x60c, 0xad, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x61b,
0x0, 0x0, 0x0, 0x61f,
0x0, 0x621, 0x622, 0x623,
0x624, 0x625, 0x626, 0x627,
0x628, 0x629, 0x62a, 0x62b,
0x62c, 0x62d, 0x62e, 0x62f,
0x630, 0x631, 0x632, 0x633,
0x634, 0x635, 0x636, 0x637,
0x638, 0x639, 0x63a, 0x0,
0x0, 0x0, 0x0, 0x0,
0x640, 0x641, 0x642, 0x643,
0x644, 0x645, 0x646, 0x647,
0x648, 0x649, 0x64a, 0x64b,
0x64c, 0x64d, 0x64e, 0x64f,
0x650, 0x651, 0x652, 0x0,
0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0
},
{
// code page: ISO 8859-7
0x80, 0x81, 0x82, 0x83,
0x84, 0x85, 0x86, 0x87,
0x88, 0x89, 0x8a, 0x8b,
0x8c, 0x8d, 0x8e, 0x8f,
0x90, 0x91, 0x92, 0x93,
0x94, 0x95, 0x96, 0x97,
0x98, 0x99, 0x9a, 0x9b,
0x9c, 0x9d, 0x9e, 0x9f,
0xa0, 0x2018, 0x2019, 0xa3,
0x0, 0x0, 0xa6, 0xa7,
0xa8, 0xa9, 0x0, 0xab,
0xac, 0xad, 0x0, 0x2015,
0xb0, 0xb1, 0xb2, 0xb3,
0x384, 0x385, 0x386, 0xb7,
0x388, 0x389, 0x38a, 0xbb,
0x38c, 0xbd, 0x38e, 0x38f,
0x390, 0x391, 0x392, 0x393,
0x394, 0x395, 0x396, 0x397,
0x398, 0x399, 0x39a, 0x39b,
0x39c, 0x39d, 0x39e, 0x39f,
0x3a0, 0x3a1, 0x0, 0x3a3,
0x3a4, 0x3a5, 0x3a6, 0x3a7,
0x3a8, 0x3a9, 0x3aa, 0x3ab,
0x3ac, 0x3ad, 0x3ae, 0x3af,
0x3b0, 0x3b1, 0x3b2, 0x3b3,
0x3b4, 0x3b5, 0x3b6, 0x3b7,
0x3b8, 0x3b9, 0x3ba, 0x3bb,
0x3bc, 0x3bd, 0x3be, 0x3bf,
0x3c0, 0x3c1, 0x3c2, 0x3c3,
0x3c4, 0x3c5, 0x3c6, 0x3c7,
0x3c8, 0x3c9, 0x3ca, 0x3cb,
0x3cc, 0x3cd, 0x3ce, 0x0
},
{
// code page: ISO 8859-8
0x80, 0x81, 0x82, 0x83,
0x84, 0x85, 0x86, 0x87,
0x88, 0x89, 0x8a, 0x8b,
0x8c, 0x8d, 0x8e, 0x8f,
0x90, 0x91, 0x92, 0x93,
0x94, 0x95, 0x96, 0x97,
0x98, 0x99, 0x9a, 0x9b,
0x9c, 0x9d, 0x9e, 0x9f,
0xa0, 0x0, 0xa2, 0xa3,
0xa4, 0xa5, 0xa6, 0xa7,
0xa8, 0xa9, 0xd7, 0xab,
0xac, 0xad, 0xae, 0x203e,
0xb0, 0xb1, 0xb2, 0xb3,
0xb4, 0xb5, 0xb6, 0xb7,
0xb8, 0xb9, 0xf7, 0xbb,
0xbc, 0xbd, 0xbe, 0x0,
0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x2017,
0x5d0, 0x5d1, 0x5d2, 0x5d3,
0x5d4, 0x5d5, 0x5d6, 0x5d7,
0x5d8, 0x5d9, 0x5da, 0x5db,
0x5dc, 0x5dd, 0x5de, 0x5df,
0x5e0, 0x5e1, 0x5e2, 0x5e3,
0x5e4, 0x5e5, 0x5e6, 0x5e7,
0x5e8, 0x5e9, 0x5ea, 0x0,
0x0, 0x0, 0x0, 0x0
},
{
// code page: ISO 8859-9
0x80, 0x81, 0x82, 0x83,
0x84, 0x85, 0x86, 0x87,
0x88, 0x89, 0x8a, 0x8b,
0x8c, 0x8d, 0x8e, 0x8f,
0x90, 0x91, 0x92, 0x93,
0x94, 0x95, 0x96, 0x97,
0x98, 0x99, 0x9a, 0x9b,
0x9c, 0x9d, 0x9e, 0x9f,
0xa0, 0xa1, 0xa2, 0xa3,
0xa4, 0xa5, 0xa6, 0xa7,
0xa8, 0xa9, 0xaa, 0xab,
0xac, 0xad, 0xae, 0xaf,
0xb0, 0xb1, 0xb2, 0xb3,
0xb4, 0xb5, 0xb6, 0xb7,
0xb8, 0xb9, 0xba, 0xbb,
0xbc, 0xbd, 0xbe, 0xbf,
0xc0, 0xc1, 0xc2, 0xc3,
0xc4, 0xc5, 0xc6, 0xc7,
0xc8, 0xc9, 0xca, 0xcb,
0xcc, 0xcd, 0xce, 0xcf,
0x11e, 0xd1, 0xd2, 0xd3,
0xd4, 0xd5, 0xd6, 0xd7,
0xd8, 0xd9, 0xda, 0xdb,
0xdc, 0x130, 0x15e, 0xdf,
0xe0, 0xe1, 0xe2, 0xe3,
0xe4, 0xe5, 0xe6, 0xe7,
0xe8, 0xe9, 0xea, 0xeb,
0xec, 0xed, 0xee, 0xef,
0x11f, 0xf1, 0xf2, 0xf3,
0xf4, 0xf5, 0xf6, 0xf7,
0xf8, 0xf9, 0xfa, 0xfb,
0xfc, 0x131, 0x15f, 0xff
}
};

View File

@@ -24,12 +24,12 @@ enum Resource_FormatType
{
Resource_FormatType_SJIS, //!< SJIS (Shift Japanese Industrial Standards) encoding
Resource_FormatType_EUC, //!< EUC (Extended Unix Code) multi-byte encoding primarily for Japanese, Korean, and simplified Chinese
Resource_FormatType_ANSI, //!< ANSI encoding (pass through without conversion)
Resource_FormatType_NoConversion, //!< format type indicating non-conversion behavior
Resource_FormatType_GB, //!< GB (Guobiao) encoding for Simplified Chinese
Resource_FormatType_UTF8, //!< multi-byte UTF-8 encoding
Resource_FormatType_SystemLocale, //!< active system-defined locale; this value is strongly NOT recommended to use
// non ASCII format types
// Windows-native ("ANSI") 8-bit code pages
Resource_FormatType_CP1250, //!< cp1250 (Central European) encoding
Resource_FormatType_CP1251, //!< cp1251 (Cyrillic) encoding
Resource_FormatType_CP1252, //!< cp1252 (Western European) encoding
@@ -40,7 +40,19 @@ enum Resource_FormatType
Resource_FormatType_CP1257, //!< cp1257 (Baltic) encoding
Resource_FormatType_CP1258, //!< cp1258 (Vietnamese) encoding
// ISO8859 8-bit code pages
Resource_FormatType_iso8859_1, //!< ISO 8859-1 (Western European) encoding
Resource_FormatType_iso8859_2, //!< ISO 8859-2 (Central European) encoding
Resource_FormatType_iso8859_3, //!< ISO 8859-3 (Turkish) encoding
Resource_FormatType_iso8859_4, //!< ISO 8859-4 (Northern European) encoding
Resource_FormatType_iso8859_5, //!< ISO 8859-5 (Cyrillic) encoding
Resource_FormatType_iso8859_6, //!< ISO 8859-6 (Arabic) encoding
Resource_FormatType_iso8859_7, //!< ISO 8859-7 (Greek) encoding
Resource_FormatType_iso8859_8, //!< ISO 8859-8 (Hebrew) encoding
Resource_FormatType_iso8859_9, //!< ISO 8859-9 (Turkish) encoding
// old aliases
Resource_FormatType_ANSI = Resource_FormatType_NoConversion,
Resource_SJIS = Resource_FormatType_SJIS,
Resource_EUC = Resource_FormatType_EUC,
Resource_ANSI = Resource_FormatType_ANSI,

View File

@@ -22,7 +22,7 @@
#include <TCollection_ExtendedString.hxx>
#include <NCollection_UtfString.hxx>
#include <Standard_NotImplemented.hxx>
#include "Resource_ANSI.pxx"
#include "Resource_CodePages.pxx"
#include "Resource_GBK.pxx"
#include "Resource_Big5.pxx"
@@ -625,20 +625,30 @@ void Resource_Unicode::ConvertFormatToUnicode (const Resource_FormatType theForm
case Resource_FormatType_CP1256:
case Resource_FormatType_CP1257:
case Resource_FormatType_CP1258:
case Resource_FormatType_iso8859_1:
case Resource_FormatType_iso8859_2:
case Resource_FormatType_iso8859_3:
case Resource_FormatType_iso8859_4:
case Resource_FormatType_iso8859_5:
case Resource_FormatType_iso8859_6:
case Resource_FormatType_iso8859_7:
case Resource_FormatType_iso8859_8:
case Resource_FormatType_iso8859_9:
{
const int aCodePageIndex = (int)theFormat - (int)Resource_FormatType_CP1250;
const Standard_ExtString aCodePage = THE_CODEPAGES_ANSI[aCodePageIndex];
theToStr.Clear();
for (const char* anInputPntr = theFromStr; *anInputPntr != '\0'; ++anInputPntr)
{
Standard_ExtCharacter aRes = (*anInputPntr & 0x80) != 0
? aCodePage[(0x7f & *anInputPntr)]
: *anInputPntr;
if (aRes == (Standard_ExtCharacter)0x0)
unsigned char anInputChar = (unsigned char)(*anInputPntr);
Standard_ExtCharacter aRes = (anInputChar & 0x80) != 0
? aCodePage[(0x7f & anInputChar)]
: anInputChar;
if (aRes == 0)
{
aRes = '?';
}
theToStr.Insert(theToStr.Length() + 1, aRes);
theToStr.AssignCat(aRes);
}
break;
}
@@ -689,8 +699,52 @@ Standard_Boolean Resource_Unicode::ConvertUnicodeToFormat(const Resource_FormatT
case Resource_FormatType_CP1256:
case Resource_FormatType_CP1257:
case Resource_FormatType_CP1258:
case Resource_FormatType_iso8859_1:
case Resource_FormatType_iso8859_2:
case Resource_FormatType_iso8859_3:
case Resource_FormatType_iso8859_4:
case Resource_FormatType_iso8859_5:
case Resource_FormatType_iso8859_6:
case Resource_FormatType_iso8859_7:
case Resource_FormatType_iso8859_8:
case Resource_FormatType_iso8859_9:
{
throw Standard_NotImplemented("Resource_Unicode::ConvertUnicodeToFormat - conversion from CP1250 - CP1258 to Unicode is not implemented");
if (theMaxSize < theFromStr.Length())
{
return Standard_False;
}
const int aCodePageIndex = (int)theFormat - (int)Resource_FormatType_CP1250;
const Standard_ExtString aCodePage = THE_CODEPAGES_ANSI[aCodePageIndex];
for (Standard_Integer aToCharInd = 0; aToCharInd < theMaxSize - 1; ++aToCharInd)
{
Standard_Boolean isFind = Standard_False;
Standard_ExtCharacter aFromChar = theFromStr.Value(aToCharInd + 1);
if (aFromChar == 0)
{
// zero value should be handled explicitly to avoid false conversion by
// selected code page that may have unused values (encoded as zero)
theToStr[aToCharInd] = '\0';
}
else
{
// find the character in the code page
for (unsigned char anIndCP = 0; aFromChar != 0 && anIndCP < 128; ++anIndCP)
{
if (aCodePage[anIndCP] == aFromChar)
{
theToStr[aToCharInd] = anIndCP | 0x80;
isFind = Standard_True;
}
}
// if character is not found, put '?'
if (!isFind)
{
theToStr[aToCharInd] = '?';
}
}
}
theToStr[theMaxSize - 1] = '\0';
return Standard_True;
}
case Resource_FormatType_UTF8:
{