From be314d2dc2846bdc08243892e66fa74aa47dcd78 Mon Sep 17 00:00:00 2001 From: Eduardo Casino Date: Thu, 5 May 2022 21:55:47 +0200 Subject: Add unicode and NLS support --- int21dos.h | 18 ++++++ sfmain.c | 131 ++++++++++++++++++++++++++++++++++++- sftsr.c | 108 ++++++++++++++++++++++++------- sftsr.h | 5 ++ unicode.h | 215 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 449 insertions(+), 28 deletions(-) create mode 100644 unicode.h diff --git a/int21dos.h b/int21dos.h index af22fea..790bd88 100644 --- a/int21dos.h +++ b/int21dos.h @@ -293,6 +293,24 @@ typedef _Packed struct dos_list_of_lists { uint8_t last_drive; } DOSLOL; +typedef _Packed struct dos_nls_table { + uint8_t table_id; + void __far *table_data; +} NLSTABLE; + +typedef _Packed struct file_char_table { + uint16_t size; // table size (not counting this word) + uint8_t unk1; // ??? (01h for MS-DOS 3.30-6.00) + uint8_t lowest; // lowest permissible character value for filename + uint8_t highest; // highest permissible character value for filename + uint8_t unk2; // ??? (00h for MS-DOS 3.30-6.00) + uint8_t first_x; // first excluded character in range \ all characters in this + uint8_t last_x; // last excluded character in range / range are illegal + uint8_t unk3; // ??? (02h for MS-DOS 3.30-6.00) + uint8_t n_illegal; // number of illegal (terminator) characters + uint8_t illegal[1]; // characters which terminate a filename: ."/\[]:|<>+=;, +} FCHAR; + static inline int drive_letter_to_index(char letter) { if (letter >= 'A' && letter <= 'Z') return letter - 'A'; diff --git a/sfmain.c b/sfmain.c index c0f292f..04b4d1b 100644 --- a/sfmain.c +++ b/sfmain.c @@ -17,17 +17,20 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ +#define __STDC_WANT_LIB_EXT1__ 1 #include #include #include #include #include +#include #include "version.h" #include "dlog.h" #include "vboxshfl.h" #include "dostsr.h" #include "sftsr.h" +#include "unicode.h" static char get_drive_letter(const char *path) { if (!path || path[0] == '\0') return '\0'; @@ -77,6 +80,7 @@ static int list_folders(LPTSRDATA data) continue; } + (void)utf8_to_local(data, str.buf, str.buf, NULL); printf(" %s on %c:\n", str.buf, drive_index_to_letter(i)); } @@ -95,6 +99,7 @@ static int list_folders(LPTSRDATA data) continue; } + (void)utf8_to_local(data, str.buf, str.buf, NULL); printf(" %s\n", str.buf); } @@ -172,7 +177,7 @@ static int unmount_shfl(LPTSRDATA data, int drive) return 0; } -static int mount(LPTSRDATA data, const char *folder, char drive_letter) +static int mount(LPTSRDATA data, char *folder, char drive_letter) { int drive = drive_letter_to_index(drive_letter); DOSLOL __far *lol = dos_get_list_of_lists(); @@ -209,6 +214,7 @@ static int mount(LPTSRDATA data, const char *folder, char drive_letter) // By setting the physical flag, we also let DOS know the drive is present cds->flags = DOS_CDS_FLAG_NETWORK | DOS_CDS_FLAG_PHYSICAL; + (void)utf8_to_local(data, folder, folder, NULL); printf("Shared folder '%s' mounted as drive %c:\n", folder, drive_letter); return EXIT_SUCCESS; @@ -335,6 +341,117 @@ static int rescan(LPTSRDATA data) return 0; } +static int get_nls(uint8_t __far * __far *file_upper_case, FCHAR __far * __far *file_char) +{ + union REGS r; + struct SREGS s; + static NLSTABLE nls_table; + + segread(&s); + + // Get FUCASE (File Uppercase Table) + // + r.w.ax = 0x6504; + r.x.bx = r.x.dx = 0xffff; + r.x.cx = 5; + + s.es = s.ds; + r.x.di = (uint16_t) &nls_table; + + intdosx(&r, &r, &s); + + if (r.x.cx != 5) + return 1; + + *file_upper_case = (uint8_t __far *)nls_table.table_data + 2; // Skip size word + + // Get FCHAR (File Terminator Table) + // + r.w.ax = 0x6505; + + intdosx(&r, &r, &s); + + if ( r.x.cx != 5 ) + return 1; + + *file_char = (FCHAR __far *)nls_table.table_data; + + return 0; +} + +static void load_unicode_table(uint16_t far *unicode_table) +{ + union REGS r; + char filename[13]; + char fullpath[_MAX_PATH]; + char buffer[256]; + struct stat filestat; + FILE *f; + int i, ret; + + // get current Code Page + // + // AX = 6601h + // Return: CF set on error + // AX = error code (see #01680 at AH=59h/BX=0000h) + // CF clear if successful + // BX = active code page (see #01757) <--- + // DX = system code page (see #01757) + // + r.w.ax = 0x6601; + + intdos(&r, &r); + + if (r.x.cflag) { + // Can't get codepage. Use ASCII only + // + fputs("Warning: Active code page not found", stderr); + goto error; + } + + sprintf(filename, r.x.bx > 999 ? "c%duni.tbl" : "cp%duni.tbl", r.x.bx); + + _searchenv(filename, "PATH", fullpath); + if ( '\0' == fullpath[0] ) { + fprintf(stderr, "Warning: Can't find Unicode table: %s", filename); + goto error; + } + + f = fopen(fullpath, "rb"); + + if ( NULL == f ) { + fprintf(stderr, "Warning: Can't load Unicode table: %s", filename); + goto error; + } + + if ( EOF == fscanf_s(f, "Unicode (%s)", buffer, sizeof(buffer)) ) { + fprintf(stderr, "Warning: Invalid file format: %s", filename); + goto close; + } + + ret = fread(buffer, 1, 3, f); + + if ( ret != 3 || buffer[0] != '\r' || buffer[1] != '\n' || buffer[2] != 1 ) { + fprintf(stderr, "Warning: Invalid file format: %s", filename); + goto close; + } + + if ( 256 != (ret = fread( buffer, 1, 256, f )) ) { + fprintf(stderr, "Warning: Can't load Unicode table: %s", filename); + goto close; + } + + _fmemcpy(unicode_table, (char far *)buffer, 256); + + return; + +close: + fclose(f); +error: + fputs( ". Defaulting to cp437\n", stderr ); + +} + static int configure_driver(LPTSRDATA data) { unsigned i; @@ -395,6 +512,14 @@ static int configure_driver(LPTSRDATA data) return -1; } + err = get_nls(&data->file_upper_case, &data->file_char); + if (err) { + puts("Cannot get the NLS tables."); + return -1; + } + + load_unicode_table( &data->unicode_table); + printf("Connected to VirtualBox shared folder service\n"); return 0; @@ -594,13 +719,13 @@ int main(int argc, const char *argv[]) if (!data) return driver_not_found(); return list_folders(data); } else if (stricmp(argv[argi], "mount") == 0) { - const char *folder; + char *folder; char drive; if (!data) return driver_not_found(); argi++; if (argi >= argc) return arg_required("mount"); - folder = argv[argi]; + folder = (char *) argv[argi]; argi++; if (argi >= argc) return arg_required("mount"); drive = get_drive_letter(argv[argi]); diff --git a/sftsr.c b/sftsr.c index b48a42d..5819ce5 100644 --- a/sftsr.c +++ b/sftsr.c @@ -25,8 +25,36 @@ #include "unixtime.h" #include "vboxshfl.h" #include "sftsr.h" - -TSRDATA data; +#define __IN_SFTSR__ 1 +#include "unicode.h" + +TSRDATA data = { + // TSR installation data + NULL, /** Previous int2f ISR, storing it for uninstall. */ + NULL, /** Stored pointer for the DOS SDA. */ + + // TSR configuration + 0, /** Offset (in seconds/2) of the current timezone. */ + NULL, NULL, /** NLS support tables. */ + { /** Codepage to unicode lookup table. + * Initialised to cp437 */ + 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7, + 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5, + 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9, + 0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x20A7, 0x0192, + 0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA, + 0x00BF, 0x2310, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB, + 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, + 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510, + 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F, + 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567, + 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B, + 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580, + 0x03B1, 0x00DF, 0x0393, 0x03C0, 0x03A3, 0x03C3, 0x03BC, 0x03C4, + 0x03A6, 0x0398, 0x03A9, 0x03B4, 0x221E, 0x03C6, 0x03B5, 0x2229, + 0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00F7, 0x2248, + 0x00B0, 0x2219, 0x00B7, 0x221A, 0x207F, 0x00B2, 0x25A0, 0x00A0 } +}; /** Private buffer for VirtualBox filenames. */ static SHFLSTRING_WITH_BUF(shflstr, SHFL_MAX_LEN); @@ -213,22 +241,59 @@ static const char * get_basename(const char *path) } } -static void translate_filename_to_host(SHFLSTRING *str) +static bool illegal_char( unsigned char c ) +{ + int i= 0; + + for ( i= 0; i < data.file_char->n_illegal; ++i ) + { + if ( c == data.file_char->illegal[i] ) + { + return true; + } + } + if ( ( c < data.file_char->lowest || c > data.file_char->highest ) || + !( c < data.file_char->first_x || c > data.file_char->last_x ) ) + { + return true; + } + + return false; +} + +static unsigned char nls_toupper( unsigned char c ) { - // TODO This should map UTF-8 to local CP :( - (void) str; + if ( c > 0x60 && c < 0x7b ) + { + return c & 0xDF; + } + + return ( c < 0x80 ? c : data.file_upper_case[c - 0x80] ); } -static void translate_filename_from_host(SHFLSTRING *str) +static inline bool translate_filename_from_host(SHFLSTRING *str) { - // TODO This should map UTF-8 to local CP :( - // At least do a poor man's uppercase... unsigned i; + bool ret; + unsigned dots = 0; + + ret = utf8_to_local(&data, str->ach, str->ach, &str->u16Length); + for (i = 0; i < str->u16Length; i++) { - if (str->ach[i] >= 'a' && str->ach[i] <= 'z') { - str->ach[i] = 'A' + (str->ach[i] - 'a'); + if (str->ach[i] == '.') { + ++dots; + } + else { + if (illegal_char(str->ach[i])) { + ret = false; + } + else { + str->ach[i] = nls_toupper(str->ach[i]); + } } } + + return ret && (dots <= 1); } /** Tries to do some very simple heuristics to convert DOS-style wildcards @@ -270,16 +335,16 @@ static void fix_wildcards(SHFLSTRING *str) static void copy_drive_relative_filename(SHFLSTRING *str, const char __far *path) { // Assume X:.... path for now, i.e. drive_relative path starts at char 2 - shflstring_strcpy(str, path + 2); + str->u16Length = local_to_utf8( &data, str->ach, path + 2, str->u16Size ); } static void copy_drive_relative_dirname(SHFLSTRING *str, const char __far *path) { int last_sep = my_strrchr(path + 2, '\\'); if (last_sep >= 0) { - shflstring_strncpy(str, path + 2, last_sep == 0 ? 1 : last_sep); + str->u16Length = local_to_utf8_n( &data, str->ach, path + 2, str->u16Size, last_sep == 0 ? 1 : last_sep ); } else { - shflstring_strcpy(str, path + 2); + str->u16Length = local_to_utf8( &data, str->ach, path + 2, str->u16Size ); } } @@ -473,7 +538,6 @@ static void handle_create_open_ex(union INTPACK __far *r) } copy_drive_relative_filename(&shflstr.shflstr, path); - translate_filename_to_host(&shflstr.shflstr); memset(&parms.create, 0, sizeof(SHFLCREATEPARMS)); if (action & OPENEX_REPLACE_IF_EXISTS) { @@ -801,7 +865,6 @@ static void handle_delete(union INTPACK __far *r) dprintf("handle_delete %Fs\n", path); copy_drive_relative_filename(&shflstr.shflstr, path); - translate_filename_to_host(&shflstr.shflstr); err = vbox_shfl_remove(&data.vb, data.hgcm_client_id, root, &shflstr.shflstr, SHFL_REMOVE_FILE); @@ -830,11 +893,9 @@ static void handle_rename(union INTPACK __far *r) } copy_drive_relative_filename(&shflstr.shflstr, src); - translate_filename_to_host(&shflstr.shflstr); // Reusing shfldirinfo buffer space here for our second filename copy_drive_relative_filename(&shfldirinfo.dirinfo.name, dst); - translate_filename_to_host(&shfldirinfo.dirinfo.name); err = vbox_shfl_rename(&data.vb, data.hgcm_client_id, root, &shflstr.shflstr, &shfldirinfo.dirinfo.name, @@ -857,7 +918,6 @@ static void handle_getattr(union INTPACK __far *r) dprintf("handle_getattr %Fs\n", path); copy_drive_relative_filename(&shflstr.shflstr, path); - translate_filename_to_host(&shflstr.shflstr); memset(&parms.create, 0, sizeof(SHFLCREATEPARMS)); parms.create.CreateFlags = SHFL_CF_LOOKUP; @@ -900,7 +960,6 @@ static vboxerr open_search_dir(unsigned openfile, SHFLROOT root, const char __fa dprintf("open_search_dir openfile=%u path=%Fs\n", openfile, path); copy_drive_relative_dirname(&shflstr.shflstr, path); - translate_filename_to_host(&shflstr.shflstr); memset(&parms.create, 0, sizeof(SHFLCREATEPARMS)); parms.create.CreateFlags = SHFL_CF_DIRECTORY @@ -945,7 +1004,7 @@ static vboxerr find_volume_label(SHFLROOT root) err = vbox_shfl_query_map_name(&data.vb, data.hgcm_client_id, root, &shflstr.shflstr); if (err) return err; - translate_filename_from_host(&shflstr.shflstr); + (void) translate_filename_from_host(&shflstr.shflstr); dprintf("label: %s\n", shflstr.buf); @@ -975,7 +1034,6 @@ static vboxerr find_next_from_vbox(unsigned openfile, const char __far *path) // this is what VirtualBox will use in future calls. if (path) { copy_drive_relative_filename(&shflstr.shflstr, path); - translate_filename_to_host(&shflstr.shflstr); fix_wildcards(&shflstr.shflstr); dprintf("fixed path=%s\n", shflstr.buf); @@ -1034,7 +1092,10 @@ static vboxerr find_next_from_vbox(unsigned openfile, const char __far *path) // TODO Use the short filename if available from a windows host // i.e. shfldirinfo.dirinfo.cucShortName - translate_filename_from_host(&shfldirinfo.dirinfo.name); + if (!translate_filename_from_host(&shfldirinfo.dirinfo.name)) { + dputs("hiding file with illegal character(s)"); + continue; + } if (!copy_to_8_3_filename(found_file->filename, &shfldirinfo.dirinfo.name)) { dputs("hiding file with long filename"); @@ -1197,7 +1258,6 @@ static void handle_chdir(union INTPACK __far *r) // Just have to check if the directory exists copy_drive_relative_filename(&shflstr.shflstr, path); - translate_filename_to_host(&shflstr.shflstr); memset(&parms.create, 0, sizeof(SHFLCREATEPARMS)); parms.create.CreateFlags = SHFL_CF_LOOKUP; @@ -1237,7 +1297,6 @@ static void handle_mkdir(union INTPACK __far *r) dprintf("handle_mkdir %Fs\n", path); copy_drive_relative_filename(&shflstr.shflstr, path); - translate_filename_to_host(&shflstr.shflstr); memset(&parms.create, 0, sizeof(SHFLCREATEPARMS)); parms.create.CreateFlags = SHFL_CF_DIRECTORY @@ -1278,7 +1337,6 @@ static void handle_rmdir(union INTPACK __far *r) dprintf("handle_rmdir %Fs\n", path); copy_drive_relative_filename(&shflstr.shflstr, path); - translate_filename_to_host(&shflstr.shflstr); err = vbox_shfl_remove(&data.vb, data.hgcm_client_id, root, &shflstr.shflstr, SHFL_REMOVE_DIR); diff --git a/sftsr.h b/sftsr.h index 9b254f7..f318bd4 100644 --- a/sftsr.h +++ b/sftsr.h @@ -67,6 +67,11 @@ typedef struct { /** Offset (in seconds/2) of the current timezone. * As per tradition, a negative offset means east of GMT; while positive means west. */ int32_t tz_offset; + /** NLS support tables. */ + uint8_t __far *file_upper_case; + FCHAR __far *file_char; + /** Codepage to unicode lookup table. */ + uint16_t unicode_table[128]; // Current status /** Array of all possible DOS drives. */ diff --git a/unicode.h b/unicode.h new file mode 100644 index 0000000..9e70895 --- /dev/null +++ b/unicode.h @@ -0,0 +1,215 @@ +/* + * VBSF - unix to DOS time conversion + * Copyright (C) 2022 Javier S. Pedro + * + * unicode.h: Unicode conversion routines + * Copyright (C) 2011-2022 Eduardo Casino + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General License for more details. + * + * You should have received a copy of the GNU General License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef UNICODE_H +#define UNICODE_H + +#include +#include "sftsr.h" + + +#ifdef __IN_SFTSR__ +#define TSRDATAPTR PTSRDATA +#else +#define TSRDATAPTR LPTSRDATA +#endif + +static inline uint8_t lookup_codepage( TSRDATAPTR data, uint16_t cp ) +{ + uint8_t i; + + for ( i = 0; i < 128 && data->unicode_table[i] != cp; ++i ); + + return ( i < 128 ? (uint8_t) i + 128 : '\0' ); +} + +// dst and src CAN'T BE THE SAME !!!! +// Returns resulting length or -1 if buffer overflow +// +static uint16_t local_to_utf8_n( TSRDATAPTR data, uint8_t *dst, const char far *src, uint16_t buflen, uint16_t count ) +{ + uint16_t len = 0; // Resulting length + uint16_t cp; // Unicode Code Point + + while ( *src && count ) + { + // UTF-8 bytes: 0xxxxxxx + // Binary CP: 0xxxxxxx + // CP range: U+0000 to U+007F (Direct ASCII translation) + // + if ( ! (*src & 0x80) ) + { + if ( buflen > len ) + { + *dst++ = *src; + ++len; + goto cont; + } + else + { + return -1; + } + } + + cp = data->unicode_table[*src - 128]; + + // UTF-8 bytes: 110yyyyy 10xxxxxx + // Binary CP: 00000yyy yyxxxxxx + // CP range: U+0080 to U+07FF + // + if ( ! (cp & 0xF000) ) + { + if ( buflen > len + 1 ) + { + *dst++ = (uint8_t)( cp >> 6 ) | 0xC0; + *dst++ = (uint8_t)( cp & 0x3f ) | 0x80; + len += 2; + } + else + { + return -1; + } + } + + // UTF-8 bytes: 1110zzzz 10yyyyyy 10xxxxxx + // Binary CP: zzzzyyyy yyxxxxxx + // CP range: U+0800 to U+FFFF + // + else + { + if ( buflen > len +2 ) + { + *dst++ = (uint8_t)( cp >> 12 ) | 0xE0; + *dst++ = (uint8_t)( (cp >> 6) & 0x3F ) | 0x80; + *dst++ = (uint8_t)( cp & 0x3F ) | 0x80; + len += 3; + } + else + { + return -1; + } + } +cont: + ++src, --count; + }; + + // Terminate string + // + *dst = '\0'; + + return len; + +} + +static inline uint16_t local_to_utf8( TSRDATAPTR data, uint8_t *dst, const char far *src, uint16_t buflen ) +{ + return local_to_utf8_n( data, dst, src, buflen, buflen ); +} + +// Returns true on success, false if any unsupported char is found +// +static bool utf8_to_local( TSRDATAPTR data, char *dst, char *src, uint16_t *len ) +{ + bool ret = true; // Return code + uint16_t cp; // Unicode Code point + uint16_t l = 0; + + while ( *src ) + { + // UTF-8 bytes: 0xxxxxxx + // Binary CP: 0xxxxxxx + // CP range: U+0000 to U+007F (Direct ASCII translation) + // + if ( ! (*src & 0x80) ) + { + *dst = *src; + ++src; + goto cont; + } + + // UTF-8 bytes: 110yyyyy 10xxxxxx + // Binary CP: 00000yyy yyxxxxxx + // CP range: U+0080 to U+07FF + // + if ( ! (*src & 0x20) ) + { + cp = ( (uint16_t)(*src & 0x1F) << 6 ) | *(src+1) & 0x3F; + *dst = lookup_codepage( data, cp ); + if ( *dst == '\0' ) + { + *dst = '_'; + ret = false; + } + src += 2; + goto cont; + } + + // UTF-8 bytes: 1110zzzz 10yyyyyy 10xxxxxx + // Binary CP: zzzzyyyy yyxxxxxx + // CP range: U+0800 to U+FFFF + // + if ( ! (*src & 0x10) ) + { + cp = ( (uint16_t)(*src & 0xF) << 12 ) | ( (uint16_t)(*(src+1) & 0x3F) << 6 ) | *(src+2) & 0x3F; + *dst = lookup_codepage( data, cp ); + if ( *dst == '\0' ) + { + *dst = '_'; + ret = false; + } + src += 3; + goto cont; + } + + // UTF-8 bytes: 11110www 10zzzzzz 10yyyyyy 10xxxxxx + // Binary CP: 000wwwzz zzzzyyyy yyxxxxxx + // CP range: U+010000 to U+10FFFF + // + if ( ! (*src & 0x08) ) + { + *dst = '_'; // Currently unsupported + ret = false; + src += 4; + goto cont; + } + + // Should not reach here + // + *dst = '_'; + ret = false; + ++src; +cont: + ++dst, ++l; + + }; + + // Terminate string + // + *dst = '\0'; + + if (len) *len = l; + + return ret; + +} + +#endif -- cgit v1.2.3