coreutils/coreutils-9.5-i18n-1.patch

5091 lines
159 KiB
Diff
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

Submitted by: Xi Ruoyao <xry111@xry111.site>
Date: 2024-03-29
Initial Package Version: 9.5
Upstream Status: Rejected
Origin: https://src.fedoraproject.org/rpms/coreutils/raw/43d181cd/f/coreutils-i18n.patch
Rebased from 9.3 to 9.4.
Then rebased and adjusted for 9.5: removed the
hunks for join and uniq because they now
support i18n out of the box, and added mbchar.h
which is removed in 9.5 but still needed by the
i18n patch.
Description: Fixes i18n issues with various Coreutils programs
From 59f066310969894f446c8df5adcf7a390adf0d94 Mon Sep 17 00:00:00 2001
From: Xi Ruoyao <xry111@xry111.site>
Date: Wed, 30 Aug 2023 12:34:31 +0800
Subject: [PATCH] i18n patch
---
bootstrap.conf | 1 +
configure.ac | 4 +
src/cut.c | 510 +++++++++++++++++++++--
src/expand-common.c | 114 ++++++
src/expand-common.h | 12 +
src/expand.c | 90 +++-
src/fold.c | 312 ++++++++++++--
src/local.mk | 4 +-
src/pr.c | 443 ++++++++++++++++++--
src/sort.c | 794 +++++++++++++++++++++++++++++++++---
src/unexpand.c | 102 ++++-
tests/Coreutils.pm | 3 +
tests/expand/mb.sh | 183 +++++++++
tests/i18n/sort.sh | 29 ++
tests/local.mk | 4 +
tests/misc/expand.pl | 42 ++
tests/misc/fold.pl | 50 ++-
tests/misc/unexpand.pl | 39 ++
tests/pr/pr-tests.pl | 49 +++
tests/sort/sort-mb-tests.sh | 45 ++
tests/sort/sort-merge.pl | 42 ++
tests/sort/sort.pl | 40 +-
tests/unexpand/mb.sh | 172 ++++++++
23 files changed, 2887 insertions(+), 197 deletions(-)
create mode 100755 tests/expand/mb.sh
create mode 100755 tests/i18n/sort.sh
create mode 100755 tests/sort/sort-mb-tests.sh
create mode 100755 tests/unexpand/mb.sh
diff --git a/bootstrap.conf b/bootstrap.conf
index 126e1e80a..8849d1ce5 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -163,6 +163,7 @@ gnulib_modules="
maintainer-makefile
malloc-gnu
manywarnings
+ mbfile
mbrlen
mbrtoc32
mbrtowc
diff --git a/configure.ac b/configure.ac
index 9cb6ee149..e494b8c60 100644
--- a/configure.ac
+++ b/configure.ac
@@ -504,6 +504,10 @@ fi
# I'm leaving it here for now. This whole thing needs to be modernized...
gl_WINSIZE_IN_PTEM
+gl_MBFILE
+AC_DEFINE([GNULIB_MBFILE], [1],
+ [Define to 1 if the gnulib module 'mbfile' is in use.])
+
gl_HEADER_TIOCGWINSZ_IN_TERMIOS_H
if test $gl_cv_sys_tiocgwinsz_needs_termios_h = no && \
diff --git a/src/cut.c b/src/cut.c
index 061e09c33..5a686db52 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -27,6 +27,11 @@
#include <stdio.h>
#include <getopt.h>
#include <sys/types.h>
+
+/* Get mbstate_t, mbrtowc(). */
+#if HAVE_WCHAR_H
+# include <wchar.h>
+#endif
#include "system.h"
#include "assure.h"
@@ -35,6 +40,18 @@
#include "set-fields.h"
+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
+ installation; work around this configuration error. */
+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
+# undef MB_LEN_MAX
+# define MB_LEN_MAX 16
+#endif
+
+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
+#if HAVE_MBRTOWC && defined mbstate_t
+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
+#endif
+
/* The official name of this program (e.g., no 'g' prefix). */
#define PROGRAM_NAME "cut"
@@ -51,6 +68,52 @@
} \
while (0)
+/* Refill the buffer BUF to get a multibyte character. */
+#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM) \
+ do \
+ { \
+ if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM)) \
+ { \
+ memmove (BUF, BUFPOS, BUFLEN); \
+ BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \
+ BUFPOS = BUF; \
+ } \
+ } \
+ while (0)
+
+/* Get wide character on BUFPOS. BUFPOS is not included after that.
+ If byte sequence is not valid as a character, CONVFAIL is true. Otherwise false. */
+#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \
+ do \
+ { \
+ mbstate_t state_bak; \
+ \
+ if (BUFLEN < 1) \
+ { \
+ WC = WEOF; \
+ break; \
+ } \
+ \
+ /* Get a wide character. */ \
+ CONVFAIL = false; \
+ state_bak = STATE; \
+ MBLENGTH = mbrtowc ((wchar_t *)&WC, BUFPOS, BUFLEN, &STATE); \
+ \
+ switch (MBLENGTH) \
+ { \
+ case (size_t)-1: \
+ case (size_t)-2: \
+ CONVFAIL = true; \
+ STATE = state_bak; \
+ /* Fall througn. */ \
+ \
+ case 0: \
+ MBLENGTH = 1; \
+ break; \
+ } \
+ } \
+ while (0)
+
/* Pointer inside RP. When checking if a byte or field is selected
by a finite range, we check if it is between CURRENT_RP.LO
@@ -58,6 +121,9 @@
CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */
static struct field_range_pair *current_rp;
+/* Length of the delimiter given as argument to -d. */
+size_t delimlen;
+
/* This buffer is used to support the semantics of the -s option
(or lack of same) when the specified field list includes (does
not include) the first field. In both of those cases, the entire
@@ -70,6 +136,29 @@ static char *field_1_buffer;
/* The number of bytes allocated for FIELD_1_BUFFER. */
static size_t field_1_bufsize;
+enum operating_mode
+ {
+ undefined_mode,
+
+ /* Output bytes that are at the given positions. */
+ byte_mode,
+
+ /* Output characters that are at the given positions. */
+ character_mode,
+
+ /* Output the given delimiter-separated fields. */
+ field_mode
+ };
+
+static enum operating_mode operating_mode;
+
+/* If nonzero, when in byte mode, don't split multibyte characters. */
+static int byte_mode_character_aware;
+
+/* If nonzero, the function for single byte locale is work
+ if this program runs on multibyte locale. */
+static int force_singlebyte_mode;
+
/* If true, do not output lines containing no delimiter characters.
Otherwise, all such lines are printed. This option is valid only
with field mode. */
@@ -81,10 +170,16 @@ static bool complement;
/* The delimiter character for field mode. */
static unsigned char delim;
+#if HAVE_WCHAR_H
+static wchar_t wcdelim;
+#endif
/* The delimiter for each line/record. */
static unsigned char line_delim = '\n';
+/* True if the --output-delimiter=STRING option was specified. */
+static bool output_delimiter_specified;
+
/* The length of output_delimiter_string. */
static size_t output_delimiter_length;
@@ -92,9 +187,6 @@ static size_t output_delimiter_length;
string consisting of the input delimiter. */
static char *output_delimiter_string;
-/* The output delimiter string contents, if the default. */
-static char output_delimiter_default[1];
-
/* True if we have ever read standard input. */
static bool have_read_stdin;
@@ -148,7 +240,7 @@ Print selected parts of lines from each FILE to standard output.\n\
-f, --fields=LIST select only these fields; also print any line\n\
that contains no delimiter character, unless\n\
the -s option is specified\n\
- -n (ignored)\n\
+ -n with -b: don't split multibyte characters\n\
"), stdout);
fputs (_("\
--complement complement the set of selected bytes, characters\n\
@@ -252,7 +344,7 @@ cut_bytes (FILE *stream)
next_item (&byte_idx);
if (print_kth (byte_idx))
{
- if (output_delimiter_string != output_delimiter_default)
+ if (output_delimiter_specified)
{
if (print_delimiter && is_range_start_index (byte_idx))
{
@@ -271,6 +363,82 @@ cut_bytes (FILE *stream)
}
}
+#if HAVE_MBRTOWC
+/* This function is in use for the following case.
+
+ 1. Read from the stream STREAM, printing to standard output any selected
+ characters.
+
+ 2. Read from stream STREAM, printing to standard output any selected bytes,
+ without splitting multibyte characters. */
+
+static void
+cut_characters_or_cut_bytes_no_split (FILE *stream)
+{
+ uintmax_t idx; /* number of bytes or characters in the line so far. */
+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
+ char *bufpos; /* Next read position of BUF. */
+ size_t buflen; /* The length of the byte sequence in buf. */
+ wint_t wc; /* A gotten wide character. */
+ size_t mblength; /* The byte size of a multibyte character which shows
+ as same character as WC. */
+ mbstate_t state; /* State of the stream. */
+ bool convfail = false; /* true, when conversion failed. Otherwise false. */
+ /* Whether to begin printing delimiters between ranges for the current line.
+ Set after we've begun printing data corresponding to the first range. */
+ bool print_delimiter = false;
+
+ idx = 0;
+ buflen = 0;
+ bufpos = buf;
+ memset (&state, '\0', sizeof(mbstate_t));
+
+ current_rp = frp;
+
+ while (1)
+ {
+ REFILL_BUFFER (buf, bufpos, buflen, stream);
+
+ GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail);
+ (void) convfail; /* ignore unused */
+
+ if (wc == WEOF)
+ {
+ if (idx > 0)
+ putchar (line_delim);
+ break;
+ }
+ else if (wc == line_delim)
+ {
+ putchar (line_delim);
+ idx = 0;
+ print_delimiter = false;
+ current_rp = frp;
+ }
+ else
+ {
+ next_item (&idx);
+ if (print_kth (idx))
+ {
+ if (output_delimiter_specified)
+ {
+ if (print_delimiter && is_range_start_index (idx))
+ {
+ fwrite (output_delimiter_string, sizeof (char),
+ output_delimiter_length, stdout);
+ }
+ print_delimiter = true;
+ }
+ fwrite (bufpos, mblength, sizeof(char), stdout);
+ }
+ }
+
+ buflen -= mblength;
+ bufpos += mblength;
+ }
+}
+#endif
+
/* Read from stream STREAM, printing to standard output any selected fields. */
static void
@@ -433,11 +601,218 @@ cut_fields (FILE *stream)
}
}
-/* Process file FILE to standard output, using CUT_STREAM.
+#if HAVE_MBRTOWC
+static void
+cut_fields_mb (FILE *stream)
+{
+ int c;
+ uintmax_t field_idx;
+ int found_any_selected_field;
+ int buffer_first_field;
+ int empty_input;
+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
+ char *bufpos; /* Next read position of BUF. */
+ size_t buflen; /* The length of the byte sequence in buf. */
+ wint_t wc = 0; /* A gotten wide character. */
+ size_t mblength; /* The byte size of a multibyte character which shows
+ as same character as WC. */
+ mbstate_t state; /* State of the stream. */
+ bool convfail = false; /* true, when conversion failed. Otherwise false. */
+
+ current_rp = frp;
+
+ found_any_selected_field = 0;
+ field_idx = 1;
+ bufpos = buf;
+ buflen = 0;
+ memset (&state, '\0', sizeof(mbstate_t));
+
+ c = getc (stream);
+ empty_input = (c == EOF);
+ if (c != EOF)
+ {
+ ungetc (c, stream);
+ wc = 0;
+ }
+ else
+ wc = WEOF;
+
+ /* To support the semantics of the -s flag, we may have to buffer
+ all of the first field to determine whether it is `delimited.'
+ But that is unnecessary if all non-delimited lines must be printed
+ and the first field has been selected, or if non-delimited lines
+ must be suppressed and the first field has *not* been selected.
+ That is because a non-delimited line has exactly one field. */
+ buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
+
+ while (1)
+ {
+ if (field_idx == 1 && buffer_first_field)
+ {
+ int len = 0;
+
+ while (1)
+ {
+ REFILL_BUFFER (buf, bufpos, buflen, stream);
+
+ GET_NEXT_WC_FROM_BUFFER
+ (wc, bufpos, buflen, mblength, state, convfail);
+
+ if (wc == WEOF)
+ break;
+
+ field_1_buffer = xrealloc (field_1_buffer, len + mblength);
+ memcpy (field_1_buffer + len, bufpos, mblength);
+ len += mblength;
+ buflen -= mblength;
+ bufpos += mblength;
+
+ if (!convfail && (wc == line_delim || wc == wcdelim))
+ break;
+ }
+
+ if (len <= 0 && wc == WEOF)
+ break;
+
+ /* If the first field extends to the end of line (it is not
+ delimited) and we are printing all non-delimited lines,
+ print this one. */
+ if (convfail || (!convfail && wc != wcdelim))
+ {
+ if (suppress_non_delimited)
+ {
+ /* Empty. */
+ }
+ else
+ {
+ fwrite (field_1_buffer, sizeof (char), len, stdout);
+ /* Make sure the output line is newline terminated. */
+ if (convfail || (!convfail && wc != line_delim))
+ putchar (line_delim);
+ }
+ continue;
+ }
+
+ if (print_kth (1))
+ {
+ /* Print the field, but not the trailing delimiter. */
+ fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
+ found_any_selected_field = 1;
+ }
+ next_item (&field_idx);
+ }
+
+ if (wc != WEOF)
+ {
+ if (print_kth (field_idx))
+ {
+ if (found_any_selected_field)
+ {
+ fwrite (output_delimiter_string, sizeof (char),
+ output_delimiter_length, stdout);
+ }
+ found_any_selected_field = 1;
+ }
+
+ while (1)
+ {
+ REFILL_BUFFER (buf, bufpos, buflen, stream);
+
+ GET_NEXT_WC_FROM_BUFFER
+ (wc, bufpos, buflen, mblength, state, convfail);
+
+ if (wc == WEOF)
+ break;
+ else if (!convfail && (wc == wcdelim || wc == line_delim))
+ {
+ buflen -= mblength;
+ bufpos += mblength;
+ break;
+ }
+
+ if (print_kth (field_idx))
+ fwrite (bufpos, mblength, sizeof(char), stdout);
+
+ buflen -= mblength;
+ bufpos += mblength;
+ }
+ }
+
+ if ((!convfail || wc == line_delim) && buflen < 1)
+ wc = WEOF;
+
+ if (!convfail && wc == wcdelim)
+ next_item (&field_idx);
+ else if (wc == WEOF || (!convfail && wc == line_delim))
+ {
+ if (found_any_selected_field
+ || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
+ putchar (line_delim);
+ if (wc == WEOF)
+ break;
+ field_idx = 1;
+ current_rp = frp;
+ found_any_selected_field = 0;
+ }
+ }
+}
+#endif
+
+static void
+cut_stream (FILE *stream)
+{
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
+ {
+ switch (operating_mode)
+ {
+ case byte_mode:
+ if (byte_mode_character_aware)
+ cut_characters_or_cut_bytes_no_split (stream);
+ else
+ cut_bytes (stream);
+ break;
+
+ case character_mode:
+ cut_characters_or_cut_bytes_no_split (stream);
+ break;
+
+ case field_mode:
+ if (delimlen == 1)
+ {
+ /* Check if we have utf8 multibyte locale, so we can use this
+ optimization because of uniqueness of characters, which is
+ not true for e.g. SJIS */
+ char * loc = setlocale(LC_CTYPE, nullptr);
+ if (loc && (strstr (loc, "UTF-8") || strstr (loc, "utf-8") ||
+ strstr (loc, "UTF8") || strstr (loc, "utf8")))
+ {
+ cut_fields (stream);
+ break;
+ }
+ }
+ cut_fields_mb (stream);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+ else
+#endif
+ {
+ if (operating_mode == field_mode)
+ cut_fields (stream);
+ else
+ cut_bytes (stream);
+ }
+}
+
+/* Process file FILE to standard output.
Return true if successful. */
static bool
-cut_file (char const *file, void (*cut_stream) (FILE *))
+cut_file (char const *file)
{
FILE *stream;
@@ -482,8 +857,8 @@ main (int argc, char **argv)
int optc;
bool ok;
bool delim_specified = false;
- bool byte_mode = false;
- char *spec_list_string = nullptr;
+ char *spec_list_string IF_LINT ( = nullptr);
+ char mbdelim[MB_LEN_MAX + 1];
initialize_main (&argc, &argv);
set_program_name (argv[0]);
@@ -493,6 +868,8 @@ main (int argc, char **argv)
atexit (close_stdout);
+ operating_mode = undefined_mode;
+
/* By default, all non-delimited lines are printed. */
suppress_non_delimited = false;
@@ -505,35 +882,77 @@ main (int argc, char **argv)
switch (optc)
{
case 'b':
+ /* Build the byte list. */
+ if (operating_mode != undefined_mode)
+ FATAL_ERROR (_("only one type of list may be specified"));
+ operating_mode = byte_mode;
+ spec_list_string = optarg;
+ break;
+
case 'c':
- /* Build the byte list. */
- byte_mode = true;
- FALLTHROUGH;
+ /* Build the character list. */
+ if (operating_mode != undefined_mode)
+ FATAL_ERROR (_("only one type of list may be specified"));
+ operating_mode = character_mode;
+ spec_list_string = optarg;
+ break;
+
case 'f':
- /* Build the field list. */
- if (spec_list_string)
- FATAL_ERROR (_("only one list may be specified"));
+ /* Build the field list. */
+ if (operating_mode != undefined_mode)
+ FATAL_ERROR (_("only one type of list may be specified"));
+ operating_mode = field_mode;
spec_list_string = optarg;
break;
case 'd':
/* New delimiter. */
/* Interpret -d '' to mean 'use the NUL byte as the delimiter.' */
- if (optarg[0] != '\0' && optarg[1] != '\0')
- FATAL_ERROR (_("the delimiter must be a single character"));
- delim = optarg[0];
- delim_specified = true;
+ {
+#if HAVE_MBRTOWC
+ if(MB_CUR_MAX > 1)
+ {
+ mbstate_t state;
+
+ memset (&state, '\0', sizeof(mbstate_t));
+ delimlen = mbrtowc (&wcdelim, optarg, strnlen(optarg, MB_LEN_MAX), &state);
+
+ if (delimlen == (size_t)-1 || delimlen == (size_t)-2)
+ ++force_singlebyte_mode;
+ else
+ {
+ delimlen = (delimlen < 1) ? 1 : delimlen;
+ if (wcdelim != L'\0' && *(optarg + delimlen) != '\0')
+ FATAL_ERROR (_("the delimiter must be a single character"));
+ memcpy (mbdelim, optarg, delimlen);
+ mbdelim[delimlen] = '\0';
+ if (delimlen == 1)
+ delim = *optarg;
+ }
+ }
+
+ if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
+#endif
+ {
+ if (optarg[0] != '\0' && optarg[1] != '\0')
+ FATAL_ERROR (_("the delimiter must be a single character"));
+ delim = (unsigned char) optarg[0];
+ }
+ delim_specified = true;
+ }
break;
case OUTPUT_DELIMITER_OPTION:
+ output_delimiter_specified = true;
/* Interpret --output-delimiter='' to mean
'use the NUL byte as the delimiter.' */
output_delimiter_length = (optarg[0] == '\0'
? 1 : strlen (optarg));
- output_delimiter_string = optarg;
+ output_delimiter_string = xstrdup (optarg);
break;
case 'n':
+ byte_mode_character_aware = 1;
break;
case 's':
@@ -555,40 +974,57 @@ main (int argc, char **argv)
}
}
- if (!spec_list_string)
+ if (operating_mode == undefined_mode)
FATAL_ERROR (_("you must specify a list of bytes, characters, or fields"));
- if (byte_mode)
- {
- if (delim_specified)
- FATAL_ERROR (_("an input delimiter may be specified only\
+ if (delim_specified && operating_mode != field_mode)
+ FATAL_ERROR (_("an input delimiter may be specified only\
when operating on fields"));
- if (suppress_non_delimited)
- FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
+ if (suppress_non_delimited && operating_mode != field_mode)
+ FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
\tonly when operating on fields"));
- }
set_fields (spec_list_string,
- ((byte_mode ? SETFLD_ERRMSG_USE_POS : 0)
- | (complement ? SETFLD_COMPLEMENT : 0)));
+ ( (operating_mode == field_mode) ? 0 : SETFLD_ERRMSG_USE_POS)
+ | (complement ? SETFLD_COMPLEMENT : 0) );
if (!delim_specified)
- delim = '\t';
+ {
+ delim = '\t';
+#ifdef HAVE_MBRTOWC
+ wcdelim = L'\t';
+ mbdelim[0] = '\t';
+ mbdelim[1] = '\0';
+ delimlen = 1;
+#endif
+ }
if (output_delimiter_string == nullptr)
{
- output_delimiter_default[0] = delim;
- output_delimiter_string = output_delimiter_default;
- output_delimiter_length = 1;
+#ifdef HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
+ {
+ output_delimiter_string = xstrdup(mbdelim);
+ output_delimiter_length = delimlen;
+ }
+
+ if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
+#endif
+ {
+ static char dummy[2];
+ dummy[0] = delim;
+ dummy[1] = '\0';
+ output_delimiter_string = dummy;
+ output_delimiter_length = 1;
+ }
}
- void (*cut_stream) (FILE *) = byte_mode ? cut_bytes : cut_fields;
if (optind == argc)
- ok = cut_file ("-", cut_stream);
+ ok = cut_file ("-");
else
for (ok = true; optind < argc; optind++)
- ok &= cut_file (argv[optind], cut_stream);
+ ok &= cut_file (argv[optind]);
if (have_read_stdin && fclose (stdin) == EOF)
diff --git a/src/expand-common.c b/src/expand-common.c
index c95998dc6..c5e147e5f 100644
--- a/src/expand-common.c
+++ b/src/expand-common.c
@@ -19,6 +19,7 @@
#include <ctype.h>
#include <stdio.h>
#include <sys/types.h>
+#include <mbfile.h>
#include "system.h"
#include "fadvise.h"
#include "quote.h"
@@ -123,6 +124,119 @@ set_increment_size (uintmax_t tabval)
return ok;
}
+extern int
+set_utf_locale (void)
+{
+ /*try using some predefined locale */
+ const char* predef_locales[] = {"C.UTF8","en_US.UTF8","en_GB.UTF8"};
+
+ const int predef_locales_count=3;
+ for (int i=0;i<predef_locales_count;i++)
+ {
+ if (setlocale(LC_ALL,predef_locales[i])!=nullptr)
+ {
+ break;
+ }
+ else if (i==predef_locales_count-1)
+ {
+ return 1;
+ error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale"));
+ }
+ }
+ return 0;
+}
+
+extern bool
+check_utf_locale(void)
+{
+ char* locale = setlocale (LC_CTYPE , nullptr);
+ if (locale == nullptr)
+ {
+ return false;
+ }
+ else if (strcasestr(locale, "utf8") == nullptr && strcasestr(locale, "utf-8") == nullptr)
+ {
+ return false;
+ }
+ return true;
+}
+
+extern bool
+check_bom(FILE* fp, mb_file_t *mbf)
+{
+ int c;
+
+
+ c=fgetc(fp);
+
+ /*test BOM header of the first file */
+ mbf->bufcount=0;
+ if (c == 0xEF)
+ {
+ c=fgetc(fp);
+ }
+ else
+ {
+ if (c != EOF)
+ {
+ ungetc(c,fp);
+ }
+ return false;
+ }
+
+ if (c == 0xBB)
+ {
+ c=fgetc(fp);
+ }
+ else
+ {
+ if ( c!= EOF )
+ {
+ mbf->buf[0]=(unsigned char) 0xEF;
+ mbf->bufcount=1;
+ ungetc(c,fp);
+ return false;
+ }
+ else
+ {
+ ungetc(0xEF,fp);
+ return false;
+ }
+ }
+ if (c == 0xBF)
+ {
+ mbf->bufcount=0;
+ return true;
+ }
+ else
+ {
+ if (c != EOF)
+ {
+ mbf->buf[0]=(unsigned char) 0xEF;
+ mbf->buf[1]=(unsigned char) 0xBB;
+ mbf->bufcount=2;
+ ungetc(c,fp);
+ return false;
+ }
+ else
+ {
+ mbf->buf[0]=(unsigned char) 0xEF;
+ mbf->bufcount=1;
+ ungetc(0xBB,fp);
+ return false;
+ }
+ }
+ return false;
+}
+
+extern void
+print_bom(void)
+{
+ putc (0xEF, stdout);
+ putc (0xBB, stdout);
+ putc (0xBF, stdout);
+}
+
/* Add the comma or blank separated list of tab stops STOPS
to the list of tab stops. */
extern void
diff --git a/src/expand-common.h b/src/expand-common.h
index 1a57108e4..60256522c 100644
--- a/src/expand-common.h
+++ b/src/expand-common.h
@@ -25,6 +25,18 @@ extern size_t max_column_width;
/* The desired exit status. */
extern int exit_status;
+extern int
+set_utf_locale (void);
+
+extern bool
+check_utf_locale(void);
+
+extern bool
+check_bom(FILE* fp, mb_file_t *mbf);
+
+extern void
+print_bom(void);
+
/* Add tab stop TABVAL to the end of 'tab_list'. */
extern void
add_tab_stop (uintmax_t tabval);
diff --git a/src/expand.c b/src/expand.c
index a6176a974..6cd312e59 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -38,6 +38,9 @@
#include <stdio.h>
#include <getopt.h>
#include <sys/types.h>
+
+#include <mbfile.h>
+
#include "system.h"
#include "expand-common.h"
@@ -96,19 +99,41 @@ expand (void)
{
/* Input stream. */
FILE *fp = next_file (nullptr);
+ mb_file_t mbf;
+ mbf_char_t c;
+ /* True if the starting locale is utf8. */
+ bool using_utf_locale;
+
+ /* True if the first file contains BOM header. */
+ bool found_bom;
+ using_utf_locale=check_utf_locale();
if (!fp)
return;
+ mbf_init (mbf, fp);
+ found_bom=check_bom(fp,&mbf);
- while (true)
+ if (using_utf_locale == false && found_bom == true)
+ {
+ /*try using some predefined locale */
+
+ if (set_utf_locale () != 0)
{
- /* Input character, or EOF. */
- int c;
+ error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale"));
+ }
+ }
+
+
+ if (found_bom == true)
+ {
+ print_bom();
+ }
+ while (true)
+ {
/* If true, perform translations. */
bool convert = true;
-
/* The following variables have valid values only when CONVERT
is true: */
@@ -118,17 +143,48 @@ expand (void)
/* Index in TAB_LIST of next tab stop to examine. */
size_t tab_index = 0;
-
/* Convert a line of text. */
do
{
- while ((c = getc (fp)) < 0 && (fp = next_file (fp)))
- continue;
+ while (true) {
+ mbf_getc (c, mbf);
+ if ((mb_iseof (c)) && (fp = next_file (fp)))
+ {
+ mbf_init (mbf, fp);
+ if (fp!=nullptr)
+ {
+ if (check_bom(fp,&mbf)==true)
+ {
+ /*Not the first file - check BOM header*/
+ if (using_utf_locale==false && found_bom==false)
+ {
+ /*BOM header in subsequent file but not in the first one. */
+ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header"));
+ }
+ }
+ else
+ {
+ if(using_utf_locale==false && found_bom==true)
+ {
+ /*First file conatined BOM header - locale was switched to UTF
+ *all subsequent files should contain BOM. */
+ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header"));
+ }
+ }
+ }
+ continue;
+ }
+ else
+ {
+ break;
+ }
+ }
+
if (convert)
{
- if (c == '\t')
+ if (mb_iseq (c, '\t'))
{
/* Column the next input tab stop is on. */
uintmax_t next_tab_column;
@@ -147,32 +203,34 @@ expand (void)
if (putchar (' ') < 0)
write_error ();
- c = ' ';
+ mb_setascii (&c, ' ');
}
- else if (c == '\b')
+ else if (mb_iseq (c, '\b'))
{
/* Go back one column, and force recalculation of the
next tab stop. */
column -= !!column;
tab_index -= !!tab_index;
}
- else
+ /* A leading control character could make us trip over. */
+ else if (!mb_iscntrl (c))
{
- column++;
+ column += mb_width (c);
if (!column)
error (EXIT_FAILURE, 0, _("input line is too long"));
}
- convert &= convert_entire_line || !! isblank (c);
+ convert &= convert_entire_line || mb_isblank (c);
}
- if (c < 0)
+ if (mb_iseof (c))
return;
- if (putchar (c) < 0)
+ mb_putc (c, stdout);
+ if (ferror (stdout))
write_error ();
}
- while (c != '\n');
+ while (!mb_iseq (c, '\n'));
}
}
diff --git a/src/fold.c b/src/fold.c
index 941ad11c6..247440212 100644
--- a/src/fold.c
+++ b/src/fold.c
@@ -23,10 +23,32 @@
#include <getopt.h>
#include <sys/types.h>
+/* Get mbstate_t, mbrtowc(), wcwidth(). */
+#if HAVE_WCHAR_H
+# include <wchar.h>
+#endif
+
+/* Get iswprint(), iswblank(), wcwidth(). */
+#if HAVE_WCTYPE_H
+# include <wctype.h>
+#endif
+
#include "system.h"
#include "fadvise.h"
#include "xdectoint.h"
+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
+ installation; work around this configuration error. */
+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
+# undef MB_LEN_MAX
+# define MB_LEN_MAX 16
+#endif
+
+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
+#if HAVE_MBRTOWC && defined mbstate_t
+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
+#endif
+
#define TAB_WIDTH 8
/* The official name of this program (e.g., no 'g' prefix). */
@@ -34,20 +56,41 @@
#define AUTHORS proper_name ("David MacKenzie")
+#define FATAL_ERROR(Message) \
+ do \
+ { \
+ error (0, 0, (Message)); \
+ usage (2); \
+ } \
+ while (0)
+
+enum operating_mode
+{
+ /* Fold texts by columns that are at the given positions. */
+ column_mode,
+
+ /* Fold texts by bytes that are at the given positions. */
+ byte_mode,
+
+ /* Fold texts by characters that are at the given positions. */
+ character_mode,
+};
+
+/* The argument shows current mode. (Default: column_mode) */
+static enum operating_mode operating_mode;
+
/* If nonzero, try to break on whitespace. */
static bool break_spaces;
-/* If nonzero, count bytes, not column positions. */
-static bool count_bytes;
-
/* If nonzero, at least one of the files we read was standard input. */
static bool have_read_stdin;
-static char const shortopts[] = "bsw:0::1::2::3::4::5::6::7::8::9::";
+static char const shortopts[] = "bcsw:0::1::2::3::4::5::6::7::8::9::";
static struct option const longopts[] =
{
{"bytes", no_argument, nullptr, 'b'},
+ {"characters", no_argument, nullptr, 'c'},
{"spaces", no_argument, nullptr, 's'},
{"width", required_argument, nullptr, 'w'},
{GETOPT_HELP_OPTION_DECL},
@@ -75,6 +118,7 @@ Wrap input lines in each FILE, writing to standard output.\n\
fputs (_("\
-b, --bytes count bytes rather than columns\n\
+ -c, --characters count characters rather than columns\n\
-s, --spaces break at spaces\n\
-w, --width=WIDTH use WIDTH columns instead of 80\n\
"), stdout);
@@ -92,7 +136,7 @@ Wrap input lines in each FILE, writing to standard output.\n\
static size_t
adjust_column (size_t column, char c)
{
- if (!count_bytes)
+ if (operating_mode != byte_mode)
{
if (c == '\b')
{
@@ -115,30 +159,14 @@ adjust_column (size_t column, char c)
to stdout, with maximum line length WIDTH.
Return true if successful. */
-static bool
-fold_file (char const *filename, size_t width)
+static void
+fold_text (FILE *istream, size_t width, int *saved_errno)
{
- FILE *istream;
int c;
size_t column = 0; /* Screen column where next char will go. */
size_t offset_out = 0; /* Index in 'line_out' for next char. */
static char *line_out = nullptr;
static size_t allocated_out = 0;
- int saved_errno;
-
- if (STREQ (filename, "-"))
- {
- istream = stdin;
- have_read_stdin = true;
- }
- else
- istream = fopen (filename, "r");
-
- if (istream == nullptr)
- {
- error (0, errno, "%s", quotef (filename));
- return false;
- }
fadvise (istream, FADVISE_SEQUENTIAL);
@@ -168,6 +196,15 @@ fold_file (char const *filename, size_t width)
bool found_blank = false;
size_t logical_end = offset_out;
+ /* If LINE_OUT has no wide character,
+ put a new wide character in LINE_OUT
+ if column is bigger than width. */
+ if (offset_out == 0)
+ {
+ line_out[offset_out++] = c;
+ continue;
+ }
+
/* Look for the last blank. */
while (logical_end)
{
@@ -214,13 +251,225 @@ fold_file (char const *filename, size_t width)
line_out[offset_out++] = c;
}
- saved_errno = errno;
+ *saved_errno = errno;
if (!ferror (istream))
- saved_errno = 0;
+ *saved_errno = 0;
if (offset_out)
fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
+}
+
+#if HAVE_MBRTOWC
+static void
+fold_multibyte_text (FILE *istream, size_t width, int *saved_errno)
+{
+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
+ size_t buflen = 0; /* The length of the byte sequence in buf. */
+ char *bufpos = buf; /* Next read position of BUF. */
+ wint_t wc; /* A gotten wide character. */
+ size_t mblength; /* The byte size of a multibyte character which shows
+ as same character as WC. */
+ mbstate_t state, state_bak; /* State of the stream. */
+ int convfail = 0; /* 1, when conversion is failed. Otherwise 0. */
+
+ static char *line_out = nullptr;
+ size_t offset_out = 0; /* Index in `line_out' for next char. */
+ static size_t allocated_out = 0;
+
+ int increment;
+ size_t column = 0;
+
+ size_t last_blank_pos;
+ size_t last_blank_column;
+ int is_blank_seen;
+ int last_blank_increment = 0;
+ int is_bs_following_last_blank;
+ size_t bs_following_last_blank_num;
+ int is_cr_after_last_blank;
+
+#define CLEAR_FLAGS \
+ do \
+ { \
+ last_blank_pos = 0; \
+ last_blank_column = 0; \
+ is_blank_seen = 0; \
+ is_bs_following_last_blank = 0; \
+ bs_following_last_blank_num = 0; \
+ is_cr_after_last_blank = 0; \
+ } \
+ while (0)
+
+#define START_NEW_LINE \
+ do \
+ { \
+ putchar ('\n'); \
+ column = 0; \
+ offset_out = 0; \
+ CLEAR_FLAGS; \
+ } \
+ while (0)
+
+ CLEAR_FLAGS;
+ memset (&state, '\0', sizeof(mbstate_t));
+
+ for (;; bufpos += mblength, buflen -= mblength)
+ {
+ if (buflen < MB_LEN_MAX && !feof (istream) && !ferror (istream))
+ {
+ memmove (buf, bufpos, buflen);
+ buflen += fread (buf + buflen, sizeof(char), BUFSIZ, istream);
+ bufpos = buf;
+ }
+
+ if (buflen < 1)
+ break;
+
+ /* Get a wide character. */
+ state_bak = state;
+ mblength = mbrtowc ((wchar_t *)&wc, bufpos, buflen, &state);
+
+ switch (mblength)
+ {
+ case (size_t)-1:
+ case (size_t)-2:
+ convfail++;
+ state = state_bak;
+ /* Fall through. */
+
+ case 0:
+ mblength = 1;
+ break;
+ }
+
+rescan:
+ if (operating_mode == byte_mode) /* byte mode */
+ increment = mblength;
+ else if (operating_mode == character_mode) /* character mode */
+ increment = 1;
+ else /* column mode */
+ {
+ if (convfail)
+ increment = 1;
+ else
+ {
+ switch (wc)
+ {
+ case L'\n':
+ fwrite (line_out, sizeof(char), offset_out, stdout);
+ START_NEW_LINE;
+ continue;
+
+ case L'\b':
+ increment = (column > 0) ? -1 : 0;
+ break;
+
+ case L'\r':
+ increment = -1 * column;
+ break;
+
+ case L'\t':
+ increment = 8 - column % 8;
+ break;
+
+ default:
+ increment = wcwidth (wc);
+ increment = (increment < 0) ? 0 : increment;
+ }
+ }
+ }
+
+ if (column + increment > width && break_spaces && last_blank_pos)
+ {
+ fwrite (line_out, sizeof(char), last_blank_pos, stdout);
+ putchar ('\n');
+
+ offset_out = offset_out - last_blank_pos;
+ column = column - last_blank_column + ((is_cr_after_last_blank)
+ ? last_blank_increment : bs_following_last_blank_num);
+ memmove (line_out, line_out + last_blank_pos, offset_out);
+ CLEAR_FLAGS;
+ goto rescan;
+ }
+
+ if (column + increment > width && column != 0)
+ {
+ fwrite (line_out, sizeof(char), offset_out, stdout);
+ START_NEW_LINE;
+ goto rescan;
+ }
+
+ if (allocated_out < offset_out + mblength)
+ {
+ line_out = X2REALLOC (line_out, &allocated_out);
+ }
+
+ memcpy (line_out + offset_out, bufpos, mblength);
+ offset_out += mblength;
+ column += increment;
+
+ if (is_blank_seen && !convfail && wc == L'\r')
+ is_cr_after_last_blank = 1;
+
+ if (is_bs_following_last_blank && !convfail && wc == L'\b')
+ ++bs_following_last_blank_num;
+ else
+ is_bs_following_last_blank = 0;
+
+ if (break_spaces && !convfail && iswblank (wc))
+ {
+ last_blank_pos = offset_out;
+ last_blank_column = column;
+ is_blank_seen = 1;
+ last_blank_increment = increment;
+ is_bs_following_last_blank = 1;
+ bs_following_last_blank_num = 0;
+ is_cr_after_last_blank = 0;
+ }
+ }
+
+ *saved_errno = errno;
+ if (!ferror (istream))
+ *saved_errno = 0;
+
+ if (offset_out)
+ fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
+
+}
+#endif
+
+/* Fold file FILENAME, or standard input if FILENAME is "-",
+ to stdout, with maximum line length WIDTH.
+ Return 0 if successful, 1 if an error occurs. */
+
+static bool
+fold_file (char const *filename, size_t width)
+{
+ FILE *istream;
+ int saved_errno;
+
+ if (STREQ (filename, "-"))
+ {
+ istream = stdin;
+ have_read_stdin = 1;
+ }
+ else
+ istream = fopen (filename, "r");
+
+ if (istream == nullptr)
+ {
+ error (0, errno, "%s", filename);
+ return 1;
+ }
+
+ /* Define how ISTREAM is being folded. */
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ fold_multibyte_text (istream, width, &saved_errno);
+ else
+#endif
+ fold_text (istream, width, &saved_errno);
+
if (STREQ (filename, "-"))
clearerr (istream);
else if (fclose (istream) != 0 && !saved_errno)
@@ -251,7 +500,8 @@ main (int argc, char **argv)
atexit (close_stdout);
- break_spaces = count_bytes = have_read_stdin = false;
+ operating_mode = column_mode;
+ break_spaces = have_read_stdin = false;
while ((optc = getopt_long (argc, argv, shortopts, longopts, nullptr)) != -1)
{
@@ -260,7 +510,15 @@ main (int argc, char **argv)
switch (optc)
{
case 'b': /* Count bytes rather than columns. */
- count_bytes = true;
+ if (operating_mode != column_mode)
+ FATAL_ERROR (_("only one way of folding may be specified"));
+ operating_mode = byte_mode;
+ break;
+
+ case 'c':
+ if (operating_mode != column_mode)
+ FATAL_ERROR (_("only one way of folding may be specified"));
+ operating_mode = character_mode;
break;
case 's': /* Break at word boundaries. */
diff --git a/src/local.mk b/src/local.mk
index 96ee941ca..fdc81b934 100644
--- a/src/local.mk
+++ b/src/local.mk
@@ -450,8 +450,8 @@ src_base32_CPPFLAGS = -DBASE_TYPE=32 $(AM_CPPFLAGS)
src_basenc_SOURCES = src/basenc.c
src_basenc_CPPFLAGS = -DBASE_TYPE=42 $(AM_CPPFLAGS)
-src_expand_SOURCES = src/expand.c src/expand-common.c
-src_unexpand_SOURCES = src/unexpand.c src/expand-common.c
+src_expand_SOURCES = src/expand.c src/expand-common.c lib/mbfile.c
+src_unexpand_SOURCES = src/unexpand.c src/expand-common.c lib/mbfile.c
src_wc_SOURCES = src/wc.c
if USE_AVX2_WC_LINECOUNT
diff --git a/src/pr.c b/src/pr.c
index 09c6fa8ac..7552b6298 100644
--- a/src/pr.c
+++ b/src/pr.c
@@ -312,6 +312,24 @@
#include <ctype.h>
#include <getopt.h>
#include <sys/types.h>
+
+/* Get MB_LEN_MAX. */
+#include <limits.h>
+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
+ installation; work around this configuration error. */
+#if !defined MB_LEN_MAX || MB_LEN_MAX == 1
+# define MB_LEN_MAX 16
+#endif
+
+/* Get MB_CUR_MAX. */
+#include <stdlib.h>
+
+/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
+/* Get mbstate_t, mbrtowc(), wcwidth(). */
+#if HAVE_WCHAR_H
+# include <wchar.h>
+#endif
+
#include "system.h"
#include "fadvise.h"
#include "hard-locale.h"
@@ -324,6 +342,18 @@
#include "xstrtol-error.h"
#include "xdectoint.h"
+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
+#if HAVE_MBRTOWC && defined mbstate_t
+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
+#endif
+
+#ifndef HAVE_DECL_WCWIDTH
+"this configure-time declaration test was not run"
+#endif
+#if !HAVE_DECL_WCWIDTH
+extern int wcwidth ();
+#endif
+
/* The official name of this program (e.g., no 'g' prefix). */
#define PROGRAM_NAME "pr"
@@ -416,7 +446,20 @@ struct COLUMN
typedef struct COLUMN COLUMN;
-static int char_to_clump (char c);
+/* Funtion pointers to switch functions for single byte locale or for
+ multibyte locale. If multibyte functions do not exist in your sysytem,
+ these pointers always point the function for single byte locale. */
+static void (*print_char) (char c);
+static int (*char_to_clump) (char c);
+
+/* Functions for single byte locale. */
+static void print_char_single (char c);
+static int char_to_clump_single (char c);
+
+/* Functions for multibyte locale. */
+static void print_char_multi (char c);
+static int char_to_clump_multi (char c);
+
static bool read_line (COLUMN *p);
static bool print_page (void);
static bool print_stored (COLUMN *p);
@@ -428,6 +471,7 @@ static void add_line_number (COLUMN *p);
static void getoptnum (char const *n_str, int min, int *num,
char const *errfmt);
static void getoptarg (char *arg, char switch_char, char *character,
+ int *character_length, int *character_width,
int *number);
static void print_files (int number_of_files, char **av);
static void init_parameters (int number_of_files);
@@ -441,7 +485,6 @@ static void store_char (char c);
static void pad_down (unsigned int lines);
static void read_rest_of_line (COLUMN *p);
static void skip_read (COLUMN *p, int column_number);
-static void print_char (char c);
static void cleanup (void);
static void print_sep_string (void);
static void separator_string (char const *optarg_S);
@@ -453,7 +496,7 @@ static COLUMN *column_vector;
we store the leftmost columns contiguously in buff.
To print a line from buff, get the index of the first character
from line_vector[i], and print up to line_vector[i + 1]. */
-static char *buff;
+static unsigned char *buff;
/* Index of the position in buff where the next character
will be stored. */
@@ -557,7 +600,7 @@ static int chars_per_column;
static bool untabify_input = false;
/* (-e) The input tab character. */
-static char input_tab_char = '\t';
+static char input_tab_char[MB_LEN_MAX] = "\t";
/* (-e) Tabstops are at chars_per_tab, 2*chars_per_tab, 3*chars_per_tab, ...
where the leftmost column is 1. */
@@ -567,7 +610,10 @@ static int chars_per_input_tab = 8;
static bool tabify_output = false;
/* (-i) The output tab character. */
-static char output_tab_char = '\t';
+static char output_tab_char[MB_LEN_MAX] = "\t";
+
+/* (-i) The byte length of output tab character. */
+static int output_tab_char_length = 1;
/* (-i) The width of the output tab. */
static int chars_per_output_tab = 8;
@@ -637,7 +683,13 @@ static int line_number;
static bool numbered_lines = false;
/* (-n) Character which follows each line number. */
-static char number_separator = '\t';
+static char number_separator[MB_LEN_MAX] = "\t";
+
+/* (-n) The byte length of the character which follows each line number. */
+static int number_separator_length = 1;
+
+/* (-n) The character width of the character which follows each line number. */
+static int number_separator_width = 0;
/* (-n) line counting starts with 1st line of input file (not with 1st
line of 1st page printed). */
@@ -690,6 +742,7 @@ static bool use_col_separator = false;
-a|COLUMN|-m is a 'space' and with the -J option a 'tab'. */
static char const *col_sep_string = "";
static int col_sep_length = 0;
+static int col_sep_width = 0;
static char *column_separator = (char *) " ";
static char *line_separator = (char *) "\t";
@@ -852,6 +905,13 @@ separator_string (char const *optarg_S)
integer_overflow ();
col_sep_length = len;
col_sep_string = optarg_S;
+
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ col_sep_width = mbswidth (col_sep_string, 0);
+ else
+#endif
+ col_sep_width = col_sep_length;
}
int
@@ -876,6 +936,21 @@ main (int argc, char **argv)
atexit (close_stdout);
+/* Define which functions are used, the ones for single byte locale or the ones
+ for multibyte locale. */
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ {
+ print_char = print_char_multi;
+ char_to_clump = char_to_clump_multi;
+ }
+ else
+#endif
+ {
+ print_char = print_char_single;
+ char_to_clump = char_to_clump_single;
+ }
+
n_files = 0;
file_names = (argc > 1
? xnmalloc (argc - 1, sizeof (char *))
@@ -952,8 +1027,12 @@ main (int argc, char **argv)
break;
case 'e':
if (optarg)
- getoptarg (optarg, 'e', &input_tab_char,
- &chars_per_input_tab);
+ {
+ int dummy_length, dummy_width;
+
+ getoptarg (optarg, 'e', input_tab_char, &dummy_length,
+ &dummy_width, &chars_per_input_tab);
+ }
/* Could check tab width > 0. */
untabify_input = true;
break;
@@ -966,8 +1045,12 @@ main (int argc, char **argv)
break;
case 'i':
if (optarg)
- getoptarg (optarg, 'i', &output_tab_char,
- &chars_per_output_tab);
+ {
+ int dummy_width;
+
+ getoptarg (optarg, 'i', output_tab_char, &output_tab_char_length,
+ &dummy_width, &chars_per_output_tab);
+ }
/* Could check tab width > 0. */
tabify_output = true;
break;
@@ -985,8 +1068,8 @@ main (int argc, char **argv)
case 'n':
numbered_lines = true;
if (optarg)
- getoptarg (optarg, 'n', &number_separator,
- &chars_per_number);
+ getoptarg (optarg, 'n', number_separator, &number_separator_length,
+ &number_separator_width, &chars_per_number);
break;
case 'N':
skip_count = false;
@@ -1011,6 +1094,7 @@ main (int argc, char **argv)
/* Reset an additional input of -s, -S dominates -s */
col_sep_string = "";
col_sep_length = 0;
+ col_sep_width = 0;
use_col_separator = true;
if (optarg)
separator_string (optarg);
@@ -1165,7 +1249,8 @@ getoptnum (char const *n_str, int min, int *num, char const *err)
a number. */
static void
-getoptarg (char *arg, char switch_char, char *character, int *number)
+getoptarg (char *arg, char switch_char, char *character, int *character_length,
+ int *character_width, int *number)
{
if (!*arg)
{
@@ -1174,7 +1259,41 @@ getoptarg (char *arg, char switch_char, char *character, int *number)
}
if (!ISDIGIT (*arg))
- *character = *arg++;
+ {
+#ifdef HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1) /* for multibyte locale. */
+ {
+ wchar_t wc;
+ size_t mblength;
+ int width;
+ mbstate_t state = {'\0'};
+
+ mblength = mbrtowc (&wc, arg, strnlen(arg, MB_LEN_MAX), &state);
+
+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
+ {
+ *character_length = 1;
+ *character_width = 1;
+ }
+ else
+ {
+ *character_length = (mblength < 1) ? 1 : mblength;
+ width = wcwidth (wc);
+ *character_width = (width < 0) ? 0 : width;
+ }
+
+ strncpy (character, arg, *character_length);
+ arg += *character_length;
+ }
+ else /* for single byte locale. */
+#endif
+ {
+ *character = *arg++;
+ *character_length = 1;
+ *character_width = 1;
+ }
+ }
+
if (*arg)
{
long int tmp_long;
@@ -1203,6 +1322,11 @@ static void
init_parameters (int number_of_files)
{
int chars_used_by_number = 0;
+ int mb_len = 1;
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ mb_len = MB_LEN_MAX;
+#endif
lines_per_body = lines_per_page - lines_per_header - lines_per_footer;
if (lines_per_body <= 0)
@@ -1240,7 +1364,7 @@ init_parameters (int number_of_files)
else
col_sep_string = column_separator;
- col_sep_length = 1;
+ col_sep_length = col_sep_width = 1;
use_col_separator = true;
}
/* It's rather pointless to define a TAB separator with column
@@ -1272,11 +1396,11 @@ init_parameters (int number_of_files)
+ TAB_WIDTH (chars_per_input_tab, chars_per_number); */
/* Estimate chars_per_text without any margin and keep it constant. */
- if (number_separator == '\t')
+ if (number_separator[0] == '\t')
number_width = (chars_per_number
+ TAB_WIDTH (chars_per_default_tab, chars_per_number));
else
- number_width = chars_per_number + 1;
+ number_width = chars_per_number + number_separator_width;
/* The number is part of the column width unless we are
printing files in parallel. */
@@ -1285,7 +1409,7 @@ init_parameters (int number_of_files)
}
int sep_chars, useful_chars;
- if (ckd_mul (&sep_chars, columns - 1, col_sep_length))
+ if (ckd_mul (&sep_chars, columns - 1, col_sep_width))
sep_chars = INT_MAX;
if (ckd_sub (&useful_chars, chars_per_line - chars_used_by_number,
sep_chars))
@@ -1308,7 +1432,7 @@ init_parameters (int number_of_files)
We've to use 8 as the lower limit, if we use chars_per_default_tab = 8
to expand a tab which is not an input_tab-char. */
free (clump_buff);
- clump_buff = xmalloc (MAX (8, chars_per_input_tab));
+ clump_buff = xmalloc (mb_len * MAX (8, chars_per_input_tab));
}
/* Open the necessary files,
@@ -1414,7 +1538,7 @@ init_funcs (void)
/* Enlarge p->start_position of first column to use the same form of
padding_not_printed with all columns. */
- h = h + col_sep_length;
+ h = h + col_sep_width;
/* This loop takes care of all but the rightmost column. */
@@ -1448,7 +1572,7 @@ init_funcs (void)
}
else
{
- h = h_next + col_sep_length;
+ h = h_next + col_sep_width;
h_next = h + chars_per_column;
}
}
@@ -1745,9 +1869,9 @@ static void
align_column (COLUMN *p)
{
padding_not_printed = p->start_position;
- if (col_sep_length < padding_not_printed)
+ if (col_sep_width < padding_not_printed)
{
- pad_across_to (padding_not_printed - col_sep_length);
+ pad_across_to (padding_not_printed - col_sep_width);
padding_not_printed = ANYWHERE;
}
@@ -2021,13 +2145,13 @@ store_char (char c)
/* May be too generous. */
buff = X2REALLOC (buff, &buff_allocated);
}
- buff[buff_current++] = c;
+ buff[buff_current++] = (unsigned char) c;
}
static void
add_line_number (COLUMN *p)
{
- int i;
+ int i, j;
char *s;
int num_width;
@@ -2044,22 +2168,24 @@ add_line_number (COLUMN *p)
/* Tabification is assumed for multiple columns, also for n-separators,
but 'default n-separator = TAB' hasn't been given priority over
equal column_width also specified by POSIX. */
- if (number_separator == '\t')
+ if (number_separator[0] == '\t')
{
i = number_width - chars_per_number;
while (i-- > 0)
(p->char_func) (' ');
}
else
- (p->char_func) (number_separator);
+ for (j = 0; j < number_separator_length; j++)
+ (p->char_func) (number_separator[j]);
}
else
/* To comply with POSIX, we avoid any expansion of default TAB
separator with a single column output. No column_width requirement
has to be considered. */
{
- (p->char_func) (number_separator);
- if (number_separator == '\t')
+ for (j = 0; j < number_separator_length; j++)
+ (p->char_func) (number_separator[j]);
+ if (number_separator[0] == '\t')
output_position = POS_AFTER_TAB (chars_per_output_tab,
output_position);
}
@@ -2218,7 +2344,7 @@ print_white_space (void)
while (goal - h_old > 1
&& (h_new = POS_AFTER_TAB (chars_per_output_tab, h_old)) <= goal)
{
- putchar (output_tab_char);
+ fwrite (output_tab_char, sizeof(char), output_tab_char_length, stdout);
h_old = h_new;
}
while (++h_old <= goal)
@@ -2238,6 +2364,7 @@ print_sep_string (void)
{
char const *s = col_sep_string;
int l = col_sep_length;
+ int not_space_flag;
if (separators_not_printed <= 0)
{
@@ -2249,6 +2376,7 @@ print_sep_string (void)
{
for (; separators_not_printed > 0; --separators_not_printed)
{
+ not_space_flag = 0;
while (l-- > 0)
{
/* 3 types of sep_strings: spaces only, spaces and chars,
@@ -2262,12 +2390,15 @@ print_sep_string (void)
}
else
{
+ not_space_flag = 1;
if (spaces_not_printed > 0)
print_white_space ();
putchar (*s++);
- ++output_position;
}
}
+ if (not_space_flag)
+ output_position += col_sep_width;
+
/* sep_string ends with some spaces */
if (spaces_not_printed > 0)
print_white_space ();
@@ -2295,7 +2426,7 @@ print_clump (COLUMN *p, int n, char *clump)
required number of tabs and spaces. */
static void
-print_char (char c)
+print_char_single (char c)
{
if (tabify_output)
{
@@ -2319,6 +2450,74 @@ print_char (char c)
putchar (c);
}
+#ifdef HAVE_MBRTOWC
+static void
+print_char_multi (char c)
+{
+ static size_t mbc_pos = 0;
+ static char mbc[MB_LEN_MAX] = {'\0'};
+ static mbstate_t state = {'\0'};
+ mbstate_t state_bak;
+ wchar_t wc;
+ size_t mblength;
+ int width;
+
+ if (tabify_output)
+ {
+ state_bak = state;
+ mbc[mbc_pos++] = c;
+ mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
+
+ while (mbc_pos > 0)
+ {
+ switch (mblength)
+ {
+ case (size_t)-2:
+ state = state_bak;
+ return;
+
+ case (size_t)-1:
+ state = state_bak;
+ ++output_position;
+ putchar (mbc[0]);
+ memmove (mbc, mbc + 1, MB_CUR_MAX - 1);
+ --mbc_pos;
+ break;
+
+ case 0:
+ mblength = 1;
+
+ default:
+ if (wc == L' ')
+ {
+ memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
+ --mbc_pos;
+ ++spaces_not_printed;
+ return;
+ }
+ else if (spaces_not_printed > 0)
+ print_white_space ();
+
+ /* Nonprintables are assumed to have width 0, except L'\b'. */
+ if ((width = wcwidth (wc)) < 1)
+ {
+ if (wc == L'\b')
+ --output_position;
+ }
+ else
+ output_position += width;
+
+ fwrite (mbc, sizeof(char), mblength, stdout);
+ memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
+ mbc_pos -= mblength;
+ }
+ }
+ return;
+ }
+ putchar (c);
+}
+#endif
+
/* Skip to page PAGE before printing.
PAGE may be larger than total number of pages. */
@@ -2495,9 +2694,9 @@ read_line (COLUMN *p)
align_empty_cols = false;
}
- if (col_sep_length < padding_not_printed)
+ if (col_sep_width < padding_not_printed)
{
- pad_across_to (padding_not_printed - col_sep_length);
+ pad_across_to (padding_not_printed - col_sep_width);
padding_not_printed = ANYWHERE;
}
@@ -2566,7 +2765,7 @@ print_stored (COLUMN *p)
COLUMN *q;
int line = p->current_line++;
- char *first = &buff[line_vector[line]];
+ unsigned char *first = &buff[line_vector[line]];
/* FIXME
UMR: Uninitialized memory read:
* This is occurring while in:
@@ -2578,7 +2777,7 @@ print_stored (COLUMN *p)
xmalloc [xmalloc.c:94]
init_store_cols [pr.c:1648]
*/
- char *last = &buff[line_vector[line + 1]];
+ unsigned char *last = &buff[line_vector[line + 1]];
pad_vertically = true;
@@ -2598,9 +2797,9 @@ print_stored (COLUMN *p)
}
}
- if (col_sep_length < padding_not_printed)
+ if (col_sep_width < padding_not_printed)
{
- pad_across_to (padding_not_printed - col_sep_length);
+ pad_across_to (padding_not_printed - col_sep_width);
padding_not_printed = ANYWHERE;
}
@@ -2613,8 +2812,8 @@ print_stored (COLUMN *p)
if (spaces_not_printed == 0)
{
output_position = p->start_position + end_vector[line];
- if (p->start_position - col_sep_length == chars_per_margin)
- output_position -= col_sep_length;
+ if (p->start_position - col_sep_width == chars_per_margin)
+ output_position -= col_sep_width;
}
return true;
@@ -2633,7 +2832,7 @@ print_stored (COLUMN *p)
number of characters is 1.) */
static int
-char_to_clump (char c)
+char_to_clump_single (char c)
{
unsigned char uc = c;
char *s = clump_buff;
@@ -2643,10 +2842,10 @@ char_to_clump (char c)
int chars;
int chars_per_c = 8;
- if (c == input_tab_char)
+ if (c == input_tab_char[0])
chars_per_c = chars_per_input_tab;
- if (c == input_tab_char || c == '\t')
+ if (c == input_tab_char[0] || c == '\t')
{
width = TAB_WIDTH (chars_per_c, input_position);
@@ -2727,6 +2926,164 @@ char_to_clump (char c)
return chars;
}
+#ifdef HAVE_MBRTOWC
+static int
+char_to_clump_multi (char c)
+{
+ static size_t mbc_pos = 0;
+ static char mbc[MB_LEN_MAX] = {'\0'};
+ static mbstate_t state = {'\0'};
+ mbstate_t state_bak;
+ wchar_t wc;
+ size_t mblength;
+ int wc_width;
+ register char *s = clump_buff;
+ register int i, j;
+ char esc_buff[4];
+ int width;
+ int chars;
+ int chars_per_c = 8;
+
+ state_bak = state;
+ mbc[mbc_pos++] = c;
+ mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
+
+ width = 0;
+ chars = 0;
+ while (mbc_pos > 0)
+ {
+ switch (mblength)
+ {
+ case (size_t)-2:
+ state = state_bak;
+ return 0;
+
+ case (size_t)-1:
+ state = state_bak;
+ mblength = 1;
+
+ if (use_esc_sequence || use_cntrl_prefix)
+ {
+ width = +4;
+ chars = +4;
+ *s++ = '\\';
+ sprintf (esc_buff, "%03o", (unsigned char) mbc[0]);
+ for (i = 0; i <= 2; ++i)
+ *s++ = (int) esc_buff[i];
+ }
+ else
+ {
+ width += 1;
+ chars += 1;
+ *s++ = mbc[0];
+ }
+ break;
+
+ case 0:
+ mblength = 1;
+ /* Fall through */
+
+ default:
+ if (memcmp (mbc, input_tab_char, mblength) == 0)
+ chars_per_c = chars_per_input_tab;
+
+ if (memcmp (mbc, input_tab_char, mblength) == 0 || c == '\t')
+ {
+ int width_inc;
+
+ width_inc = TAB_WIDTH (chars_per_c, input_position);
+ width += width_inc;
+
+ if (untabify_input)
+ {
+ for (i = width_inc; i; --i)
+ *s++ = ' ';
+ chars += width_inc;
+ }
+ else
+ {
+ for (i = 0; i < mblength; i++)
+ *s++ = mbc[i];
+ chars += mblength;
+ }
+ }
+ else if ((wc_width = wcwidth (wc)) < 1)
+ {
+ if (use_esc_sequence)
+ {
+ for (i = 0; i < mblength; i++)
+ {
+ width += 4;
+ chars += 4;
+ *s++ = '\\';
+ sprintf (esc_buff, "%03o", (unsigned char) mbc[i]);
+ for (j = 0; j <= 2; ++j)
+ *s++ = (int) esc_buff[j];
+ }
+ }
+ else if (use_cntrl_prefix)
+ {
+ if (wc < 0200)
+ {
+ width += 2;
+ chars += 2;
+ *s++ = '^';
+ *s++ = wc ^ 0100;
+ }
+ else
+ {
+ for (i = 0; i < mblength; i++)
+ {
+ width += 4;
+ chars += 4;
+ *s++ = '\\';
+ sprintf (esc_buff, "%03o", (unsigned char) mbc[i]);
+ for (j = 0; j <= 2; ++j)
+ *s++ = (int) esc_buff[j];
+ }
+ }
+ }
+ else if (wc == L'\b')
+ {
+ width += -1;
+ chars += 1;
+ *s++ = c;
+ }
+ else
+ {
+ width += 0;
+ chars += mblength;
+ for (i = 0; i < mblength; i++)
+ *s++ = mbc[i];
+ }
+ }
+ else
+ {
+ width += wc_width;
+ chars += mblength;
+ for (i = 0; i < mblength; i++)
+ *s++ = mbc[i];
+ }
+ }
+ memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
+ mbc_pos -= mblength;
+ }
+
+ /* Too many backspaces must put us in position 0 -- never negative. */
+ if (width < 0 && input_position == 0)
+ {
+ chars = 0;
+ input_position = 0;
+ }
+ else if (width < 0 && input_position <= -width)
+ input_position = 0;
+ else
+ input_position += width;
+
+ return chars;
+}
+#endif
+
/* We've just printed some files and need to clean up things before
looking for more options and printing the next batch of files.
diff --git a/src/sort.c b/src/sort.c
index 2d8324ca4..96338fbdc 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -29,6 +29,16 @@
#include <sys/types.h>
#include <sys/wait.h>
#include <signal.h>
+
+#include <assert.h>
+#if HAVE_WCHAR_H
+# include <wchar.h>
+#endif
+/* Get isw* functions. */
+#if HAVE_WCTYPE_H
+# include <wctype.h>
+#endif
+
#include "system.h"
#include "argmatch.h"
#include "assure.h"
@@ -157,14 +167,39 @@ static int thousands_sep;
/* We currently ignore multi-byte grouping chars. */
static bool thousands_sep_ignored;
+/* True if -f is specified. */
+static bool folding;
+
/* Nonzero if the corresponding locales are hard. */
static bool hard_LC_COLLATE;
-#if HAVE_NL_LANGINFO
+#if HAVE_LANGINFO_CODESET
static bool hard_LC_TIME;
#endif
#define NONZERO(x) ((x) != 0)
+/* get a multibyte character's byte length. */
+#define GET_BYTELEN_OF_CHAR(LIM, PTR, MBLENGTH, STATE) \
+ do \
+ { \
+ wchar_t wc; \
+ mbstate_t state_bak; \
+ \
+ state_bak = STATE; \
+ mblength = mbrtowc (&wc, PTR, LIM - PTR, &STATE); \
+ \
+ switch (MBLENGTH) \
+ { \
+ case (size_t)-1: \
+ case (size_t)-2: \
+ STATE = state_bak; \
+ /* Fall through. */ \
+ case 0: \
+ MBLENGTH = 1; \
+ } \
+ } \
+ while (0)
+
/* The kind of blanks for '-b' to skip in various options. */
enum blanktype { bl_start, bl_end, bl_both };
@@ -341,13 +376,11 @@ static bool stable;
/* An int value outside char range. */
enum { NON_CHAR = CHAR_MAX + 1 };
-/* If TAB has this value, blanks separate fields. */
-enum { TAB_DEFAULT = CHAR_MAX + 1 };
-
-/* Tab character separating fields. If TAB_DEFAULT, then fields are
+/* Tab character separating fields. If tab_length is 0, then fields are
separated by the empty string between a non-blank character and a blank
character. */
-static int tab = TAB_DEFAULT;
+static char tab[MB_LEN_MAX + 1];
+static size_t tab_length = 0;
/* Flag to remove consecutive duplicate lines from the output.
Only the last of a sequence of equal lines will be output. */
@@ -804,6 +837,46 @@ reap_all (void)
reap (-1);
}
+/* Function pointers. */
+static void
+(*inittables) (void);
+static char *
+(*begfield) (const struct line*, const struct keyfield *);
+static char *
+(*limfield) (const struct line*, const struct keyfield *);
+static void
+(*skipblanks) (char **ptr, char *lim);
+static int
+(*getmonth) (char const *, size_t, char **);
+static int
+(*keycompare) (const struct line *, const struct line *);
+static int
+(*numcompare) (const char *, const char *);
+
+/* Test for white space multibyte character.
+ Set LENGTH the byte length of investigated multibyte character. */
+#if HAVE_MBRTOWC
+static int
+ismbblank (const char *str, size_t len, size_t *length)
+{
+ size_t mblength;
+ wchar_t wc;
+ mbstate_t state;
+
+ memset (&state, '\0', sizeof(mbstate_t));
+ mblength = mbrtowc (&wc, str, len, &state);
+
+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
+ {
+ *length = 1;
+ return 0;
+ }
+
+ *length = (mblength < 1) ? 1 : mblength;
+ return iswblank (wc) || wc == '\n';
+}
+#endif
+
/* Clean up any remaining temporary files. */
static void
@@ -1271,7 +1344,7 @@ zaptemp (char const *name)
free (node);
}
-#if HAVE_NL_LANGINFO
+#if HAVE_LANGINFO_CODESET
static int
struct_month_cmp (void const *m1, void const *m2)
@@ -1286,7 +1359,7 @@ struct_month_cmp (void const *m1, void const *m2)
/* Initialize the character class tables. */
static void
-inittables (void)
+inittables_uni (void)
{
size_t i;
@@ -1298,7 +1371,7 @@ inittables (void)
fold_toupper[i] = toupper (i);
}
-#if HAVE_NL_LANGINFO
+#if HAVE_LANGINFO_CODESET
/* If we're not in the "C" locale, read different names for months. */
if (hard_LC_TIME)
{
@@ -1380,6 +1453,84 @@ specify_nmerge (int oi, char c, char const *s)
xstrtol_fatal (e, oi, c, long_options, s);
}
+#if HAVE_MBRTOWC
+static void
+inittables_mb (void)
+{
+ int i, j, k, l;
+ char *name, *s, *lc_time, *lc_ctype;
+ size_t s_len, mblength;
+ char mbc[MB_LEN_MAX];
+ wchar_t wc, pwc;
+ mbstate_t state_mb, state_wc;
+
+ lc_time = setlocale (LC_TIME, "");
+ if (lc_time)
+ lc_time = xstrdup (lc_time);
+
+ lc_ctype = setlocale (LC_CTYPE, "");
+ if (lc_ctype)
+ lc_ctype = xstrdup (lc_ctype);
+
+ if (lc_time && lc_ctype)
+ /* temporarily set LC_CTYPE to match LC_TIME, so that we can convert
+ * the names of months to upper case */
+ setlocale (LC_CTYPE, lc_time);
+
+ for (i = 0; i < MONTHS_PER_YEAR; i++)
+ {
+ s = (char *) nl_langinfo (ABMON_1 + i);
+ s_len = strlen (s);
+ monthtab[i].name = name = (char *) xmalloc (s_len + 1);
+ monthtab[i].val = i + 1;
+
+ memset (&state_mb, '\0', sizeof (mbstate_t));
+ memset (&state_wc, '\0', sizeof (mbstate_t));
+
+ for (j = 0; j < s_len;)
+ {
+ if (!ismbblank (s + j, s_len - j, &mblength))
+ break;
+ j += mblength;
+ }
+
+ for (k = 0; j < s_len;)
+ {
+ mblength = mbrtowc (&wc, (s + j), (s_len - j), &state_mb);
+ assert (mblength != (size_t)-1 && mblength != (size_t)-2);
+ if (mblength == 0)
+ break;
+
+ pwc = towupper (wc);
+ if (pwc == wc)
+ {
+ memcpy (mbc, s + j, mblength);
+ j += mblength;
+ }
+ else
+ {
+ j += mblength;
+ mblength = wcrtomb (mbc, pwc, &state_wc);
+ assert (mblength != (size_t)0 && mblength != (size_t)-1);
+ }
+
+ for (l = 0; l < mblength; l++)
+ name[k++] = mbc[l];
+ }
+ name[k] = '\0';
+ }
+ qsort ((void *) monthtab, MONTHS_PER_YEAR,
+ sizeof (struct month), struct_month_cmp);
+
+ if (lc_time && lc_ctype)
+ /* restore the original locales */
+ setlocale (LC_CTYPE, lc_ctype);
+
+ free (lc_ctype);
+ free (lc_time);
+}
+#endif
+
/* Specify the amount of main memory to use when sorting. */
static void
specify_sort_size (int oi, char c, char const *s)
@@ -1611,7 +1762,7 @@ buffer_linelim (struct buffer const *buf)
by KEY in LINE. */
static char *
-begfield (struct line const *line, struct keyfield const *key)
+begfield_uni (const struct line *line, const struct keyfield *key)
{
char *ptr = line->text, *lim = ptr + line->length - 1;
size_t sword = key->sword;
@@ -1620,10 +1771,10 @@ begfield (struct line const *line, struct keyfield const *key)
/* The leading field separator itself is included in a field when -t
is absent. */
- if (tab != TAB_DEFAULT)
+ if (tab_length)
while (ptr < lim && sword--)
{
- while (ptr < lim && *ptr != tab)
+ while (ptr < lim && *ptr != tab[0])
++ptr;
if (ptr < lim)
++ptr;
@@ -1649,12 +1800,71 @@ begfield (struct line const *line, struct keyfield const *key)
return ptr;
}
+#if HAVE_MBRTOWC
+static char *
+begfield_mb (const struct line *line, const struct keyfield *key)
+{
+ int i;
+ char *ptr = line->text, *lim = ptr + line->length - 1;
+ size_t sword = key->sword;
+ size_t schar = key->schar;
+ size_t mblength;
+ mbstate_t state;
+
+ memset (&state, '\0', sizeof(mbstate_t));
+
+ if (tab_length)
+ while (ptr < lim && sword--)
+ {
+ while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ ptr += mblength;
+ }
+ if (ptr < lim)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ ptr += mblength;
+ }
+ }
+ else
+ while (ptr < lim && sword--)
+ {
+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
+ ptr += mblength;
+ if (ptr < lim)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ ptr += mblength;
+ }
+ while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
+ ptr += mblength;
+ }
+
+ if (key->skipsblanks)
+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
+ ptr += mblength;
+
+ for (i = 0; i < schar; i++)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+
+ if (ptr + mblength > lim)
+ break;
+ else
+ ptr += mblength;
+ }
+
+ return ptr;
+}
+#endif
+
/* Return the limit of (a pointer to the first character after) the field
in LINE specified by KEY. */
ATTRIBUTE_PURE
static char *
-limfield (struct line const *line, struct keyfield const *key)
+limfield_uni (struct line const *line, struct keyfield const *key)
{
char *ptr = line->text, *lim = ptr + line->length - 1;
size_t eword = key->eword, echar = key->echar;
@@ -1669,10 +1879,10 @@ limfield (struct line const *line, struct keyfield const *key)
'beginning' is the first character following the delimiting TAB.
Otherwise, leave PTR pointing at the first 'blank' character after
the preceding field. */
- if (tab != TAB_DEFAULT)
+ if (tab_length)
while (ptr < lim && eword--)
{
- while (ptr < lim && *ptr != tab)
+ while (ptr < lim && *ptr != tab[0])
++ptr;
if (ptr < lim && (eword || echar))
++ptr;
@@ -1718,10 +1928,10 @@ limfield (struct line const *line, struct keyfield const *key)
*/
/* Make LIM point to the end of (one byte past) the current field. */
- if (tab != TAB_DEFAULT)
+ if (tab_length)
{
char *newlim;
- newlim = memchr (ptr, tab, lim - ptr);
+ newlim = memchr (ptr, tab[0], lim - ptr);
if (newlim)
lim = newlim;
}
@@ -1752,6 +1962,130 @@ limfield (struct line const *line, struct keyfield const *key)
return ptr;
}
+#if HAVE_MBRTOWC
+static char * _GL_ATTRIBUTE_PURE
+limfield_mb (const struct line *line, const struct keyfield *key)
+{
+ char *ptr = line->text, *lim = ptr + line->length - 1;
+ size_t eword = key->eword, echar = key->echar;
+ int i;
+ size_t mblength;
+ mbstate_t state;
+
+ if (echar == 0)
+ eword++; /* skip all of end field. */
+
+ memset (&state, '\0', sizeof(mbstate_t));
+
+ if (tab_length)
+ while (ptr < lim && eword--)
+ {
+ while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ ptr += mblength;
+ }
+ if (ptr < lim && (eword | echar))
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ ptr += mblength;
+ }
+ }
+ else
+ while (ptr < lim && eword--)
+ {
+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
+ ptr += mblength;
+ if (ptr < lim)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ ptr += mblength;
+ }
+ while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
+ ptr += mblength;
+ }
+
+
+# ifdef POSIX_UNSPECIFIED
+ /* Make LIM point to the end of (one byte past) the current field. */
+ if (tab_length)
+ {
+ char *newlim, *p;
+
+ newlim = nullptr;
+ for (p = ptr; p < lim;)
+ {
+ if (memcmp (p, tab, tab_length) == 0)
+ {
+ newlim = p;
+ break;
+ }
+
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ p += mblength;
+ }
+ }
+ else
+ {
+ char *newlim;
+ newlim = ptr;
+
+ while (newlim < lim && ismbblank (newlim, lim - newlim, &mblength))
+ newlim += mblength;
+ if (ptr < lim)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ ptr += mblength;
+ }
+ while (newlim < lim && !ismbblank (newlim, lim - newlim, &mblength))
+ newlim += mblength;
+ lim = newlim;
+ }
+# endif
+
+ if (echar != 0)
+ {
+ /* If we're skipping leading blanks, don't start counting characters
+ * until after skipping past any leading blanks. */
+ if (key->skipeblanks)
+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
+ ptr += mblength;
+
+ memset (&state, '\0', sizeof(mbstate_t));
+
+ /* Advance PTR by ECHAR (if possible), but no further than LIM. */
+ for (i = 0; i < echar; i++)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+
+ if (ptr + mblength > lim)
+ break;
+ else
+ ptr += mblength;
+ }
+ }
+
+ return ptr;
+}
+#endif
+
+static void
+skipblanks_uni (char **ptr, char *lim)
+{
+ while (*ptr < lim && blanks[to_uchar (**ptr)])
+ ++(*ptr);
+}
+
+#if HAVE_MBRTOWC
+static void
+skipblanks_mb (char **ptr, char *lim)
+{
+ size_t mblength;
+ while (*ptr < lim && ismbblank (*ptr, lim - *ptr, &mblength))
+ (*ptr) += mblength;
+}
+#endif
+
/* Fill BUF reading from FP, moving buf->left bytes from the end
of buf->buf to the beginning first. If EOF is reached and the
file wasn't terminated by a newline, supply one. Set up BUF's line
@@ -1838,8 +2172,22 @@ fillbuf (struct buffer *buf, FILE *fp, char const *file)
else
{
if (key->skipsblanks)
- while (blanks[to_uchar (*line_start)])
- line_start++;
+ {
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ {
+ size_t mblength;
+ while (line_start < line->keylim &&
+ ismbblank (line_start,
+ line->keylim - line_start,
+ &mblength))
+ line_start += mblength;
+ }
+ else
+#endif
+ while (blanks[to_uchar (*line_start)])
+ line_start++;
+ }
line->keybeg = line_start;
}
}
@@ -1977,12 +2325,10 @@ find_unit_order (char const *number)
ATTRIBUTE_PURE
static int
-human_numcompare (char const *a, char const *b)
+human_numcompare (char *a, char *b)
{
- while (blanks[to_uchar (*a)])
- a++;
- while (blanks[to_uchar (*b)])
- b++;
+ skipblanks(&a, a + strlen(a));
+ skipblanks(&b, b + strlen(b));
int diff = find_unit_order (a) - find_unit_order (b);
return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
@@ -1994,7 +2340,7 @@ human_numcompare (char const *a, char const *b)
ATTRIBUTE_PURE
static int
-numcompare (char const *a, char const *b)
+numcompare_uni (const char *a, const char *b)
{
while (blanks[to_uchar (*a)])
a++;
@@ -2004,6 +2350,25 @@ numcompare (char const *a, char const *b)
return strnumcmp (a, b, decimal_point, thousands_sep);
}
+#if HAVE_MBRTOWC
+static int
+numcompare_mb (const char *a, const char *b)
+{
+ size_t mblength, len;
+ len = strlen (a); /* okay for UTF-8 */
+ while (*a && ismbblank (a, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength))
+ {
+ a += mblength;
+ len -= mblength;
+ }
+ len = strlen (b); /* okay for UTF-8 */
+ while (*b && ismbblank (b, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength))
+ b += mblength;
+
+ return strnumcmp (a, b, decimal_point, thousands_sep);
+}
+#endif /* HAV_EMBRTOWC */
+
static int
nan_compare (long double a, long double b)
{
@@ -2045,7 +2410,7 @@ general_numcompare (char const *sa, char const *sb)
Return 0 if the name in S is not recognized. */
static int
-getmonth (char const *month, char **ea)
+getmonth_uni (char const *month, size_t len, char **ea)
{
size_t lo = 0;
size_t hi = MONTHS_PER_YEAR;
@@ -2372,15 +2737,14 @@ debug_key (struct line const *line, struct keyfield const *key)
char saved = *lim;
*lim = '\0';
- while (blanks[to_uchar (*beg)])
- beg++;
+ skipblanks (&beg, lim);
char *tighter_lim = beg;
if (lim < beg)
tighter_lim = lim;
else if (key->month)
- getmonth (beg, &tighter_lim);
+ getmonth (beg, lim-beg, &tighter_lim);
else if (key->general_numeric)
ignore_value (strtold (beg, &tighter_lim));
else if (key->numeric || key->human_numeric)
@@ -2526,7 +2890,7 @@ key_warnings (struct keyfield const *gkey, bool gkey_only)
/* Warn about significant leading blanks. */
bool implicit_skip = key_numeric (key) || key->month;
bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
- if (!zero_width && !gkey_only && tab == TAB_DEFAULT && !line_offset
+ if (!zero_width && !gkey_only && !tab_length && !line_offset
&& ((!key->skipsblanks && !implicit_skip)
|| (!key->skipsblanks && key->schar)
|| (!key->skipeblanks && key->echar)))
@@ -2574,9 +2938,9 @@ key_warnings (struct keyfield const *gkey, bool gkey_only)
bool number_locale_warned = false;
if (basic_numeric_field_span)
{
- if (tab == TAB_DEFAULT
- ? thousands_sep != NON_CHAR && (isblank (to_uchar (thousands_sep)))
- : tab == thousands_sep)
+ if (tab_length
+ ? tab[0] == thousands_sep
+ : thousands_sep != NON_CHAR && (isblank (to_uchar (thousands_sep))))
{
error (0, 0,
_("field separator %s is treated as a "
@@ -2587,9 +2951,9 @@ key_warnings (struct keyfield const *gkey, bool gkey_only)
}
if (basic_numeric_field_span || general_numeric_field_span)
{
- if (tab == TAB_DEFAULT
- ? thousands_sep != NON_CHAR && (isblank (to_uchar (decimal_point)))
- : tab == decimal_point)
+ if (tab_length
+ ? tab[0] == decimal_point
+ : thousands_sep != NON_CHAR && (isblank (to_uchar (decimal_point))))
{
error (0, 0,
_("field separator %s is treated as a "
@@ -2597,19 +2961,19 @@ key_warnings (struct keyfield const *gkey, bool gkey_only)
quote (((char []) {decimal_point, 0})));
number_locale_warned = true;
}
- else if (tab == '-')
+ else if (tab_length && tab[0] == '-')
{
error (0, 0,
_("field separator %s is treated as a "
"minus sign in numbers"),
- quote (((char []) {tab, 0})));
+ quote (((char []) {tab[0], 0})));
}
- else if (general_numeric_field_span && tab == '+')
+ else if (general_numeric_field_span && tab_length && tab[0] == '+')
{
error (0, 0,
_("field separator %s is treated as a "
"plus sign in numbers"),
- quote (((char []) {tab, 0})));
+ quote (((char []) {tab[0], 0})));
}
}
@@ -2620,7 +2984,7 @@ key_warnings (struct keyfield const *gkey, bool gkey_only)
{
error (0, 0,
_("%snumbers use %s as a decimal point in this locale"),
- tab == decimal_point ? "" : _("note "),
+ (tab_length && tab[0] == decimal_point) ? "" : _("note "),
quote (((char []) {decimal_point, 0})));
}
@@ -2662,11 +3026,87 @@ diff_reversed (int diff, bool reversed)
return reversed ? (diff < 0) - (diff > 0) : diff;
}
+#if HAVE_MBRTOWC
+static int
+getmonth_mb (const char *s, size_t len, char **ea)
+{
+ char *month;
+ register size_t i;
+ register int lo = 0, hi = MONTHS_PER_YEAR, result;
+ char *tmp;
+ size_t wclength, mblength;
+ const char *pp;
+ const wchar_t *wpp;
+ wchar_t *month_wcs;
+ mbstate_t state;
+
+ while (len > 0 && ismbblank (s, len, &mblength))
+ {
+ s += mblength;
+ len -= mblength;
+ }
+
+ if (len == 0)
+ return 0;
+
+ if (SIZE_MAX - len < 1)
+ xalloc_die ();
+
+ month = (char *) xnmalloc (len + 1, MB_CUR_MAX);
+
+ pp = tmp = (char *) xnmalloc (len + 1, MB_CUR_MAX);
+ memcpy (tmp, s, len);
+ tmp[len] = '\0';
+ wpp = month_wcs = (wchar_t *) xnmalloc (len + 1, sizeof (wchar_t));
+ memset (&state, '\0', sizeof (mbstate_t));
+
+ wclength = mbsrtowcs (month_wcs, &pp, len + 1, &state);
+ if (wclength == (size_t)-1 || pp != nullptr)
+ error (SORT_FAILURE, 0, _("Invalid multibyte input %s."), quote(s));
+
+ for (i = 0; i < wclength; i++)
+ {
+ month_wcs[i] = towupper(month_wcs[i]);
+ if (iswblank (month_wcs[i]))
+ {
+ month_wcs[i] = L'\0';
+ break;
+ }
+ }
+
+ mblength = wcsrtombs (month, &wpp, (len + 1) * MB_CUR_MAX, &state);
+ assert (mblength != (-1) && wpp == nullptr);
+
+ do
+ {
+ int ix = (lo + hi) / 2;
+
+ if (strncmp (month, monthtab[ix].name, strlen (monthtab[ix].name)) < 0)
+ hi = ix;
+ else
+ lo = ix;
+ }
+ while (hi - lo > 1);
+
+ result = (!strncmp (month, monthtab[lo].name, strlen (monthtab[lo].name))
+ ? monthtab[lo].val : 0);
+
+ if (ea && result)
+ *ea = (char*) s + strlen (monthtab[lo].name);
+
+ free (month);
+ free (tmp);
+ free (month_wcs);
+
+ return result;
+}
+#endif
+
/* Compare two lines A and B trying every key in sequence until there
are no more keys or a difference is found. */
static int
-keycompare (struct line const *a, struct line const *b)
+keycompare_uni (const struct line *a, const struct line *b)
{
struct keyfield *key = keylist;
@@ -2747,7 +3187,7 @@ keycompare (struct line const *a, struct line const *b)
else if (key->human_numeric)
diff = human_numcompare (ta, tb);
else if (key->month)
- diff = getmonth (ta, nullptr) - getmonth (tb, nullptr);
+ diff = getmonth (ta, tlena, nullptr) - getmonth (tb, tlenb, nullptr);
else if (key->random)
diff = compare_random (ta, tlena, tb, tlenb);
else if (key->version)
@@ -2857,6 +3297,211 @@ keycompare (struct line const *a, struct line const *b)
return diff_reversed (diff, key->reverse);
}
+#if HAVE_MBRTOWC
+static int
+keycompare_mb (const struct line *a, const struct line *b)
+{
+ struct keyfield *key = keylist;
+
+ /* For the first iteration only, the key positions have been
+ precomputed for us. */
+ char *texta = a->keybeg;
+ char *textb = b->keybeg;
+ char *lima = a->keylim;
+ char *limb = b->keylim;
+
+ size_t mblength_a, mblength_b;
+ wchar_t wc_a, wc_b;
+ mbstate_t state_a, state_b;
+
+ int diff = 0;
+
+ memset (&state_a, '\0', sizeof(mbstate_t));
+ memset (&state_b, '\0', sizeof(mbstate_t));
+ /* Ignore keys with start after end. */
+ if (a->keybeg - a->keylim > 0)
+ return 0;
+
+
+ /* Ignore and/or translate chars before comparing. */
+# define IGNORE_CHARS(NEW_LEN, LEN, TEXT, COPY, WC, MBLENGTH, STATE) \
+ do \
+ { \
+ wchar_t uwc; \
+ char mbc[MB_LEN_MAX]; \
+ mbstate_t state_wc; \
+ \
+ for (NEW_LEN = i = 0; i < LEN;) \
+ { \
+ mbstate_t state_bak; \
+ \
+ state_bak = STATE; \
+ MBLENGTH = mbrtowc (&WC, TEXT + i, LEN - i, &STATE); \
+ \
+ if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1 \
+ || MBLENGTH == 0) \
+ { \
+ if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1) \
+ STATE = state_bak; \
+ if (!ignore) \
+ COPY[NEW_LEN++] = TEXT[i]; \
+ i++; \
+ continue; \
+ } \
+ \
+ if (ignore) \
+ { \
+ if ((ignore == nonprinting && !iswprint (WC)) \
+ || (ignore == nondictionary \
+ && !iswalnum (WC) && !iswblank (WC))) \
+ { \
+ i += MBLENGTH; \
+ continue; \
+ } \
+ } \
+ \
+ if (translate) \
+ { \
+ \
+ uwc = towupper(WC); \
+ if (WC == uwc) \
+ { \
+ memcpy (mbc, TEXT + i, MBLENGTH); \
+ i += MBLENGTH; \
+ } \
+ else \
+ { \
+ i += MBLENGTH; \
+ WC = uwc; \
+ memset (&state_wc, '\0', sizeof (mbstate_t)); \
+ \
+ MBLENGTH = wcrtomb (mbc, WC, &state_wc); \
+ assert (MBLENGTH != (size_t)-1 && MBLENGTH != 0); \
+ } \
+ \
+ for (j = 0; j < MBLENGTH; j++) \
+ COPY[NEW_LEN++] = mbc[j]; \
+ } \
+ else \
+ for (j = 0; j < MBLENGTH; j++) \
+ COPY[NEW_LEN++] = TEXT[i++]; \
+ } \
+ COPY[NEW_LEN] = '\0'; \
+ } \
+ while (0)
+
+ /* Actually compare the fields. */
+
+ for (;;)
+ {
+ /* Find the lengths. */
+ size_t lena = lima <= texta ? 0 : lima - texta;
+ size_t lenb = limb <= textb ? 0 : limb - textb;
+
+ char enda IF_LINT (= 0);
+ char endb IF_LINT (= 0);
+
+ char const *translate = key->translate;
+ bool const *ignore = key->ignore;
+
+ if (ignore || translate)
+ {
+ if (SIZE_MAX - lenb - 2 < lena)
+ xalloc_die ();
+ char *copy_a = (char *) xnmalloc (lena + lenb + 2, MB_CUR_MAX);
+ char *copy_b = copy_a + lena * MB_CUR_MAX + 1;
+ size_t new_len_a, new_len_b;
+ size_t i, j;
+
+ IGNORE_CHARS (new_len_a, lena, texta, copy_a,
+ wc_a, mblength_a, state_a);
+ IGNORE_CHARS (new_len_b, lenb, textb, copy_b,
+ wc_b, mblength_b, state_b);
+ texta = copy_a; textb = copy_b;
+ lena = new_len_a; lenb = new_len_b;
+ }
+ else
+ {
+ /* Use the keys in-place, temporarily null-terminated. */
+ enda = texta[lena]; texta[lena] = '\0';
+ endb = textb[lenb]; textb[lenb] = '\0';
+ }
+
+ if (key->random)
+ diff = compare_random (texta, lena, textb, lenb);
+ else if (key->numeric | key->general_numeric | key->human_numeric)
+ {
+ char savea = *lima, saveb = *limb;
+
+ *lima = *limb = '\0';
+ diff = (key->numeric ? numcompare (texta, textb)
+ : key->general_numeric ? general_numcompare (texta, textb)
+ : human_numcompare (texta, textb));
+ *lima = savea, *limb = saveb;
+ }
+ else if (key->version)
+ diff = filevercmp (texta, textb);
+ else if (key->month)
+ diff = getmonth (texta, lena, nullptr) - getmonth (textb, lenb, nullptr);
+ else if (lena == 0)
+ diff = - NONZERO (lenb);
+ else if (lenb == 0)
+ diff = 1;
+ else if (hard_LC_COLLATE && !folding)
+ {
+ diff = xmemcoll0 (texta, lena + 1, textb, lenb + 1);
+ }
+ else
+ {
+ diff = memcmp (texta, textb, MIN (lena, lenb));
+ if (diff == 0)
+ diff = lena < lenb ? -1 : lena != lenb;
+ }
+
+ if (ignore || translate)
+ free (texta);
+ else
+ {
+ texta[lena] = enda;
+ textb[lenb] = endb;
+ }
+
+ if (diff)
+ goto not_equal;
+
+ key = key->next;
+ if (! key)
+ break;
+
+ /* Find the beginning and limit of the next field. */
+ if (key->eword != -1)
+ lima = limfield (a, key), limb = limfield (b, key);
+ else
+ lima = a->text + a->length - 1, limb = b->text + b->length - 1;
+
+ if (key->sword != -1)
+ texta = begfield (a, key), textb = begfield (b, key);
+ else
+ {
+ texta = a->text, textb = b->text;
+ if (key->skipsblanks)
+ {
+ while (texta < lima && ismbblank (texta, lima - texta, &mblength_a))
+ texta += mblength_a;
+ while (textb < limb && ismbblank (textb, limb - textb, &mblength_b))
+ textb += mblength_b;
+ }
+ }
+ }
+
+not_equal:
+ if (key && key->reverse)
+ return -diff;
+ else
+ return diff;
+}
+#endif
+
/* Compare two lines A and B, returning negative, zero, or positive
depending on whether A compares less than, equal to, or greater than B. */
@@ -2884,7 +3529,7 @@ compare (struct line const *a, struct line const *b)
diff = - NONZERO (blen);
else if (blen == 0)
diff = 1;
- else if (hard_LC_COLLATE)
+ else if (hard_LC_COLLATE && !folding)
{
/* xmemcoll0 is a performance enhancement as
it will not unconditionally write '\0' after the
@@ -4272,6 +4917,7 @@ set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
break;
case 'f':
key->translate = fold_toupper;
+ folding = true;
break;
case 'g':
key->general_numeric = true;
@@ -4351,7 +4997,7 @@ main (int argc, char **argv)
initialize_exit_failure (SORT_FAILURE);
hard_LC_COLLATE = hard_locale (LC_COLLATE);
-#if HAVE_NL_LANGINFO
+#if HAVE_LANGINFO_CODESET
hard_LC_TIME = hard_locale (LC_TIME);
#endif
@@ -4374,6 +5020,29 @@ main (int argc, char **argv)
thousands_sep = NON_CHAR;
}
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ {
+ inittables = inittables_mb;
+ begfield = begfield_mb;
+ limfield = limfield_mb;
+ skipblanks = skipblanks_mb;
+ getmonth = getmonth_mb;
+ keycompare = keycompare_mb;
+ numcompare = numcompare_mb;
+ }
+ else
+#endif
+ {
+ inittables = inittables_uni;
+ begfield = begfield_uni;
+ limfield = limfield_uni;
+ skipblanks = skipblanks_uni;
+ getmonth = getmonth_uni;
+ keycompare = keycompare_uni;
+ numcompare = numcompare_uni;
+ }
+
have_read_stdin = false;
inittables ();
@@ -4644,13 +5313,34 @@ main (int argc, char **argv)
case 't':
{
- char newtab = optarg[0];
- if (! newtab)
+ char newtab[MB_LEN_MAX + 1];
+ size_t newtab_length = 1;
+ strncpy (newtab, optarg, MB_LEN_MAX);
+ if (! newtab[0])
error (SORT_FAILURE, 0, _("empty tab"));
- if (optarg[1])
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ {
+ wchar_t wc;
+ mbstate_t state;
+
+ memset (&state, '\0', sizeof (mbstate_t));
+ newtab_length = mbrtowc (&wc, newtab, strnlen (newtab,
+ MB_LEN_MAX),
+ &state);
+ switch (newtab_length)
+ {
+ case (size_t) -1:
+ case (size_t) -2:
+ case 0:
+ newtab_length = 1;
+ }
+ }
+#endif
+ if (newtab_length == 1 && optarg[1])
{
if (STREQ (optarg, "\\0"))
- newtab = '\0';
+ newtab[0] = '\0';
else
{
/* Provoke with 'sort -txx'. Complain about
@@ -4661,9 +5351,11 @@ main (int argc, char **argv)
quote (optarg));
}
}
- if (tab != TAB_DEFAULT && tab != newtab)
+ if (tab_length && (tab_length != newtab_length
+ || memcmp (tab, newtab, tab_length) != 0))
error (SORT_FAILURE, 0, _("incompatible tabs"));
- tab = newtab;
+ memcpy (tab, newtab, newtab_length);
+ tab_length = newtab_length;
}
break;
diff --git a/src/unexpand.c b/src/unexpand.c
index aca67dd73..ebad4848b 100644
--- a/src/unexpand.c
+++ b/src/unexpand.c
@@ -39,6 +39,9 @@
#include <stdio.h>
#include <getopt.h>
#include <sys/types.h>
+
+#include <mbfile.h>
+
#include "system.h"
#include "expand-common.h"
@@ -105,24 +108,47 @@ unexpand (void)
{
/* Input stream. */
FILE *fp = next_file (nullptr);
+ mb_file_t mbf;
/* The array of pending blanks. In non-POSIX locales, blanks can
include characters other than spaces, so the blanks must be
stored, not merely counted. */
- char *pending_blank;
+ mbf_char_t *pending_blank;
+ /* True if the starting locale is utf8. */
+ bool using_utf_locale;
+
+ /* True if the first file contains BOM header. */
+ bool found_bom;
+ using_utf_locale=check_utf_locale();
if (!fp)
return;
+ mbf_init (mbf, fp);
+ found_bom=check_bom(fp,&mbf);
+
+ if (using_utf_locale == false && found_bom == true)
+ {
+ /*try using some predefined locale */
+ if (set_utf_locale () != 0)
+ {
+ error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale"));
+ }
+ }
/* The worst case is a non-blank character, then one blank, then a
tab stop, then MAX_COLUMN_WIDTH - 1 blanks, then a non-blank; so
allocate MAX_COLUMN_WIDTH bytes to store the blanks. */
- pending_blank = xmalloc (max_column_width);
+ pending_blank = xmalloc (max_column_width * sizeof (mbf_char_t));
+
+ if (found_bom == true)
+ {
+ print_bom();
+ }
while (true)
{
/* Input character, or EOF. */
- int c;
+ mbf_char_t c;
/* If true, perform translations. */
bool convert = true;
@@ -156,12 +182,44 @@ unexpand (void)
do
{
- while ((c = getc (fp)) < 0 && (fp = next_file (fp)))
- continue;
+ while (true) {
+ mbf_getc (c, mbf);
+ if ((mb_iseof (c)) && (fp = next_file (fp)))
+ {
+ mbf_init (mbf, fp);
+ if (fp!=nullptr)
+ {
+ if (check_bom(fp,&mbf)==true)
+ {
+ /*Not the first file - check BOM header*/
+ if (using_utf_locale==false && found_bom==false)
+ {
+ /*BOM header in subsequent file but not in the first one. */
+ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header"));
+ }
+ }
+ else
+ {
+ if(using_utf_locale==false && found_bom==true)
+ {
+ /*First file conatined BOM header - locale was switched to UTF
+ *all subsequent files should contain BOM. */
+ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header"));
+ }
+ }
+ }
+ continue;
+ }
+ else
+ {
+ break;
+ }
+ }
+
if (convert)
{
- bool blank = !! isblank (c);
+ bool blank = mb_isblank (c);
if (blank)
{
@@ -178,16 +236,16 @@ unexpand (void)
if (next_tab_column < column)
error (EXIT_FAILURE, 0, _("input line is too long"));
- if (c == '\t')
+ if (mb_iseq (c, '\t'))
{
column = next_tab_column;
if (pending)
- pending_blank[0] = '\t';
+ mb_setascii (&pending_blank[0], '\t');
}
else
{
- column++;
+ column += mb_width (c);
if (! (prev_blank && column == next_tab_column))
{
@@ -195,13 +253,14 @@ unexpand (void)
will be replaced by tabs. */
if (column == next_tab_column)
one_blank_before_tab_stop = true;
- pending_blank[pending++] = c;
+ mb_copy (&pending_blank[pending++], &c);
prev_blank = true;
continue;
}
/* Replace the pending blanks by a tab or two. */
- pending_blank[0] = c = '\t';
+ mb_setascii (&c, '\t');
+ mb_setascii (&pending_blank[0], '\t');
}
/* Discard pending blanks, unless it was a single
@@ -209,7 +268,7 @@ unexpand (void)
pending = one_blank_before_tab_stop;
}
}
- else if (c == '\b')
+ else if (mb_iseq (c, '\b'))
{
/* Go back one column, and force recalculation of the
next tab stop. */
@@ -219,16 +278,20 @@ unexpand (void)
}
else
{
- column++;
- if (!column)
+ const uintmax_t orig_column = column;
+ column += mb_width (c);
+ if (column < orig_column)
error (EXIT_FAILURE, 0, _("input line is too long"));
}
if (pending)
{
if (pending > 1 && one_blank_before_tab_stop)
- pending_blank[0] = '\t';
- if (fwrite (pending_blank, 1, pending, stdout) != pending)
+ mb_setascii (&pending_blank[0], '\t');
+
+ for (int n = 0; n < pending; ++n)
+ mb_putc (pending_blank[n], stdout);
+ if (ferror (stdout))
write_error ();
pending = 0;
one_blank_before_tab_stop = false;
@@ -238,16 +301,17 @@ unexpand (void)
convert &= convert_entire_line || blank;
}
- if (c < 0)
+ if (mb_iseof (c))
{
free (pending_blank);
return;
}
- if (putchar (c) < 0)
+ mb_putc (c, stdout);
+ if (ferror (stdout))
write_error ();
}
- while (c != '\n');
+ while (!mb_iseq (c, '\n'));
}
}
diff --git a/tests/Coreutils.pm b/tests/Coreutils.pm
index 18e7bea84..24a141b16 100644
--- a/tests/Coreutils.pm
+++ b/tests/Coreutils.pm
@@ -269,6 +269,9 @@ sub run_tests ($$$$$)
# Yes, this is an arbitrary limit. If it causes trouble,
# consider removing it.
my $max = 30;
+ # The downstream i18n multi-byte tests have a "-mb" suffix.
+ # Therefore add 3 to the maximum test name length.
+ $max += 3;
if ($max < length $test_name)
{
warn "$program_name: $test_name: test name is too long (> $max)\n";
diff --git a/tests/expand/mb.sh b/tests/expand/mb.sh
new file mode 100755
index 000000000..dd6007c00
--- /dev/null
+++ b/tests/expand/mb.sh
@@ -0,0 +1,183 @@
+#!/bin/sh
+
+# Copyright (C) 2012-2015 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ expand
+
+export LC_ALL=en_US.UTF-8
+
+#input containing multibyte characters
+cat <<\EOF > in || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+EOF
+env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_
+
+cat <<\EOF > exp || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+expand < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#multiple files as an input
+cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+expand ./in ./in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#test characters with display widths != 1
+env printf '12345678
+e\t|ascii(1)
+\u00E9\t|composed(1)
+e\u0301\t|decomposed(1)
+\u3000\t|ideo-space(2)
+\uFF0D\t|full-hypen(2)
+' > in || framework_failure_
+
+env printf '12345678
+e |ascii(1)
+\u00E9 |composed(1)
+e\u0301 |decomposed(1)
+\u3000 |ideo-space(2)
+\uFF0D |full-hypen(2)
+' > exp || framework_failure_
+
+expand < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#shouldn't fail with "input line too long"
+#when a line starts with a control character
+env printf '\n' > in || framework_failure_
+
+expand < in > out || fail=1
+compare in out > /dev/null 2>&1 || fail=1
+
+#non-Unicode characters interspersed between Unicode ones
+env printf '12345678
+\t\xFF|
+\xFF\t|
+\t\xFFä|
+ä\xFF\t|
+\tä\xFF|
+\xFF\tä|
+äbcdef\xFF\t|
+' > in || framework_failure_
+
+env printf '12345678
+ \xFF|
+\xFF |
+ \xFFä|
+ä\xFF |
+ ä\xFF|
+\xFF ä|
+äbcdef\xFF |
+' > exp || framework_failure_
+
+expand < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+
+
+#BOM header test 1
+printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+EOF
+env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_
+
+printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+
+expand < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+LANG=C expand < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+LC_ALL=C expand < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+
+printf '\xEF\xBB\xBF' > in1; cat <<\EOF >> in1 || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+EOF
+env printf ' äöü\t. öüä. \tä xx\n' >> in1 || framework_failure_
+
+
+printf '\xEF\xBB\xBF' > exp; cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+expand in1 in1 > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+LANG=C expand in1 in1 > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+LC_ALL=C expand in1 in1 > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+exit $fail
diff --git a/tests/i18n/sort.sh b/tests/i18n/sort.sh
new file mode 100755
index 000000000..26c95de9a
--- /dev/null
+++ b/tests/i18n/sort.sh
@@ -0,0 +1,29 @@
+#!/bin/sh
+# Verify sort's multi-byte support.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ sort
+
+export LC_ALL=en_US.UTF-8
+locale -k LC_CTYPE | grep -q "charmap.*UTF-8" \
+ || skip_ "No UTF-8 locale available"
+
+# Enable heap consistency checkng on older systems
+export MALLOC_CHECK_=2
+
+
+# check buffer overflow issue due to
+# expanding multi-byte representation due to case conversion
+# https://bugzilla.suse.com/show_bug.cgi?id=928749
+cat <<EOF > exp
+.
+ɑ
+EOF
+cat <<EOF | sort -f > out || fail=1
+.
+ɑ
+EOF
+compare exp out || { fail=1; cat out; }
+
+
+Exit $fail
diff --git a/tests/local.mk b/tests/local.mk
index fdbf36946..6d3d4cfab 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -387,6 +387,8 @@ all_tests = \
tests/sort/sort-discrim.sh \
tests/sort/sort-files0-from.pl \
tests/sort/sort-float.sh \
+ tests/sort/sort-mb-tests.sh \
+ tests/i18n/sort.sh \
tests/sort/sort-h-thousands-sep.sh \
tests/sort/sort-merge.pl \
tests/sort/sort-merge-fdlimit.sh \
@@ -590,6 +592,7 @@ all_tests = \
tests/du/threshold.sh \
tests/du/trailing-slash.sh \
tests/du/two-args.sh \
+ tests/expand/mb.sh \
tests/id/gnu-zero-uids.sh \
tests/id/no-context.sh \
tests/id/context.sh \
@@ -746,6 +749,7 @@ all_tests = \
tests/touch/read-only.sh \
tests/touch/relative.sh \
tests/touch/trailing-slash.sh \
+ tests/unexpand/mb.sh \
$(all_root_tests)
# See tests/factor/create-test.sh.
diff --git a/tests/misc/expand.pl b/tests/misc/expand.pl
index 11f3fc4bf..d609a2cb0 100755
--- a/tests/misc/expand.pl
+++ b/tests/misc/expand.pl
@@ -27,6 +27,15 @@ my $prog = 'expand';
# Turn off localization of executable's output.
@ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
+#comment out next line to disable multibyte tests
+my $mb_locale = $ENV{LOCALE_FR_UTF8};
+! defined $mb_locale || $mb_locale eq 'none'
+ and $mb_locale = 'C';
+
+my $prog = 'expand';
+my $try = "Try \`$prog --help' for more information.\n";
+my $inval = "$prog: invalid byte, character or field list\n$try";
+
my @Tests =
(
['t1', '--tabs=3', {IN=>"a\tb"}, {OUT=>"a b"}],
@@ -168,6 +177,8 @@ my @Tests =
# Test errors
+ # FIXME: The following tests contain quoting specific to LC_MESSAGES
+ # So we force LC_MESSAGES=C to make them pass.
['e1', '--tabs="a"', {IN=>''}, {OUT=>''}, {EXIT=>1},
{ERR => "$prog: tab size contains invalid character(s): 'a'\n"}],
['e2', "-t $UINTMAX_OFLOW", {IN=>''}, {OUT=>''}, {EXIT=>1},
@@ -184,6 +195,37 @@ my @Tests =
{ERR => "$prog: '/' specifier not at start of number: '/'\n"}],
);
+if ($mb_locale ne 'C')
+ {
+ # Duplicate each test vector, appending "-mb" to the test name and
+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
+ # provide coverage for the distro-added multi-byte code paths.
+ my @new;
+ foreach my $t (@Tests)
+ {
+ my @new_t = @$t;
+ my $test_name = shift @new_t;
+
+ # Depending on whether expand is multi-byte-patched,
+ # it emits different diagnostics:
+ # non-MB: invalid byte or field list
+ # MB: invalid byte, character or field list
+ # Adjust the expected error output accordingly.
+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
+ (@new_t))
+ {
+ my $sub = {ERR_SUBST => 's/, character//'};
+ push @new_t, $sub;
+ push @$t, $sub;
+ }
+ push @new, ["$test_name-mb", @new_t, {ENV => "LANG=$mb_locale LC_MESSAGES=C"}];
+ }
+ push @Tests, @new;
+ }
+
+
+@Tests = triple_test \@Tests;
+
my $save_temps = $ENV{DEBUG};
my $verbose = $ENV{VERBOSE};
diff --git a/tests/misc/fold.pl b/tests/misc/fold.pl
index 00b43624f..7d51bea55 100755
--- a/tests/misc/fold.pl
+++ b/tests/misc/fold.pl
@@ -20,9 +20,18 @@ use strict;
(my $program_name = $0) =~ s|.*/||;
+my $prog = 'fold';
+my $try = "Try \`$prog --help' for more information.\n";
+my $inval = "$prog: invalid byte, character or field list\n$try";
+
# Turn off localization of executable's output.
@ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
+# uncommented to enable multibyte paths
+my $mb_locale = $ENV{LOCALE_FR_UTF8};
+! defined $mb_locale || $mb_locale eq 'none'
+ and $mb_locale = 'C';
+
my @Tests =
(
['s1', '-w2 -s', {IN=>"a\t"}, {OUT=>"a\n\t"}],
@@ -31,9 +40,48 @@ my @Tests =
['s4', '-w4 -s', {IN=>"abc ef\n"}, {OUT=>"abc \nef\n"}],
);
+# Add _POSIX2_VERSION=199209 to the environment of each test
+# that uses an old-style option like +1.
+if ($mb_locale ne 'C')
+ {
+ # Duplicate each test vector, appending "-mb" to the test name and
+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
+ # provide coverage for the distro-added multi-byte code paths.
+ my @new;
+ foreach my $t (@Tests)
+ {
+ my @new_t = @$t;
+ my $test_name = shift @new_t;
+
+ # Depending on whether fold is multi-byte-patched,
+ # it emits different diagnostics:
+ # non-MB: invalid byte or field list
+ # MB: invalid byte, character or field list
+ # Adjust the expected error output accordingly.
+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
+ (@new_t))
+ {
+ my $sub = {ERR_SUBST => 's/, character//'};
+ push @new_t, $sub;
+ push @$t, $sub;
+ }
+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
+ }
+ push @Tests, @new;
+ }
+
+@Tests = triple_test \@Tests;
+
+# Remember that triple_test creates from each test with exactly one "IN"
+# file two more tests (.p and .r suffix on name) corresponding to reading
+# input from a file and from a pipe. The pipe-reading test would fail
+# due to a race condition about 1 in 20 times.
+# Remove the IN_PIPE version of the "output-is-input" test above.
+# The others aren't susceptible because they have three inputs each.
+@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
+
my $save_temps = $ENV{DEBUG};
my $verbose = $ENV{VERBOSE};
-my $prog = 'fold';
my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose);
exit $fail;
diff --git a/tests/misc/unexpand.pl b/tests/misc/unexpand.pl
index 76bcbd4d7..59eb8199b 100755
--- a/tests/misc/unexpand.pl
+++ b/tests/misc/unexpand.pl
@@ -27,6 +27,14 @@ my $limits = getlimits ();
my $prog = 'unexpand';
+# comment out next line to disable multibyte tests
+my $mb_locale = $ENV{LOCALE_FR_UTF8};
+! defined $mb_locale || $mb_locale eq 'none'
+ and $mb_locale = 'C';
+
+my $try = "Try \`$prog --help' for more information.\n";
+my $inval = "$prog: invalid byte, character or field list\n$try";
+
my @Tests =
(
['a1', {IN=> ' 'x 1 ."y\n"}, {OUT=> ' 'x 1 ."y\n"}],
@@ -128,6 +136,37 @@ my @Tests =
['ts2', '-t5,8', {IN=>"x\t \t y\n"}, {OUT=>"x\t\t y\n"}],
);
+if ($mb_locale ne 'C')
+ {
+ # Duplicate each test vector, appending "-mb" to the test name and
+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
+ # provide coverage for the distro-added multi-byte code paths.
+ my @new;
+ foreach my $t (@Tests)
+ {
+ my @new_t = @$t;
+ my $test_name = shift @new_t;
+
+ # Depending on whether unexpand is multi-byte-patched,
+ # it emits different diagnostics:
+ # non-MB: invalid byte or field list
+ # MB: invalid byte, character or field list
+ # Adjust the expected error output accordingly.
+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
+ (@new_t))
+ {
+ my $sub = {ERR_SUBST => 's/, character//'};
+ push @new_t, $sub;
+ push @$t, $sub;
+ }
+ next if ($test_name =~ 'b-1');
+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
+ }
+ push @Tests, @new;
+ }
+
+@Tests = triple_test \@Tests;
+
my $save_temps = $ENV{DEBUG};
my $verbose = $ENV{VERBOSE};
diff --git a/tests/pr/pr-tests.pl b/tests/pr/pr-tests.pl
index 6b34e0b69..34b4aeb06 100755
--- a/tests/pr/pr-tests.pl
+++ b/tests/pr/pr-tests.pl
@@ -24,6 +24,15 @@ use strict;
my $prog = 'pr';
my $normalize_strerror = "s/': .*/'/";
+my $mb_locale;
+#Uncomment the following line to enable multibyte tests
+$mb_locale = $ENV{LOCALE_FR_UTF8};
+! defined $mb_locale || $mb_locale eq 'none'
+ and $mb_locale = 'C';
+
+my $try = "Try \`$prog --help' for more information.\n";
+my $inval = "$prog: invalid byte, character or field list\n$try";
+
my @tv = (
# -b option is no longer an official option. But it's still working to
@@ -515,8 +524,48 @@ push @Tests,
{IN=>"x\tx\tx\tx\tx\nx\tx\tx\tx\tx\n"},
{OUT=>"x\tx\tx\tx\tx\tx\tx\tx\tx\tx\n"} ];
+# Add _POSIX2_VERSION=199209 to the environment of each test
+# that uses an old-style option like +1.
+if ($mb_locale ne 'C')
+ {
+ # Duplicate each test vector, appending "-mb" to the test name and
+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
+ # provide coverage for the distro-added multi-byte code paths.
+ my @new;
+ foreach my $t (@Tests)
+ {
+ my @new_t = @$t;
+ my $test_name = shift @new_t;
+
+ # Depending on whether pr is multi-byte-patched,
+ # it emits different diagnostics:
+ # non-MB: invalid byte or field list
+ # MB: invalid byte, character or field list
+ # Adjust the expected error output accordingly.
+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
+ (@new_t))
+ {
+ my $sub = {ERR_SUBST => 's/, character//'};
+ push @new_t, $sub;
+ push @$t, $sub;
+ }
+ #temporarily skip some failing tests
+ next if ($test_name =~ "col-0" or $test_name =~ "col-inval" or $test_name =~ "asan1");
+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
+ }
+ push @Tests, @new;
+ }
+
@Tests = triple_test \@Tests;
+# Remember that triple_test creates from each test with exactly one "IN"
+# file two more tests (.p and .r suffix on name) corresponding to reading
+# input from a file and from a pipe. The pipe-reading test would fail
+# due to a race condition about 1 in 20 times.
+# Remove the IN_PIPE version of the "output-is-input" test above.
+# The others aren't susceptible because they have three inputs each.
+@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
+
my $save_temps = $ENV{DEBUG};
my $verbose = $ENV{VERBOSE};
diff --git a/tests/sort/sort-mb-tests.sh b/tests/sort/sort-mb-tests.sh
new file mode 100755
index 000000000..11836baa5
--- /dev/null
+++ b/tests/sort/sort-mb-tests.sh
@@ -0,0 +1,45 @@
+#!/bin/sh
+# Verify sort's multi-byte support.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ sort
+
+export LC_ALL=en_US.UTF-8
+locale -k LC_CTYPE | grep -q "charmap.*UTF-8" \
+ || skip_ "No UTF-8 locale available"
+
+
+cat <<EOF > exp
+Banana5
+Apple10
+Citrus20
+Cherry30
+EOF
+
+cat <<EOF | sort -t -k2 -n > out || fail=1
+Apple10
+Banana5
+Citrus20
+Cherry30
+EOF
+
+compare exp out || { fail=1; cat out; }
+
+
+cat <<EOF > exp
+Citrus205
+Cherry3010
+Apple1020
+Banana530
+EOF
+
+cat <<EOF | sort -t -k4 -n > out || fail=1
+Apple1020
+Banana530
+Citrus205
+Cherry3010
+EOF
+
+compare exp out || { fail=1; cat out; }
+
+Exit $fail
diff --git a/tests/sort/sort-merge.pl b/tests/sort/sort-merge.pl
index 89eed0c64..b855d738b 100755
--- a/tests/sort/sort-merge.pl
+++ b/tests/sort/sort-merge.pl
@@ -26,6 +26,15 @@ my $prog = 'sort';
# Turn off localization of executable's output.
@ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
+my $mb_locale;
+# uncommented according to upstream commit enabling multibyte paths
+$mb_locale = $ENV{LOCALE_FR_UTF8};
+! defined $mb_locale || $mb_locale eq 'none'
+ and $mb_locale = 'C';
+
+my $try = "Try \`$prog --help' for more information.\n";
+my $inval = "$prog: invalid byte, character or field list\n$try";
+
# three empty files and one that says 'foo'
my @inputs = (+(map{{IN=> {"empty$_"=> ''}}}1..3), {IN=> {foo=> "foo\n"}});
@@ -77,6 +86,39 @@ my @Tests =
{OUT=>$big_input}],
);
+# Add _POSIX2_VERSION=199209 to the environment of each test
+# that uses an old-style option like +1.
+if ($mb_locale ne 'C')
+ {
+ # Duplicate each test vector, appending "-mb" to the test name and
+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
+ # provide coverage for the distro-added multi-byte code paths.
+ my @new;
+ foreach my $t (@Tests)
+ {
+ my @new_t = @$t;
+ my $test_name = shift @new_t;
+
+ # Depending on whether sort is multi-byte-patched,
+ # it emits different diagnostics:
+ # non-MB: invalid byte or field list
+ # MB: invalid byte, character or field list
+ # Adjust the expected error output accordingly.
+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
+ (@new_t))
+ {
+ my $sub = {ERR_SUBST => 's/, character//'};
+ push @new_t, $sub;
+ push @$t, $sub;
+ }
+ next if ($test_name =~ "nmerge-.");
+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
+ }
+ push @Tests, @new;
+ }
+
+@Tests = triple_test \@Tests;
+
my $save_temps = $ENV{DEBUG};
my $verbose = $ENV{VERBOSE};
diff --git a/tests/sort/sort.pl b/tests/sort/sort.pl
index d49f65f66..ebba92550 100755
--- a/tests/sort/sort.pl
+++ b/tests/sort/sort.pl
@@ -24,10 +24,15 @@ my $prog = 'sort';
# Turn off localization of executable's output.
@ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
-my $mb_locale = $ENV{LOCALE_FR_UTF8};
+my $mb_locale;
+#Comment out next line to disable multibyte tests
+$mb_locale = $ENV{LOCALE_FR_UTF8};
! defined $mb_locale || $mb_locale eq 'none'
and $mb_locale = 'C';
+my $try = "Try \`$prog --help' for more information.\n";
+my $inval = "$prog: invalid byte, character or field list\n$try";
+
# Since each test is run with a file name and with redirected stdin,
# the name in the diagnostic is either the file name or "-".
# Normalize each diagnostic to use '-'.
@@ -423,6 +428,38 @@ foreach my $t (@Tests)
}
}
+if ($mb_locale ne 'C')
+ {
+ # Duplicate each test vector, appending "-mb" to the test name and
+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
+ # provide coverage for the distro-added multi-byte code paths.
+ my @new;
+ foreach my $t (@Tests)
+ {
+ my @new_t = @$t;
+ my $test_name = shift @new_t;
+
+ # Depending on whether sort is multi-byte-patched,
+ # it emits different diagnostics:
+ # non-MB: invalid byte or field list
+ # MB: invalid byte, character or field list
+ # Adjust the expected error output accordingly.
+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
+ (@new_t))
+ {
+ my $sub = {ERR_SUBST => 's/, character//'};
+ push @new_t, $sub;
+ push @$t, $sub;
+ }
+ #disable several failing tests until investigation, disable all tests with envvars set
+ next if (grep {ref $_ eq 'HASH' && exists $_->{ENV}} (@new_t));
+ next if ($test_name =~ "18g" or $test_name =~ "sort-numeric" or $test_name =~ "08[ab]" or $test_name =~ "03[def]" or $test_name =~ "h4" or $test_name =~ "n1" or $test_name =~ "2[01]a");
+ next if ($test_name =~ "11[ab]"); # avoid FP: expected result differs to MB result due to collation rules.
+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
+ }
+ push @Tests, @new;
+ }
+
@Tests = triple_test \@Tests;
# Remember that triple_test creates from each test with exactly one "IN"
@@ -432,6 +469,7 @@ foreach my $t (@Tests)
# Remove the IN_PIPE version of the "output-is-input" test above.
# The others aren't susceptible because they have three inputs each.
@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
+@Tests = grep {$_->[0] ne 'output-is-input-mb.p'} @Tests;
my $save_temps = $ENV{DEBUG};
my $verbose = $ENV{VERBOSE};
diff --git a/tests/unexpand/mb.sh b/tests/unexpand/mb.sh
new file mode 100755
index 000000000..8a82d745a
--- /dev/null
+++ b/tests/unexpand/mb.sh
@@ -0,0 +1,172 @@
+#!/bin/sh
+
+# Copyright (C) 2012-2015 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ unexpand
+
+export LC_ALL=en_US.UTF-8
+
+#input containing multibyte characters
+cat > in <<\EOF
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+cat > exp <<\EOF
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+unexpand -a < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+
+#multiple files as an input
+cat >> exp <<\EOF
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+
+unexpand -a ./in ./in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#test characters with a display width larger than 1
+
+env printf '12345678
+e |ascii(1)
+\u00E9 |composed(1)
+e\u0301 |decomposed(1)
+\u3000 |ideo-space(2)
+\uFF0D |full-hypen(2)
+' > in || framework_failure_
+
+env printf '12345678
+e\t|ascii(1)
+\u00E9\t|composed(1)
+e\u0301\t|decomposed(1)
+\u3000\t|ideo-space(2)
+\uFF0D\t|full-hypen(2)
+' > exp || framework_failure_
+
+unexpand -a < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#test input where a blank of width > 1 is not being substituted
+in="$(LC_ALL=en_US.UTF-8 printf ' \u3000 ö ü ß')"
+exp='   ö ü ß'
+
+unexpand -a < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#non-Unicode characters interspersed between Unicode ones
+env printf '12345678
+ \xFF|
+\xFF |
+ \xFFä|
+ä\xFF |
+ ä\xFF|
+\xFF ä|
+äbcdef\xFF |
+' > in || framework_failure_
+
+env printf '12345678
+\t\xFF|
+\xFF\t|
+\t\xFFä|
+ä\xFF\t|
+\tä\xFF|
+\xFF\tä|
+äbcdef\xFF\t|
+' > exp || framework_failure_
+
+unexpand -a < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#BOM header test 1
+printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_
+
+printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+unexpand < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+LANG=C unexpand < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+LC_ALL=C unexpand < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+
+printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+
+unexpand in in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+LANG=C unexpand in in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+LC_ALL=C unexpand in in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
--
2.44.0
The following part is directly taken from coreutils-9.4-i18n-1.patch.
diff --git a/lib/mbfile.c b/lib/mbfile.c
new file mode 100644
index 0000000..b0a468e
--- /dev/null
+++ b/lib/mbfile.c
@@ -0,0 +1,3 @@
+#include <config.h>
+#define MBFILE_INLINE _GL_EXTERN_INLINE
+#include "mbfile.h"
diff --git a/lib/mbfile.h b/lib/mbfile.h
new file mode 100644
index 0000000..11f1b12
--- /dev/null
+++ b/lib/mbfile.h
@@ -0,0 +1,255 @@
+/* Multibyte character I/O: macros for multi-byte encodings.
+ Copyright (C) 2001, 2005, 2009-2015 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+/* Written by Mitsuru Chinen <mchinen@yamato.ibm.com>
+ and Bruno Haible <bruno@clisp.org>. */
+
+/* The macros in this file implement multi-byte character input from a
+ stream.
+
+ mb_file_t
+ is the type for multibyte character input stream, usable for variable
+ declarations.
+
+ mbf_char_t
+ is the type for multibyte character or EOF, usable for variable
+ declarations.
+
+ mbf_init (mbf, stream)
+ initializes the MB_FILE for reading from stream.
+
+ mbf_getc (mbc, mbf)
+ reads the next multibyte character from mbf and stores it in mbc.
+
+ mb_iseof (mbc)
+ returns true if mbc represents the EOF value.
+
+ Here are the function prototypes of the macros.
+
+ extern void mbf_init (mb_file_t mbf, FILE *stream);
+ extern void mbf_getc (mbf_char_t mbc, mb_file_t mbf);
+ extern bool mb_iseof (const mbf_char_t mbc);
+ */
+
+#ifndef _MBFILE_H
+#define _MBFILE_H 1
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before
+ <wchar.h>.
+ BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before
+ <wchar.h>. */
+#include <stdio.h>
+#include <time.h>
+#include <wchar.h>
+
+#include "mbchar.h"
+
+#ifndef _GL_INLINE_HEADER_BEGIN
+ #error "Please include config.h first."
+#endif
+_GL_INLINE_HEADER_BEGIN
+#ifndef MBFILE_INLINE
+# define MBFILE_INLINE _GL_INLINE
+#endif
+
+struct mbfile_multi {
+ FILE *fp;
+ bool eof_seen;
+ bool have_pushback;
+ mbstate_t state;
+ unsigned int bufcount;
+ char buf[MBCHAR_BUF_SIZE];
+ struct mbchar pushback;
+};
+
+MBFILE_INLINE void
+mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf)
+{
+ size_t bytes;
+
+ /* If EOF has already been seen, don't use getc. This matters if
+ mbf->fp is connected to an interactive tty. */
+ if (mbf->eof_seen)
+ goto eof;
+
+ /* Return character pushed back, if there is one. */
+ if (mbf->have_pushback)
+ {
+ mb_copy (mbc, &mbf->pushback);
+ mbf->have_pushback = false;
+ return;
+ }
+
+ /* Before using mbrtowc, we need at least one byte. */
+ if (mbf->bufcount == 0)
+ {
+ int c = getc (mbf->fp);
+ if (c == EOF)
+ {
+ mbf->eof_seen = true;
+ goto eof;
+ }
+ mbf->buf[0] = (unsigned char) c;
+ mbf->bufcount++;
+ }
+
+ /* Handle most ASCII characters quickly, without calling mbrtowc(). */
+ if (mbf->bufcount == 1 && mbsinit (&mbf->state) && is_basic (mbf->buf[0]))
+ {
+ /* These characters are part of the basic character set. ISO C 99
+ guarantees that their wide character code is identical to their
+ char code. */
+ mbc->wc = mbc->buf[0] = mbf->buf[0];
+ mbc->wc_valid = true;
+ mbc->ptr = &mbc->buf[0];
+ mbc->bytes = 1;
+ mbf->bufcount = 0;
+ return;
+ }
+
+ /* Use mbrtowc on an increasing number of bytes. Read only as many bytes
+ from mbf->fp as needed. This is needed to give reasonable interactive
+ behaviour when mbf->fp is connected to an interactive tty. */
+ for (;;)
+ {
+ /* We don't know whether the 'mbrtowc' function updates the state when
+ it returns -2, - this is the ISO C 99 and glibc-2.2 behaviour - or
+ not - amended ANSI C, glibc-2.1 and Solaris 2.7 behaviour. We
+ don't have an autoconf test for this, yet.
+ The new behaviour would allow us to feed the bytes one by one into
+ mbrtowc. But the old behaviour forces us to feed all bytes since
+ the end of the last character into mbrtowc. Since we want to retry
+ with more bytes when mbrtowc returns -2, we must backup the state
+ before calling mbrtowc, because implementations with the new
+ behaviour will clobber it. */
+ mbstate_t backup_state = mbf->state;
+
+ bytes = mbrtowc (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state);
+
+ if (bytes == (size_t) -1)
+ {
+ /* An invalid multibyte sequence was encountered. */
+ /* Return a single byte. */
+ bytes = 1;
+ mbc->wc_valid = false;
+ break;
+ }
+ else if (bytes == (size_t) -2)
+ {
+ /* An incomplete multibyte character. */
+ mbf->state = backup_state;
+ if (mbf->bufcount == MBCHAR_BUF_SIZE)
+ {
+ /* An overlong incomplete multibyte sequence was encountered. */
+ /* Return a single byte. */
+ bytes = 1;
+ mbc->wc_valid = false;
+ break;
+ }
+ else
+ {
+ /* Read one more byte and retry mbrtowc. */
+ int c = getc (mbf->fp);
+ if (c == EOF)
+ {
+ /* An incomplete multibyte character at the end. */
+ mbf->eof_seen = true;
+ bytes = mbf->bufcount;
+ mbc->wc_valid = false;
+ break;
+ }
+ mbf->buf[mbf->bufcount] = (unsigned char) c;
+ mbf->bufcount++;
+ }
+ }
+ else
+ {
+ if (bytes == 0)
+ {
+ /* A null wide character was encountered. */
+ bytes = 1;
+ assert (mbf->buf[0] == '\0');
+ assert (mbc->wc == 0);
+ }
+ mbc->wc_valid = true;
+ break;
+ }
+ }
+
+ /* Return the multibyte sequence mbf->buf[0..bytes-1]. */
+ mbc->ptr = &mbc->buf[0];
+ memcpy (&mbc->buf[0], &mbf->buf[0], bytes);
+ mbc->bytes = bytes;
+
+ mbf->bufcount -= bytes;
+ if (mbf->bufcount > 0)
+ {
+ /* It's not worth calling memmove() for so few bytes. */
+ unsigned int count = mbf->bufcount;
+ char *p = &mbf->buf[0];
+
+ do
+ {
+ *p = *(p + bytes);
+ p++;
+ }
+ while (--count > 0);
+ }
+ return;
+
+eof:
+ /* An mbchar_t with bytes == 0 is used to indicate EOF. */
+ mbc->ptr = NULL;
+ mbc->bytes = 0;
+ mbc->wc_valid = false;
+ return;
+}
+
+MBFILE_INLINE void
+mbfile_multi_ungetc (const struct mbchar *mbc, struct mbfile_multi *mbf)
+{
+ mb_copy (&mbf->pushback, mbc);
+ mbf->have_pushback = true;
+}
+
+typedef struct mbfile_multi mb_file_t;
+
+typedef mbchar_t mbf_char_t;
+
+#define mbf_init(mbf, stream) \
+ ((mbf).fp = (stream), \
+ (mbf).eof_seen = false, \
+ (mbf).have_pushback = false, \
+ memset (&(mbf).state, '\0', sizeof (mbstate_t)), \
+ (mbf).bufcount = 0)
+
+#define mbf_getc(mbc, mbf) mbfile_multi_getc (&(mbc), &(mbf))
+
+#define mbf_ungetc(mbc, mbf) mbfile_multi_ungetc (&(mbc), &(mbf))
+
+#define mb_iseof(mbc) ((mbc).bytes == 0)
+
+#ifndef _GL_INLINE_HEADER_BEGIN
+ #error "Please include config.h first."
+#endif
+_GL_INLINE_HEADER_BEGIN
+
+#endif /* _MBFILE_H */
diff --git a/m4/mbfile.m4 b/m4/mbfile.m4
new file mode 100644
index 0000000..8589902
--- /dev/null
+++ b/m4/mbfile.m4
@@ -0,0 +1,14 @@
+# mbfile.m4 serial 7
+dnl Copyright (C) 2005, 2008-2015 Free Software Foundation, Inc.
+dnl This file is free software; the Free Software Foundation
+dnl gives unlimited permission to copy and/or distribute it,
+dnl with or without modifications, as long as this notice is preserved.
+
+dnl autoconf tests required for use of mbfile.h
+dnl From Bruno Haible.
+
+AC_DEFUN([gl_MBFILE],
+[
+ AC_REQUIRE([AC_TYPE_MBSTATE_T])
+ :
+])
The following is taken from mbchar.h in coreutils-9.4.tar.xz.
Added two "static"s before MBCHAR_INLINE to avoid adding mbchar.c
(doing so is more nasty).
diff --git a/lib/mbchar.h b/lib/mbchar.h
new file mode 100644
--- /dev/null
+++ b/lib/mbchar.h
@@ -0,0 +1,373 @@
+/* Multibyte character data type.
+ Copyright (C) 2001, 2005-2007, 2009-2023 Free Software Foundation, Inc.
+
+ This file is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as
+ published by the Free Software Foundation; either version 2.1 of the
+ License, or (at your option) any later version.
+
+ This file is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+/* Written by Bruno Haible <bruno@clisp.org>. */
+
+/* A multibyte character is a short subsequence of a char* string,
+ representing a single 32-bit wide character.
+
+ We use multibyte characters instead of 32-bit wide characters because
+ of the following goals:
+ 1) correct multibyte handling, i.e. operate according to the LC_CTYPE
+ locale,
+ 2) ease of maintenance, i.e. the maintainer needs not know all details
+ of the ISO C 99 standard,
+ 3) don't fail grossly if the input is not in the encoding set by the
+ locale, because often different encodings are in use in the same
+ countries (ISO-8859-1/UTF-8, EUC-JP/Shift_JIS, ...),
+ 4) fast in the case of ASCII characters.
+
+ Multibyte characters are only accessed through the mb* macros.
+
+ mb_ptr (mbc)
+ return a pointer to the beginning of the multibyte sequence.
+
+ mb_len (mbc)
+ returns the number of bytes occupied by the multibyte sequence.
+ Always > 0.
+
+ mb_iseq (mbc, sc)
+ returns true if mbc is the standard ASCII character sc.
+
+ mb_isnul (mbc)
+ returns true if mbc is the nul character.
+
+ mb_cmp (mbc1, mbc2)
+ returns a positive, zero, or negative value depending on whether mbc1
+ sorts after, same or before mbc2.
+
+ mb_casecmp (mbc1, mbc2)
+ returns a positive, zero, or negative value depending on whether mbc1
+ sorts after, same or before mbc2, modulo upper/lowercase conversion.
+
+ mb_equal (mbc1, mbc2)
+ returns true if mbc1 and mbc2 are equal.
+
+ mb_caseequal (mbc1, mbc2)
+ returns true if mbc1 and mbc2 are equal modulo upper/lowercase conversion.
+
+ mb_isalnum (mbc)
+ returns true if mbc is alphanumeric.
+
+ mb_isalpha (mbc)
+ returns true if mbc is alphabetic.
+
+ mb_isascii(mbc)
+ returns true if mbc is plain ASCII.
+
+ mb_isblank (mbc)
+ returns true if mbc is a blank.
+
+ mb_iscntrl (mbc)
+ returns true if mbc is a control character.
+
+ mb_isdigit (mbc)
+ returns true if mbc is a decimal digit.
+
+ mb_isgraph (mbc)
+ returns true if mbc is a graphic character.
+
+ mb_islower (mbc)
+ returns true if mbc is lowercase.
+
+ mb_isprint (mbc)
+ returns true if mbc is a printable character.
+
+ mb_ispunct (mbc)
+ returns true if mbc is a punctuation character.
+
+ mb_isspace (mbc)
+ returns true if mbc is a space character.
+
+ mb_isupper (mbc)
+ returns true if mbc is uppercase.
+
+ mb_isxdigit (mbc)
+ returns true if mbc is a hexadecimal digit.
+
+ mb_width (mbc)
+ returns the number of columns on the output device occupied by mbc.
+ Always >= 0.
+
+ mb_putc (mbc, stream)
+ outputs mbc on stream, a byte oriented FILE stream opened for output.
+
+ mb_setascii (&mbc, sc)
+ assigns the standard ASCII character sc to mbc.
+ (Only available if the 'mbfile' module is in use.)
+
+ mb_copy (&destmbc, &srcmbc)
+ copies srcmbc to destmbc.
+
+ Here are the function prototypes of the macros.
+
+ extern const char * mb_ptr (const mbchar_t mbc);
+ extern size_t mb_len (const mbchar_t mbc);
+ extern bool mb_iseq (const mbchar_t mbc, char sc);
+ extern bool mb_isnul (const mbchar_t mbc);
+ extern int mb_cmp (const mbchar_t mbc1, const mbchar_t mbc2);
+ extern int mb_casecmp (const mbchar_t mbc1, const mbchar_t mbc2);
+ extern bool mb_equal (const mbchar_t mbc1, const mbchar_t mbc2);
+ extern bool mb_caseequal (const mbchar_t mbc1, const mbchar_t mbc2);
+ extern bool mb_isalnum (const mbchar_t mbc);
+ extern bool mb_isalpha (const mbchar_t mbc);
+ extern bool mb_isascii (const mbchar_t mbc);
+ extern bool mb_isblank (const mbchar_t mbc);
+ extern bool mb_iscntrl (const mbchar_t mbc);
+ extern bool mb_isdigit (const mbchar_t mbc);
+ extern bool mb_isgraph (const mbchar_t mbc);
+ extern bool mb_islower (const mbchar_t mbc);
+ extern bool mb_isprint (const mbchar_t mbc);
+ extern bool mb_ispunct (const mbchar_t mbc);
+ extern bool mb_isspace (const mbchar_t mbc);
+ extern bool mb_isupper (const mbchar_t mbc);
+ extern bool mb_isxdigit (const mbchar_t mbc);
+ extern int mb_width (const mbchar_t mbc);
+ extern void mb_putc (const mbchar_t mbc, FILE *stream);
+ extern void mb_setascii (mbchar_t *new, char sc);
+ extern void mb_copy (mbchar_t *new, const mbchar_t *old);
+ */
+
+#ifndef _MBCHAR_H
+#define _MBCHAR_H 1
+
+/* This file uses _GL_INLINE_HEADER_BEGIN, _GL_INLINE. */
+#if !_GL_CONFIG_H_INCLUDED
+ #error "Please include config.h first."
+#endif
+
+#include <string.h>
+#include <uchar.h>
+
+_GL_INLINE_HEADER_BEGIN
+#ifndef MBCHAR_INLINE
+# define MBCHAR_INLINE _GL_INLINE
+#endif
+
+/* The longest multibyte characters, nowadays, are 4 bytes long.
+ Regardless of the values of MB_CUR_MAX and MB_LEN_MAX. */
+#define MBCHAR_BUF_SIZE 4
+
+struct mbchar
+{
+ const char *ptr; /* pointer to current character */
+ size_t bytes; /* number of bytes of current character, > 0 */
+ bool wc_valid; /* true if wc is a valid 32-bit wide character */
+ char32_t wc; /* if wc_valid: the current character */
+#if defined GNULIB_MBFILE
+ char buf[MBCHAR_BUF_SIZE]; /* room for the bytes, used for file input only */
+#endif
+};
+
+/* EOF (not a real character) is represented with bytes = 0 and
+ wc_valid = false. */
+
+typedef struct mbchar mbchar_t;
+
+/* Access the current character. */
+#define mb_ptr(mbc) ((mbc).ptr)
+#define mb_len(mbc) ((mbc).bytes)
+
+/* Comparison of characters. */
+#define mb_iseq(mbc, sc) ((mbc).wc_valid && (mbc).wc == (sc))
+#define mb_isnul(mbc) ((mbc).wc_valid && (mbc).wc == 0)
+#define mb_cmp(mbc1, mbc2) \
+ ((mbc1).wc_valid \
+ ? ((mbc2).wc_valid \
+ ? _GL_CMP ((mbc1).wc, (mbc2).wc) \
+ : -1) \
+ : ((mbc2).wc_valid \
+ ? 1 \
+ : (mbc1).bytes == (mbc2).bytes \
+ ? memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) \
+ : (mbc1).bytes < (mbc2).bytes \
+ ? (memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) > 0 ? 1 : -1) \
+ : (memcmp ((mbc1).ptr, (mbc2).ptr, (mbc2).bytes) >= 0 ? 1 : -1)))
+#define mb_casecmp(mbc1, mbc2) \
+ ((mbc1).wc_valid \
+ ? ((mbc2).wc_valid \
+ ? _GL_CMP (c32tolower ((mbc1).wc), c32tolower ((mbc2).wc)) \
+ : -1) \
+ : ((mbc2).wc_valid \
+ ? 1 \
+ : (mbc1).bytes == (mbc2).bytes \
+ ? memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) \
+ : (mbc1).bytes < (mbc2).bytes \
+ ? (memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) > 0 ? 1 : -1) \
+ : (memcmp ((mbc1).ptr, (mbc2).ptr, (mbc2).bytes) >= 0 ? 1 : -1)))
+#define mb_equal(mbc1, mbc2) \
+ ((mbc1).wc_valid && (mbc2).wc_valid \
+ ? (mbc1).wc == (mbc2).wc \
+ : (mbc1).bytes == (mbc2).bytes \
+ && memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) == 0)
+#define mb_caseequal(mbc1, mbc2) \
+ ((mbc1).wc_valid && (mbc2).wc_valid \
+ ? c32tolower ((mbc1).wc) == c32tolower ((mbc2).wc) \
+ : (mbc1).bytes == (mbc2).bytes \
+ && memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) == 0)
+
+/* <ctype.h>, <wctype.h> classification. */
+#define mb_isascii(mbc) \
+ ((mbc).wc_valid && (mbc).wc >= 0 && (mbc).wc <= 127)
+#define mb_isalnum(mbc) ((mbc).wc_valid && c32isalnum ((mbc).wc))
+#define mb_isalpha(mbc) ((mbc).wc_valid && c32isalpha ((mbc).wc))
+#define mb_isblank(mbc) ((mbc).wc_valid && c32isblank ((mbc).wc))
+#define mb_iscntrl(mbc) ((mbc).wc_valid && c32iscntrl ((mbc).wc))
+#define mb_isdigit(mbc) ((mbc).wc_valid && c32isdigit ((mbc).wc))
+#define mb_isgraph(mbc) ((mbc).wc_valid && c32isgraph ((mbc).wc))
+#define mb_islower(mbc) ((mbc).wc_valid && c32islower ((mbc).wc))
+#define mb_isprint(mbc) ((mbc).wc_valid && c32isprint ((mbc).wc))
+#define mb_ispunct(mbc) ((mbc).wc_valid && c32ispunct ((mbc).wc))
+#define mb_isspace(mbc) ((mbc).wc_valid && c32isspace ((mbc).wc))
+#define mb_isupper(mbc) ((mbc).wc_valid && c32isupper ((mbc).wc))
+#define mb_isxdigit(mbc) ((mbc).wc_valid && c32isxdigit ((mbc).wc))
+
+/* Extra <wchar.h> function. */
+
+/* Unprintable characters appear as a small box of width 1. */
+#define MB_UNPRINTABLE_WIDTH 1
+
+static MBCHAR_INLINE int
+mb_width_aux (char32_t wc)
+{
+ int w = c32width (wc);
+ /* For unprintable characters, arbitrarily return 0 for control characters
+ and MB_UNPRINTABLE_WIDTH otherwise. */
+ return (w >= 0 ? w : c32iscntrl (wc) ? 0 : MB_UNPRINTABLE_WIDTH);
+}
+
+#define mb_width(mbc) \
+ ((mbc).wc_valid ? mb_width_aux ((mbc).wc) : MB_UNPRINTABLE_WIDTH)
+
+/* Output. */
+#define mb_putc(mbc, stream) fwrite ((mbc).ptr, 1, (mbc).bytes, (stream))
+
+#if defined GNULIB_MBFILE
+/* Assignment. */
+# define mb_setascii(mbc, sc) \
+ ((mbc)->ptr = (mbc)->buf, (mbc)->bytes = 1, (mbc)->wc_valid = 1, \
+ (mbc)->wc = (mbc)->buf[0] = (sc))
+#endif
+
+/* Copying a character. */
+static MBCHAR_INLINE void
+mb_copy (mbchar_t *new_mbc, const mbchar_t *old_mbc)
+{
+#if defined GNULIB_MBFILE
+ if (old_mbc->ptr == &old_mbc->buf[0])
+ {
+ memcpy (&new_mbc->buf[0], &old_mbc->buf[0], old_mbc->bytes);
+ new_mbc->ptr = &new_mbc->buf[0];
+ }
+ else
+#endif
+ new_mbc->ptr = old_mbc->ptr;
+ new_mbc->bytes = old_mbc->bytes;
+ if ((new_mbc->wc_valid = old_mbc->wc_valid))
+ new_mbc->wc = old_mbc->wc;
+}
+
+
+/* is_basic(c) tests whether the single-byte character c is
+ - in the ISO C "basic character set" or is one of '@', '$', and '`'
+ which ISO C 23 § 5.2.1.1.(1) guarantees to be single-byte and in
+ practice are safe to treat as basic in the execution character set,
+ or
+ - in the POSIX "portable character set", which
+ <https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap06.html>
+ equally guarantees to be single-byte.
+ This is a convenience function, and is in this file only to share code
+ between mbiter.h, mbuiter.h, and mbfile.h. */
+#if (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \
+ && ('$' == 36) && ('%' == 37) && ('&' == 38) && ('\'' == 39) \
+ && ('(' == 40) && (')' == 41) && ('*' == 42) && ('+' == 43) \
+ && (',' == 44) && ('-' == 45) && ('.' == 46) && ('/' == 47) \
+ && ('0' == 48) && ('1' == 49) && ('2' == 50) && ('3' == 51) \
+ && ('4' == 52) && ('5' == 53) && ('6' == 54) && ('7' == 55) \
+ && ('8' == 56) && ('9' == 57) && (':' == 58) && (';' == 59) \
+ && ('<' == 60) && ('=' == 61) && ('>' == 62) && ('?' == 63) \
+ && ('@' == 64) && ('A' == 65) && ('B' == 66) && ('C' == 67) \
+ && ('D' == 68) && ('E' == 69) && ('F' == 70) && ('G' == 71) \
+ && ('H' == 72) && ('I' == 73) && ('J' == 74) && ('K' == 75) \
+ && ('L' == 76) && ('M' == 77) && ('N' == 78) && ('O' == 79) \
+ && ('P' == 80) && ('Q' == 81) && ('R' == 82) && ('S' == 83) \
+ && ('T' == 84) && ('U' == 85) && ('V' == 86) && ('W' == 87) \
+ && ('X' == 88) && ('Y' == 89) && ('Z' == 90) && ('[' == 91) \
+ && ('\\' == 92) && (']' == 93) && ('^' == 94) && ('_' == 95) \
+ && ('`' == 96) && ('a' == 97) && ('b' == 98) && ('c' == 99) \
+ && ('d' == 100) && ('e' == 101) && ('f' == 102) && ('g' == 103) \
+ && ('h' == 104) && ('i' == 105) && ('j' == 106) && ('k' == 107) \
+ && ('l' == 108) && ('m' == 109) && ('n' == 110) && ('o' == 111) \
+ && ('p' == 112) && ('q' == 113) && ('r' == 114) && ('s' == 115) \
+ && ('t' == 116) && ('u' == 117) && ('v' == 118) && ('w' == 119) \
+ && ('x' == 120) && ('y' == 121) && ('z' == 122) && ('{' == 123) \
+ && ('|' == 124) && ('}' == 125) && ('~' == 126)
+/* The character set is ISO-646, not EBCDIC. */
+# define IS_BASIC_ASCII 1
+
+/* All locale encodings (see localcharset.h) map the characters 0x00..0x7F
+ to U+0000..U+007F, like ASCII, except for
+ CP864 different mapping of '%'
+ SHIFT_JIS different mappings of 0x5C, 0x7E
+ JOHAB different mapping of 0x5C
+ However, these characters in the range 0x20..0x7E are in the ISO C
+ "basic character set" and in the POSIX "portable character set", which
+ ISO C and POSIX guarantee to be single-byte. Thus, locales with these
+ encodings are not POSIX compliant. And they are most likely not in use
+ any more (as of 2023). */
+# define is_basic(c) ((unsigned char) (c) < 0x80)
+
+#else
+
+MBCHAR_INLINE bool
+is_basic (char c)
+{
+ switch (c)
+ {
+ case '\0':
+ case '\007': case '\010':
+ case '\t': case '\n': case '\v': case '\f': case '\r':
+ case ' ': case '!': case '"': case '#': case '$': case '%':
+ case '&': case '\'': case '(': case ')': case '*':
+ case '+': case ',': case '-': case '.': case '/':
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ case ':': case ';': case '<': case '=': case '>':
+ case '?': case '@':
+ case 'A': case 'B': case 'C': case 'D': case 'E':
+ case 'F': case 'G': case 'H': case 'I': case 'J':
+ case 'K': case 'L': case 'M': case 'N': case 'O':
+ case 'P': case 'Q': case 'R': case 'S': case 'T':
+ case 'U': case 'V': case 'W': case 'X': case 'Y':
+ case 'Z':
+ case '[': case '\\': case ']': case '^': case '_': case '`':
+ case 'a': case 'b': case 'c': case 'd': case 'e':
+ case 'f': case 'g': case 'h': case 'i': case 'j':
+ case 'k': case 'l': case 'm': case 'n': case 'o':
+ case 'p': case 'q': case 'r': case 's': case 't':
+ case 'u': case 'v': case 'w': case 'x': case 'y':
+ case 'z': case '{': case '|': case '}': case '~':
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+#endif
+
+_GL_INLINE_HEADER_END
+
+#endif /* _MBCHAR_H */