You are not the first person to run into this problem with iconv
. Someone has written a Perl script to solve it.
iconv
doesn't handle large files well. From the glibc source code, in iconv/iconv_prog.c
:
/* Since we have to deal with
arbitrary encodings we must read the whole text in a buffer and
process it in one step. */
However, for your particular case, it might be better to write your own UTF-8 validator. You could easily distill iconv -c -f utf8 -t utf8
down to a small C program, with a loop that calls iconv(3)
. Since UTF-8 is modeless and self-synchronizing, you can process it in chunks.
#include <errno.h>
#include <iconv.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#define BUFSIZE 4096
/* Copy STDIN to STDOUT, omitting invalid UTF-8 sequences */
int main() {
char ib[BUFSIZE], ob[BUFSIZE], *ibp, *obp;
ssize_t bytes_read;
size_t iblen = 0, oblen;
unsigned long long total;
iconv_t cd;
if ((iconv_t)-1 == (cd = iconv_open("utf8", "utf8"))) {
perror("iconv_open");
return 2;
}
for (total = 0;
bytes_read = read(STDIN_FILENO, ib + iblen, sizeof(ib) - iblen);
total += bytes_read - iblen) {
if (-1 == bytes_read) { /* Handle read error */
perror("read");
return 1;
}
ibp = ib; iblen += bytes_read;
obp = ob; oblen = sizeof(ob);
if (-1 == iconv(cd, &ibp, &iblen, &obp, &oblen)) {
switch (errno) {
case EILSEQ: /* Invalid input multibyte sequence */
fprintf(stderr, "Invalid multibyte sequence at byte %llu\n",
1 + total + sizeof(ib) - iblen);
ibp++; iblen--; /* Skip the bad byte next time */
break;
case EINVAL: /* Incomplete input multibyte sequence */
break;
default:
perror("iconv");
return 2;
}
}
write(STDOUT_FILENO, ob, sizeof(ob) - oblen);
/* There are iblen bytes at the end of ib that follow an invalid UTF-8
sequence or are part of an incomplete UTF-8 sequence. Move them to
the beginning of ib. */
memmove(ib, ibp, iblen);
}
return iconv_close(cd);
}
Best Answer
The command to run or assign a specific command to a particular core is
taskset
.Embed it in your startup script or use from the command line like:
taskset -c 0,5 command_name
-c
is a list of one or more CPUs to run the command on; in this case, core 0 and 5.You can also modify the core assignment of a running process by specifying a PID with
taskset
.But you may also want to see: Assigning Processes to CPU Cores