I have a function which converts utf-16 string to utf-8 by using iconv library (ignore "ISO-8859-1" part, my test files all are utf-8 and utf-16). Here is it's code:
char* to_utf8(const unsigned int *encoding, char *source_str, unsigned int source_size){
iconv_t cd;
// Setting "from" encoding
if (*encoding == 0){cd = iconv_open("UTF-8", "ISO-8859-1");
} else {cd = iconv_open("UTF-8", "UTF-16LE"); };
if (cd == (iconv_t)-1) {endwin(); perror("iconv_open:"); exit(1); };
// Skip BOM if present
int offset = 0;
if (source_size >= 2 &&
((unsigned char)source_str[0] == 0xFE && (unsigned char)source_str[1] == 0xFF) ||
((unsigned char)source_str[0] == 0xFF && (unsigned char)source_str[1] == 0xFE)) {
offset=2;
source_size -= 2;
};
size_t in_str_size = source_size,
out_str_size = source_size;
char *inbuf = (char *)source_str + offset;
char *output = malloc(out_str_size);
char *outbuf = output;
size_t result = iconv(cd, &inbuf, &in_str_size, &outbuf, &out_str_size);
if (result == (size_t)-1) {
endwin();
perror("iconv");
free(output);
iconv_close(cd);
exit(1);
return NULL;
};
// Do not leak memory
free(source_str);
iconv_close(cd);
return output;
}
Here is the part of code where track data declared and the function to_utf8 is called:
//...
Track *track_data = calloc(sizeof(*track_data), 1);
//...
if (strcmp(tag_str, "TIT2") == 0){
track_data->title = calloc(sizeof(char), tag_size);
memcpy(track_data->title, &id3_metadata_str[offset], tag_size-1);
// Fix encoding if needed
if (encoding != 3){track_data->title = to_utf8(&encoding, track_data->title, tag_size); };
Here is the Track struct declaration:
struct Track{
char *path;
char *artist;
char *album;
char *title;
char *year;
char *track;
double duration;
char *dur_str;
int lyrics_size;
char *lyrics;
int shfl_num;
long progress;
} typedef Track;
This code above results in error "iconv: Invalid argument".
If I run same code with manually created string, everything works fine. Here is the code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iconv.h>
char* to_utf8(const unsigned int *encoding, char *source_str, unsigned int source_size){
iconv_t cd;
// Setting "from" encoding
if (*encoding == 0){cd = iconv_open("UTF-8", "ISO-8859-1");
} else {cd = iconv_open("UTF-8", "UTF-16LE"); };
if (cd == (iconv_t)-1) {perror("iconv_open:"); exit(1); };
// Skip BOM if present
int offset = 0;
if (source_size >= 2 &&
((unsigned char)source_str[0] == 0xFE && (unsigned char)source_str[1] == 0xFF) ||
((unsigned char)source_str[0] == 0xFF && (unsigned char)source_str[1] == 0xFE)) {
offset=2;
source_size -= 2;
};
size_t in_str_size = source_size,
out_str_size = source_size;
char *inbuf = (char *)source_str + offset;
char *output = malloc(out_str_size);
char *outbuf = output;
size_t result = iconv(cd, &inbuf, &in_str_size, &outbuf, &out_str_size);
if (result == (size_t)-1) {
perror("iconv");
free(output);
iconv_close(cd);
exit(1);
return NULL;
};
// Do not leak memory
iconv_close(cd);
return output;
}
int main() {
unsigned int encoding = 1; // Assume UTF-16 with BOM
char content_str[] = {0xFF, 0xFE, 0x42, 0x00, 0x61, 0x00, 0x64, 0x00, 0x20, 0x00, 0x44, 0x00, 0x61, 0x00, 0x79,
0x00, 0x20, 0x00, 0x66, 0x00, 0x6F, 0x00, 0x72, 0x00, 0x20, 0x00, 0x4D, 0x00, 0x79, 0x00,
0x20, 0x00, 0x45, 0x00, 0x6E, 0x00, 0x65, 0x00, 0x6D, 0x00, 0x69, 0x00, 0x65, 0x00, 0x73, 0x00 }; // Example UTF-16LE with BOM
unsigned int tag_size = sizeof(content_str);
size_t output_size;
char* utf8_str = to_utf8(&encoding, content_str, tag_size);
if (utf8_str != NULL) {
printf("Converted UTF-8 string: %s\n", utf8_str);
free(utf8_str);
} else {
printf("Conversion failed.\n");
}
return 0;
}
The data in both cases is the same. Why do I get invalid argument error in case of passing track_data->title, but don't get it in case char content_str[]?
Thanks to @IanAbbott I found where the problem is (+ learned about errno).
The problem was with source_size value to which was in_str_size set. Inside the string was content of the id3v2 tag, which contains encoding byte, 2 bytes with BOM and, in this case, title of the track. source_size is size of this all in bytes. I deducted from source size 2 bytes for BOM but didn't deduct 1 byte for encoding bit. So after I deducted 1 from source_size everything started to work.
So, "Invalid argument" means that "An incomplete multibyte sequence is encountered in the input". Also, I'll add that "Argument list too long" means that "Output buffer has no more room for the next converted character".
I love C. Pay attention, or you'll waste half of a day on stupid error. At least I've learned something new.