0x00 起源
音频播放器播放列表展示音频的元数据如title、artist、album等信息是一个常见的功能,mini-player使用FFmpeg作为解码器,也自带读取音频元数据的功能,样例代码如下:
// 定回调函数
typedef void (*metadata_callback)(void *user, const char *key, const char *value);
int32_t metadata(struct decoder *pdecoder, const char *filename, metadata_callback cb, void *user)
{
const AVDictionary *m = NULL;
...
m = is->ic->metadata;
if (m && !(av_dict_count(m) == 1 && av_dict_get(m, "language", NULL, 0)))
{
while ((tag = av_dict_iterate(m, tag)))
{
if (strcmp("language", tag->key))
{
if (cb)
cb(user, tag->key, tag->value);
}
}
}
....
}
本想着FFmpeg的这个功能能快速的实现音频元数据的读取和展示,如果音频文件不涉及中文(GBK或GB18030)倒是没什么问题,但实际是不太可能的。为了正确展示音频元数据,我先后尝试了taglib(https://taglib.org/)和自己写代码解析,效果都不是很好。自己写代码解析了解一下细节可以,但重复造轮子不是我想的,还是立足FFmpeg现有的基础吧。
0x01 捋清思路
我的解决思路是先用utf8cpp(https://github.com/nemtrif/utfcpp)判断字符串编码是否为utf-8,如果编码不是utf-8,假设字符串编码为GBK或GB18030,将其转换为utf-8;如果编码是utf-8,这里需要解决两个问题:一个问题是GBK或GB18030编码的字符串被ISO-8859-1再次编码(典型特征是字符串中含有成对的0xC2、0xC3,如:[0xC2, 0xA3, 0xC2, 0xA8, 0xC3, 0x81, 0xC3, 0xB5, 0xC2, 0xBB, 0xC2, 0xB6, 0xC2, 0xA3]),将其转换为被ISO-8859-1编码前的字符串,另一个问题是GBK或GB18030编码的字符(如:路远,字节码为[0xC2, 0xB7, 0xD4, 0xB6])会被误判为utf-8。解决了以上两个问题,基本可以解决中文显示乱码的问题。
0x02 编码转换函数
编码转换基于iconv,代码如下:
int convert(const char *inFmt, const char *inStr, size_t inSize, const char *outFmt, char *outStr, size_t outSize)
{
iconv_t cd = iconv_open(outFmt, inFmt);
if (cd == (iconv_t)-1)
{
perror("iconv_open");
return -1;
}
char *inptr = (char *)inStr;
size_t inbytesleft = inSize;
char *outptr = outStr;
size_t outbytesleft = outSize;
size_t ret = 0;
if ((ret = iconv(cd, &inptr, &inbytesleft, &outptr, &outbytesleft)) == (size_t)-1)
{
perror("iconv");
iconv_close(cd);
return -1;
}
iconv_close(cd);
return (outSize - outbytesleft);
}
0x03 解决0xC2、0xC3问题
直接上代码:
uint8_t is_encoded_by_iso8859_1(const uint8_t *data, size_t size)
{
assert(data);
assert(size > 0);
size_t i = 0;
const uint8_t *p = data;
uint8_t ret = 0;
while (i < size)
{
if (*p < 0x80)
{
++p;
++i;
continue;
}
if ((*p == 0xC2) || (*p == 0xC3))
{
if ((*(p + 2) == 0xC2) || (*(p + 2) == 0xC3))
{
ret = 1;
break;
}
else
{
++p;
++i;
}
}
else
{
++p;
++i;
}
}
return ret;
}
#define DEFAULT_BUF_SIZE 2048
void metadata_callback2(void *user, const char *key, const char *value)
{
size_t value_size = strlen(value);
std::string str = value;
auto end_it = utf8::find_invalid(str.begin(), str.end());
if (end_it == str.end())
{
if (is_encoded_by_iso8859_1((uint8_t *)value, value_size))
{
char buf[DEFAULT_BUF_SIZE] = {0};
size_t size = convert("UTF-8", value, value_size, "ISO-8859-1", buf, sizeof(buf));
...
}
}
}
0x04 解决GBK或GB18030误判为UTF-8问题
因为GBK或GB18030编码规则与UTF-8编码规则部分重叠,会导致GBK或GB18030编码的字符串被误认为UTF-8,这里使用查表法判断,代码如下:
#include "gb18030_unicode.h"
typedef struct check_result
{
uint32_t ascii_count;
uint32_t utf8_count;
uint32_t gb18030_count;
} check_result;
void recheck_utf8(const uint8_t *data, size_t in_size, check_result *result)
{
assert(data);
assert(result);
size_t table_size = sizeof(gb18030_unicode_table) / sizeof(gb18030_unicode);
const uint8_t *p = data;
uint16_t val = 0;
memset(result, 0, sizeof(check_result));
for (size_t i = 0; i < in_size; ++i)
{
if (p[i] <= 0x7F)
{
++result->ascii_count;
continue;
}
if ((p[i] >= 0xC0 && p[i] <= 0xDF) && ((i + 2) <= in_size))
{
val = p[i] << 8 | p[i + 1];
for (size_t j = 0; j < table_size; ++j)
{
if (gb18030_unicode_table[j].utf8 == val)
{
++result->utf8_count;
}
if (gb18030_unicode_table[j].gb18030 == val)
{
++result->gb18030_count;
}
}
}
if ((p[i] >= 0xE0 && p[i] <= 0xEF) && ((i + 3) <= in_size))
{
++result->utf8_count;
}
if ((p[i] >= 0xF0 && p[i] <= 0xF7) && ((i + 4) <= in_size))
{
++result->utf8_count;
}
if ((p[i] >= 0xF8 && p[i] <= 0xFB) && ((i + 5) <= in_size))
{
++result->utf8_count;
}
if ((p[i] >= 0xFC && p[i] <= 0xFD) && ((i + 6) <= in_size))
{
++result->utf8_count;
}
}
}
#define DEFAULT_BUF_SIZE 2048
void metadata_callback2(void *user, const char *key, const char *value)
{
size_t value_size = strlen(value);
std::string str = value;
auto end_it = utf8::find_invalid(str.begin(), str.end());
if (end_it == str.end())
{
if (is_encoded_by_iso8859_1((uint8_t *)value, value_size))
{
...
}
else
{
check_result result;
recheck_utf8((const uint8_t *)value, value_size, &result);
if (result.gb18030_count > result.utf8_count)
{
char buf[DEFAULT_BUF_SIZE] = {0};
size_t size = convert("GB18030", value, value_size, "UTF-8", buf, sizeof(buf));
....
}
}
}
}
gb18030_unicode.h下载地址:GitHub - jackyxinli/mini-player
0x05 尾声
至此,经过以上操作,音频文件的元数据中文乱码问题基本可以解决了。