#include <stdio.h>
#include "postgres.h"
#include "fmgr.h"
#include "utils/builtins.h"
#include "mb/pg_wchar.h"
#include "tsearch/ts_public.h"
#include "tsearch/ts_locale.h"
#include "musicbrainz_unaccent_data.h"
PG_MODULE_MAGIC;
PG_FUNCTION_INFO_V1(musicbrainz_unaccent);
Datum musicbrainz_unaccent(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(musicbrainz_dunaccentdict_init);
Datum musicbrainz_dunaccentdict_init(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(musicbrainz_dunaccentdict_lexize);
Datum musicbrainz_dunaccentdict_lexize(PG_FUNCTION_ARGS);
#define TextPGetCString(t) DatumGetCString(DirectFunctionCall1(textout, PointerGetDatum(t)))
#define CStringGetTextP(c) DatumGetTextP(DirectFunctionCall1(textin, CStringGetDatum(c)))
static int
utf8_encode_char(char *str, unsigned long c)
{
if (c < 0x80) {
*str++ = (unsigned char) c;
return 1;
}
else if (c < 0x800) {
*str++ = 0xc0 | (c >> 6);
*str++ = 0x80 | (c & 0x3f);
return 2;
}
else if (c < 0x10000) {
*str++ = 0xe0 | (c >> 12);
*str++ = 0x80 | ((c >> 6) & 0x3f);
*str++ = 0x80 | (c & 0x3f);
return 3;
}
else {
*str++ = 0xf0 | (c >> 18);
*str++ = 0x80 | ((c >> 12) & 0x3f);
*str++ = 0x80 | ((c >> 6) & 0x3f);
*str++ = 0x80 | (c & 0x3f);
return 4;
}
}
static int
utf8_encode_char_len(unsigned long c)
{
if (c < 0x80) {
return 1;
}
else if (c < 0x800) {
return 2;
}
else if (c < 0x10000) {
return 3;
}
return 4;
}
static unsigned long
utf8_decode_char(const char *str, int len, int *c_len)
{
uint32 c1, c2, c3, c4;
unsigned long c;
if ((*str & 0x80) == 0) {
c = *str++;
*c_len = 1;
}
else if ((*str & 0xe0) == 0xc0 && len >= 2) {
c1 = *str++ & 0x1f;
c2 = *str++ & 0x3f;
c = (c1 << 6) | c2;
*c_len = 2;
}
else if ((*str & 0xf0) == 0xe0 && len >= 3) {
c1 = *str++ & 0x0f;
c2 = *str++ & 0x3f;
c3 = *str++ & 0x3f;
c = (c1 << 12) | (c2 << 6) | c3;
*c_len = 3;
}
else if ((*str & 0xf8) == 0xf0 && len >= 4) {
c1 = *str++ & 0x07;
c2 = *str++ & 0x3f;
c3 = *str++ & 0x3f;
c4 = *str++ & 0x3f;
c = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
*c_len = 4;
}
else {
c = *str++;
*c_len = 0;
}
return c;
}
static void
unac_lookup(unsigned long c, unsigned short **conv_data, int *conv_len)
{
int block, position, pos;
unsigned char *positions;
if (c > 0xFFFF) {
*conv_data = NULL;
*conv_len = 0;
return;
}
block = unaccent_indexes[c >> UNACCENT_BLOCK_SHIFT];
position = c & UNACCENT_BLOCK_MASK;
positions = unaccent_positions[block];
pos = positions[position];
*conv_data = &unaccent_data[block][pos];
*conv_len = positions[position + 1] - pos;
}
static int
utf8_unac_len(const char *input, int len)
{
pg_wchar c;
unsigned short *conv_data;
int cnt = 0, conv = 0, conv_len, c_len;
while (len > 0) {
c = utf8_decode_char(input, len, &c_len);
if (c_len <= 0)
break;
unac_lookup(c, &conv_data, &conv_len);
if (conv_len > 0) {
while (conv_len--) {
cnt += utf8_encode_char_len(*conv_data++);
}
conv = 1;
}
else {
cnt += c_len;
}
len -= c_len;
input += c_len;
}
return conv ? cnt : 0;
}
static void
utf8_unac(const char *input, int len, char *output)
{
pg_wchar c;
unsigned short *conv_data;
int c_len, conv_len;
while (len > 0) {
c = utf8_decode_char(input, len, &c_len);
if (c_len <= 0)
break;
unac_lookup(c, &conv_data, &conv_len);
if (conv_len > 0) {
while (conv_len--) {
output += utf8_encode_char(output, *conv_data++);
}
len -= c_len;
input += c_len;
}
else {
len -= c_len;
while (c_len--) {
*output++ = *input++;
}
}
}
}
static int
is_ascii(const char *input)
{
while (*input) {
if (*input & 0x80)
return 0;
input++;
}
return 1;
}
static char *
unaccent_string(char *input)
{
char *utf8_output, *utf8_input;
int utf8_output_len, utf8_input_len, input_len;
input_len = strlen(input);
if (is_ascii(input)) {
return input;
}
utf8_input = (char *)pg_do_encoding_conversion(
(unsigned char *)input, input_len,
GetDatabaseEncoding(), PG_UTF8);
utf8_input_len = strlen(utf8_input);
utf8_output_len = utf8_unac_len(utf8_input, utf8_input_len);
if (!utf8_output_len) {
if (utf8_input != input) {
pfree(utf8_input);
}
return input;
}
utf8_output = palloc(utf8_output_len + 1);
if (!utf8_output) {
if (utf8_input != input) {
pfree(utf8_input);
}
return input;
}
utf8_unac(utf8_input, utf8_input_len, utf8_output);
utf8_output[utf8_output_len] = '\0';
if (utf8_input != input) {
pfree(utf8_input);
}
return (char *)pg_do_encoding_conversion(
(unsigned char *)utf8_output, utf8_output_len,
PG_UTF8, GetDatabaseEncoding());
}
Datum
musicbrainz_unaccent(PG_FUNCTION_ARGS)
{
char *output, *input;
text *result;
input = TextPGetCString(PG_GETARG_DATUM(0));
output = unaccent_string(input);
result = CStringGetTextP(output);
if (input != output)
pfree(output);
PG_RETURN_TEXT_P(result);
}
Datum
musicbrainz_dunaccentdict_init(PG_FUNCTION_ARGS)
{
PG_RETURN_POINTER(NULL);
}
Datum
musicbrainz_dunaccentdict_lexize(PG_FUNCTION_ARGS)
{
char *input, *output;
int input_len;
TSLexeme *result;
input = (char *) PG_GETARG_POINTER(1);
input_len = PG_GETARG_INT32(2);
input = lowerstr_with_len(input, input_len);
output = unaccent_string(input);
result = palloc(sizeof(TSLexeme) * 2);
result[0].lexeme = output;
result[1].lexeme = NULL;
if (input != output)
pfree(input);
PG_RETURN_POINTER(result);
}