/* Read UTF-8 characters from stdin, convert them to Latin-1
   (ISO-8859-1), and write the converted characters to stdout.
   UTF-8 is defined by RFC 2044.
*/
/*
 * I don't know who the original author was.
 * I modified this program to combine the most common non-spacing
 * diacritical marks and do some other useful translations.
 *
 * Jim Rees, University of Michigan CITI, June 2003
 */
#include <stdlib.h>
#include <stdio.h>

/* Extra translations */
struct {
    unsigned long u;
    unsigned char c;
} ucsxl[] = {
    {0x2010, '-',},
    {0x2011, '-',},
    {0x2012, '-',},
    {0x2013, '-',},
    {0x2014, '-',},
    {0x2015, '-',},
    {0x2017, '_',},
    {0x2018, '\'',},
    {0x2019, '\'',},
    {0x201c, '"',},
    {0x201d, '"',},
    {0x2022, 0xb7,},
    {0x2024, 0xb7,},
    {0x2026, '~',},
    {0, '\0'},
};

/* Combining marks */
struct {
    unsigned long u1;
    unsigned long u2;
    unsigned char c;
} ucscm[] = {
    {'A', 0x0300, ''},
    {'A', 0x0301, ''},
    {'A', 0x0302, ''},
    {'A', 0x0303, ''},
    {'A', 0x0308, ''},
    {'A', 0x030a, ''},
    {'C', 0x0327, ''},
    {'E', 0x0300, ''},
    {'E', 0x0301, ''},
    {'E', 0x0302, ''},
    {'E', 0x0308, ''},
    {'I', 0x0300, ''},
    {'I', 0x0301, ''},
    {'I', 0x0302, ''},
    {'I', 0x0308, ''},
    {'N', 0x0303, ''},
    {'O', 0x0300, ''},
    {'O', 0x0301, ''},
    {'O', 0x0302, ''},
    {'O', 0x0303, ''},
    {'O', 0x0308, ''},
    {'O', 0x0338, ''},
    {'U', 0x0300, ''},
    {'U', 0x0301, ''},
    {'U', 0x0302, ''},
    {'U', 0x0308, ''},
    {'Y', 0x0301, ''},
    {'a', 0x0300, ''},
    {'a', 0x0301, ''},
    {'a', 0x0302, ''},
    {'a', 0x0303, ''},
    {'a', 0x0308, ''},
    {'a', 0x030a, ''},
    {'c', 0x0327, ''},
    {'e', 0x0300, ''},
    {'e', 0x0301, ''},
    {'e', 0x0302, ''},
    {'e', 0x0308, ''},
    {'i', 0x0300, ''},
    {'i', 0x0301, ''},
    {'i', 0x0302, ''},
    {'i', 0x0308, ''},
    {'n', 0x0303, ''},
    {'o', 0x0300, ''},
    {'o', 0x0301, ''},
    {'o', 0x0302, ''},
    {'o', 0x0303, ''},
    {'o', 0x0308, ''},
    {'o', 0x0338, ''},
    {'u', 0x0300, ''},
    {'u', 0x0301, ''},
    {'u', 0x0302, ''},
    {'u', 0x0308, ''},
    {'y', 0x0301, ''},
    {'y', 0x0308, ''},
    {'\0', 0, '\0'},
};

unsigned char utf8len[] = {
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* erroneous */
    2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6,
};

unsigned char utf8mask[] = {
    0x3f, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01,
};

unsigned long xlate(unsigned long u);
unsigned long combine(unsigned long u1, unsigned long u2);

int
main(int argc, char** argv)
{
    int c, len;
    unsigned long u1 = 0, u2, u3;

    while ((c = getchar()) != EOF) {
	len = utf8len[(c >> 2) & 0x3F];
	u2 = c & utf8mask[len];
	if (len == 0) {
	    /* erroneous: c is the middle of a character. */
	    len = 5;
	}
	while (--len && (c = getchar()) != EOF) {
	    if ((c & 0xc0) == 0x80) {
		u2 = (u2 << 6) | (c & 0x3f);
	    } else {		/* unexpected start of a new character */
		ungetc (c, stdin);
		break;
	    }
	}
	u2 = xlate(u2);
	if ((u3 = combine(u1, u2))) {
	    u1 = 0;
	    u2 = u3;
	}
	if (u1) {
	    if (u1 <= 0xff)
		putchar(u1);
	    else
		printf("U+%04lx", u1);
	}
	u1 = u2;
	if (c == EOF)
	    break;
    }
    if (u1)
	putchar(u1);
    exit(0);
}

unsigned long
xlate(unsigned long u)
{
    int i;

    for (i = 0; ucsxl[i].u; i++)
	if (ucsxl[i].u == u)
	    return ucsxl[i].c;
    return u;
}

unsigned long
combine(unsigned long u1, unsigned long u2)
{
    int i;

    for (i = 0; ucscm[i].u2; i++)
	if (ucscm[i].u1 == u1 && ucscm[i].u2 == u2)
	    return ucscm[i].c;
    return 0;
}
