/* Convert UTF-8 file to an ascii file using xml- or html-style decimal entities
 * where necessary. This can be a good idea if you don't trust your editor not
 * to mangle non-ascii strings. Or to make web pages more robust against
 * servers/clients messing up the encoding of the page.
 *
 * I also have a simpler, shorter, slower perl version: utf8_to_ascii.pl
 * and a python version (faster than perl version for recent python): utf8_to_ascii.py
 * 
 * Iain Murray 8th July 2005. This is simple code that just naively follows the
 * UTF-8 RFC. I'm putting it in the public domain. The usual disclaimers apply:
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * TODO --- things that could be improved
 * I don't check that for EOF as much as I should
 * This would happen automatically if I checked all the non-leading bytes for a
 * character are "10xxxxxx".
 * Boiler-plate should be added for command-line options and specifying files
 *
 * */

#include <string.h>
#include <stdio.h>
#include <stdlib.h>

typedef unsigned long ucs4;

void fail()
{
	fprintf(stderr,"Illegal UTF-8 string. Quitting...\n");
	abort();
}

int main()
{
	FILE *in;
	FILE *out;
	int ch;
	char ch2;
	ucs4 num;
	
	// Useful masks
	const unsigned char
	b10000000=0x80, b11000000=0xC0, b11100000=0xE0, b11110000=0xF0,
	b11111000=0xF8, b11111100=0xFC, b11111110=0xFE,
	b00000001=0x01, b00000011=0x03, b00000111=0x07, b00001111=0x0F,
	b00011111=0x1F, b00111111=0x3F, b01111111=0x7F;
	
	// Could add code to specify files here:
	in = stdin;
	out = stdout;
	
/*
	From RFC 2279 (see http://www.ietf.org/rfc/rfc2279.txt for full details)
	
	UCS-4 range (hex.)           UTF-8 octet sequence (binary)
	0000 0000-0000 007F   0xxxxxxx
	0000 0080-0000 07FF   110xxxxx 10xxxxxx
	0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
	
	0001 0000-001F FFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
	0020 0000-03FF FFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
	0400 0000-7FFF FFFF   1111110x 10xxxxxx ... 10xxxxxx
	
	The if..else if...else if... stuff works out which UCS-4 (just an
	integer giving the Unicode code for the character) range we are in.
	There are comments on a line by themselves saying what type of UTF-8
	octet sequence we are dealing with. The comments on the ends of lines
	have capital 'X's to say which bits are being extracted. Sorry, you'll
	need a wide terminal to read this.
*/
	
	while (1) {
		ch = fgetc(in);
		if (ch==EOF)
			break;
		if ((((char)ch)&b10000000)==0) {
				// 0xxxxxxx    ->    0000 0000-0000 007F
				if (ch==0) {
					fprintf(stderr,"NULL byte. Terminating...\n");
					abort();
				}
				fputc(ch, out);
				continue;
			}
		else if ((((char)ch)&b11100000)==b11000000) {
			// 110xxxxx 10xxxxxx    ->    0000 0080-0000 07FF
			ch2 = fgetc(in);
			if ((ch2&b11000000)==b10000000) {
				num = (((((char)ch)&b00011111))<<6) +  //110XXXXX 10xxxxxx
					      (ch2&b00111111);         //1110xxxx 10XXXXXX
			} else fail();
		}
		else if ((((char)ch)&b11110000)==b11100000) {
			// 1110xxxx 10xxxxxx 10xxxxxx     ->    0000 0800-0000 FFFF
			ch2 = fgetc(in);
			if ((ch2&b11000000)==b10000000) {
				num = ((((char)ch)&b00001111)<<12)   +   //1110XXXX 10xxxxxx 10xxxxxx
				          ((ch2&b00111111)<<6) +         //1110xxxx 10XXXXXX 10xxxxxx
				          (((char)fgetc(in))&b00111111); //1110xxxx 10xxxxxx 10XXXXXX
			} else fail();
		}
		else if ((((char)ch)&b11111000)==b11110000) {
			// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx     ->    0001 0000-001F FFFF
			ch2 = fgetc(in);
			if ((ch2&b11000000)==b10000000) {
				num = ((((char)ch)&b00000111)<<18) +            //11110XXX 10xxxxxx 10xxxxxx 10xxxxxx
				          ((ch2&b00111111)<<12) +               //1110xxxx 10XXXXXX 10xxxxxx 10xxxxxx
				          ((((char)fgetc(in))&b00111111)<<6) +  //1110xxxx 10xxxxxx 10XXXXXX 10xxxxxx
				          (((char)fgetc(in))&b00111111);        //1110xxxx 10xxxxxx 10xxxxxx 10XXXXXX
			} else fail();
		}
		else if ((((char)ch)&b11111100)==b11111000) {
			// 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx     ->    0020 0000-03FF FFFF
			ch2 = fgetc(in);
			if ((ch2&b11000000)==b10000000) {
				num = ((((char)ch)&b00000011)<<24) +            //111110XX 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				          ((ch2&b00111111)<<18) +               //1110xxxx 10XXXXXX 10xxxxxx 10xxxxxx 10xxxxxx
				          ((((char)fgetc(in))&b00111111)<<12) + //1110xxxx 10xxxxxx 10XXXXXX 10xxxxxx 10xxxxxx
				          ((((char)fgetc(in))&b00111111)<<6) +  //1110xxxx 10xxxxxx 10xxxxxx 10XXXXXX 10xxxxxx
				          (((char)fgetc(in))&b00111111);        //1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx 10XXXXXX
			} else fail();
		}
		else if ((((char)ch)&b11111110)==b11111100) {
			// 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx     ->    0400 0000-7FFF FFFF
			ch2 = fgetc(in);
			if ((ch2&b11000000)==b10000000) {
				num = ((((char)ch)&b00000001)<<30) +            //1111110X 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				          ((ch2&b00111111)<<24) +               //1110xxxx 10XXXXXX 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				          ((((char)fgetc(in))&b00111111)<<18) + //1110xxxx 10xxxxxx 10XXXXXX 10xxxxxx 10xxxxxx 10xxxxxx
				          ((((char)fgetc(in))&b00111111)<<12) + //1110xxxx 10xxxxxx 10xxxxxx 10XXXXXX 10xxxxxx 10xxxxxx
				          ((((char)fgetc(in))&b00111111)<<6) +  //1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx 10XXXXXX 10xxxxxx
				          (((char)fgetc(in))&b00111111);        //1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10XXXXXX
			} else fail();
		}
		else
			fail();
		
		fprintf(out,"&#%d;",num);
	}
	
	return 0;
}