/* Convert UTF-8 file to an ascii file using xml- or html-style decimal entities * where necessary. This can be a good idea if you don't trust your editor not * to mangle non-ascii strings. Or to make web pages more robust against * servers/clients messing up the encoding of the page. * * I also have a simpler, shorter, slower perl version: utf8_to_ascii.pl * and a python version (faster than perl version for recent python): utf8_to_ascii.py * * Iain Murray 8th July 2005. This is simple code that just naively follows the * UTF-8 RFC. I'm putting it in the public domain. The usual disclaimers apply: * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * TODO --- things that could be improved * I don't check that for EOF as much as I should * This would happen automatically if I checked all the non-leading bytes for a * character are "10xxxxxx". * Boiler-plate should be added for command-line options and specifying files * * */ #include #include #include typedef unsigned long ucs4; void fail() { fprintf(stderr,"Illegal UTF-8 string. Quitting...\n"); abort(); } int main() { FILE *in; FILE *out; int ch; char ch2; ucs4 num; // Useful masks const unsigned char b10000000=0x80, b11000000=0xC0, b11100000=0xE0, b11110000=0xF0, b11111000=0xF8, b11111100=0xFC, b11111110=0xFE, b00000001=0x01, b00000011=0x03, b00000111=0x07, b00001111=0x0F, b00011111=0x1F, b00111111=0x3F, b01111111=0x7F; // Could add code to specify files here: in = stdin; out = stdout; /* From RFC 2279 (see http://www.ietf.org/rfc/rfc2279.txt for full details) UCS-4 range (hex.) UTF-8 octet sequence (binary) 0000 0000-0000 007F 0xxxxxxx 0000 0080-0000 07FF 110xxxxx 10xxxxxx 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx The if..else if...else if... stuff works out which UCS-4 (just an integer giving the Unicode code for the character) range we are in. There are comments on a line by themselves saying what type of UTF-8 octet sequence we are dealing with. The comments on the ends of lines have capital 'X's to say which bits are being extracted. Sorry, you'll need a wide terminal to read this. */ while (1) { ch = fgetc(in); if (ch==EOF) break; if ((((char)ch)&b10000000)==0) { // 0xxxxxxx -> 0000 0000-0000 007F if (ch==0) { fprintf(stderr,"NULL byte. Terminating...\n"); abort(); } fputc(ch, out); continue; } else if ((((char)ch)&b11100000)==b11000000) { // 110xxxxx 10xxxxxx -> 0000 0080-0000 07FF ch2 = fgetc(in); if ((ch2&b11000000)==b10000000) { num = (((((char)ch)&b00011111))<<6) + //110XXXXX 10xxxxxx (ch2&b00111111); //1110xxxx 10XXXXXX } else fail(); } else if ((((char)ch)&b11110000)==b11100000) { // 1110xxxx 10xxxxxx 10xxxxxx -> 0000 0800-0000 FFFF ch2 = fgetc(in); if ((ch2&b11000000)==b10000000) { num = ((((char)ch)&b00001111)<<12) + //1110XXXX 10xxxxxx 10xxxxxx ((ch2&b00111111)<<6) + //1110xxxx 10XXXXXX 10xxxxxx (((char)fgetc(in))&b00111111); //1110xxxx 10xxxxxx 10XXXXXX } else fail(); } else if ((((char)ch)&b11111000)==b11110000) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx -> 0001 0000-001F FFFF ch2 = fgetc(in); if ((ch2&b11000000)==b10000000) { num = ((((char)ch)&b00000111)<<18) + //11110XXX 10xxxxxx 10xxxxxx 10xxxxxx ((ch2&b00111111)<<12) + //1110xxxx 10XXXXXX 10xxxxxx 10xxxxxx ((((char)fgetc(in))&b00111111)<<6) + //1110xxxx 10xxxxxx 10XXXXXX 10xxxxxx (((char)fgetc(in))&b00111111); //1110xxxx 10xxxxxx 10xxxxxx 10XXXXXX } else fail(); } else if ((((char)ch)&b11111100)==b11111000) { // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx -> 0020 0000-03FF FFFF ch2 = fgetc(in); if ((ch2&b11000000)==b10000000) { num = ((((char)ch)&b00000011)<<24) + //111110XX 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx ((ch2&b00111111)<<18) + //1110xxxx 10XXXXXX 10xxxxxx 10xxxxxx 10xxxxxx ((((char)fgetc(in))&b00111111)<<12) + //1110xxxx 10xxxxxx 10XXXXXX 10xxxxxx 10xxxxxx ((((char)fgetc(in))&b00111111)<<6) + //1110xxxx 10xxxxxx 10xxxxxx 10XXXXXX 10xxxxxx (((char)fgetc(in))&b00111111); //1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx 10XXXXXX } else fail(); } else if ((((char)ch)&b11111110)==b11111100) { // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx -> 0400 0000-7FFF FFFF ch2 = fgetc(in); if ((ch2&b11000000)==b10000000) { num = ((((char)ch)&b00000001)<<30) + //1111110X 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx ((ch2&b00111111)<<24) + //1110xxxx 10XXXXXX 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx ((((char)fgetc(in))&b00111111)<<18) + //1110xxxx 10xxxxxx 10XXXXXX 10xxxxxx 10xxxxxx 10xxxxxx ((((char)fgetc(in))&b00111111)<<12) + //1110xxxx 10xxxxxx 10xxxxxx 10XXXXXX 10xxxxxx 10xxxxxx ((((char)fgetc(in))&b00111111)<<6) + //1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx 10XXXXXX 10xxxxxx (((char)fgetc(in))&b00111111); //1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10XXXXXX } else fail(); } else fail(); fprintf(out,"&#%d;",num); } return 0; }