#include <stdio.h>
#include <sys/stat.h>

#ifndef ANSI_COLORS
# ifdef _WIN32
// windoze colors for cmd.exe:
#  include <windows.h>
#  define FOREGROUND_YELLOW FOREGROUND_RED   |FOREGROUND_GREEN
#  define FOREGROUND_PINK   FOREGROUND_RED   |FOREGROUND_BLUE
#  define FOREGROUND_CYAN   FOREGROUND_GREEN |FOREGROUND_BLUE
#  define NOCOLOR           FOREGROUND_YELLOW|FOREGROUND_BLUE
# else
#  define ANSI_COLORS
# endif
#endif

#ifdef ANSI_COLORS
//light ANSI colors:
# define FOREGROUND_RED    "\e[1;31m"
# define FOREGROUND_GREEN  "\e[1;32m"
# define FOREGROUND_YELLOW "\e[1;33m"
# define FOREGROUND_PINK   "\e[1;35m"
# define FOREGROUND_CYAN   "\e[1;36m"
# define BACKGROUND_RED    "\e[1;41m"
# define NOCOLOR           "\e[0m";
#endif

int rangefind(unsigned int p[], unsigned int i, unsigned int stop){
	while(i < stop){
		if(p[i]) return i;
		i++;
	}
	return -1;
}

int utf8len(unsigned char first){
	/* I hate signed chars. They are treacherous
	 * and unsuitable for character data. Who operates
	 * with negative codepoints? ASCII-idiots are
	 * lucky bastards that don't feel the pain.
	 */
	int shift=0;
	register unsigned char mask = 0x80;
	while(first & mask){
		mask >>= 1;
		shift += 1;
	}//shift = position of first low bit
	return shift;
}

struct fail{
	int c;
	int l;
};

void printcount(unsigned int octetcount[]){
	int l=0;
	do{
		printf("%4x", octetcount[l]);
		l++;
		if(l % 16 == 0) putchar('\n');
	}while(l < 256);
}

struct fail isutf8(FILE *fp, unsigned int octetcount[]){
	struct fail ret;
	ret.l=0;
	while((ret.c=fgetc(fp)) != EOF){
		octetcount[ret.c]++;
		if(ret.l){ //inside multibyte sequence
			if((ret.c & 0xC0) != 0x80){ //the parenthesis is vital!
				ret.l = 2;
				goto conclude;
			}
		}else{
			ret.l = utf8len(ret.c);
			switch(ret.l){ //special cases:
				case 0: //member of ascii
					ret.l = 1;
					break;
				case 1: //lost inside multibyte sequence
#ifdef POSIXLY_CORRECT
				case 7: //illegal
				case 8: //illegal
#endif
					goto conclude;
			}
		}
		ret.l--;
	}
	if(ret.l){
		ret.c = ret.l;
		ret.l = 3;
	}
conclude:
	return ret;
}

char nonascii(FILE *fp, unsigned int octetcount[]){
	for(;;){
		int c = fgetc(fp);
		if(c == EOF) return 'A';
		octetcount[c]++;
		if((c & 0xFF) > 0x7F) return c;
	}
}

struct fail isbin(FILE *fp, unsigned int octetcount[]){
	//Investigate 8-bit textliness so far and in rest of file
	struct fail ret;
	do{
		ret.l = nonascii(fp, octetcount); //used as iterator here
		unsigned tempt = octetcount['\t']; octetcount['\t'] = 0;
		unsigned tempn = octetcount['\n']; octetcount['\n'] = 0;
		unsigned tempr = octetcount['\r']; octetcount['\r'] = 0;
		if(tempr && tempr != tempn){
			ret.l = 4; //failed CRLF consistency heuristic
		}
		if((ret.c=rangefind(octetcount, 0, 0x20)) != -1){
			ret.l = 5; //failed control character heuristic
		}
		octetcount['\t'] = tempt;
		octetcount['\n'] = tempn;
		octetcount['\r'] = tempr;
	}while((unsigned)(ret.l) > 0x7F);

	if(ret.l == 'A'){ //8-bit text
		ret.c = rangefind(octetcount, 0x80, 0xA0);
		if(ret.c != -1){
			ret.l = 6; //failed ISO-8859-* test
		}else{
			ret.l = 0;
		}
	}
	return ret;
}

char guesscharset(FILE *fp, int verbosity){
	unsigned octetcount[256]={
		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	};
	struct fail failcode;
	char ans = nonascii(fp, octetcount);
	if(ans == 'A') goto conclude; //ascii
	ungetc(ans, fp);	octetcount[ans]--;
	if(verbosity > 2){
		printf("Not ascii: line %u, offset %lu, octet 0x%x\n",
			octetcount['\n']+1, ftell(fp), ans & 0xFF);
	}

	ans = 'U';
	failcode = isutf8(fp, octetcount);
	if(failcode.l == 0) goto conclude; //utf-8
	if(verbosity > 2){
		printf("Not utf-8: line %u, offset %lu",
			octetcount['\n']+1, ftell(fp)-1);
		if(failcode.l != 3) printf(", octet 0x%x", failcode.c);
		fputs(": ", stdout);
		switch(failcode.l){
		case 1:
			puts("that octet is only legal in multibyte sequences.");
			break;
		case 2:
			puts("illegal octet in multibyte sequence.");
			break;
		case 3:
			printf("end of file in multibyte sequence, "
			"missing %d octets.\n", failcode.c);
			break;
#ifdef POSIXLY_CORRECT
		case 7:
		case 8:
			printf("%d octets is too many in multibyte sequence.\n", failcode.l);
			break;
#endif
		}
	}

	failcode = isbin(fp, octetcount);
	switch(failcode.l){
	case 0:
		ans = 'I'; //iso-8859-*
		goto conclude;
	case 6:
		ans = 'W'; //windows codepage
		break;
	default:
		ans = 'B'; //binary
	}
	if(verbosity > 2){
		printf("Somewhere up until line %u, offset %lu",
			octetcount['\n']+1, ftell(fp)-1);
		if(failcode.l != 4) printf(", octet 0x%x", failcode.c);
		fputs(": ", stdout);
		switch(failcode.l){
		case 4:
			puts("CRLF inconsistency, assuming binary.");
			break;
		case 5:
			puts("control character, assuming binary.");
			break;
		case 6:
			puts("character not in iso-8859-*.");
			break;
		}
	}

conclude:
	if(verbosity > 3) printcount(octetcount);
	if(octetcount['\r']) ans |= 0x20; //lowercase
	return ans;
}

void help(){
	printf("is_utf8 - Check if files are valid as UTF-8.\n"
	"    Usage:\n"
	"is_utf8 [options] [-] file1 ...\n"
	"    Return value:\n"
	"Returns 0 if and only if all files are valid as UTF-8.\n"
	"Specifically, the int returned is a bitfield of ones\n"
	"for each invalid UTF-8 file, starting with the least\n"
	"significant bit for the first file. The bit position\n"
	"wraps around if the number or files is greater than\n"
	"%lu, effectively ORing overlapping bits.\n"
	"    Options:\n"
	"-h          Show this help text\n"
	"--          Do not interpret further filenames as options\n"
	"-V HOWMUCH  Set verbosity:\n"
	"            0 = quiet\n"
	"            1 = guess charset, indicate it by listing file\n"
	"                names preceded with a letter and a space:\n"
	"                \"A \" = ASCII\n"
	"                \"U \" = UTF-8\n"
	"                \"I \" = ISO-8859-*\n"
	"                \"W \" = windows-* (Windows codepage)\n"
	"                \"B \" = binary\n"
	"                \"E \" = error\n"
	"                The letter is lowercased if carriage\n"
	"                return was encountered.\n"
	"            2 = like 1, also color filenames (default)\n"
	"            3 = like 2, also explain the decision\n"
	, 8*sizeof(int)
	);
}

int main(int argc, char *argv[]){
	unsigned int verbosity=2;
	char **arg = argv + 1;
	int bitposmask = 1;
	int ret = 0;
	FILE *fp;
	while(*arg){
		if(**arg != '-'){
			break; //goto endopt;
		}
		char *opt = *arg + 1;
		while(*opt){
			//Uppercase opts take one arg:
			if((*opt) >= 'A' && (*opt) <= 'Z'){
				arg++;
				if(*arg == NULL) goto endopt;
			}
			switch(*opt){
			case '-':
				if(*(opt+1) == '\0'){
					arg++;
					goto endopt;
				}
				break;
			case 'h':
				help();
				break;
			case 'V':
				verbosity = strtoul(*arg, NULL, 10);
				break;
			default:
				fprintf(stderr, "Unknown option: \'%c\'\n", *opt);
			}
			opt++;
		}
		if(opt == *arg + 1){
			fp = stdin;
			goto readstdin;
		}
		arg++;
	}
endopt:
	while(*arg){
		struct stat info;
		if(stat(*arg, &info) != 0){
			goto complain;
		}
		if(S_ISDIR(info.st_mode)){
			goto kontinju;
		}
		fp = fopen(*arg, "rb");
		if(!fp){
complain:
			perror(*arg);
			goto kontinju;
		}
readstdin: ;
		char ans = guesscharset(fp, verbosity);
		if((ans | 0x20) != 'a' && (ans | 0x20) != 'u') ret |= bitposmask;
		if(verbosity){
#ifdef ANSI_COLORS
			char *color="", *nocolor="";
#else
			WORD color;
			HANDLE handletur = GetStdHandle(STD_OUTPUT_HANDLE);
			printf("%c ", ans);
#endif
			if(verbosity > 1){
				switch(ans | 0x20){
					case 'b': color = FOREGROUND_RED;   break;
					case 'a': color = FOREGROUND_GREEN; break;
					case 'u': color = FOREGROUND_YELLOW;break;
					case 'i': color = FOREGROUND_PINK;  break;
					case 'w': color = FOREGROUND_CYAN;  break;
					case 'e': color = BACKGROUND_RED;   break;
				}
#ifdef ANSI_COLORS
				nocolor = NOCOLOR;
#else
				color |= FOREGROUND_INTENSITY;
				SetConsoleTextAttribute(handletur, color);
				puts(*arg);
				SetConsoleTextAttribute(handletur, NOCOLOR);
#endif
			}
#ifdef ANSI_COLORS
			printf("%c %s%s%s\n", ans, color, *arg, nocolor);
#else
			else puts(*arg);
#endif
		}
		fclose(fp);
kontinju:
		arg         += 1;
		bitposmask <<= 1;
		if(bitposmask == 0) bitposmask = 1; //wrap around to lsb
	}
	return ret;
}

