#! /bin/sh

# generate CJK / Unicode mapping table from various source formats
#	{0xA140, 0x3000},
#	{0x8940, 0x2A3A9},

ucm () {
	echo extracting mappings from ucm file >&2
	echo "	/* generated from $1 */"
	# <U0111>  \xA9\xA2 |0
	egrep "^[ 	]*<U" $1 |
	sed -e 's/\\x//g' -e "s/<U\([^>]*\)>[	 ]*\([^ |]*\).*/	{0x\2, 0x\1},/"
	}

txt () {
	echo extracting mappings from libiconv TXT file >&2
	echo "	/* generated from $1 */"
	# 0xA140	0x3000
	sed -e "s/0x\([^ 	]*\)[ 	]*0x\([^ 	]*\).*/	{0x\1, 0x\2},/" $1
	}

unihan () {
	if ! make Unihan.txt >&2
	then	echo Could not acquire Unicode data file Unihan.txt >&2
		exit 1
	fi

	echo extracting mappings from Unihan data >&2
	tag=$1
	echo "	/* generated from Unihan.txt, filtering $tag */"
	sed	-e "s/^U+\([^	]*\)	$tag	\([^	]*\)$/	{0x\2, 0x\1},/" \
		-e t -e d Unihan.txt
	}

# make the filter script 'private' accessible
PATH=$PATH:`dirname $0`

# convert different mapping formats into C mapping table, 
# then filter out single-byte mappings and Unicode Private Use mappings
case "$1" in
*JIS*)	filterbytes=0-7;;
*)	filterbytes=0-9A-F;;
esac
echo filtering single bytes "[$filterbytes]" >&2

case "$1" in
*.ucm)	ucm $1;;
*.TXT)	txt $1;;
k*)	unihan $1;;
big5)	`dirname $0`/mkbig5map;;
"")	echo tag missing >&2
	exit;;
*)	unihan k$1;;
esac | 
egrep -v "{0x[$filterbytes].," | private - | 
sed -e 's/0x\(..\),/0x00\1,/' | sort -k 1,1

exit

#############################################################################
# Below, character set mapping information found in the Unihan database 
# is listed. It seems to be of much less use than other table sources 
# (esp. libiconv) for the most widely-used character encodings and 
# their current extensions.

#############################################################################
#	kBigFive
#		The Big Five mapping for this character in hex; note that this does *not* cover
#			any of the Big Five extensions in common use, including the ETEN extensions.
#	kHKSCS
#		Mappings to the Big Five extended code points used for the Hong Kong
#			Supplementary Character Set

#	kGB0
#		The GB 2312-80 mapping for this character in ku/ten form

#	kJis0
#		The JIS X 0208-1990 mapping for this character in ku/ten form

#	kKSC0
#		The KS X 1001:1992 (KS C 5601-1989) mapping for this character in ku/ten form

#############################################################################
#	kGB1
#		The GB 12345-90 mapping for this character in ku/ten form
#	kPseudoGB1
#		A "GB 12345-90" code point assigned this character for the purposes
#			of including it within Unihan. Pseudo-GB1 codes were used to provide
#			official code points for characters not already in national standards, 
#	kGB3
#		The GB 7589-87 mapping for this character in ku/ten form
#	kGB5
#		The GB 7590-87 mapping for this character in ku/ten form
#	kGB8
#		The GB 8565-89 mapping for this character in ku/ten form
#	kGB7
#		The "General Use Characters for Modern Chinese" mapping for this character
#	kIBMJapan
#		The IBM Japanese mapping for this character in hex
#	kJis1
#		The JIS X 0212-1990 mapping for this character in ku/ten form
#	kKPS0
#		The KP 9566-97 mapping for this character in hexadecimal form.
#	kKPS1
#		The KPS 10721-2000 mapping for this character in hexadecimal form.  
#	kKSC1
#		The KS X 1002:1991 (KS C 5657-1991) mapping for this character in ku/ten form
#			such as characters used to write Cantonese, and so on.

#	KPS0, KPS1: disjunkt

#############################################################################
#	kJIS0213
#		The JIS X 0213-2000 mapping for this character in min,ku,ten form
#	kCCCII
#		The CCCII mapping for this character in hex
#	kCNS1986
#		The CNS 11643-1986 mapping for this character in hex
#	kCNS1992
#		The CNS 11643-1992 mapping for this character in hex
#	kEACC
#		The EACC mapping for this character in hex

#############################################################################
