nexmon – Rev 1

Subversion Repositories:
Rev:
# This file is derived from 
#
#    http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
#    
# Which was created by   Markus Kuhn <mkuhn@acm.org> - 2000-09-02 
#
# lines begining with # and blank lines are ignored
#
# Beyond that, this file consists of a series of test cases. Each test case consists of
# 2 or 3 lines:
#
#  1. A UTF-8 string
#  2. A status
#      VALID      : The string is a valid UTF-8 representation of valid Unicode
#      INCOMPLETE : The string has a partial character at the end
#      NOTUNICODE : The string is valid UTF-8, but the characters represented
#                   are not valid unicode (
#      OVERLONG   : The string includes overlong sequences
#      MALFORMED  : The string is not valid UTF-8
# 3. If the status is VALID or NOTUNICODE, the UCS-4 representation of the string,
#    as a series of hex numbers.

# 1  Some correct UTF-8 text
κόσμε
VALID
03ba 1f79 03c3 03bc 03b5

# 2.1  First possible sequence of a certain length
#
# FIXME - handle NULLS?
#
# [ NULL BYTE ]
#VALID
#0000

€
VALID
0080

VALID
0800

𐀀
VALID
00010000


NOTUNICODE
00200000


NOTUNICODE
04000000


VALID
0000007f

߿
VALID
000007ff

￿
VALID
0000ffff


NOTUNICODE
001fffff


NOTUNICODE
03ffffff


NOTUNICODE
7fffffff

# 2.3  Other boundary conditions

VALID
d7ff

VALID
e000

VALID
fffd

􏿽
VALID
0010fffd

􏿿
VALID
0010ffff


NOTUNICODE
00110000

# 3.1  Unexpected continuation bytes

MALFORMED
¿
MALFORMED
€¿
MALFORMED
€¿€
MALFORMED
€¿€¿
MALFORMED
€¿€¿€
MALFORMED
€¿€¿€¿
MALFORMED
€¿€¿€¿€
MALFORMED
€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿
MALFORMED

# 3.2  Lonely start characters

À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß 
MALFORMED
à á â ã ä å æ ç è é ê ë ì í î ï 
MALFORMED
ð ñ ò ó ô õ ö ÷ 
MALFORMED
ø ù ú û 
MALFORMED
ü ý 
MALFORMED

# 3.3  Sequences with last continuation byte missing

À
INCOMPLETE
à€
INCOMPLETE
ð€€
INCOMPLETE
ø€€€
INCOMPLETE
ü€€€€
INCOMPLETE
ß
INCOMPLETE
ï¿
INCOMPLETE
÷¿¿
INCOMPLETE
û¿¿¿
INCOMPLETE
ý¿¿¿¿
INCOMPLETE

# 3.4  Concatenation of incomplete sequences

Àà€ð€€ø€€€ü€€€€ßï¿÷¿¿û¿¿¿ý¿¿¿¿
MALFORMED

# 3.5  Impossible bytes

þ
MALFORMED
ÿ
MALFORMED
þþÿÿ
MALFORMED

#  Examples of an overlong ASCII character

À¯
OVERLONG
à€¯
OVERLONG
ð€€¯
OVERLONG
ø€€€¯
OVERLONG
ü€€€€¯
OVERLONG

#  Maximum overlong sequences

Á¿
OVERLONG
àŸ¿
OVERLONG
ð¿¿
OVERLONG
ø‡¿¿¿
OVERLONG
üƒ¿¿¿¿
OVERLONG

# Overlong representation of the NUL character

À€
OVERLONG
à€€
OVERLONG
ð€€€
OVERLONG
ø€€€€
OVERLONG
ü€€€€€
OVERLONG

# Illegal code positions

# Single UTF-16 surrogates

í €
NOTUNICODE
d800

í­¿
NOTUNICODE
db7f

í®€
NOTUNICODE
db80

í¯¿
NOTUNICODE
dbff

í°€
NOTUNICODE
dc00

í¾€
NOTUNICODE
df80

í¿¿
NOTUNICODE
dfff

# Paired UTF-16 surrogates

𐀀
NOTUNICODE
d800 dc00

𐏿
NOTUNICODE
d800 dfff

í­¿í°€
NOTUNICODE
db7f dc00

í­¿í¿¿
NOTUNICODE
db7f dfff

󰀀
NOTUNICODE
db80 dc00

󰏿
NOTUNICODE
db80 dfff

􏰀
NOTUNICODE
dbff dc00

􏿿
NOTUNICODE
dbff dfff

################
#
# Some more tests, not from Markus Kuhn's file
#

# Mixed plane 0 and higher planes

A𐀀B􏿽C
VALID
41 00010000 42 10fffd 43