SphinxBase  0.6
utf8.c
1 /* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> */
2 /* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. */
3 
4 /* Slightly modified to use Sphinx types and remove explicit inline. */
5 
6 #include "sphinxbase/prim_type.h"
7 
8 #define UTF8_ACCEPT 0
9 #define UTF8_REJECT 1
10 
11 static const uint8 utf8d[] = {
12  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
13  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
14  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
15  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
16  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
17  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
18  8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
19  0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
20  0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
21  0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
22  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
23  1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
24  1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
25  1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
26 };
27 
28 uint32
29 utf8_decode(uint32 *state, uint32 *codep, uint32 byte) {
30  uint32 type = utf8d[byte];
31 
32  *codep = (*state != UTF8_ACCEPT) ?
33  (byte & 0x3fu) | (*codep << 6) :
34  (0xff >> type) & (byte);
35 
36  *state = utf8d[256 + *state*16 + type];
37  return *state;
38 }
39 
40 /* CMU code starts here. */
41 /* ====================================================================
42  * Copyright (c) 2009 Carnegie Mellon University. All rights
43  * reserved.
44  *
45  * Redistribution and use in source and binary forms, with or without
46  * modification, are permitted provided that the following conditions
47  * are met:
48  *
49  * 1. Redistributions of source code must retain the above copyright
50  * notice, this list of conditions and the following disclaimer.
51  *
52  * 2. Redistributions in binary form must reproduce the above copyright
53  * notice, this list of conditions and the following disclaimer in
54  * the documentation and/or other materials provided with the
55  * distribution.
56  *
57  * This work was supported in part by funding from the Defense Advanced
58  * Research Projects Agency and the National Science Foundation of the
59  * United States of America, and the CMU Sphinx Speech Consortium.
60  *
61  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
62  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
63  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
64  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
65  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
66  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
67  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
68  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
69  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
70  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
71  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
72  *
73  * ====================================================================
74  *
75  */
Basic type definitions used in Sphinx.