Sequoyah 1.0
Convert between Cherokee and ASCII transliteration
sequoyah.l
Go to the documentation of this file.
1
19/*
20 LICENSE:
21
22 This program is free software: you can redistribute it and/or modify
23 it under the terms of the GNU General Public License as published by
24 the Free Software Foundation, either version 2 of the License, or
25 (at your option) any later version.
26
27 This program is distributed in the hope that it will be useful,
28 but WITHOUT ANY WARRANTY; without even the implied warranty of
29 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 GNU General Public License for more details.
31
32 You should have received a copy of the GNU General Public License
33 along with this program. If not, see <http://www.gnu.org/licenses/>.
34*/
35
36%{
37#include <stdio.h>
38#include <unistd.h>
39#include <ctype.h>
40
41#include "config.h"
42
43#define YY_NO_INPUT
44#define YY_NO_UNPUT
45
46/*
47 The rest of this section contains several definitions for compatibility
48 with the Unibetacode package, also by Paul Hardy. However, the two
49 packages are not currently merged in any way.
50*/
51/*
52 Definitions for Beta Code encoding.
53*/
54#define LANG_GREEK 0x0000
55#define LANG_LATIN 0x1000
56#define LANG_COPTIC 0x2000
57#define LANG_ARABIC 0x4000 /* Defined by Thesaurus Linguae Graecae but not in their corpus */
58#define LANG_HEBREW 0x8000
59
60#define LANG_CHEROKEE 0x10000
61
62int bom_out = 0; /* =1 to begin UTF-8 output with a UTF-8 Byte Order Mark */
63int lang_type = LANG_CHEROKEE; /* for selecting additional languages */
64
65int doubleq_style=6; /* style for double quotation marks (Greek double quotes) */
66int singleq_style=7; /* style for single quotation marks (Greek single quotes) */
67/*
68 State for quotation type 0 through 9, inclusive; Beta
69 Code only uses quotation types 1 through 8, inclusive.
70
71 0 = open quote not active
72 1 = open quote active, so next encounter will close this quote
73*/
74int quote_state[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
75
76/* Unicode character for an opening quote for styles 0 through 9 */
77int quote_open[10] = {
78 0x201C, /* 0 */ /* LANG_LATIN: U+201C LEFT DOUBLE QUOTATION MARK */
79 0x201E, /* 1 */ /* LANG_HEBREW: U+201E DOUBLE LOW-9 QUOTATION MARK */
80 0x201E, /* 2 */ /* U+201E DOUBLE LOW-9 QUOTATION MARK (not in TLG spec) */
81 0x2018, /* 3 */ /* LANG_LATIN: U+2018 LEFT SINGLE QUOTATION MARK */
82 /* 0x02BB 3 */ /* LANG_LATIN: U+02BB Alternative - MODIFIER LETTER TURNED COMMA */
83 0x201A, /* 4 */ /* LANG_HEBREW: U+201A SINGLE LOW-9 QUOTATION MARK */
84 0x2018, /* 5 */ /* U+2018 LEFT SINGLE QUOTATION MARK (not in TLG spec) */
85 0x00AB, /* 6 */ /* LANG_GREEK:
86 LANG_COPTIC: U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
87 0x02BB, /* 7 */ /* LANG_GREEK:
88 LANG_COPTIC: U+02BB MODIFIER LETTER TURNED COMMA
89 Alternative - U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
90 */
91 0x201C, /* 8 */ /* U+201C LEFT DOUBLE QUOTATION MARK (not implemented) */
92 0x0022 /* 9 */ /* U+0022 QUOTATION MARK (not implemented) */
93};
94
95/* Unicode character for a closing quote for styles 0 through 9 */
96int quote_close[10] = {
97 0x201D, /* 0 */ /* LANG_LATIN: U+201D RIGHT DOUBLE QUOTATION MARK */
98 0x201E, /* 1 */ /* LANG_HEBREW: U+201E DOUBLE LOW-9 QUOTATION MARK */
99 0x201C, /* 2 */ /* U+201C LEFT DOUBLE QUOTATION MARK (not paired in TLG spec) */
100 0x2019, /* 3 */ /* LANG_LATIN: U+2019 RIGHT SINGLE QUOTATION MARK */
101 /* 0x02BC 3 */ /* LANG_LATIN: U+02BC Alternative - MODIFIER LETTER APOSTROPHE */
102 0x201A, /* 4 */ /* LANG_HEBREW: U+201A SINGLE LOW-9 QUOTATION MARK */
103 0x201B, /* 5 */ /* U+201B SINGLE HIGH-REVERSED-9 QUOTATION MARK (not paired in TLG spec) */
104 0x00BB, /* 6 */ /* LANG_GREEK:
105 LANG_COPTIC: U+00BB RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
106 0x02BC, /* 7 */ /* LANG_GREEK:
107 LANG_COPTIC: U+02BC MODIFIER LETTER APOSTROPHE
108 Alternative - U+203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
109 */
110 0x201E, /* 8 */ /* U+201E DOUBLE LOW-9 QUOTATION MARK (not implemented) */
111 0x0022 /* 9 */ /* U+0022 QUOTATION MARK (not implemented) */
112};
113
114
115void print_ascii (char *); /* print a string that's inside '{'...'}' pair */
116void print_unicode (char *); /* print a Unicode code point in the form "\uXXXX" */
117void print_quote (char *); /* print open or close quoation mark styles */
118void print_utf8 (uint32_t); /* Print a Unicode code point in UTF-8 */
119void print_capital (char *); /* Print Greek, Hebrew, or Coptic capital letter */
120void print_small (char *); /* Print Greek, Hebrew, or Coptic small letter */
121void print_pattern (char *, uint32_t); /* Print yytext in Latin mode or print a Unicode code point in UTF-8 */
122/*
123 Print a letter in one of four language modes:
124
125 Latin, Greek, Coptic, and Hebrew, respectively.
126
127 Currently this is only needed to handle 'S' and 's'
128 because of Greek context-dependent middle and final sigma.
129*/
130void print_letter (uint32_t, uint32_t, uint32_t, uint32_t);
131
132
133uint32_t ascii2greek_capital[128]={128*0};
134uint32_t ascii2greek_small[128]={128*0};
135uint32_t ascii2coptic[128]={128*0};
136uint32_t ascii2hebrew[128]={128*0};
137
Header information for the sequoyah program.
#define LANG_CHEROKEE
Define for Cherokee output (default).
Definition: sequoyah.l:60
138%}
139
140%option noyywrap
141
142ESCAPE \{[^\{\}]*\}
143QUOTE (\"|`|')
144DECIMAL_DIGIT [0-9]
145
146%%
147{ESCAPE} { /*
148 Print escape-delimited string of ASCII and/or
149 special Unicode symbols of the form "\ux...x"
150 */
151 yytext [strlen (yytext) - 1] = '\0';
152 /* fprintf (yyout, "%s", &yytext[1]); */
153 print_ascii (&yytext[1]);
154 }
155
156 /* Convert transliterated ASCII to UTF-8 Cherokee. */
157A print_utf8 (0x13A0); /* CHEROKEE LETTER A */
158E print_utf8 (0x13A1); /* CHEROKEE LETTER E */
159I print_utf8 (0x13A2); /* CHEROKEE LETTER I */
160O print_utf8 (0x13A3); /* CHEROKEE LETTER O */
161U print_utf8 (0x13A4); /* CHEROKEE LETTER U */
162V print_utf8 (0x13A5); /* CHEROKEE LETTER V */
163GA print_utf8 (0x13A6); /* CHEROKEE LETTER GA */
164Ga print_utf8 (0x13A6); /* CHEROKEE LETTER GA */
165KA print_utf8 (0x13A7); /* CHEROKEE LETTER KA */
166Ka print_utf8 (0x13A7); /* CHEROKEE LETTER KA */
167GE print_utf8 (0x13A8); /* CHEROKEE LETTER GE */
168Ge print_utf8 (0x13A8); /* CHEROKEE LETTER GE */
169GI print_utf8 (0x13A9); /* CHEROKEE LETTER GI */
170Gi print_utf8 (0x13A9); /* CHEROKEE LETTER GI */
171GO print_utf8 (0x13AA); /* CHEROKEE LETTER GO */
172Go print_utf8 (0x13AA); /* CHEROKEE LETTER GO */
173GU print_utf8 (0x13AB); /* CHEROKEE LETTER GU */
174Gu print_utf8 (0x13AB); /* CHEROKEE LETTER GU */
175GV print_utf8 (0x13AC); /* CHEROKEE LETTER GV */
176Gv print_utf8 (0x13AC); /* CHEROKEE LETTER GV */
177HA print_utf8 (0x13AD); /* CHEROKEE LETTER HA */
178Ha print_utf8 (0x13AD); /* CHEROKEE LETTER HA */
179HE print_utf8 (0x13AE); /* CHEROKEE LETTER HE */
180He print_utf8 (0x13AE); /* CHEROKEE LETTER HE */
181HI print_utf8 (0x13AF); /* CHEROKEE LETTER HI */
182Hi print_utf8 (0x13AF); /* CHEROKEE LETTER HI */
183HO print_utf8 (0x13B0); /* CHEROKEE LETTER HO */
184Ho print_utf8 (0x13B0); /* CHEROKEE LETTER HO */
185HU print_utf8 (0x13B1); /* CHEROKEE LETTER HU */
186Hu print_utf8 (0x13B1); /* CHEROKEE LETTER HU */
187HV print_utf8 (0x13B2); /* CHEROKEE LETTER HV */
188Hv print_utf8 (0x13B2); /* CHEROKEE LETTER HV */
189LA print_utf8 (0x13B3); /* CHEROKEE LETTER LA */
190La print_utf8 (0x13B3); /* CHEROKEE LETTER LA */
191LE print_utf8 (0x13B4); /* CHEROKEE LETTER LE */
192Le print_utf8 (0x13B4); /* CHEROKEE LETTER LE */
193LI print_utf8 (0x13B5); /* CHEROKEE LETTER LI */
194Li print_utf8 (0x13B5); /* CHEROKEE LETTER LI */
195LO print_utf8 (0x13B6); /* CHEROKEE LETTER LO */
196Lo print_utf8 (0x13B6); /* CHEROKEE LETTER LO */
197LU print_utf8 (0x13B7); /* CHEROKEE LETTER LU */
198Lu print_utf8 (0x13B7); /* CHEROKEE LETTER LU */
199LV print_utf8 (0x13B8); /* CHEROKEE LETTER LV */
200Lv print_utf8 (0x13B8); /* CHEROKEE LETTER LV */
201MA print_utf8 (0x13B9); /* CHEROKEE LETTER MA */
202Ma print_utf8 (0x13B9); /* CHEROKEE LETTER MA */
203ME print_utf8 (0x13BA); /* CHEROKEE LETTER ME */
204Me print_utf8 (0x13BA); /* CHEROKEE LETTER ME */
205MI print_utf8 (0x13BB); /* CHEROKEE LETTER MI */
206Mi print_utf8 (0x13BB); /* CHEROKEE LETTER MI */
207MO print_utf8 (0x13BC); /* CHEROKEE LETTER MO */
208Mo print_utf8 (0x13BC); /* CHEROKEE LETTER MO */
209MU print_utf8 (0x13BD); /* CHEROKEE LETTER MU */
210Mu print_utf8 (0x13BD); /* CHEROKEE LETTER MU */
211NA print_utf8 (0x13BE); /* CHEROKEE LETTER NA */
212Na print_utf8 (0x13BE); /* CHEROKEE LETTER NA */
213HNA print_utf8 (0x13BF); /* CHEROKEE LETTER HNA */
214Hna print_utf8 (0x13BF); /* CHEROKEE LETTER HNA */
215NAH print_utf8 (0x13C0); /* CHEROKEE LETTER NAH */
216Nah print_utf8 (0x13C0); /* CHEROKEE LETTER NAH */
217NE print_utf8 (0x13C1); /* CHEROKEE LETTER NE */
218Ne print_utf8 (0x13C1); /* CHEROKEE LETTER NE */
219NI print_utf8 (0x13C2); /* CHEROKEE LETTER NI */
220Ni print_utf8 (0x13C2); /* CHEROKEE LETTER NI */
221NO print_utf8 (0x13C3); /* CHEROKEE LETTER NO */
222No print_utf8 (0x13C3); /* CHEROKEE LETTER NO */
223NU print_utf8 (0x13C4); /* CHEROKEE LETTER NU */
224Nu print_utf8 (0x13C4); /* CHEROKEE LETTER NU */
225NV print_utf8 (0x13C5); /* CHEROKEE LETTER NV */
226Nv print_utf8 (0x13C5); /* CHEROKEE LETTER NV */
227QUA print_utf8 (0x13C6); /* CHEROKEE LETTER QUA */
228Qua print_utf8 (0x13C6); /* CHEROKEE LETTER QUA */
229QUE print_utf8 (0x13C7); /* CHEROKEE LETTER QUE */
230Que print_utf8 (0x13C7); /* CHEROKEE LETTER QUE */
231QUI print_utf8 (0x13C8); /* CHEROKEE LETTER QUI */
232Qui print_utf8 (0x13C8); /* CHEROKEE LETTER QUI */
233QUO print_utf8 (0x13C9); /* CHEROKEE LETTER QUO */
234Quo print_utf8 (0x13C9); /* CHEROKEE LETTER QUO */
235QUU print_utf8 (0x13CA); /* CHEROKEE LETTER QUU */
236Quu print_utf8 (0x13CA); /* CHEROKEE LETTER QUU */
237QUV print_utf8 (0x13CB); /* CHEROKEE LETTER QUV */
238Quv print_utf8 (0x13CB); /* CHEROKEE LETTER QUV */
239SA print_utf8 (0x13CC); /* CHEROKEE LETTER SA */
240Sa print_utf8 (0x13CC); /* CHEROKEE LETTER SA */
241S print_utf8 (0x13CD); /* CHEROKEE LETTER S */
242SE print_utf8 (0x13CE); /* CHEROKEE LETTER SE */
243Se print_utf8 (0x13CE); /* CHEROKEE LETTER SE */
244SI print_utf8 (0x13CF); /* CHEROKEE LETTER SI */
245Si print_utf8 (0x13CF); /* CHEROKEE LETTER SI */
246SO print_utf8 (0x13D0); /* CHEROKEE LETTER SO */
247So print_utf8 (0x13D0); /* CHEROKEE LETTER SO */
248SU print_utf8 (0x13D1); /* CHEROKEE LETTER SU */
249Su print_utf8 (0x13D1); /* CHEROKEE LETTER SU */
250SV print_utf8 (0x13D2); /* CHEROKEE LETTER SV */
251Sv print_utf8 (0x13D2); /* CHEROKEE LETTER SV */
252DA print_utf8 (0x13D3); /* CHEROKEE LETTER DA */
253Da print_utf8 (0x13D3); /* CHEROKEE LETTER DA */
254TA print_utf8 (0x13D4); /* CHEROKEE LETTER TA */
255Ta print_utf8 (0x13D4); /* CHEROKEE LETTER TA */
256DE print_utf8 (0x13D5); /* CHEROKEE LETTER DE */
257De print_utf8 (0x13D5); /* CHEROKEE LETTER DE */
258TE print_utf8 (0x13D6); /* CHEROKEE LETTER TE */
259Te print_utf8 (0x13D6); /* CHEROKEE LETTER TE */
260DI print_utf8 (0x13D7); /* CHEROKEE LETTER DI */
261Di print_utf8 (0x13D7); /* CHEROKEE LETTER DI */
262TI print_utf8 (0x13D8); /* CHEROKEE LETTER TI */
263Ti print_utf8 (0x13D8); /* CHEROKEE LETTER TI */
264DO print_utf8 (0x13D9); /* CHEROKEE LETTER DO */
265Do print_utf8 (0x13D9); /* CHEROKEE LETTER DO */
266DU print_utf8 (0x13DA); /* CHEROKEE LETTER DU */
267Du print_utf8 (0x13DA); /* CHEROKEE LETTER DU */
268DV print_utf8 (0x13DB); /* CHEROKEE LETTER DV */
269Dv print_utf8 (0x13DB); /* CHEROKEE LETTER DV */
270DLA print_utf8 (0x13DC); /* CHEROKEE LETTER DLA */
271Dla print_utf8 (0x13DC); /* CHEROKEE LETTER DLA */
272TLA print_utf8 (0x13DD); /* CHEROKEE LETTER TLA */
273Tla print_utf8 (0x13DD); /* CHEROKEE LETTER TLA */
274TLE print_utf8 (0x13DE); /* CHEROKEE LETTER TLE */
275Tle print_utf8 (0x13DE); /* CHEROKEE LETTER TLE */
276TLI print_utf8 (0x13DF); /* CHEROKEE LETTER TLI */
277Tli print_utf8 (0x13DF); /* CHEROKEE LETTER TLI */
278TLO print_utf8 (0x13E0); /* CHEROKEE LETTER TLO */
279Tlo print_utf8 (0x13E0); /* CHEROKEE LETTER TLO */
280TLU print_utf8 (0x13E1); /* CHEROKEE LETTER TLU */
281Tlu print_utf8 (0x13E1); /* CHEROKEE LETTER TLU */
282TLV print_utf8 (0x13E2); /* CHEROKEE LETTER TLV */
283Tlv print_utf8 (0x13E2); /* CHEROKEE LETTER TLV */
284TSA print_utf8 (0x13E3); /* CHEROKEE LETTER TSA */
285Tsa print_utf8 (0x13E3); /* CHEROKEE LETTER TSA */
286TSE print_utf8 (0x13E4); /* CHEROKEE LETTER TSE */
287Tse print_utf8 (0x13E4); /* CHEROKEE LETTER TSE */
288TSI print_utf8 (0x13E5); /* CHEROKEE LETTER TSI */
289Tsi print_utf8 (0x13E5); /* CHEROKEE LETTER TSI */
290TSO print_utf8 (0x13E6); /* CHEROKEE LETTER TSO */
291Tso print_utf8 (0x13E6); /* CHEROKEE LETTER TSO */
292TSU print_utf8 (0x13E7); /* CHEROKEE LETTER TSU */
293Tsu print_utf8 (0x13E7); /* CHEROKEE LETTER TSU */
294TSV print_utf8 (0x13E8); /* CHEROKEE LETTER TSV */
295Tsv print_utf8 (0x13E8); /* CHEROKEE LETTER TSV */
296WA print_utf8 (0x13E9); /* CHEROKEE LETTER WA */
297Wa print_utf8 (0x13E9); /* CHEROKEE LETTER WA */
298WE print_utf8 (0x13EA); /* CHEROKEE LETTER WE */
299We print_utf8 (0x13EA); /* CHEROKEE LETTER WE */
300WI print_utf8 (0x13EB); /* CHEROKEE LETTER WI */
301Wi print_utf8 (0x13EB); /* CHEROKEE LETTER WI */
302WO print_utf8 (0x13EC); /* CHEROKEE LETTER WO */
303Wo print_utf8 (0x13EC); /* CHEROKEE LETTER WO */
304WU print_utf8 (0x13ED); /* CHEROKEE LETTER WU */
305Wu print_utf8 (0x13ED); /* CHEROKEE LETTER WU */
306WV print_utf8 (0x13EE); /* CHEROKEE LETTER WV */
307Wv print_utf8 (0x13EE); /* CHEROKEE LETTER WV */
308YA print_utf8 (0x13EF); /* CHEROKEE LETTER YA */
309Ya print_utf8 (0x13EF); /* CHEROKEE LETTER YA */
310YE print_utf8 (0x13F0); /* CHEROKEE LETTER YE */
311Ye print_utf8 (0x13F0); /* CHEROKEE LETTER YE */
312YI print_utf8 (0x13F1); /* CHEROKEE LETTER YI */
313Yi print_utf8 (0x13F1); /* CHEROKEE LETTER YI */
314YO print_utf8 (0x13F2); /* CHEROKEE LETTER YO */
315Yo print_utf8 (0x13F2); /* CHEROKEE LETTER YO */
316YU print_utf8 (0x13F3); /* CHEROKEE LETTER YU */
317Yu print_utf8 (0x13F3); /* CHEROKEE LETTER YU */
318YV print_utf8 (0x13F4); /* CHEROKEE LETTER YV */
319Yv print_utf8 (0x13F4); /* CHEROKEE LETTER YV */
320MV print_utf8 (0x13F5); /* CHEROKEE LETTER MV */
321Mv print_utf8 (0x13F5); /* CHEROKEE LETTER MV */
322ye print_utf8 (0x13F8); /* CHEROKEE SMALL LETTER YE */
323yi print_utf8 (0x13F9); /* CHEROKEE SMALL LETTER YI */
324yo print_utf8 (0x13FA); /* CHEROKEE SMALL LETTER YO */
325yu print_utf8 (0x13FB); /* CHEROKEE SMALL LETTER YU */
326yv print_utf8 (0x13FC); /* CHEROKEE SMALL LETTER YV */
327mv print_utf8 (0x13FD); /* CHEROKEE SMALL LETTER MV */
328a print_utf8 (0xAB70); /* CHEROKEE SMALL LETTER A */
329e print_utf8 (0xAB71); /* CHEROKEE SMALL LETTER E */
330i print_utf8 (0xAB72); /* CHEROKEE SMALL LETTER I */
331o print_utf8 (0xAB73); /* CHEROKEE SMALL LETTER O */
332u print_utf8 (0xAB74); /* CHEROKEE SMALL LETTER U */
333v print_utf8 (0xAB75); /* CHEROKEE SMALL LETTER V */
334ga print_utf8 (0xAB76); /* CHEROKEE SMALL LETTER GA */
335ka print_utf8 (0xAB77); /* CHEROKEE SMALL LETTER KA */
336ge print_utf8 (0xAB78); /* CHEROKEE SMALL LETTER GE */
337gi print_utf8 (0xAB79); /* CHEROKEE SMALL LETTER GI */
338go print_utf8 (0xAB7A); /* CHEROKEE SMALL LETTER GO */
339gu print_utf8 (0xAB7B); /* CHEROKEE SMALL LETTER GU */
340gv print_utf8 (0xAB7C); /* CHEROKEE SMALL LETTER GV */
341ha print_utf8 (0xAB7D); /* CHEROKEE SMALL LETTER HA */
342he print_utf8 (0xAB7E); /* CHEROKEE SMALL LETTER HE */
343hi print_utf8 (0xAB7F); /* CHEROKEE SMALL LETTER HI */
344ho print_utf8 (0xAB80); /* CHEROKEE SMALL LETTER HO */
345hu print_utf8 (0xAB81); /* CHEROKEE SMALL LETTER HU */
346hv print_utf8 (0xAB82); /* CHEROKEE SMALL LETTER HV */
347la print_utf8 (0xAB83); /* CHEROKEE SMALL LETTER LA */
348le print_utf8 (0xAB84); /* CHEROKEE SMALL LETTER LE */
349li print_utf8 (0xAB85); /* CHEROKEE SMALL LETTER LI */
350lo print_utf8 (0xAB86); /* CHEROKEE SMALL LETTER LO */
351lu print_utf8 (0xAB87); /* CHEROKEE SMALL LETTER LU */
352lv print_utf8 (0xAB88); /* CHEROKEE SMALL LETTER LV */
353ma print_utf8 (0xAB89); /* CHEROKEE SMALL LETTER MA */
354me print_utf8 (0xAB8A); /* CHEROKEE SMALL LETTER ME */
355mi print_utf8 (0xAB8B); /* CHEROKEE SMALL LETTER MI */
356mo print_utf8 (0xAB8C); /* CHEROKEE SMALL LETTER MO */
357mu print_utf8 (0xAB8D); /* CHEROKEE SMALL LETTER MU */
358na print_utf8 (0xAB8E); /* CHEROKEE SMALL LETTER NA */
359hna print_utf8 (0xAB8F); /* CHEROKEE SMALL LETTER HNA */
360nah print_utf8 (0xAB90); /* CHEROKEE SMALL LETTER NAH */
361ne print_utf8 (0xAB91); /* CHEROKEE SMALL LETTER NE */
362ni print_utf8 (0xAB92); /* CHEROKEE SMALL LETTER NI */
363no print_utf8 (0xAB93); /* CHEROKEE SMALL LETTER NO */
364nu print_utf8 (0xAB94); /* CHEROKEE SMALL LETTER NU */
365nv print_utf8 (0xAB95); /* CHEROKEE SMALL LETTER NV */
366qua print_utf8 (0xAB96); /* CHEROKEE SMALL LETTER QUA */
367que print_utf8 (0xAB97); /* CHEROKEE SMALL LETTER QUE */
368qui print_utf8 (0xAB98); /* CHEROKEE SMALL LETTER QUI */
369quo print_utf8 (0xAB99); /* CHEROKEE SMALL LETTER QUO */
370quu print_utf8 (0xAB9A); /* CHEROKEE SMALL LETTER QUU */
371quv print_utf8 (0xAB9B); /* CHEROKEE SMALL LETTER QUV */
372sa print_utf8 (0xAB9C); /* CHEROKEE SMALL LETTER SA */
373s print_utf8 (0xAB9D); /* CHEROKEE SMALL LETTER S */
374se print_utf8 (0xAB9E); /* CHEROKEE SMALL LETTER SE */
375si print_utf8 (0xAB9F); /* CHEROKEE SMALL LETTER SI */
376so print_utf8 (0xABA0); /* CHEROKEE SMALL LETTER SO */
377su print_utf8 (0xABA1); /* CHEROKEE SMALL LETTER SU */
378sv print_utf8 (0xABA2); /* CHEROKEE SMALL LETTER SV */
379da print_utf8 (0xABA3); /* CHEROKEE SMALL LETTER DA */
380ta print_utf8 (0xABA4); /* CHEROKEE SMALL LETTER TA */
381de print_utf8 (0xABA5); /* CHEROKEE SMALL LETTER DE */
382te print_utf8 (0xABA6); /* CHEROKEE SMALL LETTER TE */
383di print_utf8 (0xABA7); /* CHEROKEE SMALL LETTER DI */
384ti print_utf8 (0xABA8); /* CHEROKEE SMALL LETTER TI */
385do print_utf8 (0xABA9); /* CHEROKEE SMALL LETTER DO */
386du print_utf8 (0xABAA); /* CHEROKEE SMALL LETTER DU */
387dv print_utf8 (0xABAB); /* CHEROKEE SMALL LETTER DV */
388dla print_utf8 (0xABAC); /* CHEROKEE SMALL LETTER DLA */
389tla print_utf8 (0xABAD); /* CHEROKEE SMALL LETTER TLA */
390tle print_utf8 (0xABAE); /* CHEROKEE SMALL LETTER TLE */
391tli print_utf8 (0xABAF); /* CHEROKEE SMALL LETTER TLI */
392tlo print_utf8 (0xABB0); /* CHEROKEE SMALL LETTER TLO */
393tlu print_utf8 (0xABB1); /* CHEROKEE SMALL LETTER TLU */
394tlv print_utf8 (0xABB2); /* CHEROKEE SMALL LETTER TLV */
395tsa print_utf8 (0xABB3); /* CHEROKEE SMALL LETTER TSA */
396tse print_utf8 (0xABB4); /* CHEROKEE SMALL LETTER TSE */
397tsi print_utf8 (0xABB5); /* CHEROKEE SMALL LETTER TSI */
398tso print_utf8 (0xABB6); /* CHEROKEE SMALL LETTER TSO */
399tsu print_utf8 (0xABB7); /* CHEROKEE SMALL LETTER TSU */
400tsv print_utf8 (0xABB8); /* CHEROKEE SMALL LETTER TSV */
401wa print_utf8 (0xABB9); /* CHEROKEE SMALL LETTER WA */
402we print_utf8 (0xABBA); /* CHEROKEE SMALL LETTER WE */
403wi print_utf8 (0xABBB); /* CHEROKEE SMALL LETTER WI */
404wo print_utf8 (0xABBC); /* CHEROKEE SMALL LETTER WO */
405wu print_utf8 (0xABBD); /* CHEROKEE SMALL LETTER WU */
406wv print_utf8 (0xABBE); /* CHEROKEE SMALL LETTER WV */
407ya print_utf8 (0xABBF); /* CHEROKEE SMALL LETTER YA */
408
409 /* Convert UTF-8 Cherokee to transliterated ASCII. */
410\341\216\240 fprintf (yyout, "A"); /* U+13A0 CHEROKEE LETTER A */
411\341\216\241 fprintf (yyout, "E"); /* U+13A1 CHEROKEE LETTER E */
412\341\216\242 fprintf (yyout, "I"); /* U+13A2 CHEROKEE LETTER I */
413\341\216\243 fprintf (yyout, "O"); /* U+13A3 CHEROKEE LETTER O */
414\341\216\244 fprintf (yyout, "U"); /* U+13A4 CHEROKEE LETTER U */
415\341\216\245 fprintf (yyout, "V"); /* U+13A5 CHEROKEE LETTER V */
416\341\216\246 fprintf (yyout, "GA"); /* U+13A6 CHEROKEE LETTER GA */
417\341\216\247 fprintf (yyout, "KA"); /* U+13A7 CHEROKEE LETTER KA */
418\341\216\250 fprintf (yyout, "GE"); /* U+13A8 CHEROKEE LETTER GE */
419\341\216\251 fprintf (yyout, "GI"); /* U+13A9 CHEROKEE LETTER GI */
420\341\216\252 fprintf (yyout, "GO"); /* U+13AA CHEROKEE LETTER GO */
421\341\216\253 fprintf (yyout, "GU"); /* U+13AB CHEROKEE LETTER GU */
422\341\216\254 fprintf (yyout, "GV"); /* U+13AC CHEROKEE LETTER GV */
423\341\216\255 fprintf (yyout, "HA"); /* U+13AD CHEROKEE LETTER HA */
424\341\216\256 fprintf (yyout, "HE"); /* U+13AE CHEROKEE LETTER HE */
425\341\216\257 fprintf (yyout, "HI"); /* U+13AF CHEROKEE LETTER HI */
426\341\216\260 fprintf (yyout, "HO"); /* U+13B0 CHEROKEE LETTER HO */
427\341\216\261 fprintf (yyout, "HU"); /* U+13B1 CHEROKEE LETTER HU */
428\341\216\262 fprintf (yyout, "HV"); /* U+13B2 CHEROKEE LETTER HV */
429\341\216\263 fprintf (yyout, "LA"); /* U+13B3 CHEROKEE LETTER LA */
430\341\216\264 fprintf (yyout, "LE"); /* U+13B4 CHEROKEE LETTER LE */
431\341\216\265 fprintf (yyout, "LI"); /* U+13B5 CHEROKEE LETTER LI */
432\341\216\266 fprintf (yyout, "LO"); /* U+13B6 CHEROKEE LETTER LO */
433\341\216\267 fprintf (yyout, "LU"); /* U+13B7 CHEROKEE LETTER LU */
434\341\216\270 fprintf (yyout, "LV"); /* U+13B8 CHEROKEE LETTER LV */
435\341\216\271 fprintf (yyout, "MA"); /* U+13B9 CHEROKEE LETTER MA */
436\341\216\272 fprintf (yyout, "ME"); /* U+13BA CHEROKEE LETTER ME */
437\341\216\273 fprintf (yyout, "MI"); /* U+13BB CHEROKEE LETTER MI */
438\341\216\274 fprintf (yyout, "MO"); /* U+13BC CHEROKEE LETTER MO */
439\341\216\275 fprintf (yyout, "MU"); /* U+13BD CHEROKEE LETTER MU */
440\341\216\276 fprintf (yyout, "NA"); /* U+13BE CHEROKEE LETTER NA */
441\341\216\277 fprintf (yyout, "HNA"); /* U+13BF CHEROKEE LETTER HNA */
442\341\217\200 fprintf (yyout, "NAH"); /* U+13C0 CHEROKEE LETTER NAH */
443\341\217\201 fprintf (yyout, "NE"); /* U+13C1 CHEROKEE LETTER NE */
444\341\217\202 fprintf (yyout, "NI"); /* U+13C2 CHEROKEE LETTER NI */
445\341\217\203 fprintf (yyout, "NO"); /* U+13C3 CHEROKEE LETTER NO */
446\341\217\204 fprintf (yyout, "NU"); /* U+13C4 CHEROKEE LETTER NU */
447\341\217\205 fprintf (yyout, "NV"); /* U+13C5 CHEROKEE LETTER NV */
448\341\217\206 fprintf (yyout, "QUA"); /* U+13C6 CHEROKEE LETTER QUA */
449\341\217\207 fprintf (yyout, "QUE"); /* U+13C7 CHEROKEE LETTER QUE */
450\341\217\210 fprintf (yyout, "QUI"); /* U+13C8 CHEROKEE LETTER QUI */
451\341\217\211 fprintf (yyout, "QUO"); /* U+13C9 CHEROKEE LETTER QUO */
452\341\217\212 fprintf (yyout, "QUU"); /* U+13CA CHEROKEE LETTER QUU */
453\341\217\213 fprintf (yyout, "QUV"); /* U+13CB CHEROKEE LETTER QUV */
454\341\217\214 fprintf (yyout, "SA"); /* U+13CC CHEROKEE LETTER SA */
455\341\217\215 fprintf (yyout, "S"); /* U+13CD CHEROKEE LETTER S */
456\341\217\216 fprintf (yyout, "SE"); /* U+13CE CHEROKEE LETTER SE */
457\341\217\217 fprintf (yyout, "SI"); /* U+13CF CHEROKEE LETTER SI */
458\341\217\220 fprintf (yyout, "SO"); /* U+13D0 CHEROKEE LETTER SO */
459\341\217\221 fprintf (yyout, "SU"); /* U+13D1 CHEROKEE LETTER SU */
460\341\217\222 fprintf (yyout, "SV"); /* U+13D2 CHEROKEE LETTER SV */
461\341\217\223 fprintf (yyout, "DA"); /* U+13D3 CHEROKEE LETTER DA */
462\341\217\224 fprintf (yyout, "TA"); /* U+13D4 CHEROKEE LETTER TA */
463\341\217\225 fprintf (yyout, "DE"); /* U+13D5 CHEROKEE LETTER DE */
464\341\217\226 fprintf (yyout, "TE"); /* U+13D6 CHEROKEE LETTER TE */
465\341\217\227 fprintf (yyout, "DI"); /* U+13D7 CHEROKEE LETTER DI */
466\341\217\230 fprintf (yyout, "TI"); /* U+13D8 CHEROKEE LETTER TI */
467\341\217\231 fprintf (yyout, "DO"); /* U+13D9 CHEROKEE LETTER DO */
468\341\217\232 fprintf (yyout, "DU"); /* U+13DA CHEROKEE LETTER DU */
469\341\217\233 fprintf (yyout, "DV"); /* U+13DB CHEROKEE LETTER DV */
470\341\217\234 fprintf (yyout, "DLA"); /* U+13DC CHEROKEE LETTER DLA */
471\341\217\235 fprintf (yyout, "TLA"); /* U+13DD CHEROKEE LETTER TLA */
472\341\217\236 fprintf (yyout, "TLE"); /* U+13DE CHEROKEE LETTER TLE */
473\341\217\237 fprintf (yyout, "TLI"); /* U+13DF CHEROKEE LETTER TLI */
474\341\217\240 fprintf (yyout, "TLO"); /* U+13E0 CHEROKEE LETTER TLO */
475\341\217\241 fprintf (yyout, "TLU"); /* U+13E1 CHEROKEE LETTER TLU */
476\341\217\242 fprintf (yyout, "TLV"); /* U+13E2 CHEROKEE LETTER TLV */
477\341\217\243 fprintf (yyout, "TSA"); /* U+13E3 CHEROKEE LETTER TSA */
478\341\217\244 fprintf (yyout, "TSE"); /* U+13E4 CHEROKEE LETTER TSE */
479\341\217\245 fprintf (yyout, "TSI"); /* U+13E5 CHEROKEE LETTER TSI */
480\341\217\246 fprintf (yyout, "TSO"); /* U+13E6 CHEROKEE LETTER TSO */
481\341\217\247 fprintf (yyout, "TSU"); /* U+13E7 CHEROKEE LETTER TSU */
482\341\217\250 fprintf (yyout, "TSV"); /* U+13E8 CHEROKEE LETTER TSV */
483\341\217\251 fprintf (yyout, "WA"); /* U+13E9 CHEROKEE LETTER WA */
484\341\217\252 fprintf (yyout, "WE"); /* U+13EA CHEROKEE LETTER WE */
485\341\217\253 fprintf (yyout, "WI"); /* U+13EB CHEROKEE LETTER WI */
486\341\217\254 fprintf (yyout, "WO"); /* U+13EC CHEROKEE LETTER WO */
487\341\217\255 fprintf (yyout, "WU"); /* U+13ED CHEROKEE LETTER WU */
488\341\217\256 fprintf (yyout, "WV"); /* U+13EE CHEROKEE LETTER WV */
489\341\217\257 fprintf (yyout, "YA"); /* U+13EF CHEROKEE LETTER YA */
490\341\217\260 fprintf (yyout, "YE"); /* U+13F0 CHEROKEE LETTER YE */
491\341\217\261 fprintf (yyout, "YI"); /* U+13F1 CHEROKEE LETTER YI */
492\341\217\262 fprintf (yyout, "YO"); /* U+13F2 CHEROKEE LETTER YO */
493\341\217\263 fprintf (yyout, "YU"); /* U+13F3 CHEROKEE LETTER YU */
494\341\217\264 fprintf (yyout, "YV"); /* U+13F4 CHEROKEE LETTER YV */
495\341\217\265 fprintf (yyout, "MV"); /* U+13F5 CHEROKEE LETTER MV */
496\341\217\270 fprintf (yyout, "ye"); /* U+13F8 CHEROKEE SMALL LETTER YE */
497\341\217\271 fprintf (yyout, "yi"); /* U+13F9 CHEROKEE SMALL LETTER YI */
498\341\217\272 fprintf (yyout, "yo"); /* U+13FA CHEROKEE SMALL LETTER YO */
499\341\217\273 fprintf (yyout, "yu"); /* U+13FB CHEROKEE SMALL LETTER YU */
500\341\217\274 fprintf (yyout, "yv"); /* U+13FC CHEROKEE SMALL LETTER YV */
501\341\217\275 fprintf (yyout, "mv"); /* U+13FD CHEROKEE SMALL LETTER MV */
502\352\255\260 fprintf (yyout, "a"); /* U+AB70 CHEROKEE SMALL LETTER A */
503\352\255\261 fprintf (yyout, "e"); /* U+AB71 CHEROKEE SMALL LETTER E */
504\352\255\262 fprintf (yyout, "i"); /* U+AB72 CHEROKEE SMALL LETTER I */
505\352\255\263 fprintf (yyout, "o"); /* U+AB73 CHEROKEE SMALL LETTER O */
506\352\255\264 fprintf (yyout, "u"); /* U+AB74 CHEROKEE SMALL LETTER U */
507\352\255\265 fprintf (yyout, "v"); /* U+AB75 CHEROKEE SMALL LETTER V */
508\352\255\266 fprintf (yyout, "ga"); /* U+AB76 CHEROKEE SMALL LETTER GA */
509\352\255\267 fprintf (yyout, "ka"); /* U+AB77 CHEROKEE SMALL LETTER KA */
510\352\255\270 fprintf (yyout, "ge"); /* U+AB78 CHEROKEE SMALL LETTER GE */
511\352\255\271 fprintf (yyout, "gi"); /* U+AB79 CHEROKEE SMALL LETTER GI */
512\352\255\272 fprintf (yyout, "go"); /* U+AB7A CHEROKEE SMALL LETTER GO */
513\352\255\273 fprintf (yyout, "gu"); /* U+AB7B CHEROKEE SMALL LETTER GU */
514\352\255\274 fprintf (yyout, "gv"); /* U+AB7C CHEROKEE SMALL LETTER GV */
515\352\255\275 fprintf (yyout, "ha"); /* U+AB7D CHEROKEE SMALL LETTER HA */
516\352\255\276 fprintf (yyout, "he"); /* U+AB7E CHEROKEE SMALL LETTER HE */
517\352\255\277 fprintf (yyout, "hi"); /* U+AB7F CHEROKEE SMALL LETTER HI */
518\352\256\200 fprintf (yyout, "ho"); /* U+AB80 CHEROKEE SMALL LETTER HO */
519\352\256\201 fprintf (yyout, "hu"); /* U+AB81 CHEROKEE SMALL LETTER HU */
520\352\256\202 fprintf (yyout, "hv"); /* U+AB82 CHEROKEE SMALL LETTER HV */
521\352\256\203 fprintf (yyout, "la"); /* U+AB83 CHEROKEE SMALL LETTER LA */
522\352\256\204 fprintf (yyout, "le"); /* U+AB84 CHEROKEE SMALL LETTER LE */
523\352\256\205 fprintf (yyout, "li"); /* U+AB85 CHEROKEE SMALL LETTER LI */
524\352\256\206 fprintf (yyout, "lo"); /* U+AB86 CHEROKEE SMALL LETTER LO */
525\352\256\207 fprintf (yyout, "lu"); /* U+AB87 CHEROKEE SMALL LETTER LU */
526\352\256\210 fprintf (yyout, "lv"); /* U+AB88 CHEROKEE SMALL LETTER LV */
527\352\256\211 fprintf (yyout, "ma"); /* U+AB89 CHEROKEE SMALL LETTER MA */
528\352\256\212 fprintf (yyout, "me"); /* U+AB8A CHEROKEE SMALL LETTER ME */
529\352\256\213 fprintf (yyout, "mi"); /* U+AB8B CHEROKEE SMALL LETTER MI */
530\352\256\214 fprintf (yyout, "mo"); /* U+AB8C CHEROKEE SMALL LETTER MO */
531\352\256\215 fprintf (yyout, "mu"); /* U+AB8D CHEROKEE SMALL LETTER MU */
532\352\256\216 fprintf (yyout, "na"); /* U+AB8E CHEROKEE SMALL LETTER NA */
533\352\256\217 fprintf (yyout, "hna"); /* U+AB8F CHEROKEE SMALL LETTER HNA */
534\352\256\220 fprintf (yyout, "nah"); /* U+AB90 CHEROKEE SMALL LETTER NAH */
535\352\256\221 fprintf (yyout, "ne"); /* U+AB91 CHEROKEE SMALL LETTER NE */
536\352\256\222 fprintf (yyout, "ni"); /* U+AB92 CHEROKEE SMALL LETTER NI */
537\352\256\223 fprintf (yyout, "no"); /* U+AB93 CHEROKEE SMALL LETTER NO */
538\352\256\224 fprintf (yyout, "nu"); /* U+AB94 CHEROKEE SMALL LETTER NU */
539\352\256\225 fprintf (yyout, "nv"); /* U+AB95 CHEROKEE SMALL LETTER NV */
540\352\256\226 fprintf (yyout, "qua"); /* U+AB96 CHEROKEE SMALL LETTER QUA */
541\352\256\227 fprintf (yyout, "que"); /* U+AB97 CHEROKEE SMALL LETTER QUE */
542\352\256\230 fprintf (yyout, "qui"); /* U+AB98 CHEROKEE SMALL LETTER QUI */
543\352\256\231 fprintf (yyout, "quo"); /* U+AB99 CHEROKEE SMALL LETTER QUO */
544\352\256\232 fprintf (yyout, "quu"); /* U+AB9A CHEROKEE SMALL LETTER QUU */
545\352\256\233 fprintf (yyout, "quv"); /* U+AB9B CHEROKEE SMALL LETTER QUV */
546\352\256\234 fprintf (yyout, "sa"); /* U+AB9C CHEROKEE SMALL LETTER SA */
547\352\256\235 fprintf (yyout, "s"); /* U+AB9D CHEROKEE SMALL LETTER S */
548\352\256\236 fprintf (yyout, "se"); /* U+AB9E CHEROKEE SMALL LETTER SE */
549\352\256\237 fprintf (yyout, "si"); /* U+AB9F CHEROKEE SMALL LETTER SI */
550\352\256\240 fprintf (yyout, "so"); /* U+ABA0 CHEROKEE SMALL LETTER SO */
551\352\256\241 fprintf (yyout, "su"); /* U+ABA1 CHEROKEE SMALL LETTER SU */
552\352\256\242 fprintf (yyout, "sv"); /* U+ABA2 CHEROKEE SMALL LETTER SV */
553\352\256\243 fprintf (yyout, "da"); /* U+ABA3 CHEROKEE SMALL LETTER DA */
554\352\256\244 fprintf (yyout, "ta"); /* U+ABA4 CHEROKEE SMALL LETTER TA */
555\352\256\245 fprintf (yyout, "de"); /* U+ABA5 CHEROKEE SMALL LETTER DE */
556\352\256\246 fprintf (yyout, "te"); /* U+ABA6 CHEROKEE SMALL LETTER TE */
557\352\256\247 fprintf (yyout, "di"); /* U+ABA7 CHEROKEE SMALL LETTER DI */
558\352\256\250 fprintf (yyout, "ti"); /* U+ABA8 CHEROKEE SMALL LETTER TI */
559\352\256\251 fprintf (yyout, "do"); /* U+ABA9 CHEROKEE SMALL LETTER DO */
560\352\256\252 fprintf (yyout, "du"); /* U+ABAA CHEROKEE SMALL LETTER DU */
561\352\256\253 fprintf (yyout, "dv"); /* U+ABAB CHEROKEE SMALL LETTER DV */
562\352\256\254 fprintf (yyout, "dla"); /* U+ABAC CHEROKEE SMALL LETTER DLA */
563\352\256\255 fprintf (yyout, "tla"); /* U+ABAD CHEROKEE SMALL LETTER TLA */
564\352\256\256 fprintf (yyout, "tle"); /* U+ABAE CHEROKEE SMALL LETTER TLE */
565\352\256\257 fprintf (yyout, "tli"); /* U+ABAF CHEROKEE SMALL LETTER TLI */
566\352\256\260 fprintf (yyout, "tlo"); /* U+ABB0 CHEROKEE SMALL LETTER TLO */
567\352\256\261 fprintf (yyout, "tlu"); /* U+ABB1 CHEROKEE SMALL LETTER TLU */
568\352\256\262 fprintf (yyout, "tlv"); /* U+ABB2 CHEROKEE SMALL LETTER TLV */
569\352\256\263 fprintf (yyout, "tsa"); /* U+ABB3 CHEROKEE SMALL LETTER TSA */
570\352\256\264 fprintf (yyout, "tse"); /* U+ABB4 CHEROKEE SMALL LETTER TSE */
571\352\256\265 fprintf (yyout, "tsi"); /* U+ABB5 CHEROKEE SMALL LETTER TSI */
572\352\256\266 fprintf (yyout, "tso"); /* U+ABB6 CHEROKEE SMALL LETTER TSO */
573\352\256\267 fprintf (yyout, "tsu"); /* U+ABB7 CHEROKEE SMALL LETTER TSU */
574\352\256\270 fprintf (yyout, "tsv"); /* U+ABB8 CHEROKEE SMALL LETTER TSV */
575\352\256\271 fprintf (yyout, "wa"); /* U+ABB9 CHEROKEE SMALL LETTER WA */
576\352\256\272 fprintf (yyout, "we"); /* U+ABBA CHEROKEE SMALL LETTER WE */
577\352\256\273 fprintf (yyout, "wi"); /* U+ABBB CHEROKEE SMALL LETTER WI */
578\352\256\274 fprintf (yyout, "wo"); /* U+ABBC CHEROKEE SMALL LETTER WO */
579\352\256\275 fprintf (yyout, "wu"); /* U+ABBD CHEROKEE SMALL LETTER WU */
580\352\256\276 fprintf (yyout, "wv"); /* U+ABBE CHEROKEE SMALL LETTER WV */
581\352\256\277 fprintf (yyout, "ya"); /* U+ABBF CHEROKEE SMALL LETTER YA */
582%%
583
584int
585main (int argc, char *argv[])
586{
587 int i; /* loop variable */
588 int exit_status; /* program exit status */
589
590 void print_help (char *);
591
592 exit_status = EXIT_SUCCESS;
593 yyin = stdin;
594 yyout = stdout;
595
596 for (i = 1; i < argc; i++) {
597 /*
598 Parse options. If an invalid command line argument
599 was given, print a help menu and exit with error status.
600 */
601 if (argv[i][0] == '-') {
602 switch (argv[i][1]) {
603 /* Check for "--version" */
604 case '-': if (strncmp (argv[i], "--version", 9) == 0) {
605 printf ("sequoyah Version %s\n", VERSION);
606 printf ("Copyright (C) 2003, 2004 Paul Hardy\n");
607 exit (EXIT_SUCCESS);
608 }
609 /* Begin output with Byte Order Mark, U+FFFE */
610 case 'b': bom_out = 1;
611 break;
612 /*
613 input file format; file name follows
614 in next parameter, so increment i
615 */
616 case 'i': yyin = fopen (argv[++i], "r");
617 break;
618 /*
619 output file format; file name follows
620 in next parameter, so increment i
621 */
622 case 'o': yyout = fopen (argv[++i], "w");
623 break;
624 /* Check for "-v" */
625 case 'v': printf ("sequoyah Version %s\n", VERSION);
626 exit (EXIT_SUCCESS);
627 /* quote mark style for open & close quotes */
628 default: print_help (argv[0]);
629 exit_status = EXIT_FAILURE;
630 break;
631 }
632 }
633 else {
634 print_help (argv[0]);
635 exit_status = EXIT_FAILURE;
636 }
637 }
638
639 if (exit_status == EXIT_SUCCESS) {
640 if (bom_out != 0) {
641 print_utf8 (0xFFFE); /* Unicode Byte Order Mark */
642 }
643
644 yylex ();
645 }
646
647 exit (exit_status);
648}
649
650
651/*
652 Print a help message. The parameter is the program name,
653 taken from argv[0].
654*/
655void
656print_help (char * progname)
657{
658
659 fprintf (stderr, "\nUnknown command line parameter.\n\n");
660 fprintf (stderr, "Syntax: %s [-b] [-i input_file] [-o output_file]\n\n",
661 progname);
662 fprintf (stderr, " -b: begin output with UTF-8 Byte Order Mark\n\n");
663 fprintf (stderr, " -i: specify input file name\n\n");
664 fprintf (stderr, " -o: specify output file name\n\n");
665
666 return;
667}
668
669
670/*
671 Print a pattern that was read as ASCII if in Latin mode.
672 Otherwise, print the UTF-8 code point.
673*/
674void
675print_pattern (char *intext, uint32_t codept)
676{
677
678 void print_ascii (char *);
679 void print_utf8 (uint32_t);
680
681 if (lang_type == LANG_LATIN)
682 print_ascii (intext);
683 else
684 print_utf8 (codept);
685
686 return;
687}
688
689
690/*
691 Print an ASCII sequence that appeared inside braces, '{'...'}'.
692*/
693void
694print_ascii (char *intext)
695{
696 int i, j, k; /* loop variables */
697 char unicode_string[7]; /* up to six hexadecimal digits, null-terminated */
698
699 for (i = 0; intext[i] != '\0'; i++) {
700 /*
701 Scan for a backslash, looking for an escape sequence.
702 At present, the only recognized escape sequence is "\u"
703 to represent a Unicode hexadecimal code point of the
704 form "\uX...X", where "X...X" is a string of one to six
705 hexadecimal digits that specify a valid Unicode code point.
706 */
707 for (j = i;
708 intext[j] != '\0' && intext[j] != '\\';
709 j++);
710
711 if (intext [j] == '\0') { /* this is probably the most frequent case */
712 fprintf (yyout, "%s", &intext [i]);
713 i = j-1; /* so the outer i loop will terminate */
714 }
715 /*
716 Found a backslash, so look for a following 'u'.
717 */
718 else if (intext [j+1] == 'u') {
719 /* print the string up to but not including the backslash */
720 intext[j] = '\0';
721 fprintf (yyout, "%s", &intext[i]);
722 i = j + 2; /* i points to first digit in Unicode code point */
723 unicode_string [0] = '\0'; /* start building the Unicode code point string */
724 /*
725 scan to end of hexadecimal digits, up to six digits
726 */
727 for (k = 0;
728 k < 6 && /* allow up to six hexadecimal digits */
729 (isdigit (intext [i]) ||
730 (intext [i] >= 'A' && intext [i] <= 'F') ||
731 (intext [i] >= 'a' && intext [i] <= 'f'));
732 k++) {
733
734 unicode_string [k] = intext[i];
735 unicode_string [k + 1] = '\0'; /* make sure string stays null-terminated */
736 i++;
737 }
738 print_unicode (unicode_string);
739 /* intext [i] points to the remainder of the input string */
740 i--; /* it will be incremented again next i loop iteration */
741 } /* intext [j+1] == 'u' */
742 /*
743 Otherwise, this was not a recognized '\' sequence,
744 so print string up to the backslash and keep going.
745 */
746 else {
747 intext [j] = '\0'; /* replace '\\' with null to print up to this location */
748 fprintf (yyout, "%s\\", &intext [i]);
749 i = j; /* keep scanning intext[i] until the end is reached */
750 }
751 }
752
753 return;
754}
755
756
757/*
758 Print a Unicode code point in the form "X...X",
759 where "X...X" is a string of one to six hexadeimcal
760 digits that describe a valid Unicode code point.
761*/
762void
763print_unicode (char *intext)
764{
765 int i; /* loop variable */
766 uint32_t this_digit; /* current ASCII hexadecimal digit being converted */
767 uint32_t codept; /* the Unicode code point to output */
768
769 void print_utf8 (uint32_t);
770
771 codept = 0;
772 for (i = 0; intext[i] != '\0'; i++) {
773 codept <<= 4; /* shift one hexadecimal digit to the left */
774 this_digit = intext[i];
775 if (this_digit >= 'a')
776 this_digit = this_digit - 'a' + 10;
777 else if (this_digit >= 'A')
778 this_digit = this_digit - 'A' + 10;
779 else
780 this_digit -= '0';
781
782 codept |= this_digit;
783 } /* for i */
784
785 print_utf8 (codept);
786
787 return;
788}
789
790
791/*
792 Print an open or close quote dependent on language mode.
793
794 intext character string starting with '"'
795 or "`" or "'".
796*/
797void
798print_quote (char *intext)
799{
800
801 void print_utf8 (uint32_t);
802
803 /* Double qoute, the most common case */
804 if (intext[0] == '"') {
805 if (quote_state[doubleq_style] == 0) { /* print opening quote */
806 print_utf8 (quote_open[doubleq_style]);
807 quote_state[doubleq_style] = 1; /* now entering a quote style */
808 }
809 else { /* print closing quote */
810 print_utf8 (quote_close[doubleq_style]);
811 quote_state[doubleq_style] = 0; /* now leaving a quote style */
812 }
813 }
814 else { /* open ("`") or close ("'") single quote */
815 if (intext[0] == '`') { /* open quote */
816 if (singleq_style == 0) /* Latin */
817 print_utf8 (0x02BB);
818 else if (singleq_style == 4) /* Hebrew */
819 print_utf8 (0x201A);
820 else /* Greek, Coptic, or Demotic */
821 print_utf8 (0x02BB);
822 }
823 else { /* close quote, "'" */
824 if (singleq_style == 0) /* Latin */
825 print_utf8 (0x02BC);
826 else if (singleq_style == 4) /* Hebrew */
827 print_utf8 (0x2018);
828 else /* Greek, Coptic, or Demotic */
829 print_utf8 (0x02BC);
830 }
831 }
832
833 return;
834}
835
836
837/*
838 Print Greek, Latin, Coptic, or Hebrew capital letter.
839
840 This is passed yytext, so the first character in the
841 input string is a '*'; skip over it for indexing.
842*/
843void
844print_capital (char *intext)
845{
846 int test_char; /* character to test */
847
848 test_char = intext[1] & 0x7F;
849
850 switch (lang_type) {
851 case LANG_GREEK:
852 /* First check for Greek varia (grave accent) on vowel */
853 if (intext[2] == '\\') { /* intext[2] should either be '\\' or '\0' */
854 test_char = tolower (test_char);
855 switch (test_char) {
856 case 'a':
857 print_utf8 (0x1FBA); /* GREEK CAPITAL LETTER ALPHA WITH VARIA */
858 break;
859 case 'e':
860 print_utf8 (0x1FC8); /* GREEK CAPITAL LETTER EPSILON WITH VARIA */
861 break;
862 case 'h':
863 print_utf8 (0x1FCA); /* GREEK CAPITAL LETTER ETA WITH VARIA */
864 break;
865 case 'i':
866 print_utf8 (0x1FDA); /* GREEK CAPITAL LETTER IOTA WITH VARIA */
867 break;
868 case 'o':
869 print_utf8 (0x1FEA); /* GREEK CAPITAL LETTER UPSILON WITH VARIA */
870 break;
871 case 'u':
872 print_utf8 (0x1FF8); /* GREEK CAPITAL LETTER OMICRON WITH VARIA */
873 break;
874 case 'w':
875 print_utf8 (0x1FFA); /* GREEK CAPITAL LETTER OMEGA WITH VARIA */
876 break;
877 default:
878 fprintf (yyout, "%s", intext); /* unexpected combination */
879 break;
880 }
881 }
882 else {
883 /*
884 ascii2greek_cap contains Unicode encodings for
885 capital Greek letters.
886 */
887 print_utf8 (ascii2greek_capital[test_char]);
888 }
889 break;
890 case LANG_COPTIC:
891 print_utf8 (ascii2coptic[test_char]);
892 /* Now check for Coptic jinkim (grave accent) on letter */
893 if (intext[2] == '\\')
894 print_utf8 (0x0300); /* COMBINING GRAVE ACCENT */
895 break;
896 case LANG_HEBREW: /* Hebrew Beta Code doesn't use '*'; we should not reach this point */
897 break;
898 case LANG_LATIN:
899 fprintf (yyout, "%s", intext);
900 break;
901 default:
902 break;
903 }
904
905 return;
906}
907
908
909/*
910 Print Greek, Latin, Coptic, or Hebrew small letter.
911*/
912void
913print_small (char *intext)
914{
915 int test_char; /* character to test */
916 int letter_form; /* =1 if letter is the final form, 2 if not; for Hebrew */
917
918 test_char = intext[0] & 0x7F;
919
920 switch (lang_type) {
921 case LANG_GREEK:
922 /* First check for varia (grave accent) on vowel */
923 if (intext[1] == '\\') { /* intext[1] should either be '\\' or '\0' */
924 test_char = tolower (test_char);
925 switch (test_char) {
926 case 'a':
927 print_utf8 (0x1F70); /* GREEK SMALL LETTER ALPHA WITH VARIA */
928 break;
929 case 'e':
930 print_utf8 (0x1F72); /* GREEK SMALL LETTER EPSILON WITH VARIA */
931 break;
932 case 'h':
933 print_utf8 (0x1F74); /* GREEK SMALL LETTER ETA WITH VARIA */
934 break;
935 case 'i':
936 print_utf8 (0x1F76); /* GREEK SMALL LETTER IOTA WITH VARIA */
937 break;
938 case 'o':
939 print_utf8 (0x1F78); /* GREEK SMALL LETTER OMICRON WITH VARIA */
940 break;
941 case 'u':
942 print_utf8 (0x1F7A); /* GREEK SMALL LETTER UPSILON WITH VARIA */
943 break;
944 case 'w':
945 print_utf8 (0x1F7C); /* GREEK SMALL LETTER OMEGA WITH VARIA */
946 break;
947 default:
948 fprintf (yyout, "%s", intext); /* unexpected combination */
949 break;
950 }
951 }
952 else {
953 print_utf8 (ascii2greek_small[test_char]);
954 }
955 break;
956 case LANG_COPTIC:
957 /*
958 Small Coptic letters are one code point above
959 the corresponding capital letter contained in
960 the ascii2coptic array, so add one for print_utf8.
961 */
962 print_utf8 (ascii2coptic[test_char] + 1);
963 if (intext[1] == '\\')
964 print_utf8 (0x0300); /* COMBINING GRAVE ACCENT */
965 break;
966 case LANG_HEBREW:
967 test_char = intext[0];
968 /*
969 If this is a letter that has middle and final forms,
970 look at next character for the digit '1' (final form)
971 or '2' (middle form).
972 */
973 if (test_char == 'k' || test_char == 'm' || test_char == 'n' ||
974 test_char == 'p' || test_char == 'T') {
975 letter_form = yytext[1];
976 if (letter_form == '2') {
977 switch (test_char) {
978 case 'k':
979 print_utf8 (0x5DA); /* HEBREW LETTER FINAL KAF */
980 break;
981 case 'm':
982 print_utf8 (0x5DD); /* HEBREW LETTER FINAL MEM */
983 break;
984 case 'n':
985 print_utf8 (0x5DF); /* HEBREW LETTER FINAL NUN */
986 break;
987 case 'p':
988 print_utf8 (0x5E3); /* HEBREW LETTER FINAL PE */
989 break;
990 case 'T':
991 print_utf8 (0x5E5); /* HEBREW LETTER FINAL TSADI */
992 break;
993 default:
994 fprintf (yyout, "%s", intext);
995 break;
996 }
997 }
998 else { /* a '2' was not the next character, so not final form */
999 /*
1000 Print the middle form of the letter, even if
1001 it was not given correctly with a '1' appended.
1002 */
1003 switch (test_char) {
1004 case 'k':
1005 print_utf8 (0x5DB); /* HEBREW LETTER KAF */
1006 break;
1007 case 'm':
1008 print_utf8 (0x5DE); /* HEBREW LETTER MEM */
1009 break;
1010 case 'n':
1011 print_utf8 (0x5E0); /* HEBREW LETTER NUN */
1012 break;
1013 case 'p':
1014 print_utf8 (0x5E4); /* HEBREW LETTER PE */
1015 break;
1016 case 'T':
1017 print_utf8 (0x5E6); /* HEBREW LETTER TSADI */
1018 break;
1019 default:
1020 fprintf (yyout, "%s", intext);
1021 break;
1022 }
1023 }
1024 }
1025 else { /* it's a Hebrew letter that only has one form */
1026 print_utf8 (ascii2hebrew[test_char]);
1027 }
1028 break;
1029 case LANG_LATIN:
1030 fprintf (yyout, "%s", intext);
1031 break;
1032 default:
1033 break;
1034 }
1035
1036 return;
1037}
1038
1039
1040/*
1041 Print one of four letter choices depending on whether the
1042 language mode is Latin, Greek, Coptic, or Hebrew, respectively.
1043*/
1044void
1045print_letter (uint32_t latin, uint32_t greek,
1046 uint32_t coptic, uint32_t hebrew)
1047{
1048
1049 switch (lang_type) {
1050 case LANG_LATIN:
1051 print_utf8 (latin);
1052 break;
1053 case LANG_GREEK:
1054 print_utf8 (greek);
1055 break;
1056 case LANG_COPTIC:
1057 print_utf8 (coptic);
1058 break;
1059 case LANG_HEBREW:
1060 print_utf8 (hebrew);
1061 break;
1062 default:
1063 print_utf8 (greek);
1064 break;
1065 }
1066
1067 return;
1068}
1069
1070
1071/*
1072 Convert a UTF-32 code point to a UTF-8 string.
1073*/
1074void
1075print_utf8 (uint32_t codept)
1076{
1077 int i; /* loop variable */
1078 int bin_length; /* number of binary digits, for forming UTF-8 */
1079 int byte_length; /* numberof bytes of UTF-8 */
1080 char utf8_bytes[4]; /* temporary array of UTF-8 output bytes */
1081
1082 int bin_digits (uint32_t);
1083
1084 byte_length = 0;
1085
1086 /*
1087 If within valid 0x2039Unicode range of U+0000..U+10FFFF, proceed
1088 */
1089 if (codept <= 0x10FFFF) {
1090 bin_length = bin_digits (codept);
1091 if (bin_length < 8) { /* U+0000..U+007F */
1092 byte_length = 1;
1093 utf8_bytes [0] = codept;
1094 }
1095 else if (bin_length < 12) { /* U+0080..U+07FF */
1096 byte_length = 2;
1097 utf8_bytes [0] = 0xC0 | ((codept >> 6) & 0x1F);
1098 utf8_bytes [1] = 0x80 | ( codept & 0x3F);
1099 }
1100 else if (bin_length < 17) { /* U+0800..U+FFFF */
1101 byte_length = 3;
1102 utf8_bytes [0] = 0xE0 | ((codept >> 12) & 0x0F);
1103 utf8_bytes [1] = 0x80 | ((codept >> 6) & 0x3F);
1104 utf8_bytes [2] = 0x80 | ( codept & 0x3F);
1105 }
1106 else if (bin_length < 22) { /* U+010000..U+10FFFF */
1107 byte_length = 4;
1108 utf8_bytes [0] = 0xF0 | ((codept >> 18) & 0x07);
1109 utf8_bytes [1] = 0x80 | ((codept >> 12) & 0x3F);
1110 utf8_bytes [2] = 0x80 | ((codept >> 6) & 0x3F);
1111 utf8_bytes [3] = 0x80 | ( codept & 0x3F);
1112 }
1113 else {
1114 fprintf (stderr,
1115 "Internal error forming UTF-8 in print_utf8() for U+%04X\n",
1116 codept);
1117 }
1118
1119 for (i = 0; i < byte_length; i++) fputc (utf8_bytes [i], yyout);
1120 }
1121 else {
1122 fprintf (stderr,
1123 "print_utf8() called with illegal Unicode code point U+%06X\n",
1124 codept);
1125 }
1126
1127 return;
1128}
1129
1130
1131/*
1132 Return the number of significant binary digits in an unsigned number.
1133*/
1134int
1135bin_digits (uint32_t itest)
1136{
1137 uint32_t i;
1138 int result;
1139
1140 i = 0x80000000; /* mask highest uint32_t bit */
1141 result = 32;
1142 while ( (i != 0) && ((itest & i) == 0) ) {
1143 i >>= 1;
1144 result--;
1145 }
1146
1147 return result;
1148}
1149
#define LANG_HEBREW
Define for Hebrew output (unused).
Definition: sequoyah.l:58
#define LANG_GREEK
Define for Greek output (unused).
Definition: sequoyah.l:54
#define LANG_LATIN
Define for Latin output (unused).
Definition: sequoyah.l:55
#define LANG_COPTIC
Define for Coptic output (unused).
Definition: sequoyah.l:56