// Translator from English text to Shavian script by Dave Coffin // // At present (Oct 2020) the American dictionary can be found at // http://www.speech.cs.cmu.edu/cgi-bin/cmudict // // and the British dictionary is inside // http://festvox.org/packed/festival/2.4/festlex_OALD.tar.gz // // and both are ignored by default in favor of the dictionary format // used by shaw.py, with the added proviso that this file must be // correctly sorted. For best results do: // // uconv -x Latin-ASCII input | sed -r "s/(’|’|โ€™)/'/g" | shaw dave.dict // // Part-of-speech tags are ignored because NLTK does not support C. #include #include #include #include #include #define APOS "\'" #define SQR(x) ((x)*(x)) #if !defined(uchar) #define uchar unsigned char #endif uchar prefix[2200000], *suffix, *dict=prefix, *dend, aflag; int wordcmp (const char *one, const char *two, int ic, int len); #if defined(US) || defined(UK) // These bypass the dictionary entirely const uchar *except[] = { "to ๐‘‘", "for ๐‘“", "of ๐‘", "the ๐‘ž", "and ๐‘ฏ", "a ๐‘ฉ", "an ๐‘ฉ๐‘ฏ", "no ๐‘ฏ๐‘ด" }; // Shavian letters are numbered 0 (peep) to 47 (yew) in Unicode order // In the dictionary, 0x90 is added to match the last byte of their UTF-8 encoding. const uchar lig[][3] = { //{ 40,24,30 }, // ash + roar = are wrong because marry != marring { 40,26,30 }, // odd + roar = are { 40,29,30 }, // ah + roar = are { 40,35,30 }, // up + roar = are { 41,36,30 }, // oak + roar = or { 41,39,30 }, // awe + roar = or { 42,33,30 }, // age + roar = air { 42,42,30 }, // air + roar = air //{ 43,23,30 }, // egg + roar = err wrong because ferry != furry { 43,43,30 }, // err + roar = err { 44,25,30 }, // ado + roar = array { 44,44,30 }, // array+roar = array { 45,22,30 }, // if + roar = ear { 45,32,30 }, // eat + roar = ear { 45,46,30 }, // ian + roar = ear { 46,32,24 }, // eat + ash = ian { 46,32,25 }, // eat + ado = ian { 46,32,35 }, // eat + up = ian { 47,8,37 }, // yea + ooze = yew }; #endif #if defined(US) // CMUdict assumes middle-American pronunciation, with its // Mary-marry-merry and father-bother mergers. void load_usa() { // The "ah" sound is absent in CMUdict, merged into "on". static const char cmu[45][4] = { "P","T","K","F","TH","S","SH","CH","Y","NG", "B","D","G","V","DH","Z","ZH","JH","W","HH", "L","M","IH","EH","AE","AH","AA","UH","AW","**", "R","N","IY","EY","AY","AH","OW","UW","OY","AO", "X","X","X","ER","ER" }; FILE *fp; char c, sym[4], *word; int i; fp = fopen ("cmudict-0.7b", "r"); if (!fp) { perror ("cmudict-0.7b"); exit(1); } dend = dict; while (!feof(fp)) { if (!isalpha(c = fgetc(fp))) { while (!feof(fp) && fgetc(fp) != '\n'); continue; } *dend++ = 0; word = dend; // just for diagnostics do { *dend++ = c; c = fgetc(fp); } while (c != '(' && c != ' '); if (c == '(') while (fgetc(fp) != ')'); *dend++ = ' '; while (1) { do c = fgetc(fp); while (c == ' '); if (c == '\n') break; for (i=0; i < 2 && isalpha(c); i++) { sym[i] = c; c = fgetc(fp); } if (c == '\n') ungetc (c, fp); sym[i] = 0; for (i=0; i < 45; i++) if (!strcmp (sym, cmu[i])) break; if (i == 45) { fprintf (stderr, "Unknown phoneme %s after %s\n", sym, word); exit(1); } if (i == 25 && c > '0') i = 35; // change ado to up when stressed if (i == 32 && c < '1') i = 22; // change eat to if when unstressed if (i == 43 && c < '1') i = 44; // change err to array when unstressed *dend++ = 0x90 + i; for (i=0; i < sizeof lig/sizeof *lig; i++) if ( dend[-2] == lig[i][1]+0x90 && dend[-1] == lig[i][2]+0x90) (--dend)[-1] = lig[i][0]+0x90; } } fclose (fp); *dend = 0; } #elif defined(UK) #define isvowel(x) (((x)|32) >= 'a' && 0x104111 >> (((x)|32)-'a') & 1) #define shvowel(x) ((0xff7800 >> (x & 63)/2) & 1) #define rhotic(x) ((x & 63) == 30 || ((x & 63) > 39 && (x & 63) < 46)) void load_uk() { static const char oald[50][4] = { "p","t","k","f","th","s","sh","ch","y","ng", "b","d","g","v","dh","z","zh","jh","w","h", "l","m","i" ,"e" ,"a" ,"@" ,"o", "u" ,"au","aa", "r","n","ii","ei","ai","uh","ou","uu","oi","oo", "0","1","e@","@@","X" ,"X" ,"i@","u@", "" ,"po" }; FILE *fp; uchar c, sym[4], *wptr, rmap[256], vc; uchar word[256], w, tok, syl, nsyl; int i; fp = fopen ("oald-0.4.out", "r"); if (!fp) { perror ("oald-0.4.out"); exit(1); } dend = dict; while (!feof(fp)) { if (fgetc(fp) != '(' || fgetc(fp) != '\"') goto nextline; *dend++ = 0; wptr = dend; memset (rmap, vc=0, sizeof rmap); while ((c = fgetc(fp)) != '\"') { if (isvowel(dend[-1]) && !isvowel(c)) rmap[vc++] = (c|32) == 'r'; // note which vowel clusters end with 'r' *dend++ = c; if (c == '_' || c == '`' || c == '-' || c == ' ') { dend = wptr-1; // cut all the French and Latin crap goto nextline; } } *dend++ = ' '; do c = fgetc(fp); while (c != '('); // skip the part-of-speech tag w = syl = nsyl = vc = 0; while (1) { for (i=0; i < 2; i++) { // read a one- or two-byte symbol c = fgetc(fp); if (!isalnum(c) && c != '@') break; sym[i] = c; } sym[i] = 0; for (tok=0; tok < 50; tok++) // convert it to a token if (!strcmp (sym, oald[tok])) break; if (c == '\n' || c == 0xff) { ungetc (c, fp); tok = 49; // end-of-word token, usually generated by "pos" symbol } if (tok/2 == 20) { // mark all phonemes in each syllable for stress nsyl++; while (syl < w) word[syl++] |= 64 * (tok - 20); continue; } if (tok == 47) { word[w++] = 27; tok = 25; // u@ = wool + ado } if (tok == 48) continue; word[w++] = tok; if (w > 1 && shvowel(word[w-2]) && !shvowel(tok) && ++vc) if (rmap[vc-1] && !rhotic(word[w-2]) && !rhotic(tok)) { word[w++] = word[w-1]; word[w-2] = 128 | 30; // insert missing "r" after a vowel cluster } if (tok == 49 && w--) break; // 49 got written, must back over it if (tok == 50) { fprintf (stderr, "Unknown phoneme %s after %s\n", sym, wptr); exit(1); } } if (word[w-2] == 32 && word[w-1] & 128) w--; if (word[w-1] == 15 && word[w-2] & 128 && word[w-3] == 22) word[--w-1] = 15; if (nsyl == 1) // one-syllable words are always stressed for (i=0; i < w; i++) word[i] |= 64; for (i=0; i < w; i++) { c = word[i]; if (c == 32 || c == 35) c -= 10; // change unstressed eat and up to if and ado if (c == 43) c++; // change unstressed err to array word[i] = c & 63; } if (word[w-1] == 1 && word[w-2] == 31 && word[w-3] < 20) { word[w-2] = 25; word[w-1] = 31; // insert schwa before final "nt" word[w++] = 1; } if (word[w-1] / 20 == 1 && word[w-1] / 2 % 5 == 0 && word[w-2] < 20) { word[w++] = word[w-1]; // insert schwa before final l/m/r/n word[w-2] = 25; } for (c=0; c < w; c++) { *dend++ = 0x90 + word[c]; for (i=0; i < sizeof lig/sizeof *lig; i++) if ( dend[-2] == lig[i][1]+0x90 && dend[-1] == lig[i][2]+0x90) (--dend)[-1] = lig[i][0]+0x90; } if (!strcasecmp (wptr, wptr-strlen(wptr)-1)) // delete duplicates dend = wptr-1; nextline: while (!feof(fp) && fgetc(fp) != '\n'); } fclose (fp); *dend = 0; } #else void load_custom (char *file) { FILE *fp; char *wptr, *prev=""; int c, line=1, space; fp = fopen (file, "r"); if (!fp) { perror (file); exit(1); } dend = dict; *dend++ = space = 0; wptr = dend; dict = suffix = 0; while ((c = fgetc(fp)) != EOF) { if (c == '\r') continue; if (c == '\n') { *dend++ = 0; prev = wptr; wptr = dend; goto nxt; } if (c == 0xf0 && fgetc(fp) == 0x90 && fgetc(fp) == 0x91) c = fgetc(fp); else if (c == 0xef && fgetc(fp) == 0xb8 && fgetc(fp) == 0x80) continue; else if (c == ' ') { if (space++) { fprintf (stderr, "%s: Extra space, line %d discarded.\n", file, line); goto del; } } else if (!isalpha(c) && !strchr("^$\':._",c)) { fprintf (stderr, "%s: Illegal character \'%c\', line %d discarded.\n", file, c, line); del: while (fgetc(fp) != '\n'); dend = wptr; nxt: space = 0; line++; continue; } if (!suffix && !dend[-1] && c != '^') suffix = dend-1; if (!dict && !dend[-1] && c != '^' && c != '$') dict = dend-1; if (dend[-1] == 0xa6 && c == 0xa9) dend[-1] = 0xbe; // merge_ia else if (dend[-1] == 0xa6 && c == 0xbc) dend[-1] = 0xbd; else *dend++ = c; if (c == ' ' && !wordcmp (wptr, prev, 0, 0)) { dend = wptr-1; *dend++ = '@'; // heteronym separator wptr = prev; } } fclose (fp); dend--; } #endif int wordcmp (const char *one, const char *two, int ic, int len) { char a, b; if (len == 0) len = INT_MAX; ic *= 32; // ic = "ignore case" while (len--) { a = *one++; b = *two++; if (a == '_') a = ' '; // ignore POS tags if (b == '_') b = ' '; a |= ic; b |= ic; if ((a | b) == 32) return 0; // both hit space or null if (a < b) return -1; if (a > b) return 1; } return 0; } void parse_root (uchar *in, uchar *out) { char *low, *high, *mid, *cp; int comp; out[0] = out[1] = 0; #ifdef PURGE if (!aflag) return; #endif low = dict; high = dend; do { mid = low + (high - low)/2; while (mid[-1]) mid--; if (mid == low) while (*mid++); if (mid >= high) return; comp = wordcmp (in, mid, 1, 0); if (comp < 0) high = mid; if (comp > 0) low = mid; } while (comp); while (1) { low = mid-1; // look for earlier matches while (low[-1]) low--; if (wordcmp (in, low, 1, 0)) break; mid = low; } for (cp=mid; !wordcmp (in, cp, 1, 0); cp++) { if (!wordcmp (in, cp, 0, 0)) goto exact; while (*++cp); } cp = mid; exact: if (isupper(*cp)) out[0] = '*'; for ( ; *cp != ' '; cp++) if (*cp == '_' && !aflag) out[0] = ','; strcat (out, cp+1); if (aflag & 1 && cp[1] == '.' || aflag & 2 && cp[strlen(cp)-1] == '.') out[0] = 0; } int parse_suffix (uchar *in, uchar *out, int adj) { int split, len, pass, score=0, best=0, i; uchar *suff, *tail, *cp, word[256], try[256], sflag; static const char penal[][8] = { "bed","can","cat","cent","dance","ine","kin","one","pal","path","ster","tie","tied","ties","tying","wing","x" }; parse_root (in, out); for (cp=in; *cp; cp++) *cp |= 32; len = cp-in; if (out[0]) return SQR(len+adj); for (suff = suffix+1; *suff++ == '$'; suff += strlen(suff)+1) { split = len - (strchr(suff,' ') - (char *)suff); if (split < 2 || wordcmp (in+split, suff, 1, 0)) continue; if (aflag & 2 && suff[strlen(suff)-1] == '.') continue; if (in[split] == in[split-1]) if (len-split == 1 || strchr("eos",in[split])) continue; if (in[split-1] == in[split-2]) { if (!strcmp (in+split, "ess") && strchr("ln",in[split-1])) continue; } else { if (!strcmp (in+split, "ry") && strchr("aeiouf",in[split-1])) continue; if (!strcmp (in+split, "ha") && strchr("cpst",in[split-1])) continue; if (!strcmp (in+split, "th") && strchr("e",in[split-1])) continue; if (!strcmp (in+split, "d") && strchr("adeiou",in[split-1])) continue; if (!strcmp (in+split, "w") && strchr("aeo",in[split-1])) continue; if (!strcmp (in+split, "t") && strchr("aeioust",in[split-1])) continue; if (!strcmp (in+split, "k") && strchr("aceino",in[split-1])) continue; if (!strcmp (in+split, "r") && strchr("aeiou",in[split-1])) continue; if (!strcmp (in+split, "m") && strchr("eis",in[split-1])) continue; if (!strcmp (in+split, "z") && strchr("i",in[split-1])) continue; if (!strcmp (in+split, "n") && strchr("eio",in[split-1])) continue; } if (!strcmp (in+split, "es") && !strchr("hiosuxz",in[split-1])) continue; while (*suff++ != ' '); for (pass=0; pass < 2; pass++) { strcpy (word, in); word[split] = word[split+1] = 0; if (pass) ; else if (in[split-1] == 'i' && !strchr("cfikmpsv",in[split])) word[split-1] = 'y'; else if (strchr("aeiouy\'",in[split]) && !strchr("aeio",in[split-1])) { if (in[split] == 'u' && (in[split+1]=='b' || in[split+1]=='p')) continue; if (in[split-1] == in[split-2] && !strchr("hsw",in[split-1])) word[split-1] = 0; else if ((strchr("cdghlsuvz",in[split-1]) || in[split] == 'e' || strchr("aeiousy",in[split-2])) && (!strchr("cg",in[split-1]) || !strchr("aou",in[split]))) word[split] = 'e'; else continue; } else if (!strcmp(word+split-2, "dg")) word[split] = 'e'; else continue; sflag = aflag; aflag &= ~2; if (in[split] != '\'' || word[split]) aflag |= 2; if (score = parse_suffix (word, try, split-strlen(word))) { #ifdef SCORE printf("%s %s %d + %d", word, in+split, score, SQR(len-split+adj)); #endif score += SQR(len-split+adj); if (in[split] == '\'' && !pass) score /= 2; if (!strcmp (in+split, "call")) score = 1; for (i=0; i < sizeof penal/sizeof *penal; i++) if (!strcmp (in+split, penal[i])) score -= 9; if (score < 1) score = 1; #ifdef SCORE printf(" = %d\n", score); #endif } aflag = sflag; if (score <= best) continue; best = score; strcpy (out, try); i = strlen(out); if (in[split-1] == 'e' && !strchr("aegiou",in[split-2]) && // silent e speaks up strchr("aou",in[split]) && strchr("dlmnprstu",in[split+1]) && strcmp(in+split,"arm") && strcmp(in+split,"out") && strcmp(in+split,"und") && strcmp(in+split,"up")) { if (strchr("\xa6\xb0",out[i-1])) i--; out[i++] = 0xa6; out[i] = 0; } cp = suff; tail = word+strlen(word); if (out[i-1] == *cp && strchr("\xa4\xaf",*cp) && strlen(cp) < 3) out[i-1] = 0; // with -n, -l, -ly, one is enough if (out[i-1] == 0xa6 && strchr("\xa9\xbc",*cp)) // link ia and ear out[i-1] = *cp++ == 0xbc ? 0xbd : 0xbe; if (strchr("vw",tail[-1]) && out[i-1] == 0x93 && *cp > 0x97 && strcmp(in+split,"s")) out[i-1] += 10; // change f to v in Slavic surnames if (!strcmp(tail-2,"le") && !strcmp(cp,"\xa6") && !strcmp(out+i-2,"\xa9\xa4")) strcpy(out+i-2,"\xa4"); // drop the schwa when -le becomes -ly if (cp[1] != 0x99 && !strchr("\'\x9b\x9f",*cp) && !strcmp(out+i-2,"\xa9\xa5") && strchr("\x9e\x9f",out[i-3])) strcpy(out+i-2,"\xa5"); // drop the schwa in -sm and -thm words if (strchr("\xa9\xad\xbe",out[i-2]) && out[i-1] == 0xa4 && !strcmp(cp,"\xa6\x91\xa6")) { strcpy(out+i-2, strchr("\x96\x97\xa0\xa1",out[i-3]) || out[i-2] == 0xbe ? "\xa6":""); strcat(out,"\xa8\xa4"); // strange "-(i)ality" rule } strcat (out, cp); } } if (in[len-1] == in[len-2] && !strchr("aeiosu",in[len-2])) { aflag |= 2; strcpy (word, in); word[len-1] = 0; // drop final double consonant score = parse_suffix (word, try, 0); if (best < score) { best = score; strcpy (out, try); } } cp = out + strlen(out); if (cp-out < 2) return best; if (cp[-1] == ':' && cp[-2] > 0x99) { for (i=-3; cp+i > out && cp[i] < 0x90; i--); if (cp[i] == cp[-2] || cp[i] == cp[-2]-10 || cp[-2] == 0x9f && strchr("\x96\x97\xa0\xa1",cp[i])) { *++cp = 0; cp[-1] = ':'; cp[-2] = cp[-3]; cp[i=-3] = 0xa9; } if (cp[i] < 0x98) cp[-2] -= 10; } if (cp-out < 4) return best; if (!strcmp(cp-4,"\x92\xa9\xa4\xa6") && strchr("\xa6\xa9",cp[-5])) strcpy(cp-4,"\x92\xa4\xa6"); // "ically" is pronounced "icly" if (!strncmp(cp-4,"\x91\xb5",2) && strchr("\xa9\xb1\xba\xbc",cp[-2])) cp[-4] = 0x97; // tu becomes chu if (cp[-1] == 0xbe) cp++; // ia palatization rules if (cp[-2] == 0xbe && strchr("\x91\x95\x9f",cp[-3]) && strchr("\x95\xa4\xaf",cp[-1])) { i = 0x96; if (cp[-3] == 0x91) { if (cp[-4] == 0x95) i = 0x97; } // stia --> scha else if (strchr("\xaf",cp[-1]) && strchr("\xb0\xb1\xb4\xb5\xb7\xbb\xbf",cp[-4])) i = 0xa0; cp[-3] = i; cp[-2] = 0xa9; } return best; } int parse_prefix (char *in, char *out, int ms) { int split, len, score, best, i; char *pref; uchar try[256]; best = parse_suffix (in, out, 0); len = strlen(in); if (best == len*len) return best; for (pref = prefix+1; *pref++ == '^'; pref += strlen(pref)+1) { split = strchr(pref,' ') - pref; if (split <= ms || split > len-2 || wordcmp (in, pref, 1, split)) continue; if ((in[split-1]|32) == 'u' && (in[split]|32) == 'n') continue; if (split == 1 && (in[0]|32) == 'z' && strchr("aeiouy",in[1]|32)) continue; while (*pref++ != ' '); strcpy (try, pref); i = strlen(try); aflag = in[split-1] != '\''; if (score = parse_prefix (in+split, try+i, 1)) #ifdef SCORE { printf("%.*s %s %d + %d", split, in, in+split, SQR(split), score); #endif score += SQR(split); if (split == 2 && !strncmp(in,"la",2)) score -= 4; #ifdef SCORE printf(" = %d\n", score); } #endif if (score <= best) continue; best = score; if (try[i-1] == try[i] && strchr("\xa6\xa7",try[i-2]) && strchr("\xa4\xa5\xae\xaf",try[i]) || !strncmp (try, "\xa5\xa9\x92\x92", 4) || !strncmp (try, "\xa5\xa9\x92*\x92", 5)) do try[i-1] = try[i]; // undupe ill-, imm-, irr-, inn-, and macc- while (try[i++]); strcpy (out, try); } return best; } void shaw_print (uchar *cp, int cap) { if ((cap || strchr(cp,'*')) && !strchr(cp,',')) printf ("ยท"); do if (strchr(":.,*",*cp)) ; else if (*cp == '\'') printf("%s", APOS); else if (*cp < 0x90) putchar(*cp); else printf ("%c%c%c%c",0xf0,0x90,0x91,*cp); while (*++cp); } int main (int argc, char **argv) { FILE *fp = stdin; uchar c, cap, w=0, sent=0, script=0, *cp; char word[256], copy[256], out[256], pword[256]; const char befto[5][2][10] = { { "has","๐‘•" },{ "have","๐‘“" },{ "used","๐‘•๐‘‘" },{ "unused","๐‘•๐‘‘" },{ "supposed","๐‘•๐‘‘" } }; int i; #if defined(US) load_usa(); #elif defined(UK) load_uk(); #else if (argc < 2) { fprintf (stderr, "Usage: %s dictionary-file\n", argv[0]); exit (1); } load_custom (argv[1]); #endif #ifdef DUMP for (cp = dict; cp < dend; cp++) { // print the whole dictionary and exit if (*cp == 0) { shaw_print (cp+1, 0); putchar ('\n'); } } exit(0); #endif while (!feof(fp)) { c = fgetc(fp); if (isalpha(c) || (w && c == '\'')) { if (!w) cap = sent++ && isupper(c); word[w++] = c; continue; } if (strchr (".:?!", c)) sent = 0; if (w) { word[w] = 0; if (script) goto pass; #if defined(US) || defined(UK) for (i=0; i < sizeof except/sizeof *except; i++) if (!wordcmp (word, except[i], 1, 0)) { printf ("%s", except[i]+w+1); goto end; } if (word[0] == 'I' && !isalpha(word[1])) cap = 0; #endif if (!strcasecmp(word,"to")) for (i=0; i < 5; i++) if (!strcasecmp(pword,befto[i][0])) printf ("@%s ", befto[i][1]); aflag = 0; strcpy (copy, word); if (parse_prefix (copy, out, 0)) { if ((i = strlen(out)) > 2 && !strcmp(out+i-2,"\xa5\x9a")) out[i-1] = 0; if (cp = strstr(out,"\xb2\x9f\xb1\x96\xa9\xaf")) *cp = 0xa6; #ifdef PURGE for (i=0; i < 2; i++) { printf ("\n%s ", word); shaw_print (out, 0); word[0] &= -33; } #else shaw_print (out, cap); #endif } else pass: printf ("%s", word); memcpy (pword, word, 256); } end:w=0; if (c != 0xff) putchar(c); if (c == '<') { // pass through HTML/XML tags and Javascript i=0; do { putchar (c = fgetc(fp)); if (i < 256) word[i++] = c; } while (c != '>' && !feof(fp)); if (!strncasecmp (word, "div", 3)) sent = 0; if (!strncasecmp (word, "script", 6)) script = 1; if (!strncasecmp (word,"/script", 7)) script = 0; if (!strncasecmp (word, "style", 5)) script = 1; if (!strncasecmp (word,"/style", 6)) script = 0; } } }