babbler/tools/mapgen.c

/* tools/mapgen.c */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <float.h>

// Import physics directly to access PHONEME_DB and f6_t
// Assumes compilation from the main directory
#include "../src/articulator.h"

// External declaration of the database (located in articulator_db.c)
extern const phoneme_t PHONEME_DB[];

// --- ANCHOR DEFINITIONS ---
// Anchors serve as reference points for how a specific language writes specific sounds

typedef struct {
  const char* spelling;
  const char* ipa_ref;
} anchor_t;

const anchor_t ANCHORS_HUNGARIAN[] = {
    {"a", "ɒ"}, {"a", "ɑ"},  {"á", "a"}, {"e", "ɛ"},  {"é", "e"},        {"i", "i"},  {"i", "ɨ"},  {"o", "o"},
    {"ó", "o"}, {"u", "ʊ"},  {"ú", "u"}, {"ö", "ø"},  {"ő", "ø"},        {"ü", "y"},  {"ű", "y"},  {"p", "p"},
    {"b", "b"}, {"t", "t"},  {"d", "d"}, {"k", "k"},  {"g", "g"},        {"m", "m"},  {"n", "n"},  {"ny", "ɲ"},
    {"f", "f"}, {"v", "v"},  {"s", "ʃ"}, {"sz", "s"}, {"z", "z"},        {"zs", "ʒ"}, {"ty", "c"}, {"gy", "ɟ"},
    {"l", "l"}, {"ly", "ʎ"}, {"r", "r"}, {"h", "h"},  {"c\u035Ch", "x"}, {"j", "j"},  {NULL, NULL}};

const anchor_t ANCHORS_POLISH[] = {
    {"a", "a"},  {"a", "ɑ"}, {"e", "ɛ"}, {"i", "i"}, {"o", "o"},  {"u", "u"}, {"y", "ɨ"},  {"p", "p"},
    {"b", "b"},  {"t", "t"}, {"d", "d"}, {"k", "k"}, {"g", "g"},  {"m", "m"}, {"n", "n"},  {"ń", "ɲ"},
    {"f", "f"},  {"w", "v"}, {"s", "s"}, {"z", "z"}, {"sz", "ʂ"}, {"ż", "ʐ"}, {"ś", "ɕ"},  {"ź", "ʑ"},
    {"ch", "x"}, {"h", "h"}, {"l", "l"}, {"ł", "w"}, {"r", "r"},  {"j", "j"}, {NULL, NULL}};
const anchor_t ANCHORS_GERMAN[] = {
    {"a", "a"},   {"a", "ɑ"}, {"e", "e"}, {"e", "ɛ"},  {"e", "ə"},  {"i", "i"},  {"i", "ɪ"},  {"o", "o"}, {"o", "ɔ"},
    {"u", "u"},   {"u", "ʊ"}, {"ä", "ɛ"}, {"ö", "ø"},  {"ü", "y"},  {"p", "p"},  {"b", "b"},  {"t", "t"}, {"d", "d"},
    {"k", "k"},   {"g", "g"}, {"m", "m"}, {"n", "n"},  {"ng", "ŋ"}, {"f", "f"},  {"w", "v"},  {"s", "z"}, {"ss", "s"},
    {"sch", "ʃ"}, {"j", "j"}, {"r", "r"}, {"ch", "x"}, {"ch", "ç"}, {"ts", "z"}, {NULL, NULL}};

const anchor_t ANCHORS_CYRILLIC[] = {
    {"а", "a"}, {"а", "ɑ"}, {"б", "b"}, {"в", "v"}, {"г", "g"}, {"д", "d"}, {"е", "ɛ"}, {"ж", "ʒ"},  {"з", "z"},
    {"и", "i"}, {"й", "j"}, {"к", "k"}, {"л", "l"}, {"м", "m"}, {"н", "n"}, {"о", "o"}, {"п", "p"},  {"р", "r"},
    {"с", "s"}, {"т", "t"}, {"у", "u"}, {"ф", "f"}, {"х", "x"}, {"ш", "ʃ"}, {"ы", "ɨ"}, {NULL, NULL}};

const anchor_t ANCHORS_ORCISH[] = {
    {"u", "y"},  {"u", "u"},  {"u", "ʊ"},  {"o", "ø"},  {"o", "o"}, {"o", "ɔ"},  {"a", "a"},  {"a", "ɑ"},  {"a", "ʌ"},
    {"e", "e"},  {"i", "i"},  {"b", "b"},  {"d", "d"},  {"g", "g"}, {"p", "p"},  {"t", "t"},  {"k", "k"},  {"kh", "x"},
    {"kh", "χ"}, {"gh", "ɣ"}, {"gh", "ʁ"}, {"q", "q"},  {"k", "ɢ"}, {"h", "h"},  {"hh", "ɦ"}, {"kh", "ħ"}, {"m", "m"},
    {"n", "n"},  {"ng", "ŋ"}, {"ny", "ɲ"}, {"l", "l"},  {"r", "r"}, {"rr", "ʀ"}, {"v", "v"},  {"w", "w"},  {"z", "z"},
    {"zg", "ʒ"}, {"sh", "ʃ"}, {"th", "θ"}, {"dh", "ð"}, {"y", "j"}, {NULL, NULL}};
// --- GENERATOR LOGIC ---

// Helper to retrieve vector from DB
f6_t get_vector(const char* ipa) {
  for (int i = 0; PHONEME_DB[i].ipa; i++) {
    if (strcmp(PHONEME_DB[i].ipa, ipa) == 0) return PHONEME_DB[i].target;
  }
  fprintf(stderr, "ERROR: Anchor refers to unknown IPA: %s\n", ipa);
  exit(1);
}

void generate_ipa_identity_table(const char* array_name) {
  printf("const ortho_rule_t %s[] = {\n", array_name);
  for (int i = 0; PHONEME_DB[i].ipa != NULL; i++) {
    printf("    {\"%s\", \"%s\"},\n", PHONEME_DB[i].ipa, PHONEME_DB[i].ipa);
  }

  printf("    {NULL, NULL}\n};\n\n");
}

void generate_table(const char* array_name, const anchor_t* anchors) {
  int anchor_count = 0;
  while (anchors[anchor_count].spelling) anchor_count++;

  f6_t* anchor_vecs = malloc(sizeof(f6_t) * anchor_count);
  for (int j = 0; j < anchor_count; j++) {
    anchor_vecs[j] = get_vector(anchors[j].ipa_ref);
  }

  printf("const ortho_rule_t %s[] = {\n", array_name);

  for (int i = 0; PHONEME_DB[i].ipa != NULL; i++) {
    f6_t source = PHONEME_DB[i].target;
    f6_t weights = PHONEME_DB[i].weights;

    float min_dist = FLT_MAX;
    int best = 0;

    for (int j = 0; j < anchor_count; j++) {
      float d = f6_dist_sq(source, anchor_vecs[j], weights);
      if (d < min_dist) {
        min_dist = d;
        best = j;
      }
    }

    printf("    {\"%s\", \"%s\"},", PHONEME_DB[i].ipa, anchors[best].spelling);
    printf("\n");
  }

  printf("    {NULL, NULL}\n};\n\n");
  free(anchor_vecs);
}

int main(void) {
  printf("// Automatically generated by mapgen\n");

  printf("#ifndef GENERATED_ORTHO_H\n");
  printf("#define GENERATED_ORTHO_H\n\n");
  printf("#include \"transcriber.h\"\n\n");

  generate_table("ORTHO_POLISH", ANCHORS_POLISH);
  generate_table("ORTHO_HUNGARIAN", ANCHORS_HUNGARIAN);
  generate_table("ORTHO_GERMAN", ANCHORS_GERMAN);
  generate_table("ORTHO_CYRILLIC", ANCHORS_CYRILLIC);
  generate_table("ORTHO_ORCISH", ANCHORS_ORCISH);
  generate_ipa_identity_table("ORTHO_IPA");
  printf("#endif\n");
  return 0;
}