#!/usr/bin/python

# convert filename.tei into filename.dict and filename.index
# you can sort filename.pyindex afterwards to achieve faster serpento startup
# usage: tei2dict.py filename.tei
# this will create filename.dict and filename.index files in current directory

import sys, string, sgmllib, os

from utils import decb64

seporth = ", "  # use this to separate ORTH entries (if there are multiple ones)

septrans = "\n  " # use this to separate TR entries


teifile = sys.argv[1]

basename, ext = os.path.splitext(teifile)
assert ext=='.tei'
dictfile = basename+".dict"
dictfile = open(dictfile, "w")
indexfile = basename+".index"
indexfile = open(indexfile, "w")

def process_entry(orths, trs):
    start = dictfile.tell()
    dictfile.write(string.join(orths, seporth)+"\n")
    dictfile.write("  "+string.join(trs, septrans)+"\n")
    end = dictfile.tell()
    ln = end-start
    for i in orths:
        indexfile.write("%s\t%s\t%s\n" % (i, decb64(start), decb64(ln)))


class Parser(sgmllib.SGMLParser):

    def __init__(self):
        sgmllib.SGMLParser.__init__(self)
        self.inorth = self.intr = 0
    
    def start_entry(self, a):
        pass
    def end_entry(self):
        process_entry(self.header, self.translations)

    def start_form(self, a):
        self.header = []
    def end_form(self):
        pass

    def start_orth(self, a):
        self.inorth = 1
    def end_orth(self):
        self.inorth = 0

    def start_tr(self, a):
        self.intr = 1
    def end_tr(self):
        self.intr = 0
        
    def start_trans(self, a):
        self.translations = []
    def end_trans(self):
        pass
        
    def handle_data(self, d):
        if self.inorth:
            self.header.append(d)
        elif self.intr:
            self.translations.append(d)
        

p = Parser()

f = open(teifile)
while 1:
    l = f.readline()
    if not l:
        break
    p.feed(l)
f.close()
dictfile.close()
indexfile.close()

