""" Processes the original EDICT file to output simplified ZBEDICT format. For entries written in kanji, also inserts an entry linking the kana reading of the kanji to the kanji entry. usage: simplegen.py edict output.txt The edict file is expected to be in EUC-JP encoding. Output file is in UTF-8. Michael Penkov (misha.penkov@gmail.com) """ import codecs import re import string import sys # First full-width character FW_START = u'\uff01' # Last full-width character FW_END = u'\uffee' # Not used def full2half(line): """ Converts full-width latin characters & punctuation to their half-width counterparts. http://www.fileformat.info/info/unicode/char/ff01/index.htm http://en.wikipedia.org/wiki/Latin_characters_in_Unicode """ result = '' for c in line: if c >= FW_START and c <= FW_END: c = chr(ord('!') + ord(c) - ord(FW_START)) result = result + c return result class Entry: """ A class to represent an entry in BEDICT """ Keyword = None Senses = None def __init__(self, keyword, senses): self.Keyword = keyword self.Senses = senses class Sense: """ A class to represent a sense in BEDICT """ PartOfSpeech = None Subsenses = None Kana = None Links = None def __init__(self, kana, ptSpeech = None): self.PartOfSpeech = ptSpeech self.Subsenses = [] self.Kana = kana self.Links = None def readSenses(components, kana): """ Read senses from a list of components """ senses = [] currentSense = None for c in components: ptSpeechRegex = re.compile('\\([A-Za-z0-9,.-]*\\)') ptSpeechMatch = ptSpeechRegex.match(c) if (ptSpeechMatch is not None): # This is a new sense ptSpeech = ptSpeechMatch.group() # Get rid of the braces ptSpeech = ptSpeech[1:len(ptSpeech) - 1] # Check for ordinals ordinalRegex = re.compile('\\([0-9]+\\)') ordinalMatch = ordinalRegex.match(c, ptSpeechMatch.end()) if ordinalMatch is not None: # Drop the ordinals c = c[len(ordinalMatch.group()):].strip() else: # Drop the part of the speech c = c[len(ptSpeechMatch.group()):].strip() currentSense = Sense(kana, ptSpeech) senses.append(currentSense) # Add subsense to the current sense, if there is one if (currentSense is not None): currentSense.Subsenses.append(c.strip()) return senses def readEntry(components): # Process the entry keyword = components[0] kana = keyword start = string.find(keyword, '[') end = string.find(keyword, ']', start) if (start != -1 and end != -1): # Kana is different from keyword kana = keyword[start + 1:end].strip() keyword = keyword[0:start].strip() # A set of senses contained by this line return Entry(keyword, readSenses(components[1:], kana)) def writeHeader(fout, header): """ Processes the EDICT header line """ fout.write('id=Japanese-English\n') fout.write('maintainer=Michael Penkov (misha.penkov@gmail.com)\n') components = header.rsplit('/') desc = None count = 0 for c in components: c = c.strip() if (len(c) > 0): if (desc == None): desc = c fout.write('description=%s\n' % desc) else: fill = str(count).zfill(2) fout.write('comment%s=%s\n' % (fill, c)) count = count + 1 fout.write('\n') def writeEntry(fout, entry): fout.write(entry.Keyword + '\n') for s in entry.Senses: fout.write('{s}') # Pronunciation if s.Kana is not None: fout.write('{pr}%s{/pr}' % s.Kana) # Part of speech if s.PartOfSpeech is not None: fout.write('{ps}%s{/ps}' % s.PartOfSpeech) # Sub-sense for ss in s.Subsenses: ss = ss.strip() if len(ss) > 0: fout.write('{ss}%s{/ss}' % ss) # Links if s.Links is not None: for l in s.Links: l = l.strip() if (len(l) > 0): fout.write('{ss}{sa}%s{/sa}{/ss}' % l) fout.write('{/s}') fout.write('\n\n') def main(): if len(sys.argv) < 3: print 'usage: %s edictfile outputfile' sys.exit() fin = codecs.open(sys.argv[1], 'r', 'euc-jp') fout = codecs.open(sys.argv[2], 'w', 'utf-8') # A dictionary of keywords to entries. From original EDICT. keywordDict = {} try: while True: line = fin.readline() if line == '': # EOL break #components = full2half(line).rsplit('/') components = line.rsplit('/') for i in range(0, len(components)): components[i] = components[i].strip() if line.startswith(u'\uff1f\uff1f\uff1f\uff1f'): # Process header writeHeader(fout, line[4:]) continue entry = readEntry(components) if (entry.Keyword not in keywordDict): # New keyword keywordDict[entry.Keyword] = entry else: # Keyword already exists. Append senses for sense in entry.Senses: keywordDict[entry.Keyword].Senses.append(sense) kana = entry.Senses[0].Kana if (kana != entry.Keyword): if (kana not in keywordDict): # Create an entry to contain the kana keywordDict[kana] = Entry(kana, []) keywordDict[kana].Senses.append(Sense(kana)) # Map kana to full entry firstSense = keywordDict[kana].Senses[0] if firstSense.Links is None: firstSense.Links = [] firstSense.Links.append(entry.Keyword) # Write out the mappings for kw in keywordDict.keys(): writeEntry(fout, keywordDict[kw]) finally: fin.close() fout.close() if __name__ == "__main__": main()