#!/usr/bin/env python # # u t f 8 . p y # # """utf8.py stdin to stdout convert string to utf-8 from latin alphabet equivalents convert ... to utf-8 standard german convert ... to utf-8 standard russian """ import sys, re def main () : txt = sys.stdin.read() p = max(0,search(txt," tag with utf = '' txt = txt[:p]+utf+txt[p:] for lang in ('unicode','german','russian') : ll = len(lang) while 1 : p = search(txt,"<%s>"%lang) q = search(txt,""%lang) if p < 0 or q <= p : break src = txt[p+ll+2:q] rpl = encode(src,lang) txt = txt[:p]+""+rpl+""+txt[q+ll+3:] print txt, def encode(s, Trans=None) : "encode s to utf-8. Trans is a module name or 'unicode'" if not Trans : return s if Trans == 'unicode' : return unichr(eval("0x"+s)).encode('utf-8') exec("from %s import trans, maxKeyLen" % Trans) u = u'' ls = len(s) rng = range(maxKeyLen,0,-1) i = 0 while i < ls : for j in rng : seq = s[i:i+j] uc = trans.get(seq) if uc : i += j break if not uc : uc = ord(s[i]) i += 1 u += unichr(uc) return u.encode('utf-8') def search (a,b) : match = re.search(b,a) if not match : return -1 else : return match.start() if __name__ == "__main__" : main()