# -*- encoding: utf-8 -*- """Convert many unicode characters to ascii characters that are like them. I want to collate names, with the property that a last name starting with O-umlaut will be in with the last name's starting with O. Horrors! So I want that many Latin-1 characters have their umlaute's, etc., stripped. Some of it can be done automatically but some needs to be done by hand, that I can tell. """ __version__='1.0.1' __author__='Jim Hefferon: ftpmaint at tug.ctan.org' __date__='2008-July-15' __notes__="""As sources, used effbot's web site, and http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/251871 and man uni2ascii """ import os, os.path, sys, re import unicodedata # These characters that are not done automatically by NFKD, and # have a name starting with "LATIN". Some of these I found on the interwebs, # but some I did by eye. Corrections or additions appreciated. EXTRA_LATIN_NAMES={ # First are ones I got off the interweb u"\N{LATIN CAPITAL LETTER O WITH STROKE}": u"O", u"\N{LATIN SMALL LETTER A WITH GRAVE}": u"a", u"\N{LATIN SMALL LETTER A WITH ACUTE}": u"a", u"\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}": u"a", u"\N{LATIN SMALL LETTER A WITH TILDE}": u"a", u"\N{LATIN SMALL LETTER A WITH DIAERESIS}": u"ae", u"\N{LATIN SMALL LETTER A WITH RING ABOVE}": u"a", u"\N{LATIN SMALL LETTER C WITH CEDILLA}": u"c", u"\N{LATIN SMALL LETTER E WITH GRAVE}": u"e", u"\N{LATIN SMALL LETTER E WITH ACUTE}": u"e", u"\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}": u"e", u"\N{LATIN SMALL LETTER E WITH DIAERESIS}": u"e", u"\N{LATIN SMALL LETTER I WITH GRAVE}": u"i", u"\N{LATIN SMALL LETTER I WITH ACUTE}": u"i", u"\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}": u"i", u"\N{LATIN SMALL LETTER I WITH DIAERESIS}": u"i", u"\N{LATIN SMALL LETTER N WITH TILDE}": u"n", u"\N{LATIN SMALL LETTER O WITH GRAVE}": u"o", u"\N{LATIN SMALL LETTER O WITH ACUTE}": u"o", u"\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}": u"o", u"\N{LATIN SMALL LETTER O WITH TILDE}": u"o", u"\N{LATIN SMALL LETTER O WITH DIAERESIS}": u"oe", u"\N{LATIN SMALL LETTER O WITH STROKE}": u"o", u"\N{LATIN SMALL LETTER U WITH GRAVE}": u"u", u"\N{LATIN SMALL LETTER U WITH ACUTE}": u"u", u"\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}": u"u", u"\N{LATIN SMALL LETTER U WITH DIAERESIS}": u"ue", u"\N{LATIN SMALL LETTER Y WITH ACUTE}": u"y", u"\N{LATIN SMALL LETTER Y WITH DIAERESIS}": u"y", u"\N{LATIN SMALL LETTER A WITH MACRON}": u"a", u"\N{LATIN SMALL LETTER A WITH BREVE}": u"a", u"\N{LATIN SMALL LETTER C WITH ACUTE}": u"c", u"\N{LATIN SMALL LETTER C WITH CIRCUMFLEX}": u"c", u"\N{LATIN SMALL LETTER E WITH MACRON}": u"e", u"\N{LATIN SMALL LETTER E WITH BREVE}": u"e", u"\N{LATIN SMALL LETTER G WITH CIRCUMFLEX}": u"g", u"\N{LATIN SMALL LETTER G WITH BREVE}": u"g", u"\N{LATIN SMALL LETTER G WITH CEDILLA}": u"g", u"\N{LATIN SMALL LETTER H WITH CIRCUMFLEX}": u"h", u"\N{LATIN SMALL LETTER I WITH TILDE}": u"i", u"\N{LATIN SMALL LETTER I WITH MACRON}": u"i", u"\N{LATIN SMALL LETTER I WITH BREVE}": u"i", u"\N{LATIN SMALL LETTER J WITH CIRCUMFLEX}": u"j", u"\N{LATIN SMALL LETTER K WITH CEDILLA}": u"k", u"\N{LATIN SMALL LETTER L WITH ACUTE}": u"l", u"\N{LATIN SMALL LETTER L WITH CEDILLA}": u"l", u"\N{LATIN CAPITAL LETTER L WITH STROKE}": u"L", u"\N{LATIN SMALL LETTER L WITH STROKE}": u"l", u"\N{LATIN SMALL LETTER N WITH ACUTE}": u"n", u"\N{LATIN SMALL LETTER N WITH CEDILLA}": u"n", u"\N{LATIN SMALL LETTER O WITH MACRON}": u"o", u"\N{LATIN SMALL LETTER O WITH BREVE}": u"o", u"\N{LATIN SMALL LETTER R WITH ACUTE}": u"r", u"\N{LATIN SMALL LETTER R WITH CEDILLA}": u"r", u"\N{LATIN SMALL LETTER S WITH ACUTE}": u"s", u"\N{LATIN SMALL LETTER S WITH CIRCUMFLEX}": u"s", u"\N{LATIN SMALL LETTER S WITH CEDILLA}": u"s", u"\N{LATIN SMALL LETTER T WITH CEDILLA}": u"t", u"\N{LATIN SMALL LETTER U WITH TILDE}": u"u", u"\N{LATIN SMALL LETTER U WITH MACRON}": u"u", u"\N{LATIN SMALL LETTER U WITH BREVE}": u"u", u"\N{LATIN SMALL LETTER U WITH RING ABOVE}": u"u", u"\N{LATIN SMALL LETTER W WITH CIRCUMFLEX}": u"w", u"\N{LATIN SMALL LETTER Y WITH CIRCUMFLEX}": u"y", u"\N{LATIN SMALL LETTER Z WITH ACUTE}": u"z", u"\N{LATIN SMALL LETTER W WITH GRAVE}": u"w", u"\N{LATIN SMALL LETTER W WITH ACUTE}": u"w", u"\N{LATIN SMALL LETTER W WITH DIAERESIS}": u"w", u"\N{LATIN SMALL LETTER Y WITH GRAVE}": u"y", # Below are the ones that failed automated conversion u'\N{LATIN CAPITAL LETTER AE}': u'AE', u'\N{LATIN CAPITAL LETTER ETH}': u'D', u"\N{LATIN CAPITAL LETTER A WITH DIAERESIS}": u"Ae", u"\N{LATIN CAPITAL LETTER O WITH DIAERESIS}": u"Oe", u"\N{LATIN CAPITAL LETTER U WITH DIAERESIS}": u"Ue", u'\N{LATIN CAPITAL LETTER O WITH STROKE}': u'O', u'\N{LATIN CAPITAL LETTER THORN}': u'TH', u'\N{LATIN SMALL LETTER SHARP S}': u'ss', u'\N{LATIN SMALL LETTER AE}': u'ae', u'\N{LATIN SMALL LETTER ETH}': u'd', u'\N{LATIN SMALL LETTER O WITH STROKE}': u'o', u'\N{LATIN SMALL LETTER THORN}': 'th', u'\N{LATIN CAPITAL LETTER D WITH STROKE}': u'D', u'\N{LATIN SMALL LETTER D WITH STROKE}': u'd', u'\N{LATIN CAPITAL LETTER H WITH STROKE}': u'H', u'\N{LATIN SMALL LETTER H WITH STROKE}': u'h', u'\N{LATIN SMALL LETTER DOTLESS I}': u'i', u'\N{LATIN SMALL LETTER KRA}': u'q', u'\N{LATIN CAPITAL LETTER L WITH STROKE}': u'L', u'\N{LATIN SMALL LETTER L WITH STROKE}': u'l', u'\N{LATIN CAPITAL LETTER ENG}': u'N', u'\N{LATIN SMALL LETTER ENG}': u'n', u'\N{LATIN CAPITAL LIGATURE OE}': u'OE', u'\N{LATIN SMALL LIGATURE OE}': u'oe', u'\N{LATIN CAPITAL LETTER T WITH STROKE}': u'T', u'\N{LATIN SMALL LETTER T WITH STROKE}': u't', u'\N{LATIN SMALL LETTER B WITH STROKE}': u'b', u'\N{LATIN CAPITAL LETTER B WITH HOOK}': u'B', u'\N{LATIN CAPITAL LETTER B WITH TOPBAR}': u'B', u'\N{LATIN SMALL LETTER B WITH TOPBAR}': u'b', # u'\N{LATIN CAPITAL LETTER TONE SIX}': u'', # ?B # u'\N{LATIN SMALL LETTER TONE SIX}': u'', # ?b u'\N{LATIN CAPITAL LETTER OPEN O}': u'O', u'\N{LATIN CAPITAL LETTER C WITH HOOK}': u'C', u'\N{LATIN SMALL LETTER C WITH HOOK}': u'c', u'\N{LATIN CAPITAL LETTER AFRICAN D}': u'D', u'\N{LATIN CAPITAL LETTER D WITH HOOK}': u'D', u'\N{LATIN CAPITAL LETTER D WITH TOPBAR}': u'D', u'\N{LATIN SMALL LETTER D WITH TOPBAR}': u'd', # u'\N{LATIN SMALL LETTER TURNED DELTA}': u'', u'\N{LATIN CAPITAL LETTER REVERSED E}': u'E', # u'\N{LATIN CAPITAL LETTER SCHWA}': u'', u'\N{LATIN CAPITAL LETTER OPEN E}': u'E', u'\N{LATIN CAPITAL LETTER F WITH HOOK}': u'F', u'\N{LATIN SMALL LETTER F WITH HOOK}': u'f', u'\N{LATIN CAPITAL LETTER G WITH HOOK}': u'G', # u'\N{LATIN CAPITAL LETTER GAMMA}': u'', u'\N{LATIN SMALL LETTER HV}': u'hv', u'\N{LATIN CAPITAL LETTER IOTA}': u'i', u'\N{LATIN CAPITAL LETTER I WITH STROKE}': u'I', u'\N{LATIN CAPITAL LETTER K WITH HOOK}': u'K', u'\N{LATIN SMALL LETTER K WITH HOOK}': u'k', u'\N{LATIN SMALL LETTER L WITH BAR}': u'l', # u'\N{LATIN SMALL LETTER LAMBDA WITH STROKE}': u'', # u'\N{LATIN CAPITAL LETTER TURNED M}': u'', u'\N{LATIN CAPITAL LETTER N WITH LEFT HOOK}': u'N', u'\N{LATIN SMALL LETTER N WITH LONG RIGHT LEG}': u'N', u'\N{LATIN CAPITAL LETTER O WITH MIDDLE TILDE}': u'O', u'\N{LATIN CAPITAL LETTER OI}': u'OI', u'\N{LATIN SMALL LETTER OI}': u'oi', u'\N{LATIN CAPITAL LETTER P WITH HOOK}': u'P', u'\N{LATIN SMALL LETTER P WITH HOOK}': u'p', # u'\N{LATIN LETTER YR}': u'', # u'\N{LATIN CAPITAL LETTER TONE TWO}': u'', # u'\N{LATIN SMALL LETTER TONE TWO}': u'', u'\N{LATIN CAPITAL LETTER ESH}': u'SH', # u'\N{LATIN LETTER REVERSED ESH LOOP}': u'', u'\N{LATIN SMALL LETTER T WITH PALATAL HOOK}': u't', u'\N{LATIN CAPITAL LETTER T WITH HOOK}': u'T', u'\N{LATIN SMALL LETTER T WITH HOOK}': u't', u'\N{LATIN CAPITAL LETTER T WITH RETROFLEX HOOK}': u'T', # u'\N{LATIN CAPITAL LETTER UPSILON}': u'', u'\N{LATIN CAPITAL LETTER V WITH HOOK}': u'V', u'\N{LATIN CAPITAL LETTER Y WITH HOOK}': u'Y', u'\N{LATIN SMALL LETTER Y WITH HOOK}': u'y', u'\N{LATIN CAPITAL LETTER Z WITH STROKE}': u'Z', u'\N{LATIN SMALL LETTER Z WITH STROKE}': u'z', u'\N{LATIN CAPITAL LETTER EZH}': u'S', # u'\N{LATIN CAPITAL LETTER EZH REVERSED}': u'', # u'\N{LATIN SMALL LETTER EZH REVERSED}': u'', u'\N{LATIN SMALL LETTER EZH WITH TAIL}': u's', # u'\N{LATIN LETTER TWO WITH STROKE}': u'', # u'\N{LATIN CAPITAL LETTER TONE FIVE}': u'', # u'\N{LATIN SMALL LETTER TONE FIVE}': u'', # u'\N{LATIN LETTER INVERTED GLOTTAL STOP WITH STROKE}': u'', u'\N{LATIN LETTER WYNN}': u'w', # u'\N{LATIN LETTER DENTAL CLICK}': u'', # u'\N{LATIN LETTER LATERAL CLICK}': u'', # u'\N{LATIN LETTER ALVEOLAR CLICK}': u'', # u'\N{LATIN LETTER RETROFLEX CLICK}': u'', # u'\N{LATIN SMALL LETTER TURNED E}': u'', u'\N{LATIN CAPITAL LETTER AE WITH MACRON}': u'AE', u'\N{LATIN SMALL LETTER AE WITH MACRON}': u'ae', u'\N{LATIN CAPITAL LETTER G WITH STROKE}': u'G', u'\N{LATIN SMALL LETTER G WITH STROKE}': u'g', u'\N{LATIN CAPITAL LETTER EZH WITH CARON}': u'S', u'\N{LATIN SMALL LETTER EZH WITH CARON}': u's', u'\N{LATIN CAPITAL LETTER HWAIR}': u'HW', u'\N{LATIN CAPITAL LETTER WYNN}': u'W', u'\N{LATIN CAPITAL LETTER AE WITH ACUTE}': u'AE', u'\N{LATIN SMALL LETTER AE WITH ACUTE}': u'AE', u'\N{LATIN CAPITAL LETTER O WITH STROKE AND ACUTE}': u'O', u'\N{LATIN SMALL LETTER O WITH STROKE AND ACUTE}': u'o', u'\N{LATIN CAPITAL LETTER YOGH}': u'J', u'\N{LATIN SMALL LETTER YOGH}': u'j', u'\N{LATIN CAPITAL LETTER N WITH LONG RIGHT LEG}': u'N', u'\N{LATIN SMALL LETTER D WITH CURL}': u'd', u'\N{LATIN CAPITAL LETTER OU}': u'OU', u'\N{LATIN SMALL LETTER OU}': u'ou', u'\N{LATIN CAPITAL LETTER Z WITH HOOK}': u'Z', u'\N{LATIN SMALL LETTER Z WITH HOOK}': u'z', u'\N{LATIN SMALL LETTER L WITH CURL}': u'l', u'\N{LATIN SMALL LETTER N WITH CURL}': u'n', u'\N{LATIN SMALL LETTER T WITH CURL}': u't', u'\N{LATIN SMALL LETTER DOTLESS J}': u'j', u'\N{LATIN SMALL LETTER DB DIGRAPH}': u'db', u'\N{LATIN SMALL LETTER QP DIGRAPH}': u'qp', u'\N{LATIN CAPITAL LETTER A WITH STROKE}': u'A', u'\N{LATIN CAPITAL LETTER C WITH STROKE}': u'C', u'\N{LATIN SMALL LETTER C WITH STROKE}': u'C', u'\N{LATIN CAPITAL LETTER L WITH BAR}': u'L', u'\N{LATIN CAPITAL LETTER T WITH DIAGONAL STROKE}': u'T', u'\N{LATIN SMALL LETTER S WITH SWASH TAIL}': u'S', u'\N{LATIN SMALL LETTER Z WITH SWASH TAIL}': u'Z', # u'\N{LATIN CAPITAL LETTER GLOTTAL STOP}': u'', # u'\N{LATIN SMALL LETTER TURNED A}': u'', # u'\N{LATIN SMALL LETTER ALPHA}': u'', # u'\N{LATIN SMALL LETTER TURNED ALPHA}': u'', u'\N{LATIN SMALL LETTER B WITH HOOK}': u'b', u'\N{LATIN SMALL LETTER OPEN O}': u'o', u'\N{LATIN SMALL LETTER C WITH CURL}': u'c', u'\N{LATIN SMALL LETTER D WITH TAIL}': u'd', u'\N{LATIN SMALL LETTER D WITH HOOK}': u'd', # u'\N{LATIN SMALL LETTER REVERSED E}': u'', # u'\N{LATIN SMALL LETTER SCHWA}': u'', # u'\N{LATIN SMALL LETTER SCHWA WITH HOOK}': u'', u'\N{LATIN SMALL LETTER OPEN E}': u'e', # u'\N{LATIN SMALL LETTER REVERSED OPEN E}': u'', # u'\N{LATIN SMALL LETTER REVERSED OPEN E WITH HOOK}': u'', # u'\N{LATIN SMALL LETTER CLOSED REVERSED OPEN E}': u'', u'\N{LATIN SMALL LETTER DOTLESS J WITH STROKE}': u'j', u'\N{LATIN SMALL LETTER G WITH HOOK}': u'g', u'\N{LATIN SMALL LETTER SCRIPT G}': u'g', u'\N{LATIN LETTER SMALL CAPITAL G}': u'G', # u'\N{LATIN SMALL LETTER GAMMA}': u'', # u'\N{LATIN SMALL LETTER RAMS HORN}': u'', # u'\N{LATIN SMALL LETTER TURNED H}': u'', u'\N{LATIN SMALL LETTER H WITH HOOK}': u'h', u'\N{LATIN SMALL LETTER HENG WITH HOOK}': u'h', u'\N{LATIN SMALL LETTER I WITH STROKE}': u'i', # u'\N{LATIN SMALL LETTER IOTA}': u'', u'\N{LATIN LETTER SMALL CAPITAL I}': u'I', u'\N{LATIN SMALL LETTER L WITH MIDDLE TILDE}': u'L', u'\N{LATIN SMALL LETTER L WITH BELT}': u'L', u'\N{LATIN SMALL LETTER L WITH RETROFLEX HOOK}': u'L', # u'\N{LATIN SMALL LETTER LEZH}': u'', # u'\N{LATIN SMALL LETTER TURNED M}': u'', # u'\N{LATIN SMALL LETTER TURNED M WITH LONG LEG}': u'', u'\N{LATIN SMALL LETTER M WITH HOOK}': u'm', u'\N{LATIN SMALL LETTER N WITH LEFT HOOK}': u'n', u'\N{LATIN SMALL LETTER N WITH RETROFLEX HOOK}': u'n', u'\N{LATIN LETTER SMALL CAPITAL N}': u'N', u'\N{LATIN SMALL LETTER BARRED O}': u'o', u'\N{LATIN LETTER SMALL CAPITAL OE}': u'OE', # u'\N{LATIN SMALL LETTER CLOSED OMEGA}': u'', # u'\N{LATIN SMALL LETTER PHI}': u'', # u'\N{LATIN SMALL LETTER TURNED R}': u'', # u'\N{LATIN SMALL LETTER TURNED R WITH LONG LEG}': u'', # u'\N{LATIN SMALL LETTER TURNED R WITH HOOK}': u'', u'\N{LATIN SMALL LETTER R WITH LONG LEG}': u'r', u'\N{LATIN SMALL LETTER R WITH TAIL}': u'r', u'\N{LATIN SMALL LETTER R WITH FISHHOOK}': u'r', # u'\N{LATIN SMALL LETTER REVERSED R WITH FISHHOOK}': u'', u'\N{LATIN LETTER SMALL CAPITAL R}': u'R', # u'\N{LATIN LETTER SMALL CAPITAL INVERTED R}': u'', u'\N{LATIN SMALL LETTER S WITH HOOK}': u's', u'\N{LATIN SMALL LETTER ESH}': u'sh', u'\N{LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK}': u'j', # u'\N{LATIN SMALL LETTER SQUAT REVERSED ESH}': u'', u'\N{LATIN SMALL LETTER ESH WITH CURL}': u'sh', # u'\N{LATIN SMALL LETTER TURNED T}': u'', u'\N{LATIN SMALL LETTER T WITH RETROFLEX HOOK}': u't', u'\N{LATIN SMALL LETTER U BAR}': u'u', # u'\N{LATIN SMALL LETTER UPSILON}': u'', u'\N{LATIN SMALL LETTER V WITH HOOK}': u'v', # u'\N{LATIN SMALL LETTER TURNED V}': u'', # u'\N{LATIN SMALL LETTER TURNED W}': u'', # u'\N{LATIN SMALL LETTER TURNED Y}': u'', u'\N{LATIN LETTER SMALL CAPITAL Y}': u'Y', u'\N{LATIN SMALL LETTER Z WITH RETROFLEX HOOK}': u'z', u'\N{LATIN SMALL LETTER Z WITH CURL}': u'z', u'\N{LATIN SMALL LETTER EZH}': u's', u'\N{LATIN SMALL LETTER EZH WITH CURL}': u's', # u'\N{LATIN LETTER GLOTTAL STOP}': u'', # u'\N{LATIN LETTER PHARYNGEAL VOICED FRICATIVE}': u'', # u'\N{LATIN LETTER INVERTED GLOTTAL STOP}': u'', u'\N{LATIN LETTER STRETCHED C}': u'c', # u'\N{LATIN LETTER BILABIAL CLICK}': u'', u'\N{LATIN LETTER SMALL CAPITAL B}': u'B', u'\N{LATIN SMALL LETTER CLOSED OPEN E}': u'e', u'\N{LATIN LETTER SMALL CAPITAL G WITH HOOK}': u'G', u'\N{LATIN LETTER SMALL CAPITAL H}': u'H', u'\N{LATIN SMALL LETTER J WITH CROSSED-TAIL}': u'j', # u'\N{LATIN SMALL LETTER TURNED K}': u'', u'\N{LATIN LETTER SMALL CAPITAL L}': u'L', u'\N{LATIN SMALL LETTER Q WITH HOOK}': u'q', # u'\N{LATIN LETTER GLOTTAL STOP WITH STROKE}': u'', # u'\N{LATIN LETTER REVERSED GLOTTAL STOP WITH STROKE}': u'', # u'\N{LATIN SMALL LETTER DZ DIGRAPH}': u'', # u'\N{LATIN SMALL LETTER DEZH DIGRAPH}': u'', # u'\N{LATIN SMALL LETTER DZ DIGRAPH WITH CURL}': u'', # u'\N{LATIN SMALL LETTER TS DIGRAPH}': u'', # u'\N{LATIN SMALL LETTER TESH DIGRAPH}': u'', # u'\N{LATIN SMALL LETTER TC DIGRAPH WITH CURL}': u'', # u'\N{LATIN SMALL LETTER FENG DIGRAPH}': u'', # u'\N{LATIN SMALL LETTER LS DIGRAPH}': u'', # u'\N{LATIN SMALL LETTER LZ DIGRAPH}': u'', # u'\N{LATIN LETTER BILABIAL PERCUSSIVE}': u'', # u'\N{LATIN LETTER BIDENTAL PERCUSSIVE}': u'', # u'\N{LATIN SMALL LETTER TURNED H WITH FISHHOOK}': u'', # u'\N{LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL}': u'', u'\N{LATIN LETTER SMALL CAPITAL A}': u'A', u'\N{LATIN LETTER SMALL CAPITAL AE}': u'AE', # u'\N{LATIN SMALL LETTER TURNED AE}': u'', u'\N{LATIN LETTER SMALL CAPITAL BARRED B}': u'B', u'\N{LATIN LETTER SMALL CAPITAL C}': u'C', u'\N{LATIN LETTER SMALL CAPITAL D}': u'D', u'\N{LATIN LETTER SMALL CAPITAL ETH}': u'D', u'\N{LATIN LETTER SMALL CAPITAL E}': u'E', # u'\N{LATIN SMALL LETTER TURNED OPEN E}': u'', # u'\N{LATIN SMALL LETTER TURNED I}': u'', u'\N{LATIN LETTER SMALL CAPITAL J}': u'J', u'\N{LATIN LETTER SMALL CAPITAL K}': u'K', u'\N{LATIN LETTER SMALL CAPITAL L WITH STROKE}': u'L', u'\N{LATIN LETTER SMALL CAPITAL M}': u'M', # u'\N{LATIN LETTER SMALL CAPITAL REVERSED N}': u'', u'\N{LATIN LETTER SMALL CAPITAL O}': u'O', u'\N{LATIN LETTER SMALL CAPITAL OPEN O}': u'O', # u'\N{LATIN SMALL LETTER SIDEWAYS O}': u'', # u'\N{LATIN SMALL LETTER SIDEWAYS OPEN O}': u'', # u'\N{LATIN SMALL LETTER SIDEWAYS O WITH STROKE}': u'', # u'\N{LATIN SMALL LETTER TURNED OE}': u'', u'\N{LATIN LETTER SMALL CAPITAL OU}': u'OU', # u'\N{LATIN SMALL LETTER TOP HALF O}': u'', # u'\N{LATIN SMALL LETTER BOTTOM HALF O}': u'', u'\N{LATIN LETTER SMALL CAPITAL P}': u'P', # u'\N{LATIN LETTER SMALL CAPITAL REVERSED R}': u'', # u'\N{LATIN LETTER SMALL CAPITAL TURNED R}': u'', u'\N{LATIN LETTER SMALL CAPITAL T}': u'T', u'\N{LATIN LETTER SMALL CAPITAL U}': u'U', # u'\N{LATIN SMALL LETTER SIDEWAYS U}': u'', # u'\N{LATIN SMALL LETTER SIDEWAYS DIAERESIZED U}': u'', # u'\N{LATIN SMALL LETTER SIDEWAYS TURNED M}': u'', u'\N{LATIN LETTER SMALL CAPITAL V}': u'V', u'\N{LATIN LETTER SMALL CAPITAL W}': u'W', u'\N{LATIN LETTER SMALL CAPITAL Z}': u'', u'\N{LATIN LETTER SMALL CAPITAL EZH}': u'S', # u'\N{LATIN LETTER VOICED LARYNGEAL SPIRANT}': u'', # u'\N{LATIN LETTER AIN}': u'', u'\N{LATIN SMALL LETTER UE}': u'ue', u'\N{LATIN SMALL LETTER B WITH MIDDLE TILDE}': u'b', u'\N{LATIN SMALL LETTER D WITH MIDDLE TILDE}': u'd', u'\N{LATIN SMALL LETTER F WITH MIDDLE TILDE}': u'f', u'\N{LATIN SMALL LETTER M WITH MIDDLE TILDE}': u'm', u'\N{LATIN SMALL LETTER N WITH MIDDLE TILDE}': u'n', u'\N{LATIN SMALL LETTER P WITH MIDDLE TILDE}': u'p', u'\N{LATIN SMALL LETTER R WITH MIDDLE TILDE}': u'r', u'\N{LATIN SMALL LETTER R WITH FISHHOOK AND MIDDLE TILDE}': u'r', u'\N{LATIN SMALL LETTER S WITH MIDDLE TILDE}': u's', u'\N{LATIN SMALL LETTER T WITH MIDDLE TILDE}': u't', u'\N{LATIN SMALL LETTER Z WITH MIDDLE TILDE}': u'z', # u'\N{LATIN SMALL LETTER TURNED G}': u'', # u'\N{LATIN SMALL LETTER INSULAR G}': u'', u'\N{LATIN SMALL LETTER TH WITH STRIKETHROUGH}': u'th', u'\N{LATIN SMALL CAPITAL LETTER I WITH STROKE}': u'I', # u'\N{LATIN SMALL LETTER IOTA WITH STROKE}': u'', u'\N{LATIN SMALL LETTER P WITH STROKE}': u'p', u'\N{LATIN SMALL CAPITAL LETTER U WITH STROKE}': u'U', # u'\N{LATIN SMALL LETTER UPSILON WITH STROKE}': u'', u'\N{LATIN SMALL LETTER B WITH PALATAL HOOK}': u'b', u'\N{LATIN SMALL LETTER D WITH PALATAL HOOK}': u'd', u'\N{LATIN SMALL LETTER F WITH PALATAL HOOK}': u'f', u'\N{LATIN SMALL LETTER G WITH PALATAL HOOK}': u'g', u'\N{LATIN SMALL LETTER K WITH PALATAL HOOK}': u'k', u'\N{LATIN SMALL LETTER L WITH PALATAL HOOK}': u'l', u'\N{LATIN SMALL LETTER M WITH PALATAL HOOK}': u'm', u'\N{LATIN SMALL LETTER N WITH PALATAL HOOK}': u'n', u'\N{LATIN SMALL LETTER P WITH PALATAL HOOK}': u'p', u'\N{LATIN SMALL LETTER R WITH PALATAL HOOK}': u'r', u'\N{LATIN SMALL LETTER S WITH PALATAL HOOK}': u's', u'\N{LATIN SMALL LETTER ESH WITH PALATAL HOOK}': u'sh', u'\N{LATIN SMALL LETTER V WITH PALATAL HOOK}': u'v', u'\N{LATIN SMALL LETTER X WITH PALATAL HOOK}': u'x', u'\N{LATIN SMALL LETTER Z WITH PALATAL HOOK}': u'z', u'\N{LATIN SMALL LETTER A WITH RETROFLEX HOOK}': u'a', # u'\N{LATIN SMALL LETTER ALPHA WITH RETROFLEX HOOK}': u'', u'\N{LATIN SMALL LETTER D WITH HOOK AND TAIL}': u'd', u'\N{LATIN SMALL LETTER E WITH RETROFLEX HOOK}': u'e', u'\N{LATIN SMALL LETTER OPEN E WITH RETROFLEX HOOK}': u'e', u'\N{LATIN SMALL LETTER REVERSED OPEN E WITH RETROFLEX HOOK}': u'e', # u'\N{LATIN SMALL LETTER SCHWA WITH RETROFLEX HOOK}': u'', u'\N{LATIN SMALL LETTER I WITH RETROFLEX HOOK}': u'i', u'\N{LATIN SMALL LETTER OPEN O WITH RETROFLEX HOOK}': u'o', u'\N{LATIN SMALL LETTER ESH WITH RETROFLEX HOOK}': u'sh', u'\N{LATIN SMALL LETTER U WITH RETROFLEX HOOK}': u'u', u'\N{LATIN SMALL LETTER EZH WITH RETROFLEX HOOK}': u's', # u'\N{LATIN SUBSCRIPT SMALL LETTER SCHWA}': u'', # u'\N{LATIN CROSS}': u'' } # Additional ones; see "man uni2ascii" UNI2ASCII_CONVERSIONS={ u'\N{NO-BREAK SPACE}': u' ', u'\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}': u'"', u'\N{SOFT HYPHEN}': u'', # Controversial: see http://www.cs.tut.fi/~jkorpela/shy.html u'\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}': u'"', u'\N{ETHIOPIC WORDSPACE}': u' ', u'\N{OGHAM SPACE MARK}': u' ', u'\N{EN QUAD}': u' ', u'\N{EM QUAD}': u' ', u'\N{EN SPACE}': u' ', u'\N{EM SPACE}': u' ', u'\N{THREE-PER-EM SPACE}': u' ', u'\N{FOUR-PER-EM SPACE}': u' ', u'\N{SIX-PER-EM SPACE}': u' ', u'\N{FIGURE SPACE}': u' ', u'\N{PUNCTUATION SPACE}': u' ', u'\N{THIN SPACE}': u' ', u'\N{HAIR SPACE}': u' ', u'\N{ZERO WIDTH SPACE}': u' ', u'\N{ZERO WIDTH NO-BREAK SPACE}': u' ', u'\N{HYPHEN}': u'-', u'\N{NON-BREAKING HYPHEN}': u'-', u'\N{FIGURE DASH}': u'-', u'\N{EN DASH}': u'-', u'\N{EM DASH}': u'-', u'\N{LEFT SINGLE QUOTATION MARK}': u'`', u'\N{RIGHT SINGLE QUOTATION MARK}': u"'", u'\N{SINGLE LOW-9 QUOTATION MARK}': u'`', u'\N{SINGLE HIGH-REVERSED-9 QUOTATION MARK}': u'`', u'\N{LEFT DOUBLE QUOTATION MARK}': u'"', u'\N{RIGHT DOUBLE QUOTATION MARK}': u'"', u'\N{DOUBLE LOW-9 QUOTATION MARK}': u'"', u'\N{DOUBLE HIGH-REVERSED-9 QUOTATION MARK}': u'"', u'\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}': u'`', u'\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}': u"'", u'\N{LOW ASTERISK}': u'*', u'\N{MINUS SIGN}': u'-', u'\N{ASTERISK OPERATOR}': u'*', u'\N{BOX DRAWINGS LIGHT HORIZONTAL}': u'-', u'\N{BOX DRAWINGS HEAVY HORIZONTAL}': u'-', u'\N{BOX DRAWINGS LIGHT VERTICAL}': u'|', u'\N{BOX DRAWINGS HEAVY VERTICAL}': u'|', u'\N{HEAVY ASTERISK}': u'*', u'\N{HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT}': u'"', u'\N{HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT}': u'"', u'\N{IDEOGRAPHIC SPACE}': u' ', u'\N{SMALL AMPERSAND}': u'&', u'\N{SMALL ASTERISK}': u'*', u'\N{SMALL PLUS SIGN}': u'+', u'\N{CENT SIGN}': u'cent', u'\N{POUND SIGN}': u'pound', u'\N{YEN SIGN}': u'yen', u'\N{COPYRIGHT SIGN}': u'(c)', u'\N{REGISTERED SIGN}': u'(R)', u'\N{VULGAR FRACTION ONE QUARTER}': u'1/4', u'\N{VULGAR FRACTION ONE HALF}': u'1/2', u'\N{VULGAR FRACTION THREE QUARTERS}': u'3/4', # u'\N{CAPITAL LETTER ASH}': u'AE', u'\N{LATIN SMALL LETTER SHARP S}': u'ss', # u'\N{SMALL LETTER ASH}': u'ae', u'\N{LATIN CAPITAL LIGATURE IJ}': u'IJ', u'\N{LATIN SMALL LIGATURE IJ}': u'ij', u'\N{LATIN CAPITAL LIGATURE OE}': u'OE', u'\N{LATIN SMALL LIGATURE oe}': u'oe', u'\N{LATIN CAPITAL LETTER DZ}': u'DZ', u'\N{LATIN CAPITAL LETTER DZ WITH CARON}': u'DZ', u'\N{LATIN CAPITAL LETTER D WITH SMALL LETTER Z}': u'Dz', u'\N{LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON}': u'Dz', u'\N{LATIN SMALL LETTER DZ}': u'dz', u'\N{LATIN SMALL LETTER TS DIGRAPH}': u'ts', u'\N{HORIZONTAL ELLIPSIS}': u'...', u'\N{MIDLINE HORIZONTAL ELLIPSIS}': u'...', u'\N{LEFTWARDS ARROW}': u'<-', u'\N{RIGHTWARDS ARROW}': u'->', u'\N{LEFTWARDS DOUBLE ARROW}': u'<=', u'\N{RIGHTWARDS DOUBLE ARROW}': u'=>', } # More from "man uni2ascii", in a different category. EXTRA_CHARACTERS={ u'\N{ACUTE ACCENT}': u"'", u'\N{BROKEN BAR}': u'|', # u'\N{CEDILLA}': u'{cedilla}', u'\N{CENT SIGN}': u' cents ', u'\N{COPYRIGHT SIGN}': u'(C)', u'\N{CURRENCY SIGN}': u' currency ', u'\N{DEGREE SIGN}': u' degrees ', # u'\N{DIAERESIS}': u'{umlaut}', u'\N{DIVISION SIGN}': u'/', # u'\N{FEMININE ORDINAL INDICATOR}': u'{^a}', u'\N{INVERTED EXCLAMATION MARK}': u'!', u'\N{INVERTED QUESTION MARK}': u'?', # wrong? u'\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}': u'<<', u'\N{MACRON}': u'_', # u'\N{MASCULINE ORDINAL INDICATOR}': u'{^o}', u'\N{MICRO SIGN}': u'micro', u'\N{MIDDLE DOT}': u'*', u'\N{MULTIPLICATION SIGN}': u'*', u'\N{NOT SIGN}': u'not', u'\N{PILCROW SIGN}': u'paragraph', u'\N{PLUS-MINUS SIGN}': u'+/-', u'\N{POUND SIGN}': u'pound', u'\N{REGISTERED SIGN}': u'(R)', # wrong? u'\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}': u'>>', u'\N{SECTION SIGN}': u'section', u'\N{SOFT HYPHEN}': u'', u'\N{SUPERSCRIPT ONE}': u'^1', u'\N{SUPERSCRIPT THREE}': u'^3', u'\N{SUPERSCRIPT TWO}': u'^2', u'\N{VULGAR FRACTION ONE HALF}': u'1/2', u'\N{VULGAR FRACTION ONE QUARTER}': u'1/4', u'\N{VULGAR FRACTION THREE QUARTERS}': u'3/4', u'\N{YEN SIGN}': u'yen' } FG_HACKS={ u'\u0082': u'', # "break permitted here" symbol u'\u2022': u'*', # Bullet } def build_dictionary(): 'Return the translation dictionary.' d = dict() # First do what can be done automatically for i in range(0xffff): u=unichr(i) try: n=unicodedata.name(u) if n.startswith('LATIN '): k=unicodedata.normalize('NFKD', u).encode('ASCII', 'ignore') if k: d[i]=unicode(k) # i=ord(u) except ValueError: pass # Next, add some by-hand ones (overlap possible, so order matters) for m in [EXTRA_LATIN_NAMES,EXTRA_CHARACTERS,UNI2ASCII_CONVERSIONS,FG_HACKS]: for i in m: try: d[ord(i)]=unicode(m[i]) except Exception, err: pass return d udict = build_dictionary() convert = lambda s: s.translate(udict) def coroutine(func): def start(*argz, **kwz): cr = func(*argz, **kwz) cr.next() return cr return start @coroutine def co_filter(drain, in_enc='utf-8', out_enc='ascii'): bs = None while True: chunk = (yield bs) bs = drain(convert(unicode(chunk)).encode('utf-8')) def uc_filter(sin, sout, bs=8192, in_enc='utf-8', out_enc='ascii'): sout = co_filter(sout.write, in_enc, out_enc) while True: dta = sin.read(bs) if not dta: break else: sout.send(dta) if __name__ == '__main__': from optparse import OptionParser parser = OptionParser(usage='%prog [options]', description='utf8 stdin -> ascii stdout') parser.add_option('-s', '--src-enc', action='store', type='str', dest='src_enc', metavar='ENC', default='utf-8', help='source encoding (utf-8)') parser.add_option('-d', '--dst-enc', action='store', type='str', dest='dst_enc', metavar='ENC', default='ascii', help='destination encoding (ascii)') parser.add_option('-c', '--chunk', action='store', type='int', dest='bs', metavar='BYTES', default=8192, help='read/write in chunks of a given size (8192)') optz, argz = parser.parse_args() if argz: parser.error('Only stdin -> stdout conversion suported') uc_filter(sys.stdin, sys.stdout, bs=optz.bs, in_enc=optz.src_enc, out_enc=optz.dst_enc)