python - Fail to understand to me strange re behaviour -
can please explain me, why getting different result when remove lines containing '# duplicate'?
import re def nysiis(term: str) -> str: """ returns new york state identification , intelligence algorithm (nysiis) code given term """ if not len(term): return '' else: term = term.upper() table = { r'\w+': '', # remove whitespace , non-word characters r'^mac': 'mcc', r'^kn': 'nn', r'k': 'c', r'ph|pf': 'ff', r'sch': 'sss', r'(ee|ie)$': 'y', r'(dt|nd|nt|rd|rt)$': 'd', # on first letter must no longer change. r'(?<!^)ev': 'af', r'(?<!^)[aeiou]': 'a', r'(?<!^)q': 'g', r'(?<!^)z': 's', r'(?<!^)(?:m|kn)': 'n', r'(?<!^)([^aeiouy])h': r'\1', r'(?<!^)(.)h[^aeiouy]': r'\1', r'(?<!^)([aeiouy])w': r'\1', r'ay$': r'y', r's$': r'', r'(\w)\1+': r'\1', # original r'a+$': r'' # original } k, v in table.items(): term = re.sub(k, v, term) table = { # duplicate r'(\w)\1+': r'\1', # duplicate r'a+$': r'' # duplicate } # duplicate k, v in table.items(): # duplicate term = re.sub(k, v, term) # duplicate return term if __name__ == '__main__': names = [ 'bishop', 'carlson', 'carr', 'chapman', 'franklin', 'greene', 'harper', 'jacobs', 'larson', 'lawrence', 'lawson', 'louis, xvi', 'lynch', 'mackenzie', 'matthews', 'mccormack', 'mcdaniel', 'mcdonald', 'mclaughlin', 'morrison', "o'banion", "o'brien", 'richards', 'silva', 'watkins', 'wheeler', 'willis', 'brown, sr', 'browne, iii', 'browne, iv', 'knight', 'mitchell', "o'daniel", ] name in names: print('%15s: %s' % (name, nysiis(name)))
you don't want use dict substitutions: order of iteration not same order listed them in. if change dicts list of pairs instead, works expect.
table = [ (r'\w+', ''), #... ] k, v in table: ...
Comments
Post a Comment