From 6802a4f89d1431913ce3218f4db9bbe2748115e5 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Tue, 22 Sep 2015 14:23:55 -0400 Subject: [PATCH 01/12] fix README conflict Former-commit-id: 5b918e7bb069a9068a5416228dcf5e49b7dd69ac --- README.md | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 3aa0043..b8c8dbb 100644 --- a/README.md +++ b/README.md @@ -232,20 +232,14 @@ sources: - Wikipedia, the free encyclopedia (http://www.wikipedia.org) -<<<<<<< HEAD It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK, -SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al. (see citations below) and -available at http://crr.ugent.be/programs-data/subtitle-frequencies. -======= -It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK, and -SUBTLEX-CH, created by Marc Brysbaert et al. and available at +SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al. +(see citations below) and available at http://crr.ugent.be/programs-data/subtitle-frequencies. ->>>>>>> greek-and-turkish -I (Robyn Speer) have -obtained permission by e-mail from Marc Brysbaert to distribute these wordlists -in wordfreq, to be used for any purpose, not just for academic use, under these -conditions: +I (Robyn Speer) have obtained permission by e-mail from Marc Brysbaert to +distribute these wordlists in wordfreq, to be used for any purpose, not just +for academic use, under these conditions: - Wordfreq and code derived from it must credit the SUBTLEX authors. - It must remain clear that SUBTLEX is freely available data. @@ -297,4 +291,3 @@ Twitter; it does not display or republish any Twitter content. SUBTLEX-UK: A new and improved word frequency database for British English. The Quarterly Journal of Experimental Psychology, 67(6), 1176-1190. http://www.tandfonline.com/doi/pdf/10.1080/17470218.2013.850521 - From fe8a6b51e718084d9b4bda0c6f83231e4af58f60 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Tue, 22 Sep 2015 15:31:27 -0400 Subject: [PATCH 02/12] document what this file is for Former-commit-id: 06f8b299712a5952c5847ece8b13c1a18fcc2ed0 --- scripts/make_chinese_mapping.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/scripts/make_chinese_mapping.py b/scripts/make_chinese_mapping.py index 19b7826..9855e18 100644 --- a/scripts/make_chinese_mapping.py +++ b/scripts/make_chinese_mapping.py @@ -1,3 +1,16 @@ +""" +Generate a Python file, _chinese_mapping.py, that maps Traditional Chinese +characters to their Simplified Chinese equivalents. + +This is meant to be a normalization of text, somewhat like case-folding -- not +an actual translator, a task for which this method would be unsuitable. We +store word frequencies using Simplified Chinese characters so that, in the large +number of cases where a Traditional Chinese word has an obvious Simplified Chinese +mapping, we can get a frequency for it that's the same in Simplified and Traditional +Chinese. + +Generating this mapping requires the external Chinese conversion tool OpenCC. +""" import unicodedata import itertools import os From db30d09947f86ed734385d8de08f2fd3ce7f5c2b Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Tue, 22 Sep 2015 16:31:50 -0400 Subject: [PATCH 03/12] load the Chinese character mapping from a .msgpack.gz file Former-commit-id: 6cf4210187430c822297ce921775db6d983c22ab --- scripts/make_chinese_mapping.py | 24 +- wordfreq/__init__.py | 9 +- wordfreq/_chinese_mapping.py | 3275 --------------------- wordfreq/chinese.py | 11 +- wordfreq/data/_chinese_mapping.msgpack.gz | Bin 0 -> 16831 bytes 5 files changed, 25 insertions(+), 3294 deletions(-) delete mode 100644 wordfreq/_chinese_mapping.py create mode 100644 wordfreq/data/_chinese_mapping.msgpack.gz diff --git a/scripts/make_chinese_mapping.py b/scripts/make_chinese_mapping.py index 9855e18..4a17d4f 100644 --- a/scripts/make_chinese_mapping.py +++ b/scripts/make_chinese_mapping.py @@ -1,20 +1,21 @@ """ -Generate a Python file, _chinese_mapping.py, that maps Traditional Chinese -characters to their Simplified Chinese equivalents. +Generate a msgpack file, _chinese_mapping.msgpack.gz, that maps Traditional +Chinese characters to their Simplified Chinese equivalents. This is meant to be a normalization of text, somewhat like case-folding -- not an actual translator, a task for which this method would be unsuitable. We -store word frequencies using Simplified Chinese characters so that, in the large -number of cases where a Traditional Chinese word has an obvious Simplified Chinese -mapping, we can get a frequency for it that's the same in Simplified and Traditional -Chinese. +store word frequencies using Simplified Chinese characters so that, in the +large number of cases where a Traditional Chinese word has an obvious +Simplified Chinese mapping, we can get a frequency for it that's the same in +Simplified and Traditional Chinese. Generating this mapping requires the external Chinese conversion tool OpenCC. """ import unicodedata import itertools import os -import pprint +import msgpack +import gzip def make_hanzi_table(filename): @@ -25,7 +26,7 @@ def make_hanzi_table(filename): print('%5X\t%s' % (codept, char), file=out) -def make_hanzi_converter(table_in, python_out): +def make_hanzi_converter(table_in, msgpack_out): table = {} with open(table_in, encoding='utf-8') as infile: for line in infile: @@ -34,15 +35,14 @@ def make_hanzi_converter(table_in, python_out): assert len(char) == 1 if chr(codept) != char: table[codept] = char - with open(python_out, 'w', encoding='utf-8') as outfile: - print('SIMPLIFIED_MAP = ', end='', file=outfile) - pprint.pprint(table, stream=outfile) + with gzip.open(msgpack_out, 'wb') as outfile: + msgpack.dump(table, outfile, encoding='utf-8') def build(): make_hanzi_table('/tmp/han_in.txt') os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt') - make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.py') + make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.msgpack.gz') if __name__ == '__main__': diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index d5b95a2..4790282 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -85,10 +85,11 @@ def available_languages(wordlist='combined'): """ available = {} for path in DATA_PATH.glob('*.msgpack.gz'): - list_name = path.name.split('.')[0] - name, lang = list_name.split('_') - if name == wordlist: - available[lang] = str(path) + if not path.name.startswith('_'): + list_name = path.name.split('.')[0] + name, lang = list_name.split('_') + if name == wordlist: + available[lang] = str(path) return available diff --git a/wordfreq/_chinese_mapping.py b/wordfreq/_chinese_mapping.py deleted file mode 100644 index 2efac17..0000000 --- a/wordfreq/_chinese_mapping.py +++ /dev/null @@ -1,3275 +0,0 @@ -SIMPLIFIED_MAP = {13423: '㑔', - 13427: '㑇', - 13459: '𠉂', - 13544: '刾', - 13810: '𠵾', - 13850: '㘎', - 14084: '㚯', - 14095: '㛣', - 14114: '𡞱', - 14135: '𡝠', - 14238: '𪨊', - 14351: '㟆', - 14493: '𢋈', - 14702: '㤘', - 14734: '𢛯', - 14940: '㨫', - 14963: '㧐', - 15871: '𤈷', - 16015: '𤠋', - 16359: '𤽯', - 16441: '𥅴', - 16490: '𥇢', - 16507: '䀥', - 16985: '𥬀', - 17004: '𫂈', - 17010: '𥮜', - 17069: '𥺅', - 17079: '䌶', - 17113: '䌺', - 17114: '䌻', - 17147: '䌾', - 17151: '𦈓', - 17160: '𦈖', - 17163: '𦈘', - 17174: '𦈜', - 17181: '𦈟', - 17183: '𦈞', - 17189: '𦈠', - 17200: '𦈙', - 17779: '𦰴', - 17919: '𧉞', - 18017: '䙌', - 18176: '䜧', - 18299: '𧹕', - 18300: '䞍', - 18312: '𧹑', - 18600: '𨑹', - 18759: '䦂', - 18793: '𨱖', - 18801: '䥾', - 18840: '𨸄', - 18843: '䦶', - 18847: '䦷', - 18867: '𨷿', - 18914: '𨸟', - 19087: '𩏼', - 19095: '𩐀', - 19096: '𩏿', - 19188: '𩖗', - 19224: '𩙮', - 19229: '𩙯', - 19230: '𩙧', - 19264: '𩠇', - 19267: '𩠈', - 19327: '𩧭', - 19357: '𩧰', - 19358: '𩨁', - 19360: '𩧿', - 19371: '𩨇', - 19379: '𩨏', - 19390: '𩧪', - 19392: '䯅', - 19518: '鲃', - 19545: '𩾈', - 19564: '𩾊', - 19568: '𩾋', - 19575: '䲣', - 19581: '䲝', - 19585: '鳚', - 19606: '𩾂', - 19632: '𪉂', - 19721: '鹮', - 19756: '𪎈', - 19764: '𪎋', - 19999: '丢', - 20006: '并', - 20094: '干', - 20098: '乱', - 20121: '亘', - 20126: '亚', - 20295: '伫', - 20296: '布', - 20308: '占', - 20341: '并', - 20358: '来', - 20374: '仑', - 20406: '侣', - 20407: '局', - 20417: '俣', - 20418: '系', - 20436: '伣', - 20448: '侠', - 20453: '伡', - 20460: '私', - 20480: '伥', - 20486: '俩', - 20488: '俫', - 20489: '仓', - 20491: '个', - 20497: '们', - 20502: '幸', - 20523: '伦', - 20530: '㑈', - 20553: '伟', - 20561: '㐽', - 20596: '侧', - 20597: '侦', - 20605: '伪', - 20625: '杰', - 20630: '伧', - 20632: '伞', - 20633: '备', - 20642: '家', - 20653: '佣', - 20655: '偬', - 20659: '传', - 20660: '伛', - 20661: '债', - 20663: '伤', - 20670: '倾', - 20674: '偻', - 20677: '仅', - 20681: '佥', - 20689: '侨', - 20693: '仆', - 20702: '伪', - 20709: '侥', - 20712: '偾', - 20721: '雇', - 20729: '价', - 20736: '仪', - 20737: '俊', - 20738: '侬', - 20740: '亿', - 20744: '侩', - 20745: '俭', - 20752: '傧', - 20756: '俦', - 20757: '侪', - 20760: '尽', - 20767: '偿', - 20771: '𠆲', - 20778: '优', - 20786: '储', - 20791: '俪', - 20792: '㑩', - 20794: '傩', - 20795: '傥', - 20796: '俨', - 20807: '凶', - 20812: '兑', - 20818: '儿', - 20823: '兖', - 20839: '内', - 20841: '两', - 20874: '册', - 20906: '幂', - 20936: '净', - 20941: '冻', - 20953: '𪞝', - 20956: '凛', - 20977: '凯', - 21029: '别', - 21034: '删', - 21060: '刭', - 21063: '则', - 21067: '克', - 21070: '刹', - 21079: '刬', - 21083: '刚', - 21085: '剥', - 21102: '剐', - 21108: '剀', - 21109: '创', - 21111: '铲', - 21118: '𠛅', - 21123: '划', - 21127: '剧', - 21129: '刘', - 21130: '刽', - 21132: '刿', - 21133: '剑', - 21135: '㓥', - 21137: '剂', - 21146: '㔉', - 21185: '劲', - 21205: '动', - 21209: '务', - 21211: '勋', - 21213: '胜', - 21214: '劳', - 21218: '势', - 21225: '勚', - 21233: '劢', - 21235: '勋', - 21237: '励', - 21240: '劝', - 21243: '匀', - 21293: '匦', - 21295: '汇', - 21297: '匮', - 21312: '区', - 21332: '协', - 21369: '恤', - 21371: '却', - 21373: '即', - 21401: '厍', - 21408: '厕', - 21412: '历', - 21421: '厌', - 21426: '厉', - 21428: '厣', - 21443: '参', - 21444: '叁', - 21474: '丛', - 21522: '咤', - 21555: '吴', - 21558: '呐', - 21570: '吕', - 21692: '呙', - 21729: '员', - 21743: '𠯟', - 21764: '呗', - 21786: '吣', - 21816: '念', - 21839: '问', - 21843: '启', - 21854: '哑', - 21855: '启', - 21858: '唡', - 21902: '㖞', - 21914: '唤', - 21930: '丧', - 21931: '吃', - 21932: '乔', - 21934: '单', - 21938: '哟', - 21958: '呛', - 21959: '啬', - 21962: '唝', - 21966: '吗', - 21978: '呜', - 21993: '唢', - 22000: '𠮶', - 22006: '哔', - 22009: '𪡏', - 22022: '叹', - 22029: '喽', - 22035: '啯', - 22036: '呕', - 22038: '啧', - 22039: '尝', - 22044: '唛', - 22057: '哗', - 22062: '唠', - 22063: '啸', - 22064: '叽', - 22069: '哓', - 22072: '呒', - 22077: '啴', - 22081: '恶', - 22085: '𠯠', - 22099: '嘘', - 22106: '㖊', - 22109: '咝', - 22112: '哒', - 22117: '哝', - 22118: '哕', - 22127: '嗳', - 22130: '哙', - 22132: '喷', - 22136: '吨', - 22137: '当', - 22144: '咛', - 22151: '吓', - 22156: '哜', - 22160: '尝', - 22165: '噜', - 22169: '啮', - 22181: '咽', - 22182: '呖', - 22184: '咙', - 22190: '向', - 22194: '亸', - 22195: '喾', - 22196: '严', - 22198: '嘤', - 22208: '啭', - 22209: '嗫', - 22210: '嚣', - 22213: '冁', - 22216: '呓', - 22217: '啰', - 22220: '苏', - 22225: '嘱', - 22250: '囱', - 22279: '囵', - 22283: '国', - 22285: '围', - 22290: '园', - 22291: '圆', - 22294: '图', - 22296: '团', - 22302: '𪢮', - 22453: '埯', - 22497: '垭', - 22512: '采', - 22519: '执', - 22533: '坚', - 22538: '垩', - 22550: '垴', - 22557: '埚', - 22575: '尧', - 22577: '报', - 22580: '场', - 22602: '块', - 22603: '茔', - 22607: '垲', - 22610: '埘', - 22615: '涂', - 22618: '冢', - 22626: '坞', - 22628: '埙', - 22645: '尘', - 22649: '堑', - 22666: '垫', - 22684: '坠', - 22702: '堕', - 22704: '坛', - 22707: '坟', - 22710: '垯', - 22715: '墙', - 22718: '垦', - 22727: '坛', - 22728: '𡒄', - 22731: '垱', - 22739: '压', - 22744: '垒', - 22745: '圹', - 22746: '垆', - 22748: '坛', - 22750: '坏', - 22751: '垄', - 22752: '垅', - 22754: '坜', - 22761: '坝', - 22767: '壮', - 22778: '壶', - 22780: '壸', - 22781: '寿', - 22816: '够', - 22818: '梦', - 22821: '伙', - 22846: '夹', - 22864: '奂', - 22887: '奥', - 22889: '奁', - 22890: '夺', - 22892: '奖', - 22894: '奋', - 22908: '姹', - 22941: '妆', - 22989: '姗', - 23014: '奸', - 23067: '娱', - 23105: '娄', - 23142: '妇', - 23149: '娅', - 23207: '娲', - 23215: '妫', - 23216: '㛀', - 23228: '媪', - 23229: '妈', - 23243: '袅', - 23255: '妪', - 23285: '妩', - 23291: '娴', - 23295: '婳', - 23296: '妫', - 23299: '媭', - 23304: '娆', - 23307: '婵', - 23308: '娇', - 23321: '嫱', - 23329: '嫒', - 23338: '嫔', - 23344: '婴', - 23352: '婶', - 23363: '娘', - 23371: '㛤', - 23372: '娈', - 23403: '孙', - 23416: '学', - 23423: '孪', - 23470: '宫', - 23488: '采', - 23522: '寝', - 23526: '实', - 23527: '宁', - 23529: '审', - 23531: '写', - 23532: '宽', - 23541: '宠', - 23542: '宝', - 23559: '将', - 23560: '专', - 23563: '寻', - 23565: '对', - 23566: '导', - 23607: '尴', - 23622: '届', - 23629: '尸', - 23635: '屃', - 23644: '屉', - 23650: '屡', - 23652: '层', - 23656: '屦', - 23657: '𪨗', - 23660: '属', - 23713: '冈', - 23791: '峰', - 23796: '岘', - 23798: '岛', - 23805: '峡', - 23821: '崃', - 23825: '昆', - 23831: '岗', - 23833: '仑', - 23842: '峥', - 23852: '岽', - 23888: '岚', - 23895: '岁', - 23932: '𡶴', - 23937: '嵝', - 23940: '崭', - 23943: '岖', - 23956: '嵚', - 23959: '崂', - 23968: '峤', - 23970: '峣', - 23975: '峄', - 23982: '崄', - 23988: '岙', - 23992: '嵘', - 23994: '岭', - 23996: '屿', - 23997: '岳', - 24011: '岿', - 24018: '峦', - 24020: '巅', - 24022: '岩', - 24048: '巯', - 24057: '卺', - 24101: '帅', - 24107: '师', - 24115: '帐', - 24118: '带', - 24128: '帧', - 24131: '帏', - 24151: '帼', - 24152: '帻', - 24159: '帜', - 24163: '币', - 24171: '帮', - 24172: '帱', - 24185: '干', - 24190: '几', - 24235: '库', - 24257: '厕', - 24258: '厢', - 24260: '厩', - 24264: '厦', - 24277: '荫', - 24282: '厨', - 24285: '厮', - 24287: '庙', - 24288: '厂', - 24289: '庑', - 24290: '废', - 24291: '广', - 24297: '廪', - 24300: '庐', - 24307: '厅', - 24338: '弑', - 24340: '吊', - 24371: '弪', - 24373: '张', - 24375: '强', - 24390: '别', - 24392: '弹', - 24396: '弥', - 24398: '弯', - 24404: '录', - 24409: '汇', - 24414: '彝', - 24416: '彟', - 24421: '彦', - 24427: '雕', - 24434: '彨', - 24447: '佛', - 24460: '后', - 24465: '径', - 24478: '从', - 24480: '徕', - 24489: '复', - 24501: '征', - 24505: '彻', - 24646: '恒', - 24677: '耻', - 24709: '悦', - 24734: '悮', - 24757: '怅', - 24758: '闷', - 24765: '凄', - 24801: '恶', - 24817: '恼', - 24818: '恽', - 24827: '恻', - 24859: '爱', - 24860: '惬', - 24872: '悫', - 24884: '怆', - 24887: '恺', - 24894: '忾', - 24900: '栗', - 24907: '态', - 24909: '愠', - 24920: '惨', - 24922: '惭', - 24927: '恸', - 24931: '惯', - 24932: '悫', - 24938: '怄', - 24939: '怂', - 24942: '虑', - 24947: '悭', - 24950: '庆', - 24956: '戚', - 24958: '欲', - 24962: '忧', - 24970: '惫', - 24976: '怜', - 24977: '凭', - 24978: '愦', - 24986: '惮', - 24996: '愤', - 25003: '悯', - 25006: '怃', - 25010: '宪', - 25014: '忆', - 25024: '𢙓', - 25031: '恳', - 25033: '应', - 25036: '怿', - 25037: '懔', - 25054: '蒙', - 25055: '怼', - 25059: '懑', - 25064: '恹', - 25074: '惩', - 25078: '懒', - 25079: '怀', - 25080: '悬', - 25082: '忏', - 25084: '惧', - 25086: '慑', - 25088: '恋', - 25095: '戆', - 25108: '戋', - 25127: '戗', - 25129: '戬', - 25136: '战', - 25137: '戯', - 25138: '戏', - 25142: '户', - 25291: '抛', - 25385: '捝', - 25393: '挲', - 25406: '挟', - 25448: '舍', - 25451: '扪', - 25457: '挨', - 25458: '卷', - 25475: '扫', - 25476: '抡', - 25478: '㧏', - 25495: '挜', - 25497: '挣', - 25499: '挂', - 25505: '采', - 25536: '拣', - 25562: '扬', - 25563: '换', - 25582: '挥', - 25613: '损', - 25622: '摇', - 25623: '捣', - 25653: '揾', - 25654: '抢', - 25675: '𢫬', - 25681: '掴', - 25692: '掼', - 25695: '搂', - 25711: '挚', - 25715: '抠', - 25718: '抟', - 25722: '折', - 25723: '掺', - 25736: '捞', - 25743: '挦', - 25744: '撑', - 25747: '挠', - 25757: '㧑', - 25759: '挢', - 25763: '掸', - 25765: '拨', - 25771: '抚', - 25778: '扑', - 25779: '揿', - 25787: '挞', - 25790: '挝', - 25791: '捡', - 25793: '拥', - 25796: '掳', - 25799: '择', - 25802: '击', - 25803: '挡', - 25811: '㧟', - 25812: '担', - 25818: '据', - 25824: '挤', - 25825: '抬', - 25827: '捣', - 25836: '拟', - 25839: '摈', - 25840: '拧', - 25841: '搁', - 25842: '掷', - 25844: '扩', - 25847: '撷', - 25850: '摆', - 25851: '擞', - 25852: '撸', - 25853: '㧰', - 25854: '扰', - 25860: '摅', - 25862: '撵', - 25871: '拢', - 25876: '拦', - 25878: '撄', - 25881: '搀', - 25883: '撺', - 25884: '携', - 25885: '摄', - 25890: '攒', - 25891: '挛', - 25892: '摊', - 25898: '搅', - 25900: '揽', - 25934: '教', - 25943: '败', - 25944: '叙', - 25973: '敌', - 25976: '数', - 25986: '敛', - 25987: '毙', - 25989: '𢽾', - 25990: '敩', - 26005: '斓', - 26028: '斩', - 26039: '断', - 26044: '于', - 26050: '旗', - 26083: '既', - 26119: '升', - 26178: '时', - 26185: '晋', - 26205: '昼', - 26248: '晕', - 26249: '晖', - 26264: '旸', - 26274: '畅', - 26283: '暂', - 26308: '晔', - 26310: '历', - 26311: '昙', - 26313: '晓', - 26319: '向', - 26326: '暧', - 26336: '旷', - 26341: '𣆐', - 26344: '昽', - 26348: '晒', - 26360: '书', - 26371: '会', - 26405: '𦛨', - 26407: '胧', - 26414: '术', - 26481: '东', - 26484: '锨', - 26548: '拐', - 26613: '栅', - 26618: '拐', - 26619: '查', - 26751: '杆', - 26772: '栀', - 26776: '枧', - 26781: '条', - 26783: '枭', - 26802: '棁', - 26820: '弃', - 26826: '棋', - 26838: '枨', - 26839: '枣', - 26847: '栋', - 26849: '㭎', - 26855: '栈', - 26866: '栖', - 26870: '梾', - 26895: '桠', - 26930: '㭏', - 26954: '杨', - 26963: '枫', - 26984: '桢', - 26989: '业', - 26997: '极', - 27032: '矩', - 27046: '干', - 27050: '杩', - 27054: '荣', - 27058: '榅', - 27071: '桤', - 27083: '构', - 27085: '枪', - 27091: '杠', - 27108: '梿', - 27111: '椠', - 27112: '椁', - 27123: '桨', - 27126: '椢', - 27132: '椝', - 27137: '桩', - 27138: '乐', - 27141: '枞', - 27153: '梁', - 27155: '楼', - 27161: '标', - 27166: '枢', - 27170: '㭤', - 27171: '样', - 27179: '㭴', - 27187: '桪', - 27192: '朴', - 27193: '树', - 27194: '桦', - 27199: '椫', - 27208: '桡', - 27211: '桥', - 27231: '机', - 27234: '椭', - 27243: '横', - 27265: '檩', - 27273: '柽', - 27284: '档', - 27292: '桧', - 27295: '槚', - 27298: '检', - 27299: '樯', - 27309: '𣘴', - 27310: '梼', - 27311: '台', - 27315: '槟', - 27320: '柠', - 27323: '槛', - 27331: '柜', - 27347: '橹', - 27354: '榈', - 27355: '栉', - 27357: '椟', - 27358: '橼', - 27359: '栎', - 27365: '橱', - 27367: '槠', - 27368: '栌', - 27370: '枥', - 27371: '橥', - 27372: '榇', - 27377: '蘖', - 27379: '栊', - 27384: '榉', - 27387: '樱', - 27396: '栏', - 27397: '榉', - 27402: '权', - 27405: '𣐤', - 27407: '椤', - 27410: '栾', - 27411: '𣗋', - 27414: '榄', - 27422: '棂', - 27453: '钦', - 27470: '叹', - 27472: '欧', - 27487: '欤', - 27489: '欢', - 27506: '岁', - 27511: '历', - 27512: '归', - 27519: '殁', - 27544: '残', - 27550: '殒', - 27556: '殇', - 27560: '㱮', - 27563: '殚', - 27565: '僵', - 27566: '殓', - 27567: '殡', - 27568: '㱩', - 27570: '歼', - 27578: '杀', - 27579: '壳', - 27580: '壳', - 27584: '毁', - 27590: '殴', - 27647: '毵', - 27650: '牦', - 27656: '毡', - 27660: '氇', - 27683: '气', - 27691: '氢', - 27692: '氩', - 27699: '氲', - 27710: '泛', - 27726: '泛', - 27737: '污', - 27770: '决', - 27794: '没', - 27798: '冲', - 27841: '况', - 27869: '溯', - 27945: '泄', - 27958: '汹', - 28025: '浃', - 28039: '泾', - 28092: '凉', - 28114: '凄', - 28122: '泪', - 28133: '渌', - 28136: '净', - 28137: '凌', - 28138: '沦', - 28149: '渊', - 28150: '涞', - 28154: '浅', - 28185: '涣', - 28187: '减', - 28194: '沨', - 28198: '涡', - 28204: '测', - 28222: '浑', - 28234: '凑', - 28254: '浈', - 28263: '涌', - 28271: '汤', - 28296: '沩', - 28310: '准', - 28317: '沟', - 28331: '温', - 28334: '浉', - 28339: '涢', - 28348: '湿', - 28356: '沧', - 28357: '灭', - 28364: '涤', - 28366: '荥', - 28377: '汇', - 28396: '沪', - 28399: '滞', - 28402: '渗', - 28407: '卤', - 28408: '浒', - 28411: '浐', - 28414: '滚', - 28415: '满', - 28417: '渔', - 28426: '溇', - 28442: '沤', - 28450: '汉', - 28451: '涟', - 28460: '渍', - 28466: '涨', - 28469: '溆', - 28472: '渐', - 28479: '浆', - 28481: '颍', - 28497: '泼', - 28500: '洁', - 28505: '沩', - 28507: '潜', - 28516: '润', - 28527: '浔', - 28528: '溃', - 28535: '滗', - 28543: '涠', - 28544: '涩', - 28549: '𣶩', - 28550: '浇', - 28551: '涝', - 28560: '沄', - 28567: '涧', - 28576: '渑', - 28580: '泽', - 28582: '滪', - 28585: '泶', - 28590: '浍', - 28593: '淀', - 28606: '㳠', - 28609: '浊', - 28611: '浓', - 28612: '㳡', - 28614: '𣸣', - 28629: '湿', - 28632: '泞', - 28635: '蒙', - 28636: '浕', - 28639: '济', - 28644: '涛', - 28647: '㳔', - 28651: '滥', - 28656: '潍', - 28657: '滨', - 28666: '溅', - 28668: '泺', - 28670: '滤', - 28674: '澛', - 28675: '𣽷', - 28677: '滢', - 28678: '渎', - 28679: '㲿', - 28681: '泻', - 28683: '沈', - 28687: '浏', - 28693: '濒', - 28696: '泸', - 28701: '沥', - 28703: '潇', - 28704: '潆', - 28710: '潴', - 28711: '泷', - 28712: '濑', - 28720: '弥', - 28722: '潋', - 28734: '澜', - 28739: '沣', - 28740: '滠', - 28753: '洒', - 28757: '漓', - 28760: '滩', - 28761: '𣺼', - 28765: '灏', - 28768: '漤', - 28769: '㳕', - 28771: '湾', - 28772: '滦', - 28775: '滟', - 28777: '滟', - 28797: '灾', - 28858: '为', - 28879: '乌', - 28916: '烃', - 28961: '无', - 29001: '炼', - 29010: '炜', - 29017: '烟', - 29026: '茕', - 29029: '焕', - 29033: '烦', - 29036: '炀', - 29041: '㶽', - 29061: '煴', - 29065: '𤈶', - 29068: '𤇄', - 29074: '荧', - 29075: '𤆡', - 29079: '炝', - 29089: '𤋏', - 29105: '热', - 29106: '颎', - 29118: '炽', - 29121: '烨', - 29128: '灯', - 29129: '炖', - 29138: '烧', - 29145: '烫', - 29148: '焖', - 29151: '营', - 29158: '灿', - 29164: '毁', - 29165: '烛', - 29172: '烩', - 29174: '㶶', - 29179: '熏', - 29180: '烬', - 29182: '焘', - 29188: '𤇃', - 29197: '烁', - 29200: '炉', - 29211: '烂', - 29229: '争', - 29234: '为', - 29242: '爷', - 29246: '尔', - 29248: '床', - 29254: '墙', - 29272: '牍', - 29309: '牵', - 29334: '荦', - 29346: '犊', - 29351: '牺', - 29376: '状', - 29433: '狭', - 29437: '狈', - 29465: '狰', - 29494: '犹', - 29499: '狲', - 29505: '犸', - 29507: '呆', - 29508: '狱', - 29509: '狮', - 29518: '奖', - 29544: '独', - 29546: '狯', - 29547: '猃', - 29550: '狝', - 29552: '狞', - 29553: '㺍', - 29554: '获', - 29557: '猎', - 29559: '犷', - 29560: '兽', - 29562: '獭', - 29563: '献', - 29564: '猕', - 29568: '猡', - 29569: '𤞤', - 29694: '现', - 29745: '雕', - 29754: '珐', - 29759: '珲', - 29771: '玮', - 29778: '玚', - 29795: '琐', - 29796: '瑶', - 29801: '莹', - 29802: '玛', - 29810: '玱', - 29821: '𪻐', - 29833: '琏', - 29857: '琎', - 29859: '玑', - 29862: '瑷', - 29867: '珰', - 29871: '㻅', - 29872: '环', - 29885: '玺', - 29898: '琼', - 29903: '珑', - 29908: '璎', - 29909: '𤦀', - 29914: '瓒', - 29964: '瓯', - 29973: '瓮', - 29986: '产', - 29987: '产', - 29990: '苏', - 29999: '宁', - 30045: '亩', - 30050: '毕', - 30059: '画', - 30064: '异', - 30069: '画', - 30070: '当', - 30087: '畴', - 30090: '叠', - 30169: '痉', - 30176: '酸', - 30206: '疴', - 30210: '痖', - 30219: '疯', - 30221: '疡', - 30227: '痪', - 30238: '瘗', - 30241: '疮', - 30247: '疟', - 30254: '瘆', - 30258: '疭', - 30266: '瘘', - 30267: '瘘', - 30274: '疗', - 30278: '痨', - 30279: '痫', - 30281: '瘅', - 30290: '愈', - 30296: '疠', - 30303: '瘪', - 30305: '痴', - 30306: '痒', - 30308: '疖', - 30309: '症', - 30311: '疬', - 30313: '癞', - 30316: '癣', - 30317: '瘿', - 30318: '瘾', - 30320: '痈', - 30321: '瘫', - 30322: '癫', - 30332: '发', - 30337: '皂', - 30362: '皑', - 30367: '𤾀', - 30384: '疱', - 30392: '皲', - 30394: '皱', - 30403: '杯', - 30428: '盗', - 30430: '盏', - 30433: '尽', - 30435: '监', - 30436: '盘', - 30439: '卢', - 30442: '荡', - 30494: '真', - 30501: '眦', - 30526: '众', - 30541: '𪾢', - 30543: '困', - 30556: '睁', - 30558: '睐', - 30570: '睾', - 30616: '眍', - 30620: '䁖', - 30622: '瞒', - 30628: '𥆧', - 30646: '瞆', - 30652: '睑', - 30663: '蒙', - 30675: '眬', - 30682: '瞩', - 30703: '矫', - 30787: '朱', - 30812: '硁', - 30820: '硖', - 30824: '砗', - 30831: '砚', - 30869: '埼', - 30873: '𥐻', - 30889: '硕', - 30893: '砀', - 30904: '砜', - 30906: '确', - 30908: '码', - 30909: '䂵', - 30929: '硙', - 30938: '砖', - 30944: '硵', - 30947: '碜', - 30951: '碛', - 30959: '矶', - 30973: '硗', - 30980: '硚', - 30982: '硷', - 30990: '础', - 30994: '𥐟', - 31001: '碍', - 31014: '矿', - 31018: '砺', - 31019: '砾', - 31020: '矾', - 31025: '砻', - 31061: '秘', - 31103: '禄', - 31117: '祸', - 31118: '祯', - 31125: '祎', - 31137: '祃', - 31142: '御', - 31146: '禅', - 31150: '礼', - 31152: '祢', - 31153: '祷', - 31167: '秃', - 31176: '籼', - 31237: '税', - 31240: '秆', - 31247: '䅉', - 31260: '棱', - 31263: '禀', - 31278: '种', - 31281: '称', - 31296: '谷', - 31308: '稣', - 31309: '积', - 31310: '颖', - 31328: '秾', - 31329: '穑', - 31330: '秽', - 31337: '稳', - 31339: '获', - 31341: '稆', - 31401: '窝', - 31402: '洼', - 31406: '穷', - 31407: '窑', - 31413: '窎', - 31414: '窭', - 31418: '窥', - 31428: '窜', - 31429: '窍', - 31431: '窦', - 31432: '灶', - 31434: '窃', - 31466: '竖', - 31478: '竞', - 31558: '笔', - 31565: '笋', - 31591: '笕', - 31604: '䇲', - 31623: '个', - 31627: '笺', - 31631: '筝', - 31680: '节', - 31684: '范', - 31689: '筑', - 31691: '箧', - 31700: '筼', - 31704: '𥬠', - 31716: '笃', - 31721: '筛', - 31731: '筚', - 31744: '箦', - 31757: '篓', - 31761: '蓑', - 31774: '箪', - 31777: '简', - 31779: '篑', - 31787: '箫', - 31801: '筜', - 31805: '签', - 31806: '帘', - 31811: '篮', - 31819: '𥬞', - 31820: '筹', - 31828: '䉤', - 31833: '箓', - 31835: '篯', - 31836: '箨', - 31839: '籁', - 31840: '笼', - 31844: '签', - 31849: '笾', - 31850: '簖', - 31852: '篱', - 31854: '箩', - 31858: '吁', - 31925: '粤', - 31965: '糁', - 31966: '粪', - 31975: '粮', - 31984: '团', - 31986: '粝', - 31988: '籴', - 31990: '粜', - 31993: '纟', - 31998: '纠', - 32000: '纪', - 32002: '纣', - 32004: '约', - 32005: '红', - 32006: '纡', - 32007: '纥', - 32008: '纨', - 32009: '纫', - 32011: '纹', - 32013: '纳', - 32016: '纽', - 32019: '纾', - 32020: '纯', - 32021: '纰', - 32022: '纼', - 32023: '纱', - 32024: '纮', - 32025: '纸', - 32026: '级', - 32027: '纷', - 32028: '纭', - 32029: '纴', - 32033: '纺', - 32044: '䌷', - 32046: '扎', - 32048: '细', - 32049: '绂', - 32050: '绁', - 32051: '绅', - 32053: '纻', - 32057: '绍', - 32058: '绀', - 32060: '绋', - 32063: '绐', - 32064: '绌', - 32066: '终', - 32067: '弦', - 32068: '组', - 32069: '䌹', - 32070: '绊', - 32078: '绗', - 32080: '结', - 32085: '绝', - 32091: '绦', - 32093: '绔', - 32094: '绞', - 32097: '络', - 32098: '绚', - 32102: '给', - 32104: '绒', - 32112: '绖', - 32113: '统', - 32114: '丝', - 32115: '绛', - 32118: '绝', - 32121: '绢', - 32122: '𫄨', - 32128: '𦈌', - 32129: '绑', - 32131: '绡', - 32134: '绠', - 32135: '𦈋', - 32136: '绨', - 32137: '绣', - 32140: '绤', - 32143: '绥', - 32144: '䌼', - 32145: '捆', - 32147: '经', - 32156: '综', - 32158: '缍', - 32160: '绿', - 32162: '绸', - 32163: '绻', - 32171: '线', - 32172: '绶', - 32173: '维', - 32175: '绹', - 32176: '绾', - 32177: '纲', - 32178: '网', - 32179: '绷', - 32180: '缀', - 32181: '彩', - 32184: '纶', - 32185: '绺', - 32186: '绮', - 32187: '绽', - 32189: '绰', - 32190: '绫', - 32191: '绵', - 32196: '绲', - 32199: '缁', - 32202: '紧', - 32203: '绯', - 32205: '𦈏', - 32209: '绿', - 32210: '绪', - 32211: '绬', - 32212: '绱', - 32215: '缃', - 32216: '缄', - 32217: '缂', - 32218: '线', - 32221: '缉', - 32222: '缎', - 32224: '缔', - 32225: '缗', - 32227: '缘', - 32230: '缌', - 32232: '编', - 32233: '缓', - 32236: '缅', - 32239: '纬', - 32240: '𦈕', - 32241: '缑', - 32242: '缈', - 32244: '练', - 32246: '缏', - 32247: '𦈉', - 32248: '𦈑', - 32249: '缇', - 32251: '致', - 32264: '萦', - 32265: '缙', - 32266: '缢', - 32267: '缒', - 32270: '𦈔', - 32272: '绉', - 32273: '缣', - 32277: '缊', - 32279: '缞', - 32283: '缚', - 32285: '缜', - 32286: '缟', - 32287: '缛', - 32291: '县', - 32295: '绦', - 32299: '缝', - 32300: '𦈚', - 32301: '缡', - 32302: '缩', - 32305: '纵', - 32306: '缧', - 32307: '䌸', - 32308: '纤', - 32309: '缦', - 32310: '絷', - 32311: '缕', - 32313: '缥', - 32314: '𦈐', - 32317: '总', - 32318: '绩', - 32323: '绷', - 32325: '缫', - 32326: '缪', - 32335: '𦈝', - 32336: '穗', - 32338: '缯', - 32339: '𦈛', - 32340: '织', - 32341: '缮', - 32346: '缭', - 32350: '绕', - 32351: '𦈎', - 32353: '绣', - 32354: '缋', - 32361: '绳', - 32362: '绘', - 32363: '系', - 32365: '茧', - 32366: '缰', - 32367: '缳', - 32368: '缲', - 32371: '缴', - 32376: '䍁', - 32377: '绎', - 32379: '𦈡', - 32380: '继', - 32381: '缤', - 32382: '缱', - 32383: '䍀', - 32385: '𫄸', - 32391: '颣', - 32392: '缬', - 32394: '纩', - 32396: '续', - 32397: '累', - 32399: '缠', - 32403: '缨', - 32404: '才', - 32406: '纤', - 32408: '缵', - 32412: '缆', - 32573: '钵', - 32584: '坛', - 32588: '罂', - 32590: '坛', - 32624: '罚', - 32629: '骂', - 32631: '罢', - 32645: '罗', - 32646: '罴', - 32648: '羁', - 32651: '芈', - 32675: '群', - 32677: '羟', - 32680: '羡', - 32681: '义', - 32694: '膻', - 32722: '习', - 32748: '翚', - 32761: '翘', - 32765: '翙', - 32812: '耧', - 32814: '耢', - 32854: '圣', - 32862: '闻', - 32879: '联', - 32880: '聪', - 32882: '声', - 32883: '耸', - 32885: '聩', - 32886: '聂', - 32887: '职', - 32889: '聍', - 32893: '听', - 32894: '聋', - 32901: '肃', - 33029: '胁', - 33032: '脉', - 33051: '胫', - 33059: '唇', - 33061: '𣍰', - 33065: '修', - 33067: '脱', - 33081: '胀', - 33102: '肾', - 33110: '胨', - 33121: '脶', - 33126: '脑', - 33130: '𣍯', - 33131: '肿', - 33139: '脚', - 33144: '肠', - 33155: '腽', - 33173: '腘', - 33178: '肤', - 33184: '胶', - 33186: '𦝼', - 33193: '腻', - 33213: '胆', - 33214: '脍', - 33215: '脓', - 33225: '脸', - 33229: '脐', - 33231: '膑', - 33239: '𣎑', - 33240: '腊', - 33242: '胪', - 33247: '脏', - 33248: '脔', - 33250: '臜', - 33253: '卧', - 33256: '临', - 33274: '台', - 33287: '与', - 33288: '兴', - 33289: '举', - 33290: '旧', - 33304: '馆', - 33369: '舱', - 33380: '舣', - 33382: '舰', - 33387: '舻', - 33393: '艰', - 33399: '艳', - 33467: '刍', - 33511: '苎', - 33586: '兹', - 33610: '荆', - 33674: '庄', - 33686: '茎', - 33698: '荚', - 33703: '苋', - 33775: '华', - 33780: '庵', - 33784: '烟', - 33799: '苌', - 33802: '莱', - 33836: '万', - 33844: '荝', - 33845: '莴', - 33865: '叶', - 33874: '荭', - 33892: '荮', - 33894: '苇', - 33903: '药', - 33911: '荤', - 33936: '搜', - 33939: '莼', - 33940: '莳', - 33950: '莅', - 33980: '苍', - 33984: '荪', - 33990: '席', - 33995: '盖', - 34030: '莲', - 34031: '苁', - 34036: '莼', - 34045: '荜', - 34068: '卜', - 34072: '参', - 34078: '蒌', - 34083: '蒋', - 34085: '葱', - 34086: '茑', - 34093: '荫', - 34113: '荨', - 34118: '蒇', - 34126: '荞', - 34130: '荬', - 34131: '芸', - 34133: '莸', - 34136: '荛', - 34146: '蒉', - 34153: '荡', - 34154: '芜', - 34157: '萧', - 34167: '蓣', - 34176: '蕰', - 34184: '荟', - 34186: '蓟', - 34188: '芗', - 34193: '姜', - 34196: '蔷', - 34200: '荙', - 34207: '莶', - 34214: '荐', - 34217: '萨', - 34227: '䓕', - 34228: '苧', - 34233: '苔', - 34234: '荠', - 34253: '蓝', - 34254: '荩', - 34269: '艺', - 34277: '药', - 34282: '薮', - 34292: '蕴', - 34294: '苈', - 34297: '蔼', - 34298: '蔺', - 34308: '蕲', - 34310: '芦', - 34311: '苏', - 34314: '蕴', - 34315: '苹', - 34330: '藓', - 34334: '蔹', - 34338: '茏', - 34349: '兰', - 34362: '蓠', - 34367: '萝', - 34374: '蔂', - 34389: '处', - 34395: '虚', - 34396: '虏', - 34399: '号', - 34407: '亏', - 34415: '虬', - 34554: '蛱', - 34555: '蜕', - 34566: '蚬', - 34645: '蚀', - 34655: '猬', - 34662: '虾', - 34680: '蜗', - 34692: '蛳', - 34718: '蚂', - 34722: '萤', - 34734: '䗖', - 34747: '蝼', - 34751: '螀', - 34756: '蛰', - 34760: '蝈', - 34766: '螨', - 34787: '虮', - 34796: '蝉', - 34799: '蛲', - 34802: '虫', - 34806: '蛏', - 34811: '蚁', - 34817: '蚃', - 34821: '蝇', - 34822: '虿', - 34829: '蝎', - 34832: '蛴', - 34833: '蝾', - 34847: '蜡', - 34851: '蛎', - 34856: '蟏', - 34865: '蛊', - 34870: '蚕', - 34875: '蛮', - 34886: '众', - 34890: '蔑', - 34899: '术', - 34901: '同', - 34906: '胡', - 34907: '卫', - 34909: '冲', - 34974: '衮', - 35018: '袅', - 35023: '里', - 35036: '补', - 35037: '装', - 35041: '里', - 35069: '制', - 35079: '复', - 35084: '裈', - 35096: '袆', - 35122: '裤', - 35123: '裢', - 35128: '褛', - 35131: '亵', - 35136: '𫌀', - 35142: '幞', - 35143: '裥', - 35145: '裥', - 35151: '袯', - 35158: '袄', - 35165: '裣', - 35168: '裆', - 35172: '褴', - 35178: '袜', - 35180: '摆', - 35183: '衬', - 35186: '袭', - 35188: '襕', - 35208: '核', - 35211: '见', - 35214: '觃', - 35215: '规', - 35219: '觅', - 35222: '视', - 35224: '觇', - 35233: '觋', - 35237: '觍', - 35238: '觎', - 35242: '亲', - 35244: '觊', - 35247: '觏', - 35250: '觐', - 35255: '觑', - 35258: '觉', - 35260: '𫌨', - 35261: '览', - 35263: '觌', - 35264: '观', - 35316: '觞', - 35318: '觯', - 35320: '触', - 35329: '讠', - 35330: '订', - 35331: '讣', - 35336: '计', - 35338: '讯', - 35340: '讧', - 35342: '讨', - 35344: '讦', - 35345: '𫍙', - 35346: '讱', - 35347: '训', - 35349: '讪', - 35350: '讫', - 35351: '托', - 35352: '记', - 35355: '讹', - 35357: '讶', - 35359: '讼', - 35362: '䜣', - 35363: '诀', - 35365: '讷', - 35369: '讻', - 35370: '访', - 35373: '设', - 35377: '许', - 35380: '诉', - 35382: '诃', - 35386: '诊', - 35387: '注', - 35392: '𧮪', - 35393: '诂', - 35398: '诋', - 35406: '讵', - 35408: '诈', - 35409: '𫍟', - 35410: '诒', - 35412: '诏', - 35413: '评', - 35414: '诐', - 35415: '诇', - 35416: '诎', - 35419: '诅', - 35422: '词', - 35424: '咏', - 35425: '诩', - 35426: '询', - 35427: '诣', - 35430: '试', - 35433: '诗', - 35435: '诧', - 35436: '诟', - 35437: '诡', - 35438: '诠', - 35440: '诘', - 35441: '话', - 35442: '该', - 35443: '详', - 35445: '诜', - 35452: '诙', - 35455: '诖', - 35460: '诔', - 35461: '诛', - 35462: '诓', - 35463: '夸', - 35468: '志', - 35469: '认', - 35473: '诳', - 35474: '诶', - 35477: '诞', - 35480: '诱', - 35482: '诮', - 35486: '语', - 35488: '诚', - 35489: '诫', - 35491: '诬', - 35492: '误', - 35493: '诰', - 35494: '诵', - 35496: '诲', - 35498: '说', - 35500: '说', - 35504: '谁', - 35506: '课', - 35510: '谇', - 35513: '诽', - 35516: '谊', - 35518: '訚', - 35519: '调', - 35522: '谄', - 35524: '谆', - 35527: '谈', - 35529: '诿', - 35531: '请', - 35533: '诤', - 35535: '诹', - 35537: '诼', - 35538: '谅', - 35542: '论', - 35543: '谂', - 35547: '谀', - 35548: '谍', - 35549: '谞', - 35550: '谝', - 35553: '谥', - 35554: '诨', - 35556: '谔', - 35558: '谛', - 35559: '谐', - 35563: '谏', - 35565: '谕', - 35566: '咨', - 35568: '𫍰', - 35569: '讳', - 35571: '谙', - 35574: '谌', - 35575: '讽', - 35576: '诸', - 35578: '谚', - 35580: '谖', - 35582: '诺', - 35584: '谋', - 35585: '谒', - 35586: '谓', - 35588: '誊', - 35589: '诌', - 35594: '谎', - 35598: '谜', - 35599: '𫍲', - 35600: '谧', - 35604: '谑', - 35606: '谡', - 35607: '谤', - 35609: '谦', - 35610: '谥', - 35611: '讲', - 35613: '谢', - 35616: '谣', - 35617: '谣', - 35624: '谟', - 35627: '谪', - 35628: '谬', - 35629: '谫', - 35635: '讴', - 35641: '谨', - 35646: '谩', - 35649: '哗', - 35653: '䜧', - 35657: '证', - 35658: '𫍢', - 35662: '谲', - 35663: '讥', - 35670: '谮', - 35672: '识', - 35673: '谯', - 35674: '谭', - 35676: '谱', - 35679: '噪', - 35691: '谵', - 35693: '毁', - 35695: '译', - 35696: '议', - 35700: '谴', - 35703: '护', - 35704: '诪', - 35709: '誉', - 35710: '谫', - 35712: '读', - 35722: '变', - 35723: '詟', - 35724: '䜩', - 35726: '雠', - 35730: '谗', - 35731: '让', - 35733: '谰', - 35734: '谶', - 35738: '赞', - 35740: '谠', - 35742: '谳', - 35912: '岂', - 35918: '竖', - 35920: '丰', - 35924: '艳', - 35948: '猪', - 35958: '豮', - 35987: '猫', - 35993: '䝙', - 35997: '贝', - 35998: '贞', - 35999: '贠', - 36000: '负', - 36001: '财', - 36002: '贡', - 36007: '贫', - 36008: '货', - 36009: '贩', - 36010: '贪', - 36011: '贯', - 36012: '责', - 36015: '贮', - 36016: '贳', - 36018: '赀', - 36019: '贰', - 36020: '贵', - 36022: '贬', - 36023: '买', - 36024: '贷', - 36026: '贶', - 36027: '费', - 36028: '贴', - 36029: '贻', - 36031: '贸', - 36032: '贺', - 36033: '贲', - 36034: '赂', - 36035: '赁', - 36036: '贿', - 36037: '赅', - 36039: '资', - 36040: '贾', - 36042: '贼', - 36049: '赈', - 36050: '赊', - 36051: '宾', - 36053: '赇', - 36057: '赒', - 36058: '赉', - 36060: '赐', - 36062: '赏', - 36063: '𧹖', - 36064: '赔', - 36065: '赓', - 36066: '贤', - 36067: '卖', - 36068: '贱', - 36070: '赋', - 36071: '赕', - 36074: '质', - 36075: '赍', - 36076: '账', - 36077: '赌', - 36080: '䞐', - 36084: '赖', - 36085: '赗', - 36090: '赚', - 36091: '赙', - 36092: '购', - 36093: '赛', - 36094: '赜', - 36099: '𧹗', - 36100: '贽', - 36101: '赘', - 36103: '赟', - 36104: '赠', - 36106: '赞', - 36107: '赝', - 36109: '赡', - 36111: '赢', - 36112: '赆', - 36115: '赃', - 36116: '赑', - 36118: '赎', - 36119: '赝', - 36123: '赣', - 36124: '赃', - 36204: '赪', - 36245: '赶', - 36249: '赵', - 36264: '趋', - 36274: '趱', - 36321: '迹', - 36368: '践', - 36400: '逾', - 36404: '踊', - 36428: '跄', - 36437: '跸', - 36447: '迹', - 36451: '蹒', - 36452: '踪', - 36474: '跷', - 36475: '𫏋', - 36482: '跶', - 36489: '趸', - 36490: '踌', - 36491: '跻', - 36493: '跃', - 36494: '䟢', - 36497: '踯', - 36498: '跞', - 36499: '踬', - 36501: '蹰', - 36506: '跹', - 36509: '𨅬', - 36513: '蹑', - 36517: '蹿', - 36518: '躜', - 36522: '躏', - 36544: '躯', - 36553: '𨉗', - 36554: '车', - 36555: '轧', - 36556: '轨', - 36557: '军', - 36559: '𫐄', - 36561: '轪', - 36562: '轩', - 36564: '轫', - 36567: '𨐅', - 36571: '轭', - 36575: '软', - 36580: '轷', - 36584: '𫐉', - 36587: '轸', - 36594: '轱', - 36600: '轴', - 36601: '轵', - 36602: '轺', - 36603: '轲', - 36604: '轶', - 36606: '轼', - 36611: '较', - 36612: '𨐈', - 36613: '辂', - 36615: '辁', - 36616: '辀', - 36617: '载', - 36618: '轾', - 36626: '辄', - 36627: '挽', - 36628: '辅', - 36629: '轻', - 36631: '𫐐', - 36635: '辆', - 36636: '辎', - 36637: '辉', - 36638: '辋', - 36639: '辍', - 36645: '辊', - 36646: '辇', - 36649: '辈', - 36650: '轮', - 36652: '辌', - 36654: '𫐓', - 36655: '辑', - 36659: '辏', - 36664: '输', - 36667: '辐', - 36670: '辗', - 36671: '舆', - 36672: '辒', - 36674: '毂', - 36676: '辖', - 36677: '辕', - 36678: '辘', - 36681: '转', - 36685: '辙', - 36686: '轿', - 36692: '辚', - 36703: '轰', - 36705: '辔', - 36706: '轹', - 36707: '𫐆', - 36708: '轳', - 36774: '办', - 36781: '辞', - 36782: '辫', - 36783: '辩', - 36786: '农', - 36852: '回', - 36885: '迳', - 36889: '这', - 36899: '连', - 36913: '周', - 36914: '进', - 36938: '游', - 36939: '运', - 36942: '过', - 36948: '达', - 36949: '违', - 36953: '遥', - 36956: '逊', - 36958: '递', - 36960: '远', - 36961: '溯', - 36969: '适', - 36978: '迟', - 36983: '迁', - 36984: '选', - 36986: '遗', - 36988: '辽', - 36993: '迈', - 36996: '还', - 36999: '迩', - 37002: '边', - 37007: '逻', - 37008: '逦', - 37087: '郏', - 37109: '邮', - 37126: '郓', - 37129: '乡', - 37138: '邹', - 37140: '邬', - 37142: '郧', - 37159: '邓', - 37165: '郑', - 37168: '邻', - 37170: '郸', - 37172: '邺', - 37174: '郐', - 37178: '邝', - 37191: '酂', - 37192: '郦', - 37251: '腌', - 37270: '酝', - 37276: '丑', - 37278: '酝', - 37283: '糖', - 37291: '医', - 37292: '酱', - 37297: '酦', - 37312: '酿', - 37313: '衅', - 37315: '酾', - 37317: '酽', - 37323: '释', - 37328: '厘', - 37330: '钅', - 37331: '钆', - 37332: '钇', - 37333: '钌', - 37335: '钊', - 37336: '钉', - 37337: '钋', - 37341: '针', - 37347: '钓', - 37348: '钐', - 37350: '扣', - 37351: '钏', - 37353: '钒', - 37363: '𨰿', - 37365: '钗', - 37367: '钍', - 37369: '钕', - 37370: '钎', - 37374: '䥺', - 37376: '钯', - 37377: '钫', - 37379: '钘', - 37380: '钭', - 37383: '𫓧', - 37384: '钚', - 37385: '钠', - 37387: '𨱂', - 37389: '钝', - 37390: '钩', - 37392: '钤', - 37393: '钣', - 37394: '钑', - 37396: '钞', - 37397: '钮', - 37406: '钧', - 37408: '𨱁', - 37411: '钙', - 37413: '钬', - 37414: '钛', - 37415: '钪', - 37422: '铌', - 37423: '𨱄', - 37424: '铈', - 37426: '𨱃', - 37427: '钶', - 37428: '铃', - 37431: '钴', - 37432: '钹', - 37433: '铍', - 37434: '钰', - 37437: '钸', - 37438: '铀', - 37439: '钿', - 37440: '钾', - 37441: '𨱅', - 37445: '巨', - 37448: '铊', - 37449: '铉', - 37451: '铇', - 37453: '铋', - 37457: '铂', - 37461: '钷', - 37463: '钳', - 37466: '铆', - 37467: '铅', - 37470: '钺', - 37474: '钵', - 37476: '钩', - 37478: '钲', - 37484: '钼', - 37485: '钽', - 37494: '铏', - 37496: '铰', - 37498: '铒', - 37499: '铬', - 37503: '铪', - 37504: '银', - 37507: '铳', - 37509: '铜', - 37517: '铚', - 37521: '铣', - 37523: '铨', - 37526: '铢', - 37528: '铭', - 37530: '铫', - 37531: '铦', - 37532: '衔', - 37536: '铑', - 37539: '铷', - 37541: '铱', - 37542: '铟', - 37544: '铵', - 37545: '铥', - 37546: '铕', - 37547: '铯', - 37548: '铐', - 37553: '铞', - 37555: '锐', - 37558: '𨱇', - 37559: '销', - 37561: '锈', - 37563: '锑', - 37564: '锉', - 37569: '铝', - 37571: '锒', - 37573: '锌', - 37575: '钡', - 37577: '𨱈', - 37580: '铤', - 37583: '铗', - 37586: '锋', - 37593: '铻', - 37597: '锊', - 37599: '锓', - 37603: '铘', - 37604: '锄', - 37605: '锃', - 37606: '锔', - 37608: '锇', - 37609: '铓', - 37610: '铺', - 37613: '锐', - 37614: '铖', - 37615: '锆', - 37616: '锂', - 37617: '铽', - 37622: '锍', - 37624: '锯', - 37628: '钢', - 37633: '锞', - 37634: '𨱋', - 37636: '录', - 37638: '锖', - 37639: '锫', - 37640: '锩', - 37647: '铔', - 37648: '锥', - 37650: '锕', - 37653: '锟', - 37656: '锤', - 37657: '锱', - 37658: '铮', - 37659: '锛', - 37663: '锬', - 37664: '锭', - 37665: '锜', - 37666: '钱', - 37670: '锦', - 37672: '锚', - 37673: '锠', - 37675: '锡', - 37678: '锢', - 37679: '错', - 37682: '录', - 37683: '锰', - 37686: '表', - 37688: '铼', - 37696: '锝', - 37697: '锨', - 37699: '锪', - 37700: '𨱉', - 37702: '钔', - 37703: '锴', - 37704: '锳', - 37706: '炼', - 37707: '锅', - 37709: '镀', - 37716: '锷', - 37720: '铡', - 37722: '钖', - 37723: '锻', - 37728: '锽', - 37732: '锸', - 37733: '锲', - 37737: '锘', - 37740: '锹', - 37742: '𨱎', - 37744: '锾', - 37749: '键', - 37750: '锶', - 37754: '锗', - 37756: '针', - 37758: '钟', - 37762: '镁', - 37764: '锿', - 37767: '镅', - 37770: '镑', - 37772: '镰', - 37780: '镕', - 37782: '锁', - 37784: '镉', - 37786: '锤', - 37787: '镈', - 37789: '𨱏', - 37793: '镃', - 37794: '钨', - 37795: '蓥', - 37798: '镏', - 37799: '铠', - 37801: '铩', - 37802: '锼', - 37804: '镐', - 37805: '鎮', - 37806: '镇', - 37807: '𨱍', - 37808: '镒', - 37810: '镋', - 37811: '镍', - 37813: '镓', - 37815: '𨰾', - 37816: '镌', - 37823: '镎', - 37827: '镞', - 37830: '𨱌', - 37831: '镟', - 37832: '链', - 37833: '𨱒', - 37836: '镆', - 37837: '镙', - 37840: '镠', - 37841: '镝', - 37847: '铿', - 37848: '锵', - 37850: '戚', - 37852: '镗', - 37853: '镘', - 37854: '镛', - 37855: '铲', - 37857: '镜', - 37858: '镖', - 37860: '镂', - 37862: '𫓩', - 37864: '錾', - 37872: '镚', - 37877: '铧', - 37879: '镤', - 37881: '镪', - 37882: '䥽', - 37885: '锈', - 37891: '铙', - 37892: '𨱑', - 37899: '铴', - 37901: '𫔎', - 37902: '𨱓', - 37903: '𨱔', - 37904: '镣', - 37906: '铹', - 37907: '镦', - 37908: '镡', - 37912: '钟', - 37913: '镫', - 37917: '镢', - 37920: '镨', - 37925: '䦅', - 37926: '锎', - 37927: '锏', - 37928: '镄', - 37931: '镌', - 37934: '镰', - 37935: '䦃', - 37938: '镯', - 37939: '镭', - 37941: '铁', - 37942: '镮', - 37944: '铎', - 37946: '铛', - 37951: '镱', - 37956: '铸', - 37962: '镬', - 37964: '镔', - 37969: '鉴', - 37970: '鉴', - 37972: '镲', - 37973: '锧', - 37982: '镴', - 37984: '铄', - 37987: '镳', - 37989: '镥', - 37997: '镧', - 38000: '钥', - 38001: '镵', - 38002: '镶', - 38007: '镊', - 38009: '镩', - 38012: '锣', - 38013: '钻', - 38014: '銮', - 38015: '凿', - 38017: '镢', - 38239: '旋', - 38263: '长', - 38272: '门', - 38274: '闩', - 38275: '闪', - 38278: '闫', - 38280: '闬', - 38281: '闭', - 38283: '开', - 38284: '闶', - 38285: '𨸂', - 38286: '闳', - 38287: '闰', - 38288: '𨸃', - 38289: '闲', - 38290: '闲', - 38291: '间', - 38292: '闵', - 38296: '闸', - 38305: '阂', - 38307: '阁', - 38308: '合', - 38309: '阀', - 38312: '闺', - 38313: '闽', - 38315: '阃', - 38316: '阆', - 38317: '闾', - 38321: '阅', - 38322: '阅', - 38326: '阊', - 38329: '阉', - 38331: '阎', - 38332: '阏', - 38333: '阍', - 38334: '阈', - 38335: '阌', - 38339: '阒', - 38342: '板', - 38343: '暗', - 38344: '闱', - 38346: '阔', - 38347: '阕', - 38348: '阑', - 38349: '阇', - 38352: '阗', - 38354: '阘', - 38355: '闿', - 38356: '阖', - 38357: '阙', - 38358: '闯', - 38364: '关', - 38366: '阚', - 38368: '阓', - 38369: '阐', - 38370: '辟', - 38372: '阛', - 38373: '闼', - 38488: '陉', - 38493: '陕', - 38494: '升', - 38499: '阵', - 38512: '阴', - 38515: '陈', - 38520: '陆', - 38525: '阳', - 38537: '陧', - 38538: '队', - 38542: '阶', - 38549: '陨', - 38555: '际', - 38568: '随', - 38570: '险', - 38577: '隐', - 38580: '陇', - 38584: '隶', - 38587: '只', - 38603: '隽', - 38614: '虽', - 38617: '双', - 38619: '雏', - 38620: '杂', - 38622: '鸡', - 38626: '离', - 38627: '难', - 38642: '云', - 38651: '电', - 38690: '霡', - 38695: '雾', - 38717: '霁', - 38722: '雳', - 38724: '霭', - 38728: '灵', - 38729: '叆', - 38746: '靓', - 38748: '静', - 38758: '腼', - 38760: '靥', - 38784: '鼗', - 38799: '巩', - 38813: '绱', - 38822: '秋', - 38845: '鞒', - 38849: '缰', - 38851: '鞑', - 38854: '千', - 38857: '鞯', - 38859: '韦', - 38860: '韧', - 38861: '韨', - 38867: '韩', - 38873: '韪', - 38876: '韬', - 38877: '鞲', - 38878: '韫', - 38907: '韵', - 38911: '响', - 38913: '页', - 38914: '顶', - 38915: '顷', - 38917: '项', - 38918: '顺', - 38919: '顸', - 38920: '须', - 38922: '顼', - 38924: '颂', - 38926: '颀', - 38927: '颃', - 38928: '预', - 38929: '顽', - 38930: '颁', - 38931: '顿', - 38935: '颇', - 38936: '领', - 38940: '颌', - 38945: '颉', - 38948: '颐', - 38950: '颏', - 38957: '头', - 38958: '颒', - 38960: '颊', - 38962: '颋', - 38964: '颕', - 38967: '颔', - 38968: '颈', - 38969: '颓', - 38971: '频', - 38973: '颓', - 38979: '𩖖', - 38982: '颗', - 38988: '题', - 38989: '额', - 38990: '颚', - 38991: '颜', - 38994: '颙', - 38995: '颛', - 38996: '颜', - 39000: '愿', - 39001: '颡', - 39003: '颠', - 39006: '类', - 39010: '颟', - 39013: '颢', - 39015: '顾', - 39019: '颤', - 39020: '颥', - 39023: '显', - 39024: '颦', - 39025: '颅', - 39027: '颞', - 39028: '颧', - 39080: '风', - 39085: '飐', - 39086: '飑', - 39087: '飒', - 39088: '𩙥', - 39089: '台', - 39091: '刮', - 39094: '飓', - 39095: '𩙪', - 39096: '飔', - 39098: '飏', - 39099: '飖', - 39100: '飕', - 39102: '𩙫', - 39104: '飗', - 39108: '飘', - 39110: '飙', - 39112: '飚', - 39131: '飞', - 39136: '饣', - 39138: '饥', - 39139: '饤', - 39141: '饦', - 39145: '饨', - 39146: '饪', - 39147: '饫', - 39149: '饬', - 39151: '饭', - 39153: '飧', - 39154: '饮', - 39156: '饴', - 39164: '饲', - 39165: '饱', - 39166: '饰', - 39167: '饳', - 39171: '饺', - 39172: '饸', - 39173: '饼', - 39177: '饷', - 39178: '养', - 39180: '饵', - 39182: '饹', - 39183: '饻', - 39185: '饽', - 39186: '馁', - 39187: '饿', - 39188: '𫗦', - 39189: '馂', - 39190: '饾', - 39191: '𫗧', - 39192: '余', - 39194: '肴', - 39195: '馄', - 39196: '馃', - 39198: '饯', - 39201: '馅', - 39206: '𫗠', - 39208: '馆', - 39213: '𫗮', - 39217: '糇', - 39219: '饧', - 39221: '喂', - 39222: '馉', - 39223: '馇', - 39224: '𩠌', - 39226: '馎', - 39228: '饩', - 39230: '馏', - 39231: '馊', - 39233: '馌', - 39235: '馍', - 39237: '馒', - 39240: '馐', - 39241: '馑', - 39242: '馓', - 39243: '馈', - 39244: '馔', - 39249: '饥', - 39250: '饶', - 39255: '飨', - 39256: '𫗴', - 39260: '餍', - 39262: '馋', - 39266: '馕', - 39340: '马', - 39341: '驭', - 39342: '冯', - 39345: '驮', - 39347: '驰', - 39348: '驯', - 39353: '驲', - 39361: '驳', - 39363: '𫘝', - 39374: '𩧨', - 39376: '驻', - 39377: '驽', - 39378: '驹', - 39380: '驵', - 39381: '驾', - 39384: '骀', - 39385: '驸', - 39386: '𩧫', - 39387: '驶', - 39389: '驼', - 39391: '驷', - 39393: '骂', - 39394: '骈', - 39399: '𩧲', - 39401: '𩧴', - 39405: '骇', - 39408: '骃', - 39409: '骆', - 39414: '𩧺', - 39416: '骎', - 39419: '𫘣', - 39423: '骏', - 39425: '骋', - 39426: '骍', - 39427: '𫘤', - 39429: '骓', - 39436: '骔', - 39437: '骒', - 39438: '骑', - 39439: '骐', - 39444: '𩨀', - 39446: '骛', - 39449: '骗', - 39450: '𩨊', - 39453: '𩨃', - 39455: '𩨈', - 39456: '𫘨', - 39460: '骙', - 39463: '䯄', - 39466: '𩨄', - 39467: '骞', - 39469: '骘', - 39470: '骝', - 39472: '腾', - 39478: '驺', - 39479: '骚', - 39480: '骟', - 39486: '骡', - 39488: '蓦', - 39489: '骜', - 39490: '骖', - 39491: '骠', - 39492: '骢', - 39493: '驱', - 39498: '骅', - 39499: '𩧯', - 39500: '骕', - 39501: '骁', - 39503: '骣', - 39509: '骄', - 39511: '验', - 39514: '惊', - 39515: '驿', - 39519: '骤', - 39522: '驴', - 39524: '骧', - 39525: '骥', - 39526: '骦', - 39530: '骊', - 39531: '骉', - 39599: '肮', - 39631: '髅', - 39634: '脏', - 39636: '体', - 39637: '髌', - 39638: '髋', - 39662: '发', - 39686: '松', - 39693: '胡', - 39706: '须', - 39714: '鬓', - 39717: '斗', - 39719: '闹', - 39720: '哄', - 39721: '阋', - 39726: '阄', - 39729: '郁', - 39737: '鬶', - 39758: '魉', - 39768: '魇', - 39770: '鱼', - 39771: '鱽', - 39775: '𫚉', - 39778: '鱾', - 39781: '𩽹', - 39784: '鲀', - 39791: '鲁', - 39796: '鲂', - 39799: '鱿', - 39802: '鲄', - 39809: '鲅', - 39811: '鲆', - 39812: '𫚒', - 39818: '鲌', - 39819: '鲉', - 39821: '鲏', - 39822: '鲇', - 39824: '鲐', - 39825: '鲍', - 39826: '鲋', - 39827: '鲊', - 39829: '𩾀', - 39834: '鲒', - 39836: '鲘', - 39837: '鲞', - 39838: '鲕', - 39839: '𩽾', - 39843: '䲟', - 39846: '鲖', - 39850: '鲔', - 39851: '鲛', - 39853: '鲑', - 39854: '鲜', - 39856: '𫚔', - 39859: '鲓', - 39862: '鲪', - 39864: '𩾃', - 39866: '鲝', - 39872: '鲧', - 39873: '鲠', - 39876: '𩾁', - 39878: '𫚙', - 39879: '鲩', - 39881: '鲤', - 39882: '鲨', - 39890: '鲬', - 39892: '鲻', - 39893: '鲯', - 39894: '鲭', - 39895: '鲞', - 39899: '鲷', - 39901: '鲴', - 39905: '鲱', - 39906: '鲵', - 39908: '鲲', - 39911: '鲳', - 39912: '鲸', - 39914: '鲮', - 39915: '鲰', - 39920: '鲶', - 39921: '𩾇', - 39924: '鲺', - 39926: '𩽼', - 39927: '鳀', - 39933: '鲫', - 39935: '鳊', - 39937: '鳈', - 39938: '鲗', - 39939: '鳂', - 39942: '䲠', - 39944: '鲽', - 39945: '鳇', - 39948: '䲡', - 39949: '鳅', - 39951: '鲾', - 39952: '鳄', - 39954: '鳆', - 39955: '鳃', - 39964: '鳒', - 39967: '鳑', - 39968: '鳋', - 39971: '鲥', - 39972: '𫚕', - 39973: '鳏', - 39975: '䲢', - 39976: '鳎', - 39977: '鳐', - 39981: '鳍', - 39982: '鳁', - 39985: '鲢', - 39986: '鳌', - 39987: '鳓', - 39989: '鳘', - 39991: '鲦', - 39993: '鲣', - 39994: '鲹', - 39995: '鳗', - 39996: '鳛', - 39998: '鳔', - 40002: '鳉', - 40005: '鳙', - 40007: '𩾌', - 40008: '鳕', - 40009: '鳖', - 40018: '鳟', - 40020: '鳝', - 40022: '鳜', - 40023: '鳞', - 40024: '鲟', - 40029: '鲼', - 40031: '鲎', - 40032: '鲙', - 40035: '鳣', - 40036: '鳡', - 40039: '鳢', - 40040: '鲿', - 40045: '鲚', - 40046: '𫚈', - 40047: '鳠', - 40055: '鳄', - 40056: '鲈', - 40058: '鲡', - 40165: '鸟', - 40167: '凫', - 40169: '鸠', - 40172: '凫', - 40178: '鸤', - 40179: '凤', - 40180: '鸣', - 40182: '鸢', - 40183: '𫛛', - 40188: '𪉃', - 40190: '䴓', - 40195: '𫛞', - 40198: '鸩', - 40199: '鸨', - 40201: '鸦', - 40210: '鸰', - 40213: '鸵', - 40215: '𫁡', - 40219: '鸳', - 40220: '𪉈', - 40221: '鸲', - 40222: '鸮', - 40223: '鸱', - 40227: '鸪', - 40230: '鸯', - 40232: '鸭', - 40239: '鸸', - 40240: '鸹', - 40242: '𪉆', - 40244: '鸻', - 40247: '䴕', - 40251: '鸿', - 40255: '鸽', - 40257: '䴔', - 40258: '鸺', - 40259: '鸼', - 40272: '鹀', - 40273: '鹃', - 40274: '鹆', - 40275: '鹁', - 40282: '𪉍', - 40284: '鹈', - 40285: '鹅', - 40288: '鹄', - 40289: '鹉', - 40298: '鹌', - 40300: '鹏', - 40302: '鹐', - 40303: '鹎', - 40304: '雕', - 40306: '鹊', - 40311: '鹓', - 40318: '鹍', - 40324: '䴖', - 40327: '鸫', - 40329: '鹑', - 40330: '鹒', - 40338: '𫛶', - 40339: '鹋', - 40342: '鹙', - 40343: '𫛸', - 40344: '鹕', - 40346: '鹗', - 40353: '鹖', - 40357: '鹛', - 40361: '鹜', - 40362: '䴗', - 40364: '鸧', - 40367: '莺', - 40370: '鹟', - 40372: '鹤', - 40377: '鹠', - 40378: '鹡', - 40379: '鹘', - 40380: '鹣', - 40383: '鹚', - 40384: '鹚', - 40385: '鹢', - 40386: '鹞', - 40388: '鸡', - 40392: '䴘', - 40394: '鹝', - 40403: '鹧', - 40404: '𪉑', - 40406: '鹥', - 40407: '鸥', - 40409: '鸷', - 40410: '鹨', - 40421: '鸶', - 40422: '鹪', - 40424: '𪉊', - 40427: '鹔', - 40431: '鹩', - 40434: '鹫', - 40435: '鹇', - 40440: '鹬', - 40441: '鹰', - 40442: '鹭', - 40445: '鸴', - 40447: '䴙', - 40450: '㶉', - 40455: '鹯', - 40459: '𫛢', - 40460: '鹱', - 40463: '鹲', - 40469: '鸬', - 40472: '鹴', - 40474: '鹦', - 40475: '鹳', - 40477: '鹂', - 40478: '鸾', - 40565: '卤', - 40569: '咸', - 40570: '鹾', - 40572: '碱', - 40573: '盐', - 40599: '丽', - 40613: '麦', - 40616: '𪎊', - 40617: '麸', - 40618: '面', - 40619: '面', - 40623: '曲', - 40626: '𪎉', - 40627: '𪎌', - 40628: '曲', - 40629: '面', - 40636: '么', - 40637: '么', - 40643: '黄', - 40652: '黉', - 40670: '点', - 40680: '党', - 40690: '黪', - 40692: '霉', - 40694: '黡', - 40695: '黩', - 40701: '黾', - 40703: '鼋', - 40713: '鼍', - 40725: '冬', - 40756: '鼹', - 40775: '齄', - 40778: '齐', - 40779: '斋', - 40782: '赍', - 40783: '齑', - 40786: '齿', - 40788: '龀', - 40789: '龁', - 40791: '龂', - 40793: '龅', - 40796: '龇', - 40799: '龃', - 40800: '龆', - 40801: '龄', - 40803: '出', - 40806: '龈', - 40807: '啮', - 40810: '龊', - 40812: '龉', - 40818: '龋', - 40822: '腭', - 40823: '龌', - 40845: '龙', - 40846: '厐', - 40848: '庞', - 40849: '䶮', - 40852: '龚', - 40853: '龛', - 40860: '龟', - 40877: '𩨎', - 40879: '𨱆', - 131877: '𠆿', - 132066: '𠉗', - 132998: '𠛆', - 133134: '𠚳', - 135444: '𠴢', - 135459: '𠵸', - 135503: '𠲥', - 136301: '𡋗', - 136446: '𡋀', - 137141: '㛟', - 137273: '㛿', - 137347: '㛠', - 138121: '𡭜', - 138147: '𡭬', - 138965: '岁', - 139185: '㟜', - 141530: '𢘝', - 141549: '𢘞', - 142763: '𢫞', - 142830: '𢫊', - 142975: '𢬦', - 144974: '㭣', - 145237: '𣘷', - 145339: '𣘓', - 145458: '𣑶', - 146420: '𣭤', - 147383: '㳢', - 147401: '𣶫', - 147555: '𣺽', - 148622: '𤊀', - 150202: '㻘', - 150249: '㻏', - 150776: '𤳄', - 151083: '𤶧', - 152323: '𥅘', - 152933: '𥐰', - 152965: '𥐯', - 153762: '䅪', - 154128: '𥧂', - 154947: '𥱔', - 154954: '𥭉', - 155168: '𥮋', - 155453: '𥹥', - 155478: '𥺇', - 155594: '𦈈', - 155781: '𦈒', - 155844: '𦈗', - 157832: '𣍨', - 157902: '𦟗', - 158361: '䑽', - 158397: '𦨩', - 161061: '𧒭', - 161559: '䘞', - 161589: '䙊', - 161630: '䘛', - 162215: '𫍟', - 162393: '䜥', - 163039: '𧳕', - 163187: '䞌', - 163220: '𧹓', - 163239: '䞎', - 164131: '𨀱', - 164173: '𨁴', - 164289: '𧿈', - 164318: '𨅫', - 164362: '𨂺', - 164364: '𨄄', - 164528: '䢀', - 164536: '䢁', - 164539: '𨐆', - 164578: '䢂', - 164782: '𨐉', - 164832: '𨐇', - 164837: '𨐊', - 166203: '𨤰', - 166235: '𨱀', - 166315: '䦀', - 166364: '䦁', - 166385: '𨱊', - 166610: '𨱐', - 166786: '𨱕', - 166853: '䥿', - 167121: '𨸁', - 167125: '𨸀', - 167191: '𨸅', - 167273: '𨸆', - 167288: '𨸇', - 167296: '𨸉', - 167311: '𨸊', - 167342: '𨸌', - 167346: '𨸋', - 167410: '𨸎', - 167759: '𨸘', - 168866: '𩏾', - 168938: '𩏽', - 169187: '𩖕', - 169408: '𩙦', - 169441: '𩙧', - 169472: '𩙩', - 169501: '𩙭', - 169529: '𩙨', - 169530: '𩙬', - 169544: '𩙰', - 169627: '𩟿', - 169637: '𩠀', - 169653: '𩠁', - 169670: '𩠂', - 169705: '𩠃', - 169735: '𩠉', - 169766: '𩠆', - 169781: '𩠊', - 169812: '𩠋', - 169860: '𩠎', - 169894: '𩠏', - 169903: '䭪', - 169936: '𩠅', - 170036: '𩠠', - 170106: '𩧦', - 170145: '𩧬', - 170164: '𩧵', - 170168: '𩧳', - 170174: '𩧮', - 170191: '𩧶', - 170193: '䯃', - 170229: '𩧻', - 170234: '𩧼', - 170250: '𩧩', - 170265: '𩨆', - 170290: '𩨉', - 170296: '𩨅', - 170308: '𩨋', - 170311: '𩨍', - 170313: '𩧱', - 170321: '𩨌', - 170438: '𩨐', - 170841: '𩬣', - 170995: '𩯒', - 171008: '𩬤', - 171236: '𩲒', - 171369: '𩽺', - 171385: '𩽻', - 171416: '䲞', - 171440: '𩽿', - 171441: '𩽽', - 171504: '𩾄', - 171523: '𩾅', - 171558: '𩾆', - 171847: '𩾎', - 172010: '𪉄', - 172070: '𪉅', - 172094: '𪉋', - 172104: '𪉉', - 172118: '𪉌', - 172166: '𪉎', - 172237: '𪉐', - 172239: '𪉏', - 172294: '𪉔', - 172309: '𪉒', - 172531: '𪉕', - 172799: '𪎍', - 173365: '𪔭', - 173568: '𪚏', - 173615: '𪚐'} diff --git a/wordfreq/chinese.py b/wordfreq/chinese.py index 63cc84e..03a1ca3 100644 --- a/wordfreq/chinese.py +++ b/wordfreq/chinese.py @@ -1,14 +1,19 @@ from pkg_resources import resource_filename -from wordfreq._chinese_mapping import SIMPLIFIED_MAP import jieba - +import msgpack +import gzip jieba_tokenizer = None +simplified_map = None DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt') +SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz') def simplify_chinese(text): - return text.translate(SIMPLIFIED_MAP).casefold() + global simplified_map + if simplified_map is None: + simplified_map = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8') + return text.translate(simplified_map).casefold() def jieba_tokenize(text): diff --git a/wordfreq/data/_chinese_mapping.msgpack.gz b/wordfreq/data/_chinese_mapping.msgpack.gz new file mode 100644 index 0000000000000000000000000000000000000000..d1313854a74ec9d7a1e8cbcd2b70e6ad7e1deb4c GIT binary patch literal 16831 zcmV)CK*GNtiwFphxB*rI|6^!rZe??2Uu|J041DvVB}TR{uK*Y zu%cjBc(GRm5&Ve=D#9XcVV5l12%BBlY{}MaO*WHcl1$0;$@EFN>21<`0mbg4uU&Z+ zkWp)8Uty<zPF{{-J+UzqI znzC|iVM4cxi_ik`0AfJNTg_^?Q^mVjb4re4cArr3Zl2{-T=KkTd+7|SxRj-9RlFZj zt>QywH9_Ywh772<+~jBKKJj);!I8~z@i-@p_A8o(bmI$+( zdM_*YDvljA8S;xA8|)q0M7!7SG^A#d`HT{^tMERTg$yvn}Q zD#qh#daNt`#kvxvdX~XF{u>K%>+D(Gui}4lB)^LP#o#U#|Hmvlg9>aWWHNf_1uW!I zw9l-?Ypknpu?uKaOVcU z9{L1gQv2B~Q=`Cx=$Tfa*euWFblX;9vcF3EUyaMRtM&%7>O;K}v)W)(jSDsmsd_(R zR<|7>&Z-+!eZ;J~hIHFeW)8B@KC^~r?tdfN)oRNglMiQ<`4-Wi(dMt2d@!ZWUpM(^ zzit~~#*99U= zcS@brJ_u<@#%1dS#d1rw&!??_^*Yaux`K1$|l|Z z3M=Ef{f$rEqk2bFa~(H0t#g-~m0Vc6 zZ=thUC!R8E&MBn^&Duy>HFufS#yahO1iLkAH=*n5)$SvV?AC68xp7jvYv}CL?pn^& zs@-+wLYH4PZoIx$HBDyK>(lNS6Vlq9Kp5>#A*!{z)vVRmtEP<|lB((Av>s)Ch_$U7 z#X2!0uCn`4v+Oa-{K)!VQ2Xy8TTSZ_&OH^=?kCu~U%UU0(^=H4c?yRL)brPio^HR` z;<8X(lec|t?fwJ1cBtl$DD6|tGw3(2n!lTsnr22NUe0!1J*vGFHN2{QInFbzlFM1R zStU3`uwErs(lQEv#H>XU+IuMr)vNY{xIf~0PLWZ`4Q3gjDZx$MZQ2he_T!0nqVt4G z)`1Xx{n~Q5$v4fbglNS-Yd11+01ZBTp2z*2D%no|oJyD*s8%iR+c>CN+}C5z6`!(t zj;fXv*_c!L>&;56S0&}_*Q{EyVPm^W&d`dhEB%SJSN$5YwGh z$q0ABLZet0=>8b1bg1NU*5q7-+0=ka zz)nw(N(h(wS(Okj-d1@?4ml?L5aqa(A29^waDhe(@-5%99^zNYQw+ukBw}~JN-h-c ztmJnL#z24IKqIRCBkDL*a*@t1m5`wWv~Vfkq{5#ttF0XB8BT)&fMsY>CI7?$K(F&= zd61ESWoSyZ|3?3`O2AAoBz=on8;xiW02g#>4>T$?u024J$67A>>*rx%51{DjRQZR^ zO1w^c@T*X*oLAu=)!OqW^r+IFx6nDGJ*4^o;%%(QkhhZryxMcMS&oFX=NgWf(w_g} znOn|l&1!I1&UX`PLG5`z^HAV}T*JJaA411I6zI4C>qG|R{45>aN_n^*mz-Z<*?Kw2 z^0H2}VY+8H`k0*0 zvP*+1{%BTanD-CPGbiUi5i`ovo3*JveIBUqa%EY*^95X9(yw)WoT9s4buVU(5t}Jy zEw*{sIGs>7io(+b}TmSt3T zHNNB5j(-4Y?7wOw$*sB%ksgM$WrxX!GD?kd*16`ao`En z;ZBii?Y+}1Cp@a#Z1U~3s^;chqpI5m-Ak%&JHoHMJ77a|s-u4JPwETBW+~xS-NV=# z^?|lTzk*koH3-VON6k{sslA|BBC5SEM3eT$*^e9rAMaUksOd0_VN845%u3WL$6Me{ zecB5wC973_)T|`>l=>#xw`(sblZa?9DC4WsUQi|ul7ExVu=akBvLUD~rx4v3_qErM zLdK`G_a`jWrM=(^rNLg(iKz={FDxrQue|_EBBab8x#J$~{X53#RO&};Hmf?WGc~Nb z=gew4s??9^%;}C7n5A@=9)6KonoFvli%a(FVVkB?-Fo;vsNSWFi`9Y(UT4;7ru6WK z&2l=Y^oIbLM)ZnY;M7el_%sWSEAwXt_o<$6O;#zjMCXWtH)Fhv>UWtHZ@=ov9*G5| zR(Lk5hmVl4(s~$#j@WZvb(6V}TGX~RsBWw30oim+cbvufrlY#!oLR~?swryL62Qi9 zSz$=^wH$1esh3hqwWR8FM}r+5v}yZNo3>r5Cv{-`dXjsfo&HZ=huMM?3f=*`pnuRT z#hU0(K~8&eAw4_>Fb3%O)H4hmXjlELS%st5!>+Scs(+jvhm?Aj(ew0vgNF^OCl0gr z2#zli2aYlRYU;52GvNY99@LH>nEYs`>VL<0myT>SE7fV$1OKTJHFcXcf~x+{jO^3H ze}$PkRZm=|{CfDGRvkS|xz^mFh8J>}8Z}V0W$M+yoh264a5*hoDPQ&~jF22v!)us2 zriRzjzo3S9V8>Q9TtlZ%4evFp&333JL2j^hP>+!Ovjh5K zHx>>~Tc&iK1tFInA>SK^9F*m?Nfp6w+Y(B@11AmYk;lQ{8a?tYhIs5o-^IV1InMW> zd|sum!Hqif2!WMl_n#sRqu%=j2_T+P!!Otws{^LSsYfV$T0Cm_bz$lwE3_=+zW4Wb za8{3ywp*I?2*phsR{u9NdqM|3V3vp4R0=P|7hPDry-}`LvR;Gsk&|0nwBtu6Kk1a~ zDzn;Fr>0+U=4!dFV@SJPA4jyQ=}B<7RjyB()%KKJp8}o6<+>Hn_g2aE1(To7%5?`r zJLI~vAoktGGTlnw$oV?ux|g-#Xcx>%DkvAlb6c&_H}TBq)D#z#mTL#|j9e5lrcN%% zUi+B#J%Wa9x^tiX1ffYT@@gAH4-`1azJo+>QZ7KUwO@A<%N+}H9c9RXT*u97M^yVr zxmo+^LerWJh8mUsrjVf1&u|_#14>(_+LoBj+P8#%ENUO_)83+efOosKkn2y@#%M@^ z59@Q0oV&8xhcmUe$@O>twC;F zb8twd(888_xyfpS({jHJF`?48vmx69GJ|s}z1my=GE1R_Et7J?2L~q=90qASwSBwE z54+@kKb?yz{Qx7!sz`J-`CoI}wd4r8Y{>$o(M1gz3d} z_Q(w<&9x~&2rs7OCgtU#azjmXF_luccBAM4TI>!(&2deK5ZvTpL|pD82(Fh%%Aq!> zmg`hz5im~5eS%h6+X=|t?wH)?XeQ)7k7!bn^|jP>RbBGIUh#5Q2*Zid&7`QCeBu6=MuFftx_g4dbt+39}TO zmzz+`4XQNFU_1<@&JCy(r0zo57Fr16GuNlmHXyi4rBrO)xMVw>2!b{@BsW2u<8HbU zBXaj5aE<V>oqg9{ul{F&AdzDy1y!!b+5dLvDMldGbRyMqDtf-RMr#57o&1 zWkjt?zrq@6xuF2$PWRUlcoOU97UljnVp2C=4gby#%l$o?1j+XiSoQ}Db*l7-X1RyX zpI~;J=BIR4tCX^_lV$#o7WO6M3^mG4#u?(Cou7?qAcwBc5hW!gz^RM5TGw zWWQxvDV2gMT`ejFRa(6Elxv*QuhQQTq#Y{#En^Wd#m;&Ke_+-^c=8|cU6#9O0|LYT z3FUBtzp!3HcRgbn9-aaj8S={gHy(27z)@~WZdm0|Ql(InZZ!B8ttPqu!?PKAUO>wu z4;VERk_U_$3d=*a(Cd~5_LB3;12_%&wEuGMH_$E*=roj;=haq^5qUtVAu6KRagqh? ze?5a@tak;yL3zNZA;{|X+*)_5mG$y`*5t!6t-IYe8|3+XfpUAu(K&}aciE%1$pbPCp(#mv zutWRTo8^ASfI&l?$%0RvJX>ky-rrab2jy5LagS;LQIj9x8L67;c>l>lA+n#EW1b7E;HF&iR2Gb| z?!2v8*_UEDXFnJ8RNG_~)!j`tL10!lK^d2aMBdXZkC#*S$m26>sTO&FlcAVA&jc}VL+_Cv|cu;tP7+7II!soAcQTy8&Td)1y z=ECrx_ETLACbfUSCt`&;=r8Njat6QrOm(%sP!UOA7MlPEQDD-07WPxvLpALMG&K?~gGvXtee=uXn*4r=V2C8Rt;18w*t{^csdR?*P?VukJoYrPpA2$|)PR(ZoL)29PX1?KN1LvRjn#;lH^ z1$Q`>mA4tuA}^_7tXW=C!&sZV)MjH$??Uv+OBNXGlD7xZBQMEdtXJMXL{46E!kCd4 z{x&u!FG*l*R9?`3yh`3tTAXtXF)lCYKh`hrBy}*IQ?w@Kh1!ix$V){xHZ3opKQ<;W z1aE9sUZ8$#N?r)w*o?f7AvgoTKaN4af^f(S@Q*FZ`&C*_dBOkjT6w7e#}?$}^2g@n z1@OlkYylRpc_#YdV7u+A`##Rtfd7;;1-SXm=YS*-G6Vt*gncJ0Q2#ryw4%HG{AgpK;D1TO34eRkEa>z{jue+ zRcd^Zt+Hck1klIk)d--E$JKZ#qFIe_&+!(0>1BA>pc-FZi0H;wATnyCR2^?s(uyqM2#B3_VJ(^!S->V8n1kQ0w_Li)cDrIkn|;#EHNE?A5LGZFMW`>@ipk^ zbvXD8>NSp7^*s90_4a{B2S0(YbI-rB+S(d_*QeVrc1(9Dvjg3n3f+vEZ8SC70%xjD z=@pZ&?^7e}Y`j;YTg+Ox%0512AJ^;PX|p_Tzkh11vK=mkZe@W59jr$m3l@!j+wWE* zC^tT##sp$qnJdA02dBodCu|Y=p+5WhdsgW`1Jd?0iN--Y&#Oi}d16>6vt}h5(z$2M z%J7&D0#Nl}@kz7nZdYg>O4aM!vjluv>A%^pX;V7*gTmLu>C3RsybeMsMtjwW-z-k+ zAPj-ELmQY_t%IwKtkS_XgjWY41|#;n=nwtDs$?0^+kae03kUzoEVm9)fJ|g{@SmI< zgWb=0N9E(*CdXAxQfe61AShJC=6eoNh|YKn>aW+7&+tDAu^%0GAR&z$$;=v zBRT{pp0FSMcYefvbQ@6S9rSZ+b7tADgt@?-LyvR1E*<(Rbb3I(Z=!-LCLfqK-zgu| zcD_r#?;yJ66X>4OA^XX5hkW0|W`SDyz`gli?Y!OOr~LB$h*qC`KeZnUWALA2a83rr zoo98Z(>(S8z~(Xfs$IY(ADA`IWx`EIxUfH(Y&V9n3z5Se(kRb?;4E)n^ukQZv?f3a1nL4H6B>f#5q z798>eS_@V3zl@3V@&j7)3yMEw);f&*l*kLU@>3#vc?L`^IOV5Oo}ZQ9#&#f}vNxl5 zyZpe_LY@4;)q+ocxaWdP{3!&G%AC?Pfe?g0HGQ#Bj-;v%TJ}e;Fka0h@0=|@Y&QLS!-*Q|29|(#`qkaP4a&p5tW}ZdLgcR?=VYqeYzKJ zF@^cyuJir!e~D)a`Jt}!iwZRo=1G-p0C(o(hiNR(y3Z_+IaIc_5IAMq%<_C%exh`K zNPa5i1+Iijd45{{-S#Z#M6}M^Pjj1YvW&i4@e3T?Mylf)F0E=TQCi)e!+~W%BT`aocBJ(7A>k+Z&qf*swXaz zT*4A(N$K7^tFY(_Yhy*q!PySo`#ZDJ#gL0!P@~HJL?^&q!>LDA2JX)|RrW9T!%Y9- zS-aN1fK@tl_!6_ckkVnefFY0%zl4WvTK`hBwAiD=mzoPBi&}pftv;=P85Rv|{VT`~ zy*m7A6!mNUYw7IQ;WwD&#gNv&k-MDI;j7qbN&!l_#e@#OjeB9Icc5Cm4!;vor@%El z%c}XF0?9c{P5})LzlV7!@m>}SYW)Y$uu~N~ut7wJKU(-6F7Ppv4|Z$)b>>2_K}9b& z`7C;pXcp2sd^;N%9sV3=Xi=sZdf?LfJ6WY!fxEbQR9eTj0R`B0v0m#pmvO#dI2h8PKrJG!Ks};I0n*pPv;uzS%_)_@%}k>h$FIEpD_9M zdSxhwQc&Fl&tsgh}su`_^ciZ#LSZb#A3v@1M z{g;_w>&KHj%~G7P)K;@jt^X#=am(L>5PP-$`y7pH`Z3SyY=wB*R*17|zR#?I(CeRk z{#d^;J)$liGm$&h~1< z3vo@iHeA9A{o3$ivhAXR5S5y+b~sEP^0@);C6YJby^90d@EVh^?o#k|EHv-UIo6^L_Pac)qu_g3Caewb z=QwfgsE2elDfl6l2`cnUichzKWQuB5yN-Rk6}+C-jDnwFNI)BIWP1+rNu0SyMc12r zdPW;=W+eK43elkr|BoTv%JxxCwrRs1FpmiZzsU4X9l6u2RL^O{-6rp}H0!nZq7{+0 zVWY{{E^5R5^f&3qR{Kthj)2nMsE+KR%8hEn!vLC78}@Ot?K(o`SY4$fN6du@P8=q1 zJlY5q^dwofB4U|$SV8;3oL5JjX2m(CBQ8pgh6!zOvl1CCW;yJXj(F*=ueDc>?~P=2 z1P0aEr6Xbb>sl2Az?~T#iLqlwp&+I<3Z_vbsN}2RP5}k8&kt#3hR3g<0a2-gafa}` zHuQop)e82RymwZ?eufU~$S_A7(2)_+pif5@SbJWpuqDMhuA{5h+XSq1s;%~GX75oPxs^C8n zEc2XMsq^c|znIXk;D5|g$BaTR;8{+g7b5x;dJ!V7qc65sSEJCSTxFOzpbiNo6@tu% zS`>PP{j{iBN1^RC4uxJtZ;y_?1`AR`y_S|yh^PwT`Bxya3PHU?gW9ujx{1z49lhDCG)yXFKlYo_(OZb~ zh(ZvL(4vmsYVtE~g+6Onb86$~*m*#q&zt;Yw~l_H@U7%VqOc*M5K$Or=$#0cHm)Pu zJPLh@7Qu55L+sa^&3D*OV|>~%1sLH~o7rVj8wt&ZejP0`E3QVJDlsdqc=!WMt<%PB zJWFch4wH9vC`6_SabXW3C_-lmO;#x>7(cWZL5VX*C&oWOYfhnqhb$M*b#(@U<(i?f_HwMYiL!-7GcA9=XcnIy8nBK zm_k2b3{FCk?(WvcpCdTw|ADu3{?e@0V9O`z994)683}9S)3k^Y;3LfS10N9_Kaar0 zml4=x6@lKMNI0y}Zx9^+_s<{d543PFYLG~sLKhL$+6bb!+qCg7&u{xzT3GK{o;7OY z-)UuZ^dDx$GoX#n0k+uUUmU4TNB>P{gTi1(B&+Zx_JH>DkWDYLFY(6}ezD2VN0s>r zUfH2Axj53K@TG`8g@Kevk1_z~6wko>a75wP&`AsMh(r|zH6qQ*kc6iu6~2mAyN*$6 zdxjK#GZO|Cri_lHlzD<3x)r7rj`S=1PQ-x1??R*%z8YPJ6$VBkc-DInNrm5sXi@kB zh*oX-5U1i9XcB4DrjHb=+fC3*FFRaUI71jLi45u3C%ETnWhe=JGYW$x;W-`q6e^8q z6L{gB(Xr1mWJ<@VoP%`=f6f}krDLCG46A(sWjN;@tTv`Fun`$m_^v|o4zHu-R2bNZ zR_Pe9(b1$$_faGJ6)s|@R)xWZ5GLHl$XOM=#N;zcWw`vQ356e|b6#NpAu^@#9>j=_ zJ;JtFWFM`#j+N5N>DYd=;;m5_l!#30*daPII(8V5)iDzPOrwrLQhk#;c7l;Ug+YnP zgpQr$WOSYa-sZLGv{`m=A7{);oFZ z--(b3qJ4_I4bi3uksa+;+!BREha%USrB0_JHniCUh7j>9LWD3Rw#fX*YP-~2+y2?z_$8 z15s7J!mP}N6!{#_rc^Wm&tsLl7}Brf>#)e6B6rh@D6<}Ds#0WwS#yo)_(q;36~Um< z5k;T?kuJq&%$h-Q@_Q7U5y#|@5^ekS#`o=IbD@7k z{hWOn|BC;hj;BgHKL8VL&}I@;Fs2i)WRILqya6$&&2Q$4jZU#~OOsAq4b$w>iEFH2 zg%j6O0)+;2;@yQeV#D>=$*B_`p^$cH=f_z#pv|8&`NeS^dJCrXtLQ0=@6=AbYN}I@ z{MuX?@TrJ2o=I!-y=E=OmYeLa-E?9HoAhY&PLqezM0ZhOdUWC;$bXcbH}A2LI;0c8 zH|T%ju*uI=>%@uYTjivwu%h;c7Zrt_#kv&*X=6Q#)*yQA3QoI1PSH9#`gEe+tl)6b z23q}!x)7Ype)Zd;sE5_76a|lC6N)xkLsl#5qdBUmA2Fn8!1mTCO6e9GQ8e_tfXT&J z73_|UYf}PzY*ZBNj!kKE(&Pc{qCc9v8C04IWGt*oQsZ2oqF{EMEue65hoXRWpg~cX zTpTsvZm}6f0qfYjHur*OUD{0H7hrcVJ6@;g0M8b*8T4q3Dmq;FrO#%t8U`4hpygH+ z1{ZfJ3UiM86otUW{ff>bP;ddkejs+-zGxAB)Iv-)f6a}+m z!-{^3XAwoI8RKjTa>uzfkUL(l&EG{iMgrh*1S~HeSCrB*#wwJK@g_wn9pec_VMj6U z{^w>T=F;XT@W`~Hl#EXM$!Ywb_G|1pMFIABQqiXnTremTSys7Y(>kI6; zdsKc2>NIJ~OA9}WDF^PFaw@0bYl4MfY5;D_>rEc)+;RmPTmP)FUwQ>x+VV#FyR_v> zim8lFzZgA|`ox>f1&qGstte+-qH8MrylFt?Z#OF#sQg{5JgD;bAey!1eTBCy!W8jQ zkG6c6IW5{kh()Kg<)dgcukz~{IiM{!Ffyzy1Y9trvJ$gA(W!EZI;>s(se;(@Y37Y- z%V+J+CslqM^J3cad5Z`wD*pn<>{j_52>WA(rY~BxhE#qhbDea2;vT})sC=VYu5Hj3 z;*=#5RJntEk~K2P}4D#Ku?Oirb#tX&n4=#<#YiX%FkiBpqeV!eN&8j+y3P`w4hY61}wU21}w#kmx!H!8a(ARs=WErG)Ah6#^oR$HPbpRivjC*EXV z$zV&uKFWP2S)DB*X$hywn}PI&Ds3+@`Qd79=|F>dZRzCh@U(7P8~`GQ`)wI4yfI%+ z5@{M%6VaRuXv+xjhzE>vILt5x3fPYdo5tgEoyoWuphUuaJ^NvMK`|K zTuA10|E0_s*Zr?R%E_@Dq(e~Hse!?|n zzwADFrCEw+b^leg+jRfiL58+L#VDWBtvY$FSxMV3mSgX>JJ;#-$4HN}+IJoC;aABY z&2q3o33yb8&5|c?FiQ!yZoH9nn`u$#F*r>|u}`y>Lnm)3yn!CO4Rehu28T)yDE39O z;uu%V{-Ui;C%?q*3vD`iFUr_&^b?=7j%6b(nDn?b53F{U=?$zSa z$?aw(!!A1!_Pu}Cr@*E5A;tEZRp*>yk1)DJC*e?;1#P{>GBX!Q% z7b#-3b|0T&ev_Z<(aAUlXw=EHS#mTfmN6@{!@4nR-`jNSWIKtqu1hhPP?uFL-fCgH zUw7UC@2u9Lj9DIO*8Srq-<;FQ3A2*5SCF{VjE>wX8W^+*#BDPojUnLw5I<12VvT) z7?pH$uTH{jvOS7XIW+exM&*zlQH%uB+^73-*R1`~U}A$cxK}6th>y~LvG9}1eVfd3 zme$|QvWxAWV-9!rUyz|u%|Wv?F{tq}K@mHF)wra(}*0w6eUyZ0yoI;_kR`J&|WM1*t(Q+tGz0kU-A{;k6ruda~ z)+)3NDzt@E zM4lOflz)_#TSXr;%kB=vsT0~fieHb2Dt;p(qBsRYTZ7^^A;OA(3K3NN(}mWl+s$%% zSaIt8wwQ|Ug6$#JnN=_Is0`ZTic{#fWfZ>$fvPaUwzMLj;HEI*Mp|gL36WC#zQVJj z`$=zDsEAy&sQ6YoDVH7qz1kGtMyo}q@cmXSgzpby*@x(CRuOpH%wp6KZH#;v>n-Ti zBgAh^@%?nRD}De$@qmHD(>hhgGgP9YXzNs*ilWV_IH{#AtN3w5mx@kcrMQw*@4aXS z4z}3u9pY!KM_Zh@ss{9FQ4w?!hdD)pXr5Eab7nc5)v1K#p|dJVW0n?ulH$G1)1Y*T zSsS(g4WX#bEIE4>hxHksPW5mmj1KECTpdNZ!OS@Xmo>x+7-+cg`d$&4A!BKNe9WqW z6GHq9S2Jm|Itx;kn}|-48d|Y2$<8z>4i7X@otigG)y%N34JP^&f0SO#^EiU5`Whmk zqOU){8}!d4b?7@>2W0l!oY2N_{CjrO37z^r%QN_gFkg>~e#T=IB}JsCRRq1u;-zE? zlU30#3hNa;iJ(sS70+sQ>M5q9$kRY+hfZA}DWnyL+jVs6)Cw&WSffQ5{2K(P`Yi&D ze~;+bDI2fE-JjS2Fa9%~9Q7H5rFZeaS{k&lz1^>N(th)V@{`EijI>?yo?s&<&f1`B?#b-ah-k@&#IIlfIH|Uay#ag zz_B_VN)Wjni%MM4ez7mssl*%TtybcV^e!l2U%iNRDRGrqnosETn>cHu5^pxE)niJ$ zg`u@d5Z;|NN>EL9c-fdTFQR2OYe-Ci_{kZgu zPJaNgpwl13NbL9_J7!P`a@lCTf{&TiR;TWxhzqbI)UpZX2*^&C5`=tby$Wfl#jC{4 zG<{0^KlJEP;uczdB|d|2D?vjsGei6{k(Y{DWK#4mM zK_%`&gq2u_Ad`X-oe{ZL%<51;Ti2U>2fA!1_(TG5=$O#y%_JMh=>6Puw-QD6EU3NR zthG1lI2_o|+B;c>tpS40sJ1@LG9B8w58=?(Qmi+ltp^GpG;TeF@aXtwm^Y%WaK)}^ zojwn1snzL5v%=+{_91fGM<#6U(dj5HpSEU9-rBd7R*gF% zaLKQ*c08l>F5uh#PA*}ITX#UGzX9ycX{%5^ro?xxuVWc%upT_|`v^RdQmkiM32LyO zaV4n1dT@lF0b}?q`LPElf5J}eQ-Ub%nNWfR*)ylauMm@}_@G(x;`3Bsz1#><+S{N* z;M#aV`#KABaO*OroKl&6Z;M;DKb7hlQ)FY|6QBf%w3nk0pS^WDWZ%F@Dq(+p*hQ56 zonGAPIeS1O7lrQ3h!R9%uU|X?FE`6G^-7{yZ>!GS zN-#|7fzP55t3w}qo0KHXdRvr)MfSESNsRTNCb87(RC1j)YffkG#zsbGzQmriY{F$C z3AfiMLNOK{RdO>e7A2T^ZT){{t655SC~5x_jax~wYtN!?-fk{*_bG|r^xJxb|x>&*VbXX_6fWYK*6RMa>%FMDisRz0kHKH@*z}}3`^qS?dIYkJn=!DJ?KX~Ms9M6J!V2FfU zrIKS1mKL2ErxUk;YW5E33>nX=GiUO>y?S6Ctr~RZaXQmF^HoHn&U_uwsWbKmI4u1w zS~Ys$+l4palixFIeTW|vej}0m5%?TdlF;e%D)|$HM-TjzHSK3iO(|?nQFG=ug?HAI ze<&245sJhIAeb6jVHEJ8x4k5hlm2}oCb-Deqs!ef)7yk#f?M)`%H>}i~3nX*f zyJ&@U?_*|Z(I|DTS#F9c^&Ybpn^x+*@Yh(1!T9bbzDTdw%yIs2A##}`aMeFbOSEM z*fF)BR1q!vPd8Hb7kT|CL%4j)M8^*WhboRJesvlA6Boi04?G!^gl{&+aT9thUSgO;u^K|wq z<)G84R1Km|sam^nNN4Nqi(U5L54h~VA3y~+&%HY9Ve1*C8fp2J@*(PV)^An@GD-z# z1(XUR8kGtmICdD}Rw`<1%_^mmG($?I5pkt5h^$gs1m|hC$K#mBEKw_^I_Ygvsu#g+ z^dVA8kvK+z+6Hb7_9-=FwT)=k7fpU{T02@yezaXFINt~dv;WRyp+>10ni*|-v{1jK z$RHyvI{P?8yhg5P%_@M9`kGw_8-AUKJv#eM&J$GXTeP}$_Sfxj7_$PZrF5oZB zNGb)%8=28If@`ov+nz;mD}RIGRq0;)H6l8}@J28k31!r+b1&d%ol28eMjI3)`Gq`6 z!}mt3m3}F=(4x{80!uyGei{A{RvNwsPufo4QoxMq+)dantu%aYWLjxzxY3x>&^r4goKF{iA$sl>lb?#KnPPE) zLy&exT{?F=stqblz8Otu`xi`p*sXNo4G(sL*o~qu#BQ`n=hj(=$m$$fXkc2~zr@B< zN<#-mo3(utGi!A2KC|R%SCU8vZPHtKhEgP>(X7&d1p#qxD<=%=+%`7O=^TaLV7Jmc zcs8eVyC6Oeog*Xl&uRNZtl6qGnGI(t+DPdYm;1GrkfG`$JnYmtQc^!>D>W-asCa-y zhm}6aYDt|t#M(Vd11X~|I(HO0?2!BQW_55t>63+vO`l>U6`O^WRynW0yV7!BX_j3U z4=awDrL0$H4wCljw7nGf#~uJCJ+?SCYQhIH;?;d8EYe`X|H@-IAV*Y;;j9%R~1@BlaN_i^S~o%@Gb znQ2z~U-nJy8g2hK^ILW9KlHY!f*_jdRmCL;ql{%`sjMnqOtVuJkh7UKRZuR?w5sA# zM29M%YBSxccp0Kk6|X=5UYDEv8070igh5;tuc4E0f2~=L*C|8zObw~x^>i+%;tE7g z6>mV$`9{Q?c3g?VF;!eeYfwAhRCwXJ;?1@j&Le-Jws+yaRWQdDs!V*8jGre0Gw@91@+2ol`;h2 z)S@c>A1#k6C`M)js<;*5Rpzryh^XQ=S`Dfo`Px^4KS!%p6`zNdPpg6qJZpdSvg0md zj(l5j4;?Y>pnk%!Gt^JhHOdfr)0lM=0_7;9W;5Dxf8l2`nFs78qYq&=+o=rI)O3SN zZw3b0Y$u&rWk7;y_JsJ&rd0v)gTg2Gb8~FCmsUc%g496_d4!fN`7--#a+^^G3OLPC z4j}x>96|(?A<(9MsyNKxCRH3kG^^q$ked;qv@)AgMR`HV4B;^yR>f&rxYZe)t+rb` z&XJ@ixYJ6WqI#A6ipQ%82cz(U8bqsh)Z$=Ve;qBnw%(4x!VNU>MHd3^CCg8TRNv~#Kmn&n1}7P1aLp^7jM{n`;R`Ed-}#KeSl#Mq-+nK-SeG9>%yn2KeV z>$$cx6Ido|R_4&Lh1Q}ntqkGFZM5(mgCMV1Vza0nFkg@+)5DOoGL%ANICvkvgMaih z5+w!@W7+{77{l1`;8}cta&ne1p%$Tbsu(dV^B7^2PGVq;-K$hFZu0Xzs+eNPgleBD zaHbtIH~bI+eYY(K%IKW>y28 zsstD35~_STqD?!-us}jt>e}gHRldq@6HykHL7`mr&;%37RQYUQcJcDnUH3 zs`3+NWqeYVH`0o#65yNbQ02{bwSX!=ZI-KMRe1|r^=Rjdpgr3Q&5_FsE zRi=X@rBwQmS#BRt<-K-3TCJxTjHz-XV{&ruCUatv`1 z{+{Qb-OIC}nn~UBi`sQx;ib&x&9FuL&F!u&_MfKKs~N_;P_12C3vX~Xle{U9cI{+v zM$Nlu)o9m)w3^jiOsiJSIC%YxcI~0FQ_cAKLW7zgE_``i>N3mC^J*q#FSxa9f8m9( zW-|AJOXuKxGdlRRtFF1&U=rSnc&F`Xyx&X4Il-n!t> zd5>9jH)xj^Tl&;YIlYijvyU6$_yMjdtGnX%`^ZH#M|hZ2a}wGt&Rp)!S8JC&|tVgg_;ax`i-Qzok&4uQ)c99CLLK9}S z&i<0;Gn3}RD9TTBFsRxLE8Bm8>A%;qL(A yboOh{J!UzY&}UY_{CWFdY@2^$i4}KErddqtJc(j4uJiv!B>q2VgNf-hTmS%eUL@53 literal 0 HcmV?d00001 From 01f9c07c3323843fbc0f1da1a08dc449f488fe00 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Tue, 22 Sep 2015 16:42:13 -0400 Subject: [PATCH 04/12] remove unnecessary delayed loads in wordfreq.chinese Former-commit-id: 4a87890afd81d87b3b30df13008d9013391dc446 --- wordfreq/chinese.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/wordfreq/chinese.py b/wordfreq/chinese.py index 03a1ca3..e7ee371 100644 --- a/wordfreq/chinese.py +++ b/wordfreq/chinese.py @@ -3,22 +3,16 @@ import jieba import msgpack import gzip -jieba_tokenizer = None -simplified_map = None DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt') SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz') +SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8') +JIEBA_TOKENIZER = jieba.Tokenizer(dictionary=DICT_FILENAME) def simplify_chinese(text): - global simplified_map - if simplified_map is None: - simplified_map = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8') - return text.translate(simplified_map).casefold() + return text.translate(SIMPLIFIED_MAP).casefold() def jieba_tokenize(text): - global jieba_tokenizer - if jieba_tokenizer is None: - jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME) - return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False) + return JIEBA_TOKENIZER.lcut(simplify_chinese(text), HMM=False) From 13642d6a4d53d8a00860e7d4bd5b6599c65197cd Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Tue, 22 Sep 2015 16:46:07 -0400 Subject: [PATCH 05/12] replace the literal 10 with the constant INFERRED_SPACE_FACTOR Former-commit-id: 7a3ea2bf796c3f31fdf7d1c441b12b8ec52acf50 --- wordfreq/__init__.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 4790282..85e4711 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -21,6 +21,14 @@ DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) # for the fact that token boundaries were inferred. SPACELESS_LANGUAGES = {'zh', 'ja'} +# We'll divide the frequency by 10 for each token boundary that was inferred. +# (We determined the factor of 10 empirically by looking at words in the +# Chinese wordlist that weren't common enough to be identified by the +# tokenizer. These words would get split into multiple tokens, and their +# inferred frequency would be on average 9.77 times higher than their actual +# frequency.) +INFERRED_SPACE_FACTOR = 10.0 + # simple_tokenize is imported so that other things can import it from here. # Suppress the pyflakes warning. simple_tokenize = simple_tokenize @@ -190,13 +198,7 @@ def _word_frequency(word, lang, wordlist, minimum): freq = 1.0 / one_over_result if lang in SPACELESS_LANGUAGES: - # Divide the frequency by 10 for each token boundary that was inferred. - # (We determined the factor of 10 empirically by looking at words in - # the Chinese wordlist that weren't common enough to be identified by - # the tokenizer. These words would get split into multiple tokens, and - # their inferred frequency would be on average 9.77 times higher than - # their actual frequency.) - freq /= 10 ** (len(tokens) - 1) + freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1) return max(freq, minimum) From b4628abb388be127426cf79460228d45d426b7de Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Tue, 22 Sep 2015 16:54:39 -0400 Subject: [PATCH 06/12] actually, still delay loading the Jieba tokenizer Former-commit-id: 48734d1a6031a500c121e762dd2075dd37aaeb68 --- wordfreq/chinese.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/wordfreq/chinese.py b/wordfreq/chinese.py index e7ee371..c07e77e 100644 --- a/wordfreq/chinese.py +++ b/wordfreq/chinese.py @@ -6,7 +6,7 @@ import gzip DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt') SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz') SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8') -JIEBA_TOKENIZER = jieba.Tokenizer(dictionary=DICT_FILENAME) +jieba_tokenizer = None def simplify_chinese(text): @@ -14,5 +14,7 @@ def simplify_chinese(text): def jieba_tokenize(text): - return JIEBA_TOKENIZER.lcut(simplify_chinese(text), HMM=False) - + global jieba_tokenizer + if jieba_tokenizer is None: + jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME) + return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False) From e6e29a1c03535555d0b18545e1b8a0b759fcb9b4 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Tue, 22 Sep 2015 17:19:00 -0400 Subject: [PATCH 07/12] Make the jieba_deps comment make sense Former-commit-id: 7c12f2aca169951f4619deb168b00fe14ff06809 --- wordfreq_builder/wordfreq_builder/ninja.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index 2c94f58..80437ff 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -215,7 +215,8 @@ def opensubtitles_deps(dirname_in, languages): def jieba_deps(dirname_in, languages): lines = [] - # Either subtlex_zh is turned off, or it's just in Chinese + # Because there's Chinese-specific handling here, the valid options for + # 'languages' are [] and ['zh']. Make sure it's one of those. if not languages: return lines assert languages == ['zh'] From d215f79ea37caf00fb826239f2888f50d71d8698 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Tue, 22 Sep 2015 17:22:38 -0400 Subject: [PATCH 08/12] describe the use of `lang` in `read_values` Former-commit-id: f224b8dbbaf4eb30988def5c35c4f8b204e5823e --- wordfreq_builder/wordfreq_builder/word_counts.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index 1ba7214..a3bf0ae 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -42,6 +42,9 @@ def read_values(filename, cutoff=0, lang=None): If `cutoff` is greater than 0, the csv file must be sorted by value in descending order. + + If `lang` is given, it will apply language-specific tokenization to the + words that it reads. """ values = defaultdict(float) total = 0. From 6b163e577291c1c5f02daad0a82c33e1c2810eaf Mon Sep 17 00:00:00 2001 From: Andrew Lin Date: Wed, 23 Sep 2015 13:02:40 -0400 Subject: [PATCH 09/12] Revert "Remove the no-longer-existent .txt files from the MANIFEST." This reverts commit 20890901514ce950ba1c0cc8fc91d7a77b28e80e [formerly db41bc790271ec2fe6f12f63a0a1d2f7ffed74fc]. Former-commit-id: bb70bdba58441a398aa14d528f96c02da0dd4480 --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index 4f20a26..012f4ca 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ recursive-include wordfreq/data *.gz include README.md +recursive-include wordfreq/data *.txt From 4d00f17477e67ef42f98c6cad0b4ae0a83f06da8 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Thu, 24 Sep 2015 12:49:45 -0400 Subject: [PATCH 10/12] don't apply the inferred-space penalty to Japanese Former-commit-id: db5eda605116e5441745cc6712abffea7f59a47b --- tests/test_japanese.py | 4 ++-- wordfreq/__init__.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/test_japanese.py b/tests/test_japanese.py index af05c2a..9906741 100644 --- a/tests/test_japanese.py +++ b/tests/test_japanese.py @@ -14,10 +14,10 @@ def test_combination(): assert_almost_equal( word_frequency('おはようおはよう', 'ja'), - ohayou_freq / 20 + ohayou_freq / 2 ) assert_almost_equal( 1.0 / word_frequency('おはようございます', 'ja'), - (100.0 / ohayou_freq + 100.0 / gozai_freq + 100.0 / masu_freq) + (1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq) ) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 85e4711..e6a4849 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -15,11 +15,11 @@ logger = logging.getLogger(__name__) CACHE_SIZE = 100000 DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) -# Chinese and Japanese are written without spaces. This means we have to -# run language-specific code to infer token boundaries on them, and also -# that we need to adjust frequencies of multi-token phrases to account -# for the fact that token boundaries were inferred. -SPACELESS_LANGUAGES = {'zh', 'ja'} +# Chinese and Japanese are written without spaces. In Chinese, in particular, +# we have to infer word boundaries from the frequencies of the words they +# would create. When this happens, we should adjust the resulting frequency +# to avoid creating a bias toward improbable word combinations. +INFERRED_SPACE_LANGUAGES = {'zh'} # We'll divide the frequency by 10 for each token boundary that was inferred. # (We determined the factor of 10 empirically by looking at words in the @@ -197,7 +197,7 @@ def _word_frequency(word, lang, wordlist, minimum): freq = 1.0 / one_over_result - if lang in SPACELESS_LANGUAGES: + if lang in INFERRED_SPACE_LANGUAGES: freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1) return max(freq, minimum) From e7d46fb104d9af02da047ce056b2a336b3c57a82 Mon Sep 17 00:00:00 2001 From: Andrew Lin Date: Thu, 24 Sep 2015 13:24:11 -0400 Subject: [PATCH 11/12] Revert a small syntax change introduced by a circular series of changes. Former-commit-id: 09597b7cf33f4c1692f48d08a535bdbc45042cde --- tests/test_japanese.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_japanese.py b/tests/test_japanese.py index 9906741..d5a73b3 100644 --- a/tests/test_japanese.py +++ b/tests/test_japanese.py @@ -18,6 +18,6 @@ def test_combination(): ) assert_almost_equal( 1.0 / word_frequency('おはようございます', 'ja'), - (1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq) + 1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq ) From e27a75029d08525fb56209ff73a5f898c07feda0 Mon Sep 17 00:00:00 2001 From: Andrew Lin Date: Thu, 24 Sep 2015 13:31:34 -0400 Subject: [PATCH 12/12] Revert "Remove the no-longer-existent .txt files from the MANIFEST." This reverts commit 20890901514ce950ba1c0cc8fc91d7a77b28e80e [formerly db41bc790271ec2fe6f12f63a0a1d2f7ffed74fc]. Former-commit-id: cd0797e1c8081acf96f4ff43b5f8ef76f3e10b7a --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index 4f20a26..012f4ca 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ recursive-include wordfreq/data *.gz include README.md +recursive-include wordfreq/data *.txt