用于对知识的整理,方便后来者。
第一步: 用python爬取每个搜狗词库的基本信息,以及下载链接。
搜狗词库细胞库链接:http://pinyin.sogou.com/dict/cate/index/167
生成12个文件,用以下代码合并文件,并按照类别加入type字段,同时对官方推荐的词库进行标记。
#encoding=utf-8 import sys defaultencoding = 'utf-8' if sys.getdefaultencoding() != defaultencoding: reload(sys) sys.setdefaultencoding(defaultencoding) from codecs import open path = 'C:\\Users\\ijiao\\Documents\\Crawler\\projects\\ana_for_word\\sougou dict\\sougouciku_total\\' filename_out = 'C:\\Users\\ijiao\\PycharmProjects\\fatedate\\jinqiao\\sogouciku\\sougouciku_list.txt' f_out = open(filename_out, 'w', 'utf-8', 'ignore') for f in (1,2,3,4,5,6,7,8,9,11,13,14): if f == 1: type = u'城市信息' elif f == 2: type = u'农林渔畜' elif f == 3: type = u'工程应用' elif f == 4: type = u'电子游戏' elif f == 5: type = u'运动休闲' elif f == 6: type = u'娱乐休闲' elif f == 7: type = u'医学医药' elif f == 8: type = u'艺术设计' elif f == 9: type = u'社会科学' elif f == 11: type = u'生活百科' elif f == 13: type = u'自然科学' elif f == 14: type = u'人文科学' full_file = path + str(f) + '.csv' f1= open(full_file,'r','gbk','ignore') f1.readline() for line_s in f1: line =line_s.replace(u'【4字成语大全】 收录成语54,089个 (上)',u'【4字成语大全】(上)').replace(u'【4字成语大全】 收录成语54,089个(下)',u'【4字成语大全】(下)') value = line.split(',') name = value[0].replace('/','').replace('|','').replace('<','').replace('>','') if