文件夹下的文件:
每个文件中的内容:
代码:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os
import unicodedata
import string
def findFiles(path): return glob.glob(path)
categorty_lines = {}
all_categories = []
for filename in findFiles('E:\\data\\surname\\names\\*.txt'):
category = os.path.splitext(os.path.basename(filename))[0]
all_categories.append(category)
lines = readLines(filename)
categorty_lines[category] = lines
print(categorty_lines)
all_leters = string.ascii_letters + " .,;"
n_letters = len(all_leters)
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize("NFD",s)
if unicodedata.category(c) != 'Mn' and c in all_leters
)
print(unicodeToAscii('Ślusàrski'))
categorty_lines = {}
all_categories = []
def readLines(filename):
lines = open(filename,encoding='utf-8').read().strip().split('\n')
return [unicodeToAscii(line) for line in lines]
for filename in findFiles('E:\\data\\surname\\names\\*.txt'):
category = os.path.splitext(os.path.basename(filename))[0]
all_categories.append(category)
lines = readLines(filename)
categorty_lines[category] = lines
n_categories = len(all_categories)
结果:
将categorty_lines字典类型数据存入csv文件中:
import csv
fpath = r'E:\\data\\surname\\names\\merge_surname.csv'
f = open(fpath, 'w')
res_list = []
res_dic = {}
for key,value in categorty_lines.items():
for v in value:
res_dic["surname"]=v
res_dic['nationality']=key
res_list.append(res_dic)
res_dic={}
fieldnames = ['surname','nationality']
csvw = csv.DictWriter(f,fieldnames=fieldnames,lineterminator='\n')
csvw.writeheader()
csvw.writerows(res_list)
f.close()
解析:
首先将数据解析为列名:值的形式;即
最终形式为: