对于中文以及windows下路径的修改是要注意的,尤其是编码方式
ASCII不能存储中文
unicode是中文在内存的编码方式
utf-8是中文在硬盘的编码方式
需要转化尤其是在调用存储的时候
下面的代码先decode的目的在于,将原本存于硬盘的utf-8代码解析成Unicode,然后再转换成utf-8显示
还有就是split对于分词来说十分有用
python下标是从0开始的。。。
# -*- coding: UTF-8 -*-
import os,sys
import re
str2 = 'C:/Users/Hit/Desktop/文本/199801.txt'
path = unicode(str2,"utf8")
fo = open(path)
fw = open('new.txt','w')
count = 0
done = 0
while not done:
line = fo.readline()
if line:
count = count+1
if count != 0:
split_line = line.split(" ")
clear_time = 1
for item in split_line:
if clear_time == 1:
clear_time = clear_time + 1
continue
else:
term = re.split('/',item)
if term[0] != '\n':
for word in term[1].split():
if word == 'nr' or word == 'ns' or word == 'nz' or word == 'nt':
count_nr = 0
isfirst = 1
for contain in term[0].decode('utf-8'):
count_nr = count_nr + 1
if count_nr == 1 and contain == '[':
continue
else:
fw.write(contain.encode('utf-8'))
fw.write(' ')
if isfirst == 1:
fw.write(word.upper())
fw.write('-B')
isfirst = isfirst + 1
else:
fw.write('I')
fw.write('\n')
else:
for contain in term[0].decode('utf-8'):
fw.write(contain.encode('utf-8'))
fw.write(' O\n')
fw.write('\n')
else:
done = 1
fw.close()
fo.close()
# -*- coding: UTF-8 -*-
import os,sys
import re
str2 = 'C:/Users/Hit/Desktop/文本/199801.txt'
path = unicode(str2,"utf8")
fo = open(path)
fw = open('new.txt','w')
count = 0
done = 0
while not done:
line = fo.readline()
if line:
count = count+1
if count ==4:
split_line = line.split(" ")
clear_time = 1
print len(split_line)
rows = 0
pre = ''
preterm = []
for num in range(len(split_line)):
if num == 0:
continue
else:
print "NEW ITERATION :",
print num
term = re.split('/',split_line[num])
print term[0]
if term[0] != '\n':
word = term[1]
if word == 'nr' or word == 'ns' or word == 'nz' or word == 'nt':
if word != pre:
if word == 'nr' or word == 'ns' or word == 'nz' or word == 'nt':
count_nr = 0
isfirst = 1
for contain in term[0].decode('utf-8'):
count_nr = count_nr + 1
if count_nr == 1 and contain == '[':
continue
else:
fw.write(contain.encode('utf-8'))
fw.write(' ')
if isfirst == 1:
fw.write(word.upper())
fw.write('-B')
isfirst = isfirst + 1
else:
fw.write('I')
fw.write('\n')
else:
if word == 'nr' or word == 'ns' or word == 'nz' or word == 'nt':
count_nr = 0
isfirst = 1
for contain in term[0].decode('utf-8'):
count_nr = count_nr + 1
if count_nr == 1 and contain == '[':
continue
else:
fw.write(contain.encode('utf-8'))
fw.write(' ')
if isfirst == 1:
fw.write('I')
isfirst = isfirst + 1
else:
fw.write('I')
fw.write('\n')
else:
for contain in term[0].decode('utf-8'):
fw.write(contain.encode('utf-8'))
fw.write(' O\n')
if num == 1:
continue
preterm = re.split('/',split_line[num])
pre = preterm[1]
fw.write('\n')
else:
done = 1
fw.close()
fo.close()