额 闲得没事,写的文本预处理模板,很简单,但是为了以后处理文本时,不再做重复的事,就整理了一下,随着处理文本增多,会慢慢更新。。
1 #!/usr/bin/enc python
2 #-*-coding:UTF-8 -*-
3 #
4 #
5 #Copyrigtht (c)
6 #Laiseek Comany 2012
7 #All rights reserved.
8 #
9 #Finename:nearsyn.py
10 #Function:预处理工具
11 #
12 #Current version:1.0
13 #author: Chen Yu
14 #Date: 05/07/2012
15 #
16
17 class NearSyn:
18 def init(self):
19 self.syn = []
20 self.det = []
21 #加载数据
22 def load(self,filename,detfile):
23 sock = open(filename,'r')
24 self.syn = sock.read().split('n')
25 sock.close()
26 for i in range(len(self.syn)):
27 self.syn[i] = self.syn[i].split(' ')
28 self.det = []
29 sock = open(detfile,'r')
30 self.det = sock.read().split('n')
31 sock.close()
32
33 #去除读入元素中的空list
34 def empty(self):
35 for i in range(len(self.syn)):
36 #空元素移除后元素减少
37 flag = 0
38 for j in range(len(self.syn[i])):
39 if self.syn[i][j - flag] == " "or len(self.syn[i][j - flag]) == 0:
40 self.syn[i].remove(self.syn[i][j - flag])
41 flag += 1
42
43 #从A中剔除D
44 def dete(self):
45 for i in range(len(self.syn)):
46 self.det[i] = self.det[i].split(' ')
47 for i in range(len(self.syn)):
48 for j in range(len(self.det[i])):
49 if self.det[i][j]:
50 self.syn[i].remove(self.det[i][j])
51
52 #输出 www.2cto.com
53 def output(self):
54 sock = open('nearsyn','w')
55 for i in range(len(self.syn)):
56 #去掉单个词
57 if len(self.syn[i]) > 1:
58 for j in range(len(self.syn[i])):
59 sock.write(self.syn[i][j] + ' ')
60 sock.write("n")
61 sock.close()
62
63 if __name__ == '__main__':
64 t = NearSyn()
65 t.load('A','D')
66 t.empty()
67 t.dete()
68 t.output()
摘自 sunrise