# 文本分类（power 8算法挑战赛第五期）

# -*- coding: utf-8 -*-
# created by axuanwu 2015.1.25
# key word: hash  count
import numpy as np
import math

def getseed(str1):
"""

:param str1: 词条的utf8形式
:return: 词条的hash指纹 256的位随机数
"""
h = 0
for x in str1:
if ord(x) > 256:
h <<= 12
h += ord(x)
else:
h <<= 6
h += ord(x)
while (h >> 256) > 0:
h = (h & (2 ** 256 - 1)) ^ (h >> 256)  # 数字不能太大
return h

class MCard():
def __init__(self):
self.M_num = 8
self.N_max = 16777216
self.nummax2 = 24
self.MCARD = [0]
self.Opath = ""
self.index = [0] * 8
self.__keys = ['first_NULL']
self.i_key = 1  # 新增元素增加在位置 i_key 处
self.index2 = [0] * 8

def get_keys(self, iii=-1):
if iii == -1:
return self.__keys[1:]
else:
return self.__keys[iii]

def flush_key(self, iii):
self.__keys[iii] = ""  # 去掉keys的值

def getindex(self, str1, for_up=False):
# 获取 词条的 8个随机位置
seed = getseed(str1)
for n in range(0, self.M_num):
a = 0
k = (n + 1)
seed1 = seed
if (seed >> 64) < 0:
seed1 = seed * (n + 15048796327)
while seed1 > 0:
a ^= (seed1 & (self.N_max - 1)) + k
a = ((a << k) & (self.N_max - 1)) | (a >> (self.nummax2 - k))  # 左循环移位
seed1 >>= self.nummax2
if for_up:
self.index2[n] = a
else:
self.index[n] = a

def update_card(self, str1):
"""
:param str1: 词的utf-8编码形式
:param num: 该词需要增加的value值
"""
# 新词
for iii in self.index:
if self.MCARD[iii] == 0:
self.MCARD[iii] = self.i_key
if self.i_key % 10000 == 0:
print self.i_key
self.i_key += 1
self.__keys.append(str1)

"""
:param str1: 词的utf-8编码形式
:return: 输出该次条对应的value值
"""
if for_up:
for i in xrange(0, 10):  # 最多尝试10次
i_str1 = str1 + str(i)
if i > 5:
print i
self.getindex(i_str1)
aaa = min(self.MCARD[self.index])
if aaa == 0:
return 0
return -1
else:
for i in xrange(0, 10):  # 最多连续处理碰撞10次
i_str1 = str1 + str(i)
self.getindex(i_str1)
aaa = max(self.MCARD[self.index])
if aaa == 0:  # 不存在
return 0
elif aaa < self.N_max:
if str1 == self.__keys[aaa]:
return aaa
# print ("warning : bad case happened , card array maybe too short when update " + str1) # hash 桶太少
return 0

def setbase(self, num1=16777216, num2=8):
"""

:param num1: 数组长度参数
:param num2: 每个词条对应的hash位置数
"""
self.nummax2 = int(math.ceil(math.log(num1, 2)))
self.N_max = 2 ** self.nummax2  # self.nummax2 2的N次方
self.M_num = num2
self.index = [0] * num2
self.index2 = [0] * num2

def set_card(self, kk=-1, dd=8):
"""

:param kk:  数组长度参数 -1表示取之前定义值
"""
if -1 == kk:
self.MCARD = np.repeat(0, self.N_max)
return 0
s1 = input('do you want to reset MCARD to zeros,all memory will be lost [y/n]:')
if s1 == 'y':
self.MCARD = np.repeat(0, self.N_max)
else:
print("no reset")
else:
self.setbase(kk, dd)
self.MCARD = np.repeat(0, 2 ** self.nummax2)

def record_num(self):

"""
:return: 返回字典词条数量
"""
return self.i_key - 1

def card_test(self):
"""

计算hash碰撞指数
"""
aaa = self._record
bbb = self.N_max
ccc = 0
for i in self.MCARD:
ccc += int(i > 0)
ddd = self.M_num
print math.log(1.0 * ccc / bbb, 10) * ddd, math.log((1.0 * aaa * ddd - ccc) / ccc, 10) * ddd



__author__ = 'axuanwu'
# coding=utf8
import re
import sys
import os
import time
import math
import numpy as np
from myclass import *

def __init__(self):
self.m_card = MCard()
self.dict_class = {}
self.classify_tongji = np.zeros((3, 9))
self.class_str = []
self.m_card.set_card(2 ** 27, 6)
self.mat_row = 3000000
self.i_file = 0
self.class_tail = np.array([0.0] * self.mat_row)
self.word_count = np.zeros((3000000, 9), float)  # 用于记录最常见的300万个片段
self.class_score = np.array([0.0] * 9)
self.root_dir = ""
self.max_word_length = 5
self.re_ch = re.compile(u"[\u4E00-\u9FA5]+", re.U)
self.re_eng = re.compile(u"[a-zA-Z0-9+\._@]+", re.U)
self.fazhi = 3

def set_dict_class(self):
file_list = os.listdir(os.path.join(self.root_dir, "train"))
i = 0
for i_dir in file_list:
self.dict_class[i_dir] = i
self.class_str.append(i_dir)
i += 1

def set_fazhi(self):
o_file = open(os.path.join(os.getcwd(), "canshu.txt"), "r")
count_my = [0] * 200
i = 0
for line in o_file:
count_my[i] = int(line.rstrip())
i += 1
o_file.close()
i = len(count_my) - 1
a = self.mat_row
while count_my[i] < a:
a -= count_my[i]
i -= 1
self.fazhi = max([2, i])

def set_root(self, path="C:\\Users\\01053185\\Desktop\\yuliao\\yuliao"):
self.root_dir = path

line_dict = max(self.word_count.shape)
dict_path = open(os.path.join(os.getcwd(), "tong_ji2new.txt"), "r")
temp_array = np.zeros((1, 9), float)
for line in dict_path:
line_s = line.strip().split("\t")
for j in xrange(1, len(line_s)):
temp_array[0, j - 1] = float(line_s[j])
# if sum(temp_array) < self.fazhi:
# continue  # 次数太少不录入特征字典
self.m_card.update_card(line_s[0].decode("utf-8", "ignore"))  # 每次都是新词
self.word_count[aaa,] = temp_array
if aaa == line_dict - 1:
break
# if aaa == 10000:
#     break
dict_path.close()

def cut_classify2(self, sentence):
blocks = re.findall(self.re_ch, sentence)
for blk in blocks:
len_blk = len(blk)
i = len_blk
while i >= 2:
j = self.max_word_length  # 最大磁长
while j >= 2:
if (i - j) < 0:
j -= 1
continue
if index_word == 0:
j -= 1
continue
else:
if self.i_file == self.class_tail[index_word]:  # 词被存储过
pass
else:
# print blk[i:(i + j)]
self.class_score += self.word_count[index_word,]
self.class_tail[index_word] = self.i_file
j -= 1
i -= 1
blocks = re.findall(self.re_eng, sentence)
for blk in blocks:
if self.i_file == self.class_tail[index_word]:  # 词被存储过
pass
else:
self.class_score += self.word_count[index_word,]
self.class_tail[index_word] = self.i_file

def cut_classify3(self, sentence):
# 正向最大匹配
blocks = re.findall(self.re_ch, sentence)
for blk in blocks:
len_blk = len(blk)
i = 0
while i < (len_blk - 2):
j = self.max_word_length  # 最大磁长
while j >= 2:
if (i + j) > len_blk:
j -= 1
continue
if index_word == 0:
j -= 1
continue
else:
if self.i_file == self.class_tail[index_word]:  # 词被计算存储过
pass
else:
# print blk[i:(i + j)]
self.class_score += self.word_count[index_word,]
self.class_tail[index_word] = self.i_file
break
if j < 2:
i += 1
else:
i += j
blocks = re.findall(self.re_eng, sentence)
for blk in blocks:
if self.i_file == self.class_tail[index_word]:  # 词被存储过
pass
else:
self.class_score += self.word_count[index_word,]
self.class_tail[index_word] = self.i_file

def cut_classify(self, sentence):
blocks = re.findall(self.re_ch, sentence)
for blk in blocks:
len_blk = len(blk)
i = len_blk
while i >= 2:
j = self.max_word_length  # 最大磁长
while j >= 2:
if (i - j) < 0:
j -= 1
continue
if index_word == 0:
j -= 1
continue
else:
if self.i_file == self.class_tail[index_word]:  # 词被存储过
pass
else:
# print blk[i:(i + j)]
self.class_score += self.word_count[index_word,]
self.class_tail[index_word] = self.i_file
break
if j < 2:
i -= 1
else:
i -= j
blocks = re.findall(self.re_eng, sentence)
for blk in blocks:
if self.i_file == self.class_tail[index_word]:  # 词被存储过
pass
else:
self.class_score += self.word_count[index_word,]
self.class_tail[index_word] = self.i_file

class_result = os.path.join(os.getcwd(), "class_result.txt")
o_file = open(class_result, "w")
class_numbers = self.word_count.shape  #
dir_path = os.path.join(self.root_dir, "train")
dir_list = os.listdir(dir_path)
for sdir in dir_list:
dir_path = os.path.join(os.path.join(self.root_dir, "train"), sdir)
# dir_path = "C:/Users/01053185/Desktop/yuliao/yuliao/test/C000024"
file_list = os.listdir(dir_path)
for files in file_list:
self.i_file += 1
file_path = os.path.join(dir_path, files)
self.class_score = np.array([0.0] * 9)
i_file = open(file_path, "r")
for line in i_file:
self.cut_classify3(line.decode("gbk", 'replace').strip())
max_pro = max(self.class_score)
for i in xrange(0, 9):
if self.class_score[i] == max_pro:
self.classify_tongji[0, self.dict_class[self.class_str[i]]] += 1
if sdir == self.class_str[i]:
o_file.writelines(file_path + "\t" + self.class_str[i] + "\t" + "1\n")
self.classify_tongji[1, self.dict_class[self.class_str[i]]] += 1
else:
o_file.writelines(file_path + "\t" + self.class_str[i] + "\t" + "0\n")
break
o_file.close()
try:
self.classify_tongji[2,] = self.classify_tongji[1,] / self.classify_tongji[0,]
except:
print "hello word!"

if __name__ == "__main__":
my_classify.set_root()
a = time.time()
my_classify.set_dict_class()
# my_classify.set_fazhi()
print "time is :",time.time() - a,"s"
print "time is :",time.time() - a,"s"
print my_classify.classify_tongji


07-16

11-03 814

11-30 9433

06-14 1946

01-09 342

07-16

04-05 237

03-24 9245

#### 面向文本分类的特征工程——kaggle文本分类比赛

©️2020 CSDN 皮肤主题: 编程工作室 设计师: CSDN官方博客

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、C币套餐、付费专栏及课程。