1.频率归一化词库
#!/usr/bin/python
#-*- coding:UTF-8 -*-
from __future__ import division
import MySQLdb as mdb
#import chardet
con = mdb.connect('localhost','root','zxwxwz','mysql_test',charset='utf8')
cur = con.cursor()
cur.execute("create table if not exists nor_word_ansj(id int(10) not null primary key auto_increment, keyword varchar(128) character set utf8 not null,nature varchar(20) character set utf8,frequency float(20))default charset=utf8;")
cur.execute("create table if not exists nor_word_celebrity(id int(10) not null primary key auto_increment,keyword varchar(128) character set utf8 not null,nature varchar(20) character set utf8,frequency float(20))default charset=utf8;")
cur.execute("create table if not exists nor_word_sougou(id int(10) not null primary key auto_increment,keyword varchar(128) character set utf8 not null,nature varchar(20) character set utf8,frequency float(20))default charset=utf8;")
cur.execute("select * from sougou_word")
rows=cur.fetchall()
num_freq=0
for row in rows:
keyword=row[0]
nature=row[2]
frequency=row[1]
num_freq=num_freq+frequency
for row in rows:
keyword=row[0]
nature=row[2]
frequency=row[1]
cur.execute("insert into nor_word_sougou(keyword,nature,frequency) values('%s','%s','%.8f')" % (keyword.encode('utf-8'),nature.encode('utf-8'),frequency/num_freq))
table_ori=('ansj_seg_default_dic','celebrity')
table_new=('nor_word_ansj','nor_word_celebrity')
limit_num=10000
for i in range(0, 2):
cur.execute("select count(*) from %s " % table_ori[i])
num_line=cur.fetchone()
clc_num=int(num_line[0]/limit_num)
for j in range(0,clc_num):
cur.execute("select * from %s where id>(%d)*(%d)&&id<=(%d)*(%d)" % (table_ori[i],limit_num,j,limit_num,j+1))
rows = cur.fetchall()
num_freq=0;
for row in rows:
keyword=row[0]
nature= row[1]
frequency=row[2]
num_freq=num_freq+frequency
for row in rows:
keyword=row[0]
nature= row[1]
frequency=row[2]
cur.execute("insert into %s(keyword,nature,frequency) values('%s','%s','%.8f')" % (table_new[i],keyword.encode('utf-8'),nature.encode('utf-8'),frequency/num_freq))
con.commit()
cur.execute("select * from %s where id>(%d)*(%d) " % (table_ori[i],clc_num,limit_num))
rows = cur.fetchall()
num_freq=0;