一、算法模块
- 增加了获取某个关键词的数据源分布的方法
- 增加了获取某个关键词的相关关键词的方法
- 增加了无关词词库库容,使分词筛选更加精准
- 优化了用于储存单条数据的数据结构,现在能更好的与数据库模块对接
二、数据库模块
完成了如下基本功能:
- 单条数据(包含url、文本信息、时间戳、分词、来源等)的读写
- 用户信息的读写
- 单条热词数据(包含出现总次数、30日内每日出现次数、12月内每月出现次数、10年内每年出现次数)的初始化、修改、维护
- 被爬取链接的增删查
代码如下:
import pymysql
import MyNLP
# 打开数据库连接
db = pymysql.Connect(host = "localhost", port = 3306, user = "root", passwd = "123456", db = "bs")
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
def set_InfoKW(InfoKW):
sql = "insert into InfoKW values ( '%s', '%s', '%s', '%s', '%s' )"
data = (InfoKW.url , InfoKW.text, InfoKW.time.strftime("%Y-%m-%d"), InfoKW.source, InfoKW.get_KW_str())
cursor.execute(sql % data)
connect.commit()
def get_InfoKW(url):
sql = "select * from InfoKW where url = '%s'"
data = (url)
cursor.execute(sql % data)
res = cursor.fetchall()
return MyNLP.Info_kw(res[0], res[1], res[2], res[3], res[4].split(","))
def set_User(mail, pw):
sql = "insert into User values ( '%s', '%s')"
data = (mail, pw)
cursor.execute(sql % data)
connect.commit()
def get_User(mail):
sql = "select pw from User where mail = '%s'"
data = (mail)
cursor.execute(sql % data)
res = cursor.fetchall()
return res[0]
def init_KW(kw):
sql = "insert into KW values ( '%s', 0, '0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0', '0,0,0,0,0,0,0,0,0,0,0,0,0', '0,0,0,0,0,0,0,0,0,0,0')"
data = (KW)
cursor.execute(sql % data)
connect.commit()
def add_KW(KW, num, days, months, years):
sql = "select * from KW where kw = '%s'"
data = (KW)
cursor.execute(sql % data)
res = cursor.fetchall()
num += int(res[1])
day = ""
temp = res[2].split(',')
for i in range(0, 31):
days[i] = int(temp[i])
day += days[i] + ","
day = day[:-2]
month = ""
temp = res[3].split(',')
for i in range(0, 13):
months[i] = int(temp[i])
month += months[i] + ','
month = month[:-2]
year = ""
temp = res[4].split(',')
for i in range(0, 11):
years[i] = int(temp[i])
year += year[i] + ','
year = year[:-2]
sql = "update KW set num = '%d', days = '%s', months = '%s', years = '%s' where kw = '%s')"
data = (num, day, month, year, KW)
cursor.execute(sql % data)
connect.commit()
#def maintain_KW():
def get_url():
sql = "select * from url"
cursor.execute(sql)
res = cursor.fetchall()
url_list = []
for item in res:
url_list.append(item)
return url_list
def sef_url(url):
sql = "insert into url values ( '%s')"
data = (url)
cursor.execute(sql % data)
connect.commit()
def del_url(url):
sql = "delete from url where url = '%s'"
data = (url)
cursor.execute(sql % data)
connect.commit()
# 关闭数据库连接
db.close()
三、web后端
web后端将采用python socket库方法。由于以前多用java开发,目前python相关部分还在学习中。