coding=utf-8
“”"
author:lei
function: k近邻算法,欧式距离,两点之间的距离公式
需要进行标准化处理
“”"
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import pandas as pd
def knncls():
“”"
K近邻预测用户签到位置
:return:
“”"
# 读取数据
data = pd.read_csv("./")
print(data.head(10))
# 处理数据
# 1、缩小数据,查询数据筛选
data = data.query("x>1.0 & x<1.25 & y>2.5 & y<2.75")
# 处理时间的处理 将时间戳转变为年月分秒
time_value = pd.to_datetime(data["time"], unit="s")
print(time_value)
# 多增加一些特征 把日期格式转换为 字典格式
time_value = pd.DatetimeIndex(time_value)
data["day"] = time_value.day
data["hoor"] = time_value.hour
data["weekday"] = time_value.weekend
# 把时间戳特征删除
data.drop(["time"], axis=1) # sklearn 的列是0,pandas和numpy的列是1
# 把签到数量少于n个目标位置删除
place_count = data.groupby("place_id").count()
tf = place_count[place_count.row_id > 3].reset_index()
data = data[data["place_id"].isin(tf.place_id)]
# 取出数据当中的特征值和目标值
y = data["place_id"]
x = data.drop(['place_id'], axis=1)
# 进行数据的分割
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.75)
# 特征工程(标准化)
std = StandardScaler()
# 对测试集和训练集的特征值进行标准化
x_train = std.fit_transform(x_train)
y_train = std.transform(y_train)
# 进行算法流程 取5个最近的点来确定
knn = KNeighborsClassifier()
# knn.fit(x_train, y_train)
#
# # 得出预测结果
# y_predict = knn.predict(x_test)
#
# print("预测的目标亲到位置为:", y_predict)
#
# # 得出准确率
# print("预测的准确率", knn.score(x_test, y_test))
# 构造一些参数的值进行网格搜索
param = {"n_neighbors": [3, 5, 10]}
gc = GridSearchCV(knn, param_grid=param, cv=2)
gc.fit(x_train, y_train)
# 预测准确率
print("在测试集上的准确率:", gc.score(x_test, y_test))
print("在交叉验证当中最好的模型:", gc.best_score_)
print("最好的模型:", gc.best_estimator_)
print("最好的结果:", gc.cv_results_)
def naviebayes():
“”"
朴素贝叶斯进行文本分类
训练集误差大,结果肯定不好
不需要调参
:return:
“”"
news = fetch_20newsgroups(subset=“all”)
# 进行数据分割
x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.75)
# 对数据集进行
tf = TfidfVectorizer()
# 以训练集中的词的列表进行每篇文章重要性统计
x_train = tf.fit_transform(x_train)
x_test = tf.transform(x_test)
# 进行朴素贝叶斯的预测
mlt = MultinomialNB(alpha=1.0)
mlt.fit(x_train, y_train)
print(x_train.toarray())
y_predict = mlt.predict(x_test)
# 预测的结果
print(y_predict)
# 得出准确率
print(mlt.score(x_test, y_test))
# 得到准确率和召回率
print(classification_report(y_test, y_predict, target_names=news.target_names))
return None
if name == ‘main’:
# knncls()
naviebayes()