数据的来源:
爬虫、买、实时数据、网上公开的数据集
数据处理成指定的格式
筛选数据
输入模型,进行训练
调优
保存模型
load开头的小数据集
fetch开头的是大数据集
make 本地数据集
from sklearn.datasets import *
#鸢尾花数据集
data = load_iris()
print(data)
#获取的特征名
print(data.feature_names)
获取的特征数组
print(data.data)
获取的目标值名字
print(data.target_names)
获取的目标数组
print(data.target)
注意点:
加载本地数据集
data = fetch_20newsgroups()
print(data)
注意点:联网下载大数据集
data = make_classification()
print(data[0])
print(data[1])
生成本地的分类数据集
爬虫实例
import requests
from lxml import etree
import matplotlib.pyplot as plt
import jieba
import nltk
from wordcloud import WordCloud,ImageColorGenerator
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from PIL import Image
import numpy as np
url = 'http://www.skeyedu.com/gw/recruitment/recruitment.html'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
#根据请求get post
response = requests.get(url=url,headers=headers)
# #请求状态
# print(response)
# print(response.text)
# #bytes类型 用来爬图片的
# print(response.content)
html_str = response.text
#将str字符串转成html文档
html_doc = etree.HTML(html_str)
#提取岗位信息
data = html_doc.xpath("//span[@class='zhaopin_zw']/text()")
#提取岗位要求i
data1 = html_doc.xpath("//p[@class='zhaopin_yq']")
context = []
for i in data1:
i = i.xpath("text()")[0].replace('\r','').replace('\n','').replace(' ','').replace('\t','').replace('要求','')
context.append(i)
print(context)
# #分词
# for i in jieba.cut(context[0]):
# print(i)
list_word = [word for word in jieba.cut(''.join(context)) if len(word)>1]
#进行词频统计
freq_dist = nltk.FreqDist(list_word)
print(freq_dist)
top_word = freq_dist.most_common(10)
print(top_word)
#画图
#图片的向量化
bg_img = np.array(Image.open('./下载.JPG'))
wc = WordCloud(font_path='C:\Windows\Fonts\simhei.ttf',background_color='white',max_words=100,mask=bg_img,max_font_size=80)
#获取词频数据
wc.generate_from_frequencies(freq_dist)
#获取背景颜色
img_color = ImageColorGenerator(bg_img)
plt.figure()
plt.imshow(wc.recolor(color_func=img_color))
plt.show()
#wc.to_file('aaa.jpg')