机器学习-实践五:明星图片爬取
def getPicinfo(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36",
"Accept": "*/*",
"Referer": "https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=%E4%B8%AD%E5%9B%BD%E8%89%BA%E4%BA%BA&fenlei=256&rsv_pq=cf6f24c500067b9f&rsv_t=c2e724FZlGF9fJYeo9ZV1I0edbhV0Z04aYY%2Fn6U7qaUoH%2B0WbUiKdOr8JO4&rqlang=cn&rsv_dl=ib&rsv_enter=1&rsv_sug3=15&rsv_sug1=6&rsv_sug7=101",
"Host": "sp0.baidu.com",
"Connection": "keep-alive",
"Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6",
"Accept-Encoding": "gzip, deflate"
}
response = requests.get(url, headers)
if response.status_code == 200:
return response.text
return None
Download_dir = 'picture'
if os.path.exists(Download_dir) == False:
os.mkdir(Download_dir)
pn_num = 1
rn_num = 10
for k in range(pn_num):
url = "https://sp0.baidu.com/8aQDcjqpAAV3otqbppnN2DJv/api.php?resource_id=28266&from_mid=1&&format=json&ie=utf-8&oe=utf-8&query=%E4%B8%AD%E5%9B%BD%E8%89%BA%E4%BA%BA&sort_key=&sort_type=1&stat0=&stat1=&stat2=&stat3=&pn=" + str(
k) + "&rn=" + str(rn_num) + "&_=1613785351574"
res = getPicinfo(url)
json_str = json.loads(res)
figs = json_str['data'][0]['result']
for i in figs:
name = i['ename']
img_url = i['pic_4n_78']
img_res = requests.get(img_url)
if img_res.status_code == 200:
ext_str_splits = img_res.headers['Content-Type'].split('/')
ext = ext_str_splits[-1]
fname = name + "." + ext
open(os.path.join(Download_dir, fname), 'wb').write(img_res.content)
print(name, img_url, "saved")
机器学习-实践七:科比职业生涯数据爬取与分析
!pip install bs4
!cp /home/aistudio/work/simhei.ttf /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/matplotlib/mpl-data/fonts/ttf/
!cp /home/aistudio/work/simhei.ttf .fonts/
!rm -rf .cache/matplotlib
import requests
from bs4 import BeautifulSoup
import csv
import matplotlib.pyplot as plt
import pandas as pd
plt.rcParams['font.sans-serif'] = ['simhei']
plt.rcParams['axes.unicode_minus']=False
plt.rcParams['figure.dpi'] = 100
'''
Created on 2021年02月20日
@author: zhongshan
'''
def getKobeList(code):
url = "http://www.stat-nba.com/player/stat_box/195_" + code + ".html"
response = requests.get(url)
resKobe = response.text
return resKobe
def getRow(resKobe, code):
soup = BeautifulSoup(resKobe, "html.parser")
table = soup.find_all(id='stat_box_avg')
header = []
if code == "season":
header = ["赛季", "出场", "首发", "时间", "投篮", "命中", "出手", "三分", "命中", "出手", "罚球", "命中", "出手", "篮板", "前场", "后场", "助攻",
"抢断", "盖帽", "失误", "犯规", "得分", "胜", "负"]
if code == "playoff":
header = ["赛季", "出场", "时间", "投篮", "命中", "出手", "三分", "命中", "出手", "罚球", "命中", "出手", "篮板", "前场", "后场", "助攻", "抢断",
"盖帽", "失误", "犯规", "得分", "胜", "负"]
if code == "allstar":
header = ["赛季", "首发", "时间", "投篮", "命中", "出手", "三分", "命中", "出手", "罚球", "命中", "出手", "篮板", "前场", "后场", "助攻", "抢断",
"盖帽", "失误", "犯规", "得分"]
rows = [];
rows.append(header)
for tr in table[0].find_all("tr", class_="sort"):
row = []
for td in tr.find_all("td"):
rank = td.get("rank")
if rank != "LAL" and rank != None:
row.append(td.get_text())
rows.append(row)
return rows
def writeCsv(rows, dir):
with open(dir, 'w', encoding='utf-8-sig', newline='') as f:
writer = csv.writer(f)
writer.writerows(rows)
resKobe = getKobeList("season")
rows = getRow(resKobe, "season")
writeCsv(rows, "season.csv")
print("season.csv saved")
resKobe = getKobeList("playoff")
rows = getRow(resKobe, "playoff")
writeCsv(rows, "playoff.csv")
print("playoff.csv saved")
resKobe = getKobeList("allstar")
rows = getRow(resKobe, "allstar")
writeCsv(rows, "star.csv")
print("star.csv saved")
def show_score(game_name='season', item='篮板', plot_name='line'):
file_name = game_name+'.csv'
data = pd.read_csv(file_name)
X= data['赛季'].values.tolist()
X.reverse()
if item=='all':
Y1 = data['篮板'].values.tolist()
Y2 = data['助攻'].values.tolist()
Y3 = data['得分'].values.tolist()
Y1.reverse()
Y2.reverse()
Y3.reverse()
else:
Y = data[item].values.tolist()
Y.reverse()
if plot_name=='line':
if item=='all':
plt.plot(X,Y1,c='r',linestyle="-.")
plt.plot(X,Y2,c='g',linestyle="--")
plt.plot(X,Y3,c='b',linestyle="-")
legend=['篮板','助攻','得分']
else:
plt.plot(X,Y,c='g',linestyle="-")
legend=[item]
elif plot_name=='bar':
if item=='all':
fig = plt.figure(figsize=(15,5))
ax1 = plt.subplot(131)
plt.bar(X,Y1,facecolor = '#9999ff',edgecolor = 'white')
plt.legend(['篮板'])
plt.title('Kobe职业生涯数据分析:'+game_name)
plt.xticks(rotation=60)
plt.ylabel('篮板')
ax2 = plt.subplot(132)
plt.bar(X,Y2,facecolor = '#999900',edgecolor = 'white')
plt.legend(['助攻'])
plt.title('Kobe职业生涯数据分析:'+game_name)
plt.xticks(rotation=60)
plt.ylabel('助攻')
ax3 = plt.subplot(133)
plt.bar(X,Y3,facecolor = '#9988ff',edgecolor = 'white')
legend=['得分']
else:
plt.bar(X,Y,facecolor = '#9900ff',edgecolor = 'white')
legend=[item]
else:
return
plt.legend(legend)
plt.title('Kobe职业生涯数据分析:'+game_name)
plt.xticks(rotation=60)
plt.xlabel('赛季')
if item!='all':
plt.ylabel(item)
else:
plt.ylabel('得分')
plt.savefig('work/Kobe职业生涯数据分析_{}_{}.png'.format(game_name,item))
plt.show()
game_name = 'season'
for game_name in ['season','playoff','star']:
show_score(game_name=game_name, item='篮板', plot_name='bar')
show_score(game_name=game_name, item='助攻', plot_name='bar')
show_score(game_name=game_name, item='得分', plot_name='bar')
show_score(game_name=game_name, item='篮板', plot_name='line')
show_score(game_name=game_name, item='助攻', plot_name='line')
show_score(game_name=game_name, item='得分', plot_name='line')
show_score(game_name=game_name, item='all', plot_name='bar')
show_score(game_name=game_name, item='all', plot_name='line')
机器学习-实践六:股票行情爬取与分析
'''
Created on 2021年02月20日
@author: zhongshan
'''
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import json
import csv
def getHtml(url):
r = requests.get(url, headers={
'User-Agent': UserAgent().random,
})
r.encoding = r.apparent_encoding
return r.text
num = 20
stockUrl = 'http://99.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112408733409809437476_1623137764048&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:80&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1623137764167:formatted'
if __name__ == '__main__':
responseText = getHtml(stockUrl)
jsonText = responseText.split("(")[1].split(")")[0];
resJson = json.loads(jsonText)
datas = resJson["data"]["diff"]
datalist = []
for data in datas:
row = [data["f12"], data["f14"]]
datalist.append(row)
print(datalist)
f = open('stock.csv', 'w+', encoding='utf-8', newline="")
writer = csv.writer(f)
writer.writerow(('代码', '名称'))
for data in datalist:
writer.writerow((data[0] + "\t", data[1] + "\t"))
f.close()
import csv
import urllib.request as r
import threading
def getStockList():
stockList = []
f = open('stock.csv', 'r', encoding='utf-8')
f.seek(0)
reader = csv.reader(f)
for item in reader:
stockList.append(item)
f.close()
return stockList
def downloadFile(url, filepath):
try:
r.urlretrieve(url, filepath)
except Exception as e:
print(e)
print(filepath, "is downloaded")
pass
sem = threading.Semaphore(1)
def downloadFileSem(url, filepath):
with sem:
downloadFile(url, filepath)
urlStart = 'http://quotes.money.163.com/service/chddata.html?code='
urlEnd = '&end=20210221&fields=TCLOSE;HIGH;LOW;TOPEN;LCLOSE;CHG;PCHG;VOTURNOVER;VATURNOVER'
if __name__ == '__main__':
stockList = getStockList()
stockList.pop(0)
print(stockList)
for s in stockList:
scode = str(s[0].split("\t")[0])
url = urlStart + ("0" if scode.startswith('6') else "1") + scode + urlEnd
print(url)
filepath = (str(s[1].split("\t")[0]) + "_" + scode) + ".csv"
threading.Thread(target=downloadFileSem, args=(url, filepath)).start()
import pandas as pd
import matplotlib.pyplot as plt
import csv
plt.rcParams['font.sans-serif'] = ['simhei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.dpi'] = 100
files = []
def read_file(file_name):
data = pd.read_csv(file_name, encoding='gbk')
col_name = data.columns.values
return data, col_name
def get_files_path():
stock_list = getStockList()
paths = []
for stock in stock_list[1:]:
p = stock[1].strip() + "_" + stock[0].strip() + ".csv"
print(p)
data, _ = read_file(p)
if len(data) > 1:
files.append(p)
print(p)
get_files_path()
print(files)
def get_diff(file_name):
data, col_name = read_file(file_name)
index = len(data['日期']) - 1
sep = index // 15
plt.figure(figsize=(15, 17))
x = data['日期'].values.tolist()
x.reverse()
xticks = list(range(0, len(x), sep))
xlabels = [x[i] for i in xticks]
xticks.append(len(x))
y1 = [float(c) if c != 'None' else 0 for c in data['涨跌额'].values.tolist()]
y2 = [float(c) if c != 'None' else 0 for c in data['涨跌幅'].values.tolist()]
y1.reverse()
y2.reverse()
ax1 = plt.subplot(211)
plt.plot(range(1, len(x) + 1), y1, c='r')
plt.title('{}-涨跌额/涨跌幅'.format(file_name.split('_')[0]), fontsize=20)
ax1.set_xticks(xticks)
ax1.set_xticklabels(xlabels, rotation=40)
plt.ylabel('涨跌额', fontsize=20)
ax2 = plt.subplot(212)
plt.plot(range(1, len(x) + 1), y2, c='g')
ax2.set_xticks(xticks)
ax2.set_xticklabels(xlabels, rotation=40)
plt.xlabel('日期', fontsize=20)
plt.ylabel('涨跌幅', fontsize=20)
plt.savefig('work/' + file_name.split('.')[0] + '_diff.png')
plt.show()
def get_max_min(file_name):
data, col_name = read_file(file_name)
index = len(data['日期']) - 1
sep = index // 15
plt.figure(figsize=(15, 10))
x = data['日期'].values.tolist()
x.reverse()
x = x[-index:]
xticks = list(range(0, len(x), sep))
xlabels = [x[i] for i in xticks]
xticks.append(len(x))
y1 = [float(c) if c != 'None' else 0 for c in data['最高价'].values.tolist()]
y2 = [float(c) if c != 'None' else 0 for c in data['最低价'].values.tolist()]
y1.reverse()
y2.reverse()
y1 = y1[-index:]
y2 = y2[-index:]
ax = plt.subplot(111)
plt.plot(range(1, len(x) + 1), y1, c='r', linestyle="-")
plt.plot(range(1, len(x) + 1), y2, c='g', linestyle="--")
plt.title('{}-最高价/最低价'.format(file_name.split('_')[0]), fontsize=20)
ax.set_xticks(xticks)
ax.set_xticklabels(xlabels, rotation=40)
plt.xlabel('日期', fontsize=20)
plt.ylabel('价格', fontsize=20)
plt.legend(['最高价', '最低价'], fontsize=20)
plt.savefig('work/' + file_name.split('.')[0] + '_minmax.png')
plt.show()
def get_deal(file_name):
data, col_name = read_file(file_name)
index = len(data['日期']) - 1
sep = index // 15
plt.figure(figsize=(15, 10))
x = data['日期'].values.tolist()
x.reverse()
x = x[-index:]
xticks = list(range(0, len(x), sep))
xlabels = [x[i] for i in xticks]
xticks.append(len(x))
y1 = [float(c) if c != 'None' else 0 for c in data['成交量'].values.tolist()]
y2 = [float(c) if c != 'None' else 0 for c in data['成交金额'].values.tolist()]
y1.reverse()
y2.reverse()
y1 = y1[-index:]
y2 = y2[-index:]
ax = plt.subplot(111)
plt.plot(range(1, len(x) + 1), y1, c='b', linestyle="-")
plt.plot(range(1, len(x) + 1), y2, c='r', linestyle="--")
plt.title('{}-成交量/成交金额'.format(file_name.split('_')[0]), fontsize=20)
ax.set_xticks(xticks)
ax.set_xticklabels(xlabels, rotation=40)
plt.xlabel('日期', fontsize=20)
plt.legend(['成交量', '成交金额'], fontsize=20)
plt.savefig('work/' + file_name.split('.')[0] + '_deal.png')
plt.show()
def get_rel(file_name):
data, col_name = read_file(file_name)
index = len(data['日期']) - 1
sep = index // 15
plt.figure(figsize=(15, 10))
x = data['日期'].values.tolist()
x.reverse()
x = x[-index:]
xticks = list(range(0, len(x), sep))
xlabels = [x[i] for i in xticks]
xticks.append(len(x))
y1 = [float(c) if c != 'None' else 0 for c in data['成交量'].values.tolist()]
y2 = [float(c) if c != 'None' else 0 for c in data['涨跌幅'].values.tolist()]
y1.reverse()
y2.reverse()
y1 = y1[-index:]
y2 = y2[-index:]
y2 = [0] + y2[:-1]
ax = plt.subplot(111)
plt.scatter(y2, y1)
plt.title('{}-成交量与前一天涨跌幅的关系'.format(file_name.split('_')[0]), fontsize=20)
plt.xlabel('前一天涨跌幅', fontsize=20)
plt.ylabel('成交量', fontsize=20)
plt.savefig('work/' + file_name.split('.')[0] + '_rel.png')
plt.show()
print(len(files))
for file in files:
get_max_min(file)
get_deal(file)
get_diff(file)
get_rel(file)