源码下载地址:
http://download.csdn.net/download/adam_zs/10224873
from __future__ import absolute_import, division, print_function
# http://www.lfd.uci.edu/~gohlke/pythonlibs/#xgboost
import sys
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import matplotlib.pylab as plt
import seaborn as sns
pd.set_option('display.height', 9999)
pd.set_option('display.max_rows', 9999)
pd.set_option('display.max_columns', 9999)
pd.set_option('display.width', 9999)
pd.set_option('display.float_format', lambda x: '%.5f' % x) # pandas
np.set_printoptions(precision=5, suppress=True) # numpy
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
# seaborn plotting style
sns.set(style='ticks', context='poster')
sentiment = pd.read_csv("data/sentiment.csv", index_col=0, parse_dates=[0])
# print(sentiment.head())
'''
DATE UMCSENT
2000-01-01 112.00000
2000-02-01 111.30000
2000-03-01 107.10000
2000-04-01 109.20000
2000-05-01 110.70000
'''
sentiment_short = sentiment['2005':'2016']
# sentiment_short.plot(figsize=(12, 8))
# plt.title("Consumer Sentiment")
# plt.show()
# sentiment_short = sentiment['2005':'2016']
# sentiment_short['diff_1'] = sentiment_short['UMCSENT'].diff(1) # 1阶差分
# sentiment_short['diff_2'] = sentiment_short['diff_1'].diff(1) # 2阶差分
# sentiment_short.plot(figsize=(18, 12))
# plt.title('diff_1 and diff_2')
# plt.show()
# acf结果,pacf结果
# fig = plt.figure(figsize=(12, 8))
#
# ax1 = fig.add_subplot(211)
# fig = sm.graphics.tsa.plot_acf(sentiment_short, lags=20, ax=ax1)
# ax1.xaxis.set_ticks_position('bottom')
# fig.tight_layout()
#
# ax2 = fig.add_subplot(212)
# fig = sm.graphics.tsa.plot_pacf(sentiment_short, lags=20, ax=ax2)
# ax2.xaxis.set_ticks_position('bottom')
# fig.tight_layout()
# plt.show()
# 散点图也可以表示
# lags = 9
# ncols = 3
# nrows = int(np.ceil(lags / ncols))
#
# fig, axes = plt.subplots(ncols=ncols, nrows=nrows, figsize=(4 * ncols, 4 * nrows))
#
# for ax, lag in zip(axes.flat, np.arange(1, lags + 1, 1)):
# lag_str = 't-{}'.format(lag)
# X = (pd.concat([sentiment_short, sentiment_short.shift(-lag)], axis=1,
# keys=['y'] + [lag_str]).dropna())
#
# X.plot(ax=ax, kind='scatter', y='y', x=lag_str)
# corr = X.corr().as_matrix()[0][1]
# ax.set_ylabel('Original')
# ax.set_title('Lag: {} (corr={:.2f})'.format(lag_str, corr))
# ax.set_aspect('equal')
# sns.despine()
#
# fig.tight_layout()
# plt.show()
# 更直观一些
def tsplot(y, lags=None, title='', figsize=(14, 8)):
fig = plt.figure(figsize=figsize)
layout = (2, 2)
ts_ax = plt.subplot2grid(layout, (0, 0))
hist_ax = plt.subplot2grid(layout, (0, 1))
acf_ax = plt.subplot2grid(layout, (1, 0))
pacf_ax = plt.subplot2grid(layout, (1, 1))
y.plot(ax=ts_ax)
ts_ax.set_title(title)
y.plot(ax=hist_ax, kind='hist', bins=25)
hist_ax.set_title('Histogram')
smt.graphics.plot_acf(y, lags=lags, ax=acf_ax)
smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax)
[ax.set_xlim(0) for ax in [acf_ax, pacf_ax]]
sns.despine()
plt.tight_layout()
return ts_ax, acf_ax, pacf_ax
tsplot(sentiment_short, title='Consumer Sentiment', lags=36);
plt.show()
from __future__ import absolute_import, division, print_function
import sys
import os
import pandas as pd
import numpy as np
# TSA from Statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
# Display and Plotting
import matplotlib.pylab as plt
import seaborn as sns
pd.set_option('display.height', 9999)
pd.set_option('display.max_rows', 9999)
pd.set_option('display.max_columns', 9999)
pd.set_option('display.width', 9999)
pd.set_option('display.float_format', lambda x: '%.5f' % x) # pandas
np.set_printoptions(precision=5, suppress=True) # numpy
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
# seaborn plotting style
sns.set(style='ticks', context='poster')
ts_df = pd.read_csv('data/series1.csv', index_col=0, parse_dates=[0])
# print(ts_df.head())
# print(ts_df.shape) (120, 1)
'''
value
2006-06-01 0.21507
2006-07-01 1.14225
2006-08-01 0.08077
2006-09-01 -0.73952
2006-10-01 0.53552
'''
train_count = int(ts_df.shape[0] * 0.95) + 1
X_train = ts_df[:train_count]['value']
y_test = ts_df[train_count:]['value']
# print(X_train.shape)
# print(X_train.tail())
# print(y_test.shape)
# print(y_test.head())
def tsplot(y, lags=None, title='', figsize=(14, 8)):
fig = plt.figure(figsize=figsize)
layout = (2, 2)
ts_ax = plt.subplot2grid(layout, (0, 0))
hist_ax = plt.subplot2grid(layout, (0, 1))
acf_ax = plt.subplot2grid(layout, (1, 0))
pacf_ax = plt.subplot2grid(layout, (1, 1))
y.plot(ax=ts_ax)
ts_ax.set_title(title)
y.plot(ax=hist_ax, kind='hist', bins=25)
hist_ax.set_title('Histogram')
smt.graphics.plot_acf(y, lags=lags, ax=acf_ax)
smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax)
[ax.set_xlim(0) for ax in [acf_ax, pacf_ax]]
sns.despine()
fig.tight_layout()
return ts_ax, acf_ax, pacf_ax
# tsplot(X_train, title='A Given Training Series', lags=20)
# plt.show()
arima200 = sm.tsa.SARIMAX(X_train, order=(2, 0, 0))
model_results = arima200.fit()
import itertools
p_min = 0
d_min = 0
q_min = 0
p_max = 4
d_max = 0
q_max = 4
# Initialize a DataFrame to store the results
results_bic = pd.DataFrame(index=['AR{}'.format(i) for i in range(p_min, p_max + 1)],
columns=['MA{}'.format(i) for i in range(q_min, q_max + 1)])
for p, d, q in itertools.product(range(p_min, p_max + 1),
range(d_min, d_max + 1),
range(q_min, q_max + 1)):
if p == 0 and d == 0 and q == 0:
results_bic.loc['AR{}'.format(p), 'MA{}'.format(q)] = np.nan
continue
try:
model = sm.tsa.SARIMAX(X_train, order=(p, d, q),
# enforce_stationarity=False,
# enforce_invertibility=False,
)
results = model.fit()
results_bic.loc['AR{}'.format(p), 'MA{}'.format(q)] = results.bic
except:
continue
results_bic = results_bic[results_bic.columns].astype(float)
fig, ax = plt.subplots(figsize=(10, 8))
ax = sns.heatmap(results_bic,
mask=results_bic.isnull(),
ax=ax,
annot=True,
fmt='.2f',
)
ax.set_title('BIC')
plt.show()
import matplotlib.pylab
import numpy as np
import pandas as pd
'''滑动窗口'''
df = pd.Series(np.random.randint(low=1, high=100, size=600),
index=pd.date_range(start='2016-01-07', periods=600, freq='D'))
# print(df.head())
print(df.rolling(window=10)) # 滑动窗口
import matplotlib.pyplot as plt
plt.figure(figsize=(15, 5))
df.plot(style='r--')
df.rolling(window=10).mean().plot(style='b')
plt.show()
import pandas as pd
import numpy as np
rng = pd.date_range('2017-01-05', periods=90, freq='D')
ts = pd.Series(np.random.randint(low=1, high=20, size=90), index=rng)
# print(ts)
# print(ts.resample(rule="M").sum())
day3Ts = ts.resample(rule='3D').sum()
# print(day3Ts.resample(rule='D').asfreq())
'''
ffill 空值取前面的值
bfill 空值取后面的值
interpolate 线性取值
'''
print(day3Ts.resample(rule='D').ffill(1)) # 1 对1个缺失值进行填充
import matplotlib.pylab as plt
import seaborn as sns
from tsfresh.examples.robot_execution_failures import download_robot_execution_failures, load_robot_execution_failures
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
download_robot_execution_failures()
df, y = load_robot_execution_failures()
# df[df.id == 3][['time', 'F_x', 'F_y', 'F_z', 'T_x', 'T_y', 'T_z']].plot(x='time', title='Success example (id 3)',
# figsize=(12, 6))
# df[df.id == 20][['time', 'F_x', 'F_y', 'F_z', 'T_x', 'T_y', 'T_z']].plot(x='time', title='Failure example (id 20)',
# figsize=(12, 6))
# plt.show()
extraction_settings = ComprehensiveFCParameters() # 特征提取
X = extract_features(df,
column_id='id', column_sort='time',
default_fc_parameters=extraction_settings,
impute_function=impute)
# 特征过滤
X_filtered = extract_relevant_features(df, y,
column_id='id', column_sort='time',
default_fc_parameters=extraction_settings)
X_train, X_test, X_filtered_train, X_filtered_test, y_train, y_test = train_test_split(X, X_filtered, y, test_size=.4)
cl = DecisionTreeClassifier()
cl.fit(X_train, y_train)
cl2 = DecisionTreeClassifier()
cl2.fit(X_filtered_train, y_train)
print(classification_report(y_test, cl2.predict(X_filtered_test)))
'''维基百科点击量数据'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
pd.set_option('display.height', 9999)
pd.set_option('display.max_rows', 9999)
pd.set_option('display.max_columns', 9999)
pd.set_option('display.width', 9999)
train = pd.read_csv('train_1.csv').fillna(0) # fillna空缺数据填充为0
# print(train.info())
for col in train.columns[1:]:
train[col] = pd.to_numeric(train[col], downcast='integer')
# print(train.head())
# print(train.info())
'''
Page 2015-07-01 2015-07-02 2015-07-03 2015-07-04 2015-07-05 2015-07-06 2015-07-07 2015-07-08 2015-07-09 2015-07-10 2015-07-11 2015-07-12 2015-07-13 2015-07-14 2015-07-15 2015-07-16 2015-07-17 2015-07-18 2015-07-19 2015-07-20 2015-07-21 2015-07-22 2015-07-23 2015-07-24 2015-07-25 2015-07-26 2015-07-27 2015-07-28 2015-07-29 2015-07-30 2015-07-31 2015-08-01 2015-08-02 2015-08-03 2015-08-04 2015-08-05 2015-08-06 2015-08-07 2015-08-08 2015-08-09 2015-08-10 2015-08-11 2015-08-12 2015-08-13 2015-08-14 2015-08-15 2015-08-16 2015-08-17 2015-08-18 2015-08-19 2015-08-20 2015-08-21 2015-08-22 2015-08-23 2015-08-24 2015-08-25 2015-08-26 2015-08-27 2015-08-28 2015-08-29 2015-08-30 2015-08-31 2015-09-01 2015-09-02 2015-09-03 2015-09-04 2015-09-05 2015-09-06 2015-09-07 2015-09-08 2015-09-09 2015-09-10 2015-09-11 2015-09-12 2015-09-13 2015-09-14 2015-09-15 2015-09-16 2015-09-17 2015-09-18 2015-09-19 2015-09-20 2015-09-21 2015-09-22 2015-09-23 2015-09-24 2015-09-25 2015-09-26 2015-09-27 2015-09-28 2015-09-29 2015-09-30 2015-10-01 2015-10-02 2015-10-03 2015-10-04 2015-10-05 2015-10-06 2015-10-07 2015-10-08 2015-10-09 2015-10-10 2015-10-11 2015-10-12 2015-10-13 2015-10-14 2015-10-15 2015-10-16 2015-10-17 2015-10-18 2015-10-19 2015-10-20 2015-10-21 2015-10-22 2015-10-23 2015-10-24 2015-10-25 2015-10-26 2015-10-27 2015-10-28 2015-10-29 2015-10-30 2015-10-31 2015-11-01 2015-11-02 2015-11-03 2015-11-04 2015-11-05 2015-11-06 2015-11-07 2015-11-08 2015-11-09 2015-11-10 2015-11-11 2015-11-12 2015-11-13 2015-11-14 2015-11-15 2015-11-16 2015-11-17 2015-11-18 2015-11-19 2015-11-20 2015-11-21 2015-11-22 2015-11-23 2015-11-24 2015-11-25 2015-11-26 2015-11-27 2015-11-28 2015-11-29 2015-11-30 2015-12-01 2015-12-02 2015-12-03 2015-12-04 2015-12-05 2015-12-06 2015-12-07 2015-12-08 2015-12-09 2015-12-10 2015-12-11 2015-12-12 2015-12-13 2015-12-14 2015-12-15 2015-12-16 2015-12-17 2015-12-18 2015-12-19 2015-12-20 2015-12-21 2015-12-22 2015-12-23 2015-12-24 2015-12-25 2015-12-26 2015-12-27 2015-12-28 2015-12-29 2015-12-30 2015-12-31 2016-01-01 2016-01-02 2016-01-03 2016-01-04 2016-01-05 2016-01-06 2016-01-07 2016-01-08 2016-01-09 2016-01-10 2016-01-11 2016-01-12 2016-01-13 2016-01-14 2016-01-15 2016-01-16 2016-01-17 2016-01-18 2016-01-19 2016-01-20 2016-01-21 2016-01-22 2016-01-23 2016-01-24 2016-01-25 2016-01-26 2016-01-27 2016-01-28 2016-01-29 2016-01-30 2016-01-31 2016-02-01 2016-02-02 2016-02-03 2016-02-04 2016-02-05 2016-02-06 2016-02-07 2016-02-08 2016-02-09 2016-02-10 2016-02-11 2016-02-12 2016-02-13 2016-02-14 2016-02-15 2016-02-16 2016-02-17 2016-02-18 2016-02-19 2016-02-20 2016-02-21 2016-02-22 2016-02-23 2016-02-24 2016-02-25 2016-02-26 2016-02-27 2016-02-28 2016-02-29 2016-03-01 2016-03-02 2016-03-03 2016-03-04 2016-03-05 2016-03-06 2016-03-07 2016-03-08 2016-03-09 2016-03-10 2016-03-11 2016-03-12 2016-03-13 2016-03-14 2016-03-15 2016-03-16 2016-03-17 2016-03-18 2016-03-19 2016-03-20 2016-03-21 2016-03-22 2016-03-23 2016-03-24 2016-03-25 2016-03-26 2016-03-27 2016-03-28 2016-03-29 2016-03-30 2016-03-31 2016-04-01 2016-04-02 2016-04-03 2016-04-04 2016-04-05 2016-04-06 2016-04-07 2016-04-08 2016-04-09 2016-04-10 2016-04-11 2016-04-12 2016-04-13 2016-04-14 2016-04-15 2016-04-16 2016-04-17 2016-04-18 2016-04-19 2016-04-20 2016-04-21 2016-04-22 2016-04-23 2016-04-24 2016-04-25 2016-04-26 2016-04-27 2016-04-28 2016-04-29 2016-04-30 2016-05-01 2016-05-02 2016-05-03 2016-05-04 2016-05-05 2016-05-06 2016-05-07 2016-05-08 2016-05-09 2016-05-10 2016-05-11 2016-05-12 2016-05-13 2016-05-14 2016-05-15 2016-05-16 2016-05-17 2016-05-18 2016-05-19 2016-05-20 2016-05-21 2016-05-22 2016-05-23 2016-05-24 2016-05-25 2016-05-26 2016-05-27 2016-05-28 2016-05-29 2016-05-30 2016-05-31 2016-06-01 2016-06-02 2016-06-03 2016-06-04 2016-06-05 2016-06-06 2016-06-07 2016-06-08 2016-06-09 2016-06-10 2016-06-11 2016-06-12 2016-06-13 2016-06-14 2016-06-15 2016-06-16 2016-06-17 2016-06-18 2016-06-19 2016-06-20 2016-06-21 2016-06-22 2016-06-23 2016-06-24 2016-06-25 2016-06-26 2016-06-27 2016-06-28 2016-06-29 2016-06-30 2016-07-01 2016-07-02 2016-07-03 2016-07-04 2016-07-05 2016-07-06 2016-07-07 2016-07-08 2016-07-09 2016-07-10 2016-07-11 2016-07-12 2016-07-13 2016-07-14 2016-07-15 2016-07-16 2016-07-17 2016-07-18 2016-07-19 2016-07-20 2016-07-21 2016-07-22 2016-07-23 2016-07-24 2016-07-25 2016-07-26 2016-07-27 2016-07-28 2016-07-29 2016-07-30 2016-07-31 2016-08-01 2016-08-02 2016-08-03 2016-08-04 2016-08-05 2016-08-06 2016-08-07 2016-08-08 2016-08-09 2016-08-10 2016-08-11 2016-08-12 2016-08-13 2016-08-14 2016-08-15 2016-08-16 2016-08-17 2016-08-18 2016-08-19 2016-08-20 2016-08-21 2016-08-22 2016-08-23 2016-08-24 2016-08-25 2016-08-26 2016-08-27 2016-08-28 2016-08-29 2016-08-30 2016-08-31 2016-09-01 2016-09-02 2016-09-03 2016-09-04 2016-09-05 2016-09-06 2016-09-07 2016-09-08 2016-09-09 2016-09-10 2016-09-11 2016-09-12 2016-09-13 2016-09-14 2016-09-15 2016-09-16 2016-09-17 2016-09-18 2016-09-19 2016-09-20 2016-09-21 2016-09-22 2016-09-23 2016-09-24 2016-09-25 2016-09-26 2016-09-27 2016-09-28 2016-09-29 2016-09-30 2016-10-01 2016-10-02 2016-10-03 2016-10-04 2016-10-05 2016-10-06 2016-10-07 2016-10-08 2016-10-09 2016-10-10 2016-10-11 2016-10-12 2016-10-13 2016-10-14 2016-10-15 2016-10-16 2016-10-17 2016-10-18 2016-10-19 2016-10-20 2016-10-21 2016-10-22 2016-10-23 2016-10-24 2016-10-25 2016-10-26 2016-10-27 2016-10-28 2016-10-29 2016-10-30 2016-10-31 2016-11-01 2016-11-02 2016-11-03 2016-11-04 2016-11-05 2016-11-06 2016-11-07 2016-11-08 2016-11-09 2016-11-10 2016-11-11 2016-11-12 2016-11-13 2016-11-14 2016-11-15 2016-11-16 2016-11-17 2016-11-18 2016-11-19 2016-11-20 2016-11-21 2016-11-22 2016-11-23 2016-11-24 2016-11-25 2016-11-26 2016-11-27 2016-11-28 2016-11-29 2016-11-30 2016-12-01 2016-12-02 2016-12-03 2016-12-04 2016-12-05 2016-12-06 2016-12-07 2016-12-08 2016-12-09 2016-12-10 2016-12-11 2016-12-12 2016-12-13 2016-12-14 2016-12-15 2016-12-16 2016-12-17 2016-12-18 2016-12-19 2016-12-20 2016-12-21 2016-12-22 2016-12-23 2016-12-24 2016-12-25 2016-12-26 2016-12-27 2016-12-28 2016-12-29 2016-12-30 2016-12-31
0 2NE1_zh.wikipedia.org_all-access_spider 18 11 5 13 14 9 9 22 26 24 19 10 14 15 8 16 8 8 16 7 11 10 20 18 15 14 49 10 16 18 8 5 9 7 13 9 7 4 11 10 5 9 9 9 9 13 4 15 25 9 5 6 20 3 14 46 5 5 13 4 9 10 9 11 11 11 9 15 5 10 7 4 8 9 10 6 13 16 6 24 9 11 12 8 14 6 6 11 14 6 10 20 7 15 8 15 5 8 8 5 11 165 34 6 13 8 9 11 26 18 3 5 12 6 16 19 9 10 11 11 7 9 10 24 6 6 8 16 13 10 10 6 5 20 6 47 9 9 12 11 17 15 14 11 97 11 12 11 14 15 12 104 5 22 45 75 29 34 20 12 25 9 62 20 19 8 23 13 16 34 36 11 18 12 24 30 27 44 35 53 11 26 13 18 9 16 6 19 20 19 22 30 14 16 22 15 15 26 16 13 27 18 13 32 31 16 38 18 9 14 10 24 8 15 18 10 23 17 11 26 14 8 12 9 11 34 17 29 11 9 14 21 12 11 13 11 13 16 13 19 21 14 11 35 18 42 15 5 21 56 9 20 17 18 8 9 17 9 10 14 17 6 18 13 11 12 11 8 15 11 20 59 11 18 17 12 14 13 9 490 189 102 38 126 71 21 57 79 17 17 23 16 23 18 22 44 6 31 17 25 40 19 15 15 29 18 16 13 20 22 19 11 50 22 39 23 21 23 22 16 19 35 16 12 15 13 14 10 21 20 19 14 12 15 17 16 21 27 13 11 15 14 18 18 10 11 14 18 14 13 17 15 14 234 8 62 26 22 8 22 15 69 11 18 23 12 20 17 15 16 18 21 15 30 115 56 45 17 18 15 18 14 15 15 24 22 18 30 12 13 18 17 31 26 29 12 19 19 57 17 20 49 10 19 26 41 23 30 55 17 24 14 12 49 42 37 13 30 20 33 20 14 40 15 18 26 8 25 21 20 25 19 23 18 19 18 55 16 65 11 11 13 20 21 13 24 20 13 32 16 10 13 44 17 13 72 40 19 14 13 12 14 10 26 13 22 14 23 12 8 50 13 10 16 14 10 24 10 20 10 26 25 16 19 20 12 19 50 16 30 18 25 14 20 8 67 13 41 10 21 13 8 15 14 12 6 11 10 42 21 24 14 11 204 14 45 33 28 18 14 47 15 14 18 20 14 16 14 20 60 22 15 17 19 18 21 21 47 65 17 32 63 15 26 14 20 22 19 18 20
1 2PM_zh.wikipedia.org_all-access_spider 11 14 15 18 11 13 22 11 10 4 41 65 57 38 20 62 44 15 10 47 24 17 22 9 39 13 11 12 21 19 9 15 33 8 8 7 13 2 23 12 27 27 36 23 58 80 60 69 42 161 94 77 78 20 24 13 14 26 8 82 22 11 81 37 9 40 47 18 23 6 2 7 16 10 34 14 31 20 23 14 16 34 15 30 13 30 15 25 17 8 12 17 10 21 18 30 13 7 15 23 20 15 9 47 14 11 16 12 7 15 14 12 18 29 39 11 14 28 17 20 17 36 13 11 14 14 14 33 14 13 18 13 11 8 10 11 81 14 20 6 16 18 9 12 10 8 11 14 47 13 13 6 10 8 8 8 18 31 16 15 10 13 9 32 161 6 20 8 11 13 8 19 7 9 16 11 6 38 11 17 13 12 12 9 7 15 14 14 11 13 12 12 24 15 38 18 26 15 12 14 40 19 13 39 19 16 19 11 76 14 19 26 19 17 30 17 17 17 19 11 175 10 5 12 7 12 14 19 11 19 17 15 19 15 9 20 6 11 6 15 20 35 34 21 17 22 26 16 16 28 19 17 15 11 7 15 11 36 16 22 18 46 17 15 17 12 17 14 15 14 15 28 36 23 12 25 18 18 16 20 17 16 13 15 19 14 20 37 16 15 11 42 10 14 61 39 17 17 41 35 16 9 64 22 22 66 33 30 16 18 45 17 88 23 18 12 12 13 13 5 11 13 11 22 10 13 17 10 14 18 9 16 17 6 15 18 10 11 16 10 12 12 13 9 16 19 19 11 15 10 20 25 9 14 10 14 18 25 13 24 14 13 14 24 16 15 13 11 12 28 28 17 27 48 184 64 24 92 31 34 49 21 36 32 16 16 19 22 22 19 18 18 17 35 49 19 25 24 39 19 29 30 16 54 15 39 19 17 60 12 77 63 12 9 34 30 13 20 29 10 14 23 15 12 25 22 144 31 31 17 66 78 19 44 43 35 13 13 25 15 37 38 22 28 19 46 24 22 43 58 26 20 27 35 20 31 24 24 94 18 20 18 16 38 54 29 49 25 72 144 36 97 179 29 12 21 42 53 41 19 25 19 15 21 21 27 33 15 24 13 11 14 26 11 21 14 14 54 5 10 12 11 14 28 23 20 9 12 11 14 14 15 15 11 20 13 19 621 57 17 23 19 21 47 28 22 22 65 27 17 17 13 9 18 22 17 15 22 23 19 17 42 28 15 9 30 52 45 26 20
2 3C_zh.wikipedia.org_all-access_spider 1 0 1 1 0 4 0 3 4 4 1 1 1 6 8 6 4 5 1 2 3 8 8 6 6 2 2 3 2 4 3 3 5 3 5 4 2 5 1 4 5 0 0 7 3 5 1 6 2 5 0 3 1 0 1 1 2 4 2 1 1 3 4 3 6 6 4 3 3 2 9 7 2 3 1 3 1 6 7 1 2 5 2 3 8 5 0 4 1 5 3 0 1 8 2 1 3 0 0 5 3 3 0 2 5 2 5 10 5 6 1 4 4 1 3 13 2 1 3 2 1 10 5 6 2 5 2 2 3 2 6 3 2 1 2 3 1 1 2 2 3 2 2 5 7 2 3 4 6 1 3 6 3 3 4 2 2 4 3 1 5 5 4 2 4 5 4 2 1 6 1 1 3 1 3 5 3 3 0 5 3 2 2 2 2 0 3 3 3 4 4 8 3 5 8 1 4 0 3 6 3 1 3 3 3 1 3 8 4 3 2 5 6 3 6 5 6 7 3 1 5 1 2 0 1 4 3 3 9 4 7 5 10 2 3 3 4 2 3 5 3 6 4 5 5 2 1 4 7 2 2 5 1 0 3 3 1 2 4 2 2 3 4 7 1 1 10 9 5 1 6 7 4 6 2 4 155 155 83 48 31 16 6 13 8 8 5 7 3 4 6 7 10 9 7 8 4 6 5 2 7 3 7 6 3 1 6 2 1 3 8 3 5 4 7 5 2 5 0 3 12 4 2 4 6 4 5 9 4 5 7 1 5 1 5 4 5 7 7 5 3 4 1 9 3 4 6 2 2 1 16 6 3 3 6 1 6 1 4 3 5 1 6 5 1 4 5 4 2 4 3 4 2 0 1 3 12 4 7 5 6 6 6 3 3 3 5 5 2 11 6 2 2 3 7 5 4 5 3 3 9 7 2 1 5 6 7 13 3 5 6 2 4 1 2 7 2 2 4 4 2 5 3 2 3 5 4 2 5 7 5 2 7 6 11 10 5 19 7 11 4 10 3 4 6 3 4 8 10 3 3 1 10 5 4 4 3 4 1 3 6 6 6 3 5 11 6 3 7 6 0 2 4 4 3 6 4 3 4 1 6 5 5 2 3 3 2 2 6 1 3 3 3 2 10 2 2 2 7 3 6 4 2 4 6 5 4 4 3 3 9 3 5 4 0 1 4 5 8 8 1 1 2 5 3 3 3 7 3 9 8 3 210 5 4 6 2 2 4 3 3 1 1 7 4 4 6 3 4 17
3 4minute_zh.wikipedia.org_all-access_spider 35 13 10 94 4 26 14 9 11 16 16 11 23 145 14 17 85 4 30 22 9 10 11 7 7 11 9 11 44 8 14 19 10 17 17 10 7 10 1 8 27 19 16 2 84 22 14 47 25 14 11 12 27 8 17 43 3 19 14 20 43 4 5 37 23 14 12 13 22 12 12 6 27 5 7 24 8 9 10 12 19 7 7 18 15 7 9 10 9 14 8 17 6 8 7 5 3 9 5 6 8 8 11 6 7 28 15 8 7 7 12 5 11 3 7 23 6 3 8 8 39 4 10 6 8 9 16 9 8 8 7 5 5 12 8 15 9 12 5 7 6 12 7 6 33 5 11 6 4 32 9 17 2 10 10 5 7 11 8 10 6 17 11 20 11 15 18 10 15 12 12 12 8 13 9 11 4 12 9 6 12 9 9 6 7 7 11 7 14 9 21 9 10 13 10 13 16 8 10 7 13 18 8 50 8 33 6 22 9 84 28 11 7 14 16 49 71 29 22 6 34 16 14 9 12 24 18 8 26 8 8 13 21 9 10 14 12 9 10 20 15 26 24 19 10 12 8 16 13 8 17 12 34 10 9 9 15 10 12 8 11 9 28 17 11 13 10 10 10 16 12 12 13 25 25 18 18 23 27 39 11 16 9 26 14 15 10 23 17 74 114 8 15 15 15 12 14 14 23 21 11 19 9 10 11 14 9 5 10 20 22 16 9 10 42 22 7 7 54 7 9 13 5 10 12 18 23 23 17 6 14 13 13 9 11 35 8 12 15 10 25 9 8 8 10 14 9 11 303 29 121 69 39 25 27 54 39 24 22 20 14 12 8 17 11 15 19 20 11 36 19 35 22 14 17 15 12 34 20 25 15 18 19 13 17 16 11 22 43 8 13 16 8 19 14 9 13 13 16 10 10 11 17 32 21 16 23 15 55 17 17 15 7 13 11 11 8 22 5 7 18 9 13 27 15 19 7 9 14 14 9 16 11 7 14 13 11 9 9 9 11 15 28 10 24 8 20 19 12 31 14 9 40 15 83 60 19 15 15 12 23 17 20 26 11 13 9 44 7 18 4 36 34 10 8 21 7 6 12 15 9 13 21 13 10 21 15 103 22 15 12 11 15 7 12 13 9 8 21 16 38 13 14 17 26 14 10 9 23 15 7 10 7 10 14 17 11 9 11 5 10 8 17 13 23 40 16 17 41 17 8 9 18 12 12 18 13 18 23 10 32 10 26 27 16 11 17 19 10 11
4 52_Hz_I_Love_You_zh.wikipedia.org_all-access_s... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 38 159 9 4 1 10 9 2 0 5 0 3 55 234 57 5 4 4 0 9 9 6 6 6 10 7 5 4 6 4 2 6 5 3 3 2 5 5 8 8 6 3 7 7 6 6 2 8 3 7 8 3 4 5 2 1 1 1 2 8 6 1 0 4 2 6 2 2 2 1 5 2 2 2 3 10 1 3 4 2 3 4 1 1 9 0 1 6 2 5 2 2 3 2 11 1 4 4 2 10 5 3 10 2 5 7 2 5 8 2 5 1 1 2 6 6 2 1 3 2 3 4 3 2 0 13 4 2 4 3 3 1 3 5 2 3 2 4 3 39 4 3 1 5 5 5 5 8 15 13 63 2 2 3 6 10 2 8 4 3 3 6 4 1 5 9 1 6 4 0 4 9 6 8 13 4 7 6 9 3 21 6 13 10 2 3 6 7 10 6 6 4 173 5 10 10 18 20 11 5 6 33 13 10 22 11 8 4 10 13 11 8 6 10 14 6 9 6 16 14 13 15 14 16 9 178 64 12 10 11 6 8 7 9 8 5 11 8 4 15 5 8 8 6 7 15 4 11 7 48 9 25 13 3 11 27 13 36 10
'''
def get_language(page):
res = re.search('[a-z][a-z].wikipedia.org', page)
if res:
return res.group()[0:2]
else:
return 'na'
train['lang'] = train['Page'].map(get_language)
# print(train.head())
from collections import Counter
# print(Counter(train['lang']))
# Counter({'zh': 19, 'en': 13, 'ja': 11, 'ru': 10, 'de': 8, 'fr': 7, 'es': 6})
lang_sets = {}
lang_sets['en'] = train[train.lang == 'en'].iloc[:, :-1]
lang_sets['ja'] = train[train.lang == 'ja'].iloc[:, :-1]
lang_sets['de'] = train[train.lang == 'de'].iloc[:, :-1]
lang_sets['fr'] = train[train.lang == 'fr'].iloc[:, :-1]
lang_sets['zh'] = train[train.lang == 'zh'].iloc[:, :-1]
lang_sets['ru'] = train[train.lang == 'ru'].iloc[:, :-1]
lang_sets['es'] = train[train.lang == 'es'].iloc[:, :-1]
sums = {}
for key in lang_sets:
sums[key] = lang_sets[key].iloc[:, 1:].sum(axis=0) / lang_sets[key].shape[0]
# print(sums)
days = [r for r in range(sums['en'].shape[0])]
# 不同国家词频的点击情况
# fig = plt.figure(1, figsize=[10, 10])
# plt.ylabel('Views per Page')
# plt.xlabel('Day')
# plt.title('Pages in Different Languages')
# labels = {'en': 'English', 'ja': 'Japanese', 'de': 'German', 'fr': 'French',
# 'zh': 'Chinese', 'ru': 'Russian', 'es': 'Spanish'
# }
# for key in sums:
# plt.plot(days, sums[key], label=labels[key])
# plt.legend()
# plt.show()
# 不同词条的点击量
# def plot_entry(key, idx):
# data = lang_sets[key].iloc[idx, 1:]
# fig = plt.figure(1, figsize=(10, 5))
# plt.plot(days, data)
# plt.xlabel('day')
# plt.ylabel('views')
# plt.title(train.iloc[lang_sets[key].index[idx], 0])
# plt.show()
#
#
# for idx in range(5, 10):
# plot_entry('en', idx)
# 查看每个国家关注的热点
top_pages = {} # 每个国家top1热点
def national_hot(key):
sum_set = pd.DataFrame(lang_sets[key][['Page']])
sum_set['total'] = lang_sets[key].sum(axis=1)
sum_set = sum_set.sort_values(by='total', ascending=False)
top_pages[key] = sum_set.index[0]
print('-----', key, '-----')
print(sum_set.head(5))
for key in lang_sets:
national_hot(key)
# 每个国家最热的话题
for key in top_pages:
fig = plt.figure(1, figsize=(10, 5))
cols = train.columns
cols = cols[1:-1]
data = train.loc[top_pages[key], cols]
plt.plot(days, data)
plt.xlabel('Days')
plt.ylabel('Views')
plt.title(train.loc[top_pages[key], 'Page'])
plt.show()
import numpy as np
import pandas as pd
# rng = pd.date_range(start='2017-01-01', periods=10, freq='3D')
# print(rng)
# print(np.random.randint(low=1, high=20, size=10))
time = pd.Series(np.random.randint(low=1, high=20, size=10),
index=pd.date_range(start='2017-01-02', periods=10, freq='3D'))
time = time.truncate(before='2017-01-05')
# print(time)
# print(time['2017-01-05':'2017-01-20'])
p1 = pd.period_range('2016-01-01 10:10', freq='10H', periods=10)
# print(p1)