AI4Code Detailed EDA📊
介绍
这次竞赛的目标是理解在python notebooks中code块和markdown块之间的关系。在这次竞赛中,我们需要在给定正确的cell块顺序情况下,重建markdown顺序,证明哪些自然语言(markdown块)依赖于哪些code块。
导库
import os
import json
import wordcloud
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from tqdm.notebook import tqdm, trange
载入数据
## train dataframes
df = pd.read_csv("../input/ai4code-train-dataframe/train.csv", index_col= [0,1])
df.dropna(inplace = True)
df_ancestors = pd.read_csv('../input/AI4Code/train_ancestors.csv', index_col='id')
df_orders = pd.read_csv("../input/AI4Code/train_orders.csv",index_col='id',squeeze=True,).str.split()
探索性数据分析(exploratory data analysis,EDA)
观察:
- 训练集中一共有139256个notebook,测试集中一共4个notebook;
- 具有两种cell_type [code,markdown]
- 大约2/3的code,1/3的markdown
print(f"\033[94mNumber of notebooks present in train set = ",len(os.listdir("../input/AI4Code/train")))
print(f"\033[94mNumber of notebooks present in test set = ",len(os.listdir("../input/AI4Code/test")))
Number of notebooks present in train set = 139256
Number of notebooks present in test set = 4
查看训练数据
df.head()
训练数据分布
code_df = df[df["cell_type"] == "code"]
mkd_df = df[df["cell_type"] == "markdown"]
print(f'\033[94mNumber of Code Cells: {len(code_df)}')
print(f'\033[94mNumber of Markdown Cells: {len(mkd_df)}')
labels=['Code Cells', 'Markdown Cells']
values= [len(code_df), len(mkd_df)]
colors = ['#DE3163', '#58D68D']
fig = go.Figure(data=[go.Pie(
labels=labels,
values=values,
pull=[0.1, 0 ],
marker=dict(colors=colors,
line=dict(color='#000000',
width=2))
)])
fig.show()
Code块分析
观察
- 代码块的平均长度为25个词
- 代码块最大长度为74589个词
采样Code块
print(f'\033[94m')
print(code_df.iloc[0]["source"])
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import uuid
import os
import scipy
import cv2
from tqdm import tqdm
import math
import ast
sns.set()
Code块长度分布
# 将code块split进行单词计数
code_lengths = np.array([len(code_df["source"][i].split()) for i in range(len(code_df))])
print(f'\033[94m Min Code Cells Length = ', min(code_lengths))
print(f'\033[94m Mean Code cells Length = ', round(np.mean(code_lengths),2))
print(f'\033[94m Max Code Cells Length = ', max(code_lengths))
Min Code Cells Length = 1
Mean Code cells Length = 25.24
Max Code Cells Length = 74589
fig,ax= plt.subplots(figsize= (18,6))
# 绘制长度分布的盒子图
# 如下图发现,具有较多离群点
plt.boxplot(code_lengths, vert = False)
plt.xlabel("Lenght of Code Cells");
Code块词云
# 绘制前1000个cell_code的词云
wordcloud_notes = wordcloud.WordCloud(stopwords=wordcloud.STOPWORDS, max_font_size=120, max_words=5000,
width = 600, height = 400,
background_color='white').generate("".join(code_df["source"][:1000]))
fig, ax = plt.subplots(figsize=(14,10))
ax.imshow(wordcloud_notes, interpolation='bilinear')
ax.set_axis_off()
plt.imshow(wordcloud_notes);
Markdown块分析
观察到:
- Markdown块的平均长度为29
- Markdown的最大长度为38939
- 长度分布中具有很多离群点
采样Markdown块
print(f'\033[94m')
print(mkd_df.iloc[59]["source"])
Markdown块长度分布
mkd_lengths = np.array([len(mkd_df ["source"][i].split()) for i in range(len(mkd_df))])
print(f'\033[94m Min Markdown Cells Length = ', min(mkd_lengths))
print(f'\033[94m Mean Markdown cells Length = ', round(np.mean(mkd_lengths),2))
print(f'\033[94m Max Markdown Cells Length = ', max(mkd_lengths))
Min Markdown Cells Length = 1
Mean Markdown cells Length = 29.42
Max Markdown Cells Length = 38939
fig,ax= plt.subplots(figsize= (18,6))
plt.boxplot(mkd_lengths, vert = False) # 水平展示
plt.xlabel("Lenght of Markdown Cells");
Markdown词云
wordcloud_notes = wordcloud.WordCloud(stopwords=wordcloud.STOPWORDS, max_font_size=120, max_words=5000,
width = 600, height = 400,
background_color='white').generate("".join(mkd_df["source"][:1000]))
fig, ax = plt.subplots(figsize=(14,10))
ax.imshow(wordcloud_notes, interpolation='bilinear')
ax.set_axis_off()
plt.imshow(wordcloud_notes);
Notebooks分析
观察到:
- cell块的长度最小为1
- code块个数平均值为30
- markdown块个数平均值为15
- 其中code块和markdown块最大个数分别为809 537
## loading code_cell counts from notebooks
# 读取train文件夹 提取里面所有的文件名 然后去掉后缀 提取出notebook_id
notebook_ids = [notebook[:-5] for notebook in os.listdir("../input/AI4Code/train")]
code_counts= []
markdown_counts= []
# trange(n)等价于tqdm(range(n))
for i in trange(len(notebook_ids)):
# 获取到单个notebook 其中包括cell_id cell_type source
temp_df = df.loc[(notebook_ids[i])]
# 计算当前notebook的code块数量 存储在code_counts中
code_counts.append((temp_df["cell_type"] == "code").sum())
# 同上 计算markdown块数量 存储在markdown_counts中
markdown_counts.append((temp_df["cell_type"] == "markdown").sum())
# 将notebook_ids code_counts markdown_counts作为列属性
counts_df = pd.DataFrame(data = np.array([notebook_ids, code_counts, markdown_counts, ]).T, columns = ["notebook_id", "code_count", "markdown_count"])
# 修改*_count的数据类型为int
counts_df["markdown_count"] = counts_df["markdown_count"].astype(str).astype(int)
counts_df["code_count"] = counts_df["code_count"].astype(str).astype(int)
# 计算总的cell个数
counts_df["total_count"] = counts_df["code_count"] + counts_df["markdown_count"]
# 分别计算总cell的最小值 最大值 以及平均值
print(f'\033[94m Minimum Cell count in any notebook', counts_df["total_count"].min())
print(f'\033[94m Maximum Cell count in any notebook', counts_df["total_count"].max())
print(f'\033[94m Mean of Cell counts across all notebooks', round(counts_df["total_count"].mean(), 2 ))
counts_df.head()
Notebooks离群值分析
k = 100
# 对总的cell数目进行逆序排序(从大到小) 选出前100个
top_k = counts_df.sort_values(by = ["total_count"], ascending=False)[:k]
fig = px.bar(data_frame=top_k,
x = "notebook_id" ,
y = [ "code_count", "markdown_count"],
color_discrete_sequence=['#DE3163', '#58D68D']
)
fig.update_layout(
title={
'text': "Cell Type Count analysis for top 100 cell count notebooks(OUTLIERS)",
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
xaxis_title="Notebook ID",
yaxis_title="Count",
template="plotly_white"
)
fig.update_traces(marker_line_color='black',
marker_line_width=0.9,opacity = 0.9)
fig.show()
Code块个数分析
# 输出code块数量最小、最大值以及平均值
print(f'\033[94m Minimum Code Cell count in any notebook', counts_df["code_count"].min())
print(f'\033[94m Maximum Code Cell count in any notebook', counts_df["code_count"].max())
print(f'\033[94m Mean of Code Cell counts across all notebooks', round(counts_df["code_count"].mean(), 2 ))
Code块个数分布
# 绘制code块个数的直方图分布
fig = px.histogram(data_frame=counts_df,
x= "code_count",
color_discrete_sequence=["#DE3163"],
marginal="violin")
fig.update_layout(
title={
'text': "Code Cell Count Distribution",
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
xaxis_title="Code Cells",
yaxis_title="Count",
showlegend=False,
template="plotly_white"
)
fig.show()
# 绘制code块个数<100的直方图分布
fig = px.histogram(data_frame=counts_df[counts_df["code_count"]<100],
x= "code_count",
color_discrete_sequence=["#58D68D"],
marginal="violin")
fig.update_layout(
title={
'text': "Code Cell Count Distribution (COUNTS < 100 )",
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
xaxis_title="Code Cells",
yaxis_title="Count",
showlegend=False,
template="plotly_white"
)
fig.show()
Markdown个数分析
# 输出markdown块数量最小、最大值以及平均值
print(f'\033[94m Minimum Markdown Cell count in any notebook', counts_df["markdown_count"].min())
print(f'\033[94m Maximum Markdown Cell count in any notebook', counts_df["markdown_count"].max())
print(f'\033[94m Mean of Markdown Cell counts across all notebooks', round(counts_df["markdown_count"].mean(), 2 ))
Markdown个数分布
# 绘制Markdown块个数分布直方图
fig = px.histogram(data_frame=counts_df,
x= "markdown_count",
color_discrete_sequence=["#DE3163"],
marginal="violin")
fig.update_layout(
title={
'text': "Markdown Cell Count Distribution",
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
xaxis_title="Markdown Cells",
yaxis_title="Count",
showlegend=False,
template="plotly_white"
)
fig.show()
# 绘制Markdown块个数<100分布直方图
fig = px.histogram(data_frame=counts_df[counts_df["markdown_count"]<100],
x= "markdown_count",
color_discrete_sequence=["#58D68D"],
marginal="violin")
fig.update_layout(
title={
'text': "Markdown Cell Count Distribution (COUNTS < 100 )",
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
xaxis_title="Markdown Cells",
yaxis_title="Count",
showlegend=False,
template="plotly_white"
)
fig.show()
最小块(Code、Markdown)个数分析
# 分析notebook中块个数都等于1 或者任意一个等于1 或者单独某个块个数等于1
either_one = counts_df[(counts_df["markdown_count"] == 1) | (counts_df["code_count"] == 1 )]
both_one = counts_df[(counts_df["markdown_count"] == 1) & (counts_df["code_count"] == 1 )]
code_count_one = counts_df[counts_df["code_count"] == 1 ]
markdown_count_one = counts_df[counts_df["markdown_count"] == 1 ]
print(f"\033[94mTotal notebook with either 1 code cell or 1 markdown cell = ", len(either_one))
print(f"\033[94mTotal notebook with both 1 code cell and 1 markdown cell = ", len(both_one ))
print(f"\033[94mNotebook counts with only 1 code cell = " ,len(code_count_one))
print(f"\033[94mNotebook counts with only 1 markdown cell = " ,len(markdown_count_one))
Code块个数 v.s. Markdown块个数
fig = px.scatter(data_frame=counts_df,
x = "code_count",
y = "markdown_count",
size = "code_count",
color_discrete_sequence=["#DE3163"])
fig.add_shape(type='line',
x0=0,
y0=0,
x1=800,
y1=800,
line=dict(color='Black'),
xref='x',
yref='y',name = "X=Y line"
)
fig.update_layout(
title={
'text': "Code Cell Counts vs Markdown Cell Counts",
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
xaxis_title="Code Cell Counts",
yaxis_title="Markdown Cell Counts",
showlegend=False,
template="plotly_white"
)
fig.show()
Reference
- https://www.kaggle.com/code/odins0n/ai4code-detailed-eda
- https://plotly.com/python/plotly-express/