AI4Code Detailed EDA

AI4Code Detailed EDA📊

介绍

这次竞赛的目标是理解在python notebooks中code块和markdown块之间的关系。在这次竞赛中,我们需要在给定正确的cell块顺序情况下,重建markdown顺序,证明哪些自然语言(markdown块)依赖于哪些code块。

导库

import os
import json
import wordcloud 
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from tqdm.notebook import tqdm, trange

载入数据

## train dataframes
df = pd.read_csv("../input/ai4code-train-dataframe/train.csv", index_col= [0,1])
df.dropna(inplace = True)

df_ancestors = pd.read_csv('../input/AI4Code/train_ancestors.csv', index_col='id')
df_orders = pd.read_csv("../input/AI4Code/train_orders.csv",index_col='id',squeeze=True,).str.split()

探索性数据分析(exploratory data analysis,EDA)

观察:

  1. 训练集中一共有139256个notebook,测试集中一共4个notebook;
  2. 具有两种cell_type [code,markdown]
  3. 大约2/3的code,1/3的markdown
print(f"\033[94mNumber of notebooks present in train set  = ",len(os.listdir("../input/AI4Code/train")))
print(f"\033[94mNumber of notebooks present in test set  = ",len(os.listdir("../input/AI4Code/test")))

Number of notebooks present in train set = 139256
Number of notebooks present in test set = 4

查看训练数据

df.head()

在这里插入图片描述

训练数据分布
code_df = df[df["cell_type"] == "code"]
mkd_df = df[df["cell_type"] == "markdown"]


print(f'\033[94mNumber of Code Cells: {len(code_df)}')
print(f'\033[94mNumber of Markdown Cells: {len(mkd_df)}')

labels=['Code Cells', 'Markdown Cells']
values= [len(code_df), len(mkd_df)]
colors = ['#DE3163', '#58D68D']

fig = go.Figure(data=[go.Pie(
    labels=labels, 
    values=values, 
    pull=[0.1, 0 ],
    marker=dict(colors=colors, 
                line=dict(color='#000000', 
                          width=2))
)])
fig.show()

在这里插入图片描述

Code块分析

观察

  1. 代码块的平均长度为25个词
  2. 代码块最大长度为74589个词
采样Code块
print(f'\033[94m')
print(code_df.iloc[0]["source"])

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import uuid
import os
import scipy
import cv2
from tqdm import tqdm
import math
import ast
sns.set()

Code块长度分布

# 将code块split进行单词计数
code_lengths = np.array([len(code_df["source"][i].split()) for i in range(len(code_df))])
print(f'\033[94m Min Code Cells Length = ', min(code_lengths))
print(f'\033[94m Mean Code cells Length = ', round(np.mean(code_lengths),2))
print(f'\033[94m Max Code Cells Length = ', max(code_lengths))

Min Code Cells Length = 1
Mean Code cells Length = 25.24
Max Code Cells Length = 74589

fig,ax= plt.subplots(figsize= (18,6))
# 绘制长度分布的盒子图
# 如下图发现,具有较多离群点
plt.boxplot(code_lengths, vert = False)
plt.xlabel("Lenght of Code Cells");

在这里插入图片描述

Code块词云
# 绘制前1000个cell_code的词云
wordcloud_notes = wordcloud.WordCloud(stopwords=wordcloud.STOPWORDS, max_font_size=120, max_words=5000,
                      width = 600, height = 400,
                      background_color='white').generate("".join(code_df["source"][:1000]))
fig, ax = plt.subplots(figsize=(14,10))
ax.imshow(wordcloud_notes, interpolation='bilinear')
ax.set_axis_off()
plt.imshow(wordcloud_notes);

在这里插入图片描述

Markdown块分析

观察到:

  1. Markdown块的平均长度为29
  2. Markdown的最大长度为38939
  3. 长度分布中具有很多离群点
采样Markdown块
print(f'\033[94m')
print(mkd_df.iloc[59]["source"])

在这里插入图片描述

Markdown块长度分布
mkd_lengths = np.array([len(mkd_df ["source"][i].split()) for i in range(len(mkd_df))])
print(f'\033[94m Min Markdown Cells Length = ', min(mkd_lengths))
print(f'\033[94m Mean Markdown cells Length = ', round(np.mean(mkd_lengths),2))
print(f'\033[94m Max Markdown Cells Length = ', max(mkd_lengths))

Min Markdown Cells Length = 1
Mean Markdown cells Length = 29.42
Max Markdown Cells Length = 38939

fig,ax= plt.subplots(figsize= (18,6))
plt.boxplot(mkd_lengths, vert = False) # 水平展示
plt.xlabel("Lenght of Markdown Cells");

在这里插入图片描述

Markdown词云
wordcloud_notes = wordcloud.WordCloud(stopwords=wordcloud.STOPWORDS, max_font_size=120, max_words=5000,
                      width = 600, height = 400,
                      background_color='white').generate("".join(mkd_df["source"][:1000]))
fig, ax = plt.subplots(figsize=(14,10))
ax.imshow(wordcloud_notes, interpolation='bilinear')
ax.set_axis_off()
plt.imshow(wordcloud_notes);

在这里插入图片描述

Notebooks分析

观察到:

  1. cell块的长度最小为1
  2. code块个数平均值为30
  3. markdown块个数平均值为15
  4. 其中code块和markdown块最大个数分别为809 537
## loading code_cell counts from notebooks 
# 读取train文件夹 提取里面所有的文件名 然后去掉后缀 提取出notebook_id
notebook_ids = [notebook[:-5] for notebook in os.listdir("../input/AI4Code/train")]
code_counts= []
markdown_counts= [] 
# trange(n)等价于tqdm(range(n))
for i in trange(len(notebook_ids)):
    # 获取到单个notebook 其中包括cell_id cell_type source
    temp_df = df.loc[(notebook_ids[i])]
    # 计算当前notebook的code块数量 存储在code_counts中
    code_counts.append((temp_df["cell_type"] == "code").sum())
    # 同上 计算markdown块数量 存储在markdown_counts中
    markdown_counts.append((temp_df["cell_type"] == "markdown").sum())

# 将notebook_ids code_counts markdown_counts作为列属性 
counts_df = pd.DataFrame(data = np.array([notebook_ids, code_counts, markdown_counts, ]).T, columns = ["notebook_id", "code_count", "markdown_count"])
# 修改*_count的数据类型为int
counts_df["markdown_count"] = counts_df["markdown_count"].astype(str).astype(int)
counts_df["code_count"] = counts_df["code_count"].astype(str).astype(int)
# 计算总的cell个数
counts_df["total_count"] = counts_df["code_count"] + counts_df["markdown_count"]
# 分别计算总cell的最小值 最大值 以及平均值 
print(f'\033[94m Minimum Cell count in any notebook', counts_df["total_count"].min())
print(f'\033[94m Maximum Cell count in any notebook', counts_df["total_count"].max())
print(f'\033[94m Mean of Cell counts across all notebooks', round(counts_df["total_count"].mean(), 2 ))
counts_df.head()

在这里插入图片描述

Notebooks离群值分析
k = 100
# 对总的cell数目进行逆序排序(从大到小) 选出前100个
top_k = counts_df.sort_values(by = ["total_count"], ascending=False)[:k]
fig = px.bar(data_frame=top_k, 
             x = "notebook_id" ,
             y = [ "code_count", "markdown_count"], 
             color_discrete_sequence=['#DE3163', '#58D68D']
         
            )
fig.update_layout(
    title={
        'text': "Cell Type Count analysis for top 100 cell count notebooks(OUTLIERS)",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Notebook ID",
    yaxis_title="Count",
    template="plotly_white"
    
)
fig.update_traces(marker_line_color='black',
                  marker_line_width=0.9,opacity = 0.9)
fig.show()

在这里插入图片描述

Code块个数分析
# 输出code块数量最小、最大值以及平均值
print(f'\033[94m Minimum Code Cell count in any notebook', counts_df["code_count"].min())
print(f'\033[94m Maximum Code Cell count in any notebook', counts_df["code_count"].max())
print(f'\033[94m Mean of Code Cell counts across all notebooks', round(counts_df["code_count"].mean(), 2 ))

在这里插入图片描述

Code块个数分布
# 绘制code块个数的直方图分布
fig = px.histogram(data_frame=counts_df, 
                   x= "code_count",
                   color_discrete_sequence=["#DE3163"],
                   marginal="violin")
fig.update_layout(
    title={
        'text': "Code Cell Count Distribution",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Code Cells",
    yaxis_title="Count",
    showlegend=False,
    template="plotly_white"
)
fig.show()
# 绘制code块个数<100的直方图分布
fig = px.histogram(data_frame=counts_df[counts_df["code_count"]<100], 
                   x= "code_count",
                   color_discrete_sequence=["#58D68D"],
                   marginal="violin")
fig.update_layout(
    title={
        'text': "Code Cell Count Distribution (COUNTS < 100 )",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Code Cells",
    yaxis_title="Count",
    showlegend=False,
    template="plotly_white"
)
fig.show()

在这里插入图片描述

在这里插入图片描述

Markdown个数分析
# 输出markdown块数量最小、最大值以及平均值
print(f'\033[94m Minimum Markdown Cell count in any notebook', counts_df["markdown_count"].min())
print(f'\033[94m Maximum Markdown Cell count in any notebook', counts_df["markdown_count"].max())
print(f'\033[94m Mean of Markdown Cell counts across all notebooks', round(counts_df["markdown_count"].mean(), 2 ))

在这里插入图片描述

Markdown个数分布
# 绘制Markdown块个数分布直方图
fig = px.histogram(data_frame=counts_df, 
                   x= "markdown_count",
                   color_discrete_sequence=["#DE3163"],
                   marginal="violin")
fig.update_layout(
    title={
        'text': "Markdown Cell Count Distribution",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Markdown Cells",
    yaxis_title="Count",
    showlegend=False,
    template="plotly_white"
)
fig.show()
# 绘制Markdown块个数<100分布直方图
fig = px.histogram(data_frame=counts_df[counts_df["markdown_count"]<100], 
                   x= "markdown_count",
                   color_discrete_sequence=["#58D68D"],
                   marginal="violin")
fig.update_layout(
    title={
        'text': "Markdown Cell Count Distribution (COUNTS < 100 )",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Markdown Cells",
    yaxis_title="Count",
    showlegend=False,
    template="plotly_white"
)
fig.show()

在这里插入图片描述

在这里插入图片描述

最小块(Code、Markdown)个数分析
# 分析notebook中块个数都等于1 或者任意一个等于1 或者单独某个块个数等于1
either_one = counts_df[(counts_df["markdown_count"] == 1) | (counts_df["code_count"] == 1 )] 
both_one = counts_df[(counts_df["markdown_count"] == 1) & (counts_df["code_count"] == 1 )] 
code_count_one = counts_df[counts_df["code_count"] == 1 ]
markdown_count_one  = counts_df[counts_df["markdown_count"] == 1 ]

print(f"\033[94mTotal notebook with either 1 code cell or 1 markdown cell = ", len(either_one))
print(f"\033[94mTotal notebook with both 1 code cell and 1 markdown cell = ", len(both_one ))
print(f"\033[94mNotebook counts with only 1 code cell  = " ,len(code_count_one))
print(f"\033[94mNotebook counts with only 1 markdown cell  = " ,len(markdown_count_one))

在这里插入图片描述

Code块个数 v.s. Markdown块个数
fig = px.scatter(data_frame=counts_df, 
                 x = "code_count", 
                 y = "markdown_count", 
                 size = "code_count",
                 color_discrete_sequence=["#DE3163"])
fig.add_shape(type='line',
                x0=0,
                y0=0,
                x1=800,
                y1=800,
                line=dict(color='Black'),
                xref='x',
                yref='y',name = "X=Y line"
             )
fig.update_layout(
    title={
        'text': "Code Cell Counts vs Markdown Cell Counts",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Code Cell Counts",
    yaxis_title="Markdown Cell Counts",
    showlegend=False,
    template="plotly_white"
)
fig.show()

在这里插入图片描述

Reference

  1. https://www.kaggle.com/code/odins0n/ai4code-detailed-eda
  2. https://plotly.com/python/plotly-express/
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值