## 出发点
作者发布的代码经久未修,现在已经有了问题,无法选择指定的语言、指定的license进行下载
## 安装依赖
pip install 'huggingface_hub[cli]'
## 下载代码
import pandas as pd
# 筛选数据
langs = ['css',
'html',
'html+ecr',
'html+php',
'html+razor',
'html+eex',
'html+erb',
'sass',
'scss']
# 数据提取
def process_data(path:str):
data = pd.read_parquet(path)
valid_data = []
metadata = {}
for idx, row_data in data.iterrows():
ext = os.path.basename(row_data['path']).rsplit(".", 1)[-1].lower()
if ext not in metadata:
metadata[ext] = 1
else:
metadata[ext] += 1
if ext in langs:
valid_data.append(
{'repo_name':row_data['repo_name'],'path':row_data['path'],'content':row_data['content']}
)
del data
return valid_data, metadata
import subprocess
import json
from tqdm import tqdm
import time
import os
command = 'huggingface-cli download --resume-download codeparrot/github-code --include "*/{file_name}.parquet" --repo-type dataset --local-dir github_code'
os.makedirs('github_code/contents', exist_ok=True)
os.makedirs('github_code/infos', exist_ok=True)
with tqdm(range(400)) as pbar: # 1127
for idx in pbar:
file_name = f"train-{idx:05}-of-01126"
pbar.set_description(f'Download {idx}')
# 下载数据
run_command = command.format(file_name=file_name)
process = subprocess.Popen(
run_command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
shell=True
)
## 等待子进程退出并获取返回码
process.communicate()
exit_code = process.returncode
# print(f"状态 {exit_code} ({run_command})")
metadata = {}
if exit_code==0:
# 处理数据
pbar.set_description(f'Process {idx}')
valid_data, metadata = process_data(f'./github_code/data/{file_name}.parquet')
# 保存数据
pbar.set_description(f'Save {idx}')
with open(f'./github_code/contents/{file_name}.json', 'w', encoding='utf-8') as f:
json.dump(valid_data, f, ensure_ascii=False)
with open(f'./github_code/infos/{file_name}.json', 'w', encoding='utf-8') as f:
json.dump(
{'exit_code':exit_code, 'run_command':run_command, 'metadata':metadata},
f, ensure_ascii=False, indent=4
)
# 删除对应的文件
os.remove(f'github_code/data/{file_name}.parquet')
## 监控
# 查看网络流量
$ watch more /proc/net/dev
# 查看cpu使用情况
$ top