有些github仓库非常大,如果网络条件不好,只想下载部分文件,用 git无法实现,必须clone整个仓库。所以编写了这个脚本,提取仓库的部分文件。
import requests
from bs4 import BeautifulSoup
import os
tag="/tree/master/"
repname="mahongquan/github-web-file-download"
reppath="https://raw.github.com/"+repname+tag
outputpath="."
def getfile(pathf):
print("get file:"+pathf)
reppath="https://raw.githubusercontent.com/"+pathf
print(reppath)
#print reppath+pathf
#raw_input("pause")
res=requests.get(reppath+pathf)#"Classes/AppDelegate.h")
ps=pathf.split("/")
p="/".join(ps[:-1])
p=outputpath+"/"+p
if not os.path.exists(p):
os.makedirs(p)
open(p+"/"+ps[-1],"wb").write(res.content)
def getpath(path):
print("getpath:"+path)
if path=="":
path="https://github.com/tree"+tag+repname
res=requests.get(path)
else:
print(reppath+path)
res=requests.get(reppath+path)
print(res.content)
soup = BeautifulSoup(res.content,"html.parser")
# soup = BeautifulSoup(open("untitled.html","rb"),"html.parser")
tbs=soup.find_all("div",attrs={"role":"grid"})
# t=tbs[0].tbody
rs=tbs[0].find_all("div",attrs={"role":"row"})
print(len(rs))
fs=[]
paths=[]
for r in rs[2:]:
print(r)
cs=r.find_all("div",attrs={"role":"gridcell"})
rh=r.find_all("div",attrs={"role":"rowheader"})
print(len(cs))
#print(cs[0])
# print(cs[0].svg)
print(cs[0])
print(rh)
# input("here===========")
if cs[0].svg!=None:
cls=cs[0].svg['class']
print("class="+str(cls))
if cls==None:
pass
elif cls[1]==u"octicon-file-directory":
print("ispath")
f=rh[0].a['href']
ps=f.split("/")
childpath="/".join(ps[5:])
print(childpath)
paths.append(childpath)
elif cls[1]==u"octicon-file":
print("is file")
fs.append(rh[0].a['href'])
else:
pass
print(fs,paths)
# input("one done==")
print(fs,paths)
# input("befor next ")
for f in fs:
print(f)
ps=f.split("/")
getfile("/".join(ps[5:]))
for p in paths:
getpath(p)
def setrepname(nm):
global repname
global reppath
global outputpath
repname=nm
outputpath=nm.split("/")[1]
reppath="https://github.com/"+repname+"/tree"+tag
def main():
setrepname("gtk-rs/gtk4-rs")
#getpath("js/src")#all
#getpath("assets/js/vendor")#assets/js/vendor
getpath("examples")#assets/js/vendor
#getpath("Resources")#all
if __name__=="__main__":
main()