通过网页抓取github仓库的部分文件

马红权

已于 2024-06-11 14:24:05 修改

阅读量2.1k

点赞数 1

分类专栏： python 文章标签： github python 脚本

于 2014-02-15 10:17:02 首次发布

本文链接：https://blog.csdn.net/mahongquan/article/details/19234627

版权

python 专栏收录该内容

17 篇文章 0 订阅

订阅专栏

有些github仓库非常大，如果网络条件不好，只想下载部分文件，用 git无法实现,必须clone整个仓库。所以编写了这个脚本，提取仓库的部分文件。

点击打开github

import requests
from bs4 import BeautifulSoup
import os
tag="/tree/master/"
repname="mahongquan/github-web-file-download"
reppath="https://raw.github.com/"+repname+tag
outputpath="."
def getfile(pathf):
    print("get file:"+pathf)
    reppath="https://raw.githubusercontent.com/"+pathf
    print(reppath)
    #print reppath+pathf
    #raw_input("pause")
    res=requests.get(reppath+pathf)#"Classes/AppDelegate.h")
    ps=pathf.split("/")
    p="/".join(ps[:-1])
    p=outputpath+"/"+p
    if not os.path.exists(p):
        os.makedirs(p)
    open(p+"/"+ps[-1],"wb").write(res.content)
def getpath(path):
    print("getpath:"+path)
    if path=="":
        path="https://github.com/tree"+tag+repname
        res=requests.get(path)
    else:
        print(reppath+path)
        res=requests.get(reppath+path)
    print(res.content)

    soup = BeautifulSoup(res.content,"html.parser")
    # soup = BeautifulSoup(open("untitled.html","rb"),"html.parser")
    tbs=soup.find_all("div",attrs={"role":"grid"})
    # t=tbs[0].tbody
    rs=tbs[0].find_all("div",attrs={"role":"row"})
    print(len(rs))
    fs=[]
    paths=[]
    for r in rs[2:]:
        print(r)
        cs=r.find_all("div",attrs={"role":"gridcell"})
        rh=r.find_all("div",attrs={"role":"rowheader"})
        print(len(cs))
        #print(cs[0])
        # print(cs[0].svg)
        print(cs[0])
        print(rh)
        # input("here===========")
        if cs[0].svg!=None:
            cls=cs[0].svg['class']
            print("class="+str(cls))
            if cls==None:
                pass
            elif cls[1]==u"octicon-file-directory":
                print("ispath")
                f=rh[0].a['href']
                ps=f.split("/")
                childpath="/".join(ps[5:])
                print(childpath)
                paths.append(childpath)
            elif cls[1]==u"octicon-file":
                print("is file")
                fs.append(rh[0].a['href'])
            else:
                pass
        print(fs,paths)
        # input("one done==")    
    print(fs,paths)
    
    # input("befor next ")
    for f in fs:
        print(f)
        ps=f.split("/")
        getfile("/".join(ps[5:]))
    for p in paths:
        getpath(p)
def setrepname(nm):
    global repname
    global reppath
    global outputpath
    repname=nm
    outputpath=nm.split("/")[1]
    reppath="https://github.com/"+repname+"/tree"+tag
def main():
    setrepname("gtk-rs/gtk4-rs")
    #getpath("js/src")#all
    #getpath("assets/js/vendor")#assets/js/vendor
    getpath("examples")#assets/js/vendor
    #getpath("Resources")#all
if __name__=="__main__":
    main()

马红权

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
通过网页抓取github仓库的部分文件

有些github仓库非常大，如果只想下载部分文件，用 git无法实现。所以编写了这个脚本，提取仓库的部分文件。点击打开githubimport requestsfrom bs4 import BeautifulSoupimport ospath="https://raw.github.com/mahongquan/OpenBird/master/"def savefil
复制链接

扫一扫