有时只想下载GIthub上某个大项目下面的一个文件夹,真心麻烦,还要把整个项目clone下来,当然也有使用svn项目检出功能的。本人不喜欢安装太多软件,就用python写了一个脚本来完成。
from bs4 import BeautifulSoup
import requests, os, sys, time
class DownloadDir(object):
def __init__(self, storDir, repoUrl):
self.repoUrl = repoUrl
self.storDir = storDir
self.sess = requests.Session()
def run(self):
self._run(self.repoUrl)
def _run(self, url):
response = self.sess.get(url)
res = self.parseStruct(response.text)
if isinstance(res, list):
for r in res:
newUrl = url + '/' + str(r[0])
self._run(newUrl)
else:
filePath, (fileType, content) = res.popitem()
self.writeContent(filePath, fileType, content)
print(url)
def parseContent(self, html):
textDict = {}
self.bs = BeautifulSoup(html, "html.parser")
filePath = self.bs.select(".file-navigation .breadcrumb")[0].text.strip()
content = self.bs.select(".file .data .image")
if content:
content = content.select("a")[0].attr("href")
fileType = 'url'
else:
content = self.bs.select(".file .data")[0].text
fileType = 'text'
textDict[filePath] = [fileType, content]
return textDict
def parseStruct(self, html):
struct = []
self.bs = BeautifulSoup(html, "html.parser")
if self.bs.select(".file .data"):
return self.parseContent(html)
currDir = self.bs.select(".file-navigation .breadcrumb")[0].text.strip()
res = self.bs.select(".file-wrap .files tr.js-navigation-item")
for node in res:
if node == '\n' or not node.select("td.content"):
continue
try:
name = node.select(".content")[0].text.strip()
isFile = False if node.select(".octicon-file-directory") else True
struct.append([name, isFile, currDir])
except:
continue
return struct
def writeContent(self, filePath, fileType, content):
filePath = os.path.join(self.storDir, filePath)
print(filePath)
mode = 'w'
if fileType == 'url':
res = self.sess.get(content)
content = res.content
mode = 'wb'
if not os.path.exists(os.path.dirname(filePath)):
os.makedirs(os.path.dirname(filePath))
with open(filePath, mode) as f:
f.write(content)
if __name__ == "__main__":
dd = DownloadDir("c:\\","https://github.com/baoboa/pyqt5/tree/master/examples")
dd.run()