不知从何时起,年度报告总能在旧年将近、新年伊始的时候掀起朋友圈的热潮。
近日,网易云音乐、知乎等各大流行app也如期放出了其用户2019年的年度报告,我突发奇想,为何不为自己生成一个GitHub的年度代码报告?
绿油油的GitHub仿佛在诉说着咱们coder的岁岁年年。
本报告的数据来源于我的 GitHub 2019年全年的代码提交数据,经简单的Python网络爬虫与数据分析获得。
爬虫部分
这里放一些核心的代码块,包含按日爬取代码提交情况,按月爬取代码提交情况,以及按仓库爬取代码提交情况~
def get_month(self):
"""
按月爬取代码提交情况
"""
date=self.generate()
for i in range(len(date)):
date_from=date[i][0]
date_to=date[i][1]
params={
'from': date_from,
'to': date_to,
'tab': 'overview',
'include_header': 'no',
'button': '',
'utf8': '✓'
}
r=requests.get(url=self.url,headers=self.headers,params=params)
tree=etree.HTML(r.text)
repo=tree.xpath('//a[@data-hovercard-type="repository"]/text()')
count=tree.xpath('//a[@class="f6 muted-link ml-lg-1 mt-1 mt-lg-0 d-block d-lg-inline "]/text()')
repo=[i[9:] for i in repo]
count=[self.regex.findall(i)[0] for i in count]
for j in range(len(count)):
item={
'month':i+1,
'repo':repo[j],
'count':count[j]
}
self.db['month'].insert_one(item)
def get_day(self):
"""
按天爬取代码提交情况
"""
r=requests.get(url=self.url,headers=self.headers)
count=re.findall('data-count="(.*?)" data-date',r.text)
date=re.findall('data-date="(.*?)"/>',r.text)
for i in range(len(count)):
item={
'day':date[i],
'count':count[i]
}
self.db['day'].insert_one(item)
def get_repo_id(self):
"""
获取所有仓库以及对应id
"""
repo=dict()
r=requests.get(url=self.url+'?tab=repositories',headers=self.headers)
tree=etree.HTML(r.text)
links=tree.xpath('//div[@class="d-inline-block mb-1"]/h3/a/@href')
print(links)
links=['https://github.com'+i for i in links]
for link in links:
headers={'User-Agent': random.choice(self.user_agent)}
r=requests.get(url=link,headers=headers)
tree=etree.HTML(r.text)
try:
repo_id=tree.xpath('//div[@class="flex-auto f6 mr-3"]/a[2]/@href')[0]
repo[repo_id[10:-48]]=repo_id[-40:]
time.sleep(5)
except Exception as e:
print(e)
time.sleep(5)
return repo
def get_commit(self):
"""
按仓库爬取代码提交情况
"""
repo=self.get_repo_id()
#repo={'Emojis': 'fe3b56bc1ebada24502030257fb92c2a46a12969', 'BlockChain': '9f26f0568f9a72472a394577495fead08dc5e631', 'Trick': 'c768bac1ef25553034b8259b0432747ec7b93e2b', 'Algorithm': '60646324bb62c46dd8f48fb89fbf8ba01fe0c92b', 'Reptile': '5f063ea9dffaae8a8ab854ac9b51c3f5b5d6ce53', 'SE': '6b55f6326a52199993f808c38c1d12b0e1d08bc5', 'leetcode': '45c12c9cbaf2d0d87e2543b49f4b0cbf9096b3bf', 'Steganalysis': 'f7844698bff217ff206b9a3de15ccec708951c83', 'DeepLearning': 'b8080938a4b22395379be9032266df36cb5491e6', 'YYSLink': '56a6b65280e50c207c0700b0359e509a75972be8', 'Statistical-Learning': 'c988aea60ba0ed5a01b10d31f4a823eb3c75f3b7','DataMining': '302e161d441a5831271a0bb05c115a81f335e527'}
for key,value in repo.items():
commit=[]
contents=[]
url='https://github.com/librauee/{}/commits/master'.format(key)
r=requests.get(url,headers=self.headers)
tree=etree.HTML(r.text)
commit_time=tree.xpath('//relative-time[@class="no-wrap"]/@datetime')
older=tree.xpath('//button[@class="btn btn-outline BtnGroup-item"]/text()')
content=tree.xpath('//p[@class="commit-title h5 mb-1 text-gray-dark "]/a/text()')
commit.extend(commit_time)
contents.extend(content)
if len(older)!=2:
i=0
while 1:
params={
'after': '{} {}'.format(value,35*i-1),
'_pjax': '#js-repo-pjax-container'
}
url='https://github.com/librauee/{}/commits/master'.format(key)
r=requests.get(url,headers=self.headers,params=params)
tree=etree.HTML(r.text)
commit_time=tree.xpath('//relative-time[@class="no-wrap"]/@datetime')
older=tree.xpath('//button[@class="btn btn-outline BtnGroup-item"]/text()')
content=tree.xpath('//p[@class="commit-title h5 mb-1 text-gray-dark "]/a/text()')
commit.extend(commit_time)
contents.extend(content)
if len(older)!=0 and older[0]=='Older':
break
i+=1
for i in range(len(commit)):
item={
'repo':key,
'commit_time':commit[i],
'cotent':contents[i]
}
self.db['time'].insert_one(item)
此报告的生成结合小李同学的想象力与创造力,由享誉海内外的高端艺术设计师杰泥亲自PS,在此对她的倾情奉献表示衷心的感谢!
2020年的第一缕阳光已经破云而出,让我们一起携手奋进!