1、前言:最近做项目需要调研github上项目存在的相关issues,所以就根据自己需要写了一个爬虫,仅此记录一下👀
2、环境:python3.7、lxml(支持HTML、XML解析)、requests(HTTP库)
3、思路:1、先根据关键词获取相关的项目列表
2、再获取每个项目的issues列表
3、再获取每个issue的内容
4、代码:
# -*- codeing = utf-8 -*-
# @Time : 2020/09/25
# @Author : loadding...
# @File : reptile_github.py
# @Software : jupyter
from lxml import etree
import requests
#根据关键词获取项目列表
def get_repos_list(key_words):
#初始化列表
repos_list=[]
#默认
for i in range(1,100):
url='https://github.com/search?p='+str(i)+'&q='+key_words+'&type=repositories'
response=requests.get(url)
#获取页面源码
page_source=response.text
#print(page_source)
tree=etree.HTML(page_source)
#获取项目超链接
arr=tree.xpath('//*[@class="f4 text-normal"]/a/@href')
repos_list+=arr
return repos_list
#获取一个项目的issues列表
def get_issues_list(repo_name):
issues_list=[]
url='https://github.com'+repo_name+'/issues'
#print(url)
response=requests.get(url)
#获取源码
page_source=response.text
tree=etree.HTML(page_source)
#获取issues数量
number=tree.xpath('//*[@id="js-repo-pjax-container"]/div[1]/nav/ul/li[2]/a/span[2]')
if len(number)==0:
number='0'
else:
number=number[0].text
#超过1K就爬取1000条(够用了)
if number.isdigit():
number=int(number)
else:
number=1000
print(number)
#计算分页数量,每页25个issues
page=0
if number%25==0:
page=int(number/25)
else:
page=int(number/25)+1
for i in range(1,page+1):
url='https://github.com'+repo_name+'/issues?page='+str(i)
response=requests.get(url)
#获取源码
page_source=response.text
tree=etree.HTML(page_source)
#获取issues超链接
arr=tree.xpath('//*[@class="d-block d-md-none position-absolute top-0 bottom-0 left-0 right-0"]/@href')
issues_list+=arr
#/combust/mleap/issues/716
#返回issues数量和列表
return number,issues_list
#获取一个issue的内容及评论
def get_issue_content(issue_name):
#拼接issue地址
url='https://github.com'+issue_name
#print(url)
response=requests.get(url)
page_source=response.text
tree=etree.HTML(page_source)
#获取issue内容
issue_content=tree.xpath('//table//td')[0].xpath('string(.)')
return issue_content
if __name__=='__main__':
#测试
#get_repos_list('ML pipeline')
#get_issues('/combust/mleap')
#get_issue_content('/combust/mleap/issues/716')
'''
issue="/rust-lang/rust/issues/76833"
content=get_issue_content(issue)
print(content)
'''
with open(r'D:\Jupyter_workspace\result.md','w+',encoding='utf-8') as f:
key_words=input('please input a keyword:')
#获取项目列表
repos_list=get_repos_list(key_words)
#格式:/combust/mleap
for repo in repos_list:
#拼接项目url
repos_url='https://github.com'+repo
print(repos_url)
f.write('\n\n')
f.write(repos_url)
f.write('\n')
#获取项目的issues列表
number,issues_list=get_issues_list(repo)
f.write(str(number))
f.write('\n')
#格式:/combust/mleap/issues/716
for issue in issues_list:
#获取issue的内容
issue_url='https://github.com'+issue
content=get_issue_content(issue)
#content=filter_emoji(content)
print(issue_url)
f.write(issue_url)
f.write('\n')
f.write('>'*100)
f.write('\n')
f.write(str(content).strip())
f.write('\n')
f.write('<'*100)
f.write('\n')
f.flush()
#print(content)
#print(issue)
print('The end!')
5、运行结果:
控制台输出结果:
生成的result.md文件内容如下:
完结撒花🌻
++++++++++++++++++++++++++++++++++++++++++++++分隔符+++++++++++++++++++++++++++++++++++++++++++++
上文爬取了repositories项目中issues的内容
下面是后续做的工作,直接关键字爬取issues,并添加了筛选条件(issue评论数、项目star数、issues更新时间),代码如下,后续如果有别的爬取需求结合这两个脚本修改应该就可以完成
代码:
# -*- codeing = utf-8 -*-
# @Time : 2020/10/18
# @Author : loadding...
# @File : reptile_github_issues.py
# @Software : jupyter
from lxml import etree
import requests
import re
#根据关键词获取issues列表
def get_issues_list(key_words,comments_num,star_num,need_datetime):
#初始化列表
issues_list=[]
#设定要爬取的issues页面数量默认10页,每页10个issues
for i in range(1,10):
url='https://github.com/search?l=Rust&p='+str(i)+'&q='+key_words+'&type=Issues'
print("issues_url_list:",url)
response=requests.get(url)
#获取页面源码
page_source=response.text
#print(page_source)
tree=etree.HTML(page_source)
#获取有评论的链接列表
arr=tree.xpath('//*[@class="d-flex text-small text-gray flex-wrap position-relative"]/span/../../*[@class="f4 text-normal"]/a/@href')
#获取issues更新时间,列表元素个数为10
issues_datetime=tree.xpath('//*[@class="d-flex text-small text-gray flex-wrap position-relative"]/span/../div/relative-time/@datetime')
comments=tree.xpath('//*[@class="d-flex text-small text-gray flex-wrap position-relative"]/span')
#for i in arr:
# print(i)
j=0
for i in comments:
#获取评论数量
pos=i.text[11:15].strip()#数量
#打印issue url
url='https://github.com'+arr[j]
print("issue url:",url)
#打印评论数量
print("issue comments:",pos)
#判断issues comments数量是否满足条件
if int(pos)>comments_num:
#print(arr[j])
#判断star数是否满足条件
flag=check_star(arr[j],star_num)
if flag:
#判断issue更新时间是否满足条件
issue_datetime=issues_datetime[j][:10]
#打印issue更新时间
print("issue datetime:",issue_datetime)
print("need_datetime:",need_datetime)
if issue_datetime>need_datetime:
print("满足条件,爬取!")
issues_list.append(arr[j])
j+=1
# #测试获取的最终issues列表
# for i in issues_list:
# print(i)
return issues_list
#根据issues comment数量筛选
def check_comment(issue_name):
#为了减少运行时间,放在get_issues_list方法里,这样就不用再次获取页面源码了
pass
#根据项目star数筛选,爬的是issues内容页面
def check_star(issues_name,star_num):
url='https://github.com'+issues_name
#打印issues url
#print("issues_url:",url)
response=requests.get(url)
#获取页面源码
page_source=response.text
#print(page_source)
tree=etree.HTML(page_source)
#print(page_source)
#xpath方法获取star数失败,不知道为什么???改用正则
#star=tree.xpath('//*[@id="js-repo-pjax-container"]/div[1]/div/ul/li[2]/div/form[2]/a/@aria-label')
#正则表达式获取star数
#如果star数是1,user没有s,所以构造下面正则表达式
pattern=re.compile('"(\d+) user[s]? starred this repository"')
#可以对正则查找的结果做非空判断,不过应该都有star数,为了执行效率没加
star=pattern.findall(page_source)[0]
#打印star数
print("repositories star:",star)
#筛选
if int(star)>star_num:
return True
#根据issues的更新时间筛选
def check_time(issues_name):
#为了加快运行速度也放在了get_issues_list函数中
pass
if __name__=='__main__':
#读取输入并进行格式校验
#输入关键字
key_words=input('keyword:')
#根据comment数量进行筛选
comments_num=input('comment number(>):')
if comments_num.isdigit():
comments_num=int(comments_num)
else:
#输入格式错误,使用默认评论数5
comments_num=5
print("input error! comments default 5!")
#根据star数量进行筛选
star_num=input('star number(>):')
if star_num.isdigit():
star_num=int(star_num)
else:
#输入错误,使用默认值1000
star_num=1000
print("input error! star default 1000!")
need_datetime=input('after datetime(yyyy-mm-dd):')
pattern=re.compile('^\d{4}-\d{2}-\d{2}$')
result=pattern.findall(need_datetime)
if len(result)<1:
#输入格式错误使用默认日期,未严格判断比如月份不超过12月等,如有需要可自行修改
need_datetime='2016-09-01'
print("input error! default 2016-09-01")
#得到符合条件的issues列表
issues_list=get_issues_list(key_words,comments_num,star_num,need_datetime)
print('The final issues_list is:')
if len(issues_list)>0:
print("共爬取了"+str(len(issues_list))+"个issues,url如下:")
for i in issues_list:
#逗号输出会添加空格,用加号才会最为一个str输出
print('https://github.com'+i)
else:
print("未爬取到issues,请修改筛选条件后重试!")
运行结果:
keyword:unsafe
comment number(>):10
star number(>):30000
after datetime(yyyy-mm-dd):2016-12-12
issues_url_list: https://github.com/search?l=Rust&p=1&q=unsafe&type=Issues
issue url: https://github.com/rust-lang/rust/pull/78002
issue comments: 9
issue url: https://github.com/kaist-cp/rv6/issues/219
issue comments: 2
issue url: https://github.com/sfackler/r2d2/pull/112
issue comments: 5
issue url: https://github.com/rust-lang/rust/pull/76676
issue comments: 30
repositories star: 48961
issue datetime: 2020-09-13
need_datetime: 2016-12-12
满足条件,爬取!
issue url: https://github.com/rusoto/rusoto/pull/1836
issue comments: 1
issue url: https://github.com/mgeisler/textwrap/issues/210
issue comments: 2
issues_url_list: https://github.com/search?l=Rust&p=2&q=unsafe&type=Issues
issue url: https://github.com/rust-lang/rust/pull/75115
issue comments: 21
repositories star: 48961
issue datetime: 2020-08-03
need_datetime: 2016-12-12
满足条件,爬取!
issue url: https://github.com/sotrh/learn-wgpu/issues/109
issue comments: 3
issue url: https://github.com/rust-lang/rust/pull/74477
issue comments: 37
repositories star: 48961
issue datetime: 2020-07-18
need_datetime: 2016-12-12
满足条件,爬取!
issue url: https://github.com/rust-analyzer/rust-analyzer/issues/5996
issue comments: 5
issues_url_list: https://github.com/search?l=Rust&p=3&q=unsafe&type=Issues
issue url: https://github.com/rust-lang/rust/pull/74979
issue comments: 11
repositories star: 48961
issue datetime: 2020-07-31
need_datetime: 2016-12-12
满足条件,爬取!
issue url: https://github.com/rust-lang/rust/pull/73928
issue comments: 11
repositories star: 48961
issue datetime: 2020-07-01
need_datetime: 2016-12-12
满足条件,爬取!
issue url: https://github.com/rust-lang/rust/issues/73904
issue comments: 34
repositories star: 48961
issue datetime: 2020-06-30
need_datetime: 2016-12-12
满足条件,爬取!
issue url: https://github.com/VictorKoenders/factorio_with_physics/issues/1
issue comments: 1
issue url: https://github.com/zkcrypto/ff/pull/41
issue comments: 1
issue url: https://github.com/tikv/tikv/issues/8759
issue comments: 1
issues_url_list: https://github.com/search?l=Rust&p=4&q=unsafe&type=Issues
issue url: https://github.com/tikv/pprof-rs/issues/36
issue comments: 2
issue url: https://github.com/rust-lang/rust/issues/69270
issue comments: 3
issue url: https://github.com/rust-lang/rust/issues/76943
issue comments: 4
issue url: https://github.com/rust-analyzer/rust-analyzer/issues/5677
issue comments: 1
issue url: https://github.com/paritytech/substrate/issues/5418
issue comments: 4
issues_url_list: https://github.com/search?l=Rust&p=5&q=unsafe&type=Issues
issue url: https://github.com/rust-lang/rust/issues/74268
issue comments: 1
issue url: https://github.com/image-rs/image/issues/1340
issue comments: 1
issue url: https://github.com/alexcrichton/jobserver-rs/issues/25
issue comments: 1
issue url: https://github.com/musitdev/portmidi-rs/issues/33
issue comments: 7
issue url: https://github.com/bitflags/bitflags/issues/228
issue comments: 2
issue url: https://github.com/rust-lang/rust/issues/28179
issue comments: 21
repositories star: 48961
issue datetime: 2015-09-02
need_datetime: 2016-12-12
issues_url_list: https://github.com/search?l=Rust&p=6&q=unsafe&type=Issues
issue url: https://github.com/woshilapin/website/pull/1
issue comments: 3
issue url: https://github.com/capnproto/capnproto-rust/issues/78
issue comments: 7
issue url: https://github.com/woboq/qmetaobject-rs/issues/69
issue comments: 1
issue url: https://github.com/rust-lang/rust/issues/71668
issue comments: 24
repositories star: 48961
issue datetime: 2020-04-29
need_datetime: 2016-12-12
满足条件,爬取!
issue url: https://github.com/bitflags/bitflags/issues/222
issue comments: 1
issues_url_list: https://github.com/search?l=Rust&p=7&q=unsafe&type=Issues
issue url: https://github.com/stepancheg/rust-protobuf/pull/387
issue comments: 15
repositories star: 1419
issue url: https://github.com/rust-analyzer/rust-analyzer/issues/5412
issue comments: 7
issue url: https://github.com/rust-lang/miri/issues/1574
issue comments: 6
issue url: https://github.com/tikv/tikv/issues/8685
issue comments: 2
issue url: https://github.com/droundy/arrayref/issues/18
issue comments: 4
issues_url_list: https://github.com/search?l=Rust&p=8&q=unsafe&type=Issues
issue url: https://github.com/rust-lang/rust/issues/74838
issue comments: 3
issue url: https://github.com/ebfull/pcap/issues/127
issue comments: 1
issue url: https://github.com/Keats/jsonwebtoken/issues/130
issue comments: 4
issue url: https://github.com/jonas-schievink/rubble/issues/122
issue comments: 4
issues_url_list: https://github.com/search?l=Rust&p=9&q=unsafe&type=Issues
issue url: https://github.com/uuid-rs/uuid/issues/488
issue comments: 1
issue url: https://github.com/jackmott/simdeez/issues/34
issue comments: 2
issue url: https://github.com/rust-lang/rust/issues/74840
issue comments: 40
repositories star: 48961
issue datetime: 2020-07-27
need_datetime: 2016-12-12
满足条件,爬取!
issue url: https://github.com/thaumant/levenshtein-perf-examples/issues/1
issue comments: 2
issue url: https://github.com/stepancheg/rust-protobuf/issues/499
issue comments: 1
issue url: https://github.com/hyperledger/indy-sdk/pull/2115
issue comments: 1
issue url: https://github.com/bodoni/svg/issues/6
issue comments: 10
issue url: https://github.com/rust-lang/rust/issues/59795
issue comments: 3
The final issues_list is:
共爬取了8个issues,url如下:
https://github.com/rust-lang/rust/pull/76676
https://github.com/rust-lang/rust/pull/75115
https://github.com/rust-lang/rust/pull/74477
https://github.com/rust-lang/rust/pull/74979
https://github.com/rust-lang/rust/pull/73928
https://github.com/rust-lang/rust/issues/73904
https://github.com/rust-lang/rust/issues/71668
https://github.com/rust-lang/rust/issues/74840
注:如果不需要多余的输出,减少运行时间,直接修改脚本即可