python3爬取github项目issues

1、前言:最近做项目需要调研github上项目存在的相关issues,所以就根据自己需要写了一个爬虫,仅此记录一下👀

2、环境:python3.7、lxml(支持HTML、XML解析)、requests(HTTP库)

3、思路:1、先根据关键词获取相关的项目列表

                2、再获取每个项目的issues列表

                3、再获取每个issue的内容

4、代码:

# -*- codeing = utf-8 -*-
# @Time : 2020/09/25
# @Author : loadding...
# @File : reptile_github.py
# @Software : jupyter
 
from lxml import etree
import requests
 
#根据关键词获取项目列表
def get_repos_list(key_words):
    #初始化列表
    repos_list=[]
    #默认
    for i in range(1,100):
        url='https://github.com/search?p='+str(i)+'&q='+key_words+'&type=repositories'
        response=requests.get(url)
        #获取页面源码
        page_source=response.text
        #print(page_source)
        tree=etree.HTML(page_source)
        #获取项目超链接
        arr=tree.xpath('//*[@class="f4 text-normal"]/a/@href')
        repos_list+=arr
        return repos_list

#获取一个项目的issues列表
def get_issues_list(repo_name):
    issues_list=[]
    url='https://github.com'+repo_name+'/issues'
    #print(url)
    response=requests.get(url)
    #获取源码
    page_source=response.text
    tree=etree.HTML(page_source)
    #获取issues数量
    number=tree.xpath('//*[@id="js-repo-pjax-container"]/div[1]/nav/ul/li[2]/a/span[2]')
    if len(number)==0:
        number='0'
    else:
        number=number[0].text
    #超过1K就爬取1000条(够用了)
    if number.isdigit():
        number=int(number)
    else:
        number=1000
    print(number)
    #计算分页数量,每页25个issues
    page=0
    if number%25==0:
        page=int(number/25)
    else:
        page=int(number/25)+1
    for i in range(1,page+1):
        url='https://github.com'+repo_name+'/issues?page='+str(i)
        response=requests.get(url)
        #获取源码
        page_source=response.text
        tree=etree.HTML(page_source)
        #获取issues超链接
        arr=tree.xpath('//*[@class="d-block d-md-none position-absolute top-0 bottom-0 left-0 right-0"]/@href')
        issues_list+=arr
        #/combust/mleap/issues/716
    #返回issues数量和列表
    return number,issues_list


#获取一个issue的内容及评论
def get_issue_content(issue_name):
    #拼接issue地址
    url='https://github.com'+issue_name
    #print(url)
    response=requests.get(url)
    page_source=response.text
    tree=etree.HTML(page_source)
    #获取issue内容
    issue_content=tree.xpath('//table//td')[0].xpath('string(.)')
    
    return issue_content

    
if __name__=='__main__':
    #测试
    #get_repos_list('ML pipeline')
    #get_issues('/combust/mleap')
    #get_issue_content('/combust/mleap/issues/716')
    '''
    issue="/rust-lang/rust/issues/76833"
    content=get_issue_content(issue)
    print(content)
    
    '''
    with open(r'D:\Jupyter_workspace\result.md','w+',encoding='utf-8') as f:
        key_words=input('please input a keyword:')
        #获取项目列表
        repos_list=get_repos_list(key_words)
        #格式:/combust/mleap
        for repo in repos_list:
            #拼接项目url
            repos_url='https://github.com'+repo
            print(repos_url)
            f.write('\n\n')
            f.write(repos_url)
            f.write('\n')
            #获取项目的issues列表
            number,issues_list=get_issues_list(repo)
            f.write(str(number))
            f.write('\n')
            #格式:/combust/mleap/issues/716
            for issue in issues_list:
                #获取issue的内容
                issue_url='https://github.com'+issue
                content=get_issue_content(issue)
                #content=filter_emoji(content)
                print(issue_url)
                f.write(issue_url)
                f.write('\n')
                f.write('>'*100)
                f.write('\n')
                f.write(str(content).strip())
                f.write('\n')
                f.write('<'*100)
                f.write('\n')
                f.flush()
                #print(content)
                #print(issue)
    print('The end!')
    

5、运行结果:

控制台输出结果:

生成的result.md文件内容如下:

完结撒花🌻

 

++++++++++++++++++++++++++++++++++++++++++++++分隔符+++++++++++++++++++++++++++++++++++++++++++++

上文爬取了repositories项目中issues的内容

下面是后续做的工作,直接关键字爬取issues,并添加了筛选条件(issue评论数、项目star数、issues更新时间),代码如下,后续如果有别的爬取需求结合这两个脚本修改应该就可以完成

代码:

# -*- codeing = utf-8 -*-
# @Time : 2020/10/18
# @Author : loadding...
# @File : reptile_github_issues.py
# @Software : jupyter
 
from lxml import etree
import requests
import re

#根据关键词获取issues列表
def get_issues_list(key_words,comments_num,star_num,need_datetime):
    #初始化列表
    issues_list=[]
    #设定要爬取的issues页面数量默认10页,每页10个issues
    for i in range(1,10):
        url='https://github.com/search?l=Rust&p='+str(i)+'&q='+key_words+'&type=Issues'
        print("issues_url_list:",url)
        response=requests.get(url)
        #获取页面源码
        page_source=response.text
        #print(page_source)
        tree=etree.HTML(page_source)
        #获取有评论的链接列表
        arr=tree.xpath('//*[@class="d-flex text-small text-gray flex-wrap position-relative"]/span/../../*[@class="f4 text-normal"]/a/@href')
        #获取issues更新时间,列表元素个数为10
        issues_datetime=tree.xpath('//*[@class="d-flex text-small text-gray flex-wrap position-relative"]/span/../div/relative-time/@datetime')
        comments=tree.xpath('//*[@class="d-flex text-small text-gray flex-wrap position-relative"]/span')
        #for i in arr:
        #    print(i)
        j=0
        for i in comments:
            #获取评论数量
            pos=i.text[11:15].strip()#数量
            #打印issue url
            url='https://github.com'+arr[j]
            print("issue url:",url)
            #打印评论数量
            print("issue comments:",pos)
            #判断issues comments数量是否满足条件
            if int(pos)>comments_num:
                #print(arr[j])
                #判断star数是否满足条件
                flag=check_star(arr[j],star_num)
                if flag:
                    #判断issue更新时间是否满足条件
                    issue_datetime=issues_datetime[j][:10]
                    #打印issue更新时间
                    print("issue datetime:",issue_datetime)
                    print("need_datetime:",need_datetime)
                    if issue_datetime>need_datetime:
                        print("满足条件,爬取!")
                        issues_list.append(arr[j])
            j+=1  
#     #测试获取的最终issues列表
#     for i in issues_list:
#         print(i)
    return issues_list

#根据issues comment数量筛选
def check_comment(issue_name):
    #为了减少运行时间,放在get_issues_list方法里,这样就不用再次获取页面源码了
    pass

#根据项目star数筛选,爬的是issues内容页面
def check_star(issues_name,star_num):
    url='https://github.com'+issues_name
    #打印issues url
    #print("issues_url:",url)
    response=requests.get(url)
    #获取页面源码
    page_source=response.text
    #print(page_source)
    tree=etree.HTML(page_source)
    #print(page_source)
    #xpath方法获取star数失败,不知道为什么???改用正则
    #star=tree.xpath('//*[@id="js-repo-pjax-container"]/div[1]/div/ul/li[2]/div/form[2]/a/@aria-label')
    #正则表达式获取star数
    #如果star数是1,user没有s,所以构造下面正则表达式
    pattern=re.compile('"(\d+) user[s]? starred this repository"')
    #可以对正则查找的结果做非空判断,不过应该都有star数,为了执行效率没加
    star=pattern.findall(page_source)[0]
    #打印star数
    print("repositories star:",star)
    #筛选
    if int(star)>star_num:
        return True
#根据issues的更新时间筛选
def check_time(issues_name):
    #为了加快运行速度也放在了get_issues_list函数中
    pass
    
if __name__=='__main__':
    #读取输入并进行格式校验
    #输入关键字
    key_words=input('keyword:')
    #根据comment数量进行筛选
    comments_num=input('comment number(>):')
    if comments_num.isdigit():
        comments_num=int(comments_num)
    else:
        #输入格式错误,使用默认评论数5
        comments_num=5
        print("input error! comments default 5!")
    #根据star数量进行筛选
    star_num=input('star number(>):')
    if star_num.isdigit():
        star_num=int(star_num)
    else:
        #输入错误,使用默认值1000
        star_num=1000
        print("input error! star default 1000!")
    need_datetime=input('after datetime(yyyy-mm-dd):')
    pattern=re.compile('^\d{4}-\d{2}-\d{2}$')
    result=pattern.findall(need_datetime)
    if len(result)<1:
        #输入格式错误使用默认日期,未严格判断比如月份不超过12月等,如有需要可自行修改
        need_datetime='2016-09-01'
        print("input error! default 2016-09-01")
    #得到符合条件的issues列表
    issues_list=get_issues_list(key_words,comments_num,star_num,need_datetime)
    print('The final issues_list is:')
    if len(issues_list)>0:
        print("共爬取了"+str(len(issues_list))+"个issues,url如下:")
        for i in issues_list:
            #逗号输出会添加空格,用加号才会最为一个str输出
            print('https://github.com'+i)
    else:
        print("未爬取到issues,请修改筛选条件后重试!")

运行结果:

keyword:unsafe
comment number(>):10
star number(>):30000
after datetime(yyyy-mm-dd):2016-12-12
issues_url_list: https://github.com/search?l=Rust&p=1&q=unsafe&type=Issues
issue url: https://github.com/rust-lang/rust/pull/78002
issue comments: 9
issue url: https://github.com/kaist-cp/rv6/issues/219
issue comments: 2
issue url: https://github.com/sfackler/r2d2/pull/112
issue comments: 5
issue url: https://github.com/rust-lang/rust/pull/76676
issue comments: 30
repositories star: 48961
issue datetime: 2020-09-13
need_datetime: 2016-12-12
满足条件,爬取!
issue url: https://github.com/rusoto/rusoto/pull/1836
issue comments: 1
issue url: https://github.com/mgeisler/textwrap/issues/210
issue comments: 2
issues_url_list: https://github.com/search?l=Rust&p=2&q=unsafe&type=Issues
issue url: https://github.com/rust-lang/rust/pull/75115
issue comments: 21
repositories star: 48961
issue datetime: 2020-08-03
need_datetime: 2016-12-12
满足条件,爬取!
issue url: https://github.com/sotrh/learn-wgpu/issues/109
issue comments: 3
issue url: https://github.com/rust-lang/rust/pull/74477
issue comments: 37
repositories star: 48961
issue datetime: 2020-07-18
need_datetime: 2016-12-12
满足条件,爬取!
issue url: https://github.com/rust-analyzer/rust-analyzer/issues/5996
issue comments: 5
issues_url_list: https://github.com/search?l=Rust&p=3&q=unsafe&type=Issues
issue url: https://github.com/rust-lang/rust/pull/74979
issue comments: 11
repositories star: 48961
issue datetime: 2020-07-31
need_datetime: 2016-12-12
满足条件,爬取!
issue url: https://github.com/rust-lang/rust/pull/73928
issue comments: 11
repositories star: 48961
issue datetime: 2020-07-01
need_datetime: 2016-12-12
满足条件,爬取!
issue url: https://github.com/rust-lang/rust/issues/73904
issue comments: 34
repositories star: 48961
issue datetime: 2020-06-30
need_datetime: 2016-12-12
满足条件,爬取!
issue url: https://github.com/VictorKoenders/factorio_with_physics/issues/1
issue comments: 1
issue url: https://github.com/zkcrypto/ff/pull/41
issue comments: 1
issue url: https://github.com/tikv/tikv/issues/8759
issue comments: 1
issues_url_list: https://github.com/search?l=Rust&p=4&q=unsafe&type=Issues
issue url: https://github.com/tikv/pprof-rs/issues/36
issue comments: 2
issue url: https://github.com/rust-lang/rust/issues/69270
issue comments: 3
issue url: https://github.com/rust-lang/rust/issues/76943
issue comments: 4
issue url: https://github.com/rust-analyzer/rust-analyzer/issues/5677
issue comments: 1
issue url: https://github.com/paritytech/substrate/issues/5418
issue comments: 4
issues_url_list: https://github.com/search?l=Rust&p=5&q=unsafe&type=Issues
issue url: https://github.com/rust-lang/rust/issues/74268
issue comments: 1
issue url: https://github.com/image-rs/image/issues/1340
issue comments: 1
issue url: https://github.com/alexcrichton/jobserver-rs/issues/25
issue comments: 1
issue url: https://github.com/musitdev/portmidi-rs/issues/33
issue comments: 7
issue url: https://github.com/bitflags/bitflags/issues/228
issue comments: 2
issue url: https://github.com/rust-lang/rust/issues/28179
issue comments: 21
repositories star: 48961
issue datetime: 2015-09-02
need_datetime: 2016-12-12
issues_url_list: https://github.com/search?l=Rust&p=6&q=unsafe&type=Issues
issue url: https://github.com/woshilapin/website/pull/1
issue comments: 3
issue url: https://github.com/capnproto/capnproto-rust/issues/78
issue comments: 7
issue url: https://github.com/woboq/qmetaobject-rs/issues/69
issue comments: 1
issue url: https://github.com/rust-lang/rust/issues/71668
issue comments: 24
repositories star: 48961
issue datetime: 2020-04-29
need_datetime: 2016-12-12
满足条件,爬取!
issue url: https://github.com/bitflags/bitflags/issues/222
issue comments: 1
issues_url_list: https://github.com/search?l=Rust&p=7&q=unsafe&type=Issues
issue url: https://github.com/stepancheg/rust-protobuf/pull/387
issue comments: 15
repositories star: 1419
issue url: https://github.com/rust-analyzer/rust-analyzer/issues/5412
issue comments: 7
issue url: https://github.com/rust-lang/miri/issues/1574
issue comments: 6
issue url: https://github.com/tikv/tikv/issues/8685
issue comments: 2
issue url: https://github.com/droundy/arrayref/issues/18
issue comments: 4
issues_url_list: https://github.com/search?l=Rust&p=8&q=unsafe&type=Issues
issue url: https://github.com/rust-lang/rust/issues/74838
issue comments: 3
issue url: https://github.com/ebfull/pcap/issues/127
issue comments: 1
issue url: https://github.com/Keats/jsonwebtoken/issues/130
issue comments: 4
issue url: https://github.com/jonas-schievink/rubble/issues/122
issue comments: 4
issues_url_list: https://github.com/search?l=Rust&p=9&q=unsafe&type=Issues
issue url: https://github.com/uuid-rs/uuid/issues/488
issue comments: 1
issue url: https://github.com/jackmott/simdeez/issues/34
issue comments: 2
issue url: https://github.com/rust-lang/rust/issues/74840
issue comments: 40
repositories star: 48961
issue datetime: 2020-07-27
need_datetime: 2016-12-12
满足条件,爬取!
issue url: https://github.com/thaumant/levenshtein-perf-examples/issues/1
issue comments: 2
issue url: https://github.com/stepancheg/rust-protobuf/issues/499
issue comments: 1
issue url: https://github.com/hyperledger/indy-sdk/pull/2115
issue comments: 1
issue url: https://github.com/bodoni/svg/issues/6
issue comments: 10
issue url: https://github.com/rust-lang/rust/issues/59795
issue comments: 3
The final issues_list is:
共爬取了8个issues,url如下:
https://github.com/rust-lang/rust/pull/76676
https://github.com/rust-lang/rust/pull/75115
https://github.com/rust-lang/rust/pull/74477
https://github.com/rust-lang/rust/pull/74979
https://github.com/rust-lang/rust/pull/73928
https://github.com/rust-lang/rust/issues/73904
https://github.com/rust-lang/rust/issues/71668
https://github.com/rust-lang/rust/issues/74840

注:如果不需要多余的输出,减少运行时间,直接修改脚本即可

 

评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

z2bns

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值