通过多任务队列爬虫,爬取Github commits api数据,并写入数据库

GitHub Repo 地址:欢迎给星

需求: 通过python执行Kusto数据库query,获取到当前最新的指定的Repo list,在获取到的Repo list中,爬取每个Repo下的所有的当前月份的commits数据,将获取到的json格式数据解析,存储在接口文件中,然后将其存入数据库中。

在这里插入图片描述
创建一个路径管理模块: path_app_branch.py

import datetime
import time
import os

# 使用局部变量方法导入,控制资源部分导入
# __all__ = ['path_file_1','path_file_2','path_file_test','start_time','end_time']

def GetDesktopPath():
    """获取本地电脑名称路径"""
    return os.path.join(os.path.expanduser("~"))

Total_path = GetDesktopPath().split('\\')[-1] # 获取电脑名称
start_time = datetime.date(datetime.date.today().year, datetime.date.today().month, 1)
end_time = datetime.date(datetime.date.today().year, datetime.date.today().month + 1, 1) - datetime.timedelta(1)
StartTime = str(start_time)
EndTime = str(end_time)
path_file_1 = r"C:\Users\{}\tool\Python_Get_Github_api\data_source\data logs\{}_URL_STATUS_CODE_update_at_{}.csv".format(Total_path,StartTime,time.strftime('%Y-%m-%d %H.%M.%S',time.localtime(time.time())))
path_file_2 = r"C:\Users\{}\tool\Python_Get_Github_api\data_source\data logs\{}_URL_DATA_LOGS_update_at_{}.csv".format(Total_path,StartTime,time.strftime('%Y-%m-%d %H.%M.%S',time.localtime(time.time())))
path_file_3 = r"C:\Users\{}\tool\Python_Get_Github_api\data_source\data logs\{}_BRANCH_COMMITS_DATA_update_at_{}.csv".format(Total_path,StartTime,time.strftime('%Y-%m-%d',time.localtime(time.time())))
path_file_4 = r"C:\Users\{}\tool\Python_Get_Github_api\data_source\data logs\{}_TOTAL_REPO_URL_update_at_{}.csv".format(Total_path,StartTime,time.strftime('%Y-%m-%d %H.%M.%S',time.localtime(time.time())))

# 过滤执行代码
if __name__ == '__main__':
    # 可以在这里书写执行代码/测试代码/说明文字,被导入时不会被执行
    print(path_file_1)
    print(path_file_2)
    print(path_file_3)
    print(path_file_4)
    print([start_time,end_time])

主程序: remote_BranchCommits.py

import sys
sys.setrecursionlimit(1000000)
from gevent import monkey
monkey.patch_all()
import gevent
from gevent.queue import Queue
from azure.kusto.data.request import KustoClient, KustoConnectionStringBuilder
from azure.kusto.data.helpers import dataframe_from_result_table
from azure.kusto.ingest import (KustoIngestClient,IngestionProperties,FileDescriptor,DataFormat,ReportLevel)
import requests
import json
import csv
from urllib.parse import urlparse
from branch.path_app_branch import *

def authenticate_kusto(kusto_cluster):
    tenant_id = '72f988bf-86f1-41af-91ab-2d7cd011db47'
    KCSB = KustoConnectionStringBuilder.with_aad_device_authentication(kusto_cluster)
    KCSB.authority_id = tenant_id
    return KustoClient(KCSB),KCSB

def query_kusto(client, database, query):
    return client.execute(database, query)

def get_page_views_1(client):
    kusto_query_1 = """
    query
    """
    kusto_database_1 = '###'
    result_1 = query_kusto(client, kusto_database_1, kusto_query_1)
    df_1 = dataframe_from_result_table(result_1.primary_results[0])
    return df_1

def Ingest(Tag):
    ingestion_props = IngestionProperties(
        database="###",
        table="###",
        dataFormat=DataFormat.CSV,
        ingestByTags=[Tag],
        dropByTags=[Tag],
        mappingReference="###_CSV_Mapping",
        reportLevel=ReportLevel.FailuresAndSuccesses,
        additionalProperties={'ignoreFirstRecord': 'true'}
    )

    file_descriptor = FileDescriptor(path_file_3,3333)  # 3333 is the raw size of the data in bytes.
    ls[1].ingest_from_file(file_descriptor, ingestion_properties=ingestion_props)

    return 1

# Query Kusto
cga_cluster = 'https://cgadataout.kusto.windows.net'
ingest_cluster = "https://ingest-cgadataout.kusto.windows.net"
cga_client = authenticate_kusto(cga_cluster)[0]
ingest_client = KustoIngestClient(authenticate_kusto(ingest_cluster)[1])
ls=[cga_client,ingest_client]
current_1 = get_page_views_1(ls[0])

File1 = open(path_file_1, "w+", newline='',encoding='utf-8')
File2 = open(path_file_2, "w+", newline='',encoding='utf-8')
File3 = open(path_file_3, "w+", newline='',encoding='utf-8')
output_url_1 = csv.writer(File1)
output_url_1.writerow(['Link','Status'])
output_url_2 = csv.writer(File2)
output_url_2.writerow(['The URL','Link','Page','Information'])
output = csv.writer(File3)
output.writerow(['RepoOwner','RepoName','Branch','CommitSha','CommitAuthorLogin','CommitAuthorId','CommitDate','Data','Tag','Month'])

headers = {
    "Authorization": "your token", # 这里的请求必须加token,否则会返回404
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
    "Cookie": "_octo=GH1.1.54427208.1583984786; _ga=GA1.2.1549066493.1583984788; logged_in=yes; dotcom_user=BaymaxBai01; tz=Asia%2FShanghai; _gid=GA1.2.220945336.1589789000"
}

urls = []

for i in range(len(current_1)):
    RepoFullName = current_1.iloc[i][0]
    OriginalContentBranch = current_1.iloc[i][1]
    url = "https://api.github.com/repos/{}/commits?sha={}&since={}T00:00:00Z&until={}T23:59:59Z&page=".format(RepoFullName,OriginalContentBranch,StartTime,EndTime)
    urls.append(url)

File4 = open(path_file_4, "w+", newline='',encoding='utf-8')
output_url = csv.writer(File4)
output_url.writerow(['Total {} URL'.format(len(urls))])
for url in urls:
    output_url.writerow([url+"1"])
File4.close()

work = Queue()
for data_url in urls:
    work.put_nowait(data_url)

start = time.time()

def get_each_url_all_page(data_url):
    j = 1
    result = requests.get(data_url + str(1), headers=headers)
    html = result.text
    status = result.status_code
    output_url_1.writerow([data_url+"1",status])
    if status == 404:
        print("The URL: {}1 status:{} Not Found in {}!".format(data_url,status,StartTime))
        output_url_2.writerow(["The URL:",data_url+"1","", "Response 404 Not Found in "+StartTime])
    elif status == 403:
        print("The URL: {}1 status:{} Not Found in {}!".format(data_url,status,StartTime))
        output_url_2.writerow(["The URL:",data_url+"1","", "Response 403 Forbidden in "+StartTime])
    elif status == 200 and html == '[\n\n]\n':
        print("The URL: {}1 status:{} Response OK but has no commit data in {}!".format(data_url,status,StartTime))
        output_url_2.writerow(["The URL:",data_url+"1","", "Response 200 OK but has no commit data in "+StartTime])
    else:
        print("The URL: {}1 status:{} Response OK!".format(data_url,status))
        output_url_2.writerow(["The URL:",data_url+"1","", "Response OK"])
        while True:
            jsondata = json.loads(html)
            for row in jsondata:
                RepoOwner = urlparse(data_url).path.split('/')[2]
                RepoName = urlparse(data_url).path.split('/')[3]
                a = []
                for i in range(100):
                    if urlparse(data_url).query[i + 4] == "&":
                        break
                    a.append(urlparse(data_url).query[i + 4])
                Branch = ''.join(a)
                CommitSha = row['sha']
                CommitDate = row['commit']['author']['date']
                try:
                    CommitAuthorLogin = row['author']['login']
                except:
                    CommitAuthorLogin = ""
                try:
                    CommitAuthorId = row['author']['id']
                except:
                    CommitAuthorId = ""
                Data = ""
                Tag = RepoOwner + "/" + RepoName + "-" + Branch + "-" + StartTime
                Month = StartTime
                try:
                    output.writerow([RepoOwner, RepoName, Branch, CommitSha, CommitAuthorLogin, CommitAuthorId, CommitDate, Data, Tag, Month])
                except:
                    print(row.values())
            print("The URL: {}{} is complete!".format(data_url, j))
            output_url_2.writerow(["The URL:",data_url, j ,"is complete!"])
            j += 1
            result = requests.get(data_url + str(j), headers=headers)
            html = result.text
            if html == '[\n\n]\n':
                break

def crawler():
    while not work.empty():
        data_url = work.get_nowait()
        try:
            get_each_url_all_page(data_url)
        except:
            print("The URL: {}1 has something wrong!".format(data_url))
            output_url.writerow(["The URL:",data_url,str(1), "has something wrong!"])

tasks_list = []
for x in range(10):
    task = gevent.spawn(crawler)
    tasks_list.append(task)
gevent.joinall(tasks_list)

File1.close()
File2.close()
File3.close()

end = time.time()

print("Take:"+ str(end - start)+"s")

DROP_TABLE_IF_EXIST = """.drop extents <| .show table BranchCommits extents where tags has 'drop-by:{}'""".format(StartTime)
RESPONSE = ls[0].execute_mgmt("DevRelWorkArea", DROP_TABLE_IF_EXIST)

Ingest(StartTime)
爬取GitHub上的所有项目,可以按照以下步骤进行操作: 1. 首先,你需要使用GitHub提供的API获取项目的信息GitHubAPI文档提供了详细的说明和示例,你可以根据自己的需求选择合适的API端点和参数来获取项目数据。\[1\] 2. 在使用API之前,你需要注册一个GitHub账号并生成一个访问令牌(access token)。访问令牌可以用于身份验证和限制访问频率。你可以在GitHub的设置页面中生成访问令牌,并在API请求中使用该令牌进行身份验证。\[1\] 3. 一旦你获得了访问令牌,你可以使用Python的requests库或其他HTTP请求库来发送API请求,并解析返回的JSON数据。你可以使用循环和分页查询来遍历所有的项目数据。\[2\] 4. 在遍历项目数据时,你可以获取每个项目的名称、描述、语言、星标数等信息。你还可以根据需要获取项目的代码仓库地址、贡献者列表等更详细的信息。\[2\] 5. 为了避免对GitHub服务器造成过大的负载,你可以设置适当的请求间隔和错误处理机制。可以使用try-except语句来捕获请求错误,并在错误发生时进行适当的处理,例如等待一段时间后重新发送请求。\[2\] 总结起来,要爬取GitHub上的所有项目,你需要使用GitHubAPI获取项目数据,并使用Python编写爬虫程序来发送API请求和解析返回的数据。你可以根据自己的需求选择合适的API端点和参数,并设置适当的请求间隔和错误处理机制。\[1\]\[2\] #### 引用[.reference_title] - *1* *2* [Python爬取github数据](https://blog.csdn.net/m0_59485658/article/details/128056622)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v91^insertT0,239^v3^insert_chatgpt"}} ] [.reference_item] - *3* [爬取GitHub开源项目](https://blog.csdn.net/qq_43250401/article/details/103208629)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v91^insertT0,239^v3^insert_chatgpt"}} ] [.reference_item] [ .reference_list ]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值