通过多任务队列爬虫，爬取Github commits api数据，并写入数据库

最新推荐文章于 2024-09-15 22:31:42 发布

置顶 Baymax_Bai

最新推荐文章于 2024-09-15 22:31:42 发布

阅读量597

点赞数

分类专栏： Github/Git 爬虫文章标签： python github api

本文链接：https://blog.csdn.net/Baymax_Bai/article/details/108015153

版权

Github/Git 同时被 2 个专栏收录

5 篇文章 0 订阅

订阅专栏

爬虫

2 篇文章 0 订阅

订阅专栏

GitHub Repo 地址：欢迎给星

需求: 通过python执行Kusto数据库query，获取到当前最新的指定的Repo list，在获取到的Repo list中，爬取每个Repo下的所有的当前月份的commits数据，将获取到的json格式数据解析，存储在接口文件中，然后将其存入数据库中。

在这里插入图片描述
创建一个路径管理模块: path_app_branch.py

import datetime
import time
import os

# 使用局部变量方法导入，控制资源部分导入
# __all__ = ['path_file_1','path_file_2','path_file_test','start_time','end_time']

def GetDesktopPath():
    """获取本地电脑名称路径"""
    return os.path.join(os.path.expanduser("~"))

Total_path = GetDesktopPath().split('\\')[-1] # 获取电脑名称
start_time = datetime.date(datetime.date.today().year, datetime.date.today().month, 1)
end_time = datetime.date(datetime.date.today().year, datetime.date.today().month + 1, 1) - datetime.timedelta(1)
StartTime = str(start_time)
EndTime = str(end_time)
path_file_1 = r"C:\Users\{}\tool\Python_Get_Github_api\data_source\data logs\{}_URL_STATUS_CODE_update_at_{}.csv".format(Total_path,StartTime,time.strftime('%Y-%m-%d %H.%M.%S',time.localtime(time.time())))
path_file_2 = r"C:\Users\{}\tool\Python_Get_Github_api\data_source\data logs\{}_URL_DATA_LOGS_update_at_{}.csv".format(Total_path,StartTime,time.strftime('%Y-%m-%d %H.%M.%S',time.localtime(time.time())))
path_file_3 = r"C:\Users\{}\tool\Python_Get_Github_api\data_source\data logs\{}_BRANCH_COMMITS_DATA_update_at_{}.csv".format(Total_path,StartTime,time.strftime('%Y-%m-%d',time.localtime(time.time())))
path_file_4 = r"C:\Users\{}\tool\Python_Get_Github_api\data_source\data logs\{}_TOTAL_REPO_URL_update_at_{}.csv".format(Total_path,StartTime,time.strftime('%Y-%m-%d %H.%M.%S',time.localtime(time.time())))

# 过滤执行代码
if __name__ == '__main__':
    # 可以在这里书写执行代码/测试代码/说明文字，被导入时不会被执行
    print(path_file_1)
    print(path_file_2)
    print(path_file_3)
    print(path_file_4)
    print([start_time,end_time])

主程序: remote_BranchCommits.py

import sys
sys.setrecursionlimit(1000000)
from gevent import monkey
monkey.patch_all()
import gevent
from gevent.queue import Queue
from azure.kusto.data.request import KustoClient, KustoConnectionStringBuilder
from azure.kusto.data.helpers import dataframe_from_result_table
from azure.kusto.ingest import (KustoIngestClient,IngestionProperties,FileDescriptor,DataFormat,ReportLevel)
import requests
import json
import csv
from urllib.parse import urlparse
from branch.path_app_branch import *

def authenticate_kusto(kusto_cluster):
    tenant_id = '72f988bf-86f1-41af-91ab-2d7cd011db47'
    KCSB = KustoConnectionStringBuilder.with_aad_device_authentication(kusto_cluster)
    KCSB.authority_id = tenant_id
    return KustoClient(KCSB),KCSB

def query_kusto(client, database, query):
    return client.execute(database, query)

def get_page_views_1(client):
    kusto_query_1 = """
    query
    """
    kusto_database_1 = '###'
    result_1 = query_kusto(client, kusto_database_1, kusto_query_1)
    df_1 = dataframe_from_result_table(result_1.primary_results[0])
    return df_1

def Ingest(Tag):
    ingestion_props = IngestionProperties(
        database="###",
        table="###",
        dataFormat=DataFormat.CSV,
        ingestByTags=[Tag],
        dropByTags=[Tag],
        mappingReference="###_CSV_Mapping",
        reportLevel=ReportLevel.FailuresAndSuccesses,
        additionalProperties={'ignoreFirstRecord': 'true'}
    )

    file_descriptor = FileDescriptor(path_file_3,3333)  # 3333 is the raw size of the data in bytes.
    ls[1].ingest_from_file(file_descriptor, ingestion_properties=ingestion_props)

    return 1

# Query Kusto
cga_cluster = 'https://cgadataout.kusto.windows.net'
ingest_cluster = "https://ingest-cgadataout.kusto.windows.net"
cga_client = authenticate_kusto(cga_cluster)[0]
ingest_client = KustoIngestClient(authenticate_kusto(ingest_cluster)[1])
ls=[cga_client,ingest_client]
current_1 = get_page_views_1(ls[0])

File1 = open(path_file_1, "w+", newline='',encoding='utf-8')
File2 = open(path_file_2, "w+", newline='',encoding='utf-8')
File3 = open(path_file_3, "w+", newline='',encoding='utf-8')
output_url_1 = csv.writer(File1)
output_url_1.writerow(['Link','Status'])
output_url_2 = csv.writer(File2)
output_url_2.writerow(['The URL','Link','Page','Information'])
output = csv.writer(File3)
output.writerow(['RepoOwner','RepoName','Branch','CommitSha','CommitAuthorLogin','CommitAuthorId','CommitDate','Data','Tag','Month'])

headers = {
    "Authorization": "your token", # 这里的请求必须加token，否则会返回404
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
    "Cookie": "_octo=GH1.1.54427208.1583984786; _ga=GA1.2.1549066493.1583984788; logged_in=yes; dotcom_user=BaymaxBai01; tz=Asia%2FShanghai; _gid=GA1.2.220945336.1589789000"
}

urls = []

for i in range(len(current_1)):
    RepoFullName = current_1.iloc[i][0]
    OriginalContentBranch = current_1.iloc[i][1]
    url = "https://api.github.com/repos/{}/commits?sha={}&since={}T00:00:00Z&until={}T23:59:59Z&page=".format(RepoFullName,OriginalContentBranch,StartTime,EndTime)
    urls.append(url)

File4 = open(path_file_4, "w+", newline='',encoding='utf-8')
output_url = csv.writer(File4)
output_url.writerow(['Total {} URL'.format(len(urls))])
for url in urls:
    output_url.writerow([url+"1"])
File4.close()

work = Queue()
for data_url in urls:
    work.put_nowait(data_url)

start = time.time()

def get_each_url_all_page(data_url):
    j = 1
    result = requests.get(data_url + str(1), headers=headers)
    html = result.text
    status = result.status_code
    output_url_1.writerow([data_url+"1",status])
    if status == 404:
        print("The URL: {}1 status:{} Not Found in {}!".format(data_url,status,StartTime))
        output_url_2.writerow(["The URL:",data_url+"1","", "Response 404 Not Found in "+StartTime])
    elif status == 403:
        print("The URL: {}1 status:{} Not Found in {}!".format(data_url,status,StartTime))
        output_url_2.writerow(["The URL:",data_url+"1","", "Response 403 Forbidden in "+StartTime])
    elif status == 200 and html == '[\n\n]\n':
        print("The URL: {}1 status:{} Response OK but has no commit data in {}!".format(data_url,status,StartTime))
        output_url_2.writerow(["The URL:",data_url+"1","", "Response 200 OK but has no commit data in "+StartTime])
    else:
        print("The URL: {}1 status:{} Response OK!".format(data_url,status))
        output_url_2.writerow(["The URL:",data_url+"1","", "Response OK"])
        while True:
            jsondata = json.loads(html)
            for row in jsondata:
                RepoOwner = urlparse(data_url).path.split('/')[2]
                RepoName = urlparse(data_url).path.split('/')[3]
                a = []
                for i in range(100):
                    if urlparse(data_url).query[i + 4] == "&":
                        break
                    a.append(urlparse(data_url).query[i + 4])
                Branch = ''.join(a)
                CommitSha = row['sha']
                CommitDate = row['commit']['author']['date']
                try:
                    CommitAuthorLogin = row['author']['login']
                except:
                    CommitAuthorLogin = ""
                try:
                    CommitAuthorId = row['author']['id']
                except:
                    CommitAuthorId = ""
                Data = ""
                Tag = RepoOwner + "/" + RepoName + "-" + Branch + "-" + StartTime
                Month = StartTime
                try:
                    output.writerow([RepoOwner, RepoName, Branch, CommitSha, CommitAuthorLogin, CommitAuthorId, CommitDate, Data, Tag, Month])
                except:
                    print(row.values())
            print("The URL: {}{} is complete!".format(data_url, j))
            output_url_2.writerow(["The URL:",data_url, j ,"is complete!"])
            j += 1
            result = requests.get(data_url + str(j), headers=headers)
            html = result.text
            if html == '[\n\n]\n':
                break

def crawler():
    while not work.empty():
        data_url = work.get_nowait()
        try:
            get_each_url_all_page(data_url)
        except:
            print("The URL: {}1 has something wrong!".format(data_url))
            output_url.writerow(["The URL:",data_url,str(1), "has something wrong!"])

tasks_list = []
for x in range(10):
    task = gevent.spawn(crawler)
    tasks_list.append(task)
gevent.joinall(tasks_list)

File1.close()
File2.close()
File3.close()

end = time.time()

print("Take:"+ str(end - start)+"s")

DROP_TABLE_IF_EXIST = """.drop extents <| .show table BranchCommits extents where tags has 'drop-by:{}'""".format(StartTime)
RESPONSE = ls[0].execute_mgmt("DevRelWorkArea", DROP_TABLE_IF_EXIST)

Ingest(StartTime)