python frame框架抓取_python 爬取github框架

最新推荐文章于 2023-12-12 23:14:27 发布

weixin_39620279

最新推荐文章于 2023-12-12 23:14:27 发布

阅读量161

点赞数

文章标签： python frame框架抓取

前言

做iOS马甲包时, 需要添加混淆代码, 一部分便来自github, 抽空写了个脚本, 使用的scrapy框架

一, Item 模型

import scrapy

class GithubItem(scrapy.Item):

# define the fields for your item here like:

name = scrapy.Field() # 框架名称

url = scrapy.Field() # 框架主页路径

star_number = scrapy.Field() # 框架的star数量

update_time = scrapy.Field() # 更新时间

clone_url = scrapy.Field() # clone地址

二, 爬取框架要求

指定语言oc

star数量至少为100

from github.items import GithubItem

import scrapy

class GithubSpider(scrapy.Spider):

name = 'github'

allowed_domains = ['github.com']

url = 'https://github.com/search?l=Objective-C&o=desc&q=ios&s=stars&p='

offset = 1

start_urls = [

url+str(offset)

]

# 所有的模型

items = []

# 保存clone_url

clone_urls = []

def parse(self, response):

# parentUrls = response.xpath('//div[@id=\"tab01\"]/div/h3/a/@href').extract()

names = response.xpath('//ul[@class=\"repo-list\"]/li/div/h3/a/text()').extract()

# 访问url全路径,前面需要拼接:https://github.com

urls = response.xpath('//ul[@class=\"repo-list\"]/li/div/h3/a/@href').extract()

star_numbers:list = response.xpath('//ul[@class=\"repo-list\"]/li/div//a[@class=\"muted-link\"]/text()').extract()

# 去掉无效值及空白回车

for i in range(len(star_numbers)-1, -1, -1):

temp:str = star_numbers[i].strip()

if len(temp) > 2 or temp.find('k')!=-1:

star_numbers[i] = temp

update_times = response.xpath('//ul[@class=\"repo-list\"]/li/div//p[@class=\"f6 text-gray mr-3 mb-0 mt-2\"]/relative-time/text()').extract()

for i in range(0, len(names)):

item = GithubItem()

item['name'] = names[i]

item['url'] = 'https://github.com' + urls[i]

item['star_number'] = star_numbers[i]

item['update_time'] = update_times[i]

self.items.append(item)

for item in self.items:

yield scrapy.Request(url=item['url'], meta={'item':item}, callback=self.parse_article)

if self.offset < 3:

self.offset += 1

# 每次处理完一页的数据之后，重新发送下一页页面请求

yield scrapy.Request(self.url+str(self.offset), callback=self.parse)

def parse_article(self, response):

"""

解析框架主页,提取其中的clone地址

:param response:

:return:

"""

item = response.meta['item']

item['clone_url'] = response.xpath('//div[@class=\"input-group\"]/input/@value').extract()

self.clone_urls.append(item['url'])

print('clone_url: %s'%self.clone_urls)

# print(item)

注: 修改此处控制拉取数量:

if self.offset < 3:

self.offset += 1

三, clone到指定本地路径

import os

import pwd

import shutil

class GitUtil(object):

@classmethod

def yh_clone_url(cls, url):

# 确定路径

dst_path = cls._dst_path_of_clone(url)

if os.path.exists(dst_path):

shutil.rmtree(dst_path)

os.mkdir(dst_path)

# 执行shell

cls._clone_url_to_dst_path(url, dst_path)

output = os.popen("pwd")

print (output.read())

@classmethod

def _clone_url_to_dst_path(cls, url, dst_path):

"""

clone到目的路径

:param url: 将要clone的仓库

:param dst_path: 目的路径

:return:

"""

os.system("git clone " + url + " " + dst_path)

@classmethod

def _dst_path_of_clone(cls, url):

# 确定路径, 得到框架名称

last_path = os.path.split(url)[-1]

framework_name = os.path.splitext(last_path)[0]

home_path = pwd.getpwuid(os.getuid()).pw_dir

dst_folder_path = os.path.join(home_path, 'Desktop/tmp_git')

if os.path.exists(dst_folder_path):

shutil.rmtree(dst_folder_path)

os.mkdir(dst_folder_path)

dst_path = os.path.join(dst_folder_path, framework_name)

return dst_path

四, main.py 执行此脚本

from scrapy import cmdline

cmdline.execute('scrapy crawl github'.split())

weixin_39620279

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python frame框架抓取_python 爬取github框架

前言做iOS马甲包时, 需要添加混淆代码, 一部分便来自github, 抽空写了个脚本, 使用的scrapy框架一, Item 模型import scrapyclass GithubItem(scrapy.Item):# define the fields for your item here like:name = scrapy.Field() # 框架名称url = scrapy.Field(...
复制链接

扫一扫

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。