Python爬虫以太坊智能合约solidity

获取 etherscan网站中最新的10000条Solidity智能合约源码

代码以Github为准

Github CodeLink: SmartContactSpider By Python

导入python所需各种标准库,如:

pip install beautifulsoup4

运行环境Python3.7
SmartContactSpider_10000.py

# -*- coding: utf8 -*-
# SmartContactSpider.py
import requests
from bs4 import BeautifulSoup
import os
import time


def printtime():
    print(time.strftime("%Y-%m-%d %H:%M:%S:", time.localtime()), end=' ')
    return 0


def getsccodecore(eachLine):
    # 伪装成浏览器
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'}

    failedTimes = 100
    while True:  # 在制定次数内一直循环,直到访问站点成功

        if (failedTimes <= 0):
            printtime()
            print("失败次数过多,请检查网络环境!")
            break

        failedTimes -= 1
        try:
            # 以下except都是用来捕获当requests请求出现异常时,
            # 通过捕获然后等待网络情况的变化,以此来保护程序的不间断运行
            printtime()
            eachLineurl = eachLine[:80]
            print('正在连接的的网址链接是 ' + eachLineurl)
            response = requests.get(eachLineurl, headers=headers, timeout=5)
            break

        except requests.exceptions.ConnectionError:
            printtime()
            print('ConnectionError!请等待3秒!')
            time.sleep(3)

        except requests.exceptions.ChunkedEncodingError:
            printtime()
            print('ChunkedEncodingError!请等待3秒!')
            time.sleep(3)

        except:
            printtime()
            print('Unfortunitely,出现未知错误!请等待3秒!')
            time.sleep(3)

    response.encoding = response.apparent_encoding
    soup = BeautifulSoup(response.text, "html.parser")
    # js-sourcecopyarea editor ace_editor ace-dawn
    targetPRE = soup.find_all('pre', 'js-sourcecopyarea editor')
    filepath = "E:\\SmartContract\\ContractCrawler\\code\\"
    # filename 从address.txt文档中截取;从第34位到75位
    # filename为合约地址 '0x5f3ed22b53ac0a001f0feedc2a3985999377c2ab'
    filename = eachLine[33:75]

    if (os.path.exists(filepath + filename + '.sol')):
        printtime()
        print(filename + '.sol已存在!')
        return 0

    fo = open(filepath + filename + '.sol', "w+", encoding="utf-8")
    for eachpre in targetPRE:
        fo.write(eachpre.text)
    fo.close()
    printtime()
    print(filename + '.sol新建完成!')

    return 0


def getsccode():
    try:
        SCAddress = open(
            "E:\\SmartContract\\ContractCrawler\\address\\address.txt", "r")

    except:
        printtime()
        print('打开智能合约URL地址仓库错误!请检查文件目录是否正确!')

    # 一行一行的读取 address.txt 的内容
    for eachLine in SCAddress:
        # print("test:" + eachLine[:80])
        # address.txt每一行前80字段 https://cn.etherscan.com/address/0xEeE690AAA67d1eE33365c02C3Bf477A93867052f#code
        getsccodecore(eachLine)  # 这个才是获取智能合约代码的核心函数

    SCAddress.close()
    return 0


def getSCAddress(eachurl, filepath):
    # 伪装成某种浏览器,防止被服务器拒绝服务
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'}

    # 设置访问网址失败的最高次数,达到制定次数后,报告错误,停止程序
    failedTimes = 50

    while True:  # 一直循环,直到在制定的次数内访问站点成功

        if (failedTimes <= 0):
            printtime()
            print("失败次数过多,请检查网络环境!")
            break

        failedTimes -= 1  # 每执行一次就要减1
        try:
            # 以下except都是用来捕获当requests请求出现异常时,
            # 通过捕获然后等待网络情况的变化,以此来保护程序的不间断运行
            print('正在连接的的网址链接是 ' + eachurl)

            response = requests.get(url=eachurl, headers=headers, timeout=5)

            # 执行到这一句意味着成功访问,于是退出while循环
            break
        except requests.exceptions.ConnectionError:
            printtime()
            print('ConnectionError!请等待3秒!')
            time.sleep(3)

        except requests.exceptions.ChunkedEncodingError:
            printtime()
            print('ChunkedEncodingError!请等待3秒!')
            time.sleep(3)

        except:
            printtime()
            print('出现未知错误!请等待3秒!')
            time.sleep(3)

    # 转换成UTF-8编码
    response.encoding = response.apparent_encoding

    # 煲汤 爬虫解析
    soup = BeautifulSoup(response.text, "html.parser")

    # 查找这个字段,这个字段下,包含智能合约代码的URL地址
    # targetDiv = soup.find_all('div', 'table-responsive')

    try:
        targetTBody = soup.find_all('tbody', 'align-middle text-nowrap')[0]

    except:
        printtime()
        print("targetTBody未成功获取!")
        return 1

    # 以追加的方式打开文件。
    # 如果文件不存在,则新建;如果文件已存在,则在文件指针末尾追加
    fo = open(filepath + "address.txt", "a")

    # 把每一个地址,都写到文件里面保存下来
    for targetTR in targetTBody:
        # print(targetTR)
        # 获取每一行
        if targetTR.name == 'tr':
            # 获取一行所有列
            data = targetTR.find_all('td')

            Address = data[0].getText()
            Name = data[1].getText()
            Compiler = data[2].getText()
            Version = data[3].getText()
            Balance = data[4].getText()
            VerifiedTime = data[7].getText()
            if "Solidity".lower() in Compiler.lower():
                fo.write("https://cn.etherscan.com" + targetTR.td.find('a', 'me-1').attrs['href']
                         + ' ' + VerifiedTime + ':' + Name + '==>Version:' + Version + '==>Balance:' + Balance + "\n")

    fo.close()
    return 0

# 仅显示最后500经已验证的合约源码


def getUrlList500():

    urlList = []

    for i in range(5):
        page = i + 1
        urlList.append(
            "https://cn.etherscan.com/contractsVerified/"+str(page)+"?filter=solc&ps=100")

    return urlList

# 仅显示最后10,000经已验证的合约源码 (OpenSource)


def getUrlList10000():

    urlList = []

    for i in range(100):
        page = i + 1
        urlList.append(
            "https://cn.etherscan.com/contractsVerified/"+str(page)+"?filter=opensourcelicense&ps=100")

    return urlList


def updatescurl():
    # TODO:
    # getUrlList10000 getUrlList500
    urlList = getUrlList10000()

    # print(urlList)

    # filepath是保存要爬取的智能合约地址的文件的存放路径
    # 请根据自己的需求改成自己想要的路径。
    filepath = 'E:\\SmartContract\\ContractCrawler\\address\\'

    # 把旧的存放合约地址的文件清除干净
    try:
        if (os.path.exists(filepath + "address.txt")):
            os.remove(filepath + "address.txt")
            printtime()
            print('已清除%s目录下的旧文件(仓库)!' % filepath)
    except IOError:

        printtime()
        print("出现一个不能处理的错误,终止程序:IOError!")

        # 函数不正常执行,返回1
        return 1

    # 读取urlList里的每一个URL网页里的智能合约地址
    for eachurl in urlList:
        time = 0
        while (1 == getSCAddress(eachurl, filepath)):
            time += 1
            if (time == 10):
                break
            pass

    # 函数正常执行,返回0
    return 0


def main():
    # 更新要爬取的智能合约的地址
    updatescurl()

    # 根据智能合约的地址去爬取智能合约的代码
    getsccode()


main()
  • 0
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 10
    评论
评论 10
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值