Python Notebook 爬虫实践案例分享3/yieldwatch

本文链接：https://blog.csdn.net/m0_56574080/article/details/127609654

#KEY: NB7CMBBEG2USA8GVHF2DK3K4WGEV4X4NH1

# import the package that we need
from requests import Request, Session
from requests.exceptions import ConnectionError, Timeout, TooManyRedirects
import json
import ast
import datetime as dt
import pandas as pd
import numpy as np
import time
import timeit
from datetime import datetime
from telnetlib import EC

# importing the libraries for webscraping
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import requests
import csv
from selenium.webdriver.chrome.options import Options  # for suppressing the browser
#import re
options = webdriver.ChromeOptions()
options.add_argument('headless')

import warnings
warnings.filterwarnings('ignore')

这一个爬虫任务我们需要登录网站输入一个东西，并且查看我们是否可以得到回复，并且我们需要采集下来所有得到回复的代码。

# url for webscraper
url = "https://yieldwatch.net"

# This is the LP token addresses that we want to work on

#LP_Table = address_of_lp_token.dropna()
#LP_Table['YW'] = -999

# This is the final result
total_result = pd.DataFrame(columns= ['LP_address','transfer', 'amount'])

#initiate a session
session = Session()
endpoint_url = 'https://api.bscscan.com/api'
transfer_topic = '0xddf252ad1be2c89b69c2b068fc378daa952ba7f163c4a11628f55a4df523b3ef'
staking_address_v1 = '0x00000000000000000000000073feaa1eE314F8c655E354234017bE2193C9E24E'
staking_address_v2 = '0x000000000000000000000000a5f8C5Dbd5F286960b9d90548680aE5ebFf07652'
staking_address_list = [staking_address_v1,staking_address_v2]
#Array of API Keys
KEY = ['NB7CMBBEG2USA8GVHF2DK3K4WGEV4X4NH1', '5JTRM138T2NH9IKV7FRNIQMBY3G7DVB99A',  '33G82Z1J154EKJ8QNX8VCHW8TYBIN5NWN9', 'NH9EKWTQRGGXY82AJT78J6T6R1I7T5JKKU']

def json_get_load(test_LP_address, staking_address, topic_number):
    #print(staking_address, topic_number)
    nrow = 1999
    stBlk = 0
    Result = []
    KeyIdx = 0
    while nrow > 999:
        temp_Result = session.get(endpoint_url, params={
            'module': 'logs', 'action': 'getLogs', 'fromBlock': stBlk, 'toBlock': 9999999999999, 'address': test_LP_address.lower(),
            'topic0': transfer_topic, 'topic0_1_opr': 'and', 'topic' + topic_number : staking_address.lower(), 'apikey': KEY[KeyIdx % 4] #This part cycles through each of the 4 keys
        })
        KeyIdx = KeyIdx + 1

        if KeyIdx > 100:
            return []
        elif json.loads(temp_Result.text)['status'] == '1': #If everything goes normally
            #print('hello')
            temp_Json = json.loads(temp_Result.text)['result']
            nrow = len(temp_Json)
            if (nrow > 999):
                stBlk = int(temp_Json[999]['blockNumber'], base=16)
            Result = Result + temp_Json

        else: #If shit happens
            if json.loads(temp_Result.text)['message'] == "No records found":
                return Result
            elif json.loads(temp_Result.text)['message'] == "Max rate limit reached":
                print('Limit Reached!')
                time.sleep(0.5)
                continue
            else:
                print('Something is really wrong!') #Tell Roy if you see this
                print(json.loads(temp_Result.text)['message'])
                time.sleep(1)
                continue
    return Result

这里我们没有使用爬虫，我们使用了API的程序。

需要注意的是：

我们在使用API的时候有些时候会遇到很多奇怪的返回，我们一定要注意API的程序的正确使用，并且我们需要处理API回复的很多不同的情况。

99%的情况API的回复都是有一定数量的回复，我们需要多提交几次申请。


LP_Table = pd.read_csv('cached.csv')
for L in LP_Table.iterrows():
    start = time.time()
    L_idx = L[0]
    LP = L[1]

    if LP_Table['YW'][L_idx] < 0:

        lp_token = LP['Address of LP Token']
        print(str(L_idx) + '/' + str(len(LP_Table)) + ' : LP=' + lp_token)
        Result1 = []
        Result2 = []
        Result3 = []
        Result4 = []
        #get FROM v2 address
        result_3_temp = json_get_load(lp_token,staking_address_v2,"1")
        [Result3.append(x) for x in result_3_temp if x not in Result3]
        if len(Result3) > 1:
            #get TO v2 address
            result_4_temp = json_get_load(lp_token,staking_address_v2,"2")
            [Result4.append(x) for x in result_4_temp if x not in Result4]

        #if len(Result3 + Result4) < 3:
            #get FROM v1 address
        result_1_temp = json_get_load(lp_token,staking_address_v1,"1")
        [Result1.append(x) for x in result_1_temp if x not in Result1]
        if len(Result1) > 1:
            #get TO v1 address
            result_2_temp = json_get_load(lp_token,staking_address_v1,"2")
            [Result2.append(x) for x in result_2_temp if x not in Result2]

        result_Json_from = Result1+Result3
        result_Json_to = Result2+Result4

        if len(result_Json_from + result_Json_to) > 1:

            # Sum the transactions
            transactions = {}
            p = 0
            # Grab the data that we need
            for i in result_Json_from:
                #a = i['address']
                #b = i['topics'][1]
                c = i['topics'][2]
                d = int(i['data'], 16)
                transactions[p] = {'acct':c, 'amount': -1*d}
                p = p + 1

            for i in result_Json_to:
                #a = i['address']
                b = i['topics'][1]
                #c = i['topics'][2]
                d = int(i['data'], 16)
                transactions[p] =  {'acct':b, 'amount':d}
                p = p + 1
            transactions = pd.DataFrame.from_dict(transactions, "index")

            transactions_result = transactions.groupby('acct').sum()
            transactions_result = transactions_result[transactions_result['amount'] != 0].sort_values(ascending = False, by = 'amount').reset_index()

            # max_chk = min(len(transactions_result['acct']), 3) #ADD: Change this to 10 or 15 to check up to 10/15

            max_chk = min(len(transactions_result['acct']), 10)

            print('all good in data')

            #Try First Account in Yieldwatch, check if covered, IF NOT COVERED, maybe check next address until max of 10.
            for q in transactions_result['acct'][0:max_chk]:
                if LP_Table['YW'][L_idx] < 1:
                    #Reformat Account
                    Account = '0x' + q[26:]

                    #Launch a new window to prevent issues with stale pages
                    # browser = webdriver.Chrome(executable_path="C:/Users/RoyCh/PycharmProjects/WebScraper/chromedriver.exe")
                    browser = webdriver.Chrome(executable_path="C:/Users/Administrator/Dropbox/work220221005/1014/chromedriver.exe",options=options)
                    browser.get(url) #Load yieldwatch website
                    browser.implicitly_wait(30) #Wait for the page to load

                    #Locate Input Box by ID, type in the Account to check
                    addressBox = browser.find_elements(by=By.ID, value=
                    'addressInputField')
                    addressBox[0].clear()
                    addressBox[0].send_keys(Account)

                    #De-select all other buttons (XPATH is the structural location of the button on the page)
                    for g in [1, 2, 3, 5, 7, 8, 21, 25]:
                        MiscButton = browser.find_elements(by=By.XPATH,
                                                           value='/html/body/div/div/div/div[1]/div/div/div[4]/div/div['+ str(g) + ']/div')
                        MiscButton[0].click()

                    #Select the Pancake button
                    CakeButton = browser.find_elements(by=By.XPATH,
                                                        value = '/html/body/div/div/div/div[1]/div/div/div[4]/div/div[4]/div')
                    CakeButton[0].click()

                    #Submit request to look up
                    WatchButton = browser.find_elements(by=By.XPATH,
                                                       value='/html/body/div/div/div/div[1]/div/div/div[1]/div/div[2]/button[1]')
                    WatchButton[0].click()

                    Dummy_Wait = browser.find_elements(by=By.XPATH, #Dummy find command to force selenium to wait for page to load
                                 value = '/html/body/div/div/div/div[2]/div[2]/div')

                    #Try to find 'Token1-Token2' or 'Token2-Token1'
                    content = browser.page_source #Loads the HTML of the webpage

                    #Find farms (ADD: Check for matches ignoring capitalizations)
                    #Since all the characters are Captal on Yeild Farm, So I don't think we need to check the different Capitalization
                    LP_Table['YW'][L_idx] = int((content.find(LP['TK1'] + '-' + LP['TK2']) + content.find(LP['TK2'] + '-' + LP['TK1'])) > 0)

                    #Keep track of accounts checked so far for this farm
                    LP_Table['acct'][L_idx] = str(LP_Table['acct'][L_idx]) + ' ' + Account
                    browser.close() # Close the window before launching a new browser
    
    end = time.time()

    print('Time Elapsed: ' + str(end - start))

    pd.DataFrame(LP_Table,columns=['Address of LP Token','TK1','TK2','YW','acct']).to_csv('cached.csv') #Saves progress after every loop