#KEY: NB7CMBBEG2USA8GVHF2DK3K4WGEV4X4NH1
# import the package that we need
from requests import Request, Session
from requests.exceptions import ConnectionError, Timeout, TooManyRedirects
import json
import ast
import datetime as dt
import pandas as pd
import numpy as np
import time
import timeit
from datetime import datetime
from telnetlib import EC
# importing the libraries for webscraping
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import requests
import csv
from selenium.webdriver.chrome.options import Options # for suppressing the browser
#import re
options = webdriver.ChromeOptions()
options.add_argument('headless')
import warnings
warnings.filterwarnings('ignore')
这一个爬虫任务我们需要登录网站输入一个东西,并且查看我们是否可以得到回复,并且我们需要采集下来所有得到回复的代码。
# url for webscraper
url = "https://yieldwatch.net"
# This is the LP token addresses that we want to work on
#LP_Table = address_of_lp_token.dropna()
#LP_Table['YW'] = -999
# This is the final result
total_result = pd.DataFrame(columns= ['LP_address','transfer', 'amount'])
#initiate a session
session = Session()
endpoint_url = 'https://api.bscscan.com/api'
transfer_topic = '0xddf252ad1be2c89b69c2b068fc378daa952ba7f163c4a11628f55a4df523b3ef'
staking_address_v1 = '0x00000000000000000000000073feaa1eE314F8c655E354234017bE2193C9E24E'
staking_address_v2 = '0x000000000000000000000000a5f8C5Dbd5F286960b9d90548680aE5ebFf07652'
staking_address_list = [staking_address_v1,staking_address_v2]
#Array of API Keys
KEY = ['NB7CMBBEG2USA8GVHF2DK3K4WGEV4X4NH1', '5JTRM138T2NH9IKV7FRNIQMBY3G7DVB99A', '33G82Z1J154EKJ8QNX8VCHW8TYBIN5NWN9', 'NH9EKWTQRGGXY82AJT78J6T6R1I7T5JKKU']
def json_get_load(test_LP_address, staking_address, topic_number):
#print(staking_address, topic_number)
nrow = 1999
stBlk = 0
Result = []
KeyIdx = 0
while nrow > 999:
temp_Result = session.get(endpoint_url, params={
'module': 'logs', 'action': 'getLogs', 'fromBlock': stBlk, 'toBlock': 9999999999999, 'address': test_LP_address.lower(),
'topic0': transfer_topic, 'topic0_1_opr': 'and', 'topic' + topic_number : staking_address.lower(), 'apikey': KEY[KeyIdx % 4] #This part cycles through each of the 4 keys
})
KeyIdx = KeyIdx + 1
if KeyIdx > 100:
return []
elif json.loads(temp_Result.text)['status'] == '1': #If everything goes normally
#print('hello')
temp_Json = json.loads(temp_Result.text)['result']
nrow = len(temp_Json)
if (nrow > 999):
stBlk = int(temp_Json[999]['blockNumber'], base=16)
Result = Result + temp_Json
else: #If shit happens
if json.loads(temp_Result.text)['message'] == "No records found":
return Result
elif json.loads(temp_Result.text)['message'] == "Max rate limit reached":
print('Limit Reached!')
time.sleep(0.5)
continue
else:
print('Something is really wrong!') #Tell Roy if you see this
print(json.loads(temp_Result.text)['message'])
time.sleep(1)
continue
return Result
这里我们没有使用爬虫,我们使用了API的程序。
需要注意的是:
我们在使用API的时候有些时候会遇到很多奇怪的返回,我们一定要注意API的程序的正确使用,并且我们需要处理API回复的很多不同的情况。
99%的情况API的回复都是有一定数量的回复,我们需要多提交几次申请。
LP_Table = pd.read_csv('cached.csv')
for L in LP_Table.iterrows():
start = time.time()
L_idx = L[0]
LP = L[1]
if LP_Table['YW'][L_idx] < 0:
lp_token = LP['Address of LP Token']
print(str(L_idx) + '/' + str(len(LP_Table)) + ' : LP=' + lp_token)
Result1 = []
Result2 = []
Result3 = []
Result4 = []
#get FROM v2 address
result_3_temp = json_get_load(lp_token,staking_address_v2,"1")
[Result3.append(x) for x in result_3_temp if x not in Result3]
if len(Result3) > 1:
#get TO v2 address
result_4_temp = json_get_load(lp_token,staking_address_v2,"2")
[Result4.append(x) for x in result_4_temp if x not in Result4]
#if len(Result3 + Result4) < 3:
#get FROM v1 address
result_1_temp = json_get_load(lp_token,staking_address_v1,"1")
[Result1.append(x) for x in result_1_temp if x not in Result1]
if len(Result1) > 1:
#get TO v1 address
result_2_temp = json_get_load(lp_token,staking_address_v1,"2")
[Result2.append(x) for x in result_2_temp if x not in Result2]
result_Json_from = Result1+Result3
result_Json_to = Result2+Result4
if len(result_Json_from + result_Json_to) > 1:
# Sum the transactions
transactions = {}
p = 0
# Grab the data that we need
for i in result_Json_from:
#a = i['address']
#b = i['topics'][1]
c = i['topics'][2]
d = int(i['data'], 16)
transactions[p] = {'acct':c, 'amount': -1*d}
p = p + 1
for i in result_Json_to:
#a = i['address']
b = i['topics'][1]
#c = i['topics'][2]
d = int(i['data'], 16)
transactions[p] = {'acct':b, 'amount':d}
p = p + 1
transactions = pd.DataFrame.from_dict(transactions, "index")
transactions_result = transactions.groupby('acct').sum()
transactions_result = transactions_result[transactions_result['amount'] != 0].sort_values(ascending = False, by = 'amount').reset_index()
# max_chk = min(len(transactions_result['acct']), 3) #ADD: Change this to 10 or 15 to check up to 10/15
max_chk = min(len(transactions_result['acct']), 10)
print('all good in data')
#Try First Account in Yieldwatch, check if covered, IF NOT COVERED, maybe check next address until max of 10.
for q in transactions_result['acct'][0:max_chk]:
if LP_Table['YW'][L_idx] < 1:
#Reformat Account
Account = '0x' + q[26:]
#Launch a new window to prevent issues with stale pages
# browser = webdriver.Chrome(executable_path="C:/Users/RoyCh/PycharmProjects/WebScraper/chromedriver.exe")
browser = webdriver.Chrome(executable_path="C:/Users/Administrator/Dropbox/work220221005/1014/chromedriver.exe",options=options)
browser.get(url) #Load yieldwatch website
browser.implicitly_wait(30) #Wait for the page to load
#Locate Input Box by ID, type in the Account to check
addressBox = browser.find_elements(by=By.ID, value=
'addressInputField')
addressBox[0].clear()
addressBox[0].send_keys(Account)
#De-select all other buttons (XPATH is the structural location of the button on the page)
for g in [1, 2, 3, 5, 7, 8, 21, 25]:
MiscButton = browser.find_elements(by=By.XPATH,
value='/html/body/div/div/div/div[1]/div/div/div[4]/div/div['+ str(g) + ']/div')
MiscButton[0].click()
#Select the Pancake button
CakeButton = browser.find_elements(by=By.XPATH,
value = '/html/body/div/div/div/div[1]/div/div/div[4]/div/div[4]/div')
CakeButton[0].click()
#Submit request to look up
WatchButton = browser.find_elements(by=By.XPATH,
value='/html/body/div/div/div/div[1]/div/div/div[1]/div/div[2]/button[1]')
WatchButton[0].click()
Dummy_Wait = browser.find_elements(by=By.XPATH, #Dummy find command to force selenium to wait for page to load
value = '/html/body/div/div/div/div[2]/div[2]/div')
#Try to find 'Token1-Token2' or 'Token2-Token1'
content = browser.page_source #Loads the HTML of the webpage
#Find farms (ADD: Check for matches ignoring capitalizations)
#Since all the characters are Captal on Yeild Farm, So I don't think we need to check the different Capitalization
LP_Table['YW'][L_idx] = int((content.find(LP['TK1'] + '-' + LP['TK2']) + content.find(LP['TK2'] + '-' + LP['TK1'])) > 0)
#Keep track of accounts checked so far for this farm
LP_Table['acct'][L_idx] = str(LP_Table['acct'][L_idx]) + ' ' + Account
browser.close() # Close the window before launching a new browser
end = time.time()
print('Time Elapsed: ' + str(end - start))
pd.DataFrame(LP_Table,columns=['Address of LP Token','TK1','TK2','YW','acct']).to_csv('cached.csv') #Saves progress after every loop