一直有这个想法做,可是奈何没有时间,周日晚上突然特别想做这个,然后一晚上边嗑啤酒边敲代码到凌晨五点多终于写完了这个原型,4个小时大概提交了四百多次过了一百多个题目,正确率就那样哈哈,代码没有处理查重问题,由于某些因素有时候会中断,之后有时间再做改进
稍微讲一下原理,登录->获取题目->根据题目百度爬url->筛选CSDN的url用无头浏览器爬取代码->提交代码->判断答案->错误继续爬,对了下一题
学校模拟oj登录啥的比较简单,就是密码MD5加密,然后有个csdf要注意一下,看好久都没有找到可以越权的地方,比较尴尬,
因为CSDN的内置搜索还没有百度好用,登录后爬到题目和内容再去爬百度,爬取百度参考了博客上面的,但是要注意的是百度给的url是重定向的,要处理一下得到真实url,然后再去爬CSDN,由于CSDN用了js加密,而且蛮复杂网上也没有找到案例,就用无头浏览器了,没什么技术含量,哈哈
代码贴上,要运行的话要装一下谷歌无头浏览器的驱动
#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
import urllib.parse
import urllib
import hashlib, binascii
import time
import chardet
from lxml import etree
import re
import json
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
# 导入chrome选项
from selenium.webdriver.chrome.options import Options
from requests.exceptions import RequestException
from urllib.parse import urljoin
global cs
user={'id':'','passw':''}
# 百度搜索接口
chrome_options = Options()#无头浏览器
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.set_page_load_timeout(10)
driver.set_script_timeout(10)
s=requests.session()
headers={'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:55.0) Gecko/20100101 Firefox/55.0'}
def saveHtml(file_name, file_content):
with open(file_name.replace('/', '_') + ".html", "wb") as f:
f.write(str.encode(file_content))
def format_url(url, params: dict=None) -> str:
query_str = urllib.parse.urlencode(params)
return f'{ url }?{ query_str }'
def get_url(keyword):
params = {
'wd': str(keyword)
}
url = "https://www.baidu.com/s"
url = format_url(url, params)
# print(url)
return url
def get_page(url):
try:
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
}
response = requests.get(url=url,headers=headers)
# 更改编码方式,否则会出现乱码的情况
response.encoding = "utf-8"
print(response.status_code)
# print(response.text)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_page(url,page):
for i in range(1,int(page)+1):
print("正在爬取第{}页....".format(i))
title = ""
sub_url = ""
abstract = ""
flag = 11
if i == 1:
flag = 10
html = get_page(url)
content = etree.HTML(html)
for j in range(1,flag):
data = {}
res_title = content.xpath('//*[@id="%d"]/h3/a' % ((i - 1) * 10 + j))
if res_title:
title = res_title[0].xpath('string(.)')
sub_url = content.xpath('//*[@id="%d"]/h3/a/@href' % ((i - 1) * 10 + j))
if sub_url:
sub_url = sub_url[0]
res_abstract = content.xpath('//*[@id="%d"]/div[@class="c-abstract"]'%((i-1)*10+j))
if res_abstract:
abstract = res_abstract[0].xpath('string(.)')
else:
res_abstract = content.xpath('//*[@id="%d"]/div/div[2]/div[@class="c-abstract"]'%((i-1)*10+j))
if res_abstract:
abstract = res_abstract[0].xpath('string(.)')
# res_abstract = content.xpath('//*[@id="%d"]/div/div[2]/p[1]'%((i-1)*10+j))
# if not abstract:
# abstract = content.xpath('//*[@id="%d"]/div/div[2]/p[1]'%((i-1)*10+j))[0].xpath('string(.)')
data['title'] = title
data['sub_url'] = sub_url
data['abstract'] = abstract
rel_url = content.xpath('//*[@id="page"]/a[{}]/@href'.format(flag))
if rel_url:
url = urljoin(url, rel_url[0])
else:
print("无更多页面!~")
return
yield data
def get_real(o_url):
#获取重定向url指向的网址
r = requests.get(o_url, allow_redirects=False) # 禁止自动跳转
if r.status_code == 302:
try:
return r.headers['location'] # 返回指向的地址
except:
pass
return o_url
#Repairing a Road CSDN
#https://blog.csdn.net/Yellow_python/article/details/81107273
def ans_code(url):
url=get_real(url)
Get = requests.get(url)
#获取show代码网页
try:
driver.get(url)
time.sleep(3)
html1=etree.HTML(driver.page_source)
except:
pass
code='nil'
#saveHtml(url,tt)
# print(tt)
try:
driver.find_element_by_id('btn-readmore').click()
code = driver.find_element_by_class_name('language-cpp').text
print(code)
except:
pass
return code
def get_md5_value(str):
#获取密码的md5十六位值
my_md5 = hashlib.md5()
my_md5.update(str.encode(encoding='utf-8'))
my_md5_Digest = my_md5.hexdigest()
print(my_md5_Digest)
return my_md5_Digest
def csrf():
#获取csrf值
firsturl='http://acm.hnucm.edu.cn/JudgeOnline/csrf.php'
get=s.get(firsturl)
html=get.text
t=re.compile(r'name="csrf" value="(.+)" ')
resu=t.findall(html)
return resu[0]
def login_oj(id,pa):
global cs
paprm={'user_id':id,'password':get_md5_value(pa),'csrf':cs}
s.post('http://acm.hnucm.edu.cn/JudgeOnline/login.php',data=paprm)
stt=s.get('http://acm.hnucm.edu.cn/JudgeOnline/submitpage.php?id=1100&sid=48970').text
#saveHtml('1112', stt)
def submit_code(code,id):
global cs
url='http://acm.hnucm.edu.cn/JudgeOnline/submit.php'
codes={'id':id,'language':'1','source':code,'reverse2':'reverse','csrf':cs}
s.post(url,data=codes)
#saveHtml('123123', stt)
def get_pro(id):
url='http://acm.hnucm.edu.cn/JudgeOnline/problem.php?id='+id
tt=s.get(url).text
ss=re.compile(r'</title><center><h2>(.+)</h2>',flags=re.DOTALL)
si=re.compile(r'<span style="(.+?)<',flags=re.DOTALL)
tit=ss.findall(tt)
sis=si.findall(tt)
print(tit)
pt='00'
try :
pt=tit[0]
except :
pass
for i in sis:
try :
print(i)
pt=pt+i[18:]
except:
pass
print(pt)
return pt
def is_ac():
url='http://acm.hnucm.edu.cn/JudgeOnline/status.php?user_id=201701020135'
tt=s.get(url).text
ss=re.compile(r'class=\'label label-(.*?)\' title=\'',flags=re.DOTALL)
ff=ss.findall(tt)[0]
if(ff=='success'):
print("ACACAC啦!!")
return True
else :
print(ff)
return False
def main():
global cs
global user
user['id']=input('账号:')
user['passw']=input('密码:')
cs=csrf()
login_oj(user['id'],user['passw'])
for id in range(1100,1400):
tit=get_pro(str(id))
if(tit=='00'):
continue
ke=tit
url = get_url(ke)
results = parse_page(url,2)
# 写入文件
#file = open("data.json", 'w+', encoding='utf-8')
for result in results:
print(result)
if(get_real(result['sub_url'])[0:21]=='https://blog.csdn.net'):
#file.write(json.dumps(result, indent=2, ensure_ascii=False))
try :
code=ans_code(result['sub_url'])
if(code!='nil'):
submit_code(code, str(id))
time.sleep(8)
if is_ac():
break;
except:
pass
if __name__ == '__main__':
main()