网址:http://www.heibanke.com/lesson/crawler_ex00/
大概用了一小天的时间吧,把这五关给过了,还挺好玩的。推荐小白玩玩。直接po代码了。
我装pytessetact总有问题,之后其实可以直接训练一个cnn来做的,但是现在先po代码了,晚上回来随便弄弄。然后我第五关验证码就是人工把图片show出来,然后人工识别输入结果,很脑残。不管了。
import requests
from lxml import etree
import re
def level_1():
url_row = 'http://www.heibanke.com/lesson/crawler_ex00/'
url = 'http://www.heibanke.com/lesson/crawler_ex00/10963/'
while 1:
try:
html = etree.HTML(requests.get(url).text)
a = html.xpath("//div[@class='col-xs-12 col-sm-10 col-md-8 col-lg-6']//h3")[0].text
num = re.findall('下一个你需要输入的数字是(\d+)',a)[0]
url = url_row+num+'/'
print(url)
except:
break
def level_2():
url = 'http://www.heibanke.com/lesson/crawler_ex01/'
username = 'hi'
password = 0
while password<40:
resp = requests.post(url, data={'username':username,'password':password})
html = etree.HTML(resp.text)
a = html.xpath("//div[@class='col-xs-12 col-sm-10 col-md-8 col-lg-6']//h3")[0].text
if a=='您输入的密码错误, 请重新输入':
password += 1
else:
print(password)
break
def level_3():
username = 'emma'
password = 'emmaczw'
# url = 'http://www.heibanke.com/lesson/crawler_ex02/'
url = 'http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex02/'
# url = 'http://www.heibanke.com/accounts/login'
r_session = requests.Session()
resp = r_session.get(url)
# resp = r_session.post(url,data={'username':username,'password':password})
# print(resp.text)
token = resp.cookies.get('csrftoken')
resp = r_session.post(url,data={'username':username,'password':password,'csrfmiddlewaretoken':token})
url = resp.url
print(url)
username = 'hi'
password = 20
while password<40:
r_session.get(url)
token = r_session.cookies.get('csrftoken')
resp = requests.post(url,data={'username':username,'password':password,'csrfmiddlewaretoken':token},cookies=r_session.cookies)
print(resp.url)
html = etree.HTML(resp.text)
a = html.xpath("//div[@class='col-xs-12 col-sm-10 col-md-8 col-lg-6']//h3")[0].text
if a=='您输入的密码错误, 请重新输入':
password += 1
else:
print('correct password is: ',password)
break
def level_4():
import _thread
import time
password = [-1]*100
sum_num = 0
r_session = requests.Session()
r_session.get('http://www.heibanke.com/lesson/crawler_ex03/pw_list/?page=1')
# url = 'http://www.heibanke.com/lesson/crawler_ex03/pw_list/'
url = 'http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex03/pw_list/?page='
# url = 'http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex02/'
r_session.cookies.get('csrftoken')
def get_password(number):
nonlocal sum_num
nonlocal password
url_temp = url+str(number)
print(url_temp)
resp = r_session.post(url_temp, data={'username': 'emma', 'password': 'emmaczw', 'csrfmiddlewaretoken': r_session.cookies.get('csrftoken')})
html = etree.HTML(resp.text)
pass_list = html.xpath("//table[@class='table table-striped']//tr")[1:]
for k in pass_list:
ks = k.xpath(".//td")
index = int(ks[0].text)
num = int(ks[1].text)
if password[index-1] == -1:
print(sum_num)
sum_num += 1
password[index-1] = num
while sum_num<=98:
for i in range(1,14):
_thread.start_new_thread(get_password, (i,))
time.sleep(1)
time.sleep(2)
for i in range(len(password)):
if password[i] == -1:
password[i] = '{}'
else:
password[i] = str(password[i])
print(''.join(password))
def level_5():
import pytesseract
from PIL import Image
def get_image(img_url):
img = requests.get(img_url)
if img.status_code == 200:
print(type(img.content))
with open('./img_temp.png','wb') as f:
f.write(img.content)
img = Image.open('./img_temp.png')
img.show()
img = input()
# pytesseract.pytesseract.tesseract_cmd = "/anaconda3/lib/python3.6/site-packages/tesseract"
# img = pytesseract.image_to_string(Image.open('./img_temp.png'), lang='eng',config='-psm 7')
print(img)
else:
img = -1
return img
url_ = 'http://www.heibanke.com/lesson/crawler_ex04/'
url = 'http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex04/'
r_session = requests.Session()
r_session.get(url)
password = 14
resp = r_session.post(url, data={'username': 'emma', 'password': 'emmaczw',
'csrfmiddlewaretoken': r_session.cookies.get('csrftoken')})
while password<40:
img_url = 'http://www.heibanke.com/' + etree.HTML(resp.text).xpath("//div[@class='form-group']//img")[0].get(
'src')
number = get_image(img_url)
print(img_url)
print(img_url[-41:-1])
if number == -1:
print('没有找到图片')
continue
else:
resp = r_session.post(url_, data={'username': 'hi', 'password': password,
'csrfmiddlewaretoken': r_session.cookies.get('csrftoken'),
'captcha_1':number,'captcha_0':img_url[-41:-1]})
a = etree.HTML(resp.text).xpath("//div[@class='col-xs-12 col-sm-10 col-md-8 col-lg-6']//h3")[0].text
if a=='您输入的密码错误, 请重新输入':
password += 1
else:
print(password)
break