#!/usr/bin/env python
# _*_ coding:utf-8 _*_
#此代码是书上151页获取验证码提交答案
#需要安装pillow库进行图片的转换和像素的变化
#需要安装tesseract进行图片文字的识别
#这段代码的思路是:先打开需要验证码填写的网页,获取必要的信息,将验证码转换为字符串,
#查看当前网页的提交的路径,然后将这些参数传递过去,判断是否成功即可
from urllib.request import urlopen, urlretrieve
import requests
from PIL import Image
from PIL import ImageOps
#利用pillow库对图片进行像素和尺寸的变化
from bs4 import BeautifulSoup
#用于创建子进程
import subprocess
def cleanImage(imagePath):
image=Image.open(imagePath)
image=image.point(lambda x:0 if x<143 else 255)
borderImage=ImageOps.expand(image,border=20,fill='white')
borderImage.save(imagePath)
html=urlopen("http://www.pythonscraping.com/humans-only")
bsObj=BeautifulSoup(html)
#收集需要处理的表单数据(包括验证码和输入字段)
imageLocation=bsObj.find("img",{"title":"Image CAPTCHA"})["src"]
formBuildId=bsObj.find("input",{"name":"form_build_id"})["value"]
captchaSid=bsObj.find("input",{"name":"captcha_sid"})["value"]
captchaToken=bsObj.find("input",{"name":"captcha_token"})["value"]
captchaUrl="http://pythonscraping.com"+imageLocation
urlretrieve(captchaUrl,"captcha.jpg")
cleanImage("captcha.jpg")
#系统调用tesseract,由于没有安装,因此下面这行代码有错就先注释,在运行中需要安装加解注释
#p=subprocess.Popen(["tesseract","page.jpg","page"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
#p.wait()
f=open("captcha.txt","r")
#清理识别结果中的空格和换行符
captchaResponse=f.read().replace(" ","").replace("\n","")
print("captcha solution attempt: "+captchaResponse)
if len(captchaResponse)==5:
params={
"captcha_token":captchaToken,
"captcha_sid":captchaSid,
"form_id":"comment_node_page_form",
"form_build_id":formBuildId,
"captcha_reponse":captchaResponse,
"name":"Ryan Mitchell",
"subject":"I come to check the Grail",
"comment_body[und][0][value]":"... and I am definitely not a bot"
}
r= requests.post("http://www.pythonscraping.com/comment/reply/10",data=params)
reponseObj=BeautifulSoup(r.text)
if reponseObj.find("div",{"class":"messages"}) is not None:
print(reponseObj.find("div",{"class":"messages"}).get_text())
else:
print("There is a problem reading the CHPTCHA correctly!")