__author__ = 'Administrator'
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException
from img_get import get_img
import selenium
import sys
import time
import re
import csv
import pytesser
if __name__ == "__main__":
import os
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
f = open("/home/henson/Documents/coding/bill/criminal.csv", "a+", encoding='utf-8')
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
chromedriver = "/home/henson/Documents/pycharm/webdriver/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
driver.get('http://......')
# inputElement = driver.find_element_by_xpath("//*[@id='b_Text0']")#//*[@id="b_Text0"]
# driver.switch_to_frame('iframeResult') # 定位到页面元素
driver.find_element_by_xpath('/html/body/div[3]/ul/li[1]/a/div[1]').click()
try:
for i in range(1, 10000):
title = driver.find_element_by_xpath('/html/body/div[3]/div[1]/div[1]/div[2]').text
court = driver.find_element_by_xpath('/html/body/div[3]/div[1]/div[1]/div[3]').text
text = driver.find_element_by_xpath('/html/body/div[3]/div[1]/div[1]/div[5]').text
#title = title.replace(",", "")
print(title)
print(court)
print(text)
data = (title, court, text)
writer = csv.writer(f)
writer.writerows(data)
driver.find_element_by_xpath('//*[@id="linkNextOne"]').click()
except NoSuchElementException:
verificationCode=driver.current_url #获取验证码的链接
get_img(verificationCode) #获取验证码
import pytesser
# 打开验证码界面
import os
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
chromedriver = "/home/henson/Documents/pycharm/webdriver/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
def get_img(url):
driver.get(url)
# 获取截图整个页面
driver.get_screenshot_as_file('screenshot.png')
# 获取指定元素位置
element = driver.find_element_by_id('imgCode')
left = int(element.location['x'])
top = int(element.location['y'])
right = int(element.location['x'] + element.size['width'])
bottom = int(element.location['y'] + element.size['height'])
# 通过Image处理图像
im = Image.open('screenshot.png')
im = im.crop((left, top, right, bottom)) #进行裁剪得到验证码
im.save('code.png')
# encoding=utf-8
from PIL import Image
from PIL import ImageEnhance
#from pytesser import *
import sys
sys.path.append('/home/henson/Documents/coding/bill/pytesser/')
from pytesser.pytesser import *
img = Image.open('screenshot.png')
#img=Image.open('code.png')
#使用ImageEnhance可以增强图片的识别率
enhancer = ImageEnhance.Contrast(img)
image_enhancer = enhancer.enhance(4) #识别验证码
print (image_to_string(image_enhancer)) #返回验证码结果
不知道为什么无法识别code.png,返回的是Empty page!!Empty page!!
而尝试识别没有裁剪过的截图,返回的是iii/WIFE
Cupd , v
什么鬼?说好的效果不错
连这个都识别不了,好气。
因为在import pytesser这里的时候弄了好久,死活在同一个目录在不能导入