#!/usr/bin/python from PIL import Image from PIL import ImageFilter import urllib import requests import re import json # hack CERTIFICATE_VERIFY_FAILED # https://github.com/mtschirs/quizduellapi/issues/2 import ssl if hasattr(ssl, '_create_unverified_context'): ssl._create_default_https_context = ssl._create_unverified_context UA = 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36' pic_url = "https://kyfw.12306.cn/otn/passcodeNew/getPassCodeNew?module=login&rand=sjrand&0.21191171556711197" ''' def get_img(): resp = urllib.urlopen(pic_url) raw = resp.read() with open("./tmp.png", 'wb') as fp: fp.write(raw) return Image.open("./tmp.jpg") ''' def get_img(im): (WITH,HEIGHT) = im.size print("WITH = %d,HEIGHT = %d"%(WITH,HEIGHT)) left = 0 top = 0 right = left + WITH bottom = top + HEIGHT return im.crop((left, top, right, bottom)) def baidu_image_upload(im): url = "http://image.baidu.com/pictureup/uploadshitu?fr=flash&fm=index&pos=upload" im.save("./tmp.png") raw = open("./tmp.png", 'rb').read() files = { 'fileheight': "0", 'newfilesize': str(len(raw)), 'compresstime': "0", 'Filename': "image.png", 'filewidth': "0", 'filesize': str(len(raw)), 'filetype': 'image/png', 'Upload': "Submit Query", 'filedata': ("image.png", raw) } resp = requests.post(url, files=files, headers={'User-Agent': UA}) #print("resp.text = %s"%resp.text) # resp.url redirect_url = "http://image.baidu.com" + resp.text return redirect_url def baidu_stu_lookup(im): redirect_url = baidu_image_upload(im) # print redirect_url #print("1111111111redirect_url = %s"%redirect_url) resp = requests.get(redirect_url) html = resp.text #print("2222222222resp.text = %s"%resp.text) return baidu_stu_html_extract(html) def baidu_stu_html_extract(html): pattern = re.compile(r"'multitags':\s*'(.*?)'") matches = pattern.findall(html) if not matches: print("Not match") return '[ERROR?]' else: print("Match") tags_str = matches[0] for item in matches: print("item = %s"%item) #print("tags_str = %s"%tags_str) result = list(filter(None, tags_str.replace('\t', ' ').split())) return '|'.join(result) if result else '[UNKOWN]' def ocr_question_extract(im): # git@github.com:madmaze/pytesseract.git global pytesseract try: import pytesseract except: print("[ERROR] pytesseract not installed") return (width, height) = im.size im = im.crop((0, 0, width, height)) im = pre_ocr_processing(im) #im.show() return pytesseract.image_to_string(im, lang='chi_sim').strip() def pre_ocr_processing(im): im = im.convert("RGB") (width, height) = im.size white = im.filter(ImageFilter.BLUR).filter(ImageFilter.MaxFilter(23)) grey = im.convert('L') impix = im.load() whitepix = white.load() greypix = grey.load() for y in range(height): for x in range(width): greypix[x, y] = min(255, max(255 + impix[0,0][0] - whitepix[x, y][0], 255 + impix[0,0][1] - whitepix[x, y][1], 255 + impix[0,0][2] - whitepix[x, y][2])) new_im = grey.copy() binarize(new_im, 150) return new_im def binarize(im, thresh=120): assert 0 < thresh < 255 assert im.mode == 'L' w, h = im.size for y in range(0, h): for x in range(0, w): if im.getpixel((x, y)) < thresh: im.putpixel((x, y), 0) else: im.putpixel((x, y), 255) if __name__ == '__main__': # im = get_img() im = Image.open("tmp.jpg") ''' try: print('OCR Question:', ocr_question_extract(im)) except Exception as e: print ('<OCR failed>', e) ''' im2 = get_img(im) result = baidu_stu_lookup(im2) print(result)
将一张名为tmp.jpg的jpeg图片放在代码目录下:
显示结果如下:
E:\Python35\python.exe E:/PycharmProjs/baidushitu/baidushitu.py WITH = 500,HEIGHT = 440 Match item = 小鸡 哈多利博美 荷兰猪 豚鼠 鸡雏 鸡蛋孵小鸡 小鸡|哈多利博美|荷兰猪|豚鼠|鸡雏|鸡蛋孵小鸡 Process finished with exit code 0
本来是想用来识别12306抢票的验证码,但百度识图的识别率不高,其他的也试了,现在都不太高,所以就在想想有什么好的值得研究推荐下