首先在
Tesseract-OCR官网下载Tesseract-OCR 3.02,以及中文数据包chi_sim.traineddata(简体)
下面是JSP代码:
效果图:
接下来就是新建一个JAVA EE项目,把Tesseract-OCR放在项目WebRoot下。
下面是主要代码:
接受客户端上传过来的图片,使用Tesseract-OCR识别后返回至前台。
- package servlet;
- import java.io.IOException;
- import javax.servlet.ServletConfig;
- import javax.servlet.ServletException;
- import javax.servlet.http.HttpServlet;
- import javax.servlet.http.HttpServletRequest;
- import javax.servlet.http.HttpServletResponse;
- import util.FileUtil;
- import util.OCRUtil;
- import com.jspsmart.upload.File;
- import com.jspsmart.upload.SmartUpload;
- import com.jspsmart.upload.SmartUploadException;
- public class OCRServlet extends HttpServlet {
- public void doPost(HttpServletRequest request, HttpServletResponse response)
- throws ServletException, IOException {
- response.setCharacterEncoding("gbk");
- SmartUpload upload = new SmartUpload();
- ServletConfig sc = this.getServletConfig();
- upload.initialize(sc, request, response);
- File file = null;
- long size = 5*1024*1024;
- upload.setAllowedFilesList("gif,jpg,bmp,png");
- upload.setMaxFileSize(size);
- upload.setCharset("GBK");
- try {
- upload.upload();
- file = upload.getFiles().getFile(0);
- String userPath = "upload\\"+request.getRemoteAddr().replaceAll("\\.", "")+"\\";
- String svpath = userPath+file.getFileName();
- if(!file.isMissing()){
- String realPath = request.getRealPath("/");
- FileUtil.creatPath(realPath+userPath);
- file.saveAs(svpath,SmartUpload.SAVE_VIRTUAL);
- try {
- OCRUtil.runOCR(realPath, realPath+svpath, realPath+userPath+"ocr",true);
- request.setAttribute("txt", FileUtil.read(realPath+userPath+"ocr.txt").trim());
- request.getRequestDispatcher("/index.jsp").forward(request, response);
- } catch (Exception e) {
- e.printStackTrace();
- }
- FileUtil.delete(realPath+userPath);
- }
- } catch (SmartUploadException e) {
- e.printStackTrace();
- }
- }
- }
package servlet;
import java.io.IOException;
import javax.servlet.ServletConfig;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import util.FileUtil;
import util.OCRUtil;
import com.jspsmart.upload.File;
import com.jspsmart.upload.SmartUpload;
import com.jspsmart.upload.SmartUploadException;
public class OCRServlet extends HttpServlet {
public void doPost(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
response.setCharacterEncoding("gbk");
SmartUpload upload = new SmartUpload();
ServletConfig sc = this.getServletConfig();
upload.initialize(sc, request, response);
File file = null;
long size = 5*1024*1024;
upload.setAllowedFilesList("gif,jpg,bmp,png");
upload.setMaxFileSize(size);
upload.setCharset("GBK");
try {
upload.upload();
file = upload.getFiles().getFile(0);
String userPath = "upload\\"+request.getRemoteAddr().replaceAll("\\.", "")+"\\";
String svpath = userPath+file.getFileName();
if(!file.isMissing()){
String realPath = request.getRealPath("/");
FileUtil.creatPath(realPath+userPath);
file.saveAs(svpath,SmartUpload.SAVE_VIRTUAL);
try {
OCRUtil.runOCR(realPath, realPath+svpath, realPath+userPath+"ocr",true);
request.setAttribute("txt", FileUtil.read(realPath+userPath+"ocr.txt").trim());
request.getRequestDispatcher("/index.jsp").forward(request, response);
} catch (Exception e) {
e.printStackTrace();
}
FileUtil.delete(realPath+userPath);
}
} catch (SmartUploadException e) {
e.printStackTrace();
}
}
}
- package util;
- public class OCRUtil {
- public static String chiSIM = "chi_sim";
- public static void runOCR(String realPath,String imagePath,String outPath,boolean isChi) throws Exception{
- Runtime r = Runtime.getRuntime();
- String cmd = "\""+realPath+"Tesseract-OCR\\tesseract.exe\" \""+imagePath+"\" \""+outPath+"\" -l "+(isChi?chiSIM:"");
- r.exec(cmd);
- }
- }
package util;
public class OCRUtil {
public static String chiSIM = "chi_sim";
public static void runOCR(String realPath,String imagePath,String outPath,boolean isChi) throws Exception{
Runtime r = Runtime.getRuntime();
String cmd = "\""+realPath+"Tesseract-OCR\\tesseract.exe\" \""+imagePath+"\" \""+outPath+"\" -l "+(isChi?chiSIM:"");
r.exec(cmd);
}
}
- package util;
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.IOException;
- import java.io.InputStreamReader;
- public class FileUtil {
- public static String read(String path) throws IOException{
- String txt = "";
- File file = new File(path);
- long timeout = 30*60;
- while(!(file.isFile() && file.exists())){
- file = new File(path);
- try {
- Thread.sleep(100);
- timeout -= 100;
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- }
- if (file.isFile() && file.exists()) {
- InputStreamReader read = new InputStreamReader(new FileInputStream(file), "UTF-8");
- BufferedReader bReader = new BufferedReader(read);
- String temptxt = "";
- txt = "";
- while((temptxt=bReader.readLine())!=null){
- txt += temptxt;
- }
- bReader.close();
- read.close();
- }
- return txt;
- }
- public static void creatPath(String path) throws IOException{
- File file = new File(path);
- file.mkdir();
- }
- public static void delete(String path) throws IOException{
- File file = new File(path);
- String[] list = file.list();
- File tempFile = null;
- for(String temp : list){
- tempFile = new File(path+temp);
- tempFile.delete();
- }
- file.delete();
- }
- }
package util;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
public class FileUtil {
public static String read(String path) throws IOException{
String txt = "";
File file = new File(path);
long timeout = 30*60;
while(!(file.isFile() && file.exists())){
file = new File(path);
try {
Thread.sleep(100);
timeout -= 100;
} catch (InterruptedException e) {
e.printStackTrace();
}
}
if (file.isFile() && file.exists()) {
InputStreamReader read = new InputStreamReader(new FileInputStream(file), "UTF-8");
BufferedReader bReader = new BufferedReader(read);
String temptxt = "";
txt = "";
while((temptxt=bReader.readLine())!=null){
txt += temptxt;
}
bReader.close();
read.close();
}
return txt;
}
public static void creatPath(String path) throws IOException{
File file = new File(path);
file.mkdir();
}
public static void delete(String path) throws IOException{
File file = new File(path);
String[] list = file.list();
File tempFile = null;
for(String temp : list){
tempFile = new File(path+temp);
tempFile.delete();
}
file.delete();
}
}
下面是JSP代码:
- <%@ page language="java" import="java.util.*" pageEncoding="GBK"%>
- <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
- <html>
- <head>
- <title>在线OCR--By Lee</title>
- <meta http-equiv="pragma" content="no-cache">
- <meta http-equiv="cache-control" content="no-cache">
- <meta http-equiv="expires" content="0">
- <meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
- <meta http-equiv="description" content="This is my page">
- <!--a
- <link rel="stylesheet" type="text/css" href="styles.css">
- -->
- </head>
- <script type="text/javascript">
- function check(){
- var path = document.getElementById("image").value;
- if(path.length==0){
- alert("请选择要导入的图片!");
- return false;
- }
- if(!(path.match(/.jpg$/i)||path.match(/.bmp$/i)||path.match(/.gif$/i)||path.match(/.png$/i))){
- alert("只支持JPG,BMP,GIF,PNG格式!");
- return false;
- }
- return true;
- }
- </script>
- <body>
- <form enctype="multipart/form-data" method="post" action="OCRServlet" οnsubmit="return check();">
- 选择文件:<input type="file" id="image" name="image"><br/>
- 上传文件:<input type="submit" value="提交上传">
- </form>
- <textarea rows="20" cols="60"><%Object txt = request.getAttribute("txt");
- if(txt!=null&&txt.toString().length()==0){
- out.print("未识别出任何文字!");
- }else if(txt!=null){
- out.print(txt.toString());
- }
- %></textarea>
- </body>
- </html>
<%@ page language="java" import="java.util.*" pageEncoding="GBK"%>
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
<title>在线OCR--By Lee</title>
<meta http-equiv="pragma" content="no-cache">
<meta http-equiv="cache-control" content="no-cache">
<meta http-equiv="expires" content="0">
<meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
<meta http-equiv="description" content="This is my page">
<!--a
<link rel="stylesheet" type="text/css" href="styles.css">
-->
</head>
<script type="text/javascript">
function check(){
var path = document.getElementById("image").value;
if(path.length==0){
alert("请选择要导入的图片!");
return false;
}
if(!(path.match(/.jpg$/i)||path.match(/.bmp$/i)||path.match(/.gif$/i)||path.match(/.png$/i))){
alert("只支持JPG,BMP,GIF,PNG格式!");
return false;
}
return true;
}
</script>
<body>
<form enctype="multipart/form-data" method="post" action="OCRServlet" οnsubmit="return check();">
选择文件:<input type="file" id="image" name="image"><br/>
上传文件:<input type="submit" value="提交上传">
</form>
<textarea rows="20" cols="60"><%Object txt = request.getAttribute("txt");
if(txt!=null&&txt.toString().length()==0){
out.print("未识别出任何文字!");
}else if(txt!=null){
out.print(txt.toString());
}
%></textarea>
</body>
</html>
效果图:
在图片没做任何处理的情况下,识别率还是挺低的。。