图片转文本

最新推荐文章于 2024-03-19 16:30:00 发布

tianmayuchuan

最新推荐文章于 2024-03-19 16:30:00 发布

阅读量368

点赞数

分类专栏： java 文章标签：图片转换文本已实现

本文链接：https://blog.csdn.net/tianmayuchuan/article/details/79909158

版权

java 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

package com.picture.text;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.StringTokenizer;
import java.io.File;
import com.conn.Sqlcon;
public class ReadMain_ok {
public static Connection conn=null;
public static PreparedStatement psmt=null;
public static ResultSet rs=null;
public static void main(String[] args) {
try
{ // testdata

File testDataDir = new File("C:\\Program Files (x86)\\Tesseract-OCR\\image");
int i = 0 ;
for(File file :testDataDir.listFiles())
{
i++ ;
System.out.println (file.getName());
String recognizeText = new OCRHelper().recognizeText(file);
//System.out.println(i+ " 解析字符长度："+recognizeText.length()+" "+recognizeText+"\t");
//移动文件夹
File afile = new File("C:\\Program Files (x86)\\Tesseract-OCR\\image\\"+file.getName()); //移动前文件名称
//文件修改时间
Calendar cal = Calendar.getInstance();
long time =afile.lastModified();
SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
cal.setTimeInMillis(time);
System.out.println("修改时间[2] " + formatter.format(cal.getTime()));
//输出：修改时间[2] 2009-08-17 10:32:38

Sqlcon con=new Sqlcon();
conn= con.openConn();

int ii=1;
String s="";
ResultSet rs_2;
String zt="";
StringTokenizer token=new StringTokenizer(recognizeText.replace("'", "\"") ,"\""); //字符串拆分，按照"进行差分。

while(token.hasMoreElements()){
s=token.nextToken();
System.out.println(i+"拆分："+s+" ");
if(s.contains("https")) //设备状态对应的https对应文件
{
String sql2=" declare @s varchar(100); "+
" set @s = '"+s+"' "+
"select right(@s, charindex('-',reverse(@s))-1) ";
System.out .println(sql2 );
Statement smt_2=conn.createStatement();
rs_2=smt_2.executeQuery(sql2);
while(rs_2.next()) //取得https串中最后一个单词
{
zt=rs_2.getString(1);
System.out.println(ii+"状态： "+zt+" ");
String sql_insert="insert into sbzt_xx (sbbh,https,sj ,lrrq,zt ,picture ) values ('"+ii+"','"+s+"','"+ formatter.format(cal.getTime())+"',getdate(),'"+zt+"','"+file.getName()+"')";
System.out.println("拆入状态信息："+sql_insert);
try {
psmt=conn.prepareStatement(sql_insert);
psmt.execute();
} catch (SQLException e) {
e.printStackTrace();
}
}
ii++;
}

}
//将解析文本保存到数据库中
String sql_code="insert into picture_jx (name,time,jxwb,jxrq) values ('"+file.getName()+"','"+formatter.format(cal.getTime())+"','"+recognizeText.replace("'", "\"")+"',getdate())";
System.out.println(sql_code);
try {
psmt=conn.prepareStatement(sql_code);
psmt.execute();
} catch (SQLException e) {
e.printStackTrace();
}
//转移文件到新的位置。

if (afile.renameTo(new File("C:\\Program Files (x86)\\Tesseract-OCR\\image_ws\\" + afile.getName()))) {
System.out.println("File is moved successful!");
} else {
System.out.println("File is failed to move!");
}

}

} catch (Exception e)
{
e.printStackTrace();
}
}

}

OCRHelper 类代码：

package com.picture.text;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import org.jdesktop.swingx.util.OS;
public class OCRHelper {
private final String LANG_OPTION = "-l";
private final String EOL = System.getProperty("line.separator");
/**
* 文件位置我防止在，项目同一路径
*/
// // String tessPath = new File("tesseract").getAbsolutePath();
private String tessPath = "C:\\Program Files (x86)\\Tesseract-OCR";
/**
* @param imageFile
* 传入的图像文件
* @param imageFormat
* 传入的图像格式
* @return 识别后的字符串
*/
public String recognizeText(File imageFile) throws Exception
{
/**
* 设置输出文件的保存的文件目录
*/
File outputFile = new File(imageFile.getParentFile(), "output");
StringBuffer strB = new StringBuffer();
List<String> cmd = new ArrayList<String>();
if (OS.isWindowsXP())
{
cmd.add(tessPath + "\\tesseract");
} else if (OS.isLinux())
{
cmd.add("tesseract");
} else
{
cmd.add(tessPath + "\\tesseract");
}
cmd.add("");
cmd.add(outputFile.getName());
cmd.add(LANG_OPTION);
// cmd.add("chi_sim");
cmd.add("eng");

ProcessBuilder pb = new ProcessBuilder();
/**
*Sets this process builder's working directory.
*/
pb.directory(imageFile.getParentFile());
cmd.set(1, imageFile.getName());
pb.command(cmd);
pb.redirectErrorStream(true);
Process process = pb.start();
// tesseract.exe 1.jpg 1 -l chi_sim
// Runtime.getRuntime().exec("tesseract.exe 1.jpg 1 -l chi_sim");
/**
* the exit value of the process. By convention, 0 indicates normal
* termination.
*/
//System.out.println("cmd内容："+cmd.toString());
int w = process.waitFor();
System.out.println("w的值："+w);
if (w == 0)// 0代表正常退出
{
BufferedReader in = new BufferedReader(new InputStreamReader(
new FileInputStream(outputFile.getAbsolutePath() + ".txt"),
"UTF-8"));
String str;

while ((str = in.readLine()) != null)
{
strB.append(str).append(EOL);
}
in.close();
} else
{
String msg;
switch (w)
{
case 1:
msg = "Errors accessing files. There may be spaces in your image's filename.";
break;
case 29:
msg = "Cannot recognize the image or its selected region.";
break;
case 31:
msg = "Unsupported image format.";
break;
default:
msg = "Errors occurred.";
}
throw new RuntimeException(msg);
}
new File(outputFile.getAbsolutePath() + ".txt").delete();
return strB.toString().replaceAll("\\s*", "");
}
}

//该程序需要调用Tesseract-OCR工具。

工具下载地址：https://download.csdn.net/download/tianmayuchuan/10343146

tianmayuchuan

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
图片转文本

package com.picture.text;import java.sql.Connection;import java.sql.PreparedStatement;import java.sql.ResultSet;import java.sql.SQLException;import java.sql.Statement;import java.text.SimpleDateFormat...
复制链接

扫一扫