在编写爬虫爬去一个航空公司官网的时候,发现航班信息都是图片,比如航班号,舱位信息,价格,时间等。相对而言,图片相对比较简单,没有干扰线条,文字也是端端正正的。所以,可以处理.
package com.weixuan;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.*;
/**
* Create by fengtang
* 2015/8/25 0025
* ImageToString
*/
public class ImageToString {
/**
* 配置文件,图片识别特征库
*/
public static Properties prop = new Properties();
private static final int GRAYCVALUE = 128;
static {
try {
String fileName = "E:\\IDEA\\ImageToString\\src\\main\\resource\\config\\FeatureLibrary.properties";
File myFile = new File(fileName);
InputStreamReader isr;
isr = new InputStreamReader(new FileInputStream(myFile), "UTF-8");
prop.load(isr);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* @param imag 原始图片文件
* @return 图片对应的字符串
* @throws Exception
* @brief 获取图片对应的字符串
*/
public static String getString(BufferedImage imag) throws Exception {
/**
* 1.获取有效图形
*/
imag = getPicValidByValue(imag, GRAYCVALUE);
/**
* 2.切割图片
*/
BufferedImage[] imagList = getSplitPics(imag);
String[] code = new String[imagList.length];
String[] picCode = new String[imagList.length];
String validateCode = "";
for (int i = 0; i < imagList.length; i++) {
/**
* 3.获取切割后的有效图形
*/
imagList[i] = getPicValidByValue(imagList[i], GRAYCVALUE);
/**
* 4.获取图片的点阵描述字符串
*/
picCode[i] = getSingleBmpCode(imagList[i], GRAYCVALUE);
/**
* 5.匹配结果
*/
code[i] = match(picCode[i]);
if (code[i] == null) {
throw new RuntimeException("匹配出错");
}
/**
* 6.拼装最后结果
*/
validateCode = validateCode + code[i];
}
validateCode = validateCode.replace("semicolon", ":");
return validateCode;
}
/**
* @param singlepic 灰度图
* @param grayValue 背前景灰色界限
* @return 灰度图片的点阵描述字串(1表示灰点,0表示背景)
* @brief 返回灰度图片的点阵描述字串.
*/
private static String getSingleBmpCode(BufferedImage singlepic, int grayValue) {
if (singlepic == null) {
return null;
}
int[] pixel;
StringBuilder code = new StringBuilder();
for (int y = 0; y < singlepic.getHeight(); y++) {
for (int x = 0; x < singlepic.getWidth(); x++) {
pixel = getPixel(singlepic, x, y);
if (pixel[0] < grayValue) {
code.append("1");
} else {
code.append("0");
}
}
}
return code.toString();
}
/**
* @param imag 图片文件
* @param grayVale 灰度背景分界值
* @return 有效图片文件
* @brief 获取有效图形
*/
private static BufferedImage getPicValidByValue(BufferedImage imag, int grayVale) {
int posX1 = imag.getWidth();
int posY1 = imag.getHeight();
int posX2 = 0;
int posY2 = 0;
/**
* 找有效区
*/
for (int i = 0; i < imag.getHeight(); i++) {
for (int j = 0; j < imag.getWidth(); j++) {
int pixelValue = getPixel(imag, j, i)[0];
/**
* 根据灰度值
*/
if (pixelValue < GRAYCVALUE) {
if (posX1 > j) {
posX1 = j;
}
if (posY1 > i) {
posY1 = i;
}
if (posX2 < j) {
posX2 = j;
}
if (posY2 < i) {
posY2 = i;
}
}
}
}
return imag.getSubimage(posX1, posY1, posX2 - posX1 + 1, posY2 - posY1 + 1);
}
/**
* @param imag 源图像。
* @param j 图像上指定像素位置的 x 坐标。
* @param i 图像上指定像素位置的 y 坐标。
* @return 返回包含 rgb 颜色分量值的数组。元素 index 由小到大分别对应 r,g,b。
* @brief 取得图像上指定位置像素的 rgb 颜色分量.
*/
private static int[] getPixel(BufferedImage imag, int j, int i) {
int[] rgb = new int[3];
int pixel = imag.getRGB(j, i);
rgb[0] = (pixel & 0xff0000) >> 16;
rgb[1] = (pixel & 0xff00) >> 8;
rgb[2] = pixel & 0xff;
return rgb;
}
/**
* @param picCode piCode
* @return String
* @brief 匹配.
*/
private static String match(String picCode) {
String c = null;
Iterator it = prop.entrySet().iterator();
while (it.hasNext()) {
Map.Entry entry = (Map.Entry) it.next();
char[] validateC = entry.getValue().toString().toCharArray();
char[] picCodeC = picCode.toCharArray();
boolean rightFlag = true;
if (validateC.length == picCodeC.length) {
for (int i = 0; i < picCodeC.length; i++) {
if (picCodeC[i] != validateC[i]) {
rightFlag = false;
break;
}
}
} else {
rightFlag = false;
}
if (rightFlag) {
c = entry.getKey().toString();
break;
}
}
return c;
}
/**
* @param img img
* @return BufferedImage
* @brief 图片切割.
*/
private static BufferedImage[] getSplitPics(BufferedImage img) {
int posx1 = img.getWidth();
int posy1 = img.getHeight();
List<Integer> point = new ArrayList<Integer>();
for (int i = 0; i < img.getWidth(); i++) {
boolean flag = true;
for (int j = 0; j < img.getHeight(); j++) {
if (getPixel(img, i, j)[0] < GRAYCVALUE) {
flag = false;
break;
}
}
if (flag) {
if (point.size() != 0) {
int temp1 = point.get(point.size() - 1);
if (i == temp1) {
point.set(point.size() - 1, i + 1);
} else {
point.add(i + 1);
}
} else {
point.add(i + 1);
}
}
}
BufferedImage[] listImage = new BufferedImage[point.size() + 1];
for (int i = 0; i < point.size() + 1; i++) {
if (i == 0) {
listImage[i] = img.getSubimage(0, 0, point.get(i), posy1);
} else if (i < point.size()) {
listImage[i] = img.getSubimage(point.get(i - 1), 0, point.get(i) - point.get(i - 1), posy1);
} else {
listImage[i] = img.getSubimage(point.get(i - 1), 0, posx1 - point.get(i - 1), posy1);
}
}
return listImage;
}
}
FeatureLibrary.properties
#################图片识别特征库###################
semicolon=111100001111
A=000100000010000010100001010000101000011100010001001000101110111
B=111110010001010001011110010011010001010001010001111110
C=001111010001100000100000100000100000100000010001001110
D=111100010010010001010001010001010001010001010010111100
E=111111010001010000010010011110010010010000010001111111
F=111111010001010000010010011110010010010000010000111000
G=001110010010100000100000100000100111100010010010001100
H=111011101000100100010010001001111100100010010001001000101110111
I=111110010000100001000010000100001000010011111
J=011111000100000100000100000100000100000100000100100100111000
K=111011010010010100011000011100010100010010010010111011
L=111000010000010000010000010000010000010000010001111111
M=111011101101100110110011011001010100101010010101001010101101011
N=111011101100100110010010101001010100101010010011001001101110010
O=001100010010100001100001100001100001100001010010001100
P=111110010001010001010001011110010000010000010000111000
Q=001100010010100001100001100001100001101101010010001110000011
R=111110001000100100010011110001010000100100010010001000101110011
S=011111000110000010000011000001000011000111110
T=111111010100100001000010000100001000010001110
U=111011101000100100010010001001000100100010010001001000100011100
V=111011101000100100010001010000101000010100001010000010000001000
W=110101101010100101010010101001010100110110001010000101000010100
X=110111010010010100001100001000001100010010010010111011
Y=111011101000100010100001010000010000001000000100000010000011100
Z=111111001000010001000010000100010000100111111
0=011101000110001100011000110001100011000101110
1=010110010010010010010010111
2=011101000110001000010001000100010001000011111
3=011101000100001001100000100001000011001101110
4=000010000110001010010010010010100010011110000010000111
5=111111000010000111101000100001100011000101110
6=001110100110000101101100110001100011000101110
7=111111001000010000100010000100001000010000100
8=011101000110001010100111010001100011000101110
9=011101000110001100011001101101000011001011100
舱=001000010000011110011000010010100100011011000010010010000001111110111100010010100100011010100100010110101100010010100010100010100010100110011110
¥=111000111010000010001000100000101000000010000011111110000010000000010000000010000000111000
测试
package com.weixuan;
import org.junit.Test;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
/**
* Create by fengtang
* 2015/8/25 0025
* ImageToString
*/
public class TestProcessImage {
public static final String fileName = "C:\\Users\\fengtang\\Desktop\\img\\flightNo.png";
@Test
public void processImageTest() {
try {
BufferedImage imag = ImageIO.read(new File(fileName));;
System.out.println(ImageToString.getString(imag));
} catch (IOException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
}
}
测试的航班号和价格