背景
有个朋友写的爬虫,遇到个问题,就是爬回来的数据文字中有很多表情符号,拿到的html文档又有很多字是乱码?渲染出来又很正常。
在这样的情况下
- 如果你直接用ocr识别,表情会被解析成乱七八糟的字符
- 如果你直接读取内容,也会有很多乱七八糟的字符
通过观察发现,html的中乱码需要使用该站点提供的字体文件进行渲染,该站点自定义了一套字体和自定义的unicode的规则进行映射。那么字体实际是画出来的图片文件,我提供了如下方案:
- 先借助字体解析工具将文件中的所有文字解析成一张张图片,并使用它对应的unicode码命名。
- 使用离线ocr工具将所有的图片解析成对应的字符,这样就得到一个自定义unicode编码与对应字符的一个映射关系字典。
- 从html对应标签解析出来对应的unicode编码,从字典查询对应的字符即可。
准备工作
对应的字体解析工具开源项目opentype4j
离线ocr我们也开源项目tess4j直接引入maven包即可
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>5.4.0</version>
</dependency>
下载对应中文字库ocr字库文件
新建目录存放你下载的字库文件
具体代码实现
路径工具类
public class TestUtils {
static final String TEST_PATH = "src/test/files/";
static String assemblyFilePath(String fileName) {
return TEST_PATH + fileName;
}
static String getDirectory(){
return TEST_PATH+"/tmp/";
}
static String assemblyOutFilePath(String fileName) {
return TEST_PATH + "/tmp/" + fileName;
}
}
public static void getImages() throws IOException {
Font font = OpenType.parse(TestUtils.assemblyFilePath("83db004a.woff"));
for (int i = 0; i <font.getGlyphs().getLength() ; i++) {
String path=TestUtils
.assemblyOutFilePath(font.getGlyphs()
.get(i).getName())+".jpg";
font.getGlyphs().get(i)
.getPath().toImage(new File(path));
}
}
接着使用ocr工具进行识别,完整代码如下:
static List<String> allFilePath=new ArrayList<>();
public static void main(String[] args) throws Exception {
//getImages();
File directory=new File(TestUtils.getDirectory());
if (directory.isDirectory()) {
getAllFilePath(directory);
}
//ocr加载
ITesseract instance=new Tesseract();
//设置字库位置
instance.setDatapath("D:\\tessdata");
//加载对应语言
instance.setLanguage("chi_sim");
//循环识别
Map<String,String> dicMap=new HashMap<>();
for (String path : allFilePath) {
File file = new File(path);
String s = instance.doOCR(file);
String unicode=file.getName().substring(0,file.getName().lastIndexOf("."));
dicMap.put(unicode,s);
}
//此处你可以将字典保存到.txt文件去
System.out.println("xxxx");
}
public static void getAllFilePath(File srcFile) {
File[] fileArray = srcFile.listFiles();
if (fileArray != null) {
for (File file : fileArray){
if(file.isDirectory()){
getAllFilePath(file);
}else {
allFilePath.add(file.getAbsoluteFile().getPath());
}
}
}
}
public static void getImages() throws IOException {
Font font = OpenType.parse(TestUtils.assemblyFilePath("83db004a.woff"));
for (int i = 0; i <font.getGlyphs().getLength() ; i++) {
String path=TestUtils
.assemblyOutFilePath(font.getGlyphs()
.get(i).getName())+".jpg";
font.getGlyphs().get(i)
.getPath().toImage(new File(path));
}
}
此时我们就得到一个自定义unicode对应字符的编码映射字典,在根据这个映字典去解析html中对应标签的内容即可
是不是so easy!
网页编码到字体编码的映射
该站点做了两层unicode
码映射,有点坑
以上部分解析了字体文件,现在还要解析网页上的u
码到字体u
码的映射。
借助google
提供的工具包 sfnttool.jar
源码地址https://github.com/2229499815/sfntly
拉取源码,使用ant进行编译后生成jar包引入项目即可
ant
下载地址https://ant.apache.org/bindownload.cgi
下载最新版即可
进入sfntly
目录,使用全路径ant/bin
下的ant命令执行即可生成对应jar
包
> cd sfntly
> D:\ant\bin\ant
然后到\sfntly\java\dist\tools\sfnttool
目录中获取jar
包 添加到项目中
接着编写对应的解析类
public class WoffConverter {
private static Logger logger = LoggerFactory.getLogger(WoffConverter.class);
private static final LinkedHashMap<String, Integer> woffHeaderFormat = new LinkedHashMap<String, Integer>() {
{
put("signature", 4);
put("flavor", 4);
put("length", 4);
put("numTables", 2);
put("reserved", 2);
put("totalSfntSize", 4);
put("majorVersion", 2);
put("minorVersion", 2);
put("metaOffset", 4);
put("metaLength", 4);
put("metaOrigLength", 4);
put("privOffset", 4);
put("privOrigLength", 4);
}
};
private static final LinkedHashMap<String, Integer> tableRecordEntryFormat = new LinkedHashMap<String, Integer>() {
{
put("tag", 4);
put("offset", 4);
put("compLength", 4);
put("origLength", 4);
put("origChecksum", 4);
}
};
private HashMap<String, Number> woffHeaders = new HashMap<String, Number>();
private ArrayList<HashMap<String, Number>> tableRecordEntries = new ArrayList<HashMap<String, Number>>();
private int offset = 0;
private int readOffset = 0;
private File woffFile;
private byte[] ttfByteArray;
private WoffConverter(){}
public WoffConverter(File woffFile) throws InvalidWoffException, IOException, DataFormatException{
this.woffFile = woffFile;
FileInputStream inputStream = new FileInputStream(woffFile);
ByteArrayOutputStream ttfOutputStream = convertToTTFOutputStream(inputStream);
ttfByteArray = ttfOutputStream.toByteArray();
}
/**
* woff 转 ttf byte[]
* @return
* @throws InvalidWoffException
* @throws IOException
* @throws DataFormatException
*/
public byte[] getTTFByteArray(){
return ttfByteArray;
}
/**
* 获取Cmap
* @return
*/
public LinkedHashMap<Integer, String> getCmap(){
LinkedHashMap<Integer, String> ret = new LinkedHashMap<Integer, String>();
try{
FontFactory fontFactory = FontFactory.getInstance();
Font font = fontFactory.loadFonts(ttfByteArray)[0];
Map<Integer, ? extends Table> tableMap = font.tableMap();
CMapTable cmapTable = (CMapTable)tableMap.get(Tag.cmap);
Iterator<CMap> it = cmapTable.iterator();
while(it.hasNext()){
CMap cmap = it.next();
if(cmap instanceof CMapFormat12){
Iterator<Integer> it1 = cmap.iterator();
while(it1.hasNext()){
int val = it1.next();
String unicode = val < 128 ? String.valueOf((char) val) : ("uni" + Integer.toHexString(val));
ret.put(val, unicode);
}
break;
}
}
}
catch (IOException | InvalidWoffException ex){
logger.error(ex.getMessage(), ex);
}
return ret;
}
/**
* 获取unicode 字符列表
* @return
*/
public List<String> getUniCodeList(){
List<String> works = new ArrayList<String>();
try{
FontFactory fontFactory = FontFactory.getInstance();
Font font = fontFactory.loadFonts(ttfByteArray)[0];
Map<Integer, ? extends Table> tableMap = font.tableMap();
if(tableMap.containsKey(Tag.CFF)){
}
else if(tableMap.containsKey(Tag.post)){
PostScriptTable postScriptTable = (PostScriptTable)tableMap.get(Tag.post);
for(int i=0; i< postScriptTable.numberOfGlyphs(); i++){
String glypName = postScriptTable.glyphName(i);
if(!glypName.startsWith("uni")){
continue;
}
works.add(glypName);
}
}
}
catch (IOException | InvalidWoffException ex){
logger.error(ex.getMessage(), ex);
}
return works;
}
private ByteArrayOutputStream convertToTTFOutputStream(InputStream inputStream)
throws InvalidWoffException, IOException, DataFormatException {
getHeaders(new DataInputStream(inputStream));
if ((Integer) woffHeaders.get("signature") != 0x774F4646) {
throw new InvalidWoffException("Invalid woff file");
}
ByteArrayOutputStream ttfOutputStream = new ByteArrayOutputStream();
writeOffsetTable(ttfOutputStream);
getTableRecordEntries(new DataInputStream(inputStream));
writeTableRecordEntries(ttfOutputStream);
writeFontData(inputStream, ttfOutputStream);
return ttfOutputStream;
}
/**
* 获取头部
* @param woffFileStream
* @throws IOException
*/
private void getHeaders(DataInputStream woffFileStream) throws IOException {
readTableData(woffFileStream, woffHeaderFormat, woffHeaders);
}
/**
*
* @param ttfOutputStream
* @throws IOException
*/
private void writeOffsetTable(ByteArrayOutputStream ttfOutputStream)
throws IOException {
ttfOutputStream.write(getBytes((Integer) woffHeaders.get("flavor")));
int numTables = (Integer) woffHeaders.get("numTables");
ttfOutputStream.write(getBytes((short)numTables));
int temp = numTables;
int searchRange = 16;
short entrySelector = 0;
while (temp > 1) {
temp = temp >> 1;
entrySelector++;
searchRange = (searchRange << 1);
}
short rangeShift = (short) (numTables * 16 - searchRange);
ttfOutputStream.write(getBytes((short) searchRange));
ttfOutputStream.write(getBytes(entrySelector));
ttfOutputStream.write(getBytes(rangeShift));
offset += 12;
}
private void getTableRecordEntries(DataInputStream woffFileStream)
throws IOException {
int numTables = (Integer) woffHeaders.get("numTables");
for (int i = 0; i < numTables; i++) {
HashMap<String, Number> tableDirectory = new HashMap<String, Number>();
readTableData(woffFileStream, tableRecordEntryFormat,
tableDirectory);
offset += 16;
tableRecordEntries.add(tableDirectory);
}
}
private void writeTableRecordEntries(ByteArrayOutputStream ttfOutputStream)
throws IOException {
for (HashMap<String, Number> tableRecordEntry : tableRecordEntries) {
ttfOutputStream.write(getBytes((Integer) tableRecordEntry
.get("tag")));
ttfOutputStream.write(getBytes((Integer) tableRecordEntry
.get("origChecksum")));
ttfOutputStream.write(getBytes(offset));
ttfOutputStream.write(getBytes((Integer) tableRecordEntry
.get("origLength")));
tableRecordEntry.put("outOffset", offset);
offset += (Integer) tableRecordEntry.get("origLength");
if (offset % 4 != 0) {
offset += 4 - (offset % 4);
}
}
}
private void writeFontData(InputStream woffFileStream,
ByteArrayOutputStream ttfOutputStream) throws IOException,
DataFormatException {
for (HashMap<String, Number> tableRecordEntry : tableRecordEntries) {
int tableRecordEntryOffset = (Integer) tableRecordEntry
.get("offset");
int skipBytes = tableRecordEntryOffset - readOffset;
if (skipBytes > 0)
woffFileStream.skip(skipBytes);
readOffset += skipBytes;
int compressedLength = (Integer) tableRecordEntry.get("compLength");
int origLength = (Integer) tableRecordEntry.get("origLength");
byte[] fontData = new byte[compressedLength];
byte[] inflatedFontData = new byte[origLength];
int readBytes = 0;
while (readBytes < compressedLength) {
readBytes += woffFileStream.read(fontData, readBytes,
compressedLength - readBytes);
}
readOffset += compressedLength;
inflatedFontData = inflateFontData(compressedLength,
origLength, fontData, inflatedFontData);
ttfOutputStream.write(inflatedFontData);
offset = (Integer) tableRecordEntry.get("outOffset")
+ (Integer) tableRecordEntry.get("origLength");
int padding = 0;
if (offset % 4 != 0) {
padding = 4 - (offset % 4);
}
ttfOutputStream.write(getBytes(0), 0, padding);
}
}
private byte[] inflateFontData(int compressedLength, int origLength,
byte[] fontData, byte[] inflatedFontData) {
if (compressedLength != origLength) {
Inflater decompressor = new Inflater();
decompressor.setInput(fontData, 0, compressedLength);
try {
decompressor.inflate(inflatedFontData, 0, origLength);
} catch (DataFormatException e) {
throw new InvalidWoffException("Malformed woff file");
}
} else {
inflatedFontData = fontData;
}
return inflatedFontData;
}
private byte[] getBytes(int i) {
return ByteBuffer.allocate(4).putInt(i).array();
}
private byte[] getBytes(short h) {
return ByteBuffer.allocate(2).putShort(h).array();
}
private void readTableData(DataInputStream woffFileStream,
LinkedHashMap<String, Integer> formatTable,
HashMap<String, Number> table) throws IOException {
Iterator<String> headerKeys = formatTable.keySet().iterator();
while (headerKeys.hasNext()) {
String key = headerKeys.next();
int size = formatTable.get(key);
if (size == 2) {
table.put(key, woffFileStream.readUnsignedShort());
} else if (size == 4) {
table.put(key, woffFileStream.readInt());
}
readOffset += size;
}
}
}
异常处理
public class InvalidWoffException extends RuntimeException {
public InvalidWoffException() {
super();
}
public InvalidWoffException(String message) {
super(message);
}
public InvalidWoffException(String message, Throwable cause) {
super(message, cause);
}
public InvalidWoffException(Throwable cause) {
super(cause);
}
protected InvalidWoffException(String message, Throwable cause, boolean enableSuppression,
boolean writableStackTrace) {
super(message, cause, enableSuppression, writableStackTrace);
}
}
解析
public static void main(String[] args) throws IOException, DataFormatException {
File file = new File("xxx\\83db004a.woff");
WoffConverter converter = new WoffConverter(file);
List<String> uniCodeList = converter.getUniCodeList();
LinkedHashMap<Integer, String> cmap = converter.getCmap();
}
通过cmaps
中可以获取到对应网页u
码与对应字符u
码的映射,这里解析出来的网页u码是个integer
转成16
进制拼接上\u
即可