最近在写开题报告,发现PDF中的文字复制出来会各种凌乱:标点全半角不定、各种空格莫名其妙的多。太影响复制粘贴了。
用js写了个解决问题的办法,基本就是用正则表达式替换。为了能方便的加入新标点转换,做了一下结构上的优化。
<html>
<!-- trim all the spaces in input, and make the punctuations in right case-->
<head>
<meta charset="UTF-8">
<script type="text/javascript"
src="http://ajax.googleapis.com/ajax/libs/jquery/1/jquery.min.js"></script>
<script type="text/javascript">
$(function(){
//bind events
$('#format').click(format);
$('#clear').click(clear);
});
//replace strategy
function Strategy(reg, rep){
this[this.REG] = reg;
this[this.REP] = rep;
}
Strategy.prototype.REG = 0;
Strategy.prototype.REP = 1;
//replace utils
function change(p1, p2, mapping){
return p1 + mapping[p2];
}
function getRegOf(word, mapping){
var str = '';
for (var i in mapping){
str += '\\' + i;
}
return new RegExp('(' + word + ')([' + str + '])', 'g');
}
// DBC to SBC case
function en2cnChange(){
var args = arguments;
return change(args[1], args[2], en2cnChange.prototype.mapping.mapping);
}
en2cnChange.prototype.mapping = {
mapping : {
'\,': ',',
'\.': '。',
'\;': ';',
'\!': '!'
},
reg: function(){
if(this._reg == undefined){
//initial once
this._reg = getRegOf('\\W', this.mapping);
}
return this._reg;
}
}
//SBC to DBC case
function cn2enChange(){
var args = arguments;
return change(args[1], args[2], cn2enChange.prototype.mapping.mapping);
}
cn2enChange.prototype.mapping = {
mapping : {
',': '\,',
'。': '\.',
';': '\;',
'!': '\!'
},
reg: function(){
if(this._reg == undefined){
//initial once
this._reg = getRegOf('\\w', this.mapping);
}
return this._reg;
}
}
// event handler
function format(){
var str = $('#input').val();
var reg = Strategy.prototype.REG;
var rep = Strategy.prototype.REP;
var strategies = format.prototype.strategies;
for(var i in strategies){
var strategy = strategies[i];
str = str.replace(strategy[reg], strategy[rep]);
}
$('#output').val(str);
}
format.prototype.strategies = [
new Strategy(en2cnChange.prototype.mapping.reg(), en2cnChange),
new Strategy(cn2enChange.prototype.mapping.reg(), cn2enChange),
new Strategy(/\s/g, function(){return ''})// whtie space
];
function clear(){
$('textarea').each(function(){$(this).val('')});
}
</script>
<style type="text/css">
textarea{
display: inline-block;
width: 45%;
height:80%;
margin: 1em;
}
</style>
</head>
<textarea id="input" placeholder="input"></textarea>
<textarea id="output" placeholder="output"></textarea>
<br/>
<button id="format">format</button>
<button id="clear">clear</button>
</html>