获取请求链接的服务器头链接,无协议,给用户加协议,tomcat下获取webApp路径,传入文件名,删除不是同年月日的所有文件,判断网页的编码,爬虫完美绕过服务器反爬检查代码
获取请求链接的服务器头链接:String hostPort = request.getScheme() + “://” + request.getServerName() + “:” + request.getServerPort();
// 无协议,给用户加协议
if (!url.contains("http")) {
try {
URL url0 = new URL("https://"+url);
url0.openStream();
url="https://"+url;
} catch (Exception e) {
url="http://"+url;
}
}
tomcat下获取webApp路径:
nowpath=System.getProperty(“user.dir”);
tempdir=nowpath.replace(“bin”, “webapps”); //把bin 文件夹变到 webapps文件里面
删除文件夹下的文件
/**
* 传入文件名,删除不是同年月日的所有文件
*
* @param fileName
*/
public void deleteOutDateJsp(String fileName) {
String nowpath; //当前tomcat的bin目录的路径 如 D:\java\software\apache-tomcat-6.0.14\bin
String tempdir;
nowpath=System.getProperty("user.dir");
tempdir=nowpath.replace("bin", "webapps"); //把bin 文件夹变到 webapps文件里面
String date = fileName.substring(0, 10);
// File folder = new File("src/main/webapp/static/temp/");//jetty
File folder = new File(tempdir+"/datacrawl/static/temp/");//tomcat
File[] files = folder.listFiles();
for (int i = 0; i < files.length; i++) {
if (files[i]!=null&&(files[i].getName().contains(date) || files[i].getName().equals("get_xpath.html"))) {
continue;
} else if (files[i]!=null){
files[i].delete();
}
}
}
判断网页的编码:
判断网页的编码:
public String findCharset(String htmlfileName) {
BufferedReader bufReader = null;
try {
bufReader = new BufferedReader(new InputStreamReader(new FileInputStream(new File(htmlfileName))));
for (String tmp1 = null; (tmp1 = bufReader.readLine()) != null; tmp1 = null) {
String tmp = new String(tmp1.toString());
if (tmp.contains("meta") && tmp.contains("charset") && (tmp.contains("gbk") || tmp.contains("GBK"))) {
return "GBK";
}
if (tmp.contains("meta") && tmp.contains("charset")
&& (tmp.contains("utf-8") || tmp.contains("UTF-8"))) {
return "UTF-8";
}
if (tmp.contains("meta") && tmp.contains("charset")
&& (tmp.contains("gb2312") || tmp.contains("GB2312"))) {
return "GB2312";
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
bufReader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return "";
}
爬虫完美绕过服务器反爬检查代码
HostnameVerifier hv = new HostnameVerifier() {
public boolean verify(String urlHostName, SSLSession session) {
System.out.println("Warning: URL Host: " + urlHostName + " vs. "
+ session.getPeerHost());
return true;
}
};
trustAllHttpsCertificates();
HttpsURLConnection.setDefaultHostnameVerifier(hv);
// 插入取到的html代码
try {
URL url = new URL(string);
URLConnection connection = url.openConnection();
connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
connection.connect();
/*URLConnection conn = url.openConnection();
conn.setRequestProperty("User-Agent",
"Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");*/
isr=new InputStreamReader(connection.getInputStream(),Charset.forName("UTF-8"));
//isr=new InputStreamReader(conn.getInputStream(), "UTF-8");
bufr = new BufferedReader(isr);
} catch (Exception e) {
e.printStackTrace();
}
private void trustAllHttpsCertificates() {
javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1];
javax.net.ssl.TrustManager tm = new miTM();
trustAllCerts[0] = tm;
javax.net.ssl.SSLContext sc = null;
try {
sc = javax.net.ssl.SSLContext
.getInstance("SSL");
} catch (NoSuchAlgorithmException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
try {
sc.init(null, trustAllCerts, null);
} catch (KeyManagementException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc
.getSocketFactory());
}
static class miTM implements javax.net.ssl.TrustManager,
javax.net.ssl.X509TrustManager {
public java.security.cert.X509Certificate[] getAcceptedIssuers() {
return null;
}
public boolean isServerTrusted(
java.security.cert.X509Certificate[] certs) {
return true;
}
public boolean isClientTrusted(
java.security.cert.X509Certificate[] certs) {
return true;
}
public void checkServerTrusted(
java.security.cert.X509Certificate[] certs, String authType)
throws java.security.cert.CertificateException {
return;
}
public void checkClientTrusted(
java.security.cert.X509Certificate[] certs, String authType)
throws java.security.cert.CertificateException {
return;
}
}