我是直接把process中的page.getHtml()替换为我自己得到的
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.security.SecureRandom;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import javax.net.ssl.SSLSocketFactory;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
public class TestWebMagic implements PageProcessor{
private static Html html;
private static String baseUrl;
@Override
public void process(Page page) {
//https协议得到页面html
html = new Html(getHtmlByHttps(baseUrl, "UTF-8"));
System.out.println(html);
page.putField("imgs", html.$("img", "src-medium").regex(".*800x800.jpg.*").all());
}
@Override
public Site getSite() {
return Site.me().setRetryTimes(3).setSleepTime(1000);
}
public static String getHtmlByHttps(String u, String encoding){
try {
SSLContext sc = SSLContext.getInstance("SSL", "SunJSSE");
sc.init(null, new TrustManager[] { new X509TrustManager() {
@Override
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
@Override
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
@Override
public X509Certificate[] getAcceptedIssuers() {
return null;
}
} }, new SecureRandom());
HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
HttpsURLConnection.setDefaultHostnameVerifier(new HostnameVerifier() {
@Override
public boolean verify(String arg0, SSLSession arg1) {
return true;
}
});
SSLSocketFactory ssf = sc.getSocketFactory();
URL url = new URL(null, u, new sun.net.www.protocol.https.Handler());
HttpsURLConnection conn = (HttpsURLConnection) url.openConnection();
conn.setSSLSocketFactory(ssf);
conn.setDoOutput(true);
conn.setDoInput(true);
conn.setUseCaches(false);
conn.setRequestMethod("GET");
conn.connect();
BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), encoding));
StringBuffer sb = new StringBuffer();
String line;
while ((line = br.readLine()) != null)
sb.append(line);
return sb.toString();
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
public static void main(String[] args) {
baseUrl = "https://product.suning.com/0070137013/149868717.html";
// 创建默认的httpClient实例
Spider.create(new TestWebMagic())
.addUrl("http://fanyi.baidu.com")//随便写个html协议可以得到的链接
//启动爬虫
.run();
}
}