今天开始正式着手写爬取微博搜索的爬虫,上个星期,找了很多资料,也尝试了很多,想了很多,这个星期开始一点一点的写吧,新手入门,从最简单的开始,先爬取一个微博页面的Html。
package sina_weibo;
//根据Demo4获取微博的Html文件
import java.awt.Desktop;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.cookie.Cookie;
import org.apache.http.cookie.CookieOrigin;
import org.apache.http.cookie.CookieSpec;
import org.apache.http.cookie.CookieSpecProvider;
import org.apache.http.cookie.MalformedCookieException;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.cookie.BestMatchSpecFactory;
import org.apache.http.impl.cookie.BrowserCompatSpec;
import org.apache.http.impl.cookie.BrowserCompatSpecFactory;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
@SuppressWarnings("deprecation")
public class getHtml {
/**
* 用默认浏览器打开指定网址
* @param url
* @throws URISyntaxException
* @throws IOException
*/
public void runBroswer(String url) throws URISyntaxException, IOException {
Desktop desktop = Desktop.getDesktop();
if (Desktop.isDesktopSupported() && desktop.isSupported(Desktop.Action.BROWSE)) {
URI uri = new URI(url);
desktop.browse(uri);
}
}
/**
* 由url得到html
* @param url
* @return html
* @throws URISyntaxException
* @throws ClientProtocolException
* @throws IOException
*/
public String getHTML(String url) throws URISyntaxException, ClientProtocolException, IOException {
CookieSpecProvider easySpecProvider = new CookieSpecProvider() {
@SuppressWarnings("deprecation")
public CookieSpec create(HttpContext context) {
return new BrowserCompatSpec() {
@Override
public void validate(Cookie cookie, CookieOrigin origin)
throws MalformedCookieException {
}
};
}
};
@SuppressWarnings("deprecation")
Registry<CookieSpecProvider> r = RegistryBuilder
.<CookieSpecProvider> create()
.register(CookieSpecs.BEST_MATCH, new BestMatchSpecFactory())
.register(CookieSpecs.BROWSER_COMPATIBILITY,
new BrowserCompatSpecFactory())
.register("easy", easySpecProvider).build();
RequestConfig requestConfig = RequestConfig.custom()
.setCookieSpec("easy")
.setSocketTimeout(1000)//设置socket超时时间
.setConnectTimeout(1000)//设置connect超时时间
.build();
CloseableHttpClient httpClient = HttpClients.custom()
.setDefaultCookieSpecRegistry(r)
.setDefaultRequestConfig(requestConfig).build();
HttpGet httpGet = new HttpGet(url);
httpGet.setConfig(requestConfig);
String html = "html获取失败";//用于验证是否正常取到html
try{
CloseableHttpResponse response = httpClient.execute(httpGet);
html = EntityUtils.toString(response.getEntity());
//System.out.println(html);//打印返回的html
} catch(IOException e){
System.out.println("****连接超时,程序自动重连****");
}
return html;
}
public String get () throws IOException, URISyntaxException, IOException{
getHtml html = new getHtml();
String url = "http://weibo.com/1642088277/C8P1zpVDP";
String data = "";
html.runBroswer(url);
data = html.getHTML(url);
return data;
}
public static void main(String[] args) throws IOException, URISyntaxException{
getHtml test = new getHtml();
System.out.println(test.get());
}
}
返回的即为一个微博页面的Html,之后对其用jsoup进行解析处理。