开始学习——爬取微博页面的html

今天开始正式着手写爬取微博搜索的爬虫,上个星期,找了很多资料,也尝试了很多,想了很多,这个星期开始一点一点的写吧,新手入门,从最简单的开始,先爬取一个微博页面的Html。

package sina_weibo;
//根据Demo4获取微博的Html文件
import java.awt.Desktop;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.cookie.Cookie;
import org.apache.http.cookie.CookieOrigin;
import org.apache.http.cookie.CookieSpec;
import org.apache.http.cookie.CookieSpecProvider;
import org.apache.http.cookie.MalformedCookieException;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.cookie.BestMatchSpecFactory;
import org.apache.http.impl.cookie.BrowserCompatSpec;
import org.apache.http.impl.cookie.BrowserCompatSpecFactory;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;

@SuppressWarnings("deprecation")
public class getHtml {
    /**
     * 用默认浏览器打开指定网址
     * @param url
     * @throws URISyntaxException 
     * @throws IOException 
     */
    public void runBroswer(String url) throws URISyntaxException, IOException {  
        Desktop desktop = Desktop.getDesktop();  
        if (Desktop.isDesktopSupported() && desktop.isSupported(Desktop.Action.BROWSE)) {  
            URI uri = new URI(url);  
            desktop.browse(uri);
            }
    }
    /**
     * 由url得到html
     * @param url
     * @return html
     * @throws URISyntaxException
     * @throws ClientProtocolException
     * @throws IOException
     */
    public String getHTML(String url) throws URISyntaxException, ClientProtocolException, IOException {
        CookieSpecProvider easySpecProvider = new CookieSpecProvider() {
            @SuppressWarnings("deprecation")
            public CookieSpec create(HttpContext context) {
                return new BrowserCompatSpec() {
                    @Override
                    public void validate(Cookie cookie, CookieOrigin origin)
                            throws MalformedCookieException {
                    }
                };
            }
        };
        @SuppressWarnings("deprecation")
        Registry<CookieSpecProvider> r = RegistryBuilder
                .<CookieSpecProvider> create()
                .register(CookieSpecs.BEST_MATCH, new BestMatchSpecFactory())
                .register(CookieSpecs.BROWSER_COMPATIBILITY,
                        new BrowserCompatSpecFactory())
                .register("easy", easySpecProvider).build();
        RequestConfig requestConfig = RequestConfig.custom()
                .setCookieSpec("easy")
                .setSocketTimeout(1000)//设置socket超时时间
                .setConnectTimeout(1000)//设置connect超时时间
                .build();
        CloseableHttpClient httpClient = HttpClients.custom()
                .setDefaultCookieSpecRegistry(r)
                .setDefaultRequestConfig(requestConfig).build();

        HttpGet httpGet = new HttpGet(url);
        httpGet.setConfig(requestConfig);
        String html = "html获取失败";//用于验证是否正常取到html
        try{
            CloseableHttpResponse response = httpClient.execute(httpGet);
            html = EntityUtils.toString(response.getEntity());
            //System.out.println(html);//打印返回的html
        } catch(IOException e){
            System.out.println("****连接超时,程序自动重连****");
        }
        return html;
    }
    public String get () throws IOException, URISyntaxException, IOException{
        getHtml html = new getHtml();
        String url = "http://weibo.com/1642088277/C8P1zpVDP";
        String data = "";
        html.runBroswer(url);
        data = html.getHTML(url);
        return data;
    }
    public static void main(String[] args) throws IOException, URISyntaxException{
        getHtml test = new getHtml();
        System.out.println(test.get());
    }
}

返回的即为一个微博页面的Html,之后对其用jsoup进行解析处理。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值