开始学习——爬取微博页面的html

最新推荐文章于 2024-04-20 16:55:37 发布

莫轮_小锐

最新推荐文章于 2024-04-20 16:55:37 发布

阅读量1.3k

点赞数

分类专栏：微博爬虫

本文链接：https://blog.csdn.net/a123719014/article/details/44309935

版权

微博爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

今天开始正式着手写爬取微博搜索的爬虫，上个星期，找了很多资料，也尝试了很多，想了很多，这个星期开始一点一点的写吧，新手入门，从最简单的开始，先爬取一个微博页面的Html。

package sina_weibo;
//根据Demo4获取微博的Html文件
import java.awt.Desktop;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.cookie.Cookie;
import org.apache.http.cookie.CookieOrigin;
import org.apache.http.cookie.CookieSpec;
import org.apache.http.cookie.CookieSpecProvider;
import org.apache.http.cookie.MalformedCookieException;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.cookie.BestMatchSpecFactory;
import org.apache.http.impl.cookie.BrowserCompatSpec;
import org.apache.http.impl.cookie.BrowserCompatSpecFactory;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;

@SuppressWarnings("deprecation")
public class getHtml {
    /**
     * 用默认浏览器打开指定网址
     * @param url
     * @throws URISyntaxException 
     * @throws IOException 
     */
    public void runBroswer(String url) throws URISyntaxException, IOException {  
        Desktop desktop = Desktop.getDesktop();  
        if (Desktop.isDesktopSupported() && desktop.isSupported(Desktop.Action.BROWSE)) {  
            URI uri = new URI(url);  
            desktop.browse(uri);
            }
    }
    /**
     * 由url得到html
     * @param url
     * @return html
     * @throws URISyntaxException
     * @throws ClientProtocolException
     * @throws IOException
     */
    public String getHTML(String url) throws URISyntaxException, ClientProtocolException, IOException {
        CookieSpecProvider easySpecProvider = new CookieSpecProvider() {
            @SuppressWarnings("deprecation")
            public CookieSpec create(HttpContext context) {
                return new BrowserCompatSpec() {
                    @Override
                    public void validate(Cookie cookie, CookieOrigin origin)
                            throws MalformedCookieException {
                    }
                };
            }
        };
        @SuppressWarnings("deprecation")
        Registry<CookieSpecProvider> r = RegistryBuilder
                .<CookieSpecProvider> create()
                .register(CookieSpecs.BEST_MATCH, new BestMatchSpecFactory())
                .register(CookieSpecs.BROWSER_COMPATIBILITY,
                        new BrowserCompatSpecFactory())
                .register("easy", easySpecProvider).build();
        RequestConfig requestConfig = RequestConfig.custom()
                .setCookieSpec("easy")
                .setSocketTimeout(1000)//设置socket超时时间
                .setConnectTimeout(1000)//设置connect超时时间
                .build();
        CloseableHttpClient httpClient = HttpClients.custom()
                .setDefaultCookieSpecRegistry(r)
                .setDefaultRequestConfig(requestConfig).build();

        HttpGet httpGet = new HttpGet(url);
        httpGet.setConfig(requestConfig);
        String html = "html获取失败";//用于验证是否正常取到html
        try{
            CloseableHttpResponse response = httpClient.execute(httpGet);
            html = EntityUtils.toString(response.getEntity());
            //System.out.println(html);//打印返回的html
        } catch(IOException e){
            System.out.println("****连接超时，程序自动重连****");
        }
        return html;
    }
    public String get () throws IOException, URISyntaxException, IOException{
        getHtml html = new getHtml();
        String url = "http://weibo.com/1642088277/C8P1zpVDP";
        String data = "";
        html.runBroswer(url);
        data = html.getHTML(url);
        return data;
    }
    public static void main(String[] args) throws IOException, URISyntaxException{
        getHtml test = new getHtml();
        System.out.println(test.get());
    }
}

返回的即为一个微博页面的Html，之后对其用jsoup进行解析处理。

莫轮_小锐

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
开始学习——爬取微博页面的html

今天开始正式着手写爬取微博搜索的爬虫，上个星期，找了很多资料，也尝试了很多，想了很多，这个星期开始一点一点的写吧，新手入门，从最简单的开始，先爬取一个微博页面的Html。package sina_weibo;//根据Demo4获取微博的Html文件import java.awt.Desktop;import java.io.IOException;import java.net.URI;i
复制链接

扫一扫