爬虫

最新推荐文章于 2024-07-30 20:50:32 发布
Eayonchen
最新推荐文章于 2024-07-30 20:50:32 发布
阅读量229
点赞数
文章标签： java 优化多线程
本文链接：https://blog.csdn.net/Eayonchen/article/details/77876114
版权
JAVA爬虫，爬取虎嗅主页新闻

初学爬虫，这个程序跑起来慢的要炸。
如何用多线程和队列对这段代码进行优化?
package com.eayon.spider_huxiu;

import java.util.ArrayList;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.apache.http.HttpEntity;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.google.gson.Gson;

public class Spider_HuxiuMain {

    private static final String startwith = "https://www.huxiu.com/article/";
    private static final String endwith = ".html";
    private static String last_dataline;
    private static ArticleDao articleDao = new ArticleDao();
    private static final ExecutorService threadPool = Executors.newFixedThreadPool(10);

    public static void main(String[] args) throws Exception {
        //1.获取虎嗅首页数据
        String url = "https://www.huxiu.com/";
        //定义一个获取单页html的方法
        String indexHtml = getOneHtml(url);
        //1.2解析首页html,得到所有文章的url(url由固定地址和文章id组成)
        ArrayList<String> articleIds = getarticleId(indexHtml);
        //1.3定义方法,爬取url指向的新闻网页数据,返回list
        ArrayList<Article> articles = getArticle(articleIds);
        //1.4存入数据库
        save2db(articles);

        //2.分页数据,last_dataline需要在请求体里
        last_dataline = getValueByIndexHtml(indexHtml);
        //2.1请求下一页，并直接将获得的信息保存
        for (int i = 2; i < 1585; i++) {
            requestNextPage(i);
        }
    }

    /**
     * 请求下一页的方法
     * @param last_dataline
     * @param i
     * @return
     * @throws Exception 
     */
    private static void requestNextPage(int page) throws Exception {
        String url = "https://www.huxiu.com/v2_action/article_list";
        HttpPost httpPost = new HttpPost(url);
        //为请求添加参数
        setHeader(httpPost);
        ArrayList<BasicNameValuePair> arrayList = new ArrayList<BasicNameValuePair>();
        arrayList.add(new BasicNameValuePair("huxiu_hash_code", "353a9683918c807f5f783dc1df116fad"));
        arrayList.add(new BasicNameValuePair("page", page+""));
        arrayList.add(new BasicNameValuePair("last_dateline", last_dataline));
        httpPost.setEntity(new UrlEncodedFormEntity(arrayList));
        //发起请求
        CloseableHttpClient httpClient = HttpClients.createDefault();
        CloseableHttpResponse response = httpClient.execute(httpPost);
        //这里返回的数据类型是Json
        String jsonHtml = EntityUtils.toString(response.getEntity());
        Gson gson = new Gson();
        ResponseBean responseBean = gson.fromJson(jsonHtml, ResponseBean.class);
        String html = responseBean.getData();
        //获取相应到本地的新last_dataline
        last_dataline = responseBean.getLast_dateline();
        System.err.println("第"+(page-1)+"次操作,当前保存网页的链接为:"+startwith+last_dataline+endwith);
        //获取页面中的新闻超链接,即新闻ID
        ArrayList<String> articleIds = getarticleId(html);
        //通过新闻ID获取文章集合
        ArrayList<Article> articles = getArticle(articleIds);
        //保存到数据库
        save2db(articles);
        System.out.println("第"+(page-1)+"个页面保存成功!");
    }


    /**
     * 获取第一个last_dataline的方法
     * @param indexHtml
     * @return
     */
    private static String getValueByIndexHtml(String indexHtml) {
        Document doc = Jsoup.parse(indexHtml);
        Elements select = doc.select("div[data-last_dateline]");
        return select.attr("data-last_dateline");
    }

    /**
     * 保存到数据库的方法
     * @param articles
     */
    private static Void save2db(ArrayList<Article> articles) {
        for (Article article : articles) {
            articleDao.save(article);
        }
    }

    /**
     * 利用多线程去载入页面信息
     * @param articleIds
     * @return
     * @throws Exception
     */
    public static ArrayList<Article> getArticle(ArrayList<String> articleIds) throws Exception {
        ArrayList<Article> listArticle = new ArrayList<Article>();
        for (String articleId : articleIds) {
            threadPool.execute(new Tread_More(articleId, listArticle));
        }
        return listArticle;
    }

    /**
     * 载入单个页面，并解析有效信息
     * @throws Exception
     */
    public static void toLoadPage(ArrayList<Article> listArticle, String articleId) throws Exception {
        Article article = new Article();
        String url = startwith+articleId+endwith;
        //调用获得一个html页面的方法获得所有文章页面
        String articleHtml = getOneHtml(url);
        //解析文章页面,并封装成对象
        Document doc = Jsoup.parse(articleHtml);
        article.setId(articleId);
        article.setUrl(url);
        String title = doc.select(".t-h1").get(0).text();
        article.setTitle(title);
        String author = doc.select("span[class=author-name]").get(0).text();
        article.setAuthor(author);
        String createTime = doc.select("span[class^=article-time]").get(0).text();
        article.setCreateTime(createTime);
        String share = doc.select("span[class^=article-share]").get(0).text();
        share = share.replace("收藏", "");
        article.setSc(share);
        String pl = doc.select("span[class^=article-pl]").get(0).text();
        pl = pl.replace("评论", "");
        article.setPl(pl);
        String content = doc.select("div[class=article-content-wrap]").get(0).text();
        article.setContent(content);
        String zan = doc.select("div[class^=praise-box] span[class=num]").get(0).ownText();
        article.setZan(zan);
        listArticle.add(article);
    }

    /**
     * 获取html中的文章ID
     * @param html
     * @return
     */
    private static ArrayList<String> getarticleId(String html) {
        if(html!=null){
            ArrayList<String> articleIds = new ArrayList<String>();
            Document doc = Jsoup.parse(html);
            Elements urls = doc.select("div[data-aid]");
            for (Element element : urls) {
                String articleId = element.attr("data-aid");
                articleIds.add(articleId);
            }
            return articleIds;
        }
        return null;

    }


    /**
     * 获取单个html页面的方法
     * @param url
     * @return
     * @throws Exception
     */
    private static String getOneHtml(String url) throws Exception {
        HttpGet httpGet = new HttpGet(url);
        setHeader(httpGet);
        CloseableHttpClient httpClient = HttpClients.createDefault();
        String html = null;
        CloseableHttpResponse response = httpClient.execute(httpGet);
        if (response.getStatusLine().getStatusCode()==200) {
            HttpEntity entity = response.getEntity();
            html = EntityUtils.toString(entity);
        }
        return html;
    }

    /**
     * 为请求添加请求头,模拟浏览器行为的方法
     * @param request
     */
    private static void setHeader(HttpRequestBase request) {
        request.addHeader("user-agent",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36");
    }
}