JAVA爬虫,爬取虎嗅主页新闻
初学爬虫,这个程序跑起来慢的要炸。
如何用多线程和队列对这段代码进行优化?
package com.eayon.spider_huxiu;
import java.util.ArrayList;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.http.HttpEntity;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.google.gson.Gson;
public class Spider_HuxiuMain {
private static final String startwith = "https://www.huxiu.com/article/";
private static final String endwith = ".html";
private static String last_dataline;
private static ArticleDao articleDao = new ArticleDao();
private static final ExecutorService threadPool = Executors.newFixedThreadPool(10);
public static void main(String[] args) throws Exception {
//1.获取虎嗅首页数据
String url = "https://www.huxiu.com/";
//定义一个获取单页html的方法
String indexHtml = getOneHtml(url);
//1.2解析首页html,得到所有文章的url(url由固定地址和文章id组成)
ArrayList<String> articleIds = getarticleId(indexHtml);
//1.3定义方法,爬取url指向的新闻网页数据,返回list
ArrayList<Article> articles = getArticle(articleIds);
//1.4存入数据库
save2db(articles);
//2.分页数据,last_dataline需要在请求体里
last_dataline = getValueByIndexHtml(indexHtml);
//2.1请求下一页,并直接将获得的信息保存
for (int i = 2; i < 1585; i++) {
requestNextPage(i);
}
}
/**
* 请求下一页的方法
* @param last_dataline
* @param i
* @return
* @throws Exception
*/
private static void requestNextPage(int page) throws Exception {
String url = "https://www.huxiu.com/v2_action/article_list";
HttpPost httpPost = new HttpPost(url);
//为请求添加参数
setHeader(httpPost);
ArrayList<BasicNameValuePair> arrayList = new ArrayList<BasicNameValuePair>();
arrayList.add(new BasicNameValuePair("huxiu_hash_code", "353a9683918c807f5f783dc1df116fad"));
arrayList.add(new BasicNameValuePair("page", page+""));
arrayList.add(new BasicNameValuePair("last_dateline", last_dataline));
httpPost.setEntity(new UrlEncodedFormEntity(arrayList));
//发起请求
CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse response = httpClient.execute(httpPost);
//这里返回的数据类型是Json
String jsonHtml = EntityUtils.toString(response.getEntity());
Gson gson = new Gson();
ResponseBean responseBean = gson.fromJson(jsonHtml, ResponseBean.class);
String html = responseBean.getData();
//获取相应到本地的新last_dataline
last_dataline = responseBean.getLast_dateline();
System.err.println("第"+(page-1)+"次操作,当前保存网页的链接为:"+startwith+last_dataline+endwith);
//获取页面中的新闻超链接,即新闻ID
ArrayList<String> articleIds = getarticleId(html);
//通过新闻ID获取文章集合
ArrayList<Article> articles = getArticle(articleIds);
//保存到数据库
save2db(articles);
System.out.println("第"+(page-1)+"个页面保存成功!");
}
/**
* 获取第一个last_dataline的方法
* @param indexHtml
* @return
*/
private static String getValueByIndexHtml(String indexHtml) {
Document doc = Jsoup.parse(indexHtml);
Elements select = doc.select("div[data-last_dateline]");
return select.attr("data-last_dateline");
}
/**
* 保存到数据库的方法
* @param articles
*/
private static Void save2db(ArrayList<Article> articles) {
for (Article article : articles) {
articleDao.save(article);
}
}
/**
* 利用多线程去载入页面信息
* @param articleIds
* @return
* @throws Exception
*/
public static ArrayList<Article> getArticle(ArrayList<String> articleIds) throws Exception {
ArrayList<Article> listArticle = new ArrayList<Article>();
for (String articleId : articleIds) {
threadPool.execute(new Tread_More(articleId, listArticle));
}
return listArticle;
}
/**
* 载入单个页面,并解析有效信息
* @throws Exception
*/
public static void toLoadPage(ArrayList<Article> listArticle, String articleId) throws Exception {
Article article = new Article();
String url = startwith+articleId+endwith;
//调用获得一个html页面的方法获得所有文章页面
String articleHtml = getOneHtml(url);
//解析文章页面,并封装成对象
Document doc = Jsoup.parse(articleHtml);
article.setId(articleId);
article.setUrl(url);
String title = doc.select(".t-h1").get(0).text();
article.setTitle(title);
String author = doc.select("span[class=author-name]").get(0).text();
article.setAuthor(author);
String createTime = doc.select("span[class^=article-time]").get(0).text();
article.setCreateTime(createTime);
String share = doc.select("span[class^=article-share]").get(0).text();
share = share.replace("收藏", "");
article.setSc(share);
String pl = doc.select("span[class^=article-pl]").get(0).text();
pl = pl.replace("评论", "");
article.setPl(pl);
String content = doc.select("div[class=article-content-wrap]").get(0).text();
article.setContent(content);
String zan = doc.select("div[class^=praise-box] span[class=num]").get(0).ownText();
article.setZan(zan);
listArticle.add(article);
}
/**
* 获取html中的文章ID
* @param html
* @return
*/
private static ArrayList<String> getarticleId(String html) {
if(html!=null){
ArrayList<String> articleIds = new ArrayList<String>();
Document doc = Jsoup.parse(html);
Elements urls = doc.select("div[data-aid]");
for (Element element : urls) {
String articleId = element.attr("data-aid");
articleIds.add(articleId);
}
return articleIds;
}
return null;
}
/**
* 获取单个html页面的方法
* @param url
* @return
* @throws Exception
*/
private static String getOneHtml(String url) throws Exception {
HttpGet httpGet = new HttpGet(url);
setHeader(httpGet);
CloseableHttpClient httpClient = HttpClients.createDefault();
String html = null;
CloseableHttpResponse response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode()==200) {
HttpEntity entity = response.getEntity();
html = EntityUtils.toString(entity);
}
return html;
}
/**
* 为请求添加请求头,模拟浏览器行为的方法
* @param request
*/
private static void setHeader(HttpRequestBase request) {
request.addHeader("user-agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36");
}
}