java 实现爬虫(多线程)

最新推荐文章于 2024-07-04 16:04:39 发布

dreamss520

最新推荐文章于 2024-07-04 16:04:39 发布

阅读量3.7k

点赞数 1

分类专栏： java爬虫文章标签： java 爬虫多线程爬虫

本文链接：https://blog.csdn.net/dreamss520/article/details/79911229

版权

该博客介绍了如何使用Java实现单线程和多线程爬虫。通过Apache HttpClient库获取网页内容，使用Jsoup解析HTML，提取所需数据。多线程爬虫利用ExecutorService和CountDownLatch进行并发处理，提高爬取效率。

摘要由CSDN通过智能技术生成

1. 单线程爬虫

import java.util.ArrayList;
import java.util.List;

import javax.annotation.Resource;

import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;

import com.bm.entity.metabolize.Food;
import com.bm.service.download.FoodService;
import com.bm.util.HttpUtils;

@Controller
public class DownLoadController {

@Resource(name = "foodService")
private FoodService foodService;

/**
* 食物知识数据爬去
*
* @param args
* @throws Exception
*/
//@RequestMapping("download")
public void execute(String[] args) throws Exception {
// 初始化一个httpclient
@SuppressWarnings("deprecation")
HttpClient client = new DefaultHttpClient();
// 我们要爬取的一个地址，这里可以从数据库中抽取数据，然后利用循环，可以爬取一个URL队列
String urls[] = {
"https://www.cndzys.com/foodcalories/listbytype-%E8%9B%8B%E7%B1%BB.html",
"https://www.cndzys.com/foodcalories/listbytype-%E8%B1%86%E7%B1%BB.html",
"https://www.cndzys.com/foodcalories/listbytype-%E8%B0%B7%E7%B1%BB.html",
"https://www.cndzys.com/foodcalories/listbytype-%E5%9D%9A%E6%9E%9C.html",
"https://www.cndzys.com/foodcalories/listbytype-%E9%85%92%E9%A5%AE.html",
"https://www.cndzys.com/foodcalories/listbytype-%E8%8D%AF%E9%A3%9F.html", };
// 爬取的数据
List<List<Food>> fooddatas = getURLParser(client, urls);
// 将抓取的数据插入数据库
foodService.insertFood(fooddatas);
}

/**
* 根据目标URL爬取页面
*
* @param client
* @param url
* @return
* @throws Exception
*/
public List<List<Food>> getURLParser(HttpClient client, String urls[])
throws Exception {
// 用来接收解析的数据
List<List<Food>> foodDatas = new ArrayList<List<Food>>();
for (String url : urls) {
List<Food> foodData = new ArrayList<Food>();
// 获取网站响应的html，这里调用了HTTPUtils类
HttpResponse response = HttpUtils.getRawHtml(client, url);
// 获取响应状态码
int StatusCode = response.getStatusLine().getStatusCode();
// 如果状态响应码为200，则获取html实体内容或者json文件
if (StatusCode == 200) {
String entity = EntityUtils.toString(response.getEntity(),
"utf-8");
foodData = getData(entity);
foodDatas.add(foodData);
EntityUtils.consume(response.getEntity());
} else {
// 否则，消耗掉实体
EntityUtils.consume(response.getEntity());
}
}
return foodDatas;
}

/**
* 食物能量页面解析获取目标数据
*
* @param html
* @return
* @throws Exception
*/
public List<Food> getData(String html) throws Exception {
// 获取的数据，存放在集合中
List<Food> data = new ArrayList<Food>();
// 采用Jsoup解析
Document doc = Jsoup.parse(html);
// 获取html标签中的内容
Elements elements = doc.select("ul[class=list1]").select("li");
for (Element ele : elements) {
String foodName = ele.select("li").select("h5").select("a").text();
String foodEnargy = ele.select("li").select("p").text();
String energyValue=foodEnargy.substring(3);
energyValue=energyValue.substring(0,energyValue.indexOf("大卡"));
// 对象封装数据
Food food = new Food();
food.setFoodName(foodName);
food.setFoodEnergy(foodEnargy);
food.setEnergyValue(energyValue);
// 将每一个对象的值，保存到List集合中
d