java写网页_java编写网站数据抓取

packagecom.zyt.creenshot.service.crawlerData.impl;importcom.zyt.creenshot.entity.CarBaseData;importcom.zyt.creenshot.mapper.CarBaseDataMapper;importcom.zyt.creenshot.service.crawlerData.ICrawlerData;importcom.zyt.creenshot.util.DocumentHelper;importcom.zyt.creenshot.util.HttpConnectionManager;importlombok.extern.slf4j.Slf4j;importorg.apache.commons.collections4.CollectionUtils;importorg.apache.commons.lang3.StringUtils;importorg.jsoup.Jsoup;importorg.jsoup.nodes.Document;importorg.jsoup.nodes.Element;importorg.jsoup.select.Elements;importorg.springframework.beans.factory.annotation.Autowired;importorg.springframework.stereotype.Component;importjava.util.ArrayList;importjava.util.List;/*** @ClassName:CrawlerDataImpl

* @Description:

* @Author:zhaiyutao

* @Data:2019/7/8 17:48

* @Vesion: v1.0*/@Component

@Slf4jpublic class CrawlerDataImpl implementsICrawlerData {

@AutowiredprivateHttpConnectionManager connectionManager;

@Autowired(required= false)privateCarBaseDataMapper carBaseDataMapper;

@Overridepublic voidcrawlerCarBaseData() {

String url= "***********要爬取的网址*************";

String resultHtml= DocumentHelper.getProxyHttp(url, null, 0, "GBK", connectionManager);if(StringUtils.isEmpty(resultHtml)){

log.error("没有爬到网站数据");

}

Document html=Jsoup.parse(resultHtml);//解析品牌

Elements brandList = html.select("div[class=braRow]");if(null != brandList && brandList.size() > 0){

List listCar = new ArrayList<>();//获取车的大品牌

for(Element brand : brandList){

Elements brandBig= brand.select("div[class=braRow-icon]");//大品牌名称 和 车标

String brandName = brandBig.select("p").text().replace("?","·");

String brandPic= brandBig.select("img[src]").attr("#src");

Elements smallBrandList= brand.select("div[class=modA noBorder]");for( Element sb : smallBrandList){

Elements brandItem= sb.select("div[class=thA]");//细分品牌

String brandSmallName = brandItem.select("a[href]").text();

Elements sbInner= sb.select("div[class=tbA ]");for(Element in : sbInner){

dealCarData(listCar, brandName, brandPic, brandSmallName, in);

}

Elements sbInnerNother= sb.select("div[class=tbA mt10 noBorder]");for(Element inner : sbInnerNother){

dealCarData(listCar, brandName, brandPic, brandSmallName, inner);

}

}

}if(CollectionUtils.isNotEmpty(listCar)){

carBaseDataMapper.insertBatch(listCar);

}

}

}private void dealCarData(ListlistCar, String brandName, String brandPic, String brandSmallName, Element in) {

String carTypeName= in.select("p[class=stit]").text().split("(")[0];

Elements li= in.select("li");for(Element element : li){

Element tit= element.select("p[class=tit]").get(0);

Element price= element.select("p[class=price]").get(0);

Elements carHref= tit.select("a[href]");

String priceStr=price.text();if(null !=carHref){

String href= carHref.attr("href");if(StringUtils.isEmpty(href)){continue;

}

String carName= carHref.attr("title");

String carId= StringUtils.substring(href, 1, href.length() - 1);

CarBaseData carBaseData= newCarBaseData();

carBaseData.setCarId(carId);

carBaseData.setCarName(carName);

carBaseData.setBrandName(brandName);

carBaseData.setBrandPic(brandPic);

carBaseData.setSubBrandName(brandSmallName);

carBaseData.setCarType(carTypeName);

carBaseData.setCarPrice(priceStr);

listCar.add(carBaseData);

}if(listCar.size()>=500){

carBaseDataMapper.insertBatch(listCar);

listCar.clear();

}

}

}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值