java爬虫 webcollector_使用java爬虫WebCollector+jsoup抓取商品分类图标

package com.etoak.crawl.main;

import java.io.UnsupportedEncodingException;

import java.net.URLDecoder;

import java.net.URLEncoder;

import java.util.ArrayList;

import java.util.List;

import org.jsoup.select.Elements;

import com.alibaba.fastjson.JSON;

import com.etoak.crawl.link.LinkFilter;

import com.etoak.crawl.link.Links;

import com.etoak.crawl.page.Page;

import com.etoak.crawl.page.PageParserTool;

import com.etoak.crawl.page.RequestAndResponseTool;

import com.etoak.crawl.util.FileTool;

public class MyCrawler {

/**

* 使用种子初始化 URL 队列

*

* @param seeds 种子 URL

* @return

*/

private void initCrawlerWithSeeds(String[] seeds) {

for (int i = 0; i < seeds.length; i++){

Links.addUnvisitedUrlQueue(seeds[i]);

}

}

/**

* 抓取过程

*

* @param seeds

* @return

*/

public void crawling(String[] seeds , String name) {

//初始化 URL 队列

initCrawlerWithSeeds(seeds);

//定义过滤器,提取以 http://www.baidu.com 开头的链接

LinkFilter filter = new LinkFilter() {

public boolean accept(String url) {

if (url.startsWith("https://www.jd.com"))

return true;

else

return false;

}

};

//循环条件:待抓取的链接不空且抓取的网页不多于 1000

int m = 1;

for (int i = 0; i < m; i++) {

//先从待访问的序列中取出第一个;

String visitUrl = (String) Links.removeHeadOfUnVisitedUrlQueue();

if (visitUrl == null){

continue;

}

//根据URL得到page;

Page page = RequestAndResponseTool.sendRequstAndGetResponse(visitUrl);

//对page进行处理: 访问DOM的某个标签

System.out.println(page);

Elements es = PageParserTool.select(page,"img[src]");

if(!es.isEmpty()){

System.out.println("下面将打印所有img[src]标签: ");

System.out.println(es);

}

//得到超链接

ArrayList  links = PageParserTool.getAttrs(page,"img[src]","abs:src");

m = links.size();

try {

FileTool.saveToLocal(page,URLDecoder.decode(name, "utf-8"));

} catch (UnsupportedEncodingException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

if(links!=null&&links.size()>0){

Links.addUnvisitedUrlQueue(links.get(3));

System.out.println("新增爬取路径: " + links);

}

}

}

//main 方法入口

public static void main(String[] args) {

String as ="['帆布鞋']";

List list =JSON.parseArray(as);

/*JSONObject json = null;

json = RedisUtils.getObject("JDClassCid3");

if(json!=null){

list = (List) json.get("list");

}else{

list = ShoppingGuideUtils.getClasss11();

Map map = new HashMap();

map.put("list", list);

RedisUtils.setObjectMap("JDClassCid3", map, RedisUtils.EXRP_DAY);

}*/

System.out.println(list);

if(list!=null&&list.size()>0){

String name= "";

for (int i = 0; i < list.size(); i++) {

MyCrawler crawler = new MyCrawler();

name = list.get(i).toString();

try {

if(URLDecoder.decode(name, "utf-8").contains("二手")){//去除二手物品

continue;

}

} catch (UnsupportedEncodingException e2) {

// TODO Auto-generated catch block

e2.printStackTrace();

}

try {

name =URLEncoder.encode(name, "utf-8");

} catch (UnsupportedEncodingException e1) {

// TODO Auto-generated catch block

e1.printStackTrace();

}

crawler.crawling(new String[]{"https://so.m.jd.com/ware/search.action?keyword="+name+"&searchFrom=category&sf=1&as=1"},name);

try {

System.out.println("总计"+list.size()+"个===当前分类"+URLDecoder.decode(name, "utf-8")+"爬到"+i+"个");

} catch (UnsupportedEncodingException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

}

}

/*MyCrawler crawler = new MyCrawler();

String name= "ETC";

try {

if(URLDecoder.decode(name, "utf-8").contains("二手")){

System.out.println("jies");

}

} catch (UnsupportedEncodingException e2) {

// TODO Auto-generated catch block

e2.printStackTrace();

}

try {

name =URLEncoder.encode(name, "utf-8");

} catch (UnsupportedEncodingException e1) {

// TODO Auto-generated catch block

e1.printStackTrace();

}

crawler.crawling(new String[]{"https://so.m.jd.com/ware/search.action?keyword="+name+"&searchFrom=category&sf=1&as=1"},name); */

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值