package com.etoak.crawl.main;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.select.Elements;
import com.alibaba.fastjson.JSON;
import com.etoak.crawl.link.LinkFilter;
import com.etoak.crawl.link.Links;
import com.etoak.crawl.page.Page;
import com.etoak.crawl.page.PageParserTool;
import com.etoak.crawl.page.RequestAndResponseTool;
import com.etoak.crawl.util.FileTool;
public class MyCrawler {
/**
* 使用种子初始化 URL 队列
*
* @param seeds 种子 URL
* @return
*/
private void initCrawlerWithSeeds(String[] seeds) {
for (int i = 0; i < seeds.length; i++){
Links.addUnvisitedUrlQueue(seeds[i]);
}
}
/**
* 抓取过程
*
* @param seeds
* @return
*/
public void crawling(String[] seeds , String name) {
//初始化 URL 队列
initCrawlerWithSeeds(seeds);
//定义过滤器,提取以 http://www.baidu.com 开头的链接
LinkFilter filter = new LinkFilter() {
public boolean accept(String url) {
if (url.startsWith("https://www.jd.com"))
return true;
else
return false;
}
};
//循环条件:待抓取的链接不空且抓取的网页不多于 1000
int m = 1;
for (int i = 0; i < m; i++) {
//先从待访问的序列中取出第一个;
String visitUrl = (String) Links.removeHeadOfUnVisitedUrlQueue();
if (visitUrl == null){
continue;
}
//根据URL得到page;
Page page = RequestAndResponseTool.sendRequstAndGetResponse(visitUrl);
//对page进行处理: 访问DOM的某个标签
System.out.println(page);
Elements es = PageParserTool.select(page,"img[src]");
if(!es.isEmpty()){
System.out.println("下面将打印所有img[src]标签: ");
System.out.println(es);
}
//得到超链接
ArrayList links = PageParserTool.getAttrs(page,"img[src]","abs:src");
m = links.size();
try {
FileTool.saveToLocal(page,URLDecoder.decode(name, "utf-8"));
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if(links!=null&&links.size()>0){
Links.addUnvisitedUrlQueue(links.get(3));
System.out.println("新增爬取路径: " + links);
}
}
}
//main 方法入口
public static void main(String[] args) {
String as ="['帆布鞋']";
List list =JSON.parseArray(as);
/*JSONObject json = null;
json = RedisUtils.getObject("JDClassCid3");
if(json!=null){
list = (List) json.get("list");
}else{
list = ShoppingGuideUtils.getClasss11();
Map map = new HashMap();
map.put("list", list);
RedisUtils.setObjectMap("JDClassCid3", map, RedisUtils.EXRP_DAY);
}*/
System.out.println(list);
if(list!=null&&list.size()>0){
String name= "";
for (int i = 0; i < list.size(); i++) {
MyCrawler crawler = new MyCrawler();
name = list.get(i).toString();
try {
if(URLDecoder.decode(name, "utf-8").contains("二手")){//去除二手物品
continue;
}
} catch (UnsupportedEncodingException e2) {
// TODO Auto-generated catch block
e2.printStackTrace();
}
try {
name =URLEncoder.encode(name, "utf-8");
} catch (UnsupportedEncodingException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
crawler.crawling(new String[]{"https://so.m.jd.com/ware/search.action?keyword="+name+"&searchFrom=category&sf=1&as=1"},name);
try {
System.out.println("总计"+list.size()+"个===当前分类"+URLDecoder.decode(name, "utf-8")+"爬到"+i+"个");
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
/*MyCrawler crawler = new MyCrawler();
String name= "ETC";
try {
if(URLDecoder.decode(name, "utf-8").contains("二手")){
System.out.println("jies");
}
} catch (UnsupportedEncodingException e2) {
// TODO Auto-generated catch block
e2.printStackTrace();
}
try {
name =URLEncoder.encode(name, "utf-8");
} catch (UnsupportedEncodingException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
crawler.crawling(new String[]{"https://so.m.jd.com/ware/search.action?keyword="+name+"&searchFrom=category&sf=1&as=1"},name); */
}
}