package com.zg.controller;
import com.zg.jsoup.JsoupList;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.tomcat.util.http.fileupload.IOUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.FutureTask;
public class ZgJsoupDemo {
public static void main1(String[] args){
try {
for (int i = 0; i < 107; i++) {
String url = "http://college.gaokao.com/schlist/p" + i + "/";
Document document = Jsoup.connect(url).get();
Elements dl1 = document.select("dl>dt>a");//.select("dl").select("dt");
File file = new File("D:/jsoupimg/uuidName.txt");
for (Element element2 : dl1) {
String href = element2.select("a[href]").attr("href");
Document doc = Jsoup.connect(href).get();
Elements college_msg = doc.getElementsByClass("college_msg");
for (Element element3 : college_msg) {
String imgs = element3.getElementsByClass("left").select("img[src]").attr("src");
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(imgs);
CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
HttpEntity entity = httpResponse.getEntity();
InputStream is = entity.getContent();
String imgName = "schoolImg"+UUID.randomUUID().toString().replaceAll("-", "") + imgs.substring(imgs.lastIndexOf("."));
//String uuid = UUID.randomUUID().toString().replaceAll("-", "")+imgs.substring(imgs.lastIndexOf("."));
System.out.println(imgName);
FileOutputStream out1 = new FileOutputStream("D:/jsoupimg/jsoupSchoolName/" + imgName);
FileOutputStream out = new FileOutputStream(file);
out.write(imgName.getBytes());
//IOUtils.copy(is, out);
IOUtils.copy(is, out1);
is.close();
out1.close();
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main2(String[] args) {
List<JsoupList> jsoupLists = new ArrayList<>();
String http = "https://www.amazon.com/s?k=cigarette&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_1";
try {
Document document = Jsoup.connect(http).get();
Elements elementsByClass = document.getElementsByClass("s-main-slot");
File file = new File("E:/jsoupimg/uuidName.txt");
for (Element byClass : elementsByClass) {
Elements elementsByClass1 = byClass.getElementsByClass("sg-col-4-of-12");
for (Element element : elementsByClass1) {
Elements elementsByClass2 = element.getElementsByClass("a-spacing-medium");
//新建对象存储爬取的数据
//第一层获取image
for (Element element1 : elementsByClass2) {
Elements elementsByClass3 = element1.getElementsByClass("s-image-square-aspect");
for (Element element2 : elementsByClass3) {
// Elements img = element2.getElementsByTag("img");
// String src = img.attr("src").trim();
// //System.out.println(img); //拿到封面图
// CloseableHttpClient httpClient = HttpClients.createDefault();
// HttpGet httpGet = new HttpGet(src);
// CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
// HttpEntity entity = httpResponse.getEntity();
// InputStream is = entity.getContent();
// String imgName = ""+UUID.randomUUID().toString().replaceAll("-", "") + src.substring(src.lastIndexOf("."));
// try {
// Thread.sleep(1000);
// } catch (InterruptedException e) {
// e.printStackTrace();
// }
// FileOutputStream out1 = new FileOutputStream("E:/jsoupimg/jsoupName/" + imgName);
// FileOutputStream out = new FileOutputStream(file);
//
// out.write(imgName.getBytes());
// //IOUtils.copy(is, out);
// IOUtils.copy(is, out1);
// is.close();
// out1.close();
}
}
//第二层第一子层,描述层
for (Element element1 : elementsByClass2) {
JsoupList jsoupList = new JsoupList();
String img = element1.getElementsByTag("img").attr("src").trim();
//System.out.println(img);
jsoupList.setImage(img);
Elements elementsByClass3 = element1.getElementsByClass("a-size-base-plus");
//System.out.println(elementsByClass3.text()); //拿到描述
jsoupList.setTitle(elementsByClass3.text());
Elements elementsByClass4 = element1.getElementsByClass("a-color-information");
//System.out.println(elementsByClass4.text()); //这个是部分有 Pack of
Elements elementsByClass5 = element1.getElementsByClass("a-icon-alt");
//System.out.println(elementsByClass5.text()); //部分无评分 拿到评分
Elements elementsByClass6 = element1.getElementsByClass("a-offscreen");
//System.out.println(elementsByClass6.text()); //拿到商品的价格 price 美元
jsoupList.setPrice(elementsByClass6.text());
Elements elementsByClass7 = element1.getElementsByClass("s-align-children-center");
//Element span = element1.getElementsByTag("span").last();
// for (Element element2 : elementsByClass7) {
// String span = element2.getElementsByTag("span").text();
// System.out.println(span); //发往中国
// }
System.out.println("----------------------------------------------------------------------------");
jsoupLists.add(jsoupList);
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
// //多线程爬虫1
// Callable<List<Elements>> infocallable1 = new Callable<List<Elements>>() {
// public List<Elements> call() throws Exception {
//
// List<Elements> element = new ArrayList<>();
//
// //1,学校1
// try {
//
// String s = "http://www.51meishu.com/artexam/news/";
// Document document = Jsoup.connect(s).get();
// Elements listright_ = document.getElementsByClass("listright");
// for (Element element1 : listright_) {
// Elements select = element1.getElementsByTag("ul").select("li");
// element.add(select);
// }
//
// } catch (IOException e) {
// e.printStackTrace();
// }
//
//
// //1,学校1
// try {
//
// String s = "http://www.51meishu.com/artexam/news/";
// Document document = Jsoup.connect(s).get();
// Elements listright_ = document.getElementsByClass("listright");
// for (Element element1 : listright_) {
// Elements select = element1.getElementsByTag("ul").select("li");
// element.add(select);
// }
//
// } catch (IOException e) {
// e.printStackTrace();
// }
//
// //1,学校1
// try {
//
// String s = "http://www.51meishu.com/artexam/news/";
// Document document = Jsoup.connect(s).get();
// Elements listright_ = document.getElementsByClass("listright");
// for (Element element1 : listright_) {
// Elements select = element1.getElementsByTag("ul").select("li");
// element.add(select);
// }
//
// } catch (IOException e) {
// e.printStackTrace();
// }
//
// //1,学校1
// try {
//
// String s = "http://www.51meishu.com/artexam/news/";
// Document document = Jsoup.connect(s).get();
// Elements listright_ = document.getElementsByClass("listright");
// for (Element element1 : listright_) {
// Elements select = element1.getElementsByTag("ul").select("li");
// element.add(select);
// }
//
// } catch (IOException e) {
// e.printStackTrace();
// }
//
//
// return element;
// }
// };
//
// //多线程爬虫1
// Callable<List<Elements>> infocallable2 = new Callable<List<Elements>>() {
// public List<Elements> call() throws Exception {
//
// List<Elements> element = new ArrayList<>();
//
// //1,学校1
// try {
//
// String s = "http://www.51meishu.com/artexam/news/";
// Document document = Jsoup.connect(s).get();
// Elements listright_ = document.getElementsByClass("listright");
// for (Element element1 : listright_) {
// Elements select = element1.getElementsByTag("ul").select("li");
// element.add(select);
// }
//
// } catch (IOException e) {
// e.printStackTrace();
// }
//
//
// //1,学校1
// try {
//
// String s = "http://www.51meishu.com/artexam/news/";
// Document document = Jsoup.connect(s).get();
// Elements listright_ = document.getElementsByClass("listright");
// for (Element element1 : listright_) {
// Elements select = element1.getElementsByTag("ul").select("li");
// element.add(select);
// }
//
// } catch (IOException e) {
// e.printStackTrace();
// }
//
// //1,学校1
// try {
//
// String s = "http://www.51meishu.com/artexam/news/";
// Document document = Jsoup.connect(s).get();
// Elements listright_ = document.getElementsByClass("listright");
// for (Element element1 : listright_) {
// Elements select = element1.getElementsByTag("ul").select("li");
// element.add(select);
// }
//
// } catch (IOException e) {
// e.printStackTrace();
// }
//
// //1,学校1
// try {
//
// String s = "http://www.51meishu.com/artexam/news/";
// Document document = Jsoup.connect(s).get();
// Elements listright_ = document.getElementsByClass("listright");
// for (Element element1 : listright_) {
// Elements select = element1.getElementsByTag("ul").select("li");
// element.add(select);
// }
//
// } catch (IOException e) {
// e.printStackTrace();
// }
//
//
// return element;
// }
// };
//
// //多线程爬虫1
// Callable<List<Elements>> infocallable3 = new Callable<List<Elements>>() {
// public List<Elements> call() throws Exception {
//
// List<Elements> element = new ArrayList<>();
//
// //1,学校1
// try {
//
// String s = "http://www.51meishu.com/artexam/news/";
// Document document = Jsoup.connect(s).get();
// Elements listright_ = document.getElementsByClass("listright");
// for (Element element1 : listright_) {
// Elements select = element1.getElementsByTag("ul").select("li");
// element.add(select);
// }
//
// } catch (IOException e) {
// e.printStackTrace();
// }
//
//
// //1,学校1
// try {
//
// String s = "http://www.51meishu.com/artexam/news/";
// Document document = Jsoup.connect(s).get();
// Elements listright_ = document.getElementsByClass("listright");
// for (Element element1 : listright_) {
// Elements select = element1.getElementsByTag("ul").select("li");
// element.add(select);
// }
//
// } catch (IOException e) {
// e.printStackTrace();
// }
//
// //1,学校1
// try {
//
// String s = "http://www.51meishu.com/artexam/news/";
// Document document = Jsoup.connect(s).get();
// Elements listright_ = document.getElementsByClass("listright");
// for (Element element1 : listright_) {
// Elements select = element1.getElementsByTag("ul").select("li");
// element.add(select);
// }
//
// } catch (IOException e) {
// e.printStackTrace();
// }
//
// //1,学校1
// try {
//
// String s = "http://www.51meishu.com/artexam/news/";
// Document document = Jsoup.connect(s).get();
// Elements listright_ = document.getElementsByClass("listright");
// for (Element element1 : listright_) {
// Elements select = element1.getElementsByTag("ul").select("li");
// element.add(select);
// }
//
// } catch (IOException e) {
// e.printStackTrace();
// }
//
//
// return element;
// }
// };
// FutureTask<List<Elements>> info1 = new FutureTask<>(infocallable1);
// FutureTask<List<Elements>> info2 = new FutureTask<>(infocallable2);
// FutureTask<List<Elements>> info3 = new FutureTask<>(infocallable3);
//
// new Thread(info1).start();
// new Thread(info2).start();
// new Thread(info3).start();
// try {
// List<Elements> integer1 = info1.get();
// List<Elements> integer2 = info2.get();
// List<Elements> integer3 = info2.get();
//
// System.out.println(integer1);
// System.out.println(integer2);
// System.out.println(integer3);
//
// } catch (Exception e) {
// e.printStackTrace();
// }
}
public static void main8(String[] args) {
String lt = "https://search.rakuten.co.jp/search/mall/Ploom+tech/?p=1";
try {
Document document = Jsoup.connect(lt).get();
//Elements searchresults = document.getElementsByClass("searchresultitems");
System.out.println(document);
System.out.println("----------------------------------");
} catch (Exception e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
String lt = "https://t.me/s/BandwagonHostNews";
try {
Document document = Jsoup.connect(lt).get();
//Elements searchresults = document.getElementsByClass("searchresultitems");
System.out.println(document);
System.out.println("----------------------------------");
} catch (Exception e) {
e.printStackTrace();
}
// 54.254.161.51:63894
}
}
多线程异步返回值
于 2022-02-08 18:40:47 首次发布