先写一个简单的爬虫项目
package com.kgc;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class JsoupTest {
public static void main(String[] args) {
String url = "https://www.baidu.com/s?ie=utf-8&f=3&rsv_bp=1&tn=monline_3_dg&wd=jsoup%E8%A7%A3%E6%9E%90html&oq=httpclient4.4.9&rsv_pq=d7f6243e00006886&rsv_t=1c21FPkhF%2BgQg6I4fQ2ZuApWm%2B5jszdGTEjEmVgQAeQV1%2FQcJwcpl1e9fVIk6IexhrHV&rqlang=cn&rsv_enter=1&inputT=5488&rsv_sug3=34&rsv_sug1=35&rsv_sug7=100&rsv_sug2=1&prefixsug=jsoup&rsp=1&rsv_sug4=6912&rsv_sug=1";
try {
Document doc = Jsoup.connect(url).get();
// System.out.println(doc.html());
Elements es = doc.select("h3.t a");
for (Element e : es) {
System.out.println("h3.t a:\n" + e.attr("href") + "\n" + e.text());
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
用多线程爬取智联招聘的信息
package cn.itrip.test;
import java.io.IOException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import cn.kgc.beans.Recruit;
public class JsoupTest {
public static void main(String[] args) {
zhiLianTest();//智联招聘
}
/**
* 智联招聘
*/
public static void zhiLianTest(){
String url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&kw=java&isadv=0&sg=172c6d7dcafe4755ad7cec36bd1d3683&p=temp";
//创建一个定长线程池,可控制线程最大并发数,超过的线程会在队列中等待。
ExecutorService fixedThreadPool = Executors.newFixedThreadPool(10);
for(int pageii = 0;pageii<50;pageii++){//爬几页
final String listUrl = url.replaceAll("temp", (pageii+1) + "");
final int page = pageii + 1;
fixedThreadPool.execute(new Runnable(){
@Override
public void run() {
// TODO Auto-generated method stub
//线程内代码
try {
Document document = Jsoup.connect(listUrl).get();
String selector = "table[class=newlist]";
Elements elements = document.select(selector);
for(int i = 1;i<elements.size();i++){
Element e = elements.get(i);
//String ss = e.text();
String zwmc = e.select("td[class=zwmc]").text();//职位名称
String gsmc = e.select("td[class=gsmc]").text();//公司名称
String zwyx = e.s