使用Jsoup和Selenium实现网页分析
环境搭建
用idea创建一个maven项目。
在pom.xml配置文件中导入Jsoup和Selenium相关依赖。
例:
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.59</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
下载浏览器对应的驱动
[Chome浏览器部分驱动下载](http://chromedriver.storage.googleapis.com/index.html)
代码示例
package org.example;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import java.util.Scanner;
public class TestPrseMy {
public static void main(String[] args) throws Exception {
System.out.println("输入招聘查询关键词:");
String str = new Scanner(System.in).nextLine();
System.setProperty("webdriver.chrome.driver","D:\\MyExePrograms\\chromedriver_win32\\chromedriver.exe");
WebDriver driver = new ChromeDriver();
String url="https://search.51job.com/list/090200,000000,0000,00,9,99,"+
"java"+",2,50.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=";
driver.get(url);
WebElement element1 = ((ChromeDriver) driver).findElementByCssSelector("#keywordInput");
element1.clear();
element1.sendKeys(str);
WebElement element2 = ((ChromeDriver) driver).findElementByCssSelector("#search_btn");
element2.click();
Thread.sleep(3000);
Document document = Jsoup.parse(driver.getPageSource());
Elements ets = document.select(".e");
for (int i = 0; i < ets.size(); i++) {
String url1 = ets.get(i).getElementsByTag("a").eq(0).attr("href");
String title = ets.get(i).getElementsByClass("jname").eq(0).text();
String time = ets.get(i).getElementsByClass("time").eq(0).text();
String sal = ets.get(i).getElementsByClass("sal").eq(0).text();
String address = ets.get(i).getElementsByClass("d").eq(0).text();
String tag = ets.get(i).getElementsByClass("tags").eq(0).text();
String cname = ets.get(i).getElementsByClass("cname").eq(0).text();
if (title.isEmpty())continue;
System.out.print("职位\t"+title+"\t");
System.out.print("网址\t"+url1);
System.out.print("\t"+time);
System.out.print("\t薪资: "+sal);
System.out.print("\t相关信息: "+address);
System.out.print("\t福利待遇: "+tag);
System.out.println("\t公司名\t"+cname);
}
}
}