java实现爬虫常用的第三方包:
- httpclient,for http
- jsoup,for dom
- rhino,for js
- jackson,for json
pom.xml摘录
<dependencies> <!-- simulate web browser --> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.7</version> </dependency> <!-- parse DOM --> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.3</version> </dependency> <!-- jackson --> <dependency> <groupId>com.fasterxml.jackson.core</groupId> <artifactId>jackson-databind</artifactId> <version>2.9.8</version> </dependency> <!-- parse javascript --> <dependency> <groupId>org.mozilla</groupId> <artifactId>rhino</artifactId> <version>1.7.10</version> </dependency> <!-- simulate client action --> <dependency> <groupId>net.sourceforge.htmlunit</groupId> <artifactId>htmlunit</artifactId> <version>2.33</version> </dependency> <!-- upgrade junit to junit4 --> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12<!-- default is v3.8.1 --></version> <scope>test</scope> </dependency> <!-- log --> <!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-api --> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> <version>1.7.25</version> </dependency> <!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-log4j12 --> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> <version>1.7.25</version> <!-- <scope>test</scope> --> </dependency> </dependencies>
启用log4j基本配置,在main方法中加入语句:
public static void main(String[] args) { //启用log4j基本配置 //不想去写配置文件,可以用Java基本配置 BasicConfigurator.configure(); //... }