首先建立maven项目,导入相关的jar包
pom
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.text.jsoup</groupId>
<artifactId>com.text.jsoup</artifactId>
<version>0.0.1-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
</dependencies>
//以下是打包所使用的
<build>
<finalName>import_tool</finalName>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<id>copy</id>
<phase>install</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<outputDirectory>${project.build.directory}/lib</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>
<!--直接将所有依赖一起打包-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.5.5</version>
<configuration>
<archive>
<manifest>
<mainClass>com.text.jsoup.ThreadTest</mainClass>
</manifest>
</archive>
<descriptorRefs>
<!--打包后缀名称-->
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</plugin>
</plugins>
</build>
</project>
爬取过程
这个业务是在某一个网站上爬取到某个class的所有的值,然后保存到date.txt 中,这里我使用的是多线程
package com.text.jsoup;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class ThreadTest implements Runnable{
int i = FileUtil.getI();
public static void main(String[] args) {
ThreadTest t = new ThreadTest();
for (int j = 0; j < 100; j++) {
Thread t1 = new Thread(t);
t1.setName("线程"+j);
t1.start();
}
}
public void run(){
while(true){
synchronized(this){
notify();
try{
Thread.sleep(10);
}catch (Exception e){
e.printStackTrace();
}
if(i <= 30000000){
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet("http://www.okooo.com/member/"+i);
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
} catch (Exception e1) {
e1.printStackTrace();
}//执行get请求
HttpEntity httpEntity = response.getEntity();
String string = null;
try {
string = EntityUtils.toString(httpEntity, "utf-8");
} catch (Exception e1) {
e1.printStackTrace();
}
Document parse = Jsoup.parse(string);
Elements elementsByTag = parse.getElementsByClass("xxx");
Element element = null;
try {
element = elementsByTag.get(0);
} catch (Exception e) {
}
String text = element.text();
String fileName = "/home/date.txt";
FileUtil.saveTxt(fileName, i+":"+text);
// System.out.println(i+":"+text);
try {
response.close();
httpClient.close();
} catch (Exception e1) {
e1.printStackTrace();
}
i++;
try{
wait();
}catch (InterruptedException e ){
e.printStackTrace();
}
}
}
}
}
}
部分工具类
package com.text.jsoup;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
public class FileUtil {
/**
读取最后一行的值
*/
public static String readLastLine(File file, String charset) throws IOException {
if (!file.exists() || file.isDirectory() || !file.canRead()) {
return null;
}
RandomAccessFile raf = null;
try {
raf = new RandomAccessFile(file, "r");
long len = raf.length();
if (len == 0L) {
return "";
} else {
long pos = len - 1;
while (pos > 0) {
pos--;
raf.seek(pos);
if (raf.readByte() == '\n') {
break;
}
}
if (pos == 0) {
raf.seek(0);
}
byte[] bytes = new byte[(int) (len - pos)];
raf.read(bytes);
if (charset == null) {
return new String(bytes);
} else {
return new String(bytes, charset);
}
}
} catch (FileNotFoundException e) {
} finally {
if (raf != null) {
try {
raf.close();
} catch (Exception e2) {
}
}
}
return null;
}
/**
保存文件并换行
*/
public static void saveTxt(String fileName, String content) {
try {
// 打开一个随机访问文件流,按读写方式
RandomAccessFile randomFile = new RandomAccessFile(fileName, "rw");
// 文件长度,字节数
long fileLength = randomFile.length();
// 将写文件指针移到文件尾。
randomFile.seek(fileLength);
randomFile.write((content + "\r\n").getBytes());
randomFile.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
这个是为了如果网络断掉,可以读取到上次保存的最后一行继续往下写入
*/
public static int getI(){
String readLastLine = null;
try {
readLastLine = FileUtil.readLastLine(new File("/home/date.txt"), "utf-8");
} catch (IOException e) {
e.printStackTrace();
}
if (readLastLine != null) {
String[] split = readLastLine.split(":");
int parseInt = Integer.parseInt(split[0]);
// System.out.println(parseInt);
return parseInt + 1;
}
return 0;
}
}
该项目可以直接打包成可执行jar,然后运行即可,效率方面一般,还望路过的大神指点