目录
因为版权问题,不能包含某些词,发了好多次都不通过。
所以就简单发了一下源代码,以下代码都是笔者亲自编写测试过的,有问题的可以留言。
1.网页源代码
URL可以直接替换,会直接爬取相应网站的网页源代码
import java.io.*;
import java.net.URL;
public class RetrivePage {
public static String downloadPage(String url,String filepath) throws IOException {
URL pageURL = new URL(url);
InputStreamReader isr = new InputStreamReader(pageURL.openStream(),"utf8"); //将字节流转为字符流
BufferedReader reader = new BufferedReader(isr); //BufferedReader是从缓冲区之中读取内容,所有的输入的字节数据都将放在缓冲区之中
String line;
StringBuilder pageBuffer = new StringBuilder(); //StringBuilder是一个可变的字符串类,我们可以把它看成是一个容器,这里的可变指的是StringBuilder对象中的内容是可变的
while ((line = reader.readLine()) != null) {
pageBuffer.append(line); //append( )是往动态字符串数组添加
}
String html = pageBuffer.toString();
File file = new File(filepath);
if(file.exists()){
file.mkdirs();
}
OutputStream os = new FileOutputStream(file);
os.write(html.getBytes());
os.close();
return pageBuffer.toString();
}
public static void main (String[] args) throws IOException {
System.out.println(RetrivePage.downloadPage("https://xxxxxxxxxxxxxx/","D://crawlerdata/page.txt"));
}
}
2.网站的图片
public static void dlImg(String url){
//获取httpclient
HttpClient httpClient= HttpClients.createDefault();
//获取httpGet
if(!url.matches("https://(.*)")) {
StringBuffer sb = new StringBuffer(url);
sb.insert(7,"/");
url = sb.toString();
}
HttpGet httpGet = new HttpGet(url);
CloseableHttpResponse httpResponse = null;
//获取httpResponse
try {
try {
httpResponse = (CloseableHttpResponse) httpClient.execute(httpGet);
} catch (IOException e) {
e.printStackTrace();
}
if(httpResponse.getStatusLine().getStatusCode() == 200){
if (httpResponse.getEntity()!=null) {
//获取图片后缀
String exName = url.substring(url.lastIndexOf("."));
//创建图片名 重命名图片
String picName = UUID.randomUUID().toString() + exName;
//下载图片
if(httpResponse.getEntity()!=null){
OutputStream outputStream = new FileOutputStream(new File("D:\\crawlerdata\\图片文件\\"+picName));
httpResponse.getEntity().writeTo(outputStream);
System.out.println(picName);
}
}
}
} catch (IOException e) {
e.printStackTrace();
}finally {
try {
//httpClient不需要关闭有连接池关闭。
httpResponse.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
3.全站最佳小说
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
public class Find {
public static void Fbest (String url) throws Exception {
//创建client实例
HttpClient client = HttpClients.createDefault();
String best = ".QZZJ";
//创建httpget实例
HttpGet httpGet = new HttpGet(url);
//执行 get请求
HttpResponse response = client.execute(httpGet);
//返回获取实体
HttpEntity entity = response.getEntity();
//获取网页内容,指定编码
String web = EntityUtils.toString(entity, "UTF-8");
//输出网页
Document document = Jsoup.parse(web);
Elements links = document.select(best);
//选择器,选取特征信息
String webs;
for (Element link : links) {
webs = link.html();
Document doc = Jsoup.parse(webs);
Elements e1 = doc.select("h2");
System.out.println(e1.text() + ": ");//输出特征信息
Elements e2 = doc.select("dl dd h3 a");
for (Element link1 : e2) {
webs = link1.attr("href");
System.out.println("https:" + webs + ": " + link1.text());//输出特征信息
URL url1 = new URL("https:" + webs); //读取每个截取的URL
HttpURLConnection connection = (HttpURLConnection) url1.openConnection();
connection.setDoOutput(true);
InputStreamReader isr = new InputStreamReader(connection.getInputStream(), "utf8");
BufferedReader br = new BufferedReader(isr);
String str = null;
File dest = new File("D://crawlerdata/html文件/" + link1.text() + ".html");
dest.createNewFile();
FileOutputStream fileOutputStream = new FileOutputStream(dest);
OutputStreamWriter outputStreamWriter = new OutputStreamWriter(fileOutputStream, "utf8");
while ((str = br.readLine()) != null) {
outputStreamWriter.write(str);
}
}
}
}
public static void main(String[] args) throws Exception {
Find find = new Find();
find.Fbest("https://xxxxxxxxxxxxxxx/");
}
}
4.小说内容
实现了自动翻页,下载每一章节的内容
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
public class Downloadtxt {
public static Document Ddoc(String url) throws IOException {
String url1 = url;
//创建client实例
HttpClient client = HttpClients.createDefault();
//创建httpget实例
HttpGet httpGet = new HttpGet(url1);
//执行 get请求
HttpResponse response = client.execute(httpGet);
//返回获取实体
HttpEntity entity = response.getEntity();
//获取网页内容,指定编码
String web = EntityUtils.toString(entity, "UTF-8");
//输出网页
Document document = Jsoup.parse(web);
return document;
}
public static void dltxt(String filepath,String url) throws IOException {
Document document = Ddoc(url);
//经F12分析,章节的标题、内容组合选择器如下所示
Elements title = document.select("#readArea [class=readAreaBox content] h1");
Elements introduce = document.select("#readArea [class=readAreaBox content] .chapter_update_time");
Elements content = document.select("#readArea [class=readAreaBox content] .p p");
//利用IO流将小说章节内容保存到本地
String chapter = title.text();
FileWriter fw = new FileWriter(filepath+"\\"+chapter+".txt");
fw.write(title.text() + "\r\n"+introduce.text() + "\r\n");
for (Element e : content) {
fw.write(e.text() + "\r\n");
}
fw.close();
System.out.println(chapter+"下载完成");
}
public static void dlall(String url) throws IOException {
Document document = Ddoc(url);
Elements es = document.select(".Main .bLeft .BookInfo .Props .Bar .read a");
String link0 = es.attr("href");
Document doc = Ddoc("https://www.17k.com"+link0);
Elements bookname = doc.select(".Main h1[class=Title]");
File file = new File("D:\\crawlerdata\\"+bookname.text());
if(file.mkdir()){
System.out.println(bookname.text()+"文件夹创建成功\n位置:"+"D:\\学习资料\\搜索引擎\\crawlerdata\\"+bookname.text());
}
else System.out.println(bookname.text()+"文件夹已存在\n位置:"+"D:\\学习资料\\搜索引擎\\crawlerdata\\"+bookname.text());
Elements chapter = doc.select(".Volume dd [target=_blank]");
for(Element e : chapter){
String href = e.attr("href");
href = "https://xxxxxxxxxxxx"+href;
dltxt("D:\\学习资料\\搜索引擎\\crawlerdata\\"+bookname.text(),href);
}
}
public static void main(String[] args) throws IOException {
Downloadtxt.dlall("https://xxxxxxxxxx//xxxxx.html");
}
}
最后附上maven配置中的jar依赖
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
<dependency>
<groupId>edu.uci.ics</groupId>
<artifactId>crawler4j</artifactId>
<version>4.3</version>
</dependency>