maven httpclient jsoup爬虫入门(二)

对txt操作核心代码

package util;


import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;

public class StoreTxt {

    public static void operateTxt(String url) throws IOException {
        String all="";
        String content = HttpClientUtil.testHttpClient(url);
        Document doc = Jsoup.parse(content); // 解析网页 得到文档对象

        File file = new File("D:\\纣临.txt");
        Elements h1Elements = doc.getElementsByTag("h1"); // 根据tag名称来查询DOM
        Element h1Element = h1Elements.get(0);
        String h1 = h1Element.text();
        System.out.println("题目:" + h1);
        all=all+"书名:"+h1+"\r\n";
        Elements authorElements = doc.select("#info p");//作者
        Element authorElement = authorElements.get(0);
        String author = authorElement.text();
        all=all+author+"\r\n";
        System.out.println(author);

        Element introElement = doc.getElementById("intro");// 简介
        String intro = introElement.text();
        System.out.println("简介" + intro);
        all=all+"简介" + intro+"\r\n";

//        addH1AuthorIntro(fileWriter, h1, author, intro);

        Elements hrefElements = doc.select("#list dl dd a");

        for (Element e : hrefElements) {
            String urlIndex = "http://www.biquge.com.tw" + e.attr("href");
            String contentIndex = HttpClientUtil.testHttpClient(urlIndex);
            Document docIndex = Jsoup.parse(contentIndex); // 解析网页 得到文档对象

            // 获取章节名
            Elements chapterElements = docIndex.getElementsByTag("h1"); // 根据tag名称来查询DOM
            Element chapterElement = chapterElements.get(0);
            String chapter = chapterElement.text();

            String textsIndex = docIndex.select("#content").text().replaceAll("\\s+","\r\n");

//            int index  = textsIndex.indexOf(" ");
//            while(index>-1){
//                String line = textsIndex.substring(0,index);
//                textsIndex = textsIndex.substring(index+1);
//                System.out.println(line);
//            }

            all=all+chapter+"\r\n"+textsIndex+"\r\n";
           
        }
        FileWriter fileWriter = new FileWriter(file, true);
        fileWriter.write(all);
        fileWriter.flush();
        fileWriter.close();
        System.out.println(all);
    }
}


httpclient工具类

package util;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

class HttpClientUtil {

    private static CloseableHttpClient httpClient;

    public static  String testHttpClient(String url)  {
        String content= "";
        CloseableHttpResponse response=null;
        try {

            //即将访问的url
//        String url = "http://www.biquge.com.tw/17_17380/";
            //使用默认配置的httpclient
            httpClient = HttpClients.createDefault();

            //执行请求
            response = getResponse(url);

            //打印请求的实体内容 返回json格式
            HttpEntity entity = response.getEntity();
            content=EntityUtils.toString(entity, "GBK");

            response.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return content;
    }

    public static CloseableHttpResponse getResponse(String url) throws IOException {
        HttpGet httpGet = new HttpGet(url);
        CloseableHttpResponse response = httpClient.execute(httpGet);
        return response;
    }
}


main方法入口

import util.StoreTxt;
import java.io.IOException;

public class Main {
    public static void main(String[] args) {

        try {
            StoreTxt.operateTxt("http://www.biquge.com.tw/17_17380/");

        } catch (IOException e) {
            e.printStackTrace();
        }

    }
}

出现问题及解决

对txt操作问题:

fileWriter = new FileWriter(file,true);

txt简写:

        FileWriter fileWriter = new FileWriter(file, true);
        fileWriter.write(all);
        fileWriter.flush();
        fileWriter.close();
        System.out.println(all);
输入txt简写:
String all="";//统一写入后再到txt
应用stringbuffer改进



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值