贴吧抓数据

package com.spider;

import org.apache.http.impl.client.CloseableHttpClient;
import org.jsoup.Jsoup;


/**
 * @author nidayu
 * @Description:
 * @date 2015/10/22
 */
public class TieBa {

    public static void main(String[] args){

        CloseableHttpClient httpClient = HttpClientTestNew.getInstance();

        // 贴吧
        String tiebaUrl = null, tiebaChapter = null;
        // 灵域
        // tiebaUrl = "http://tieba.baidu.com/f?kw=%E7%81%B5%E5%9F%9F&ie=gbk&tab=good&cid=0&pn=300";tiebaChapter = "一千四百三十四章";
        // 魔天记
        // tiebaUrl = "http://tieba.baidu.com/f?kw=%E9%AD%94%E5%A4%A9%E8%AE%B0&ie=utf-8&tab=good&cid=2";tiebaChapter = "1411";

        // getTieBa(httpClient, tiebaUrl, tiebaChapter);


        // 起点
        String url = null, chapter = null;
        // 一剑飞仙
        // url = "http://read.qidian.com/BookReader/Rph5iVEas1Q1.aspx"; chapter = "二十二、";
        // 真武世界
        // url = "http://read.qidian.com/BookReader/SsH0QR3uBSU1.aspx";chapter = "第一章";
        // 盛唐崛起
        // url = "http://read.qidian.com/BookReader/PHJRvEIGX-Y1.aspx";chapter = "第九章";
        // 巫神纪
        // url = "http://read.qidian.com/BookReader/wjGb4uJndg01.aspx";chapter = "第一百零八章";

        // getQiDian(httpClient, url, chapter);
        


        // 纵横
        String zonghengUrl = null, zonghengChapter = null;
        // 终极教师
        // zonghengUrl = "http://book.zongheng.com/showchapter/347511.html"; zonghengChapter = "第一章";

        // getZongHeng(httpClient, zonghengUrl, zonghengChapter);


    }

    // 获取贴吧数据
    private static void getTieBa(CloseableHttpClient httpClient, String url, String chapter){
        // 正文
        String[][] tiebaHeader = {{"Host", "tieba.baidu.com"}};
        String tiebaHtml = HttpClientTestNew.getUrl(httpClient, url, tiebaHeader);
        String s = Jsoup.parse(tiebaHtml).select("a[title*=" + chapter +"]").attr("href");
        System.out.println(Jsoup.parse(tiebaHtml).select("a[title*=" + chapter +"]").attr("title"));
        String[][] headers = {{"Referer", url}, {"Host", "tieba.baidu.com"}};
        if (!s.startsWith("http")){
            s = "http://tieba.baidu.com"+s;
        }
        String tiebaInfo = HttpClientTestNew.getUrl(httpClient, s, headers);
        String text = Jsoup.parse(tiebaInfo).select("div").text();
        String[] list = text.split(" ");
        for (int i=0; i<list.length; i++) {
            System.out.println(list[i]);
        }
    }


    // 获取起点数据
    private static void getQiDian(CloseableHttpClient httpClient, String url, String chapter){
        // 正文
        String qidianHtml = HttpClientTestNew.getUrl(httpClient, url, null);
        String qidianInfo = Jsoup.parse(qidianHtml).select("a").stream()
                .filter(a -> a.text().contains(chapter))
                .map(a -> a.attr("href"))
                .findFirst().get();

        System.out.println(qidianInfo);
        qidianHtml = HttpClientTestNew.getUrl(httpClient, qidianInfo, null);
        String des = Jsoup.parse(qidianHtml).select("script[src^=http://files]").attr("src");
        String[][] qidianHeader = {{"Host", "files.qidian.com"}, {"Referer", qidianInfo}, {"Accept-Encoding", "deflate, sdch"}, {"Content-Type", "text/plain"}, {"Accept-Ranges", "bytes"}, {"Content-Encoding", "gzip"}};
        qidianHtml = HttpClientTestNew.getUrl(httpClient, des, qidianHeader, "gbk");
        Jsoup.parse(qidianHtml).select("p").stream().forEach(p -> System.out.println(p.text()));
    }


    // 获取纵横数据
    private static void getZongHeng(CloseableHttpClient httpClient, String url, String chapter){
        // 正文
        String zonghengHtml = HttpClientTestNew.getUrl(httpClient, url, null);
        String chapterId = Jsoup.parse(zonghengHtml).select("td[chapterName^=" + chapter + "]").attr("chapterId");
        String zonghengUrlDes = url.substring(0, url.length() - 5)+"/"+chapterId+".html";
        String[][] zonghengHeader = {{"Referer", url}, {"Host", "book.zongheng.com"}};
        zonghengHtml = HttpClientTestNew.getUrl(httpClient, zonghengUrlDes.replace("showchapter", "chapter"), zonghengHeader);
        System.out.println(zonghengHtml);
        Jsoup.parse(zonghengHtml).select("p").stream().forEach(p -> System.out.println(p.text()));
    }
    
}

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值