java抓取糗事百科内容

解决思路

  1. 通过URLConnection对象获取糗百网的响应html
  2. 解析html通过正则取出糗事内容
  3. 利用FileWrite对象输出到文件

具体步骤

新建QiuShiBaiKe类,创建一个发送get请求的方法,参数为url和params,返回响应html.
  1. 将url和发送参数params拼接完整
    String urlName = url + "?" + param;
  2. 由完整路径构建URL对象
    URL realUrl = new URL(urlName);
  3. 通过URL对象的openConnection()方法获得URLConnection对象 连接
    URLConnection con = realUrl.openConnection();
  4. 设置request头部参数,可从浏览器net中看到
    con.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
    con.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8");
    con.setRequestProperty("Connection", "keep-alive");
    con.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36");

    request请求参数
  5. 连接发送请求
    con.connect();
  6. 通过URLConnection对象的getInputStream()方法获取响应网络流 ,并包装成BufferedReader带有缓冲区的字符流对象
    in = new BufferedReader(new InputStreamReader(con.getInputStream()));
  7. 循环读取流拼接到result字符串中,返回result
    String line;
    while ((line = in.readLine()) != null) {
    result += "\n"+ line;
    }
    return result;
  8. 糗百的地址的规律
    http://www.qiushibaike.com/text/page/后拼接页码,我们可以通过循环获取前10页的内容
    String url = "http://www.qiushibaike.com/text/page/";
    String html = "";
    for (int i = 1; i <= 10; i++) {
    html += qiuShiBaiKe.sendGet(url+i, null);
    }
  9. 糗事内容的规则
    通过分析页面发现内容是被包含在<div class="content"></div>中,可通过正则抓取
    Matcher jokeMatcher = Pattern.compile("<div class=\"content\">[^/]*</div>?").matcher(html);
    使用Matcher对象的find()方法作为循环条件输出捕获到的内容group()
  10. 通过FileWrite的write方法写入到文件
    FileWriter fw = new FileWriter("hello.txt");
    while (jokeMatcher.find()) {
    String one = jokeMatcher.group();
    String joke = one.replace("<div class=\"content\">", "").replace("</div>", "");
    fw.write(joke+"\n");
    }

效果如下图

这里写图片描述

放马过来

package com.sqq.Internate17.demo;

import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 获取糗事百科段子内容
 */
public class QiuShiBaiKe {

    public String sendGet(String url, String param) {
        String result = "";
        BufferedReader in = null;
        try {
            String urlName = url + "?" + param;
            URL realUrl = new URL(urlName);
            URLConnection con = realUrl.openConnection();

            con.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
            con.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8");
            con.setRequestProperty("Connection", "keep-alive");
            con.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36");

            con.connect();

            in = new BufferedReader(new InputStreamReader(con.getInputStream()));

            String line;
            while ((line = in.readLine()) != null) {
                result += "\n" + line;
            }
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return result;
    }

    public static void main(String[] args) throws IOException {
        QiuShiBaiKe qiuShiBaiKe = new QiuShiBaiKe();
        //糗事百科地址
        String url = "http://www.qiushibaike.com/text/page/";
        String html = "";
        for (int i = 1; i <= 10; i++) {
            html += qiuShiBaiKe.sendGet(url + i, null);
        }
        System.out.println(html);
        FileWriter fw = new FileWriter("hello.txt");
        Matcher jokeMatcher = Pattern.compile("<div class=\"content\">[^/]*</div>?").matcher(html);
        fw.write("************** 把快乐建立在别人的痛苦上:糗事百科 *************\n");
        while (jokeMatcher.find()) {
            String one = jokeMatcher.group();
            String joke = one.replace("<div class=\"content\">", "").replace("</div>", "");
            fw.write(joke + "\n");
            fw.write("========== 华丽的分割线 ==========\n");
        }
    }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值