java简单爬取网页内容实例

10 篇文章 0 订阅

一、获取返回的整个文件内容,输出到文件:

package cn.chance.mavenTest;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.PostMethod;

/**
 * First web catch test! 
 * 2017.12.12 
 * chance
 */
public class App {
    private static String url = "http://www.80s.tw/";// 要访问的路径
    private static String outPath = "C:/Users/chanc/Desktop/data.html";// 文件输出路径

    public static void main(String[] args) throws HttpException, IOException {
        HttpClient httpClient = new HttpClient();
        PostMethod post = new PostMethod(url);
        // GetMethod get = new GetMethod(url);
        InputStream in = null;
        FileOutputStream out = null;

        /**
        NameValuePair[] postData = new NameValuePair[2];
        postData[0] = new NameValuePair("user", "chance");
        postData[1] = new NameValuePair("password", "123456");
        post.addParameters(postData);// 给post方法添加参数
        **/

        int resultCode = httpClient.executeMethod(post);// 获取访问url返回的状态码
        if (resultCode == HttpStatus.SC_OK) {
            in = post.getResponseBodyAsStream();// 获取访问url返回的数据流
            out = new FileOutputStream(new File(outPath));

            byte[] buff = new byte[100];// 缓冲区
            int length = 0;
            while ((length = in.read(buff)) > 0) {
                out.write(buff, 0, length);
            }
            in.close();
            out.close();
            System.out.println("succeed!");
        } else {
            System.out.println("failed! resultCode = " + resultCode);
        }
    }
}

二、获取返回数据里面的链接,输出到文件:

package test;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.PostMethod;

/**
 * Second web catch test! 
 * 2017.12.14 
 * chance
 */
public class test1 {
    private static String url = "https://www.80s.tt/";// 要访问的路径
    private static String outPath = "C:/Users/chanc/Desktop/data.txt";// 文件输出路径

    public static void main(String[] args) throws HttpException, IOException {
        HttpClient httpClient = new HttpClient();
        PostMethod post = new PostMethod(url);
        String data = null;
        FileOutputStream out = null;
        StringBuffer buffer = new StringBuffer();

        int resultCode = httpClient.executeMethod(post);// 获取访问url返回的状态码
        if (resultCode == HttpStatus.SC_OK) {
            data = post.getResponseBodyAsString();// 获取访问url返回的数据字符串
            Pattern pattern = Pattern.compile("href=\"(.+?)\"");// 添加一个匹配格式
            Matcher matcher = pattern.matcher(data);// 对data进行模式匹配
            int i = 0;
            while (matcher.find()) {// 在data内匹配到 与pattern定义的格式一样 的内容
                buffer.append(matcher.group());// 截取数据添加到buffer
                i++;
                if (i == 4) {
                    buffer.append("\r\n");// 添加换行
                    i = 0;
                } else {
                    buffer.append("\t");// 添加制表符
                }
            }

            data = buffer.toString();
            pattern = Pattern.compile("\"|href=");
            matcher = pattern.matcher(data);
            data = matcher.replaceAll("");// 按照格式替换字符串内容

            out = new FileOutputStream(new File(outPath));
            out.write(data.getBytes(), 0, data.getBytes().length);
            out.close();
            System.out.println("succeed!");
        } else {
            System.out.println("failed! resultCode = " + resultCode);
        }
    }
}
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 3
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值