Java爬取网页数据

要爬的网页:http://quote.eastmoney.com/zs000001.html
要爬这部分数据:
在这里插入图片描述

要的数据在源代码这部分:
在这里插入图片描述
首先定义数据:

public class Information {
    String type;
    String volume;
    String money;
    String market_value;
    String number;
    String price_earnings_ratio;
    public void setType(String type){
        this.type = type;
    }
    public String getType(){
        return type;
    }
    public void setVolume(String volume){
        this.volume = volume;
    }
    public String getVolume(){
        return volume;
    }
    public void setMoney(String money){
        this.money = money;
    }
    public String getMoney(){
        return money;
    }
    public void setMarket_value(String  market_value){
        this.market_value = market_value;
    }
    public String  getMarket_value(){
        return market_value;
    }
    public void setNumber(String  number){
        this.number = number;
    }
    public String  getNumber(){
        return number;
    }
    public void setPrice_earnings_ratio(String  price_earnings_ratio){
        this.price_earnings_ratio = price_earnings_ratio;
    }
    public String getPrice_earnings_ratio(){
        return price_earnings_ratio;
    }
}

i

mport org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;

import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


public class AlmanacUtil {

    public AlmanacUtil() {

    }
//获得源码
    public String getdata(String url) {
        String data = null;
        org.apache.commons.httpclient.HttpClient client = new HttpClient();
        GetMethod getMethod = new GetMethod(url);
        getMethod.setRequestHeader("User_Agent", "Mozilla/5.0(Windows NT 6.1;Win64;x64;rv:39.0) Gecko/20100101 Firefox/39.0");
        getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());//系统默认的恢复策略
        try {
            int statusCode = client.executeMethod(getMethod);
            if (statusCode != HttpStatus.SC_OK) {
                System.out.println("Wrong");
            }
            byte[] responseBody = getMethod.getResponseBody();
            data = new String(responseBody);
            return data;

        } catch (HttpException e) {
            System.out.println("Please check your provided http address!");
            data = "";
            e.printStackTrace();

        } catch (IOException e) {
            data = "";
            e.printStackTrace();
        } finally {

            getMethod.releaseConnection();

        }
        return data;
    }

    //想要获取字段部分的分割模式
    static Pattern proInfo = Pattern.compile("<td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td>" +
            "<td>(.*?)</td><td>(.*?)</td>", Pattern.DOTALL);

    private static List<Information> getDataStructure(String str) {
        //想要获取的整段数据的分割模式
        String[] info = str.split("</tr>");
        List<Information> list = new ArrayList<Information>();
        for (String s : info) {
            Matcher m = proInfo.matcher(s);
            Information information = null;
            if (m.find()) {
                information = new Information();
                String ss = m.group(1).trim();
                information.setType(ss);
                information.setVolume(m.group(2).trim());
                information.setMoney(m.group(3).trim());
                information.setMarket_value(m.group(4).trim());
                information.setNumber(m.group(6).trim());
                information.setPrice_earnings_ratio(m.group(7).trim());
                list.add(information);
            }
        }
        return list;
    }

    public static void main(String[] args) throws IOException {
        AlmanacUtil almanacUtil = new AlmanacUtil();
        String ss = almanacUtil.getdata("http://quote.eastmoney.com/zs000001.html");
        List<Information> list = getDataStructure(ss);
        String string = "";
        for (int k = 0; k < list.size(); k++) {
            String s = "类别:" + list.get(k).getType() + " " + "成交量:" + list.get(k).volume + " " +
                    "成交金额:" + list.get(k).getMoney() + " " + "总市值:" + list.get(k).getMarket_value() +
                    " " + "上市公司(家):" + list.get(k).getNumber() + " " + "平均市盈率:" + list.get(k).getPrice_earnings_ratio() + "\n";
            string = string + s;
        }
        System.out.println(string);
        File f = new File("D:" + File.separator + "gupiao.txt");//存在D盘guipiao.txt里
        OutputStream out = null;
        out = new FileOutputStream(f);
        byte b[] = string.getBytes();
        out.write(b);
        out.close();
    }
}
  • 6
    点赞
  • 29
    收藏
    觉得还不错? 一键收藏
  • 7
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值