java 获取亚马逊 top 前100 数据

代码没整理       赶出来的代码  可优化空间巨大 (据了解半年使用一次,且有其他工作,就不优化了)。以美国站, Wallpaper前100为例吧。可执行更换产品。缺的jar包可以自己 下载 或者 留言 我给你。

// 自己建个方法体吧 

  Document document;
  Document document1;

  List<plsj> plsjList = new ArrayList<plsj>();
  int num = 1;

  try {
       //因为 一页50 个产品 所以。。。。
      for (int m = 1; m < 3; m++) {
          document = Jsoup.connect("https://www.amazon.com/Best-Sellers-Home-Improvement-Wallpaper/zgbs/hi/2242314011/ref=zg_bs_pg_" + m + "?_encoding=UTF8&pg=" + m).userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36").get();

          Elements elementsByClass = document.getElementsByClass("a-list-item");

          for (int i = 0; i < elementsByClass.size(); i++) {


              Document parse = Jsoup.parse(elementsByClass.get(i).toString());
              plsj plsj = new plsj();

              plsj.setPm(OtherUtil.getTextFromHTML(parse.getElementsByClass("zg-badge-text").toString().replace("#", "")));
              plsj.setXj(OtherUtil.getTextFromHTML(parse.getElementsByClass("a-icon-alt").toString()).substring(0, 4));
              plsj.setPls(OtherUtil.getTextFromHTML(parse.getElementsByClass("a-size-small a-link-normal").toString()));
              plsj.setJg(OtherUtil.getTextFromHTML(parse.getElementsByClass("a-size-base a-color-price").toString()));
              String str = parse.getElementsByClass("a-link-normal a-text-normal").toString();

              str = OtherUtil.regularexpression("(?<=href=\").*?(?=\")", str).get(0);


              document1 = Jsoup.connect("https://www.amazon.com/" + str).userAgent("User-Agent:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50").get();

              try {

                  Elements elementsByClass1 = document1.getElementsByClass("a-normal a-spacing-micro").select("table").select("tr");

                  for (int k = 0; k < elementsByClass1.size(); k++) {

                      Document parse1 = Jsoup.parse(elementsByClass1.get(k).toString());

                      String temp = OtherUtil.getTextFromHTML(parse1.getElementsByClass("a-size-base a-text-bold").toString());

                      if ("Brand".equals(temp)) {

                          plsj.setBrand(OtherUtil.getTextFromHTML(parse1.getElementsByClass("a-size-base").toString()));


                      }

                  }

              } catch (NullPointerException e) {

              }


              plsj.setDp(OtherUtil.getTextFromHTML(document1.getElementById("bylineInfo").toString()));

              try {
                  Elements select1 = document1.getElementById("tabular-buybox-container").select("table").select("tr");
                  for (int l = 0; l < select1.size(); l++) {

                      Document parse1 = Jsoup.parse(select1.get(l).toString());

                      String textFromHTML = OtherUtil.getTextFromHTML(parse1.getElementsByClass("a-color-tertiary tabular-buybox-label").toString());

                      if ("Ships from".equals(textFromHTML)) {

                          plsj.setPsfs(OtherUtil.getTextFromHTML(parse1.getElementsByClass("tabular-buybox-text").toString()));

                      }


                  }
              } catch (NullPointerException e) {

              }


              Elements select = document1.getElementById("prodDetails").select("table").select("tr");

              for (int j = 0; j < select.size(); j++) {

                  String string = select.get(j).toString();


                  if (string.indexOf("Package Dimensions") != -1) {

                      plsj.setCc(OtherUtil.getTextFromHTML(string.replace("Package Dimensions", "")));

                  }
                  if (string.indexOf("ASIN") != -1) {

                      plsj.setAsin(OtherUtil.getTextFromHTML(string.replace("ASIN", "")));

                  }
                  if (string.indexOf("Item Weight") != -1) {

                      plsj.setZl(OtherUtil.getTextFromHTML(string.replace("Item Weight", "")));

                  }

              }
              plsjList.add(plsj);

              System.out.println("第 " + num + " 个");

             //别嫌慢  亚马逊检测很厉害  验证码 我没研究 不会过
              try {

                  Thread.sleep((int) (Math.random() * 5000));

              } catch (InterruptedException e) {
                  e.printStackTrace();
              }
              num++;
          }
      }

  } catch (IOException e) {
      e.printStackTrace();
  }

 

 

  

//去除标签的方法

public static String getTextFromHTML(String htmlStr) {
    Document doc = Jsoup.parse(htmlStr);
    String text = doc.text();
    // remove extra white space
    StringBuilder builder = new StringBuilder(text);
    int index = 0;
    while (builder.length() > index) {
        char tmp = builder.charAt(index);
        if (Character.isSpaceChar(tmp) || Character.isWhitespace(tmp)) {
            builder.setCharAt(index, ' ');
        }
        index++;
    }
    text = builder.toString().replaceAll(" +", " ").trim();
    return text;
}

//封装的正则表达

public static List<String> regularexpression(String pattern, String str) {

    List<String> list = new ArrayList<String>();

    Pattern r = Pattern.compile(pattern);
    Matcher matcher = r.matcher(str);

    if (matcher.find()) {
    //1000应该足够用了
        for (int i = 0; i < 1000; i++) {

            try {
                list.add(matcher.group(i));
            } catch (IndexOutOfBoundsException e) {
                return list;
            }
        }

    }
    return list;
}

//实体类的 

public class plsj {

    private String xj;
    private String sj;
    private String cc;
    private String ms;

    private String asin;
    /**
     * 品牌
     */
    private String brand;
    /**
     * 店铺
     */
    private String dp;
    /**
     * 配送方式
     */
    private String psfs;
    /**
     * 价格
     */
    private String jg;
    /**
     * 排名
     */
    private String pm;
    /**
     * 评论数
     */
    private String pls;
    /**
     *重量
     */
    private String zl;


    public String getCc() {
        return cc;
    }

    public void setCc(String cc) {
        this.cc = cc;
    }

    public String getMs() {
        return ms;
    }

    public void setMs(String ms) {
        this.ms = ms;
    }

    public String getXj() {
        return xj;
    }

    public void setXj(String xj) {
        this.xj = xj;
    }

    public String getSj() {
        return sj;
    }

    public void setSj(String sj) {
        this.sj = sj;
    }

    public String getAsin() {
        return asin;
    }

    public void setAsin(String asin) {
        this.asin = asin;
    }

    public String getBrand() {
        return brand;
    }

    public void setBrand(String brand) {
        this.brand = brand;
    }

    public String getDp() {
        return dp;
    }

    public void setDp(String dp) {
        this.dp = dp;
    }

    public String getPsfs() {
        return psfs;
    }

    public void setPsfs(String psfs) {
        this.psfs = psfs;
    }

    public String getJg() {
        return jg;
    }

    public void setJg(String jg) {
        this.jg = jg;
    }

    public String getPm() {
        return pm;
    }

    public void setPm(String pm) {
        this.pm = pm;
    }

    public String getPls() {
        return pls;
    }

    public void setPls(String pls) {
        this.pls = pls;
    }

    public String getZl() {
        return zl;
    }

    public void setZl(String zl) {
        this.zl = zl;
    }
}
//最后数据的部分展示

 

 

  • 4
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值