爬取京东品牌和分类信息

爬取存入数据库 

/**
 * 京东爬虫依赖
 *
 * <dependency>
 * <groupId>org.jsoup</groupId>
 * <artifactId>jsoup</artifactId>
 * <version>1.11.3</version>
 * </dependency>
 * <p>
 * 爬取京东品牌和分类信息
 */
@RestController
public class DemoController {

    @Autowired
    CategoryService categoryService;

    @Autowired
    BrandService brandService;

    @GetMapping("getCategoryFromJD")
    public AxiosResult<Void> setData() throws IOException {
        Document document = Jsoup.connect("https://www.jd.com/allSort.aspx").get();
        Elements elementsByClass = document.getElementsByClass("category-items");

        for (int i = 0; i < elementsByClass.size(); i++) {
            Element element = elementsByClass.get(i);
            Elements element1 = element.getElementsByClass("category-item");
            for (int j = 0; j < element1.size(); j++) {
                Element element2 = element1.get(j);


                //一级分类名
                String firstCategoryName = element2.getElementsByTag("span").text();
                Category firstCategory = new Category();
                firstCategory.setCatetoryName(firstCategoryName);
                firstCategory.setCategoryLevel(1);
                firstCategory.setParentId(0L);
                categoryService.save(firstCategory);

                //二级分类
                Elements dt = element2.getElementsByTag("dl");
                for (int k = 0; k < dt.size(); k++) {
                    Element element3 = dt.get(k);
                    String secondCategoryName = element3.getElementsByTag("dt").get(0).getElementsByTag("a").get(0).text();
                    Category sencondCategory = new Category();
                    sencondCategory.setParentId(firstCategory.getId());
                    sencondCategory.setCatetoryName(secondCategoryName);
                    sencondCategory.setCategoryLevel(2);
                    categoryService.save(sencondCategory);
                    Elements a = element3.getElementsByTag("dd").get(0).getElementsByTag("a");
                    for (int l = 0; l < a.size(); l++) {

                        Category threeCategory = new Category();
                        threeCategory.setParentId(sencondCategory.getId());
                        threeCategory.setCatetoryName(a.get(l).text());
                        threeCategory.setCategoryLevel(3);
                        categoryService.save(threeCategory);

                    }


                }


            }
        }
        return AxiosResult.success();

    }


    @GetMapping("getBrandFromJD")
    public AxiosResult<Void> getBrandFromJd() throws Exception {
        Document document = Jsoup.connect("https://www.jd.com/brand.aspx").get();

        Elements brandslist = document.getElementsByClass("brandslist");
        for (int i = 0; i < brandslist.size(); i++) {
            Element element1 = brandslist.get(i);
            Elements li = element1.getElementsByTag("li");
            for (int j = 0; j < li.size(); j++) {
                Element img = li.get(j).getElementsByTag("img").get(0);
                System.out.println(img);
                String src = img.attr("src");
                String alt = img.attr("alt");
                System.out.println(src);
                Element span = li.get(j).getElementsByTag("span").get(1).getElementsByTag("a").get(0);
                String text = span.text();
                Brand brand = new Brand();
                brand.setBrandName(text);
                brand.setBrandDesc(alt);
                brand.setBrandLogo(src);
                brand.setBrandSite("http://www.baidu.com");
                brandService.save(brand);

            }


        }


        return AxiosResult.success();

    }


}

 爬取输出txt

 public static void setData() throws IOException {
        Document document = Jsoup.connect("https://www.jd.com/allSort.aspx").get();
        Elements elementsByClass = document.getElementsByClass("category-items");
        StringBuffer stringBuffer = new StringBuffer();
        StringBuffer append = null;
        for (int i = 0; i < elementsByClass.size(); i++) {
            Element element = elementsByClass.get(i);
            Elements element1 = element.getElementsByClass("category-item");
            for (int j = 0; j < element1.size(); j++) {
                Element element2 = element1.get(j);
                //一级分类名
                String firstCategoryName = element2.getElementsByTag("span").text();
                //二级分类
                Elements dt = element2.getElementsByTag("dl");
                for (int k = 0; k < dt.size(); k++) {
                    Element element3 = dt.get(k);
                    String secondCategoryName = element3.getElementsByTag("dt").get(0).getElementsByTag("a").get(0).text();
                    Elements a = element3.getElementsByTag("dd").get(0).getElementsByTag("a");
                    for (int l = 0; l < a.size(); l++) {
                        System.out.print(a.get(l).text() + " ");
                        String text = a.get(l).text();
                        append = stringBuffer.append(text + " ");
                    }
                    System.out.println("\n");
                    append.append("\r\n");
                }
            }
        }
        FileOutputStream stream = new FileOutputStream("C://Users//Desktop//京东分类目录.txt");
        byte[] bytes = append.toString().getBytes(StandardCharsets.UTF_8);
        stream.write(bytes);
        stream.close();
    }

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值