java爬取网页的数据并存入数据库

1 篇文章 0 订阅
1 篇文章 0 订阅

这里使用Jsoup来实现改功能。
demo用到的技术为springboot+jsoup+mysql+mybatis plus

1.首先导入jsoup依赖

<dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>

2.新建实体类(存放网页的数据,字段属性根据需求来定)

/**
 * @author  Mr. Dong
 * @create  2021/9/2 15:56
 * @desc    Jsoup爬取网页数据
 **/
@Data
@TableName("menu")
public class Menu extends Model<Menu> {

    @TableId(value = "id")
    private String id;

    @TableField("title")
    private String title;

    @TableField("img")
    private String img;

    @TableField("des")
    private String describe;

    @TableField("main_material")
    private String mainMaterial;

    @TableField("step")
    private String step;

    @TableField("finished_product")
    private String finishedProduct;

    @TableField("skill")
    private String skill;
}

3.接口

import com.dongbing.demo.modules.system.entity.Menu;
import com.dongbing.demo.modules.system.mapper.MenuMapper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;

import java.net.URL;
import java.util.*;
/**
 * @author  Mr. Dong
 * @create  2021/9/2 15:56
 * @desc    jsoup爬取网页数据
 **/
@RestController
public class Test {

    @Autowired
    private MenuMapper menuMapper;

    public static Menu menu = new Menu();

    @RequestMapping("/getData")
    public List<Map<String,String>> getData() throws Exception {
        //爬取的网页
        String url = "https://so.meishi.cc/?q=%E9%B1%BC%E9%A6%99%E8%82%89%E4%B8%9D&kw=168&sort=time&page=1";
        //document相当于是个js直接操作js一样必须要联网 这一步就是获取当前网页的所有元素,接下来的操作和js差不多了
        Document document = Jsoup.parse(new URL(url), 3000000);
        Elements element = document.getElementsByClass("search2015_cpitem");
        List<Map<String,String>> list = new ArrayList<>();
        for (Element element1 : element) {

            String title = element1.getElementsByClass("img").attr("title");
            //懒加载的方式  所以调用data-lazy-img这个属性,而不是 src
            String image = element1.getElementsByTag("img").attr("src");
            String msgUrl = element1.getElementsByTag("a").attr("href");

            //循环获取a标签中的链接
            Document documentTwo = Jsoup.parse(new URL(msgUrl), 996000000);

            Elements elementTwo = documentTwo.getElementsByClass("info2");
            Elements elementThree = documentTwo.getElementsByClass("recipe_ingredientsw");
            Elements elementFour = documentTwo.getElementsByClass("recipe_step_num");
            Elements elementFive = documentTwo.getElementsByClass("step_content");
            Elements elementSix = documentTwo.getElementsByClass("recipe_finish_box");
            Elements elementSeven = documentTwo.getElementsByClass("recipe_tips_words");


            String describe = "";
            String mainMaterial = "";
            String step = "";
            String stepUrl = "";
            String finishedProduct = "";
            String skill = "";
            for (Element element2: elementTwo){
                describe = element2.getElementsByTag("em").text()+";"+element2.getElementsByTag("strong").text();
            }
            for (Element element3: elementThree){
                mainMaterial = element3.getElementsByTag("strong").text()+element3.getElementsByTag("a").text();
            }
            List<String> listOne = new ArrayList<>();
            List<String> listTwo = new ArrayList<>();
            for (Element element4: elementFour){
                step = element4.getElementsByTag("strong").text()+element4.getElementsByTag("p").text();
                listOne.add(step);
            }
            for (Element element5: elementFive){
                stepUrl = element5.getElementsByTag("p").text()+";图片地址"+element5.getElementsByTag("img").attr("src");
                listTwo.add(stepUrl);
            }

            for (Element element6:elementSix){
                finishedProduct = element6.getElementsByTag("img").attr("title")+":"+element6.getElementsByTag("img").attr("src")+";";
            }

            for (Element element7: elementSeven){
                skill = element7.getElementsByTag("p").text();
            }
            menu.setId(UUID.randomUUID().toString().replace("-",""));
            menu.setTitle(title);
            menu.setImg(image);
            menu.setDescribe(describe);
            menu.setMainMaterial(mainMaterial);
            menu.setStep(listOne.toString()+listTwo.toString());
            menu.setFinishedProduct(finishedProduct);
            menu.setSkill("烹饪技巧: "+skill);
            menuMapper.insert(menu);
        }
        Map<String,String> map = new HashMap<>();
        map.put("Jsoup","获取网页数据呀~");
        list.add(map);
        return list;

    }

}

4.数据库
在这里插入图片描述
在这里插入图片描述
demo码云链接https://gitee.com/dongbingya/springboot/tree/master/Jsoup

  • 5
    点赞
  • 56
    收藏
    觉得还不错? 一键收藏
  • 7
    评论
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值