Java+xpath爬虫实现食物数据抓取

package com.lenovo.lhp.food;

import com.lenovo.lhp.food.entity.Food;
import com.lenovo.lhp.food.entity.FoodEnergyComposition;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;

import java.math.BigDecimal;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Random;

public class FoodFetch {

    private static String driver;//连接数据库的驱动
    private static String url;
    private static String username;
    private static String password;

    static {
        driver = "com.mysql.cj.jdbc.Driver";//需要的数据库驱动
        url = "jdbc:mysql://10.121.121.216:30040/lhp_defaults";//数据库名路径
        username = "root";
        password = "passw0rd";
    }

    public static Connection open() {
        try {
            Class.forName(driver);
            return (Connection) DriverManager.getConnection(url, username, password);
        } catch (Exception e) {
            System.out.println("数据库连接失败!");
            e.printStackTrace();
        }//加载驱动
        return null;
    }

    /*
     * 关闭数据库
     */
    public static void close(Connection conn) {
        if (conn != null) {
            try {
                conn.close();
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
    }

    static int insertFood(Long id, String chineseName, String primaryClassification, String secondaryClassification, Connection conn) {
        String sql = "insert into com_food (id,chinese_name,primary_classification,secondary_classification)\n" +
                "        values (?,?,?,?)\n" +
                "            on duplicate key update\n" +
                "                                 id = ?,\n" +
                "                                 chinese_name = ?,\n" +
                "                                 primary_classification = ?,\n" +
                "                                 secondary_classification = ?";

        int i = 0;
        try {
            PreparedStatement pstmt = (PreparedStatement) conn.prepareStatement(sql);
            pstmt.setLong(1, id);
            pstmt.setString(2, chineseName);
            pstmt.setString(3, primaryClassification);
            pstmt.setString(4, secondaryClassification);
            pstmt.setLong(5, id);
            pstmt.setString(6, chineseName);
            pstmt.setString(7, primaryClassification);
            pstmt.setString(8, secondaryClassification);
            i = pstmt.executeUpdate();
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return i;
    }

    static int insertFoodEnergyComposition(FoodEnergyComposition foodEnergyComposition, Connection conn) {
        String sql = "insert into com_food_energy_composition (id,food_id,edible,edible_unit,moisture_content,moisture_content_unit,energy,\n" +
                "            energy_unit,protein,protein_unit,fat,fat_unit,cholesterol,cholesterol_unit,ash_content,ash_content_unit,\n" +
                "            carbohydrate,carbohydrate_unit,dietary_fiber,dietary_fiber_unit)\n" +
                "        values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)\n" +
                "        on duplicate key update\n" +
                "             id = ?,\n" +
                "             food_id = ?,\n" +
                "             edible = ?,\n" +
                "             edible_unit = ?,\n" +
                "             moisture_content = ?,\n" +
                "             moisture_content_unit = ?,\n" +
                "             energy = ?,\n" +
                "             energy_unit = ?,\n" +
                "             protein = ?,\n" +
                "             protein_unit = ?,\n" +
                "             fat = ?,\n" +
                "             fat_unit = ?,\n" +
                "             cholesterol =?,\n" +
                "             cholesterol_unit = ?,\n" +
                "             ash_content = ?,\n" +
                "             ash_content_unit = ?,\n" +
                "             carbohydrate = ?,\n" +
                "             carbohydrate_unit = ?,\n" +
                "             dietary_fiber = ?,\n" +
                "             dietary_fiber_unit = ?";
        int i = 0;
        try {
            PreparedStatement pstmt = (PreparedStatement) conn.prepareStatement(sql);
            pstmt.setLong(1, foodEnergyComposition.getId());
            pstmt.setLong(2, foodEnergyComposition.getFoodId());
            pstmt.setBigDecimal(3, foodEnergyComposition.getEdible());
            pstmt.setString(4, foodEnergyComposition.getEdibleUnit());
            pstmt.setBigDecimal(5, foodEnergyComposition.getMoistureContent());
            pstmt.setString(6, foodEnergyComposition.getMoistureContentUnit());
            pstmt.setBigDecimal(7, foodEnergyComposition.getEnergy());
            pstmt.setString(8, foodEnergyComposition.getEnergyUnit());
            pstmt.setBigDecimal(9, foodEnergyComposition.getProtein());
            pstmt.setString(10, foodEnergyComposition.getProteinUnit());
            pstmt.setBigDecimal(11, foodEnergyComposition.getFat());
            pstmt.setString(12, foodEnergyComposition.getFatUnit());
            pstmt.setBigDecimal(13, foodEnergyComposition.getCholesterol());
            pstmt.setString(14, foodEnergyComposition.getCholesterolUnit());
            pstmt.setBigDecimal(15, foodEnergyComposition.getAshContent());
            pstmt.setString(16, foodEnergyComposition.getAshContentUnit());
            pstmt.setBigDecimal(17, foodEnergyComposition.getCarbohydrate());
            pstmt.setString(18, foodEnergyComposition.getCarbohydrateUnit());
            pstmt.setBigDecimal(19, foodEnergyComposition.getDietaryFiber());
            pstmt.setString(20, foodEnergyComposition.getDietaryFiberUnit());
            pstmt.setLong(21, foodEnergyComposition.getId());
            pstmt.setLong(22, foodEnergyComposition.getFoodId());
            pstmt.setBigDecimal(23, foodEnergyComposition.getEdible());
            pstmt.setString(24, foodEnergyComposition.getEdibleUnit());
            pstmt.setBigDecimal(25, foodEnergyComposition.getMoistureContent());
            pstmt.setString(26, foodEnergyComposition.getMoistureContentUnit());
            pstmt.setBigDecimal(27, foodEnergyComposition.getEnergy());
            pstmt.setString(28, foodEnergyComposition.getEnergyUnit());
            pstmt.setBigDecimal(29, foodEnergyComposition.getProtein());
            pstmt.setString(30, foodEnergyComposition.getProteinUnit());
            pstmt.setBigDecimal(31, foodEnergyComposition.getFat());
            pstmt.setString(32, foodEnergyComposition.getFatUnit());
            pstmt.setBigDecimal(33, foodEnergyComposition.getCholesterol());
            pstmt.setString(34, foodEnergyComposition.getCholesterolUnit());
            pstmt.setBigDecimal(35, foodEnergyComposition.getAshContent());
            pstmt.setString(36, foodEnergyComposition.getAshContentUnit());
            pstmt.setBigDecimal(37, foodEnergyComposition.getCarbohydrate());
            pstmt.setString(38, foodEnergyComposition.getCarbohydrateUnit());
            pstmt.setBigDecimal(39, foodEnergyComposition.getDietaryFiber());
            pstmt.setString(40, foodEnergyComposition.getDietaryFiberUnit());
            i = pstmt.executeUpdate();
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return i;
    }

    public static void main(String[] args) throws Exception {

        //打开浏览器的按照地址   设置驱动类型和名称
        System.setProperty("webdriver.chrome.driver", "D:\\software\\chromedriver_win32\\chromedriver.exe");
        WebDriver driver = new ChromeDriver();
        JavascriptExecutor executor = (JavascriptExecutor) driver;
        driver.get("http://yycx.yybq.net/searchlist__1.htm");
        driver.manage().window().maximize();
        Thread.sleep(3000);

        //点击搜索框
        driver.findElement(By.xpath("/html/body/div[2]/div[1]/button")).click();
        Thread.sleep(3000);
        Connection conn = FoodFetch.open();


        //1 每次从列表拿出十个数
        //2 翻页更新列表
        int i = 1;
        while (true) {
            //1-10  11-20
            List<WebElement> elements = driver.findElements(By.xpath("//div[@class='public_margin_top ysq_div_list']/div[position()>=" + i + " and position()<=" + (i + 9) + "]/a"));
            Thread.sleep(3000);

            elements.stream().forEach(element -> {
                Food food = new Food();
                food.setId(Long.valueOf(getRandomUUID()));

                FoodEnergyComposition foodEnergyComposition = new FoodEnergyComposition();
                foodEnergyComposition.setId(Long.valueOf(getRandomUUID()));
                foodEnergyComposition.setFoodId(food.getId());

                //进入详情页
                executor.executeScript("arguments[0].click();", element);
                try {
                    Thread.sleep(3000);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }

                //获取能量与成分
                List<WebElement> energyCompositionElements = driver.findElements(By.xpath("//div[@class='details_table'][1]//tr"));

                List<WebElement> edibleElements = energyCompositionElements.get(1).findElements(By.xpath("./*"));//获取食部
                try {
                    foodEnergyComposition.setEdible(new BigDecimal(edibleElements.get(1).getText().split("%")[0]));
                } catch (Exception e) {
                    foodEnergyComposition.setEdible(new BigDecimal("0.0"));
                }
                foodEnergyComposition.setEdibleUnit("%");

                List<WebElement> moistureElements = energyCompositionElements.get(2).findElements(By.xpath("./*"));//获取水分
                try {
                    foodEnergyComposition.setMoistureContent(new BigDecimal(moistureElements.get(1).getText().split("g")[0]));
                } catch (Exception e) {
                    foodEnergyComposition.setMoistureContent(new BigDecimal("0.0"));
                }
                foodEnergyComposition.setMoistureContentUnit("g");

                List<WebElement> energyElements = energyCompositionElements.get(3).findElements(By.xpath("./*"));//获取能量
                try {
                    foodEnergyComposition.setEnergy(new BigDecimal(energyElements.get(1).getText().split("KJ")[0]));
                } catch (Exception e) {
                    foodEnergyComposition.setEnergy(new BigDecimal("0.0"));
                }
                foodEnergyComposition.setEnergyUnit("KJ");

                List<WebElement> proteinElements = energyCompositionElements.get(4).findElements(By.xpath("./*"));//获取蛋白质
                try {
                    foodEnergyComposition.setProtein(new BigDecimal(proteinElements.get(1).getText().split("g")[0]));
                } catch (Exception e) {
                    foodEnergyComposition.setProtein(new BigDecimal("0.0"));
                }
                foodEnergyComposition.setProteinUnit("g");

                List<WebElement> fatElements = energyCompositionElements.get(5).findElements(By.xpath("./*"));//获取脂肪
                try {
                    foodEnergyComposition.setFat(new BigDecimal(fatElements.get(1).getText().split("g")[0]));
                } catch (Exception e) {
                    foodEnergyComposition.setFat(new BigDecimal("0.0"));
                }
                foodEnergyComposition.setFatUnit("g");

                List<WebElement> cholesterolElements = energyCompositionElements.get(6).findElements(By.xpath("./*"));//获取胆固醇
                try {
                    foodEnergyComposition.setCholesterol(new BigDecimal(cholesterolElements.get(1).getText().split("g")[0]));
                } catch (Exception e) {
                    foodEnergyComposition.setCholesterol(new BigDecimal("0.0"));
                }
                foodEnergyComposition.setCholesterolUnit("g");

                List<WebElement> ashContentElements = energyCompositionElements.get(7).findElements(By.xpath("./*"));//获取灰分
                try {
                    foodEnergyComposition.setAshContent(new BigDecimal(ashContentElements.get(1).getText().split("g")[0]));
                } catch (Exception e) {
                    foodEnergyComposition.setAshContent(new BigDecimal("0.0"));
                }
                foodEnergyComposition.setAshContentUnit("g");

                List<WebElement> carbohydrateElements = energyCompositionElements.get(8).findElements(By.xpath("./*"));//获取碳水化合物
                try {
                    foodEnergyComposition.setCarbohydrate(new BigDecimal(carbohydrateElements.get(1).getText().split("g")[0]));
                } catch (Exception e) {
                    foodEnergyComposition.setCarbohydrate(new BigDecimal("0.0"));
                }
                foodEnergyComposition.setCarbohydrateUnit("g");

                List<WebElement> dietaryFiberElements = energyCompositionElements.get(9).findElements(By.xpath("./*"));//获取总膳食纤维
                try {
                    foodEnergyComposition.setDietaryFiber(new BigDecimal(dietaryFiberElements.get(1).getText().split("g")[0]));
                } catch (Exception e) {
                    foodEnergyComposition.setDietaryFiber(new BigDecimal("0.0"));
                }
                foodEnergyComposition.setDietaryFiberUnit("g");


                //获取后返回上一页
                driver.navigate().back();   //返回第一次访问的网页,即后退功能
                try {
                    Thread.sleep(10000);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }
//
//                executor.executeScript("arguments[0].click();", driver.findElement(By.xpath("/html/body/div[2]/a/img")));

                List<WebElement> elements1 = element.findElements(By.xpath("./*"));
                elements1.stream().forEach(element1 -> {
                    if (element1.getAttribute("class").equals("weui_media_bd")) {
                        List<WebElement> elements2 = element1.findElements(By.xpath("./*"));
                        elements2.stream().forEach(element2 -> {
                            if (element2.getAttribute("class").equals("list_title")) {
                                food.setChineseName(element2.getText());
                                System.out.println(element2.getText());
                            } else if (element2.getAttribute("class").equals("list_type")) {
                                String[] classify = element2.getText().split("-");
                                food.setPrimaryClassification(classify[0]);
                                if (classify.length == 2) {
                                    food.setSecondaryClassification(classify[1]);
                                } else {
                                    food.setSecondaryClassification(null);
                                }

                                if (insertFood(food.getId(), food.getChineseName(), food.getPrimaryClassification(), food.getSecondaryClassification(), conn) > 0) {
                                    insertFoodEnergyComposition(foodEnergyComposition, conn);
                                }
                                System.out.println(element2.getText());
                            }
                        });
                    }
                });
            });
            executor.executeScript("window.scrollBy(0,5000)");
            Thread.sleep(4000);
            i = i + 10;
        }
        //关闭浏览器
//        driver.quit();
    }

    public static String getRandomUUID() {
        //1、创建时间戳
        Date dateNow = new Date();
        SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss");
        String dateNowStr = dateFormat.format(dateNow);
        StringBuffer sb = new StringBuffer(dateNowStr);
        //2、创建随机对象
        Random rd = new Random();
        //3、产生4位随机数
        String n = "";
        int rdGet;//取得随机数
        do {
            rdGet = Math.abs(rd.nextInt()) % 10 + 48; //产生48到57的随机数(0-9的键位值)
            //rdGet=Math.abs(rd.nextlnt0))%626+97;// 产生97到122的随机数(a-z的键位值)
            char num1 = (char) rdGet;
            String dd = Character.toString(num1);
            n += dd;
        } while (n.length() < 4); // 假如长度小于4
        sb.append(n);
        //4、返回唯一码return
        return sb.toString();
    }
}
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值