Java爬虫获取有道词典专业名词翻译结果,EasyExcel写入

之前在写毕业论文的时候,帮同门处理过一个专业名称翻译的问题,从结果中筛选目标基因。想必 生信 的小伙伴都有这个烦恼吧~~

代码如下:主要是用 EasyExcel读取数据,再用 selenium 进行页面访问,控制访问频率,然后将一个一个的数据填充到 有道翻译 的 输入框中,再抓取翻译结果,写入到本地文件中。

友情提示:需要使用 谷歌浏览器,下载专用的 chromeDrive 哦。



import com.alibaba.excel.annotation.ExcelProperty;
//读取excel pojo
public class DataR {
    @ExcelProperty("Gene Family")
    private String GeneFamily;
    @ExcelProperty("B")
    private  String B;
    @ExcelProperty("Module")
    private  String Module;
    @ExcelProperty("Pathway")
    private  String Pathway;
    @ExcelProperty("Name")
    private  String Name;
    @ExcelProperty("EC")
    private  String EC;
    @ExcelProperty("Description")
    private  String Description;

    public String getGeneFamily() {
        return GeneFamily;
    }

    public void setGeneFamily(String geneFamily) {
        GeneFamily = geneFamily;
    }

    public  String getB() {
        return B;
    }

    public  void setB(String b) {
        B = b;
    }

    public  String getModule() {
        return Module;
    }

    public  void setModule(String module) {
        Module = module;
    }

    public  String getPathway() {
        return Pathway;
    }

    public  void setPathway(String pathway) {
        Pathway = pathway;
    }

    public  String getName() {
        return Name;
    }

    public  void setName(String name) {
        Name = name;
    }

    public  String getEC() {
        return EC;
    }

    public  void setEC(String EC) {
         EC = EC;
    }

    public  String getDescription() {
        return Description;
    }

    public  void setDescription(String description) {
        Description = description;
    }

    @Override
    public String toString() {
        return "DataR{" +
                "GeneFamily='" + GeneFamily + '\'' +
                ", B='" + B + '\'' +
                ", Module='" + Module + '\'' +
                ", Pathway='" + Pathway + '\'' +
                ", Name='" + Name + '\'' +
                ", EC='" + EC + '\'' +
                ", Description='" + Description + '\'' +
                '}';
    }
}

import com.alibaba.excel.annotation.ExcelProperty;
//写出 excel pojo
public class DataW {

    @ExcelProperty("Gene Family")
    private String GeneFamily;
    @ExcelProperty("B")
    private  String B;
    @ExcelProperty("Module")
    private  String Module;
    @ExcelProperty("Pathway")
    private  String Pathway;
    @ExcelProperty("Name")
    private  String Name;
    @ExcelProperty("EC")
    private  String EC;
    @ExcelProperty("Description")
    private  String Description;
    @ExcelProperty("Text")
    private  String Text;

    public String getGeneFamily() {
        return GeneFamily;
    }

    public void setGeneFamily(String geneFamily) {
        GeneFamily = geneFamily;
    }

    public String getB() {
        return B;
    }

    public void setB(String b) {
        B = b;
    }

    public String getModule() {
        return Module;
    }

    public void setModule(String module) {
        Module = module;
    }

    public String getPathway() {
        return Pathway;
    }

    public void setPathway(String pathway) {
        Pathway = pathway;
    }

    public String getName() {
        return Name;
    }

    public void setName(String name) {
        Name = name;
    }

    public String getEC() {
        return EC;
    }

    public void setEC(String EC) {
        this.EC = EC;
    }

    public String getDescription() {
        return Description;
    }

    public void setDescription(String description) {
        Description = description;
    }

    public String getText() {
        return Text;
    }

    public void setText(String text) {
        Text = text;
    }
}

import com.alibaba.excel.context.AnalysisContext;
import com.alibaba.excel.metadata.CellExtra;
import com.alibaba.excel.read.listener.ReadListener;
import com.alibaba.excel.util.ListUtils;

import java.util.List;
import java.util.Map;


public class DemoDataListener implements ReadListener {

    /**
     * 每隔5条存储数据库,实际使用中可以100条,然后清理list ,方便内存回收
     */
    private static final int BATCH_COUNT = 100;
    /**
     * 缓存的数据
     */
    private List<DataR> cachedDataList = ListUtils.newArrayListWithExpectedSize(BATCH_COUNT);
    /**
     * 假设这个是一个DAO,当然有业务逻辑这个也可以是一个service。当然如果不用存储这个对象没用。
     */
    //private DemoDAO demoDAO;

    public DemoDataListener() {
        // 这里是demo,所以随便new一个。实际使用如果到了spring,请使用下面的有参构造函数
        //demoDAO = new DemoDAO();
    }

    /**
     * 如果使用了spring,请使用这个构造方法。每次创建Listener的时候需要把spring管理的类传进来
     *
     * @param demoDAO
     */
    /*public DemoDataListener(DemoDAO demoDAO) {
        this.demoDAO = demoDAO;
    }*/



    @Override
    public void onException(Exception exception, AnalysisContext context) throws Exception {
        ReadListener.super.onException(exception, context);
    }

    @Override
    public void invokeHead(Map headMap, AnalysisContext context) {
        ReadListener.super.invokeHead(headMap, context);
    }

    @Override
    public void invoke(Object data, AnalysisContext analysisContext) {
        DataR data1  = (DataR) data;
        cachedDataList.add(data1);
        try {
            String chinese = Spider4EngClass.getChinese(data1.getDescription(),Spider4EngClass.chromeDriver);
            if(!"".equals(chinese) && null != chinese){
                DataW dataW = new DataW();
                dataW.setGeneFamily(data1.getGeneFamily());
                dataW.setB(data1.getB());
                dataW.setModule(data1.getModule());
                dataW.setPathway(data1.getPathway());
                dataW.setEC(data1.getEC());
                dataW.setName(data1.getName());
                dataW.setText(chinese);
                dataW.setDescription(data1.getDescription());
                Spider4EngClass.listAll.add(dataW);
                if(chinese.contains("锑")){
                    System.out.println(data1.toString());
                    Spider4EngClass.list.add(dataW);
                }else if(chinese.contains("硫")){
                    System.out.println(data1.toString());
                    Spider4EngClass.list.add(dataW);
                }//System.out.println("数据不符合");

            }

        } catch (Exception e) {
            String chinese = null;
            try {
                chinese = Spider4EngClass.getChinese(data1.getDescription(),Spider4EngClass.chromeDriver);
            } catch (InterruptedException ex) {
                throw new RuntimeException(ex);
            }
            if(!"".equals(chinese) && null != chinese){
                DataW dataW = new DataW();
                dataW.setGeneFamily(data1.getGeneFamily());
                dataW.setB(data1.getB());
                dataW.setModule(data1.getModule());
                dataW.setPathway(data1.getPathway());
                dataW.setEC(data1.getEC());
                dataW.setName(data1.getName());
                dataW.setText(chinese);
                dataW.setDescription(data1.getDescription());
                Spider4EngClass.listAll.add(dataW);
                if(chinese.contains("锑")){
                    System.out.println(data1.toString());
                    Spider4EngClass.list.add(dataW);
                }else if(chinese.contains("硫")){
                    System.out.println(data1.toString());
                    Spider4EngClass.list.add(dataW);
                }//System.out.println("数据不符合");

            }
        }
        // 达到BATCH_COUNT了,需要去存储一次数据库,防止数据几万条数据在内存,容易OOM
        if (cachedDataList.size() >= BATCH_COUNT) {
            saveData();
            // 存储完成清理 list
            cachedDataList = ListUtils.newArrayListWithExpectedSize(BATCH_COUNT);
        }
    }

    @Override
    public void extra(CellExtra extra, AnalysisContext context) {
        ReadListener.super.extra(extra, context);
    }

    /**
     * 所有数据解析完成了 都会来调用
     *
     * @param context
     */
    @Override
    public void doAfterAllAnalysed(AnalysisContext context) {
        // 这里也要保存数据,确保最后遗留的数据也存储到数据库
        //System.out.println("已经处理了" + BATCH_COUNT+ "条数据");
    }

    @Override
    public boolean hasNext(AnalysisContext context) {
        return ReadListener.super.hasNext(context);
    }

    /**
     * 加上存储数据库
     */
    private void saveData() {
        //demoDAO.save(cachedDataList);
        System.out.println("已经处理了" + BATCH_COUNT+ "条数据");
    }
}

import com.alibaba.excel.EasyExcel;
import com.alibaba.excel.util.ListUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;

import java.util.ArrayList;

/**
 * 微生物 代谢 名称翻译
 */
public class Spider4EngClass {


    static String e2cUrl = "https://fanyi.youdao.com/index.html#/";//有道翻译
    static String pathAAA = "D:\\aaa.xlsx";
    static String pathBBB = "D:\\bbb.xlsx";
    static String pathAll = "D:\\ccc.xlsx";
    static  ChromeDriver chromeDriver =null;
    static ArrayList<DataW> list = ListUtils.newArrayListWithExpectedSize(6000);
    static ArrayList<DataW> listAll = ListUtils.newArrayListWithExpectedSize(6000);
    public static void main(String[] args) throws InterruptedException {
        /*String text = getChinese(path);
        System.out.println(text);*/
        System.setProperty("webdriver.chrome.driver","D:\\chromedriver-win64\\chromedriver.exe");
        ChromeOptions options = new ChromeOptions();
        options.addArguments("--remote-allow-origins=*");
        options.addArguments("--headless=new");
        chromeDriver = new ChromeDriver(options);
        EasyExcel.read(pathAAA, DataR.class, new DemoDataListener()).sheet().doRead();
        String a = "";
        EasyExcel.write(pathBBB, DataW.class).sheet("结果").doWrite(list);
        EasyExcel.write(pathAll, DataW.class).sheet("全部").doWrite(listAll);
        chromeDriver.quit();//关闭浏览器
    }
    /**
     *
     */
    public static String getChinese(String text,ChromeDriver chromeDriver) throws InterruptedException {

        if(chromeDriver.getWindowHandles().size()>=20){
            chromeDriver.quit();
            System.setProperty("webdriver.chrome.driver","D:\\chromedriver-win64\\chromedriver.exe");
            ChromeOptions options = new ChromeOptions();
            options.addArguments("--remote-allow-origins=*");
            options.addArguments("--headless=new");
            chromeDriver = new ChromeDriver(options);
        }


        chromeDriver.get(e2cUrl);
        Thread.sleep(2000);//必须的,等待页面加载,否则页面数据为空

        //Document document = Jsoup.parse(chromeDriver.getPageSource());
        By.ById jsFanyiInput = new By.ById("js_fanyi_input");
        By.ById js_fanyi_output_resultOutput = new By.ById("js_fanyi_output_resultOutput");
        //By.ByClassName errorTips_color_text_3 = new By.ByClassName("errorTips color_text_3");
        if(text.contains("-")){
            text=text.replaceAll("-","- ");
        }
        chromeDriver.findElement(jsFanyiInput).clear();//先清空
        chromeDriver.findElement(jsFanyiInput).sendKeys(text);
        Thread.sleep(2000);
        WebElement element = null;
        String a ="";
        try{
            element = chromeDriver.findElement(js_fanyi_output_resultOutput);
            a = element.getText();
        }catch (Exception e){
           // element = chromeDriver.findElement(errorTips_color_text_3);
            a = "无法翻译";
        }

       // chromeDriver.quit();//关闭浏览器
        return a;
    }
}

下一步升级思路,调用本地大模型进行翻译 ,替代 远程 访问,提高代码性能,能将时间从几小时压缩到 几分钟。:)

希望对你有所帮助,欢迎留言。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值