之前在写毕业论文的时候,帮同门处理过一个专业名称翻译的问题,从结果中筛选目标基因。想必 生信 的小伙伴都有这个烦恼吧~~
代码如下:主要是用 EasyExcel读取数据,再用 selenium 进行页面访问,控制访问频率,然后将一个一个的数据填充到 有道翻译 的 输入框中,再抓取翻译结果,写入到本地文件中。
友情提示:需要使用 谷歌浏览器,下载专用的 chromeDrive 哦。
import com.alibaba.excel.annotation.ExcelProperty;
//读取excel pojo
public class DataR {
@ExcelProperty("Gene Family")
private String GeneFamily;
@ExcelProperty("B")
private String B;
@ExcelProperty("Module")
private String Module;
@ExcelProperty("Pathway")
private String Pathway;
@ExcelProperty("Name")
private String Name;
@ExcelProperty("EC")
private String EC;
@ExcelProperty("Description")
private String Description;
public String getGeneFamily() {
return GeneFamily;
}
public void setGeneFamily(String geneFamily) {
GeneFamily = geneFamily;
}
public String getB() {
return B;
}
public void setB(String b) {
B = b;
}
public String getModule() {
return Module;
}
public void setModule(String module) {
Module = module;
}
public String getPathway() {
return Pathway;
}
public void setPathway(String pathway) {
Pathway = pathway;
}
public String getName() {
return Name;
}
public void setName(String name) {
Name = name;
}
public String getEC() {
return EC;
}
public void setEC(String EC) {
EC = EC;
}
public String getDescription() {
return Description;
}
public void setDescription(String description) {
Description = description;
}
@Override
public String toString() {
return "DataR{" +
"GeneFamily='" + GeneFamily + '\'' +
", B='" + B + '\'' +
", Module='" + Module + '\'' +
", Pathway='" + Pathway + '\'' +
", Name='" + Name + '\'' +
", EC='" + EC + '\'' +
", Description='" + Description + '\'' +
'}';
}
}
import com.alibaba.excel.annotation.ExcelProperty;
//写出 excel pojo
public class DataW {
@ExcelProperty("Gene Family")
private String GeneFamily;
@ExcelProperty("B")
private String B;
@ExcelProperty("Module")
private String Module;
@ExcelProperty("Pathway")
private String Pathway;
@ExcelProperty("Name")
private String Name;
@ExcelProperty("EC")
private String EC;
@ExcelProperty("Description")
private String Description;
@ExcelProperty("Text")
private String Text;
public String getGeneFamily() {
return GeneFamily;
}
public void setGeneFamily(String geneFamily) {
GeneFamily = geneFamily;
}
public String getB() {
return B;
}
public void setB(String b) {
B = b;
}
public String getModule() {
return Module;
}
public void setModule(String module) {
Module = module;
}
public String getPathway() {
return Pathway;
}
public void setPathway(String pathway) {
Pathway = pathway;
}
public String getName() {
return Name;
}
public void setName(String name) {
Name = name;
}
public String getEC() {
return EC;
}
public void setEC(String EC) {
this.EC = EC;
}
public String getDescription() {
return Description;
}
public void setDescription(String description) {
Description = description;
}
public String getText() {
return Text;
}
public void setText(String text) {
Text = text;
}
}
import com.alibaba.excel.context.AnalysisContext;
import com.alibaba.excel.metadata.CellExtra;
import com.alibaba.excel.read.listener.ReadListener;
import com.alibaba.excel.util.ListUtils;
import java.util.List;
import java.util.Map;
public class DemoDataListener implements ReadListener {
/**
* 每隔5条存储数据库,实际使用中可以100条,然后清理list ,方便内存回收
*/
private static final int BATCH_COUNT = 100;
/**
* 缓存的数据
*/
private List<DataR> cachedDataList = ListUtils.newArrayListWithExpectedSize(BATCH_COUNT);
/**
* 假设这个是一个DAO,当然有业务逻辑这个也可以是一个service。当然如果不用存储这个对象没用。
*/
//private DemoDAO demoDAO;
public DemoDataListener() {
// 这里是demo,所以随便new一个。实际使用如果到了spring,请使用下面的有参构造函数
//demoDAO = new DemoDAO();
}
/**
* 如果使用了spring,请使用这个构造方法。每次创建Listener的时候需要把spring管理的类传进来
*
* @param demoDAO
*/
/*public DemoDataListener(DemoDAO demoDAO) {
this.demoDAO = demoDAO;
}*/
@Override
public void onException(Exception exception, AnalysisContext context) throws Exception {
ReadListener.super.onException(exception, context);
}
@Override
public void invokeHead(Map headMap, AnalysisContext context) {
ReadListener.super.invokeHead(headMap, context);
}
@Override
public void invoke(Object data, AnalysisContext analysisContext) {
DataR data1 = (DataR) data;
cachedDataList.add(data1);
try {
String chinese = Spider4EngClass.getChinese(data1.getDescription(),Spider4EngClass.chromeDriver);
if(!"".equals(chinese) && null != chinese){
DataW dataW = new DataW();
dataW.setGeneFamily(data1.getGeneFamily());
dataW.setB(data1.getB());
dataW.setModule(data1.getModule());
dataW.setPathway(data1.getPathway());
dataW.setEC(data1.getEC());
dataW.setName(data1.getName());
dataW.setText(chinese);
dataW.setDescription(data1.getDescription());
Spider4EngClass.listAll.add(dataW);
if(chinese.contains("锑")){
System.out.println(data1.toString());
Spider4EngClass.list.add(dataW);
}else if(chinese.contains("硫")){
System.out.println(data1.toString());
Spider4EngClass.list.add(dataW);
}//System.out.println("数据不符合");
}
} catch (Exception e) {
String chinese = null;
try {
chinese = Spider4EngClass.getChinese(data1.getDescription(),Spider4EngClass.chromeDriver);
} catch (InterruptedException ex) {
throw new RuntimeException(ex);
}
if(!"".equals(chinese) && null != chinese){
DataW dataW = new DataW();
dataW.setGeneFamily(data1.getGeneFamily());
dataW.setB(data1.getB());
dataW.setModule(data1.getModule());
dataW.setPathway(data1.getPathway());
dataW.setEC(data1.getEC());
dataW.setName(data1.getName());
dataW.setText(chinese);
dataW.setDescription(data1.getDescription());
Spider4EngClass.listAll.add(dataW);
if(chinese.contains("锑")){
System.out.println(data1.toString());
Spider4EngClass.list.add(dataW);
}else if(chinese.contains("硫")){
System.out.println(data1.toString());
Spider4EngClass.list.add(dataW);
}//System.out.println("数据不符合");
}
}
// 达到BATCH_COUNT了,需要去存储一次数据库,防止数据几万条数据在内存,容易OOM
if (cachedDataList.size() >= BATCH_COUNT) {
saveData();
// 存储完成清理 list
cachedDataList = ListUtils.newArrayListWithExpectedSize(BATCH_COUNT);
}
}
@Override
public void extra(CellExtra extra, AnalysisContext context) {
ReadListener.super.extra(extra, context);
}
/**
* 所有数据解析完成了 都会来调用
*
* @param context
*/
@Override
public void doAfterAllAnalysed(AnalysisContext context) {
// 这里也要保存数据,确保最后遗留的数据也存储到数据库
//System.out.println("已经处理了" + BATCH_COUNT+ "条数据");
}
@Override
public boolean hasNext(AnalysisContext context) {
return ReadListener.super.hasNext(context);
}
/**
* 加上存储数据库
*/
private void saveData() {
//demoDAO.save(cachedDataList);
System.out.println("已经处理了" + BATCH_COUNT+ "条数据");
}
}
import com.alibaba.excel.EasyExcel;
import com.alibaba.excel.util.ListUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import java.util.ArrayList;
/**
* 微生物 代谢 名称翻译
*/
public class Spider4EngClass {
static String e2cUrl = "https://fanyi.youdao.com/index.html#/";//有道翻译
static String pathAAA = "D:\\aaa.xlsx";
static String pathBBB = "D:\\bbb.xlsx";
static String pathAll = "D:\\ccc.xlsx";
static ChromeDriver chromeDriver =null;
static ArrayList<DataW> list = ListUtils.newArrayListWithExpectedSize(6000);
static ArrayList<DataW> listAll = ListUtils.newArrayListWithExpectedSize(6000);
public static void main(String[] args) throws InterruptedException {
/*String text = getChinese(path);
System.out.println(text);*/
System.setProperty("webdriver.chrome.driver","D:\\chromedriver-win64\\chromedriver.exe");
ChromeOptions options = new ChromeOptions();
options.addArguments("--remote-allow-origins=*");
options.addArguments("--headless=new");
chromeDriver = new ChromeDriver(options);
EasyExcel.read(pathAAA, DataR.class, new DemoDataListener()).sheet().doRead();
String a = "";
EasyExcel.write(pathBBB, DataW.class).sheet("结果").doWrite(list);
EasyExcel.write(pathAll, DataW.class).sheet("全部").doWrite(listAll);
chromeDriver.quit();//关闭浏览器
}
/**
*
*/
public static String getChinese(String text,ChromeDriver chromeDriver) throws InterruptedException {
if(chromeDriver.getWindowHandles().size()>=20){
chromeDriver.quit();
System.setProperty("webdriver.chrome.driver","D:\\chromedriver-win64\\chromedriver.exe");
ChromeOptions options = new ChromeOptions();
options.addArguments("--remote-allow-origins=*");
options.addArguments("--headless=new");
chromeDriver = new ChromeDriver(options);
}
chromeDriver.get(e2cUrl);
Thread.sleep(2000);//必须的,等待页面加载,否则页面数据为空
//Document document = Jsoup.parse(chromeDriver.getPageSource());
By.ById jsFanyiInput = new By.ById("js_fanyi_input");
By.ById js_fanyi_output_resultOutput = new By.ById("js_fanyi_output_resultOutput");
//By.ByClassName errorTips_color_text_3 = new By.ByClassName("errorTips color_text_3");
if(text.contains("-")){
text=text.replaceAll("-","- ");
}
chromeDriver.findElement(jsFanyiInput).clear();//先清空
chromeDriver.findElement(jsFanyiInput).sendKeys(text);
Thread.sleep(2000);
WebElement element = null;
String a ="";
try{
element = chromeDriver.findElement(js_fanyi_output_resultOutput);
a = element.getText();
}catch (Exception e){
// element = chromeDriver.findElement(errorTips_color_text_3);
a = "无法翻译";
}
// chromeDriver.quit();//关闭浏览器
return a;
}
}