java调用Linux执行Python爬虫,并将数据存储到elasticsearch中--(java后台代码)

该篇博客主要是java代码,如需相应脚本及java连接elasticsearch工具类代码,请移步到上一篇博客(https://www.cnblogs.com/chenyuanbo/p/9973685.html)

一、创建连接执行Linux脚本工具类

package com.yjlc.platform.utils.Elasticsearch;

import ch.ethz.ssh2.Connection;
import ch.ethz.ssh2.StreamGobbler;

import java.io.*;
/**
 * --------------------------------------------------------------
 * CopyRights(c)2018,YJLC
 * All Rights Reserved
 * <p>
 * FileName: SingletonUtil.java
 * Description:
 * Author: cyb
 * CreateDate: 2018-11-15
 * --------------------------------------------------------------
 */
public class SingletonUtil {
    //无参构造
    private SingletonUtil(){}
    private volatile static SingletonUtil instance;
    //字符编码默认是utf-8
    public static String  DEFAULTCHART="UTF-8";
    public static Connection conn;
    private String ip;
    private String userName;
    private String userPwd;
    public static Boolean flag=false;
//有参构造
    public SingletonUtil(String ip, String userName, String userPwd) {
        this.ip = ip;
        this.userName = userName;
        this.userPwd = userPwd;
    }

    public SingletonUtil getInstance(String ip, String userName, String userPwd){
        if(instance==null){
            synchronized(SingletonUtil.class){
                //防止多线程多次创建
                if(instance==null){
                    instance=new SingletonUtil(ip,userName, userPwd);
                }
            }
        }
        flag= instance.login();//调用登录方法
        return instance;
    }
    //登录
    public Boolean login(){
        boolean flg=false;
        try {
            System.out.println("进入连接");
            conn = new Connection(ip);
            try {
                conn.connect();//连接
            } catch (IOException e) {
                e.printStackTrace();
            }
            flg=conn.authenticateWithPassword(userName, userPwd);//认证
            if (flg){
                System.out.println("认证成功!");
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return flg;
    }

    /**
     *@description:纯文本格式返回
     *@author:cyb
     *@date: 2018-11-15 16:56
    *@param: in
    *@param: charset
     *@return: java.lang.String
     */
    public static String processStdout(InputStream in, String charset){
        InputStream    stdout = new StreamGobbler(in);
        StringBuffer buffer = new StringBuffer();;
        try {
            BufferedReader br = new BufferedReader(new InputStreamReader(stdout,charset));
            String line=null;
            while((line=br.readLine()) != null){
                buffer.append(line+"\n");
            }
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return buffer.toString();
    }
}

二、控制层

/**
     *@description:开启爬虫
     *@author:cyb
     *@date: 2018-11-14 15:59
     *@param: id
     *@param: execute
     *@return: java.util.Map<java.lang.String,java.lang.Object>
     */
    @RequestMapping("openTask")
    @ResponseBody
    public Map<String,Object> openTask(String id,Boolean execute){
        Map<String,Object> map = new HashMap<>();
        //根据id查询任务详细信息
        BsKnowledgeInfoDTO  knowledgeInfoDTO=  knolegeService.getDataInfoById(id);
        if(execute==true){
            execute=false;
        }else {
            execute=true;
        }
        knowledgeInfoDTO.setExecute(execute);//修改任务的状态(开启、关闭)
        int k = knolegeService.updateDataInfo(knowledgeInfoDTO);
//        StringBuilder url = new StringBuilder(knowledgeInfoDTO.getPath()) ;//爬虫目标路径
        StringBuilder url= new StringBuilder("https://mil.news.sina.com.cn/");
        StringBuilder reptileMethod= new StringBuilder("http://192.168.200.8:8000/news");//爬虫方法http://192.168.200.8:8000/news
        StringBuilder themeid= new StringBuilder("hottopic");//存储索引名称
       //http://192.168.200.8:8000/news?themeid=hottopic&url=https://mil.news.sina.com.cn/history/2018-11-15/doc-ihmutuec0443667.shtml
        StringBuilder path =reptileMethod.append("?").append("themid=").append(themeid).append("&").append("url=").append(url);
        String ip="192.168.200.8";//Linux 路径
        String userName ="root";
        String userPwd ="yjlc20148";
        int w = knolegeService.reptile(path.toString(),ip,userName,userPwd);
        if(w==200){
            map.put("code",200);
            map.put("message","爬虫成功!");
        }else if(w==206){
            map.put("code",206);
            map.put("message","连接失败!");
        }
        return map;
    }

三、service层(此处省略了service接口层)

/**
 *@description: 爬虫
 *@author:cyb
 *@date: 2018-11-15 20:52
*@param: path 爬虫方法路径+ES存储索引+爬虫目标url合集
*@param: ip 连接ip地址
*@param: userName :用户名
*@param: userPwd:用户密码
 *@return: int
 */
@Override
public int reptile(String path,String ip,String userName,String userPwd) {
    SingletonUtil singletonUtil = new SingletonUtil("192.168.200.8", "root","yjlc20148");
    singletonUtil.getInstance(ip, userName,userPwd);
    Boolean b =SingletonUtil.flag;//看是否连接成功
    if(b==true){
        System.out.println("=====第一个步骤=====");
        Session session= null;//打开一个会话
        try {
            session = singletonUtil.conn.openSession();
            session.execCommand("sh /opt/zc/linux_sina.sh");//执行命令
        } catch (IOException e) {
            e.printStackTrace();
        }
        //TODO:多条命令
        String result=singletonUtil.processStdout(session.getStdout(),singletonUtil.DEFAULTCHART);
        //如果为得到标准输出为空,说明脚本执行出错了
        if(StringUtils.isBlank(result)){
            System.out.println("脚本出错");
           result=singletonUtil.processStdout(session.getStderr(),singletonUtil.DEFAULTCHART);
        }
        System.out.println("第一个步骤脚本运行成功"+result);
        ConnectNetworkUtil connectNetworkUtil = new ConnectNetworkUtil();
        connectNetworkUtil.ConnectNetwork(path);
        System.out.println("采集成功!");
        session.close();//关闭session
        singletonUtil.conn.close();//爬虫关闭连接
        return 200;//爬虫成功
    }else {
        return 206;//连接失败
    }

}

 

以上代码已省略了service接口层和java连接elasticsearch工具类(上一篇博客中已写到),以上代码仅供参考,若代码中有不合理或者不规范的地方,请各位指出,技术在于交流!

 

转载于:https://www.cnblogs.com/chenyuanbo/p/9973769.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
使用elasticsearch-dsl可以很方便地将Scrapy爬取的数据存储Elasticsearch。 首先,需要在Scrapy项目安装elasticsearch-dsl: ``` pip install elasticsearch-dsl ``` 然后,在Scrapy项目的settings.py文件添加以下代码: ``` ELASTICSEARCH_HOST = 'localhost' ELASTICSEARCH_PORT = 9200 ELASTICSEARCH_USERNAME = '' ELASTICSEARCH_PASSWORD = '' ELASTICSEARCH_INDEX = 'my_index' ELASTICSEARCH_TYPE = 'my_type' ``` 这里需要设置Elasticsearch的主机名、端口号、用户名、密码、索引名称和类型名称。 接下来,在Scrapy项目的pipelines.py文件编写以下代码: ``` from elasticsearch_dsl.connections import connections from elasticsearch_dsl import DocType, Text, Date, Integer from scrapy.utils.project import get_project_settings class MyItem(DocType): title = Text() content = Text() publish_date = Date() view_count = Integer() class Meta: index = get_project_settings().get('ELASTICSEARCH_INDEX') doc_type = get_project_settings().get('ELASTICSEARCH_TYPE') class ElasticsearchPipeline(object): def __init__(self): settings = get_project_settings() self.es = connections.create_connection( hosts=[{'host': settings.get('ELASTICSEARCH_HOST'), 'port': settings.get('ELASTICSEARCH_PORT')}], http_auth=(settings.get('ELASTICSEARCH_USERNAME'), settings.get('ELASTICSEARCH_PASSWORD')) ) def process_item(self, item, spider): my_item = MyItem(title=item['title'], content=item['content'], publish_date=item['publish_date'], view_count=item['view_count']) my_item.save(using=self.es) return item ``` 这里定义了一个MyItem类,包含了需要存储Elasticsearch的字段。ElasticsearchPipeline类则是对数据进行处理和存储的类,其在初始化方法连接Elasticsearch,将数据保存到Elasticsearch的process_item方法则是通过创建MyItem对象并调用save方法来完成的。 最后,在Scrapy项目的settings.py文件添加以下代码启用ElasticsearchPipeline: ``` ITEM_PIPELINES = { 'my_project.pipelines.ElasticsearchPipeline': 300, } ``` 这样,爬取到的数据就会自动存储Elasticsearch了。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值